├── Treebank ├── __init__.py └── CCGbank │ ├── Writers │ ├── __init__.py │ ├── _PargFileWriter.py │ └── _AutoFileWriter.py │ ├── __init__.py │ ├── profile_ccgbank.py │ ├── _CCGLeaf.py │ ├── _Printer.py │ ├── _Sentence.py │ ├── _Corpus.py │ ├── _File.py │ ├── _CCGFile.py │ ├── _Leaf.py │ ├── _CCGbank.py │ ├── _Node.py │ ├── _CCGSentence.py │ ├── _CCGNode.py │ └── _Production.py ├── ccg ├── grammar.py ├── __init__.py ├── lexicon.py ├── category.py ├── scat.py └── rules.py ├── README.md └── tests ├── test_lexicon.py ├── test_unify.py ├── test_secondary.py ├── test_parse.py ├── test_rules.py └── test_replace.py /Treebank/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Treebank/CCGbank/Writers/__init__.py: -------------------------------------------------------------------------------- 1 | from _AutoFileWriter import AutoFileWriter 2 | from _PargFileWriter import PargFileWriter 3 | -------------------------------------------------------------------------------- /Treebank/CCGbank/__init__.py: -------------------------------------------------------------------------------- 1 | from _CCGbank import CCGbank 2 | from _CCGFile import CCGFile 3 | from _CCGSentence import CCGSentence 4 | from _CCGNode import CCGNode 5 | from _CCGLeaf import CCGLeaf 6 | from _Production import Production, setComplexAdj 7 | -------------------------------------------------------------------------------- /ccg/grammar.py: -------------------------------------------------------------------------------- 1 | def read(loc): 2 | productions = [] 3 | for line in open(loc): 4 | if not line.strip(): 5 | continue 6 | freq, production = line.strip().split(' # ') 7 | production = production.replace('[nb]', '') 8 | parent, children = production.split(' --> ') 9 | children = children.split() 10 | left = children[0] 11 | if left == '((S[b]\NP)/NP)/': 12 | left = '(S[b]\NP)/NP' 13 | if len(children) == 2: 14 | right = children[1] 15 | else: 16 | right = None 17 | productions.append((parent, left, right, int(freq))) 18 | return productions 19 | 20 | 21 | -------------------------------------------------------------------------------- /ccg/__init__.py: -------------------------------------------------------------------------------- 1 | import ccg.category 2 | 3 | def isIdentical(c1, c2): 4 | return c1.exact_eq(c2) 5 | 6 | CONJ = ccg.category.from_string('conj') 7 | conj = CONJ 8 | COMMA = ccg.category.from_string(',{_}') 9 | SEMI_COLON = ccg.category.from_string(';{_}') 10 | COLON = ccg.category.from_string(':{_}') 11 | N = ccg.category.from_string('N') 12 | NP = ccg.category.from_string('NP') 13 | VP = ccg.category.from_string('S\NP') 14 | punct = { 15 | ',': True, 16 | ':': True, 17 | '.': True, 18 | ';': True, 19 | 'RRB': True, 20 | 'LRB': True, 21 | '-RRB-': True, 22 | '-LRB-': True, 23 | 'LQU': True, 24 | 'RQU': True, 25 | 'PUNCT': True 26 | } 27 | -------------------------------------------------------------------------------- /Treebank/CCGbank/profile_ccgbank.py: -------------------------------------------------------------------------------- 1 | import hotshot.stats 2 | import hotshot 3 | 4 | import Treebank.CCGbank 5 | import ccg.lexicon 6 | 7 | def load_files(): 8 | location = '/home/matt/code/repos/data/CCGbank1.2_np_v0.7' 9 | corpus = Treebank.CCGbank.CCGbank(path=location) 10 | ccg.lexicon.load() 11 | for i, child in enumerate(corpus.children()): 12 | if i == 100: 13 | break 14 | pass 15 | 16 | def pfile(function): 17 | prof = hotshot.Profile('/tmp/test.prof') 18 | prof.runcall(function) 19 | prof.close() 20 | stats = hotshot.stats.load('/tmp/test.prof') 21 | stats.strip_dirs() 22 | stats.sort_stats('time', 'calls') 23 | stats.print_stats(20) 24 | 25 | if __name__ == '__main__': 26 | pfile(load_files) 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | * Overview 2 | 3 | Manipulate Combinatory Categorial Grammar categories and derivations, for natural language processing research. 4 | 5 | The library is quite feature rich, but has a pretty messy API, and some bugs. 6 | 7 | The "killer feature" is the implementation of the CCG grammar rules and variable binding. After sentence.unify_vars() 8 | has been called, all categories will have all slots bound to "global" variables, which are unified to other 9 | variable bindings, and may have words attached. 10 | 11 | Aside from ugliness, there are two main sources of remaining problems: 12 | 13 | 1) Coordination is very difficult to get right with respect to unification, as we need a set of words, and we don't necessarily 14 | unify when we coordinate (think "red bus and green train". We do not unify "bus" and "train"!). 15 | 16 | 2) When a word is missing from the "markedup" file, we do a terrible job of guessing its annotation. 17 | 18 | -------------------------------------------------------------------------------- /tests/test_lexicon.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import ccg.lexicon 4 | 5 | class LexiconTests(unittest.TestCase): 6 | def test_entry(self): 7 | lexicon = ccg.lexicon._Lexicon() 8 | entry = """((S\NP)\(S\NP))/NP 9 | 2 (((S[X]{Y}\NP{Z}){Y}\(S[X]{Y}<1>\NP{Z}){Y}){_}/NP{W}<2>){_} 10 | 1 ncmod _ %f %li 11 | 2 dobj %l %f""" 12 | stag, annotated = lexicon._parse_entry(entry) 13 | self.assertEqual(stag, '((S\NP)\(S\NP))/NP') 14 | self.assertEqual(annotated, 15 | '(((S[X]{Y}\NP{Z}){Y}\(S[X]{Y}<1>\NP{Z}){Y}){_}/NP{W}<2>){_}') 16 | 17 | def test_all_annotated(self): 18 | lexicon = ccg.lexicon._Lexicon() 19 | for key, cat in lexicon.items(): 20 | if '{' in key: 21 | if key != cat.annotated: 22 | print cat.string 23 | self.assertEqual(key.replace('[nb]', ''), 24 | cat.annotated.replace('[nb]', '')) 25 | 26 | def test_all_supertags(self): 27 | lexicon = ccg.lexicon._Lexicon() 28 | for key, cat in lexicon.items(): 29 | if '{' not in key: 30 | self.assertEqual(key.replace('[nb]', ''), 31 | cat.string.replace('[nb]', '')) 32 | 33 | def test_load(self): 34 | ccg.lexicon.load() 35 | cat = ccg.category.from_string('S[dcl]\NP') 36 | self.assertEqual(cat.annotated, '(S[dcl]{_}\NP{Y}<1>){_}') 37 | 38 | 39 | if __name__ == '__main__': 40 | unittest.main() 41 | -------------------------------------------------------------------------------- /Treebank/CCGbank/_CCGLeaf.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from ._Leaf import Leaf 4 | from ._CCGNode import CCGNode 5 | import ccg.scat 6 | 7 | class CCGLeaf(Leaf, CCGNode): 8 | neRE = re.compile(r'\|(?=[BI]-)') 9 | def __init__(self, **kwargs): 10 | textName = CCGLeaf.neRE.split(kwargs.pop('text')) 11 | if len(textName) == 2: 12 | text, entityTag = textName 13 | else: 14 | text = textName[0] 15 | entityTag = '' 16 | self.text = text 17 | self.entity = entityTag 18 | self.pos = kwargs.pop('pos') 19 | self.parg = kwargs.pop('parg') 20 | self.wordID = kwargs.pop('wordID') 21 | self.srl_args = {} 22 | CCGNode.__init__(self, headIdx=0, **kwargs) 23 | 24 | def sibling(self): 25 | return None 26 | 27 | def validate(self): 28 | return True 29 | 30 | def isAdjunct(self): 31 | return False 32 | 33 | 34 | def isPunct(self): 35 | return bool(self.label in ccg.punct) 36 | 37 | def changeLabel(self, newLabel): 38 | """ 39 | Change predicate-argument category 40 | """ 41 | oldLabel = self.parg 42 | newLabel = ccg.scat.SuperCat(newLabel) 43 | #newLabel.goldDeps = oldLabel.goldDeps 44 | #for head in oldLabel.heads(): 45 | # newLabel.addHead(head) 46 | self.parg = newLabel 47 | 48 | def head(self): 49 | return self 50 | 51 | def heads(self): 52 | return [self] 53 | 54 | @property 55 | def stag(self): 56 | return self.parent().label 57 | 58 | -------------------------------------------------------------------------------- /Treebank/CCGbank/_Printer.py: -------------------------------------------------------------------------------- 1 | 2 | class Printer(object): 3 | """ 4 | Print a parse tree with good formatting 5 | """ 6 | 7 | def __call__(self, node): 8 | return self.actOn(node) 9 | 10 | def actOn(self, node): 11 | if node.isRoot(): 12 | return self._visitRoot(node) 13 | else: 14 | raise Break 15 | 16 | def _isLeaf(self, node): 17 | return node.isLeaf() 18 | 19 | def _visitRoot(self, node): 20 | """ 21 | Print each node's label, and track indentation 22 | """ 23 | self._indentation = 0 24 | self._lines = [] 25 | # Accrue print state 26 | self._printNode(node) 27 | # Ensure that brackets match 28 | assert self._indentation == 0 29 | return '\n'.join(self._lines) 30 | 31 | 32 | def _visitInternal(self, node): 33 | """ 34 | The visitor must control iteration itself, so only works on root. 35 | """ 36 | raise Break 37 | 38 | def _printNode(self, node): 39 | """ 40 | Print indentation, a bracket, then the node label. 41 | Then print the node's children, then a close bracket. 42 | """ 43 | indentation = ' '*self._indentation 44 | self._lines.append('%s(%s' % (indentation, node.label)) 45 | self._indentation += 1 46 | for child in node.children(): 47 | if self._isLeaf(child): 48 | self._printLeaf(child) 49 | else: 50 | self._printNode(child) 51 | self._lines[-1] = self._lines[-1] + ')' 52 | self._indentation -= 1 53 | 54 | def _printLeaf(self, node): 55 | self._lines[-1] = self._lines[-1] + ' %s' % (node.text) 56 | -------------------------------------------------------------------------------- /Treebank/CCGbank/_Sentence.py: -------------------------------------------------------------------------------- 1 | from _Node import Node 2 | from _Printer import Printer 3 | 4 | class Sentence(Node): 5 | _printer = Printer() 6 | def __str__(self): 7 | return self._printer(self) 8 | 9 | def parent(self): 10 | """ 11 | Raises an error, because the root node has no parent 12 | """ 13 | raise AttributeError, "Cannot retrieve the parent of the root node! Current parse state:\n\n%s" % self.prettyPrint() 14 | 15 | def performOperation(self, operation): 16 | """ 17 | Accept a Visitor and call it on each child 18 | Goofy name/design is legacy from when I didn't know how to code :( 19 | """ 20 | operation.newStructure() 21 | operation.actOn(self) 22 | for node in self.depthList(): 23 | try: 24 | operation.actOn(node) 25 | # Give operations the opportunity to signal 26 | # when the work is complete 27 | except Break: 28 | break 29 | while operation.moreChanges: 30 | operation.actOn(self) 31 | for node in getattr(self, operation.listType)(): 32 | try: 33 | operation.actOn(node) 34 | # Give operations the opportunity to signal 35 | # when the work is complete 36 | except Break: 37 | break 38 | 39 | def isRoot(self): 40 | return True 41 | 42 | def _connectNodes(self, nodes, parentage): 43 | # Build the tree 44 | offsets = sorted(nodes.keys()) 45 | # Skip the top node 46 | offsets.pop(0) 47 | for key in offsets: 48 | node = nodes[key] 49 | parent = nodes[parentage[node]] 50 | parent.attachChild(node, len(parent)) 51 | -------------------------------------------------------------------------------- /Treebank/CCGbank/_Corpus.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from _Node import Node 4 | from _File import File 5 | from _Sentence import Sentence 6 | 7 | class Corpus(Node): 8 | def parent(self): 9 | """ 10 | Raises an error, because the root node has no parent 11 | """ 12 | raise AttributeError, "Cannot retrieve the parent of the root node. Current parse state:\n\n%s" % self.prettyPrint() 13 | 14 | def attachChild(self, newChild): 15 | """ 16 | Append a file 17 | """ 18 | # Security isn't really an issue for Corpus, so just stick 19 | # it onto the list 20 | self._children.append(newChild) 21 | 22 | 23 | def performOperation(self, operation): 24 | """ 25 | Accept a Visitor and call it on each child 26 | Goofy name/design is legacy from when I didn't know how to code :( 27 | """ 28 | operation.newStructure() 29 | operation.actOn(self) 30 | for node in self.children(): 31 | operation.actOn(node) 32 | 33 | def child(self, index): 34 | """ 35 | Read a file by zero-index offset 36 | """ 37 | path = self._children[index] 38 | print >> sys.stderr, path 39 | return self.fileClass(path=path) 40 | 41 | 42 | def children(self): 43 | """ 44 | Generator to iterate through children 45 | """ 46 | for i in xrange(len(self._children)): 47 | yield self.child(i) 48 | 49 | def file(self, key): 50 | """ 51 | Read a file by path 52 | """ 53 | return self.fileClass(path=key) 54 | 55 | def sentence(self, key): 56 | filename, sentenceKey = key.split('~') 57 | file_ = self.file(filename) 58 | return file_.sentence(key) 59 | 60 | def sentences(self): 61 | for child in self.children(): 62 | for sentence in child.children(): 63 | yield sentence 64 | -------------------------------------------------------------------------------- /Treebank/CCGbank/_File.py: -------------------------------------------------------------------------------- 1 | from _Node import Node 2 | 3 | class File(Node): 4 | """ 5 | A file in a treebank 6 | """ 7 | def __init__(self, **kwargs): 8 | self._IDDict = {} 9 | Node.__init__(self, **kwargs) 10 | 11 | def attachChild(self, newChild): 12 | """ 13 | Append a sentence 14 | """ 15 | # Security isn't really an issue for Files, so just append the new 16 | # Sentence without complaint 17 | self._children.append(newChild) 18 | self._IDDict[newChild.globalID] = newChild 19 | 20 | def detachChild(self, node): 21 | """ 22 | Delete a sentence 23 | """ 24 | self._children.remove(node) 25 | self._IDDict.pop(node.globalID) 26 | 27 | def sentence(self, key): 28 | """ 29 | Retrieve a sentence by key 30 | """ 31 | return self._IDDict[key] 32 | 33 | def prettyPrint(self): 34 | return "(%d %s)" % (self.localID, '\n\n\n'.join([child.prettyPrint() for child in self.children()])) 35 | 36 | def performOperation(self, operation): 37 | """ 38 | Accept a Visitor and call it on each child 39 | Goofy name/design is legacy from when I didn't know how to code :( 40 | """ 41 | operation.newStructure() 42 | operation.actOn(self) 43 | for node in getattr(self, operation.listType)(): 44 | try: 45 | operation.actOn(node) 46 | # Give operations the opportunity to signal 47 | # when the work is complete 48 | except Break: 49 | break 50 | while operation.moreChanges: 51 | operation.actOn(self) 52 | for node in getattr(self, operation.listType)(): 53 | try: 54 | operation.actOn(node) 55 | # Give operations the opportunity to signal 56 | # when the work is complete 57 | except Break: 58 | break 59 | -------------------------------------------------------------------------------- /Treebank/CCGbank/Writers/_PargFileWriter.py: -------------------------------------------------------------------------------- 1 | from _AutoFileWriter import AutoFileWriter 2 | import os 3 | from os.path import join as pjoin 4 | 5 | class PargFileWriter(AutoFileWriter): 6 | 7 | 8 | def writeFile(self, fileID, sentences): 9 | path = self._getPath(fileID) 10 | output = open(path, 'w') 11 | for sentence in sentences: 12 | output.write(sentence + '\n') 13 | output.close() 14 | 15 | def getSentenceStr(self, sentence): 16 | idLine = self._getIDLine(sentence) 17 | deps = [] 18 | for word in sentence.listWords(): 19 | for argHead, depType, argNum in word.parg.goldDependencies(): 20 | depStr = self._makeDep(word, argHead, argNum, depType) 21 | deps.append(depStr) 22 | deps.sort() 23 | deps.insert(0, idLine) 24 | deps.append('<\s>') 25 | return '\n'.join(deps) 26 | 27 | 28 | def _getPath(self, fileID): 29 | dirSect = fileID[4:6] 30 | directory = pjoin(self.directory, dirSect) 31 | if not os.path.exists(directory): 32 | os.mkdir(directory) 33 | return pjoin(directory, fileID.replace('auto', 'parg')) 34 | 35 | def _getIDLine(self, sentence): 36 | idLine = ' %d' % (sentence.globalID, sentence.getWord(-1).wordID) 37 | return idLine 38 | 39 | 40 | def _makeDep(self, head, arg, argNum, depType): 41 | """ 42 | A depedency between the ith word and the jth word (wordI and wordJ) 43 | where the jth word has the lexical (functor) category catJ, and the 44 | ith word is head of the constituent which fills the kth argument slot 45 | of catJ is described as: 46 | i j cat_j arg_k word_i word_j 47 | """ 48 | i = arg.wordID 49 | j = head.wordID 50 | catJ = str(head.parg) 51 | argK = argNum 52 | wordI = arg.text 53 | wordJ = head.text 54 | dep = '%d \t %d \t %s \t %d \t %s %s' % (i, j, catJ, argNum, wordI, wordJ) 55 | if depType != 'L': 56 | dep = dep + ' ' + depType 57 | return dep 58 | -------------------------------------------------------------------------------- /Treebank/CCGbank/_CCGFile.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | import os 4 | import re 5 | 6 | from ._File import File 7 | from ._CCGNode import CCGNode 8 | from ._CCGSentence import CCGSentence 9 | 10 | class CCGFile(File, CCGNode): 11 | mmRE = re.compile(r'(?<=[/\\])[\.]') 12 | def __init__(self, **kwargs): 13 | if 'string' in kwargs: 14 | text = kwargs.pop('string') 15 | path = kwargs.pop('path') 16 | else: 17 | path = kwargs.pop('path') 18 | text = open(path).read() 19 | # Hack for mmccg version 20 | text = self.mmRE.sub('', text) 21 | # Sometimes sentences start (( instead of ( (. This is an error, correct it 22 | filename = path.split('/')[-1] 23 | self.path = path 24 | self.filename = filename 25 | self.ID = filename 26 | self._IDDict = {} 27 | CCGNode.__init__(self, label='File', headIdx=0, **kwargs) 28 | self._parseFile(text) 29 | 30 | def _parseFile(self, text): 31 | lines = text.strip().split('\n') 32 | while lines: 33 | idLine = lines.pop(0) 34 | sentence = lines.pop(0) 35 | self._addSentence(idLine, sentence) 36 | 37 | def _addSentence(self, idLine, sentStr): 38 | try: 39 | globalID = idLine.split(' ')[0].split('=')[1] 40 | except: 41 | print sentStr 42 | print >> sys.stderr, idLine 43 | raise 44 | sentence = CCGSentence(globalID=globalID, string=sentStr, 45 | localID=self.length()) 46 | self.attachChild(sentence) 47 | 48 | pargSentsRE = re.compile(r' \d+\n(?:(\d.+?)\n)?<\\s>', re.DOTALL) 49 | def addPargDeps(self, pargPath=None): 50 | pargPath = self.path.rsplit('/', 2)[0].replace('AUTO', 'PARG') 51 | section = self.ID[4:6] 52 | fileLoc = os.path.join(pargPath, section, self.ID.replace('auto', 'parg')) 53 | text = open(fileLoc).read().strip() 54 | for i, matchObj in enumerate(CCGFile.pargSentsRE.finditer(text)): 55 | if not matchObj.groups()[0]: 56 | continue 57 | pargSent = matchObj.groups()[0] 58 | deps = [dep.split() for dep in pargSent.split('\n')] 59 | sentence = self.child(i) 60 | sentence.addPargDeps(deps) 61 | -------------------------------------------------------------------------------- /Treebank/CCGbank/_Leaf.py: -------------------------------------------------------------------------------- 1 | from _Node import Node 2 | 3 | class Leaf(Node): 4 | """ 5 | A leaf of the parse tree -- ie, a word, punctuation or trace 6 | Cannot attach or retrieve children 7 | """ 8 | def __hash__(self): 9 | return self.wordID 10 | 11 | def isLeaf(self): 12 | return True 13 | 14 | def attachChild(self, newChild, index = None): 15 | raise AttachmentError, "Cannot add node\n\n%s\n\nto leaf:\n\n%s\n\nLeaves cannot have children." \ 16 | % (newChild.prettyPrint(), self.prettyPrint()) 17 | 18 | def length(self, constraint = None): 19 | return 0 20 | 21 | def child(self, index): 22 | """ 23 | Raises an error, because leaf nodes have no children 24 | """ 25 | raise AttributeError, "Cannot retrieve children from leaf nodes! Attempted on leaf:\n\n%s" % self.prettyPrint() 26 | 27 | def detachChild(self, node): 28 | """ 29 | Raises an error, because leaf nodes have no children 30 | """ 31 | raise AttributeError, "Cannot remove children from leaf nodes! Attempted on leaf:\n\n%s" % self.prettyPrint() 32 | 33 | def prettyPrint(self): 34 | return "(%s %s)" % (self.label, self.text) 35 | 36 | def listWords(self): 37 | return [self] 38 | 39 | def lemma(self): 40 | """ 41 | Get lemma from COMLEX entry, otherwise text 42 | """ 43 | if self.metadata.get('COMLEX'): 44 | return self.metadata['COMLEX'][0].features['ORTH'][0][1:-1] 45 | elif self.label in ['NNP', 'NNPS']: 46 | return self.text 47 | else: 48 | return self.text.lower() 49 | 50 | def isTrace(self): 51 | return bool(self.label == '-NONE-') 52 | 53 | 54 | def isPunct(self): 55 | punct = { 56 | ',': True, 57 | ':': True, 58 | '.': True, 59 | ';': True, 60 | 'RRB': True, 61 | 'LRB': True 62 | } 63 | return bool(self.label in punct) 64 | 65 | def nextWord(self): 66 | words = self.root().listWords() 67 | nextID = self.wordID + 1 68 | if nextID == len(words): 69 | return None 70 | else: 71 | assert self is not words[self.wordID+1] 72 | return words[self.wordID+1] 73 | 74 | def prevWord(self): 75 | if self.wordID == 0: 76 | return None 77 | words = self.root().listWords() 78 | return words[self.wordID - 1] 79 | -------------------------------------------------------------------------------- /tests/test_unify.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import ccg.rules 4 | import ccg.scat 5 | import ccg.lexicon 6 | import Treebank.CCGbank 7 | 8 | ccg.lexicon.load() 9 | 10 | class TestUnify(unittest.TestCase): 11 | def test_fapply_adjunct(self): 12 | c1 = ccg.scat.SuperCat('N/N') 13 | c2 = ccg.scat.SuperCat('N') 14 | parent = ccg.scat.SuperCat('N') 15 | production = ccg.rules.Production(c1, c2, parent) 16 | parent.bind_vars(production.result, parent.category, 17 | production.result.category) 18 | left_arg_global_vars = c1.get_vars(c1.argument) 19 | self.assertEqual(left_arg_global_vars, parent.get_vars(parent)) 20 | 21 | def test_fcomp(self): 22 | c1 = ccg.scat.SuperCat('(S[dcl]\NP)/(S[pss]\NP)') 23 | c2 = ccg.scat.SuperCat('(S[pss]\NP)/NP') 24 | parent = ccg.scat.SuperCat('(S[dcl]\NP)/NP') 25 | production = ccg.rules.Production(c1, c2, parent) 26 | parent.bind_vars(production.result, parent.category, 27 | production.result.category) 28 | left_arg_global_vars = c1.get_vars(c1.argument) 29 | right_result_global_vars = c2.get_vars(c2.result) 30 | self.assertEqual(right_result_global_vars, left_arg_global_vars) 31 | laa_global_vars = c1.get_vars(c1.argument.argument) 32 | ra_global_vars = c2.get_vars(c2.result.argument) 33 | self.assertEqual(laa_global_vars, ra_global_vars) 34 | self.assertFalse(laa_global_vars == left_arg_global_vars) 35 | self.assertEqual(parent.get_vars(parent.result.argument), 36 | c1.get_vars(c1.result.argument)) 37 | 38 | def test_fapply_sentence(self): 39 | sent_str = ("( ( ( " 40 | "() () ) ( () () )" 43 | " ) () )") 44 | sent = Treebank.CCGbank.CCGSentence(string=sent_str, globalID=0, 45 | localID=0) 46 | sent.unify_vars() 47 | ms, haag, plays, elianti, period = [w.stag for w in sent.listWords()] 48 | self.assertEqual(ms.get_vars(ms.argument), 49 | haag.get_vars(ms)) 50 | self.assertEqual(haag.get_vars(haag), 51 | plays.get_vars(plays.result.argument)) 52 | self.assertEqual(plays.get_vars((plays.argument)), 53 | elianti.get_vars(elianti)) 54 | 55 | 56 | if __name__ == '__main__': 57 | unittest.main() 58 | -------------------------------------------------------------------------------- /Treebank/CCGbank/_CCGbank.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | import os.path 4 | 5 | import ccg.lexicon 6 | from ._Corpus import Corpus 7 | from ._CCGNode import CCGNode 8 | from ._CCGFile import CCGFile 9 | 10 | 11 | 12 | class CCGbank(Corpus, CCGNode): 13 | fileClass = CCGFile 14 | def __init__(self, path=None, **kwargs): 15 | self._children = [] 16 | self.path = path 17 | for fileLoc in self._getFileList(self.path): 18 | self.attachChild(fileLoc) 19 | ccg.lexicon.load(os.path.join(path, 'markedup')) 20 | 21 | def child(self, index): 22 | """ 23 | Read a file by zero-index offset 24 | """ 25 | path = self._children[index] 26 | print >> sys.stderr, path 27 | return self.fileClass(path=path) 28 | 29 | def sentence(self, key): 30 | fileName, sentID = key.split('.') 31 | section = fileName[4:6] 32 | fileID = os.path.join(self.path, 'data', 'AUTO', section, fileName + 33 | '.auto') 34 | f = self.file(fileID) 35 | #pargLoc = fileID.rsplit('/', 2)[0].replace('AUTO', 'PARG') 36 | #f.addPargDeps(pargLoc) 37 | return f.sentence(key) 38 | 39 | def tokens(self): 40 | """ 41 | Generate tokens without parsing the files properly 42 | """ 43 | tokenRE = re.compile(r'') 44 | for path in self._children: 45 | string = open(path).read() 46 | for cat, pos, form in tokenRE.findall(string): 47 | yield form, pos, cat 48 | 49 | def section(self, sec): 50 | for i, fileLoc in enumerate(self._children): 51 | path, fileName = os.path.split(fileLoc) 52 | if int(fileName[4:6]) == sec: 53 | yield self.child(i) 54 | 55 | 56 | def section00(self): 57 | for i in xrange(99): 58 | yield self.child(i) 59 | 60 | def twoTo21(self): 61 | for i in xrange(199, 2074): 62 | yield self.child(i) 63 | 64 | def section23(self): 65 | for i in xrange(2157, 2257): 66 | yield self.child(i) 67 | 68 | def section24(self): 69 | for i in xrange(2257, self.length()): 70 | yield self.child(i) 71 | 72 | def _getFileList(self, location): 73 | """ 74 | Get all files below location 75 | """ 76 | paths = [] 77 | for path in [os.path.join(location, f) for f in os.listdir(location)]: 78 | if path.endswith('CVS'): 79 | continue 80 | elif path.startswith('.'): 81 | continue 82 | if os.path.isdir(path): 83 | paths.extend(self._getFileList(path)) 84 | elif path.endswith('.mrg') or path.endswith('.auto'): 85 | paths.append(path) 86 | paths.sort() 87 | return paths 88 | 89 | 90 | -------------------------------------------------------------------------------- /tests/test_secondary.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import ccg.category 4 | import ccg.lexicon 5 | 6 | ccg.lexicon.load() 7 | 8 | class TestSecondary(unittest.TestCase): 9 | def test_inner_result(self): 10 | c = ccg.category.from_string('(S[dcl]\NP)/NP') 11 | self.assertEqual(c.inner_result.annotated, 'S[dcl]{_}') 12 | c = ccg.category.from_string('((S\NP)\(S\NP))/NP') 13 | self.assertEqual(c.inner_result.annotated, 'S[X]{Y}') 14 | c = ccg.category.from_string('NP') 15 | self.assertEqual(c.inner_result, 'NP') 16 | 17 | def test_is_predicate(self): 18 | c = ccg.category.from_string('PP/NP') 19 | self.assertFalse(c.is_predicate) 20 | c = ccg.category.from_string('S[dcl]\NP') 21 | self.assertTrue(c.is_predicate) 22 | c = ccg.category.from_string(r'(S[adj]\NP)/(S[adj]\NP)') 23 | self.assertFalse(c.is_predicate) 24 | 25 | def test_is_adjunct(self): 26 | c = ccg.category.from_string('(S\NP)\(S\NP)') 27 | self.assertTrue(c.is_adjunct) 28 | c = ccg.category.from_string('((S[X]{Y}\NP{Z}){Y}/' 29 | '(S[X]{Y}\NP{Z*}){Y}<1>){_}') 30 | self.assertTrue(c.is_adjunct) 31 | c = ccg.category.from_string('(S[X]{_}/S[X]{Y}){_}') 32 | self.assertFalse(c.is_adjunct) 33 | c = ccg.category.from_string('(PP{Y}/PP{Y}){_}') 34 | self.assertTrue(c.is_adjunct) 35 | 36 | def test_has_adjunct(self): 37 | c = ccg.category.from_string('((S\NP)\(S\NP))/NP') 38 | self.assertTrue(c.has_adjunct) 39 | c = ccg.category.from_string('NP/N') 40 | self.assertFalse(c.has_adjunct) 41 | 42 | def test_is_aux(self): 43 | c = ccg.category.from_string('(S[dcl]\NP)/(S[dcl]\NP)') 44 | self.assertTrue(c.is_aux) 45 | c = ccg.category.from_string('(S[adj]\NP)/(S[adj]\NP)') 46 | self.assertTrue(c.is_aux) 47 | c = ccg.category.from_string('PP/NP') 48 | self.assertFalse(c.is_aux) 49 | 50 | def test_is_true_aux(self): 51 | c = ccg.category.from_string('(S[dcl]\NP)/(S[dcl]\NP)') 52 | self.assertTrue(c.is_true_aux) 53 | c = ccg.category.from_string('(S[adj]\NP)/(S[adj]\NP)') 54 | self.assertFalse(c.is_true_aux) 55 | c = ccg.category.from_string('PP/NP') 56 | self.assertFalse(c.is_true_aux) 57 | 58 | 59 | def test_srl_annot_string(self): 60 | stag = ccg.scat.SuperCat('(S[dcl]\NP)/NP') 61 | stag.srl_annot.add(('_', 'A0', 'Y')) 62 | stag.srl_annot.add(('Z', 'AM-TMP', '_')) 63 | n, stag_str, annotated, roles = stag.srl_string() 64 | assert n == 2 65 | self.assertEqual(stag_str, "(S[dcl]\\NP)/NP@X'A0'Y_Z'AM-TMP'X") 66 | self.assertEqual(annotated, 67 | "((S[dcl]{_}\\NP{Y}<1>){_}/NP{Z}<2>){_}@X'A0'Y_Z'AM-TMP'X") 68 | assert roles == ['1 A0 %l %f', '2 AM-TMP %f %l'] 69 | 70 | 71 | if __name__ == '__main__': 72 | unittest.main() 73 | -------------------------------------------------------------------------------- /Treebank/CCGbank/Writers/_AutoFileWriter.py: -------------------------------------------------------------------------------- 1 | import os 2 | from os.path import join as pjoin 3 | import re 4 | 5 | import ccg.lexicon 6 | 7 | class AutoFileWriter: 8 | """ 9 | Write a .auto format file 10 | """ 11 | def __init__(self, **kwargs): 12 | if 'directory' in kwargs: 13 | self.setDir(kwargs.pop('directory')) 14 | if 'markedup' in kwargs and False: # Don't support markedup right now 15 | muLoc = kwargs.pop('markedup') 16 | entries, unused = ccg.Markedup.getEntries(muLoc) 17 | markedup = {} 18 | for entry in entries: 19 | markedup[entry.cat] = entry.toJulia() 20 | self.markedup = markedup 21 | else: 22 | self.markedup = {} 23 | 24 | def setDir(self, directory): 25 | if not os.path.exists(directory): 26 | print "Making %s" % directory 27 | os.makedirs(directory) 28 | self.directory = directory 29 | 30 | def getSentenceStr(self, sentence): 31 | lines = [] 32 | idLine = self._getIDLine(sentence.globalID) 33 | lines.append(idLine) 34 | lines.append(self._nodeString(sentence.child(0))) 35 | return '\n'.join(lines) 36 | 37 | def writeFile(self, fileID, sentences): 38 | path = self._getPath(fileID) 39 | output = open(path, 'w') 40 | for sentence in sentences: 41 | output.write(sentence + '\n') 42 | output.close() 43 | 44 | def _getPath(self, fileID): 45 | dirSect = fileID[4:6] 46 | directory = pjoin(self.directory, dirSect) 47 | if not os.path.exists(directory): 48 | os.mkdir(directory) 49 | return pjoin(directory, fileID) 50 | 51 | 52 | def _getIDLine(self, sentenceID): 53 | return "ID=%s PARSER=GOLD NUMPARSE=1" % sentenceID 54 | 55 | 56 | def _nodeString(self, node): 57 | if node.child(0).isLeaf(): 58 | return self._leafString(node) 59 | else: 60 | childStrings = [] 61 | for child in node.children(): 62 | childStrings.append(self._nodeString(child)) 63 | nodeString = '( %s )' % (node.label, node.headIdx, len(childStrings), ' '.join(childStrings)) 64 | return nodeString 65 | 66 | def _leafString(self, node): 67 | leaf = node.child(0) 68 | #if not leaf.parg: 69 | # print leaf 70 | # print leaf.parent() 71 | # raise StandardError 72 | annotated = self.markedup.get(leaf.stag, leaf.stag.annotated) 73 | #if leaf.stag != node.label: 74 | # print node.root().globalID 75 | # print node 76 | # print leaf 77 | # print leaf.stag 78 | # raise StandardError 79 | annot_strip_re = re.compile(r'<\d>|\*') 80 | stag_str = annot_strip_re.sub('', leaf.stag.annotated) 81 | stag_str += '@%s' % leaf.stag.srl_string() 82 | properties = [ 83 | leaf.stag.string, 84 | leaf.pos, 85 | leaf.label, 86 | leaf.text, 87 | stag_str] 88 | try: 89 | leafString = '()' % ' '.join(properties) 90 | except: 91 | print properties 92 | raise 93 | return leafString 94 | -------------------------------------------------------------------------------- /ccg/lexicon.py: -------------------------------------------------------------------------------- 1 | """ 2 | A lexicon loaded from a markedup file 3 | """ 4 | import os 5 | import os.path 6 | from collections import defaultdict 7 | 8 | import ccg.category 9 | 10 | _INIT_STR = "# now list the markedup categories" 11 | DEFAULT_PATH = os.path.join(os.path.split(__file__)[0], 'markedup') 12 | CATS = {} 13 | 14 | 15 | def load(path=DEFAULT_PATH): 16 | global CATS 17 | CATS = _Lexicon(path) 18 | 19 | class _Lexicon(dict): 20 | def __init__(self, path=DEFAULT_PATH): 21 | dict.__init__(self) 22 | self.cats = defaultdict(int) 23 | for entry in self._split_entries(open(path).read()): 24 | if not entry: 25 | continue 26 | entry = entry.strip() 27 | supertag, annotated = self._parse_entry(entry) 28 | self.add_entry(supertag, annotated) 29 | self.add_entry(supertag.replace('[nb]', ''), annotated.replace('[nb]', '')) 30 | 31 | def add_entry(self, supertag, annotated): 32 | annotated = annotated.split('@')[0] 33 | if supertag in self and annotated != self[supertag].annotated: 34 | #print supertag 35 | #print annotated 36 | #print self[supertag].annotated 37 | return None 38 | if '{R}' in annotated: 39 | return None 40 | try: 41 | category = ccg.category.from_string(annotated) 42 | except: 43 | print entry 44 | raise 45 | self[supertag] = category 46 | self[annotated] = category 47 | # Allow frequencies to be set 48 | self.cats[category] = 0 49 | 50 | def _split_entries(self, markedup): 51 | header, text = markedup.split(_INIT_STR) 52 | return text.split('\n\n') 53 | 54 | def _parse_entry(self, entry_str): 55 | lines = [line for line in entry_str.split('\n') 56 | if not line.startswith('#')] 57 | supertag = lines[0] 58 | n_args, annotated = lines[1].strip().split() 59 | return supertag, annotated 60 | 61 | class MarkedupEntry(object): 62 | def __init__(self, markedup_str): 63 | self.string = markedup_str 64 | lines = [l for l in markedup_str.split('\n') 65 | if not l.strip().startswith('#')] 66 | bare_category = lines.pop(0) 67 | n_slots, annotated_category = lines.pop(0).strip().split(' ') 68 | if lines and lines[0].startswith(' !'): 69 | alt_markedup = lines.pop(0)[4:] 70 | else: 71 | alt_markedup = '' 72 | slots = defaultdict(list) 73 | for line in lines: 74 | slot = Slot(line) 75 | slots[slot.n].append(slot) 76 | 77 | self.category = ccg.category.from_string(bare_category) 78 | self.annotated = ccg.category.from_string(annotated_category) 79 | self.n_grs = int(n_slots) 80 | if alt_markedup: 81 | self.alt_annotated = ccg.category.from_string(alt_markedup) 82 | else: 83 | self.alt_annotated = self.annotated 84 | self.grs = slots 85 | 86 | 87 | class Slot(object): 88 | def __init__(self, slot_str): 89 | pieces = slot_str.strip().split(' ') 90 | if pieces and pieces[-1].startswith('='): 91 | self.constraint_name = pieces.pop(-1) 92 | self.constraint_group = CONSTRAINT_GROUPS.get(self.constraint_name, set()) 93 | else: 94 | self.constraint_name = None 95 | self.constraint_group = set() 96 | 97 | if not pieces[-1].startswith('%') and pieces[-1] != 'ignore': 98 | self.subtype2 = pieces.pop(-1) 99 | else: 100 | self.subtype2 = None 101 | 102 | self.words = [p for p in pieces if p.startswith('%')] 103 | pieces = [p for p in pieces if not p.startswith('%')] 104 | 105 | self.n = int(pieces.pop(0)) 106 | self.label = pieces.pop(0) 107 | if pieces: 108 | self.subtype1 = pieces.pop(0) 109 | else: 110 | self.subtype1 = None 111 | assert not pieces 112 | 113 | 114 | 115 | -------------------------------------------------------------------------------- /tests/test_parse.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import ccg 3 | import ccg.category 4 | import ccg.lexicon 5 | ccg.lexicon.load() 6 | 7 | class TestAtomicRE(unittest.TestCase): 8 | def test_basic(self): 9 | basic = [('NP', 'NP', '', False, None)] 10 | self._run(basic) 11 | 12 | def test_feats(self): 13 | feats = [('NP[nb]', 'NP', '[nb]', False, None), 14 | ('PP[dcl]', 'PP', '[dcl]', False, None) 15 | ] 16 | self._run(feats) 17 | 18 | def test_conj(self): 19 | conj = [('NP[conj]', 'NP', '', True, None)] 20 | self._run(conj) 21 | 22 | def test_hat(self): 23 | hat = [('N^NP', 'NP', '', False, 'NP', 24 | 'N^S[dcl]', 'NP', '', False, 'S[dcl]', 25 | 'N^S[dcl][conj]', 'N', '', True, 'S[dcl]')] 26 | 27 | def test_feat_conj(self): 28 | feat_conj = [('NP[nom][conj]', 'NP', '[nom]', True, None)] 29 | self._run(feat_conj) 30 | 31 | def test_var(self): 32 | cat = ccg.category.from_string('NP{_}') 33 | self.assertEqual(cat.string, 'NP') 34 | self.assertEqual(cat.var, 0) 35 | 36 | def _run(self, cases): 37 | for cat, atom, feat, conj, hat in cases: 38 | category = ccg.category.from_string(cat) 39 | self.assertEqual(atom, category.cat) 40 | if feat != '[nb]': 41 | self.assertEqual(feat, category.feature) 42 | self.assertEqual(conj, category.conj) 43 | self.assertEqual(hat, category.hat) 44 | 45 | 46 | class TestComplex(unittest.TestCase): 47 | def test_basic(self): 48 | basic = [(r'(S[dcl]\NP)/NP', 'S[dcl]\NP', 'NP', None)] 49 | self._run(basic) 50 | 51 | def test_hat(self): 52 | hat = [(r'((S[dcl]\NP)/NP)^(NP\NP)', 'S[dcl]\NP', 'NP', 'NP\NP'), 53 | (r'NP^PP/N', 'NP^PP', 'N', None), 54 | (r'N^NP^(S/S)/NP', 'N^NP^(S/S)', 'NP', None), 55 | (r'N^(S[dcl]^NP/NP)/NP[conj]', 'N^(S[dcl]^NP/NP)', 'NP', 56 | None), 57 | (r'(NP/PP)^(S/S)/NP', '(NP/PP)^(S/S)', 'NP', None)] 58 | self._run(hat) 59 | 60 | def test_conj(self): 61 | category = ccg.category.from_string('(S\NP)\(S\NP)[conj]') 62 | self.assertEqual(category.conj, True) 63 | category = ccg.category.from_string('S[dcl]\NP[conj]') 64 | self.assertEqual(len(category.cats_by_var), 2) 65 | 66 | def test_complex(self): 67 | c = ('(((S[wq]{_}/PP{Y}<1>){_}/((S[q]{Z}<2>/PP{Y*}){Z}' 68 | '/(S[adj]{W*}\NP{V}){W*}){Z}){_}/(S[adj]{W}<3>' 69 | '\NP{V}){W}){_}') 70 | cat = ccg.category.from_string(c) 71 | self.assertEqual(c, cat.annotated) 72 | 73 | def _run(self, cases): 74 | for cat, result, argument, hat in cases: 75 | category = ccg.category.from_string(cat) 76 | self.assertEqual(cat, category.string) 77 | self.assertEqual(result, category.result.string) 78 | self.assertEqual(argument, category.argument.string) 79 | self.assertEqual(str(hat), str(category.hat)) 80 | 81 | def test_safety(self): 82 | cat = ccg.category.from_string(r'(S[dcl]\NP)/NP') 83 | #cat.result = _parse.Category('NP') 84 | 85 | def test_var(self): 86 | cat = ccg.category.from_string('(S[dcl]{_}\NP{Y}){_}') 87 | self.assertEqual(cat.string, 'S[dcl]\NP') 88 | self.assertEqual(cat.var, 0) 89 | self.assertEqual(cat.argument.var, 1) 90 | cat = ccg.category.from_string('((S[X]{Y}\NP{Z}){Y}/(S[X]{Y}\NP{Z}){Y}){_}') 91 | self.assertEqual(cat.string, '(S\NP)/(S\NP)') 92 | self.assertEqual(cat.result.result.var, 1) 93 | self.assertEqual(cat.argument.argument.var, 2) 94 | self.assertEqual(cat.argument.result.var, 1) 95 | self.assertEqual(cat.result.argument.var, 2) 96 | cat = ccg.category.from_string('(N{_}^(S[X]{Y}\S[X]{Y}){_}/NP{Y}){_}') 97 | self.assertEqual(cat.string, 'N^(S\S)/NP') 98 | self.assertEqual(cat.result.hat.result.var, 1) 99 | 100 | def test_multi_var(self): 101 | cat = ccg.category.from_string('(PP{Y,_}/NP{Y}){_}') 102 | assert cat.var == 0 103 | assert cat.var2 == -1 104 | assert cat.result.var == 1 105 | assert cat.result.var2 == 0 106 | assert cat.annotated == '(PP{Y,_}/NP{Y}){_}' 107 | 108 | def test_variable_guessing(self): 109 | cat_str = r'PP/(S[to]\NP)' 110 | assert cat_str not in ccg.lexicon.CATS 111 | cat = ccg.category.from_string(r'PP/(S[to]\NP)') 112 | # Need to fix this somehow 113 | assert cat.annotated != r'(PP{_}/(S[to]{_}\NP{Y}<1>){_}){_}' 114 | 115 | if __name__ == "__main__": 116 | unittest.main() 117 | -------------------------------------------------------------------------------- /Treebank/CCGbank/_Node.py: -------------------------------------------------------------------------------- 1 | import bisect 2 | 3 | class Node(object): 4 | globalID = 0 5 | def __init__(self, label): 6 | self.globalID = Node.globalID 7 | self._children = [] 8 | self._parent = None 9 | Node.globalID += 1 10 | self.label = label 11 | 12 | def __hash__(self): 13 | return hash(self.globalID) 14 | 15 | def reattach(self, newParent, index = None): 16 | """ 17 | Detach from current location and move to a new location 18 | in the tree 19 | """ 20 | depthList = self.depthList() 21 | lookup = {} 22 | for n in depthList: 23 | lookup[n] = True 24 | assert not newParent in lookup 25 | self._detachFromParent() 26 | newParent.attachChild(self, index) 27 | 28 | def attachChild(self, newChild, index = None): 29 | """ 30 | Attach a (parentless) child. If the child has a parent 31 | already, call its reattach method 32 | """ 33 | # Don't allow bidirectional parenthood 34 | assert not self is newChild 35 | if newChild.parent(): 36 | raise AttachmentError, 'Cannot attach node:\n\n%s\n\nto:\n\n%s\n\nNode is already attached to\n\n%s' \ 37 | % (newChild.prettyPrint(), self.prettyPrint(), newChild.parent().prettyPrint()) 38 | if index == None: 39 | bisect.insort_right(self._children, newChild) 40 | else: 41 | self._children.insert(index, newChild) 42 | newChild.setParent(self) 43 | 44 | 45 | def _detachFromParent(self): 46 | self._parent.detachChild(self) 47 | self._parent = None 48 | 49 | def detachChild(self, node): 50 | """ 51 | Detach a specific node. Deprecated; use node.prune() 52 | """ 53 | self._children.remove(node) 54 | 55 | 56 | def setParent(self, node): 57 | """ 58 | Set a node as parent. Does not add as child 59 | """ 60 | assert not self._parent 61 | self._parent = node 62 | 63 | 64 | def prettyPrint(self): 65 | """ 66 | Deprecated alias for __str__ 67 | """ 68 | return "(%s %s)" % (self.label, ' '.join([child.prettyPrint() for child in self.children()])) 69 | 70 | def parent(self): 71 | """ 72 | Returns _parent 73 | Should be change to property, perhaps 74 | """ 75 | return self._parent 76 | 77 | 78 | def child(self, index): 79 | """ 80 | Returns the child at index 81 | """ 82 | return self._children[index] 83 | 84 | def children(self): 85 | """ 86 | Generator for children 87 | """ 88 | # Must use list copy, lest the list change out from under the iteration 89 | for child in list(self._children): 90 | yield child 91 | 92 | def insert(self, node): 93 | """ 94 | Insert a node above self 95 | """ 96 | self.parent().replace(self, node) 97 | node.attachChild(self) 98 | 99 | def delete(self): 100 | """ 101 | Delete self from the tree, reattaching children to parent 102 | """ 103 | parent = self.parent() 104 | self.prune() 105 | for node in self.children(): 106 | node.reattach(parent) 107 | 108 | def replace(self, currentChild, replacement): 109 | """ 110 | Insert a new node where an old one was 111 | """ 112 | index = self._children.index(currentChild) 113 | if replacement.parent(): 114 | replacement.reattach(self, index) 115 | else: 116 | self.attachChild(replacement, index) 117 | currentChild.prune() 118 | 119 | def prune(self): 120 | """ 121 | Detach node from parent 122 | """ 123 | self._detachFromParent() 124 | 125 | def sortChildren(self): 126 | """ 127 | Sort children in-place. Should not be necessary, but just in case... 128 | """ 129 | decorated = [(c.getWordID(0), c) for c in self._children] 130 | decorated.sort() 131 | self._children = [d[1] for d in decorated] 132 | 133 | def depthList(self): 134 | """ 135 | Depth-first node list 136 | """ 137 | # Avoid recursion, for speed 138 | queue = list(self.children()) 139 | # Can't use enumerate because changing list in place 140 | # Must stay 1 ahead of the current index 141 | i = 0 142 | for node in queue: 143 | i += 1 144 | if not node.isLeaf(): 145 | for j, child in enumerate(node.children()): 146 | queue.insert(i+j, child) 147 | return queue 148 | 149 | def breadthList(self): 150 | """ 151 | Breadth-first node list 152 | """ 153 | children = [child for child in self.children()] 154 | for child in children: 155 | for subChild in child.children(): 156 | children.append(subChild) 157 | return children 158 | 159 | def getWordID(self, index): 160 | """ 161 | Word ID at index. Generally 0 or -1 162 | """ 163 | wordIDList = [word.wordID for word in self.listWords()] 164 | return wordIDList[index] 165 | 166 | def getWord(self, index): 167 | """ 168 | Word ID at index. Generally 0 or -1 169 | """ 170 | wordList = self.listWords() 171 | if not wordList: 172 | return None 173 | return wordList[index] 174 | 175 | def listWords(self): 176 | """ 177 | List the word yield of the node 178 | """ 179 | return [n for n in self.depthList() if n.isLeaf()] 180 | 181 | def length(self, constraint = None): 182 | """ 183 | Alias for __len__, except this allows a constraint function 184 | """ 185 | if constraint == None: 186 | return len(self._children) 187 | else: 188 | return len([c for c in self.children() if constraint(c)]) 189 | 190 | def siblings(self): 191 | """ 192 | Return a list of sibling nodes 193 | """ 194 | return [s for s in self.parent().children() if s != self] 195 | 196 | def isLeaf(self): 197 | return False 198 | 199 | def isRoot(self): 200 | return False 201 | 202 | def root(self): 203 | """ 204 | Return the Sentence node at the top of the tree 205 | """ 206 | node = self 207 | while not node.isRoot(): 208 | node = node.parent() 209 | return node 210 | 211 | def isUnary(self): 212 | if self.length() == 1 and not self.child(0).isLeaf(): 213 | return True 214 | else: 215 | return False 216 | 217 | def ancestors(self): 218 | """ 219 | Generate parents 220 | """ 221 | node = self 222 | while not node.isRoot(): 223 | node = node.parent() 224 | yield node 225 | 226 | ## 227 | ## def borders(self, node): 228 | ## """ 229 | ## Decide whether the nodes form a contiguous span of words 230 | ## """ 231 | ## first, second = sorted((self, node)) 232 | ## lastWord = first.getWord(-1) 233 | ## firstWord = second.getWord(0) 234 | ## if firstWord.wordID == lastWord.wordID + 1: 235 | ## return True 236 | ## else: 237 | ## return False 238 | 239 | 240 | # 'Rich comparison' must be used, because I want equality tests to check 241 | # object identity, and less than/greater 242 | # than comparisons to check ID for sorting 243 | 244 | def __eq__(self, other): 245 | return bool(self is other) 246 | 247 | def __ne__(self, other): 248 | return bool(self is not other) 249 | 250 | def __cmp__(self, obj): 251 | """ 252 | The deprecated complicated (and crushingly slow) cmp is used in the SFG 253 | stuff 254 | """ 255 | # return cmp(self.globalID, obj.globalID) 256 | selfID = float(self.getWordID(0)) 257 | objID = float(obj.getWordID(0)) 258 | if selfID == -1: 259 | return 0 260 | elif objID == -1: 261 | return 0 262 | else: 263 | return cmp(selfID, objID) 264 | 265 | def __len__(self): 266 | return self.length() 267 | 268 | def __nonzero__(self): 269 | return True 270 | 271 | def __str__(self): 272 | return self.prettyPrint() 273 | -------------------------------------------------------------------------------- /Treebank/CCGbank/_CCGSentence.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | 4 | from ._CCGNode import CCGNode 5 | from ._CCGLeaf import CCGLeaf 6 | from ._Sentence import Sentence 7 | import ccg.scat 8 | 9 | class CCGSentence(Sentence, CCGNode): 10 | def __init__(self, **kwargs): 11 | if 'string' in kwargs: 12 | node = self._parseString(kwargs.pop('string')) 13 | elif 'node' in kwargs: 14 | node = kwargs.pop('node') 15 | globalID = kwargs.pop('globalID') 16 | self.localID = kwargs.pop('localID') 17 | CCGNode.__init__(self, label="S", headIdx=0, **kwargs) 18 | self.globalID = globalID 19 | self.attachChild(node) 20 | self.headIdx = 0 21 | 22 | 23 | def sibling(self): 24 | return None 25 | 26 | def addPargDeps(self, pargDeps): 27 | headDeps = {} 28 | for pargDep in pargDeps: 29 | if len(pargDep) == 6: 30 | i, j, catJ, argK, formI, formJ = pargDep 31 | if formI == '-colon-': 32 | formI = ':' 33 | if formJ == '-colon-': 34 | formJ = ':' 35 | depType = 'L' 36 | elif len(pargDep) == 7: 37 | i, j, catJ, argK, formI, formJ, depType = pargDep 38 | else: 39 | print pargDeps 40 | raise StandardError 41 | i = int(i) 42 | j = int(j) 43 | argK = int(argK) 44 | arg = self.getWord(i) 45 | head = self.getWord(j) 46 | if arg.text != formI or head.text != formJ: 47 | if formI == 'null' or formJ == 'null': 48 | continue 49 | #else: 50 | # print >> sys.stderr, "Mismatched dependency" 51 | # return None 52 | print self.globalID 53 | print '\n'.join('%d-%s' % (w.wordID, w.text) for w in self.listWords()) 54 | print arg 55 | print head 56 | print formI 57 | print formJ 58 | print pargDep 59 | print '\n'.join([' '.join(d) for d in pargDeps]) 60 | print self 61 | raise StandardError, "Mismatched dependency" 62 | headDeps.setdefault(head, {}).setdefault(argK, []).append((arg, depType)) 63 | # Initialise dependencies, so there's a slot there for unfilled deps 64 | for word in self.listWords(): 65 | goldDeps = [] 66 | for arg in word.parg.arguments: 67 | goldDeps.append([]) 68 | word.parg.goldDeps = goldDeps 69 | for head, itsDeps in headDeps.items(): 70 | cat = head.parg 71 | for argNum, deps in itsDeps.items(): 72 | for dep in deps: 73 | try: 74 | cat.goldDeps[argNum - 1].append(dep) 75 | except IndexError: 76 | # print >> sys.stderr, "Index error" 77 | # return None 78 | print self.globalID 79 | print '\n'.join('%d-%s' % (w.wordID, w.text) for w in self.listWords()) 80 | print head 81 | print cat 82 | print cat.arguments 83 | print itsDeps 84 | print cat.goldDeps 85 | for p in pargDeps: 86 | print p 87 | cat.goldDeps[argNum - 1].append(dep) 88 | 89 | def unify_vars(self): 90 | """ 91 | Traverse the nodes in the sentence, and unify their variables 92 | so that all nodes that have unified during the derivation have 93 | the same gloval variable. 94 | 95 | The nodes must be traversed bottom-up, and node labels must be 96 | replaced by the rule-product of their children. This is done 97 | because the parent nodes' annotations are not provided, and 98 | cannot be guessed. For example, in wsj_0200.0, 99 | there is the production: 100 | (S[dcl]\NP)/(S[to]\NP) --> (((S[dcl]\NP)/(S[to]\NP))/NP NP 101 | The annotation of the parent is _not_ the same as the one 102 | in the markedup file for that category --- in 103 | "it expects that to happen", "it" and "that" must not be 104 | coindexed. 105 | """ 106 | def unifyBranch(node): 107 | """ 108 | Start at bottom left corner of the tree. Walk 109 | upwards, at each point unifying the sibling 110 | by calling this function. 111 | """ 112 | current = node.getWord(0).parent() 113 | while current is not node: 114 | sibling = current.sibling() 115 | parent = current.parent() 116 | if sibling and not sibling.child(0).isLeaf(): 117 | unifyBranch(sibling) 118 | 119 | curLab = current.label 120 | assert curLab 121 | sibLab = sibling.label if sibling else None 122 | parLab = parent.label 123 | production = ccg.rules.Production(curLab, sibLab, parLab) 124 | result = production.result 125 | if result and result.exact_eq(parLab): 126 | parent.label = result 127 | current = parent 128 | assert current.label 129 | 130 | unifyBranch(self.child(0)) 131 | # Now bind the variables to the words 132 | for word in self.listWords(): 133 | word.stag.add_head(word) 134 | # Fix conjunctions 135 | # This is truly an evil hack, but it's very difficult to get it right. 136 | # We first find conjunction nodes and get their conjuncted variables 137 | # plus the set of nodes _outside_ their subtree (note that this 138 | # "outside the subtree" part is what makes this so hard to do in 139 | # pure unification) 140 | # Once we have them, we search the nodes outside for variable sets 141 | # that contain one but not all of the conjuncts, and then restore 142 | # the missing ones. 143 | conjVarSets = [] 144 | nodeSet = set(self.depthList()) 145 | for conjNode in nodeSet: 146 | if conjNode.length() < 2: 147 | continue 148 | if not conjNode.child(1).label.conj: 149 | continue 150 | varSet = set(v.get_ref() for v in conjNode.label.get_vars()) 151 | nodesBelowConj = set(conjNode.depthList()) 152 | nodesBelowConj.add(conjNode) 153 | nodesToCheck = nodeSet - nodesBelowConj 154 | conjVarSets.append((varSet, nodesToCheck)) 155 | 156 | for conjVars, nodes in conjVarSets: 157 | for node in nodes: 158 | if node.isLeaf() or node.isRoot(): 159 | continue 160 | scat = node.label 161 | for var, varSet in scat._var_table.items(): 162 | varSet = set(v.get_ref() for v in varSet) 163 | if not varSet.intersection(conjVars): 164 | continue 165 | words = set(v.word for v in varSet if v.word) 166 | for conjVar in conjVars: 167 | if conjVar.word not in words: 168 | scat.add_var(var, conjVar) 169 | 170 | # This returns 4 groups for compatibility with the 171 | # Root.parseString method 172 | bracketsRE = re.compile(r'(\()<([^>]+)>|()(\))') 173 | def _parseString(self, text): 174 | # The algorithm here is roughly, find and build the nodes, 175 | # and keep track of the parent. Then, later, connect the nodes together 176 | # into a tree 177 | # This is very similar to Root's, but it's not worth making 178 | # both unreadable/slow to shoe-horn them together... 179 | openBrackets = [] 180 | parentage = {} 181 | nodes = {} 182 | nWords = 0 183 | for match in self.bracketsRE.finditer(text): 184 | open_, nodeData, null, close = match.groups() 185 | if open_: 186 | assert not close 187 | openBrackets.append((nodeData, match.start())) 188 | else: 189 | assert close 190 | try: 191 | nodeData, start = openBrackets.pop() 192 | except: 193 | print text 194 | raise 195 | if nodeData.startswith('L'): 196 | newNode = self._makeLeaf(nodeData, nWords) 197 | nWords += 1 198 | else: 199 | newNode = self._makeNode(nodeData) 200 | if openBrackets: 201 | parentStart = openBrackets[-1][1] 202 | parentage[newNode] = parentStart 203 | else: 204 | top = newNode 205 | nodes[start] = newNode 206 | # Can use Root's method for this bit 207 | self._connectNodes(nodes, parentage) 208 | return top 209 | 210 | def _makeNode(self, nodeData): 211 | try: 212 | T, cat, headIdx, nChildren = nodeData.split() 213 | except: 214 | print >> sys.stderr, nodeData 215 | raise 216 | return CCGNode(label=ccg.scat.SuperCat(cat), headIdx=int(headIdx)) 217 | 218 | def _makeLeaf(self, nodeData, wordID): 219 | L, cat, ccgPos, ptbPos, text, annotCat = nodeData.split() 220 | if cat.endswith('/'): 221 | cat = cat[1:-2] 222 | if '@' in cat: 223 | cat, srl_annot_str = cat.split('@') 224 | else: 225 | srl_annot_str = '' 226 | # Check whether the @ is on the annotCat instead 227 | if not srl_annot_str and '@' in annotCat: 228 | annotCat, srl_annot_str = annotCat.split('@') 229 | if cat.endswith('/'): 230 | cat = cat[1:-2] 231 | cat = ccg.scat.SuperCat(cat) 232 | for srl_triple in srl_annot_str.split('_'): 233 | if not srl_triple: 234 | continue 235 | cat.srl_annot.add(tuple(srl_triple.replace('X', '_').split("'"))) 236 | parent = CCGNode(label=cat, headIdx=0) 237 | leaf = CCGLeaf(label=ptbPos, pos=ccgPos, text=text, 238 | parg=cat, wordID=wordID) 239 | parent.attachChild(leaf) 240 | return parent 241 | -------------------------------------------------------------------------------- /Treebank/CCGbank/_CCGNode.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from collections import defaultdict 3 | 4 | import ccg.rules 5 | import ccg.scat 6 | from ._Node import Node 7 | 8 | class CCGNode(Node): 9 | def __init__(self, **kwargs): 10 | self.headIdx = kwargs.pop('headIdx') 11 | label = kwargs.pop('label') 12 | self.srl_labels = defaultdict(list) 13 | Node.__init__(self, label) 14 | 15 | def isProduction(self, **kwargs): 16 | """ 17 | Check whether the node matches a given production, by checking 18 | its label, and some combination of the labels of its sibling, 19 | parent, and children 20 | """ 21 | # Do this first for speed 22 | selfType = kwargs.pop('selfType') 23 | assert not '^' in str(selfType) 24 | #if not ccg.isIdentical(self.label.morphLess(), selfType): 25 | # return False 26 | # NB: breaking hats here! 27 | if not ccg.isIdentical(self.label, selfType): 28 | return False 29 | for nodeType, specified in kwargs.items(): 30 | assert not '^' in str(specified) 31 | if nodeType == 'parent': 32 | node = self.parent() 33 | elif nodeType == 'sibling': 34 | node = self.sibling() 35 | elif nodeType == 'left': 36 | if self.length() < 1: 37 | return False 38 | node = self.child(0) 39 | elif nodeType == 'right': 40 | if self.length() < 2: 41 | return False 42 | node = self.child(1) 43 | else: 44 | print nodeType 45 | raise StandardError 46 | if not node: 47 | return False 48 | try: 49 | label = node.label.morphLess() 50 | except AttributeError: 51 | label = ccg.scat.SuperCat(node.label).morphLess() 52 | if str(label) != str(specified): 53 | return False 54 | return True 55 | 56 | 57 | def changeLabel(self, newLabel): 58 | """ 59 | Replace the node's category with a new one, propagating the changes 60 | as appropriate. The propagation code is controlled by Production, 61 | using logic roughly documented in my thesis. 62 | """ 63 | if self.label.exact_eq(newLabel): 64 | return None 65 | #if ccg.isIdentical(newLabel, 'NP[nb]/N'): 66 | # newLabel = ccg.category.from_string('NP/N') 67 | c0 = self.child(0) 68 | if c0.isLeaf(): 69 | newLabel._var_table[0] = self.label._var_table[0] 70 | self.label = newLabel 71 | c0.changeLabel(newLabel) 72 | return None 73 | if self.isUnary(): 74 | # Don't produce unary rules like N --> S/S 75 | # Instead, create a new NP node and place it under self 76 | # This will make a S/S --> NP --> N chain 77 | # The exception is when we're adding arguments to NP. 78 | # Then what we want to do is add the arguments to N 79 | if newLabel.innerResult() == 'NP' and not newLabel.isAdjunct(): 80 | if not newLabel.isComplex(): 81 | nLabel = ccg.scat.SuperCat('N') 82 | else: 83 | args = [(a, s, {'hat': h}) for (r, a, s, h) in 84 | newLabel.deconstruct()] 85 | nLabel = ccg.scat.add_args(ccg.scat.SuperCat('N'), args) 86 | self.child(0).changeLabel(nLabel) 87 | newLabel._var_table[0] = self.label._var_table[0] 88 | self.label = newLabel 89 | return None 90 | if self.length() == 2: 91 | c1 = self.child(1) 92 | production = ccg.rules.Production(c0.label, c1.label, self.label) 93 | production.replace(newLabel) 94 | if not production.left.exact_eq(c0.label): 95 | c0.changeLabel(production.left) 96 | if c1 and not production.right.exact_eq(c1.label): 97 | c1.changeLabel(production.right) 98 | newLabel._var_table[0] = self.label._var_table[0] 99 | self.label = newLabel 100 | 101 | def sibling(self): 102 | """ 103 | If there is a sibling, return it, else return None 104 | """ 105 | for child in self.parent().children(): 106 | if child is not self: 107 | return child 108 | return None 109 | 110 | def validate(self): 111 | """ 112 | Check whether subtree composes 113 | 114 | Currently broken 115 | """ 116 | for child in self.children(): 117 | if not child.validate(): 118 | return False 119 | if child.isLeaf(): 120 | return True 121 | if self.isRoot(): 122 | return True 123 | label = self.label 124 | left = self.child(0).label 125 | if self.length() == 1: 126 | right = None 127 | else: 128 | right = self.child(1).label 129 | if ccg.validate(left, right, label): 130 | return True 131 | else: 132 | w1 = self.getWord(0).globalID 133 | return False 134 | 135 | def head(self): 136 | """ 137 | Return the leaf node that the CCGbank indices designate as the head 138 | Warning: These indices are sometimes unreliable, so this function 139 | may give incorrect results. 140 | 141 | Warning++!! Be especially careful of bugs introduced during rebanking, 142 | where the head indices have not been updated appropriately during node 143 | movement. 144 | """ 145 | head = self 146 | while not head.isLeaf(): 147 | if head.headIdx >= head.length(): 148 | #print >> sys.stderr, "Bad head idx: %s, %d" % (head, head.headIdx) 149 | head = head.child(-1) 150 | else: 151 | head = head.child(head.headIdx) 152 | return head 153 | 154 | def heads(self): 155 | if self.headIdx >= self.length(): 156 | print >> sys.stderr, "Bad head idx: %s" % self 157 | head = self.child(-1) 158 | else: 159 | head = self.child(self.headIdx) 160 | heads = [] 161 | if head.sibling() and head.sibling().label.conj: 162 | heads.extend(head.sibling().heads()) 163 | heads.extend(head.heads()) 164 | heads.sort() 165 | return heads 166 | 167 | 168 | def move(self, destination, headIdx): 169 | """ 170 | ccg trees are binary branching, so moving a node means inserting a 171 | new level in the tree and deleting a level at the old destination. 172 | This function is not responsible for ensuring valid labels, but does 173 | check whether moves would cause crossing brackets, and checks whether 174 | any words are stranded. Requires an index noting head directionality, 175 | so that the head() function does not break. 176 | """ 177 | if destination is self.sibling(): 178 | raise StandardError, "Moving to current location!" 179 | if destination.isLeaf(): 180 | raise StandardError, "Cannot move to leaf!" 181 | # Store the word list so that we can check it isn't disrupted 182 | origWords = ' '.join([w.text for w in self.root().listWords()]) 183 | # Check for crossing brackets 184 | firstNode, lastNode = sorted([self, destination]) 185 | lastYield = lastNode.listWords() 186 | edgeWords = [w for w in firstNode.listWords() if w not in lastYield] 187 | if not edgeWords: 188 | print firstNode 189 | print lastNode 190 | raise StandardError 191 | rightEdge = edgeWords[-1] 192 | leftEdge = lastYield[0] 193 | if rightEdge.wordID != (leftEdge.wordID - 1): 194 | raise StandardError, "Move would create non-contiguous word seq" 195 | # The actual move operation 196 | labelCopy = ccg.scat.SuperCat(destination.label) 197 | newParent = CCGNode(label=labelCopy, headIdx=headIdx) 198 | destination.insert(newParent) 199 | # Trim production by deleting sibling, moving its children up to parent 200 | oldParent = self.parent() 201 | oldSibling = self.sibling() 202 | oldSibling.prune() 203 | self.reattach(newParent) 204 | for node in oldSibling.children(): 205 | node.reattach(oldParent) 206 | # Post-validation 207 | # Parent should have same head idx as before, as it has same children 208 | oldParent.headIdx = oldSibling.headIdx 209 | if not newParent.listWords(): 210 | print self 211 | print destination 212 | raise StandardError 213 | newWords = ' '.join([w.text for w in self.root().listWords()]) 214 | if origWords != newWords: 215 | print origWords 216 | print newWords 217 | raise StandardError 218 | return newParent 219 | 220 | 221 | def typeRaise(self, tCat, slash): 222 | """ 223 | Add a type-raise node 224 | """ 225 | assert not tCat.conj 226 | assert not self.label.conj 227 | innerSlash = '\\' if slash == '/' else '/' 228 | newCat = ccg.scat.type_raise(tCat, slash, self.label) 229 | newNode = CCGNode(headIdx=0, label=newCat) 230 | self.insert(newNode) 231 | 232 | def isEntity(self, typeRequested=None): 233 | """ 234 | Check whether the node spans an entity 235 | """ 236 | words = self.listWords() 237 | if not words[0].entity.startswith('B'): 238 | return False 239 | typeSeen = words[0].entity.split('-')[1] 240 | if typeRequested and not typeSeen.startswith(typeRequested): 241 | return False 242 | matchTag = 'I-%s' % typeRequested 243 | for w in words[1:]: 244 | if not w.entity.startswith(matchTag): 245 | return False 246 | nextWord = words[-1].nextWord() 247 | if nextWord and nextWord.entity.startswith(matchTag): 248 | # Don't allow sentence-final periods to be entities 249 | if nextWord.text == '.' and not nextWord.nextWord(): 250 | nextWord.entity = '' 251 | else: 252 | return False 253 | return True 254 | ## def finalise(self): 255 | ## """ 256 | ## Once the changes to the tree are complete, it is worth building final word 257 | ## lists etc, and then telling methods to use them instead 258 | ## """ 259 | ## self._wordList = self.listWords() 260 | ## self._siblings = [s for s in self.parent().children() if s != self] 261 | ## self._breadthList = self.breadthList() 262 | ## self._depthList = self.depthList() 263 | ## self._head = self.head() 264 | ## self._finalised = True 265 | 266 | ## def addCatHeads(self): 267 | ## return StandardError, "Currently Broken!" 268 | ## # Find the highest left-side node with a head 269 | ## left = self._findNode() 270 | ## # Ensure that the node to the right of it has a head 271 | ## left, right, parent = self._prepareJunction(left) 272 | ## # Add the head 273 | ## self.addCatHead(parent, left, right) 274 | 275 | ## def _findNode(self): 276 | ## n = self 277 | ## while not n.label.hasHead(): 278 | ## n = n.child(0) 279 | ## if n.isLeaf(): 280 | ## n.parg.addHead(n.text) 281 | ## n.parent().label.unify(n.parg) 282 | ## return n.parent() 283 | ## return n 284 | ## 285 | ## def _prepareJunction(self, node): 286 | ## while node.parent().length() == 1: 287 | ## parent = node.parent() 288 | ## ccg.combineChildren(parent.label, node.label, None) 289 | ## if parent.isRoot(): 290 | ## return None, None 291 | ## node = parent 292 | ## left, right = node.parent().children() 293 | ## if not left.label.hasHead(): 294 | ## left.addCatHeads() 295 | ## if not right.label.hasHead(): 296 | ## right.addCatHeads() 297 | ## return left, right, node 298 | 299 | ## def addCatHead(self, left, right, parent): 300 | ## assert left.label.hasHead() 301 | ## assert right.label.hasHead() 302 | ## ccg.combineChildren(parent.label, left.label, right.label) 303 | -------------------------------------------------------------------------------- /tests/test_rules.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os.path 3 | 4 | from ccg import rules 5 | import ccg.scat 6 | import ccg.category 7 | import ccg.lexicon 8 | import Treebank.CCGbank 9 | 10 | ccg.lexicon.load() 11 | 12 | #class TestCatPaths(unittest.TestCase): 13 | # def test_tv(self): 14 | # cat = ccg.scat.SuperCat('((S[dcl]{_}\NP{Y}){_}/NP{Y}){_}') 15 | # self.assertEqual(cat.cats[(0, 0)], 'S[dcl]') 16 | # self.assertEqual(cat.cats[(1, )], 'NP') 17 | # self.assertEqual(cat.cats[(0, 1)], 'NP') 18 | # self.assertEqual(cat.active_features[(0, 0)], 'S[dcl]') 19 | # self.assertEqual(cat.cats_by_var[1], ['NP', 'NP']) 20 | # self.assertEqual([str(c) for c in cat.cats_by_var[0]], 21 | # ['(S[dcl]\NP)/NP', 'S[dcl]\NP', 'S[dcl]']) 22 | 23 | # def test_adv(self): 24 | # cat = ccg.scat.SuperCat('((S[X]{Y}\NP{Z}){Y}/(S[X]{Y}\NP{Z}){Y}){_}') 25 | # self.assertEqual(cat.cats[(0, 0)], 'S') 26 | # self.assertEqual(cat.cats[(1, 1)], 'NP') 27 | # self.assertEqual( 28 | # [str(c) for c in cat.cats_by_var[1]], 29 | # ['S\NP', 'S', 'S\NP', 'S']) 30 | class MockToken(str): 31 | def __init__(self, text): 32 | self.text = text 33 | str.__init__(self, text) 34 | 35 | 36 | 37 | class TestRules(unittest.TestCase): 38 | def test_fapply_basic(self): 39 | cat1 = ccg.scat.SuperCat(r'(NP{Y}/N{Y}){_}') 40 | cat2 = ccg.scat.SuperCat(r'N{_}') 41 | self.do_rule(rules.fapply, cat1, cat2, 'NP{_}') 42 | 43 | def test_fapply_feature(self): 44 | cat1 = ccg.scat.SuperCat('S/S') 45 | cat2 = ccg.scat.SuperCat('S[dcl]') 46 | self.do_rule(rules.fapply, cat1, cat2, 'S[dcl]{_}') 47 | self.do_rule(rules.fapply, 48 | ccg.scat.SuperCat('(S\NP)/(S\NP)'), 49 | ccg.scat.SuperCat(r'(S[pss]{_}\NP{Y}){_}'), 50 | '(S[pss]{_}\NP{Y}){_}') 51 | 52 | def test_bapply_basic(self): 53 | cat1 = ccg.scat.SuperCat(r'NP{_}') 54 | cat2 = ccg.scat.SuperCat(r'(S[dcl]{_}\NP{Y}){_}') 55 | self.do_rule(rules.bapply, cat1, cat2, 'S[dcl]{_}') 56 | 57 | def test_bapply_feature(self): 58 | cat1 = ccg.scat.SuperCat(r'S[ng]{_}') 59 | cat2 = ccg.scat.SuperCat(r'(S[X]{Y}\S[X]{Y}){_}') 60 | self.do_rule(rules.bapply, cat1, cat2, 'S[ng]{_}') 61 | 62 | def test_fcomp_basic(self): 63 | cat1 = ccg.scat.SuperCat(r'(NP{Y}/N{Y}){_}') 64 | cat2 = ccg.scat.SuperCat(r'(N{_}/PP{Y}){_}') 65 | self.do_rule(rules.fcomp, cat1, cat2, '(NP{_}/PP{Y}){_}') 66 | 67 | def test_fcomp_feature(self): 68 | cat1 = ccg.scat.SuperCat(r'(S[X]{Y}/S[X]{Y}){_}') 69 | cat2 = ccg.scat.SuperCat(r'(S[dcl]{_}/S[dcl]{Y}){_}') 70 | self.do_rule(rules.fcomp, cat1, cat2, '(S[dcl]{_}/S[dcl]{Y}){_}') 71 | cat1 = ccg.scat.SuperCat(r'((S[X]{Y}\NP{Z}){Y}/(S[X]{Y}\NP{Z}){Y}){_}') 72 | cat2 = ccg.scat.SuperCat(r'((S[dcl]{_}\NP{Y}){_}/NP{Z}){_}') 73 | self.do_rule(rules.fcomp, cat1, cat2, 74 | '((S[dcl]{_}\NP{Y}){_}/NP{Z}){_}') 75 | 76 | def test_bcomp_basic(self): 77 | c1 = ccg.scat.SuperCat(r'(NP{Y}\NP{Y}){_}') 78 | c2 = ccg.scat.SuperCat(r'(S[dcl]{_}\NP{Y}){_}') 79 | self.do_rule(rules.bcomp, c1, c2, '(S[dcl]{Y}\NP{Z}){_}') 80 | 81 | def test_bcomp_feature(self): 82 | c1 = ccg.scat.SuperCat(r'(S[dcl]{_}\NP{Y}){_}') 83 | c2 = ccg.scat.SuperCat(r'(S[X]{Y}\S[X]{Y}){_}') 84 | self.do_rule(rules.bcomp, c1, c2, '(S[dcl]{_}\NP{Y}){_}') 85 | 86 | def test_bxcomp_basic(self): 87 | c1 = ccg.scat.SuperCat(r'((S[dcl]{_}\NP{Y}){_}/NP{Z}){_}') 88 | c2 = ccg.scat.SuperCat(r'((S[X]{Y}\NP{Z}){Y}\(S[X]{Y}\NP{Z}){Y}){_}') 89 | self.do_rule(rules.bxcomp, c1, c2, c1.annotated) 90 | 91 | def test_gfcomp_basic(self): 92 | c1 = ccg.scat.SuperCat(r'(PP{_}/S[em]{Y}){_}') 93 | c2 = ccg.scat.SuperCat(r'((S[em]{_}/NP{Y}){_}/Q{Z}){_}') 94 | self.do_rule(rules.fcomp, c1, c2, '((PP{Y}/NP{Z}){_}/Q{W}){_}') 95 | 96 | def test_gfcomp_feature(self): 97 | c1 = ccg.scat.SuperCat(r'S/S') 98 | c2 = ccg.scat.SuperCat(r'(S[dcl]\NP)/NP') 99 | self.assertFalse(rules.fcomp(c1, c2)) 100 | self.do_rule(rules.fxcomp, c1, c2, '((S[dcl]{_}\NP{Y}){_}/NP{Z}){_}') 101 | 102 | def test_gbxcomp_cross_slash(self): 103 | c1 = ccg.scat.SuperCat(r'(S[dcl]\NP)/NP') 104 | c2 = ccg.scat.SuperCat(r'S\S') 105 | self.assertFalse(rules.bcomp(c1, c2)) 106 | self.do_rule(rules.bxcomp, c1, c2, '((S[dcl]{_}\NP{Y}){_}/NP{Z}){_}') 107 | 108 | def test_gbxcomp_basic(self): 109 | c1 = ccg.scat.SuperCat(r'((NP{_}/PP{Y}){_}/S[em]{Z}){_}') 110 | c2 = ccg.scat.SuperCat(r'S[dcl]\NP') 111 | self.do_rule(rules.bxcomp, c1, c2, '((S[dcl]{Y}/PP{Z}){_}/S[em]{W}){_}') 112 | 113 | def test_bgxcomp_feature(self): 114 | c1 = ccg.scat.SuperCat('((S[dcl]\NP)/NP)/NP') 115 | c2 = ccg.scat.SuperCat('(S\NP)\(S\NP)') 116 | self.do_rule(rules.bxcomp, c1, c2, r'(((S[dcl]{_}\NP{Y}){_}/NP{Z}){_}/NP{W}){_}') 117 | c1 = ccg.scat.SuperCat('((S[dcl]\NP[expl])/(NP\NP))/NP') 118 | c2 = ccg.scat.SuperCat('(S\NP)\(S\NP)') 119 | self.do_rule(rules.bxcomp, c1, c2, 120 | r'(((S[dcl]{_}\NP[expl]{Y}){_}/(NP{Z}\NP{Z}){W}){_}/NP{V}){_}') 121 | c1 = ccg.scat.SuperCat('((S[dcl]\NP[expl])/S[for])/(S[adj]\NP)') 122 | c2 = ccg.scat.SuperCat('(S\NP)\((S\NP)/S[for])') 123 | self.do_rule(rules.bxcomp, c1, c2, '((S[dcl]{_}\NP[expl]{Y}){_}/(S[adj]{Z}\NP{W}){Z}){_}') 124 | c1 = ccg.scat.SuperCat('(S[qem]/S[dcl])\((NP\NP)/NP)') 125 | c2 = ccg.scat.SuperCat('S\S') 126 | self.do_rule(rules.bxcomp, c1, c2, '((S[qem]{_}/S[dcl]{Y}){_}\\((NP{Z}\\NP{Z}){W}/NP{V}){U}){_}') 127 | 128 | def test_add_conj(self): 129 | c1 = ccg.scat.SuperCat('conj') 130 | c2 = ccg.scat.SuperCat('S[dcl]{_}^(S[X]{Y}\S[X]{Y}){_}') 131 | result = rules.add_conj(c1, c2) 132 | self.assertEqual(result.string, 'S[dcl]^(S\S)[conj]') 133 | c1 = ccg.scat.SuperCat('conj') 134 | c2 = ccg.scat.SuperCat('S[dcl]\NP') 135 | result = rules.add_conj(c1, c2) 136 | self.assertEqual(result.annotated, 137 | '((S[dcl]{Y}\NP{Z}<1>){_}\(S[dcl]{W}\NP{Z}<1>){W}){_}') 138 | c1 = ccg.scat.SuperCat('conj') 139 | c2 = ccg.scat.SuperCat('S[pss]\NP') 140 | result = rules.add_conj(c1, c2) 141 | 142 | def test_add_conj_head(self): 143 | c1 = ccg.scat.SuperCat('conj') 144 | c1.add_head(MockToken('and')) 145 | c2 = ccg.scat.SuperCat('NP') 146 | c2_head = MockToken('thing') 147 | c2.add_head(c2_head) 148 | result = rules.add_conj(c1, c2) 149 | self.assertTrue(result.has_head(c2_head)) 150 | 151 | def test_do_conj(self): 152 | c1 = ccg.scat.SuperCat('S[X]\NP') 153 | c2 = ccg.scat.SuperCat('S[dcl]\NP[conj]') 154 | self.assertEqual(c2.annotated, 155 | '((S[dcl]{Y}\NP{Z}<1>){_}\(S[dcl]{W}\NP{Z}<1>){W}){_}') 156 | self.assertEqual(c2.annotated, 157 | '((S[dcl]{Y}\NP{Z}<1>){_}\(S[dcl]{W}\NP{Z}<1>){W}){_}') 158 | self.assertFalse(rules.do_conj(c1, c2)) 159 | c1 = ccg.scat.SuperCat('S[dcl]\NP') 160 | c1_head = MockToken('plays') 161 | c1.add_head(c1_head) 162 | c2_head = MockToken('is') 163 | c2.add_head(c2_head) 164 | result = rules.do_conj(c1, c2) 165 | self.assertEqual(result, 'S[dcl]\NP') 166 | self.assertTrue(result.has_head(c1_head)) 167 | self.assertTrue(result.has_head(c2_head)) 168 | 169 | def test_comma_conj(self): 170 | c1 = ccg.scat.SuperCat(':') 171 | c2 = ccg.scat.SuperCat('NP') 172 | self.assertEqual(rules.comma_conj(c1, c2).string, 'NP[conj]') 173 | 174 | def test_fcomp_tr(self): 175 | c1 = ccg.scat.SuperCat('(S[X]{Y}/(S[X]{Y}\NP{_}){Y}){_}') 176 | c2 = ccg.scat.SuperCat('(S[dcl]\NP)/NP') 177 | result = rules.fcomp(c1, c2) 178 | self.assertEqual(result, 'S[dcl]/NP') 179 | 180 | def test_feature(self): 181 | c1 = ccg.scat.SuperCat('(N{_}/N[num]){_}') 182 | c2 = ccg.scat.SuperCat('N[num]') 183 | self.do_rule(rules.fapply, c1, c2, 'N{_}') 184 | 185 | 186 | def do_rule(self, rule, cat1, cat2, expected): 187 | cat1str = cat1.string 188 | cat2str = cat2.string 189 | result = rule(cat1, cat2) 190 | self.assertEqual(result, expected) 191 | self.assertEqual(str(cat1), cat1str) 192 | self.assertEqual(str(cat2), cat2str) 193 | self.assertEqual(result.annotated, expected) 194 | 195 | 196 | #def test_traise(self): 197 | # c1 = ccg.scat.SuperCat('NP') 198 | # par = ccg.scat.SuperCat('Q/(Q\NP)') 199 | # result = rules.traise(c1, par) 200 | # self.assertEqual(result, 'Q/(Q\NP)') 201 | # self.assertEqual(result.annotated, '(Q{Y}/(Q{Y}\NP{_}){Y}){_}') 202 | 203 | 204 | def test_minimise(self): 205 | cat = ccg.category.from_string('((S[dcl]{Y}\NP{Z}){Y}/NP{W}){Y}') 206 | min, var_map = rules.minimise_vars(cat, {}) 207 | self.assertEqual(min.annotated, '((S[dcl]{_}\NP{Y}){_}/NP{Z}){_}') 208 | cat = ccg.category.from_string('(S[dcl]{Y}\NP{Z}){Y}') 209 | min, var_map = rules.minimise_vars(cat, {}) 210 | self.assertEqual(min.annotated, '(S[dcl]{_}\NP{Y}){_}') 211 | 212 | def test_badjunct_global(self): 213 | c1 = ccg.scat.SuperCat('S[pss]\NP') 214 | c2 = ccg.scat.SuperCat('(S\NP)\(S\NP)') 215 | parent = ccg.scat.SuperCat('S[pss]\NP') 216 | production = ccg.rules.Production(c1, c2, parent) 217 | 218 | def test_parent_annotation(self): 219 | c1 = ccg.scat.SuperCat('((S[dcl]\NP)/(S[to]\NP))/NP') 220 | c2 = ccg.scat.SuperCat('NP') 221 | parent = ccg.scat.SuperCat('(S[dcl]\NP)/(S[to]\NP)') 222 | production = ccg.rules.Production(c1, c2, parent) 223 | self.assertEqual(production.result.annotated, 224 | '((S[dcl]{_}\NP[Y]{Y}<1>){_}/(S[to]{Z}<2>\NP[Y]{W*}){Z}){_}') 225 | 226 | class TestTrees(unittest.TestCase): 227 | def test_conj_plays(self): 228 | elianti = ('( ( ( ( ' 229 | '() () ' 230 | ') ) ( ( ' 231 | '() ' 232 | '( () ' 233 | '() ) ) ' 234 | '( () ) ) )' 235 | '())') 236 | import Treebank.CCGbank 237 | sentence = Treebank.CCGbank.CCGSentence(string=elianti, globalID=0, 238 | localID=0) 239 | sentence.unify_vars() 240 | plays_and_is = sentence.getWord(2).parent().parent() 241 | annotated = plays_and_is.label.global_annotated() 242 | self.assertEqual(annotated, 243 | '((S[dcl]{plays,is}\NP{Haag}<1>){plays,is}/NP{Elianti}<2>){plays,is}') 244 | 245 | def test_conj_elianti(self): 246 | elianti = ('( ( ( ( ' 247 | '() () ' 248 | ') ) ( () ( ( ' 250 | '() ( ' 251 | '() () ) )' 252 | ') ) ) () )') 253 | sentence = Treebank.CCGbank.CCGSentence(string=elianti, globalID=0, 254 | localID=0) 255 | sentence.unify_vars() 256 | elianti_and_celamene = sentence.getWord(-2).parent().parent().parent() 257 | annotated = elianti_and_celamene.label.global_annotated() 258 | self.assertEqual(annotated, 'N{Elianti,Celamene}') 259 | 260 | def test_fcomp_sadj(self): 261 | c1 = ccg.scat.SuperCat(r'(S[dcl]\NP)/(S[adj]\NP)') 262 | c1_head = MockToken('is') 263 | c1.add_head(c1_head) 264 | c2 = ccg.scat.SuperCat(r'(S[adj]\NP)/NP') 265 | c2_head = MockToken('worth') 266 | c2.add_head(c2_head) 267 | production = ccg.rules.Production(c1, c2) 268 | assert production.result.has_head(c2_head) 269 | 270 | def test_lex_vars_stay(self): 271 | ccgbank_loc = '/usr/local/data/CCGbank1.2' 272 | ccgbank = Treebank.CCGbank.CCGbank(path=ccgbank_loc) 273 | ccg.lexicon.load(os.path.join(ccgbank_loc, 'markedup')) 274 | asbestos = ccgbank.child(2).child(0) 275 | asbestos.unify_vars() 276 | for word in asbestos.listWords(): 277 | self.assertTrue(word.stag.has_head(word)) 278 | 279 | if __name__ == '__main__': 280 | unittest.main() 281 | -------------------------------------------------------------------------------- /ccg/category.py: -------------------------------------------------------------------------------- 1 | import re 2 | from collections import defaultdict 3 | 4 | import ccg.lexicon 5 | 6 | VARS = ['_', 'Y', 'Z', 'W', 'V', 'U', 'T', 'S', 'R', 'P', 'Q', 'O'] 7 | _FEATS = ['[dcl]', '[b]', '[pss]', '[ng]', '[pt]'] 8 | _ATOMIC_RE = re.compile(r'([a-zA-Z,\.;:]+)(\[[^\]]+\])?(\[conj\])?') 9 | _AUX_RE = re.compile(r'\(S\[(\w+)\]\\NP\)/\(S\[(\w+)\]\\NP\)') 10 | _PRED_RE = re.compile(r'\(*S\[(b|dcl|ng|pss|pt|to)\]') 11 | _PUNCT = set([',', ':', ';', '.', "LQU", "RQU", "--", 'RRB', 'LRB']) 12 | 13 | 14 | class Category(object): 15 | def __init__(self, result, slash='', argument=None, **kwargs): 16 | import ccg.scat 17 | if isinstance(result, ccg.scat.SuperCat): 18 | result = result.category 19 | if isinstance(argument, ccg.scat.SuperCat): 20 | argument = argument.category 21 | self.slash = slash 22 | self.argument = argument 23 | self.is_complex = bool(self.slash) 24 | if self.is_complex: 25 | self.result = result 26 | else: 27 | self.result = self 28 | self._cat = result if isinstance(result, str) else result.cat 29 | self.kwargs = kwargs 30 | self.hat = kwargs.get('hat') 31 | self.conj = kwargs.get('conj', False) 32 | self.var = kwargs.get('var', 0) 33 | self.var2 = kwargs.get('var2', -1) 34 | self.asterisk = kwargs.get('asterisk', False) 35 | self.feat_var = kwargs.get('feat_var') 36 | self.feature = kwargs.get('feature', '') 37 | self.arg_idx = kwargs.get('arg_idx') 38 | 39 | if self.is_complex: 40 | str_get = self._complex_strings 41 | cat_get = self._complex_cats 42 | else: 43 | str_get = self._atomic_strings 44 | cat_get = self._atomic_cats 45 | self.cats, self.cats_by_var, self.active_features = cat_get() 46 | self.next_var = max(self.cats_by_var) + 1 47 | self.cat, self.string, self.annotated = str_get() 48 | if not '^' in self.string: 49 | self.hatless = self.string 50 | else: 51 | self.hatless = None 52 | 53 | # Higher-order attributes. Could be properties, but I'm assuming 54 | # categories are immutable, and this should be more efficient 55 | self.str_as_piece = '(%s)' % self if (self.is_complex and not 56 | self.hat) else self.string 57 | # Result leaf is at (0, 0, ...) with the longest path 58 | self.inner_result = max((p, c) for p, c in self.cats.items() 59 | if not any(p))[1] 60 | self.is_predicate = bool(_PRED_RE.match(self.string)) 61 | self.is_adjunct = (self.result.exact_eq(self.argument) 62 | and self.result.var == self.argument.var 63 | and all(c for (p, c) in self.result.cats.items() 64 | if self.argument.cats[p].var == c.var)) 65 | self.has_adjunct = any(r[0].is_adjunct for r in self.deconstruct()) 66 | self.is_aux = bool(_AUX_RE.match(self.string)) 67 | self.is_true_aux = self.is_aux and self.inner_result.feature in _FEATS 68 | self.is_punct = (not self.is_complex and self.string in _PUNCT) 69 | self.is_type_raise = (self.is_complex 70 | and self.argument.is_complex 71 | and self.slash != self.argument.slash 72 | and self.result.exact_eq(self.argument.result)) 73 | self.forward = bool(self.is_complex and self.slash == '/') 74 | self.backward = bool(self.is_complex and self.slash == '\\') 75 | 76 | def __eq__(self, other): 77 | """ 78 | Check whether the featureless version of the 79 | other category matches self. Note that this means 80 | equality is not commutative 81 | """ 82 | if self is other: 83 | return True 84 | if isinstance(other, str): 85 | other = from_string(other) 86 | if self.is_complex != other.is_complex: 87 | return False 88 | # Fail on feature or hat if it's there and doesnt match 89 | if self.feature and other.feature and self.feature != other.feature: 90 | return False 91 | if self.hat and other.hat and self.hat != other.hat: 92 | return False 93 | if self.slash != other.slash: 94 | return False 95 | s_cats = self.cats 96 | o_cats = other.cats 97 | if len(s_cats.keys()) != len(o_cats.keys()): 98 | return False 99 | for path, s_cat in s_cats.items(): 100 | if path not in o_cats: 101 | return False 102 | if s_cat.is_complex: 103 | continue 104 | o_cat = o_cats[path] 105 | if s_cat.cat != o_cat.cat: 106 | return False 107 | if (s_cat.feature and o_cat.feature 108 | and s_cat.feature != o_cat.feature): 109 | return False 110 | if s_cat.hat and o_cat.hat and s_cat.hat != o_cat.hat: 111 | return False 112 | return True 113 | 114 | def __ne__(self, other): 115 | """ 116 | Apparently != doesn't call __eq__. Boo, hiss. 117 | """ 118 | if not self == other: 119 | return True 120 | else: 121 | return False 122 | 123 | def __str__(self): 124 | return self.string 125 | 126 | def __hash__(self): 127 | return hash(str(self)) 128 | 129 | def __repr__(self): 130 | return str(self) 131 | 132 | def __setattr__(self, attr, value): 133 | """ 134 | Make Categories immutable by ensuring values 135 | that have been set can never be over-written 136 | """ 137 | if attr in self.__dict__: 138 | raise AttributeError(attr) 139 | else: 140 | self.__dict__[attr] = value 141 | 142 | non_s_feat_re = re.compile(r'(?= 0 else '' 236 | arg_idx = '<%s>' % self.arg_idx if self.arg_idx else '' 237 | var_str = '{%s%s%s}%s' % (VARS[self.var], var2, asterisk, arg_idx) 238 | annot_pieces = [self._cat, feat_annot, hat_annot, var_str] 239 | 240 | if self.conj: 241 | pieces.append('[conj]') 242 | annot_cat = '%s{Y}' % ''.join(annot_pieces[:-1]) 243 | annotated = '(%s\%s){%s}' % (annot_cat, annot_cat, VARS[self.var]) 244 | else: 245 | annotated = ''.join(annot_pieces) 246 | return self._cat, ''.join(pieces), annotated 247 | 248 | def _complex_strings(self): 249 | res_str = self.result.str_as_piece 250 | arg_str = self.argument.str_as_piece 251 | cat = '%s%s%s' % (res_str, self.slash, arg_str) 252 | if self.hat: 253 | cat = '(%s)^%s' % (cat, self.hat.str_as_piece) 254 | 255 | res_annot = self.result.annotated 256 | arg_annot = self.argument.annotated 257 | asterisk = '*' if self.asterisk else '' 258 | arg_idx = '<%s>' % self.arg_idx if self.arg_idx else '' 259 | var_annot = '{%s%s}%s' % (VARS[self.var], asterisk, arg_idx) 260 | hat_annot = '^%s' % self.hat.annotated if self.hat else '' 261 | annot_cat = '(%s%s%s)%s%s' % (res_annot, self.slash, arg_annot, 262 | var_annot, hat_annot) 263 | 264 | # All this effort to get the correct annotation for conj 265 | # categories, when it (probably?) doesn't matter... 266 | if self.conj: 267 | non_conj = from_string(cat) 268 | var_map = dict((v, v+1) for v in non_conj.cats_by_var) 269 | 270 | result = ccg.rules.remap_vars(non_conj, var_map) 271 | result_str = result.annotated[:-3] + '{_}' 272 | var_map = dict((v, v) for v in result.cats_by_var) 273 | var_map[1] = max(var_map.keys()) + 1 274 | arg = ccg.rules.remap_vars(result, var_map).annotated 275 | annot_cat = '(%s\%s){_}' % (result_str, arg) 276 | string = '%s[conj]' % cat 277 | else: 278 | string = cat 279 | return cat, string, annot_cat 280 | 281 | 282 | def _atomic_cats(self): 283 | cats = {(): self} 284 | cats_by_var = {self.var: [self]} 285 | if self.var2 >= 0: 286 | cats_by_var[self.var2] = [self] 287 | active_features = {(): self} if self.feature else {} 288 | return cats, cats_by_var, active_features 289 | 290 | def _complex_cats(self): 291 | # Get list of all cats in tree and their position 292 | cats = {tuple(): self} 293 | active_features = {} 294 | cats_by_var = defaultdict(list) 295 | cats_by_var[self.var].append(self) 296 | if self.var2 >= 0 and self.var2 != self.var: 297 | cats_by_var[self.var2].append(self) 298 | for piece, path_prefix in ((self.result, 0), (self.argument, 1)): 299 | for path, cat in piece.cats.items(): 300 | cats[(path_prefix,) + path] = cat 301 | for var, cat_list in piece.cats_by_var.items(): 302 | cats_by_var[var].extend(cat_list) 303 | for path, cat in piece.active_features.items(): 304 | active_features[(path_prefix,) + path] = cat 305 | return cats, cats_by_var, active_features 306 | 307 | 308 | var_re = re.compile(r'\{(\w)(?:,(\w))?(\*)?\}$') 309 | def from_string(cat_str, **kwargs): 310 | global VARS 311 | assert cat_str 312 | assert cat_str.count('(') == cat_str.count(')') 313 | cat_str = cat_str.replace('[nb]', '') 314 | if not kwargs and cat_str in ccg.lexicon.CATS: 315 | return ccg.lexicon.CATS[cat_str] 316 | # Add a kwarg to stop subpieces being looked up in CATS 317 | kwargs['top'] = False 318 | if cat_str.endswith('>'): 319 | kwargs['arg_idx'] = cat_str[-2] 320 | cat_str = cat_str[:-3] 321 | 322 | if cat_str.endswith('[conj]'): 323 | kwargs['conj'] = True 324 | cat_str = cat_str[:-6] 325 | if cat_str in ccg.lexicon.CATS: 326 | annotated = ccg.lexicon.CATS[cat_str].annotated 327 | return from_string(annotated, **kwargs) 328 | elif 'conj' not in kwargs: 329 | kwargs['conj'] = False 330 | 331 | # Handle top-level hat 332 | hat_idx = cat_str.find('^') 333 | if hat_idx != -1 and cat_str.endswith('{_}'): 334 | assert 'hat' not in kwargs 335 | base_str = cat_str[:hat_idx] 336 | if base_str.count('(') == base_str.count(')'): 337 | kwargs['hat'] = from_string(cat_str[hat_idx + 1:]) 338 | return from_string(base_str, **kwargs) 339 | 340 | var_match = var_re.search(cat_str) 341 | if var_match is not None: 342 | var = var_match.group(1) 343 | var2 = var_match.group(2) 344 | kwargs['asterisk'] = var_match.group(3) 345 | kwargs['var'] = VARS.index(var) 346 | if var2: 347 | kwargs['var2'] = VARS.index(var2) 348 | cat_str = _strip_brackets(cat_str[:var_match.start()]) 349 | 350 | if '/' not in cat_str and '\\' not in cat_str: 351 | category = _parse_atomic(cat_str, kwargs) 352 | else: 353 | category = _parse_complex(cat_str, kwargs) 354 | 355 | #if not kwargs and '{' in cat_str: 356 | # print cat_str 357 | # lexicon.CATS[cat_str] = category 358 | # lexicon.CATS[category.string] = category 359 | return category 360 | 361 | def _parse_atomic(cat_str, kwargs): 362 | if '^' in cat_str: 363 | cat_str, hat_str = cat_str.split('^', 1) 364 | kwargs['hat'] = from_string(hat_str) 365 | assert cat_str 366 | match = _ATOMIC_RE.match(cat_str) 367 | if match is None: 368 | raise StandardError(cat_str) 369 | atom, feature, conj = match.groups() 370 | if feature: 371 | if feature[1].isupper(): 372 | kwargs['feat_var'] = feature 373 | else: 374 | kwargs['feature'] = feature 375 | return Category(atom, **kwargs) 376 | 377 | 378 | def _parse_complex(cat_str, kwargs): 379 | depth = 0 380 | slashes = set(('/', '\\')) 381 | hats = [] 382 | if not cat_str.count('(') == cat_str.count(')'): 383 | raise StandardError(cat_str) 384 | for i, c in enumerate(cat_str): 385 | if c == '(': 386 | depth += 1 387 | elif c == ')': 388 | depth -= 1 389 | elif depth == 0: 390 | if c in slashes: 391 | hats = [] 392 | result = from_string(_strip_brackets(cat_str[:i])) 393 | slash = cat_str[i] 394 | argument = from_string(_strip_brackets(cat_str[i + 1:])) 395 | return Category(result, slash, argument, **kwargs) 396 | elif c == '^': 397 | hats.append(i) 398 | assert depth >= 0 399 | else: 400 | assert hats 401 | i = hats[0] 402 | kwargs['hat'] = from_string(_strip_brackets(cat_str[i + 1:])) 403 | return from_string(_strip_brackets(cat_str[:i]), **kwargs) 404 | 405 | 406 | def _strip_brackets(cat_str): 407 | if not (cat_str.startswith('(') and cat_str.endswith(')')): 408 | return cat_str 409 | depth = 0 410 | for c in cat_str: 411 | if c == '(': 412 | depth += 1 413 | elif c == ')': 414 | depth -= 1 415 | if depth == 0 and (c == '/' or c == '\\' or c == '^'): 416 | return cat_str 417 | else: 418 | return cat_str[1:-1] 419 | 420 | 421 | -------------------------------------------------------------------------------- /tests/test_replace.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test category replacement, for changeLabel 3 | """ 4 | import unittest 5 | import os.path 6 | import random 7 | 8 | import ccg.scat 9 | import ccg.rules 10 | import ccg.lexicon 11 | import ccg.grammar 12 | 13 | ccg.lexicon.load() 14 | 15 | class TestReplace(unittest.TestCase): 16 | def test_fapply_basic(self): 17 | c1 = ccg.scat.SuperCat('PP/NP') 18 | c2 = ccg.scat.SuperCat('NP') 19 | parent = ccg.scat.SuperCat('PP') 20 | production = ccg.rules.Production(c1, c2, parent) 21 | production.replace(ccg.scat.SuperCat('NP')) 22 | assert production.left == 'NP/NP' 23 | assert production.left.annotated == '(NP{_}/NP{Y}<1>){_}' 24 | 25 | def test_fapply_adjunct(self): 26 | c1 = ccg.scat.SuperCat('N/N') 27 | c2 = ccg.scat.SuperCat('N') 28 | production = ccg.rules.Production(c1, c2) 29 | production.parent = production.result 30 | production.replace(ccg.scat.SuperCat('PP')) 31 | assert production.left == 'PP/PP' 32 | assert production.left.annotated == '(PP{Y}/PP{Y}){_}' 33 | assert production.right == 'PP' 34 | 35 | def test_fapply_adjunct_feature(self): 36 | c1 = ccg.scat.SuperCat('S/S') 37 | c2 = ccg.scat.SuperCat('S[dcl]') 38 | production = ccg.rules.Production(c1, c2) 39 | production.parent = production.result 40 | production.replace(ccg.scat.SuperCat('N[num]')) 41 | assert production.left.annotated == '(N[X]{Y}/N[X]{Y}){_}' 42 | assert production.right == 'N[num]' 43 | assert production.parent == 'N[num]' 44 | assert ccg.rules.fapply(production.left, production.right) == 'N[num]' 45 | 46 | def test_fcomp_adjunct(self): 47 | c1 = ccg.scat.SuperCat('NP/NP') 48 | c2 = ccg.scat.SuperCat('NP/N') 49 | parent = ccg.scat.SuperCat('NP/N') 50 | production = ccg.rules.Production(c1, c2, parent=parent) 51 | production.replace('(S[adj]\NP)/(S[adj]\NP)') 52 | production.replace(parent) 53 | self.assertEqual(production.left, c1) 54 | self.assertEqual(production.right, c2) 55 | 56 | def test_bapply_basic(self): 57 | c1 = ccg.scat.SuperCat('NP') 58 | c2 = ccg.scat.SuperCat('S[dcl]\NP') 59 | production = ccg.rules.Production(c1, c2) 60 | production.parent = production.result 61 | production.replace(ccg.scat.SuperCat('S[em]')) 62 | assert production.right == 'S[em]\NP' 63 | assert production.right.annotated == '(S[em]{_}\NP{Y}<1>){_}' 64 | 65 | def test_fcomp_basic(self): 66 | c1 = ccg.scat.SuperCat('NP/N') 67 | c2 = ccg.scat.SuperCat('N/N') 68 | production = ccg.rules.Production(c1, c2) 69 | production.parent = production.result 70 | assert production.rule == 'fcomp' 71 | production.replace(ccg.scat.SuperCat('PP/N')) 72 | assert production.left == 'PP/N' 73 | c1 = ccg.scat.SuperCat('NP/N') 74 | c2 = ccg.scat.SuperCat('(N/PP)/S[em]') 75 | production = ccg.rules.Production(c1, c2) 76 | production.parent = production.result 77 | production.replace(ccg.scat.SuperCat('(NP/S[em])/PP')) 78 | assert production.left == 'NP/N' 79 | assert production.right == '(N/S[em])/PP' 80 | c1 = ccg.scat.SuperCat('NP/N') 81 | c2 = ccg.scat.SuperCat('(N/PP)/S[em]') 82 | production = ccg.rules.Production(c1, c2) 83 | production.parent = production.result 84 | production.replace(ccg.scat.SuperCat('((PP{_}/PP{Y}){_}/S[em]{Z}){_}')) 85 | self.assertEqual(production.left, 'PP/N') 86 | self.assertEqual(production.right, '(N/PP)/S[em]') 87 | 88 | def test_fcomp2(self): 89 | c1 = ccg.scat.SuperCat('((S[dcl]\NP)/PP)/PP') 90 | c2 = ccg.scat.SuperCat('PP/NP') 91 | production = ccg.rules.Production(c1, c2) 92 | production.parent = production.result 93 | assert production.rule == 'fcomp' 94 | production.replace(ccg.scat.SuperCat('((S[ng]\NP)/NP)/NP')) 95 | self.assertEqual(production.right, 'PP/NP') 96 | 97 | def test_bxcomp(self): 98 | # (NP\NP)/NP NP\NP --> (NP\NP)/NP 99 | # (((S[dcl]\NP)/(S[to]\NP))/PP)/NP 100 | c1 = ccg.scat.SuperCat('(NP\NP)/NP') 101 | c2 = ccg.scat.SuperCat('NP\NP') 102 | production = ccg.rules.Production(c1, c2, rule='bxcomp') 103 | production.parent = production.result 104 | assert production.parent == '(NP\NP)/NP' 105 | production.replace('(((S[dcl]\NP)/(S[to]\NP))/PP)/NP') 106 | self.assertEqual(production.left, '(((S[dcl]\NP)/(S[to]\NP))/PP)/NP') 107 | self.assertEqual(production.right, '(S\NP)\(S\NP)') 108 | production.replace('(NP\NP)/NP') 109 | print production.rule 110 | self.assertEqual(production.left.string, c1.string) 111 | self.assertEqual(production.right.string, c2.string) 112 | 113 | 114 | 115 | def test_fcomp_aux(self): 116 | # (S[dcl]\NP)/(S[ng]\NP) (S[ng]\NP)/NP --> (S[dcl]\NP)/NP 117 | # to: 118 | # (S/S)/(S[ng]\NP) (S[ng]\NP)/S[dcl] --> (S/S)/S[dcl] 119 | c1 = ccg.scat.SuperCat('(S[dcl]\NP)/(S[ng]\NP)') 120 | c2 = ccg.scat.SuperCat('(S[ng]\NP)/NP') 121 | production = ccg.rules.Production(c1, c2) 122 | production.parent = production.result 123 | assert production.rule == 'fcomp' 124 | assert production.parent == '(S[dcl]\NP)/NP' 125 | production.replace(ccg.scat.SuperCat('(S/S)/S[dcl]')) 126 | self.assertEqual(production.left, '(S/S)/(S[ng]\NP)') 127 | self.assertEqual(production.right, '(S[ng]\NP)/S[dcl]') 128 | 129 | def test_bcomp_traise(self): 130 | # ((S[dcl]\NP)/PP)/NP (S\NP)\((S\NP)/PP) --> (S[dcl]\NP)/NP 131 | # (((S[b]\NP)/PP)/(S[b]\NP))/NP (S\NP)\((S\NP)/PP) 132 | # --> ((S[b]\NP)/(S[b]\NP))/NP 133 | c1 = ccg.scat.SuperCat('((S[dcl]\NP)/PP)/NP') 134 | c2 = ccg.scat.SuperCat('(S\NP)\((S\NP)/PP)') 135 | production = ccg.rules.Production(c1, c2) 136 | production.parent = production.result 137 | production.replace(ccg.scat.SuperCat('((S[b]\NP)/(S[b]\NP))/NP')) 138 | self.assertEqual(production.left, '(((S[b]\NP)/PP)/(S[b]\NP))/NP') 139 | 140 | def test_badjunct(self): 141 | # (S[pt]\NP)/S[em] (S\NP)\(S\NP) --> (S[pt]\NP)/S[em] 142 | # PP 143 | # New left: PP 144 | # New right: PP\PP 145 | # (S[pt]\NP)/S[em] (S[pt]\NP)/S[em] 146 | # ((S\NP)/S)\((S\NP)/S) (S\NP)\(S\NP) 147 | # badjunct 148 | c1 = ccg.scat.SuperCat('(S[pt]\NP)/S[em]') 149 | c2 = ccg.scat.SuperCat('(S\NP)\(S\NP)') 150 | production = ccg.rules.Production(c1, c2, rule='bxcomp') 151 | production.parent = production.result 152 | assert production.rule == 'badjunct' 153 | production.replace(ccg.scat.SuperCat('PP')) 154 | production.replace(ccg.scat.SuperCat('(S[pt]\NP)/S[em]')) 155 | self.assertEqual(production.left, c1) 156 | self.assertEqual(production.right.string, c2.string) 157 | 158 | 159 | 160 | def test_make_adjunct(self): 161 | cat = ccg.scat.SuperCat('N/N') 162 | stripped = ccg.rules.strip_features(cat) 163 | self.assertEqual(stripped.annotated, '(N[X]{Y}/N[X]{Y}<1>){_}') 164 | adjunct = ccg.scat.make_adjunct(cat, '/') 165 | self.assertEqual(adjunct, '(N/N)/(N/N)') 166 | 167 | def test_punct(self): 168 | c1 = ccg.scat.SuperCat('PP/NP') 169 | c2 = ccg.scat.SuperCat("RQU") 170 | parent = ccg.scat.SuperCat('PP/NP') 171 | production = ccg.rules.Production(c1, c2, parent=parent) 172 | production.replace(ccg.scat.SuperCat('S[dcl]\NP')) 173 | assert production.right == "RQU" 174 | assert production.left == 'S[dcl]\NP' 175 | c1 = ccg.scat.SuperCat(',') 176 | c2 = ccg.scat.SuperCat('(S\NP)/(S\NP)') 177 | production = ccg.rules.Production(c1, c2, rule='left_punct') 178 | production.parent = production.result 179 | assert production.parent.annotated == c2.annotated 180 | production.replace(ccg.scat.SuperCat('NP/N')) 181 | assert production.left == ',' 182 | assert production.right.annotated == '(NP{Y}/N{Y}<1>){_}' 183 | 184 | def test_add_conj(self): 185 | c1 = ccg.scat.SuperCat('conj') 186 | c2 = ccg.scat.SuperCat('PP/NP') 187 | production = ccg.rules.Production(c1, c2) 188 | production.parent = production.result 189 | production.replace(ccg.scat.SuperCat('S[dcl]\NP[conj]')) 190 | assert production.right == 'S[dcl]\NP' 191 | assert production.left == 'conj' 192 | 193 | def test_do_conj(self): 194 | c1 = ccg.scat.SuperCat('S[dcl]\NP') 195 | c2 = ccg.scat.SuperCat('S[dcl]\NP[conj]') 196 | production = ccg.rules.Production(c1, c2) 197 | production.parent = production.result 198 | production.replace('(S[dcl]\NP)/NP') 199 | assert production.left == '(S[dcl]\NP)/NP' 200 | assert production.right == '(S[dcl]\NP)/NP[conj]' 201 | 202 | def test_type_raise1(self): 203 | c1 = ccg.scat.SuperCat('S/(S\NP)') 204 | c2 = ccg.scat.SuperCat('(S[dcl]\NP)/NP') 205 | production = ccg.rules.Production(c1, c2) 206 | production.parent = production.result 207 | production.replace('((S[dcl]\NP)/NP)/NP') 208 | self.assertEqual(production.left, c1.string) 209 | self.assertEqual(production.left.annotated, c1.annotated) 210 | 211 | def test_type_raise2(self): 212 | # S/(S\NP) (S[dcl]\NP)/NP --> S[dcl]/NP 213 | # (N/N)/((N/N)\NP) ((N/N)\NP)/N --> (N/N)/N 214 | c1 = ccg.scat.SuperCat('S/(S\NP)') 215 | c2 = ccg.scat.SuperCat('(S[dcl]\NP)/NP') 216 | production = ccg.rules.Production(c1, c2) 217 | production.parent = production.result 218 | assert production.rule == 'ftraise_comp' 219 | production.replace('(N/N)/N') 220 | self.assertEqual(production.left, '(N/N)/((N/N)\NP)') 221 | self.assertEqual(production.right, '((N/N)\NP)/N') 222 | production.replace('S[dcl]/NP') 223 | self.assertEqual(production.right, c2) 224 | self.assertEqual(production.left, c1) 225 | 226 | def test_type_raise3(self): 227 | # (S[pss]\NP)/(S[adj]\NP) (S\NP)\((S\NP)/(S[adj]\NP)) --> 228 | # S[pss]\NP 229 | # ((S/S)/(S[ad]\NP))\NP (S/S)\((S/S)/(S[adj]\NP)) --> 230 | # (S/S)\NP 231 | c1 = ccg.scat.SuperCat('(S[pss]\NP)/(S[adj]\NP)') 232 | c2 = ccg.scat.SuperCat('(S\NP)\((S\NP)/(S[adj]\NP))') 233 | production = ccg.rules.Production(c1, c2) 234 | production.parent = production.result 235 | production.replace(ccg.scat.SuperCat('(S/S)\NP')) 236 | self.assertEqual(production.left.string, 237 | '((S/S)\NP)/(S[adj]\NP)') 238 | self.assertEqual(production.right.string, 239 | '((S/S)\NP)\(((S/S)\NP)/(S[adj]\NP))') 240 | production.replace(ccg.scat.SuperCat('S[pss]\NP')) 241 | self.assertEqual(production.left.string, c1.string) 242 | 243 | def test_type_raise4(self): 244 | # ((S[pt]\NP)/PP)/NP (S\NP)\((S\NP)/PP) --> (S[pt]\NP)/NP 245 | # ((S[q]/PP)/(S[pss]\NP))/NP S\(S/PP) --> 246 | # (S[q]/(S[pss]\NP))/NP 247 | c1 = ccg.scat.SuperCat('((S[pt]\NP)/PP)/NP') 248 | c2 = ccg.scat.SuperCat('(S\NP)\((S\NP)/PP)') 249 | production = ccg.rules.Production(c1, c2) 250 | production.parent = production.result 251 | assert production.rule == 'btraise_comp' 252 | production.replace(ccg.scat.SuperCat('(S[q]/(S[pss]\NP))/NP')) 253 | self.assertEqual(production.left.string, '((S[q]/PP)/(S[pss]\NP))/NP') 254 | self.assertEqual(production.right.string, 'S\(S/PP)') 255 | production.replace(ccg.scat.SuperCat('(S[pt]\NP)/NP')) 256 | self.assertEqual(production.left.string, c1.string) 257 | self.assertEqual(production.right.string, c2.string) 258 | 259 | 260 | 261 | 262 | def test_feature_passing(self): 263 | c1 = ccg.scat.SuperCat('(S[X]{Y}/(S[X]{Y}/NP{_}){Y}){_}') 264 | c2 = ccg.scat.SuperCat('S[dcl]/NP') 265 | production = ccg.rules.Production(c1, c2) 266 | production.parent = production.result 267 | production.replace('S[ng]') 268 | self.assertEqual(production.left, c1) 269 | 270 | def test_replace_bug(self): 271 | # (S[b]\NP)/(S[ng]\NP) S[ng]\NP --> S[b]\NP 272 | # (((S[b]\NP)/(S[to]\NP))/(S[adj]\NP))/NP[expl] 273 | left = ccg.scat.SuperCat('(S[b]\NP)/(S[ng]\NP)') 274 | right = ccg.scat.SuperCat('S[ng]\NP') 275 | parent = ccg.scat.SuperCat('S[b]\NP') 276 | production = ccg.rules.Production(left, right, parent=parent) 277 | replacement = ccg.scat.SuperCat( 278 | '(((S[b]\NP)/(S[to]\NP))/(S[adj]\NP))/NP[expl]') 279 | production.replace(replacement) 280 | 281 | 282 | def test_determiner_apply_replace(self): 283 | left = ccg.scat.SuperCat('NP/N') 284 | right = ccg.scat.SuperCat('N') 285 | production = ccg.rules.Production(left, right) 286 | production.parent = production.result 287 | replacement = ccg.scat.SuperCat('NP/PP') 288 | production.replace(replacement) 289 | self.assertEqual(production.left.annotated, '(NP{Y}/N{Y}<1>){_}') 290 | self.assertEqual(production.right.annotated, '(N{_}/PP{Y}<1>){_}') 291 | 292 | 293 | def test_possessive_apply_replace(self): 294 | left = ccg.scat.SuperCat('NP/(N/PP)') 295 | right = ccg.scat.SuperCat('N/PP') 296 | production = ccg.rules.Production(left, right) 297 | production.parent = production.result 298 | replacement = ccg.scat.SuperCat('NP/PP') 299 | production.replace(replacement) 300 | self.assertEqual(production.left, 'NP/(N/PP)') 301 | self.assertEqual(production.right, '(N/PP)/PP') 302 | 303 | def test_all_round_trips(self): 304 | """ 305 | Test a round-trip replacement for every production rule 306 | """ 307 | random.seed(0) 308 | grammar_loc = os.path.join(os.path.split(__file__)[0], 309 | 'wsjfull.grammar') 310 | cats = ccg.lexicon.CATS.values() 311 | for parent, left, right, freq in ccg.grammar.read(grammar_loc): 312 | if right is None: 313 | continue 314 | if left not in ccg.lexicon.CATS or \ 315 | right not in ccg.lexicon.CATS: 316 | continue 317 | # Ignore these productions, where I prefer my answer: 318 | # 5: (PP/NP)/(PP/NP) PP/NP --> PP/NP RTed to PP/PP PP/NP --> PP/NP 319 | if parent == 'PP/NP' and left == '(PP/NP)/(PP/NP)' \ 320 | and right == 'PP/NP': 321 | continue 322 | # 4: ((S[adj]\NP)/PP)/((S[adj]\NP)/PP) (S[adj]\NP)/PP 323 | # --> (S[adj]\NP)/PP RTed to (S[adj]\NP)/(S[adj]\NP) on left 324 | if left == '((S[adj]\NP)/PP)/((S[adj]\NP)/PP)' \ 325 | and right == '(S[adj]\NP)/PP' and parent == '(S[adj]\NP)/PP': 326 | continue 327 | # 2: ((S[dcl]\NP)/(S[adj]\NP))/NP (S\NP)\(((S\NP)/(S[adj]\NP))/NP) 328 | # --> S[dcl]\NP 329 | # Broken category 330 | if right == '(S\NP)\(((S\NP)/(S[adj]\NP))/NP)': 331 | continue 332 | #print "%d: %s %s --> %s" % (freq, left, right, parent) 333 | c1 = ccg.scat.SuperCat(left) 334 | c1_annot = c1.annotated 335 | c1_str = c1.string 336 | c2 = ccg.scat.SuperCat(right) 337 | c2_annot = c2.annotated 338 | c2_str = c2.string 339 | parent = ccg.scat.SuperCat(parent) 340 | production = ccg.rules.Production(c1, c2, parent=parent) 341 | if production.left.is_type_raise \ 342 | and production.right.is_type_raise: 343 | continue 344 | rule = production.rule 345 | replace_with = random.choice(cats) 346 | replacement = ccg.scat.SuperCat(replace_with) 347 | if parent.conj: 348 | replacement = ccg.scat.change_kwarg(replacement, conj=True) 349 | production.replace(replacement) 350 | # Don't expect RT if replacement forces rule change 351 | if production.rule != rule: 352 | continue 353 | #print 'New left: %s' % production.left.string 354 | #print 'New right: %s' % production.right.string 355 | production.replace(parent) 356 | # Accept (S\NP)|(S\NP) for S|S 357 | if production.left.string == '(S\NP)/(S\NP)' and c1_str == 'S/S': 358 | continue 359 | elif production.right.string == '(S\NP)\(S\NP)' and c2_str == 'S\S': 360 | continue 361 | self.assertEqual(production.left.string, c1_str) 362 | self.assertEqual(production.right.string, c2_str) 363 | 364 | 365 | 366 | if __name__ == '__main__': 367 | unittest.main() 368 | -------------------------------------------------------------------------------- /ccg/scat.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import re 3 | 4 | import ccg.category 5 | import ccg.rules 6 | 7 | class SuperCat(object): 8 | """ 9 | A top-level category, participating in a derivation. Manages 10 | variable coindexation between a CCG category and HLDS terms. 11 | Tracks variable coindexation 12 | during productions. Unlike Category objects, is mutable. 13 | """ 14 | def __init__(self, category, hlds=None, word_bindings=None): 15 | if isinstance(category, str): 16 | category = ccg.category.from_string(category) 17 | elif isinstance(category, SuperCat): 18 | category = category.category 19 | else: 20 | assert isinstance(category, ccg.category.Category) 21 | # Have a unique variable ID for each category variable. 22 | # Store the mapping from unique IDs to category vars and vice versa 23 | var_table = {} 24 | for var in category.cats_by_var: 25 | var_id = Variable() 26 | var_table[var] = set([var_id]) 27 | self._var_table = var_table 28 | self.category = category 29 | self.hlds_children = defaultdict(set) 30 | self.hlds_parents = defaultdict(set) 31 | self.hlds_passed = set() 32 | self.srl_annot = set() 33 | 34 | 35 | def __getattr__(self, attr): 36 | if attr in self.__dict__: 37 | return self.__dict__[attr] 38 | elif hasattr(self.category, attr): 39 | return getattr(self.category, attr) 40 | else: 41 | raise AttributeError(attr) 42 | 43 | def __eq__(self, other): 44 | return self.category == other 45 | 46 | def __ne__(self, other): 47 | return self.category != other 48 | 49 | def __str__(self): 50 | return str(self.category) 51 | 52 | def __hash__(self): 53 | return hash(self.category) 54 | 55 | def __repr__(self): 56 | return repr(self.category) 57 | 58 | def global_annotated(self): 59 | annotated = self.annotated.replace('*}', '}') 60 | for var in self.cats_by_var: 61 | global_vars = [v for v in self._var_table[var] if v.word is not None] 62 | try: 63 | global_vars = sorted(global_vars, key=lambda gv: gv.word) 64 | except: 65 | raise 66 | global_str = '{%s}' % ','.join(str(v) for v in global_vars) 67 | var_str = ccg.category.VARS[var] 68 | annotated = annotated.replace('{%s}' % var_str, global_str) 69 | return annotated 70 | 71 | def bind_vars(self, other, self_cat, other_cat): 72 | """ 73 | Unify the global variables of a piece of this scat against 74 | the piece of another scat. Use the other's variable 75 | table to retrieve the other's global variables. 76 | """ 77 | assert self_cat == other_cat 78 | to_unify = set() 79 | for path, self_piece in self_cat.cats.items(): 80 | other_piece = other_cat.cats[path] 81 | self_var = self_piece.var 82 | other_var = other_piece.var 83 | if not self.can_unify(other, self_var, other_var): 84 | return False 85 | to_unify.add((self_piece.var, other_piece.var)) 86 | for self_var, other_var in to_unify: 87 | self_cats = self_cat.cats_by_var[self_var] 88 | other_cats = other_cat.cats_by_var[other_var] 89 | self.unify_globals_at_var(other, self_var, other_var) 90 | return True 91 | 92 | def add_hlds_child(self, relation, global_var): 93 | """ 94 | Set a child dependency 95 | """ 96 | self.hlds_children[global_var].add(relation) 97 | 98 | def add_hlds_parent(self, relation, global_var): 99 | self.hlds_parents[global_var].add(relation) 100 | 101 | def add_hlds_passed(self, parent_var, relation, child_var): 102 | self.hlds_passed.add((parent_var, relation, child_var)) 103 | 104 | 105 | def get_vars(self, cat=None): 106 | if cat is None: 107 | cat = self 108 | return set(v.get_ref() for v in self._var_table[cat.var]) 109 | 110 | def add_var(self, i, var): 111 | ref = var.get_ref() 112 | self._var_table[i] = set(v.get_ref() for v in self._var_table[i]) 113 | self._var_table[i].add(ref) 114 | 115 | def unify_globals_at_var(self, other, var, other_var=None): 116 | if other_var is None: 117 | other_var = var 118 | s_vars = self._var_table[var] 119 | o_vars = other._var_table[other_var] 120 | if len(s_vars) == len(o_vars) == 1: 121 | list(s_vars)[0].unify(list(o_vars)[0]) 122 | else: 123 | list(s_vars)[0].unify(list(o_vars)[0]) 124 | # The unification is not complete here, which may cause problems. 125 | # But cannot unify other to both in self, or self vars will 126 | # be unified to each other :( 127 | var_set = set([v.get_ref() for v in s_vars.union(o_vars)]) 128 | self._var_table[var] = var_set 129 | other._var_table[other_var] = var_set 130 | 131 | def can_unify(self, other, var, other_var): 132 | s_vars = self._var_table[var] 133 | s_words = set([s_var.word for s_var in s_vars]) 134 | o_vars = other._var_table[other_var] 135 | for s_var in s_vars: 136 | s_word = s_var.word 137 | if not s_word: 138 | continue 139 | for o_var in o_vars: 140 | o_word = o_var.word 141 | # Patch for conjunction, see wsj_0047.11 for eg 142 | # May be bad idea? 143 | if o_word and o_word not in s_words: 144 | return False 145 | return True 146 | 147 | 148 | def has_head(self, word, cat=None): 149 | for v in self.get_vars(cat): 150 | if v.word is word: 151 | return True 152 | else: 153 | return False 154 | 155 | def has_dep(self, word): 156 | if self.has_head(word): 157 | return False 158 | for r, a, s, k in self.deconstruct(): 159 | if self.has_head(word, a): 160 | return True 161 | return False 162 | 163 | def add_head(self, word): 164 | s_vars = self.get_vars() 165 | #assert len(s_vars) == 1 166 | for v in s_vars: 167 | # Sadly this fails too often :( 168 | # When it does it indicates a real problem, but the problems 169 | # are quite difficult to solve... 170 | #assert not v.word 171 | v.word = word 172 | 173 | def heads(self, cat=None): 174 | if cat is None: 175 | cat = self 176 | return sorted(set([v.word for v in self.get_vars(cat) if v.word])) 177 | 178 | def deconstruct(self): 179 | for r, a, s, k in self.category.deconstruct(): 180 | k = dict(k) 181 | k['arg_global_vars'] = self.get_vars(a) 182 | yield r, a, s, k 183 | 184 | 185 | def cats_at_global(self, global_var): 186 | """ 187 | Find all cats whose vars map to this var's value. 188 | Can't simply have a reverse index, because var's values change 189 | on unification 190 | """ 191 | cats = set() 192 | val = global_var.val 193 | for cat_var, var_set in self._var_table.items(): 194 | for var in var_set: 195 | if var.val == val: 196 | for cat in self.cats_by_var[cat_var]: 197 | cats.add(cat) 198 | return cats 199 | 200 | def all_globals(self): 201 | global_vars = set() 202 | for var_set in self._var_table.values(): 203 | global_vars.update(v.get_ref() for v in var_set) 204 | return global_vars 205 | 206 | def map_letters_to_words(self): 207 | """ 208 | Return a dictionary mapping letter-variables e.g. _, Y, Z 209 | to word sets, e.g. {'_': set(Pierre, Holly)}, where 210 | Pierre and Holly are CCGLeaf instances 211 | """ 212 | mapping = {} 213 | for var, cats in self.cats_by_var.items(): 214 | letter_var = ccg.category.VARS[var] 215 | heads = self.heads(cats[0]) 216 | mapping[letter_var] = heads 217 | return mapping 218 | 219 | def add_srl_annot_from_srl_string(self, srl_annot_str): 220 | """ 221 | Populate the srl_annot set with triples from an 222 | srl_string. srl_strings look like X'P:A0'Y_X'P:A1'Z 223 | """ 224 | assert not self.srl_annot, '%s %s' % (self.srl_annot, self.annotated) 225 | if srl_annot_str == '@': 226 | return None 227 | for srl_triple in srl_annot_str.split('_'): 228 | if not srl_triple: 229 | continue 230 | srl_triple = srl_triple.replace('X', '_') 231 | srl_triple = srl_triple.replace('E_T', 'EXT') 232 | head_letter, label, child_letter = srl_triple.split("'") 233 | head_var = ccg.category.VARS.index(head_letter) 234 | child_var = ccg.category.VARS.index(child_letter) 235 | if head_var not in self._var_table or child_var not in self._var_table: 236 | err = "Var not found from srl_string %s for cat %s" 237 | raise StandardError, err % (srl_string, self.annotated) 238 | srl_tuple = tuple(srl_triple.split("'")) 239 | self.srl_annot.add(srl_tuple) 240 | 241 | 242 | 243 | def convert_hlds_to_srl_annot(self): 244 | """ 245 | For each SRL label bound to the category, print 246 | X label Y, where X and Y are the local variables 247 | for the head and child. 248 | """ 249 | labels = set() 250 | for var, srl_labels in self.hlds_parents.items(): 251 | for cat in self.cats_at_global(var): 252 | for label in srl_labels: 253 | # Child of hlds_parents is always own lexical variable 254 | labels.add((ccg.category.VARS[cat.var], label, '_')) 255 | for var, srl_labels in self.hlds_children.items(): 256 | for cat in self.cats_at_global(var): 257 | for label in srl_labels: 258 | # Parent of hlds_children is always own lexical variable 259 | labels.add(('_', label, ccg.category.VARS[cat.var])) 260 | for var1, label, var2 in self.hlds_passed: 261 | for cat1 in self.cats_at_global(var1): 262 | for cat2 in self.cats_at_global(var2): 263 | labels.add((ccg.category.VARS[cat1.var], label, 264 | ccg.category.VARS[cat2.var])) 265 | self.srl_annot = labels 266 | 267 | 268 | annot_strip_re = re.compile(r'<\d>') 269 | var_find_re = re.compile(r'(?<={)[A-Z]') 270 | def srl_string(self): 271 | """ 272 | Create an annotated string referencing semantic roles, and 273 | markedup entries for the role dependencies 274 | """ 275 | triple_strs = ["'".join(triple).replace('_', 'X') for triple in 276 | self.srl_annot] 277 | triple_strs.sort() 278 | stag_annot = '_'.join(triple_strs) 279 | stag_str = '%s@%s' % (self.string, stag_annot) 280 | seen_vars = set() 281 | roles = [] 282 | for head, label, child in self.srl_annot: 283 | if head == '_' and child == '_': 284 | roles.append(('_', label, ' %l %l')) 285 | continue 286 | elif head == '_': 287 | var = child 288 | lf = '%l %f' 289 | elif child == '_': 290 | var = head 291 | lf = '%f %l' 292 | else: 293 | raise Exception 294 | seen_vars.add(var) 295 | roles.append((var, label, lf)) 296 | for var, cats in self.cats_by_var.items(): 297 | var = ccg.category.VARS[var] 298 | if var == 0 or var in seen_vars: 299 | continue 300 | for cat in cats: 301 | if cat.arg_idx: 302 | seen_vars.add(var) 303 | roles.append((var, 'ignore', '')) 304 | var_to_args = {} 305 | for v in self.var_find_re.findall(self.annotated): 306 | if v in seen_vars: 307 | var_to_args.setdefault(v, len(var_to_args) + 1) 308 | roles = ['%d %s %s' % (var_to_args.get(v, 0), l, lf) for v, l, lf in roles] 309 | roles.sort() 310 | annotated = self.annot_strip_re.sub('', self.annotated) 311 | # Add argument numbers to string 312 | # We need to do the replacement at the rightmost point, 313 | # so reverse the string and add the replacement backwards 314 | annotated = ''.join(reversed(annotated)) 315 | # Remove the *'s, as they're irrelevant to us 316 | # Um why do we need the rightmost point? 317 | for var, arg_num in var_to_args.items(): 318 | var_annot = '}%s{' % var 319 | #var_annot = '{%s}' % var 320 | assert var_annot in annotated, annotated + ' ' + var_annot 321 | var_arg = ('>%d<' % arg_num) + var_annot 322 | #var_arg = '%s<%d>' % (var_annot, arg_num) 323 | annotated = annotated.replace(var_annot, var_arg, 1) 324 | annotated = annotated.replace('*', '') 325 | annotated = ''.join(reversed(annotated)) # Unreverse now that we're done 326 | # Append the @ annotation to the annotated string 327 | annotated = '%s@%s' % (annotated, stag_annot) 328 | return len(var_to_args), stag_str, annotated, roles 329 | 330 | def srl_deps_from_annot(self): 331 | var_map = dict((var, i) for i, var in enumerate(ccg.category.VARS)) 332 | for head_var, label, child_var in sorted(self.srl_annot): 333 | head_globals = self._var_table[var_map[head_var]] 334 | child_globals = self._var_table[var_map[child_var]] 335 | for head_global in head_globals: 336 | for child_global in child_globals: 337 | if head_global.word and child_global.word: 338 | yield head_global.word, label, child_global.word 339 | 340 | 341 | class Variable(object): 342 | _next = 0 343 | def __init__(self): 344 | Variable._next += 1 345 | self._val = Variable._next 346 | self._ref = None 347 | self._word = None 348 | 349 | def __eq__(self, other): 350 | return self.val == other.val 351 | 352 | def __ne__(self, other): 353 | return not self == other 354 | 355 | def __cmp__(self, other): 356 | return cmp(self.val, other.val) 357 | 358 | def __hash__(self): 359 | return hash(self.val) 360 | 361 | @property 362 | def val(self): 363 | return self.get_ref()._val 364 | 365 | @property 366 | def word(self): 367 | return self.get_ref()._word 368 | 369 | @word.setter 370 | def word(self, word): 371 | self.get_ref()._word = word 372 | 373 | def get_ref(self): 374 | var = self 375 | while var._ref is not None: 376 | var = var._ref 377 | return var 378 | 379 | def __str__(self): 380 | ref = self.get_ref() 381 | if ref._word: 382 | return ref._word.text 383 | else: 384 | return 'v%d' % ref._val 385 | 386 | def __repr__(self): 387 | return str(self) 388 | 389 | def unify(self, other): 390 | if self is other: 391 | return None 392 | self_ref = self.get_ref() 393 | other_ref = other.get_ref() 394 | if self_ref is other_ref: 395 | return None 396 | ### nicky_random_debugging_destruction - commented out: 397 | ### wsj_0023.3 (percent) breaks with this assert statement. 33 % of ... 398 | ### assert not (self_ref._word and other_ref._word) 399 | other_ref._ref = self_ref 400 | if other_ref._word and not self_ref._word: 401 | self_ref._word = other_ref._word 402 | if self_ref._word and not other_ref._word: 403 | other_ref._word = self_ref._word 404 | 405 | 406 | def replace_result(scat, new_res): 407 | assert scat.is_complex 408 | arg = scat.argument 409 | var_map = {} 410 | res_vars = new_res.cats_by_var 411 | next_var = max(res_vars) + 1 412 | for var in arg.cats_by_var: 413 | if var in res_vars and var not in var_map: 414 | var_map[var] = next_var 415 | next_var += 1 416 | arg = ccg.rules.remap_vars(arg, var_map) 417 | new_cat = ccg.category.Category(new_res, scat.slash, arg, **scat.kwargs) 418 | return SuperCat(new_cat) 419 | 420 | def replace_inner_result(scat, new_res): 421 | raise Exception("Not implemented yet") 422 | 423 | def add_args(res, args, reorder = False): 424 | if reorder: 425 | res, args = reorder_args(res, args) 426 | for arg, slash, kwargs in args: 427 | if 'arg_global_var' in kwargs: 428 | global_var = kwargs.pop('arg_global_var') 429 | else: 430 | global_var = None 431 | res = add_arg(res, slash, arg, **kwargs) 432 | if global_var: 433 | # Unify the variable with the one passed in 434 | for var in res.get_vars(res.argument): 435 | var.unify(global_var) 436 | assert res.var == 0 437 | return res 438 | 439 | def reorder_args(res, args): 440 | # Order args so that, for non-adjunct args, backward args are always added 441 | # first. 442 | backward = [] 443 | forward = [] 444 | for arg, slash, kwargs in args: 445 | #if kwargs.get('var', res.var) != res.var or kwargs.get('hat'): 446 | # res = ccg.category.Category(res, slash, arg, **kwargs) 447 | if slash == '/': 448 | forward.append((arg, slash, kwargs)) 449 | else: 450 | backward.append((arg, slash, kwargs)) 451 | return res, backward + forward 452 | 453 | 454 | 455 | def add_arg(result, slash, arg, revar=True, **kwargs): 456 | # Revar means to assume the extra arg is not coindexed to something 457 | if revar: 458 | arg = change_kwarg(arg, var=result.next_var) 459 | category = ccg.category.Category(result, slash, 460 | arg, **kwargs) 461 | new_scat = SuperCat(category) 462 | if hasattr(result, 'bind_vars'): 463 | new_scat.bind_vars(result, new_scat.result, result.category) 464 | if hasattr(arg, 'bind_vars'): 465 | new_scat.bind_vars(arg, new_scat.argument, arg.category) 466 | return new_scat 467 | 468 | def make_adjunct(cat, slash, force_dep=True): 469 | # Decide which category to base adjunct on 470 | if cat.is_complex: 471 | for res, arg, s, _ in reversed(list(cat.deconstruct())): 472 | if force_dep and res.var != cat.var: 473 | continue 474 | # Don't reduce (S\NP)|(S\NP) to S|S 475 | if res.var == 0 and \ 476 | not (res == 'S' and arg == 'NP' and s == '\\'): 477 | cat = res 478 | break 479 | else: 480 | cat = res if not force_dep else cat 481 | var_map = {0: cat.next_var} 482 | new_cat = ccg.rules.remap_vars(cat, var_map) 483 | new_cat = ccg.rules.strip_features(new_cat) 484 | return SuperCat(ccg.category.Category(new_cat, slash, new_cat, var=0)) 485 | 486 | def change_kwarg(cat, **kwargs): 487 | cat_kwargs = cat.kwargs.copy() 488 | cat_kwargs.update(kwargs) 489 | new_cat = ccg.category.Category(cat.result, cat.slash, cat.argument, 490 | **cat_kwargs) 491 | if hasattr(cat, 'bind_vars'): 492 | new_scat = SuperCat(new_cat) 493 | new_scat.bind_vars(cat, new_cat, cat.category) 494 | return new_scat 495 | else: 496 | return new_cat 497 | 498 | def type_raise(t_cat, slash, arg_cat): 499 | t_cat = ccg.rules.strip_features(t_cat) 500 | next_var = arg_cat.next_var 501 | var_map = {} 502 | for var in t_cat.cats_by_var: 503 | var_map.setdefault(var, len(var_map.keys()) + next_var) 504 | t_cat = ccg.rules.remap_vars(t_cat, var_map) 505 | inner_slash = '\\' if slash == '/' else '/' 506 | argument = add_arg(t_cat, inner_slash, arg_cat, var=t_cat.var, revar=False) 507 | return add_arg(t_cat, slash, argument, var=arg_cat.var) 508 | -------------------------------------------------------------------------------- /ccg/rules.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | from ccg.category import from_string, Category 4 | import ccg 5 | import re 6 | 7 | VARS = ['_', 'Y', 'Z', 'W', 'V', 'U', 'T', 'S'] 8 | _ARG_IDX_RE = re.compile(r'<\d>') 9 | MAX_COMP_DEPTH = 3 10 | 11 | 12 | class Production(object): 13 | """ 14 | A CCG production rule. Tracks combinators used, 15 | unification, and manages change propagation. 16 | """ 17 | combinators = ['add_conj', 'do_conj', 18 | 'fapply', 'bapply', 'bcomp', 'fcomp', 19 | 'bxcomp', 'fxcomp', 20 | 'left_punct', 'right_punct', 'comma_conj'] 21 | def __init__(self, left, right, parent=None, rule=None): 22 | assert left 23 | self.left = left 24 | self.right = right 25 | self._y = None 26 | self._x = None 27 | if rule: 28 | combinator = getattr(self, rule) 29 | result, depth = combinator(left, right) 30 | else: 31 | rule, result, depth = self.get_rule(left, right, parent) 32 | self.depth = depth 33 | self.result = result 34 | if not right: 35 | pass 36 | elif left.is_adjunct and rule.startswith('f') and self._y and \ 37 | self._y.var == right.var: 38 | rule = 'fadjunct' 39 | elif right.is_adjunct and rule.startswith('b') and self._y and \ 40 | self._y.var == left.var: 41 | rule = 'badjunct' 42 | elif left.is_type_raise and rule.startswith('f'): 43 | rule = 'ftraise_comp' 44 | elif right.is_type_raise and rule.startswith('b'): 45 | rule = 'btraise_comp' 46 | self.rule = rule 47 | self.parent = parent 48 | self.force_dep = True 49 | 50 | def __str__(self): 51 | return '%s %s --> %s (%s)' % (self.left, self.right, self.parent, 52 | self.rule) 53 | 54 | def get_rule(self, left, right, parent = None): 55 | if right is None: 56 | assert parent 57 | unary_rules = [('traise', self.traise), ('unary', self.unary)] 58 | for rule, combinator in unary_rules: 59 | result, depth = combinator(parent, left) 60 | if result and parent.exact_eq(result): 61 | return rule, result, depth 62 | else: 63 | return 'unary', None, 0 64 | for rule in self.combinators: 65 | combinator = getattr(self, rule) 66 | result, depth = combinator(left, right) 67 | if result and ((not parent) or parent.exact_eq(result)): 68 | return rule, result, depth 69 | else: 70 | if parent: 71 | result, depth = self.binary(left, right, parent) 72 | if result and parent.exact_eq(result): 73 | return 'binary', result, 0 74 | return 'invalid', parent, 0 75 | 76 | 77 | def replace(self, new): 78 | if not isinstance(new, ccg.scat.SuperCat): 79 | new = ccg.scat.SuperCat(new) 80 | assert self.parent 81 | if self.rule == 'fapply': 82 | left, right = self._apply_replace(self.left, self.right, new) 83 | elif self.rule == 'bapply': 84 | right, left = self._apply_replace(self.right, self.left, new) 85 | elif self.rule == 'fcomp' or self.rule == 'fxcomp': 86 | left, right = self._comp_replace(self.left, self.right, new) 87 | elif self.rule == 'bcomp' or self.rule == 'bxcomp': 88 | right, left = self._comp_replace(self.right, self.left, new) 89 | elif self.rule == 'fadjunct': 90 | left, right = self._adjunct_replace(self.left, self.right, new) 91 | elif self.rule == 'badjunct': 92 | right, left = self._adjunct_replace(self.right, self.left, new) 93 | elif self.rule == 'left_punct': 94 | left = self.left 95 | right = new 96 | elif self.rule == 'ftraise_comp': 97 | left, right = self._traise_comp_replace(self.left, self.right, new) 98 | elif self.rule == 'btraise_comp': 99 | right, left = self._traise_comp_replace(self.right, self.left, new) 100 | elif self.rule == 'right_punct': 101 | right = self.right 102 | left = new 103 | elif new.conj and \ 104 | (self.rule == 'add_conj' or self.rule == 'comma_conj'): 105 | left = self.left 106 | right = ccg.scat.change_kwarg(new, conj=False) 107 | elif self.rule == 'do_conj': 108 | left = new 109 | right = ccg.scat.change_kwarg(new, conj=True) 110 | elif self.rule == 'traise': 111 | left = self._traise_replace(self.left, new) 112 | right = None 113 | elif self.rule == 'unary': 114 | left = self.left 115 | right = self.right 116 | elif self.rule == 'invalid' or self.rule == 'unary': 117 | left = self.left 118 | right = self.right 119 | else: 120 | raise Exception(self.rule) 121 | self.left = left 122 | self.right = right 123 | self.parent = new 124 | return left, right 125 | 126 | def _apply_replace(self, func, arg, new): 127 | # Special case for determiners where new has grown an argument 128 | determiners = ['NP/N', 'NP/(N/PP)', 'PP/NP'] 129 | if func in determiners and new.is_complex and \ 130 | new.inner_result == self.parent.inner_result: 131 | args = [(a, s, k) for r, a, s, k in new.deconstruct()] 132 | new_arg = ccg.scat.add_args(arg, args) 133 | return func, new_arg 134 | if func.result.feat_var: # Preserve feature passing 135 | new = ccg.scat.change_kwarg(new, feature='', 136 | feat_var=func.result.feat_var) 137 | new_func = ccg.scat.replace_result(func, new) 138 | return new_func, arg 139 | 140 | def _adjunct_replace(self, func, arg, new): 141 | new_func = ccg.scat.make_adjunct(new, func.slash, True) 142 | return new_func, new 143 | 144 | def _comp_replace(self, func, arg, new): 145 | # If new isn't complex we can't compose. Just use application. 146 | if not new.is_complex: 147 | functor = ccg.scat.add_arg(new, func.slash, arg, **func.kwargs) 148 | self.rule = 'fapply' 149 | return functor, arg 150 | # Y category must be the same as before, as it's not in the parent 151 | # Give the argument and result the Ys they each had originally 152 | res_y = func.argument 153 | for arg_y, slash, z, _ in arg.deconstruct(): 154 | if arg_y == res_y: 155 | break 156 | else: 157 | if func.argument == arg: 158 | res_y = arg 159 | else: 160 | raise Exception 161 | orig_x = func.result 162 | # Get the new Z (or Zs in the case of generalised comp) 163 | dollars = [] 164 | for res, arg, slash, kwargs in new.deconstruct(): 165 | dollars.append((arg, slash, kwargs)) 166 | if res.is_adjunct: 167 | break 168 | #if res == 'S\NP': 169 | # break 170 | else: 171 | # Don't accumulate arguments on adjuncts or determiners 172 | if func != 'NP/N' and not func.is_adjunct: 173 | res = new.result 174 | dollars = [(new.argument, new.slash, new.kwargs)] 175 | if orig_x.feat_var: # Preserves feature-passing 176 | res = ccg.scat.change_kwarg(res, feature='', 177 | feat_var=orig_x.feat_var, 178 | var=orig_x.var) 179 | if func.is_adjunct: 180 | if new.is_adjunct: 181 | functor = ccg.scat.SuperCat(new) 182 | elif new.inner_result.var != new.var: 183 | if any(c == 'S\NP' for p, c in new.cats.items() if not any(p)): 184 | functor = ccg.scat.make_adjunct(ccg.VP, func.slash) 185 | else: 186 | functor = ccg.scat.make_adjunct(new.inner_result, func.slash) 187 | else: 188 | functor = ccg.scat.make_adjunct(new, func.slash) 189 | return functor, ccg.scat.SuperCat(new) 190 | functor = ccg.scat.add_arg(res, func.slash, res_y, **func.kwargs) 191 | dollars.reverse() 192 | argument = ccg.scat.add_args(arg_y, dollars) 193 | return functor, argument 194 | 195 | def _traise_replace(self, child, new): 196 | assert new.is_type_raise 197 | left = ccg.scat.SuperCat(new.argument.argument) 198 | new.bind_vars(new, new.argument.argument, left) 199 | return left 200 | 201 | def _traise_comp_replace(self, func, arg, new): 202 | # Type-raise-type-raise composition is a special case used for 203 | # argument cluster coordination. It's dangerous to clobber it 204 | # with a non-type-raised new category. 205 | if arg.is_type_raise and not new.is_type_raise: 206 | assert func.is_type_raise 207 | raise Exception("Should not replace raise-raise composition with" 208 | "non-raised category.") 209 | # New == T/$ 210 | # Func == T/(T\R) 211 | # Arg == (T\R)/$ 212 | r = func.argument.argument 213 | dollars = [] 214 | for t, z, slash, kwargs in new.deconstruct(): 215 | dollars.append((z, slash, kwargs)) 216 | if t.is_adjunct: 217 | break 218 | else: 219 | if not new.is_complex: 220 | t = new 221 | # Now, where do we place the R relative to the $s? Let's say we have 222 | # R=(/PP) and $s=[(/NP), (\NP)] (where last will be added first) 223 | # We could redefine T to T\NP, so that we get 224 | # an argument cat of (T\NP)/PP)/NP. OR, we could keep T, and get 225 | # ((T/PP)\NP)/NP. They're equivalent, but the latter will get 226 | # non-standard cats. So what we must do is check whether the slashes 227 | # for the last dollar and the R disagree. If they do, we should redefine 228 | # T to append the last dollar, which is popped. 229 | if dollars and dollars[-1][1] == '\\' and func.argument.slash == '/': 230 | last_arg, last_slash, last_kwarg = dollars.pop() 231 | t = ccg.scat.add_arg(t, last_slash, last_arg, **last_kwarg) 232 | dollars.append((r, func.argument.slash, {})) 233 | dollars.reverse() 234 | functor = ccg.scat.type_raise(t, func.slash, r) 235 | argument = ccg.scat.add_args(t, dollars) 236 | return functor, argument 237 | 238 | 239 | 240 | 241 | def fapply(self, left, right): 242 | if not self._check_dir(left, '/'): 243 | return False, 0 244 | return self._application(left, right) 245 | 246 | def bapply(self, left, right): 247 | if not self._check_dir(right, '\\'): 248 | return False, 0 249 | return self._application(right, left) 250 | 251 | def fcomp(self, left, right): # Don't do general for now 252 | if not self._check_dir(left, '/') or not self._check_dir(right, '/'): 253 | return False, 0 254 | return self._composition(left, right) 255 | 256 | def bcomp(self, left, right): 257 | if not self._check_dir(left, '\\') or not self._check_dir(right, '\\'): 258 | return False, 0 259 | return self._composition(right, left) 260 | 261 | def fxcomp(self, left, right): 262 | if not left.is_complex or not right.is_complex: 263 | return False, 0 264 | if not self._check_dir(left, '/'): 265 | return False, 0 266 | return self._composition(left, right, crossing=True) 267 | 268 | def bxcomp(self, left, right): 269 | if not left.is_complex and self._check_dir(right, '\\'): 270 | return False, 0 271 | return self._composition(right, left, crossing=True) 272 | 273 | def add_conj(self, left, right): 274 | """ 275 | Multi-variables for conj is so far a failure. Make conjuncted 276 | constituents headed by the conjunction 277 | """ 278 | if left != ccg.CONJ or right.conj: 279 | return False, 0 280 | return self._do_add_conj(left, right) 281 | 282 | def _do_add_conj(self, left, right): 283 | # This should take care of variable binding too 284 | scat = ccg.scat.change_kwarg(right, conj=True) 285 | return scat, 0 286 | 287 | def comma_conj(self, left, right): 288 | if left != ccg.COMMA and left != ccg.SEMI_COLON and left != ccg.COLON: 289 | return False, 0 290 | return self._do_add_conj(left, right) 291 | 292 | def do_conj(self, left, right): 293 | if not right.conj: 294 | return False, 0 295 | if left.conj: 296 | return False, 0 297 | new_right = ccg.scat.change_kwarg(right, conj=False) 298 | if not new_right.exact_eq(left): 299 | return False, 0 300 | for path, right_cat in new_right.cats.items(): 301 | if right_cat.var > 0: 302 | new_right.unify_globals_at_var(left, right_cat.var, 303 | left.cats[path].var) 304 | for var in left.get_vars(): 305 | new_right.add_var(0, var) 306 | return new_right, 0 307 | 308 | def left_punct(self, left, right): 309 | if not left.is_punct: 310 | return False, 0 311 | return right, 0 312 | 313 | def right_punct(self, left, right): 314 | if not right.is_punct: 315 | return False, 0 316 | return left, 0 317 | 318 | def traise(self, parent, child): 319 | # Type raising 320 | if not parent.is_complex: 321 | return False, 0 322 | if not parent.argument.is_complex: 323 | return False, 0 324 | if not parent.result.exact_eq(parent.argument.result): 325 | return False, 0 326 | if not parent.argument.argument.exact_eq(child): 327 | return False, 0 328 | result = ccg.scat.type_raise(parent.result, parent.slash, child) 329 | result.bind_vars(child, result.argument.argument, child.category) 330 | return result, 0 331 | 332 | def unary(self, parent, child): 333 | key = (parent.string, child.string) 334 | if key not in TypeChanging.rules: 335 | return False, 0 336 | else: 337 | result = ccg.scat.SuperCat(parent.category) 338 | bindings = TypeChanging.rules[key] 339 | for parent_var, child_var in bindings: 340 | try: 341 | result.unify_globals_at_var(child, parent_var, child_var) 342 | except KeyError: 343 | raise 344 | return result, 0 345 | 346 | def binary(self, left, right, parent): 347 | key = (parent.string, left.string, right.string) 348 | if key not in BinaryTypeChanging.rules: 349 | return False, 0 350 | else: 351 | bindings = BinaryTypeChanging.rules[key] 352 | result = ccg.scat.SuperCat(parent.category) 353 | for parent_var, left_var, right_var in bindings: 354 | if left_var is None: 355 | assert right_var is not None 356 | try: 357 | result.unify_globals_at_var( 358 | right, parent_var, right_var) 359 | except KeyError: 360 | raise 361 | elif right_var is None: 362 | assert left_var is not None 363 | try: 364 | result.unify_globals_at_var(left, parent_var, left_var) 365 | except KeyError: 366 | raise 367 | else: 368 | raise Exception 369 | return result, 0 370 | 371 | def _application(self, functor, argument): 372 | if functor.conj or argument.conj: 373 | return False, 0 374 | if functor.argument != argument: 375 | return False, 0 376 | has_bound = functor.bind_vars(argument, functor.argument, argument.category) 377 | if not has_bound: 378 | return False, 0 379 | result = functor.result 380 | c1_to_c2, c2_to_c1 = self._var_to_feats(functor.argument, argument) 381 | result, var_map = minimise_vars(result, c1_to_c2) 382 | self._x = functor.result 383 | self._y = argument 384 | result_scat = ccg.scat.SuperCat(result) 385 | functor.bind_vars(result_scat, functor.result, result_scat.category) 386 | return result_scat, 0 387 | 388 | def _composition(self, functor, arg, crossing = False): 389 | if functor.conj or arg.conj: 390 | return False, 0 391 | if not functor.is_complex or not arg.is_complex: 392 | return False, 0 393 | depth = 0 394 | # X/Y (Y/Z_1)/Z_2 etc 395 | x_y = functor.argument 396 | self._x = functor.result 397 | yz = arg 398 | zs = [] 399 | 400 | while depth < MAX_COMP_DEPTH and yz.is_complex: 401 | zs.append((yz.argument, yz.slash, yz.kwargs.copy())) 402 | if yz.result != x_y: 403 | yz = yz.result 404 | depth += 1 405 | else: 406 | self._y = yz.result 407 | break 408 | else: 409 | return False, 0 410 | 411 | # For non-crossing composition, the slashes must be consistent. 412 | # For crossing composition, they must be inconsistent. 413 | if all(s == functor.slash for (arg, s, k) in zs) == crossing: 414 | return False, 0 415 | functor.bind_vars(arg, x_y, self._y) 416 | max_var = max(functor.cats_by_var) + 1 417 | arg_to_final = self._map_vars(x_y, yz.result, max_var, arg.cats_by_var) 418 | curr_cat = functor.result 419 | for z, slash, kwargs in reversed(zs): 420 | z = remap_vars(z, arg_to_final) 421 | kwargs['var'] = arg_to_final.get(kwargs.get('var', 99), max_var) 422 | curr_cat = Category(curr_cat, slash, z, **kwargs) 423 | 424 | c1_to_c2, c2_to_c1 = self._var_to_feats(functor.argument, yz.result) 425 | c1_to_c2.update(c2_to_c1) 426 | result, var_map = minimise_vars(curr_cat, c1_to_c2) 427 | 428 | # Bind the global variables 429 | scat = ccg.scat.SuperCat(result) 430 | 431 | # Take outer var from arg 432 | scat.unify_globals_at_var(arg, 0) 433 | 434 | arg_res = arg 435 | for result, z, _, _ in scat.deconstruct(): 436 | scat.bind_vars(arg, z, arg_res.argument) 437 | if result == functor.result: 438 | scat.bind_vars(functor, result, functor.result) 439 | break 440 | arg_res = arg_res.result 441 | return scat, depth 442 | 443 | def _check_dir(self, cat, slash): 444 | if not cat.is_complex: 445 | return False 446 | if cat.slash != slash: 447 | return False 448 | return True 449 | 450 | def _map_vars(self, func_u, arg_u, next_var, arg_vars): 451 | # Map variables from argument to functor 452 | arg_to_final = {} 453 | for path, acat in arg_u.cats.items(): 454 | fcat = func_u.cats[path] 455 | arg_to_final[acat.var] = fcat.var 456 | for var in arg_vars: 457 | if var not in arg_to_final: 458 | arg_to_final[var] = next_var 459 | next_var += 1 460 | return arg_to_final 461 | 462 | def _var_to_feats(self, cat1, cat2): 463 | """ 464 | Map feature variables to feature values for the unified pieces 465 | """ 466 | c1_to_c2 = {} 467 | c2_to_c1 = {} 468 | for path, sub1 in cat1.cats.items(): 469 | sub2 = cat2.cats[path] 470 | if sub1.feat_var and sub2.feature: 471 | c1_to_c2[sub1.feat_var] = sub2.feature 472 | elif sub2.feat_var and sub1.feature: 473 | c2_to_c1[sub2.feat_var] = sub1.feature 474 | return c1_to_c2, c2_to_c1 475 | 476 | class TypeChanging(object): 477 | # Note which variables to bind 478 | rules = { 479 | ('NP', 'N'): [(0, 0)], 480 | ('NP\NP', 'S[dcl]\NP'): [(0, 0), (1, 1)], 481 | ('NP\NP', 'S[pss]\NP'): [(0, 0), (1, 1)], 482 | ('NP\NP', 'S[adj]\NP'): [(0, 0), (1, 1)], 483 | ('NP\NP', 'S[ng]\NP'): [(0, 0), (1, 1)], 484 | ('NP\NP', 'S[to]\NP'): [(0, 0), (1, 1)], 485 | ('N\N', 'S[pss]\NP'): [(0, 0), (1, 1)], 486 | ('N\N', 'S[ng]\NP'): [(0, 0), (1, 1)], 487 | ('N\N', 'S[adj]\NP'): [(0, 0), (1, 1)], 488 | ('N\N', 'S[dcl]/NP'): [(0, 0), (1, 1)], 489 | ('(S\NP)\(S\NP)', 'S\NP'): [(0, 0), (2, 1)], 490 | ('(S\NP)\(S\NP)', 'S[ng]\NP'): [(0, 0), (2, 1)], 491 | ('(S\NP)/(S\NP)', 'S\NP'): [(0, 0), (2, 1)], 492 | ('NP\NP', 'S[dcl]/NP'): [(0, 0), (1, 1)], 493 | ('NP', 'S\NP'): [(0, 0)], 494 | ('S/S', 'S\NP'): [(0, 0)], 495 | ('NP\NP', 'S'): [(0, 0)], 496 | ('S/S', 'S\NP'): [(0, 0)], 497 | ('S/S', 'S\NP'): [(0, 0)], 498 | ('NP/PP', 'N/PP'): [(0, 0), (1, 1)], 499 | ('(NP/PP)/PP', '(N/PP)/PP'): [(0, 0), (1, 1), (2, 2)], 500 | ('((NP/PP)/PP)/PP', '((N/PP)/PP)/PP'): [(0, 0), (1, 1), (2, 2), (3, 3)] 501 | } 502 | 503 | class BinaryTypeChanging(object): 504 | rules = { 505 | # For rebanking 506 | ('NP\NP', ',', 'S[pss]\NP'): [(0, None, 0), (1, None, 1)], 507 | ('NP\NP', ',', 'S[ng]\NP'): [(0, None, 0), (1, None, 1)], 508 | ('NP\NP', ',', 'S[adj]\NP'): [(0, None, 0), (1, None, 1)], 509 | ('NP\NP', ',', 'S[dcl]\NP'): [(0, None, 0), (1, None, 1)], 510 | ('NP\NP', ',', 'S[dcl]/NP'): [(0, None, 0), (1, None, 1)], 511 | ('S/S', 'S[dcl]/S[dcl]', ','): [(0, 0, None), (1, 1, None)], 512 | ('(S\NP)\(S\NP)', ',', 'NP'): [(0, None, 0)], 513 | ('(S\NP)/(S\NP)', 'S[dcl]/S[dcl]', ','): [(0, 0, None), (1, 1, None)], 514 | ('(S\NP)\(S\NP)', 'S[dcl]/S[dcl]', ','): [(0, 0, None), (1, 1, None)], 515 | ('S/S', 'NP', ','): [(0, 0, None)], 516 | ('S\S', 'S[dcl]/S[dcl]', ','): [(0, 0, None), (1, 1, None)], 517 | ('S/S', 'S[dcl]\S[dcl]', ','): [(0, 0, None), (1, 1, None)], 518 | ('S[adj]\NP[conj]', 'conj', 'PP'): [(0, None, 0)], 519 | ('S[adj]\NP[conj]', 'conj', 'NP'): [(0, None, 0)], 520 | ('NP[conj]', 'conj', 'S[adj]\NP'): [(0, None, 0)], 521 | ('S/S', 'S[dcl]', ','): [(0, 0, None)], 522 | ('(S\NP)/(S\NP)', 'S[dcl]\S[dcl]', ','): [(0, 0, None), (1, 1, None)], 523 | ('NP\NP', 'S[dcl]/S[dcl]', ','): [(0, 0, None), (1, 1, None)], 524 | ('S[adj]\NP[conj]', 'conj', 'S[ng]\NP'): [(0, None, 0), (1, None, 1)], 525 | ('(S\NP)\(S\NP)', 'S[dcl]\S[dcl]', ','): [(0, 0, None), (1, 1, None)], 526 | ('S[pss]\NP[conj]', 'conj', 'S[ng]\NP'): [(0, None, 0), (1, None, 1)] 527 | } 528 | 529 | 530 | def fapply(left, right): 531 | return Production(left, right, rule='fapply').result 532 | 533 | def bapply(left, right): 534 | return Production(left, right, rule='bapply').result 535 | 536 | def fcomp(left, right): 537 | return Production(left, right, rule='fcomp').result 538 | 539 | def bcomp(left, right): 540 | return Production(left, right, rule='bcomp').result 541 | 542 | def fxcomp(left, right): 543 | return Production(left, right, rule='fxcomp').result 544 | 545 | def bxcomp(left, right): 546 | return Production(left, right, rule='bxcomp').result 547 | 548 | def add_conj(left, right): 549 | return Production(left, right, rule='add_conj').result 550 | 551 | def do_conj(left, right): 552 | return Production(left, right, rule='do_conj').result 553 | 554 | def comma_conj(left, right): 555 | return Production(left, right, rule='comma_conj').result 556 | 557 | def left_punct(left, right): 558 | return Production(left, right, rule='left_punct').result 559 | 560 | def right_punct(left, right): 561 | return Production(left, right, rule='right_punct').result 562 | 563 | def traise(left, parent): 564 | return Production(left, None, parent).result 565 | 566 | def binary(left, right, parent): 567 | return Production(left, right, parent).result 568 | 569 | def minimise_vars(cat, fvars, seen_vars = None, fvar_freqs = None): 570 | def _kwargs(cat): 571 | # nonlocal seen_vars, feat_vars, feat_vars 572 | kwargs = cat.kwargs.copy() 573 | kwargs['var'] = seen_vars[cat.var] 574 | kwargs['arg_idx'] = '' 575 | if cat.feat_var in fvars: 576 | kwargs['feat_var'] = '' 577 | kwargs['feature'] = fvars[cat.feat_var] 578 | elif fvar_freqs[cat.feat_var] == 1: 579 | kwargs['feat_var'] = '' 580 | return kwargs 581 | 582 | # Return the cat unchanged if there are no gaps in the vars 583 | # and we do not have a variable map, and last var is head 584 | if not seen_vars and not fvars and cat.var == 0: 585 | vars = cat.cats_by_var.keys() 586 | if len(vars) == max(vars) + 1: 587 | return cat, {} 588 | 589 | if seen_vars is None: 590 | seen_vars = defaultdict(lambda: len(seen_vars)) 591 | seen_vars[cat.var] # Maps outer var to 0 592 | fvar_freqs = defaultdict(int) 593 | for c in cat.cats.values(): 594 | if c.feat_var and c.feat_var not in fvars: 595 | fvar_freqs[c.feat_var] += 1 596 | 597 | if not cat.is_complex: 598 | return Category(cat.cat, **_kwargs(cat)), seen_vars 599 | 600 | cats = [(p, c) for p, c in cat.cats.items() if not any(p)] 601 | cats.sort() 602 | cats.reverse() 603 | inner = cats.pop(0)[1] 604 | curr_cat = Category(inner.cat, **_kwargs(inner)) 605 | for path, cat in cats: 606 | if cat.argument.is_complex: 607 | arg, seen_vars = minimise_vars(cat.argument, fvars, seen_vars, 608 | fvar_freqs) 609 | else: 610 | arg = Category(cat.argument.cat, **_kwargs(cat.argument)) 611 | curr_cat = Category(curr_cat, cat.slash, arg, **_kwargs(cat)) 612 | return curr_cat, seen_vars 613 | 614 | def remap_vars(cat, var_map): 615 | if not var_map: 616 | return cat 617 | kwargs = cat.kwargs.copy() 618 | kwargs['var'] = var_map.get(cat.var, max(var_map.values()) + 1) 619 | if cat.is_complex: 620 | result = remap_vars(cat.result, var_map) 621 | argument = remap_vars(cat.argument, var_map) 622 | return Category(result, cat.slash, argument, **kwargs) 623 | else: 624 | try: 625 | return Category(cat.cat, **kwargs) 626 | except: 627 | print cat.cat 628 | print kwargs 629 | raise 630 | 631 | def strip_features(cat): 632 | def next_var(): 633 | if not feat_map: 634 | return '[X]' 635 | else: 636 | return VARS[len(feat_map.keys())] 637 | feat_map = {} 638 | for c in cat.cats.values(): 639 | if c.feature and c.feature not in feat_map: 640 | feat_map[c.feature] = next_var() 641 | assert not (c.feat_var and c.feat_var in feat_map) 642 | return feats_to_vars(cat, feat_map) 643 | 644 | def feats_to_vars(cat, feat_map): 645 | if not feat_map: 646 | return cat 647 | kwargs = cat.kwargs.copy() 648 | if cat.feature and cat.feature != '[adj]': 649 | kwargs['feat_var'] = feat_map[cat.feature] 650 | kwargs['feature'] = '' 651 | if cat.is_complex: 652 | result = feats_to_vars(cat.result, feat_map) 653 | argument = feats_to_vars(cat.argument, feat_map) 654 | return ccg.scat.SuperCat(ccg.category.Category(result, cat.slash, 655 | argument, **kwargs)) 656 | else: 657 | return ccg.scat.SuperCat(ccg.category.Category(cat.cat, **kwargs)) 658 | 659 | -------------------------------------------------------------------------------- /Treebank/CCGbank/_Production.py: -------------------------------------------------------------------------------- 1 | """ 2 | Update a subtree to reflect its parent category 3 | 4 | We do this by detecting the type of production 5 | 6 | Unary 7 | Unlicensed -- Does not conform to any ccg rule 8 | Adjunction -- The functor category is an adjunct 9 | Auxiliary -- The functor category is an auxiliary 10 | Application -- The functor category is a predicate applying to the argument 11 | Composition, result change -- The result cat of the parent has changed 12 | Composition, argument change -- The argument structure of the parent has changed 13 | 14 | The detection algorithm also finds the functor category (the category whose 15 | result is preserved in the case of composition) 16 | """ 17 | import ccg.category 18 | from copy import deepcopy as dcopy 19 | import re 20 | class _Production(object): 21 | """ 22 | Represents a production rule, of a given type. The parent can then be 23 | replaced and the changes reflected in the children, in a way that's 24 | customised according to the rule type 25 | """ 26 | def __init__(self, left, right, parent, functorPos = None): 27 | self.left = ccg.category.from_string(str(left)) 28 | self.right = ccg.category.from_string(str(right)) 29 | self.parent = dcopy(parent) 30 | if functorPos != None: 31 | self._functor = functorPos 32 | else: 33 | self._functor = self.findFunctor(left, right, parent) 34 | self.unify() 35 | 36 | def unify(self): 37 | pass 38 | 39 | def _getFunctor(self): 40 | if self._functor == 0: 41 | return self.left 42 | else: 43 | return self.right 44 | 45 | def _setFunctor(self, functor): 46 | if self._functor == 0: 47 | self.left = functor 48 | else: 49 | self.right = functor 50 | 51 | def _getArg(self): 52 | if self._functor == 1: 53 | return self.left 54 | else: 55 | return self.right 56 | 57 | def _setArg(self, arg): 58 | if self._functor == 1: 59 | self.left = arg 60 | else: 61 | self.right = arg 62 | 63 | def replaceResult(self, cat, newResult): 64 | argument = dcopy(cat.argument) 65 | slash = cat.slash 66 | conj = cat.conj 67 | result = dcopy(newResult) 68 | return ccg.ComplexCategory(result, argument, slash, conj) 69 | 70 | def addArgs(self, cat, args): 71 | cat = dcopy(cat) 72 | args.reverse() 73 | for arg, slash, morph in args: 74 | cat = ccg.ComplexCategory(cat, arg, slash, False) 75 | cat.morph = dcopy(morph) 76 | return cat 77 | 78 | def __str__(self): 79 | return "%s %s --> %s %s" % (self.label, self.parent, self.left, self.right) 80 | 81 | def _removeFeatures(self, cat, removeFeat): 82 | if cat.isComplex(): 83 | cat.morph = None 84 | for result, argument, slash, morph in cat.deconstruct(): 85 | self._removeFeatures(result, removeFeat) 86 | self._removeFeatures(argument, removeFeat) 87 | else: 88 | if removeFeat: 89 | cat.feature = '' 90 | cat.morph = None 91 | 92 | @property 93 | def head(self): 94 | return self._getHead() 95 | 96 | 97 | functor = property(_getFunctor, _setFunctor) 98 | argument = property(_getArg, _setArg) 99 | 100 | class Application(_Production): 101 | label = 'a' 102 | def findFunctor(left, right, parent): 103 | candidates = [(left, '/', right, 0), (right, '\\', left, 1)] 104 | for functor, slash, arg, position in candidates: 105 | if functor.isComplex() \ 106 | and functor.slash == slash \ 107 | and functor.argument == arg \ 108 | and functor.result == parent \ 109 | and not functor.isAdjunct(): 110 | return position 111 | else: 112 | return -1 113 | 114 | def unify(self): 115 | self.functor.argument.unify(self.argument) 116 | 117 | findFunctor = staticmethod(findFunctor) 118 | 119 | def replace(self, new): 120 | self.functor = self.replaceResult(self.functor, new) 121 | return self.left, self.right 122 | 123 | def _getHead(self): 124 | return self.functor 125 | 126 | class TRaiseApplication(Application): 127 | label = 'tra' 128 | def findFunctor(left, right, parent): 129 | direction = Application.findFunctor(left, right, parent) 130 | if direction == -1: 131 | return direction 132 | else: 133 | functor = [left, right][direction] 134 | if functor.isTypeRaise(): 135 | return direction 136 | else: 137 | return -1 138 | findFunctor = staticmethod(findFunctor) 139 | 140 | def unify(self): 141 | pass 142 | 143 | def replace(self, new): 144 | x = dcopy(self.argument.argument) 145 | y = dcopy(new) 146 | featLessY = dcopy(y) 147 | self._removeFeatures(featLessY, True) 148 | if self._functor == 0: 149 | innerSlash = '\\' 150 | outerSlash = '/' 151 | else: 152 | innerSlash = '/' 153 | outerSlash = '\\' 154 | tRaiseArgStr = r'(%s%s%s)' % (featLessY.strAsPiece(), innerSlash, x.strAsPiece()) 155 | catStr = '%s%s%s' % (featLessY.strAsPiece(), outerSlash, tRaiseArgStr) 156 | self.functor = ccg.category.from_string(catStr) 157 | self.argument = self.addArgs(dcopy(y), [(dcopy(x), innerSlash, None)]) 158 | return self.left, self.right 159 | 160 | def _getHead(self): 161 | return self.argument 162 | 163 | 164 | 165 | 166 | 167 | class Composition(_Production): 168 | label = 'c' 169 | def findFunctor(left, right, parent): 170 | """ 171 | Composition is of the form X/Y Y/$ -> X|$ or Y|$ X\Y 172 | 173 | We call the X|Y category the functor. 174 | """ 175 | if (not left.isComplex()) or (not right.isComplex()): 176 | return -1 177 | if left.conj or right.conj: 178 | return -1 179 | oLeft = left 180 | left = dcopy(left) 181 | right = dcopy(right) 182 | if right.slash == '\\': 183 | for result, argument, slash, morph in left.deconstruct(): 184 | if right.argument.unify(result): 185 | return 1 186 | elif left.slash == '/' and right.slash == '/': 187 | for result, argument, slash, morph in right.deconstruct(): 188 | if left.argument.unify(result) and slash == '/': 189 | return 0 190 | return -1 191 | 192 | findFunctor = staticmethod(findFunctor) 193 | 194 | def unify(self): 195 | yCat = self.functor.argument 196 | for potentialY, argument, slash, morph in self.argument.deconstruct(): 197 | if yCat.unify(potentialY): 198 | break 199 | else: 200 | raise StandardError 201 | self._y = yCat 202 | self._x = self.functor.result 203 | 204 | 205 | 206 | def replace(self, new, xCat = None): 207 | """ 208 | Get the X and $ components of the new category, and change children 209 | accordingly. Note that because the Y element is not represented in the 210 | parent, this must be invariant. So we're going to be replacing the result 211 | of the functor, and/or the $ of the arguments. 212 | """ 213 | # If new is atomic, we really can't do much composing 214 | # Make it apply into the new category instead 215 | if not new.isComplex(): 216 | if self._functor == 0: 217 | slash = '/' 218 | else: 219 | slash = '\\' 220 | newFunc = ccg.ComplexCategory(dcopy(new), dcopy(self.argument), slash, False) 221 | self.functor = newFunc 222 | return self.left, self.right 223 | # If no X supplied, try the old X 224 | if not xCat: 225 | xCat = dcopy(self._x) 226 | # Take a copy of new so that unification doesn't mess things up 227 | new = dcopy(new) 228 | oldResults = self._getResultArgs(self.parent) 229 | dollarCats = [] 230 | # Handle general comp 231 | for result, argument, slash, morph in new.deconstruct(): 232 | dollarCats.append((argument, slash, morph)) 233 | # So pass first round 234 | # Unify to pass features and morph 235 | if xCat.unify(result): 236 | break 237 | else: 238 | # Otherwise, make it non-generalised composition 239 | xCat = dcopy(new.result) 240 | dollarCats = [(new.argument, new.slash, new.morph)] 241 | functor = self.functor 242 | 243 | self.functor = self.addArgs(xCat, [(functor.argument, functor.slash, functor.morph)]) 244 | self.argument = self.addArgs(functor.argument, dollarCats) 245 | return (self.left, self.right) 246 | 247 | 248 | def _getResultArgs(category): 249 | """ 250 | Pair a result with the arguments up to that point 251 | """ 252 | results = {} 253 | args = [] 254 | for result, argument, slash, morph in category.deconstruct(): 255 | args.append((argument, slash, morph)) 256 | # Use featureless because we don't want to deal with feature passing, eg 257 | # S[dcl]/NP --> S/(S\NP) (S[dcl]\NP)/NP 258 | results[result] = list(args) 259 | 260 | return results 261 | 262 | _getResultArgs = staticmethod(_getResultArgs) 263 | 264 | def _getHead(self): 265 | # Follow the C&C parser's wrong policy of always left heading 266 | return self.left 267 | 268 | 269 | class Determination(Application, Composition): 270 | """ 271 | Purely for the infuriating NP -> NP[nb]/N N rule 272 | """ 273 | label = 't' 274 | def findFunctor(left, right, parent): 275 | if str(left) == 'NP[nb]/N' and str(right) == 'N' and str(parent) == 'NP': 276 | return 0 277 | elif left.isComplex() and not left.isAdjunct() and not ccg.isIdentical(left.innerResult(), ccg.N) and ccg.isIdentical(left.argument, ccg.N) and ccg.isIdentical(ccg.N, right): 278 | return 0 279 | else: 280 | return -1 281 | 282 | def replace(self, new): 283 | if ccg.isIdentical(new, ccg.NP): 284 | self.functor = ccg.category.from_string('NP[nb]/N') 285 | return self.left, self.right 286 | # If possible, prefer to use composition than to stick args onto the NP 287 | if new.innerResult() == 'NP': 288 | self._x = self.functor.result 289 | Composition.replace(self, new) 290 | else: 291 | Application.replace(self, new) 292 | 293 | findFunctor = staticmethod(findFunctor) 294 | 295 | def _getHead(self): 296 | return self.argument 297 | 298 | class AdjunctDetermination(Application): 299 | """ 300 | Handle cases like ((S\NP)\(S\NP))/N 301 | """ 302 | label = 'jt' 303 | 304 | def findFunctor(left, right, parent): 305 | if parent.isAdjunct() and left.result == parent and right == ccg.N: 306 | return 0 307 | else: 308 | return -1 309 | 310 | findFunctor = staticmethod(findFunctor) 311 | 312 | def replace(self, new): 313 | if ccg.isIdentical(new, ccg.NP): 314 | self.functor = ccg.category.from_string(r'NP[nb]/N') 315 | return self.left, self.right 316 | elif new.isAdjunct(): 317 | self.functor = ccg.addArg(new, self.right, '/') 318 | else: 319 | # Define this when I've got an example 320 | # wsj_0029.13 -- noisy, requires fixing 321 | self.functor = ccg.addArg(new, self.right, '/') 322 | #print self 323 | #print new 324 | #raise StandardError 325 | 326 | def _getHead(self): 327 | return self.argument 328 | 329 | 330 | 331 | class TRaiseComp(Composition): 332 | """ 333 | Composition between a type raised category and an argument 334 | """ 335 | label = 'r' 336 | def findFunctor(left, right, parent): 337 | answer = Composition.findFunctor(left, right, parent) 338 | if answer == 0: 339 | if TRaiseComp.isTypeRaise(left, '/', '\\'): 340 | return 0 341 | if answer == 1: 342 | if TRaiseComp.isTypeRaise(right, '\\', '/'): 343 | return 1 344 | return -1 345 | 346 | def replace(self, new): 347 | left, right = Composition.replace(self, new) 348 | self._removeFeatures(self.functor, True) 349 | if new.isAdjunct(): 350 | # To produce an adjunct we can't have features on the argument 351 | self._removeFeatures(self.argument, True) 352 | 353 | def isTypeRaise(cat, slash1, slash2): 354 | if cat.isComplex() and cat.slash == slash1 and cat.argument.isComplex() and cat.argument.slash == slash2: 355 | if ccg.isIdentical(cat.result, cat.argument.result): 356 | return True 357 | return False 358 | 359 | findFunctor = staticmethod(findFunctor) 360 | isTypeRaise = staticmethod(isTypeRaise) 361 | 362 | def _getHead(self): 363 | # Cover cases like N/(N\N) (N\N)/N 364 | if self.left.isTypeRaise() and self.right.isTypeRaise(): 365 | return self.left 366 | else: 367 | return self.argument 368 | 369 | 370 | class Adjunction(_Production): 371 | """ 372 | Adjunction is of the form X/X X$ -> X$ or X$ X\X -> X$ 373 | """ 374 | label = 'd' 375 | def findFunctor(left, right, parent): 376 | if left.conj or right.conj: 377 | return -1 378 | candidates = [(left, right, 0), (right, left, 1)] 379 | for adjunct, head, position in candidates: 380 | if adjunct.isAdjunct() and head == parent: 381 | # Test that adjunct applies 382 | # Guards against composition cases like (S[b]\NP)/NP NP/NP 383 | adjArg = adjunct.argument 384 | if adjArg == head: 385 | return position 386 | for result, argument, slash, morph in head.deconstruct(): 387 | if adjArg == result: 388 | return position 389 | else: 390 | return -1 391 | 392 | findFunctor = staticmethod(findFunctor) 393 | 394 | def replace(self, new, forceApp = False): 395 | global complexAdj 396 | 397 | # Under forceApp, composition is disallowed -- so the argument, 398 | # functor components and parent must all be identical 399 | if forceApp: 400 | self.argument = dcopy(new) 401 | self.functor = ccg.ComplexCategory(dcopy(new), dcopy(new), self.functor.slash, False) 402 | return None 403 | args = [] 404 | if not new.isComplex(): 405 | x = new 406 | elif new.isAdjunct(): 407 | x = new 408 | # The complexAdj flag indicates whether to replicate CCGbank analysis and use 409 | # (S\NP)|(S\NP) adjuncts. Otherwise the natural thing is S\S adjuncts 410 | elif complexAdj and ccg.VP == new: 411 | x = new 412 | # For parser compatibility, don't backwards compose into NP 413 | elif self._functor == 1 and new.innerResult() != 'S': 414 | x = new 415 | else: 416 | lastCat = new 417 | lastArgs = [] 418 | # Select either: an adjunct, S\NP, or an atom 419 | for result, argument, slash, morph in new.deconstruct(): 420 | args.append((argument, slash, morph)) 421 | # Ensure the slash directions work 422 | # If the functor's to the left, cannot cross-compose into a backslash -- unless not complexAdj!s 423 | if complexAdj and self._functor == 0 and slash == '\\': 424 | continue 425 | # Don't back-cross compose into non-S 426 | if self._functor == 1 and slash == '/' and result.innerResult() != 'S': 427 | continue 428 | # Taken this out for the (S[dcl]\S[dcl])\NP S\S test case, but might need it again 429 | if result.isAdjunct(): 430 | x = result 431 | break 432 | elif complexAdj and result == r'S\NP': 433 | x = result 434 | break 435 | elif not result.isComplex(): 436 | x = result 437 | break 438 | # In case the slashes don't work out for a while (ie cross composition), store the last valid 439 | # place to compose into, and its arg set 440 | lastCat = result 441 | lastArgs = list(args) 442 | else: 443 | x = lastCat 444 | args = lastArgs 445 | x = dcopy(x) 446 | newArg = self.addArgs(x, args) 447 | # If the new label has a morph category, add it to the argument, as per wsj_1057.57 448 | if new == newArg: 449 | newArg.morph = dcopy(new.morph) 450 | # If we currently have features on the old X, don't remove them 451 | # This can cause entropy reduction, as per wsj_1824.28 452 | oldX = self.functor.argument 453 | if x.morphLess() == oldX.morphLess(): 454 | removeFeat = False 455 | else: 456 | removeFeat = True 457 | self._removeFeatures(x, removeFeat) 458 | newFunctor = ccg.ComplexCategory(x, x, self.functor.slash, False) 459 | self.functor = newFunctor 460 | self.argument = newArg 461 | 462 | def _getHead(self): 463 | return self.argument 464 | 465 | class AdjComp(_Production): 466 | """ 467 | Composition of adjuncts of the form X|X X|X -> X|X 468 | """ 469 | label = 'o' 470 | def findFunctor(left, right, parent): 471 | if left.conj or right.conj or parent.conj: 472 | return -1 473 | if left.isAdjunct() and right.isAdjunct() and parent.isAdjunct(): 474 | if left.argument == right.argument == parent.argument: 475 | if right.slash == '\\': 476 | return 1 477 | elif left.slash == '/': 478 | return 0 479 | return -1 480 | 481 | def replace(self, new): 482 | if new.isAdjunct(): 483 | x = new.result 484 | functor = ccg.ComplexCategory(dcopy(x), dcopy(x), self.functor.slash, False) 485 | argument = ccg.ComplexCategory(dcopy(x), dcopy(x), self.argument.slash, False) 486 | self.functor = functor 487 | self.argument = argument 488 | else: 489 | x = new 490 | functor = ccg.ComplexCategory(dcopy(x), dcopy(x), self.functor.slash, False) 491 | argument = dcopy(x) 492 | self.functor = functor 493 | self.argument = argument 494 | 495 | findFunctor = staticmethod(findFunctor) 496 | 497 | def _getHead(self): 498 | # Doesn't matter, but follow C&C parser's incorrect left head policy 499 | return self.left 500 | 501 | class AddConj(_Production): 502 | label = 'n' 503 | def findFunctor(left, right, parent): 504 | if not parent.conj: 505 | return -1 506 | if left.conj or right.conj: 507 | return -1 508 | elif left == ccg.conj: 509 | functor = 0 510 | arg = right 511 | elif right == ccg.conj: 512 | functor = 1 513 | arg = left 514 | elif left.isPunct(): 515 | functor = 0 516 | arg = right 517 | elif right.isPunct(): 518 | functor = 1 519 | arg = left 520 | else: 521 | return -1 522 | parentStr = str(parent).replace('[conj]', '') 523 | if str(arg) != parentStr: 524 | return -1 525 | else: 526 | return functor 527 | 528 | findFunctor = staticmethod(findFunctor) 529 | 530 | def replace(self, new): 531 | self.argument = dcopy(new) 532 | self.argument.conj = False 533 | 534 | def _getHead(self): 535 | return self.argument 536 | 537 | class Conjunction(_Production): 538 | label = 'j' 539 | def findFunctor(left, right, parent): 540 | if parent.conj: 541 | return -1 542 | elif left.conj: 543 | return 0 544 | elif right.conj: 545 | return 1 546 | else: 547 | return -1 548 | 549 | findFunctor = staticmethod(findFunctor) 550 | 551 | def replace(self, new): 552 | functor = dcopy(new) 553 | functor.conj = True 554 | self.functor = functor 555 | self.argument = dcopy(new) 556 | 557 | def _getHead(self): 558 | # Follow CCGbank in heading to the left 559 | return self.left 560 | 561 | class Auxiliary(_Production): 562 | label = 'x' 563 | def findFunctor(left, right, parent): 564 | # Recent change, might break something 565 | if left.isTrueAux(): 566 | return 0 567 | elif right.isTrueAux(): 568 | return 1 569 | else: 570 | return -1 571 | 572 | findFunctor = staticmethod(findFunctor) 573 | 574 | def replace(self, new): 575 | # Wrong: need feature from functor result 576 | self.argument = dcopy(new) 577 | 578 | def _getHead(self): 579 | # Depart from CCGbank in taking the argument 580 | return self.argument 581 | 582 | class Punctuation(_Production): 583 | label = 'p' 584 | def findFunctor(left, right, parent): 585 | """ 586 | Functor is the punctuation symbol. Ensure this isn't conjunctive punctuation. 587 | """ 588 | if left.isPunct(): 589 | if not ccg.isIdentical(right, parent): 590 | return -1 591 | else: 592 | return 0 593 | elif right.isPunct(): 594 | if not ccg.isIdentical(left, parent): 595 | return -1 596 | else: 597 | return 1 598 | return -1 599 | findFunctor = staticmethod(findFunctor) 600 | 601 | def replace(self, new): 602 | self.argument = dcopy(new) 603 | 604 | def _getHead(self): 605 | return self.argument 606 | 607 | class UnHat(_Production): 608 | label = 'h' 609 | 610 | def findFunctor(left, right, parent): 611 | if right: 612 | return -1 613 | if not ccg.isIdentical(parent, left.morph): 614 | return -1 615 | return 0 616 | 617 | findFunctor = staticmethod(findFunctor) 618 | 619 | def replace(self, new): 620 | self.functor = self.functor 621 | self.functor.morph = dcopy(new) 622 | 623 | 624 | 625 | 626 | class Unary(_Production): 627 | label = 'u' 628 | def findFunctor(left, right, parent): 629 | if not right: 630 | return 0 631 | else: 632 | return -1 633 | 634 | findFunctor = staticmethod(findFunctor) 635 | 636 | 637 | def replace(self, new): 638 | pass 639 | 640 | def _getHead(self): 641 | return self.left 642 | 643 | 644 | 645 | class Invalid(_Production): 646 | label = 'i' 647 | def findFunctor(left, right, parent): 648 | """ 649 | Functor is arbitrarily the left side, unless involves conj or punct 650 | 651 | This is order dependent on the other rules, as we do not want to check that the production is 652 | in fact invalid. 653 | """ 654 | if right == ccg.conj or right.isPunct(): 655 | return 1 656 | else: 657 | return 0 658 | 659 | 660 | findFunctor = staticmethod(findFunctor) 661 | 662 | def replace(self, new): 663 | """ 664 | Conj and punct cases are particularly common for invalid, so 665 | percolate the new label down for these 666 | """ 667 | if self.functor == ccg.conj or self.functor.isPunct(): 668 | # Provide exception case for transformation punctuation 669 | if self.functor == ',' and new.isAdjunct(): 670 | return None 671 | self.argument = dcopy(new) 672 | self.argument.conj = None 673 | else: 674 | pass 675 | 676 | def _getHead(self): 677 | return self.functor 678 | 679 | def Production(left, right, parent): 680 | """ 681 | Allocate a Production 682 | """ 683 | binary = [AdjunctDetermination, 684 | Determination, 685 | TRaiseApplication, 686 | Application, 687 | AdjComp, 688 | Adjunction, 689 | TRaiseComp, 690 | Composition, 691 | Conjunction, 692 | AddConj, 693 | Punctuation, 694 | Invalid] 695 | unary = [UnHat, Unary] 696 | productions = binary if right else unary 697 | for productionClass in productions: 698 | functorPos = productionClass.findFunctor(left, right, parent) 699 | if functorPos != -1: 700 | return productionClass(left, right, parent, functorPos) 701 | 702 | 703 | 704 | def testLabels(): 705 | """ 706 | Check production type assignment against manually annotated production rules 707 | """ 708 | location = '/home/mhonn/code/mhonn/Treebank/CCGBank/productionTypes.txt' 709 | for label, parent, left, right, line in readProductions(location): 710 | if label == 'x': 711 | continue 712 | if not label: 713 | continue 714 | if label and not right and label != 'u': 715 | print line 716 | raise StandardError 717 | if right: 718 | production = Production(left, right, parent) 719 | if production.label != label: 720 | print "Incorrect: %s" % production.label 721 | print line 722 | raise StandardError 723 | else: 724 | print "Correct! %s" % line 725 | 726 | def testReplacements(): 727 | location = '/home/mhonn/code/mhonn/Treebank/CCGBank/productionTypes.txt' 728 | for label, parent, left, right, line in readProductions(location): 729 | # if line != 'd 20352 # S[dcl]\NP --> S[dcl]\NP (S\NP)\(S\NP)': 730 | # continue 731 | # if label == 'x': continue 732 | if not label: 733 | break 734 | if right: 735 | production = Production(left, right, parent) 736 | production.replace(parent) 737 | if label != production.label: 738 | print line 739 | print production 740 | if not ccg.isIdentical(production.left, left) or not ccg.isIdentical(production.right, right): 741 | print line 742 | print production 743 | 744 | def readProductions(location): 745 | for line in open(location): 746 | line = line.strip() 747 | if not line: 748 | continue 749 | if line.startswith('#'): 750 | continue 751 | front, production = line.split(' # ') 752 | pieces = front.split() 753 | freq = int(pieces.pop()) 754 | if pieces: 755 | label = pieces[0] 756 | else: 757 | label = None 758 | parent, children = production.split(' --> ') 759 | parent = ccg.category.from_string(parent) 760 | pieces = children.split() 761 | left = ccg.category.from_string(pieces.pop(0)) 762 | if pieces: 763 | right = ccg.category.from_string(pieces[0]) 764 | else: 765 | right = None 766 | yield label, parent, left, right, line 767 | 768 | def setComplexAdj(value): 769 | global complexAdj 770 | complexAdj = value 771 | 772 | complexAdj = True 773 | if __name__ == '__main__': 774 | if 1: 775 | parent = ccg.category.from_string(r'NP\NP') 776 | cat1 = ccg.category.from_string(r'(S[dc]\NP)^(NP\NP)') 777 | cat2 = None 778 | production = Production(cat1, cat2, parent) 779 | print production 780 | production.replace(ccg.category.from_string(r'N\N')) 781 | print production.label 782 | print production.left 783 | print production.right 784 | if 0: 785 | cat1 = ccg.category.from_string('NP[nb]/N') 786 | cat2 = ccg.category.from_string('N') 787 | result = ccg.category.from_string('NP') 788 | production = Production(cat1, cat2, result) 789 | production.replace(ccg.category.from_string('NP/(S\NP)')) 790 | print production.label 791 | print production.left 792 | print production.right 793 | if 0: 794 | testReplacements() 795 | --------------------------------------------------------------------------------