├── Treebank
    ├── __init__.py
    └── CCGbank
    │   ├── Writers
    │       ├── __init__.py
    │       ├── _PargFileWriter.py
    │       └── _AutoFileWriter.py
    │   ├── __init__.py
    │   ├── profile_ccgbank.py
    │   ├── _CCGLeaf.py
    │   ├── _Printer.py
    │   ├── _Sentence.py
    │   ├── _Corpus.py
    │   ├── _File.py
    │   ├── _CCGFile.py
    │   ├── _Leaf.py
    │   ├── _CCGbank.py
    │   ├── _Node.py
    │   ├── _CCGSentence.py
    │   ├── _CCGNode.py
    │   └── _Production.py
├── ccg
    ├── grammar.py
    ├── __init__.py
    ├── lexicon.py
    ├── category.py
    ├── scat.py
    └── rules.py
├── README.md
└── tests
    ├── test_lexicon.py
    ├── test_unify.py
    ├── test_secondary.py
    ├── test_parse.py
    ├── test_rules.py
    └── test_replace.py


/Treebank/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Treebank/CCGbank/Writers/__init__.py:
--------------------------------------------------------------------------------
1 | from _AutoFileWriter import AutoFileWriter
2 | from _PargFileWriter import PargFileWriter
3 | 


--------------------------------------------------------------------------------
/Treebank/CCGbank/__init__.py:
--------------------------------------------------------------------------------
1 | from _CCGbank import CCGbank
2 | from _CCGFile import CCGFile
3 | from _CCGSentence import CCGSentence
4 | from _CCGNode import CCGNode
5 | from _CCGLeaf import CCGLeaf
6 | from _Production import Production, setComplexAdj
7 | 


--------------------------------------------------------------------------------
/ccg/grammar.py:
--------------------------------------------------------------------------------
 1 | def read(loc):
 2 |     productions = []
 3 |     for line in open(loc):
 4 |         if not line.strip():
 5 |             continue
 6 |         freq, production = line.strip().split(' # ')
 7 |         production = production.replace('[nb]', '')
 8 |         parent, children = production.split(' --> ')
 9 |         children = children.split()
10 |         left = children[0]
11 |         if left == '((S[b]\NP)/NP)/':
12 |             left = '(S[b]\NP)/NP'
13 |         if len(children) == 2:
14 |             right = children[1]
15 |         else:
16 |             right = None
17 |         productions.append((parent, left, right, int(freq)))
18 |     return productions
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/ccg/__init__.py:
--------------------------------------------------------------------------------
 1 | import ccg.category
 2 | 
 3 | def isIdentical(c1, c2):
 4 |     return c1.exact_eq(c2)
 5 | 
 6 | CONJ = ccg.category.from_string('conj')
 7 | conj = CONJ
 8 | COMMA = ccg.category.from_string(',{_}')
 9 | SEMI_COLON = ccg.category.from_string(';{_}')
10 | COLON = ccg.category.from_string(':{_}')
11 | N = ccg.category.from_string('N')
12 | NP = ccg.category.from_string('NP')
13 | VP = ccg.category.from_string('S\NP')
14 | punct = {
15 |         ',': True,
16 |         ':': True,
17 |         '.': True,
18 |         ';': True,
19 |         'RRB': True,
20 |         'LRB': True,
21 |         '-RRB-': True,
22 |         '-LRB-': True,
23 |         'LQU': True,
24 |         'RQU': True,
25 |         'PUNCT': True
26 |     }
27 | 


--------------------------------------------------------------------------------
/Treebank/CCGbank/profile_ccgbank.py:
--------------------------------------------------------------------------------
 1 | import hotshot.stats
 2 | import hotshot
 3 | 
 4 | import Treebank.CCGbank
 5 | import ccg.lexicon
 6 | 
 7 | def load_files():
 8 |     location = '/home/matt/code/repos/data/CCGbank1.2_np_v0.7'
 9 |     corpus = Treebank.CCGbank.CCGbank(path=location)
10 |     ccg.lexicon.load()
11 |     for i, child in enumerate(corpus.children()):
12 |         if i == 100:
13 |             break
14 |         pass
15 | 
16 | def pfile(function):
17 |     prof = hotshot.Profile('/tmp/test.prof')
18 |     prof.runcall(function)
19 |     prof.close()
20 |     stats = hotshot.stats.load('/tmp/test.prof')
21 |     stats.strip_dirs()
22 |     stats.sort_stats('time', 'calls')
23 |     stats.print_stats(20)
24 | 
25 | if __name__ == '__main__':
26 |     pfile(load_files)
27 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | * Overview
 2 | 
 3 | Manipulate Combinatory Categorial Grammar categories and derivations, for natural language processing research.
 4 | 
 5 | The library is quite feature rich, but has a pretty messy API, and some bugs.
 6 | 
 7 | The "killer feature" is the implementation of the CCG grammar rules and variable binding. After sentence.unify_vars()
 8 | has been called, all categories will have all slots bound to "global" variables, which are unified to other
 9 | variable bindings, and may have words attached.
10 | 
11 | Aside from ugliness, there are two main sources of remaining problems:
12 | 
13 | 1) Coordination is very difficult to get right with respect to unification, as we need a set of words, and we don't necessarily
14 | unify when we coordinate (think "red bus and green train". We do not unify "bus" and "train"!).
15 | 
16 | 2) When a word is missing from the "markedup" file, we do a terrible job of guessing its annotation.
17 | 
18 | 


--------------------------------------------------------------------------------
/tests/test_lexicon.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import ccg.lexicon
 4 | 
 5 | class LexiconTests(unittest.TestCase):
 6 |     def test_entry(self):
 7 |         lexicon = ccg.lexicon._Lexicon()
 8 |         entry = """((S\NP)\(S\NP))/NP
 9 |   2 (((S[X]{Y}\NP{Z}){Y}\(S[X]{Y}<1>\NP{Z}){Y}){_}/NP{W}<2>){_}
10 |   1 ncmod _ %f %li
11 |   2 dobj %l %f"""
12 |         stag, annotated = lexicon._parse_entry(entry)
13 |         self.assertEqual(stag, '((S\NP)\(S\NP))/NP')
14 |         self.assertEqual(annotated,
15 |             '(((S[X]{Y}\NP{Z}){Y}\(S[X]{Y}<1>\NP{Z}){Y}){_}/NP{W}<2>){_}')
16 | 
17 |     def test_all_annotated(self):
18 |         lexicon = ccg.lexicon._Lexicon()
19 |         for key, cat in lexicon.items():
20 |             if '{' in key:
21 |                 if key != cat.annotated:
22 |                     print cat.string
23 |                 self.assertEqual(key.replace('[nb]', ''),
24 |                                  cat.annotated.replace('[nb]', ''))
25 | 
26 |     def test_all_supertags(self):
27 |         lexicon = ccg.lexicon._Lexicon()
28 |         for key, cat in lexicon.items():
29 |             if '{' not in key:
30 |                 self.assertEqual(key.replace('[nb]', ''),
31 |                                  cat.string.replace('[nb]', ''))
32 | 
33 |     def test_load(self):
34 |         ccg.lexicon.load()
35 |         cat = ccg.category.from_string('S[dcl]\NP')
36 |         self.assertEqual(cat.annotated, '(S[dcl]{_}\NP{Y}<1>){_}')
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     unittest.main()
41 | 


--------------------------------------------------------------------------------
/Treebank/CCGbank/_CCGLeaf.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from ._Leaf import Leaf
 4 | from ._CCGNode import CCGNode
 5 | import ccg.scat
 6 | 
 7 | class CCGLeaf(Leaf, CCGNode):
 8 |     neRE = re.compile(r'\|(?=[BI]-)')
 9 |     def __init__(self, **kwargs):
10 |         textName = CCGLeaf.neRE.split(kwargs.pop('text'))
11 |         if len(textName) == 2:
12 |             text, entityTag = textName 
13 |         else:
14 |             text = textName[0]
15 |             entityTag = ''
16 |         self.text = text
17 |         self.entity = entityTag
18 |         self.pos = kwargs.pop('pos')
19 |         self.parg = kwargs.pop('parg')
20 |         self.wordID = kwargs.pop('wordID')
21 |         self.srl_args = {}
22 |         CCGNode.__init__(self, headIdx=0, **kwargs)
23 |         
24 |     def sibling(self):
25 |         return None
26 | 
27 |     def validate(self):
28 |         return True
29 | 
30 |     def isAdjunct(self):
31 |         return False
32 | 
33 |     
34 |     def isPunct(self):
35 |         return bool(self.label in ccg.punct)
36 | 
37 |     def changeLabel(self, newLabel):
38 |         """
39 |         Change predicate-argument category
40 |         """
41 |         oldLabel = self.parg
42 |         newLabel = ccg.scat.SuperCat(newLabel)
43 |         #newLabel.goldDeps = oldLabel.goldDeps
44 |         #for head in oldLabel.heads():
45 |         #    newLabel.addHead(head)
46 |         self.parg = newLabel
47 | 
48 |     def head(self):
49 |         return self
50 | 
51 |     def heads(self):
52 |         return [self]
53 | 
54 |     @property
55 |     def stag(self):
56 |         return self.parent().label
57 | 
58 | 


--------------------------------------------------------------------------------
/Treebank/CCGbank/_Printer.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class Printer(object):
 3 |     """
 4 |     Print a parse tree with good formatting
 5 |     """
 6 |     
 7 |     def __call__(self, node):
 8 |         return self.actOn(node)
 9 |     
10 |     def actOn(self, node):
11 |         if node.isRoot():
12 |             return self._visitRoot(node)
13 |         else:
14 |             raise Break
15 |     
16 |     def _isLeaf(self, node):
17 |         return node.isLeaf()
18 | 
19 |     def _visitRoot(self, node):
20 |         """
21 |         Print each node's label, and track indentation
22 |         """
23 |         self._indentation = 0
24 |         self._lines = []
25 |         # Accrue print state
26 |         self._printNode(node)
27 |         # Ensure that brackets match
28 |         assert self._indentation == 0
29 |         return '\n'.join(self._lines)
30 | 
31 |             
32 |     def _visitInternal(self, node):
33 |         """
34 |         The visitor must control iteration itself, so only works on root.
35 |         """
36 |         raise Break
37 |             
38 |     def _printNode(self, node):
39 |         """
40 |         Print indentation, a bracket, then the node label.
41 |         Then print the node's children, then a close bracket.
42 |         """
43 |         indentation = '  '*self._indentation
44 |         self._lines.append('%s(%s' % (indentation, node.label))
45 |         self._indentation += 1
46 |         for child in node.children():
47 |             if self._isLeaf(child):
48 |                 self._printLeaf(child)
49 |             else:
50 |                 self._printNode(child)
51 |         self._lines[-1] = self._lines[-1] + ')'
52 |         self._indentation -= 1
53 | 
54 |     def _printLeaf(self, node):
55 |         self._lines[-1] = self._lines[-1] + ' %s' % (node.text)
56 | 


--------------------------------------------------------------------------------
/Treebank/CCGbank/_Sentence.py:
--------------------------------------------------------------------------------
 1 | from _Node import Node
 2 | from _Printer import Printer
 3 | 
 4 | class Sentence(Node):
 5 |     _printer = Printer()
 6 |     def __str__(self):
 7 |         return self._printer(self)
 8 |         
 9 |     def parent(self):
10 |         """
11 |         Raises an error, because the root node has no parent
12 |         """
13 |         raise AttributeError, "Cannot retrieve the parent of the root node! Current parse state:\n\n%s" % self.prettyPrint()
14 |         
15 |     def performOperation(self, operation):
16 |         """
17 |         Accept a Visitor and call it on each child
18 |         Goofy name/design is legacy from when I didn't know how to code :(
19 |         """
20 |         operation.newStructure()
21 |         operation.actOn(self)
22 |         for node in self.depthList():
23 |             try:
24 |                 operation.actOn(node)
25 |             # Give operations the opportunity to signal
26 |             # when the work is complete
27 |             except Break:
28 |                 break
29 |         while operation.moreChanges:
30 |             operation.actOn(self)
31 |             for node in getattr(self, operation.listType)():
32 |                 try:
33 |                     operation.actOn(node)
34 |                 # Give operations the opportunity to signal
35 |                 # when the work is complete
36 |                 except Break:
37 |                     break
38 | 
39 |     def isRoot(self):
40 |         return True
41 | 
42 |     def _connectNodes(self, nodes, parentage):
43 |         # Build the tree
44 |         offsets = sorted(nodes.keys())
45 |         # Skip the top node
46 |         offsets.pop(0)
47 |         for key in offsets:
48 |             node = nodes[key]
49 |             parent = nodes[parentage[node]]
50 |             parent.attachChild(node, len(parent))
51 | 


--------------------------------------------------------------------------------
/Treebank/CCGbank/_Corpus.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from _Node import Node
 4 | from _File import File
 5 | from _Sentence import Sentence
 6 | 
 7 | class Corpus(Node):
 8 |     def parent(self):
 9 |         """
10 |         Raises an error, because the root node has no parent
11 |         """
12 |         raise AttributeError, "Cannot retrieve the parent of the root node. Current parse state:\n\n%s" % self.prettyPrint()
13 |         
14 |     def attachChild(self, newChild):
15 |         """
16 |         Append a file
17 |         """
18 |         # Security isn't really an issue for Corpus, so just stick
19 |         # it onto the list
20 |         self._children.append(newChild)
21 | 
22 |         
23 |     def performOperation(self, operation):
24 |         """
25 |         Accept a Visitor and call it on each child
26 |         Goofy name/design is legacy from when I didn't know how to code :(
27 |         """
28 |         operation.newStructure()
29 |         operation.actOn(self)
30 |         for node in self.children():
31 |             operation.actOn(node)
32 |             
33 |     def child(self, index):
34 |         """
35 |         Read a file by zero-index offset
36 |         """
37 |         path = self._children[index]
38 |         print >> sys.stderr, path
39 |         return self.fileClass(path=path)
40 |      
41 |     
42 |     def children(self):
43 |         """
44 |         Generator to iterate through children
45 |         """
46 |         for i in xrange(len(self._children)):
47 |             yield self.child(i)
48 |     
49 |     def file(self, key):
50 |         """
51 |         Read a file by path
52 |         """
53 |         return self.fileClass(path=key)
54 |             
55 |     def sentence(self, key):
56 |         filename, sentenceKey = key.split('~')
57 |         file_ = self.file(filename)
58 |         return file_.sentence(key)
59 | 
60 |     def sentences(self):
61 |         for child in self.children():
62 |             for sentence in child.children():
63 |                 yield sentence
64 | 


--------------------------------------------------------------------------------
/Treebank/CCGbank/_File.py:
--------------------------------------------------------------------------------
 1 | from _Node import Node
 2 | 
 3 | class File(Node):
 4 |     """
 5 |     A file in a treebank
 6 |     """
 7 |     def __init__(self, **kwargs):
 8 |         self._IDDict = {}
 9 |         Node.__init__(self, **kwargs)
10 | 
11 |     def attachChild(self, newChild):
12 |         """
13 |         Append a sentence
14 |         """
15 |         # Security isn't really an issue for Files, so just append the new
16 |         # Sentence without complaint
17 |         self._children.append(newChild)
18 |         self._IDDict[newChild.globalID] = newChild
19 |     
20 |     def detachChild(self, node):
21 |         """
22 |         Delete a sentence
23 |         """
24 |         self._children.remove(node)
25 |         self._IDDict.pop(node.globalID)
26 |     
27 |     def sentence(self, key):
28 |         """
29 |         Retrieve a sentence by key
30 |         """
31 |         return self._IDDict[key]
32 |     
33 |     def prettyPrint(self):
34 |         return "(%d %s)" % (self.localID, '\n\n\n'.join([child.prettyPrint() for child in self.children()]))
35 |         
36 |     def performOperation(self, operation):
37 |         """
38 |         Accept a Visitor and call it on each child
39 |         Goofy name/design is legacy from when I didn't know how to code :(
40 |         """
41 |         operation.newStructure()
42 |         operation.actOn(self)
43 |         for node in getattr(self, operation.listType)():
44 |             try:
45 |                 operation.actOn(node)
46 |             # Give operations the opportunity to signal
47 |             # when the work is complete
48 |             except Break:
49 |                 break
50 |         while operation.moreChanges:
51 |             operation.actOn(self)
52 |             for node in getattr(self, operation.listType)():
53 |                 try:
54 |                     operation.actOn(node)
55 |                 # Give operations the opportunity to signal
56 |                 # when the work is complete
57 |                 except Break:
58 |                     break
59 | 


--------------------------------------------------------------------------------
/Treebank/CCGbank/Writers/_PargFileWriter.py:
--------------------------------------------------------------------------------
 1 | from _AutoFileWriter import AutoFileWriter
 2 | import os
 3 | from os.path import join as pjoin
 4 | 
 5 | class PargFileWriter(AutoFileWriter):
 6 | 
 7 | 
 8 |     def writeFile(self, fileID, sentences):
 9 |         path = self._getPath(fileID)
10 |         output = open(path, 'w')
11 |         for sentence in sentences:
12 |             output.write(sentence + '\n')
13 |         output.close()
14 | 
15 |     def getSentenceStr(self, sentence):
16 |         idLine = self._getIDLine(sentence)
17 |         deps = []
18 |         for word in sentence.listWords():
19 |             for argHead, depType, argNum in word.parg.goldDependencies():
20 |                 depStr = self._makeDep(word, argHead, argNum, depType)
21 |                 deps.append(depStr)
22 |         deps.sort()
23 |         deps.insert(0, idLine)
24 |         deps.append('<\s>')
25 |         return '\n'.join(deps)
26 |         
27 | 
28 |     def _getPath(self, fileID):
29 |         dirSect = fileID[4:6]
30 |         directory = pjoin(self.directory, dirSect)
31 |         if not os.path.exists(directory):
32 |             os.mkdir(directory)
33 |         return pjoin(directory, fileID.replace('auto', 'parg'))
34 | 
35 |     def _getIDLine(self, sentence):
36 |         idLine = '<s id="%s"> %d' % (sentence.globalID, sentence.getWord(-1).wordID)
37 |         return idLine
38 |         
39 | 
40 |     def _makeDep(self, head, arg, argNum, depType):
41 |         """
42 |         A depedency between the ith word and the jth word (wordI and wordJ)
43 |         where the jth word has the lexical (functor) category catJ, and the
44 |         ith word is head of the constituent which fills the kth argument slot
45 |         of catJ is described as:
46 |         i j cat_j arg_k word_i word_j
47 |         """
48 |         i = arg.wordID
49 |         j = head.wordID
50 |         catJ = str(head.parg)
51 |         argK = argNum
52 |         wordI = arg.text
53 |         wordJ = head.text
54 |         dep = '%d \t %d \t %s \t %d \t %s %s' % (i, j, catJ, argNum, wordI, wordJ)
55 |         if depType != 'L':
56 |             dep = dep + ' ' + depType
57 |         return dep
58 | 


--------------------------------------------------------------------------------
/Treebank/CCGbank/_CCGFile.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import sys
 3 | import os
 4 | import re
 5 | 
 6 | from ._File import File
 7 | from ._CCGNode import CCGNode
 8 | from ._CCGSentence import CCGSentence
 9 | 
10 | class CCGFile(File, CCGNode):
11 |     mmRE = re.compile(r'(?<=[/\\])[\.]')
12 |     def __init__(self, **kwargs):
13 |         if 'string' in kwargs:
14 |             text = kwargs.pop('string')
15 |             path = kwargs.pop('path')
16 |         else:
17 |             path = kwargs.pop('path')
18 |             text = open(path).read()
19 |         # Hack for mmccg version
20 |         text = self.mmRE.sub('', text)
21 |         # Sometimes sentences start (( instead of ( (. This is an error, correct it
22 |         filename = path.split('/')[-1]
23 |         self.path = path
24 |         self.filename = filename
25 |         self.ID = filename
26 |         self._IDDict = {}
27 |         CCGNode.__init__(self, label='File', headIdx=0, **kwargs)
28 |         self._parseFile(text)
29 | 
30 |     def _parseFile(self, text):
31 |         lines = text.strip().split('\n')
32 |         while lines:
33 |             idLine = lines.pop(0)
34 |             sentence = lines.pop(0)
35 |             self._addSentence(idLine, sentence)
36 | 
37 |     def _addSentence(self, idLine, sentStr):
38 |         try:
39 |             globalID = idLine.split(' ')[0].split('=')[1]
40 |         except:
41 |             print sentStr
42 |             print >> sys.stderr, idLine
43 |             raise
44 |         sentence = CCGSentence(globalID=globalID, string=sentStr,
45 |                                localID=self.length())
46 |         self.attachChild(sentence)
47 |         
48 |     pargSentsRE = re.compile(r'<s id="[^"]+\.\d+"> \d+\n(?:(\d.+?)\n)?<\\s>', re.DOTALL)
49 |     def addPargDeps(self, pargPath=None):
50 |         pargPath = self.path.rsplit('/', 2)[0].replace('AUTO', 'PARG')
51 |         section = self.ID[4:6]
52 |         fileLoc = os.path.join(pargPath, section, self.ID.replace('auto', 'parg'))
53 |         text = open(fileLoc).read().strip()
54 |         for i, matchObj in enumerate(CCGFile.pargSentsRE.finditer(text)):
55 |             if not matchObj.groups()[0]:
56 |                 continue
57 |             pargSent = matchObj.groups()[0]
58 |             deps = [dep.split() for dep in pargSent.split('\n')]
59 |             sentence = self.child(i)
60 |             sentence.addPargDeps(deps)
61 | 


--------------------------------------------------------------------------------
/Treebank/CCGbank/_Leaf.py:
--------------------------------------------------------------------------------
 1 | from _Node import Node
 2 | 
 3 | class Leaf(Node):
 4 |     """
 5 |     A leaf of the parse tree -- ie, a word, punctuation or trace
 6 |     Cannot attach or retrieve children
 7 |     """
 8 |     def __hash__(self):
 9 |         return self.wordID
10 | 
11 |     def isLeaf(self):
12 |         return True
13 |     
14 |     def attachChild(self, newChild, index = None):
15 |         raise AttachmentError, "Cannot add node\n\n%s\n\nto leaf:\n\n%s\n\nLeaves cannot have children." \
16 |         % (newChild.prettyPrint(), self.prettyPrint())
17 | 
18 |     def length(self, constraint = None):
19 |         return 0
20 |         
21 |     def child(self, index):
22 |         """
23 |         Raises an error, because leaf nodes have no children
24 |         """
25 |         raise AttributeError, "Cannot retrieve children from leaf nodes! Attempted on leaf:\n\n%s" % self.prettyPrint()
26 |         
27 |     def detachChild(self, node):
28 |         """
29 |         Raises an error, because leaf nodes have no children
30 |         """
31 |         raise AttributeError, "Cannot remove children from leaf nodes! Attempted on leaf:\n\n%s" % self.prettyPrint()
32 |         
33 |     def prettyPrint(self):
34 |         return "(%s %s)" % (self.label, self.text) 
35 |     
36 |     def listWords(self):
37 |         return [self]
38 |         
39 |     def lemma(self):
40 |         """
41 |         Get lemma from COMLEX entry, otherwise text
42 |         """
43 |         if self.metadata.get('COMLEX'):
44 |             return self.metadata['COMLEX'][0].features['ORTH'][0][1:-1]
45 |         elif self.label in ['NNP', 'NNPS']:
46 |             return self.text
47 |         else:
48 |             return self.text.lower()
49 | 
50 |     def isTrace(self):
51 |         return bool(self.label == '-NONE-')
52 |         
53 | 
54 |     def isPunct(self):
55 |         punct = {
56 |         ',': True,
57 |         ':': True,
58 |         '.': True,
59 |         ';': True,
60 |         'RRB': True,
61 |         'LRB': True
62 |         }
63 |         return bool(self.label in punct)
64 | 
65 |     def nextWord(self):
66 |         words = self.root().listWords()
67 |         nextID = self.wordID + 1
68 |         if nextID == len(words):
69 |             return None
70 |         else:
71 |             assert self is not words[self.wordID+1]
72 |             return words[self.wordID+1]
73 | 
74 |     def prevWord(self):
75 |         if self.wordID == 0:
76 |             return None
77 |         words = self.root().listWords()
78 |         return words[self.wordID - 1]
79 | 


--------------------------------------------------------------------------------
/tests/test_unify.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import ccg.rules
 4 | import ccg.scat
 5 | import ccg.lexicon
 6 | import Treebank.CCGbank
 7 | 
 8 | ccg.lexicon.load()
 9 | 
10 | class TestUnify(unittest.TestCase):
11 |     def test_fapply_adjunct(self):
12 |         c1 = ccg.scat.SuperCat('N/N')
13 |         c2 = ccg.scat.SuperCat('N')
14 |         parent = ccg.scat.SuperCat('N')
15 |         production = ccg.rules.Production(c1, c2, parent)
16 |         parent.bind_vars(production.result, parent.category,
17 |                          production.result.category)
18 |         left_arg_global_vars = c1.get_vars(c1.argument)
19 |         self.assertEqual(left_arg_global_vars, parent.get_vars(parent))
20 |     
21 |     def test_fcomp(self):
22 |         c1 = ccg.scat.SuperCat('(S[dcl]\NP)/(S[pss]\NP)')
23 |         c2 = ccg.scat.SuperCat('(S[pss]\NP)/NP')
24 |         parent = ccg.scat.SuperCat('(S[dcl]\NP)/NP')
25 |         production = ccg.rules.Production(c1, c2, parent)
26 |         parent.bind_vars(production.result, parent.category,
27 |                          production.result.category)
28 |         left_arg_global_vars = c1.get_vars(c1.argument)
29 |         right_result_global_vars = c2.get_vars(c2.result)
30 |         self.assertEqual(right_result_global_vars, left_arg_global_vars)
31 |         laa_global_vars = c1.get_vars(c1.argument.argument)
32 |         ra_global_vars = c2.get_vars(c2.result.argument)
33 |         self.assertEqual(laa_global_vars, ra_global_vars)
34 |         self.assertFalse(laa_global_vars == left_arg_global_vars)
35 |         self.assertEqual(parent.get_vars(parent.result.argument),
36 |                          c1.get_vars(c1.result.argument))
37 | 
38 |     def test_fapply_sentence(self):
39 |         sent_str = ("(<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T NP 0 1> "
40 |         "(<L NP/N NNP NNP Ms. NP_254/N_254>) (<L N NNP NNP Haag N>) ) (<T "
41 |         "S[dcl]\NP 0 2> (<L (S[dcl]\NP)/NP VBZ VBZ plays "
42 |         "(S[dcl]\NP_241)/NP_242>) (<L NP NNP NNP Elianti NP>) )"
43 |         " ) (<L . . . . .>) )")
44 |         sent = Treebank.CCGbank.CCGSentence(string=sent_str, globalID=0,
45 |                                             localID=0)
46 |         sent.unify_vars()
47 |         ms, haag, plays, elianti, period = [w.stag for w in sent.listWords()]
48 |         self.assertEqual(ms.get_vars(ms.argument),
49 |                          haag.get_vars(ms))
50 |         self.assertEqual(haag.get_vars(haag),
51 |                          plays.get_vars(plays.result.argument))
52 |         self.assertEqual(plays.get_vars((plays.argument)),
53 |                          elianti.get_vars(elianti))
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     unittest.main()
58 | 


--------------------------------------------------------------------------------
/Treebank/CCGbank/_CCGbank.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import re
 3 | import os.path
 4 | 
 5 | import ccg.lexicon
 6 | from ._Corpus import Corpus
 7 | from ._CCGNode import CCGNode
 8 | from ._CCGFile import CCGFile
 9 | 
10 | 
11 | 
12 | class CCGbank(Corpus, CCGNode):
13 |     fileClass = CCGFile
14 |     def __init__(self, path=None, **kwargs):
15 |         self._children = []
16 |         self.path = path
17 |         for fileLoc in self._getFileList(self.path):
18 |             self.attachChild(fileLoc)
19 |         ccg.lexicon.load(os.path.join(path, 'markedup'))
20 | 
21 |     def child(self, index):
22 |         """
23 |         Read a file by zero-index offset
24 |         """
25 |         path = self._children[index]
26 |         print >> sys.stderr, path
27 |         return self.fileClass(path=path)
28 | 
29 |     def sentence(self, key):
30 |         fileName, sentID = key.split('.')
31 |         section = fileName[4:6]
32 |         fileID = os.path.join(self.path, 'data', 'AUTO', section, fileName +
33 |                               '.auto')
34 |         f = self.file(fileID)
35 |         #pargLoc = fileID.rsplit('/', 2)[0].replace('AUTO', 'PARG')
36 |         #f.addPargDeps(pargLoc)
37 |         return f.sentence(key)
38 | 
39 |     def tokens(self):
40 |         """
41 |         Generate tokens without parsing the files properly
42 |         """
43 |         tokenRE = re.compile(r'<L (\S+) \S+ (\S+) (\S+) \S+>')
44 |         for path in self._children:
45 |             string = open(path).read()
46 |             for cat, pos, form in tokenRE.findall(string):
47 |                 yield form, pos, cat
48 | 
49 |     def section(self, sec):
50 |         for i, fileLoc in enumerate(self._children):
51 |             path, fileName = os.path.split(fileLoc)
52 |             if int(fileName[4:6]) == sec:
53 |                 yield self.child(i)
54 | 
55 | 
56 |     def section00(self):
57 |         for i in xrange(99):
58 |             yield self.child(i)
59 | 
60 |     def twoTo21(self):
61 |         for i in xrange(199, 2074):
62 |             yield self.child(i)
63 | 
64 |     def section23(self):
65 |         for i in xrange(2157, 2257):
66 |             yield self.child(i)
67 | 
68 |     def section24(self):
69 |         for i in xrange(2257, self.length()):
70 |             yield self.child(i)
71 | 
72 |     def _getFileList(self, location):
73 |         """
74 |         Get all files below location
75 |         """
76 |         paths = []
77 |         for path in [os.path.join(location, f) for f in os.listdir(location)]:
78 |             if path.endswith('CVS'):
79 |                 continue
80 |             elif path.startswith('.'):
81 |                 continue
82 |             if os.path.isdir(path):
83 |                 paths.extend(self._getFileList(path))
84 |             elif path.endswith('.mrg') or path.endswith('.auto'):
85 |                 paths.append(path)
86 |         paths.sort()
87 |         return paths
88 | 
89 |             
90 | 


--------------------------------------------------------------------------------
/tests/test_secondary.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import ccg.category
 4 | import ccg.lexicon
 5 | 
 6 | ccg.lexicon.load()
 7 | 
 8 | class TestSecondary(unittest.TestCase):
 9 |     def test_inner_result(self):
10 |         c = ccg.category.from_string('(S[dcl]\NP)/NP')
11 |         self.assertEqual(c.inner_result.annotated, 'S[dcl]{_}')
12 |         c = ccg.category.from_string('((S\NP)\(S\NP))/NP')
13 |         self.assertEqual(c.inner_result.annotated, 'S[X]{Y}')
14 |         c = ccg.category.from_string('NP')
15 |         self.assertEqual(c.inner_result, 'NP')
16 | 
17 |     def test_is_predicate(self):
18 |         c = ccg.category.from_string('PP/NP')
19 |         self.assertFalse(c.is_predicate)
20 |         c = ccg.category.from_string('S[dcl]\NP')
21 |         self.assertTrue(c.is_predicate)
22 |         c = ccg.category.from_string(r'(S[adj]\NP)/(S[adj]\NP)')
23 |         self.assertFalse(c.is_predicate)
24 | 
25 |     def test_is_adjunct(self):
26 |         c = ccg.category.from_string('(S\NP)\(S\NP)')
27 |         self.assertTrue(c.is_adjunct)
28 |         c = ccg.category.from_string('((S[X]{Y}\NP{Z}){Y}/'
29 |                                      '(S[X]{Y}\NP{Z*}){Y}<1>){_}')
30 |         self.assertTrue(c.is_adjunct)
31 |         c = ccg.category.from_string('(S[X]{_}/S[X]{Y}){_}')
32 |         self.assertFalse(c.is_adjunct)
33 |         c = ccg.category.from_string('(PP{Y}/PP{Y}){_}')
34 |         self.assertTrue(c.is_adjunct)
35 | 
36 |     def test_has_adjunct(self):
37 |         c = ccg.category.from_string('((S\NP)\(S\NP))/NP')
38 |         self.assertTrue(c.has_adjunct)
39 |         c = ccg.category.from_string('NP/N')
40 |         self.assertFalse(c.has_adjunct)
41 | 
42 |     def test_is_aux(self):
43 |         c = ccg.category.from_string('(S[dcl]\NP)/(S[dcl]\NP)')
44 |         self.assertTrue(c.is_aux)
45 |         c = ccg.category.from_string('(S[adj]\NP)/(S[adj]\NP)')
46 |         self.assertTrue(c.is_aux)
47 |         c = ccg.category.from_string('PP/NP')
48 |         self.assertFalse(c.is_aux)
49 | 
50 |     def test_is_true_aux(self):
51 |         c = ccg.category.from_string('(S[dcl]\NP)/(S[dcl]\NP)')
52 |         self.assertTrue(c.is_true_aux)
53 |         c = ccg.category.from_string('(S[adj]\NP)/(S[adj]\NP)')
54 |         self.assertFalse(c.is_true_aux)
55 |         c = ccg.category.from_string('PP/NP')
56 |         self.assertFalse(c.is_true_aux)
57 | 
58 | 
59 |     def test_srl_annot_string(self):
60 |         stag = ccg.scat.SuperCat('(S[dcl]\NP)/NP')
61 |         stag.srl_annot.add(('_', 'A0', 'Y'))
62 |         stag.srl_annot.add(('Z', 'AM-TMP', '_'))
63 |         n, stag_str, annotated, roles = stag.srl_string()
64 |         assert n == 2
65 |         self.assertEqual(stag_str, "(S[dcl]\\NP)/NP@X'A0'Y_Z'AM-TMP'X")
66 |         self.assertEqual(annotated,
67 |                          "((S[dcl]{_}\\NP{Y}<1>){_}/NP{Z}<2>){_}@X'A0'Y_Z'AM-TMP'X")
68 |         assert roles == ['1 A0 %l %f', '2 AM-TMP %f %l']
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     unittest.main()
73 | 


--------------------------------------------------------------------------------
/Treebank/CCGbank/Writers/_AutoFileWriter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from os.path import join as pjoin
 3 | import re
 4 | 
 5 | import ccg.lexicon
 6 | 
 7 | class AutoFileWriter:
 8 |     """
 9 |     Write a .auto format file
10 |     """
11 |     def __init__(self, **kwargs):
12 |         if 'directory' in kwargs:
13 |             self.setDir(kwargs.pop('directory'))
14 |         if 'markedup' in kwargs and False: # Don't support markedup right now
15 |             muLoc = kwargs.pop('markedup')
16 |             entries, unused = ccg.Markedup.getEntries(muLoc)
17 |             markedup = {}
18 |             for entry in entries:
19 |                 markedup[entry.cat] = entry.toJulia()
20 |             self.markedup = markedup
21 |         else:
22 |             self.markedup = {}
23 |         
24 |     def setDir(self, directory):
25 |         if not os.path.exists(directory):
26 |             print "Making %s" % directory
27 |             os.makedirs(directory)
28 |         self.directory = directory
29 |         
30 |     def getSentenceStr(self, sentence):
31 |         lines = []
32 |         idLine = self._getIDLine(sentence.globalID)
33 |         lines.append(idLine)
34 |         lines.append(self._nodeString(sentence.child(0)))
35 |         return '\n'.join(lines)
36 | 
37 |     def writeFile(self, fileID, sentences):
38 |         path = self._getPath(fileID)
39 |         output = open(path, 'w')
40 |         for sentence in sentences:
41 |             output.write(sentence + '\n')
42 |         output.close()
43 | 
44 |     def _getPath(self, fileID):
45 |         dirSect = fileID[4:6]
46 |         directory = pjoin(self.directory, dirSect)
47 |         if not os.path.exists(directory):
48 |             os.mkdir(directory)
49 |         return pjoin(directory, fileID)
50 |             
51 | 
52 |     def _getIDLine(self, sentenceID):
53 |         return "ID=%s PARSER=GOLD NUMPARSE=1" % sentenceID
54 |         
55 |         
56 |     def _nodeString(self, node):
57 |         if node.child(0).isLeaf():
58 |             return self._leafString(node)
59 |         else:
60 |             childStrings = []
61 |             for child in node.children():
62 |                 childStrings.append(self._nodeString(child))
63 |             nodeString = '(<T %s %d %d> %s )' % (node.label, node.headIdx, len(childStrings), ' '.join(childStrings))
64 |             return nodeString
65 | 
66 |     def _leafString(self, node):
67 |         leaf = node.child(0)
68 |         #if not leaf.parg:
69 |         #    print leaf
70 |         #    print leaf.parent()
71 |         #    raise StandardError
72 |         annotated = self.markedup.get(leaf.stag, leaf.stag.annotated)
73 |         #if leaf.stag != node.label:
74 |         #    print node.root().globalID
75 |         #    print node
76 |         #    print leaf
77 |         #    print leaf.stag
78 |         #    raise StandardError
79 |         annot_strip_re = re.compile(r'<\d>|\*')
80 |         stag_str = annot_strip_re.sub('', leaf.stag.annotated)
81 |         stag_str += '@%s' % leaf.stag.srl_string()
82 |         properties = [
83 |             leaf.stag.string,
84 |             leaf.pos,
85 |             leaf.label,
86 |             leaf.text,
87 |             stag_str]
88 |         try:
89 |             leafString = '(<L %s>)' % ' '.join(properties)
90 |         except:
91 |             print properties
92 |             raise
93 |         return leafString
94 | 


--------------------------------------------------------------------------------
/ccg/lexicon.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A lexicon loaded from a markedup file
  3 | """
  4 | import os
  5 | import os.path
  6 | from collections import defaultdict
  7 | 
  8 | import ccg.category
  9 | 
 10 | _INIT_STR = "# now list the markedup categories" 
 11 | DEFAULT_PATH = os.path.join(os.path.split(__file__)[0], 'markedup')
 12 | CATS = {}
 13 | 
 14 | 
 15 | def load(path=DEFAULT_PATH):
 16 |     global CATS
 17 |     CATS = _Lexicon(path)
 18 | 
 19 | class _Lexicon(dict):
 20 |     def __init__(self, path=DEFAULT_PATH):
 21 |         dict.__init__(self)
 22 |         self.cats = defaultdict(int)
 23 |         for entry in self._split_entries(open(path).read()):
 24 |             if not entry:
 25 |                 continue
 26 |             entry = entry.strip()
 27 |             supertag, annotated = self._parse_entry(entry)
 28 |             self.add_entry(supertag, annotated)
 29 |             self.add_entry(supertag.replace('[nb]', ''), annotated.replace('[nb]', ''))
 30 | 
 31 |     def add_entry(self, supertag, annotated):
 32 |         annotated = annotated.split('@')[0]
 33 |         if supertag in self and annotated != self[supertag].annotated:
 34 |             #print supertag
 35 |             #print annotated
 36 |             #print self[supertag].annotated
 37 |             return None
 38 |         if '{R}' in annotated:
 39 |            return None 
 40 |         try:
 41 |             category = ccg.category.from_string(annotated)
 42 |         except:
 43 |             print entry
 44 |             raise
 45 |         self[supertag] = category
 46 |         self[annotated] = category
 47 |         # Allow frequencies to be set
 48 |         self.cats[category] = 0
 49 | 
 50 |     def _split_entries(self, markedup):
 51 |         header, text = markedup.split(_INIT_STR)
 52 |         return text.split('\n\n')
 53 | 
 54 |     def _parse_entry(self, entry_str):
 55 |         lines = [line for line in entry_str.split('\n')
 56 |                  if not line.startswith('#')]
 57 |         supertag = lines[0]
 58 |         n_args, annotated = lines[1].strip().split()
 59 |         return supertag, annotated
 60 | 
 61 | class MarkedupEntry(object):
 62 |     def __init__(self, markedup_str):
 63 |         self.string = markedup_str
 64 |         lines = [l for l in markedup_str.split('\n')
 65 |                 if not l.strip().startswith('#')]
 66 |         bare_category = lines.pop(0)
 67 |         n_slots, annotated_category = lines.pop(0).strip().split(' ')
 68 |         if lines and lines[0].startswith('  !'):
 69 |             alt_markedup = lines.pop(0)[4:]
 70 |         else:
 71 |             alt_markedup = ''
 72 |         slots = defaultdict(list)
 73 |         for line in lines:
 74 |             slot = Slot(line)
 75 |             slots[slot.n].append(slot)
 76 | 
 77 |         self.category = ccg.category.from_string(bare_category)
 78 |         self.annotated = ccg.category.from_string(annotated_category)
 79 |         self.n_grs = int(n_slots)
 80 |         if alt_markedup:
 81 |             self.alt_annotated = ccg.category.from_string(alt_markedup)
 82 |         else:
 83 |             self.alt_annotated = self.annotated
 84 |         self.grs = slots
 85 | 
 86 | 
 87 | class Slot(object):
 88 |     def __init__(self, slot_str):
 89 |         pieces = slot_str.strip().split(' ')
 90 |         if pieces and pieces[-1].startswith('='):
 91 |             self.constraint_name = pieces.pop(-1)
 92 |             self.constraint_group = CONSTRAINT_GROUPS.get(self.constraint_name, set())
 93 |         else:
 94 |             self.constraint_name = None
 95 |             self.constraint_group = set()
 96 | 
 97 |         if not pieces[-1].startswith('%') and pieces[-1] != 'ignore':
 98 |             self.subtype2 = pieces.pop(-1)
 99 |         else:
100 |             self.subtype2 = None
101 | 
102 |         self.words = [p for p in pieces if p.startswith('%')]
103 |         pieces = [p for p in pieces if not p.startswith('%')]
104 | 
105 |         self.n = int(pieces.pop(0))
106 |         self.label = pieces.pop(0)
107 |         if pieces:
108 |             self.subtype1 = pieces.pop(0)
109 |         else:
110 |             self.subtype1 = None
111 |         assert not pieces
112 | 
113 | 
114 | 
115 | 


--------------------------------------------------------------------------------
/tests/test_parse.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import ccg
  3 | import ccg.category
  4 | import ccg.lexicon
  5 | ccg.lexicon.load()
  6 | 
  7 | class TestAtomicRE(unittest.TestCase):
  8 |     def test_basic(self):
  9 |         basic = [('NP', 'NP', '', False, None)]
 10 |         self._run(basic)
 11 | 
 12 |     def test_feats(self):
 13 |         feats = [('NP[nb]', 'NP', '[nb]', False, None),
 14 |                  ('PP[dcl]', 'PP', '[dcl]', False, None)
 15 |                 ]
 16 |         self._run(feats)
 17 | 
 18 |     def test_conj(self):
 19 |         conj = [('NP[conj]', 'NP', '', True, None)]
 20 |         self._run(conj)
 21 | 
 22 |     def test_hat(self):
 23 |         hat = [('N^NP', 'NP', '', False, 'NP',
 24 |                 'N^S[dcl]', 'NP', '', False, 'S[dcl]',
 25 |                 'N^S[dcl][conj]', 'N', '', True, 'S[dcl]')]
 26 | 
 27 |     def test_feat_conj(self):
 28 |         feat_conj = [('NP[nom][conj]', 'NP', '[nom]', True, None)]
 29 |         self._run(feat_conj)
 30 | 
 31 |     def test_var(self):
 32 |         cat = ccg.category.from_string('NP{_}')
 33 |         self.assertEqual(cat.string, 'NP')
 34 |         self.assertEqual(cat.var, 0)
 35 |         
 36 |     def _run(self, cases):
 37 |         for cat, atom, feat, conj, hat in cases:
 38 |             category = ccg.category.from_string(cat)
 39 |             self.assertEqual(atom, category.cat)
 40 |             if feat != '[nb]':
 41 |                 self.assertEqual(feat, category.feature)
 42 |             self.assertEqual(conj, category.conj)
 43 |             self.assertEqual(hat, category.hat)
 44 | 
 45 | 
 46 | class TestComplex(unittest.TestCase):
 47 |     def test_basic(self):
 48 |         basic = [(r'(S[dcl]\NP)/NP', 'S[dcl]\NP', 'NP', None)]
 49 |         self._run(basic)
 50 | 
 51 |     def test_hat(self):
 52 |         hat = [(r'((S[dcl]\NP)/NP)^(NP\NP)', 'S[dcl]\NP', 'NP', 'NP\NP'),
 53 |                (r'NP^PP/N', 'NP^PP', 'N', None),
 54 |                (r'N^NP^(S/S)/NP', 'N^NP^(S/S)', 'NP', None),
 55 |                (r'N^(S[dcl]^NP/NP)/NP[conj]', 'N^(S[dcl]^NP/NP)', 'NP',
 56 |                 None),
 57 |                (r'(NP/PP)^(S/S)/NP', '(NP/PP)^(S/S)', 'NP', None)]
 58 |         self._run(hat)
 59 | 
 60 |     def test_conj(self):
 61 |         category = ccg.category.from_string('(S\NP)\(S\NP)[conj]')
 62 |         self.assertEqual(category.conj, True)
 63 |         category = ccg.category.from_string('S[dcl]\NP[conj]')
 64 |         self.assertEqual(len(category.cats_by_var), 2)
 65 | 
 66 |     def test_complex(self):
 67 |         c = ('(((S[wq]{_}/PP{Y}<1>){_}/((S[q]{Z}<2>/PP{Y*}){Z}'
 68 |              '/(S[adj]{W*}\NP{V}){W*}){Z}){_}/(S[adj]{W}<3>'
 69 |              '\NP{V}){W}){_}')
 70 |         cat = ccg.category.from_string(c)
 71 |         self.assertEqual(c, cat.annotated)
 72 | 
 73 |     def _run(self, cases):
 74 |         for cat, result, argument, hat in cases:
 75 |             category = ccg.category.from_string(cat)
 76 |             self.assertEqual(cat, category.string)
 77 |             self.assertEqual(result, category.result.string)
 78 |             self.assertEqual(argument, category.argument.string)
 79 |             self.assertEqual(str(hat), str(category.hat))
 80 | 
 81 |     def test_safety(self):
 82 |         cat = ccg.category.from_string(r'(S[dcl]\NP)/NP')
 83 |         #cat.result = _parse.Category('NP')
 84 |         
 85 |     def test_var(self):
 86 |         cat = ccg.category.from_string('(S[dcl]{_}\NP{Y}){_}')
 87 |         self.assertEqual(cat.string, 'S[dcl]\NP')
 88 |         self.assertEqual(cat.var, 0)
 89 |         self.assertEqual(cat.argument.var, 1)
 90 |         cat = ccg.category.from_string('((S[X]{Y}\NP{Z}){Y}/(S[X]{Y}\NP{Z}){Y}){_}')
 91 |         self.assertEqual(cat.string, '(S\NP)/(S\NP)')
 92 |         self.assertEqual(cat.result.result.var, 1)
 93 |         self.assertEqual(cat.argument.argument.var, 2)
 94 |         self.assertEqual(cat.argument.result.var, 1)
 95 |         self.assertEqual(cat.result.argument.var, 2)
 96 |         cat = ccg.category.from_string('(N{_}^(S[X]{Y}\S[X]{Y}){_}/NP{Y}){_}')
 97 |         self.assertEqual(cat.string, 'N^(S\S)/NP')
 98 |         self.assertEqual(cat.result.hat.result.var, 1)
 99 | 
100 |     def test_multi_var(self):
101 |         cat = ccg.category.from_string('(PP{Y,_}/NP{Y}){_}')
102 |         assert cat.var == 0
103 |         assert cat.var2 == -1
104 |         assert cat.result.var == 1
105 |         assert cat.result.var2 == 0
106 |         assert cat.annotated == '(PP{Y,_}/NP{Y}){_}'
107 | 
108 |     def test_variable_guessing(self):
109 |         cat_str = r'PP/(S[to]\NP)'
110 |         assert cat_str not in ccg.lexicon.CATS
111 |         cat = ccg.category.from_string(r'PP/(S[to]\NP)')
112 |         # Need to fix this somehow
113 |         assert cat.annotated != r'(PP{_}/(S[to]{_}\NP{Y}<1>){_}){_}'
114 | 
115 | if __name__ == "__main__":
116 |     unittest.main()
117 | 


--------------------------------------------------------------------------------
/Treebank/CCGbank/_Node.py:
--------------------------------------------------------------------------------
  1 | import bisect
  2 | 
  3 | class Node(object):
  4 |     globalID = 0
  5 |     def __init__(self, label):
  6 |         self.globalID = Node.globalID
  7 |         self._children = []
  8 |         self._parent = None
  9 |         Node.globalID += 1
 10 |         self.label = label
 11 |         
 12 |     def __hash__(self):
 13 |         return hash(self.globalID)
 14 | 
 15 |     def reattach(self, newParent, index = None):
 16 |         """
 17 |         Detach from current location and move to a new location
 18 |         in the tree
 19 |         """
 20 |         depthList = self.depthList()
 21 |         lookup = {}
 22 |         for n in depthList:
 23 |             lookup[n] = True
 24 |         assert not newParent in lookup
 25 |         self._detachFromParent()
 26 |     	newParent.attachChild(self, index)
 27 |         
 28 |     def attachChild(self, newChild, index = None):
 29 |         """
 30 |         Attach a (parentless) child. If the child has a parent
 31 |         already, call its reattach method
 32 |         """
 33 |         # Don't allow bidirectional parenthood
 34 |         assert not self is newChild
 35 |         if newChild.parent():
 36 |             raise AttachmentError, 'Cannot attach node:\n\n%s\n\nto:\n\n%s\n\nNode is already attached to\n\n%s' \
 37 |             % (newChild.prettyPrint(), self.prettyPrint(), newChild.parent().prettyPrint())
 38 |         if index == None:
 39 |             bisect.insort_right(self._children, newChild)
 40 |         else:
 41 |             self._children.insert(index, newChild)
 42 |         newChild.setParent(self)
 43 |         
 44 |         
 45 |     def _detachFromParent(self):
 46 |         self._parent.detachChild(self)
 47 |         self._parent = None
 48 |         
 49 |     def detachChild(self, node):
 50 |         """
 51 |         Detach a specific node. Deprecated; use node.prune()
 52 |         """
 53 |         self._children.remove(node)
 54 |         
 55 |         
 56 |     def setParent(self, node):
 57 |         """
 58 |         Set a node as parent. Does not add as child
 59 |         """
 60 |         assert not self._parent
 61 |         self._parent = node
 62 |  
 63 |         
 64 |     def prettyPrint(self):
 65 |         """
 66 |         Deprecated alias for __str__
 67 |         """
 68 |         return "(%s %s)" % (self.label, ' '.join([child.prettyPrint() for child in self.children()]))
 69 | 
 70 |     def parent(self):
 71 |         """
 72 |         Returns _parent
 73 |         Should be change to property, perhaps
 74 |         """
 75 |         return self._parent
 76 |         
 77 |         
 78 |     def child(self, index):
 79 |         """
 80 |         Returns the child at index
 81 |         """
 82 |         return self._children[index]    
 83 |         
 84 |     def children(self):
 85 |         """
 86 |         Generator for children
 87 |         """
 88 |         # Must use list copy, lest the list change out from under the iteration
 89 |         for child in list(self._children):
 90 |             yield child
 91 |             
 92 |     def insert(self, node):
 93 |         """
 94 |         Insert a node above self
 95 |         """
 96 |         self.parent().replace(self, node)
 97 |         node.attachChild(self)
 98 | 
 99 |     def delete(self):
100 |         """
101 |         Delete self from the tree, reattaching children to parent
102 |         """
103 |         parent = self.parent()
104 |         self.prune()
105 |         for node in self.children():
106 |             node.reattach(parent)
107 | 
108 |     def replace(self, currentChild, replacement):
109 |         """
110 |         Insert a new node where an old one was
111 |         """
112 |         index = self._children.index(currentChild)
113 |         if replacement.parent():
114 |             replacement.reattach(self, index)
115 |         else:
116 |             self.attachChild(replacement, index)
117 |         currentChild.prune()
118 |     
119 |     def prune(self):
120 |         """
121 |         Detach node from parent
122 |         """
123 |         self._detachFromParent()
124 |         
125 |     def sortChildren(self):
126 |         """
127 |         Sort children in-place. Should not be necessary, but just in case...
128 |         """
129 |         decorated = [(c.getWordID(0), c) for c in self._children]
130 |         decorated.sort()
131 |         self._children = [d[1] for d in decorated]
132 |         
133 |     def depthList(self):
134 |         """
135 |         Depth-first node list
136 |         """
137 |         # Avoid recursion, for speed
138 |         queue = list(self.children())
139 |         # Can't use enumerate because changing list in place
140 |         # Must stay 1 ahead of the current index
141 |         i = 0
142 |         for node in queue:
143 |             i += 1
144 |             if not node.isLeaf():
145 |                 for j, child in enumerate(node.children()):
146 |                     queue.insert(i+j, child)
147 |         return queue
148 |         
149 |     def breadthList(self):
150 |         """
151 |         Breadth-first node list
152 |         """
153 |         children = [child for child in self.children()]
154 |         for child in children:
155 |             for subChild in child.children():
156 |                 children.append(subChild)
157 |         return children
158 |         
159 |     def getWordID(self, index):
160 |         """
161 |         Word ID at index. Generally 0 or -1
162 |         """
163 |         wordIDList = [word.wordID for word in self.listWords()]
164 |         return wordIDList[index]
165 |         
166 |     def getWord(self, index):
167 |         """
168 |         Word ID at index. Generally 0 or -1
169 |         """
170 |         wordList = self.listWords()
171 |         if not wordList:
172 |             return None
173 |         return wordList[index]
174 |         
175 |     def listWords(self):
176 |         """
177 |         List the word yield of the node
178 |         """
179 |         return [n for n in self.depthList() if n.isLeaf()]
180 |         
181 |     def length(self, constraint = None):
182 |         """
183 |         Alias for __len__, except this allows a constraint function
184 |         """
185 |         if constraint == None:
186 |             return len(self._children)
187 |         else:
188 |             return len([c for c in self.children() if constraint(c)])
189 |     
190 |     def siblings(self):
191 |         """
192 |         Return a list of sibling nodes
193 |         """
194 |         return [s for s in self.parent().children() if s != self]
195 | 
196 |     def isLeaf(self):
197 |         return False
198 | 
199 |     def isRoot(self):
200 |         return False
201 | 
202 |     def root(self):
203 |         """
204 |         Return the Sentence node at the top of the tree
205 |         """
206 |         node = self
207 |         while not node.isRoot():
208 |             node = node.parent()
209 |         return node
210 | 
211 |     def isUnary(self):
212 |         if self.length() == 1 and not self.child(0).isLeaf():
213 |             return True
214 |         else:
215 |             return False
216 | 
217 |     def ancestors(self):
218 |         """
219 |         Generate parents
220 |         """
221 |         node = self
222 |         while not node.isRoot():
223 |             node = node.parent()
224 |             yield node
225 |     
226 | ##
227 | ##    def borders(self, node):
228 | ##        """
229 | ##        Decide whether the nodes form a contiguous span of words
230 | ##        """
231 | ##        first, second = sorted((self, node))
232 | ##        lastWord = first.getWord(-1)
233 | ##        firstWord = second.getWord(0)
234 | ##        if firstWord.wordID == lastWord.wordID + 1:
235 | ##            return True
236 | ##        else:
237 | ##            return False
238 | 
239 |     
240 |     # 'Rich comparison' must be used, because I want equality tests to check
241 |     # object identity, and less than/greater
242 |     # than comparisons to check ID for sorting
243 |     
244 |     def __eq__(self, other):
245 |         return bool(self is other)
246 |     
247 |     def __ne__(self, other):
248 |         return bool(self is not other)
249 |     
250 |     def __cmp__(self, obj):
251 |         """
252 |         The deprecated complicated (and crushingly slow) cmp is used in the SFG
253 |         stuff
254 |         """
255 |        # return cmp(self.globalID, obj.globalID)
256 |         selfID = float(self.getWordID(0))
257 |         objID = float(obj.getWordID(0))
258 |         if selfID == -1:
259 |             return 0
260 |         elif objID == -1:
261 |             return 0
262 |         else:
263 |             return cmp(selfID, objID)
264 | 
265 |     def __len__(self):
266 |         return self.length()
267 |         
268 |     def __nonzero__(self):
269 |         return True
270 |         
271 |     def __str__(self):
272 |         return self.prettyPrint()
273 | 


--------------------------------------------------------------------------------
/Treebank/CCGbank/_CCGSentence.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import sys
  3 | 
  4 | from ._CCGNode import CCGNode
  5 | from ._CCGLeaf import CCGLeaf
  6 | from ._Sentence import Sentence
  7 | import ccg.scat
  8 | 
  9 | class CCGSentence(Sentence, CCGNode):
 10 |     def __init__(self, **kwargs):
 11 |         if 'string' in kwargs:
 12 |             node = self._parseString(kwargs.pop('string'))
 13 |         elif 'node' in kwargs:
 14 |             node = kwargs.pop('node')
 15 |         globalID = kwargs.pop('globalID')
 16 |         self.localID = kwargs.pop('localID')
 17 |         CCGNode.__init__(self, label="S", headIdx=0, **kwargs)
 18 |         self.globalID = globalID
 19 |         self.attachChild(node)
 20 |         self.headIdx = 0
 21 |         
 22 | 
 23 |     def sibling(self):
 24 |         return None
 25 | 
 26 |     def addPargDeps(self, pargDeps):
 27 |         headDeps = {}
 28 |         for pargDep in pargDeps:
 29 |             if len(pargDep) == 6:
 30 |                 i, j, catJ, argK, formI, formJ = pargDep
 31 |                 if formI == '-colon-':
 32 |                     formI = ':'
 33 |                 if formJ == '-colon-':
 34 |                     formJ = ':'
 35 |                 depType = 'L'
 36 |             elif len(pargDep) == 7:
 37 |                 i, j, catJ, argK, formI, formJ, depType = pargDep
 38 |             else:
 39 |                 print pargDeps
 40 |                 raise StandardError
 41 |             i = int(i)
 42 |             j = int(j)
 43 |             argK = int(argK)
 44 |             arg = self.getWord(i)
 45 |             head = self.getWord(j)
 46 |             if arg.text != formI or head.text != formJ:
 47 |                 if formI == 'null' or formJ == 'null':
 48 |                     continue
 49 |                 #else:
 50 |                 #    print >> sys.stderr, "Mismatched dependency"
 51 |                 #    return None
 52 |                 print self.globalID
 53 |                 print '\n'.join('%d-%s' % (w.wordID, w.text) for w in self.listWords())
 54 |                 print arg
 55 |                 print head
 56 |                 print formI
 57 |                 print formJ
 58 |                 print pargDep
 59 |                 print '\n'.join([' '.join(d) for d in pargDeps])
 60 |                 print self
 61 |                 raise StandardError, "Mismatched dependency"
 62 |             headDeps.setdefault(head, {}).setdefault(argK, []).append((arg, depType))
 63 |         # Initialise dependencies, so there's a slot there for unfilled deps
 64 |         for word in self.listWords():
 65 |             goldDeps = []
 66 |             for arg in word.parg.arguments:
 67 |                 goldDeps.append([])
 68 |             word.parg.goldDeps = goldDeps
 69 |         for head, itsDeps in headDeps.items():
 70 |             cat = head.parg
 71 |             for argNum, deps in itsDeps.items():
 72 |                 for dep in deps:
 73 |                     try:
 74 |                         cat.goldDeps[argNum - 1].append(dep)
 75 |                     except IndexError:
 76 |                     #    print >> sys.stderr, "Index error"
 77 |                     #    return None
 78 |                         print self.globalID
 79 |                         print '\n'.join('%d-%s' % (w.wordID, w.text) for w in self.listWords())
 80 |                         print head
 81 |                         print cat
 82 |                         print cat.arguments
 83 |                         print itsDeps
 84 |                         print cat.goldDeps
 85 |                         for p in pargDeps:
 86 |                             print p
 87 |                         cat.goldDeps[argNum - 1].append(dep)
 88 | 
 89 |     def unify_vars(self):
 90 |         """
 91 |         Traverse the nodes in the sentence, and unify their variables
 92 |         so that all nodes that have unified during the derivation have
 93 |         the same gloval variable.
 94 | 
 95 |         The nodes must be traversed bottom-up, and node labels must be
 96 |         replaced by the rule-product of their children. This is done
 97 |         because the parent nodes' annotations are not provided, and
 98 |         cannot be guessed. For example, in wsj_0200.0,
 99 |         there is the production:
100 |             (S[dcl]\NP)/(S[to]\NP) --> (((S[dcl]\NP)/(S[to]\NP))/NP NP
101 |         The annotation of the parent is _not_ the same as the one
102 |         in the markedup file for that category --- in
103 |         "it expects that to happen", "it" and "that" must not be
104 |         coindexed.
105 |         """
106 |         def unifyBranch(node):
107 |             """
108 |             Start at bottom left corner of the tree. Walk
109 |             upwards, at each point unifying the sibling
110 |             by calling this function.
111 |             """
112 |             current = node.getWord(0).parent()
113 |             while current is not node:
114 |                 sibling = current.sibling()
115 |                 parent = current.parent()
116 |                 if sibling and not sibling.child(0).isLeaf():
117 |                     unifyBranch(sibling)
118 | 
119 |                 curLab = current.label
120 |                 assert curLab
121 |                 sibLab = sibling.label if sibling else None
122 |                 parLab = parent.label
123 |                 production = ccg.rules.Production(curLab, sibLab, parLab)
124 |                 result = production.result
125 |                 if result and result.exact_eq(parLab):
126 |                     parent.label = result
127 |                 current = parent
128 |                 assert current.label
129 |         
130 |         unifyBranch(self.child(0))
131 |         # Now bind the variables to the words
132 |         for word in self.listWords():
133 |             word.stag.add_head(word)
134 |         # Fix conjunctions
135 |         # This is truly an evil hack, but it's very difficult to get it right.
136 |         # We first find conjunction nodes and get their conjuncted variables
137 |         # plus the set of nodes _outside_ their subtree (note that this
138 |         # "outside the subtree" part is what makes this so hard to do in
139 |         # pure unification)
140 |         # Once we have them, we search the nodes outside for variable sets
141 |         # that contain one but not all of the conjuncts, and then restore
142 |         # the missing ones.
143 |         conjVarSets = []
144 |         nodeSet = set(self.depthList())
145 |         for conjNode in nodeSet:
146 |             if conjNode.length() < 2:
147 |                 continue
148 |             if not conjNode.child(1).label.conj:
149 |                 continue
150 |             varSet = set(v.get_ref() for v in conjNode.label.get_vars())
151 |             nodesBelowConj = set(conjNode.depthList())
152 |             nodesBelowConj.add(conjNode)
153 |             nodesToCheck = nodeSet - nodesBelowConj
154 |             conjVarSets.append((varSet, nodesToCheck))
155 |         
156 |         for conjVars, nodes in conjVarSets:
157 |             for node in nodes:
158 |                 if node.isLeaf() or node.isRoot():
159 |                     continue
160 |                 scat = node.label
161 |                 for var, varSet in scat._var_table.items():
162 |                     varSet = set(v.get_ref() for v in varSet)
163 |                     if not varSet.intersection(conjVars):
164 |                         continue
165 |                     words = set(v.word for v in varSet if v.word)
166 |                     for conjVar in conjVars:
167 |                         if conjVar.word not in words:
168 |                             scat.add_var(var, conjVar)
169 | 
170 |     # This returns 4 groups for compatibility with the
171 |     # Root.parseString method
172 |     bracketsRE = re.compile(r'(\()<([^>]+)>|()(\))')
173 |     def _parseString(self, text):
174 |         # The algorithm here is roughly, find and build the nodes,
175 |         # and keep track of the parent. Then, later, connect the nodes together
176 |         # into a tree
177 |         # This is very similar to Root's, but it's not worth making
178 |         # both unreadable/slow to shoe-horn them together...
179 |         openBrackets = []
180 |         parentage = {}
181 |         nodes = {}
182 |         nWords = 0
183 |         for match in self.bracketsRE.finditer(text):
184 |             open_, nodeData, null, close = match.groups()
185 |             if open_:
186 |                 assert not close
187 |                 openBrackets.append((nodeData, match.start()))
188 |             else:
189 |                 assert close
190 |                 try:
191 |                     nodeData, start = openBrackets.pop()
192 |                 except:
193 |                     print text
194 |                     raise
195 |                 if nodeData.startswith('L'):
196 |                     newNode = self._makeLeaf(nodeData, nWords)
197 |                     nWords += 1
198 |                 else:
199 |                     newNode = self._makeNode(nodeData)
200 |                 if openBrackets:
201 |                     parentStart = openBrackets[-1][1]
202 |                     parentage[newNode] = parentStart
203 |                 else:
204 |                     top = newNode
205 |                 nodes[start] = newNode
206 |         # Can use Root's method for this bit
207 |         self._connectNodes(nodes, parentage)
208 |         return top
209 | 
210 |     def _makeNode(self, nodeData):
211 |         try:
212 |             T, cat, headIdx, nChildren = nodeData.split()
213 |         except:
214 |             print >> sys.stderr, nodeData
215 |             raise
216 |         return CCGNode(label=ccg.scat.SuperCat(cat), headIdx=int(headIdx))
217 | 
218 |     def _makeLeaf(self, nodeData, wordID):
219 |         L, cat, ccgPos, ptbPos, text, annotCat = nodeData.split()
220 |         if cat.endswith('/'):
221 |             cat = cat[1:-2]
222 |         if '@' in cat:
223 |             cat, srl_annot_str = cat.split('@')
224 |         else:
225 |             srl_annot_str = ''
226 |         # Check whether the @ is on the annotCat instead
227 |         if not srl_annot_str and '@' in annotCat:
228 |             annotCat, srl_annot_str = annotCat.split('@')
229 |         if cat.endswith('/'):
230 |             cat = cat[1:-2]
231 |         cat = ccg.scat.SuperCat(cat)
232 |         for srl_triple in srl_annot_str.split('_'):
233 |             if not srl_triple:
234 |                 continue
235 |             cat.srl_annot.add(tuple(srl_triple.replace('X', '_').split("'")))
236 |         parent = CCGNode(label=cat, headIdx=0)
237 |         leaf = CCGLeaf(label=ptbPos, pos=ccgPos, text=text,
238 |                        parg=cat, wordID=wordID)
239 |         parent.attachChild(leaf)
240 |         return parent
241 | 


--------------------------------------------------------------------------------
/Treebank/CCGbank/_CCGNode.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from collections import defaultdict
  3 | 
  4 | import ccg.rules
  5 | import ccg.scat
  6 | from ._Node import Node
  7 | 
  8 | class CCGNode(Node):
  9 |     def __init__(self, **kwargs):
 10 |         self.headIdx = kwargs.pop('headIdx')
 11 |         label = kwargs.pop('label')
 12 |         self.srl_labels = defaultdict(list)
 13 |         Node.__init__(self, label)
 14 |         
 15 |     def isProduction(self, **kwargs):
 16 |         """
 17 |         Check whether the node matches a given production, by checking
 18 |         its label, and some combination of the labels of its sibling,
 19 |         parent, and children
 20 |         """
 21 |         # Do this first for speed
 22 |         selfType = kwargs.pop('selfType')
 23 |         assert not '^' in str(selfType)
 24 |         #if not ccg.isIdentical(self.label.morphLess(), selfType):
 25 |         #    return False
 26 |         # NB: breaking hats here!
 27 |         if not ccg.isIdentical(self.label, selfType):
 28 |             return False
 29 |         for nodeType, specified in kwargs.items():
 30 |             assert not '^' in str(specified)
 31 |             if nodeType == 'parent':
 32 |                 node = self.parent()
 33 |             elif nodeType == 'sibling':
 34 |                 node = self.sibling()
 35 |             elif nodeType == 'left':
 36 |                 if self.length() < 1:
 37 |                     return False
 38 |                 node = self.child(0)
 39 |             elif nodeType == 'right':
 40 |                 if self.length() < 2:
 41 |                     return False
 42 |                 node = self.child(1)
 43 |             else:
 44 |                 print nodeType
 45 |                 raise StandardError
 46 |             if not node:
 47 |                 return False
 48 |             try:
 49 |                 label = node.label.morphLess()
 50 |             except AttributeError:
 51 |                 label = ccg.scat.SuperCat(node.label).morphLess()
 52 |             if str(label) != str(specified):
 53 |                 return False
 54 |         return True
 55 |             
 56 |     
 57 |     def changeLabel(self, newLabel):
 58 |         """
 59 |         Replace the node's category with a new one, propagating the changes
 60 |         as appropriate. The propagation code is controlled by Production,
 61 |         using logic roughly documented in my thesis.
 62 |         """
 63 |         if self.label.exact_eq(newLabel):
 64 |             return None
 65 |         #if ccg.isIdentical(newLabel, 'NP[nb]/N'):
 66 |         #    newLabel = ccg.category.from_string('NP/N')
 67 |         c0 = self.child(0)
 68 |         if c0.isLeaf():
 69 |             newLabel._var_table[0] = self.label._var_table[0]
 70 |             self.label = newLabel
 71 |             c0.changeLabel(newLabel)
 72 |             return None
 73 |         if self.isUnary():
 74 |             # Don't produce unary rules like N --> S/S
 75 |             # Instead, create a new NP node and place it under self
 76 |             # This will make a S/S --> NP --> N chain
 77 |             # The exception is when we're adding arguments to NP.
 78 |             # Then what we want to do is add the arguments to N
 79 |             if newLabel.innerResult() == 'NP' and not newLabel.isAdjunct():
 80 |                 if not newLabel.isComplex():
 81 |                     nLabel = ccg.scat.SuperCat('N')
 82 |                 else:
 83 |                     args = [(a, s, {'hat': h}) for (r, a, s, h) in
 84 |                             newLabel.deconstruct()]
 85 |                     nLabel = ccg.scat.add_args(ccg.scat.SuperCat('N'), args)
 86 |                     self.child(0).changeLabel(nLabel)
 87 |                     newLabel._var_table[0] = self.label._var_table[0]
 88 |                     self.label = newLabel
 89 |                     return None
 90 |         if self.length() == 2:
 91 |             c1 = self.child(1)
 92 |             production = ccg.rules.Production(c0.label, c1.label, self.label)
 93 |             production.replace(newLabel)
 94 |             if not production.left.exact_eq(c0.label):
 95 |                 c0.changeLabel(production.left)
 96 |             if c1 and not production.right.exact_eq(c1.label):
 97 |                 c1.changeLabel(production.right)
 98 |         newLabel._var_table[0] = self.label._var_table[0]
 99 |         self.label = newLabel 
100 |             
101 |     def sibling(self):
102 |         """
103 |         If there is a sibling, return it, else return None
104 |         """
105 |         for child in self.parent().children():
106 |             if child is not self:
107 |                 return child
108 |         return None
109 | 
110 |     def validate(self):
111 |         """
112 |         Check whether subtree composes
113 | 
114 |         Currently broken
115 |         """
116 |         for child in self.children():
117 |             if not child.validate():
118 |                 return False
119 |             if child.isLeaf():
120 |                 return True
121 |         if self.isRoot():
122 |             return True
123 |         label = self.label
124 |         left = self.child(0).label
125 |         if self.length() == 1:
126 |             right = None
127 |         else:
128 |             right = self.child(1).label
129 |         if ccg.validate(left, right, label):
130 |             return True
131 |         else:
132 |             w1 = self.getWord(0).globalID
133 |             return False
134 | 
135 |     def head(self):
136 |         """
137 |         Return the leaf node that the CCGbank indices designate as the head
138 |         Warning: These indices are sometimes unreliable, so this function
139 |         may give incorrect results.
140 | 
141 |         Warning++!! Be especially careful of bugs introduced during rebanking,
142 |         where the head indices have not been updated appropriately during node
143 |         movement.
144 |         """
145 |         head = self
146 |         while not head.isLeaf():
147 |             if head.headIdx >= head.length():
148 |                 #print >> sys.stderr, "Bad head idx: %s, %d" % (head, head.headIdx)
149 |                 head = head.child(-1)
150 |             else:
151 |                 head = head.child(head.headIdx)
152 |         return head
153 | 
154 |     def heads(self):
155 |         if self.headIdx >= self.length():
156 |                 print >> sys.stderr, "Bad head idx: %s" % self
157 |                 head = self.child(-1)
158 |         else:
159 |             head = self.child(self.headIdx)
160 |         heads = []
161 |         if head.sibling() and head.sibling().label.conj:
162 |             heads.extend(head.sibling().heads())
163 |         heads.extend(head.heads())
164 |         heads.sort()
165 |         return heads
166 |                 
167 | 
168 |     def move(self, destination, headIdx):
169 |         """
170 |         ccg trees are binary branching, so moving a node means inserting a
171 |         new level in the tree and deleting a level at the old destination.
172 |         This function is not responsible for ensuring valid labels, but does
173 |         check whether moves would cause crossing brackets, and checks whether
174 |         any words are stranded. Requires an index noting head directionality,
175 |         so that the head() function does not break.
176 |         """
177 |         if destination is self.sibling():
178 |             raise StandardError, "Moving to current location!"
179 |         if destination.isLeaf():
180 |             raise StandardError, "Cannot move to leaf!"
181 |         # Store the word list so that we can check it isn't disrupted
182 |         origWords = ' '.join([w.text for w in self.root().listWords()])
183 |         # Check for crossing brackets
184 |         firstNode, lastNode = sorted([self, destination])
185 |         lastYield = lastNode.listWords()
186 |         edgeWords = [w for w in firstNode.listWords() if w not in lastYield]
187 |         if not edgeWords:
188 |             print firstNode
189 |             print lastNode
190 |             raise StandardError
191 |         rightEdge = edgeWords[-1]
192 |         leftEdge = lastYield[0]
193 |         if rightEdge.wordID != (leftEdge.wordID - 1):
194 |             raise StandardError, "Move would create non-contiguous word seq"
195 |         # The actual move operation
196 |         labelCopy = ccg.scat.SuperCat(destination.label)
197 |         newParent = CCGNode(label=labelCopy, headIdx=headIdx)
198 |         destination.insert(newParent)
199 |         # Trim production by deleting sibling, moving its children up to parent
200 |         oldParent = self.parent()
201 |         oldSibling = self.sibling()
202 |         oldSibling.prune()
203 |         self.reattach(newParent)
204 |         for node in oldSibling.children():
205 |             node.reattach(oldParent)
206 |         # Post-validation
207 |         # Parent should have same head idx as before, as it has same children
208 |         oldParent.headIdx = oldSibling.headIdx
209 |         if not newParent.listWords():
210 |             print self
211 |             print destination
212 |             raise StandardError
213 |         newWords = ' '.join([w.text for w in self.root().listWords()])
214 |         if origWords != newWords:
215 |             print origWords
216 |             print newWords
217 |             raise StandardError
218 |         return newParent
219 |         
220 | 
221 |     def typeRaise(self, tCat, slash):
222 |         """
223 |         Add a type-raise node
224 |         """
225 |         assert not tCat.conj
226 |         assert not self.label.conj
227 |         innerSlash = '\\' if slash == '/' else '/'
228 |         newCat = ccg.scat.type_raise(tCat, slash, self.label)
229 |         newNode = CCGNode(headIdx=0, label=newCat)
230 |         self.insert(newNode)
231 | 
232 |     def isEntity(self, typeRequested=None):
233 |         """
234 |         Check whether the node spans an entity
235 |         """
236 |         words = self.listWords()
237 |         if not words[0].entity.startswith('B'):
238 |             return False
239 |         typeSeen = words[0].entity.split('-')[1]
240 |         if typeRequested and not typeSeen.startswith(typeRequested):
241 |             return False
242 |         matchTag = 'I-%s' % typeRequested
243 |         for w in words[1:]:
244 |             if not w.entity.startswith(matchTag):
245 |                 return False
246 |         nextWord = words[-1].nextWord()
247 |         if nextWord and nextWord.entity.startswith(matchTag):
248 |             # Don't allow sentence-final periods to be entities
249 |             if nextWord.text == '.' and not nextWord.nextWord():
250 |                 nextWord.entity = ''
251 |             else:
252 |                 return False
253 |         return True
254 | ##    def finalise(self):
255 | ##        """
256 | ##        Once the changes to the tree are complete, it is worth building final word
257 | ##        lists etc, and then telling methods to use them instead
258 | ##        """
259 | ##        self._wordList = self.listWords()
260 | ##        self._siblings = [s for s in self.parent().children() if s != self]
261 | ##        self._breadthList = self.breadthList()
262 | ##        self._depthList = self.depthList()
263 | ##        self._head = self.head()
264 | ##        self._finalised = True
265 | 
266 | ##    def addCatHeads(self):
267 | ##        return StandardError, "Currently Broken!"
268 | ##        # Find the highest left-side node with a head
269 | ##        left = self._findNode()
270 | ##        # Ensure that the node to the right of it has a head
271 | ##        left, right, parent = self._prepareJunction(left)
272 | ##        # Add the head
273 | ##        self.addCatHead(parent, left, right)
274 | 
275 | ##    def _findNode(self):
276 | ##        n = self
277 | ##        while not n.label.hasHead():
278 | ##            n = n.child(0)
279 | ##            if n.isLeaf():
280 | ##                n.parg.addHead(n.text)
281 | ##                n.parent().label.unify(n.parg)
282 | ##                return n.parent()
283 | ##        return n
284 | ##
285 | ##    def _prepareJunction(self, node):
286 | ##        while node.parent().length() == 1:
287 | ##            parent = node.parent()
288 | ##            ccg.combineChildren(parent.label, node.label, None)
289 | ##            if parent.isRoot():
290 | ##                return None, None
291 | ##            node = parent
292 | ##        left, right = node.parent().children()
293 | ##        if not left.label.hasHead():
294 | ##            left.addCatHeads()
295 | ##        if not right.label.hasHead():
296 | ##            right.addCatHeads()
297 | ##        return left, right, node
298 | 
299 | ##    def addCatHead(self, left, right, parent):
300 | ##        assert left.label.hasHead()
301 | ##        assert right.label.hasHead()
302 | ##        ccg.combineChildren(parent.label, left.label, right.label)        
303 | 


--------------------------------------------------------------------------------
/tests/test_rules.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import os.path
  3 | 
  4 | from ccg import rules
  5 | import ccg.scat
  6 | import ccg.category
  7 | import ccg.lexicon
  8 | import Treebank.CCGbank
  9 | 
 10 | ccg.lexicon.load()
 11 | 
 12 | #class TestCatPaths(unittest.TestCase):
 13 | #    def test_tv(self):
 14 | #        cat = ccg.scat.SuperCat('((S[dcl]{_}\NP{Y}){_}/NP{Y}){_}')
 15 | #        self.assertEqual(cat.cats[(0, 0)], 'S[dcl]')
 16 | #        self.assertEqual(cat.cats[(1, )], 'NP')
 17 | #        self.assertEqual(cat.cats[(0, 1)], 'NP')
 18 | #        self.assertEqual(cat.active_features[(0, 0)], 'S[dcl]')
 19 | #        self.assertEqual(cat.cats_by_var[1], ['NP', 'NP'])
 20 | #        self.assertEqual([str(c) for c in cat.cats_by_var[0]],
 21 | #                         ['(S[dcl]\NP)/NP', 'S[dcl]\NP', 'S[dcl]'])
 22 | 
 23 | #    def test_adv(self):
 24 | #        cat = ccg.scat.SuperCat('((S[X]{Y}\NP{Z}){Y}/(S[X]{Y}\NP{Z}){Y}){_}')
 25 | #        self.assertEqual(cat.cats[(0, 0)], 'S')
 26 | #        self.assertEqual(cat.cats[(1, 1)], 'NP')
 27 | #        self.assertEqual(
 28 | #            [str(c) for c in cat.cats_by_var[1]],
 29 | #            ['S\NP', 'S', 'S\NP', 'S'])
 30 | class MockToken(str):
 31 |     def __init__(self, text):
 32 |         self.text = text
 33 |         str.__init__(self, text)
 34 | 
 35 | 
 36 | 
 37 | class TestRules(unittest.TestCase):
 38 |     def test_fapply_basic(self):
 39 |         cat1 = ccg.scat.SuperCat(r'(NP{Y}/N{Y}){_}')
 40 |         cat2 = ccg.scat.SuperCat(r'N{_}')
 41 |         self.do_rule(rules.fapply, cat1, cat2, 'NP{_}')
 42 | 
 43 |     def test_fapply_feature(self):
 44 |         cat1 = ccg.scat.SuperCat('S/S')
 45 |         cat2 = ccg.scat.SuperCat('S[dcl]')
 46 |         self.do_rule(rules.fapply, cat1, cat2, 'S[dcl]{_}')
 47 |         self.do_rule(rules.fapply,
 48 |           ccg.scat.SuperCat('(S\NP)/(S\NP)'),
 49 |           ccg.scat.SuperCat(r'(S[pss]{_}\NP{Y}){_}'),
 50 |           '(S[pss]{_}\NP{Y}){_}')
 51 | 
 52 |     def test_bapply_basic(self):
 53 |         cat1 = ccg.scat.SuperCat(r'NP{_}')
 54 |         cat2 = ccg.scat.SuperCat(r'(S[dcl]{_}\NP{Y}){_}')
 55 |         self.do_rule(rules.bapply, cat1, cat2, 'S[dcl]{_}')
 56 |     
 57 |     def test_bapply_feature(self):
 58 |         cat1 = ccg.scat.SuperCat(r'S[ng]{_}')
 59 |         cat2 = ccg.scat.SuperCat(r'(S[X]{Y}\S[X]{Y}){_}')
 60 |         self.do_rule(rules.bapply, cat1, cat2, 'S[ng]{_}')
 61 | 
 62 |     def test_fcomp_basic(self):
 63 |         cat1 = ccg.scat.SuperCat(r'(NP{Y}/N{Y}){_}')
 64 |         cat2 = ccg.scat.SuperCat(r'(N{_}/PP{Y}){_}')
 65 |         self.do_rule(rules.fcomp, cat1, cat2, '(NP{_}/PP{Y}){_}')
 66 |     
 67 |     def test_fcomp_feature(self):
 68 |         cat1 = ccg.scat.SuperCat(r'(S[X]{Y}/S[X]{Y}){_}')
 69 |         cat2 = ccg.scat.SuperCat(r'(S[dcl]{_}/S[dcl]{Y}){_}')
 70 |         self.do_rule(rules.fcomp, cat1, cat2, '(S[dcl]{_}/S[dcl]{Y}){_}')
 71 |         cat1 = ccg.scat.SuperCat(r'((S[X]{Y}\NP{Z}){Y}/(S[X]{Y}\NP{Z}){Y}){_}')
 72 |         cat2 = ccg.scat.SuperCat(r'((S[dcl]{_}\NP{Y}){_}/NP{Z}){_}')
 73 |         self.do_rule(rules.fcomp, cat1, cat2,
 74 |                       '((S[dcl]{_}\NP{Y}){_}/NP{Z}){_}')
 75 | 
 76 |     def test_bcomp_basic(self):
 77 |         c1 = ccg.scat.SuperCat(r'(NP{Y}\NP{Y}){_}')
 78 |         c2 = ccg.scat.SuperCat(r'(S[dcl]{_}\NP{Y}){_}')
 79 |         self.do_rule(rules.bcomp, c1, c2, '(S[dcl]{Y}\NP{Z}){_}')
 80 |     
 81 |     def test_bcomp_feature(self):
 82 |         c1 = ccg.scat.SuperCat(r'(S[dcl]{_}\NP{Y}){_}')
 83 |         c2 = ccg.scat.SuperCat(r'(S[X]{Y}\S[X]{Y}){_}')
 84 |         self.do_rule(rules.bcomp, c1, c2, '(S[dcl]{_}\NP{Y}){_}')
 85 | 
 86 |     def test_bxcomp_basic(self):
 87 |         c1 = ccg.scat.SuperCat(r'((S[dcl]{_}\NP{Y}){_}/NP{Z}){_}')
 88 |         c2 = ccg.scat.SuperCat(r'((S[X]{Y}\NP{Z}){Y}\(S[X]{Y}\NP{Z}){Y}){_}')
 89 |         self.do_rule(rules.bxcomp, c1, c2, c1.annotated)
 90 | 
 91 |     def test_gfcomp_basic(self):
 92 |         c1 = ccg.scat.SuperCat(r'(PP{_}/S[em]{Y}){_}')
 93 |         c2 = ccg.scat.SuperCat(r'((S[em]{_}/NP{Y}){_}/Q{Z}){_}')
 94 |         self.do_rule(rules.fcomp, c1, c2, '((PP{Y}/NP{Z}){_}/Q{W}){_}')
 95 | 
 96 |     def test_gfcomp_feature(self):
 97 |         c1 = ccg.scat.SuperCat(r'S/S')
 98 |         c2 = ccg.scat.SuperCat(r'(S[dcl]\NP)/NP')
 99 |         self.assertFalse(rules.fcomp(c1, c2))
100 |         self.do_rule(rules.fxcomp, c1, c2, '((S[dcl]{_}\NP{Y}){_}/NP{Z}){_}')
101 | 
102 |     def test_gbxcomp_cross_slash(self):
103 |         c1 = ccg.scat.SuperCat(r'(S[dcl]\NP)/NP')
104 |         c2 = ccg.scat.SuperCat(r'S\S')
105 |         self.assertFalse(rules.bcomp(c1, c2))
106 |         self.do_rule(rules.bxcomp, c1, c2, '((S[dcl]{_}\NP{Y}){_}/NP{Z}){_}')
107 | 
108 |     def test_gbxcomp_basic(self):
109 |         c1 = ccg.scat.SuperCat(r'((NP{_}/PP{Y}){_}/S[em]{Z}){_}')
110 |         c2 = ccg.scat.SuperCat(r'S[dcl]\NP')
111 |         self.do_rule(rules.bxcomp, c1, c2, '((S[dcl]{Y}/PP{Z}){_}/S[em]{W}){_}')
112 | 
113 |     def test_bgxcomp_feature(self):
114 |         c1 = ccg.scat.SuperCat('((S[dcl]\NP)/NP)/NP')
115 |         c2 = ccg.scat.SuperCat('(S\NP)\(S\NP)')
116 |         self.do_rule(rules.bxcomp, c1, c2, r'(((S[dcl]{_}\NP{Y}){_}/NP{Z}){_}/NP{W}){_}')
117 |         c1 = ccg.scat.SuperCat('((S[dcl]\NP[expl])/(NP\NP))/NP')
118 |         c2 = ccg.scat.SuperCat('(S\NP)\(S\NP)')
119 |         self.do_rule(rules.bxcomp, c1, c2,
120 |                      r'(((S[dcl]{_}\NP[expl]{Y}){_}/(NP{Z}\NP{Z}){W}){_}/NP{V}){_}')
121 |         c1 = ccg.scat.SuperCat('((S[dcl]\NP[expl])/S[for])/(S[adj]\NP)')
122 |         c2 = ccg.scat.SuperCat('(S\NP)\((S\NP)/S[for])')
123 |         self.do_rule(rules.bxcomp, c1, c2, '((S[dcl]{_}\NP[expl]{Y}){_}/(S[adj]{Z}\NP{W}){Z}){_}')
124 |         c1 = ccg.scat.SuperCat('(S[qem]/S[dcl])\((NP\NP)/NP)')
125 |         c2 = ccg.scat.SuperCat('S\S')
126 |         self.do_rule(rules.bxcomp, c1, c2, '((S[qem]{_}/S[dcl]{Y}){_}\\((NP{Z}\\NP{Z}){W}/NP{V}){U}){_}')
127 | 
128 |     def test_add_conj(self):
129 |         c1 = ccg.scat.SuperCat('conj')
130 |         c2 = ccg.scat.SuperCat('S[dcl]{_}^(S[X]{Y}\S[X]{Y}){_}')
131 |         result = rules.add_conj(c1, c2)
132 |         self.assertEqual(result.string, 'S[dcl]^(S\S)[conj]')
133 |         c1 = ccg.scat.SuperCat('conj')
134 |         c2 = ccg.scat.SuperCat('S[dcl]\NP')
135 |         result = rules.add_conj(c1, c2)
136 |         self.assertEqual(result.annotated,
137 |                          '((S[dcl]{Y}\NP{Z}<1>){_}\(S[dcl]{W}\NP{Z}<1>){W}){_}')
138 |         c1 = ccg.scat.SuperCat('conj')
139 |         c2 = ccg.scat.SuperCat('S[pss]\NP')
140 |         result = rules.add_conj(c1, c2)
141 | 
142 |     def test_add_conj_head(self):
143 |         c1 = ccg.scat.SuperCat('conj')
144 |         c1.add_head(MockToken('and'))
145 |         c2 = ccg.scat.SuperCat('NP')
146 |         c2_head = MockToken('thing')
147 |         c2.add_head(c2_head)
148 |         result = rules.add_conj(c1, c2)
149 |         self.assertTrue(result.has_head(c2_head))
150 | 
151 |     def test_do_conj(self):
152 |         c1 = ccg.scat.SuperCat('S[X]\NP')
153 |         c2 = ccg.scat.SuperCat('S[dcl]\NP[conj]')
154 |         self.assertEqual(c2.annotated,
155 |             '((S[dcl]{Y}\NP{Z}<1>){_}\(S[dcl]{W}\NP{Z}<1>){W}){_}')
156 |         self.assertEqual(c2.annotated, 
157 |             '((S[dcl]{Y}\NP{Z}<1>){_}\(S[dcl]{W}\NP{Z}<1>){W}){_}')
158 |         self.assertFalse(rules.do_conj(c1, c2))
159 |         c1 = ccg.scat.SuperCat('S[dcl]\NP')
160 |         c1_head = MockToken('plays')
161 |         c1.add_head(c1_head)
162 |         c2_head = MockToken('is')
163 |         c2.add_head(c2_head)
164 |         result = rules.do_conj(c1, c2)
165 |         self.assertEqual(result, 'S[dcl]\NP')
166 |         self.assertTrue(result.has_head(c1_head))
167 |         self.assertTrue(result.has_head(c2_head))
168 | 
169 |     def test_comma_conj(self):
170 |         c1 = ccg.scat.SuperCat(':')
171 |         c2 = ccg.scat.SuperCat('NP')
172 |         self.assertEqual(rules.comma_conj(c1, c2).string, 'NP[conj]')
173 | 
174 |     def test_fcomp_tr(self):
175 |         c1 = ccg.scat.SuperCat('(S[X]{Y}/(S[X]{Y}\NP{_}){Y}){_}')
176 |         c2 = ccg.scat.SuperCat('(S[dcl]\NP)/NP')
177 |         result = rules.fcomp(c1, c2)
178 |         self.assertEqual(result, 'S[dcl]/NP')
179 | 
180 |     def test_feature(self):
181 |         c1 = ccg.scat.SuperCat('(N{_}/N[num]){_}')
182 |         c2 = ccg.scat.SuperCat('N[num]')
183 |         self.do_rule(rules.fapply, c1, c2, 'N{_}')
184 |                          
185 | 
186 |     def do_rule(self, rule, cat1, cat2, expected):
187 |         cat1str = cat1.string
188 |         cat2str = cat2.string
189 |         result = rule(cat1, cat2)
190 |         self.assertEqual(result, expected)
191 |         self.assertEqual(str(cat1), cat1str)
192 |         self.assertEqual(str(cat2), cat2str)
193 |         self.assertEqual(result.annotated, expected)
194 | 
195 | 
196 |     #def test_traise(self):
197 |     #    c1 = ccg.scat.SuperCat('NP')
198 |     #    par = ccg.scat.SuperCat('Q/(Q\NP)')
199 |     #    result = rules.traise(c1, par)
200 |     #    self.assertEqual(result, 'Q/(Q\NP)')
201 |     #    self.assertEqual(result.annotated, '(Q{Y}/(Q{Y}\NP{_}){Y}){_}')
202 |     
203 | 
204 |     def test_minimise(self):
205 |         cat = ccg.category.from_string('((S[dcl]{Y}\NP{Z}){Y}/NP{W}){Y}')
206 |         min, var_map = rules.minimise_vars(cat, {})
207 |         self.assertEqual(min.annotated, '((S[dcl]{_}\NP{Y}){_}/NP{Z}){_}')
208 |         cat = ccg.category.from_string('(S[dcl]{Y}\NP{Z}){Y}')
209 |         min, var_map = rules.minimise_vars(cat, {})
210 |         self.assertEqual(min.annotated, '(S[dcl]{_}\NP{Y}){_}')
211 | 
212 |     def test_badjunct_global(self):
213 |         c1 = ccg.scat.SuperCat('S[pss]\NP')
214 |         c2 = ccg.scat.SuperCat('(S\NP)\(S\NP)')
215 |         parent = ccg.scat.SuperCat('S[pss]\NP')
216 |         production = ccg.rules.Production(c1, c2, parent)
217 | 
218 |     def test_parent_annotation(self):
219 |         c1 = ccg.scat.SuperCat('((S[dcl]\NP)/(S[to]\NP))/NP')
220 |         c2 = ccg.scat.SuperCat('NP')
221 |         parent = ccg.scat.SuperCat('(S[dcl]\NP)/(S[to]\NP)')
222 |         production = ccg.rules.Production(c1, c2, parent)
223 |         self.assertEqual(production.result.annotated,
224 |                          '((S[dcl]{_}\NP[Y]{Y}<1>){_}/(S[to]{Z}<2>\NP[Y]{W*}){Z}){_}')
225 | 
226 | class TestTrees(unittest.TestCase):
227 |     def test_conj_plays(self):
228 |         elianti = ('(<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T NP 0 1> (<T N 1 2> '
229 |                    '(<L N/N NNP NNP Ms. N_254/N_254>) (<L N NNP NNP Haag N>) '
230 |                    ') ) (<T S[dcl]\NP 0 2> (<T (S[dcl]\NP)/NP 1 2> '
231 |                    '(<L (S[dcl]\NP)/NP VBZ VBZ plays (S[dcl]\NP_241)/NP_242>) '
232 |                    '(<T (S[dcl]\NP)/NP[conj] 1 2> (<L conj CC CC and conj>) '
233 |                    '(<L (S[dcl]\NP)/NP VBZ VBZ is (S[dcl]\NP_241)/NP_242>) ) ) '
234 |                    '(<T NP 0 1> (<L N NNP NNP Elianti N>) ) ) )'
235 |                    '(<L . . . . .>))')
236 |         import Treebank.CCGbank
237 |         sentence = Treebank.CCGbank.CCGSentence(string=elianti, globalID=0,
238 |                                                 localID=0)
239 |         sentence.unify_vars()
240 |         plays_and_is = sentence.getWord(2).parent().parent()
241 |         annotated = plays_and_is.label.global_annotated()
242 |         self.assertEqual(annotated,
243 |                          '((S[dcl]{plays,is}\NP{Haag}<1>){plays,is}/NP{Elianti}<2>){plays,is}')
244 | 
245 |     def test_conj_elianti(self):
246 |         elianti = ('(<T S[dcl] 0 2> (<T S[dcl] 1 2> (<T NP 0 1> (<T N 1 2> '
247 |                    '(<L N/N NNP NNP Ms. N_254/N_254>) (<L N NNP NNP Haag N>) '
248 |                   ') ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/NP VBZ VBZ plays '
249 |                    '(S[dcl]\NP_241)/NP_242>) (<T NP 0 1> (<T N 1 2> '
250 |                    '(<L N NNP NNP Elianti N>) (<T N[conj] 1 2> '
251 |                    '(<L conj CC CC and conj>) (<L N NN NN Celamene NN>) ) )' 
252 |                    ') ) ) (<L . . . . .>) )')
253 |         sentence = Treebank.CCGbank.CCGSentence(string=elianti, globalID=0,
254 |                                                 localID=0)
255 |         sentence.unify_vars()
256 |         elianti_and_celamene = sentence.getWord(-2).parent().parent().parent()
257 |         annotated = elianti_and_celamene.label.global_annotated()
258 |         self.assertEqual(annotated, 'N{Elianti,Celamene}')
259 | 
260 |     def test_fcomp_sadj(self):
261 |         c1 = ccg.scat.SuperCat(r'(S[dcl]\NP)/(S[adj]\NP)')
262 |         c1_head = MockToken('is')
263 |         c1.add_head(c1_head)
264 |         c2 = ccg.scat.SuperCat(r'(S[adj]\NP)/NP')
265 |         c2_head = MockToken('worth')
266 |         c2.add_head(c2_head)
267 |         production = ccg.rules.Production(c1, c2)
268 |         assert production.result.has_head(c2_head)
269 | 
270 |     def test_lex_vars_stay(self):
271 |         ccgbank_loc = '/usr/local/data/CCGbank1.2'
272 |         ccgbank = Treebank.CCGbank.CCGbank(path=ccgbank_loc)
273 |         ccg.lexicon.load(os.path.join(ccgbank_loc, 'markedup'))
274 |         asbestos = ccgbank.child(2).child(0)
275 |         asbestos.unify_vars()
276 |         for word in asbestos.listWords():
277 |             self.assertTrue(word.stag.has_head(word))
278 | 
279 | if __name__ == '__main__':
280 |     unittest.main()
281 | 


--------------------------------------------------------------------------------
/ccg/category.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from collections import defaultdict
  3 | 
  4 | import ccg.lexicon
  5 | 
  6 | VARS = ['_', 'Y', 'Z', 'W', 'V', 'U', 'T', 'S', 'R', 'P', 'Q', 'O']
  7 | _FEATS = ['[dcl]', '[b]', '[pss]', '[ng]', '[pt]']
  8 | _ATOMIC_RE = re.compile(r'([a-zA-Z,\.;:]+)(\[[^\]]+\])?(\[conj\])?')
  9 | _AUX_RE = re.compile(r'\(S\[(\w+)\]\\NP\)/\(S\[(\w+)\]\\NP\)')
 10 | _PRED_RE = re.compile(r'\(*S\[(b|dcl|ng|pss|pt|to)\]')
 11 | _PUNCT = set([',', ':', ';', '.', "LQU", "RQU", "--", 'RRB', 'LRB'])
 12 | 
 13 | 
 14 | class Category(object):
 15 |     def __init__(self, result, slash='', argument=None, **kwargs):
 16 |         import ccg.scat
 17 |         if isinstance(result, ccg.scat.SuperCat):
 18 |             result = result.category
 19 |         if isinstance(argument, ccg.scat.SuperCat):
 20 |             argument = argument.category
 21 |         self.slash = slash
 22 |         self.argument = argument
 23 |         self.is_complex = bool(self.slash)
 24 |         if self.is_complex:
 25 |             self.result = result
 26 |         else:
 27 |             self.result = self
 28 |             self._cat = result if isinstance(result, str) else result.cat
 29 |         self.kwargs = kwargs
 30 |         self.hat = kwargs.get('hat')
 31 |         self.conj = kwargs.get('conj', False)
 32 |         self.var = kwargs.get('var', 0)
 33 |         self.var2 = kwargs.get('var2', -1)
 34 |         self.asterisk = kwargs.get('asterisk', False)
 35 |         self.feat_var = kwargs.get('feat_var')
 36 |         self.feature = kwargs.get('feature', '')
 37 |         self.arg_idx = kwargs.get('arg_idx')
 38 | 
 39 |         if self.is_complex:
 40 |             str_get = self._complex_strings
 41 |             cat_get = self._complex_cats
 42 |         else:
 43 |             str_get = self._atomic_strings
 44 |             cat_get = self._atomic_cats
 45 |         self.cats, self.cats_by_var, self.active_features = cat_get()
 46 |         self.next_var = max(self.cats_by_var) + 1
 47 |         self.cat, self.string, self.annotated = str_get()
 48 |         if not '^' in self.string:
 49 |             self.hatless = self.string
 50 |         else:
 51 |             self.hatless = None
 52 | 
 53 |         # Higher-order attributes. Could be properties, but I'm assuming
 54 |         # categories are immutable, and this should be more efficient
 55 |         self.str_as_piece = '(%s)' % self if (self.is_complex and not
 56 |                                               self.hat) else self.string
 57 |         # Result leaf is at (0, 0, ...) with the longest path
 58 |         self.inner_result = max((p, c) for p, c in self.cats.items()
 59 |                                 if not any(p))[1]
 60 |         self.is_predicate = bool(_PRED_RE.match(self.string))
 61 |         self.is_adjunct = (self.result.exact_eq(self.argument) 
 62 |                            and self.result.var == self.argument.var
 63 |                            and all(c for (p, c) in self.result.cats.items()
 64 |                                    if self.argument.cats[p].var == c.var))
 65 |         self.has_adjunct = any(r[0].is_adjunct for r in self.deconstruct())
 66 |         self.is_aux = bool(_AUX_RE.match(self.string))
 67 |         self.is_true_aux = self.is_aux and self.inner_result.feature in _FEATS
 68 |         self.is_punct = (not self.is_complex and self.string in _PUNCT)
 69 |         self.is_type_raise = (self.is_complex
 70 |                               and self.argument.is_complex
 71 |                               and self.slash != self.argument.slash
 72 |                               and self.result.exact_eq(self.argument.result))
 73 |         self.forward = bool(self.is_complex and self.slash == '/')
 74 |         self.backward = bool(self.is_complex and self.slash == '\\')
 75 | 
 76 |     def __eq__(self, other):
 77 |         """
 78 |         Check whether the featureless version of the
 79 |         other category matches self. Note that this means
 80 |         equality is not commutative
 81 |         """
 82 |         if self is other:
 83 |             return True
 84 |         if isinstance(other, str):
 85 |             other = from_string(other)
 86 |         if self.is_complex != other.is_complex:
 87 |             return False
 88 |         # Fail on feature or hat if it's there and doesnt match
 89 |         if self.feature and other.feature and self.feature != other.feature:
 90 |             return False
 91 |         if self.hat and other.hat and self.hat != other.hat:
 92 |             return False
 93 |         if self.slash != other.slash:
 94 |             return False
 95 |         s_cats = self.cats
 96 |         o_cats = other.cats
 97 |         if len(s_cats.keys()) != len(o_cats.keys()):
 98 |             return False
 99 |         for path, s_cat in s_cats.items():
100 |             if path not in o_cats:
101 |                 return False
102 |             if s_cat.is_complex:
103 |                 continue
104 |             o_cat = o_cats[path]
105 |             if s_cat.cat != o_cat.cat:
106 |                 return False
107 |             if (s_cat.feature and o_cat.feature 
108 |                 and s_cat.feature != o_cat.feature):
109 |                 return False
110 |             if s_cat.hat and o_cat.hat and s_cat.hat != o_cat.hat:
111 |                 return False
112 |         return True 
113 | 
114 |     def __ne__(self, other):
115 |         """
116 |         Apparently != doesn't call __eq__. Boo, hiss.
117 |         """
118 |         if not self == other:
119 |             return True
120 |         else:
121 |             return False
122 | 
123 |     def __str__(self):
124 |         return self.string
125 | 
126 |     def __hash__(self):
127 |         return hash(str(self))
128 | 
129 |     def __repr__(self):
130 |         return str(self)
131 | 
132 |     def __setattr__(self, attr, value):
133 |         """
134 |         Make Categories immutable by ensuring values
135 |         that have been set can never be over-written
136 |         """
137 |         if attr in self.__dict__:
138 |             raise AttributeError(attr)
139 |         else:
140 |             self.__dict__[attr] = value
141 | 
142 |     non_s_feat_re = re.compile(r'(?<!S)\[\w+]+')
143 |     def exact_eq(self, other):
144 |         if self is other:
145 |             return True
146 |         elif other is None:
147 |             return False
148 |         if self.conj != other.conj:
149 |             return False
150 |         else:
151 |             # Succeed if features are different on non-S nodes
152 |             self_str = self.non_s_feat_re.sub('', self.string)
153 |             other_str  =self.non_s_feat_re.sub('', other.string)
154 |             return self_str == other_str
155 |         #elif isinstance(other, str):
156 |         #    return self.string == other
157 |         #elif isinstance(other, Category):
158 |         #    return self.string == other.string
159 |         #elif isinstance(other, ccg.scat.SuperCat):
160 |         #    return self.string == other.string
161 |         #else:
162 |         #    return False
163 | 
164 |     def deconstruct(self):
165 |         """
166 |         Yields result, argument, slash and kwargs for
167 |         each node on result branch of the category tree
168 |         """
169 |         cat = self
170 |         while cat.is_complex:
171 |             yield cat.result, cat.argument, cat.slash, self.kwargs
172 |             cat = cat.result
173 | 
174 |     # Backwards compatibility
175 |     def isPredicate(self):
176 |         return self.is_predicate
177 | 
178 |     def isAdjunct(self):
179 |         return self.is_adjunct
180 | 
181 |     def isPunct(self):
182 |         return self.is_punct
183 | 
184 |     def isAux(self):
185 |         return self.is_aux
186 | 
187 |     def isTrueAux(self):
188 |         return self.is_true_aux
189 | 
190 |     def isTypeRaise(self):
191 |         return self.is_type_raise
192 | 
193 |     def adjunctResult(self):
194 |         return self.adjunct_result
195 | 
196 |     def innerResult(self):
197 |         return self.inner_result
198 | 
199 |     def isAux(self):
200 |         return self.is_aux
201 | 
202 |     def isComplex(self):
203 |         return self.is_complex
204 | 
205 |     @property
206 |     def morph(self):
207 |         return self.hat
208 | 
209 |     @property
210 |     def hasMorph(self):
211 |         return '^' in self.string
212 | 
213 |     def morphLess(self, as_piece=False):
214 |         if as_piece and self.is_complex:
215 |             return '(%s)' % self.hatless
216 |         else:
217 |             return self.hatless
218 | 
219 |     def featLess(self):
220 |         return self.featless
221 | 
222 |     # Unsupported: heads, headGen, addHead, unify,
223 |     # headShare, headRef, strAsPiece, dependencies,
224 |     # goldDependencies, fullPrint
225 | 
226 | 
227 |     def _atomic_strings(self):
228 |         hat_str = '^%s' % self.hat.str_as_piece if self.hat else ''
229 |         feat_str = self.feature
230 |         pieces = [self._cat, feat_str, hat_str]
231 | 
232 |         feat_annot = self.feat_var if self.feat_var else self.feature
233 |         hat_annot = '^%s' % self.hat.annotated if self.hat else ''
234 |         asterisk = '*' if self.asterisk else ''
235 |         var2 = ',%s' % VARS[self.var2] if self.var2 >= 0 else ''
236 |         arg_idx = '<%s>' % self.arg_idx if self.arg_idx else ''
237 |         var_str = '{%s%s%s}%s' % (VARS[self.var], var2, asterisk, arg_idx)
238 |         annot_pieces = [self._cat, feat_annot, hat_annot, var_str]
239 | 
240 |         if self.conj:
241 |             pieces.append('[conj]')
242 |             annot_cat = '%s{Y}' % ''.join(annot_pieces[:-1])
243 |             annotated = '(%s\%s){%s}' % (annot_cat, annot_cat, VARS[self.var])
244 |         else:
245 |             annotated = ''.join(annot_pieces)
246 |         return self._cat, ''.join(pieces), annotated
247 | 
248 |     def _complex_strings(self):
249 |         res_str = self.result.str_as_piece
250 |         arg_str = self.argument.str_as_piece
251 |         cat = '%s%s%s' % (res_str, self.slash, arg_str)
252 |         if self.hat:
253 |             cat = '(%s)^%s' % (cat, self.hat.str_as_piece)
254 | 
255 |         res_annot = self.result.annotated
256 |         arg_annot = self.argument.annotated
257 |         asterisk = '*' if self.asterisk else ''
258 |         arg_idx = '<%s>' % self.arg_idx if self.arg_idx else ''
259 |         var_annot = '{%s%s}%s' % (VARS[self.var], asterisk, arg_idx)
260 |         hat_annot = '^%s' % self.hat.annotated if self.hat else ''
261 |         annot_cat = '(%s%s%s)%s%s' % (res_annot, self.slash, arg_annot,
262 |                                       var_annot, hat_annot)
263 |         
264 |         # All this effort to get the correct annotation for conj
265 |         # categories, when it (probably?) doesn't matter...
266 |         if self.conj:
267 |             non_conj = from_string(cat)
268 |             var_map = dict((v, v+1) for v in non_conj.cats_by_var)
269 | 
270 |             result = ccg.rules.remap_vars(non_conj, var_map)
271 |             result_str = result.annotated[:-3] + '{_}'
272 |             var_map = dict((v, v) for v in result.cats_by_var)
273 |             var_map[1] = max(var_map.keys()) + 1
274 |             arg = ccg.rules.remap_vars(result, var_map).annotated
275 |             annot_cat = '(%s\%s){_}' % (result_str, arg)
276 |             string = '%s[conj]' % cat
277 |         else:
278 |             string = cat
279 |         return cat, string, annot_cat
280 | 
281 |  
282 |     def _atomic_cats(self):
283 |         cats = {(): self}
284 |         cats_by_var = {self.var: [self]}
285 |         if self.var2 >= 0:
286 |             cats_by_var[self.var2] = [self]
287 |         active_features = {(): self} if self.feature else {}
288 |         return cats, cats_by_var, active_features
289 | 
290 |     def _complex_cats(self):
291 |         # Get list of all cats in tree and their position
292 |         cats = {tuple(): self}
293 |         active_features = {}
294 |         cats_by_var = defaultdict(list)
295 |         cats_by_var[self.var].append(self)
296 |         if self.var2 >= 0 and self.var2 != self.var:
297 |             cats_by_var[self.var2].append(self)
298 |         for piece, path_prefix in ((self.result, 0), (self.argument, 1)):
299 |             for path, cat in piece.cats.items():
300 |                 cats[(path_prefix,) + path] = cat
301 |             for var, cat_list in piece.cats_by_var.items():
302 |                 cats_by_var[var].extend(cat_list)
303 |             for path, cat in piece.active_features.items():
304 |                 active_features[(path_prefix,) + path] = cat
305 |         return cats, cats_by_var, active_features
306 | 
307 | 
308 | var_re = re.compile(r'\{(\w)(?:,(\w))?(\*)?\}$')
309 | def from_string(cat_str, **kwargs):
310 |     global VARS
311 |     assert cat_str
312 |     assert cat_str.count('(') == cat_str.count(')')
313 |     cat_str = cat_str.replace('[nb]', '')
314 |     if not kwargs and cat_str in ccg.lexicon.CATS:
315 |         return ccg.lexicon.CATS[cat_str]
316 |     # Add a kwarg to stop subpieces being looked up in CATS
317 |     kwargs['top'] = False
318 |     if cat_str.endswith('>'):
319 |         kwargs['arg_idx'] = cat_str[-2]
320 |         cat_str = cat_str[:-3]
321 |     
322 |     if cat_str.endswith('[conj]'):
323 |         kwargs['conj'] = True
324 |         cat_str = cat_str[:-6]
325 |         if cat_str in ccg.lexicon.CATS:
326 |             annotated = ccg.lexicon.CATS[cat_str].annotated
327 |             return from_string(annotated, **kwargs)
328 |     elif 'conj' not in kwargs:
329 |         kwargs['conj'] = False
330 | 
331 |     # Handle top-level hat
332 |     hat_idx = cat_str.find('^')
333 |     if hat_idx != -1 and cat_str.endswith('{_}'):
334 |         assert 'hat' not in kwargs
335 |         base_str = cat_str[:hat_idx]
336 |         if base_str.count('(') == base_str.count(')'):
337 |             kwargs['hat'] = from_string(cat_str[hat_idx + 1:])
338 |             return from_string(base_str, **kwargs)
339 |         
340 |     var_match = var_re.search(cat_str)
341 |     if var_match is not None:
342 |         var = var_match.group(1)
343 |         var2 = var_match.group(2)
344 |         kwargs['asterisk'] = var_match.group(3)
345 |         kwargs['var'] = VARS.index(var)
346 |         if var2:
347 |             kwargs['var2'] = VARS.index(var2)
348 |         cat_str = _strip_brackets(cat_str[:var_match.start()])
349 |     
350 |     if '/' not in cat_str and '\\' not in cat_str:
351 |         category = _parse_atomic(cat_str, kwargs)
352 |     else:
353 |         category = _parse_complex(cat_str, kwargs)
354 | 
355 |     #if not kwargs and '{' in cat_str:
356 |     #    print cat_str
357 |     #    lexicon.CATS[cat_str] = category
358 |     #    lexicon.CATS[category.string] = category
359 |     return category
360 | 
361 | def _parse_atomic(cat_str, kwargs):
362 |     if '^' in cat_str:
363 |         cat_str, hat_str = cat_str.split('^', 1)
364 |         kwargs['hat'] = from_string(hat_str)
365 |     assert cat_str
366 |     match = _ATOMIC_RE.match(cat_str)
367 |     if match is None:
368 |         raise StandardError(cat_str)
369 |     atom, feature, conj = match.groups()
370 |     if feature:
371 |         if feature[1].isupper():
372 |             kwargs['feat_var'] = feature
373 |         else:
374 |             kwargs['feature'] = feature
375 |     return Category(atom, **kwargs)
376 | 
377 | 
378 | def _parse_complex(cat_str, kwargs):
379 |     depth = 0
380 |     slashes = set(('/', '\\'))
381 |     hats = []
382 |     if not cat_str.count('(') == cat_str.count(')'):
383 |         raise StandardError(cat_str)
384 |     for i, c in enumerate(cat_str):
385 |         if c == '(':
386 |             depth += 1
387 |         elif c == ')':
388 |             depth -= 1
389 |         elif depth == 0:
390 |             if c in slashes:
391 |                 hats = []
392 |                 result = from_string(_strip_brackets(cat_str[:i]))
393 |                 slash = cat_str[i]
394 |                 argument = from_string(_strip_brackets(cat_str[i + 1:]))
395 |                 return Category(result, slash, argument, **kwargs)
396 |             elif c == '^':
397 |                 hats.append(i)
398 |         assert depth >= 0
399 |     else:
400 |         assert hats
401 |         i = hats[0]
402 |         kwargs['hat'] = from_string(_strip_brackets(cat_str[i + 1:]))
403 |         return from_string(_strip_brackets(cat_str[:i]), **kwargs)
404 | 
405 | 
406 | def _strip_brackets(cat_str):
407 |     if not (cat_str.startswith('(') and cat_str.endswith(')')):
408 |         return cat_str
409 |     depth = 0
410 |     for c in cat_str:
411 |         if c == '(':
412 |             depth += 1
413 |         elif c == ')':
414 |             depth -= 1
415 |         if depth == 0 and (c == '/' or c == '\\' or c == '^'):
416 |             return cat_str
417 |     else:
418 |         return cat_str[1:-1]
419 | 
420 | 
421 | 


--------------------------------------------------------------------------------
/tests/test_replace.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Test category replacement, for changeLabel
  3 | """
  4 | import unittest
  5 | import os.path
  6 | import random
  7 | 
  8 | import ccg.scat
  9 | import ccg.rules
 10 | import ccg.lexicon
 11 | import ccg.grammar
 12 | 
 13 | ccg.lexicon.load()
 14 | 
 15 | class TestReplace(unittest.TestCase):
 16 |     def test_fapply_basic(self):
 17 |         c1 = ccg.scat.SuperCat('PP/NP')
 18 |         c2 = ccg.scat.SuperCat('NP')
 19 |         parent = ccg.scat.SuperCat('PP')
 20 |         production = ccg.rules.Production(c1, c2, parent)
 21 |         production.replace(ccg.scat.SuperCat('NP'))
 22 |         assert production.left == 'NP/NP'
 23 |         assert production.left.annotated == '(NP{_}/NP{Y}<1>){_}'
 24 |         
 25 |     def test_fapply_adjunct(self):
 26 |         c1 = ccg.scat.SuperCat('N/N')
 27 |         c2 = ccg.scat.SuperCat('N')
 28 |         production = ccg.rules.Production(c1, c2)
 29 |         production.parent = production.result
 30 |         production.replace(ccg.scat.SuperCat('PP'))
 31 |         assert production.left == 'PP/PP'
 32 |         assert production.left.annotated == '(PP{Y}/PP{Y}){_}'
 33 |         assert production.right == 'PP'
 34 |         
 35 |     def test_fapply_adjunct_feature(self):
 36 |         c1 = ccg.scat.SuperCat('S/S')
 37 |         c2 = ccg.scat.SuperCat('S[dcl]')
 38 |         production = ccg.rules.Production(c1, c2)
 39 |         production.parent = production.result
 40 |         production.replace(ccg.scat.SuperCat('N[num]'))
 41 |         assert production.left.annotated == '(N[X]{Y}/N[X]{Y}){_}'
 42 |         assert production.right == 'N[num]'
 43 |         assert production.parent == 'N[num]'
 44 |         assert ccg.rules.fapply(production.left, production.right) == 'N[num]'
 45 | 
 46 |     def test_fcomp_adjunct(self):
 47 |         c1 = ccg.scat.SuperCat('NP/NP')
 48 |         c2 = ccg.scat.SuperCat('NP/N')
 49 |         parent = ccg.scat.SuperCat('NP/N')
 50 |         production = ccg.rules.Production(c1, c2, parent=parent)
 51 |         production.replace('(S[adj]\NP)/(S[adj]\NP)')
 52 |         production.replace(parent)
 53 |         self.assertEqual(production.left, c1)
 54 |         self.assertEqual(production.right, c2)
 55 | 
 56 |     def test_bapply_basic(self):
 57 |         c1 = ccg.scat.SuperCat('NP')
 58 |         c2 = ccg.scat.SuperCat('S[dcl]\NP')
 59 |         production = ccg.rules.Production(c1, c2)
 60 |         production.parent = production.result
 61 |         production.replace(ccg.scat.SuperCat('S[em]'))
 62 |         assert production.right == 'S[em]\NP'
 63 |         assert production.right.annotated == '(S[em]{_}\NP{Y}<1>){_}'
 64 | 
 65 |     def test_fcomp_basic(self):
 66 |         c1 = ccg.scat.SuperCat('NP/N')
 67 |         c2 = ccg.scat.SuperCat('N/N')
 68 |         production = ccg.rules.Production(c1, c2)
 69 |         production.parent = production.result
 70 |         assert production.rule == 'fcomp'
 71 |         production.replace(ccg.scat.SuperCat('PP/N'))
 72 |         assert production.left == 'PP/N'
 73 |         c1 = ccg.scat.SuperCat('NP/N')
 74 |         c2 = ccg.scat.SuperCat('(N/PP)/S[em]')
 75 |         production = ccg.rules.Production(c1, c2)
 76 |         production.parent = production.result
 77 |         production.replace(ccg.scat.SuperCat('(NP/S[em])/PP'))
 78 |         assert production.left == 'NP/N'
 79 |         assert production.right == '(N/S[em])/PP'
 80 |         c1 = ccg.scat.SuperCat('NP/N')
 81 |         c2 = ccg.scat.SuperCat('(N/PP)/S[em]')
 82 |         production = ccg.rules.Production(c1, c2)
 83 |         production.parent = production.result
 84 |         production.replace(ccg.scat.SuperCat('((PP{_}/PP{Y}){_}/S[em]{Z}){_}'))
 85 |         self.assertEqual(production.left, 'PP/N')
 86 |         self.assertEqual(production.right, '(N/PP)/S[em]')
 87 | 
 88 |     def test_fcomp2(self):
 89 |         c1 = ccg.scat.SuperCat('((S[dcl]\NP)/PP)/PP')
 90 |         c2 = ccg.scat.SuperCat('PP/NP')
 91 |         production = ccg.rules.Production(c1, c2)
 92 |         production.parent = production.result
 93 |         assert production.rule == 'fcomp'
 94 |         production.replace(ccg.scat.SuperCat('((S[ng]\NP)/NP)/NP'))
 95 |         self.assertEqual(production.right, 'PP/NP')
 96 | 
 97 |     def test_bxcomp(self):
 98 |         # (NP\NP)/NP NP\NP --> (NP\NP)/NP
 99 |         # (((S[dcl]\NP)/(S[to]\NP))/PP)/NP
100 |         c1 = ccg.scat.SuperCat('(NP\NP)/NP')
101 |         c2 = ccg.scat.SuperCat('NP\NP')
102 |         production = ccg.rules.Production(c1, c2, rule='bxcomp')
103 |         production.parent = production.result
104 |         assert production.parent == '(NP\NP)/NP'
105 |         production.replace('(((S[dcl]\NP)/(S[to]\NP))/PP)/NP')
106 |         self.assertEqual(production.left, '(((S[dcl]\NP)/(S[to]\NP))/PP)/NP')
107 |         self.assertEqual(production.right, '(S\NP)\(S\NP)')
108 |         production.replace('(NP\NP)/NP')
109 |         print production.rule
110 |         self.assertEqual(production.left.string, c1.string)
111 |         self.assertEqual(production.right.string, c2.string)
112 | 
113 | 
114 | 
115 |     def test_fcomp_aux(self):
116 |         # (S[dcl]\NP)/(S[ng]\NP) (S[ng]\NP)/NP --> (S[dcl]\NP)/NP
117 |         # to:
118 |         # (S/S)/(S[ng]\NP) (S[ng]\NP)/S[dcl] --> (S/S)/S[dcl]
119 |         c1 = ccg.scat.SuperCat('(S[dcl]\NP)/(S[ng]\NP)')
120 |         c2 = ccg.scat.SuperCat('(S[ng]\NP)/NP')
121 |         production = ccg.rules.Production(c1, c2)
122 |         production.parent = production.result
123 |         assert production.rule == 'fcomp'
124 |         assert production.parent == '(S[dcl]\NP)/NP'
125 |         production.replace(ccg.scat.SuperCat('(S/S)/S[dcl]'))
126 |         self.assertEqual(production.left, '(S/S)/(S[ng]\NP)')
127 |         self.assertEqual(production.right, '(S[ng]\NP)/S[dcl]')
128 |     
129 |     def test_bcomp_traise(self):
130 |         # ((S[dcl]\NP)/PP)/NP (S\NP)\((S\NP)/PP) --> (S[dcl]\NP)/NP
131 |         # (((S[b]\NP)/PP)/(S[b]\NP))/NP (S\NP)\((S\NP)/PP)
132 |         # --> ((S[b]\NP)/(S[b]\NP))/NP
133 |         c1 = ccg.scat.SuperCat('((S[dcl]\NP)/PP)/NP')
134 |         c2 = ccg.scat.SuperCat('(S\NP)\((S\NP)/PP)')
135 |         production = ccg.rules.Production(c1, c2)
136 |         production.parent = production.result
137 |         production.replace(ccg.scat.SuperCat('((S[b]\NP)/(S[b]\NP))/NP'))
138 |         self.assertEqual(production.left, '(((S[b]\NP)/PP)/(S[b]\NP))/NP')
139 | 
140 |     def test_badjunct(self):
141 |         # (S[pt]\NP)/S[em] (S\NP)\(S\NP) --> (S[pt]\NP)/S[em]
142 |         # PP
143 |         # New left: PP
144 |         # New right: PP\PP
145 |         # (S[pt]\NP)/S[em] (S[pt]\NP)/S[em]
146 |         # ((S\NP)/S)\((S\NP)/S) (S\NP)\(S\NP)
147 |         # badjunct
148 |         c1 = ccg.scat.SuperCat('(S[pt]\NP)/S[em]')
149 |         c2 = ccg.scat.SuperCat('(S\NP)\(S\NP)')
150 |         production = ccg.rules.Production(c1, c2, rule='bxcomp')
151 |         production.parent = production.result
152 |         assert production.rule == 'badjunct'
153 |         production.replace(ccg.scat.SuperCat('PP'))
154 |         production.replace(ccg.scat.SuperCat('(S[pt]\NP)/S[em]'))
155 |         self.assertEqual(production.left, c1)
156 |         self.assertEqual(production.right.string, c2.string)
157 | 
158 | 
159 | 
160 |     def test_make_adjunct(self):
161 |         cat = ccg.scat.SuperCat('N/N')
162 |         stripped = ccg.rules.strip_features(cat)
163 |         self.assertEqual(stripped.annotated, '(N[X]{Y}/N[X]{Y}<1>){_}')
164 |         adjunct = ccg.scat.make_adjunct(cat, '/')
165 |         self.assertEqual(adjunct, '(N/N)/(N/N)')
166 |         
167 |     def test_punct(self):
168 |         c1 = ccg.scat.SuperCat('PP/NP')
169 |         c2 = ccg.scat.SuperCat("RQU")
170 |         parent = ccg.scat.SuperCat('PP/NP')
171 |         production = ccg.rules.Production(c1, c2, parent=parent)
172 |         production.replace(ccg.scat.SuperCat('S[dcl]\NP'))
173 |         assert production.right == "RQU"
174 |         assert production.left == 'S[dcl]\NP'
175 |         c1 = ccg.scat.SuperCat(',')
176 |         c2 = ccg.scat.SuperCat('(S\NP)/(S\NP)')
177 |         production = ccg.rules.Production(c1, c2, rule='left_punct')
178 |         production.parent = production.result
179 |         assert production.parent.annotated == c2.annotated
180 |         production.replace(ccg.scat.SuperCat('NP/N'))
181 |         assert production.left == ','
182 |         assert production.right.annotated == '(NP{Y}/N{Y}<1>){_}'
183 |     
184 |     def test_add_conj(self):
185 |         c1 = ccg.scat.SuperCat('conj')
186 |         c2 = ccg.scat.SuperCat('PP/NP')
187 |         production = ccg.rules.Production(c1, c2)
188 |         production.parent = production.result
189 |         production.replace(ccg.scat.SuperCat('S[dcl]\NP[conj]'))
190 |         assert production.right == 'S[dcl]\NP'
191 |         assert production.left == 'conj'
192 | 
193 |     def test_do_conj(self):
194 |         c1 = ccg.scat.SuperCat('S[dcl]\NP')
195 |         c2 = ccg.scat.SuperCat('S[dcl]\NP[conj]')
196 |         production = ccg.rules.Production(c1, c2)
197 |         production.parent = production.result
198 |         production.replace('(S[dcl]\NP)/NP')
199 |         assert production.left == '(S[dcl]\NP)/NP'
200 |         assert production.right == '(S[dcl]\NP)/NP[conj]'
201 | 
202 |     def test_type_raise1(self):
203 |         c1 = ccg.scat.SuperCat('S/(S\NP)')
204 |         c2 = ccg.scat.SuperCat('(S[dcl]\NP)/NP')
205 |         production = ccg.rules.Production(c1, c2)
206 |         production.parent = production.result
207 |         production.replace('((S[dcl]\NP)/NP)/NP')
208 |         self.assertEqual(production.left, c1.string)
209 |         self.assertEqual(production.left.annotated, c1.annotated)
210 | 
211 |     def test_type_raise2(self):
212 |         # S/(S\NP) (S[dcl]\NP)/NP --> S[dcl]/NP
213 |         # (N/N)/((N/N)\NP) ((N/N)\NP)/N --> (N/N)/N
214 |         c1 = ccg.scat.SuperCat('S/(S\NP)')
215 |         c2 = ccg.scat.SuperCat('(S[dcl]\NP)/NP')
216 |         production = ccg.rules.Production(c1, c2)
217 |         production.parent = production.result
218 |         assert production.rule == 'ftraise_comp'
219 |         production.replace('(N/N)/N')
220 |         self.assertEqual(production.left, '(N/N)/((N/N)\NP)')
221 |         self.assertEqual(production.right, '((N/N)\NP)/N')
222 |         production.replace('S[dcl]/NP')
223 |         self.assertEqual(production.right, c2)
224 |         self.assertEqual(production.left, c1)
225 | 
226 |     def test_type_raise3(self):
227 |         # (S[pss]\NP)/(S[adj]\NP) (S\NP)\((S\NP)/(S[adj]\NP)) -->
228 |         # S[pss]\NP
229 |         # ((S/S)/(S[ad]\NP))\NP (S/S)\((S/S)/(S[adj]\NP)) -->
230 |         # (S/S)\NP
231 |         c1 = ccg.scat.SuperCat('(S[pss]\NP)/(S[adj]\NP)')
232 |         c2 = ccg.scat.SuperCat('(S\NP)\((S\NP)/(S[adj]\NP))')
233 |         production = ccg.rules.Production(c1, c2)
234 |         production.parent = production.result
235 |         production.replace(ccg.scat.SuperCat('(S/S)\NP'))
236 |         self.assertEqual(production.left.string,
237 |                          '((S/S)\NP)/(S[adj]\NP)')
238 |         self.assertEqual(production.right.string,
239 |                          '((S/S)\NP)\(((S/S)\NP)/(S[adj]\NP))')
240 |         production.replace(ccg.scat.SuperCat('S[pss]\NP'))
241 |         self.assertEqual(production.left.string, c1.string)
242 | 
243 |     def test_type_raise4(self):
244 |         # ((S[pt]\NP)/PP)/NP (S\NP)\((S\NP)/PP) --> (S[pt]\NP)/NP
245 |         # ((S[q]/PP)/(S[pss]\NP))/NP S\(S/PP) -->
246 |         # (S[q]/(S[pss]\NP))/NP
247 |         c1 = ccg.scat.SuperCat('((S[pt]\NP)/PP)/NP')
248 |         c2 = ccg.scat.SuperCat('(S\NP)\((S\NP)/PP)')
249 |         production = ccg.rules.Production(c1, c2)
250 |         production.parent = production.result
251 |         assert production.rule == 'btraise_comp'
252 |         production.replace(ccg.scat.SuperCat('(S[q]/(S[pss]\NP))/NP'))
253 |         self.assertEqual(production.left.string, '((S[q]/PP)/(S[pss]\NP))/NP')
254 |         self.assertEqual(production.right.string, 'S\(S/PP)')
255 |         production.replace(ccg.scat.SuperCat('(S[pt]\NP)/NP'))
256 |         self.assertEqual(production.left.string, c1.string)
257 |         self.assertEqual(production.right.string, c2.string)
258 | 
259 | 
260 | 
261 | 
262 |     def test_feature_passing(self):
263 |         c1 = ccg.scat.SuperCat('(S[X]{Y}/(S[X]{Y}/NP{_}){Y}){_}')
264 |         c2 = ccg.scat.SuperCat('S[dcl]/NP')
265 |         production = ccg.rules.Production(c1, c2)
266 |         production.parent = production.result
267 |         production.replace('S[ng]')
268 |         self.assertEqual(production.left, c1)
269 | 
270 |     def test_replace_bug(self):
271 |         # (S[b]\NP)/(S[ng]\NP) S[ng]\NP --> S[b]\NP
272 |         # (((S[b]\NP)/(S[to]\NP))/(S[adj]\NP))/NP[expl]
273 |         left = ccg.scat.SuperCat('(S[b]\NP)/(S[ng]\NP)')
274 |         right = ccg.scat.SuperCat('S[ng]\NP')
275 |         parent = ccg.scat.SuperCat('S[b]\NP')
276 |         production = ccg.rules.Production(left, right, parent=parent)
277 |         replacement = ccg.scat.SuperCat(
278 |             '(((S[b]\NP)/(S[to]\NP))/(S[adj]\NP))/NP[expl]')
279 |         production.replace(replacement)
280 | 
281 | 
282 |     def test_determiner_apply_replace(self):
283 |         left = ccg.scat.SuperCat('NP/N')
284 |         right = ccg.scat.SuperCat('N')
285 |         production = ccg.rules.Production(left, right)
286 |         production.parent = production.result
287 |         replacement = ccg.scat.SuperCat('NP/PP')
288 |         production.replace(replacement)
289 |         self.assertEqual(production.left.annotated, '(NP{Y}/N{Y}<1>){_}')
290 |         self.assertEqual(production.right.annotated, '(N{_}/PP{Y}<1>){_}')
291 | 
292 | 
293 |     def test_possessive_apply_replace(self):
294 |         left = ccg.scat.SuperCat('NP/(N/PP)')
295 |         right = ccg.scat.SuperCat('N/PP')
296 |         production = ccg.rules.Production(left, right)
297 |         production.parent = production.result
298 |         replacement = ccg.scat.SuperCat('NP/PP')
299 |         production.replace(replacement)
300 |         self.assertEqual(production.left, 'NP/(N/PP)')
301 |         self.assertEqual(production.right, '(N/PP)/PP')
302 | 
303 |     def test_all_round_trips(self):
304 |         """
305 |         Test a round-trip replacement for every production rule
306 |         """
307 |         random.seed(0)
308 |         grammar_loc = os.path.join(os.path.split(__file__)[0],
309 |                                    'wsjfull.grammar')
310 |         cats = ccg.lexicon.CATS.values()
311 |         for parent, left, right, freq in ccg.grammar.read(grammar_loc):
312 |             if right is None:
313 |                 continue
314 |             if left not in ccg.lexicon.CATS or \
315 |                right not in ccg.lexicon.CATS:
316 |                 continue
317 |             # Ignore these productions, where I prefer my answer:
318 |             # 5: (PP/NP)/(PP/NP) PP/NP --> PP/NP RTed to PP/PP PP/NP --> PP/NP
319 |             if parent == 'PP/NP' and left == '(PP/NP)/(PP/NP)' \
320 |                and right == 'PP/NP':
321 |                 continue
322 |             # 4: ((S[adj]\NP)/PP)/((S[adj]\NP)/PP) (S[adj]\NP)/PP 
323 |             # --> (S[adj]\NP)/PP RTed to (S[adj]\NP)/(S[adj]\NP) on left
324 |             if left == '((S[adj]\NP)/PP)/((S[adj]\NP)/PP)' \
325 |                and right == '(S[adj]\NP)/PP' and parent == '(S[adj]\NP)/PP':
326 |                 continue
327 |             # 2: ((S[dcl]\NP)/(S[adj]\NP))/NP (S\NP)\(((S\NP)/(S[adj]\NP))/NP)
328 |             # --> S[dcl]\NP
329 |             # Broken category
330 |             if right == '(S\NP)\(((S\NP)/(S[adj]\NP))/NP)':
331 |                 continue
332 |             #print "%d: %s %s --> %s" % (freq, left, right, parent)
333 |             c1 = ccg.scat.SuperCat(left)
334 |             c1_annot = c1.annotated
335 |             c1_str = c1.string
336 |             c2 = ccg.scat.SuperCat(right)
337 |             c2_annot = c2.annotated
338 |             c2_str = c2.string
339 |             parent = ccg.scat.SuperCat(parent)
340 |             production = ccg.rules.Production(c1, c2, parent=parent)
341 |             if production.left.is_type_raise \
342 |                and production.right.is_type_raise:
343 |                 continue
344 |             rule = production.rule
345 |             replace_with = random.choice(cats)
346 |             replacement = ccg.scat.SuperCat(replace_with)
347 |             if parent.conj:
348 |                 replacement = ccg.scat.change_kwarg(replacement, conj=True)
349 |             production.replace(replacement)
350 |             # Don't expect RT if replacement forces rule change
351 |             if production.rule != rule:
352 |                 continue
353 |             #print 'New left: %s' % production.left.string
354 |             #print 'New right: %s' % production.right.string
355 |             production.replace(parent)
356 |             # Accept (S\NP)|(S\NP) for S|S
357 |             if production.left.string == '(S\NP)/(S\NP)' and c1_str == 'S/S':
358 |                 continue
359 |             elif production.right.string == '(S\NP)\(S\NP)' and c2_str == 'S\S':
360 |                 continue
361 |             self.assertEqual(production.left.string, c1_str)
362 |             self.assertEqual(production.right.string, c2_str)
363 | 
364 | 
365 | 
366 | if __name__ == '__main__':
367 |     unittest.main()
368 | 


--------------------------------------------------------------------------------
/ccg/scat.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import re
  3 | 
  4 | import ccg.category
  5 | import ccg.rules
  6 | 
  7 | class SuperCat(object):
  8 |     """
  9 |     A top-level category, participating in a derivation. Manages
 10 |     variable coindexation between a CCG category and HLDS terms.
 11 |     Tracks variable coindexation
 12 |     during productions. Unlike Category objects, is mutable.
 13 |     """
 14 |     def __init__(self, category, hlds=None, word_bindings=None):
 15 |         if isinstance(category, str):
 16 |             category = ccg.category.from_string(category)
 17 |         elif isinstance(category, SuperCat):
 18 |             category = category.category
 19 |         else:
 20 |             assert isinstance(category, ccg.category.Category)
 21 |         # Have a unique variable ID for each category variable.
 22 |         # Store the mapping from unique IDs to category vars and vice versa
 23 |         var_table = {}
 24 |         for var in category.cats_by_var:
 25 |             var_id = Variable()
 26 |             var_table[var] = set([var_id])
 27 |         self._var_table = var_table
 28 |         self.category = category
 29 |         self.hlds_children = defaultdict(set)
 30 |         self.hlds_parents = defaultdict(set)
 31 |         self.hlds_passed = set()
 32 |         self.srl_annot = set()
 33 |     
 34 | 
 35 |     def __getattr__(self, attr):
 36 |         if attr in self.__dict__:
 37 |             return self.__dict__[attr]
 38 |         elif hasattr(self.category, attr):
 39 |             return getattr(self.category, attr)
 40 |         else:
 41 |             raise AttributeError(attr)
 42 |  
 43 |     def __eq__(self, other):
 44 |         return self.category == other
 45 | 
 46 |     def __ne__(self, other):
 47 |         return self.category != other
 48 | 
 49 |     def __str__(self):
 50 |         return str(self.category)
 51 | 
 52 |     def __hash__(self):
 53 |         return hash(self.category)
 54 | 
 55 |     def __repr__(self):
 56 |         return repr(self.category)
 57 | 
 58 |     def global_annotated(self):
 59 |         annotated = self.annotated.replace('*}', '}')
 60 |         for var in self.cats_by_var:
 61 |             global_vars = [v for v in self._var_table[var] if v.word is not None]
 62 |             try:
 63 |                 global_vars = sorted(global_vars, key=lambda gv: gv.word)
 64 |             except:
 65 |                 raise
 66 |             global_str = '{%s}' % ','.join(str(v) for v in global_vars)
 67 |             var_str = ccg.category.VARS[var]
 68 |             annotated = annotated.replace('{%s}' % var_str, global_str)
 69 |         return annotated
 70 | 
 71 |     def bind_vars(self, other, self_cat, other_cat):
 72 |         """
 73 |         Unify the global variables of a piece of this scat against
 74 |         the piece of another scat. Use the other's variable
 75 |         table to retrieve the other's global variables.
 76 |         """
 77 |         assert self_cat == other_cat
 78 |         to_unify = set()
 79 |         for path, self_piece in self_cat.cats.items():
 80 |             other_piece = other_cat.cats[path]
 81 |             self_var = self_piece.var
 82 |             other_var = other_piece.var
 83 |             if not self.can_unify(other, self_var, other_var):
 84 |                 return False
 85 |             to_unify.add((self_piece.var, other_piece.var))
 86 |         for self_var, other_var in to_unify:
 87 |             self_cats = self_cat.cats_by_var[self_var]
 88 |             other_cats = other_cat.cats_by_var[other_var]
 89 |             self.unify_globals_at_var(other, self_var, other_var)
 90 |         return True
 91 | 
 92 |     def add_hlds_child(self, relation, global_var):
 93 |         """
 94 |         Set a child dependency
 95 |         """
 96 |         self.hlds_children[global_var].add(relation)
 97 | 
 98 |     def add_hlds_parent(self, relation, global_var):
 99 |         self.hlds_parents[global_var].add(relation)
100 | 
101 |     def add_hlds_passed(self, parent_var, relation, child_var):
102 |         self.hlds_passed.add((parent_var, relation, child_var))
103 |         
104 |    
105 |     def get_vars(self, cat=None):
106 |         if cat is None:
107 |             cat = self
108 |         return set(v.get_ref() for v in self._var_table[cat.var])
109 | 
110 |     def add_var(self, i, var):
111 |         ref = var.get_ref()
112 |         self._var_table[i] = set(v.get_ref() for v in self._var_table[i])
113 |         self._var_table[i].add(ref)
114 | 
115 |     def unify_globals_at_var(self, other, var, other_var=None):
116 |         if other_var is None:
117 |             other_var = var
118 |         s_vars = self._var_table[var]
119 |         o_vars = other._var_table[other_var]
120 |         if len(s_vars) == len(o_vars) == 1:
121 |             list(s_vars)[0].unify(list(o_vars)[0])
122 |         else:
123 |             list(s_vars)[0].unify(list(o_vars)[0])
124 |             # The unification is not complete here, which may cause problems.
125 |             # But cannot unify other to both in self, or self vars will
126 |             # be unified to each other :(
127 |             var_set = set([v.get_ref() for v in s_vars.union(o_vars)])
128 |             self._var_table[var] = var_set
129 |             other._var_table[other_var] = var_set
130 | 
131 |     def can_unify(self, other, var, other_var):
132 |         s_vars = self._var_table[var]
133 |         s_words = set([s_var.word for s_var in s_vars])
134 |         o_vars = other._var_table[other_var]
135 |         for s_var in s_vars:
136 |             s_word = s_var.word
137 |             if not s_word:
138 |                 continue
139 |             for o_var in o_vars:
140 |                 o_word = o_var.word
141 |                 # Patch for conjunction, see wsj_0047.11 for eg
142 |                 # May be bad idea?
143 |                 if o_word and o_word not in s_words:
144 |                     return False
145 |         return True
146 | 
147 | 
148 |     def has_head(self, word, cat=None):
149 |         for v in self.get_vars(cat):
150 |             if v.word is word:
151 |                 return True
152 |         else:
153 |             return False
154 | 
155 |     def has_dep(self, word):
156 |         if self.has_head(word):
157 |             return False
158 |         for r, a, s, k in self.deconstruct():
159 |             if self.has_head(word, a):
160 |                 return True
161 |         return False
162 | 
163 |     def add_head(self, word):
164 |         s_vars = self.get_vars()
165 |         #assert len(s_vars) == 1
166 |         for v in s_vars:
167 |             # Sadly this fails too often :(
168 |             # When it does it indicates a real problem, but the problems
169 |             # are quite difficult to solve...
170 |             #assert not v.word
171 |             v.word = word
172 | 
173 |     def heads(self, cat=None):
174 |         if cat is None:
175 |             cat = self
176 |         return sorted(set([v.word for v in self.get_vars(cat) if v.word]))
177 | 
178 |     def deconstruct(self):
179 |         for r, a, s, k in self.category.deconstruct():
180 |             k = dict(k)
181 |             k['arg_global_vars'] = self.get_vars(a)
182 |             yield r, a, s, k
183 | 
184 | 
185 |     def cats_at_global(self, global_var):
186 |         """
187 |         Find all cats whose vars map to this var's value.
188 |         Can't simply have a reverse index, because var's values change
189 |         on unification
190 |         """
191 |         cats = set()
192 |         val = global_var.val
193 |         for cat_var, var_set in self._var_table.items():
194 |             for var in var_set:
195 |                 if var.val == val:
196 |                     for cat in self.cats_by_var[cat_var]:
197 |                         cats.add(cat)
198 |         return cats
199 | 
200 |     def all_globals(self):
201 |         global_vars = set()
202 |         for var_set in self._var_table.values():
203 |             global_vars.update(v.get_ref() for v in var_set)
204 |         return global_vars
205 | 
206 |     def map_letters_to_words(self):
207 |         """
208 |         Return a dictionary mapping letter-variables e.g. _, Y, Z
209 |         to word sets, e.g. {'_': set(Pierre, Holly)}, where
210 |         Pierre and Holly are CCGLeaf instances
211 |         """
212 |         mapping = {}
213 |         for var, cats in self.cats_by_var.items():
214 |             letter_var = ccg.category.VARS[var]
215 |             heads = self.heads(cats[0])
216 |             mapping[letter_var] = heads
217 |         return mapping
218 | 
219 |     def add_srl_annot_from_srl_string(self, srl_annot_str):
220 |         """
221 |         Populate the srl_annot set with triples from an 
222 |         srl_string. srl_strings look like X'P:A0'Y_X'P:A1'Z
223 |         """
224 |         assert not self.srl_annot, '%s %s' % (self.srl_annot, self.annotated)
225 |         if srl_annot_str == '@':
226 |             return None
227 |         for srl_triple in srl_annot_str.split('_'):
228 |             if not srl_triple:
229 |                 continue
230 |             srl_triple = srl_triple.replace('X', '_')
231 |             srl_triple = srl_triple.replace('E_T', 'EXT')
232 |             head_letter, label, child_letter = srl_triple.split("'")
233 |             head_var = ccg.category.VARS.index(head_letter)
234 |             child_var = ccg.category.VARS.index(child_letter)
235 |             if head_var not in self._var_table or child_var not in self._var_table:
236 |                 err = "Var not found from srl_string %s for cat %s"
237 |                 raise StandardError, err % (srl_string, self.annotated)
238 |             srl_tuple = tuple(srl_triple.split("'"))
239 |             self.srl_annot.add(srl_tuple)
240 | 
241 | 
242 | 
243 |     def convert_hlds_to_srl_annot(self):
244 |         """
245 |         For each SRL label bound to the category, print
246 |         X label Y, where X and Y are the local variables
247 |         for the head and child.
248 |         """
249 |         labels = set()
250 |         for var, srl_labels in self.hlds_parents.items():
251 |             for cat in self.cats_at_global(var):
252 |                 for label in srl_labels:
253 |                     # Child of hlds_parents is always own lexical variable
254 |                     labels.add((ccg.category.VARS[cat.var], label, '_'))
255 |         for var, srl_labels in self.hlds_children.items():
256 |             for cat in self.cats_at_global(var):
257 |                 for label in srl_labels:
258 |                     # Parent of hlds_children is always own lexical variable
259 |                     labels.add(('_', label, ccg.category.VARS[cat.var]))
260 |         for var1, label, var2 in self.hlds_passed:
261 |             for cat1 in self.cats_at_global(var1):
262 |                 for cat2 in self.cats_at_global(var2):
263 |                     labels.add((ccg.category.VARS[cat1.var], label,
264 |                                 ccg.category.VARS[cat2.var]))
265 |         self.srl_annot = labels
266 | 
267 | 
268 |     annot_strip_re = re.compile(r'<\d>')
269 |     var_find_re = re.compile(r'(?<={)[A-Z]')
270 |     def srl_string(self):
271 |         """
272 |         Create an annotated string referencing semantic roles, and
273 |         markedup entries for the role dependencies
274 |         """
275 |         triple_strs = ["'".join(triple).replace('_', 'X') for triple in
276 |                        self.srl_annot]
277 |         triple_strs.sort()
278 |         stag_annot =  '_'.join(triple_strs)
279 |         stag_str = '%s@%s' % (self.string, stag_annot)
280 |         seen_vars = set()
281 |         roles = []
282 |         for head, label, child in self.srl_annot:
283 |             if head == '_' and child == '_':
284 |                 roles.append(('_', label, ' %l %l'))
285 |                 continue
286 |             elif head == '_':
287 |                 var = child
288 |                 lf = '%l %f'
289 |             elif child == '_':
290 |                 var = head
291 |                 lf = '%f %l'
292 |             else:
293 |                 raise Exception
294 |             seen_vars.add(var)
295 |             roles.append((var, label, lf))
296 |         for var, cats in self.cats_by_var.items():
297 |             var = ccg.category.VARS[var]
298 |             if var == 0 or var in seen_vars:
299 |                 continue
300 |             for cat in cats:
301 |                 if cat.arg_idx:
302 |                     seen_vars.add(var)
303 |                     roles.append((var, 'ignore', ''))
304 |         var_to_args = {}
305 |         for v in self.var_find_re.findall(self.annotated):
306 |             if v in seen_vars:
307 |                 var_to_args.setdefault(v, len(var_to_args) + 1)
308 |         roles = ['%d %s %s' % (var_to_args.get(v, 0), l, lf) for v, l, lf in roles]
309 |         roles.sort()
310 |         annotated = self.annot_strip_re.sub('', self.annotated)
311 |         # Add argument numbers to string
312 |         # We need to do the replacement at the rightmost point,
313 |         # so reverse the string and add the replacement backwards
314 |         annotated = ''.join(reversed(annotated))
315 |         # Remove the *'s, as they're irrelevant to us
316 |         # Um why do we need the rightmost point?
317 |         for var, arg_num in var_to_args.items():
318 |             var_annot = '}%s{' % var
319 |             #var_annot = '{%s}' % var
320 |             assert var_annot in annotated, annotated + ' ' + var_annot
321 |             var_arg = ('>%d<' % arg_num) + var_annot
322 |             #var_arg = '%s<%d>' % (var_annot, arg_num)
323 |             annotated = annotated.replace(var_annot, var_arg, 1)
324 |         annotated = annotated.replace('*', '')
325 |         annotated = ''.join(reversed(annotated)) # Unreverse now that we're done
326 |         # Append the @ annotation to the annotated string
327 |         annotated = '%s@%s' % (annotated, stag_annot)
328 |         return len(var_to_args), stag_str, annotated, roles
329 |     
330 |     def srl_deps_from_annot(self):
331 |         var_map = dict((var, i) for i, var in enumerate(ccg.category.VARS))
332 |         for head_var, label, child_var in sorted(self.srl_annot):
333 |             head_globals = self._var_table[var_map[head_var]]
334 |             child_globals = self._var_table[var_map[child_var]]
335 |             for head_global in head_globals:
336 |                 for child_global in child_globals:
337 |                     if head_global.word and child_global.word:
338 |                         yield head_global.word, label, child_global.word
339 | 
340 | 
341 | class Variable(object):
342 |     _next = 0
343 |     def __init__(self):
344 |         Variable._next += 1
345 |         self._val = Variable._next
346 |         self._ref = None
347 |         self._word = None
348 | 
349 |     def __eq__(self, other):
350 |         return self.val == other.val
351 | 
352 |     def __ne__(self, other):
353 |         return not self == other
354 | 
355 |     def __cmp__(self, other):
356 |         return cmp(self.val, other.val)
357 | 
358 |     def __hash__(self):
359 |         return hash(self.val)
360 | 
361 |     @property
362 |     def val(self):
363 |         return self.get_ref()._val
364 |     
365 |     @property
366 |     def word(self):
367 |         return self.get_ref()._word
368 | 
369 |     @word.setter
370 |     def word(self, word):
371 |         self.get_ref()._word = word
372 | 
373 |     def get_ref(self):
374 |         var = self
375 |         while var._ref is not None:
376 |             var = var._ref
377 |         return var
378 | 
379 |     def __str__(self):
380 |         ref = self.get_ref()
381 |         if ref._word:
382 |             return ref._word.text
383 |         else:
384 |             return 'v%d' % ref._val
385 | 
386 |     def __repr__(self):
387 |         return str(self)
388 | 
389 |     def unify(self, other):
390 |         if self is other:
391 |             return None
392 |         self_ref = self.get_ref()
393 |         other_ref = other.get_ref()
394 |         if self_ref is other_ref:
395 |             return None
396 |         ### nicky_random_debugging_destruction - commented out:
397 |         ### wsj_0023.3 (percent) breaks with this assert statement. 33 % of ...
398 |         ### assert not (self_ref._word and other_ref._word)
399 |         other_ref._ref = self_ref
400 |         if other_ref._word and not self_ref._word:
401 |             self_ref._word = other_ref._word
402 |         if self_ref._word and not other_ref._word:
403 |             other_ref._word = self_ref._word
404 | 
405 | 
406 | def replace_result(scat, new_res):
407 |     assert scat.is_complex
408 |     arg = scat.argument
409 |     var_map = {}
410 |     res_vars = new_res.cats_by_var
411 |     next_var = max(res_vars) + 1
412 |     for var in arg.cats_by_var:
413 |         if var in res_vars and var not in var_map:
414 |             var_map[var] = next_var
415 |             next_var += 1
416 |     arg = ccg.rules.remap_vars(arg, var_map)
417 |     new_cat = ccg.category.Category(new_res, scat.slash, arg, **scat.kwargs)
418 |     return SuperCat(new_cat)
419 | 
420 | def replace_inner_result(scat, new_res):
421 |     raise Exception("Not implemented yet")
422 | 
423 | def add_args(res, args, reorder = False):
424 |     if reorder:
425 |         res, args = reorder_args(res, args)
426 |     for arg, slash, kwargs in args:
427 |         if 'arg_global_var' in kwargs:
428 |             global_var = kwargs.pop('arg_global_var')
429 |         else:
430 |             global_var = None
431 |         res = add_arg(res, slash, arg, **kwargs)
432 |         if global_var:
433 |             # Unify the variable with the one passed in
434 |             for var in res.get_vars(res.argument):
435 |                 var.unify(global_var)
436 |     assert res.var == 0
437 |     return res
438 | 
439 | def reorder_args(res, args):
440 |     # Order args so that, for non-adjunct args, backward args are always added
441 |     # first.
442 |     backward = []
443 |     forward = []
444 |     for arg, slash, kwargs in args:
445 |         #if kwargs.get('var', res.var) != res.var or kwargs.get('hat'):
446 |         #    res = ccg.category.Category(res, slash, arg, **kwargs)
447 |         if slash == '/':
448 |             forward.append((arg, slash, kwargs))
449 |         else:
450 |             backward.append((arg, slash, kwargs))
451 |     return res, backward + forward
452 | 
453 | 
454 | 
455 | def add_arg(result, slash, arg, revar=True, **kwargs):
456 |     # Revar means to assume the extra arg is not coindexed to something
457 |     if revar:
458 |         arg = change_kwarg(arg, var=result.next_var)
459 |     category = ccg.category.Category(result, slash,
460 |                                      arg, **kwargs)
461 |     new_scat = SuperCat(category)
462 |     if hasattr(result, 'bind_vars'):
463 |         new_scat.bind_vars(result, new_scat.result, result.category)
464 |     if hasattr(arg, 'bind_vars'):
465 |         new_scat.bind_vars(arg, new_scat.argument, arg.category)
466 |     return new_scat
467 | 
468 | def make_adjunct(cat, slash, force_dep=True):
469 |     # Decide which category to base adjunct on
470 |     if cat.is_complex:
471 |         for res, arg, s, _ in reversed(list(cat.deconstruct())):
472 |             if force_dep and res.var != cat.var:
473 |                 continue
474 |             # Don't reduce (S\NP)|(S\NP) to S|S
475 |             if res.var == 0 and \
476 |             not (res == 'S' and arg == 'NP' and s == '\\'):
477 |                 cat = res
478 |                 break
479 |         else:
480 |             cat = res if not force_dep else cat
481 |     var_map = {0: cat.next_var}
482 |     new_cat = ccg.rules.remap_vars(cat, var_map)
483 |     new_cat = ccg.rules.strip_features(new_cat)
484 |     return SuperCat(ccg.category.Category(new_cat, slash, new_cat, var=0))
485 | 
486 | def change_kwarg(cat, **kwargs):
487 |     cat_kwargs = cat.kwargs.copy()
488 |     cat_kwargs.update(kwargs)
489 |     new_cat = ccg.category.Category(cat.result, cat.slash, cat.argument,
490 |                                     **cat_kwargs)
491 |     if hasattr(cat, 'bind_vars'):
492 |         new_scat = SuperCat(new_cat)
493 |         new_scat.bind_vars(cat, new_cat, cat.category)
494 |         return new_scat
495 |     else:
496 |         return new_cat
497 | 
498 | def type_raise(t_cat, slash, arg_cat):
499 |     t_cat = ccg.rules.strip_features(t_cat)
500 |     next_var = arg_cat.next_var
501 |     var_map = {}
502 |     for var in t_cat.cats_by_var:
503 |         var_map.setdefault(var, len(var_map.keys()) + next_var)
504 |     t_cat = ccg.rules.remap_vars(t_cat, var_map)
505 |     inner_slash = '\\' if  slash == '/' else '/'
506 |     argument = add_arg(t_cat, inner_slash, arg_cat, var=t_cat.var, revar=False)
507 |     return add_arg(t_cat, slash, argument, var=arg_cat.var)
508 | 


--------------------------------------------------------------------------------
/ccg/rules.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | 
  3 | from ccg.category import from_string, Category
  4 | import ccg
  5 | import re
  6 | 
  7 | VARS = ['_', 'Y', 'Z', 'W', 'V', 'U', 'T', 'S']
  8 | _ARG_IDX_RE = re.compile(r'<\d>')
  9 | MAX_COMP_DEPTH = 3
 10 | 
 11 | 
 12 | class Production(object):
 13 |     """
 14 |     A CCG production rule. Tracks combinators used,
 15 |     unification, and manages change propagation.
 16 |     """
 17 |     combinators = ['add_conj', 'do_conj',
 18 |                    'fapply', 'bapply', 'bcomp', 'fcomp',
 19 |                    'bxcomp', 'fxcomp',
 20 |                    'left_punct', 'right_punct', 'comma_conj']
 21 |     def __init__(self, left, right, parent=None, rule=None):
 22 |         assert left
 23 |         self.left = left
 24 |         self.right = right
 25 |         self._y = None
 26 |         self._x = None
 27 |         if rule:
 28 |             combinator = getattr(self, rule)
 29 |             result, depth = combinator(left, right)
 30 |         else:
 31 |             rule, result, depth = self.get_rule(left, right, parent)
 32 |             self.depth = depth
 33 |         self.result = result
 34 |         if not right:
 35 |             pass
 36 |         elif left.is_adjunct and rule.startswith('f') and self._y and \
 37 |             self._y.var == right.var:
 38 |             rule = 'fadjunct'
 39 |         elif right.is_adjunct and rule.startswith('b') and self._y and \
 40 |                 self._y.var == left.var:
 41 |             rule = 'badjunct'
 42 |         elif left.is_type_raise and rule.startswith('f'):
 43 |             rule = 'ftraise_comp'
 44 |         elif right.is_type_raise and rule.startswith('b'):
 45 |             rule = 'btraise_comp'
 46 |         self.rule = rule
 47 |         self.parent = parent
 48 |         self.force_dep = True
 49 | 
 50 |     def __str__(self):
 51 |         return '%s %s --> %s (%s)' % (self.left, self.right, self.parent,
 52 |                                       self.rule)
 53 | 
 54 |     def get_rule(self, left, right, parent = None):
 55 |         if right is None:
 56 |             assert parent
 57 |             unary_rules = [('traise', self.traise), ('unary', self.unary)]
 58 |             for rule, combinator in unary_rules:
 59 |                 result, depth = combinator(parent, left)
 60 |                 if result and parent.exact_eq(result):
 61 |                     return rule, result, depth
 62 |             else:
 63 |                 return 'unary', None, 0
 64 |         for rule in self.combinators:
 65 |             combinator = getattr(self, rule)
 66 |             result, depth = combinator(left, right)
 67 |             if result and ((not parent) or parent.exact_eq(result)):
 68 |                 return rule, result, depth
 69 |         else:
 70 |             if parent:
 71 |                 result, depth = self.binary(left, right, parent)
 72 |                 if result and parent.exact_eq(result):
 73 |                     return 'binary', result, 0
 74 |             return 'invalid', parent, 0
 75 | 
 76 | 
 77 |     def replace(self, new):
 78 |         if not isinstance(new, ccg.scat.SuperCat):
 79 |             new = ccg.scat.SuperCat(new)
 80 |         assert self.parent
 81 |         if self.rule == 'fapply':
 82 |             left, right = self._apply_replace(self.left, self.right, new)
 83 |         elif self.rule == 'bapply':
 84 |             right, left = self._apply_replace(self.right, self.left, new)
 85 |         elif self.rule == 'fcomp' or self.rule == 'fxcomp':
 86 |             left, right = self._comp_replace(self.left, self.right, new)
 87 |         elif self.rule == 'bcomp' or self.rule == 'bxcomp':
 88 |             right, left = self._comp_replace(self.right, self.left, new)
 89 |         elif self.rule == 'fadjunct':
 90 |             left, right = self._adjunct_replace(self.left, self.right, new)
 91 |         elif self.rule == 'badjunct':
 92 |             right, left = self._adjunct_replace(self.right, self.left, new)
 93 |         elif self.rule == 'left_punct':
 94 |             left = self.left
 95 |             right = new
 96 |         elif self.rule == 'ftraise_comp':
 97 |             left, right = self._traise_comp_replace(self.left, self.right, new)
 98 |         elif self.rule == 'btraise_comp':
 99 |             right, left = self._traise_comp_replace(self.right, self.left, new)
100 |         elif self.rule == 'right_punct':
101 |             right = self.right
102 |             left = new
103 |         elif new.conj and \
104 |                 (self.rule == 'add_conj' or self.rule == 'comma_conj'):
105 |             left = self.left
106 |             right = ccg.scat.change_kwarg(new, conj=False)
107 |         elif self.rule == 'do_conj':
108 |             left = new
109 |             right = ccg.scat.change_kwarg(new, conj=True)
110 |         elif self.rule == 'traise':
111 |             left = self._traise_replace(self.left, new)
112 |             right = None
113 |         elif self.rule == 'unary':
114 |             left = self.left
115 |             right = self.right
116 |         elif self.rule == 'invalid' or self.rule == 'unary':
117 |             left = self.left
118 |             right = self.right
119 |         else:
120 |             raise Exception(self.rule)
121 |         self.left = left
122 |         self.right = right
123 |         self.parent = new
124 |         return left, right
125 | 
126 |     def _apply_replace(self, func, arg, new):
127 |         # Special case for determiners where new has grown an argument
128 |         determiners = ['NP/N', 'NP/(N/PP)', 'PP/NP']
129 |         if func in determiners and new.is_complex and \
130 |            new.inner_result == self.parent.inner_result:
131 |             args = [(a, s, k) for r, a, s, k in new.deconstruct()]
132 |             new_arg = ccg.scat.add_args(arg, args)
133 |             return func, new_arg
134 |         if func.result.feat_var: # Preserve feature passing
135 |             new = ccg.scat.change_kwarg(new, feature='',
136 |                                         feat_var=func.result.feat_var)
137 |         new_func = ccg.scat.replace_result(func, new)
138 |         return new_func, arg
139 | 
140 |     def _adjunct_replace(self, func, arg, new):
141 |         new_func = ccg.scat.make_adjunct(new, func.slash, True)
142 |         return new_func, new
143 |             
144 |     def _comp_replace(self, func, arg, new):
145 |         # If new isn't complex we can't compose. Just use application.
146 |         if not new.is_complex:
147 |             functor = ccg.scat.add_arg(new, func.slash, arg, **func.kwargs)
148 |             self.rule = 'fapply'
149 |             return functor, arg
150 |         # Y category must be the same as before, as it's not in the parent
151 |         # Give the argument and result the Ys they each had originally
152 |         res_y = func.argument
153 |         for arg_y, slash, z, _ in arg.deconstruct():
154 |             if arg_y == res_y:
155 |                 break
156 |         else:
157 |             if func.argument == arg:
158 |                 res_y = arg
159 |             else:
160 |                 raise Exception
161 |         orig_x = func.result
162 |         # Get the new Z (or Zs in the case of generalised comp)
163 |         dollars = []
164 |         for res, arg, slash, kwargs in new.deconstruct():
165 |             dollars.append((arg, slash, kwargs))
166 |             if res.is_adjunct:
167 |                 break
168 |             #if res == 'S\NP':
169 |             #    break
170 |         else:
171 |             # Don't accumulate arguments on adjuncts or determiners
172 |             if func != 'NP/N' and not func.is_adjunct:
173 |                 res = new.result
174 |                 dollars = [(new.argument, new.slash, new.kwargs)]
175 |         if orig_x.feat_var: # Preserves feature-passing
176 |                         res = ccg.scat.change_kwarg(res, feature='',
177 |                                         feat_var=orig_x.feat_var,
178 |                                         var=orig_x.var)
179 |         if func.is_adjunct:
180 |             if new.is_adjunct:
181 |                 functor = ccg.scat.SuperCat(new)
182 |             elif new.inner_result.var != new.var:
183 |                 if any(c == 'S\NP' for p, c in new.cats.items() if not any(p)):
184 |                     functor = ccg.scat.make_adjunct(ccg.VP, func.slash)
185 |                 else:
186 |                     functor = ccg.scat.make_adjunct(new.inner_result, func.slash)
187 |             else:
188 |                 functor = ccg.scat.make_adjunct(new, func.slash)
189 |             return functor, ccg.scat.SuperCat(new)
190 |         functor = ccg.scat.add_arg(res, func.slash, res_y, **func.kwargs)
191 |         dollars.reverse()
192 |         argument = ccg.scat.add_args(arg_y, dollars)
193 |         return functor, argument
194 | 
195 |     def _traise_replace(self, child, new):
196 |         assert new.is_type_raise
197 |         left = ccg.scat.SuperCat(new.argument.argument)
198 |         new.bind_vars(new, new.argument.argument, left)
199 |         return left
200 | 
201 |     def _traise_comp_replace(self, func, arg, new):
202 |         # Type-raise-type-raise composition is a special case used for
203 |         # argument cluster coordination. It's dangerous to clobber it
204 |         # with a non-type-raised new category.
205 |         if arg.is_type_raise and not new.is_type_raise:
206 |             assert func.is_type_raise
207 |             raise Exception("Should not replace raise-raise composition with"
208 |                             "non-raised category.")
209 |         # New == T/$
210 |         # Func == T/(T\R)
211 |         # Arg == (T\R)/$
212 |         r = func.argument.argument
213 |         dollars = []
214 |         for t, z, slash, kwargs in new.deconstruct():
215 |             dollars.append((z, slash, kwargs))
216 |             if t.is_adjunct:
217 |                 break
218 |         else:
219 |             if not new.is_complex:
220 |                 t = new
221 |         # Now, where do we place the R relative to the $s? Let's say we have
222 |         # R=(/PP) and $s=[(/NP), (\NP)] (where last will be added first)
223 |         # We could redefine T to T\NP, so that we get
224 |         # an argument cat of (T\NP)/PP)/NP. OR, we could keep T, and get
225 |         # ((T/PP)\NP)/NP. They're equivalent, but the latter will get
226 |         # non-standard cats. So what we must do is check whether the slashes
227 |         # for the last dollar and the R disagree. If they do, we should redefine
228 |         # T to append the last dollar, which is popped.
229 |         if dollars and dollars[-1][1] == '\\' and func.argument.slash == '/':
230 |             last_arg, last_slash, last_kwarg = dollars.pop()
231 |             t = ccg.scat.add_arg(t, last_slash, last_arg, **last_kwarg)
232 |         dollars.append((r, func.argument.slash, {}))
233 |         dollars.reverse()
234 |         functor = ccg.scat.type_raise(t, func.slash, r)
235 |         argument = ccg.scat.add_args(t, dollars)
236 |         return functor, argument
237 |         
238 | 
239 | 
240 | 
241 |     def fapply(self, left, right):
242 |         if not self._check_dir(left, '/'):
243 |             return False, 0
244 |         return self._application(left, right)
245 | 
246 |     def bapply(self, left, right):
247 |         if not self._check_dir(right, '\\'):
248 |             return False, 0
249 |         return self._application(right, left)
250 | 
251 |     def fcomp(self, left, right): # Don't do general for now
252 |         if not self._check_dir(left, '/') or not self._check_dir(right, '/'):
253 |             return False, 0
254 |         return self._composition(left, right)
255 | 
256 |     def bcomp(self, left, right):
257 |         if not self._check_dir(left, '\\') or not self._check_dir(right, '\\'):
258 |             return False, 0
259 |         return self._composition(right, left)
260 | 
261 |     def fxcomp(self, left, right):
262 |         if not left.is_complex or not right.is_complex:
263 |             return False, 0
264 |         if not self._check_dir(left, '/'):
265 |             return False, 0
266 |         return self._composition(left, right, crossing=True)
267 | 
268 |     def bxcomp(self, left, right):
269 |         if not left.is_complex and self._check_dir(right, '\\'):
270 |             return False, 0
271 |         return self._composition(right, left, crossing=True)
272 | 
273 |     def add_conj(self, left, right):
274 |         """
275 |         Multi-variables for conj is so far a failure. Make conjuncted
276 |         constituents headed by the conjunction
277 |         """
278 |         if left != ccg.CONJ or right.conj:
279 |             return False, 0
280 |         return self._do_add_conj(left, right)
281 | 
282 |     def _do_add_conj(self, left, right):
283 |         # This should take care of variable binding too
284 |         scat = ccg.scat.change_kwarg(right, conj=True)
285 |         return scat, 0
286 | 
287 |     def comma_conj(self, left, right):
288 |         if left != ccg.COMMA and left != ccg.SEMI_COLON and left != ccg.COLON:
289 |             return False, 0
290 |         return self._do_add_conj(left, right)
291 | 
292 |     def do_conj(self, left, right):
293 |         if not right.conj:
294 |             return False, 0
295 |         if left.conj:
296 |             return False, 0
297 |         new_right = ccg.scat.change_kwarg(right, conj=False)
298 |         if not new_right.exact_eq(left):
299 |             return False, 0
300 |         for path, right_cat in new_right.cats.items():
301 |             if right_cat.var > 0:
302 |                 new_right.unify_globals_at_var(left, right_cat.var,
303 |                                                left.cats[path].var)
304 |         for var in left.get_vars():
305 |             new_right.add_var(0, var)
306 |         return new_right, 0
307 | 
308 |     def left_punct(self, left, right):
309 |         if not left.is_punct:
310 |             return False, 0
311 |         return right, 0
312 | 
313 |     def right_punct(self, left, right):
314 |         if not right.is_punct:
315 |             return False, 0
316 |         return left, 0
317 | 
318 |     def traise(self, parent, child):
319 |         # Type raising
320 |         if not parent.is_complex:
321 |             return False, 0
322 |         if not parent.argument.is_complex:
323 |             return False, 0
324 |         if not parent.result.exact_eq(parent.argument.result):
325 |             return False, 0
326 |         if not parent.argument.argument.exact_eq(child):
327 |             return False, 0
328 |         result = ccg.scat.type_raise(parent.result, parent.slash, child)
329 |         result.bind_vars(child, result.argument.argument, child.category)
330 |         return result, 0
331 | 
332 |     def unary(self, parent, child):
333 |         key = (parent.string, child.string)
334 |         if key not in TypeChanging.rules:
335 |             return False, 0
336 |         else:
337 |             result = ccg.scat.SuperCat(parent.category)
338 |             bindings = TypeChanging.rules[key]
339 |             for parent_var, child_var in bindings:
340 |                 try:
341 |                     result.unify_globals_at_var(child, parent_var, child_var)
342 |                 except KeyError:
343 |                     raise
344 |             return result, 0
345 |     
346 |     def binary(self, left, right, parent):
347 |         key = (parent.string, left.string, right.string)
348 |         if key not in BinaryTypeChanging.rules:
349 |             return False, 0
350 |         else:
351 |             bindings = BinaryTypeChanging.rules[key]
352 |             result = ccg.scat.SuperCat(parent.category)
353 |             for parent_var, left_var, right_var in bindings:
354 |                 if left_var is None:
355 |                     assert right_var is not None
356 |                     try:
357 |                         result.unify_globals_at_var(
358 |                             right, parent_var, right_var)
359 |                     except KeyError:
360 |                         raise
361 |                 elif right_var is None:
362 |                     assert left_var is not None
363 |                     try:
364 |                         result.unify_globals_at_var(left, parent_var, left_var)
365 |                     except KeyError:
366 |                         raise
367 |                 else:
368 |                     raise Exception
369 |             return result, 0
370 | 
371 |     def _application(self, functor, argument):
372 |         if functor.conj or argument.conj:
373 |             return False, 0
374 |         if functor.argument != argument:
375 |             return False, 0
376 |         has_bound = functor.bind_vars(argument, functor.argument, argument.category)
377 |         if not has_bound:
378 |             return False, 0
379 |         result = functor.result
380 |         c1_to_c2, c2_to_c1 = self._var_to_feats(functor.argument, argument)
381 |         result, var_map = minimise_vars(result, c1_to_c2)
382 |         self._x = functor.result
383 |         self._y = argument
384 |         result_scat = ccg.scat.SuperCat(result)
385 |         functor.bind_vars(result_scat, functor.result, result_scat.category)
386 |         return result_scat, 0
387 | 
388 |     def _composition(self, functor, arg, crossing = False):
389 |         if functor.conj or arg.conj:
390 |             return False, 0
391 |         if not functor.is_complex or not arg.is_complex:
392 |             return False, 0
393 |         depth = 0
394 |         # X/Y (Y/Z_1)/Z_2 etc
395 |         x_y = functor.argument
396 |         self._x = functor.result
397 |         yz = arg
398 |         zs = []
399 | 
400 |         while depth < MAX_COMP_DEPTH and yz.is_complex:
401 |             zs.append((yz.argument, yz.slash, yz.kwargs.copy()))
402 |             if yz.result != x_y:
403 |                 yz = yz.result
404 |                 depth += 1
405 |             else:
406 |                 self._y = yz.result
407 |                 break
408 |         else:
409 |             return False, 0
410 | 
411 |         # For non-crossing composition, the slashes must be consistent.
412 |         # For crossing composition, they must be inconsistent.
413 |         if all(s == functor.slash for (arg, s, k) in zs) == crossing:
414 |             return False, 0
415 |         functor.bind_vars(arg, x_y, self._y)
416 |         max_var = max(functor.cats_by_var) + 1
417 |         arg_to_final = self._map_vars(x_y, yz.result, max_var, arg.cats_by_var)
418 |         curr_cat = functor.result
419 |         for z, slash, kwargs in reversed(zs):
420 |             z = remap_vars(z, arg_to_final)
421 |             kwargs['var'] = arg_to_final.get(kwargs.get('var', 99), max_var)
422 |             curr_cat = Category(curr_cat, slash, z, **kwargs)
423 | 
424 |         c1_to_c2, c2_to_c1 = self._var_to_feats(functor.argument, yz.result)
425 |         c1_to_c2.update(c2_to_c1)
426 |         result, var_map = minimise_vars(curr_cat, c1_to_c2)
427 | 
428 |         # Bind the global variables
429 |         scat = ccg.scat.SuperCat(result)
430 | 
431 |         # Take outer var from arg
432 |         scat.unify_globals_at_var(arg, 0)
433 | 
434 |         arg_res = arg
435 |         for result, z, _, _ in scat.deconstruct():
436 |             scat.bind_vars(arg, z, arg_res.argument)
437 |             if result == functor.result:
438 |                 scat.bind_vars(functor, result, functor.result)
439 |                 break
440 |             arg_res = arg_res.result
441 |         return scat, depth
442 | 
443 |     def _check_dir(self, cat, slash):
444 |         if not cat.is_complex:
445 |             return False
446 |         if cat.slash != slash:
447 |             return False
448 |         return True
449 | 
450 |     def _map_vars(self, func_u, arg_u, next_var, arg_vars):
451 |         # Map variables from argument to functor
452 |         arg_to_final = {}
453 |         for path, acat in arg_u.cats.items():
454 |             fcat = func_u.cats[path]
455 |             arg_to_final[acat.var] = fcat.var
456 |         for var in arg_vars:
457 |             if var not in arg_to_final:
458 |                 arg_to_final[var] = next_var
459 |                 next_var += 1
460 |         return arg_to_final
461 | 
462 |     def _var_to_feats(self, cat1, cat2):
463 |         """
464 |         Map feature variables to feature values for the unified pieces
465 |         """
466 |         c1_to_c2 = {}
467 |         c2_to_c1 = {}
468 |         for path, sub1 in cat1.cats.items():
469 |             sub2 = cat2.cats[path]
470 |             if sub1.feat_var and sub2.feature:
471 |                 c1_to_c2[sub1.feat_var] = sub2.feature
472 |             elif sub2.feat_var and sub1.feature:
473 |                 c2_to_c1[sub2.feat_var] = sub1.feature
474 |         return c1_to_c2, c2_to_c1
475 | 
476 | class TypeChanging(object):
477 |     # Note which variables to bind
478 |     rules = {
479 |         ('NP', 'N'): [(0, 0)],
480 |         ('NP\NP', 'S[dcl]\NP'): [(0, 0), (1, 1)],
481 |         ('NP\NP', 'S[pss]\NP'): [(0, 0), (1, 1)],
482 |         ('NP\NP', 'S[adj]\NP'): [(0, 0), (1, 1)],
483 |         ('NP\NP', 'S[ng]\NP'): [(0, 0), (1, 1)],
484 |         ('NP\NP', 'S[to]\NP'): [(0, 0), (1, 1)],
485 |         ('N\N', 'S[pss]\NP'): [(0, 0), (1, 1)],
486 |         ('N\N', 'S[ng]\NP'): [(0, 0), (1, 1)],
487 |         ('N\N', 'S[adj]\NP'): [(0, 0), (1, 1)],
488 |         ('N\N', 'S[dcl]/NP'): [(0, 0), (1, 1)],
489 |         ('(S\NP)\(S\NP)', 'S\NP'): [(0, 0), (2, 1)],
490 |         ('(S\NP)\(S\NP)', 'S[ng]\NP'): [(0, 0), (2, 1)],
491 |         ('(S\NP)/(S\NP)', 'S\NP'): [(0, 0), (2, 1)],
492 |         ('NP\NP', 'S[dcl]/NP'): [(0, 0), (1, 1)],
493 |         ('NP', 'S\NP'): [(0, 0)],
494 |         ('S/S', 'S\NP'): [(0, 0)],
495 |         ('NP\NP', 'S'): [(0, 0)],
496 |         ('S/S', 'S\NP'): [(0, 0)],
497 |         ('S/S', 'S\NP'): [(0, 0)],
498 |         ('NP/PP', 'N/PP'): [(0, 0), (1, 1)],
499 |         ('(NP/PP)/PP', '(N/PP)/PP'): [(0, 0), (1, 1), (2, 2)],
500 |         ('((NP/PP)/PP)/PP', '((N/PP)/PP)/PP'): [(0, 0), (1, 1), (2, 2), (3, 3)]
501 |         }
502 | 
503 | class BinaryTypeChanging(object):
504 |     rules = {
505 |         # For rebanking
506 |         ('NP\NP', ',', 'S[pss]\NP'): [(0, None, 0), (1, None, 1)],
507 |         ('NP\NP', ',', 'S[ng]\NP'): [(0, None, 0), (1, None, 1)],
508 |         ('NP\NP', ',', 'S[adj]\NP'): [(0, None, 0), (1, None, 1)],
509 |         ('NP\NP', ',', 'S[dcl]\NP'): [(0, None, 0), (1, None, 1)],
510 |         ('NP\NP', ',', 'S[dcl]/NP'): [(0, None, 0), (1, None, 1)],
511 |         ('S/S', 'S[dcl]/S[dcl]', ','): [(0, 0, None), (1, 1, None)],
512 |         ('(S\NP)\(S\NP)', ',', 'NP'): [(0, None, 0)],
513 |         ('(S\NP)/(S\NP)', 'S[dcl]/S[dcl]', ','): [(0, 0, None), (1, 1, None)],
514 |         ('(S\NP)\(S\NP)', 'S[dcl]/S[dcl]', ','): [(0, 0, None), (1, 1, None)],
515 |         ('S/S', 'NP', ','): [(0, 0, None)],
516 |         ('S\S', 'S[dcl]/S[dcl]', ','): [(0, 0, None), (1, 1, None)],
517 |         ('S/S', 'S[dcl]\S[dcl]', ','): [(0, 0, None), (1, 1, None)],
518 |         ('S[adj]\NP[conj]', 'conj', 'PP'): [(0, None, 0)],
519 |         ('S[adj]\NP[conj]', 'conj', 'NP'): [(0, None, 0)],
520 |         ('NP[conj]', 'conj', 'S[adj]\NP'): [(0, None, 0)],
521 |         ('S/S', 'S[dcl]', ','): [(0, 0, None)],
522 |         ('(S\NP)/(S\NP)', 'S[dcl]\S[dcl]', ','): [(0, 0, None), (1, 1, None)],
523 |         ('NP\NP', 'S[dcl]/S[dcl]', ','): [(0, 0, None), (1, 1, None)],
524 |         ('S[adj]\NP[conj]', 'conj', 'S[ng]\NP'): [(0, None, 0), (1, None, 1)],
525 |         ('(S\NP)\(S\NP)', 'S[dcl]\S[dcl]', ','): [(0, 0, None), (1, 1, None)],
526 |         ('S[pss]\NP[conj]', 'conj', 'S[ng]\NP'): [(0, None, 0), (1, None, 1)]
527 |     }
528 | 
529 | 
530 | def fapply(left, right):
531 |     return Production(left, right, rule='fapply').result
532 | 
533 | def bapply(left, right):
534 |     return Production(left, right, rule='bapply').result
535 | 
536 | def fcomp(left, right):
537 |     return Production(left, right, rule='fcomp').result
538 | 
539 | def bcomp(left, right):
540 |     return Production(left, right, rule='bcomp').result
541 | 
542 | def fxcomp(left, right):
543 |     return Production(left, right, rule='fxcomp').result
544 | 
545 | def bxcomp(left, right):
546 |     return Production(left, right, rule='bxcomp').result
547 | 
548 | def add_conj(left, right):
549 |     return Production(left, right, rule='add_conj').result
550 | 
551 | def do_conj(left, right):
552 |     return Production(left, right, rule='do_conj').result
553 | 
554 | def comma_conj(left, right):
555 |     return Production(left, right, rule='comma_conj').result
556 | 
557 | def left_punct(left, right):
558 |     return Production(left, right, rule='left_punct').result
559 | 
560 | def right_punct(left, right):
561 |     return Production(left, right, rule='right_punct').result
562 | 
563 | def traise(left, parent):
564 |     return Production(left, None, parent).result
565 | 
566 | def binary(left, right, parent):
567 |     return Production(left, right, parent).result
568 | 
569 | def minimise_vars(cat, fvars, seen_vars = None, fvar_freqs = None):
570 |     def _kwargs(cat):
571 |         # nonlocal seen_vars, feat_vars, feat_vars
572 |         kwargs = cat.kwargs.copy()
573 |         kwargs['var'] = seen_vars[cat.var]
574 |         kwargs['arg_idx'] = ''
575 |         if cat.feat_var in fvars:
576 |             kwargs['feat_var'] = ''
577 |             kwargs['feature'] = fvars[cat.feat_var]
578 |         elif fvar_freqs[cat.feat_var] == 1:
579 |             kwargs['feat_var'] = ''
580 |         return kwargs
581 | 
582 |     # Return the cat unchanged if there are no gaps in the vars
583 |     # and we do not have a variable map, and last var is head
584 |     if not seen_vars and not fvars and cat.var == 0:
585 |         vars = cat.cats_by_var.keys()
586 |         if len(vars) == max(vars) + 1:
587 |             return cat, {}
588 | 
589 |     if seen_vars is None:
590 |         seen_vars = defaultdict(lambda: len(seen_vars))
591 |         seen_vars[cat.var] # Maps outer var to 0
592 |         fvar_freqs = defaultdict(int)
593 |         for c in cat.cats.values():
594 |             if c.feat_var and c.feat_var not in fvars:
595 |                 fvar_freqs[c.feat_var] += 1
596 | 
597 |     if not cat.is_complex:
598 |         return Category(cat.cat, **_kwargs(cat)), seen_vars
599 | 
600 |     cats = [(p, c) for p, c in cat.cats.items() if not any(p)]
601 |     cats.sort()
602 |     cats.reverse()
603 |     inner = cats.pop(0)[1]
604 |     curr_cat = Category(inner.cat, **_kwargs(inner))
605 |     for path, cat in cats:
606 |         if cat.argument.is_complex:
607 |             arg, seen_vars = minimise_vars(cat.argument, fvars, seen_vars,
608 |                                            fvar_freqs)
609 |         else:
610 |             arg = Category(cat.argument.cat, **_kwargs(cat.argument))
611 |         curr_cat = Category(curr_cat, cat.slash, arg, **_kwargs(cat))
612 |     return curr_cat, seen_vars
613 | 
614 | def remap_vars(cat, var_map):
615 |     if not var_map:
616 |         return cat
617 |     kwargs = cat.kwargs.copy()
618 |     kwargs['var'] = var_map.get(cat.var, max(var_map.values()) + 1)
619 |     if cat.is_complex:
620 |         result = remap_vars(cat.result, var_map)
621 |         argument = remap_vars(cat.argument, var_map)
622 |         return Category(result, cat.slash, argument, **kwargs)
623 |     else:
624 |         try:
625 |             return Category(cat.cat, **kwargs)
626 |         except:
627 |             print cat.cat
628 |             print kwargs
629 |             raise
630 | 
631 | def strip_features(cat):
632 |     def next_var():
633 |         if not feat_map:
634 |             return '[X]'
635 |         else:
636 |             return VARS[len(feat_map.keys())]
637 |     feat_map = {}
638 |     for c in cat.cats.values():
639 |         if c.feature and c.feature not in feat_map:
640 |             feat_map[c.feature] = next_var()
641 |         assert not (c.feat_var and c.feat_var in feat_map)
642 |     return feats_to_vars(cat, feat_map)
643 | 
644 | def feats_to_vars(cat, feat_map):
645 |     if not feat_map:
646 |         return cat
647 |     kwargs = cat.kwargs.copy()
648 |     if cat.feature and cat.feature != '[adj]':
649 |         kwargs['feat_var'] = feat_map[cat.feature]
650 |         kwargs['feature'] = ''
651 |     if cat.is_complex:
652 |         result = feats_to_vars(cat.result, feat_map)
653 |         argument = feats_to_vars(cat.argument, feat_map)
654 |         return ccg.scat.SuperCat(ccg.category.Category(result, cat.slash,
655 |                                                        argument, **kwargs))
656 |     else:
657 |         return ccg.scat.SuperCat(ccg.category.Category(cat.cat, **kwargs))
658 | 
659 | 


--------------------------------------------------------------------------------
/Treebank/CCGbank/_Production.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Update a subtree to reflect its parent category
  3 | 
  4 | We do this by detecting the type of production
  5 | 
  6 | Unary
  7 | Unlicensed -- Does not conform to any ccg rule
  8 | Adjunction -- The functor category is an adjunct
  9 | Auxiliary -- The functor category is an auxiliary
 10 | Application -- The functor category is a predicate applying to the argument
 11 | Composition, result change -- The result cat of the parent has changed
 12 | Composition, argument change -- The argument structure of the parent has changed
 13 | 
 14 | The detection algorithm also finds the functor category (the category whose
 15 | result is preserved in the case of composition)
 16 | """
 17 | import ccg.category
 18 | from copy import deepcopy as dcopy
 19 | import re
 20 | class _Production(object):
 21 |     """
 22 |     Represents a production rule, of a given type. The parent can then be
 23 |     replaced and the changes reflected in the children, in a way that's
 24 |     customised according to the rule type
 25 |     """
 26 |     def __init__(self, left, right, parent, functorPos = None):
 27 |         self.left = ccg.category.from_string(str(left))
 28 |         self.right = ccg.category.from_string(str(right))
 29 |         self.parent = dcopy(parent)
 30 |         if functorPos != None:
 31 |             self._functor = functorPos
 32 |         else:
 33 |             self._functor = self.findFunctor(left, right, parent)
 34 |         self.unify()
 35 | 
 36 |     def unify(self):
 37 |         pass
 38 |     
 39 |     def _getFunctor(self):
 40 |         if self._functor == 0:
 41 |             return self.left
 42 |         else:
 43 |             return self.right
 44 | 
 45 |     def _setFunctor(self, functor):
 46 |         if self._functor == 0:
 47 |             self.left = functor
 48 |         else:
 49 |             self.right = functor
 50 | 
 51 |     def _getArg(self):
 52 |         if self._functor == 1:
 53 |             return self.left
 54 |         else:
 55 |             return self.right
 56 | 
 57 |     def _setArg(self, arg):
 58 |         if self._functor == 1:
 59 |             self.left = arg
 60 |         else:
 61 |             self.right = arg
 62 | 
 63 |     def replaceResult(self, cat, newResult):
 64 |         argument = dcopy(cat.argument)
 65 |         slash = cat.slash
 66 |         conj = cat.conj
 67 |         result = dcopy(newResult)
 68 |         return ccg.ComplexCategory(result, argument, slash, conj)
 69 | 
 70 |     def addArgs(self, cat, args):
 71 |         cat = dcopy(cat)
 72 |         args.reverse()
 73 |         for arg, slash, morph in args:
 74 |             cat = ccg.ComplexCategory(cat, arg, slash, False)
 75 |             cat.morph = dcopy(morph)
 76 |         return cat
 77 | 
 78 |     def __str__(self):
 79 |         return "%s %s --> %s %s" % (self.label, self.parent, self.left, self.right)
 80 | 
 81 |     def _removeFeatures(self, cat, removeFeat):
 82 |         if cat.isComplex():
 83 |             cat.morph = None
 84 |             for result, argument, slash, morph in cat.deconstruct():
 85 |                 self._removeFeatures(result, removeFeat)
 86 |                 self._removeFeatures(argument, removeFeat)
 87 |         else:
 88 |             if removeFeat:
 89 |                 cat.feature = ''
 90 |             cat.morph = None
 91 | 
 92 |     @property
 93 |     def head(self):
 94 |         return self._getHead()
 95 | 
 96 | 
 97 |     functor = property(_getFunctor, _setFunctor)
 98 |     argument = property(_getArg, _setArg)
 99 | 
100 | class Application(_Production):
101 |     label = 'a'
102 |     def findFunctor(left, right, parent):
103 |         candidates = [(left, '/', right, 0), (right, '\\', left, 1)]
104 |         for functor, slash, arg, position in candidates:
105 |             if functor.isComplex() \
106 |             and functor.slash == slash \
107 |             and functor.argument == arg \
108 |             and functor.result == parent \
109 |             and not functor.isAdjunct():
110 |                 return position
111 |         else:
112 |             return -1
113 | 
114 |     def unify(self):
115 |         self.functor.argument.unify(self.argument)
116 | 
117 |     findFunctor = staticmethod(findFunctor)
118 | 
119 |     def replace(self, new):
120 |         self.functor = self.replaceResult(self.functor, new)
121 |         return self.left, self.right
122 | 
123 |     def _getHead(self):
124 |         return self.functor
125 | 
126 | class TRaiseApplication(Application):
127 |     label = 'tra'
128 |     def findFunctor(left, right, parent):
129 |         direction = Application.findFunctor(left, right, parent)
130 |         if direction == -1:
131 |             return direction
132 |         else:
133 |             functor = [left, right][direction]
134 |             if functor.isTypeRaise():
135 |                 return direction
136 |             else:
137 |                 return -1
138 |     findFunctor = staticmethod(findFunctor)
139 | 
140 |     def unify(self):
141 |         pass
142 | 
143 |     def replace(self, new):
144 |         x = dcopy(self.argument.argument)
145 |         y = dcopy(new)
146 |         featLessY = dcopy(y)
147 |         self._removeFeatures(featLessY, True)
148 |         if self._functor == 0:
149 |             innerSlash = '\\'
150 |             outerSlash = '/'
151 |         else:
152 |             innerSlash = '/'
153 |             outerSlash = '\\'
154 |         tRaiseArgStr = r'(%s%s%s)' % (featLessY.strAsPiece(), innerSlash, x.strAsPiece())
155 |         catStr = '%s%s%s' % (featLessY.strAsPiece(), outerSlash, tRaiseArgStr)
156 |         self.functor = ccg.category.from_string(catStr)
157 |         self.argument = self.addArgs(dcopy(y), [(dcopy(x), innerSlash, None)])
158 |         return self.left, self.right
159 | 
160 |     def _getHead(self):
161 |         return self.argument
162 |         
163 | 
164 | 
165 | 
166 | 
167 | class Composition(_Production):
168 |     label = 'c'
169 |     def findFunctor(left, right, parent):
170 |         """
171 |         Composition is of the form X/Y Y/$ -> X|$ or Y|$ X\Y
172 | 
173 |         We call the X|Y category the functor.
174 |         """
175 |         if (not left.isComplex()) or (not right.isComplex()):
176 |             return -1
177 |         if left.conj or right.conj:
178 |             return -1
179 |         oLeft = left
180 |         left = dcopy(left)
181 |         right = dcopy(right)
182 |         if right.slash == '\\':
183 |             for result, argument, slash, morph in left.deconstruct():
184 |                 if right.argument.unify(result):
185 |                     return 1
186 |         elif left.slash == '/' and right.slash == '/':
187 |             for result, argument, slash, morph in right.deconstruct():
188 |                 if left.argument.unify(result) and slash == '/':
189 |                     return 0
190 |         return -1
191 | 
192 |     findFunctor = staticmethod(findFunctor)
193 | 
194 |     def unify(self):
195 |         yCat = self.functor.argument
196 |         for potentialY, argument, slash, morph in self.argument.deconstruct():
197 |             if yCat.unify(potentialY):
198 |                 break
199 |         else:
200 |             raise StandardError
201 |         self._y = yCat
202 |         self._x = self.functor.result
203 |         
204 |     
205 | 
206 |     def replace(self, new, xCat = None):
207 |         """
208 |         Get the X and $ components of the new category, and change children
209 |         accordingly. Note that because the Y element is not represented in the
210 |         parent, this must be invariant. So we're going to be replacing the result
211 |         of the functor, and/or the $ of the arguments.
212 |         """
213 |         # If new is atomic, we really can't do much composing
214 |         # Make it apply into the new category instead
215 |         if not new.isComplex():
216 |             if self._functor == 0:
217 |                 slash = '/'
218 |             else:
219 |                 slash = '\\'
220 |             newFunc = ccg.ComplexCategory(dcopy(new), dcopy(self.argument), slash, False)
221 |             self.functor = newFunc
222 |             return self.left, self.right
223 |         # If no X supplied, try the old X
224 |         if not xCat:
225 |             xCat = dcopy(self._x)
226 |         # Take a copy of new so that unification doesn't mess things up
227 |         new = dcopy(new)
228 |         oldResults = self._getResultArgs(self.parent)
229 |         dollarCats = []
230 |         # Handle general comp
231 |         for result, argument, slash, morph in new.deconstruct():
232 |             dollarCats.append((argument, slash, morph))
233 |             # So pass first round
234 |             # Unify to pass features and morph
235 |             if xCat.unify(result):
236 |                 break
237 |         else:
238 |             # Otherwise, make it non-generalised composition
239 |             xCat = dcopy(new.result)
240 |             dollarCats = [(new.argument, new.slash, new.morph)]
241 |         functor = self.functor
242 |         
243 |         self.functor = self.addArgs(xCat, [(functor.argument, functor.slash, functor.morph)])
244 |         self.argument = self.addArgs(functor.argument, dollarCats)
245 |         return (self.left, self.right)
246 |         
247 | 
248 |     def _getResultArgs(category):
249 |         """
250 |         Pair a result with the arguments up to that point
251 |         """
252 |         results = {}
253 |         args = []
254 |         for result, argument, slash, morph in category.deconstruct():
255 |             args.append((argument, slash, morph))
256 |             # Use featureless because we don't want to deal with feature passing, eg
257 |             # S[dcl]/NP --> S/(S\NP) (S[dcl]\NP)/NP
258 |             results[result] = list(args)
259 |             
260 |         return results
261 | 
262 |     _getResultArgs = staticmethod(_getResultArgs)
263 | 
264 |     def _getHead(self):
265 |         # Follow the C&C parser's wrong policy of always left heading
266 |         return self.left
267 | 
268 | 
269 | class Determination(Application, Composition):
270 |     """
271 |     Purely for the infuriating NP -> NP[nb]/N N rule
272 |     """
273 |     label = 't'
274 |     def findFunctor(left, right, parent):
275 |         if str(left) == 'NP[nb]/N' and str(right) == 'N' and str(parent) == 'NP':
276 |             return 0
277 |         elif left.isComplex() and not left.isAdjunct() and not ccg.isIdentical(left.innerResult(), ccg.N) and ccg.isIdentical(left.argument, ccg.N) and ccg.isIdentical(ccg.N, right):
278 |             return 0
279 |         else:
280 |             return -1
281 | 
282 |     def replace(self, new):
283 |         if ccg.isIdentical(new, ccg.NP):
284 |             self.functor = ccg.category.from_string('NP[nb]/N')
285 |             return self.left, self.right
286 |         # If possible, prefer to use composition than to stick args onto the NP
287 |         if new.innerResult() == 'NP':
288 |             self._x = self.functor.result
289 |             Composition.replace(self, new)
290 |         else:
291 |             Application.replace(self, new)
292 | 
293 |     findFunctor = staticmethod(findFunctor)
294 | 
295 |     def _getHead(self):
296 |         return self.argument
297 | 
298 | class AdjunctDetermination(Application):
299 |     """
300 |     Handle cases like ((S\NP)\(S\NP))/N
301 |     """
302 |     label = 'jt'
303 | 
304 |     def findFunctor(left, right, parent):
305 |         if parent.isAdjunct() and left.result == parent and right == ccg.N:
306 |             return 0
307 |         else:
308 |             return -1
309 | 
310 |     findFunctor = staticmethod(findFunctor)
311 | 
312 |     def replace(self, new):
313 |         if ccg.isIdentical(new, ccg.NP):
314 |             self.functor = ccg.category.from_string(r'NP[nb]/N')
315 |             return self.left, self.right
316 |         elif new.isAdjunct():
317 |             self.functor = ccg.addArg(new, self.right, '/')
318 |         else:
319 |             # Define this when I've got an example
320 |             # wsj_0029.13 -- noisy, requires fixing
321 |             self.functor = ccg.addArg(new, self.right, '/')
322 |             #print self
323 |             #print new
324 |             #raise StandardError
325 | 
326 |     def _getHead(self):
327 |         return self.argument
328 | 
329 | 
330 | 
331 | class TRaiseComp(Composition):
332 |     """
333 |     Composition between a type raised category and an argument
334 |     """
335 |     label = 'r'
336 |     def findFunctor(left, right, parent):
337 |         answer = Composition.findFunctor(left, right, parent)
338 |         if answer == 0:
339 |             if TRaiseComp.isTypeRaise(left, '/', '\\'):
340 |                 return 0
341 |         if answer == 1:
342 |             if TRaiseComp.isTypeRaise(right, '\\', '/'):
343 |                 return 1
344 |         return -1
345 | 
346 |     def replace(self, new):
347 |         left, right = Composition.replace(self, new)
348 |         self._removeFeatures(self.functor, True)
349 |         if new.isAdjunct():
350 |             # To produce an adjunct we can't have features on the argument
351 |             self._removeFeatures(self.argument, True)
352 | 
353 |     def isTypeRaise(cat, slash1, slash2):
354 |         if cat.isComplex() and cat.slash == slash1 and cat.argument.isComplex() and cat.argument.slash == slash2:
355 |             if ccg.isIdentical(cat.result, cat.argument.result):
356 |                 return True
357 |         return False
358 | 
359 |     findFunctor = staticmethod(findFunctor)
360 |     isTypeRaise = staticmethod(isTypeRaise)
361 | 
362 |     def _getHead(self):
363 |         # Cover cases like N/(N\N) (N\N)/N
364 |         if self.left.isTypeRaise() and self.right.isTypeRaise():
365 |             return self.left
366 |         else:
367 |             return self.argument
368 |     
369 |         
370 | class Adjunction(_Production):
371 |     """
372 |     Adjunction is of the form X/X X$ -> X$ or X$ X\X -> X$
373 |     """
374 |     label = 'd'
375 |     def findFunctor(left, right, parent):
376 |         if left.conj or right.conj:
377 |             return -1
378 |         candidates = [(left, right, 0), (right, left, 1)]
379 |         for adjunct, head, position in candidates:
380 |             if adjunct.isAdjunct() and head == parent:
381 |                 # Test that adjunct applies
382 |                 # Guards against composition cases like (S[b]\NP)/NP NP/NP
383 |                 adjArg = adjunct.argument
384 |                 if adjArg == head:
385 |                     return position
386 |                 for result, argument, slash, morph in head.deconstruct():
387 |                     if adjArg == result:
388 |                         return position
389 |         else:
390 |             return -1
391 |         
392 |     findFunctor = staticmethod(findFunctor)
393 |     
394 |     def replace(self, new, forceApp = False):
395 |         global complexAdj
396 |         
397 |         # Under forceApp, composition is disallowed -- so the argument,
398 |         # functor components and parent must all be identical
399 |         if forceApp:
400 |             self.argument = dcopy(new)
401 |             self.functor = ccg.ComplexCategory(dcopy(new), dcopy(new), self.functor.slash, False)
402 |             return None
403 |         args = []
404 |         if not new.isComplex():
405 |             x = new
406 |         elif new.isAdjunct():
407 |             x = new
408 |         # The complexAdj flag indicates whether to replicate CCGbank analysis and use
409 |         # (S\NP)|(S\NP) adjuncts. Otherwise the natural thing is S\S adjuncts
410 |         elif complexAdj and ccg.VP == new:
411 |             x = new
412 |         # For parser compatibility, don't backwards compose into NP
413 |         elif self._functor == 1 and new.innerResult() != 'S':
414 |             x = new
415 |         else:
416 |             lastCat = new
417 |             lastArgs = []
418 |             # Select either: an adjunct, S\NP, or an atom
419 |             for result, argument, slash, morph in new.deconstruct():
420 |                 args.append((argument, slash, morph))
421 |                 # Ensure the slash directions work
422 |                 # If the functor's to the left, cannot cross-compose into a backslash -- unless not complexAdj!s
423 |                 if complexAdj and self._functor == 0 and slash == '\\':
424 |                     continue
425 | 	            # Don't back-cross compose into non-S
426 | 	        if self._functor == 1 and slash == '/' and result.innerResult() != 'S':
427 |                     continue
428 |                 # Taken this out for the (S[dcl]\S[dcl])\NP S\S test case, but might need it again
429 |                 if result.isAdjunct():
430 |                     x = result
431 |                     break
432 |                 elif complexAdj and result == r'S\NP':
433 |                     x = result
434 |                     break
435 |                 elif not result.isComplex():
436 |                     x = result
437 |                     break
438 |                 # In case the slashes don't work out for a while (ie cross composition), store the last valid
439 |                 # place to compose into, and its arg set
440 |                 lastCat = result
441 |                 lastArgs = list(args)
442 |             else:
443 |                 x = lastCat
444 |                 args = lastArgs
445 |         x = dcopy(x)
446 |         newArg = self.addArgs(x, args)
447 |         # If the new label has a morph category, add it to the argument, as per wsj_1057.57
448 |         if new == newArg:
449 |             newArg.morph = dcopy(new.morph)
450 |         # If we currently have features on the old X, don't remove them
451 |         # This can cause entropy reduction, as per wsj_1824.28
452 |         oldX = self.functor.argument
453 |         if x.morphLess() == oldX.morphLess():
454 |             removeFeat = False
455 |         else:
456 |             removeFeat = True
457 |         self._removeFeatures(x, removeFeat)
458 |         newFunctor = ccg.ComplexCategory(x, x, self.functor.slash, False)
459 |         self.functor = newFunctor
460 |         self.argument = newArg
461 | 
462 |     def _getHead(self):
463 |         return self.argument
464 | 
465 | class AdjComp(_Production):
466 |     """
467 |     Composition of adjuncts of the form X|X X|X -> X|X
468 |     """
469 |     label = 'o'
470 |     def findFunctor(left, right, parent):
471 |         if left.conj or right.conj or parent.conj:
472 |             return -1
473 |         if left.isAdjunct() and right.isAdjunct() and parent.isAdjunct():
474 |             if left.argument == right.argument == parent.argument:
475 |                 if right.slash == '\\':
476 |                     return 1
477 |                 elif left.slash == '/':
478 |                     return 0
479 |         return -1
480 | 
481 |     def replace(self, new):
482 |         if new.isAdjunct():
483 |             x = new.result
484 |             functor = ccg.ComplexCategory(dcopy(x), dcopy(x), self.functor.slash, False)
485 |             argument = ccg.ComplexCategory(dcopy(x), dcopy(x), self.argument.slash, False)
486 |             self.functor = functor
487 |             self.argument = argument
488 |         else:
489 |             x = new
490 |             functor = ccg.ComplexCategory(dcopy(x), dcopy(x), self.functor.slash, False)
491 |             argument = dcopy(x)
492 |             self.functor = functor
493 |             self.argument = argument
494 | 
495 |     findFunctor = staticmethod(findFunctor)
496 | 
497 |     def _getHead(self):
498 |         # Doesn't matter, but follow C&C parser's incorrect left head policy
499 |         return self.left
500 | 
501 | class AddConj(_Production):
502 |     label = 'n'
503 |     def findFunctor(left, right, parent):
504 |         if not parent.conj:
505 |             return -1
506 |         if left.conj or right.conj:
507 |             return -1
508 |         elif left == ccg.conj:
509 |             functor = 0
510 |             arg = right
511 |         elif right == ccg.conj:
512 |             functor = 1
513 |             arg = left
514 |         elif left.isPunct():
515 |             functor = 0
516 |             arg = right
517 |         elif right.isPunct():
518 |             functor = 1
519 |             arg = left
520 |         else:
521 |             return -1
522 |         parentStr = str(parent).replace('[conj]', '')
523 |         if str(arg) != parentStr:
524 |             return -1
525 |         else:
526 |             return functor
527 |         
528 |     findFunctor = staticmethod(findFunctor)
529 | 
530 |     def replace(self, new):
531 |         self.argument = dcopy(new)
532 |         self.argument.conj = False
533 | 
534 |     def _getHead(self):
535 |         return self.argument
536 | 
537 | class Conjunction(_Production):
538 |     label = 'j'
539 |     def findFunctor(left, right, parent):
540 |         if parent.conj:
541 |             return -1
542 |         elif left.conj:
543 |             return 0
544 |         elif right.conj:
545 |             return 1
546 |         else:
547 |             return -1
548 | 
549 |     findFunctor = staticmethod(findFunctor)
550 |         
551 |     def replace(self, new):
552 |         functor = dcopy(new)
553 |         functor.conj = True
554 |         self.functor = functor
555 |         self.argument = dcopy(new)
556 | 
557 |     def _getHead(self):
558 |         # Follow CCGbank in heading to the left
559 |         return self.left
560 | 
561 | class Auxiliary(_Production):
562 |     label = 'x'
563 |     def findFunctor(left, right, parent):
564 |         # Recent change, might break something
565 |         if left.isTrueAux():
566 |             return 0
567 |         elif right.isTrueAux():
568 |             return 1
569 |         else:
570 |             return -1
571 | 
572 |     findFunctor = staticmethod(findFunctor)
573 | 
574 |     def replace(self, new):
575 |         # Wrong: need feature from functor result
576 |         self.argument = dcopy(new)
577 | 
578 |     def _getHead(self):
579 |         # Depart from CCGbank in taking the argument
580 |         return self.argument
581 |     
582 | class Punctuation(_Production):
583 |     label = 'p'
584 |     def findFunctor(left, right, parent):
585 |         """
586 |         Functor is the punctuation symbol. Ensure this isn't conjunctive punctuation.
587 |         """
588 |         if left.isPunct():
589 |             if not ccg.isIdentical(right, parent):
590 |                 return -1
591 |             else:
592 |                 return 0
593 |         elif right.isPunct():
594 |             if not ccg.isIdentical(left, parent):
595 |                 return -1
596 |             else:
597 |                 return 1
598 |         return -1
599 |     findFunctor = staticmethod(findFunctor)
600 |     
601 |     def replace(self, new):
602 |         self.argument = dcopy(new)
603 | 
604 |     def _getHead(self):
605 |         return self.argument
606 | 
607 | class UnHat(_Production):
608 |     label = 'h'
609 |     
610 |     def findFunctor(left, right, parent):
611 |         if right:
612 |             return -1
613 |         if not ccg.isIdentical(parent, left.morph):
614 |             return -1
615 |         return 0
616 | 
617 |     findFunctor = staticmethod(findFunctor)
618 | 
619 |     def replace(self, new):
620 |         self.functor = self.functor
621 |         self.functor.morph = dcopy(new)
622 |         
623 |     
624 |     
625 | 
626 | class Unary(_Production):
627 |     label = 'u'
628 |     def findFunctor(left, right, parent):
629 |         if not right:
630 |             return 0
631 |         else:
632 |             return -1
633 | 
634 |     findFunctor = staticmethod(findFunctor)
635 | 
636 | 
637 |     def replace(self, new):
638 |         pass
639 | 
640 |     def _getHead(self):
641 |         return self.left
642 | 
643 | 
644 | 
645 | class Invalid(_Production):
646 |     label = 'i'
647 |     def findFunctor(left, right, parent):
648 |         """
649 |         Functor is arbitrarily the left side, unless involves conj or punct
650 | 
651 |         This is order dependent on the other rules, as we do not want to check that the production is
652 |         in fact invalid.
653 |         """
654 |         if right == ccg.conj or right.isPunct():
655 |             return 1
656 |         else:
657 |             return 0
658 |         
659 | 
660 |     findFunctor = staticmethod(findFunctor)
661 | 
662 |     def replace(self, new):
663 |         """
664 |         Conj and punct cases are particularly common for invalid, so
665 |         percolate the new label down for these
666 |         """
667 |         if self.functor == ccg.conj or self.functor.isPunct():
668 |             # Provide exception case for transformation punctuation
669 |             if self.functor == ',' and new.isAdjunct():
670 |                 return None
671 |             self.argument = dcopy(new)
672 |             self.argument.conj = None
673 |         else:
674 |             pass
675 | 
676 |     def _getHead(self):
677 |         return self.functor
678 | 
679 | def Production(left, right, parent):
680 |     """
681 |     Allocate a Production
682 |     """
683 |     binary = [AdjunctDetermination,
684 |               Determination,
685 |                    TRaiseApplication,
686 |                    Application,
687 |                    AdjComp,
688 |                    Adjunction,
689 |                    TRaiseComp,
690 |                    Composition,
691 |                    Conjunction,
692 |                    AddConj,
693 |                    Punctuation,
694 |                    Invalid]
695 |     unary = [UnHat, Unary]
696 |     productions = binary if right else unary
697 |     for productionClass in productions:
698 |         functorPos = productionClass.findFunctor(left, right, parent)
699 |         if functorPos != -1:
700 |             return productionClass(left, right, parent, functorPos)
701 |             
702 |         
703 |     
704 | def testLabels():
705 |     """
706 |     Check production type assignment against manually annotated production rules
707 |     """
708 |     location = '/home/mhonn/code/mhonn/Treebank/CCGBank/productionTypes.txt'
709 |     for label, parent, left, right, line in readProductions(location):
710 |         if label == 'x':
711 |             continue
712 |         if not label:
713 |             continue
714 |         if label and not right and label != 'u':
715 |             print line
716 |             raise StandardError
717 |         if right:
718 |             production = Production(left, right, parent)
719 |             if production.label != label:
720 |                 print "Incorrect: %s" % production.label
721 |                 print line
722 |                 raise StandardError
723 |             else:
724 |                 print "Correct! %s" % line
725 |         
726 | def testReplacements():
727 |     location = '/home/mhonn/code/mhonn/Treebank/CCGBank/productionTypes.txt'
728 |     for label, parent, left, right, line in readProductions(location):
729 | #        if line != 'd 20352 # S[dcl]\NP --> S[dcl]\NP (S\NP)\(S\NP)':
730 | #            continue
731 | #        if label == 'x': continue
732 |         if not label:
733 |             break
734 |         if right:
735 |             production = Production(left, right, parent)
736 |             production.replace(parent)
737 |             if label != production.label:
738 |                 print line
739 |                 print production
740 |             if not ccg.isIdentical(production.left, left) or not ccg.isIdentical(production.right, right):
741 |                 print line
742 |                 print production
743 |              
744 | def readProductions(location):
745 |     for line in open(location):
746 |         line = line.strip()
747 |         if not line:
748 |             continue
749 |         if line.startswith('#'):
750 |             continue
751 |         front, production = line.split(' # ')
752 |         pieces = front.split()
753 |         freq = int(pieces.pop())
754 |         if pieces:
755 |             label = pieces[0]
756 |         else:
757 |             label = None
758 |         parent, children = production.split(' --> ')
759 |         parent = ccg.category.from_string(parent)
760 |         pieces = children.split()
761 |         left = ccg.category.from_string(pieces.pop(0))
762 |         if pieces:
763 |             right = ccg.category.from_string(pieces[0])
764 |         else:
765 |             right = None
766 |         yield label, parent, left, right, line
767 | 
768 | def setComplexAdj(value):
769 |     global complexAdj
770 |     complexAdj = value
771 | 
772 | complexAdj = True
773 | if __name__ == '__main__':
774 |     if 1:
775 |         parent = ccg.category.from_string(r'NP\NP')
776 |         cat1 = ccg.category.from_string(r'(S[dc]\NP)^(NP\NP)')
777 |         cat2 = None
778 |         production = Production(cat1, cat2, parent)
779 |         print production
780 |         production.replace(ccg.category.from_string(r'N\N'))
781 |         print production.label
782 |         print production.left
783 |         print production.right
784 |     if 0:
785 |         cat1 = ccg.category.from_string('NP[nb]/N')
786 |         cat2 = ccg.category.from_string('N')
787 |         result = ccg.category.from_string('NP')
788 |         production = Production(cat1, cat2, result)
789 |         production.replace(ccg.category.from_string('NP/(S\NP)'))
790 |         print production.label
791 |         print production.left
792 |         print production.right
793 |     if 0:
794 |         testReplacements()
795 | 


--------------------------------------------------------------------------------