├── _config.yml ├── LICENSE_notes.txt ├── _includes └── head-custom-google-analytics.html ├── LICENSE.txt ├── trivial.py ├── span_dict.py ├── README.md ├── sample.gold_ptb ├── category.py ├── convert.py ├── sample.candc ├── analysis.py ├── sample.ccgbank ├── rule.py ├── trees.py └── markup_convert.py /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman 2 | title: CCG to PST conversion 3 | description: Convert Combinatory Categorial Grammar derivations to Phrase Structure Trees. 4 | show_downloads: true 5 | google_analytics: G-JVV2VPL5CX 6 | -------------------------------------------------------------------------------- /LICENSE_notes.txt: -------------------------------------------------------------------------------- 1 | The license on this software is the ISC license. 2 | 3 | "The ISC copyright is functionally equivalent to a two-term BSD copyright with 4 | language removed that is made unnecessary by the Berne convention. This is the 5 | preferred license for new code incorporated into OpenBSD." 6 | from http://www.openbsd.org/policy.html 7 | -------------------------------------------------------------------------------- /_includes/head-custom-google-analytics.html: -------------------------------------------------------------------------------- 1 | {% if site.google_analytics %} 2 | 3 | 4 | 11 | {% endif %} 12 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Jonathan K Kummerfeld 2 | 3 | Permission to use, copy, modify, and/or distribute this software for any 4 | purpose with or without fee is hereby granted, provided that the above 5 | copyright notice and this permission notice appear in all copies. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 9 | FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | PERFORMANCE OF THIS SOFTWARE. 14 | 15 | -------------------------------------------------------------------------------- /trivial.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import trees, category 5 | 6 | # The trivial method reproduces the bracket structure exactly. Labels are either 7 | # the atomic category, or a VP 8 | def convert(source, argv=None, log=sys.stdout): 9 | ans = trees.PTB_Tree() 10 | if '\\' in source.category or '/' in source.category: 11 | ans.label = "VP" 12 | else: 13 | ans.label = category.strip_square_brackets(source.category) 14 | if source.word is not None: 15 | ans.word = source.word 16 | ans.pos = source.pos 17 | ans.label = source.pos 18 | for subtree in source.subtrees: 19 | ans.subtrees.append(convert(subtree)) 20 | if argv is None: 21 | return ans 22 | else: 23 | return True, ans, None 24 | 25 | if __name__ == '__main__': 26 | print("Please enter CCG trees:") 27 | for line in sys.stdin: 28 | print(convert(trees.CCG_Tree(line.strip()))) 29 | -------------------------------------------------------------------------------- /span_dict.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import trees 4 | 5 | labels_to_ignore = set(["-NONE-", "TOP", "."]) 6 | words_to_ignore = set(["'","`","''", "``", "--",":",";","-",",","..."]) 7 | def span_dict(tree, ans, pos=0): 8 | start = pos 9 | label = tree.label 10 | word = tree.word 11 | if len(tree.subtrees) == 0: 12 | if label in labels_to_ignore or word in words_to_ignore: 13 | return pos 14 | return pos + 1 15 | for subtree in tree.subtrees: 16 | pos = span_dict(subtree, ans, pos) 17 | end = pos 18 | if start == end: 19 | return start 20 | if (start, end) not in ans: 21 | ans[(start, end)] = set() 22 | if not label[0] == '-': 23 | label = label.split('-')[0] 24 | label = label.split('=')[0] 25 | if label == 'PRT': 26 | label = 'ADVP' # another collins change 27 | if label != '' and label != 'TOP': 28 | ans[(start, end)].add(label) 29 | return pos 30 | 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This software converts Combinatory Categorial Grammar (CCG) derivations to Phrase Structure Trees (PST). For a full description of the method, and discussion of results, see: 2 | 3 | [Robust Conversion of CCG Derivations to Phrase Structure Trees](https://aclweb.org/anthology/P/P12/P12-2021.pdf), 4 | Jonathan K. Kummerfeld, James R. Curran and Dan Klein, 5 | ACL (short) 2012 6 | 7 | To use the system, download it one of these ways, and run as shown below: 8 | 9 | - [Download .zip](https://github.com/jkkummerfeld/berkeley-ccg2pst/zipball/master) 10 | - [Download .tar.gz](https://github.com/jkkummerfeld/berkeley-ccg2pst/tarball/master) 11 | - `git clone https://github.com/jkkummerfeld/berkeley-ccg2pst.git` 12 | 13 | If you use my code in your own work, please cite the paper: 14 | 15 | ``` 16 | @InProceedings{Kummerfeld-Klein-Curran:2012:ACL, 17 | author = {Jonathan K. Kummerfeld and Dan Klein and James R. Curran}, 18 | title = {Robust Conversion of {CCG} Derivations to Phrase Structure Trees}, 19 | booktitle = {Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)}, 20 | month = {July}, 21 | year = {2012}, 22 | address = {Jeju Island, Korea}, 23 | pages = {105--109}, 24 | software = {http://github.com/jkkummerfeld/berkeley-ccg2pst/}, 25 | url = {http://www.aclweb.org/anthology/P12-2021}, 26 | } 27 | ``` 28 | 29 | ## Running the code 30 | 31 | On a sample of CCGbank: 32 | ``` 33 | ./convert.py sample.gold_ptb sample.ccgbank -print_comparison -prefix=sample.ccgbank -verbose -method=markedup ./markedup 34 | ``` 35 | 36 | On a sample of C&C Parser output: 37 | ``` 38 | ./convert.py sample.gold_ptb sample.candc -print_comparison -prefix=sample.candc -verbose -method=markedup ./markedup 39 | ``` 40 | 41 | Conversion output will be in: 42 | ``` 43 | sample.ccgbank.auto 44 | sample.candc.auto 45 | ``` 46 | 47 | The code also comes with a sample of parses from the Penn Treebank section 00, 48 | the corresponding parses from CCGbank section 00, and the C&C parser output on 49 | the same sentences. 50 | -------------------------------------------------------------------------------- /sample.gold_ptb: -------------------------------------------------------------------------------- 1 | (ROOT (S (NP-SBJ (NP (NNP Pierre) (NNP Vinken)) (, ,) (ADJP (NP (CD 61) (NNS years)) (JJ old)) (, ,)) (VP (MD will) (VP (VB join) (NP (DT the) (NN board)) (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director))) (NP-TMP (NNP Nov.) (CD 29)))) (. .))) 2 | (ROOT (S (NP-SBJ (NNP Mr.) (NNP Vinken)) (VP (VBZ is) (NP-PRD (NP (NN chairman)) (PP (IN of) (NP (NP (NNP Elsevier) (NNP N.V.)) (, ,) (NP (DT the) (NNP Dutch) (VBG publishing) (NN group)))))) (. .))) 3 | (ROOT (S (NP-SBJ-1 (NP (NNP Rudolph) (NNP Agnew)) (, ,) (UCP (ADJP (NP (CD 55) (NNS years)) (JJ old)) (CC and) (NP (NP (JJ former) (NN chairman)) (PP (IN of) (NP (NNP Consolidated) (NNP Gold) (NNP Fields) (NNP PLC))))) (, ,)) (VP (VBD was) (VP (VBN named) (S (NP-SBJ (-NONE- *-1)) (NP-PRD (NP (DT a) (JJ nonexecutive) (NN director)) (PP (IN of) (NP (DT this) (JJ British) (JJ industrial) (NN conglomerate))))))) (. .))) 4 | (ROOT (S (S-TPC-1 (NP-SBJ (NP (NP (DT A) (NN form)) (PP (IN of) (NP (NN asbestos)))) (RRC (ADVP-TMP (RB once)) (VP (VBN used) (NP (-NONE- *)) (S-CLR (NP-SBJ (-NONE- *)) (VP (TO to) (VP (VB make) (NP (NNP Kent) (NN cigarette) (NNS filters)))))))) (VP (VBZ has) (VP (VBN caused) (NP (NP (DT a) (JJ high) (NN percentage)) (PP (IN of) (NP (NN cancer) (NNS deaths))) (PP-LOC (IN among) (NP (NP (DT a) (NN group)) (PP (IN of) (NP (NP (NNS workers)) (RRC (VP (VBN exposed) (NP (-NONE- *)) (PP-CLR (TO to) (NP (PRP it))) (ADVP-TMP (NP (QP (RBR more) (IN than) (CD 30)) (NNS years)) (IN ago)))))))))))) (, ,) (NP-SBJ (NNS researchers)) (VP (VBD reported) (SBAR (-NONE- 0) (S (-NONE- *T*-1)))) (. .))) 5 | (ROOT (S (S-TPC-2 (NP-SBJ (NP (DT The) (NN asbestos) (NN fiber)) (, ,) (NP (NN crocidolite)) (, ,)) (VP (VBZ is) (ADJP-PRD (RB unusually) (JJ resilient)) (SBAR-TMP (IN once) (S (NP-SBJ (PRP it)) (VP (VBZ enters) (NP (DT the) (NNS lungs))))) (, ,) (PP (IN with) (S-NOM (NP-SBJ (NP (RB even) (JJ brief) (NNS exposures)) (PP (TO to) (NP (PRP it)))) (VP (VBG causing) (NP (NP (NNS symptoms)) (SBAR (WHNP-1 (WDT that)) (S (NP-SBJ (-NONE- *T*-1)) (VP (VBP show) (PRT (RP up)) (ADVP-TMP (NP (NNS decades)) (JJ later))))))))))) (, ,) (NP-SBJ (NNS researchers)) (VP (VBD said) (SBAR (-NONE- 0) (S (-NONE- *T*-2)))) (. .))) 6 | (ROOT (S (NP-SBJ (NP (NNP Lorillard) (NNP Inc.)) (, ,) (NP (NP (DT the) (NN unit)) (PP (IN of) (NP (ADJP (JJ New) (JJ York-based)) (NNP Loews) (NNP Corp.))) (SBAR (WHNP-2 (WDT that)) (S (NP-SBJ (-NONE- *T*-2)) (VP (VBZ makes) (NP (NNP Kent) (NNS cigarettes)))))) (, ,)) (VP (VBD stopped) (VP (VBG using) (NP (NN crocidolite)) (PP-LOC-CLR (IN in) (NP (PRP$ its) (NN Micronite) (NN cigarette) (NNS filters))) (PP-TMP (IN in) (NP (CD 1956))))) (. .))) 7 | (ROOT (S (SBAR-ADV (IN Although) (S (NP-SBJ-2 (JJ preliminary) (NNS findings)) (VP (VBD were) (VP (VBN reported) (NP (-NONE- *-2)) (ADVP-TMP (NP (QP (RBR more) (IN than) (DT a)) (NN year)) (IN ago)))))) (, ,) (NP-SBJ (DT the) (JJS latest) (NNS results)) (VP (VBP appear) (PP-LOC (IN in) (NP (NP (NP (NP (NN today) (POS 's)) (NNP New) (NNP England) (NNP Journal)) (PP (IN of) (NP (NNP Medicine)))) (, ,) (NP (NP (DT a) (NN forum)) (ADJP (JJ likely) (S (NP-SBJ (-NONE- *)) (VP (TO to) (VP (VB bring) (NP (JJ new) (NN attention)) (PP-DIR (TO to) (NP (DT the) (NN problem))))))))))) (. .))) 8 | (ROOT (S (NP-SBJ (DT A) (NNP Lorillard) (NN spokewoman)) (VP (VBD said) (, ,) (`` ``) (S (NP-SBJ (DT This)) (VP (VBZ is) (NP-PRD (DT an) (JJ old) (NN story))))) (. .))) 9 | (ROOT (S (NP-SBJ (PRP We)) (VP (VBP 're) (VP (VBG talking) (PP-CLR (IN about) (ADVP-TMP (ADVP (NP (NNS years)) (IN ago)) (SBAR (IN before) (S (NP-SBJ (NN anyone)) (VP (VBD heard) (PP-CLR (IN of) (S-NOM (NP-SBJ (NN asbestos)) (VP (VBG having) (NP (DT any) (JJ questionable) (NNS properties)))))))))))) (. .))) 10 | (ROOT (S (NP-SBJ (EX There)) (VP (VBZ is) (NP-PRD (DT no) (NN asbestos)) (PP-LOC (IN in) (NP (PRP$ our) (NNS products))) (ADVP-TMP (RB now))) (. .) ('' ''))) 11 | -------------------------------------------------------------------------------- /category.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Just handle unary rules, working out when one is being used 3 | 4 | import re 5 | 6 | CURLY_BRACES_RE = re.compile('{[^}]*}') 7 | def strip_braces(category): 8 | return CURLY_BRACES_RE.sub('', category) 9 | 10 | SQUARE_BRACKETS_RE = re.compile('\[[^\]]*\]') 11 | def strip_square_brackets(category): 12 | if category is not None: 13 | return SQUARE_BRACKETS_RE.sub('', category) 14 | else: 15 | return None 16 | 17 | def remove_extra_brackets(category): 18 | if category[0] != '(' or category[-1] != ')': 19 | return category 20 | if not ('\\' in category or '/' in category): 21 | return category[1:-1] 22 | depth = 0 23 | hit_zero = False 24 | for i in range(len(category)): 25 | if category[i] == '(': 26 | depth += 1 27 | elif category[i] == ')': 28 | depth -= 1 29 | elif depth == 0: 30 | hit_zero = True 31 | break 32 | if not hit_zero: 33 | return category[1:-1] 34 | return category 35 | 36 | def divide(category): 37 | if '\\' not in category and '/' not in category: 38 | return [category, None, None] 39 | category = remove_extra_brackets(category) 40 | depth = 0 41 | sep = None 42 | for i in range(len(category)): 43 | if category[i] == '(': 44 | depth += 1 45 | elif category[i] == ')': 46 | depth -= 1 47 | elif category[i] in '/\\' and depth == 0: 48 | sep = i 49 | break 50 | if sep is None: 51 | return [category, None, None] 52 | parts = [category[:sep], category[sep:sep+1], category[sep+1:]] 53 | for i in [0, 2]: 54 | while True: 55 | if parts[i][0] != '(' or parts[i][-1] != ')': 56 | break 57 | stripped_version = parts[i][1:-1] 58 | depth = 0 59 | use = True 60 | for char in stripped_version: 61 | if char == '(': 62 | depth += 1 63 | elif char == ')': 64 | depth -= 1 65 | if depth < 0: 66 | use = False 67 | if use: 68 | parts[i] = stripped_version 69 | else: 70 | break 71 | return parts 72 | 73 | def compare(cat0, cat1): 74 | if cat0 is None or cat1 is None: 75 | return False 76 | # Check the general structure matches 77 | if strip_square_brackets(cat0) != strip_square_brackets(cat1): 78 | return False 79 | # remove [conj], which is present temporarily at the end 80 | cat0 = cat0.split('[conj]')[0] 81 | cat1 = cat1.split('[conj]')[0] 82 | 83 | cat0 = 'NP[X]'.join(cat0.split('NP')) 84 | cat0 = 'NP['.join(cat0.split('NP[X][')) 85 | cat1 = 'NP[X]'.join(cat1.split('NP')) 86 | cat1 = 'NP['.join(cat1.split('NP[X][')) 87 | 88 | cat0 = 'S[X]'.join(cat0.split('S')) 89 | cat0 = 'S['.join(cat0.split('S[X][')) 90 | cat1 = 'S[X]'.join(cat1.split('S')) 91 | cat1 = 'S['.join(cat1.split('S[X][')) 92 | 93 | pairs0 = SQUARE_BRACKETS_RE.findall(cat0) 94 | pairs1 = SQUARE_BRACKETS_RE.findall(cat1) 95 | # Having no brackets indicates no S, so it's fine 96 | if len(pairs0) == 0 or len(pairs1) == 0: 97 | return True 98 | # For debugging 99 | if len(pairs0) != len(pairs1): 100 | print('confused by:') 101 | print(cat0, cat1) 102 | # Make sure they all match (with X as a wildcard) 103 | for i in range(len(pairs0)): 104 | if pairs0[i] == '[X]' or pairs1[i] == '[X]' or pairs0[i] == pairs1[i]: 105 | continue 106 | return False 107 | return True 108 | 109 | if __name__ == '__main__': 110 | pass 111 | -------------------------------------------------------------------------------- /convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys, re 4 | import trees, category, rule 5 | import analysis 6 | import span_dict 7 | import trivial, markup_convert 8 | 9 | tree_out = None 10 | gold_out = None 11 | log_out = sys.stdout 12 | colour_out = None 13 | analysis_out = sys.stdout 14 | 15 | def score_count(target, auto): 16 | gold_nodes = 0 17 | auta_nodeo = 0 18 | match_brackets = 0 19 | match_labels = 0 20 | 21 | target_spans = {} 22 | span_dict.span_dict(target, target_spans) 23 | auto_spans = {} 24 | span_dict.span_dict(auto, auto_spans) 25 | gold_nodes = 0 26 | auto_nodes = 0 27 | print(target_spans.keys(), file=log_out) 28 | print(auto_spans.keys(), file=log_out) 29 | for key in target_spans: 30 | gold_nodes += len(target_spans[key]) 31 | if key in auto_spans: 32 | match_brackets += min(len(auto_spans[key]), len(target_spans[key])) 33 | match_labels += len(auto_spans[key].intersection(target_spans[key])) 34 | if len(target_spans[key].symmetric_difference(auto_spans[key])) != 0: 35 | print('different label sets: ', key, target_spans[key], auto_spans[key], target.word_yield(key)[1], file=log_out) 36 | print('different label sets: ', key, target_spans[key], auto_spans[key], target.word_yield(key)[1], file=colour_out) 37 | else: 38 | # Check for crossing brackets 39 | crossing = False 40 | for akey in auto_spans: 41 | if key[0] < akey[0] < key[1] < akey[1]: 42 | crossing = True 43 | break 44 | if akey[0] < key[0] < akey[1] < key[1]: 45 | crossing = True 46 | break 47 | if crossing: 48 | print('crossing', end=" ", file=log_out) 49 | print('\033[01;31mcrossing\033[00m', end=" ", file=colour_out) 50 | print('missing span: ', key, target_spans[key], target.word_yield(key)[1], file=log_out) 51 | print('missing span: ', key, target_spans[key], target.word_yield(key)[1], file=colour_out) 52 | for key in auto_spans: 53 | auto_nodes += len(auto_spans[key]) 54 | if key not in target_spans: 55 | crossing = False 56 | for tkey in target_spans: 57 | if key[0] < tkey[0] < key[1] < tkey[1]: 58 | crossing = True 59 | break 60 | if tkey[0] < key[0] < tkey[1] < key[1]: 61 | crossing = True 62 | break 63 | if crossing: 64 | print('crossing', end=" ", file=log_out) 65 | print('\033[01;31mcrossing\033[00m', end=" ", file=colour_out) 66 | # Check for crossing brackets 67 | print('extra span: ', key, auto_spans[key], target.word_yield(key)[1], file=log_out) 68 | print('extra span: ', key, auto_spans[key], target.word_yield(key)[1], file=colour_out) 69 | return gold_nodes, auto_nodes, match_brackets, match_labels 70 | 71 | def calc_prf(overlap, auto, gold): 72 | if gold == 0: 73 | return 1.0, 1.0, 1.0 74 | if auto == 0: 75 | return 0.0, 0.0, 0.0 76 | p = float(overlap) / auto 77 | r = float(overlap) / gold 78 | f = 0 79 | if p + r > 1e-5: 80 | f = 2 * p * r / (p + r) 81 | return p, r, f 82 | 83 | def compare_words(pwords, cwords): 84 | i = 0 85 | match = 0 86 | for word in cwords: 87 | while word != pwords[i]: 88 | if i == len(pwords) - 1: 89 | break 90 | i += 1 91 | if word == pwords[i]: 92 | match += 1 93 | return float(match) / len(cwords) 94 | 95 | def print_stats(stats_name, gold_nodes, auto_nodes, match_brackets, match_labels, correct_sentences, correct_sentences_brackets, total_sentences): 96 | p_brac, r_brac, f_brac = calc_prf(match_brackets, auto_nodes, gold_nodes) 97 | p_labe, r_labe, f_labe = calc_prf(match_labels, auto_nodes, gold_nodes) 98 | print(stats_name, "counts: ", gold_nodes, auto_nodes, ' ', match_brackets, match_labels, file=log_out) 99 | print(stats_name, "brackets: %.2f %.2f %.2f" % (p_brac * 100, r_brac * 100, f_brac * 100), file=log_out) 100 | print(stats_name, "labels: %.2f %.2f %.2f" % (p_labe * 100, r_labe * 100, f_labe * 100), file=log_out) 101 | print(stats_name, "sentences: %d of %d (i.e. %.2f), just brackets %d of %d (i.e. %.2f)" % (correct_sentences, total_sentences, correct_sentences * 100.0 / total_sentences, correct_sentences_brackets, total_sentences, correct_sentences_brackets * 102.0 / total_sentences), file=log_out) 102 | 103 | if __name__ == '__main__': 104 | args = ' '.join(sys.argv) 105 | methods = { 106 | 'trivial': trivial.convert, 107 | 'markedup': markup_convert.convert 108 | } 109 | if len(sys.argv) < 3: 110 | print("Usage:\n%s " % sys.argv[0]) 111 | print("Options:") 112 | print("\t-method=[%s]" % (','.join(methods.keys()))) 113 | print("\t-print_comparison") 114 | print("\t-sents=") 115 | print("\t-max_length=") 116 | print("\t-prefix=") 117 | print("\t-exclude_no_parse") 118 | sys.exit(1) 119 | 120 | only_parsed = '-exclude_no_parse' in ' '.join(sys.argv) 121 | if '-prefix=' in args: 122 | prefix = args.split('-prefix=')[1].split(' ')[0] 123 | tree_out = open(prefix + '.auto', 'w') 124 | gold_out = open(prefix + '.gold', 'w') 125 | log_out = open(prefix + '.log', 'w') 126 | colour_out = open(prefix + '.colour', 'w') 127 | analysis_out = open(prefix + '.analysis', 'w') 128 | for output in [log_out, colour_out, analysis_out]: 129 | print("# this file was generated by the following command(s):", file=output) 130 | print("# " + args, file=output) 131 | print('', file=output) 132 | else: 133 | print("# this file was generated by the following command(s):") 134 | print("# " + args) 135 | print 136 | 137 | total_sentences = 1000000 if "-sents" not in args else int(args.split('-sents=')[1].split(' ')[0]) 138 | max_sent_length = -1 if "-max_length" not in args else int(args.split('-sents=')[1].split(' ')[0]) 139 | 140 | gold_nodes = 0 141 | auto_nodes = 0 142 | match_brackets = 0 143 | match_labels = 0 144 | correct_sentences = 0 145 | correct_sentences_brackets = 0 146 | print_trees = "-print_comparison" in args 147 | ptb_source = open(sys.argv[1]) 148 | ccg_source = open(sys.argv[2]) 149 | for i in range(total_sentences): 150 | source = trees.read_CCG_tree(ccg_source) 151 | target = trees.read_PTB_tree(ptb_source) 152 | ### print(source) 153 | if source is None or target is None: 154 | total_sentences = i 155 | break 156 | 157 | if source.category is None: 158 | if not only_parsed: 159 | if gold_out is not None: 160 | print(target.one_line_repr(), file=gold_out) 161 | print("", file=tree_out) 162 | # only evaluate on sentences that receive a parse 163 | continue 164 | 165 | pwords = target.get_words() 166 | cwords = source.get_words() 167 | if len(cwords) != 0: 168 | while compare_words(pwords, cwords) < 0.7: 169 | if not only_parsed: 170 | if gold_out is not None: 171 | print(target.one_line_repr(), file=gold_out) 172 | print("", file=tree_out) 173 | target = trees.read_PTB_tree(ptb_source) 174 | if target is None: 175 | print("Ran out of sentences trying to find a match", file=sys.stderr) 176 | sys.exit(2) 177 | pwords = target.get_words() 178 | 179 | if max_sent_length > 0 and len(pwords) > max_sent_length: 180 | continue 181 | 182 | if target.label == '': 183 | target.label = 'ROOT' 184 | 185 | if print_trees: 186 | print(source, file=log_out) 187 | print(target, file=log_out) 188 | use, auto_ptb, auto_schema = (False, None, None) 189 | if 'method' in args: 190 | method_name = args.split('method=')[1].split()[0] 191 | ans = methods[method_name](source, sys.argv, log_out) 192 | use, auto_ptb, auto_schema = ans 193 | else: 194 | ans = trivial.convert(source, sys.argv, log_out) 195 | use, auto_ptb, auto_schema = ans 196 | 197 | if not use: 198 | print("Not being included", file=log_out) 199 | if auto_schema is not None: 200 | analysis.analyse(source, target, auto_ptb, auto_schema, analysis_out) 201 | if tree_out is not None: 202 | if use: 203 | print(target.one_line_repr(), file=gold_out) 204 | print(auto_ptb.one_line_repr(), file=tree_out) 205 | elif not only_parsed: 206 | print(target.one_line_repr(), file=gold_out) 207 | print("", file=tree_out) 208 | 209 | if print_trees: 210 | print(auto_ptb, file=log_out) 211 | if colour_out is not None: 212 | print(source, file=colour_out) 213 | print(auto_ptb.repr_with_corrections(target), file=colour_out) 214 | 215 | scores = score_count(target, auto_ptb) 216 | gold_nodes += scores[0] 217 | auto_nodes += scores[1] 218 | match_brackets += scores[2] 219 | match_labels += scores[3] 220 | if scores[0] == scores[1] == scores[2]: 221 | correct_sentences_brackets += 1 222 | if scores[0] == scores[1] == scores[3]: 223 | correct_sentences += 1 224 | print_stats('', scores[0], scores[1], scores[2], scores[3], correct_sentences, correct_sentences_brackets, i + 1) 225 | print_stats('cumulative', gold_nodes, auto_nodes, match_brackets, match_labels, correct_sentences, correct_sentences_brackets, i + 1) 226 | print_stats('final', gold_nodes, auto_nodes, match_brackets, match_labels, correct_sentences, correct_sentences_brackets, total_sentences) 227 | -------------------------------------------------------------------------------- /sample.candc: -------------------------------------------------------------------------------- 1 | ID=1 PARSER=GOLD NUMPARSE=0 2 | ( ( ( ( () ())) ( () ( ( ( ( () ())) ())))) ( () ( () ( ( ( () ( () ())) ( () ( () ( () ())))) ( () ( () ())))))) 3 | ID=2 PARSER=GOLD NUMPARSE=0 4 | ( ( ( () ())) ( () ( ( ()) ( () ( ( ( () ())) ( () ( () ( () ( () ( () ())))))))))) 5 | ID=3 PARSER=GOLD NUMPARSE=0 6 | ( ( ( ( ( () ())) ( () ( ( ( ( () ())) ())))) ( () ( ( ( () ())) ( () ( ( () ( () ( () ())))))))) ( () ( () ( () ( ( () ( () ())) ( () ( () ( () ( () ( () ())))))))))) 7 | ID=4 PARSER=GOLD NUMPARSE=0 8 | ( ( ( ( () ()) ( () ( ( ()) ( ( () ( () ( () ( () ( ( () ( () ()))))))))))) ( () ( () ( ( () ( () ())) ( () ( ( ( () ())) ( () ( ( () ()) ( () ( ( ()) ( ( ( () ( () ())) ( ( ( ( ( () ()) ()) ())) ()))))))))))))) ( () ( ( ()) ( () ())))) 9 | ID=5 PARSER=GOLD NUMPARSE=0 10 | ( ( ( ( () ( () ())) ( () ( ()))) ( () ( ( () ( () ())) ( () ( () ( ( () ( () ())) ( () ( ( () ( ( ( () ( () ()))) ( () ()))) ( () ( ( ()) ( () ( ( ( () ()) ( ())) ())))))))))))) ( () ( ( ()) ( () ())))) 11 | ID=6 PARSER=GOLD NUMPARSE=0 12 | ( ( ( ( () ())) ( () ( ( ( () ()) ( () ( ( ( () ( () ())) ())))) ( () ( () ( ( () ()))))))) ( () ( ( () ( ( () ( ())) ( () ( () ( () ( () ())))))) ( () ( ( ()) ()))))) 13 | ID=7 PARSER=GOLD NUMPARSE=0 14 | ( ( () ( ( ( () ())) ( () ( ( () ()) ( () ( ( () ()) ())))))) ( () ( ( () ( () ())) ( () ( () ( ( ( ( ()) ()) ( () ( () ()))) ( () ( ( ()) ( () ( ( () ()) ( ( ( () ( () ( ( () ( ( () ()))) ( () ( () ())))))) ()))))))))))) 15 | ID=8 PARSER=GOLD NUMPARSE=0 16 | ( ( () ( () ())) ( () ( () ( () ( () ( () ( () ( () ())))))))) 17 | ID=9 PARSER=GOLD NUMPARSE=0 18 | ( () ( () ( ( ( () ( ( ( () ())) ())) ( () ( () ( ())))) ( () ( ( ()) ( ( ( () ( () ( () ())))) ())))))) 19 | ID=10 PARSER=GOLD NUMPARSE=0 20 | ( () ( ( () ( ( () ()) ( () ( () ())))) ( () ()))) 21 | -------------------------------------------------------------------------------- /analysis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import trees 5 | import span_dict 6 | import markup_convert 7 | 8 | analysis_out = sys.stdout 9 | 10 | def side_by_side(tree0, tree1): 11 | text0 = tree0.__repr__() 12 | text1 = tree1.__repr__() 13 | lines0 = ' '.join(text0.split('\t')).split('\n') 14 | lines1 = ' '.join(text1.split('\t')).split('\n') 15 | longest = 0 16 | for line in lines0: 17 | longest = max(longest, len(line)) 18 | longest += 3 19 | longest = max(longest, 40) 20 | lines = [] 21 | i = 0 22 | while i < len(lines0) or i < len(lines1): 23 | start = ' ' * longest 24 | if i < len(lines0): 25 | start = lines0[i] + ((longest - len(lines0[i])) * ' ') 26 | rest = '' 27 | if i < len(lines1): 28 | rest = lines1[i] 29 | lines.append(start + rest) 30 | i += 1 31 | return '\n'.join(lines) 32 | 33 | def strip_label(label): 34 | if not label[0] == '-': 35 | label = label.split('-')[0] 36 | label = label.split('=')[0] 37 | if label == 'PRT': 38 | label = 'ADVP' # another collins change 39 | return label 40 | 41 | labels_to_ignore = set(["-NONE-", "TOP", "."]) 42 | words_to_ignore = set(["'","`","''", "``", "--",":",";","-",",","..."]) 43 | def spans3(tree, ans, pos): 44 | start = pos 45 | label = None 46 | not_a_schema = False 47 | try: 48 | a = tree.subtrees 49 | not_a_schema = True 50 | except: 51 | pass 52 | if type(tree) == type(''): 53 | if tree == '' or tree[0] != '(': 54 | return pos 55 | label, word = tree[1:-1].split() 56 | if label in labels_to_ignore or word in words_to_ignore: 57 | return pos 58 | return pos + 1 59 | else: 60 | label = tree.label 61 | if not_a_schema: 62 | if len(tree.subtrees) == 0: 63 | if tree.label in labels_to_ignore or tree.word in words_to_ignore: 64 | return pos 65 | return pos + 1 66 | if not_a_schema: 67 | for subtree in tree.subtrees: 68 | pos = spans3(subtree, ans, pos) 69 | else: 70 | for child in tree.children: 71 | pos = spans3(child, ans, pos) 72 | end = pos 73 | if start == end: 74 | return start 75 | if (start, end) not in ans: 76 | ans[(start, end)] = {} 77 | label = strip_label(label) 78 | if label != '' and label != 'TOP': 79 | if label in ans[(start, end)]: 80 | # keep the higher one 81 | ctree = ans[(start, end)][label] 82 | if not_a_schema: 83 | while len(ctree.subtrees) == 1: 84 | ctree = ctree.subtrees[0] 85 | if ctree == tree: 86 | return pos 87 | else: 88 | while len(ctree.children) == 1: 89 | ctree = ctree.children[0] 90 | if ctree == tree: 91 | return pos 92 | if type(ctree) == type(''): 93 | break 94 | ans[(start, end)][label] = tree 95 | return pos 96 | 97 | def spans(tree): 98 | ans = {} 99 | spans3(tree, ans, 0) 100 | return ans 101 | 102 | def tree_repr(tree, depth): 103 | not_a_schema = False 104 | try: 105 | a = tree.subtrees 106 | not_a_schema = True 107 | except: 108 | pass 109 | if not_a_schema: 110 | if tree.word is not None: 111 | return '(%s)' % (strip_label(tree.label)) 112 | text = '(' + strip_label(tree.label) 113 | if depth > 0: 114 | for subtree in tree.subtrees: 115 | text += ' ' + tree_repr(subtree, depth - 1) 116 | text += ')' 117 | return text 118 | else: 119 | return 'TODO' 120 | 121 | def get_word_info(schema_spans, key, label): 122 | word_info = 'unk-cat\tunk-pos\tunk-word' 123 | for label in schema_spans[key]: 124 | span = schema_spans[key][label] 125 | if span.label == label: 126 | cat = span.source.category 127 | pos = span.source.pos 128 | word = span.source.word 129 | if span.source.rule == 'unary': 130 | pos = "unary-rule" 131 | word = span.source.subtrees[0].category 132 | elif span.source.rule == 'binary': 133 | pos = "binary-rule" 134 | word = "%s_%s" % (span.source.subtrees[0].category, span.source.subtrees[1].category) 135 | elif span.source.rule == 'type': 136 | pos = "type-raising" 137 | word = span.source.subtrees[0].category 138 | return "%s\t%s\t%s" % (cat, pos, word) 139 | 140 | def get_cat(source, key): 141 | node = source.get_node(key) 142 | if node.pos is not None: 143 | return '\t'.join([node.category, node.pos, node.word]) 144 | else: 145 | return '\t'.join([node.category, 'unk-pos', 'unk-word']) 146 | 147 | def lowest_span(spans): 148 | fallback = [s for s in spans][0] 149 | for span in spans: 150 | if len(span.subtrees) != 1: 151 | return span 152 | return fallback 153 | 154 | def log(fields): 155 | print('\t'.join(fields), file=analysis_out) 156 | 157 | 158 | 159 | def analyse(source, target, auto_ptb, auto_schema, out): 160 | global analysis_out 161 | analysis_out = out 162 | if auto_schema.source is None: 163 | print("Missing schema source") 164 | print("Missing schema source", file=out) 165 | return 166 | 167 | target_spans = spans(target) 168 | auto_spans = spans(auto_ptb) 169 | schema_spans = spans(auto_schema) 170 | 171 | errors = False 172 | 173 | # 174 | # Missing brackets 175 | # 176 | for target_key in target_spans: 177 | if target_key not in auto_spans: 178 | errors = True 179 | # find the set of brackets that are as small as possible, while still covering key 180 | best = None 181 | for akey in auto_spans: 182 | if akey[0] <= target_key[0] and target_key[1] <= akey[1]: 183 | if best is None or best[0] < akey[0] or akey[1] < best[1]: 184 | best = akey 185 | auto_key = best 186 | 187 | for tlabel in target_spans[target_key]: 188 | ttree = target_spans[target_key][tlabel] 189 | atree = lowest_span(auto_spans[auto_key].values()) 190 | cat_info = get_word_info(schema_spans, auto_key, atree.label) 191 | for adepth in [1, 2]: 192 | for tdepth in [1, 2]: 193 | adesc = tree_repr(atree, adepth) 194 | tdesc = tree_repr(ttree, tdepth) 195 | log(['==miss %d %d ==' % (tdepth, adepth), tlabel, cat_info, tdesc, adesc]) 196 | 197 | 198 | # 199 | # Extra brackets 200 | # 201 | for auto_key in auto_spans: 202 | if auto_key not in target_spans: 203 | errors = True 204 | # find the set of brackets that are as small as possible, while still covering key 205 | best = None 206 | for tkey in target_spans: 207 | if tkey[0] <= auto_key[0] and auto_key[1] <= tkey[1]: 208 | if best is None or best[0] < tkey[0] or tkey[1] < best[1]: 209 | best = tkey 210 | target_key = best 211 | if target_key is None: 212 | log(["None target key!", auto_key.__repr__()]) 213 | else: 214 | for alabel in auto_spans[auto_key]: 215 | atree = auto_spans[auto_key][alabel] 216 | cat_info = get_word_info(schema_spans, auto_key, alabel) 217 | ttree = lowest_span(target_spans[target_key].values()) 218 | for adepth in [1, 2]: 219 | for tdepth in [1, 2]: 220 | adesc = tree_repr(atree, adepth) 221 | tdesc = tree_repr(ttree, tdepth) 222 | log(['==extra %d %d ==' % (tdepth, adepth), alabel, cat_info, tdesc, adesc]) 223 | 224 | 225 | # 226 | # Span present in both, but with different labels 227 | # 228 | for key in target_spans: 229 | if key in auto_spans: 230 | target_labels = set(target_spans[key].keys()) 231 | auto_labels = set(auto_spans[key].keys()) 232 | diff = target_labels.symmetric_difference(auto_labels) 233 | if len(diff) != 0: 234 | errors = True 235 | textra = target_labels.difference(auto_labels) 236 | aextra = auto_labels.difference(target_labels) 237 | 238 | # A single label that is wrong 239 | if len(diff) == 2 and len(textra) == 1 and len(aextra) == 1: 240 | tlabel = textra.pop() 241 | ttree = target_spans[key][tlabel] 242 | alabel = aextra.pop() 243 | atree = auto_spans[key][alabel] 244 | cat_info = get_word_info(schema_spans, key, alabel) 245 | for adepth in [1, 2]: 246 | for tdepth in [1, 2]: 247 | adesc = tree_repr(atree, adepth) 248 | tdesc = tree_repr(ttree, tdepth) 249 | log(['==diff-c %d %d ==' % (tdepth, adepth), tlabel + '_' + alabel, cat_info, tdesc, adesc]) 250 | 251 | elif len(aextra) == 0: # ie, these are actually missing 252 | for tlabel in textra: 253 | ttree = target_spans[key][tlabel] 254 | atree = lowest_span(auto_spans[key].values()) 255 | cat_info = get_word_info(schema_spans, key, atree.label) 256 | for adepth in [1, 2]: 257 | for tdepth in [1, 2]: 258 | adesc = tree_repr(atree, adepth) 259 | tdesc = tree_repr(ttree, tdepth) 260 | log(['==miss %d %d ==' % (tdepth, adepth), tlabel, cat_info, tdesc, adesc]) 261 | 262 | elif len(textra) == 0: # ie, these are actually extra 263 | for alabel in aextra: 264 | atree = auto_spans[key][alabel] 265 | cat_info = get_word_info(schema_spans, key, alabel) 266 | ttree = lowest_span(target_spans[key].values()) 267 | for adepth in [1, 2]: 268 | for tdepth in [1, 2]: 269 | adesc = tree_repr(atree, adepth) 270 | tdesc = tree_repr(ttree, tdepth) 271 | log(['==extra %d %d ==' % (tdepth, adepth), alabel, cat_info, tdesc, adesc]) 272 | 273 | else: # more complicated difference 274 | for tlabel in textra: 275 | ttree = target_spans[key][tlabel] 276 | atree = lowest_span(auto_spans[key].values()) 277 | cat_info = get_word_info(schema_spans, key, atree.label) 278 | for adepth in [1, 2]: 279 | for tdepth in [1, 2]: 280 | adesc = tree_repr(atree, adepth) 281 | tdesc = tree_repr(ttree, tdepth) 282 | log(['==diff-m %d %d ==' % (tdepth, adepth), tlabel, cat_info, tdesc, adesc]) 283 | for alabel in aextra: 284 | atree = auto_spans[key][alabel] 285 | cat_info = get_word_info(schema_spans, key, alabel) 286 | ttree = lowest_span(target_spans[key].values()) 287 | for adepth in [1, 2]: 288 | for tdepth in [1, 2]: 289 | adesc = tree_repr(atree, adepth) 290 | tdesc = tree_repr(ttree, tdepth) 291 | log(['==diff-e %d %d ==' % (tdepth, adepth), alabel, cat_info, tdesc, adesc]) 292 | 293 | # 294 | # Correct brackets 295 | # 296 | for key in target_spans: 297 | if key in auto_spans: 298 | target_labels = set(target_spans[key].keys()) 299 | auto_labels = set(auto_spans[key].keys()) 300 | same = target_labels.intersection(auto_labels) 301 | for label in same: 302 | ttree = target_spans[key][label] 303 | atree = auto_spans[key][label] 304 | cat_info = get_word_info(schema_spans, key, label) 305 | for adepth in [1, 2]: 306 | for tdepth in [1, 2]: 307 | adesc = tree_repr(atree, adepth) 308 | tdesc = tree_repr(ttree, tdepth) 309 | log(['==same %d %d ==' % (tdepth, adepth), label, cat_info, tdesc, adesc]) 310 | # 311 | # General sentence info 312 | # 313 | if errors: 314 | print(target, file=out) 315 | print(auto_ptb, file=out) 316 | print("", file=out) 317 | 318 | if __name__ == '__main__': 319 | pass 320 | -------------------------------------------------------------------------------- /sample.ccgbank: -------------------------------------------------------------------------------- 1 | ID=wsj_0001.1 PARSER=GOLD NUMPARSE=1 2 | ( ( ( ( ( ( ( () () ) ) () ) ( ( ( ( () () ) ) () ) ) ) () ) ( () ( ( ( () ( () () ) ) ( () ( () ( () () ) ) ) ) ( () () ) ) ) ) () ) 3 | ID=wsj_0001.2 PARSER=GOLD NUMPARSE=1 4 | ( ( ( ( () () ) ) ( () ( ( () ) ( () ( ( ( () () ) ) ( () ( () ( () ( () () ) ) ) ) ) ) ) ) ) () ) 5 | ID=wsj_0002.1 PARSER=GOLD NUMPARSE=1 6 | ( ( ( ( ( ( ( () () ) ) () ) ( ( ( ( ( () () ) ) () ) ( () ( ( ( () () ) ) ( () ( ( () ( () ( () () ) ) ) ) ) ) ) ) ) ) () ) ( () ( () ( ( () ( () () ) ) ( () ( () ( () ( () () ) ) ) ) ) ) ) ) () ) 7 | ID=wsj_0003.1 PARSER=GOLD NUMPARSE=1 8 | ( ( ( ( ( ( () () ) ( () ( () ) ) ) ( ( () ( () ( () ( () ( ( () ( () () ) ) ) ) ) ) ) ) ) ( () ( () ( ( ( () ( () () ) ) ( () ( ( () () ) ) ) ) ( () ( ( () () ) ( () ( ( () ) ( ( ( () ( () () ) ) ( ( ( ( ( () () ) () ) () ) ) () ) ) ) ) ) ) ) ) ) ) ) ( () ( ( () ) () ) ) ) () ) 9 | ID=wsj_0003.2 PARSER=GOLD NUMPARSE=1 10 | ( ( ( ( ( ( () ( () () ) ) ( () ( () ) ) ) () ) ( ( ( ( () ( () () ) ) ( () ( () ( () ( () () ) ) ) ) ) () ) ( ( () ( ( ( () ( () () ) ) ) ( () () ) ) ) ( () ( ( () ) ( () ( ( () () ) ( ( () ) () ) ) ) ) ) ) ) ) ( () ( ( () ) () ) ) ) () ) 11 | ID=wsj_0003.3 PARSER=GOLD NUMPARSE=1 12 | ( ( ( ( ( ( () () ) ) ( () ( ( ( () () ) ( () ( ( ( () () ) ( () () ) ) ) ) ) ( () ( () ( ( () () ) ) ) ) ) ) ) () ) ( () ( ( ( () ( () ) ) ( () ( () ( () ( () () ) ) ) ) ) ( () ( () ) ) ) ) ) () ) 13 | ID=wsj_0003.4 PARSER=GOLD NUMPARSE=1 14 | ( ( ( () ( ( ( () () ) ) ( () ( () ( ( ( ( () () ) () ) () ) () ) ) ) ) ) ( () ( ( () ( () () ) ) ( () ( () ( ( ( ( ( () ) () ) ( () ( () () ) ) ) ( () ( () ) ) ) ( () ( ( () () ) ( ( () ( () ( ( () ( ( () () ) ) ) ( () ( () () ) ) ) ) ) ) ) ) ) ) ) ) ) ) () ) 15 | ID=wsj_0003.5 PARSER=GOLD NUMPARSE=1 16 | ( ( ( () ( () () ) ) ( ( () () ) ( () ( () ( () ( () () ) ) ) ) ) ) () ) 17 | ID=wsj_0003.6 PARSER=GOLD NUMPARSE=1 18 | ( ( () ( () ( () ( () ( ( ( () ) () ) ( () ( () ( () ( ( () ( () ) ) ( () ( () ( () () ) ) ) ) ) ) ) ) ) ) ) ) () ) 19 | ID=wsj_0003.7 PARSER=GOLD NUMPARSE=1 20 | ( ( () ( ( ( () ( () () ) ) ( () ( () () ) ) ) () ) ) () ) 21 | -------------------------------------------------------------------------------- /rule.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Just handle unary rules, working out when one is being used 3 | 4 | import re 5 | import category 6 | 7 | # from, to, keep original dependencies, activated by extra flag 8 | UNARIES = [ 9 | ('S[adj]\\NP','NP\\NP',False,False,[ 10 | '(ADJP 0)', 11 | '(NP {1} 0) arg:(NP PP ...):', 12 | '(NP 1 0) arg:default:']), 13 | ('S[to]\\NP','NP\\NP',True,False,[ 14 | '{(TEMP 0)}', 15 | '(NP {1} (SBAR 0)) arg:(NP PP ...):', 16 | '(NP 1 (SBAR 0)) arg:default:']), 17 | ('S[dcl]/NP','NP\\NP',True,False,[ 18 | '(SBAR 0)', 19 | '(NP {1} 0) arg:(NP PP ...):', 20 | '(NP 1 0) arg:default:']), 21 | ('(S[to]\\NP)/NP','NP\\NP',True,True,[]), 22 | ('S[dcl]','NP\\NP',False,True,[]), 23 | 24 | ('S[pss]\\NP','S/S',False,True,[]), 25 | ('S[ng]\\NP','S/S',False,False,[ 26 | '(S 0)', 27 | '(S* 0 {1})']), 28 | ('S[adj]\\NP','S/S',False,True,[]), 29 | ('S[ng]\\NP','S\\S',False,True,[]), 30 | ('S[dcl]','S\\S',False,True,[]), 31 | ('S/S','S\\S',False,False,[]), 32 | ('S[to]\\NP','S/S',False,True,[]), 33 | 34 | ('S[pss]\\NP','(S\\NP)\\(S\\NP)',False,True,[]), 35 | ('S[ng]\\NP','(S\\NP)\\(S\\NP)',False,False,[ 36 | '(S 0)', 37 | '(VP {1} 0)', 38 | '(S 1 0)']), 39 | ('S[adj]\\NP','(S\\NP)\\(S\\NP)',False,False,[ 40 | '(S (ADJP 0))', 41 | '(VP {1} 0)', 42 | '(S 1 0)']), 43 | ('S[to]\\NP','(S\\NP)\\(S\\NP)',False,False,[ 44 | '(S 0)', 45 | '(VP {1} 0)', 46 | '(S 1 0)']), 47 | 48 | ('S[ng]\\NP','NP',False,True,[ 49 | '(S {0})']), 50 | ('N','NP',False,False,[ 51 | '(NP {0})']), 52 | # Need to implement filtering based on self... 53 | ### ('N','NP',False,False,[ 54 | ### '{(TEMP 0)}', 55 | ### '(QP {0}) self:(... QP):', 56 | ### '(QP 0) self:(... CD):', 57 | ### '(NP {0}) self:default:']), 58 | 59 | ('S[ng]\\NP','(S\\NP)/(S\\NP)',False,True,[]), 60 | ('S[to]\\NP','N\\N',True,False,[]), 61 | ('NP','NP/(NP\\NP)',False,True,[]), 62 | ('S[dcl][conj]','S[dcl]',False,False,[ 63 | '{(TEMP 0)}']), 64 | ('PP','(S\\NP)\\((S\\NP)/PP)',False,False,[]), 65 | ('S[to]\\NP','(S\\NP)\\((S\\NP)/(S[to]\\NP))',False,False,[]), 66 | ('S[adj]\\NP','(S\\NP)\\((S\\NP)/(S[adj]\\NP))',False,False,[]), 67 | ('NP','S/(S\\NP)',False,False,[]), 68 | ('NP','(S\\NP)\\((S\\NP)/NP)',False,False,[]), 69 | ('NP','((S\\NP)/NP)\\(((S\\NP)/NP)/NP)',False,False,[]), 70 | ('NP','((S\\NP)/(S[to]\\NP))\\(((S\\NP)/(S[to]\\NP))/NP)',False,False,[]), 71 | ('NP','((S\\NP)/PP)\\(((S\\NP)/PP)/NP)',False,False,[]), 72 | ('NP','((S\\NP)/(S[adj]\\NP))\\(((S\\NP)/(S[adj]\\NP))/NP)',False,False,[]), 73 | ('NP','S/(S/NP)',False,False,[ 74 | '{(TEMP 0)}', 75 | '(S 0 {1})']), 76 | 77 | ('S[dcl]','((S\\NP)\\(S\\NP))\\((S\\NP)\\(S\\NP))',False,False,[ 78 | '(SBAR 0)', 79 | '(NP 1 0)', 80 | '(VP {1} 0)', 81 | '(S 1 0)']), 82 | 83 | ('S[X]\\NP','NP\\NP',True,False,[]) 84 | ] 85 | 86 | def get_unary(start_cat, end_cat, markedup=None): 87 | # Note: PP_qus - for questions only, ignored for now 88 | for unary in UNARIES: 89 | start = unary[0] 90 | end_markup = unary[1] 91 | end = category.strip_braces(end_markup) 92 | keep_deps = unary[2] 93 | extra = unary[3] 94 | rules = unary[4] 95 | if category.compare(start_cat, start): 96 | if category.compare(end_cat, end): 97 | if len(rules) > 0: 98 | return rules 99 | elif markedup is not None: 100 | if end in markedup: 101 | return markedup[end][1:] 102 | end_no_brac = category.strip_square_brackets(end) 103 | if end_no_brac in markedup: 104 | return markedup[end_no_brac][1:] 105 | else: 106 | return [] 107 | return None 108 | 109 | BINARIES = [ 110 | (',','NP','(S\\NP)\\(S\\NP)',False,[ 111 | '(ADVP {0} 1)', 112 | '(VP {1} {0})', 113 | '(S 1 0)']), 114 | ('NP',',','S/S',False,[ 115 | '(S (S 0) 1)', 116 | '(S* {0} 1)']), 117 | ('S[dcl]\\S[dcl]',',','S/S',False,[ 118 | '(PRN (SINV 0) 1)', 119 | '(S* 0 1)']), 120 | ('S[dcl]/S[dcl]',',','(S\\NP)/(S\\NP)',False,[ 121 | '(S 0 1)', 122 | '(S {0} 1)', 123 | '(S 1 {0})']), 124 | ('S[dcl]/S[dcl]',',','(S\\NP)\\(S\\NP)',False,[ 125 | '(S 0 1)', 126 | '(S 1 {0})', 127 | '(S 1 {0})']), 128 | ('S[dcl]/S[dcl]',',','S/S',False,[ 129 | '(S 0 1)', 130 | '(S* {0} {1})']), 131 | ('S[dcl]/S[dcl]',',','S\\S',False,[ 132 | '(S 0 1)', 133 | '(S* {1} {0})']), 134 | 135 | # not generated by C&C 136 | ('S[dcl]',',','S/S',False,[ 137 | '(S {0} 1)', 138 | '(S* 0 {1})']), 139 | ('S[dcl]',',','S\\S',False,[ 140 | '(S (PRN 0) 1)', 141 | '(S* {1} {0})']), 142 | ('S[dcl]',',','NP\\NP',False,[ 143 | '(S {0} 1)', 144 | '(NP 1 0)']), 145 | ('S[adj]\\NP',',','NP\\NP',False,[ 146 | '(S {0} 1)', 147 | '(NP 1 0)']), 148 | ('S[dcl]',',','(S\\NP)\\(S\\NP)',False,[ 149 | '(S 0 1)', 150 | '(VP {1} 0)', 151 | '(S 1 0)']), 152 | ('((S[pss]\\NP)/PP)/NP','(S\\NP)\\(S\\NP)','((S[pss]\\NP)/PP)/NP',False,[ 153 | '(VP {0} 1)', 154 | '(VP {0} 3)', 155 | '(VP {0} 2)', 156 | '(S 1 0)']), 157 | ('S[dcl]/S[dcl]',',','NP\\NP',False,[ 158 | '(S {0} 1)', 159 | '(NP 1 0)']), 160 | ('S[dcl]\\S[dcl]',',','(S\\NP)\\(S\\NP)',False,[ 161 | '{(TEMP 0 1)}', 162 | '(VP {1} 0)', 163 | '(S 1 0)']), 164 | ('S[dcl]\\S[dcl]',',','(S\\NP)/(S\\NP)',False,[ 165 | '(PRN (SINV 0) 1)', 166 | '(S 0 1)', 167 | '(S 1 {0})']) 168 | ### ('S[dcl]\\S[dcl]',',','S\\S',False,[]) 169 | ### ('((S[dcl]\\NP)/PP)/NP','(S\\NP)\\(S\\NP)','((S[dcl]\\NP)/PP)/NP',False,[]) 170 | ### ('((S[dcl]\\NP[expl])/(S[to]\\NP))/(S[adj]\\NP)','(S\\NP)\\(S\\NP)','((S[dcl]\\NP[expl])/','(S[to]\\NP))/(S[adj]\\NP)',False,[]) 171 | ### ('((S[dcl]\\NP[expl])/(S[to]\\NP))/NP','(S\\NP)\\(S\\NP)','((S[dcl]\\NP[expl])/(S[to]\\NP))/NP',False,[]) 172 | ### ('((S[dcl]\\NP[expl])/S[dcl])/(S[adj]\\NP)','(S\\NP)\\(S\\NP)','((S[dcl]\\NP[expl])/S[dcl])/','(S[adj]\\NP)',False,[]) 173 | ### ('((S[dcl]\\NP[expl])/S[dcl])/NP','(S\\NP)\\(S\\NP)','((S[dcl]\\NP[expl])/S[dcl])/NP',False,[]) 174 | ### ('((S[dcl]\\NP[expl])/S[qem])/(S[adj]\\NP)','(S\\NP)\\(S\\NP)','((S[dcl]\\NP[expl])/S[qem])/','(S[adj]\\NP)',False,[]) 175 | ### ('((S[ng]\\NP)/PP)/NP','(S\\NP)\\(S\\NP)','((S[ng]\\NP)/PP)/NP',False,[]) 176 | ### ('(S[dcl]\\(S[to]\\NP))/(S[b]\\NP)','S\\S','(S[dcl]\\(S[to]\\NP))/(S[b]\\NP)',False,[]) 177 | ### ('(S[dcl]\\S[dcl])\\NP','S\\S','(S[dcl]\\S[dcl])\\NP',False,[]) 178 | ### ('(S[q]/(S[b]\\NP))/NP','S\\S','(S[q]/(S[b]\\NP))/NP',False,[]) 179 | 180 | ### ('(S\\NP)/(S\\NP)','(S[ng]\\NP)\\(S[adj]\\NP)','(S[ng]\\NP)\\(S[adj]\\NP)',False,['(VP 0 1)','(ADJP 1 0)','(S 1 0)']) 181 | ] 182 | 183 | def get_binary_for_markedup(left, right, result, markedup=None, flexible=False): 184 | for binary in BINARIES: 185 | if category.compare(left, binary[0]): 186 | if category.compare(right, binary[1]): 187 | if category.compare(result, binary[2]): 188 | keep_deps = binary[3] 189 | rules = binary[4] 190 | if len(rules) > 0: 191 | return rules 192 | elif markedup is not None: 193 | return ['(S 0 1)'] + markedup[result][1:] 194 | else: 195 | return [] 196 | if flexible: 197 | for binary in BINARIES: 198 | if category.compare(result, binary[2]): 199 | rules = binary[4] 200 | if len(rules) > 0: 201 | return rules 202 | elif markedup is not None: 203 | return ['(S 0 1)'] + markedup[result][1:] 204 | else: 205 | return [] 206 | if markedup is not None: 207 | return ['(S 0 1)'] + markedup[result][1:] 208 | return None 209 | 210 | def get_binary(left, right, result, markedup=None): 211 | for binary in BINARIES: 212 | if category.compare(left, binary[0]): 213 | if category.compare(right, binary[1]): 214 | if category.compare(result, binary[2]): 215 | keep_deps = binary[3] 216 | rules = binary[4] 217 | if len(rules) > 0: 218 | return rules 219 | elif markedup is not None: 220 | return ['(S 0 1)'] + markedup[result][1:] 221 | else: 222 | return [] 223 | return None 224 | 225 | def determine_combinator(source, result): 226 | ### print(len(source)) 227 | ### print(' '.join(source), result) 228 | if len(source) == 0: 229 | return 'lex' 230 | if len(source) == 1: 231 | if get_unary(source[0].category, result) is not None: 232 | return 'unary' 233 | return 'type' 234 | if len(source) == 2: 235 | left = source[0].category 236 | right = source[1].category 237 | result_parts = category.divide(result) 238 | left_parts = category.divide(left) 239 | right_parts = category.divide(right) 240 | 241 | if get_binary(left, right, result) is not None: 242 | return 'binary' 243 | 244 | # Coordination 245 | # X = X CONJ X 246 | if left == 'conj' or (result.endswith('[conj]') and not '[conj]' in right): 247 | if right == 'conj\\conj': 248 | return 'fa.b' 249 | return 'conj1' 250 | elif 'conj' in source[1].rule or '[conj]' in right: 251 | if category.compare(left, right): 252 | return 'conj2' 253 | if category.compare(category.divide(left)[2], right) and category.divide(left)[1] == '/': 254 | return 'fa.f' 255 | if category.compare(category.divide(right)[0], left) and category.divide(right)[1] is not None: 256 | if 'conj2' in source[1].rule or '[conj]' in right and category.compare(category.divide(right)[2], left): 257 | return 'fa.b' 258 | else: 259 | return 'conj1' 260 | if category.compare(category.divide(right)[2], left): 261 | return 'fa.b' 262 | if (category.compare(left_parts[2], result_parts[2]) and 263 | category.compare(left_parts[0], right_parts[2]) and 264 | category.compare(right_parts[0], result_parts[0]) and 265 | left_parts[1] == result_parts[1] == '/' and 266 | right_parts[1] == '\\'): 267 | return 'cc.b' 268 | if (category.compare(left_parts[2], right_parts[0]) and 269 | category.compare(left_parts[0], result_parts[0]) and 270 | category.compare(right_parts[2], result_parts[2]) and 271 | left_parts[1] == right_parts[1] == result_parts[1] == '/'): 272 | return 'fc.f' 273 | if (category.compare(left_parts[2], result_parts[2]) and 274 | category.compare(left_parts[0], right_parts[2]) and 275 | category.compare(right_parts[0], result_parts[0]) and 276 | left_parts[1] == right_parts[1] == result_parts[1] == '\\'): 277 | return 'fc.b' 278 | if category.compare(result, left): 279 | if '[conj]' in result: 280 | return 'conj2' 281 | raw_right = right 282 | if '[conj]' in right: 283 | raw_right = right[:-6] 284 | if category.compare(result, raw_right): 285 | return 'conj2' 286 | else: 287 | return 'conj2' 288 | elif 'conj1' in source[0].rule or '[conj]' in left: 289 | return 'conj2' 290 | # consider conj3, to handle , separated lists 291 | 292 | # Function application 293 | # X = X/Y + Y 294 | if (left_parts[1] == '/' and 295 | category.compare(left_parts[2], right) and 296 | category.compare(left_parts[0], result)): 297 | return 'fa.f' 298 | # X = Y + X\\Y 299 | if (right_parts[1] == '\\' and 300 | category.compare(right_parts[2], left) and 301 | category.compare(right_parts[0], result)): 302 | return 'fa.b' 303 | 304 | # Function composition 305 | # X/Z = X/Y + Y/Z 306 | if (category.compare(left_parts[2], right_parts[0]) and 307 | category.compare(left_parts[0], result_parts[0]) and 308 | category.compare(right_parts[2], result_parts[2]) and 309 | left_parts[1] == right_parts[1] == result_parts[1] == '/'): 310 | return 'fc.f' 311 | # X\\Z = Y\\Z + X\\Y 312 | if (category.compare(left_parts[2], result_parts[2]) and 313 | category.compare(left_parts[0], right_parts[2]) and 314 | category.compare(right_parts[0], result_parts[0]) and 315 | left_parts[1] == right_parts[1] == result_parts[1] == '\\'): 316 | return 'fc.b' 317 | 318 | # Crossed composition 319 | # X/Z = Y/Z + X\\Y 320 | # For example: 321 | # (S\\NP)/(S\\NP) = (S\\NP)/(S\\NP) + (S\\NP)\\(S\\NP) 322 | if (category.compare(left_parts[2], result_parts[2]) and 323 | category.compare(left_parts[0], right_parts[2]) and 324 | category.compare(right_parts[0], result_parts[0]) and 325 | left_parts[1] == result_parts[1] == '/' and 326 | right_parts[1] == '\\'): 327 | return 'cc.b' 328 | # Z\\X = Z/Y + Y\\X 329 | # ((S\\NP)/S)/(S\\NP) = ((S\\NP)/S)/(S\\NP) + (S\\NP)\\(S\\NP) 330 | 331 | # Backward crossed substitution 332 | # X/Z = B/Z + (X\\B)/Z 333 | if (left_parts[1] == right_parts[1] == result_parts[1] == '/' and 334 | category.compare(left_parts[2], result_parts[2]) and 335 | category.compare(right_parts[2], result_parts[2])): 336 | sub_parts = category.divide(right_parts[0]) 337 | if (category.compare(sub_parts[0], result_parts[0]) and 338 | category.compare(sub_parts[2], left_parts[0]) and 339 | sub_parts[1] != left_parts[1]): 340 | return 'bs.f' 341 | # X\\Z = (X/B)\\Z + B\\Z 342 | if (left_parts[1] == right_parts[1] == result_parts[1] == '\\' and 343 | category.compare(left_parts[2], result_parts[2]) and 344 | category.compare(right_parts[2], result_parts[2])): 345 | sub_parts = category.divide(left_parts[0]) 346 | if (sub_parts[0] == result_parts[0] and 347 | sub_parts[2] == right_parts[0] and 348 | sub_parts[1] != right_parts[1]): 349 | return 'bs.b' 350 | # There are restrictions on what B can be, but since this is a parse, and 351 | # all other options have been exhausted, this must be what is going on 352 | 353 | # Uncomment to see what is misc: 354 | ### if left == result and '/' not in right and '\\' not in right: 355 | ### pass 356 | ### elif right == result and '/' not in left and '\\' not in left: 357 | ### pass 358 | ### elif '[conj]' in left or '[conj]' in right or '[conj]' in result: 359 | ### pass 360 | ### else: 361 | ### print('misc rule:', left, right, result) 362 | ### print(' ', left_parts) 363 | ### print(' ', right_parts) 364 | ### print(' ', result_parts) 365 | if category.divide(result)[0] == right and category.divide(result)[1] is not None: 366 | return 'conj1' 367 | return 'misc' 368 | 369 | if __name__ == '__main__': 370 | pass 371 | -------------------------------------------------------------------------------- /trees.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys, re 4 | import category, rule 5 | 6 | class Tree: 7 | def __init__(self, text): 8 | self.text = text 9 | self.subtrees = [] 10 | self.word = None 11 | 12 | def get_words(self): 13 | if self.word is not None: 14 | return [self.word] 15 | words = [] 16 | for tree in self.subtrees: 17 | words += tree.get_words() 18 | return words 19 | 20 | # switching back to PTB scheme 21 | word_to_word_mapping = { 22 | '{': '-LCB-', 23 | '}': '-RCB-' 24 | } 25 | word_to_POS_mapping = { 26 | '--': ':', 27 | '-': ':', 28 | ';': ':', 29 | ':': ':', 30 | '-LRB-': '-LRB-', 31 | '-RRB-': '-RRB-', 32 | '-LCB-': '-LRB-', 33 | '-RCB-': '-RRB-', 34 | '{': '-LRB-', 35 | '}': '-RRB-', 36 | 'Wa': 'NNP' 37 | } 38 | def get_PTB_word(word): 39 | global word_to_word_mapping 40 | if word in word_to_word_mapping: 41 | word = word_to_word_mapping[word] 42 | return word 43 | def get_PTB_label(label, word): 44 | global word_to_POS_mapping 45 | if word in word_to_POS_mapping: 46 | label = word_to_POS_mapping[word] 47 | return label 48 | 49 | class CCG_Tree(Tree): 50 | # Convert line of CCGBank to a tree. This is an example line: 51 | # ( ( ( ( () () ) ) ( () ( () ) ) ) () ) 52 | # This expands to: 53 | # ( 54 | # ( 55 | # ( 56 | # ( 57 | # () 58 | # () ) ) 59 | # ( 60 | # () 61 | # ( 62 | # () ) ) ) 63 | # () ) 64 | def __init__(self, text='', pos=0): 65 | Tree.__init__(self, text) 66 | self.label = '' 67 | self.category = None 68 | self.orig_category = None 69 | self.pos = None 70 | self.word = None 71 | self.head = None 72 | self.rule = None 73 | if text == '': 74 | return 75 | if '' and self.label == '': 89 | self.label = text[pos + 2:i] 90 | # we've reached the end of the scope for this bracket 91 | if depth < 0: 92 | break 93 | parts = self.label.split() 94 | self.category = ''.join(parts[1].split('[X]')) 95 | self.orig_category = self.category 96 | # Fix a sentence with two broken categories in CCGBank (0595.15) 97 | if self.category[-1] in '\\/': 98 | self.category = self.category + 'NP' 99 | self.rule = rule.determine_combinator(self.subtrees, self.category) 100 | if 'conj' in self.rule: 101 | if not self.category.endswith('[conj]') and not category.compare(self.category, self.subtrees[1].category): 102 | if self.subtrees[1].category.endswith('[conj]'): 103 | self.category = self.subtrees[1].category 104 | else: 105 | self.category = self.subtrees[1].category + '[conj]' 106 | if len(parts) == 4: 107 | if len(self.subtrees) > 0: 108 | self.head = self.subtrees[0] 109 | if parts[2] == '1' and len(self.subtrees) == 2: 110 | self.head = self.subtrees[1] 111 | elif len(parts) == 6: 112 | self.pos = parts[3] 113 | self.word = parts[4] 114 | else: 115 | # Handle fowler input 116 | self.label = text[pos:].split()[0][1:] 117 | self.category = ')'.join('('.join(self.label.split('{')).split('}')) 118 | self.orig_category = self.category 119 | 120 | depth = 0 121 | for i in range(pos + 1, len(text)): 122 | if depth < 0: 123 | break 124 | char = text[i] 125 | # update the depth 126 | if char == '(': 127 | depth += 1 128 | if depth == 1: 129 | self.subtrees.append(CCG_Tree(text, i)) 130 | elif char == ')': 131 | depth -= 1 132 | if len(self.subtrees) == 0: 133 | pos = i 134 | for j in range(i, 0, -1): 135 | if text[j] == ' ': 136 | pos = j 137 | break 138 | self.word = text[pos + 1:i] 139 | break 140 | 141 | self.rule = rule.determine_combinator(self.subtrees, self.category) 142 | if 'conj' in self.rule: 143 | if not self.category.endswith('[conj]') and not category.compare(self.category, self.subtrees[1].category): 144 | if self.subtrees[1].category.endswith('[conj]'): 145 | self.category = self.subtrees[1].category 146 | else: 147 | self.category = self.subtrees[1].category + '[conj]' 148 | if self.word is not None: 149 | self.pos = "UNK" 150 | if self.word == '.': 151 | self.pos = '.' 152 | if self.word == ',': 153 | self.pos = ',' 154 | if self.word == '...': 155 | self.pos = ':' 156 | if self.word == '?': 157 | self.pos = '.' 158 | if self.word == '!': 159 | self.pos = '.' 160 | 161 | 162 | def get_node(self, span, pos=None, min_enclosing=False): 163 | return_ans_only = False 164 | if pos is None: 165 | pos = 0 166 | return_ans_only = True 167 | start = pos 168 | ans = None 169 | if self.word is not None: 170 | labels_to_ignore = set(["-NONE-","TOP"]) 171 | words_to_ignore = set(["'","`","''","``",".","--",":",";","-",",","..."]) 172 | if self.word not in words_to_ignore and self.label not in labels_to_ignore: 173 | pos += 1 174 | else: 175 | for subtree in self.subtrees: 176 | pos, sub_ans = subtree.get_node(span, pos, min_enclosing) 177 | if sub_ans is not None: 178 | ans = sub_ans 179 | end = pos 180 | if min_enclosing: 181 | if ans is None and start <= span[0] and end >= span[1]: 182 | ans = self 183 | else: 184 | if start == span[0] and end == span[1]: 185 | ans = self 186 | if return_ans_only: 187 | return ans 188 | else: 189 | return end, ans 190 | 191 | def contains_rule(self, text): 192 | if text in self.rule: 193 | return True 194 | for subtree in self.subtrees: 195 | if subtree.contains_rule(text): 196 | return True 197 | return False 198 | 199 | def all_word_yield(self, span=None, pos=0): 200 | if self.word is not None: 201 | if span is None or span[0] <= pos < span[1]: 202 | return (pos + 1, self.word) 203 | else: 204 | return (pos + 1, '') 205 | else: 206 | text = [] 207 | for subtree in self.subtrees: 208 | pos, words = subtree.all_word_yield(span, pos) 209 | if words != '': 210 | text.append(words) 211 | return (pos, ' '.join(text)) 212 | 213 | def __repr__(self, depth=0): 214 | ans = '\n' + depth * '\t' 215 | ans += '(' 216 | if self.category is None: 217 | ans += 'EMPTY EMPTY)' 218 | return ans 219 | if self.rule is not None: 220 | ans += self.rule + ' ' 221 | ans += self.category 222 | if self.pos is not None: 223 | pos = get_PTB_label(self.pos, self.word) 224 | ans += ' ' + pos 225 | if self.word is not None: 226 | ans += ' ' + get_PTB_word(self.word) 227 | for subtree in self.subtrees: 228 | ans += subtree.__repr__(depth + 1) 229 | ans += ')' 230 | return ans 231 | 232 | 233 | 234 | class PTB_Tree(Tree): 235 | # Convert text from the PTB to a tree. For example: 236 | # ( (S (NP-SBJ (NNP Ms.) (NNP Haag) ) (VP (VBZ plays) (NP (NNP Elianti) )) (. .) )) 237 | # This is a compressed form of: 238 | # ( (S 239 | # (NP-SBJ (NNP Ms.) (NNP Haag) ) 240 | # (VP (VBZ plays) 241 | # (NP (NNP Elianti) )) 242 | # (. .) )) 243 | def __init__(self, text='', pos=0): 244 | Tree.__init__(self, text) 245 | self.label = '' 246 | self.pos = None 247 | depth = 0 248 | for i in range(pos + 1, len(text)): 249 | char = text[i] 250 | # update the depth 251 | if char == '(': 252 | depth += 1 253 | if depth == 1: 254 | self.subtrees.append(PTB_Tree(text, i)) 255 | elif char == ')': 256 | depth -= 1 257 | if len(self.subtrees) == 0: 258 | pos = i 259 | for j in range(i, 0, -1): 260 | if text[j] == ' ': 261 | pos = j 262 | break 263 | self.word = text[pos + 1:i] 264 | 265 | # we've reached the end of the category that is the root of this subtree 266 | if depth == 0 and char == ' ' and self.label == '': 267 | self.label = text[pos + 1:i] 268 | # we've reached the end of the scope for this bracket 269 | if depth < 0: 270 | break 271 | if self.word is not None: 272 | self.pos = self.label 273 | 274 | def word_yield(self, span=None, pos=0): 275 | labels_to_ignore = set([",", "-NONE-", "TOP", ":", "."]) 276 | words_to_ignore = set(["'","`","''","``"]) 277 | # ignore quotes as they won't always be present 278 | if self.label in labels_to_ignore: 279 | return (pos, '') 280 | if self.word is not None: 281 | if self.word in words_to_ignore: 282 | return (pos, '') 283 | if span is None or span[0] <= pos < span[1]: 284 | return (pos + 1, self.word) 285 | else: 286 | return (pos + 1, '') 287 | else: 288 | text = [] 289 | for subtree in self.subtrees: 290 | pos, words = subtree.word_yield(span, pos) 291 | if words != '': 292 | text.append(words) 293 | return (pos, ' '.join(text)) 294 | 295 | def __repr__(self, depth=0): 296 | ans = '' 297 | if depth > 0: 298 | ans += '\n' 299 | ans += depth * '\t' 300 | ans += '(' + get_PTB_label(self.label, self.word) 301 | if self.word is not None: 302 | ans += ' ' + get_PTB_word(self.word) 303 | for subtree in self.subtrees: 304 | ans += subtree.__repr__(depth + 1) 305 | ans += ')' 306 | return ans 307 | 308 | def one_line_repr(self): 309 | ans = '(' + get_PTB_label(self.label, self.word) 310 | if self.word is not None: 311 | return ans + ' ' + get_PTB_word(self.word) + ')' 312 | for subtree in self.subtrees: 313 | ans += ' ' + subtree.one_line_repr() 314 | ans += ')' 315 | return ans 316 | 317 | def repr_with_corrections(self, gold_spans, depth=0, pos=0,parent=None): 318 | return_str = False 319 | if type(gold_spans) != type({}): 320 | return_str = True 321 | span_dict = {} 322 | gold_spans.get_spans(span_dict) 323 | gold_spans = span_dict 324 | 325 | # note - does not print missing spans that cover parts of present spans 326 | start_missing = "\033[01;36m" 327 | start_extra = "\033[01;31m" 328 | start_wrong_label = "\033[01;33m" 329 | end_colour = "\033[00m" 330 | 331 | start = '' 332 | if depth > 0: 333 | start += '\n' 334 | start += depth * '\t' 335 | # Handle the POS-word case 336 | labels_to_ignore = set(["-NONE-", "TOP", ":", "."]) 337 | words_to_ignore = set(["'","`","''", "``", "--",":",";","-",","]) 338 | if self.word is not None: 339 | text = '' 340 | if self.label not in labels_to_ignore and (self.word is None or self.word not in words_to_ignore): 341 | if self.label not in gold_spans[(pos, pos+1)] and self.word not in gold_spans[(pos, pos+1)]: 342 | text = '%s(%s%s %s%s)' % (start, start_extra, self.label, self.word, end_colour) 343 | text += ' BROKEN WORD' 344 | elif len(gold_spans[(pos, pos+1)]) > 1 and parent is not None and len(parent.subtrees) > 1: 345 | punc_count = 0 346 | for subtree in parent.subtrees: 347 | if subtree.label in labels_to_ignore: 348 | punc_count += 1 349 | if punc_count != len(parent.subtrees) - 1: 350 | to_cover = gold_spans[(pos, pos+1)] 351 | covered = set() 352 | covered.add(self.word) 353 | missed = to_cover.difference(covered) 354 | text = '%s%s(%s%s' % (start, start_missing, ' '.join(missed), end_colour) 355 | text += '%s(%s %s)' % (start + '\t', self.label, self.word) 356 | text += '%s)%s' % (start_missing, end_colour) 357 | pos += 1 358 | if text == '': 359 | text = '%s(%s %s)' % (start, self.label, self.word) 360 | if return_str: 361 | return text 362 | else: 363 | return (pos, text) 364 | # Handle when constituents are present 365 | init = pos 366 | children = [(pos, '')] 367 | for subtree in self.subtrees: 368 | pos, text = subtree.repr_with_corrections(gold_spans, depth + 1, pos, self) 369 | children.append((pos, text)) 370 | final = pos 371 | text = start 372 | extra = (init, final) not in gold_spans 373 | wrong_label = False 374 | if extra: 375 | text += start_extra + '(' + self.label + end_colour 376 | elif self.label not in gold_spans[(init, final)]: 377 | if len(gold_spans[(init, final)]) == 1 and final - init == 1: 378 | # actually an extra bracket, just confused by POS 379 | text += start_extra + '(' + self.label + end_colour 380 | extra = True 381 | elif parent is not None and len(parent.subtrees) > 1: 382 | # check if all but one subtree is punctuation 383 | punc_count = 0 384 | for subtree in parent.subtrees: 385 | if subtree.label in labels_to_ignore: 386 | punc_count += 1 387 | if punc_count != len(parent.subtrees) - 1: 388 | to_cover = gold_spans[(init, final)] 389 | covered = set() 390 | covered.add(self.label) 391 | subtree = self 392 | while len(subtree.subtrees) == 1: 393 | subtree = subtree.subtrees[0] 394 | covered.add(subtree.label) 395 | covered.add(subtree.word) 396 | missed = to_cover.difference(covered) 397 | text += start_wrong_label + '(' + self.label + end_colour 398 | wrong_label = True 399 | text += ' ' + start_missing + '_'.join(missed) + end_colour 400 | else: 401 | text += start_wrong_label + '(' + self.label + end_colour 402 | wrong_label = True 403 | else: 404 | text += start_wrong_label + '(' + self.label + end_colour 405 | wrong_label = True 406 | elif len(gold_spans[(init, final)]) > 1 and (parent is None or len(parent.subtrees) > 1): 407 | # check if all but one subtree is punctuation 408 | punc_count = 0 409 | if parent is not None: 410 | for subtree in parent.subtrees: 411 | if subtree.label in labels_to_ignore: 412 | punc_count += 1 413 | if parent is None or punc_count != len(parent.subtrees) - 1: 414 | # this is right, but there are other that should be here too 415 | to_cover = gold_spans[(init, final)] 416 | covered = set() 417 | covered.add(self.label) 418 | subtrees = self.subtrees 419 | punc_count = 0 420 | for subtree in subtrees: 421 | if subtree.label in labels_to_ignore: 422 | punc_count += 1 423 | while len(subtrees) - punc_count == 1: 424 | cur = subtrees[0] 425 | for subtree in subtrees: 426 | if subtree.label not in labels_to_ignore: 427 | cur = subtree 428 | break 429 | covered.add(cur.label) 430 | subtrees = cur.subtrees 431 | if len(subtrees) == 0: 432 | covered.add(cur.word) 433 | punc_count = 0 434 | for subtree in subtrees: 435 | if subtree.label in labels_to_ignore: 436 | punc_count += 1 437 | missed = to_cover.difference(covered) 438 | text += '(' + self.label 439 | if len(missed) > 0: 440 | text += ' ' + start_missing + '_'.join(missed) + end_colour 441 | else: 442 | text += '(' + self.label 443 | else: 444 | # it's correct 445 | text += '(' + self.label 446 | 447 | # now consider groupings of the children 448 | for length in range(2, len(children) - 1): 449 | for i in range(len(children)): 450 | if i + length >= len(children): 451 | continue 452 | if children[i][0] == children[i+1][0]: 453 | continue 454 | if children[i+length][0] == children[i + length-1][0]: 455 | continue 456 | if length == len(children) - 2 and i == 1 and children[0][0] == children[1][0]: 457 | continue 458 | if length == len(children) - 2 and i == 0 and children[-1][0] == children[-2][0]: 459 | continue 460 | if (children[i][0], children[i + length][0]) in gold_spans: 461 | # this is a missing span 462 | # 1 - indent 463 | for k in range(i+1, i+length+1): 464 | cpos, ctext = children[k] 465 | ctext = '\n\t'.join(ctext.split('\n')) 466 | children[k] = (cpos, ctext) 467 | # 2 - add open bracket and label(s) to first entry 468 | cpos, ctext = children[i+1] 469 | pretext = '\n' 470 | pretext += (depth + 1) * '\t' + start_missing + '(' 471 | pretext += '/'.join(gold_spans[(children[i][0], children[i + length][0])]) 472 | pretext += end_colour 473 | children[i+1] = (cpos, pretext + ctext) 474 | # 3 - add end bracket to last entry 475 | cpos, ctext = children[i+length] 476 | ctext += start_missing + ')' + end_colour 477 | children[i+length] = (cpos, ctext) 478 | for child in children: 479 | text += child[1] 480 | if extra: 481 | text += start_extra + ')' + end_colour 482 | elif wrong_label: 483 | text += start_wrong_label + ')' + end_colour 484 | else: 485 | text += ')' 486 | if return_str: 487 | return text 488 | else: 489 | return (final, text) 490 | 491 | def get_spans(self, span_dict, pos=0): 492 | labels_to_ignore = set(["-NONE-", "TOP", ":", "."]) 493 | words_to_ignore = set(["'","`","''", "``", "--",":",";","-",",","."]) 494 | label = self.label 495 | # ignore quotes as they won't always be present 496 | if label in labels_to_ignore or self.word in words_to_ignore: 497 | return pos 498 | init = pos 499 | if len(self.subtrees) == 0: 500 | pos += 1 501 | else: 502 | for subtree in self.subtrees: 503 | pos = subtree.get_spans(span_dict, pos) 504 | if init != pos: 505 | if (init, pos) not in span_dict: 506 | span_dict[(init, pos)] = set() 507 | if not label[0] == '-': 508 | label = label.split('-')[0] 509 | label = label.split('=')[0] 510 | if label == 'PRT': 511 | label = 'ADVP' # another collins change 512 | if self.word is not None: 513 | label = self.word 514 | span_dict[(init, pos)].add(label) 515 | return pos 516 | 517 | def read_PTB_tree(source): 518 | cur_text = '' 519 | depth = 0 520 | while True: 521 | line = source.readline() 522 | if line == '': 523 | return None 524 | line = line.strip() 525 | if line == '': 526 | continue 527 | if cur_text != '': 528 | cur_text += ' ' 529 | cur_text += line 530 | for char in line: 531 | if char == '(': 532 | depth += 1 533 | elif char == ')': 534 | depth -= 1 535 | if depth == 0: 536 | return PTB_Tree(cur_text) 537 | return trees 538 | 539 | def read_PTB_trees(source, max_sents=-1): 540 | if type(source) == type(''): 541 | source = open(source) 542 | trees = [] 543 | while True: 544 | tree = read_PTB_tree(source) 545 | if tree is None: 546 | break 547 | trees.append(tree) 548 | if len(trees) >= max_sents > 0: 549 | break 550 | return trees 551 | 552 | def read_CCG_tree(source): 553 | while True: 554 | line = source.readline() 555 | if line == '': 556 | return None 557 | else: 558 | line = line.strip() 559 | if line != '' and not line.startswith("ID"): 560 | line = '-LRB- -LCB-'.join(line.split('LRB {')) 561 | line = '-RRB- -RCB-'.join(line.split('RRB }')) 562 | line = '-LRB- -LRB-'.join(line.split('LRB (')) 563 | line = '-RRB- -RRB-'.join(line.split('RRB )')) 564 | tree = None 565 | if '= max_sents > 0: 583 | break 584 | return trees 585 | 586 | if __name__ == '__main__': 587 | if len(sys.argv) != 3: 588 | print("Usage:\n%s [PTB,CCG] " % sys.argv[0]) 589 | sys.exit(1) 590 | filename = sys.argv[2] 591 | trees = None 592 | if sys.argv[1] == 'PTB': 593 | trees = read_PTB_trees(filename) 594 | elif sys.argv[1] == 'CCG': 595 | trees = read_CCG_trees(filename) 596 | print(len(trees), "trees read") 597 | for tree in trees: 598 | print(tree) 599 | -------------------------------------------------------------------------------- /markup_convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Convert using a markedup-style file 4 | import sys, re 5 | import trees, category, rule 6 | 7 | log_out = sys.stdout 8 | contains_bs = False 9 | 10 | VERBOSE = False 11 | VERBOSE = True 12 | def verbose_print(text): 13 | if VERBOSE: 14 | print(text, file=log_out) 15 | 16 | markup_info = {} 17 | def read_markup(markup_file): 18 | global markup_info 19 | # Only read the markup info once 20 | if len(markup_info) == 0: 21 | # (NP\NP)/NP 22 | # 2 ((NP{Y}\NP{Y}<1>){_}/NP{Z}<2>){_} 23 | # (PP 0 2) 24 | # (NP 1 0) 25 | # 26 | cur = [] 27 | for line in markup_file: 28 | line = line.strip() 29 | if len(line) > 0: 30 | if line[0] == '#': 31 | continue 32 | cur.append(line) 33 | else: 34 | if len(cur) > 0: 35 | label = cur.pop(0) 36 | markup_info[label] = cur 37 | cur = [] 38 | 39 | # Find the bracket that matches the one at text[start_index] 40 | def get_balanced_point(text, start_index, deeper, shallower): 41 | depth = 0 42 | for i in range(start_index, len(text)): 43 | if text[i] == deeper: 44 | depth += 1 45 | if text[i] == shallower: 46 | depth -= 1 47 | if depth == 0: 48 | return i 49 | return -1 50 | 51 | UNIQUE_ID = 100 52 | class Schema: 53 | def __init__(self, lines, uniqued=False, argument=None, source_node=None): 54 | global UNIQUE_ID 55 | self.source = source_node 56 | text = '(TEMP 0)' 57 | self.parent = [] 58 | self.children = [] 59 | self.rule = 'unk' 60 | self.get_label_from_argument = False 61 | if type(lines) == type(''): 62 | text = lines 63 | self.parent = [] 64 | elif type(lines) == type([]): 65 | text = lines[0] 66 | # only one parent, which is the schema this will insert into 67 | self.parent = lines[1:] 68 | if 'arg' in text: 69 | # check rules 70 | text = None 71 | self.parent = [] 72 | to_parent = False 73 | for line in lines: 74 | if to_parent: 75 | self.parent.append(line) 76 | else: 77 | if 'arg:default' in line: 78 | if text is None: 79 | text = line 80 | to_parent = True 81 | elif argument is not None: 82 | constraint = line.split('arg:')[1].split(':')[0] 83 | if '(' not in constraint: 84 | if type(argument) == type(self) and constraint == argument.label: 85 | text = line 86 | elif type(argument) == type('') and argument[1:].split()[0] == constraint: 87 | text = line 88 | elif type(argument) == type(self): 89 | labels = constraint[1:-1].split() 90 | children = [] 91 | for child in argument.children: 92 | if type(child) == type(self) and child.label in ":,.;": 93 | continue 94 | elif type(child) == type('') and child[1] in ":,.;": 95 | continue 96 | elif type(child) == type(''): 97 | children.append(child[1:].split()[0]) 98 | else: 99 | children.append(child.label) 100 | if '...' in labels: 101 | if len(labels)-1 <= len(children): 102 | use = True 103 | if labels[0] == '...': 104 | for i in range(len(labels)-1): 105 | if labels[-1-i] != children[-1-i]: 106 | use = False 107 | elif labels[-1] == '...': 108 | for i in range(len(labels)-1): 109 | if labels[i] != children[i]: 110 | use = False 111 | else: 112 | print('... in the middle of arguments is not yet supported') 113 | use = False 114 | if use: 115 | text = line 116 | elif len(labels) == len(children): 117 | use = True 118 | for i in range(len(labels)): 119 | if labels[i] != children[i]: 120 | use = False 121 | if use: 122 | text = line 123 | if text[-1] not in ')}': 124 | text = ' '.join(text.split(':')[0].split()[:-1]) 125 | 126 | # change numbers in text to be a unique ID 127 | text = text.strip() 128 | self.zero = None 129 | if not uniqued: 130 | mapping = {} 131 | ntext = '' 132 | pos = 0 133 | while pos < len(text): 134 | if text[pos] in '1234567890': 135 | start = pos 136 | end = pos 137 | while text[end] in '1234567890': 138 | end += 1 139 | end -= 1 140 | num = int(text[start:end+1]) 141 | if num not in mapping: 142 | mapping[num] = UNIQUE_ID 143 | UNIQUE_ID += 1 144 | ntext += str(mapping[num]) 145 | if num == 0: 146 | self.zero = mapping[num] 147 | pos = end 148 | else: 149 | ntext += text[pos] 150 | pos += 1 151 | text = ntext 152 | self.schema = text 153 | # determine if this node is to be deleted 154 | self.delete_on_adoption = self.schema.startswith('{(') and self.schema.endswith(')}') 155 | self.label = self.schema.split()[0].strip('{(') 156 | if '*' in self.label: 157 | self.get_label_from_argument = True 158 | self.label = self.label[:-1] 159 | self.children = [] # the tree 160 | self.incomplete = {} # elements somewhere in the tree that are to be filled 161 | tschema = ')'.join('('.join(self.schema.split('(')[1:]).split(')')[:-1]) 162 | pos = len(tschema.split()[0]) # jump to after the label 163 | while pos < len(tschema): 164 | if tschema[pos] == '(': 165 | # Create a subtree for this bracket set 166 | balance = get_balanced_point(tschema, pos, '(', ')') 167 | subschema = Schema(tschema[pos:balance+1], uniqued=True, source_node=self.source) 168 | self.children.append(subschema) 169 | for key in self.children[-1].incomplete: 170 | if key not in self.incomplete: 171 | self.incomplete[key] = [] 172 | self.incomplete[key] += self.children[-1].incomplete[key] 173 | pos = balance 174 | elif tschema[pos] == ' ': 175 | if tschema[pos + 1] != '(': 176 | left = pos + 1 177 | right = left 178 | while right < len(tschema) and tschema[right] in '1234567890{}<>': 179 | right += 1 180 | right -= 1 181 | text = tschema[left:right+1] 182 | self.children.append(text) 183 | num = int(text.strip('{}<>')) 184 | if num not in self.incomplete: 185 | self.incomplete[num] = [] 186 | self.incomplete[num].append((text, self)) 187 | pos = right 188 | pos += 1 189 | 190 | def PTB_tree(self): 191 | text = '(' 192 | text += self.label + ' ' 193 | child_texts = [] 194 | for child in self.children: 195 | if type(child) != type(''): 196 | child_texts.append(child.PTB_tree()) 197 | elif '(' in child: 198 | child_texts.append(child) 199 | if len(child_texts) == 0: 200 | return '' 201 | text += ' '.join(child_texts) 202 | if self.delete_on_adoption: 203 | return ' '.join(child_texts) 204 | text += ')' 205 | return text 206 | 207 | def __repr__(self): 208 | child_ans = [] 209 | for child in self.children: 210 | if type(child) == type(''): 211 | child_ans.append(child) 212 | else: 213 | child_ans.append('obj') 214 | ans = ' schema: ' + self.schema + ' cur: ' 215 | if self.delete_on_adoption: 216 | ans += '{' 217 | ans += '(' + self.label + ' ' + ' '.join(child_ans) + ')' 218 | if self.delete_on_adoption: 219 | ans += '}' 220 | ans += ' incomplete:' 221 | for thing in self.incomplete: 222 | ans += ' (' 223 | ans += str(self.incomplete[thing][0][0]) 224 | if self.incomplete[thing][0][1] == self: 225 | ans += ', self)' 226 | else: 227 | ans += ', other)' 228 | for schema in self.parent: 229 | ans += '\n' + schema 230 | return ans 231 | 232 | def insert(self, ID, value): 233 | if ID is None: 234 | print("Insert with None ID requested", file=log_out) 235 | print("Insert with None ID requested", file=sys.stderr) 236 | return 237 | if ID != self.zero and self.get_label_from_argument: 238 | try: 239 | if type(value) != type(''): 240 | if not value.delete_on_adoption: 241 | self.label = value.label 242 | except: 243 | pass 244 | original = value 245 | keep_left = False 246 | delete_left = False 247 | keep_right = False 248 | delete_right = False 249 | stop = False 250 | entries = self.incomplete.pop(ID) 251 | for entry in entries: 252 | value = original 253 | text = entry[0] 254 | parent = entry[1] 255 | # find the position 256 | index = 0 257 | while index < len(parent.children): 258 | if parent.children[index] == text: 259 | break 260 | index += 1 261 | del parent.children[index] 262 | if text[0] == '>': 263 | if not keep_left: 264 | keep_left = True 265 | delete_left = False 266 | else: 267 | keep_left = False 268 | delete_left = True 269 | text = text[1:] 270 | if text[-1] == '<': 271 | if not delete_right: 272 | delete_right = True 273 | keep_right = False 274 | else: 275 | delete_right = False 276 | keep_right = True 277 | text = text[:-1] 278 | if text[0] == '{' and text[-1] == '}': 279 | try: 280 | if len(value.children) > 0: 281 | value = value.children 282 | except: 283 | # doesn't have sub=parts, ignore deletion {} 284 | # can happen if we have a list, or a string 285 | pass 286 | text = text[1:-1] 287 | if type(value) == type(self) and value.delete_on_adoption: 288 | value = value.children 289 | if stop: 290 | parent.children.insert(index, '') 291 | else: 292 | if type(value) != type([]): 293 | parent.children.insert(index, value) 294 | if keep_left or delete_left or keep_right or delete_right: 295 | stop = True 296 | else: 297 | if keep_left: 298 | parent.children.insert(index, value[0]) 299 | elif delete_left: 300 | parent.children = parent.children[:index] + value[1:] + parent.children[index:] 301 | elif keep_right: 302 | parent.children.insert(index, value[-1]) 303 | elif delete_right: 304 | parent.children = parent.children[:index] + value[:-1] + parent.children[index:] 305 | else: 306 | parent.children = parent.children[:index] + value + parent.children[index:] 307 | # When complete pass self to parent 308 | return self 309 | 310 | def set_zero(self, thing): 311 | self.insert(self.zero, thing) 312 | return self 313 | 314 | def get_argument_key(self, key_no=0): 315 | if len(self.incomplete) == 0: 316 | print("Trying to insert into a complete schema!", file=log_out) 317 | print("Trying to insert into a complete schema!", file=sys.stderr) 318 | else: 319 | for val in self.incomplete: 320 | if key_no == 0: 321 | return val 322 | else: 323 | key_no -= 1 324 | return None 325 | 326 | # fa.f and fa.b - Function application 327 | def fa(self, argument, combinator): 328 | # fill the incomplete argument with the argument 329 | key = self.get_argument_key() 330 | if key is not None: 331 | self.insert(key, argument) 332 | if 'conj1' == argument.rule: 333 | pos = 0 334 | while pos < len(self.children): 335 | if type(self.children[pos]) == type(self) and self.children[pos].label == 'NX': 336 | child = self.children[pos] 337 | self.children = self.children[:pos] + child.children + self.children[pos+1:] 338 | pos += len(child.children) - 1 339 | pos += 1 340 | else: 341 | if combinator == 'fa.f': 342 | return self.glom(argument) 343 | else: 344 | return argument.glom(self) 345 | return self 346 | 347 | # fc.f and fc.b - Function composition 348 | def fc(self, argument): 349 | # fill the incomplete argument with the argument 350 | self.insert(self.get_argument_key(), argument) 351 | # add the unfilled arguments of the argument to the incomplete arguments of 352 | # self 353 | for key in argument.incomplete: 354 | self.incomplete[key] = [] 355 | for entry in argument.incomplete[key]: 356 | used = False 357 | for child in self.children: 358 | if child == entry[0]: 359 | used = True 360 | self.incomplete[key].append((entry[0], self)) 361 | break 362 | if not used: 363 | self.incomplete[key].append((entry[0], entry[1])) 364 | ### if category.divide(self.source.category)[1] == '/': 365 | ### self.children.append(entry[0]) 366 | ### else: 367 | ### self.children.insert(0, entry[0]) 368 | argument.incomplete = {} 369 | return self 370 | 371 | # bs.f and bs.b - Crossed substitution 372 | def bs(self, argument): 373 | print('bs is not implemented - this should not have been called') 374 | print('bs is not implemented - this should not have been called', file=sys.stderr) 375 | return nlevel 376 | 377 | def is_empty(self): 378 | for child in self.children: 379 | if type(child) == type(self): 380 | if not child.is_empty(): 381 | return False 382 | elif child[0] == '(': 383 | return False 384 | return True 385 | 386 | # cc.b - Backwards crossed composition 387 | def back_cross(self, argument): 388 | left = get_next_incomplete_schema(self, argument) 389 | pos, children = left.get_last_partial_subtree() 390 | if pos < 0: 391 | pos = 0 392 | children = left.children 393 | argument = get_next_incomplete_schema(argument, None) 394 | left.parent = argument.parent 395 | non_empty_children = [] 396 | for child in argument.children: 397 | if type(child) == type(left): 398 | if not child.is_empty(): 399 | non_empty_children.append(child) 400 | elif child[0] == '(': 401 | non_empty_children.append(child) 402 | if len(non_empty_children) == 1: 403 | argument = non_empty_children[0] 404 | children.insert(pos, argument) 405 | return left 406 | 407 | # Type raising 408 | def tr(self, child): 409 | if self.label[0] == child.label[0] and not self.delete_on_adoption: 410 | child.delete_on_adoption = True 411 | self.set_zero(child) 412 | return self 413 | 414 | # one of the special binary combination rules defined in rule.py 415 | def special_binary(self, right, new_schemas): 416 | new_schemas.set_zero(self) 417 | new_schemas.insert(new_schemas.get_argument_key(), right) 418 | return new_schemas 419 | 420 | # one of the special unary combination rules defined in rule.py 421 | def special_unary(self, unary_schema): 422 | unary_schema.set_zero(self) 423 | return unary_schema 424 | 425 | def conj_part1(self, right): 426 | # create a new node, with these two as children 427 | if right.label in ['Nslash', 'Nnum']: 428 | right.delete_on_adoption = True 429 | if right.label == 'N': 430 | if len(right.children) > 1: 431 | right.label = 'NX' 432 | else: 433 | right.delete_on_adoption = True 434 | left = self 435 | if len(left.children) == 1: 436 | left = left.children[0] 437 | 438 | # detect a list and set right to be deleted 439 | is_list = False 440 | if len(right.children) > 2: 441 | if type(right.children[1]) == type(left): 442 | if right.children[1] == left or (left == '(, ,)' and 'CC' in right.children[1]): 443 | if type(right.children[0]) == type(self) == type(right.children[2]): 444 | if right.children[0].label == right.children[2].label: 445 | is_list = True 446 | if is_list: 447 | right.delete_on_adoption = True 448 | 449 | nlevel = Schema(['(%s 0 1)' % right.label] + right.parent, source_node=right.source) 450 | nlevel.set_zero(left) 451 | nlevel.insert(nlevel.get_argument_key(), right) 452 | 453 | if nlevel.label == 'TEMP': 454 | nlevel.delete_on_adoption = True 455 | # move unfilled arguments 456 | for key in right.incomplete: 457 | nlevel.incomplete[key] = [] 458 | for entry in right.incomplete[key]: 459 | text = entry[0] 460 | parent = entry[1] 461 | if text == parent.children[-1]: 462 | if text in nlevel.children: 463 | nlevel.children.remove(text) 464 | nlevel.children.append(text) 465 | else: 466 | if text in nlevel.children: 467 | nlevel.children.remove(text) 468 | nlevel.children.insert(0, text) 469 | nlevel.incomplete[key].append((text, nlevel)) 470 | nlevel.rule = 'conj1' 471 | return nlevel 472 | 473 | def conj_part2(self, right): 474 | if self.label in "~!@#$%^&*()_+{}|:<>?,./;'[]\=-`" or self.label in ['LRB', 'RRB']: 475 | # glom self on instead 476 | return self.glom(right) 477 | # check labels 478 | if self.label in ['Nslash', 'Nnum']: 479 | self.delete_on_adoption = True 480 | if self.label == 'N': 481 | if len(self.children) > 1: 482 | self.label = 'NX' 483 | else: 484 | self.delete_on_adoption = True 485 | if self.label != 'NX': 486 | pos = 0 487 | while pos < len(right.children): 488 | if type(right.children[pos]) == type(right) and right.children[pos].label == 'NX': 489 | child = right.children[pos] 490 | right.children = right.children[:pos] + child.children + right.children[pos+1:] 491 | pos += len(child.children) - 1 492 | pos += 1 493 | nlabel = self.label 494 | if nlabel != right.label: 495 | nlabel = 'UCP' 496 | 497 | # check for VPs that are being conjed 498 | try: 499 | remove_VPs = False 500 | print(self.label, self.children[0], file=log_out) 501 | if self.label == 'VP' and 'VB' in self.children[0]: 502 | all_empty = True 503 | print(self.children[1:], file=log_out) 504 | for child in self.children[1:]: 505 | if type(child) != type('') or child[0] == '(': 506 | all_empty = False 507 | if all_empty: 508 | print(right.label, right.children[1].label, right.children[1].children[0], file=log_out) 509 | if right.label == 'VP' and right.children[1].label == 'VP' and 'VB' in right.children[1].children[0]: 510 | all_empty = True 511 | print(right.children[1].children[1:], file=log_out) 512 | for child in right.children[1].children[1:]: 513 | if type(child) != type('') or child[0] == '(': 514 | all_empty = False 515 | if all_empty: 516 | remove_VPs = True 517 | if remove_VPs: 518 | self.delete_on_adoption = True 519 | right.children[1] = right.children[1].children[0] 520 | except: 521 | pass 522 | 523 | nlevel = Schema(['(%s 0 {1})' % nlabel] + self.parent, source_node=self.source) 524 | nlevel.set_zero(self) 525 | nlevel.insert(nlevel.get_argument_key(), right) 526 | if nlevel.label == 'TEMP': 527 | nlevel.delete_on_adoption = True 528 | # move unfilled arguments 529 | for key in self.incomplete: 530 | nlevel.incomplete[key] = [] 531 | for entry in self.incomplete[key]: 532 | text = entry[0] 533 | parent = entry[1] 534 | if text == parent.children[-1]: 535 | if text in nlevel.children: 536 | nlevel.children.remove(text) 537 | nlevel.children.append(text) 538 | elif text == parent.children[0]: 539 | if text in nlevel.children: 540 | nlevel.children.remove(text) 541 | nlevel.children.insert(0, text) 542 | else: 543 | if text in nlevel.children: 544 | nlevel.children.remove(text) 545 | continue 546 | nlevel.incomplete[key].append((text, nlevel)) 547 | nlevel.rule = 'conj2' 548 | return nlevel 549 | 550 | def get_first_partial_subtree(self): 551 | if len(self.children) == 0: 552 | return (0, []) 553 | if type(self.children[0]) == type('') and self.children[0][0] == '(': 554 | return (0, self.children) 555 | for i in range(len(self.children)): 556 | child = self.children[i] 557 | if type(child) == type(self): 558 | pos, children = child.get_first_partial_subtree() 559 | if pos > 0: 560 | return (pos, children) 561 | elif pos == 0: 562 | return (i, self.children) 563 | elif type(child) == type('') and child[0] == '(': 564 | return (i, self.children) 565 | return (-1, []) 566 | 567 | def get_last_partial_subtree(self): 568 | if len(self.children) == 0: 569 | return (0, []) 570 | if type(self.children[-1]) == type('') and self.children[-1][0] == '(': 571 | return (len(self.children), self.children) 572 | for i in range(len(self.children) - 1, -1, -1): 573 | child = self.children[i] 574 | if type(child) == type(self): 575 | pos, children = child.get_last_partial_subtree() 576 | if 0 < pos < len(children): 577 | return (pos, children) 578 | elif pos == len(children): 579 | return (i+1, self.children) 580 | elif type(child) == type('') and len(child) > 0 and child[0] == '(': 581 | return (i+1, self.children) 582 | return (-1, []) 583 | 584 | # misc - Just glom on the random stuff 585 | def glom(self, right, keep_right=None): 586 | left = self 587 | if keep_right is None: 588 | keep_right = left.label in "~!@#$%^&*()_+{}|:<>?,./;'[]\=-`" or left.label in ['LRB', 'RRB'] 589 | if keep_right: 590 | # glom left on to left of right 591 | if len(left.children) == 1: 592 | left = left.children[0] 593 | pos, children = right.get_first_partial_subtree() 594 | if pos < 0: 595 | pos = 0 596 | children = right.children 597 | children.insert(pos, left) 598 | return right 599 | else: 600 | # glom right on to right of left 601 | if len(right.children) == 1: 602 | right = right.children[0] 603 | if len(left.incomplete) != 0: 604 | pos, children = left.get_last_partial_subtree() 605 | if pos < 0: 606 | pos = 0 607 | children = left.children 608 | children.insert(pos, right) 609 | else: 610 | left.children.append(right) 611 | ### nlevel = Schema(['{(TEMP 0 1)}'] + left.parent, source_node=left.source) 612 | ### nlevel.set_zero(left) 613 | ### key = nlevel.get_argument_key() 614 | ### nlevel.insert(key, right) 615 | ### return nlevel 616 | return left 617 | 618 | def fallback_schema(cat): 619 | rules = ['{(TEMP 0)}'] 620 | while '/' in cat or '\\' in cat: 621 | parts = category.divide(cat) 622 | if parts[1] == '/': 623 | rules.append("(NP 0 1)") 624 | else: 625 | rules.append("(NP 1 0)") 626 | cat = parts[0] 627 | plain_cat = cat 628 | if plain_cat not in markup_info: 629 | plain_cat = category.strip_square_brackets(cat) 630 | if plain_cat in markup_info: 631 | markup_lines = markup_info[plain_cat][1:] 632 | if '/' not in markup_lines[0] and '\\' not in markup_lines[0]: 633 | rules += markup_lines 634 | return rules 635 | return rules 636 | 637 | ANGLE_RE = re.compile('<[^>]*>') 638 | def markup_to_schemas(lines, cat=None, source=None): 639 | unannotated = False 640 | if lines == []: 641 | unannotated = True 642 | else: 643 | for line in lines[1:]: 644 | if '\\' in line or '/' in line: 645 | cat_to_print = lines[0].strip().split()[1] 646 | cat_to_print = category.strip_braces(cat_to_print) 647 | cat_to_print = ''.join(cat_to_print.split('[X]')) 648 | cat_to_print = ANGLE_RE.sub('', cat_to_print) 649 | cat_to_print = category.remove_extra_brackets(cat_to_print) 650 | print('Unannotated category:', cat_to_print, file=log_out) 651 | print('Unannotated category:', cat_to_print, file=sys.stderr) 652 | unannotated = True 653 | break 654 | if unannotated: 655 | lines = fallback_schema(cat) 656 | pos = None 657 | word = None 658 | if source is not None: 659 | pos = source.pos 660 | word = source.word 661 | used = False 662 | nlines = [] 663 | for i in range(1, len(lines)): 664 | line = lines[i].strip() 665 | if line[-1] not in ')}': 666 | use = True 667 | if 'POS' in line: 668 | if pos is None or pos not in line.split('POS:')[1].split()[0].split(','): 669 | use = False 670 | if not used and 'POS:default' in line: 671 | use = True 672 | if 'Word' in line: 673 | if word is None or word not in line.split('Word:')[1].split()[0].split(','): 674 | use = False 675 | if not used and 'Word:default' in line: 676 | use = True 677 | if use: 678 | nlines.append(line) 679 | if 'arg' not in line or 'arg:default:' in line: 680 | used = True 681 | else: 682 | nlines.append(line) 683 | used = False 684 | if 'POS:default' in line or 'Word:default' in line: 685 | if 'arg' not in line or 'arg:default:' in line: 686 | used = False 687 | return Schema(nlines, source_node=source) 688 | 689 | def get_next_incomplete_schema(schema, arg): 690 | while len(schema.incomplete) == 0 and len(schema.parent) > 0: 691 | parent = Schema(schema.parent, argument=arg, source_node=schema.source) 692 | parent.set_zero(schema) 693 | schema = parent 694 | return schema 695 | 696 | def apply_markup(source, markup, top=True): 697 | global contains_bs 698 | # Bottom up, so get the results from below 699 | children = [] 700 | for subtree in source.subtrees: 701 | children.append(apply_markup(subtree, markup, False)) 702 | combinator = source.rule 703 | result = None 704 | verbose_print('using %s combiantor rule' % combinator) 705 | for child in children: 706 | verbose_print('%s' % child.PTB_tree()) 707 | verbose_print(child.__repr__()) 708 | if combinator == 'lex' or combinator == 'type': 709 | source_category = source.category 710 | if source_category not in markup_info: 711 | source_category = category.strip_square_brackets(source.category) 712 | schema_text = [] 713 | if source_category not in markup_info: 714 | print("Missing category:", source.category, "asked for by", combinator, file=log_out) 715 | print("Missing category:", source.category, "asked for by", combinator, file=sys.stderr) 716 | else: 717 | schema_text = markup_info[source_category] 718 | schema = markup_to_schemas(schema_text, source.category, source) 719 | if combinator == 'lex': 720 | result = schema.set_zero("(%s %s)" % (source.pos, source.word)) 721 | elif combinator == 'type': 722 | verbose_print("Type schema:") 723 | verbose_print(schema.__repr__()) 724 | result = schema.tr(children[0]) 725 | elif combinator == 'conj1': 726 | result = children[0].conj_part1(children[1]) 727 | elif combinator == 'conj2': 728 | result = children[0].conj_part2(children[1]) 729 | elif combinator == 'unary': 730 | unary_rule = rule.get_unary(source.subtrees[0].category, source.category, markup_info) 731 | if unary_rule is None: 732 | unary_rule = fallback_schema(source.category) 733 | schemas = markup_to_schemas(['None'] + unary_rule, source=source) 734 | verbose_print("Unary schema:") 735 | verbose_print(schemas.__repr__()) 736 | result = children[0].special_unary(schemas) 737 | elif combinator in ['binary', 'bs.f', 'bs.b']: 738 | binary_rule = rule.get_binary_for_markedup(source.subtrees[0].category, source.subtrees[1].category, source.category, markup_info) 739 | if binary_rule is None: 740 | binary_rule = ['(VP 0 1)'] + fallback_schema(source.category) 741 | schemas = markup_to_schemas(['None'] + binary_rule, source=source) 742 | verbose_print("Binary schema:") 743 | verbose_print(schemas.__repr__()) 744 | control = get_next_incomplete_schema(children[0], children[1]) 745 | result = control.special_binary(children[1], schemas) 746 | elif combinator == 'fa.f': 747 | control = get_next_incomplete_schema(children[0], children[1]) 748 | result = control.fa(children[1], combinator) 749 | elif combinator == 'fa.b': 750 | control = get_next_incomplete_schema(children[1], children[0]) 751 | result = control.fa(children[0], combinator) 752 | elif combinator == 'fc.f': 753 | control = get_next_incomplete_schema(children[0], children[1]) 754 | argument = get_next_incomplete_schema(children[1], None) 755 | result = control.fc(argument) 756 | elif combinator == 'fc.b': 757 | control = get_next_incomplete_schema(children[1], children[0]) 758 | argument = get_next_incomplete_schema(children[0], None) 759 | result = control.fc(argument) 760 | elif combinator == 'cc.b': 761 | control = get_next_incomplete_schema(children[0], children[1]) 762 | result = control.back_cross(children[1]) 763 | elif combinator == 'misc': 764 | if len(source.subtrees) == 2: 765 | cur = category.strip_square_brackets(source.category) 766 | left = category.strip_square_brackets(source.subtrees[0].category) 767 | right = category.strip_square_brackets(source.subtrees[1].category) 768 | if cur != left and cur != right: 769 | print("miscing an unknown category:", source.category, end=" ", file=log_out) 770 | print("from", source.subtrees[0].category, "and", source.subtrees[1].category, file=log_out) 771 | print("miscing an unknown category:", source.category, end=" ", file=sys.stderr) 772 | print("from", source.subtrees[0].category, "and", source.subtrees[1].category, file=sys.stderr) 773 | binary_rule = fallback_schema(source.category) 774 | schemas = markup_to_schemas(['None','(NP 0 1)'] + binary_rule, source=source) 775 | verbose_print("Misc Binary schema:") 776 | verbose_print(schemas.__repr__()) 777 | result = children[0].special_binary(children[1], schemas) 778 | else: 779 | # check if this forms a PRN 780 | words = source.all_word_yield()[1].split() 781 | left_word = words[0] 782 | right_word = words[-1] 783 | verbose_print(left_word + ' ' + right_word) 784 | use_PRN = False 785 | if not top: 786 | if left_word == ',' and right_word == ',': 787 | use_PRN = True 788 | elif left_word == '--' and right_word == '--': 789 | use_PRN = True 790 | elif left_word == '-LRB-' and right_word == '-RRB-': 791 | use_PRN = True 792 | result = children[0].glom(children[1], cur == right) 793 | if use_PRN: 794 | old_label = result.label 795 | result.label = 'PRN' 796 | result.delete_on_adoption = False 797 | nlevel = Schema(['(%s 0)' % old_label] + result.parent, source_node=source) 798 | if old_label == 'TEMP': 799 | nlevel = Schema(['{(%s 0)}' % old_label] + result.parent, source_node=source) 800 | nlevel.set_zero(result) 801 | nlevel.incomplete = result.incomplete 802 | result = nlevel 803 | else: 804 | print('misc combinator is not handled', file=sys.stderr) 805 | verbose_print('resolved: %s' % result.PTB_tree()) 806 | verbose_print(result.__repr__()) 807 | verbose_print('') 808 | return result 809 | 810 | def remove_N(tree): 811 | nsubtrees = [] 812 | for subtree in tree.subtrees: 813 | sub = remove_N(subtree) 814 | if type(sub) == type([]): 815 | nsubtrees += sub 816 | else: 817 | nsubtrees.append(sub) 818 | tree.subtrees = nsubtrees 819 | if tree.label == 'N' or tree.label == 'Nslash' or tree.label == 'Nnum': 820 | return tree.subtrees 821 | else: 822 | return tree 823 | 824 | def remove_repetition(tree): 825 | # recurse and update subtrees 826 | if len(tree.subtrees) > 0: 827 | nsubtrees = [] 828 | for subtree in tree.subtrees: 829 | nsubtrees.append(remove_repetition(subtree)) 830 | tree.subtrees = nsubtrees 831 | 832 | # look down and remove this if it is repeated 833 | repeats = False 834 | cur = tree 835 | label = cur.label 836 | while len(cur.subtrees) == 1: 837 | cur = cur.subtrees[0] 838 | if cur.label == label: 839 | repeats = True 840 | break 841 | if repeats: 842 | print('duplicate!', file=log_out) 843 | print(tree.one_line_repr(), file=log_out) 844 | print(cur.one_line_repr(), file=log_out) 845 | tree = tree.subtrees[0] 846 | 847 | return tree 848 | 849 | def convert(source, argv, log=sys.stdout): 850 | global markup_info, contains_bs, log_out, VERBOSE 851 | log_out = log 852 | VERBOSE = '-verbose' in ' '.join(argv) 853 | filename = ' '.join(argv).split(' -method')[1].split()[1] 854 | read_markup(open(filename)) 855 | 856 | contains_bs = False 857 | auto_schema = apply_markup(source, markup_info) 858 | 859 | ################### 860 | # Extra cleanup 861 | # i.e. hacks that don't fit within the main architecture 862 | ################### 863 | auto_ptb = trees.PTB_Tree('(ROOT ' + auto_schema.PTB_tree() + ')') 864 | verbose_print('before cleaning: %s' % auto_ptb) 865 | 866 | # remove remaining N 867 | auto_ptb = remove_N(auto_ptb) 868 | 869 | # collapse repetitions 870 | auto_ptb = remove_repetition(auto_ptb) 871 | 872 | verbose_print('cleaned: %s' % auto_ptb) 873 | verbose_print('') 874 | return (not contains_bs, auto_ptb, auto_schema) 875 | 876 | if __name__ == '__main__': 877 | if len(sys.argv) < 2: 878 | print("Usage:\n%s -method_info " % sys.argv[0]) 879 | sys.exit(1) 880 | print("Please enter CCG trees:") 881 | for line in sys.stdin: 882 | print(convert(trees.CCG_Tree(line.strip()), sys.argv)) 883 | --------------------------------------------------------------------------------