├── _config.yml ├── LICENSE_notes.txt ├── _includes └── head-custom-google-analytics.html ├── LICENSE.txt ├── trivial.py ├── span_dict.py ├── README.md ├── sample.gold_ptb ├── category.py ├── convert.py ├── sample.candc ├── analysis.py ├── sample.ccgbank ├── rule.py ├── trees.py └── markup_convert.py /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman 2 | title: CCG to PST conversion 3 | description: Convert Combinatory Categorial Grammar derivations to Phrase Structure Trees. 4 | show_downloads: true 5 | google_analytics: G-JVV2VPL5CX 6 | -------------------------------------------------------------------------------- /LICENSE_notes.txt: -------------------------------------------------------------------------------- 1 | The license on this software is the ISC license. 2 | 3 | "The ISC copyright is functionally equivalent to a two-term BSD copyright with 4 | language removed that is made unnecessary by the Berne convention. This is the 5 | preferred license for new code incorporated into OpenBSD." 6 | from http://www.openbsd.org/policy.html 7 | -------------------------------------------------------------------------------- /_includes/head-custom-google-analytics.html: -------------------------------------------------------------------------------- 1 | {% if site.google_analytics %} 2 | 3 | 4 | 11 | {% endif %} 12 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Jonathan K Kummerfeld 2 | 3 | Permission to use, copy, modify, and/or distribute this software for any 4 | purpose with or without fee is hereby granted, provided that the above 5 | copyright notice and this permission notice appear in all copies. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 9 | FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | PERFORMANCE OF THIS SOFTWARE. 14 | 15 | -------------------------------------------------------------------------------- /trivial.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import trees, category 5 | 6 | # The trivial method reproduces the bracket structure exactly. Labels are either 7 | # the atomic category, or a VP 8 | def convert(source, argv=None, log=sys.stdout): 9 | ans = trees.PTB_Tree() 10 | if '\\' in source.category or '/' in source.category: 11 | ans.label = "VP" 12 | else: 13 | ans.label = category.strip_square_brackets(source.category) 14 | if source.word is not None: 15 | ans.word = source.word 16 | ans.pos = source.pos 17 | ans.label = source.pos 18 | for subtree in source.subtrees: 19 | ans.subtrees.append(convert(subtree)) 20 | if argv is None: 21 | return ans 22 | else: 23 | return True, ans, None 24 | 25 | if __name__ == '__main__': 26 | print("Please enter CCG trees:") 27 | for line in sys.stdin: 28 | print(convert(trees.CCG_Tree(line.strip()))) 29 | -------------------------------------------------------------------------------- /span_dict.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import trees 4 | 5 | labels_to_ignore = set(["-NONE-", "TOP", "."]) 6 | words_to_ignore = set(["'","`","''", "``", "--",":",";","-",",","..."]) 7 | def span_dict(tree, ans, pos=0): 8 | start = pos 9 | label = tree.label 10 | word = tree.word 11 | if len(tree.subtrees) == 0: 12 | if label in labels_to_ignore or word in words_to_ignore: 13 | return pos 14 | return pos + 1 15 | for subtree in tree.subtrees: 16 | pos = span_dict(subtree, ans, pos) 17 | end = pos 18 | if start == end: 19 | return start 20 | if (start, end) not in ans: 21 | ans[(start, end)] = set() 22 | if not label[0] == '-': 23 | label = label.split('-')[0] 24 | label = label.split('=')[0] 25 | if label == 'PRT': 26 | label = 'ADVP' # another collins change 27 | if label != '' and label != 'TOP': 28 | ans[(start, end)].add(label) 29 | return pos 30 | 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This software converts Combinatory Categorial Grammar (CCG) derivations to Phrase Structure Trees (PST). For a full description of the method, and discussion of results, see: 2 | 3 | [Robust Conversion of CCG Derivations to Phrase Structure Trees](https://aclweb.org/anthology/P/P12/P12-2021.pdf), 4 | Jonathan K. Kummerfeld, James R. Curran and Dan Klein, 5 | ACL (short) 2012 6 | 7 | To use the system, download it one of these ways, and run as shown below: 8 | 9 | - [Download .zip](https://github.com/jkkummerfeld/berkeley-ccg2pst/zipball/master) 10 | - [Download .tar.gz](https://github.com/jkkummerfeld/berkeley-ccg2pst/tarball/master) 11 | - `git clone https://github.com/jkkummerfeld/berkeley-ccg2pst.git` 12 | 13 | If you use my code in your own work, please cite the paper: 14 | 15 | ``` 16 | @InProceedings{Kummerfeld-Klein-Curran:2012:ACL, 17 | author = {Jonathan K. Kummerfeld and Dan Klein and James R. Curran}, 18 | title = {Robust Conversion of {CCG} Derivations to Phrase Structure Trees}, 19 | booktitle = {Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)}, 20 | month = {July}, 21 | year = {2012}, 22 | address = {Jeju Island, Korea}, 23 | pages = {105--109}, 24 | software = {http://github.com/jkkummerfeld/berkeley-ccg2pst/}, 25 | url = {http://www.aclweb.org/anthology/P12-2021}, 26 | } 27 | ``` 28 | 29 | ## Running the code 30 | 31 | On a sample of CCGbank: 32 | ``` 33 | ./convert.py sample.gold_ptb sample.ccgbank -print_comparison -prefix=sample.ccgbank -verbose -method=markedup ./markedup 34 | ``` 35 | 36 | On a sample of C&C Parser output: 37 | ``` 38 | ./convert.py sample.gold_ptb sample.candc -print_comparison -prefix=sample.candc -verbose -method=markedup ./markedup 39 | ``` 40 | 41 | Conversion output will be in: 42 | ``` 43 | sample.ccgbank.auto 44 | sample.candc.auto 45 | ``` 46 | 47 | The code also comes with a sample of parses from the Penn Treebank section 00, 48 | the corresponding parses from CCGbank section 00, and the C&C parser output on 49 | the same sentences. 50 | -------------------------------------------------------------------------------- /sample.gold_ptb: -------------------------------------------------------------------------------- 1 | (ROOT (S (NP-SBJ (NP (NNP Pierre) (NNP Vinken)) (, ,) (ADJP (NP (CD 61) (NNS years)) (JJ old)) (, ,)) (VP (MD will) (VP (VB join) (NP (DT the) (NN board)) (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director))) (NP-TMP (NNP Nov.) (CD 29)))) (. .))) 2 | (ROOT (S (NP-SBJ (NNP Mr.) (NNP Vinken)) (VP (VBZ is) (NP-PRD (NP (NN chairman)) (PP (IN of) (NP (NP (NNP Elsevier) (NNP N.V.)) (, ,) (NP (DT the) (NNP Dutch) (VBG publishing) (NN group)))))) (. .))) 3 | (ROOT (S (NP-SBJ-1 (NP (NNP Rudolph) (NNP Agnew)) (, ,) (UCP (ADJP (NP (CD 55) (NNS years)) (JJ old)) (CC and) (NP (NP (JJ former) (NN chairman)) (PP (IN of) (NP (NNP Consolidated) (NNP Gold) (NNP Fields) (NNP PLC))))) (, ,)) (VP (VBD was) (VP (VBN named) (S (NP-SBJ (-NONE- *-1)) (NP-PRD (NP (DT a) (JJ nonexecutive) (NN director)) (PP (IN of) (NP (DT this) (JJ British) (JJ industrial) (NN conglomerate))))))) (. .))) 4 | (ROOT (S (S-TPC-1 (NP-SBJ (NP (NP (DT A) (NN form)) (PP (IN of) (NP (NN asbestos)))) (RRC (ADVP-TMP (RB once)) (VP (VBN used) (NP (-NONE- *)) (S-CLR (NP-SBJ (-NONE- *)) (VP (TO to) (VP (VB make) (NP (NNP Kent) (NN cigarette) (NNS filters)))))))) (VP (VBZ has) (VP (VBN caused) (NP (NP (DT a) (JJ high) (NN percentage)) (PP (IN of) (NP (NN cancer) (NNS deaths))) (PP-LOC (IN among) (NP (NP (DT a) (NN group)) (PP (IN of) (NP (NP (NNS workers)) (RRC (VP (VBN exposed) (NP (-NONE- *)) (PP-CLR (TO to) (NP (PRP it))) (ADVP-TMP (NP (QP (RBR more) (IN than) (CD 30)) (NNS years)) (IN ago)))))))))))) (, ,) (NP-SBJ (NNS researchers)) (VP (VBD reported) (SBAR (-NONE- 0) (S (-NONE- *T*-1)))) (. .))) 5 | (ROOT (S (S-TPC-2 (NP-SBJ (NP (DT The) (NN asbestos) (NN fiber)) (, ,) (NP (NN crocidolite)) (, ,)) (VP (VBZ is) (ADJP-PRD (RB unusually) (JJ resilient)) (SBAR-TMP (IN once) (S (NP-SBJ (PRP it)) (VP (VBZ enters) (NP (DT the) (NNS lungs))))) (, ,) (PP (IN with) (S-NOM (NP-SBJ (NP (RB even) (JJ brief) (NNS exposures)) (PP (TO to) (NP (PRP it)))) (VP (VBG causing) (NP (NP (NNS symptoms)) (SBAR (WHNP-1 (WDT that)) (S (NP-SBJ (-NONE- *T*-1)) (VP (VBP show) (PRT (RP up)) (ADVP-TMP (NP (NNS decades)) (JJ later))))))))))) (, ,) (NP-SBJ (NNS researchers)) (VP (VBD said) (SBAR (-NONE- 0) (S (-NONE- *T*-2)))) (. .))) 6 | (ROOT (S (NP-SBJ (NP (NNP Lorillard) (NNP Inc.)) (, ,) (NP (NP (DT the) (NN unit)) (PP (IN of) (NP (ADJP (JJ New) (JJ York-based)) (NNP Loews) (NNP Corp.))) (SBAR (WHNP-2 (WDT that)) (S (NP-SBJ (-NONE- *T*-2)) (VP (VBZ makes) (NP (NNP Kent) (NNS cigarettes)))))) (, ,)) (VP (VBD stopped) (VP (VBG using) (NP (NN crocidolite)) (PP-LOC-CLR (IN in) (NP (PRP$ its) (NN Micronite) (NN cigarette) (NNS filters))) (PP-TMP (IN in) (NP (CD 1956))))) (. .))) 7 | (ROOT (S (SBAR-ADV (IN Although) (S (NP-SBJ-2 (JJ preliminary) (NNS findings)) (VP (VBD were) (VP (VBN reported) (NP (-NONE- *-2)) (ADVP-TMP (NP (QP (RBR more) (IN than) (DT a)) (NN year)) (IN ago)))))) (, ,) (NP-SBJ (DT the) (JJS latest) (NNS results)) (VP (VBP appear) (PP-LOC (IN in) (NP (NP (NP (NP (NN today) (POS 's)) (NNP New) (NNP England) (NNP Journal)) (PP (IN of) (NP (NNP Medicine)))) (, ,) (NP (NP (DT a) (NN forum)) (ADJP (JJ likely) (S (NP-SBJ (-NONE- *)) (VP (TO to) (VP (VB bring) (NP (JJ new) (NN attention)) (PP-DIR (TO to) (NP (DT the) (NN problem))))))))))) (. .))) 8 | (ROOT (S (NP-SBJ (DT A) (NNP Lorillard) (NN spokewoman)) (VP (VBD said) (, ,) (`` ``) (S (NP-SBJ (DT This)) (VP (VBZ is) (NP-PRD (DT an) (JJ old) (NN story))))) (. .))) 9 | (ROOT (S (NP-SBJ (PRP We)) (VP (VBP 're) (VP (VBG talking) (PP-CLR (IN about) (ADVP-TMP (ADVP (NP (NNS years)) (IN ago)) (SBAR (IN before) (S (NP-SBJ (NN anyone)) (VP (VBD heard) (PP-CLR (IN of) (S-NOM (NP-SBJ (NN asbestos)) (VP (VBG having) (NP (DT any) (JJ questionable) (NNS properties)))))))))))) (. .))) 10 | (ROOT (S (NP-SBJ (EX There)) (VP (VBZ is) (NP-PRD (DT no) (NN asbestos)) (PP-LOC (IN in) (NP (PRP$ our) (NNS products))) (ADVP-TMP (RB now))) (. .) ('' ''))) 11 | -------------------------------------------------------------------------------- /category.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Just handle unary rules, working out when one is being used 3 | 4 | import re 5 | 6 | CURLY_BRACES_RE = re.compile('{[^}]*}') 7 | def strip_braces(category): 8 | return CURLY_BRACES_RE.sub('', category) 9 | 10 | SQUARE_BRACKETS_RE = re.compile('\[[^\]]*\]') 11 | def strip_square_brackets(category): 12 | if category is not None: 13 | return SQUARE_BRACKETS_RE.sub('', category) 14 | else: 15 | return None 16 | 17 | def remove_extra_brackets(category): 18 | if category[0] != '(' or category[-1] != ')': 19 | return category 20 | if not ('\\' in category or '/' in category): 21 | return category[1:-1] 22 | depth = 0 23 | hit_zero = False 24 | for i in range(len(category)): 25 | if category[i] == '(': 26 | depth += 1 27 | elif category[i] == ')': 28 | depth -= 1 29 | elif depth == 0: 30 | hit_zero = True 31 | break 32 | if not hit_zero: 33 | return category[1:-1] 34 | return category 35 | 36 | def divide(category): 37 | if '\\' not in category and '/' not in category: 38 | return [category, None, None] 39 | category = remove_extra_brackets(category) 40 | depth = 0 41 | sep = None 42 | for i in range(len(category)): 43 | if category[i] == '(': 44 | depth += 1 45 | elif category[i] == ')': 46 | depth -= 1 47 | elif category[i] in '/\\' and depth == 0: 48 | sep = i 49 | break 50 | if sep is None: 51 | return [category, None, None] 52 | parts = [category[:sep], category[sep:sep+1], category[sep+1:]] 53 | for i in [0, 2]: 54 | while True: 55 | if parts[i][0] != '(' or parts[i][-1] != ')': 56 | break 57 | stripped_version = parts[i][1:-1] 58 | depth = 0 59 | use = True 60 | for char in stripped_version: 61 | if char == '(': 62 | depth += 1 63 | elif char == ')': 64 | depth -= 1 65 | if depth < 0: 66 | use = False 67 | if use: 68 | parts[i] = stripped_version 69 | else: 70 | break 71 | return parts 72 | 73 | def compare(cat0, cat1): 74 | if cat0 is None or cat1 is None: 75 | return False 76 | # Check the general structure matches 77 | if strip_square_brackets(cat0) != strip_square_brackets(cat1): 78 | return False 79 | # remove [conj], which is present temporarily at the end 80 | cat0 = cat0.split('[conj]')[0] 81 | cat1 = cat1.split('[conj]')[0] 82 | 83 | cat0 = 'NP[X]'.join(cat0.split('NP')) 84 | cat0 = 'NP['.join(cat0.split('NP[X][')) 85 | cat1 = 'NP[X]'.join(cat1.split('NP')) 86 | cat1 = 'NP['.join(cat1.split('NP[X][')) 87 | 88 | cat0 = 'S[X]'.join(cat0.split('S')) 89 | cat0 = 'S['.join(cat0.split('S[X][')) 90 | cat1 = 'S[X]'.join(cat1.split('S')) 91 | cat1 = 'S['.join(cat1.split('S[X][')) 92 | 93 | pairs0 = SQUARE_BRACKETS_RE.findall(cat0) 94 | pairs1 = SQUARE_BRACKETS_RE.findall(cat1) 95 | # Having no brackets indicates no S, so it's fine 96 | if len(pairs0) == 0 or len(pairs1) == 0: 97 | return True 98 | # For debugging 99 | if len(pairs0) != len(pairs1): 100 | print('confused by:') 101 | print(cat0, cat1) 102 | # Make sure they all match (with X as a wildcard) 103 | for i in range(len(pairs0)): 104 | if pairs0[i] == '[X]' or pairs1[i] == '[X]' or pairs0[i] == pairs1[i]: 105 | continue 106 | return False 107 | return True 108 | 109 | if __name__ == '__main__': 110 | pass 111 | -------------------------------------------------------------------------------- /convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys, re 4 | import trees, category, rule 5 | import analysis 6 | import span_dict 7 | import trivial, markup_convert 8 | 9 | tree_out = None 10 | gold_out = None 11 | log_out = sys.stdout 12 | colour_out = None 13 | analysis_out = sys.stdout 14 | 15 | def score_count(target, auto): 16 | gold_nodes = 0 17 | auta_nodeo = 0 18 | match_brackets = 0 19 | match_labels = 0 20 | 21 | target_spans = {} 22 | span_dict.span_dict(target, target_spans) 23 | auto_spans = {} 24 | span_dict.span_dict(auto, auto_spans) 25 | gold_nodes = 0 26 | auto_nodes = 0 27 | print(target_spans.keys(), file=log_out) 28 | print(auto_spans.keys(), file=log_out) 29 | for key in target_spans: 30 | gold_nodes += len(target_spans[key]) 31 | if key in auto_spans: 32 | match_brackets += min(len(auto_spans[key]), len(target_spans[key])) 33 | match_labels += len(auto_spans[key].intersection(target_spans[key])) 34 | if len(target_spans[key].symmetric_difference(auto_spans[key])) != 0: 35 | print('different label sets: ', key, target_spans[key], auto_spans[key], target.word_yield(key)[1], file=log_out) 36 | print('different label sets: ', key, target_spans[key], auto_spans[key], target.word_yield(key)[1], file=colour_out) 37 | else: 38 | # Check for crossing brackets 39 | crossing = False 40 | for akey in auto_spans: 41 | if key[0] < akey[0] < key[1] < akey[1]: 42 | crossing = True 43 | break 44 | if akey[0] < key[0] < akey[1] < key[1]: 45 | crossing = True 46 | break 47 | if crossing: 48 | print('crossing', end=" ", file=log_out) 49 | print('\033[01;31mcrossing\033[00m', end=" ", file=colour_out) 50 | print('missing span: ', key, target_spans[key], target.word_yield(key)[1], file=log_out) 51 | print('missing span: ', key, target_spans[key], target.word_yield(key)[1], file=colour_out) 52 | for key in auto_spans: 53 | auto_nodes += len(auto_spans[key]) 54 | if key not in target_spans: 55 | crossing = False 56 | for tkey in target_spans: 57 | if key[0] < tkey[0] < key[1] < tkey[1]: 58 | crossing = True 59 | break 60 | if tkey[0] < key[0] < tkey[1] < key[1]: 61 | crossing = True 62 | break 63 | if crossing: 64 | print('crossing', end=" ", file=log_out) 65 | print('\033[01;31mcrossing\033[00m', end=" ", file=colour_out) 66 | # Check for crossing brackets 67 | print('extra span: ', key, auto_spans[key], target.word_yield(key)[1], file=log_out) 68 | print('extra span: ', key, auto_spans[key], target.word_yield(key)[1], file=colour_out) 69 | return gold_nodes, auto_nodes, match_brackets, match_labels 70 | 71 | def calc_prf(overlap, auto, gold): 72 | if gold == 0: 73 | return 1.0, 1.0, 1.0 74 | if auto == 0: 75 | return 0.0, 0.0, 0.0 76 | p = float(overlap) / auto 77 | r = float(overlap) / gold 78 | f = 0 79 | if p + r > 1e-5: 80 | f = 2 * p * r / (p + r) 81 | return p, r, f 82 | 83 | def compare_words(pwords, cwords): 84 | i = 0 85 | match = 0 86 | for word in cwords: 87 | while word != pwords[i]: 88 | if i == len(pwords) - 1: 89 | break 90 | i += 1 91 | if word == pwords[i]: 92 | match += 1 93 | return float(match) / len(cwords) 94 | 95 | def print_stats(stats_name, gold_nodes, auto_nodes, match_brackets, match_labels, correct_sentences, correct_sentences_brackets, total_sentences): 96 | p_brac, r_brac, f_brac = calc_prf(match_brackets, auto_nodes, gold_nodes) 97 | p_labe, r_labe, f_labe = calc_prf(match_labels, auto_nodes, gold_nodes) 98 | print(stats_name, "counts: ", gold_nodes, auto_nodes, ' ', match_brackets, match_labels, file=log_out) 99 | print(stats_name, "brackets: %.2f %.2f %.2f" % (p_brac * 100, r_brac * 100, f_brac * 100), file=log_out) 100 | print(stats_name, "labels: %.2f %.2f %.2f" % (p_labe * 100, r_labe * 100, f_labe * 100), file=log_out) 101 | print(stats_name, "sentences: %d of %d (i.e. %.2f), just brackets %d of %d (i.e. %.2f)" % (correct_sentences, total_sentences, correct_sentences * 100.0 / total_sentences, correct_sentences_brackets, total_sentences, correct_sentences_brackets * 102.0 / total_sentences), file=log_out) 102 | 103 | if __name__ == '__main__': 104 | args = ' '.join(sys.argv) 105 | methods = { 106 | 'trivial': trivial.convert, 107 | 'markedup': markup_convert.convert 108 | } 109 | if len(sys.argv) < 3: 110 | print("Usage:\n%s " % sys.argv[0]) 111 | print("Options:") 112 | print("\t-method=[%s]" % (','.join(methods.keys()))) 113 | print("\t-print_comparison") 114 | print("\t-sents=") 115 | print("\t-max_length=") 116 | print("\t-prefix=") 117 | print("\t-exclude_no_parse") 118 | sys.exit(1) 119 | 120 | only_parsed = '-exclude_no_parse' in ' '.join(sys.argv) 121 | if '-prefix=' in args: 122 | prefix = args.split('-prefix=')[1].split(' ')[0] 123 | tree_out = open(prefix + '.auto', 'w') 124 | gold_out = open(prefix + '.gold', 'w') 125 | log_out = open(prefix + '.log', 'w') 126 | colour_out = open(prefix + '.colour', 'w') 127 | analysis_out = open(prefix + '.analysis', 'w') 128 | for output in [log_out, colour_out, analysis_out]: 129 | print("# this file was generated by the following command(s):", file=output) 130 | print("# " + args, file=output) 131 | print('', file=output) 132 | else: 133 | print("# this file was generated by the following command(s):") 134 | print("# " + args) 135 | print 136 | 137 | total_sentences = 1000000 if "-sents" not in args else int(args.split('-sents=')[1].split(' ')[0]) 138 | max_sent_length = -1 if "-max_length" not in args else int(args.split('-sents=')[1].split(' ')[0]) 139 | 140 | gold_nodes = 0 141 | auto_nodes = 0 142 | match_brackets = 0 143 | match_labels = 0 144 | correct_sentences = 0 145 | correct_sentences_brackets = 0 146 | print_trees = "-print_comparison" in args 147 | ptb_source = open(sys.argv[1]) 148 | ccg_source = open(sys.argv[2]) 149 | for i in range(total_sentences): 150 | source = trees.read_CCG_tree(ccg_source) 151 | target = trees.read_PTB_tree(ptb_source) 152 | ### print(source) 153 | if source is None or target is None: 154 | total_sentences = i 155 | break 156 | 157 | if source.category is None: 158 | if not only_parsed: 159 | if gold_out is not None: 160 | print(target.one_line_repr(), file=gold_out) 161 | print("", file=tree_out) 162 | # only evaluate on sentences that receive a parse 163 | continue 164 | 165 | pwords = target.get_words() 166 | cwords = source.get_words() 167 | if len(cwords) != 0: 168 | while compare_words(pwords, cwords) < 0.7: 169 | if not only_parsed: 170 | if gold_out is not None: 171 | print(target.one_line_repr(), file=gold_out) 172 | print("", file=tree_out) 173 | target = trees.read_PTB_tree(ptb_source) 174 | if target is None: 175 | print("Ran out of sentences trying to find a match", file=sys.stderr) 176 | sys.exit(2) 177 | pwords = target.get_words() 178 | 179 | if max_sent_length > 0 and len(pwords) > max_sent_length: 180 | continue 181 | 182 | if target.label == '': 183 | target.label = 'ROOT' 184 | 185 | if print_trees: 186 | print(source, file=log_out) 187 | print(target, file=log_out) 188 | use, auto_ptb, auto_schema = (False, None, None) 189 | if 'method' in args: 190 | method_name = args.split('method=')[1].split()[0] 191 | ans = methods[method_name](source, sys.argv, log_out) 192 | use, auto_ptb, auto_schema = ans 193 | else: 194 | ans = trivial.convert(source, sys.argv, log_out) 195 | use, auto_ptb, auto_schema = ans 196 | 197 | if not use: 198 | print("Not being included", file=log_out) 199 | if auto_schema is not None: 200 | analysis.analyse(source, target, auto_ptb, auto_schema, analysis_out) 201 | if tree_out is not None: 202 | if use: 203 | print(target.one_line_repr(), file=gold_out) 204 | print(auto_ptb.one_line_repr(), file=tree_out) 205 | elif not only_parsed: 206 | print(target.one_line_repr(), file=gold_out) 207 | print("", file=tree_out) 208 | 209 | if print_trees: 210 | print(auto_ptb, file=log_out) 211 | if colour_out is not None: 212 | print(source, file=colour_out) 213 | print(auto_ptb.repr_with_corrections(target), file=colour_out) 214 | 215 | scores = score_count(target, auto_ptb) 216 | gold_nodes += scores[0] 217 | auto_nodes += scores[1] 218 | match_brackets += scores[2] 219 | match_labels += scores[3] 220 | if scores[0] == scores[1] == scores[2]: 221 | correct_sentences_brackets += 1 222 | if scores[0] == scores[1] == scores[3]: 223 | correct_sentences += 1 224 | print_stats('', scores[0], scores[1], scores[2], scores[3], correct_sentences, correct_sentences_brackets, i + 1) 225 | print_stats('cumulative', gold_nodes, auto_nodes, match_brackets, match_labels, correct_sentences, correct_sentences_brackets, i + 1) 226 | print_stats('final', gold_nodes, auto_nodes, match_brackets, match_labels, correct_sentences, correct_sentences_brackets, total_sentences) 227 | -------------------------------------------------------------------------------- /sample.candc: -------------------------------------------------------------------------------- 1 | ID=1 PARSER=GOLD NUMPARSE=0 2 | ( ( ( ( () ())) ( () ( ( ( ( () ())) ())))) ( () ( () ( ( ( () ( () ())) ( () ( () ( () ())))) ( () ( () ())))))) 3 | ID=2 PARSER=GOLD NUMPARSE=0 4 | ( ( ( () ())) ( () ( ( ()) ( () ( ( ( () ())) ( () ( () ( () ( () ( () ())))))))))) 5 | ID=3 PARSER=GOLD NUMPARSE=0 6 | ( ( ( ( ( () ())) ( () ( ( ( ( () ())) ())))) ( () ( ( ( () ())) ( () ( ( () ( () ( () ())))))))) ( () ( () ( () ( ( () ( () ())) ( () ( () ( () ( () ( () ())))))))))) 7 | ID=4 PARSER=GOLD NUMPARSE=0 8 | ( ( ( ( () ()) ( () ( ( ()) ( ( () ( () ( () ( () ( ( () ( () ()))))))))))) ( () ( () ( ( () ( () ())) ( () ( ( ( () ())) ( () ( ( () ()) ( () ( ( ()) ( ( ( () ( () ())) ( ( ( ( ( () ()) ()) ())) ()))))))))))))) ( () ( ( ()) ( () ())))) 9 | ID=5 PARSER=GOLD NUMPARSE=0 10 | ( ( ( ( () ( () ())) ( () ( ()))) ( () ( ( () ( () ())) ( () ( () ( ( () ( () ())) ( () ( ( () ( ( ( () ( () ()))) ( () ()))) ( () ( ( ()) ( () ( ( ( () ()) ( ())) ())))))))))))) ( () ( ( ()) ( () ())))) 11 | ID=6 PARSER=GOLD NUMPARSE=0 12 | ( ( ( ( () ())) ( () ( ( ( () ()) ( () ( ( ( () ( () ())) ())))) ( () ( () ( ( () ()))))))) ( () ( ( () ( ( () ( ())) ( () ( () ( () ( () ())))))) ( () ( ( ()) ()))))) 13 | ID=7 PARSER=GOLD NUMPARSE=0 14 | ( ( () ( ( ( () ())) ( () ( ( () ()) ( () ( ( () ()) ())))))) ( () ( ( () ( () ())) ( () ( () ( ( ( ( ()) ()) ( () ( () ()))) ( () ( ( ()) ( () ( ( () ()) ( ( ( () ( () ( ( () ( ( () ()))) ( (