├── README.md └── chunklinkctb.pl /README.md: -------------------------------------------------------------------------------- 1 | ChunkLinkCTB (Chunklink for CTB) 2 | ============ 3 | 4 | A tool for extracting chunks from Penn Chinese Treebank (Chen et. al 2006, ACL-poster), that is A CTB version of chunklink. 5 | 6 | This tool is designed for the old versions of CTB. For CTB7.0 or higher, it needs some changes. 7 | 8 | 宾州中文树库抽取中文短语(组块)工具,适用于CTB5.0及以下。 9 | 10 | ============= 11 | Related Publications 12 | 13 | If you would like to check more details about this tool, please check the following papers 14 | 15 | -Wenliang Chen, Yujie Zhang and Hitoshi Isahara. An Empirical Study of Chinese Chunking, The joint conference of the International Committee on Computational Linguistics and the Association for Computational Linguistics (Coling-ACL2006) (Poster Session), Sydney, Australia, pp. 97-104, Jul. 2006 16 | -------------------------------------------------------------------------------- /chunklinkctb.pl: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/perl -w 2 | 3 | ### 4 | ### Produces a convenient list of words, chunks, functions, and links from Penn Chinese TreeBank 4 5 | ### 6 | ### 7 | ### Modified by Wenliang Chen, NiCT, chenwl@nict.go.jp or chenwenliang@gmail.com, 2005 8 | ### 9 | 10 | ### Produces a convenient list of words, chunks, functions, and links from Penn TreeBank II 11 | ### 12 | ### Written by Sabine Buchholz, ILK, Tilburg University, S.Buchholz@kub.nl, 1998 13 | ### 14 | ### Modifications by Yuval Krymolowski, Bar-Ilan University, yuvalk@macs.biu.ac.il, 1999 15 | ### 16 | 17 | sub help { 18 | print "call as: \n"; 19 | print " chunklink_ctb.pl /cdrom/ctb4/data/bracketed/chtb_????.fid | more \n"; 20 | print " \n"; 21 | print "options: \n"; 22 | print " -s : Place a '# sentence ID' line before the word-list of each sentence \n"; 23 | print " instead of at the lines of the individual words. \n"; 24 | print " The sentence ID is file/number, e.g., 0001/01. \n"; 25 | print " \n"; 26 | print " -ns : Enumerate the words inside a sentence, instead of number in the file \n"; 27 | print " \n"; 28 | print " -B : which sort of IOB tags to output; I tags are always inside a chunk, O tags are outside \n"; 29 | print " possible values are: Begin (the default): B tags for word at the beginning of a chunk \n"; 30 | print " End: E tags for word at the end of a chunk \n"; 31 | print " BeginEndCombined: B tags for word at the beginning of a chunk \n"; 32 | print " E tags for word at the end of a chunk \n"; 33 | print " C tags for single word chunks \n"; 34 | print " Between: B tags for words that are at the beginning \n"; 35 | print " of a chunk and the previous chunk had the \n"; 36 | print " same syntactic category \n"; 37 | print " Attention! The last option applies only to the simple \n"; 38 | print " IOB tag column (e.g. 'I-NP'), not to the IOB chain column \n"; 39 | print " (e.g. 'I-S/I-S/I-NP/I-NP/I-PP/B-NP'). If 'Between', the \n"; 40 | print " latter column gets the default representation 'Begin'. \n"; 41 | print " \n"; 42 | print " -N : suppress word number in output \n"; 43 | print " -p : ... POS tag ... \n"; 44 | print " -f : function \n"; 45 | print " -h : head word \n"; 46 | print " -H : head number \n"; 47 | print " -i : IOB tag \n"; 48 | print " -c : IOB tag chain \n"; 49 | print " -t : trace information \n"; 50 | } 51 | 52 | ###################################################################### 53 | ###################################################################### 54 | ############################# initialize ############################# 55 | ###################################################################### 56 | ###################################################################### 57 | 58 | ###################################################################### 59 | ### load definition of object data structures to represent parse tree 60 | ###################################################################### 61 | 62 | # use nodes; 63 | 64 | # start of nodes.pm 65 | 66 | package terminal; 67 | sub new { 68 | my $type=shift; 69 | my $self={}; 70 | $self->{number}=shift; 71 | $self->{iob_tag}=shift; 72 | $self->{iob_tag_inner}=$self->{iob_tag}; 73 | $self->{pos_tag}=shift; 74 | $self->{word}=shift; 75 | $self->{function}=shift; 76 | $self->{prep}='-'; # for PNP feature: not used by chunklink.pl 77 | $self->{adv}='-'; # for ADV feature: not used by chunklink.pl 78 | $self->{head_comp}='c'; 79 | $self->{trace}=undef; 80 | my $obj=bless $self, $type; 81 | $self->{lex_head}=[$obj]; # head is reference to itself 82 | return $obj; 83 | } 84 | 85 | package non_terminal; 86 | sub new { 87 | my $type=shift; 88 | my $self={}; 89 | my @d=() ; 90 | $self->{function}=shift; 91 | $self->{lex_head}=undef; 92 | $self->{head_comp}='c'; 93 | $self->{daughters}=shift; 94 | $self->{trace}=undef; 95 | $self->{iob_tag}=''; 96 | return bless $self, $type; 97 | } 98 | 99 | package trace; 100 | sub new { 101 | my $type=shift; 102 | my $self={}; 103 | $self->{function}=shift; 104 | $self->{kind}=shift; 105 | $self->{lex_head}=shift; 106 | $self->{reference}=shift; # trace points to filler 107 | $self->{head_comp}='c'; 108 | my $obj=bless $self, $type; 109 | if (defined($obj->{reference})) { 110 | if (defined($obj->{reference}->{trace}) && ref($obj->{reference}->{trace}) eq 'ARRAY') { 111 | $obj->{reference}->{trace}=[$obj,@{$obj->{reference}->{trace}}]; # filler points to trace 112 | } 113 | else { 114 | $obj->{reference}->{trace}=[$obj]; # filler points to trace 115 | } 116 | } 117 | return $obj; 118 | } 119 | 120 | package main; 121 | 122 | # return 1; 123 | 124 | # end of nodes.pm 125 | 126 | if (@ARGV==0) { 127 | help(); 128 | exit; 129 | } 130 | 131 | ###################################################################### 132 | ### check options 133 | ###################################################################### 134 | 135 | require "getopts.pl" ; 136 | $opt_B = 'Begin' ; 137 | $opt_n = '' ; 138 | $opt_s = '' ; 139 | $opt_N = 0 ; # word number 140 | $opt_p = 0 ; # POS tag 141 | $opt_f = 0 ; # function 142 | $opt_h = 0 ; # head 143 | $opt_H = 0 ; # head number 144 | $opt_i = 0 ; # IOB tag 145 | $opt_c = 0 ; # IOB tag chain 146 | $opt_t = 0 ; # trace information 147 | &Getopts("B:n:sNpfhHict") ; 148 | 149 | $sent_each_word = 1 ; # default - print the sentence for each word 150 | if ($opt_s) 151 | { 152 | $sent_each_word = 0 ; 153 | } 154 | 155 | $word_enumerate = 'file' ; # default - word number runs through all the file 156 | 157 | if ($opt_n =~ /^s/) 158 | { 159 | $word_enumerate = 'sent' ; # word number runs in current sentence 160 | } 161 | 162 | printf("#arguments: IOB tag: $opt_B, word numbering: $word_enumerate\n") ; 163 | printf("#columns:") ; 164 | 165 | if ($sent_each_word) 166 | { 167 | printf(" file_id sent_id") ; 168 | } 169 | if ($opt_N==0) 170 | { 171 | printf(" word_id") ; 172 | } 173 | if ($opt_i==0) 174 | { 175 | printf(" iob_inner") ; 176 | } 177 | if ($opt_p==0) 178 | { 179 | printf(" pos") ; 180 | } 181 | printf(" word") ; 182 | if ($opt_f==0) 183 | { 184 | printf(" function") ; 185 | } 186 | if ($opt_h==0) 187 | { 188 | printf(" heads") ; 189 | } 190 | if ($opt_H==0) 191 | { 192 | printf(" head_ids") ; 193 | } 194 | if ($opt_c==0) 195 | { 196 | printf(" iob_chain") ; 197 | } 198 | if ($opt_t==0) 199 | { 200 | printf(" trace-function trace-type trace-head_ids") ; 201 | } 202 | printf ("\n") ; 203 | 204 | ###################################################################### 205 | ### load definition of constituent head 206 | ###################################################################### 207 | 208 | head_medium(); 209 | 210 | ###################################################################### 211 | ### what to prune 212 | ###################################################################### 213 | 214 | ### comment in if any such constituent should be pruned 215 | $prune_always{'NAC'}=1; 216 | $prune_always{'NX'}=1; 217 | $prune_always{'X'}=1; 218 | ##CTB4 for verb phrases 219 | $prune_always{'VCD'}=1; 220 | $prune_always{'VRD'}=1; 221 | $prune_always{'VSB'}=1; 222 | $prune_always{'VCP'}=1; 223 | $prune_always{'VNV'}=1; 224 | $prune_always{'VPT'}=1; 225 | 226 | 227 | ### comment in if the second constituent should be pruned 228 | ### when appearing as daughter of the first 229 | ### in front of any potential head 230 | $prune_if_infrontof_head{'NP'}{'ADJP'}=1; 231 | $prune_if_infrontof_head{'NP'}{'UCP'}=1; 232 | $prune_if_infrontof_head{'WHNP'}{'WHADJP'}=1; 233 | $prune_if_infrontof_head{'ADJP'}{'ADVP'}=1; 234 | ##CTB4 CLP will be pruned after QP 235 | $prune_if_infrontof_head{'QP'}{'CLP'}=1; 236 | $prune_if_infrontof_head{'DP'}{'CLP'}=1; 237 | 238 | ### set to 1 if these should be pruned 239 | ### 0 otherwise 240 | $prune_s_in_vp_predicative_flag=1; 241 | $prune_s_in_vp_empty_subject_flag=1; 242 | $prune_vp_in_vp_flag=1; 243 | $prune_advp_in_vp_flag=1; 244 | 245 | ###################################################################### 246 | ###################################################################### 247 | ################################ main ################################ 248 | ###################################################################### 249 | ###################################################################### 250 | 251 | $word_number=0; # initialize: for unique word number over whole corpus 252 | foreach $file (@ARGV) { # loop: for each treebank file 253 | if (defined($file)) { # open for reading 254 | open(INPUT,"<$file") or die "Cannot open $file!\n"; 255 | } 256 | else { 257 | die "file not defined!\n"; 258 | } 259 | if ($file =~ /\/chtb_([0-9]+)\.fid$/) { # extract file number 260 | $filenumber=$1; 261 | } 262 | else { 263 | $filenumber=1;#die "$file does not match!\n"; 264 | } 265 | print STDERR "$filenumber "; 266 | $sentence=''; # initialize sentence string 267 | $sentence_number=0; # initialize sentence counter 268 | while (defined($sentence=)) { # loop: until end of file 269 | $sent_flag = 1; 270 | $result=start_read(); # read sentence into $result 271 | last if($sent_flag == 0); 272 | 273 | if (defined($result->{function}) # delete outermost parentheses 274 | && $result->{function} eq 'NOLABEL' 275 | && @{$result->{daughters}}==1) 276 | { 277 | $result=$result->{daughters}->[0]; 278 | } 279 | 280 | # set the word counter within file or within sentence 281 | if ($word_enumerate eq 'file') 282 | { 283 | $word_num = $word_number ; 284 | } 285 | else # 'sent' 286 | { 287 | $word_num = 0 ; 288 | } 289 | #print "\n################## start_read ####################\n\n"; print_parse_tree(0,$result); print "\n\n"; 290 | prune($result); # prune parse tree and determine heads of constituents 291 | #print "################## prune ####################\n\n"; print_parse_tree(0,$result); print "\n\n"; 292 | lexicalize($result); # lexicalize parse tree 293 | #print "################## lexicalize ####################\n\n"; print_parse_tree(0,$result); print "\n\n"; 294 | @flattened=flatten($result); # flatten tree into sequence of chunks 295 | #print "################## flatten ####################\n\n"; print_list(@flattened); print "\n\n"; 296 | if ($opt_c==0) { # if {iob_tag} feature must be computed 297 | cut_traces($result); # remove traces from the tree 298 | iob_chain($result); # compute the value of the {iob_tag} feature 299 | } 300 | chunks(); # compute IOB-tags 301 | #print "################## chunks ####################\n\n"; print_list(@flattened); print "\n\n"; 302 | #print "################## result ####################\n\n"; 303 | print_flatten(); # print output 304 | } 305 | } 306 | print STDERR "\n$word_number words processed\n"; 307 | 308 | ###################################################################### 309 | ###################################################################### 310 | ################################ subs ################################ 311 | ###################################################################### 312 | ###################################################################### 313 | 314 | ###################################################################### 315 | ### start_read : called by main, calls read_sentence(_,_) 316 | ###################################################################### 317 | 318 | sub start_read { 319 | # skip all lines not of the form " (..." (e.g. blank lines) 320 | while ($sentence!~/^\s*\((.*)$/ 321 | && defined($sentence=)) {}; 322 | if(!defined($sentence)){ 323 | $sent_flag = 0; 324 | return \''; 325 | } 326 | chop($sentence); 327 | # consumes first opening bracket of sentence 328 | # calls read_sentence to read input until corresponding closing bracket is found 329 | if ($sentence=~/^\s*\((.*)$/) 330 | { 331 | $sentence=$1; 332 | $depth=1; 333 | $sentence_number++; 334 | $chunk_number=0; 335 | undef %corefs; 336 | undef %tracerefs; 337 | return read_sentence('NOLABEL',$depth-1); 338 | } 339 | else 340 | { 341 | return \''; # ' 342 | } 343 | } 344 | 345 | ###################################################################### 346 | ### read_sentence : called by start_read(), calls itself, 347 | ### 'trace'->new(_,_,_,_), 'terminal'->new(_,_,_,_,_), 348 | ### non_terminal->new(_,_) 349 | ###################################################################### 350 | 351 | sub read_sentence { 352 | my $label=shift(@_); # the label of the constituent that is to be read 353 | my $mydepth=shift(@_); # at which bracket depth the constituent starts/ends 354 | my @store=(); # references to the daughters of the constituent will go in here 355 | 356 | while ($depth>$mydepth) { # while closing bracket of constituent is not yet found 357 | if (length($sentence)==0) { # if necessary: 358 | if (defined($sentence=)) { # read in new line 359 | chop($sentence); 360 | } 361 | else { 362 | die "$filenumber $sentence_number ERROR (input finished before sentence was complete)!\n"; 363 | } 364 | } 365 | if ($sentence=~/^\(([^ ()]+) ([^ ()]+)\)(.*)$/) { # (NN man) # terminal node 366 | $tag=$1; 367 | $word=$2; 368 | $sentence=$3; 369 | if ($tag eq '-NONE-') { # null element 370 | if ($word=~/^(\*T\*|\*U\*|\*NOT\*|\*RNR\*|\*ICH\*|\*EXP\*|\*PPA\*|\*\?\*|\*)(.*)$/) { # trace 371 | my $kind=$1; 372 | my $rest=$2; 373 | if (defined($rest) && $rest=~/^-([0-9]+)$/) { # co-referenced 374 | if (defined($corefs{$1})) { # co-reference already defined (backward reference) 375 | push(@store,'trace'->new('NOFUNC',$kind,undef,$corefs{$1})); 376 | } 377 | else { # co-reference not yet defined (forward reference) 378 | my $ref='trace'->new('NOFUNC',$kind,undef,undef); 379 | push(@store,$ref); 380 | $tracerefs{$1}=$ref; # store in hash for later processing 381 | } 382 | } 383 | else { # not coreferenced 384 | push(@store,'trace'->new('NOFUNC',$kind,undef,undef)); 385 | } 386 | } # no else: 387 | # if $tag is '-NONE-', but $word is none of the above, 388 | # it is probably the empty complementizer: (-NONE- 0) 389 | # -> just ignore 390 | } 391 | else { # no null element 392 | $tag=~s/,/COMMA/g; # replace some special characters 393 | $tag=~s/-LRB-/(/g; 394 | $tag=~s/-RRB-/)/g; 395 | $word=~s/,/COMMA/g; 396 | $word=~s/-LRB-/(/g; 397 | $word=~s/-RRB-/)/g; 398 | $word=~s/-LCB-/{/g; 399 | $word=~s/-RCB-/}/g; 400 | push(@store,'terminal'->new($word_num++,'O',$tag,$word,'NOFUNC')); 401 | $word_number++ ; 402 | } 403 | } 404 | elsif ($sentence=~/^\((\S+)(.*)$/) { # (XP # beginning of non-terminal node 405 | my $index; 406 | my $sublabel=$1; 407 | $sentence=$2; 408 | $depth++; 409 | if ($sublabel=~/^(.+)-([0-9]+)$/) { # special: co-references 410 | $sublabel=$1; 411 | $index=$2; 412 | } 413 | if ($sublabel=~/^(.+)=[0-9]+$/) { # delete gapping-references 414 | $sublabel=$1; 415 | } 416 | my $ref=read_sentence($sublabel,$depth-1); # read constituent 417 | push(@store,$ref); # store constituent as daughter 418 | if (defined($index)) { # was coreferenced 419 | if (defined($tracerefs{$index})) { # trace had been found before filler (forward reference) 420 | $tracerefs{$index}->{reference}=$ref; # trace points to filler 421 | $ref->{trace}=[$tracerefs{$index}]; # filler points to trace 422 | } 423 | else { # filler had been found before trace (backward reference) 424 | $corefs{$index}=$ref; # store in hash for later processing 425 | } 426 | } 427 | } 428 | elsif ($sentence=~/^\((.*)$/) { # ( # same, without label 429 | $sentence=$1; 430 | $depth++; 431 | push(@store,read_sentence('NOLABEL',$depth-1)); 432 | } 433 | elsif ($sentence=~/^\)(.*)$/) { # ) # end of non-terminal node 434 | $depth--; 435 | $sentence=$1; 436 | } 437 | elsif ($sentence=~/^\s+(.*)/) { # blanks 438 | $sentence=$1; 439 | } 440 | else { 441 | print die "$filenumber $sentence_number ERROR: No match(2) $sentence!\n"; 442 | } 443 | } 444 | if ($depth==$mydepth) { # if corresponding closing bracket has been found 445 | return non_terminal->new($label,\@store); # return reference to constituent 446 | } 447 | else { 448 | print die "$filenumber $sentence_number ERROR: $depth $mydepth\n"; 449 | } 450 | } 451 | 452 | ###################################################################### 453 | ### prune : called by main, calls head_of(_,_) 454 | ###################################################################### 455 | 456 | sub verbs_or_adverbs_in_front { 457 | if (@lastnonref+@advps>0 # at least one verb or adverb in front 458 | && $i==@lastnonref+@advps) { # only verbs and adverbs in front 459 | return 1; 460 | } 461 | else { 462 | return 0; 463 | } 464 | } 465 | 466 | sub simple_prune { 467 | splice(@$daughters,$i,1,@{$daughters->[$i]->{daughters}}); # insert contents of ADJP etc. 468 | $i--; 469 | } 470 | 471 | sub prune_drop_first { 472 | splice(@$daughters,$i,1,@{$s_daughters}[1..$#{$s_daughters}]); # insert all but first daughter of S 473 | $i--; 474 | } 475 | 476 | # prune S with (non-)empty subject inside VP 477 | # so that "expected to take" is ONE VP chunk 478 | # so that objects of predicative verbs are indeed recognized as objects 479 | # possibly: so that objects of controll verbs are indeed recognized as objects 480 | sub prune_s_in_vp_condition { 481 | if ( $xp eq 'VP' # && $sub_labelxp eq 'S' # S(-...) in VP 482 | && $sub_label eq 'S' # S in VP: no S-CLR, S-ADV etc. 483 | && verbs_or_adverbs_in_front() # verbs or adverbs in front 484 | && defined($daughters->[$i]->{daughters}) 485 | && @{$daughters->[$i]->{daughters}}>=2 # has at least 2 daughters: 486 | && $daughters->[$i]->{daughters}->[0]->{function} =~ /-SBJ/ # 1) subject: include S-NOM-SBJ 487 | ) { 488 | return 1; 489 | } 490 | else { 491 | return 0; 492 | } 493 | } 494 | 495 | sub prune_s_in_vp { 496 | local $s_daughters=$daughters->[$i]->{daughters}; 497 | if (prune_s_in_vp_predicative_condition()) { 498 | $s_daughters->[0]->{function} =~ s/-SBJ//; # change function to "direct object": also for S-NOM-SBJ 499 | simple_prune(); 500 | } 501 | elsif (prune_s_in_vp_empty_subject_condition()) { 502 | if (defined($s_daughters->[0]->{daughters}->[0]->{reference})) { # trace no longer exists 503 | $s_daughters->[0]->{daughters}->[0]->{reference}->{trace}=undef; 504 | } 505 | prune_drop_first(); 506 | } 507 | } 508 | 509 | # predicative: 510 | # (VP (VBP make) 511 | # (S 512 | # (NP-SBJ (PRP them) ) 513 | # (ADJP-PRD (JJ fearful) ))) 514 | ###################################################### 515 | ### can be changed so as to also apply to cases like: 516 | # (VP (VBP permit) 517 | # (S 518 | # (NP-SBJ (NN portfolio) (NNS managers) ) 519 | # (VP (TO to) 520 | # (VP (VB retain) 521 | sub prune_s_in_vp_predicative_condition { 522 | if ($prune_s_in_vp_predicative_flag 523 | # subject may be trace if verb verb is passivized 524 | && $s_daughters->[1]->{function}=~/-PRD$/ # 2) predicative 525 | ) { 526 | return 1; 527 | } 528 | else { 529 | return 0; 530 | } 531 | } 532 | 533 | # empty subject and VP is infinitive or gerund: 534 | # (VP (VBN expected) 535 | # (S 536 | # (NP-SBJ (-NONE- *-1) ) 537 | # (VP (TO to) 538 | # (VP (VB take) 539 | # (NP (DT another) (JJ sharp) (NN dive) ) 540 | # (VP (VBD evaluated) 541 | # (S 542 | # (NP-SBJ (-NONE- *-2) ) 543 | # (VP (VBG raising) 544 | # (NP (PRP$ our) (NN bid) ))) 545 | sub prune_s_in_vp_empty_subject_condition { 546 | if ($prune_s_in_vp_empty_subject_flag 547 | && @{$s_daughters->[0]->{daughters}}==1 # subject has only one daughter 548 | && ref($s_daughters->[0]->{daughters}->[0]) eq 'trace' # which is trace 549 | && $s_daughters->[1]->{function} eq 'VP' # 2) infinitive/gerund 550 | ) { 551 | return 1; 552 | } 553 | else { 554 | return 0; 555 | } 556 | } 557 | 558 | sub np_condition { 559 | if ($xp eq 'NP' # special case of NPs 560 | && $i!=@$daughters-1 561 | && defined($daughters->[$i+1]->{function}) 562 | && $daughters->[$i+1]->{function}=~/^NP/ # no directly following NP 563 | && defined($daughters->[$i]->{daughters}) 564 | && defined($daughters->[$i]->{daughters}->[-1]) 565 | && defined($daughters->[$i]->{daughters}->[-1]->{pos_tag}) 566 | && $daughters->[$i]->{daughters}->[-1]->{pos_tag} eq 'POS') { # no possessive NP 567 | return 1; 568 | } 569 | else { 570 | return 0; 571 | } 572 | } 573 | 574 | sub check_non_terminal { 575 | if (exists($prune_always{$sub_labelxp})) { 576 | simple_prune(); 577 | } 578 | elsif (exists($prune_if_infrontof_head{$xp}{$sub_labelxp}) 579 | # UCP or ADJP in NP 580 | # WHADJP in WHNP 581 | # ADVP in ADJP 582 | # && @lastnonref==0 # no possible head-word found to date: would result in "a [UCP state and local] [NP utility]" 583 | && @sub_xps==0) { # no possible head-constituent found to date: don't prune "P.V., 61 years old, ..." 584 | simple_prune(); 585 | } 586 | elsif ($prune_vp_in_vp_flag 587 | && $xp eq 'VP' && $sub_labelxp eq 'VP' # VP in VP 588 | && verbs_or_adverbs_in_front()) { # verbs or adverbs in front 589 | simple_prune(); 590 | } 591 | elsif (prune_s_in_vp_condition()) { 592 | prune_s_in_vp(); 593 | } 594 | # special case of ADVP in VP: remember position of ADVP for later pruning 595 | elsif ($xp eq 'VP' && $sub_labelxp eq 'ADVP') { # ADVP in VP 596 | push(@advps,$i); 597 | } 598 | # for determining the head daughter later: remember position of non-terminal daughter 599 | elsif ( defined($headcat{$xp}) 600 | && $sub_label=~/^($headcat{$xp})/) { 601 | if (not(np_condition())) { 602 | push(@sub_xps,$i); # remember position of sub-XP 603 | } 604 | } 605 | # for determining the head daughter later: record if coordinating conjunction found 606 | elsif ($sub_labelxp eq 'CONJP') { 607 | $cc=1; 608 | } 609 | } 610 | 611 | sub check_terminal { 612 | my $postag=$daughters->[$i]->{pos_tag}; 613 | # for determining the head daughter later: remember position of terminal daughter 614 | if (head_of($postag,$xp)) { # restrictions on heads of XPs 615 | push(@lastnonref,$i); # remember position of last word 616 | } 617 | # for determining the head daughter later: record if coordinating conjunction found 618 | if ($postag=~/^CC/ # coordinating conjunction 619 | && $i>0) { # not as first word in phrase 620 | $cc=1; 621 | } 622 | # see above: VP in VP is deleted if only verbs or adverbs in front 623 | elsif ($postag=~/^RB/) { 624 | push(@advps,$i); 625 | } 626 | } 627 | 628 | # determine and mark head(s): default: last word or first XP is head 629 | sub mark_head { 630 | if (@lastnonref>0) { # possible lexical head (none for NOLABEL, ...) 631 | $daughters->[$lastnonref[-1]]->{head_comp}='h'; 632 | } 633 | elsif (@sub_xps>0) { # possible non-lexical head 634 | if ($cc==1) { # coordinated structure 635 | for (my $i=0; $i<@sub_xps; $i++) { 636 | $daughters->[$sub_xps[$i]]->{head_comp}='h'; 637 | } 638 | } 639 | else { # no coordination 640 | $daughters->[$sub_xps[0]]->{head_comp}='h'; 641 | } 642 | } 643 | } 644 | 645 | # prune ADVP inside a VP 646 | sub prune_advp_in_vp { 647 | if ($prune_advp_in_vp_flag 648 | && $xp eq 'VP' && @advps>0 && @lastnonref>0) { # there is an ADVP in a VP 649 | my $i=0; 650 | my $add=0; 651 | my $index; 652 | while ($i<@advps && $advps[$i]<$lastnonref[-1]) { # ADVP is in front of last verb 653 | $index=$advps[$i]+$add; 654 | if (ref($daughters->[$index]) eq 'non_terminal') { # prune only ADVPs not pure RBs 655 | $add+=@{$daughters->[$index]->{daughters}}-1; # -1: original reference is replaced 656 | splice(@$daughters,$index,1,@{$daughters->[$index]->{daughters}}); # insert contents of ADVP 657 | } 658 | $i++; 659 | } 660 | } 661 | } 662 | 663 | sub prune { 664 | local $res=shift(@_); # the actual node 665 | local $i; 666 | local $daughters; # the daughters of the constituent 667 | local $xp; # its syntactic category 668 | local @sub_xps=(); # those daughters that are non-terminals 669 | local @lastnonref=(); # ... terminals 670 | local @advps=(); # ADVPs in VPs 671 | local $cc=0; # boolean: whether coordinating conjunction was found 672 | local $sub_label; # the label of the daughter (e.g. XP-FUNC) 673 | local $sub_labelxp; # its syntactic category (e.g. XP) 674 | if (ref($res) eq 'terminal' || ref($res) eq 'trace') { # stop at word or trace level 675 | return 1; 676 | } 677 | if (ref($res) ne 'non_terminal') { 678 | die "(prune) No match $res ".ref($res)."!\n"; 679 | } 680 | 681 | if ($res->{function}=~/^([A-Z]+)/) { # XP 682 | $xp=$1; # the syntactic category 683 | } 684 | else { 685 | die "(prune) No match $res->{function}=~/^([A-Z]+)/!\n"; 686 | } 687 | 688 | $daughters=$res->{daughters}; # the daughters of the constituent 689 | for ($i=0; $i<@$daughters; $i++) { 690 | $sub_label=$daughters->[$i]->{function}; # the label of the daughter 691 | if ($sub_label=~/^([A-Z]+)/) { # XP 692 | $sub_labelxp=$1; # its syntactic category 693 | } 694 | else { 695 | $sub_labelxp=$sub_label; 696 | } 697 | 698 | if (ref($daughters->[$i]) eq 'non_terminal') { 699 | check_non_terminal(); 700 | } 701 | elsif (ref($daughters->[$i]) eq 'terminal') { 702 | check_terminal(); 703 | } 704 | elsif (@$daughters==1) { # trace must be only daughter 705 | # for determining the head daughter later: remember position of trace daughter 706 | push(@lastnonref,$i); 707 | } 708 | } # end of processing the daughters: for ($i=0; $i<@$daughters; $i++) 709 | 710 | mark_head(); 711 | 712 | prune_advp_in_vp(); 713 | 714 | # recursive call 715 | for ($i=0; $i<@$daughters; $i++) { 716 | prune($daughters->[$i]); 717 | } 718 | } 719 | 720 | 721 | ###################################################################### 722 | ### head_of : called by prune(_), calls nothing 723 | ###################################################################### 724 | 725 | sub head_of { 726 | ($postag,$xp)=@_; 727 | if (defined($head{$xp})) { 728 | if ($postag=~/^($head{$xp})/) { 729 | return 1; 730 | } 731 | else { 732 | return 0; 733 | } 734 | } 735 | else { 736 | return 0; 737 | } 738 | } 739 | 740 | ###################################################################### 741 | ### lexicalize : called by main, calls itself 742 | ###################################################################### 743 | 744 | sub lexicalize { 745 | my $res=shift(@_); 746 | my $i; 747 | if (ref($res) eq 'terminal' || ref($res) eq 'trace') { 748 | # stop at word or trace level 749 | return 1; 750 | } 751 | elsif (ref($res) eq 'non_terminal') { # non_terminal 752 | my $daughters=$res->{daughters}; 753 | for ($i=0; $i<@$daughters; $i++) { 754 | lexicalize($daughters->[$i]); 755 | } 756 | my $headword=[]; 757 | for ($i=0; $i<@$daughters; $i++) { # find the headword(s) 758 | if ($daughters->[$i]->{head_comp} eq 'h' && defined($daughters->[$i]->{lex_head})) { 759 | $headword=[@$headword,@{$daughters->[$i]->{lex_head}}]; 760 | } 761 | } 762 | for ($i=0; $i<@$daughters; $i++) { # copy the headword(s) 763 | $daughters->[$i]->{lex_head}=$headword; 764 | } 765 | $res->{lex_head}=$headword; # copy to mother 766 | } 767 | else { 768 | die "(lexicalize) No match $res ".ref($res)."!\n"; 769 | } 770 | } 771 | 772 | ###################################################################### 773 | ### flatten : called by main, calls itself 774 | ###################################################################### 775 | 776 | sub flatten { 777 | my $res=shift(@_); 778 | my $i; 779 | 780 | if (ref($res) eq 'terminal') 781 | { # stop at word level 782 | return $res; 783 | } 784 | elsif (ref($res) eq 'trace') 785 | { # ??? 786 | return (); 787 | } 788 | elsif (ref($res) eq 'non_terminal' && $res->{function}=~/^([A-Z]+)/) 789 | { # XP 790 | my $xp=$1; 791 | my $daughters=$res->{daughters}; 792 | 793 | $chunk_number++; 794 | 795 | for ($i=0; $i<@$daughters; $i++) 796 | { #loop on daughters 797 | 798 | # For terminals, initialize the IOB tag in a special field 799 | if (ref($daughters->[$i]) eq 'terminal') 800 | { # terminal 801 | $daughters->[$i]->{iob_tag_inner} = 802 | "I-$xp-$chunk_number"; 803 | } 804 | 805 | # for head words 806 | if ($daughters->[$i]->{head_comp} eq 'h') 807 | { 808 | 809 | # find the function: inherit from above, if the upper level 810 | # had already a different function - do not replace it but 811 | # add the current function. 812 | 813 | if ($daughters->[$i]->{function}=~/^$xp/ || 814 | $daughters->[$i]->{function} eq 'NOFUNC') 815 | { # NOFUNC 816 | $daughters->[$i]->{function}=$res->{function}; # copy function to head(s) 817 | } 818 | else 819 | { 820 | $daughters->[$i]->{function} = 821 | $daughters->[$i]->{function} .'/' . $res->{function} ; 822 | } 823 | $daughters->[$i]->{lex_head}=$res->{lex_head}; 824 | if (ref($daughters->[$i]) ne 'trace' 825 | && ref($res->{trace}) eq 'ARRAY') 826 | { # copy trace information 827 | if (ref($daughters->[$i]->{trace}) eq 'ARRAY') { 828 | $daughters->[$i]->{trace}=[@{$res->{trace}},@{$daughters->[$i]->{trace}}]; 829 | } 830 | else { 831 | $daughters->[$i]->{trace}=$res->{trace}; 832 | } 833 | } 834 | } 835 | } # end: loop on daughters 836 | my @output=(); 837 | for ($i=0; $i<@$daughters; $i++) { 838 | push(@output,flatten($daughters->[$i])); 839 | } 840 | return @output; 841 | } 842 | else { 843 | die "(flatten) No match $res ".ref($res)." $res->{function}!\n"; 844 | } 845 | } 846 | 847 | ###################################################################### 848 | ### cut_traces : called by main, calls itself 849 | ###################################################################### 850 | 851 | sub cut_traces { 852 | my $res=shift(@_); 853 | my $i; 854 | 855 | if (ref($res) eq 'terminal' || ref($res) eq 'trace') 856 | { # stop at word or trace level 857 | return 1; 858 | } 859 | elsif (ref($res) eq 'non_terminal') 860 | { # XP 861 | my $daughters=$res->{daughters}; 862 | for ($i=0; $i<@$daughters; $i++) { 863 | cut_traces($daughters->[$i]); 864 | } 865 | for ($i=0; $i<@$daughters; $i++) { 866 | if (ref($daughters->[$i]) eq 'trace' # trace 867 | || (ref($daughters->[$i]) eq 'non_terminal' # or empty constituent 868 | && @{$daughters->[$i]->{daughters}}==0)) { 869 | splice(@$daughters,$i,1); 870 | $i--; 871 | } 872 | } 873 | 874 | } 875 | else { 876 | die "(cut_traces) No match $res ".ref($res)."!\n"; 877 | } 878 | } 879 | 880 | ###################################################################### 881 | ### iob_chain : called by main, calls itself 882 | ###################################################################### 883 | 884 | sub iob_chain { 885 | my $res=shift(@_); 886 | my $i; 887 | 888 | if (ref($res) eq 'terminal' || ref($res) eq 'trace') 889 | { # stop at word or trace level 890 | return 1; 891 | } 892 | elsif (ref($res) eq 'non_terminal' && $res->{function}=~/^([A-Z]+)/) 893 | { # XP 894 | my $xp=$1; 895 | my $daughters=$res->{daughters}; 896 | 897 | for ($i=0; $i<@$daughters; $i++) 898 | { #loop on daughters 899 | 900 | $daughters->[$i]->{iob_tag} = $res->{iob_tag}; 901 | 902 | # If there is only one daughter, use the 'combined' tag 903 | if (@$daughters==1) { 904 | $current_iob_tag = 'C' ; 905 | } 906 | # If we are in the first daughter, use the 'begin' tag 907 | elsif ($i==0) { 908 | $current_iob_tag = 'B' ; 909 | $daughters->[$i]->{iob_tag} =~ s!E-!I-!g ; 910 | $daughters->[$i]->{iob_tag} =~ s!C-!B-!g ; 911 | } 912 | # If we are in the last daughter, use the 'end' tag 913 | elsif ($i==@$daughters-1) { 914 | $current_iob_tag = 'E' ; 915 | $daughters->[$i]->{iob_tag} =~ s!B-!I-!g ; 916 | $daughters->[$i]->{iob_tag} =~ s!C-!E-!g ; 917 | } 918 | # If we are somewhere in the middle, use the 'inside' tag 919 | else { 920 | $current_iob_tag = 'I' ; 921 | $daughters->[$i]->{iob_tag} =~ s!(B|E|C)-!I-!g ; 922 | } 923 | 924 | # Add the IOB tag to the chain 925 | $daughters->[$i]->{iob_tag} .= "$current_iob_tag-$xp/" ; 926 | } # end: loop on daughters 927 | 928 | for ($i=0; $i<@$daughters; $i++) { 929 | iob_chain($daughters->[$i]); 930 | } 931 | } 932 | else { 933 | die "(iob_chain) No match $res ".ref($res)." $res->{function}!\n"; 934 | } 935 | } 936 | 937 | ###################################################################### 938 | ### chunks : called by main, calls nothing 939 | ###################################################################### 940 | 941 | sub chunks { 942 | my $old_nr=-1; 943 | my $head_xp=''; 944 | my $head_nr=-1; 945 | my $xp; 946 | my $ref; 947 | my $prev_ref; 948 | my $i; 949 | for ($i=@flattened-1; $i>=0; $i--) { # start at end 950 | $ref=$flattened[$i]; # this word 951 | if ($i<@flattened-1) { # previous word 952 | $prev_ref=$flattened[$i+1]; 953 | } 954 | if ($ref->{iob_tag_inner}=~/^I-([A-Z]+)-([0-9]+)$/) { # inside 955 | $xp=$1; 956 | $nr=$2; 957 | if ($xp=~/^WH([A-Z]+)$/) { # WHNP, WHPP, WHADVP, WHADJP 958 | $xp=$1; 959 | } 960 | if ($ref->{function}!~/^NOFUNC/) { # word is a head 961 | $head_xp=$xp; 962 | $head_nr=$nr; 963 | $ref->{iob_tag_inner}="I-$xp"; # head is inside a chunk 964 | # } 965 | } 966 | else { # word is no head 967 | if ($xp eq $head_xp && $nr eq $head_nr) { # there has been a head 968 | $ref->{iob_tag_inner}="I-$xp"; # word is inside a chunk 969 | } 970 | elsif ($ref->{pos_tag} eq 'POS') { # special case "'s" "'" 971 | $ref->{iob_tag_inner}="I-NP"; # inside an NP 972 | if (defined($prev_ref) 973 | && ($prev_ref->{iob_tag_inner} =~ /^.-NP/)) { # there is a following NP 974 | $nr=$old_nr; # copy number 975 | if (defined($flattened[$i-1])) { 976 | $ref->{lex_head}=$flattened[$i-1]->{lex_head}; 977 | # lex_head is head of following NP (same as lex_head of previous word) 978 | } 979 | } 980 | else { 981 | $ref->{function}='NP'; # 's is NP of its own ' 982 | $nr=-2; # gets number of its own 983 | } 984 | } 985 | else { 986 | $ref->{iob_tag_inner}='O'; # word is outside a chunk 987 | } 988 | } 989 | if ($old_nr!=$nr) { # chunk boundary 990 | $ref->{iob_tag_inner} =~ s!^I-!E-!; # end of chunk 991 | if (defined($prev_ref)) { 992 | $prev_ref->{iob_tag_inner} =~ s!^I-!B-!; # beginning of chunk 993 | $prev_ref->{iob_tag_inner} =~ s!^E-!C-!; 994 | } 995 | } 996 | $old_nr=$nr; 997 | } 998 | else { 999 | die "File $filenumber: Error: no match $ref->{iob_tag_inner}, $ref->{word}!\n"; 1000 | } 1001 | } 1002 | # first word in sentence must have B- or C-tag (or O-) 1003 | $ref->{iob_tag_inner} =~ s!^I-!B-!; 1004 | $ref->{iob_tag_inner} =~ s!^E-!C-!; 1005 | } 1006 | 1007 | ###################################################################### 1008 | ### print_flatten : called by main, calls nothing 1009 | ###################################################################### 1010 | 1011 | sub print_flatten { 1012 | my $i; 1013 | my $j; 1014 | my $l; 1015 | my $trace; 1016 | my $trace_array; 1017 | my $headword; 1018 | 1019 | if ($sent_each_word == 0) # argument '-s' was given 1020 | { 1021 | # print file name and sentence number before word-list 1022 | printf("\# Sentence %s/%02d\n", $filenumber,$sentence_number) ; 1023 | } 1024 | 1025 | for ($i=0; $i<@flattened; $i++) 1026 | { # loop on words 1027 | if ($sent_each_word) 1028 | { 1029 | printf(" %4s %2d",$filenumber,$sentence_number); 1030 | } 1031 | 1032 | if ($opt_N==0) { 1033 | printf(" %2d",$flattened[$i]->{number}); 1034 | } 1035 | 1036 | if ($opt_i==0) { 1037 | if ($opt_B eq 'Begin') { 1038 | $flattened[$i]->{iob_tag_inner} =~ s!^E-!I-!g ; 1039 | $flattened[$i]->{iob_tag_inner} =~ s!^C-!B-!g ; 1040 | } 1041 | elsif ($opt_B eq 'End') { 1042 | $flattened[$i]->{iob_tag_inner} =~ s!^B-!I-!g ; 1043 | $flattened[$i]->{iob_tag_inner} =~ s!^C-!E-!g ; 1044 | } 1045 | elsif ($opt_B eq 'Between') { 1046 | $flattened[$i]->{iob_tag_inner} =~ s!^E-!I-!g ; 1047 | $flattened[$i]->{iob_tag_inner} =~ s!^C-!B-!g ; 1048 | if ($flattened[$i]->{iob_tag_inner} =~ /^B-(.*)$/ 1049 | && ($i==0 # first word in sentence 1050 | || $flattened[$i-1]->{iob_tag_inner} eq 'O' 1051 | || substr($flattened[$i-1]->{iob_tag_inner},2) ne $1)) { # different category 1052 | $flattened[$i]->{iob_tag_inner}="I-$1"; # is not 'Between' 1053 | } 1054 | } 1055 | # else: $opt_B eq 'BeginEndCombined') : nothing needs to be changed 1056 | 1057 | # ++ new case 1058 | # PU as B-NP or B-VP but the next tag is B-XP 1059 | # --> O 1060 | if( $flattened[$i]->{pos_tag} eq "PU" 1061 | && $flattened[$i]->{iob_tag_inner}=~ /^B-(.*)$/ 1062 | && ($i + 1 == @flattened || $flattened[$i +1]->{iob_tag_inner}=~ /^B-(.*)$|^C-(.*)$|^O$/)){ 1063 | $flattened[$i]->{iob_tag_inner}="O"; 1064 | } 1065 | # ++ new case 1066 | # PU + PU as B-XP + I-XP, but the next tag is O 1067 | # --> O + O 1068 | if( $flattened[$i]->{pos_tag} eq "PU" 1069 | && $flattened[$i]->{iob_tag_inner}=~ /^B-(.*)$/ 1070 | && $i + 1 < @flattened 1071 | && $flattened[$i+1]->{pos_tag} eq "PU" 1072 | && $flattened[$i+1]->{iob_tag_inner}=~ /^I-(.*)$|^E-(.*)$/){ 1073 | $flattened[$i]->{iob_tag_inner}="O"; 1074 | $flattened[$i+1]->{iob_tag_inner}="O"; 1075 | } 1076 | printf (" %-7s", $flattened[$i]->{iob_tag_inner}) ; 1077 | } 1078 | 1079 | if ($opt_p==0) { 1080 | printf(" %-5s",$flattened[$i]->{pos_tag}); 1081 | } 1082 | printf(" %-15s",$flattened[$i]->{word}); 1083 | if ($opt_f==0) { 1084 | printf(" %-15s",$flattened[$i]->{function}); 1085 | } 1086 | 1087 | $headword=$flattened[$i]->{lex_head}; 1088 | if (defined($headword) && @$headword>0) { # has heads 1089 | $printed_head = $headword->[0]->{word} ; 1090 | if ($opt_h==0) { 1091 | for ($j=1; $j<@$headword; $j++) { 1092 | $printed_head .= '/'.$headword->[$j]->{word} ; 1093 | } 1094 | printf(" %-15.30s", $printed_head) ; 1095 | } 1096 | if ($opt_H==0) { 1097 | printf(" %3d",$headword->[0]->{number}); 1098 | for ($j=1; $j<@$headword; $j++) { 1099 | printf("/%d",$headword->[$j]->{number}); 1100 | } 1101 | } 1102 | } 1103 | else { 1104 | if ($opt_h==0) { 1105 | printf(" %-15s",'???'); 1106 | } 1107 | if ($opt_H==0) { 1108 | printf(" %-10s",'???'); 1109 | } 1110 | } 1111 | if ($opt_c==0) { 1112 | chop $flattened[$i]->{iob_tag} ; 1113 | if ($opt_B eq 'Begin' 1114 | || $opt_B eq 'Between') { 1115 | # 'Between' doesn't apply to {iob_tag_inner}: take 'Begin' instead 1116 | $flattened[$i]->{iob_tag} =~ s!E-!I-!g ; 1117 | $flattened[$i]->{iob_tag} =~ s!C-!B-!g ; 1118 | } 1119 | elsif ($opt_B eq 'End') { 1120 | $flattened[$i]->{iob_tag} =~ s!B-!I-!g ; 1121 | $flattened[$i]->{iob_tag} =~ s!C-!E-!g ; 1122 | } 1123 | # else: $opt_B eq 'BeginEndCombined') : nothing needs to be changed 1124 | printf (" %s", $flattened[$i]->{iob_tag}) ; 1125 | } 1126 | 1127 | if (defined($flattened[$i]->{trace})) { 1128 | $trace_array=$flattened[$i]->{trace}; 1129 | for ($l=0; $l<@$trace_array; $l++) { 1130 | $trace=$trace_array->[$l]; 1131 | if ($opt_t==0) { 1132 | printf(" %10s %4s",$trace->{function},$trace->{kind}); 1133 | $headword=$trace->{lex_head}; 1134 | if (defined($headword) && @$headword>0) { # has heads 1135 | printf(" %5d",$headword->[0]->{number}); 1136 | for ($j=1; $j<@$headword; $j++) 1137 | { 1138 | printf("/%d",$headword->[$j]->{number}); 1139 | } 1140 | } 1141 | else { 1142 | printf(" %5s",'???'); 1143 | } 1144 | } 1145 | # break circular references to allow for proper garbage collection 1146 | undef $trace->{reference}; # delete reference from trace to filler 1147 | } 1148 | undef $flattened[$i]->{trace}; # delete references from filler to traces 1149 | } 1150 | 1151 | print "\n"; 1152 | } 1153 | print "\n"; 1154 | } 1155 | 1156 | ###################################################################### 1157 | ### head_medium : called by main, calls nothing 1158 | ###################################################################### 1159 | ### CTB4 1160 | ###################################################################### 1161 | sub head_medium { 1162 | 1163 | # (ADJP-PRD (UH OK)) 1164 | #$head{'ADJP'}='JJ|RB|VB|IN|UH|FW|RP|\$|#|DT|NN'; 1165 | #RP=DEC|DEG|DER|DEV|SP|AS|ETC|SP|MSP 1166 | $head{'ADJP'}='JJ|AD|VA|VC|VE|VV|CS|P|LC|IJ|FW|DEC|DEG|DER|DEV|SP|AS|ETC|SP|MSP|DT|NN|NR|NT'; 1167 | # (ADVP (UH Indeed)) 1168 | #$head{'ADVP'}='RB|IN|TO|DT|PDT|JJ|RP|FW|LS|UH|CC|NN|CD|VB'; 1169 | $head{'ADVP'}='AD|CS|P|LC|DT|JJ|DEC|DEG|DER|DEV|SP|AS|ETC|SP|MSP|FW|IJ|CC|NN|NR|NT|CD|OD|VA|VC|VE|VV'; 1170 | $head{'CLP'}='M'; 1171 | $head{'DP'}='DT|M'; 1172 | $head{'DNP'}='DEG'; 1173 | $head{'DVP'}='DEV'; 1174 | 1175 | #$head{'FRAG'}='[A-Z]'; 1176 | $head{'LCP'}='LC'; 1177 | $head{'LST'}='CD|PU|FW|OD'; 1178 | #$head{'NP'}='NN|CD|PRP|JJ|DT|EX|IN|RB|VB|FW|SYM|UH|WP|WDT'; 1179 | $head{'NP'}='NN|NR|NT|CD|OD|JJ|OD|DT|CS|P|LC|AD|VA|VC|VE|VV|FW|PU|ETC|M|P'; 1180 | #$head{'PP'}='IN|TO|RB|VBG|VBN|JJ|RP|CC|FW'; 1181 | $head{'PP'}='P|CS|P|LC|AD|VA|VC|VE|VV|JJ|OD|DEC|DEG|DER|DEV|SP|AS|ETC|SP|MSP|CC|FW|'; 1182 | #$head{'PRN'}= '[A-Z]'; 1183 | # "第一" 1184 | $head{'QP'}='CD|DT|OD|JJ|M|NN|NR|NT|PU'; 1185 | #$head{'UCP'}='JJ|NN|VB|CD|NR|NT|VA|VC|VE|VV'; 1186 | # AS了|DER得|SP了|BA把|LBlong Bei被|SBShort Bei被 1187 | #$head{'VP'}='VB|MD|TO|JJ|NN|POS|FW|SYM'; 1188 | $head{'VP'}='VA|VC|VE|VV|AS|DER|SP|BA|LB|SB|JJ|OD|FW|NN|NR|NT'; 1189 | ## 1190 | $head{'WHADJP'}='JJ'; 1191 | $head{'WHADVP'}='WRB|IN|RB|WDT'; 1192 | $head{'WHNP'}='NN|NR|NT|PU|ETC|VA|VC|VE|VV|CD|JJ|OD|DT|FW|M|P'; # including WP$ 1193 | $head{'WHPP'}='P'; 1194 | 1195 | 1196 | $headcat{'ADJP'}='ADJP'; 1197 | $headcat{'ADVP'}='ADVP|.*-ADV'; 1198 | 1199 | $headcat{'FRAG'}='FRAG|INTJ|S|VP'; 1200 | $headcat{'CLP'}='CLP'; 1201 | $headcat{'DP'}='DP|CLP'; 1202 | $headcat{'DNP'}='DNP'; 1203 | $headcat{'DVP'}='DVP'; 1204 | $headcat{'LCP'}='LCP'; 1205 | $headcat{'LST'}='LST'; 1206 | $headcat{'NP'}='NP|NX|.*-NOM'; 1207 | $headcat{'PP'}='PP'; 1208 | $headcat{'PRN'}='S|VP'; # not |PRN 1209 | $headcat{'QP'}='CLP|QP'; 1210 | $headcat{'UCP'}='[A-Z]+P(-[-A-Z]+)?$|S'; # not |UCP 1211 | $headcat{'VP'}='VP'; 1212 | 1213 | $headcat{'S'}='S$|VP|.*-PRD'; # special: not every S 1214 | $headcat{'NOLABEL'}='[A-Z]'; 1215 | $headcat{'X'}='S|[A-Z]+P(-[-A-Z]+)?$'; # normally pruned # not |X #' 1216 | 1217 | $headcat{'WHADJP'}='WHADJP|ADJP'; 1218 | $headcat{'WHADVP'}='WHADVP'; 1219 | $headcat{'WHNP'}='WHNP|NP'; 1220 | $headcat{'WHPP'}='WHPP'; 1221 | 1222 | } 1223 | 1224 | ###################################################################### 1225 | ###################################################################### 1226 | ############################# visualize ############################## 1227 | ###################################################################### 1228 | ###################################################################### 1229 | 1230 | sub print_lex_head { 1231 | my $r=shift(@_); # reference to list of objects (terminals) 1232 | my $i; 1233 | if (not(defined($r))) { 1234 | print 'undef'; 1235 | } 1236 | elsif (@$r==0) { 1237 | print "[]"; 1238 | } 1239 | else { 1240 | print "[$r->[0]->{word}"; 1241 | for ($i=1; $i<@$r; $i++) 1242 | { 1243 | print ",$r->[$i]->{word}"; 1244 | } 1245 | print ']'; 1246 | } 1247 | } 1248 | 1249 | sub print_trace { 1250 | my $r=shift(@_); # reference to list of objects (traces) 1251 | my $i; 1252 | for ($i=0; $i<@$r; $i++) { 1253 | print "\n\ttrace->{function}=$r->[$i]->{function}, {kind}=$r->[$i]->{kind}, {lex_head}="; 1254 | print_lex_head($r->[$i]->{lex_head}); 1255 | } 1256 | } 1257 | 1258 | sub print_list { 1259 | my @r=@_; # list of objects (terminals) 1260 | my $i; 1261 | foreach $r (@r) { 1262 | print "terminal->{number}=$r->{number}, {iob_tag}=$r->{iob_tag}, {pos_tag}=$r->{pos_tag}, {word}=$r->{word}, {function}=$r->{function}, {head_comp}=$r->{head_comp}, {lex_head}="; 1263 | print_lex_head($r->{lex_head}); 1264 | if (defined($r->{trace})) { 1265 | print ",{trace}=["; 1266 | print_trace($r->{trace}); 1267 | print "\n\t]"; 1268 | } 1269 | print "\n"; 1270 | } 1271 | } 1272 | sub print_parse_tree { 1273 | my $s=shift(@_); # number of spaces (=indentation for output) 1274 | my $r=shift(@_); # reference to object (parse tree) 1275 | my $i; 1276 | for ($i=0; $i<$s; $i++) { 1277 | print ' '; 1278 | } 1279 | if (ref($r) eq 'terminal') { 1280 | print "terminal->{number}=$r->{number}, {iob_tag}=$r->{iob_tag}, {pos_tag}=$r->{pos_tag}, {word}=$r->{word}, {function}=$r->{function}, {head_comp}=$r->{head_comp}, {lex_head}="; 1281 | print_lex_head($r->{lex_head}); 1282 | if (defined($r->{trace})) { 1283 | print ",{trace}=$r->{trace}"; 1284 | } 1285 | print "\n"; 1286 | } 1287 | elsif (ref($r) eq 'trace') { 1288 | print "trace->{function}=$r->{function}, {kind}=$r->{kind}, {lex_head}="; 1289 | print_lex_head($r->{lex_head}); 1290 | print ", {reference}="; 1291 | if (defined($r->{reference})) { 1292 | print "$r->{reference}"; 1293 | } 1294 | else { 1295 | print 'undef'; 1296 | } 1297 | print ", {head_comp}=$r->{head_comp}\n"; 1298 | } 1299 | elsif (ref($r) eq 'non_terminal') { 1300 | print "non_terminal->{function}=$r->{function}, {lex_head}="; 1301 | print_lex_head($r->{lex_head}); 1302 | if (defined($r->{trace})) { 1303 | print ",{trace}=$r->{trace}"; 1304 | } 1305 | print ", {head_comp}=$r->{head_comp}, {daughters}=[\n"; 1306 | my $daughters=$r->{daughters}; 1307 | for ($i=0; $i<@$daughters; $i++) { 1308 | print_parse_tree($s+1,$daughters->[$i]); 1309 | } 1310 | for ($i=0; $i<$s; $i++) 1311 | { 1312 | print ' '; 1313 | } 1314 | print "]\n"; 1315 | } 1316 | } 1317 | --------------------------------------------------------------------------------