├── README.md
└── chunklinkctb.pl


/README.md:
--------------------------------------------------------------------------------
 1 | ChunkLinkCTB (Chunklink for CTB)
 2 | ============
 3 | 
 4 | A tool for extracting chunks from Penn Chinese Treebank (Chen et. al 2006, ACL-poster), that is A CTB version of chunklink.
 5 | 
 6 | This tool is designed for the old versions of CTB. For CTB7.0 or higher, it needs some changes.
 7 | 
 8 | 宾州中文树库抽取中文短语（组块）工具，适用于CTB5.0及以下。
 9 | 
10 | =============
11 | Related Publications
12 | 
13 | If you would like to check more details about this tool, please check the following papers
14 | 
15 | -Wenliang Chen, Yujie Zhang and Hitoshi Isahara. An Empirical Study of Chinese Chunking, The joint conference of the International Committee on Computational Linguistics and the Association for Computational Linguistics (Coling-ACL2006) (Poster Session), Sydney, Australia, pp. 97-104, Jul. 2006
16 | 


--------------------------------------------------------------------------------
/chunklinkctb.pl:
--------------------------------------------------------------------------------
   1 | #!/usr/local/bin/perl -w
   2 | 
   3 | ###
   4 | ### Produces a convenient list of words, chunks, functions, and links from Penn Chinese TreeBank 4
   5 | ###
   6 | ### 
   7 | ### Modified by Wenliang Chen, NiCT, chenwl@nict.go.jp or chenwenliang@gmail.com, 2005
   8 | ###
   9 | 
  10 | ### Produces a convenient list of words, chunks, functions, and links from Penn TreeBank II
  11 | ###
  12 | ### Written by Sabine Buchholz, ILK, Tilburg University, S.Buchholz@kub.nl, 1998
  13 | ###
  14 | ### Modifications by Yuval Krymolowski, Bar-Ilan University, yuvalk@macs.biu.ac.il, 1999
  15 | ###
  16 | 
  17 | sub help {
  18 | print "call as:                                                                                      \n";
  19 | print "  chunklink_ctb.pl <options> /cdrom/ctb4/data/bracketed/chtb_????.fid | more                  \n";
  20 | print "                                                                                              \n";
  21 | print "options:                                                                                      \n";
  22 | print " -s : Place a '# sentence ID' line before the word-list of each sentence                      \n";
  23 | print "      instead of at the lines of the individual words.                                        \n";
  24 | print "      The sentence ID is file/number, e.g., 0001/01.                                          \n";
  25 | print "                                                                                              \n";
  26 | print " -ns : Enumerate the words inside a sentence, instead of number in the file                   \n";
  27 | print "                                                                                              \n";
  28 | print " -B  : which sort of IOB tags to output; I tags are always inside a chunk, O tags are outside \n";
  29 | print "       possible values are: Begin (the default): B tags for word at the beginning of a chunk  \n";
  30 | print "                            End:                 E tags for word at the end of a chunk        \n";
  31 | print "                            BeginEndCombined:    B tags for word at the beginning of a chunk  \n";
  32 | print "                                                 E tags for word at the end of a chunk        \n";
  33 | print "                                                 C tags for single word chunks                \n";
  34 | print "                            Between:             B tags for words that are at the beginning   \n";
  35 | print "                                                 of a chunk and the previous chunk had the    \n";
  36 | print "                                                 same syntactic category                      \n";
  37 | print "                                    Attention! The last option applies only to the simple     \n";
  38 | print "                                    IOB tag column (e.g. 'I-NP'), not to the IOB chain column \n";
  39 | print "                                    (e.g. 'I-S/I-S/I-NP/I-NP/I-PP/B-NP'). If 'Between', the   \n";
  40 | print "                                    latter column gets the default representation 'Begin'.    \n";
  41 | print "                                                                                              \n";
  42 | print " -N  : suppress word number in output                                                         \n";
  43 | print " -p  :     ...  POS tag ...                                                                   \n";
  44 | print " -f  :          function                                                                      \n";
  45 | print " -h  :          head word                                                                     \n";
  46 | print " -H  :          head number                                                                   \n";
  47 | print " -i  :          IOB tag                                                                       \n";
  48 | print " -c  :          IOB tag chain                                                                 \n";
  49 | print " -t  :          trace information                                                             \n";
  50 | }
  51 | 
  52 | ######################################################################
  53 | ######################################################################
  54 | ############################# initialize #############################
  55 | ######################################################################
  56 | ######################################################################
  57 | 
  58 | ######################################################################
  59 | ### load definition of object data structures to represent parse tree
  60 | ######################################################################
  61 | 
  62 | # use nodes;
  63 | 
  64 | # start of nodes.pm
  65 | 
  66 | package terminal;
  67 | sub new {
  68 |     my $type=shift;
  69 |     my $self={};
  70 |     $self->{number}=shift;
  71 |     $self->{iob_tag}=shift;
  72 |     $self->{iob_tag_inner}=$self->{iob_tag};
  73 |     $self->{pos_tag}=shift;
  74 |     $self->{word}=shift;
  75 |     $self->{function}=shift;
  76 |     $self->{prep}='-';         # for PNP feature: not used by chunklink.pl
  77 |     $self->{adv}='-';          # for ADV feature: not used by chunklink.pl
  78 |     $self->{head_comp}='c';
  79 |     $self->{trace}=undef;
  80 |     my $obj=bless $self, $type;
  81 |     $self->{lex_head}=[$obj];  # head is reference to itself
  82 |     return $obj;
  83 | }
  84 | 
  85 | package non_terminal;
  86 | sub new {
  87 |     my $type=shift;
  88 |     my $self={};
  89 |     my @d=() ;
  90 |     $self->{function}=shift;
  91 |     $self->{lex_head}=undef;
  92 |     $self->{head_comp}='c';
  93 |     $self->{daughters}=shift;
  94 |     $self->{trace}=undef;    
  95 |     $self->{iob_tag}='';
  96 |     return bless $self, $type;
  97 | }
  98 | 
  99 | package trace;
 100 | sub new {
 101 |     my $type=shift;
 102 |     my $self={};
 103 |     $self->{function}=shift;
 104 |     $self->{kind}=shift;
 105 |     $self->{lex_head}=shift;
 106 |     $self->{reference}=shift; # trace points to filler
 107 |     $self->{head_comp}='c';
 108 |     my $obj=bless $self, $type;
 109 |     if (defined($obj->{reference})) {
 110 | 	if (defined($obj->{reference}->{trace}) && ref($obj->{reference}->{trace})  eq 'ARRAY') {
 111 | 	    $obj->{reference}->{trace}=[$obj,@{$obj->{reference}->{trace}}]; # filler points to trace
 112 | 	}
 113 | 	else {
 114 | 	    $obj->{reference}->{trace}=[$obj]; # filler points to trace
 115 | 	}
 116 |     }
 117 |     return $obj;
 118 | }
 119 | 
 120 | package main;
 121 | 
 122 | # return 1;
 123 | 
 124 | # end of nodes.pm
 125 | 
 126 | if (@ARGV==0) {
 127 |     help();
 128 |     exit;
 129 | }
 130 | 
 131 | ######################################################################
 132 | ### check options
 133 | ######################################################################
 134 | 
 135 | require "getopts.pl" ;
 136 | $opt_B = 'Begin' ;
 137 | $opt_n = '' ;
 138 | $opt_s = '' ;
 139 | $opt_N = 0 ; # word number
 140 | $opt_p = 0 ; # POS tag
 141 | $opt_f = 0 ; # function
 142 | $opt_h = 0 ; # head
 143 | $opt_H = 0 ; # head number
 144 | $opt_i = 0 ; # IOB tag
 145 | $opt_c = 0 ; # IOB tag chain
 146 | $opt_t = 0 ; # trace information
 147 | &Getopts("B:n:sNpfhHict") ;
 148 | 
 149 | $sent_each_word = 1 ;       # default - print the sentence for each word
 150 | if ($opt_s)
 151 | {
 152 |     $sent_each_word = 0 ;
 153 | }
 154 | 
 155 | $word_enumerate = 'file' ;  # default - word number runs through all the file
 156 | 
 157 | if ($opt_n =~ /^s/)
 158 | {
 159 |     $word_enumerate = 'sent' ; # word number runs in current sentence
 160 | }
 161 | 
 162 | printf("#arguments: IOB tag: $opt_B, word numbering: $word_enumerate\n") ;
 163 | printf("#columns:") ;
 164 | 
 165 | if ($sent_each_word)
 166 | {
 167 |   printf(" file_id sent_id") ;
 168 | }
 169 | if ($opt_N==0)
 170 | { 
 171 |    printf(" word_id") ;
 172 | }
 173 | if ($opt_i==0)
 174 | { 
 175 |   printf(" iob_inner") ;
 176 | }
 177 | if ($opt_p==0)
 178 | { 
 179 |   printf(" pos") ;
 180 | }
 181 | printf(" word") ;
 182 | if ($opt_f==0)
 183 | { 
 184 |   printf(" function") ;
 185 | }
 186 | if ($opt_h==0)
 187 | { 
 188 |   printf(" heads") ;
 189 | }
 190 | if ($opt_H==0)
 191 | { 
 192 |   printf(" head_ids") ;
 193 | }
 194 | if ($opt_c==0)
 195 | { 
 196 |   printf(" iob_chain") ;
 197 | }
 198 | if ($opt_t==0)
 199 | { 
 200 |   printf(" trace-function trace-type trace-head_ids") ;
 201 | }
 202 | printf ("\n") ;
 203 | 
 204 | ######################################################################
 205 | ### load definition of constituent head
 206 | ######################################################################
 207 | 
 208 | head_medium();
 209 | 
 210 | ######################################################################
 211 | ### what to prune
 212 | ######################################################################
 213 | 
 214 | ### comment in if any such constituent should be pruned
 215 | $prune_always{'NAC'}=1; 
 216 | $prune_always{'NX'}=1;
 217 | $prune_always{'X'}=1;
 218 | ##CTB4 for verb phrases
 219 | $prune_always{'VCD'}=1;
 220 | $prune_always{'VRD'}=1;
 221 | $prune_always{'VSB'}=1;
 222 | $prune_always{'VCP'}=1;
 223 | $prune_always{'VNV'}=1;
 224 | $prune_always{'VPT'}=1;
 225 | 
 226 | 
 227 | ### comment in if the second constituent should be pruned 
 228 | ### when appearing as daughter of the first
 229 | ### in front of any potential head
 230 | $prune_if_infrontof_head{'NP'}{'ADJP'}=1;
 231 | $prune_if_infrontof_head{'NP'}{'UCP'}=1;
 232 | $prune_if_infrontof_head{'WHNP'}{'WHADJP'}=1;
 233 | $prune_if_infrontof_head{'ADJP'}{'ADVP'}=1;
 234 | ##CTB4 CLP will be pruned after QP
 235 | $prune_if_infrontof_head{'QP'}{'CLP'}=1;
 236 | $prune_if_infrontof_head{'DP'}{'CLP'}=1;
 237 | 
 238 | ### set to 1 if these should be pruned
 239 | ###        0 otherwise
 240 | $prune_s_in_vp_predicative_flag=1;
 241 | $prune_s_in_vp_empty_subject_flag=1;
 242 | $prune_vp_in_vp_flag=1;
 243 | $prune_advp_in_vp_flag=1;
 244 | 	   
 245 | ######################################################################
 246 | ######################################################################
 247 | ################################ main ################################
 248 | ######################################################################
 249 | ######################################################################
 250 | 
 251 | $word_number=0;   # initialize: for unique word number over whole corpus
 252 | foreach $file (@ARGV) {                     # loop: for each treebank file
 253 |     if (defined($file)) {                   # open for reading
 254 | 	open(INPUT,"<$file") or die "Cannot open $file!\n";
 255 |     }
 256 |     else {
 257 | 	die "file not defined!\n";
 258 |     }
 259 |     if ($file =~ /\/chtb_([0-9]+)\.fid$/) {  # extract file number
 260 | 	$filenumber=$1;
 261 |     }
 262 |     else {
 263 | 	$filenumber=1;#die "$file does not match!\n";
 264 |     }
 265 |     print STDERR "$filenumber ";
 266 |     $sentence='';                           # initialize sentence string
 267 |     $sentence_number=0;                     # initialize sentence counter
 268 |     while (defined($sentence=<INPUT>)) {    # loop: until end of file
 269 | 	$sent_flag = 1;
 270 | 	$result=start_read();               # read sentence into $result
 271 | 	last if($sent_flag == 0);
 272 | 	
 273 | 	if (defined($result->{function})    # delete outermost parentheses
 274 | 	    && $result->{function} eq 'NOLABEL' 
 275 | 	    && @{$result->{daughters}}==1)
 276 | 	    {
 277 | 		$result=$result->{daughters}->[0];
 278 | 	    }
 279 | 
 280 | # set the word counter within file or within sentence
 281 | 	if ($word_enumerate eq 'file')
 282 | 	{
 283 | 	    $word_num = $word_number ;
 284 | 	}
 285 | 	else # 'sent'
 286 | 	{
 287 | 	    $word_num = 0 ;
 288 | 	}
 289 | #print "\n################## start_read ####################\n\n"; print_parse_tree(0,$result); print "\n\n";
 290 | 	prune($result);                     # prune parse tree and determine heads of constituents
 291 | #print "################## prune ####################\n\n"; print_parse_tree(0,$result); print "\n\n";
 292 | 	lexicalize($result);                # lexicalize parse tree
 293 | #print "################## lexicalize ####################\n\n"; print_parse_tree(0,$result); print "\n\n";
 294 | 	@flattened=flatten($result);        # flatten tree into sequence of chunks
 295 | #print "################## flatten ####################\n\n"; print_list(@flattened); print "\n\n";
 296 | 	if ($opt_c==0) {                    # if {iob_tag} feature must be computed
 297 | 	    cut_traces($result);            # remove traces from the tree
 298 | 	    iob_chain($result);             # compute the value of the {iob_tag} feature
 299 | 	}
 300 | 	chunks();                           # compute IOB-tags
 301 | #print "################## chunks ####################\n\n"; print_list(@flattened); print "\n\n";
 302 | #print "################## result ####################\n\n"; 
 303 | 	print_flatten();                    # print output
 304 |     }
 305 | }
 306 | print STDERR "\n$word_number words processed\n";
 307 | 
 308 | ######################################################################
 309 | ######################################################################
 310 | ################################ subs ################################
 311 | ######################################################################
 312 | ######################################################################
 313 | 
 314 | ######################################################################
 315 | ### start_read : called by main, calls read_sentence(_,_)
 316 | ######################################################################
 317 | 
 318 | sub start_read {
 319 | # skip all lines not of the form "   (..." (e.g. blank lines)
 320 |     while ($sentence!~/^\s*\((.*)$/ 
 321 | 	   && defined($sentence=<INPUT>)) {};
 322 |     if(!defined($sentence)){
 323 | 	$sent_flag = 0;
 324 | 	return \'';
 325 | 	}
 326 |     chop($sentence);
 327 | # consumes first opening bracket of sentence
 328 | # calls read_sentence to read input until corresponding closing bracket is found
 329 |     if ($sentence=~/^\s*\((.*)$/)
 330 |     { 
 331 | 	$sentence=$1;
 332 | 	$depth=1;
 333 | 	$sentence_number++;
 334 | 	$chunk_number=0; 
 335 | 	undef %corefs;
 336 | 	undef %tracerefs;
 337 | 	return read_sentence('NOLABEL',$depth-1);
 338 |     }
 339 |     else
 340 |     {
 341 | 	return \'';   # '
 342 |     }
 343 | }
 344 | 
 345 | ######################################################################
 346 | ### read_sentence : called by start_read(), calls itself, 
 347 | ###                 'trace'->new(_,_,_,_), 'terminal'->new(_,_,_,_,_), 
 348 | ###                 non_terminal->new(_,_)
 349 | ######################################################################
 350 | 
 351 | sub read_sentence {
 352 |     my $label=shift(@_);             # the label of the constituent that is to be read
 353 |     my $mydepth=shift(@_);           # at which bracket depth the constituent starts/ends
 354 |     my @store=();                    # references to the daughters of the constituent will go in here
 355 | 
 356 |     while ($depth>$mydepth) {        # while closing bracket of constituent is not yet found
 357 | 	if (length($sentence)==0) {      # if necessary:
 358 | 	    if (defined($sentence=<INPUT>)) { # read in new line
 359 | 		chop($sentence);
 360 | 	    }
 361 | 	    else {
 362 | 		die "$filenumber $sentence_number ERROR (input finished before sentence was complete)!\n";
 363 | 	    }
 364 | 	}
 365 | 	if ($sentence=~/^\(([^ ()]+) ([^ ()]+)\)(.*)$/) { # (NN man)           # terminal node
 366 | 	    $tag=$1;
 367 | 	    $word=$2;
 368 | 	    $sentence=$3;
 369 | 	    if ($tag eq '-NONE-') {                                            # null element
 370 | 		if ($word=~/^(\*T\*|\*U\*|\*NOT\*|\*RNR\*|\*ICH\*|\*EXP\*|\*PPA\*|\*\?\*|\*)(.*)$/) { # trace
 371 | 		    my $kind=$1;
 372 | 		    my $rest=$2;
 373 | 		    if (defined($rest) && $rest=~/^-([0-9]+)$/) {              # co-referenced
 374 | 			if (defined($corefs{$1})) {                            # co-reference already defined (backward reference)
 375 | 			    push(@store,'trace'->new('NOFUNC',$kind,undef,$corefs{$1}));
 376 | 			}
 377 | 			else {                                                 # co-reference not yet defined (forward reference)
 378 | 			    my $ref='trace'->new('NOFUNC',$kind,undef,undef);
 379 | 			    push(@store,$ref);			
 380 | 			    $tracerefs{$1}=$ref;                               # store in hash for later processing
 381 | 			}
 382 | 		    }
 383 | 		    else {                                                     # not coreferenced
 384 | 			push(@store,'trace'->new('NOFUNC',$kind,undef,undef));			
 385 | 		    }
 386 | 		} # no else:
 387 |                   # if $tag is '-NONE-', but $word is none of the above, 
 388 |                   # it is probably the empty complementizer: (-NONE- 0)
 389 |                   # -> just ignore
 390 | 	    }
 391 | 	    else {                                                             # no null element
 392 | 		$tag=~s/,/COMMA/g;   # replace some special characters
 393 | 		$tag=~s/-LRB-/(/g;
 394 | 		$tag=~s/-RRB-/)/g;
 395 | 		$word=~s/,/COMMA/g;
 396 | 		$word=~s/-LRB-/(/g;
 397 | 		$word=~s/-RRB-/)/g;
 398 | 		$word=~s/-LCB-/{/g;
 399 | 		$word=~s/-RCB-/}/g;
 400 | 		push(@store,'terminal'->new($word_num++,'O',$tag,$word,'NOFUNC'));
 401 | 		$word_number++ ;
 402 | 	    }
 403 | 	}
 404 | 	elsif ($sentence=~/^\((\S+)(.*)$/) { # (XP                             # beginning of non-terminal node
 405 | 	    my $index;
 406 | 	    my $sublabel=$1;
 407 | 	    $sentence=$2;
 408 | 	    $depth++;
 409 | 	    if ($sublabel=~/^(.+)-([0-9]+)$/) {                                # special: co-references
 410 | 		$sublabel=$1;
 411 | 		$index=$2;
 412 | 	    }
 413 | 	    if ($sublabel=~/^(.+)=[0-9]+$/) {                                  # delete gapping-references
 414 | 		$sublabel=$1;
 415 | 	    }
 416 | 	    my $ref=read_sentence($sublabel,$depth-1);    # read constituent
 417 | 	    push(@store,$ref);                            # store constituent as daughter
 418 | 	    if (defined($index)) {                                             # was coreferenced
 419 | 		if (defined($tracerefs{$index})) {                             # trace had been found before filler (forward reference)
 420 | 		    $tracerefs{$index}->{reference}=$ref;                      # trace points to filler
 421 | 		    $ref->{trace}=[$tracerefs{$index}];                        # filler points to trace
 422 | 		}
 423 | 		else {                                                         # filler had been found before trace (backward reference)
 424 | 		    $corefs{$index}=$ref;                                      # store in hash for later processing
 425 | 		}
 426 | 	    }
 427 | 	}
 428 | 	elsif ($sentence=~/^\((.*)$/) { # (                                    # same, without label
 429 | 	    $sentence=$1;
 430 | 	    $depth++;
 431 | 	    push(@store,read_sentence('NOLABEL',$depth-1));
 432 | 	}
 433 | 	elsif ($sentence=~/^\)(.*)$/) { # )                                    # end of non-terminal node
 434 | 	    $depth--;
 435 | 	    $sentence=$1;
 436 | 	}	
 437 | 	elsif ($sentence=~/^\s+(.*)/) {                                        # blanks
 438 | 	    $sentence=$1;
 439 | 	}
 440 | 	else {
 441 | 	    print die "$filenumber $sentence_number ERROR: No match(2) $sentence!\n";
 442 | 	}	
 443 |     }
 444 |     if ($depth==$mydepth) {                             # if corresponding closing bracket has been found
 445 | 	return non_terminal->new($label,\@store);       # return reference to constituent
 446 |     }
 447 |     else {
 448 | 	print die "$filenumber $sentence_number ERROR: $depth $mydepth\n";
 449 |     }
 450 | }
 451 | 
 452 | ######################################################################
 453 | ### prune : called by main, calls head_of(_,_)
 454 | ######################################################################
 455 | 
 456 | sub verbs_or_adverbs_in_front {
 457 |     if (@lastnonref+@advps>0         # at least one verb or adverb in front
 458 | 	&& $i==@lastnonref+@advps) { # only verbs and adverbs in front
 459 | 	return 1;
 460 |     }
 461 |     else {
 462 | 	return 0;
 463 |     }
 464 | }
 465 | 
 466 | sub simple_prune {
 467 |     splice(@$daughters,$i,1,@{$daughters->[$i]->{daughters}});        # insert contents of ADJP etc.
 468 |     $i--;
 469 | }
 470 | 
 471 | sub prune_drop_first {
 472 |     splice(@$daughters,$i,1,@{$s_daughters}[1..$#{$s_daughters}]); # insert all but first daughter of S
 473 |     $i--;		    
 474 | }
 475 | 
 476 | # prune S with (non-)empty subject inside VP
 477 | #    so that "expected to take" is ONE VP chunk
 478 | #    so that objects of predicative verbs are indeed recognized as objects
 479 | #    possibly: so that objects of controll verbs are indeed recognized as objects
 480 | sub prune_s_in_vp_condition {
 481 |     if ( $xp eq 'VP' # && $sub_labelxp eq 'S'                          # S(-...) in VP 
 482 |                       && $sub_label eq 'S'                             # S in VP: no S-CLR, S-ADV etc.
 483 | 	&& verbs_or_adverbs_in_front()                                 # verbs or adverbs in front
 484 | 	&& defined($daughters->[$i]->{daughters})
 485 | 	&& @{$daughters->[$i]->{daughters}}>=2                         # has at least 2 daughters:
 486 | 	&& $daughters->[$i]->{daughters}->[0]->{function} =~ /-SBJ/    # 1) subject: include S-NOM-SBJ
 487 | 	) {
 488 | 	return 1;
 489 |     }
 490 |     else {
 491 | 	return 0;
 492 |     }
 493 | }
 494 | 	
 495 | sub prune_s_in_vp {
 496 |     local $s_daughters=$daughters->[$i]->{daughters};
 497 |     if (prune_s_in_vp_predicative_condition()) {
 498 | 	$s_daughters->[0]->{function} =~ s/-SBJ//;       # change function to "direct object": also for S-NOM-SBJ
 499 | 	simple_prune();
 500 |     }
 501 |     elsif (prune_s_in_vp_empty_subject_condition()) {
 502 | 	if (defined($s_daughters->[0]->{daughters}->[0]->{reference})) { # trace no longer exists
 503 | 	    $s_daughters->[0]->{daughters}->[0]->{reference}->{trace}=undef;
 504 | 	}
 505 | 	prune_drop_first();
 506 |     }
 507 | }
 508 | 
 509 | #    predicative:
 510 | #    (VP (VBP make) 
 511 | #      (S 
 512 | #        (NP-SBJ (PRP them) )
 513 | #        (ADJP-PRD (JJ fearful) )))
 514 | ######################################################
 515 | ### can be changed so as to also apply to cases like:
 516 | #    (VP (VBP permit) 
 517 | #      (S 
 518 | #        (NP-SBJ (NN portfolio) (NNS managers) )
 519 | #        (VP (TO to) 
 520 | #          (VP (VB retain) 
 521 | sub prune_s_in_vp_predicative_condition {
 522 |     if ($prune_s_in_vp_predicative_flag
 523 | # subject may be trace if verb verb is passivized
 524 | 	&& $s_daughters->[1]->{function}=~/-PRD$/                         # 2) predicative
 525 | 	) {
 526 | 	return 1;
 527 |     }
 528 |     else {
 529 | 	return 0;
 530 |     }
 531 | }
 532 | 
 533 | #    empty subject and VP is infinitive or gerund:
 534 | #    (VP (VBN expected) 
 535 | #      (S 
 536 | #        (NP-SBJ (-NONE- *-1) )
 537 | #        (VP (TO to) 
 538 | #          (VP (VB take) 
 539 | #            (NP (DT another) (JJ sharp) (NN dive) )
 540 | #    (VP (VBD evaluated) 
 541 | #      (S 
 542 | #        (NP-SBJ (-NONE- *-2) )
 543 | #        (VP (VBG raising) 
 544 | #          (NP (PRP$ our) (NN bid) )))
 545 | sub prune_s_in_vp_empty_subject_condition {
 546 |     if ($prune_s_in_vp_empty_subject_flag
 547 | 	&& @{$s_daughters->[0]->{daughters}}==1                        #    subject has only one daughter
 548 | 	&& ref($s_daughters->[0]->{daughters}->[0]) eq 'trace'         #    which is trace
 549 | 	&& $s_daughters->[1]->{function} eq 'VP'                       # 2) infinitive/gerund
 550 | 	) {
 551 | 	return 1;
 552 |     }
 553 |     else {
 554 | 	return 0;
 555 |     }
 556 | }
 557 | 
 558 | sub np_condition {
 559 |     if ($xp eq 'NP'                                                  # special case of NPs
 560 | 	&& $i!=@$daughters-1 
 561 | 	   && defined($daughters->[$i+1]->{function})
 562 | 	&& $daughters->[$i+1]->{function}=~/^NP/                      # no directly following NP
 563 | 	   && defined($daughters->[$i]->{daughters})
 564 | 	   && defined($daughters->[$i]->{daughters}->[-1])
 565 | 	   && defined($daughters->[$i]->{daughters}->[-1]->{pos_tag})
 566 | 	&& $daughters->[$i]->{daughters}->[-1]->{pos_tag} eq 'POS') { # no possessive NP
 567 | 	return 1;
 568 |     }
 569 |     else {
 570 | 	return 0;
 571 |     }
 572 | }
 573 | 
 574 | sub check_non_terminal {
 575 |     if (exists($prune_always{$sub_labelxp})) {
 576 | 	simple_prune();
 577 |     }
 578 |     elsif (exists($prune_if_infrontof_head{$xp}{$sub_labelxp})
 579 | # UCP or ADJP in NP
 580 | # WHADJP in WHNP
 581 | # ADVP in ADJP
 582 | #	   && @lastnonref==0  # no possible head-word found to date: would result in "a [UCP state and local] [NP utility]"
 583 | 	   && @sub_xps==0) {  # no possible head-constituent found to date: don't prune "P.V., 61 years old, ..."
 584 | 	simple_prune();
 585 |     }
 586 |     elsif ($prune_vp_in_vp_flag
 587 | 	   && $xp eq 'VP' && $sub_labelxp eq 'VP'    # VP in VP
 588 | 	   && verbs_or_adverbs_in_front()) {      #    verbs or adverbs in front
 589 | 	simple_prune();
 590 |     }
 591 |     elsif (prune_s_in_vp_condition()) {
 592 | 	prune_s_in_vp();
 593 |     }
 594 | # special case of ADVP in VP: remember position of ADVP for later pruning
 595 |     elsif ($xp eq 'VP' && $sub_labelxp eq 'ADVP') { # ADVP in VP
 596 | 	push(@advps,$i);
 597 |     }
 598 | # for determining the head daughter later: remember position of non-terminal daughter
 599 |     elsif ( defined($headcat{$xp}) 
 600 | 	   && $sub_label=~/^($headcat{$xp})/) {
 601 | 	if (not(np_condition())) {
 602 | 	    push(@sub_xps,$i);                  # remember position of sub-XP
 603 | 	}
 604 |     }
 605 | # for determining the head daughter later: record if coordinating conjunction found
 606 |     elsif ($sub_labelxp eq 'CONJP') { 
 607 | 	$cc=1;
 608 |     }
 609 | }
 610 | 
 611 | sub check_terminal {
 612 |     my $postag=$daughters->[$i]->{pos_tag};
 613 | # for determining the head daughter later: remember position of terminal daughter
 614 |     if (head_of($postag,$xp)) {    # restrictions on heads of XPs
 615 | 	push(@lastnonref,$i);      # remember position of last word
 616 |     }
 617 | # for determining the head daughter later: record if coordinating conjunction found
 618 |     if ($postag=~/^CC/             # coordinating conjunction
 619 | 	&& $i>0) {                 # not as first word in phrase
 620 | 	$cc=1;
 621 |     }
 622 | # see above: VP in VP is deleted if only verbs or adverbs in front
 623 |     elsif ($postag=~/^RB/) {       
 624 | 	push(@advps,$i);
 625 |     }
 626 | }
 627 | 
 628 | # determine and mark head(s): default: last word or first XP is head
 629 | sub mark_head {
 630 |     if (@lastnonref>0) {                                # possible lexical head (none for NOLABEL, ...)
 631 | 	$daughters->[$lastnonref[-1]]->{head_comp}='h';
 632 |     }
 633 |     elsif (@sub_xps>0) {                                # possible non-lexical head
 634 | 	if ($cc==1) {                                   # coordinated structure
 635 | 	    for (my $i=0; $i<@sub_xps; $i++) {
 636 | 		$daughters->[$sub_xps[$i]]->{head_comp}='h';
 637 | 	    }
 638 | 	}
 639 | 	else {                                          # no coordination
 640 | 	    $daughters->[$sub_xps[0]]->{head_comp}='h';
 641 | 	}
 642 |     }
 643 | }
 644 | 
 645 | # prune ADVP inside a VP
 646 | sub prune_advp_in_vp {
 647 |     if ($prune_advp_in_vp_flag
 648 | 	&& $xp eq 'VP' && @advps>0 && @lastnonref>0) {                     # there is an ADVP in a VP
 649 | 	my $i=0; 
 650 | 	my $add=0; 
 651 | 	my $index;
 652 | 	while ($i<@advps && $advps[$i]<$lastnonref[-1]) {               # ADVP is in front of last verb
 653 | 	    $index=$advps[$i]+$add;
 654 | 	    if (ref($daughters->[$index]) eq 'non_terminal') {          # prune only ADVPs not pure RBs
 655 | 		$add+=@{$daughters->[$index]->{daughters}}-1;           # -1: original reference is replaced
 656 | 		splice(@$daughters,$index,1,@{$daughters->[$index]->{daughters}});   # insert contents of ADVP
 657 | 	    }
 658 | 	    $i++;
 659 | 	}
 660 |     }    
 661 | }
 662 | 
 663 | sub prune {
 664 |     local $res=shift(@_); # the actual node
 665 |     local $i;
 666 |     local $daughters;     # the daughters of the constituent
 667 |     local $xp;            # its syntactic category
 668 |     local @sub_xps=();    # those daughters that are non-terminals
 669 |     local @lastnonref=(); #                      ... terminals
 670 |     local @advps=();      # ADVPs in VPs
 671 |     local $cc=0;          # boolean: whether coordinating conjunction was found
 672 |     local $sub_label;     # the label of the daughter (e.g. XP-FUNC)
 673 |     local $sub_labelxp;   # its syntactic category    (e.g. XP)
 674 |     if (ref($res) eq 'terminal' || ref($res) eq 'trace') { # stop at word or trace level
 675 | 	return 1;
 676 |     }
 677 |     if (ref($res) ne 'non_terminal') {
 678 | 	die "(prune) No match $res ".ref($res)."!\n";
 679 |     }
 680 | 
 681 |     if ($res->{function}=~/^([A-Z]+)/) {  # XP
 682 | 	$xp=$1;                    # the syntactic category
 683 |     }
 684 |     else {
 685 | 	die "(prune) No match $res->{function}=~/^([A-Z]+)/!\n";
 686 |     }
 687 | 
 688 |     $daughters=$res->{daughters};  # the daughters of the constituent
 689 |     for ($i=0; $i<@$daughters; $i++) {
 690 | 	$sub_label=$daughters->[$i]->{function}; # the label of the daughter
 691 | 	if ($sub_label=~/^([A-Z]+)/) { # XP
 692 | 	    $sub_labelxp=$1;                     # its syntactic category
 693 | 	}
 694 | 	else {		
 695 | 	    $sub_labelxp=$sub_label;
 696 | 	}
 697 | 
 698 | 	if (ref($daughters->[$i]) eq 'non_terminal') {
 699 | 	    check_non_terminal();
 700 | 	}
 701 | 	elsif (ref($daughters->[$i]) eq 'terminal') {
 702 | 	    check_terminal();
 703 | 	}
 704 | 	elsif (@$daughters==1) {               # trace must be only daughter
 705 | # for determining the head daughter later: remember position of trace daughter
 706 | 	    push(@lastnonref,$i);      
 707 | 	}
 708 |     } # end of processing the daughters: for ($i=0; $i<@$daughters; $i++) 
 709 |     
 710 |     mark_head();
 711 |     
 712 |     prune_advp_in_vp();
 713 |     
 714 | # recursive call
 715 |     for ($i=0; $i<@$daughters; $i++) {
 716 | 	prune($daughters->[$i]);
 717 |     }
 718 | }
 719 | 
 720 | 
 721 | ######################################################################
 722 | ### head_of : called by prune(_), calls nothing
 723 | ######################################################################
 724 | 
 725 | sub head_of {
 726 |     ($postag,$xp)=@_;
 727 |     if (defined($head{$xp})) {
 728 | 	if ($postag=~/^($head{$xp})/) {
 729 | 	    return 1;
 730 | 	}
 731 | 	else {
 732 | 	    return 0;
 733 | 	}
 734 |     }
 735 |     else {
 736 | 	return 0;
 737 |     }
 738 | }
 739 | 
 740 | ######################################################################
 741 | ### lexicalize : called by main, calls itself
 742 | ######################################################################
 743 | 
 744 | sub lexicalize {
 745 |     my $res=shift(@_);
 746 |     my $i;
 747 |     if (ref($res) eq 'terminal' || ref($res) eq 'trace') {  
 748 | # stop at word or trace level
 749 | 	return 1;
 750 |     }
 751 |     elsif (ref($res) eq 'non_terminal') { # non_terminal
 752 | 	my $daughters=$res->{daughters};
 753 | 	for ($i=0; $i<@$daughters; $i++) {
 754 | 	    lexicalize($daughters->[$i]);
 755 | 	}
 756 | 	my $headword=[];
 757 | 	for ($i=0; $i<@$daughters; $i++) { # find the headword(s)
 758 | 	    if ($daughters->[$i]->{head_comp} eq 'h' && defined($daughters->[$i]->{lex_head})) {
 759 | 		$headword=[@$headword,@{$daughters->[$i]->{lex_head}}];
 760 | 	    }
 761 | 	}
 762 | 	for ($i=0; $i<@$daughters; $i++) { # copy the headword(s)
 763 | 	    $daughters->[$i]->{lex_head}=$headword;
 764 | 	}
 765 | 	$res->{lex_head}=$headword; # copy to mother
 766 |     }
 767 |     else {
 768 | 	die "(lexicalize) No match $res ".ref($res)."!\n";
 769 |     }
 770 | }
 771 | 
 772 | ######################################################################
 773 | ### flatten : called by main, calls itself
 774 | ######################################################################
 775 | 
 776 | sub flatten {
 777 |     my $res=shift(@_);
 778 |     my $i;
 779 |     
 780 |     if (ref($res) eq 'terminal')
 781 |     { # stop at word level
 782 | 	return $res;
 783 |     }
 784 |     elsif (ref($res) eq 'trace')
 785 |     { # ???
 786 | 	return ();
 787 |     }
 788 |     elsif (ref($res) eq 'non_terminal' && $res->{function}=~/^([A-Z]+)/)
 789 |     { # XP
 790 | 	my $xp=$1;
 791 | 	my $daughters=$res->{daughters};
 792 | 	
 793 | 	$chunk_number++;
 794 | 	
 795 | 	for ($i=0; $i<@$daughters; $i++)
 796 | 	{ #loop on daughters
 797 | 	    
 798 | 	    # For terminals, initialize the IOB tag in a special field
 799 | 	    if (ref($daughters->[$i]) eq 'terminal')
 800 | 	    { # terminal
 801 | 		$daughters->[$i]->{iob_tag_inner} =
 802 | 		    "I-$xp-$chunk_number";
 803 | 	    }
 804 | 	    
 805 | 	    # for head words
 806 | 	    if ($daughters->[$i]->{head_comp} eq 'h')
 807 | 	    {
 808 | 		
 809 | 		# find the function: inherit from above, if the upper level
 810 | 		# had already a different function - do not replace it but
 811 | 		# add the current function.
 812 | 		
 813 | 		if ($daughters->[$i]->{function}=~/^$xp/ ||
 814 | 		    $daughters->[$i]->{function} eq 'NOFUNC')
 815 | 		{ # NOFUNC
 816 | 		    $daughters->[$i]->{function}=$res->{function}; # copy function to head(s)
 817 | 		}
 818 | 		else
 819 | 		{
 820 | 		    $daughters->[$i]->{function} =
 821 | 			$daughters->[$i]->{function} .'/' . $res->{function} ;
 822 | 		}
 823 | 		$daughters->[$i]->{lex_head}=$res->{lex_head};
 824 | 		if (ref($daughters->[$i]) ne 'trace'
 825 | 		    && ref($res->{trace}) eq 'ARRAY')
 826 | 		{ # copy trace information
 827 | 		    if (ref($daughters->[$i]->{trace}) eq 'ARRAY') {
 828 | 			$daughters->[$i]->{trace}=[@{$res->{trace}},@{$daughters->[$i]->{trace}}];
 829 | 		    }
 830 | 		    else {
 831 | 			$daughters->[$i]->{trace}=$res->{trace};
 832 | 		    }
 833 | 		}
 834 | 	    }
 835 | 	} # end: loop on daughters
 836 | 	my @output=();
 837 | 	for ($i=0; $i<@$daughters; $i++) {
 838 | 	    push(@output,flatten($daughters->[$i]));
 839 | 	}
 840 | 	return @output;
 841 |     }
 842 |     else {
 843 | 	die "(flatten) No match $res ".ref($res)." $res->{function}!\n";
 844 |     }
 845 | }
 846 | 
 847 | ######################################################################
 848 | ### cut_traces : called by main, calls itself
 849 | ######################################################################
 850 | 
 851 | sub cut_traces {
 852 |     my $res=shift(@_);
 853 |     my $i;
 854 |     
 855 |     if (ref($res) eq 'terminal' || ref($res) eq 'trace')
 856 |     { # stop at word or trace level
 857 | 	return 1;
 858 |     }
 859 |     elsif (ref($res) eq 'non_terminal')
 860 |     { # XP
 861 | 	my $daughters=$res->{daughters};
 862 | 	for ($i=0; $i<@$daughters; $i++) {
 863 | 	    cut_traces($daughters->[$i]);
 864 | 	}
 865 | 	for ($i=0; $i<@$daughters; $i++) {
 866 | 	    if (ref($daughters->[$i]) eq 'trace'               # trace
 867 | 		|| (ref($daughters->[$i]) eq 'non_terminal'    # or empty constituent
 868 | 		    && @{$daughters->[$i]->{daughters}}==0)) {
 869 | 		splice(@$daughters,$i,1);                     
 870 | 		$i--;                                         
 871 | 	    }
 872 | 	}
 873 | 	
 874 |     }
 875 |     else {
 876 | 	die "(cut_traces) No match $res ".ref($res)."!\n";
 877 |     }
 878 | }
 879 | 
 880 | ######################################################################
 881 | ### iob_chain : called by main, calls itself
 882 | ######################################################################
 883 | 
 884 | sub iob_chain {
 885 |     my $res=shift(@_);
 886 |     my $i;
 887 |     
 888 |     if (ref($res) eq 'terminal' || ref($res) eq 'trace')
 889 |     { # stop at word or trace level
 890 | 	return 1;
 891 |     }
 892 |     elsif (ref($res) eq 'non_terminal' && $res->{function}=~/^([A-Z]+)/)
 893 |     { # XP
 894 |       my $xp=$1;
 895 |       my $daughters=$res->{daughters};
 896 | 
 897 |       for ($i=0; $i<@$daughters; $i++)
 898 |       { #loop on daughters
 899 | 
 900 | 	  $daughters->[$i]->{iob_tag} = $res->{iob_tag};
 901 | 
 902 |         # If there is only one daughter, use the 'combined' tag
 903 | 	  if (@$daughters==1) {
 904 | 	      $current_iob_tag = 'C' ;
 905 | 	  }
 906 |         # If we are in the first daughter, use the 'begin' tag
 907 | 	  elsif ($i==0) {
 908 | 	      $current_iob_tag = 'B' ;
 909 | 	      $daughters->[$i]->{iob_tag} =~ s!E-!I-!g ;
 910 | 	      $daughters->[$i]->{iob_tag} =~ s!C-!B-!g ;
 911 | 	  }
 912 |         # If we are in the last daughter, use the 'end' tag
 913 | 	  elsif ($i==@$daughters-1) {
 914 | 	      $current_iob_tag = 'E' ;
 915 | 	      $daughters->[$i]->{iob_tag} =~ s!B-!I-!g ;
 916 | 	      $daughters->[$i]->{iob_tag} =~ s!C-!E-!g ;
 917 | 	  }
 918 | 	# If we are somewhere in the middle, use the 'inside' tag
 919 | 	  else {
 920 | 	      $current_iob_tag = 'I' ;
 921 | 	      $daughters->[$i]->{iob_tag} =~ s!(B|E|C)-!I-!g ;
 922 | 	  }
 923 | 
 924 |         # Add the IOB tag to the chain
 925 | 	$daughters->[$i]->{iob_tag} .= "$current_iob_tag-$xp/" ;
 926 |       } # end: loop on daughters
 927 |       
 928 |       for ($i=0; $i<@$daughters; $i++) {
 929 | 	  iob_chain($daughters->[$i]);
 930 |       }
 931 |     }
 932 |     else {
 933 | 	die "(iob_chain) No match $res ".ref($res)." $res->{function}!\n";
 934 |     }
 935 | }
 936 | 
 937 | ######################################################################
 938 | ### chunks : called by main, calls nothing
 939 | ######################################################################
 940 | 
 941 | sub chunks {
 942 |     my $old_nr=-1;
 943 |     my $head_xp='';
 944 |     my $head_nr=-1;
 945 |     my $xp;
 946 |     my $ref;
 947 |     my $prev_ref;
 948 |     my $i;
 949 |     for ($i=@flattened-1; $i>=0; $i--) { # start at end
 950 | 	$ref=$flattened[$i]; # this word
 951 | 	if ($i<@flattened-1) { # previous word
 952 | 	    $prev_ref=$flattened[$i+1];
 953 | 	}
 954 | 	if ($ref->{iob_tag_inner}=~/^I-([A-Z]+)-([0-9]+)$/) { # inside
 955 | 	    $xp=$1;
 956 | 	    $nr=$2;
 957 | 	    if ($xp=~/^WH([A-Z]+)$/) { # WHNP, WHPP, WHADVP, WHADJP
 958 | 		$xp=$1;
 959 | 	    }
 960 | 	    if ($ref->{function}!~/^NOFUNC/) { # word is a head
 961 | 		    $head_xp=$xp;
 962 | 		    $head_nr=$nr;
 963 | 		    $ref->{iob_tag_inner}="I-$xp"; # head is inside a chunk
 964 | #		}
 965 | 	    }
 966 | 	    else { # word is no head
 967 | 		if ($xp eq $head_xp && $nr eq $head_nr) { # there has been a head
 968 | 		    $ref->{iob_tag_inner}="I-$xp"; # word is inside a chunk
 969 | 		}
 970 | 		elsif ($ref->{pos_tag} eq 'POS') { # special case "'s" "'"
 971 | 		    $ref->{iob_tag_inner}="I-NP"; # inside an NP
 972 | 		    if (defined($prev_ref) 
 973 | 			&& ($prev_ref->{iob_tag_inner} =~ /^.-NP/)) { # there is a following NP
 974 | 			$nr=$old_nr; # copy number
 975 | 			if (defined($flattened[$i-1])) {
 976 | 			    $ref->{lex_head}=$flattened[$i-1]->{lex_head}; 
 977 | # lex_head is head of following NP (same as lex_head of previous word)
 978 | 			}
 979 | 		    }
 980 | 		    else {
 981 | 			$ref->{function}='NP'; # 's is NP of its own '
 982 | 			$nr=-2;                # gets number of its own
 983 | 		    }
 984 | 		}
 985 | 		else {
 986 | 		    $ref->{iob_tag_inner}='O'; # word is outside a chunk
 987 | 		}
 988 | 	    }
 989 | 	    if ($old_nr!=$nr) { # chunk boundary
 990 | 		$ref->{iob_tag_inner} =~ s!^I-!E-!;          # end of chunk
 991 | 		if (defined($prev_ref)) {
 992 | 		    $prev_ref->{iob_tag_inner} =~ s!^I-!B-!; # beginning of chunk
 993 | 		    $prev_ref->{iob_tag_inner} =~ s!^E-!C-!;
 994 | 		}
 995 | 	    }		
 996 | 	    $old_nr=$nr;
 997 | 	}
 998 | 	else {
 999 | 	    die "File $filenumber: Error: no match $ref->{iob_tag_inner}, $ref->{word}!\n";
1000 | 	}
1001 |     }
1002 |     # first word in sentence must have B- or C-tag (or O-)
1003 |     $ref->{iob_tag_inner} =~ s!^I-!B-!; 
1004 |     $ref->{iob_tag_inner} =~ s!^E-!C-!;
1005 | }
1006 | 
1007 | ######################################################################
1008 | ### print_flatten : called by main, calls nothing
1009 | ######################################################################
1010 | 
1011 | sub print_flatten {
1012 |     my $i;
1013 |     my $j;
1014 |     my $l;
1015 |     my $trace;
1016 |     my $trace_array;
1017 |     my $headword;
1018 | 
1019 |     if ($sent_each_word == 0)      # argument '-s' was given
1020 |     {
1021 | 	# print file name and sentence number before word-list
1022 | 	printf("\# Sentence %s/%02d\n", $filenumber,$sentence_number) ;
1023 |     }
1024 |     
1025 |     for ($i=0; $i<@flattened; $i++)
1026 |     { # loop on words
1027 | 	if ($sent_each_word)
1028 | 	{
1029 | 	    printf(" %4s %2d",$filenumber,$sentence_number);
1030 | 	}
1031 | 
1032 | 	if ($opt_N==0) { 
1033 | 	    printf(" %2d",$flattened[$i]->{number}); 
1034 | 	}
1035 | 
1036 | 	if ($opt_i==0) { 
1037 | 	    if ($opt_B eq 'Begin') {   
1038 | 		$flattened[$i]->{iob_tag_inner} =~ s!^E-!I-!g ;
1039 | 		$flattened[$i]->{iob_tag_inner} =~ s!^C-!B-!g ;
1040 | 	    }
1041 | 	    elsif ($opt_B eq 'End') {
1042 | 		$flattened[$i]->{iob_tag_inner} =~ s!^B-!I-!g ;
1043 | 		$flattened[$i]->{iob_tag_inner} =~ s!^C-!E-!g ;
1044 | 	    }
1045 | 	    elsif ($opt_B eq 'Between') { 
1046 | 		$flattened[$i]->{iob_tag_inner} =~ s!^E-!I-!g ;
1047 | 		$flattened[$i]->{iob_tag_inner} =~ s!^C-!B-!g ;
1048 | 		if ($flattened[$i]->{iob_tag_inner} =~ /^B-(.*)$/
1049 | 		    && ($i==0                               # first word in sentence
1050 | 			|| $flattened[$i-1]->{iob_tag_inner} eq 'O'
1051 | 			|| substr($flattened[$i-1]->{iob_tag_inner},2) ne $1)) { # different category
1052 | 		    $flattened[$i]->{iob_tag_inner}="I-$1"; # is not 'Between'
1053 | 		}
1054 | 	    }
1055 | 	    # else: $opt_B eq 'BeginEndCombined') : nothing needs to be changed
1056 | 	    
1057 | 	    # ++ new case
1058 | 	    # PU as B-NP or B-VP but the next tag is B-XP
1059 | 	    # --> O
1060 | 	    if(    $flattened[$i]->{pos_tag} eq "PU" 
1061 | 		&& $flattened[$i]->{iob_tag_inner}=~ /^B-(.*)$/
1062 | 		&& ($i + 1 == @flattened || $flattened[$i +1]->{iob_tag_inner}=~ /^B-(.*)$|^C-(.*)$|^O$/)){
1063 | 	  	$flattened[$i]->{iob_tag_inner}="O";
1064 | 	    }
1065 | 	    # ++ new case
1066 | 	    # PU + PU as B-XP + I-XP, but the next tag is O		
1067 | 	    # --> O + O
1068 | 	    if(    $flattened[$i]->{pos_tag} eq "PU" 
1069 | 		&& $flattened[$i]->{iob_tag_inner}=~ /^B-(.*)$/
1070 | 		&& $i + 1 < @flattened
1071 | 		&& $flattened[$i+1]->{pos_tag} eq "PU" 
1072 | 		&& $flattened[$i+1]->{iob_tag_inner}=~ /^I-(.*)$|^E-(.*)$/){
1073 | 		$flattened[$i]->{iob_tag_inner}="O";
1074 | 		$flattened[$i+1]->{iob_tag_inner}="O";
1075 | 	    }	
1076 | 	    printf (" %-7s", $flattened[$i]->{iob_tag_inner}) ;
1077 | 	}
1078 | 
1079 | 	if ($opt_p==0) { 
1080 | 	    printf(" %-5s",$flattened[$i]->{pos_tag}); 
1081 | 	}
1082 | 	printf(" %-15s",$flattened[$i]->{word});
1083 | 	if ($opt_f==0) { 
1084 | 	    printf(" %-15s",$flattened[$i]->{function}); 
1085 | 	}
1086 | 
1087 | 	$headword=$flattened[$i]->{lex_head};
1088 | 	if (defined($headword) && @$headword>0) { # has heads
1089 | 	    $printed_head = $headword->[0]->{word} ;
1090 | 	    if ($opt_h==0) { 
1091 | 		for ($j=1; $j<@$headword; $j++) {
1092 | 		    $printed_head .= '/'.$headword->[$j]->{word} ;
1093 | 		}
1094 | 		printf(" %-15.30s", $printed_head) ;
1095 | 	    }
1096 | 	    if ($opt_H==0) { 
1097 | 		printf(" %3d",$headword->[0]->{number});
1098 | 		for ($j=1; $j<@$headword; $j++) {
1099 | 		    printf("/%d",$headword->[$j]->{number});
1100 | 		}
1101 | 	    }
1102 | 	}
1103 | 	else {
1104 | 	    if ($opt_h==0) { 
1105 | 		printf(" %-15s",'???');
1106 | 	    }
1107 | 	    if ($opt_H==0) { 
1108 | 		printf(" %-10s",'???');
1109 | 	    }
1110 | 	}
1111 | 	if ($opt_c==0) { 
1112 | 	    chop $flattened[$i]->{iob_tag} ;
1113 | 	    if ($opt_B eq 'Begin'     
1114 | 		|| $opt_B eq 'Between') { 
1115 | # 'Between' doesn't apply to {iob_tag_inner}: take 'Begin' instead
1116 | 		$flattened[$i]->{iob_tag} =~ s!E-!I-!g ;
1117 | 		$flattened[$i]->{iob_tag} =~ s!C-!B-!g ;
1118 | 	    }
1119 | 	    elsif ($opt_B eq 'End') {
1120 | 		$flattened[$i]->{iob_tag} =~ s!B-!I-!g ;
1121 | 		$flattened[$i]->{iob_tag} =~ s!C-!E-!g ;
1122 | 	    }
1123 |             # else: $opt_B eq 'BeginEndCombined') : nothing needs to be changed
1124 | 	    printf (" %s", $flattened[$i]->{iob_tag}) ;
1125 | 	}
1126 | 
1127 | 	if (defined($flattened[$i]->{trace})) { 
1128 | 	    $trace_array=$flattened[$i]->{trace};
1129 | 	    for ($l=0; $l<@$trace_array; $l++) {
1130 | 		$trace=$trace_array->[$l];
1131 | 		if ($opt_t==0) { 
1132 | 		    printf(" %10s %4s",$trace->{function},$trace->{kind});
1133 | 		    $headword=$trace->{lex_head};
1134 | 		    if (defined($headword) && @$headword>0) { # has heads
1135 | 			printf(" %5d",$headword->[0]->{number});
1136 | 			for ($j=1; $j<@$headword; $j++)
1137 | 			{
1138 | 			    printf("/%d",$headword->[$j]->{number});
1139 | 			}
1140 | 		    }
1141 | 		    else {
1142 | 			printf(" %5s",'???');
1143 | 		    }
1144 | 		}
1145 | # break circular references to allow for proper garbage collection
1146 | 		undef $trace->{reference}; # delete reference from trace to filler
1147 | 	    }
1148 | 	    undef $flattened[$i]->{trace}; # delete references from filler to traces
1149 | 	}
1150 | 
1151 |         print "\n";
1152 |     }
1153 |     print "\n";
1154 | }
1155 | 
1156 | ######################################################################
1157 | ### head_medium : called by main, calls nothing
1158 | ######################################################################
1159 | ### CTB4
1160 | ######################################################################
1161 | sub head_medium {
1162 | 
1163 | # (ADJP-PRD (UH OK))
1164 | #$head{'ADJP'}='JJ|RB|VB|IN|UH|FW|RP|\$|#|DT|NN';
1165 | #RP=DEC|DEG|DER|DEV|SP|AS|ETC|SP|MSP
1166 | $head{'ADJP'}='JJ|AD|VA|VC|VE|VV|CS|P|LC|IJ|FW|DEC|DEG|DER|DEV|SP|AS|ETC|SP|MSP|DT|NN|NR|NT';
1167 | # (ADVP (UH Indeed))
1168 | #$head{'ADVP'}='RB|IN|TO|DT|PDT|JJ|RP|FW|LS|UH|CC|NN|CD|VB';
1169 | $head{'ADVP'}='AD|CS|P|LC|DT|JJ|DEC|DEG|DER|DEV|SP|AS|ETC|SP|MSP|FW|IJ|CC|NN|NR|NT|CD|OD|VA|VC|VE|VV';
1170 | $head{'CLP'}='M'; 
1171 | $head{'DP'}='DT|M'; 
1172 | $head{'DNP'}='DEG'; 
1173 | $head{'DVP'}='DEV'; 
1174 | 
1175 | #$head{'FRAG'}='[A-Z]'; 
1176 | $head{'LCP'}='LC'; 
1177 | $head{'LST'}='CD|PU|FW|OD';
1178 | #$head{'NP'}='NN|CD|PRP|JJ|DT|EX|IN|RB|VB|FW|SYM|UH|WP|WDT';
1179 | $head{'NP'}='NN|NR|NT|CD|OD|JJ|OD|DT|CS|P|LC|AD|VA|VC|VE|VV|FW|PU|ETC|M|P';
1180 | #$head{'PP'}='IN|TO|RB|VBG|VBN|JJ|RP|CC|FW';
1181 | $head{'PP'}='P|CS|P|LC|AD|VA|VC|VE|VV|JJ|OD|DEC|DEG|DER|DEV|SP|AS|ETC|SP|MSP|CC|FW|';
1182 | #$head{'PRN'}= '[A-Z]';
1183 | # "第一" 
1184 | $head{'QP'}='CD|DT|OD|JJ|M|NN|NR|NT|PU';
1185 | #$head{'UCP'}='JJ|NN|VB|CD|NR|NT|VA|VC|VE|VV';
1186 | # AS了|DER得|SP了|BA把|LBlong Bei被|SBShort Bei被
1187 | #$head{'VP'}='VB|MD|TO|JJ|NN|POS|FW|SYM';
1188 | $head{'VP'}='VA|VC|VE|VV|AS|DER|SP|BA|LB|SB|JJ|OD|FW|NN|NR|NT';
1189 | ##
1190 | $head{'WHADJP'}='JJ';
1191 | $head{'WHADVP'}='WRB|IN|RB|WDT';
1192 | $head{'WHNP'}='NN|NR|NT|PU|ETC|VA|VC|VE|VV|CD|JJ|OD|DT|FW|M|P'; # including WP$
1193 | $head{'WHPP'}='P';
1194 | 
1195 | 
1196 | $headcat{'ADJP'}='ADJP'; 
1197 | $headcat{'ADVP'}='ADVP|.*-ADV';
1198 | 
1199 | $headcat{'FRAG'}='FRAG|INTJ|S|VP'; 
1200 | $headcat{'CLP'}='CLP';
1201 | $headcat{'DP'}='DP|CLP';
1202 | $headcat{'DNP'}='DNP';
1203 | $headcat{'DVP'}='DVP';
1204 | $headcat{'LCP'}='LCP';
1205 | $headcat{'LST'}='LST';
1206 | $headcat{'NP'}='NP|NX|.*-NOM';
1207 | $headcat{'PP'}='PP'; 
1208 | $headcat{'PRN'}='S|VP'; # not |PRN
1209 | $headcat{'QP'}='CLP|QP'; 
1210 | $headcat{'UCP'}='[A-Z]+P(-[-A-Z]+)?$|S'; # not |UCP
1211 | $headcat{'VP'}='VP';
1212 | 
1213 | $headcat{'S'}='S$|VP|.*-PRD'; # special: not every S
1214 | $headcat{'NOLABEL'}='[A-Z]';
1215 | $headcat{'X'}='S|[A-Z]+P(-[-A-Z]+)?$'; # normally pruned # not |X #'
1216 | 
1217 | $headcat{'WHADJP'}='WHADJP|ADJP';
1218 | $headcat{'WHADVP'}='WHADVP';
1219 | $headcat{'WHNP'}='WHNP|NP';
1220 | $headcat{'WHPP'}='WHPP';
1221 | 
1222 | }
1223 | 
1224 | ######################################################################
1225 | ######################################################################
1226 | ############################# visualize ##############################
1227 | ######################################################################
1228 | ######################################################################
1229 | 
1230 | sub print_lex_head {
1231 |     my $r=shift(@_); # reference to list of objects (terminals)
1232 |     my $i;
1233 |     if (not(defined($r))) {
1234 | 	print 'undef';
1235 |     }
1236 |     elsif (@$r==0) {
1237 | 	print "[]";
1238 |     }
1239 |     else {
1240 | 	print "[$r->[0]->{word}";
1241 | 	for ($i=1; $i<@$r; $i++)
1242 | 	{
1243 | 	    print ",$r->[$i]->{word}";
1244 | 	}	    
1245 | 	print ']';
1246 |     }
1247 | }
1248 | 
1249 | sub print_trace {
1250 |     my $r=shift(@_); # reference to list of objects (traces)
1251 |     my $i;
1252 |     for ($i=0; $i<@$r; $i++) {
1253 | 	print "\n\ttrace->{function}=$r->[$i]->{function}, {kind}=$r->[$i]->{kind}, {lex_head}=";
1254 | 	print_lex_head($r->[$i]->{lex_head});
1255 |     }	    
1256 | }
1257 | 
1258 | sub print_list {
1259 |     my @r=@_; # list of objects (terminals)
1260 |     my $i;
1261 |     foreach $r (@r) {
1262 | 	print "terminal->{number}=$r->{number}, {iob_tag}=$r->{iob_tag}, {pos_tag}=$r->{pos_tag}, {word}=$r->{word}, {function}=$r->{function}, {head_comp}=$r->{head_comp}, {lex_head}=";
1263 | 	print_lex_head($r->{lex_head});
1264 | 	if (defined($r->{trace})) {
1265 | 	    print ",{trace}=[";
1266 | 	    print_trace($r->{trace});
1267 | 	    print "\n\t]";
1268 | 	}
1269 | 	print "\n";
1270 |     }
1271 | }
1272 | sub print_parse_tree {
1273 |     my $s=shift(@_); # number of spaces (=indentation for output)
1274 |     my $r=shift(@_); # reference to object (parse tree)
1275 |     my $i;
1276 |     for ($i=0; $i<$s; $i++) {
1277 | 	print '  ';
1278 |     }
1279 |     if (ref($r) eq 'terminal') {
1280 | 	print "terminal->{number}=$r->{number}, {iob_tag}=$r->{iob_tag}, {pos_tag}=$r->{pos_tag}, {word}=$r->{word}, {function}=$r->{function}, {head_comp}=$r->{head_comp}, {lex_head}=";
1281 | 	print_lex_head($r->{lex_head});
1282 | 	if (defined($r->{trace})) {
1283 | 	    print ",{trace}=$r->{trace}";
1284 | 	}
1285 | 	print "\n";
1286 |     }
1287 |     elsif (ref($r) eq 'trace') {
1288 | 	print "trace->{function}=$r->{function}, {kind}=$r->{kind}, {lex_head}=";
1289 | 	print_lex_head($r->{lex_head});
1290 | 	print ", {reference}=";
1291 | 	if (defined($r->{reference})) {
1292 | 	    print "$r->{reference}";
1293 | 	}
1294 | 	else {
1295 | 	    print 'undef';
1296 | 	}
1297 | 	print ", {head_comp}=$r->{head_comp}\n";
1298 |     }
1299 |     elsif (ref($r) eq 'non_terminal') {
1300 | 	print "non_terminal->{function}=$r->{function}, {lex_head}=";
1301 | 	print_lex_head($r->{lex_head});
1302 | 	if (defined($r->{trace})) {
1303 | 	    print ",{trace}=$r->{trace}";
1304 | 	}
1305 | 	print ", {head_comp}=$r->{head_comp}, {daughters}=[\n";
1306 | 	my $daughters=$r->{daughters};
1307 | 	for ($i=0; $i<@$daughters; $i++) {
1308 | 	    print_parse_tree($s+1,$daughters->[$i]);
1309 | 	}
1310 | 	for ($i=0; $i<$s; $i++)
1311 | 	{
1312 | 	    print '  ';
1313 | 	}
1314 | 	print "]\n";
1315 |     }
1316 | }
1317 | 


--------------------------------------------------------------------------------