├── .gitignore ├── LICENSE ├── README ├── morphology_categories ├── README ├── ReadDotGraphs.pm ├── morphology.pl └── noun │ ├── animacy.dot │ ├── arbitrary_noun_class.dot │ ├── associativity.dot │ ├── case.dot │ ├── definiteness.dot │ ├── deictic_distance.dot │ ├── number.dot │ ├── sex.dot │ └── spatial_relation.dot └── phonology ├── CXS.yml ├── FeatureSystem.pm ├── IPA_HTML.yml ├── PhoneSet.pm ├── PhonologicalRule.pm ├── Phonology.pm ├── PhonologyDescriber.pm ├── PhonologySynchronicState.pm ├── README ├── Transcription.pm ├── features.yml ├── gleb.cgi ├── gleb.pl └── phon_descr.yml /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.tgz 3 | old/ 4 | old-notes/ 5 | discards/ 6 | temp.html 7 | jimhenry/ 8 | nytprof* 9 | random-language-xcode/ 10 | 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2011--2019 Alex Fink 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | The (unattainable, in its full scope) aim of this project is to procedurally generate an entire naturalistic conlang. So far the following two components exist. 2 | 3 | - morphology_categories/morphology.pl: systems of morphological contrasts (but not their forms). Data files have been written for nouns. 4 | 5 | - phonology/gleb.pl: phonology, including allophony and other such processes. 6 | An nearly current version of gleb is running at https://gleb.000024.org/ . 7 | 8 | Raise comments of general interest on the CONLANG list, http://listserv.brown.edu/archives/conlang.html , or write me at 000024@gmail.com. 9 | 10 | See also the list of other software conlanging tools at http://www.frathwiki.com/Software_tools_for_conlanging . 11 | -------------------------------------------------------------------------------- /morphology_categories/README: -------------------------------------------------------------------------------- 1 | morphology.pl is a script for generateing systems of morphological contrasts: case systems, TAM systems, and all that. The included DOT data files in the directory noun/ describe a number of common systems of contrasts for nouns (verbs and perhaps other classes are coming, eventually). Each contrast is a single hypothetical morphological slot; it is output as a list of values grouped into several lines, where each line is a single form and the elements of that line are its functions. For example, a case contrast starting 2 | 3 | intransitive_subject agent experiencer [...] 4 | focus patient [...] 5 | [possibly more lines here] 6 | 7 | describes a nominative-accusative case system. 8 | 9 | Invoking morphology.pl with a directory argument as in 10 | perl morphology.pl noun 11 | generates a series of contrasts choosing among all categories in that directory. Invoking it with a list of filenames, 12 | perl morphology.pl noun/case.dot noun/number.dot 13 | generates a single contrast fusionally expressing all the named categories. 14 | -------------------------------------------------------------------------------- /morphology_categories/ReadDotGraphs.pm: -------------------------------------------------------------------------------- 1 | # ReadDotGraphs module for morphology.pl 2 | # Read dot graphs using Graph::Reader::Dot, and convert them to how 3 | # morphology.pl prefers them. 4 | 5 | use strict; 6 | use Graph::Reader::Dot; 7 | use Graph; 8 | use YAML::Any; 9 | 10 | package ReadDotGraphs; 11 | 12 | BEGIN { 13 | use Exporter(); 14 | our @ISA = qw(Exporter); 15 | our @EXPORT = qw( &read ); 16 | } 17 | 18 | my $debug = 0; 19 | 20 | sub read { 21 | $Graph::Reader::Dot::UseNodeAttr = $Graph::Reader::Dot::UseEdgeAttr = 1; 22 | my $reader = Graph::Reader::Dot->new(); 23 | my $g = $reader->read_graph(shift); 24 | 25 | my $cat = {graph => {}}; 26 | # name is a graph attribute 27 | while ((my $att, $_) = each %{$g->get_graph_attributes()}) { 28 | # as below 29 | $cat->{$att} = /^\s*[\{\[](.*)[\}\]]\s*$/ ? YAML::Any::Load($_) : $_; 30 | } 31 | # TODO graph attributes 32 | for my $v ($g->vertices) { 33 | $cat->{graph}{$v} = {edges => []}; 34 | if (defined $g->get_vertex_attributes($v)) { 35 | while ((my $att, $_) = each %{$g->get_vertex_attributes($v)}) { 36 | # If the value is delimited in [] or {}, take it as an inline-type YAML object. 37 | $cat->{graph}{$v}{$att} = /^\s*[\{\[](.*)[\}\]]\s*$/ ? YAML::Any::Load($_) : $_; 38 | } 39 | } 40 | } 41 | 42 | for my $e ($g->edges) { 43 | my $modifier = ''; 44 | my $twoway; 45 | if (defined $g->get_edge_attributes($e->[0], $e->[1])) { 46 | while ((my $att, $_) = each %{$g->get_edge_attributes($e->[0], $e->[1])}) { 47 | # The only edge attribute that the morphology graph code can handle at present is weight. 48 | $modifier .= ":$_" if ($att eq 'weight'); 49 | $twoway = $_ if ($att eq 'twoway'); 50 | } 51 | } 52 | push @{$cat->{graph}{$e->[0]}{edges}}, $e->[1] . $modifier; 53 | push @{$cat->{graph}{$e->[1]}{edges}}, $e->[0] . $modifier if $twoway; 54 | } 55 | 56 | if ($debug) { 57 | $, = ", "; 58 | $\ = "\n"; 59 | print $cat->{odds}; 60 | for my $x (keys %{$cat->{graph}}) { 61 | print "***"; 62 | print $x; 63 | print @{$cat->{graph}{$x}{edges}}; 64 | print $cat->{graph}{$x}{seed_weight}; 65 | # one thing of each type to see if it's working 66 | print @{$cat->{graph}{$x}{product_with}} if defined $cat->{graph}{$x}{product_with}; 67 | print %{$cat->{graph}{$x}{syncretism}} if defined $cat->{graph}{$x}{syncretism}; 68 | } 69 | } 70 | 71 | return $cat; 72 | } 73 | 74 | 1; 75 | 76 | =pod 77 | 78 | The special edge attribute 'twoway' causes normally directed edges 79 | to be interpreted as undirected. 80 | 81 | =cut 82 | 83 | -------------------------------------------------------------------------------- /morphology_categories/morphology.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # Generate random systems of morphological contrasts, such as case. 3 | # Alex Fink, July 2009, after an algorithm of Jim Henry 4 | # http://listserv.brown.edu/archives/cgi-bin/wa?A2=ind0907b&L=conlang&P=11401 5 | # Thanks also to Jim Henry for suggestions. 6 | 7 | use strict; 8 | use ReadDotGraphs; 9 | 10 | my $debug = 0; 11 | 12 | # Later: Add unmarkedness to nodes, incl. special unmarkedness in products, 13 | # for use in the morphological form generation. 14 | # (It seems to me that the unmarkedness should be high if the seed_weight is high, 15 | # but not likewise for low; and the unmarkedness should be low if the 16 | # probability of retention is low.) 17 | 18 | # Also probably boundness for contrasts (what things are usually clitics, 19 | # what tightly woven in?) 20 | 21 | # Things that were on my whiteboard: 22 | # - what should be done about that e.g. inanimate agents aren't likely? 23 | # - sporadically having a value have multiple expressions 24 | # - how to handle optionality? here? in syntax? both? 25 | 26 | srand; 27 | 28 | my $categories = {}; 29 | 30 | =pod 31 | 32 | C randomly chooses a string (without its weight suffix) from the given list, 33 | with items suffixed by ":I" for a number n weighted by n and others weighted by 1. 34 | C parses this notation and returns (weight, token with weight stripped). 35 | I've used the colon syntax for conformance with John Fisher's words.pl. 36 | 37 | =cut 38 | 39 | sub weighted_one_of { 40 | my $sum = 0; 41 | $sum += /:(.*)$/ ? $1 : 1 for @_; 42 | $sum = rand $sum; 43 | for (@_) { 44 | # an assumption here about number formats 45 | if (/^(.*):([.0-9e-]*)$/) { 46 | return $1 if ($sum -= $2) < 0; 47 | } else { 48 | return $_ if ($sum -= 1) < 0; 49 | } 50 | } 51 | } 52 | 53 | sub decolonize { 54 | local $_ = shift; 55 | /^(.*):(.*)$/ ? ($2, $1) : (1, $_); 56 | } 57 | 58 | =pod 59 | 60 | The graph describing any given category is specified as a hash 61 | from nodes to hashes with the following keys. 62 | These graphs are read from DOT files, with their edges expressed in 63 | the usual DOT way and the other items below expressed as node attributes. 64 | 65 | =over 66 | 67 | =item C 68 | 69 | The edges from this node. Weights can be specified by appending ":I" as usual. 70 | Graphs are taken as directed, so undirected edges should be specified from both ends. 71 | 72 | =item C 73 | 74 | The weight of this node for seeding the subgraphs initially. 75 | 76 | =item C 77 | 78 | A measure of how likely this node is to be retained after doing the subgraphing, 79 | ranging from 0 for never to 1 for always -- but not quite a probability. 80 | The node is retained if the product of a C<1 + rand> for the category 81 | and a C for the node exceeds its retention. This is a cheap 82 | and not especially good 83 | way of having all the retentions for one category be correlated. 84 | This is the behaviour we want in the general case: languages tend 85 | to systemically vary in how many contrasts of a given type they show. 86 | 87 | =item C 88 | 89 | Like C, but without the correlation to anything else. 90 | The value is just a probability. 91 | 92 | =item C 93 | 94 | A probability of dropping (initial) seeds that are placed here, default 1. 95 | Set less than 1 for values that don't usually form contrastive units on their own. 96 | 97 | =item C 98 | 99 | A list of names of other categories, controlling the behaviour in products 100 | (i.e. conflationary expressions of several categories). If some nodes 101 | of the graph for category A mention category B in C, 102 | only these nodes will be split apart in the product; but as the default behaviour, 103 | if no nodes of category A mention category B they I will be split. 104 | 105 | This is used e.g. for spatial relations in the case graph: it only makes sense 106 | to split the local cases. 107 | 108 | =item C 109 | 110 | A list of names of other categories. If any of the named categories 111 | appear in a product with this category, this value will not appear at all. 112 | (Misleadingly not just the negative of C.) 113 | 114 | =item C 115 | 116 | A hash from categories to numbers. If the product is taken with a category 117 | appearing as a key, the edges which lie over this node in the factor 118 | have their weight multiplied by the specified value. A value exceeding 1 119 | means that this node is likely to show syncretism, less than 1 that 120 | this node is likelier than usual to make the distinction. 121 | 122 | If the empty string appears among the keys, its value is taken as 123 | the default syncretism for all categories not otherwise specified. 124 | 125 | =item C 126 | 127 | A list of names of values in any categories. If this value exists in 128 | the ultimate generated language, the values listed in its C 129 | will always be dropped. This is meant to be used for cases where an interaction 130 | of categories would produce two differently-labelled values with the same meaning 131 | (e.g. "state_entered" and "goal state_relation"). 132 | 133 | =item C 134 | 135 | The opposite semantics to C: if this value exists, the 136 | values listed in C will never be dropped. 137 | 138 | =back 139 | 140 | =cut 141 | 142 | sub equivalence_classes { 143 | my $head = shift; 144 | my @subgraphs; 145 | my %indices; 146 | # not the best shuffle ever, but it'll do 147 | for my $x (sort {rand >= 0.5} keys %$head) { 148 | my @g; 149 | if (@g = grep {$head->{$_} eq $x} keys %$head) { 150 | $indices{$x} = @subgraphs; 151 | push @subgraphs, \@g; 152 | } 153 | } 154 | \%indices, \@subgraphs; 155 | } 156 | 157 | =pod 158 | 159 | C splits a graph (passed by reference) into a randomly generated number of subgraphs, 160 | at least 2. (Why at least 2? I'm assuming we only call this if we've already decided 161 | our language will mark the contrast represented in the graph passed. Postprocessing 162 | might reduce the number of contrasts to one, though.) 163 | 164 | If passed only one argument, C returns a list of lists; 165 | if passed two, it returns a hash instead (whose values happen to be node names, 166 | though that shouldn't be depended upon). 167 | 168 | =cut 169 | 170 | sub subgraphize { 171 | my $graph = shift; 172 | # number of subgraphs: 1 + exp of a nonnegative number >= 2; upper bound okay 173 | my $assigned = 1 + int exp rand log keys %$graph; 174 | my %head; # head{$x} is the node representing the subgraph $x is in 175 | my $head_only = shift; 176 | 177 | for (1..$assigned) { 178 | my $seed; 179 | do { 180 | $seed = weighted_one_of map {"$_:".($graph->{$_}{seed_weight} || 1)} keys %$graph 181 | } while defined $head{$seed}; 182 | $head{$seed} = $seed; 183 | } 184 | for (keys %head) { 185 | delete $head{$_} if (defined $graph->{$_}{seed_retention} and rand >= $graph->{$_}{seed_retention}); 186 | } 187 | $assigned = keys %head; 188 | 189 | # Choosing from among all available edges with their weights is the most intuitive (and correct) thing. 190 | while ($assigned != keys %$graph) { 191 | my @fringe; 192 | for my $x (keys %$graph) { 193 | next if !defined $head{$x}; 194 | push @fringe, "$x:$_" for (grep {!defined $head{decolonize $_}} @{$graph->{$x}{edges}}); 195 | } 196 | my ($x, $y) = split /:/, weighted_one_of @fringe; 197 | if ($x) { 198 | $head{$y} = $head{$x}; 199 | } else { 200 | # The graph must be disconnected; start a new category in a connected component. 201 | do { 202 | $x = weighted_one_of map {"$_:".($graph->{$_}{seed_weight} || 1)} keys %$graph 203 | } while defined $head{$x}; 204 | $head{$x} = $x; 205 | } 206 | $assigned++; 207 | } 208 | 209 | return %head if $head_only; 210 | equivalence_classes \%head; 211 | } 212 | 213 | =pod 214 | 215 | C(I, I) 216 | recasts the subgraphed graph, provided as a hash output by C, 217 | into a graph, so that the reduced set of categories can be used for products etc. 218 | The second graph is the one that the first is a subgraphing of. 219 | 220 | The new node names are the values of the hash. 221 | 222 | The behaviour correct from the point of view of emulating how products behaved 223 | before this says that 224 | edge weights add, and syncretisms also add, I. 225 | The former says that ragged subdivisions which are bad cuts 226 | have a greater chance of being unified across in subsequent products; 227 | I think that's okay, so we do it. 228 | The latter says that large categories are more likely to syncretise; 229 | I think that's actually undesired. So we take the mean syncretism instead. 230 | 231 | I don't think seed_weight carries over well here; I'll also make it the mean. 232 | 233 | seed_retention I don't model at all. Using the average or something would 234 | punish whatever value a node with low seed_retention gets lumped in with, 235 | and we don't want that. 236 | 237 | no_product_with appears only if everything in this node has it. 238 | product_with appears if something in this node has it, but that's a poor 239 | solution, as it means some categories lacking the product_with might still appear 240 | in two different places; accordingly the expansion has to take care. 241 | 242 | =cut 243 | 244 | sub remake_graph { 245 | my ($graph, $head) = @_; 246 | my %new_graph; 247 | 248 | my %edges; 249 | for my $x (keys %$graph) { 250 | for my $y (@{$graph->{$x}{edges}}) { 251 | my($y_weight, $y_name) = decolonize $y; 252 | next if $head->{$y_name} eq $head->{$x}; 253 | if (defined $edges{$head->{$x}}{$head->{$y_name}}) { 254 | $edges{$head->{$x}}{$head->{$y_name}} += $y_weight; 255 | } else { 256 | $edges{$head->{$x}}{$head->{$y_name}} = $y_weight; 257 | } 258 | } 259 | } 260 | for my $x (keys %edges) { 261 | while ((my $y_name, my $y_weight) = each %{$edges{$x}}) { 262 | push @{$new_graph{$x}{edges}}, "$y_name:$y_weight"; 263 | } 264 | } 265 | 266 | my %total; 267 | $total{$head->{$_}}++ for keys %$graph; 268 | 269 | my %num_no_product_with; 270 | 271 | for my $x (keys %$graph) { 272 | for my $c (keys %{$graph->{$x}{syncretism}}) { 273 | $new_graph{$head->{$x}}{syncretism}{$c} = 1 274 | if !defined $new_graph{$head->{$x}}{syncretism}{$c}; 275 | $new_graph{$head->{$x}}{syncretism}{$c} += ($graph->{$x}{syncretism}{$c} - 1) / $total{$head->{$x}} 276 | if defined $graph->{$x}{syncretism}{$c}; 277 | } 278 | 279 | $new_graph{$head->{$x}}{seed_weight} = 1 280 | if !defined $new_graph{$head->{$x}}{seed_weight}; 281 | $new_graph{$head->{$x}}{seed_weight} += ($graph->{$x}{seed_weight} - 1) / $total{$head->{$x}} 282 | if defined $graph->{$x}{seed_weight}; 283 | 284 | for my $c (@{$graph->{$x}{product_with}}) { 285 | push @{$new_graph{$head->{$x}}{product_with}}, $c 286 | if !grep $_ eq $c, @{$new_graph{$head->{$x}}{product_with}}; 287 | } 288 | 289 | $num_no_product_with{$head->{$x}}{$_}++ for @{$graph->{$x}{no_product_with}}; 290 | } 291 | 292 | for my $k (keys %new_graph) { 293 | @{$new_graph{$k}{no_product_with}} = 294 | grep $num_no_product_with{$k}{$_} == $total{$k}, keys %{$num_no_product_with{$k}}; 295 | } 296 | 297 | \%new_graph; 298 | } 299 | 300 | =pod 301 | 302 | C(I, I, I, I) 303 | constructs graph products. The names are necessary since graphs refer to each other by name. 304 | 305 | =cut 306 | 307 | sub product { 308 | my($cat0name, $cat0, $cat1name, $cat1) = @_; 309 | my %graph; 310 | 311 | # Whether any (well, how many) nodes in each graph say product_with the other. 312 | # This is necessary for unfurling the default behaviour, and ugly. 313 | # Also grr scoping. 314 | # (It's also redundant with annotate_with_product_with below, but if it ain't broke.) 315 | my(%prod01, %prod10); 316 | for my $c (split / /, $cat1name) { 317 | $prod01{$c} = grep {grep $_ eq $c, @{$cat0->{$_}{product_with}};} keys %$cat0; 318 | } 319 | for my $c (split / /, $cat0name) { 320 | $prod10{$c} = grep {grep $_ eq $c, @{$cat1->{$_}{product_with}};} keys %$cat1; 321 | } 322 | sub use0 { 323 | my($x, $cat0, $cat1name) = (shift, shift, shift); 324 | for my $c (split / /, $cat1name) { 325 | return 0 if (defined $cat0->{$x}{no_product_with} and 326 | grep $_ eq $c, @{$cat0->{$x}{no_product_with}}); 327 | } 328 | return 1; 329 | } 330 | sub use1 { 331 | my($y, $cat1, $cat0name) = (shift, shift, shift); 332 | for my $c (split / /, $cat0name) { 333 | return 0 if (defined $cat1->{$y}{no_product_with} and 334 | grep $_ eq $c, @{$cat1->{$y}{no_product_with}}); 335 | } 336 | return 1; 337 | } 338 | sub product_with0 { 339 | my($x, $p, $cat0, $cat1name) = (shift, shift, shift, shift); 340 | for my $c (split / /, $cat1name) { 341 | return 0 if ($p->{$c} and (not defined($cat0->{$x}{product_with}) or 342 | not grep $_ eq $c, @{$cat0->{$x}{product_with}})); 343 | } 344 | return 1; 345 | } 346 | sub product_with1 { 347 | my($y, $p, $cat1, $cat0name) = (shift, shift, shift, shift); 348 | for my $c (split / /, $cat0name) { 349 | return 0 if ($p->{$c} and (not defined($cat1->{$y}{product_with}) or 350 | not grep $_ eq $c, @{$cat1->{$y}{product_with}})); 351 | } 352 | return 1; 353 | } 354 | 355 | # Ick, so much code duplication. 356 | for my $x (keys %$cat0) { 357 | next unless use0($x,$cat0,$cat1name) and product_with0($x,\%prod01,$cat0,$cat1name); 358 | for my $y (keys %$cat1) { 359 | next unless use1($y,$cat1,$cat0name) and product_with1($y,\%prod10,$cat1,$cat0name); 360 | 361 | # Edges are as in the cartesian product, except that if this product isn't 362 | # allowed for some of the nodes we have to catch that. 363 | $graph{"$x $y"}{edges} = []; 364 | for my $xx (@{$cat0->{$x}{edges}}) { 365 | my($xx_weight, $xx_name) = decolonize $xx; 366 | next unless use0($xx_name,$cat0,$cat1name); 367 | $xx_weight *= defined($cat1->{$y}{syncretism}{$_}) ? 368 | $cat1->{$y}{syncretism}{$_} : 369 | (defined($cat1->{$y}{syncretism}{""}) ? 370 | $cat1->{$y}{syncretism}{""} : 1) for split / /, $cat0name; 371 | push @{$graph{"$x $y"}{edges}}, product_with0($xx_name,\%prod01,$cat0,$cat1name) 372 | ? "$xx_name $y:$xx_weight" 373 | : "$xx_name _:$xx_weight"; 374 | } 375 | for my $yy (@{$cat1->{$y}{edges}}) { 376 | my($yy_weight, $yy_name) = decolonize $yy; 377 | next unless use1($yy_name,$cat1,$cat0name); 378 | $yy_weight *= defined($cat0->{$x}{syncretism}{$_}) ? 379 | $cat0->{$x}{syncretism}{$_} : 380 | (defined($cat0->{$x}{syncretism}{""}) ? 381 | $cat0->{$x}{syncretism}{""} : 1) for split / /, $cat1name; 382 | push @{$graph{"$x $y"}{edges}}, product_with1($yy_name,\%prod10,$cat1,$cat0name) 383 | ? "$x $yy_name:$yy_weight" 384 | : "_ $yy_name:$yy_weight"; 385 | } 386 | # Seed_weight multiplies. 387 | $graph{"$x $y"}{seed_weight} = ($cat0->{$x}{seed_weight} || 1) * 388 | ($cat1->{$y}{seed_weight} || 1); 389 | # Seed_retention multiplies too. 390 | $graph{"$x $y"}{seed_retention} = ($cat0->{$x}{seed_retention} || 1) * 391 | ($cat1->{$y}{seed_retention} || 1); 392 | 393 | # For product_with, this treatment is likely to be nonassociative. 394 | # I haven't tried to figure out whether it's sensible. 395 | # In any event, the whole implementation of product_with behaves ungracefully 396 | # in situations where we're multiplying G by H\times I, H and I being unrelated, 397 | # and G has product_by H. 398 | $graph{"$x $y"}{product_with} = 399 | [defined($cat0->{$x}{product_with}) ? @{$cat0->{$x}{product_with}} : (), 400 | defined($cat1->{$y}{product_with}) ? @{$cat1->{$y}{product_with}} : ()]; 401 | # Similarly for no_product_with. 402 | $graph{"$x $y"}{no_product_with} = 403 | [defined($cat0->{$x}{no_product_with}) ? @{$cat0->{$x}{no_product_with}} : (), 404 | defined($cat1->{$y}{no_product_with}) ? @{$cat1->{$y}{no_product_with}} : ()]; 405 | # Syncretism multiplies. 406 | $graph{"$x $y"}{syncretism} = defined($cat0->{$x}{syncretism}) ? 407 | {%{$cat0->{$x}{syncretism}}} : {}; # deep copy 408 | if (defined $cat1->{$y}{syncretism}{""}) { 409 | for (keys %{$graph{"$x $y"}{syncretism}}) { 410 | $graph{"$x $y"}{syncretism}{$_} *= $cat1->{$y}{syncretism}{""} 411 | if !defined($cat1->{$y}{syncretism}{$_}); 412 | } 413 | } 414 | $graph{"$x $y"}{syncretism}{$_} = 415 | defined($graph{"$x $y"}{syncretism}{$_}) ? 416 | $graph{"$x $y"}{syncretism}{$_} * $cat1->{$y}{syncretism}{$_} : 417 | $cat1->{$y}{syncretism}{$_} 418 | for keys %{$cat1->{$y}{syncretism}}; 419 | # No retention or uncorrelated_retention or replaces or necessitates, 420 | # those are done on the factors. 421 | } #for $y 422 | } #for $x 423 | 424 | for my $x (keys %$cat0) { 425 | next unless use0($x,$cat0,$cat1name) and not product_with0($x,\%prod01,$cat0,$cat1name); 426 | 427 | $graph{"$x _"}{edges} = []; 428 | for my $xx (@{$cat0->{$x}{edges}}) { 429 | my($xx_weight, $xx_name) = decolonize $xx; 430 | next unless use0($xx_name,$cat0,$cat1name); 431 | # we lie over too many categories for me to understand what syncretism would do 432 | if (product_with0($xx_name,\%prod01,$cat0,$cat1name)) { 433 | $xx_weight /= keys %$cat1; 434 | for (keys %$cat1) { 435 | push @{$graph{"$x _"}{edges}}, "$xx_name $_:$xx_weight" if use1($_,$cat1,$cat0name); 436 | } 437 | } else { 438 | push @{$graph{"$x _"}{edges}}, "$xx_name _:$xx_weight"; 439 | } 440 | } 441 | $graph{"$x _"}{seed_weight} = $cat0->{$x}{seed_weight} || 1; 442 | $graph{"$x _"}{seed_retention} = $cat0->{$x}{seed_retention} || 1; 443 | $graph{"$x _"}{product_with} = 444 | defined($cat0->{$x}{product_with}) ? [@{$cat0->{$x}{product_with}}] : []; 445 | $graph{"$x _"}{no_product_with} = 446 | defined($cat0->{$x}{no_product_with}) ? [@{$cat0->{$x}{no_product_with}}] : []; 447 | $graph{"$x _"}{syncretism} = $cat0->{$x}{syncretism}; 448 | } 449 | 450 | for my $y (keys %$cat1) { 451 | next unless use1($y,$cat1,$cat0name) and not product_with1($y,\%prod10,$cat1,$cat0name); 452 | 453 | $graph{"_ $y"}{edges} = []; 454 | for my $yy (@{$cat1->{$y}{edges}}) { 455 | my($yy_weight, $yy_name) = decolonize $yy; 456 | next unless use1($yy_name,$cat1,$cat0name); 457 | # nothing to syncretise 458 | if (product_with1($yy_name,\%prod10,$cat1,$cat0name)) { 459 | $yy_weight /= keys %$cat0; 460 | for (keys %$cat0) { 461 | push @{$graph{"_ $y"}{edges}}, "$_ $yy_name:$yy_weight" if use0($_,$cat0,$cat1name); 462 | } 463 | } else { 464 | push @{$graph{"_ $y"}{edges}}, "_ $yy_name:$yy_weight"; 465 | } 466 | } 467 | $graph{"_ $y"}{seed_weight} = $cat1->{$y}{seed_weight} || 1; 468 | $graph{"_ $y"}{seed_retention} = $cat1->{$y}{seed_retention} || 1; 469 | $graph{"_ $y"}{product_with} = 470 | defined($cat1->{$y}{product_with}) ? [@{$cat1->{$y}{product_with}}] : []; 471 | $graph{"_ $y"}{no_product_with} = 472 | defined($cat1->{$y}{no_product_with}) ? [@{$cat1->{$y}{no_product_with}}] : []; 473 | $graph{"_ $y"}{syncretism} = $cat0->{$y}{syncretism}; 474 | } 475 | 476 | return ("$cat0name $cat1name", \%graph); 477 | } 478 | 479 | sub show_graph { 480 | my $graph = shift; 481 | $, = ", "; 482 | $\ = "\n"; 483 | print "--- GRAPH"; 484 | for my $x (keys %$graph) { 485 | print $x; 486 | print @{$graph->{$x}{edges}} if defined $graph->{$x}{edges}; 487 | my $attr; 488 | for $attr qw(seed_weight, retention, uncorrelated_retention) { 489 | if (defined $graph->{$x}{$attr}) { 490 | print $attr, $graph->{$x}{$attr}; 491 | } 492 | } 493 | for $attr qw(product_with no_product_with replaces necessitates) { 494 | if (defined $graph->{$x}{$attr}) { 495 | print $attr, @{$graph->{$x}{$attr}}; 496 | } 497 | } 498 | for $attr qw(syncretism) { 499 | if (defined $graph->{$x}{$attr}) { 500 | print $attr, %{$graph->{$x}{$attr}}; 501 | } 502 | } 503 | print; 504 | } 505 | } 506 | 507 | 508 | sub joined_cartesian_product { 509 | my ($sep, $a, $b) = @_; 510 | my @l; 511 | for my $x (@$a) { 512 | push @l, (map $x . $sep . $_, @$b); 513 | } 514 | @l; 515 | } 516 | 517 | =pod 518 | 519 | C takes a graph and puts a top-level 520 | property C in, naming all the categories for which 521 | it has C in some node. 522 | 523 | =cut 524 | 525 | sub product_annotation { 526 | my $graph = shift; 527 | my %h; 528 | for my $x (keys %$graph) { 529 | $h{$_} = 1 for @{$graph->{$x}{product_with}}; 530 | } 531 | keys %h; 532 | } 533 | 534 | =pod 535 | 536 | If C is given a list of category names, it returns a list 537 | of forms in the product category. E.g. 538 | choose_contrasts qw(tense aspect mode) 539 | would make a tense-aspect-mode system (as long as tense and aspect and mode are 540 | names of category graphs). Each form is a list of the situations it is used for. 541 | 542 | C takes the product of the argument graphs in the order 543 | they are listed. Listing them in a bad order may yield bad results 544 | for reasons of the sloppy implementation of C. 545 | 546 | If given no arguments, C will produce a list of lists of 547 | forms, choosing a random set of categories to mark in each. 548 | 549 | C will screw up if there are values of different categories 550 | that share a name. -- Actually, this may not be true anymore. 551 | But one should certainly avoid having a value of a category named '_'. 552 | 553 | =cut 554 | 555 | sub choose_contrasts { 556 | my %odds = ('' => 1, map(($_ => $categories->{$_}{odds}), keys %$categories)); # mustn't modify the global structure 557 | my %all_cats; # which categories have been used? 558 | my @sg; # list of lists of forms 559 | 560 | while(1) { 561 | my @cats; 562 | if (@_) { 563 | @cats = @_; 564 | } else { 565 | my $x = weighted_one_of map "$_:$odds{$_}", keys %odds; 566 | last unless $x; 567 | # Propagate along conflations for conflated things. 568 | # Order is significant here, because of the sketchy behaviour of product_with. 569 | my (@newcats, @tempcats); 570 | @cats = (); 571 | @newcats = $x; 572 | while (@newcats) { 573 | push @cats, @newcats; 574 | @tempcats = @newcats; 575 | @newcats = (); 576 | for $x (@tempcats) { 577 | push @newcats, (grep {rand() < $categories->{$x}{conflation}{$_}} keys %{$categories->{$x}{conflation}}); 578 | } 579 | } 580 | for (my $i=$#cats; $i>=0; --$i) { 581 | splice @cats, $i, 1 if grep $_ eq $cats[$i], @cats[0..$i-1]; 582 | } 583 | } 584 | 585 | print @cats if $debug; 586 | 587 | $all_cats{$_} = 1 for @cats; 588 | my $cat = $cats[0]; 589 | my $graph = $categories->{$cat}{graph}; 590 | my $cat1; 591 | my (@left_prod_anns, @right_prod_anns); 592 | my (@left_graphs, @left_heads, @left_indices, @left_members, 593 | @right_graphs, @right_heads, @right_indices, @right_members); 594 | for (my $i = 1; $cat1 = $cats[$i]; $i++) { 595 | my $graph1 = $categories->{$cat1}{graph}; 596 | 597 | @{$left_prod_anns[$i-1]} = product_annotation $graph; 598 | $left_graphs[$i-1] = $graph; 599 | %{$left_heads[$i-1]} = subgraphize $graph, 1; 600 | ($left_indices[$i-1], $left_members[$i-1]) = equivalence_classes $left_heads[$i-1]; 601 | $graph = remake_graph $graph, $left_heads[$i-1]; 602 | 603 | @{$right_prod_anns[$i-1]} = product_annotation $graph1; 604 | $right_graphs[$i-1] = $graph1; 605 | %{$right_heads[$i-1]} = subgraphize $graph1, 1; 606 | ($right_indices[$i-1], $right_members[$i-1]) = equivalence_classes $right_heads[$i-1]; 607 | $graph1 = remake_graph $graph1, $right_heads[$i-1]; 608 | 609 | ($cat, $graph) = product $cat, $graph, $cat1, $graph1; 610 | if ($debug) { 611 | print "--- HASHES"; 612 | print %{$left_heads[$i-1]}; 613 | print %{$right_heads[$i-1]}; 614 | print; 615 | show_graph $left_graphs[$i-1]; 616 | } 617 | } 618 | 619 | my @slot = @{subgraphize $graph}; 620 | 621 | if ($debug) { 622 | print @$_ for @slot; 623 | print; 624 | } 625 | 626 | # An older version of this code without $record_pw had the following problem 627 | # in restoring product_with: 628 | # for example, say dual masculine gets retained in the left_graph 629 | # but trial masculine doesn't. Then, after the recursive call, @$lefts 630 | # will contain both, but 'trial masculine' simply isn't a node name 631 | # and so we can't tell at this stage that it has product_with associativity! 632 | # Hence $record_pw, which says which product_withs we shuold track. 633 | sub expand_product_names { 634 | my ($x, $cats, $left_prod_anns, $right_prod_anns, 635 | $left_graphs, $left_indices, $left_members, 636 | $right_graphs, $right_indices, $right_members, 637 | $i, $record_pw) = @_; 638 | 639 | if ($x !~ / /) { # base case 640 | for my $cat (@$cats) { 641 | return {} if (grep $_ eq $cat, @{$left_graphs->[$i+1]{$x}{no_product_with}}); 642 | } 643 | my %prods = ($x => {map { 644 | my $cat = $_; 645 | $cat => ! (grep $_ eq $cat, @{$left_prod_anns->[$i+1]} 646 | and !grep $_ eq $cat, @{$left_graphs->[$i+1]{$x}{product_with}}) 647 | } @$record_pw}); 648 | return %prods; 649 | } 650 | $x =~ /^(.*) ([^ ]*)$/; 651 | my ($my1, $my2) = ($1, $2); 652 | 653 | # For product_with, I'm just doing the direction where 654 | # the left node has them for the right category, since that's the only 655 | # direction for which the model really works well to begin with. 656 | 657 | my %prods; 658 | my $rights; 659 | my %lefts = map expand_product_names($_, $cats, $left_prod_anns, $right_prod_anns, 660 | $left_graphs, $left_indices, $left_members, 661 | $right_graphs, $right_indices, $right_members, 662 | $i-1, [@$record_pw, $cats->[$i+1]]), 663 | @{$left_members->[$i][ $left_indices->[$i]{$my1}]}; 664 | @$rights = @{$right_members->[$i][$right_indices->[$i]{$my2}]}; 665 | for my $cat (@$cats) { 666 | @$rights = grep {my $x = $_; 667 | !grep $_ eq $cat, @{$left_graphs->[$i+1]{$x}{no_product_with}}; 668 | } @$rights; 669 | } 670 | 671 | for my $l (keys %lefts) { 672 | if ($lefts{$l}{$cats->[$i+1]}) { 673 | for my $r (@$rights) { 674 | $prods{"$l $r"} = {%{$lefts{$l}}}; # some deep copy idiom 675 | for my $cat (keys %{$prods{"$l $r"}}) { 676 | $prods{"$l $r"}{$cat} = 0 677 | if (grep $_ eq $cat, @{$right_prod_anns->[$i]} 678 | and !grep $_ eq $cat, @{$right_graphs->[$i]{$r}{product_with}}); 679 | } 680 | } 681 | } else { 682 | $prods{$l} = {%{$lefts{$l}}} 683 | if $right_indices->[$i]{$my2} == 0; # conveniently subsuming the case $my2 eq '_' 684 | } 685 | } 686 | 687 | %prods; 688 | } 689 | 690 | for my $s (@slot) { 691 | @$s = map { 692 | my %h = expand_product_names($_, \@cats, \@left_prod_anns, \@right_prod_anns, 693 | \@left_graphs, \@left_indices, \@left_members, 694 | \@right_graphs, \@right_indices, \@right_members, 695 | $#left_heads, []); 696 | keys %h; 697 | } @$s; 698 | } 699 | 700 | push @sg, [@slot]; 701 | 702 | last if @_; 703 | # adjust odds for repetition: if not many subgraphs came out, we're likelier 704 | # to give the contrast encoded in these subgraphs another chance 705 | for (@cats) { 706 | $odds{$_} = 1/(4*@slot-1) if $odds{$_} > 1/(4*@slot-1); 707 | } 708 | } #while(1) 709 | 710 | if ($debug) { 711 | for my $x (@sg) { 712 | print @$_ for @$x; 713 | print; 714 | } 715 | } 716 | 717 | my %drop; 718 | for my $cat (keys %all_cats) { 719 | my $extent = 1 + rand; # correlation for dropping these contrasts 720 | $drop{$_} = ( 721 | # usual correlated retention 722 | ((defined $categories->{$cat}{graph}{$_}{retention}) and 723 | (rand() > $extent * $categories->{$cat}{graph}{$_}{retention})) or 724 | # uncorrelated retention 725 | ((defined $categories->{$cat}{graph}{$_}{uncorrelated_retention}) and 726 | (rand() > $categories->{$cat}{graph}{$_}{uncorrelated_retention})) 727 | ) for keys %{$categories->{$cat}{graph}}; 728 | } 729 | 730 | for my $cat (keys %all_cats) { 731 | for my $x (keys %{$categories->{$cat}{graph}}) { 732 | unless ($drop{$x}) { 733 | delete $drop{$_} for @{$categories->{$cat}{graph}{$x}{necessitates}}; 734 | $drop{$_} = 1 for @{$categories->{$cat}{graph}{$x}{replaces}}; 735 | } 736 | } 737 | } 738 | 739 | for my $slot (@sg) { 740 | for my $form (@$slot) { 741 | SIT: for (my $i = $#$form; $i >= 0; $i--) { 742 | for (split / /, $form->[$i]) { 743 | splice(@$form, $i, 1), next SIT if $drop{$_}; 744 | } 745 | } 746 | } 747 | @$slot = grep {@$_} @$slot; 748 | } 749 | 750 | @sg = grep {@$_ >= 2} @sg; 751 | 752 | return @sg; 753 | } 754 | 755 | 756 | $, = ", "; 757 | $\ = "\n"; 758 | 759 | my $whole_system = 0; 760 | my @catnames = (); 761 | if (@ARGV == 1 and -d $ARGV[0]) { 762 | $whole_system = 1; 763 | my $dir = $ARGV[0]; 764 | @ARGV = (); 765 | opendir(my $dh, $dir) || die "can't opendir $dir: $!"; 766 | for (readdir($dh)) { 767 | push @ARGV, "$dir/$_" if (/\.dot$/ and -f "$dir/$_"); 768 | } 769 | closedir $dh; 770 | } 771 | for (@ARGV) { 772 | my $cat = ReadDotGraphs::read($_); 773 | $categories->{$cat->{name}} = $cat; 774 | push @catnames, $cat->{name}; 775 | } 776 | 777 | my @sg = $whole_system ? choose_contrasts : choose_contrasts @catnames; 778 | print "--- FINAL SYSTEM" if $debug; 779 | for my $x (@sg) { 780 | print @$_ for @$x; 781 | print; 782 | } 783 | 784 | 785 | 786 | 787 | -------------------------------------------------------------------------------- /morphology_categories/noun/animacy.dot: -------------------------------------------------------------------------------- 1 | digraph animacy { 2 | /* 3 | This is a semantic feature of the noun so I'm proceeding for now with the 4 | assumption it wouldn't be marked by itself in inflection. Maybe I should do otherwise. 5 | */ 6 | graph [odds = 0]; 7 | graph [conflation = "{sex: 0.25}"]; 8 | edge [twoway = 1]; 9 | /* 10 | WALS says inanimates never take associativity productively. 11 | (This doesn't quite achieve that; both ends might be seeds.) 12 | */ 13 | inanimate [syncretism = "{associativity: 40}"]; 14 | inanimate -> animate; 15 | // Often there is no three-way distinction. 16 | animate [seed_retention = 0.5]; 17 | animate [syncretism = "{number: 0.1}"]; 18 | animate [product_with = "[sex]"]; 19 | animate -> human; 20 | human [syncretism = "{number: 0.01}"]; // this doesn't _quite_ behave as a strict universal, eh 21 | human [product_with = "[sex]"]; 22 | // more, incl. 'is a pronoun'? see for inst WALS ch. 34 23 | } 24 | -------------------------------------------------------------------------------- /morphology_categories/noun/arbitrary_noun_class.dot: -------------------------------------------------------------------------------- 1 | digraph arbitrary_noun_class { 2 | /* 3 | This is a semantic feature of the noun so I'm proceeding for now with the 4 | assumption it wouldn't be marked by itself in inflection. Maybe I should do otherwise. 5 | */ 6 | graph [odds = 0]; 7 | edge [twoway = 1]; 8 | /* 9 | I'm just making the graph a path, so that in products clumps are likely to comprise similar classes. 10 | The ordering on the path is as it is so that small numbers are mostly far apart, so that when we retain only small numbers we aren't overlikely to all be the same value. 11 | */ 12 | noun_class2 -> noun_class6; 13 | noun_class6 -> noun_class0; 14 | noun_class0 -> noun_class4; 15 | noun_class4 -> noun_class3; 16 | noun_class3 -> noun_class7; 17 | noun_class7 -> noun_class1; 18 | noun_class1 -> noun_class5; 19 | noun_class2 [necessitates = "[noun_class1]"]; 20 | noun_class2 [uncorrelated_retention = 0.3]; // pretty arbitrary 21 | noun_class3 [necessitates = "[noun_class1, noun_class2]"]; 22 | noun_class3 [uncorrelated_retention = 0.125]; 23 | noun_class4 [necessitates = "[noun_class1, noun_class2, noun_class3]"]; 24 | noun_class4 [uncorrelated_retention = 0.05]; 25 | noun_class5 [necessitates = "[noun_class1, noun_class2, noun_class3, noun_class4]"]; 26 | noun_class5 [uncorrelated_retention = 0.033333]; 27 | noun_class6 [necessitates = "[noun_class1, noun_class2, noun_class3, noun_class4, noun_class5]"]; 28 | noun_class6 [uncorrelated_retention = 0.023809]; 29 | noun_class7 [necessitates = "[noun_class1, noun_class2, noun_class3, noun_class4, noun_class5, noun_class6]"]; 30 | noun_class7 [uncorrelated_retention = 0.017857]; 31 | /* 32 | I've stopped at eight classes just 'cause I have to stop somewhere. 33 | Maybe a generation system like the ones we envisioned for tense would also do here. 34 | */ 35 | } 36 | -------------------------------------------------------------------------------- /morphology_categories/noun/associativity.dot: -------------------------------------------------------------------------------- 1 | digraph associativity { 2 | graph [odds = 0]; // let it only occur with the plural 3 | edge [twoway = 1]; 4 | nonassociative -> associative [weight = 1.5, label = 1.5]; 5 | associative [syncretism = "{animacy: 0.2}"]; // this is to bias in favour of systems where the associative on animates is the more distinct member 6 | } 7 | -------------------------------------------------------------------------------- /morphology_categories/noun/case.dot: -------------------------------------------------------------------------------- 1 | digraph case { 2 | graph [odds = 1.6]; // WALS 3 | graph [conflation = "{spatial_relation: 0.333333, number: 0.05, animacy: 0.05, definiteness: 0.025, arbitrary_noun_class: 0.025}"]; 4 | // Core roles up top. 5 | intransitive_subject [seed_weight = 0.0625]; // anti-tripartite bias: we don't want a correlation between large case systems and tripartiteness 6 | intransitive_subject [uncorrelated_retention = 0.95]; // for active-stative. WALS ch. 98 says .979; I'm generous 7 | intransitive_subject -> agent [weight = .5, label = .5]; 8 | intransitive_subject -> experiencer [weight = .25, label = .25]; 9 | intransitive_subject -> focus [weight = .25, label = .25]; 10 | intransitive_subject -> patient [weight = .25, label = .25]; 11 | intransitive_subject -> form_of_address [weight = .25, label = .25]; 12 | agent [seed_weight = 2]; // the agent -- patient contrast is fundamental 13 | agent -> intransitive_subject [weight = 2, label = 2]; 14 | agent -> force [weight = .25, label = .25]; // they say agent -> force is rare 15 | agent -> experiencer [weight = 1.5, label = 1.5]; 16 | agent -> cause [weight = .5, label = .5]; 17 | experiencer [seed_retention = 0.25]; 18 | experiencer -> intransitive_subject; 19 | experiencer -> agent [weight = 1.5, label = 1.5]; 20 | experiencer -> focus [weight = 1.5, label = 1.5]; 21 | experiencer -> recipient; 22 | focus [seed_retention = 0.25]; 23 | focus -> intransitive_subject; 24 | focus -> experiencer [weight = 1.5, label = 1.5]; 25 | focus -> patient [weight = 1.5, label = 1.5]; 26 | focus -> goal [weight = .5, label = .5]; 27 | focus -> recipient [weight = .5, label = .5]; 28 | patient [seed_weight = 2]; 29 | patient -> intransitive_subject; 30 | patient -> focus [weight = 1.5, label = 1.5]; 31 | patient -> goal [weight = .5, label = .5]; 32 | patient -> recipient [weight = .5, label = .5]; 33 | force [seed_weight = 0.025]; 34 | force [retention = 0.5]; 35 | force -> agent [weight = .25, label = .25]; 36 | force -> instrument; 37 | force -> cause; 38 | source [retention = 0.4]; 39 | source [product_with = "[spatial_relation]"]; 40 | source -> agent [weight = .5, label = .5]; 41 | source -> instrument; 42 | source -> possesser; 43 | source -> possession_source; 44 | source -> thing_avoided; 45 | time [retention = 0.1]; 46 | time -> location; 47 | purpose [retention = 0.1]; 48 | purpose -> goal; 49 | purpose -> cause; 50 | purpose -> beneficiary; 51 | purpose -> maleficiary; 52 | thing_per_which [retention = 0.05]; // distributive. what am I supposed to call the role? 53 | thing_per_which -> location; 54 | thing_per_which -> beneficiary; 55 | state [retention = 0.25]; // essive; but this might be better handled among the local cases 56 | state -> location; 57 | state -> similar_thing; 58 | state -> state_entered; 59 | /* 60 | form_of_address is for the vocative. 61 | If it's a case it's probably fairly extrasystematic. Nonetheless we connect it in the graph 62 | so that it can be special in one number but not another, etc. */ 63 | form_of_address [retention = 0.15]; 64 | form_of_address -> intransitive_subject [weight = .25, label = .25]; 65 | beneficiary [retention = 0.25]; 66 | beneficiary -> recipient; 67 | beneficiary -> goal; 68 | beneficiary -> possesser; 69 | beneficiary -> purpose; 70 | beneficiary -> maleficiary; 71 | beneficiary -> cause; 72 | beneficiary -> thing_per_which; 73 | cause [retention = 0.1]; // these three are probably little used on nouns 74 | cause -> agent [weight = .5, label = .5]; 75 | cause -> force; 76 | cause -> beneficiary; 77 | cause -> purpose; 78 | manner [retention = 0.1]; 79 | manner -> similar_thing; 80 | manner -> instrument; 81 | manner -> path; 82 | possession_source [retention = 0.1]; // "I borrowed the book _from Cecil_" 83 | possession_source -> source; 84 | possession_source -> possesser; 85 | possession_source -> thing_lacked; 86 | possesser [retention = 0.666667]; 87 | possesser -> source; 88 | possesser -> possession_source; 89 | possesser -> location; 90 | possesser -> recipient; 91 | possesser -> beneficiary; 92 | possesser -> maleficiary; 93 | possesser -> accompanier; 94 | thing_lacked [retention = 0.1]; // I have no good idea for an edge, except for from local cases 95 | thing_lacked -> possession_source; 96 | similar_thing [retention = 0.1]; 97 | similar_thing -> manner; 98 | similar_thing -> state; 99 | recipient [retention = 0.666667]; 100 | recipient -> experiencer; 101 | recipient -> goal; 102 | recipient -> beneficiary; 103 | recipient -> maleficiary; 104 | recipient -> possesser; 105 | recipient -> focus [weight = .5, label = .5]; 106 | recipient -> patient [weight = .5, label = .5]; 107 | location [retention = 0.5]; 108 | location [product_with = "[spatial_relation]"]; 109 | location -> goal; 110 | location -> instrument; 111 | location -> possesser; 112 | location -> path; 113 | location -> time; 114 | location -> state; 115 | location -> accompanier; 116 | location -> thing_per_which; 117 | instrument [retention = 0.5]; 118 | instrument -> force; 119 | instrument -> location; 120 | instrument -> source; 121 | instrument -> accompanier; 122 | instrument -> manner; 123 | path [retention = 0.2]; 124 | path [product_with = "[spatial_relation]"]; 125 | path -> location; 126 | path -> beneficiary; 127 | path -> manner; 128 | thing_avoided [retention = 0.05]; 129 | thing_avoided -> source; 130 | goal [retention = 0.4]; 131 | goal [product_with = "[spatial_relation]"]; 132 | goal -> recipient; 133 | goal -> beneficiary; 134 | goal -> maleficiary; 135 | goal -> focus; 136 | goal -> patient; 137 | goal -> location; 138 | goal -> purpose; 139 | goal -> state_entered; 140 | state_entered [retention = 0.1]; 141 | state_entered -> goal; 142 | state_entered -> state; 143 | accompanier [retention = 0.333333]; // what about 'with' -- 'and'? 144 | accompanier -> instrument; 145 | accompanier -> possesser; 146 | accompanier -> location; 147 | maleficiary [retention = 0.05]; 148 | maleficiary -> recipient; 149 | maleficiary -> goal; 150 | maleficiary -> possesser; 151 | maleficiary -> purpose; 152 | maleficiary -> beneficiary; 153 | } 154 | 155 | -------------------------------------------------------------------------------- /morphology_categories/noun/definiteness.dot: -------------------------------------------------------------------------------- 1 | digraph definiteness { 2 | graph [odds = 0.2]; // WALS 3 | // conflation with deictic_distance smallish, because deictic_distance by itself gives much the same effect. 4 | graph [conflation = "{deictic_distance: 0.075}"]; 5 | edge [twoway = 1]; 6 | indefinite -> specific [weight = .666667, label = .666667]; 7 | specific [seed_retention = 0.075]; 8 | specific [product_with = "[deictic_distance]"]; 9 | specific [syncretism = "{deictic_distance: 2}"]; 10 | specific -> definite [weight = .666667, label = .666667]; 11 | definite [product_with = "[deictic_distance]"]; 12 | definite [syncretism = "{deictic_distance: 2}"]; 13 | } 14 | -------------------------------------------------------------------------------- /morphology_categories/noun/deictic_distance.dot: -------------------------------------------------------------------------------- 1 | digraph deictic_distance { 2 | graph [odds = 0.05]; 3 | edge [twoway = 1]; 4 | no_deictic_distance [no_product_with = "[definiteness]"]; 5 | proximal -> medial [weight = .5, label = .5]; 6 | medial -> distal; 7 | medial -> near_listener; 8 | } 9 | -------------------------------------------------------------------------------- /morphology_categories/noun/number.dot: -------------------------------------------------------------------------------- 1 | digraph number { 2 | graph [odds = 7]; // WALS ch. 34 says 9, including systems where using the plural marking is optional. Lessened because I assume that in some of those cases the system is synthetic rather than morphological. 3 | /* 4 | animacy is WALS, under a generous interpretation of the optionals 5 | associativity is WALS too, the bound ones 6 | */ 7 | graph [conflation = "{animacy: 0.25, associativity: 0.25, definiteness: 0.05, arbitrary_noun_class: 0.05}"]; 8 | edge [twoway = 1]; 9 | singular -> dual [weight = .333333, label = .333333]; 10 | 11 | // These are for all non-singular numbers. 12 | node [syncretism = "{case: 2}"]; 13 | node [product_with = "[associativity]"]; 14 | dual [syncretism = "{case: 2}"]; 15 | dual [product_with = "[associativity]"]; 16 | /* 17 | The dual is perhaps too frequent. 18 | DM says 'rare' on nominals, less rare on pronouns. 19 | Of course sometimes it won't be distinctive in our implementation, even if it's kept. 20 | */ 21 | dual [retention = 0.05]; 22 | dual -> trial; 23 | trial [retention = 0.005]; 24 | trial [necessitates = "[dual]"]; 25 | trial -> quadrual; 26 | quadrual [retention = 0.0005]; 27 | quadrual [necessitates = "[dual, trial]"]; 28 | quadrual -> paucal; 29 | paucal [retention = 0.025]; 30 | paucal -> plural; 31 | plural -> large_plural; 32 | large_plural [retention = 0.015]; 33 | // what else? distributivity is a separate contrast. 34 | } 35 | 36 | -------------------------------------------------------------------------------- /morphology_categories/noun/sex.dot: -------------------------------------------------------------------------------- 1 | digraph sex { 2 | graph [odds = 0]; 3 | edge [twoway = 1]; 4 | masculine -> feminine; 5 | } 6 | -------------------------------------------------------------------------------- /morphology_categories/noun/spatial_relation.dot: -------------------------------------------------------------------------------- 1 | digraph spatial_relation { 2 | graph [odds = 0.1]; // TODO: this should be separable from case but quite infrequently occur without case. do we support that? 3 | under [retention = 0.05]; 4 | under -> at; 5 | under -> in_contact; 6 | nearby [retention = 0.15]; 7 | nearby -> at; 8 | on_top [retention = 0.2]; 9 | on_top -> at; 10 | on_top -> above; 11 | on_top -> in_contact; 12 | in_contact [retention = 0.1]; 13 | in_contact -> at; 14 | in_contact -> inside; 15 | in_contact -> on_top; 16 | in_contact -> under; 17 | possession_relation [retention = 0.025]; 18 | possession_relation [replaces = "[possesser, recipient, possession_source]"]; 19 | possession_relation -> at; 20 | among [retention = 0.05]; 21 | among -> at; 22 | among -> inside; 23 | no_spatial_relation [no_product_with = "[case]"]; 24 | state_relation [retention = 0.1]; 25 | state_relation [replaces = "[state, state_entered]"]; 26 | state_relation -> at; 27 | above [retention = 0.05]; 28 | above -> at; 29 | above -> on_top; 30 | time_relation [retention = 0.05]; 31 | time_relation [replaces = "[time]"]; 32 | time_relation -> at; 33 | at [necessitates = "[location]"]; 34 | at -> in_contact; 35 | at -> above; 36 | at -> on_top; 37 | at -> under; 38 | at -> among; 39 | at -> nearby; 40 | at -> state_relation; 41 | at -> time_relation; 42 | at -> possession_relation; 43 | inside [retention = 0.666667]; 44 | inside -> in_contact; 45 | inside -> among; 46 | } 47 | 48 | -------------------------------------------------------------------------------- /phonology/CXS.yml: -------------------------------------------------------------------------------- 1 | --- 2 | null: '0' 3 | 4 | characters: 5 | "+syllabic +resonant dorsal +high +front -round +ATR": 'i' 6 | "+syllabic +resonant dorsal +high +front +round +ATR": 'y' 7 | "+syllabic +resonant dorsal +high -front -back -round +ATR": '1' 8 | "+syllabic +resonant dorsal +high -front -back +round +ATR": 'u\' 9 | "+syllabic +resonant dorsal +high +back -round +ATR": 'M' 10 | "+syllabic +resonant dorsal +high +back +round +ATR": 'u' 11 | "+resonant dorsal +high +front -round -ATR": 'I' 12 | "+resonant dorsal +high +front +round -ATR": 'Y' 13 | "+resonant dorsal +high -front -back -round -ATR": 'I\' 14 | "+resonant dorsal +high -front -back +round -ATR": 'U\' 15 | "+resonant dorsal +high +back -round -ATR": 'U_c' 16 | "+resonant dorsal +high +back +round -ATR": 'U' 17 | "+resonant dorsal -high -low +front -round +ATR -tap_or_trill": 'e' 18 | "+resonant dorsal -high -low +front +round +ATR -tap_or_trill": '2' 19 | "+resonant dorsal -high -low -front -back -round +ATR -tap_or_trill": '@\' 20 | "+resonant dorsal -high -low -front -back +round +ATR -tap_or_trill": '8' 21 | "+resonant dorsal -high -low +back -round +ATR -tap_or_trill": '7' 22 | "+resonant dorsal -high -low +back +round +ATR -tap_or_trill": 'o' 23 | "+resonant dorsal -high -low +front -round -ATR -tap_or_trill": 'E' 24 | "+resonant dorsal -high -low +front +round -ATR -tap_or_trill": '9' 25 | "+resonant dorsal -high -low -front -back -round -ATR -tap_or_trill": '3' 26 | "+resonant dorsal -high -low -front -back +round -ATR -tap_or_trill": '3\' 27 | "+resonant dorsal -high -low +back -round -ATR -tap_or_trill": 'V' 28 | "+resonant dorsal -high -low +back +round -ATR -tap_or_trill": 'O' 29 | # We use /&/ for a front and /a/ a central vowel. 30 | # I'm accordingly not sure where to put /&\/, so I do without. 31 | "+resonant dorsal +low +front -round -ATR": '&' 32 | "+resonant dorsal +low +front +round -ATR": '&_O' 33 | "+resonant dorsal +low -front -back -round -ATR": 'a' 34 | "+resonant dorsal +low -front -back +round -ATR": 'a_O' 35 | "+resonant dorsal +low +back -round -ATR": 'A' 36 | "+resonant dorsal +low +back +round -ATR": 'Q' 37 | 38 | "-syllabic +resonant dorsal -lateral -tap_or_trill +high +front -round +ATR": 'j' 39 | "-syllabic +resonant dorsal -lateral -tap_or_trill +high +front +round +ATR": 'H' 40 | "-syllabic +resonant dorsal -lateral -tap_or_trill +high -front -round +ATR": 'M\' 41 | "-syllabic +resonant dorsal -lateral -tap_or_trill +high -front +round +ATR": 'w' 42 | "+resonant dorsal +lateral -tap_or_trill +high +front": 'L' 43 | "+resonant dorsal +lateral -tap_or_trill +high -front": 'L\' 44 | # taps can be notated as extra-short trills I guess 45 | "+resonant dorsal -lateral +tap_or_trill -trill -high": 'R\_X' 46 | "+resonant dorsal -lateral +tap_or_trill +trill -high": 'R\' 47 | "+resonant coronal +lateral -tap_or_trill -back": 'l' 48 | "+resonant coronal +lateral -tap_or_trill +back": '5' 49 | "+resonant coronal +lateral +tap_or_trill -trill": 'l\' 50 | "+resonant coronal -lateral -tap_or_trill": 'r\' 51 | # We don't respect the /r`/ anomaly; this generates /4`/. 52 | "+resonant coronal -lateral +tap_or_trill -trill": '4' 53 | "+resonant coronal -lateral +tap_or_trill +trill": 'r' 54 | "+resonant labial -labiodental -lateral -tap_or_trill": 'B_o' 55 | "+resonant labial +labiodental -lateral -tap_or_trill": 'v\' 56 | "+resonant labial -lateral +tap_or_trill -trill": 'B\_X' 57 | "+resonant labial -lateral +tap_or_trill +trill": 'B\' 58 | "-resonant +nasal -fricative -affricate dorsal -high": 'N\' 59 | "-resonant +nasal -fricative -affricate dorsal +high -front": 'N' 60 | "-resonant +nasal -fricative -affricate dorsal +high +front +palatalised_velar": 'N_j' 61 | "-resonant +nasal -fricative -affricate dorsal +high +front -palatalised_velar": 'J' 62 | "-resonant +nasal -fricative -affricate coronal": 'n' 63 | "-resonant +nasal -fricative -affricate labial -labiodental": 'm' 64 | "-resonant +nasal -fricative -affricate labial +labiodental": 'F' 65 | "-resonant -nasal -fricative -affricate -voice dorsal -high": 'q' 66 | "-resonant -nasal -fricative -affricate +voice dorsal -high": 'G\' 67 | "-resonant -nasal -fricative -affricate -voice dorsal +high -front": 'k' 68 | "-resonant -nasal -fricative -affricate +voice dorsal +high -front": 'g' 69 | "-resonant -nasal -fricative -affricate -voice dorsal +high +front +palatalised_velar": 'k_j' 70 | "-resonant -nasal -fricative -affricate +voice dorsal +high +front +palatalised_velar": 'g_j' 71 | "-resonant -nasal -fricative -affricate -voice dorsal +high +front -palatalised_velar": 'c' 72 | "-resonant -nasal -fricative -affricate +voice dorsal +high +front -palatalised_velar": 'J\' 73 | "-resonant -nasal -fricative -affricate -voice coronal": 't' 74 | "-resonant -nasal -fricative -affricate +voice coronal": 'd' 75 | "-resonant -nasal -fricative -affricate -voice labial": 'p' 76 | "-resonant -nasal -fricative -affricate +voice labial": 'b' 77 | "-resonant +fricative -voice dorsal -high": 'X' 78 | "-resonant +fricative +voice dorsal -high": 'R' 79 | "-resonant +fricative -voice dorsal +high -front": 'x' 80 | "-resonant +fricative +voice dorsal +high -front": 'G' 81 | "-resonant +fricative -voice dorsal +high +front +palatalised_velar": 'x_j' 82 | "-resonant +fricative +voice dorsal +high +front +palatalised_velar": 'G_j' 83 | "-resonant +fricative -voice dorsal +high +front -palatalised_velar": 'C' 84 | "-resonant +fricative +voice dorsal +high +front -palatalised_velar": 'j\' 85 | "-resonant +fricative -voice coronal -lateral -anterior -retroflex +front +laminal +sibilant": 's\' 86 | "-resonant +fricative +voice coronal -lateral -anterior -retroflex +front +laminal +sibilant": 'z\' 87 | "-resonant +fricative -voice coronal -lateral -anterior -retroflex -front +laminal +sibilant": 'S' 88 | "-resonant +fricative +voice coronal -lateral -anterior -retroflex -front +laminal +sibilant": 'Z' 89 | "-resonant +fricative -voice coronal -lateral -anterior -retroflex +front -laminal +sibilant": 's\_a' 90 | "-resonant +fricative +voice coronal -lateral -anterior -retroflex +front -laminal +sibilant": 'z\_a' 91 | "-resonant +fricative -voice coronal -lateral -anterior -retroflex -front -laminal +sibilant": 'S_a' 92 | "-resonant +fricative +voice coronal -lateral -anterior -retroflex -front -laminal +sibilant": 'Z_a' 93 | "-resonant +fricative -voice coronal -lateral -anterior +retroflex +sibilant": 's`' 94 | "-resonant +fricative +voice coronal -lateral -anterior +retroflex +sibilant": 'z`' 95 | "-resonant +fricative -voice coronal -lateral +anterior +sibilant": 's' 96 | "-resonant +fricative +voice coronal -lateral +anterior +sibilant": 'z' 97 | "-resonant +fricative -voice coronal -lateral -sibilant +laminal": 'T' 98 | "-resonant +fricative +voice coronal -lateral -sibilant +laminal": 'D' 99 | "-resonant +fricative -voice coronal -lateral -sibilant -laminal": 'T_a' # less distracting than the Z-SAMPA 100 | "-resonant +fricative +voice coronal -lateral -sibilant -laminal": 'D_a' 101 | "-resonant +fricative -voice coronal +lateral": 'K' 102 | "-resonant +fricative +voice coronal +lateral": 'K\' 103 | "-resonant +fricative -voice labial -labiodental": 'p\' 104 | "-resonant +fricative +voice labial -labiodental": 'B' 105 | "-resonant +fricative -voice labial +labiodental": 'f' 106 | "-resonant +fricative +voice labial +labiodental": 'v' 107 | "-dorsal -coronal -labial -voice +constricted_glottis -pharyngealised": '?' 108 | "-dorsal -coronal -labial -voice +constricted_glottis +pharyngealised": '?_?\' 109 | "-dorsal -coronal -labial -voice +spread_glottis": 'h' 110 | "-dorsal -coronal -labial +voice +spread_glottis": 'h\' 111 | "-dorsal -coronal -labial +pharyngealised -constricted_glottis -voice": 'X\' # what if there's [+sg] here? 112 | "-dorsal -coronal -labial +pharyngealised +voice": '?\' 113 | 114 | ligations: 115 | # the "-lateral"s here are cheats, especially the last 116 | "labial coronal": ["-labial", "-coronal -lateral", "[][])"] 117 | "labial dorsal": ["-labial", "-dorsal -lateral", "[][])"] 118 | "coronal dorsal": ["-coronal -lateral", "-dorsal", "[][])"] 119 | "+long +affricate": ["-affricate", "+fricative -affricate -prenasalised", "[]:[])"] # David's hack 120 | "+affricate": ["-affricate", "+fricative -affricate -prenasalised", "[][])"] 121 | "+prenasalised": ["+nasal -affricate -fricative -prenasalised", "-prenasalised", "[][])"] 122 | 123 | modifiers: 124 | # oddly, the string '=' gets stabbed 125 | "+syllabic -dorsal": '[]=' 126 | "+syllabic +lateral": '[]=' 127 | "+syllabic +tap_or_trill": '[]=' 128 | "+syllabic -resonant": '[]=' 129 | "-syllabic +resonant dorsal -lateral -tap_or_trill": '[]_^' 130 | "+spread_glottis -voice": '[]_h' 131 | "+spread_glottis +voice": '[]_t' 132 | "+constricted_glottis -voice": '[]_>' 133 | "+constricted_glottis +voice": '[]_k' 134 | "+implosive": '[]_<' 135 | # various releases have symbols 136 | "-voice": '[]_0' 137 | #"+voice": '[]_v' # let's just not specify any voiceless unpaired Cs 138 | "+labiodental": '[]_d' # Z-SAMPA 139 | #"-laminal -anterior -retroflex": '[]_a' 140 | "+laminal +anterior": '[]_m' 141 | "+laminal +retroflex": '[]_m' 142 | "+retroflex": '[]`' 143 | "-anterior -retroflex +laminal": '[]_-' # a common use of IPA underline 144 | "-anterior -retroflex -laminal": '[]_-_a' 145 | "+lateral": '[]_l' # that's meant for release, enh 146 | "+round": '[]_w' 147 | "+front": '[]_j' 148 | "-back dorsal -front": '[]_+' 149 | "-ATR": '[]_q' 150 | "+back -dorsal": '[]_G' 151 | "+pharyngealised": '[]_?\' 152 | "+nasal": '[]~' 153 | "+long": '[]:' 154 | -------------------------------------------------------------------------------- /phonology/FeatureSystem.pm: -------------------------------------------------------------------------------- 1 | package FeatureSystem; 2 | use strict; 3 | use Carp;#gd? 4 | 5 | # There is no package for phones, since they are just strings. The FeatureSystem object 6 | # is the one that knows how to handle its phones. 7 | 8 | # Phones are specified as strings, with one character for each feature in the order 9 | # in which they occur in the feature definition file. The principal values of characters: 10 | # '.' is unspecified, 'u' undefined, '0' off, '1' on. 11 | # Only [u01] appear in actual phones; '.' appears in classes (where it matches everything) 12 | # and sound change outcomes and the like. 13 | 14 | # There are various other characters used. 15 | # In one place the syllable structure uses 'U', which gets converted to 'u' only after the generator runs. 16 | # Effects of sound changes can have '<' and '>', which do progressive and regressive assimilation. 17 | 18 | # Univalent features are treated by the code bivalently, just like all the others. 19 | # Their univalence manifests in other ways: 20 | # - their complement cannot be selected as the conditioning environment of a rule (not in yet); 21 | # - they can't be inserted as part of a repair rule. 22 | 23 | # As essentially a hack to achieve reasonable behaviour of certain 3-place continua, 24 | # a feature can be marked as antithetical to another. Then, whenever the former is 25 | # set +, the latter is automatically set -. 26 | # TODO: this likely needs generalisation for e.g. tone. 27 | 28 | 29 | # Uses dots for unspecified values, unless $args{undefined} is true when it uses 'u'. 30 | sub parse { 31 | my ($self, $fs, %args) = (shift, shift, @_); 32 | my $re = !defined $args{undefined}; 33 | my $phone = ($re ? '.' : 'u') x @{$self->{features}}; 34 | return $phone if !defined $fs; 35 | my @a = split / /, $fs; 36 | for my $f (@a) { 37 | if ($f =~ /^([^\w\s])(.*)/) { 38 | my ($a, $b) = ($1, $2); 39 | $a =~ tr/-+?/01u/; 40 | substr($phone, $self->{feature_index}{$b}, 1) = $a; # $1 eq '+' ? '1' : $1 eq '-' ? '0' : $1; 41 | } else { 42 | substr($phone, $self->{feature_index}{$f}, 1) = '1'; 43 | } 44 | } 45 | $phone; 46 | } 47 | 48 | # Called with two args, displays undef things. 49 | sub feature_string { 50 | my $self = shift; 51 | my $phone = shift; 52 | my $fs = ''; 53 | my $c; 54 | for my $i (0..(length $phone)-1) { 55 | $fs .= ($fs ? ' ' : '') . 56 | ($c eq '1' ? (defined $self->{features}[$i]{univalent} ? '' : '+') : 57 | ($c eq 'u' ? '?' : ($c eq '0' ? '-' : $c))) . 58 | $self->{features}[$i]{name} 59 | unless ($c = substr($phone, $i, 1)) eq '.' or ($c eq 'u' and !@_); 60 | } 61 | $fs; 62 | } 63 | 64 | sub load_file { 65 | my $filename = shift; 66 | my $FS = YAML::Any::LoadFile($filename); 67 | bless $FS; 68 | 69 | $FS->{feature_index}{$FS->{features}[$_]{name}} = $_ for (0..@{$FS->{features}}-1); 70 | # {features_requiring}[v][f] is the list of features which are only defined if feature f takes value v 71 | for my $i (0..@{$FS->{features}}-1) { 72 | if (defined $FS->{features}[$i]{requires}) { 73 | my $s = $FS->parse($FS->{features}[$i]{requires}, undefined => 1); 74 | for (0..length($s)-1) { 75 | push @{$FS->{features_requiring}[substr($s, $_, 1)][$_]}, $i if (substr($s, $_, 1) =~ /[01]/); 76 | } 77 | } 78 | } 79 | for my $str (@{$FS->{strippings}}) { 80 | $str->{condition_parsed} = $FS->parse($str->{condition}, 1); 81 | } 82 | my @otherway_relations; 83 | for my $rel (@{$FS->{relations}}) { 84 | if (defined $rel->{twoway}) { 85 | my %flipped = %$rel; 86 | $flipped{from} = $rel->{to}; 87 | $flipped{to} = $rel->{from}; 88 | push @otherway_relations, \%flipped; 89 | } 90 | } 91 | push @{$FS->{relations}}, @otherway_relations; 92 | 93 | return $FS; 94 | } 95 | 96 | # Phones ought to be objects themselves (then they could carry around their excepts and tiers &c), 97 | # but as they're just strings right now this will be a large rewrite. 98 | # So for now operations on phones, like the four below, will live in the feature system. 99 | 100 | # Takes a phone and a phone with dots. Replaces features in the first with non-dots in the second. 101 | sub overwrite { 102 | my ($self, $a, $b) = @_; 103 | carp "length mismatch, '$a' vs. '$b'" if length($a) != length($b); #gd 104 | for my $i (0..(length $b)-1) { 105 | substr($a, $i, 1) = substr($b, $i, 1) if substr($b, $i, 1) ne '.'; 106 | } 107 | $a; 108 | } 109 | 110 | sub compatible { 111 | my ($self, $a, $b) = @_; 112 | for my $i (0..(length $b)-1) { 113 | return undef unless substr($a, $i, 1) eq '.' or 114 | substr($b, $i, 1) eq '.' or 115 | substr($a, $i, 1) eq substr($b, $i, 1); 116 | } 117 | return 1; 118 | } 119 | 120 | # Takes the setwise intersection of two phones, returning the 'x' thing if it's empty. 121 | sub intersect { 122 | my ($self, $a, $b) = @_; 123 | if ($self->compatible($a, $b)) { 124 | return $self->overwrite($a, $b); 125 | } else { 126 | return 'x' x @{$self->{features}}; 127 | } 128 | } 129 | 130 | # Returns the features that $a has but $b doesn't. 131 | sub subtract_features { 132 | my ($self, $a, $b) = @_; 133 | for my $i (0..(length $b)-1) { 134 | substr($a, $i, 1) = '.' if substr($a, $i, 1) eq substr($b, $i, 1); 135 | } 136 | $a; 137 | } 138 | 139 | sub add_requirements { 140 | my ($self, $reqd) = @_; 141 | for my $i (0..length($reqd)-1) { 142 | $reqd = $self->overwrite($reqd, $self->parse($self->{features}[$i]{requires})) 143 | if substr($reqd, $i, 1) =~ /[01]/ and defined $self->{features}[$i]{requires}; 144 | } 145 | $reqd; 146 | } 147 | 148 | sub add_entailments { 149 | my ($self, $phone) = @_; 150 | for my $i (0..length($phone)-1) { 151 | substr($phone, $self->{feature_index}{$self->{features}[$i]{antithetical}}, 1) = '0' 152 | if substr($phone, $i, 1) eq '1' and defined $self->{features}[$i]{antithetical}; 153 | } 154 | for my $i (0..length($phone)-1) { 155 | if (substr($phone, $i, 1) =~ /[01]/) { 156 | substr($phone, $_, 1) = 'u' for (@{$self->{features_requiring}[1 - substr($phone, $i, 1)][$i]}); 157 | } 158 | } 159 | $phone; 160 | } 161 | 162 | # Record the changes between from $phone0 to $phone1, as described below. 163 | sub change_record { 164 | my ($phone0, $phone1) = (shift, shift); 165 | my @changes; 166 | for my $i (0..length($phone0)-1) { 167 | push @changes, "c " . substr($phone1, $i, 1) . " $i" if substr($phone0, $i, 1) ne substr($phone1, $i, 1); 168 | } 169 | @changes; 170 | } 171 | 172 | 173 | 1; 174 | -------------------------------------------------------------------------------- /phonology/IPA_HTML.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # There is one little bit of _actual_ HTML in here: I've used for prenasalisation. 3 | null: '∅' 4 | 5 | characters: 6 | "+syllabic +resonant dorsal +high +front -round +ATR": 'i' 7 | "+syllabic +resonant dorsal +high +front +round +ATR": 'y' 8 | "+syllabic +resonant dorsal +high -front -back -round +ATR": 'ɨ' 9 | "+syllabic +resonant dorsal +high -front -back +round +ATR": 'ʉ' 10 | "+syllabic +resonant dorsal +high +back -round +ATR": 'ɯ' 11 | "+syllabic +resonant dorsal +high +back +round +ATR": 'u' 12 | "+resonant dorsal +high +front -round -ATR": 'ɪ' 13 | "+resonant dorsal +high +front +round -ATR": 'ʏ' 14 | "+resonant dorsal +high -front -back -round -ATR": 'ɪ̈' 15 | "+resonant dorsal +high -front -back +round -ATR": 'ʊ̈' 16 | "+resonant dorsal +high +back -round -ATR": 'ʊ̜' 17 | "+resonant dorsal +high +back +round -ATR": 'ʊ' 18 | "+resonant dorsal -high -low +front -round +ATR -tap_or_trill": 'e' 19 | "+resonant dorsal -high -low +front +round +ATR -tap_or_trill": 'ø' 20 | "+resonant dorsal -high -low -front -back -round +ATR -tap_or_trill": 'ɘ' 21 | "+resonant dorsal -high -low -front -back +round +ATR -tap_or_trill": 'ɵ' 22 | "+resonant dorsal -high -low +back -round +ATR -tap_or_trill": 'ɤ' 23 | "+resonant dorsal -high -low +back +round +ATR -tap_or_trill": 'o' 24 | "+resonant dorsal -high -low +front -round -ATR -tap_or_trill": 'ɛ' 25 | "+resonant dorsal -high -low +front +round -ATR -tap_or_trill": 'œ' 26 | "+resonant dorsal -high -low -front -back -round -ATR -tap_or_trill": 'ɜ' 27 | "+resonant dorsal -high -low -front -back +round -ATR -tap_or_trill": 'ɞ' 28 | "+resonant dorsal -high -low +back -round -ATR -tap_or_trill": 'ʌ' 29 | "+resonant dorsal -high -low +back +round -ATR -tap_or_trill": 'ɔ' 30 | # We use /&/ for a front and /a/ a central vowel. 31 | # I'm accordingly not sure where to put /&\/, so I do without. 32 | "+resonant dorsal +low +front -round -ATR": 'æ' 33 | "+resonant dorsal +low +front +round -ATR": 'æ̹' 34 | "+resonant dorsal +low -front -back -round -ATR": 'a' 35 | "+resonant dorsal +low -front -back +round -ATR": 'a̹' 36 | "+resonant dorsal +low +back -round -ATR": 'ɑ' 37 | "+resonant dorsal +low +back +round -ATR": 'ɒ' 38 | 39 | "-syllabic +resonant dorsal -lateral -tap_or_trill +high +front -round +ATR": 'j' 40 | "-syllabic +resonant dorsal -lateral -tap_or_trill +high +front +round +ATR": 'ɥ' 41 | "-syllabic +resonant dorsal -lateral -tap_or_trill +high -front -round +ATR": 'ɰ' 42 | "-syllabic +resonant dorsal -lateral -tap_or_trill +high -front +round +ATR": 'w' 43 | "+resonant dorsal +lateral -tap_or_trill +high +front": 'ʎ' 44 | "+resonant dorsal +lateral -tap_or_trill +high -front": 'ʟ' 45 | # taps can be notated as extra-short trills I guess 46 | "+resonant dorsal -lateral +tap_or_trill -trill -high": 'ʀ̆' 47 | "+resonant dorsal -lateral +tap_or_trill +trill -high": 'ʀ' 48 | "+resonant coronal +lateral -tap_or_trill -back -retroflex": 'l' 49 | "+resonant coronal +lateral -tap_or_trill +retroflex": 'ɭ' 50 | "+resonant coronal +lateral -tap_or_trill +back": 'ɫ' 51 | "+resonant coronal +lateral +tap_or_trill -trill": 'ɺ' 52 | "+resonant coronal -lateral -tap_or_trill -retroflex": 'ɹ' 53 | "+resonant coronal -lateral -tap_or_trill +retroflex": 'ɻ' 54 | "+resonant coronal -lateral +tap_or_trill -trill -retroflex": 'ɾ' 55 | "+resonant coronal -lateral +tap_or_trill -trill +retroflex": 'ɽ' 56 | "+resonant coronal -lateral +tap_or_trill +trill": 'r' 57 | "+resonant labial -labiodental -lateral -tap_or_trill": 'β̞' 58 | "+resonant labial +labiodental -lateral -tap_or_trill": 'ʋ' 59 | "+resonant labial -lateral +tap_or_trill -trill": 'ʙ̆' 60 | "+resonant labial -lateral +tap_or_trill +trill": 'ʙ' 61 | "-resonant +nasal -fricative -affricate dorsal -high": 'ɴ' 62 | "-resonant +nasal -fricative -affricate dorsal +high -front": 'ŋ' 63 | "-resonant +nasal -fricative -affricate dorsal +high +front +palatalised_velar": 'ŋʲ' 64 | "-resonant +nasal -fricative -affricate dorsal +high +front -palatalised_velar": 'ɲ' 65 | "-resonant +nasal -fricative -affricate coronal -retroflex": 'n' 66 | "-resonant +nasal -fricative -affricate coronal +retroflex": 'ɳ' 67 | "-resonant +nasal -fricative -affricate labial -labiodental": 'm' 68 | "-resonant +nasal -fricative -affricate labial +labiodental": 'ɱ' 69 | "-resonant -nasal -fricative -affricate -voice dorsal -high": 'q' 70 | "-resonant -nasal -fricative -affricate +voice dorsal -high -implosive": 'ɢ' 71 | "-resonant -nasal -fricative -affricate dorsal -high +implosive": 'ʛ' 72 | "-resonant -nasal -fricative -affricate -voice dorsal +high -front -implosive": 'k' 73 | "-resonant -nasal -fricative -affricate +voice dorsal +high -front -implosive": 'g' 74 | "-resonant -nasal -fricative -affricate dorsal +high -front +implosive": 'ɠ' 75 | "-resonant -nasal -fricative -affricate -voice dorsal +high +front +palatalised_velar -implosive": 'kʲ' 76 | "-resonant -nasal -fricative -affricate +voice dorsal +high +front +palatalised_velar -implosive": 'gʲ' 77 | "-resonant -nasal -fricative -affricate dorsal +high +front +palatalised_velar +implosive": 'ɠʲ' 78 | "-resonant -nasal -fricative -affricate -voice dorsal +high +front -palatalised_velar -implosive": 'c' 79 | "-resonant -nasal -fricative -affricate +voice dorsal +high +front -palatalised_velar -implosive": 'ɟ' 80 | "-resonant -nasal -fricative -affricate dorsal +high +front -palatalised_velar +implosive": 'ʄ' 81 | "-resonant -nasal -fricative -affricate -voice coronal -implosive -retroflex": 't' 82 | "-resonant -nasal -fricative -affricate -voice coronal -implosive +retroflex": 'ʈ' 83 | "-resonant -nasal -fricative -affricate +voice coronal -implosive -retroflex": 'd' 84 | "-resonant -nasal -fricative -affricate +voice coronal -implosive +retroflex": 'ɖ' 85 | "-resonant -nasal -fricative -affricate coronal +implosive": 'ɗ' 86 | "-resonant -nasal -fricative -affricate -voice labial -implosive": 'p' 87 | "-resonant -nasal -fricative -affricate +voice labial -implosive": 'b' 88 | "-resonant -nasal -fricative -affricate labial +implosive": 'ɓ' 89 | "-resonant +fricative -voice dorsal -high": 'χ' 90 | "-resonant +fricative +voice dorsal -high": 'ʁ' 91 | "-resonant +fricative -voice dorsal +high -front": 'x' 92 | "-resonant +fricative +voice dorsal +high -front": 'ɣ' 93 | "-resonant +fricative -voice dorsal +high +front +palatalised_velar": 'xʲ' 94 | "-resonant +fricative +voice dorsal +high +front +palatalised_velar": 'ɣʲ' 95 | "-resonant +fricative -voice dorsal +high +front -palatalised_velar": 'ç' 96 | "-resonant +fricative +voice dorsal +high +front -palatalised_velar": 'ʝ' 97 | "-resonant +fricative -voice coronal -lateral -anterior -retroflex +front +laminal +sibilant": 'ɕ' 98 | "-resonant +fricative +voice coronal -lateral -anterior -retroflex +front +laminal +sibilant": 'ʑ' 99 | "-resonant +fricative -voice coronal -lateral -anterior -retroflex -front +laminal +sibilant": 'ʃ' 100 | "-resonant +fricative +voice coronal -lateral -anterior -retroflex -front +laminal +sibilant": 'ʒ' 101 | "-resonant +fricative -voice coronal -lateral -anterior -retroflex +front -laminal +sibilant": 'ɕ̺' 102 | "-resonant +fricative +voice coronal -lateral -anterior -retroflex +front -laminal +sibilant": 'ʑ̺' 103 | "-resonant +fricative -voice coronal -lateral -anterior -retroflex -front -laminal +sibilant": 'ʃ̺' 104 | "-resonant +fricative +voice coronal -lateral -anterior -retroflex -front -laminal +sibilant": 'ʒ̺' 105 | "-resonant +fricative -voice coronal -lateral -anterior +retroflex +sibilant": 'ʂ' 106 | "-resonant +fricative +voice coronal -lateral -anterior +retroflex +sibilant": 'ʐ' 107 | "-resonant +fricative -voice coronal -lateral +anterior +sibilant": 's' 108 | "-resonant +fricative +voice coronal -lateral +anterior +sibilant": 'z' 109 | "-resonant +fricative -voice coronal -lateral -sibilant +laminal": 'θ' 110 | "-resonant +fricative +voice coronal -lateral -sibilant +laminal": 'ð' 111 | "-resonant +fricative -voice coronal -lateral -sibilant -laminal": 'θ̺' 112 | "-resonant +fricative +voice coronal -lateral -sibilant -laminal": 'ð̺' 113 | "-resonant +fricative -voice coronal +lateral": 'ɬ' 114 | "-resonant +fricative +voice coronal +lateral": 'ɮ' 115 | "-resonant +fricative -voice labial -labiodental": 'ɸ' 116 | "-resonant +fricative +voice labial -labiodental": 'β' 117 | "-resonant +fricative -voice labial +labiodental": 'f' 118 | "-resonant +fricative +voice labial +labiodental": 'v' 119 | "-dorsal -coronal -labial -voice +constricted_glottis -pharyngealised": 'ʔ' 120 | "-dorsal -coronal -labial -voice +constricted_glottis +pharyngealised": 'ʔˁ' 121 | "-dorsal -coronal -labial -voice +spread_glottis": 'h' 122 | "-dorsal -coronal -labial +voice +spread_glottis": 'ɦ' 123 | "-dorsal -coronal -labial +pharyngealised -constricted_glottis -voice": 'ħ' # what if there's [+sg] here? 124 | "-dorsal -coronal -labial +pharyngealised +voice": 'ʕ' 125 | 126 | ligations: 127 | # the "-lateral"s here are cheats, especially the last 128 | "labial coronal": ["-labial", "-coronal -lateral", "[]͡[]"] 129 | "labial dorsal": ["-labial", "-dorsal -lateral", "[]͡[]"] 130 | "coronal dorsal": ["-coronal -lateral", "-dorsal", "[]͡[]"] 131 | "+long +affricate": ["-affricate", "+fricative -affricate -prenasalised", "[]ː͡[]"] # David's hack 132 | "+affricate": ["-affricate", "+fricative -affricate -prenasalised", "[]͡[]"] 133 | "+prenasalised": ["+nasal -affricate -fricative -prenasalised", "-prenasalised", "[][]"] 134 | 135 | modifiers: 136 | "+syllabic -dorsal": '[]̩' 137 | "+syllabic +lateral": '[]̩' 138 | "+syllabic +tap_or_trill": '[]̩' 139 | "+syllabic -resonant": '[]̩' 140 | "-syllabic +resonant dorsal -lateral -tap_or_trill": '[]̯' 141 | "+spread_glottis -voice": '[]ʰ' 142 | "+spread_glottis +voice": '[]̤' 143 | "+constricted_glottis -voice": '[]ʼ' 144 | "+constricted_glottis +voice": '[]̰' 145 | "+implosive": "[]˂" 146 | # various releases have symbols 147 | "-voice": '[]̥' 148 | #"+voice": '[]̬' # let's just not specify any voiceless unpaired Cs 149 | "+labiodental": '[]̪' # apparently IPA-sanctioned? also in Z-SAMPA. alternatively ȸ and ȹ 150 | "+laminal +anterior": '[]̻' 151 | "+laminal +retroflex": '[]̻' 152 | "+retroflex -syllabic": '[]̢' 153 | "+retroflex +syllabic": '[]˞' 154 | "-anterior -retroflex +laminal": '[]̱' # a common use of IPA underline 155 | "-anterior -retroflex -laminal": '[]̱̺' 156 | "+lateral": '[]ˡ' # that's meant for release, enh 157 | "+round": '[]ʷ' 158 | "+front": '[]ʲ' 159 | "-back dorsal -front": '[]̟' 160 | "-ATR": '[]̙' 161 | "+back -dorsal": '[]ˠ' 162 | "+pharyngealised": '[]ˤ' 163 | "+nasal": '[]̃' 164 | "+long": '[]ː' 165 | -------------------------------------------------------------------------------- /phonology/PhoneSet.pm: -------------------------------------------------------------------------------- 1 | package PhoneSet; 2 | use strict; 3 | 4 | # If $multi is 0, parse into $s a description of a phone set given in $d by {condition}, {except}, {extras}. 5 | # If $multi is 1, parse into $s a hash of the same for multiple indexed phones. 6 | # 7 | # If passed a hash in {base}, overwrite that. 8 | sub parse { 9 | my ($d, $multi, %args) = (shift, shift, shift, @_); 10 | my $FS = $args{FS}; 11 | my $s = {}; 12 | $s = $args{base} if (defined $args{base}); 13 | 14 | # If e.g. offset = -1, then the phone at the beginning of the commaed list of conditions is phone -1. 15 | # This is necessary e.g. for split rules that want to add a left environment. 16 | # Beware: rules may not work if none of their indices is 0. 17 | my $offset = 0; 18 | $offset = $d->{offset} if defined $d->{offset}; 19 | 20 | my @phones = map $FS->parse($_), split /, */, $d->{condition}, -1; 21 | for (0..$#phones) { 22 | if (defined $s->{$_+$offset}) { 23 | $s->{$_+$offset}{condition} = $FS->intersect($s->{$_+$offset}{condition}, $phones[$_]); 24 | } else { 25 | $s->{$_+$offset}{condition} = $phones[$_]; 26 | } 27 | bless $s->{$_+$offset}; 28 | } 29 | 30 | if (defined $d->{except}) { 31 | # two styles: hash for backward compatibility, string so that specifying single phones is sane 32 | if (ref $d->{except} eq 'HASH') { 33 | for my $displ (keys %{$d->{except}}) { 34 | $s->{$displ}{except} .= ' ' if defined $s->{$displ}{except}; 35 | $s->{$displ}{except} .= join ' ', map $FS->parse($_), split / *\| */, $d->{except}{$displ}; 36 | } 37 | } else { 38 | my @exceptions = map $FS->parse($_), split /, */, $d->{except}, -1; 39 | for my $displ (0..$#exceptions) { 40 | $s->{$displ+$offset}{except} .= ' ' if defined $s->{$displ}{except}; 41 | $s->{$displ+$offset}{except} .= join ' ', map $FS->parse($_), split / *\| */, $d->{except}{$displ}; 42 | } 43 | } 44 | } 45 | 46 | my $pause_phone; 47 | @_ = split / +([0-9.]+) */, $d->{pause_phone}; 48 | if (scalar @_ == 1) { 49 | $pause_phone = $FS->parse($d->{pause_phone}, undefined => 1); 50 | } elsif (scalar @_ > 1) { 51 | $pause_phone = $FS->parse(PhonologicalRule::weighted_one_of(@_), undefined => 1); 52 | } 53 | # As a corollary of the sort here, '-' assignments follow '+' ones. TODO: make this saner? 54 | for my $e (sort keys %{$d->{extras}}) { 55 | if (rand() < $d->{extras}{$e}) { 56 | my ($e0, $e1); 57 | if ($e =~ /^(.*) ([^ ]*)$/) { 58 | ($e0, $e1) = ($1, $2); 59 | } else { 60 | ($e0, $e1) = ($e, 0); 61 | } 62 | if ($e0 eq '##') { # ad hoc notation for _only_ at extremum of word 63 | $s->{$e1}{or_pause} = $pause_phone; 64 | substr($s->{$e1}{condition}, 0, 1) = 'x'; # ad hoc match prevention 65 | } elsif ($e0 eq '#') { # end of word _allowed_ 66 | $s->{$e1}{or_pause} = $pause_phone; 67 | } elsif ($e0 =~ /^!/) { 68 | $s->{$e1}{except} .= ' ' if defined $s->{$e1}{except}; 69 | $s->{$e1}{except} .= $FS->parse(substr($e0,1)); 70 | } else { 71 | $s->{$e1}{condition} = $FS->overwrite($s->{$e1}{condition}, $FS->parse($e0)); 72 | } 73 | } 74 | } 75 | 76 | # If this wasn't supposed to be multiple phones, lift everything up a level. 77 | unless ($multi) { 78 | $s->{$_} = $s->{0}{$_} for keys %{$s->{0}}; 79 | delete $s->{0}; 80 | bless $s; 81 | } 82 | 83 | $s; 84 | } 85 | 86 | # Test a phoneset against a single phone. 87 | sub matches { 88 | my ($self, $phone) = (shift, shift); 89 | if (defined $self->{condition}) { 90 | return 0 unless $phone =~ /^$self->{condition}$/; 91 | } 92 | if (defined $self->{except}) { 93 | for my $exception (split / /, $self->{except}) { 94 | return 0 if $phone =~ /^$exception$/; 95 | } 96 | } 97 | return 1; 98 | } 99 | 100 | # Simplify a phoneset. 101 | # This could use more testing. 102 | # Possible improvements: consolidate pairs of excepts; handle 'u' values. 103 | sub simplify { 104 | my ($self, $FS) = (shift, shift); 105 | 106 | if (defined $self->{condition} and $self->{except}) { 107 | my $condition = $FS->add_entailments($FS->add_requirements($self->{condition})); 108 | $condition = $FS->intersect($condition, $self->{enriched_condition}) if defined $self->{enriched_condition}; 109 | my @except = split / /, $self->{except}; 110 | { # loop if the condition has changed, so we need to retest the excepts 111 | my $must_redo = 0; 112 | for (my $i = 0; $i < @except; ++$i) { 113 | # x out the condition, and return, if everything's excepted. 114 | if ($condition =~ /^$except[$i]$/) { 115 | $self->{condition} = 'x' x @{$FS->{features}}; 116 | delete $self->{except}; 117 | return; 118 | } 119 | 120 | # Remove excepts that don't matter. 121 | if (!$FS->compatible($condition, $FS->add_entailments($FS->add_requirements($except[$i])) )) { 122 | splice @except, $i, 1; 123 | redo; 124 | } 125 | 126 | # For excepts that only set one feature further to the condition, remove them and set a feature 127 | # in the condition if the prerequisites are all there, and a stripping doesn't preclude that. 128 | my $excess_feature = undef; 129 | my $just_one = 1; 130 | for (0..length($condition)-1) { 131 | if (substr($except[$i], $_, 1) =~ /[01]/ and substr($condition, $_, 1) eq '.') { 132 | if (defined $excess_feature) { 133 | $just_one = 0; last; 134 | } else { 135 | $excess_feature = $_; 136 | } 137 | } 138 | } 139 | if ($just_one and defined $excess_feature) { 140 | $_ = $FS->parse($FS->{features}[$excess_feature]{requires}); 141 | if ($condition =~ /^$_$/) { 142 | my $cant_be_stripped = 1; 143 | for my $str (@{$FS->{strippings}}) { 144 | $_ = $FS->parse($str->{strip}); 145 | if ($FS->compatible($condition, $str->{condition_parsed})) { 146 | $cant_be_stripped = 0; last; 147 | } 148 | } 149 | if ($cant_be_stripped) { 150 | # now we know that things matching the condition will not be 'u' in this position. 151 | substr($self->{condition}, $excess_feature, 1) = 1 - substr($except[$i], $excess_feature, 1); 152 | $must_redo = 1; 153 | splice @except, $i, 1; 154 | redo; 155 | } 156 | } 157 | } 158 | } # $i 159 | 160 | redo if $must_redo; 161 | } 162 | $self->{except} = join ' ', @except; 163 | delete $self->{except} unless @except; 164 | } 165 | } 166 | 167 | 1; 168 | -------------------------------------------------------------------------------- /phonology/PhonologicalRule.pm: -------------------------------------------------------------------------------- 1 | package PhonologicalRule; 2 | use strict; 3 | use constant INF => 9**9**9; # is there really nothing sensible better? 4 | 5 | # Each rule is a hash. In the simplest case, it has hashes that 6 | # contain hashes with keys including {condition} and {effects}, 7 | # giving the before and after of the rule in this position. 8 | # The top-level keys are relative indices into the word, and these are always integers 9 | # starting from 0 and sequentially increasing. (So e.g. $rule->{0}{condition} exists.) 10 | # 11 | # Hashes with keys {condition} and {except} are used elsewhere to specify phone classes too. 12 | # 13 | # Other kinds of effects include {deletions}. 14 | 15 | # Dump this rule without the feature system. 16 | sub debug_dump { 17 | my $self = shift; 18 | my $a = { %$self }; 19 | delete $a->{FS}; 20 | YAML::Any::Dump($a); 21 | } 22 | 23 | # Return the indices for this rule. If passed an argument, return the indices that have that datum. 24 | sub indices { 25 | my $self = shift; 26 | if (@_) { 27 | return grep((/^-?[0-9]*$/ and defined $self->{$_}{$_[0]}), keys %$self); 28 | } else { 29 | return grep /^-?[0-9]*$/, keys %$self; 30 | } 31 | } 32 | 33 | # Make a copy of this rule which deeply copies the indexed parts. 34 | sub deep_copy_indexed { 35 | my $self = shift; 36 | my $a = { %$self }; 37 | bless $a; 38 | for my $i (grep /^-?[0-9]*$/, keys %$self) { 39 | $a->{$i} = { %{$self->{$i}} }; 40 | bless $a->{$i}, 'PhoneSet'; 41 | delete $a->{$i}{condition_ar}; 42 | delete $a->{$i}{outcome}; 43 | } 44 | if (defined $self->{filter}) { 45 | $a->{filter} = { %{$self->{filter}} }; 46 | bless $a->{filter}, 'PhoneSet'; 47 | } 48 | delete $a->{broken_tags}; 49 | $a; 50 | } 51 | 52 | # Choose randomly from a hash giving weight distribution. (Not called everywhere it might be, yet.) 53 | # Class method. 54 | sub weighted_one_of { 55 | my $sum = 0; 56 | $sum += $_[2*$_+1] for 0..@_/2-1; 57 | $sum = rand $sum; 58 | # Sort again, just in case the argument really was a hash 59 | # (because it'll be randomly ordered, breaking determinism when the seed is constant). 60 | my %distribution = @_; 61 | for my $a (sort keys %distribution) { 62 | return $a if ($sum - $distribution{$a}) < 0; 63 | } 64 | } 65 | 66 | 67 | 68 | # Return a skeletal rule with essentially nothing in it. Class method. 69 | sub skeletal_rule { 70 | my $FS = shift; 71 | my $rule = { 72 | 0 => {condition => $FS->parse(''), effects => $FS->parse('')}, 73 | FS => $FS, 74 | }; 75 | bless $rule; 76 | } 77 | 78 | # Memoise a rule for the computations performed in feeds(!). 79 | # These things can be totally stripped out once the phonology is finalised. 80 | sub feed_annotate { 81 | my $rule = shift; 82 | my $FS = $rule->{FS}; 83 | for my $displ ($rule->indices('condition')) { 84 | $rule->{$displ}{condition_ar} = $FS->add_requirements($rule->{$displ}{condition}); 85 | if (defined $rule->{$displ}{effects}) { 86 | $rule->{$displ}{outcome} = $FS->overwrite($FS->add_requirements($rule->{$displ}{condition}), $rule->{$displ}{effects}); 87 | $rule->{$displ}{outcome} =~ s/[<>]/./g; 88 | } 89 | } 90 | } 91 | 92 | sub strip_feed_annotation { 93 | my $rule = shift; 94 | for my $i ($rule->indices()) { 95 | delete $rule->{$i}{condition_ar}; 96 | delete $rule->{$i}{outcome}; 97 | } 98 | } 99 | 100 | # Given two rules ri, rj, can the execution of ri cause rj to be applicable 101 | # where it wasn't before? This can yield false positives. 102 | 103 | # This had been somewhat of a bottleneck, but I haven't checked for a long time. 104 | 105 | # TODO: account for except. 106 | 107 | sub feeds { 108 | my ($ri, $rj, %args) = (shift, shift, @_); 109 | my $FS = $ri->{FS}; # shd check that they're the same 110 | 111 | # TODO: update this as we get new rule types 112 | # an insertion means we have to look at everything (whereas a fission might be okay with less); etc. 113 | return 1 if scalar $ri->indices('deletions') and 114 | (scalar $ri->indices('condition') > 1) or scalar $ri->indices('or_pause'); 115 | 116 | for my $i_displ ($ri->indices('effects')) { 117 | for my $j_displ ($rj->indices('condition')) { 118 | # this is costly enough that it's slightly worth putting it in here. klugily, assume index 0 has a condition 119 | $ri->feed_annotate() if !defined $ri->{0}{condition_ar}; 120 | $rj->feed_annotate() if !defined $rj->{0}{condition_ar}; 121 | next if !$FS->compatible($ri->{$i_displ}{outcome}, $rj->{$j_displ}{condition_ar}); 122 | 123 | # We might have rules which unnecessarily set features identically 124 | # to their precondition (antithetical, I'm thinking of you); this can't feed, of course. 125 | for my $f (0..@{$FS->{features}}-1) { 126 | if (substr($ri->{$i_displ}{effects}, $f, 1) =~ /[<>]/ and 127 | substr($rj->{$j_displ}{condition}, $f, 1) ne '.') { 128 | # Don't count this assimilation as feeding another rule if it can't create the precondition 129 | # _in a word that lacked it before_. 130 | my $effects = $ri->{$i_displ}{effects}; 131 | if ($effects =~ />/ and $effects =~ //) ? 1 : -1; 136 | my $condition = $rj->{$j_displ}{condition}; 137 | my $trigger = '.' x @{$FS->{features}}; 138 | for (0..$#{$FS->{features}}) { 139 | substr($trigger, $_, 1) = substr($condition, $_, 1) if substr($effects, $_, 1) =~ /[<>]/; 140 | } 141 | $trigger = $FS->add_entailments($FS->add_requirements($FS->intersect($trigger, $ri->{$i_displ+$direction}{condition}))); 142 | next if $trigger =~ /^$condition$/; 143 | 144 | return 1 unless defined $args{pairs}; 145 | push @{$args{pairs}}, [$i_displ, $j_displ]; next; 146 | } 147 | # or, this rule could force the assimilation to apply again. kluge out undefineds 148 | if (substr($rj->{$j_displ}{effects}, $f, 1) =~ /[<>]/ and 149 | substr($ri->{$i_displ}{condition}, $f, 1) !~ /[.u]/) { 150 | return 1 unless defined $args{pairs}; 151 | push @{$args{pairs}}, [$i_displ, $j_displ]; next; 152 | } 153 | if (substr($ri->{$i_displ}{effects}, $f, 1) eq 154 | substr($rj->{$j_displ}{condition}, $f, 1) and 155 | substr($ri->{$i_displ}{effects}, $f, 1) ne 156 | substr($ri->{$i_displ}{condition}, $f, 1) and 157 | substr($ri->{$i_displ}{effects}, $f, 1) ne '.') { 158 | return 1 unless defined $args{pairs}; 159 | push @{$args{pairs}}, [$i_displ, $j_displ]; next; 160 | } 161 | 162 | } 163 | } 164 | } 165 | 166 | return 0 unless defined $args{pairs}; 167 | return @{$args{pairs}} ? 1 : 0; 168 | } 169 | 170 | # Two rules conflict if they feed each other at the same displacement and their 171 | # outcomes are incompatible. 172 | 173 | sub conflicts_with { 174 | my ($ri, $rj, %args) = (shift, shift, @_); 175 | my $FS = $ri->{FS}; # shd check if they're the same 176 | my (@pij, @pji); 177 | return 0 unless $ri->feeds($rj, pairs => \@pij) and $rj->feeds($ri, pairs => \@pji); 178 | for my $dij (@pij) { 179 | for my $dji (@pji) { 180 | my ($i, $j) = ($dij->[0], $dji->[0]); 181 | next unless $i eq $dji->[1] and $j eq $dij->[1]; 182 | @{$args{indices}} = ($i, $j) if defined $args{indices}; 183 | return 1 if !$FS->compatible($ri->{$i}{effects}, $rj->{$j}{effects}); 184 | } 185 | } 186 | return 0; 187 | } 188 | 189 | 190 | 191 | # Test a hash of phonesets against a word, at given displacement $i. 192 | # 193 | # If $args{context_dependent} is defined (assumed to be a hash), 194 | # and this word could come to match (or fail to) if it were prefixed 195 | # or suffixed, record this fact there, under the key ''. 196 | # (Assumed to be use with nopause.) 197 | 198 | sub matches_word { 199 | my ($self, $word, $i, %args) = (shift, shift, shift, @_); 200 | my $context_dependent = undef; 201 | for my $displ ($self->indices()) { 202 | if ($i + $displ < 0 or $i + $displ >= @$word) { 203 | next if !$args{nopause} and defined $self->{$displ}{or_pause}; 204 | if (defined $self->{$displ}{condition}) { 205 | return 0 unless defined $args{context_dependent} and 206 | !(defined $self->{$displ}{effects} or defined $self->{$displ}{deletions}); 207 | $context_dependent = 1; 208 | } 209 | next; 210 | } 211 | return 0 unless PhoneSet::matches($self->{$displ}, $word->[$i+$displ]); 212 | } 213 | if ($context_dependent) { # we haven't already returned thanks to a mismatch... 214 | $args{context_dependent}{''} = [@$word] if scalar $self->indices() >= 2; 215 | return 0; 216 | } 217 | return 1; 218 | } 219 | 220 | 221 | 222 | # Return a left-right reversed copy of this rule. 223 | # TODO: this should reverse outcomes internally. 224 | sub reverse_rule { 225 | my $self = shift; 226 | my $a = { %$self }; 227 | bless $a; 228 | my @indices = $self->indices(); 229 | my $max = -1 * INF; 230 | for (@indices) { 231 | delete $a->{$_}; 232 | $max = $_ if $_ > $max; 233 | } 234 | for (@indices) { 235 | $a->{$max - $_} = $self->{$_}; 236 | } 237 | for ($a->indices()) { 238 | if (defined $a->{$_}{effects}) { 239 | $a->{$_}{effects} =~ y/<>/>{direction}) { 243 | $a->{direction} = -$self->{direction}; 244 | } else { 245 | $a->{direction} = -1; 246 | } 247 | $a; 248 | } 249 | 250 | 251 | 252 | # If %args includes a list {changes}, tags describing the particular changes caused 253 | # will be pushed. These tags are: 254 | # "c $v $f" -- feature $f was changed to value $v 255 | # "d" -- a segment was deleted 256 | # "r" -- a segment was replicated (as prelude to some kind of breaking or metathesis) 257 | 258 | # HERE: the current big thing in progress is implementing changes which epenthesise. 259 | # I currently envision doing these entirely by multiplying the outcomes of a single phone. 260 | # After this is finished, don't forget the various collision-detecting machinery! 261 | # Furrther down the line, can deletions be eliminated as a special kind of thing? 262 | 263 | sub run { 264 | my ($rule, $unfiltered_word, %args) = (shift, shift, @_); 265 | my $changed = 0; 266 | 267 | my $original_word = [@$unfiltered_word]; 268 | my $reversed = $rule->{bidirectional} ? 1 : 0; 269 | { 270 | my $word = $unfiltered_word; 271 | my $operating_on_subword = 0; 272 | my @inverse_filter = 0..@$unfiltered_word-1; 273 | if ($reversed) { 274 | $operating_on_subword = 1; 275 | @inverse_filter = reverse @inverse_filter; 276 | $word = [reverse @$word]; 277 | } 278 | if (defined $rule->{filter}) { 279 | $operating_on_subword = 1; 280 | @inverse_filter = grep $rule->{filter}->matches($unfiltered_word->[$_]), @inverse_filter; 281 | $word = [grep $rule->{filter}->matches($_), @$word]; 282 | } 283 | my @surviving = (1,) x @$word; # actually a count 284 | 285 | # iterate in the direction specified 286 | my @displs = -1..@$word; # start at -1 for assimilations to word-initial pause, and end at @$word for word-final with {-1,0} 287 | # will need to be changed if there can be rules whose indices are all strictly of the same sign 288 | @displs = reverse @displs if (defined $rule->{direction} and $rule->{direction} < 0); 289 | for my $i (@displs) { 290 | next unless $rule->matches_word($word, $i, 291 | nopause => $args{nopause}, 292 | context_dependent => $args{context_dependent}); 293 | 294 | for my $displ ($rule->indices('effects')) { 295 | next if ($i + $displ < 0 or $i + $displ >= @$word); 296 | my @effects = split / /, $rule->{$displ}{effects}; 297 | if (defined $args{alternate_effects}) { 298 | @effects = $rule->{$displ}{alternate_effects} if $args{alternate_effects}; # for generation only? 299 | } 300 | if (@effects != 1) { 301 | $changed = 1; 302 | push @{$args{changes}}, ('r') x (@effects - 1); 303 | } 304 | 305 | for my $j (0..$#effects) { 306 | my $effects = $effects[$j]; 307 | # Handle the assimilation characters. This is still okay for multi-phone resolutions; 308 | # there is no reason to use an assimilation character except out across an edge. 309 | if ($effects =~ /[<>]/) { 310 | my ($next_before, $next_after) = (undef, undef); 311 | for ($rule->indices('condition')) { 312 | $next_before = $_ if (!defined $next_before or $next_before < $_) and $_ < $displ; 313 | $next_after = $_ if (!defined $next_after or $next_after > $_) and $_ > $displ; 314 | } 315 | while ($effects =~ /= 0 ? $word->[$i+$next_before] : $rule->{$next_before}{or_pause}, $c, 1); 319 | } 320 | while ($effects =~ />/) { 321 | my $c = index($effects, '>'); 322 | substr($effects, $c, 1) = 323 | substr($i+$next_after < @$word ? $word->[$i+$next_after] : $rule->{$next_after}{or_pause}, $c, 1); 324 | } 325 | # We must entail the effects, not just the overwritten phone, since otherwise 326 | # jumps over the middle point on an antithetical scale won't always work. 327 | $effects = $rule->{FS}->add_entailments($effects); 328 | } 329 | my $newphone = $rule->{FS}->overwrite($word->[$i+$displ], $effects); 330 | $effects[$j] = $newphone; 331 | 332 | if ($word->[$i+$displ] ne $newphone) { 333 | $changed = 1; 334 | push @{$args{changes}}, FeatureSystem::change_record($word->[$i+$displ], $newphone) if defined $args{changes}; 335 | } 336 | } # $j 337 | 338 | #TODO: (proximal) reverse internally if necessary. 339 | splice @$word, $i+$displ, 1, @effects; 340 | @surviving[$i+$displ] = scalar @effects; 341 | } 342 | 343 | if (scalar $rule->indices('deletions')) { 344 | $changed = 1; 345 | push @{$args{changes}}, 'd' if defined $args{changes}; 346 | splice @$word, $i+$_, 1 for sort {$b <=> $a} $rule->indices('deletions'); 347 | if ($operating_on_subword) { 348 | $surviving[$i+$_] = 0 for grep(($i+$_ >= 0 and $i+$_ < @surviving), sort {$b <=> $a} $rule->indices('deletions')); 349 | } else { 350 | # can only do this if it's not a subword 351 | if (defined $args{sources}) { 352 | splice @{$args{sources}}, $i+$_, 1 for sort {$b <=> $a} $rule->indices('deletions'); 353 | } 354 | } 355 | $i -= scalar $rule->indices('deletions'); 356 | } 357 | } 358 | 359 | if ($operating_on_subword) { 360 | # Assume @inverse_filter is either ordered or reverse-ordered. 361 | my @indices = (0..$#inverse_filter); 362 | @indices = reverse @indices if (@inverse_filter >= 2 and $inverse_filter[0] < $inverse_filter[1]); 363 | my $a = 0; 364 | my @splice_ranges = (0, map $a += $_, @surviving); 365 | my @word = @$word; 366 | 367 | for my $i (@indices) { 368 | splice @$unfiltered_word, $inverse_filter[$i], 1, (@word[($splice_ranges[$i])..($splice_ranges[$i+1]-1)]); 369 | if (defined $args{sources}) { 370 | splice @{$args{sources}}, $inverse_filter[$i], 1, (($args{sources}[$inverse_filter[$i]]) x $surviving[$i]); 371 | } 372 | } # i 373 | } 374 | 375 | if ($rule->{bidirectional} and $reversed) { 376 | $reversed = 0; 377 | redo; 378 | } 379 | } # block for reverse iteration 380 | 381 | # Cases in which we might have found changes that weren't actually changes. 382 | if ($rule->{bidirectional}) { 383 | $changed = 0 if join(' ', @$original_word) eq join(' ', @$unfiltered_word); #kluge 384 | } 385 | 386 | $changed; 387 | } 388 | 389 | # Record on this rule that rule $rj, whose number is $j, should be inactivated if this one is chosen. 390 | sub mark_to_inactivate { 391 | my ($self, $rj, $j) = (shift, shift, shift); 392 | push @{$self->{inactivate}}, $j; 393 | # For now, don't regenerate split-off pieces of split rules, since they'll come back without the base part of their condition. 394 | # this is an ugly kluge, but few rules have more than one effect 395 | if (defined $rj->{tag} and $rj->{tag} !~ /_split/) { 396 | my $evitands = join '|', map($rj->{$_}{effects}, $rj->indices('effects')); 397 | push @{$self->{broken_tags}}, $rj->{tag} . ' ' . $evitands; 398 | } 399 | } 400 | 401 | # Create persistent and impersistent variants of this rule. Weight appropriately. 402 | # In fact, there are two kinds of persistent variants; one tries to redo every rule it conflicts with, 403 | # while one takes on their conditions as excepts in many cases (if this rule is recastable enough). 404 | sub persistence_variants { 405 | my ($self, $base_weight, $pd, $persistence_weight, $no_persist, $generable_val) = 406 | (shift, shift, shift, shift, shift, shift); 407 | my $phonology = $pd->{phonology}; 408 | my @makings; 409 | 410 | my $impersistent = $self->deep_copy_indexed(); 411 | $impersistent->{inactive} = scalar @$phonology; 412 | push @makings, [$impersistent, $base_weight * (1 - $persistence_weight)]; 413 | 414 | unless ($no_persist) { 415 | my $redo = $self->deep_copy_indexed(); 416 | my $loopbreak_penalty = 1 - $self->{recastability}; 417 | my $redo_and_except = $self->deep_copy_indexed(); 418 | my $loopbreak_penalty_and_except = $self->{recastability}; 419 | 420 | # The test for looping we do here was at one point the most expensive thing 421 | # in the phonology generation. By way of cutting down, only check rules 422 | # which set something the (potential) opposite of this rule. 423 | my @potential_conflicts; 424 | for my $displ ($redo->indices('effects')) { 425 | for my $i (0..@{$pd->{FS}{features}}-1) { 426 | push @potential_conflicts, @{$generable_val->[1-substr($redo->{$displ}{effects}, $i, 1)][$i]} 427 | if substr($redo->{$displ}{effects}, $i, 1) =~ /[01]/ 428 | and defined($generable_val->[1-substr($redo->{$displ}{effects}, $i, 1)][$i]); 429 | if (substr($redo->{$displ}{effects}, $i, 1) =~ /[<>]/) { 430 | push @potential_conflicts, @{$generable_val->[0][$i]} if defined($generable_val->[0][$i]); 431 | push @potential_conflicts, @{$generable_val->[1][$i]} if defined($generable_val->[1][$i]); 432 | } 433 | } 434 | } 435 | my %pch = map(($_ => 1), @potential_conflicts); 436 | @potential_conflicts = sort keys %pch; # uniq 437 | 438 | my @conflict_indices; 439 | for my $j (@potential_conflicts) { 440 | next if defined $phonology->[$j]{inactive} and $phonology->[$j]{inactive} < @$phonology; 441 | if ($self->conflicts_with($phonology->[$j], indices => \@conflict_indices)) { 442 | #print STDERR "clash of $self->{tag} with $j\n"; #debug 443 | my $recastability = 1; 444 | $recastability = $phonology->[$j]{recastability} if defined $phonology->[$j]{recastability}; 445 | 446 | $redo->mark_to_inactivate($phonology->[$j], $j); 447 | $loopbreak_penalty *= $recastability; 448 | 449 | if (1) { 450 | my $clash = $pd->{FS}->overwrite($phonology->[$j]{$conflict_indices[1]}{condition}, $phonology->[$j]{$conflict_indices[1]}{effects}); 451 | $clash =~ s/u/./g; # in case e.g. of forcing undefined 452 | $redo_and_except->{$conflict_indices[0]}{except} .= ' ' if defined $redo_and_except->{$conflict_indices[0]}{except}; 453 | $redo_and_except->{$conflict_indices[0]}{except} .= $clash; 454 | } else { 455 | $redo_and_except->mark_to_inactivate($phonology->[$j], $j); 456 | $loopbreak_penalty_and_except *= $recastability; 457 | } 458 | } 459 | } 460 | push @makings, [$redo, $base_weight * $persistence_weight * $loopbreak_penalty]; 461 | push @makings, [$redo_and_except, $base_weight * $persistence_weight * $loopbreak_penalty_and_except]; 462 | } # unless ($no_persist) 463 | 464 | @makings; 465 | } 466 | 467 | # Generate an extra condition for this rule. 468 | 469 | sub gen_extra_condition { 470 | my ($self, %args) = (shift, @_); 471 | my $FS = $self->{FS}; 472 | my (%resolution_keys, %resolutions); 473 | my $global_res_count = 0; 474 | 475 | for my $locus ($self->indices('or_pause')) { 476 | # Restriction to word-extremal, and away from it. 477 | my $rule1 = $self->deep_copy_indexed(); 478 | substr($rule1->{$locus}{condition}, 0, 1) = 'x'; 479 | $resolution_keys{$global_res_count} = 0.5; # magic weight 480 | $resolutions{$global_res_count++} = $rule1; 481 | 482 | my $rule2 = $self->deep_copy_indexed(); 483 | delete $rule2->{$locus}{or_pause}; 484 | $resolution_keys{$global_res_count} = 0.5; # magic weight 485 | $resolutions{$global_res_count++} = $rule2; 486 | } 487 | 488 | for my $locus ($self->indices('effects')) { 489 | my $effect = $self->{$locus}{effects}; 490 | 491 | # Conditions of the same family as the effects (which we don't have stored in a special structure). 492 | %_ = map(($_ => 1), map split(/ /, $FS->{features}[$_]{families}), 493 | grep substr($effect, $_, 1) ne '.', 0..length($effect)-1); 494 | my @families = grep $FS->compatible($FS->parse($FS->{families}{$_}), $self->{$locus}{condition}), 495 | grep $_, keys %_; 496 | my @family_features = grep { 497 | my $i = $_; 498 | grep $FS->{features}[$i]{families} =~ /\b$_\b/, @families; 499 | } grep((defined $args{generable_val}[0][$_] && @{$args{generable_val}[0][$_]} && 500 | defined $args{generable_val}[1][$_] && @{$args{generable_val}[1][$_]}), 501 | 0..length($effect)-1); 502 | # TODO: handle this when there's no generable_val. also, a more uniform way of dropping ungenerables for the later types 503 | for my $f (@family_features) { 504 | next if substr($self->{$locus}{condition}, $f, 1) ne '.'; 505 | for my $v (0..1) { 506 | next if $v == 0 and $FS->{features}[$f]{univalent}; 507 | my $rule1 = $self->deep_copy_indexed(); 508 | substr($rule1->{$locus}{condition}, $f, 1) = $v; 509 | $resolution_keys{$global_res_count} = $FS->{features}[$f]{univalent} ? 1.0 : 0.5; # magic factor 510 | # equiprobable on features, aot on their values 511 | $resolutions{$global_res_count++} = $rule1; 512 | } 513 | } 514 | 515 | # Conditions related to the outcome. 516 | my $outcome = $FS->overwrite($self->{$locus}{condition}, $effect); 517 | $outcome =~ s/[<>]/./; 518 | for my $rel (@{$FS->{relations}}) { 519 | next if $rel->{spread_only}; 520 | $_ = $FS->parse($rel->{to}); 521 | next unless $outcome =~ /^$_$/; 522 | 523 | my $rule1 = $self->deep_copy_indexed(); 524 | my $extra = $FS->parse($rel->{from}); 525 | next unless $FS->compatible($rule1->{$locus}{condition}, $extra); 526 | $rule1->{$locus}{condition} = $FS->overwrite($rule1->{$locus}{condition}, $extra); 527 | next if $rule1->{$locus}{condition} == $self->{$locus}{condition}; 528 | $resolution_keys{$global_res_count} = $rel->{weight}; # magic factor 529 | $resolutions{$global_res_count++} = $rule1; 530 | } 531 | 532 | # Conditions to which the outcome is a (possibly related) assimilation. 533 | # TODO: once preconditions can have equality w/out fixed values, allow it here in special cases (like homorganicity). 534 | # Also once assimilations can be long distance, allow that in those cases here -- but that should be automatic. 535 | unless ($args{bar_sequences}) { 536 | for my $f (0..$#{$FS->{features}}) { 537 | next if substr($effect, $f, 1) eq '.'; 538 | 539 | EF_ASSIM: for my $d (@{$FS->{features}[$f]{assimilation}}) { 540 | my @condition = map $FS->parse($_), split /, */, $d->{condition}, -1; 541 | next unless $outcome =~ /^$condition[$d->{target}]$/; 542 | 543 | my $rule1 = $self->deep_copy_indexed(); 544 | for my $displ (0..$#condition) { 545 | my $l = $locus + $displ - $d->{target}; 546 | if (!defined $rule1->{$l}{condition}) { 547 | $_ = $FS->parse($FS->{generic_pause_phone}); 548 | $rule1->{$l}{or_pause} = $_ if /^$condition[$displ]$/; 549 | $rule1->{$l}{condition} = '.' x length($effect); 550 | } 551 | next EF_ASSIM unless $FS->compatible($rule1->{$l}{condition}, $condition[$displ]); 552 | $rule1->{$l}{condition} = $FS->overwrite($rule1->{$l}{condition}, $condition[$displ]); 553 | } 554 | $_ = '.' x length($effect); 555 | substr($_, $f, 1) = substr($rule1->{$locus}{condition}, $f, 1); 556 | next unless $FS->compatible(substr($rule1->{$locus + 1 - 2*$d->{target}{condition}}, $f, 1), $_); 557 | substr($rule1->{$locus + 1 - 2*$d->{target}{condition}}, $f, 1) = 558 | $FS->overwrite(substr($rule1->{$locus + 1 - 2*$d->{target}{condition}}, $f, 1), $_); # impose the actual assimilation 559 | 560 | $resolution_keys{$global_res_count} = ($d->{prob} >= 1/24.0 ? 1/24.0 : $d->{prob}) * 48; # magic factor 561 | $resolutions{$global_res_count++} = $rule1; 562 | } 563 | } 564 | 565 | # pretty duplicative :/ 566 | for my $r (@{$FS->{relations}}) { 567 | $_ = $FS->parse($r->{to}); 568 | next if $effect !~ /^$_$/; 569 | 570 | EF_ASSIMR: for my $d (@{$r->{assimilation}}) { 571 | my @condition = map $FS->parse($_), split /, */, $d->{condition}, -1; 572 | next unless $outcome =~ /^$condition[$d->{target}]$/; 573 | 574 | my $rule1 = $self->deep_copy_indexed(); 575 | for my $displ (0..$#condition) { 576 | my $l = $locus + $displ - $d->{target}; 577 | if (!defined $rule1->{$l}->{condition}) { 578 | $_ = $FS->parse($FS->{generic_pause_phone}); 579 | $rule1->{$l}{or_pause} = $_ if /^$condition[$displ]$/; 580 | $rule1->{$l}{condition} = '.' x length($effect); 581 | } 582 | next EF_ASSIMR unless $FS->compatible($rule1->{$l}{condition}, $condition[$displ]); 583 | $rule1->{$l}{condition} = $FS->overwrite($rule1->{$l}{condition}, $condition[$displ]); 584 | } 585 | $_ = $FS->parse($r->{from}); 586 | next unless $FS->compatible($rule1->{$locus + 1 - 2*$d->{target}{condition}}, $_); 587 | $rule1->{$locus + 1 - 2*$d->{target}{condition}} = 588 | $FS->overwrite($rule1->{$locus + 1 - 2*$d->{target}{condition}}, $_); # impose the actual assimilation 589 | 590 | $resolution_keys{$global_res_count} = ($d->{prob} >= 1/24.0 ? 1/24.0 : $d->{prob}) * 48; # magic factor 591 | $resolutions{$global_res_count++} = $rule1; 592 | } 593 | } 594 | } 595 | 596 | # Conditions that avoid a marked situation changed by a previous rule. 597 | for my $old_rule (@{$args{phonology}{phonology}}) { 598 | next if defined $old_rule->{inactive}; 599 | next if scalar $old_rule->indices('condition') >= 2 and $args{bar_sequences}; 600 | for my $old_locus ($old_rule->indices('effects')) { 601 | my $old_precondition = $old_rule->{$old_locus}{condition}; 602 | next if $old_precondition =~ /u/; 603 | next unless $FS->compatible($old_precondition, $outcome); 604 | my $old_effect = $old_rule->{$old_locus}{effects}; 605 | # We take a rule to avoid markedness if its effect 606 | # is incompatible with its precondition. 607 | for (0..length($old_effect)-1) { 608 | substr($old_effect, $_, 1) = substr($old_rule->{$old_locus+1}{effects}, $_, 1) 609 | if substr($old_effect, $_, 1) eq '>'; 610 | substr($old_effect, $_, 1) = substr($old_rule->{$old_locus-1}{effects}, $_, 1) 611 | if substr($old_effect, $_, 1) eq '<'; 612 | } 613 | next if $FS->compatible($old_effect, $old_precondition); 614 | 615 | # We let how good this thing is as a condition to avoid depend on how many features have to be added. 616 | # (We just perform this subtraction on the old precondition, direcly.) 617 | my $rule1 = $self->deep_copy_indexed(); 618 | 619 | my $num_convergences = scalar grep substr($old_precondition, $_, 1) ne '.', 0..length($old_precondition)-1; 620 | for (0..length($old_precondition)-1) { 621 | substr($old_precondition, $_, 1) = '.' 622 | if substr($old_precondition, $_, 1) eq substr($outcome, $_, 1); 623 | } 624 | my $num_divergences = scalar grep substr($old_precondition, $_, 1) ne '.', 0..length($old_precondition)-1; 625 | $num_convergences -= $num_divergences; # num_convergences is for magic weights 626 | if ($num_divergences <= 0) { 627 | substr($rule1->{$locus}{condition}, 0, 1) = 'x'; # nothing is left to match! 628 | } elsif ($num_divergences <= 1) { 629 | $old_precondition =~ y/01/10/; 630 | $rule1->{$locus}{condition} = $FS->overwrite($rule1->{$locus}{condition}, $old_precondition); 631 | } else { 632 | $rule1->{$locus}{except} .= ' ' if defined $rule1->{$locus}{except}; 633 | $rule1->{$locus}{except} .= $old_precondition; 634 | } 635 | $resolution_keys{$global_res_count} = 636 | $num_convergences / ($num_divergences * ($num_divergences - 1) / 2 + 1); # much magic :/ 637 | $resolutions{$global_res_count++} = $rule1; 638 | } 639 | } 640 | } # locus 641 | 642 | # print STDERR YAML::Any::Dump(\%resolutions), "\n\n"; 643 | if (keys %resolutions) { 644 | my $i = weighted_one_of(%resolution_keys); 645 | return $resolutions{$i}; 646 | } 647 | } 648 | 649 | 650 | # To expand a rule tag: 651 | # - make all the resolutions, incl. related features, incl. loop-preserving and -breaking forms. 652 | # - make the rules (retaining the tag, for later remaking). When flipping a feature between 0 and 1, 653 | # clear features formerly requiring it. 654 | # - do the chance of extra conditions thing. 655 | # - repeat to make any necessary new rules for loopbreaks. 656 | 657 | # The format of rule tags is "$kind $list_index", where $kind is one of the values that 658 | # appear herein several times. 659 | 660 | sub generate { 661 | my ($tag, %args) = (shift, @_); 662 | my ($kind, $k, $rest) = split / /, $tag; 663 | my $FS = $args{phonology}{FS}; 664 | #print STDERR "[" . scalar @{$args{phonology}{phonology}} . "] tag is $tag\n"; # debug 665 | 666 | # Not doing assimilation rules (or strippings) since they can't much come out differently. 667 | my $threshold = 1; 668 | if ($kind eq 'default') { 669 | $threshold = $FS->{features}[$k]{default}[$rest]{value}; 670 | } elsif ($kind eq 'repair') { 671 | $threshold = $FS->{marked}[$k]{prob}; 672 | if (defined $FS->{marked}[$k]{contrast_probs}) { 673 | while (my ($cp, $prob) = each %{$FS->{marked}[$k]{contrast_probs}}) { 674 | my ($cp_pre, $cp_outcome) = split /; */, $cp; 675 | $cp_pre = $FS->parse($cp_pre); 676 | $cp_outcome = $FS->parse($cp_outcome); 677 | $cp_outcome = $FS->add_entailments($FS->overwrite($cp_pre, $cp_outcome)); 678 | if ($args{initial} and $args{phonology}->generatedly_contrast($cp_pre, $cp_outcome)) { 679 | $threshold = $prob; 680 | last; 681 | } 682 | } 683 | } 684 | } 685 | 686 | my $initial_threshold = $threshold; # e.g. for things which are more unlikely than marked, in a way that feature choice can't handle 687 | if ($kind eq 'repair' and $args{initial} and defined $FS->{marked}[$k]{initial_prob}) { 688 | $initial_threshold = $FS->{marked}[$k]{initial_prob}; 689 | } 690 | 691 | # Catch the default rules which are really repair rules. 692 | if ($kind eq 'default' and defined $FS->{features}[$k]{default}[$rest]{repair}) { 693 | $kind = 'repair_default'; 694 | $threshold = 1; 695 | $initial_threshold = $FS->{features}[$k]{default}[$rest]{repair}{prob}; 696 | } 697 | 698 | my $skip_me = 0; 699 | if (!$args{dont_skip} and ($kind eq 'repair' or $kind eq 'repair_default')) { 700 | $skip_me = (rand() > $initial_threshold); 701 | } 702 | my $add_a_condition = 0; 703 | if ($kind ne 'stripping') { 704 | $add_a_condition = (rand() < ($skip_me ? $threshold * 2/5.0 : 1/15.0)); # magic constants 705 | } 706 | return if $skip_me and !$add_a_condition; 707 | 708 | my (@resolutions, @weights); 709 | my $total_base_weight = 0; 710 | 711 | if ($kind eq 'default') { 712 | my $precondition = $FS->parse($FS->{features}[$k]{default}[$rest]{condition}); 713 | $precondition = $FS->overwrite($precondition, $FS->parse($FS->{features}[$k]{requires})) 714 | if defined $FS->{features}[$k]{requires}; 715 | substr($precondition, $k, 1) = 'u'; 716 | 717 | # Check for the case where defaults are different if a contrast exists. 718 | my $expected_value = $FS->{features}[$k]{default}[$rest]{value}; 719 | my ($precondition0, $precondition1) = ($precondition, $precondition); 720 | substr($precondition0, $k, 1) = '0'; substr($precondition1, $k, 1) = '1'; 721 | if (defined $FS->{features}[$k]{default}[$rest]{contrast_value} and 722 | $args{phonology}->generatedly_contrast($precondition0, $precondition1)) { 723 | $expected_value = $FS->{features}[$k]{default}[$rest]{contrast_value}; 724 | } 725 | 726 | for (0..1) { 727 | my $effects = '.' x @{$FS->{features}}; 728 | substr($effects, $k, 1) = $_; 729 | my $weight = $_ ? $expected_value : 1 - $expected_value; 730 | my $rule = { 731 | 0 => {condition => $precondition, effects => $effects}, 732 | recastability => 1 - $weight, 733 | }; 734 | bless $rule->{0}, 'PhoneSet'; 735 | # Default-provision rules shouldn't run where a stripping exists. 736 | for (@{$FS->{strippings}}) { 737 | if ($_->{strip} =~ /(^| )$FS->{features}[$k]{name}( |$)/) { 738 | $rule->{0}{except} = $FS->parse($_->{condition}); 739 | last; 740 | } 741 | } 742 | push @resolutions, $rule; 743 | push @weights, $weight; 744 | } 745 | } 746 | 747 | # this short-circuits a bunch of stuff. 748 | elsif ($kind eq 'stripping') { 749 | my $precondition = $FS->{strippings}[$k]{condition_parsed}; 750 | 751 | if (defined $rest) { 752 | my $s = $FS->{strippings}[$k]{substitute}[$rest]; 753 | $s =~ /^(.*) *: *(.*)$/; 754 | my $rule = { 755 | 0 => {condition => $FS->overwrite($precondition, $FS->parse($1)), effects => $FS->parse($2)}, 756 | recastability => 0, 757 | FS => $FS, 758 | }; 759 | return bless $rule; 760 | } 761 | 762 | my $effects = $FS->parse($FS->{strippings}[$k]{strip}); 763 | $effects =~ s/1/u/g; 764 | my $rule = { 765 | 0 => {condition => $precondition, effects => $effects}, 766 | recastability => 0, 767 | tag => $tag, 768 | FS => $FS, 769 | }; 770 | return bless $rule; 771 | } 772 | 773 | elsif ($kind =~ /^repair/) { 774 | my ($unsplit_d, $d); 775 | if ($kind eq 'repair_default') { 776 | $d = $unsplit_d = $FS->{features}[$k]{default}[$rest]{repair}; 777 | 778 | } else { 779 | $unsplit_d = $FS->{marked}[$k]; 780 | if ($kind =~ /_split$/) { 781 | $d = $unsplit_d->{split}[$rest]; 782 | } else { 783 | $d = $unsplit_d; 784 | } 785 | } 786 | 787 | my $base_rule = PhoneSet::parse($d, 1, FS => $FS, 788 | base => defined $args{unsplit_rule} ? $args{unsplit_rule}->deep_copy_indexed() : undef); 789 | bless $base_rule; 790 | $base_rule->{recastability} = 1 - $d->{prob}; 791 | $base_rule->{tag} = $tag; 792 | $base_rule->{cede} = 1 - $threshold; 793 | $base_rule->{FS} = $FS; 794 | 795 | if (defined $d->{filter}) { 796 | $base_rule->{filter} = PhoneSet::parse($d->{filter}, 0, FS => $FS); 797 | } 798 | 799 | # Invalidate previous resolutions if directed. This is intended to be used for cases 800 | # where typical epenthetic segments differ from typical phonemic segments. 801 | if ($kind eq 'repair_default' and defined $FS->{features}[$k]{default}[$rest]{inactivate_previous}) { 802 | for my $i (0..$#{$args{phonology}{phonology}}) { 803 | if ($args{phonology}{phonology}[$i]{tag} =~ /^default $k\W/) { 804 | my $extant_cd = $args{phonology}{phonology}[$i]{0}{condition}; 805 | $extant_cd =~ y/u/./; 806 | my $new_cd = $base_rule->{0}{condition}; 807 | $new_cd =~ y/u/./; 808 | push @{$base_rule->{inactivate}}, $i if $extant_cd =~ /^$new_cd$/; 809 | } 810 | } 811 | $threshold = 1; 812 | } 813 | 814 | my @unsplit_phones = map $FS->parse($_), split /, */, $unsplit_d->{condition}, -1; 815 | 816 | # Do choice of direction outside of the resolution selection for now, for laziness. 817 | if (defined $d->{direction}) { 818 | my $direction = weighted_one_of %{$d->{direction}}; 819 | if ($direction eq 'reverse') { 820 | $base_rule->{reverse} = 1; 821 | } elsif ($direction eq 'both') { 822 | $base_rule->{bidirectional} = 1; 823 | } 824 | } 825 | 826 | # {resolve} is a weight-hash of possible resolutions, whose keys are of the form "$operation $argument". 827 | # 828 | # If the resolution part isn't written, we will resolve phone 0 freely. 829 | # (This is intended for marked single phoneme rules. In particular, the last-resort deletion 830 | # that these rules once had is now no more.) 831 | my %resolutions; 832 | %resolutions = %{$d->{resolve}} if defined $d->{resolve}; 833 | $resolutions{'free 0'} = 1 unless keys %resolutions; 834 | 835 | my @resolution_keys = sort keys %resolutions; 836 | for (@resolution_keys) { 837 | my $weight = $resolutions{$_}; 838 | /^([^ ]*) +(.*)$/; 839 | my ($reskind, $arg) = ($1, $2); 840 | 841 | my $rule = $base_rule->deep_copy_indexed(); 842 | my @variants = (); # where to put the generated rules 843 | 844 | # resolve as specified 845 | if ($reskind eq 'r') { 846 | my @effects_strings = split /, +/, $arg; 847 | my %effects = (); 848 | for (@effects_strings) { 849 | /^(.*) +(-?[0-9]*)$/; 850 | my ($effect, $target) = ($1, $2); 851 | my $parsed_effect = $FS->parse($effect); 852 | 853 | for (0..length($effects{$target})-1) { 854 | if (substr($parsed_effect, $_, 1) =~ /[{}]/) { 855 | my $restriction = rand(2.0 + 4.0/(1-$threshold)); # 4 is a magic factor 856 | if ($restriction < 2.0) { 857 | substr($rule->{$target}{condition}, $_, 1) = int($restriction) 858 | if substr($rule->{$target}{condition}, $_, 1) eq 'u'; 859 | } 860 | } 861 | } 862 | $parsed_effect =~ y/{}/<>/; 863 | 864 | # In case of assimilation, both the things being spread from and to need to support the feature, 865 | # unless assimilating or setting that feature as well assures that this is unnecessary. 866 | for (0..length($parsed_effect)-1) { 867 | if (substr($parsed_effect, $_, 1) ne '.') { 868 | my $requirements = $FS->parse($FS->{features}[$_]{requires}); 869 | for my $i (0..length($requirements)-1) { 870 | substr($requirements, $i, 1) = '.' 871 | if substr($parsed_effect, $_, 1) =~ /[<>]/ 872 | and substr($parsed_effect, $i, 1) eq substr($parsed_effect, $_, 1); 873 | substr($requirements, $i, 1) = '.' if substr($parsed_effect, $i, 1) eq substr($requirements, $i, 1); 874 | } 875 | $rule->{$target}{condition} = '.' x @{$FS->{features}} unless defined $rule->{$target}{condition}; 876 | $rule->{$target}{condition} = $FS->intersect($rule->{$target}{condition}, $requirements); 877 | if (substr($parsed_effect, $_, 1) =~ /[<>]/) { 878 | my $source = (substr($parsed_effect, $_, 1) eq '>') ? $target + 1 : $target - 1; 879 | $rule->{$source}{condition} = $FS->intersect($rule->{$source}{condition}, $requirements); 880 | } 881 | } 882 | } 883 | 884 | $effects{$target} = $parsed_effect; 885 | } 886 | 887 | # But not if it's stripped off. 888 | for my $str (@{$FS->{strippings}}) { 889 | for my $displ ($rule->indices('condition')) { 890 | if ($rule->{$displ}{condition} =~ /^$str->{condition_parsed}$/) { 891 | my $effect = $FS->parse($str->{strip}); 892 | $effect =~ s/1/a/g; # temporary char 893 | $rule->{$displ}{condition} = $FS->overwrite($rule->{$displ}{condition}, $effect); 894 | $rule->{$displ}{condition} =~ s/a/./g; 895 | } 896 | } 897 | } 898 | 899 | $rule->{$_}{effects} = $effects{$_} for keys %effects; 900 | push @variants, $rule->persistence_variants(1, $args{phonology}, $threshold, 901 | 0, $args{generable_val}); 902 | } #r 903 | 904 | elsif ($reskind eq 'delete') { 905 | $rule->{$arg}{deletions} = 1; 906 | push @variants, [$rule, 1]; 907 | } #delete 908 | 909 | # Resolve the named phone in the ways listed in {flip} and {related_weight}. 910 | # In {flip} is a hash of single features to be flipped, with multiplicative weights; 911 | # in {related_weight} is a hash of multiplicative weights applying to resolutions via related features. 912 | # For essentially historical reasons, {flip} and {related_weight} belong to the whole constraint, 913 | # not the resolution. If entries in {flip} or keys in {related_weight} are followed by 914 | # a number, they apply only to the phone of that index, else they apply to all phones. 915 | elsif ($reskind eq 'free') { 916 | my $resolvend = $base_rule->{$arg}{condition}; 917 | my $reqd = $FS->add_requirements($resolvend); 918 | my $i = 0; 919 | my $resolution_type = 0; 920 | RESOLUTION_TYPE: while ($resolution_type <= 1) { 921 | my $effects; 922 | my $base_weight = 0; 923 | my $no_persist = 0; 924 | $no_persist = 1 if defined $d->{phonemic_only}; 925 | 926 | my $rule = $base_rule->deep_copy_indexed(); 927 | 928 | if ($resolution_type == 0) { 929 | $i = 0, $resolution_type++, next if $i >= length($resolvend); 930 | unless (defined $d->{flip}{$FS->{features}[$i]{name}}) { 931 | $i++, redo if substr($unsplit_phones[$arg], $i, 1) !~ /[01]/; # only flip actual things in the *base* situation 932 | $i++, redo if defined $FS->{features}[$i]{structural}; 933 | } 934 | $effects = '.' x length($resolvend); 935 | substr($effects, $i, 1) = (substr($reqd, $i, 1) eq '1' ? '0' : '1'); 936 | # don't turn univalents on (unless specially allowed) 937 | if (substr($reqd, $i, 1) eq '0' and defined $FS->{features}[$i]{univalent}) { 938 | $i++, redo if !defined $d->{flip}{$FS->{features}[$i]{name}}; 939 | $effects = $FS->overwrite($effects, $FS->parse($d->{univalent_addition})); 940 | # this still needs to have multiple phones enabled on it 941 | } 942 | # Weights for flipping individual features: given in {flip}. 943 | $base_weight = (defined $d->{flip}{$FS->{features}[$i]{name} . " $arg"} ? 944 | $d->{flip}{$FS->{features}[$i]{name} . " $arg"} : 945 | (defined $d->{flip}{$FS->{features}[$i]{name}} ? 946 | $d->{flip}{$FS->{features}[$i]{name}} : 1)); 947 | } 948 | 949 | elsif ($resolution_type == 1) { 950 | $i = 0, $resolution_type++, next if $i >= @{$FS->{relations}}; 951 | # just bail if we're in a stripping condition. --- why did I do this? 952 | for my $str (@{$FS->{strippings}}) { 953 | my $strip_condition = $str->{condition_parsed}; 954 | $i = 0, $resolution_type++, next RESOLUTION_TYPE if $resolvend =~ /^$strip_condition$/; 955 | } 956 | 957 | $i++, redo if defined $FS->{relations}[$i]{spread_only}; 958 | 959 | my $from = $FS->parse($FS->{relations}[$i]{from}); 960 | $i++, redo if $resolvend !~ /^$from$/; 961 | $effects = $FS->add_requirements($FS->parse($FS->{relations}[$i]{to})); 962 | if ($FS->compatible($FS->add_entailments($effects), $resolvend)) { 963 | # This is the place where we get the first word. That's problematic. 964 | $FS->{relations}[$i]{from} =~ /^([^ ]*)/; 965 | $_ = $FS->parse($1); 966 | y/01/10/; 967 | $effects = $FS->overwrite($effects, $FS->add_requirements($_)); 968 | } 969 | # Weights for doing any complicated feature change: given in {relate_weight}, 970 | # which apply to anything they match. 971 | $base_weight = $FS->{relations}[$i]{weight}; 972 | if(defined $d->{related_weight}) { 973 | for my $outcome (keys %{$d->{related_weight}}) { 974 | if ($outcome =~ /^(.*) (-?[0-9]*)$/) { 975 | next unless $arg == $2; 976 | $outcome = $1; 977 | } 978 | my $f = $FS->parse($outcome); 979 | $base_weight *= $d->{related_weight}{$outcome} if $effects =~ /^$f$/; 980 | } 981 | } 982 | } 983 | 984 | $total_base_weight += $base_weight; 985 | 986 | $rule->{$arg}{effects} = $effects; 987 | # This base_weight is used to fill out recastability, below. 988 | $rule->{base_weight} = $base_weight; 989 | 990 | my $persistence_weight = defined $d->{persist} ? $d->{persist} : $threshold; 991 | push @variants, $rule->persistence_variants($base_weight, $args{phonology}, $persistence_weight, 992 | $no_persist, $args{generable_val}); 993 | for (@variants) { 994 | push @resolutions, $_->[0]; 995 | push @weights, $_->[1]; 996 | } 997 | $i++; 998 | } # resolution type 999 | } # free 1000 | 1001 | my $total_weight = 0; 1002 | $total_weight += $_->[1] for @variants; 1003 | for (@variants) { 1004 | push @resolutions, $_->[0]; 1005 | push @weights, $_->[1] * $weight / $total_weight; 1006 | } 1007 | } # @resolution_keys 1008 | 1009 | # Record which split resolutions we will need to do. 1010 | # Recursive splits don't in fact work, as this is currently implemented. 1011 | if (defined $d->{split}) { 1012 | for my $rule (@resolutions) { 1013 | for my $i (0..$#{$d->{split}}) { 1014 | next unless rand() < $d->{split}[$i]{prob}; 1015 | if (defined $d->{split}[$i]{if}) { 1016 | $d->{split}[$i]{if} =~ /^(.*) +(-?[0-9]*)$/; 1017 | my ($condition, $target) = ($1, $2); 1018 | $condition = $FS->parse($condition); 1019 | next unless $rule->{$target}{effects} =~ /^$condition$/; 1020 | } 1021 | if (defined $d->{split}[$i]{unless_deletion}) { # code duplication... 1022 | next if $rule->{$d->{split}[$i]{unless_deletion}}{deletions}; 1023 | } 1024 | next if grep $_ eq "${kind}_split $k $i", @{$rule->{splits}}; #kluge? 1025 | push @{$rule->{splits}}, "${kind}_split $k $i"; 1026 | } 1027 | } 1028 | } 1029 | } # repair 1030 | 1031 | else { 1032 | warn "unknown rule tag: $tag"; 1033 | return; 1034 | } 1035 | 1036 | my $selected_rule; 1037 | RESOLVE: { 1038 | $selected_rule = undef; 1039 | my $total_weight = 0; 1040 | $total_weight += $_ for @weights; 1041 | my $w = rand $total_weight; 1042 | my $j; 1043 | for (0..$#weights) { 1044 | $j = $_, $selected_rule = $resolutions[$_], last if (($w -= $weights[$_]) < 0); 1045 | } 1046 | 1047 | return unless $selected_rule; 1048 | bless $selected_rule; 1049 | 1050 | # Decorate the selected resolution by clearing features that now lack their requirements. 1051 | # Do antithetical features. 1052 | for my $displ ($selected_rule->indices('effects')) { 1053 | $selected_rule->{$displ}{effects} = $FS->add_entailments($selected_rule->{$displ}{effects}); 1054 | 1055 | # If this resolution is to be avoided, try again. 1056 | for my $avoid (@{$args{avoid}}) { 1057 | if ($selected_rule->{$displ}{effects} eq $avoid) { 1058 | splice @resolutions, $j, 1; 1059 | splice @weights, $j, 1; 1060 | redo RESOLVE; 1061 | } 1062 | } 1063 | } # $displ 1064 | } # RESOLVE 1065 | 1066 | $selected_rule->{FS} = $FS; 1067 | 1068 | # Adorn the rule with extra conditions, if we decided to before. 1069 | if ($add_a_condition) { 1070 | $selected_rule->gen_extra_condition(%args); # {phonology} is passed through 1071 | } 1072 | 1073 | # If any of the preconditions of this rule are not generable by anything coming before, 1074 | # and it's a one-time rule, it's never triggerable; just drop it and don't write it down. 1075 | for my $displ ($selected_rule->indices('condition')) { 1076 | for my $i (0..@{$FS->{features}}-1) { 1077 | return if substr($selected_rule->{$displ}{condition}, $i, 1) =~ /[01]/ 1078 | and !defined($args{generable_val}[substr($selected_rule->{$displ}{condition}, $i, 1)][$i]) 1079 | and defined $selected_rule->{inactive}; 1080 | } 1081 | } 1082 | 1083 | # Heed forcibly_unmark: take out changes setting a certain feature 1084 | # to a value other than undefined, except on default rules. 1085 | # Moreover, if a rule changes a feature on which the defaults of that feature continge, 1086 | # explicitly force it back to undefined. 1087 | # (This is for before start_sequences, and is meant to be a mechanism by which e.g. 1088 | # we can avoid the stupid /n_a/ : /n_m/ contrasts.) 1089 | # 1090 | # Get rid of effectses if we can. 1091 | if (defined $args{forcibly_unmark}) { 1092 | for my $i (keys %{$args{forcibly_unmark}}) { 1093 | for my $displ ($selected_rule->indices('effects')) { 1094 | if ($kind ne 'default' and $kind ne 'stripping') { 1095 | substr($selected_rule->{$displ}{effects}, $i, 1) = '.' 1096 | if substr($selected_rule->{$displ}{effects}, $i, 1) =~ /[01]/; 1097 | delete $selected_rule->{$displ}{effects}, next unless $selected_rule->{$displ}{effects} =~ /[^.]/; 1098 | } 1099 | for (@{$args{forcibly_unmark}{$i}}) { 1100 | substr($selected_rule->{$displ}{effects}, $i, 1) = 'u', last 1101 | if substr($selected_rule->{$displ}{effects}, $_, 1) ne '.'; 1102 | } 1103 | } 1104 | } 1105 | } 1106 | 1107 | # Abandon this rule if it does nothing now. 1108 | # TODO: update these tests for rules that do nothing as needed 1109 | return unless scalar $selected_rule->indices('effects') or scalar $selected_rule->indices('deletions'); 1110 | 1111 | # Reverse this rule if it needs that. 1112 | if ($selected_rule->{reverse}) { 1113 | delete $selected_rule->{reverse}; 1114 | $selected_rule = $selected_rule->reverse_rule(); 1115 | } 1116 | 1117 | # Choose an application direction. 1118 | $selected_rule->{direction} = rand(2) >= 1.0 ? -1 : 1; 1119 | 1120 | # It's correct for extra condition rules to have no tag, so that they 1121 | # just drop out when regenerated. 1122 | $selected_rule->{tag} = $tag unless $add_a_condition; 1123 | $selected_rule->{run_again} = 1 if ($add_a_condition and !$skip_me); 1124 | if (defined $selected_rule->{base_weight}) { 1125 | $selected_rule->{recastability} = (1 - $selected_rule->{base_weight} / $total_base_weight); 1126 | $selected_rule->{recastability} = 0 if $selected_rule->{recastability} < 0; 1127 | delete $selected_rule->{base_weight}; 1128 | } 1129 | 1130 | $selected_rule; 1131 | } 1132 | 1133 | 1134 | 1; 1135 | -------------------------------------------------------------------------------- /phonology/Phonology.pm: -------------------------------------------------------------------------------- 1 | package Phonology; 2 | use strict; 3 | use constant INF => 9**9**9; # is there really nothing sensible better? 4 | 5 | our $debug_alphabet; # used for printing phones for debugging only 6 | our $verbose = 0; 7 | our $debug = 0; 8 | our $noprune = 0; 9 | 10 | # Go from a prototypical prob to an actual one. Now twice as gentle! 11 | sub fuzz { 12 | my $p = shift; 13 | return 0 if $p <= 0; 14 | return 1 if $p >= 1; 15 | my $q = rand($p / (1 - $p)) + rand($p / (1 - $p)); 16 | return $q / ($q + rand(1) + rand(1)); 17 | } 18 | 19 | # Box-Muller. I wonder whether this is faster than sum of 12 uniforms. 20 | use constant TWOPI => 6.28318530717958647688; 21 | sub std_normal { 22 | return sqrt(-2*log rand(1)) * cos(rand(TWOPI)); 23 | } 24 | 25 | # To save space, we currently trim the feature system out of a phonology before saving it. 26 | sub load_file { 27 | my ($infile, $FS) = (shift, shift); 28 | my $pd = YAML::Any::LoadFile($infile); 29 | $pd->{FS} = $FS; 30 | bless $pd, 'Phonology'; 31 | for my $rule (@{$pd->{phonology}}) { 32 | $rule->{FS} = $FS; 33 | bless $rule, 'PhonologicalRule'; 34 | } 35 | bless $pd; 36 | } 37 | 38 | sub dump_file { 39 | my ($self, $outfile, $annotate) = (shift, shift, shift); 40 | 41 | my $FS = $self->{FS}; 42 | my $pd = YAML::Any::Load(YAML::Any::Dump($self)); # kluge deep copy 43 | delete $pd->{FS}; 44 | for my $rule (@{$pd->{phonology}}) { 45 | delete $rule->{FS}; 46 | } 47 | if ($annotate) { 48 | for my $rule (@{$pd->{phonology}}) { 49 | for my $displ ($rule->indices('condition')) { 50 | $rule->{$displ}{condition_humane} = $FS->feature_string($rule->{$displ}{condition}, 1); 51 | } 52 | for my $displ ($rule->indices('effects')) { 53 | $rule->{$displ}{effects_humane} = join ' | ', map $FS->feature_string($_, 1), split / /, $rule->{$displ}{effects}; 54 | } 55 | } 56 | $pd->{phonology}[$_]{number} = $_ for 0..@{$pd->{phonology}}-1; 57 | } 58 | YAML::Any::DumpFile($outfile, $pd); 59 | } 60 | 61 | # Return annotations regarding which rules have which preconditions or excepts. 62 | # Used to optimise which rules we consider rerunning in running the phonology. 63 | # The resulting array is indexed as {$value}[$feature], where $value is '0' or '1' or 'u', 64 | # We also use {seq} for those rules whose preconditions include a sequence; 65 | # these are those which can be newly triggered after a deletion. 66 | 67 | sub annotate_with_preconditions { 68 | my $self = shift; 69 | my $FS = $self->{FS}; 70 | my %which; 71 | for my $i (0..@{$self->{phonology}}-1) { 72 | my $rule = $self->{phonology}[$i]; 73 | # Strippings need to be special-cased: the features they strip out shouldn't be allowed 74 | # to be turned on. 75 | if (defined $rule->{tag} and $rule->{tag} =~ /^stripping/) { 76 | for my $displ ($rule->indices('effects')) { 77 | for my $j (0..@{$FS->{features}}) { 78 | if (substr($rule->{$displ}{effects}, $j, 1) eq 'u') { 79 | push @{$which{0}[$j]}, $i; 80 | push @{$which{1}[$j]}, $i; 81 | } 82 | } 83 | } 84 | } 85 | 86 | for my $displ ($rule->indices('condition')) { 87 | for my $j (0..@{$FS->{features}}) { 88 | push @{$which{substr($rule->{$displ}{condition}, $j, 1)}[$j]}, $i 89 | if substr($rule->{$displ}{condition}, $j, 1) =~ /[01]/; 90 | # Doing undefined features is unnecessary, given the restricted circumstances 91 | # in which we set features undefined. 92 | # Not so! We now use this for forcibly_unmark. 93 | push @{$which{u}[$j]}, $i 94 | if substr($rule->{$displ}{condition}, $j, 1) eq 'u'; 95 | } 96 | } 97 | 98 | # Assimilations (to a feature undefined in the target) can be triggered by any change. 99 | for my $displ ($rule->indices('effects')) { 100 | for my $j (0..@{$FS->{features}}) { 101 | if ((substr($rule->{$displ}{effects}, $j, 1) eq '<' and 102 | substr($rule->{$displ-1}{condition}, $j, 1) eq '.') or 103 | (substr($rule->{$displ}{effects}, $j, 1) eq '>' and 104 | substr($rule->{$displ+1}{condition}, $j, 1) eq '.')) { 105 | push @{$which{0}[$j]}, $i; 106 | push @{$which{1}[$j]}, $i; 107 | } 108 | } 109 | } 110 | 111 | # Again for excepts. 112 | for my $displ ($rule->indices('except')) { 113 | my @exceptions = split / /, $rule->{$displ}{except}; 114 | for my $phone (@exceptions) { 115 | for my $j (0..@{$FS->{features}}) { 116 | if (substr($rule->{$displ}{except}, $j, 1) =~ /[01]/) { 117 | push @{$which{1-substr($rule->{$displ}{except}, $j, 1)}[$j]}, $i; 118 | push @{$which{'u'}[$j]}, $i; 119 | } 120 | } 121 | } 122 | } 123 | push @{$which{seq}}, $i if scalar($rule->indices('condition')) >= 2; 124 | } 125 | 126 | $self->{which_preconditions} = \%which; 127 | } 128 | 129 | # Drop, from a completed phonology, rules that go inactive too early to ever run. 130 | 131 | sub trim_inactive { 132 | my $self = shift; 133 | my @new_indices; 134 | my $deleted = 0; 135 | 136 | for (my $i = 0; $i < $self->{start_sequences}-$deleted; $i++) { 137 | if (defined $self->{phonology}[$i]{inactive} and $self->{phonology}[$i]{inactive} <= $self->{start_sequences}) { 138 | splice @{$self->{phonology}}, $i, 1; 139 | $i--; 140 | $deleted++; 141 | } 142 | push @new_indices, $i; 143 | } 144 | # I don't suppose it actually matters when a rule is inactivated if that time is before it runs. 145 | for my $rule (@{$self->{phonology}}) { 146 | $rule->{inactive} = $rule->{inactive} >= $self->{start_sequences} 147 | ? $rule->{inactive} - $deleted 148 | : $new_indices[$rule->{inactive}] 149 | if defined $rule->{inactive}; 150 | } 151 | $self->{start_sequences} -= $deleted; 152 | } 153 | 154 | # Persistent rules implement so-called surface filters. 155 | 156 | # Persistence is the default state of affairs for a non-generator rule. 157 | # The {inactive} property on a rule is a rule number N, at which point this one 158 | # becomes inactive (it won't run as a resolution when the current rule is >= N). 159 | # A rule can also inactivate itself (they commonly do); 160 | # these still run once. 161 | 162 | # If passed a list in sources, it will overwrite it with a list of 163 | # positions of source phones of the phones in the result. 164 | # -1 is used for epenthesis, and for the smaller fragment in breakings. 165 | # (It seems a bad idea to use the same source label twice.) 166 | 167 | # Regardless of persistence, always run a _single_ rule repeatedly until 168 | # it makes no more changes. This way things like assimilation will work 169 | # across groups of more than two phones. It also means we must disallow 170 | # certain rule types (e.g. a single rule to achieve l...l, r...r > l...r). 171 | # TODO: whether LtR or RtL needs to be an option here. 172 | 173 | # Only do rules start..end-1, if these are provided. 174 | # If passed cleanup => $i, don't even do a single rule but rather just 175 | # fix up the word using persistent rules in force after rule $i runs. 176 | 177 | # If passed a list in track_expiry, it puts the expiry time of the derivation 178 | # in the first element (with the same semantics as inactive, i.e. it expires 179 | # just before the given rule number). 180 | 181 | use constant STEPS_TO_LOOP => 10; 182 | use constant STEPS_TO_DIE => 30; 183 | 184 | sub run { 185 | my ($self, $word, %args) = (shift, shift, @_); 186 | my $phonology = $self->{phonology}; 187 | 188 | my $start = defined $args{start} ? $args{start} : 0; 189 | my $end = defined $args{end} ? $args{end} : @$phonology; 190 | ($start, $end) = ($args{cleanup}, $args{cleanup}+1) if defined $args{cleanup}; 191 | my $first_time = 1; 192 | @{$args{sources}} = 0..@$word-1 if (defined $args{sources} and !@{$args{sources}}); 193 | my $track_expiry; 194 | $track_expiry = INF if defined $args{track_expiry}; 195 | 196 | my @loop_rules; 197 | my @loop_cessions; 198 | my %ceders; 199 | print STDERR "@$word (initially)\n" if $debug >= 1; 200 | for my $k ($start..$end-1) { 201 | my %agenda = ($k => 1); 202 | my $iterations = 0; 203 | while (keys %agenda) { 204 | my %new_agenda; 205 | for my $i (sort {$a <=> $b} keys %agenda) { 206 | next if $i > $k; 207 | # if this is the first time through, let rules run even if they've marked themselves inactive 208 | next if $iterations and (defined $phonology->[$i]{inactive} and $k >= $phonology->[$i]{inactive}); 209 | next if ($iterations >= STEPS_TO_LOOP) and defined $ceders{$i}; 210 | 211 | my @changes; 212 | if (($first_time and defined $args{cleanup}) or 213 | $phonology->[$i]->run($word, %args, changes => \@changes)) { 214 | my $length_after_one = @$word; 215 | if (scalar($phonology->[$i]->indices()) > 1) { # an optimization. helpful? 216 | while ($phonology->[$i]->run($word, %args)) { #gdgd 217 | $length_after_one = INF, warn "possible epenthesis loop, rule $i" if @$word >= $length_after_one + 8; 218 | } 219 | } 220 | print STDERR "@$word (after $i)\n" if $debug >= 1; 221 | 222 | @changes = @{$args{change_record}} if ($first_time and defined $args{change_record}); 223 | $first_time = undef; 224 | $track_expiry = $phonology->[$i]{inactive} if defined $track_expiry and 225 | $phonology->[$i]{inactive} < $track_expiry; 226 | 227 | # This is vile. We should not be negotiating with terrorists loops. 228 | # At least it's a respectable algorithm. 229 | if ($iterations >= STEPS_TO_LOOP and defined $phonology->[$i]{cede}) { 230 | while (@loop_rules and 231 | (($loop_cessions[-1] < $phonology->[$i]{cede}) or 232 | ($loop_cessions[-1] == $phonology->[$i]{cede}) and ($loop_rules[-1] < $i))) { 233 | pop @loop_rules; 234 | pop @loop_cessions; 235 | } 236 | $ceders{$i} = 1 if (@loop_rules and $loop_rules[-1] == $i); 237 | push @loop_rules, $i; 238 | push @loop_cessions, $phonology->[$i]{cede}; 239 | } 240 | if (defined $self->{which_preconditions}) { 241 | # We might need to rerun the rules which have as a precondition a feature 242 | # this rule has newly acquired. 243 | my @new_agenda; 244 | for my $change (@changes) { 245 | if ($change =~ /^c (.*) (.*)$/) { 246 | push @new_agenda, @{$self->{which_preconditions}{$1}[$2]} 247 | if defined $self->{which_preconditions}{$1}[$2]; 248 | } 249 | elsif ($change eq 'd') { 250 | push @new_agenda, @{$self->{which_preconditions}{seq}} if defined $self->{which_preconditions}{seq}; 251 | } 252 | } 253 | %new_agenda = (%new_agenda, map(($_ => 1), @new_agenda)); 254 | } else { 255 | %new_agenda = map(($_ => 1), (0..$k)); 256 | } 257 | } 258 | 259 | # stick the ordinal on to label the context dependency, if it's being looked out for 260 | if (defined $args{context_dependent} and defined $args{context_dependent}{''}) { 261 | $args{context_dependent}{$i} = $args{context_dependent}{''}; # Does this work without being a multimap? 262 | delete $args{context_dependent}{''}; 263 | } 264 | } 265 | %agenda = %new_agenda; 266 | # if this rule is supposed to run with the next as a block, don't check previous rules now 267 | %agenda = () if $phonology->[$k]{inseparable}; 268 | 269 | # fwiw I saw this tripped wrongly once when the bound was 8. 270 | (print STDERR "*** unceded loop!\n"), last if (++$iterations >= STEPS_TO_DIE); 271 | } # while (keys %agenda) 272 | } 273 | 274 | if (defined $args{generator}) { 275 | $_ = $self->{FS}->add_entailments($_) for @$word; 276 | } 277 | $args{track_expiry}[0] = $track_expiry if defined $track_expiry; 278 | } 279 | 280 | 281 | 282 | # Is there a generated contrast between phones matching $phone0 and $phone1? 283 | sub generatedly_contrast { 284 | my ($self, $phone0, $phone1) = (shift, shift, shift); 285 | 286 | # If the contrast isn't in just one feature, default to false for now. 287 | my $f = -1; 288 | for(my $i = 0; $i < scalar @{$self->{FS}{features}}; ++$i) { 289 | if (substr($phone0, $i, 1) ne substr($phone1, $i, 1) and substr($phone0, $i, 1) ne '.' and substr($phone1, $i, 1) ne '.') { 290 | if ($f == -1) { 291 | $f = $i; 292 | } else { 293 | warn "can't test for a contrast on generation"; 294 | return 0; 295 | } 296 | } 297 | } 298 | return 1 if ($f == -1); 299 | 300 | my $base_phone = $phone0; 301 | substr($base_phone, $f, 1) = '.'; 302 | $base_phone = $self->{FS}->add_requirements($base_phone); 303 | for my $rule (@{$self->{phone_generator}}) { 304 | if (substr($rule->{0}{condition}, $f, 1) eq 'u') { # safer, because of antitheticals 305 | return 1 if $self->{FS}->compatible($base_phone, $rule->{0}{condition}); 306 | } 307 | } 308 | return 0; 309 | } 310 | 311 | # Generates a new rule with tag $tag, and appends it to the phonology, 312 | # making the other changes that this may entail. 313 | sub generate_new_rule { 314 | my ($self, $tag, %args) = (shift, shift, @_); 315 | my $phonology = $self->{phonology}; 316 | 317 | # Strippings and defaults are clusters of rules to be generated all at once. 318 | # Other tags, just generate once. 319 | my @tag_suffixes = (''); 320 | if ($tag =~ /^default /) { 321 | my ($kind, $k) = split / /, $tag; 322 | @tag_suffixes = map " $_", 0..@{$self->{FS}{features}[$k]{default}}-1; 323 | } elsif ($tag =~ /^stripping /) { 324 | my ($kind, $k) = split / /, $tag; 325 | @tag_suffixes = map " $_", 0..@{$self->{FS}{strippings}[$k]{substitute}}-1; 326 | push @tag_suffixes, ''; 327 | } 328 | 329 | for my $tag_suffix (@tag_suffixes) { 330 | my $rule = PhonologicalRule::generate($tag.$tag_suffix, %args, phonology => $self); 331 | next unless $rule; 332 | 333 | if ($tag =~ /^stripping /) { 334 | push @$phonology, $rule; 335 | next; 336 | } 337 | 338 | for (@{$rule->{inactivate}}) { 339 | $phonology->[$_]{inactive} = scalar @$phonology unless defined $phonology->[$_]{inactive} 340 | and $phonology->[$_]{inactive} < scalar @$phonology; 341 | } 342 | delete $rule->{inactivate}; 343 | 344 | if (defined $rule->{splits}) { 345 | my @splits = @{$rule->{splits}}; 346 | delete $rule->{splits}; 347 | delete $rule->{run_again}; 348 | $self->generate_new_rule($_, %args, unsplit_rule => $rule, dont_skip => 1) for @splits; 349 | } 350 | 351 | # Since extra conditions added to this rule may have come out the same way, delete redundant ones, 352 | # i.e. rules made since invoking this function whose conditions are narrower than 353 | # the one about to be inserted, and which have the same effect, unless they're persistent 354 | # and we're not. 355 | # (If we cut a few too many things, though, not the end of the world.) 356 | # Only do this to immediately preceding ones, since we might get A in a doubly-special case, 357 | # B in a special case, A in the general case. 358 | my $former_length = @$phonology; 359 | DROP_REDUNDANT: while(1) { 360 | my $rule1 = $phonology->[-1]; 361 | for my $displ ($rule->indices('condition')) { 362 | last DROP_REDUNDANT unless defined $rule1->{$displ}{condition} 363 | and $rule1->{$displ}{condition} =~ /^$rule->{$displ}{condition}$/; 364 | } 365 | last DROP_REDUNDANT unless $rule1->indices('effects') == $rule->indices('effects'); 366 | for my $displ ($rule->indices('effects')) { 367 | last DROP_REDUNDANT unless defined $rule1->{$displ}{effects} 368 | and $rule1->{$displ}{effects} eq $rule->{$displ}{effects}; 369 | } 370 | last DROP_REDUNDANT if scalar $rule->indices('deletions') == 0; 371 | last DROP_REDUNDANT unless scalar $rule1->indices('deletions') == scalar $rule->indices('deletions'); 372 | for my $displ ($rule1->indices('deletions')) { 373 | last DROP_REDUNDANT unless grep $_ == $displ, $rule->indices('deletions'); 374 | } 375 | last DROP_REDUNDANT if !defined $rule1->{inactive} and defined $rule->{inactive}; 376 | #print STDERR YAML::Any::Dump($rule1) . "redounds with\n" . YAML::Any::Dump($rule) . "\n"; # debug 377 | pop @$phonology; 378 | } 379 | if (@$phonology < $former_length) { 380 | for my $rule1 (@$phonology) { 381 | $rule1->{inactive} = @$phonology if $rule1->{inactive} > @$phonology; 382 | } 383 | } 384 | 385 | # This must be done after all mucking about with deleting rules. 386 | for my $displ ($rule->indices('effects')) { 387 | for my $i (0..@{$self->{FS}{features}}-1) { 388 | if (substr($rule->{$displ}{effects}, $i, 1) =~ /[01]/) { 389 | push @{$args{generable_val}[substr($rule->{$displ}{effects}, $i, 1)][$i]}, scalar @$phonology; 390 | } elsif (substr($rule->{$displ}{effects}, $i, 1) eq '<') { 391 | 392 | } elsif (substr($rule->{$displ}{effects}, $i, 1) eq '>') { 393 | 394 | } 395 | } 396 | } 397 | 398 | push @$phonology, $rule; 399 | 400 | # Recurse to replace any other rule which we deactivated; make sure these don't resolve 401 | # the same as the bad rule. 402 | # (Incidentally, we couldn't recurse before the push; it would break rule referencing by number.) 403 | # 404 | # Watch out: this mechanism will cause infinite recursions if there are two conflicting rules of prob 1 (=> recastability 0). 405 | for my $bt (@{$rule->{broken_tags}}) { 406 | $bt =~ /^(.*) ([01u.]*)$/; 407 | my ($tag, $avoid) = ($1, $2); 408 | my %otherargs = %args; 409 | delete $otherargs{avoid}; 410 | delete $otherargs{dont_skip}; 411 | delete $otherargs{unsplit_rule}; 412 | #print "{\n"; # debug 413 | $args{avoid} = [] if !defined $args{avoid}; 414 | $self->generate_new_rule($tag, avoid => [(split /\|/, $avoid), @{$args{avoid}}], %otherargs); 415 | #print "}\n"; # debug 416 | } 417 | delete $rule->{broken_tags}; 418 | 419 | # If this is the added-condition version of a rule which we also wanted to generate unadorned, 420 | # recurse to do the unadorned form. 421 | if (defined $rule->{run_again}) { 422 | delete $rule->{run_again}; 423 | $self->generate_new_rule($tag, %args, dont_skip => 1); 424 | } 425 | } 426 | } 427 | 428 | 429 | 430 | # A complete generated phonology has the following layers. 431 | # 432 | # (1) General single segment repair and default feature insertion rules. 433 | # [It is planned that a few of the default insertion rules may be harmonic in nature. 434 | # Aside from this exception:] These are context-independent. 435 | # (2) General cluster resolution, allophony, and the like. Any time from 436 | # the start of this block onward is a sensible affix attachment time. 437 | # 438 | # The start of (2) is called start_sequences. Phonemes are regarded as being those phones 439 | # which can be extant at the start of (2). 440 | 441 | # Alternations (whose implementaiton is probably not to be soon, comparatively...) 442 | # will not be implemented by the resolutions of features in 443 | # different contexts alone. Instead, we'll eventually generate a thing for them: 444 | # perhaps a table with several small dimensions, and for each value of each dimension 445 | # one (or a few?) feature-values from among the contrastive features, 446 | # 447 | # Things, vowels included, have to be able to alternate with zero! 448 | # In allophony mode (sound change mode is different) 449 | # I won't actually ever generate a new syllabification rule; therefore, 450 | # syllable structure things have to be handled fully in the alternations. 451 | # This means that, to the extent the forms don't fit within the generated syllable structure, 452 | # we'll have to list allowable deletions carefully, and specify epenthetic vowels for the rest. 453 | # (Sonority mistroughs I can handle though.) 454 | # 455 | # To generate forms in an alternation, then, we generate the phone(s) in question 456 | # as normal and overstamp them with the feature values from the alternation somehow. 457 | # This still needs some thought on what to do about features in an alternation 458 | # that don't fulfill their requisites for generation (think also about sound change). 459 | # 460 | # To make good deeper alternations probably requires using related features and stuff 461 | # to retcon some extra history. But that seems hard. 462 | 463 | sub generate { 464 | print STDERR "generating phonology...\n" if $verbose; 465 | my $pd = Phonology::generate_preliminary(shift); 466 | $pd->annotate_with_preconditions(); 467 | print STDERR "computing inventory...\n" if $verbose; 468 | $pd->compute_inventory(); # base inventory for generation 469 | $pd->postprocess(); 470 | delete $pd->{phone_generator}; # now this is needless 471 | if ($debug < 1 and !$noprune) { 472 | $pd->trim_inactive(); 473 | } else { 474 | print STDERR "pruning of inactive rules skipped\n"; 475 | } 476 | $pd->annotate_with_preconditions(); # since the numbers are changed 477 | for (@{$pd->{phonology}}) { 478 | $_->strip_feed_annotation(); 479 | } 480 | return $pd; 481 | } 482 | 483 | sub generate_preliminary { 484 | my $FS = shift; 485 | my (@phone_generator, @phonology); 486 | my @syllable_structure; 487 | 488 | # The most general allowable form of a syllable position features specification 489 | # consists of feature strings alternated with weights. Each probability 490 | # associates to the feature string before it; a missing final weight will be chosen 491 | # to make the sum 1. 492 | for my $slot (@{$FS->{syllable_template}}) { 493 | next if rand() >= $slot->{prob}; 494 | do { 495 | # Prepare syllable structure for the cases where there are alternates. 496 | # Fuzz the probabilities. 497 | my @featureses = split / +([0-9.]+) */, $slot->{features}; 498 | my %featureses; 499 | my $remaining_weight = 1; # pre-fuzz weight 500 | my $fuzzed_weight = 0; # post-fuzz weight 501 | while (@featureses) { 502 | my $phone = $FS->parse(shift @featureses, undefined => 1); 503 | $remaining_weight -= @featureses[0] if @featureses; 504 | $_ = (@featureses ? shift @featureses : $remaining_weight) * (rand() + rand()); 505 | $fuzzed_weight += $_; 506 | $featureses{$phone} = $_; 507 | } 508 | $featureses{$_} /= $fuzzed_weight for (keys %featureses); 509 | 510 | my $rslot = { 511 | prob => fuzz($slot->{presence}), 512 | features => \%featureses, 513 | tag => $slot->{tag}, 514 | }; 515 | $rslot->{bend} = $slot->{bend} if defined $slot->{bend}; 516 | $rslot->{lump} = $slot->{lump} if defined $slot->{lump}; 517 | $rslot->{reprune} = 1 if defined $slot->{reprune} and rand() < $slot->{reprune}; 518 | if (defined $slot->{except}) { 519 | while (my ($k, $v) = each %{$slot->{except}}) { 520 | push @{$rslot->{except}}, $FS->parse($k) if rand() < $v; 521 | } 522 | } 523 | push @syllable_structure, $rslot; 524 | } while (defined $slot->{prob_more} and rand() < $slot->{prob_more}); 525 | } 526 | 527 | my @generable; # is this feature generated as contrastive in any situation? 528 | my @generable_val; # defined($generable_val[$v][$f]) iff the $f-th feature can take value $v \in 0,1. 529 | # If it's an empty list, that's ok; that just means the feature can only come up in phone generation. 530 | my %family_inventories; 531 | $family_inventories{$_} = { $FS->parse($FS->{families}{$_}) => 1 } 532 | for (keys %{$FS->{families}}); 533 | my %special_filling; # which features we're using a U in the syllable structure in 534 | my %prevent_marked; # when we look through the markeds, which ones we don't do 535 | 536 | # Sometimes we generate things which we never generate a prerequisite for. Not a problem though. 537 | for my $fi (0..@{$FS->{features}}-1) { 538 | my $f = $FS->{features}[$fi]; 539 | for my $sit (@{$f->{generated}}) { 540 | if (rand() < $sit->{contrast}) { 541 | my $requires; 542 | $requires = $FS->parse($f->{requires}) if defined $f->{requires}; 543 | 544 | my @by_families; 545 | if (defined $sit->{by_family} and rand() < $sit->{by_family_prob}) { 546 | for my $phone (keys %{$family_inventories{$sit->{by_family}}}) { 547 | next if defined $requires and !$FS->compatible($phone, $requires); 548 | push @by_families, $phone if rand() < $sit->{each_family_prob} * 549 | sqrt($family_inventories{$sit->{by_family}}{$phone}); 550 | } 551 | # If we didn't make the contrast anywhere on the first pass, introduce it on the 552 | # most frequent family category. I want a rule like this so that the 553 | # feature doesn't just fail to appear. Choosing the 554 | # most frequent category isn't really motivated, but it serves to spread out 555 | # probability peaks, and has the right effect for some particular cases 556 | # (e.g. lateral). 557 | if (!@by_families) { 558 | my ($max_phone, $max_value) = ('.' x @{$FS->{features}}, 0); 559 | for (my ($phone, $value) = each %{$family_inventories{$sit->{by_family}}}) { 560 | next if defined $f->{requires} and $phone !~ /^$requires$/; 561 | ($max_phone, $max_value) = ($phone, $value) if $value > $max_value; 562 | } 563 | push @by_families, $max_phone; 564 | } 565 | } 566 | 567 | my $precondition = $FS->parse($sit->{condition}); 568 | substr($precondition, $FS->{feature_index}{$f->{name}}, 1) = 'u'; 569 | $precondition = $FS->overwrite($precondition, $requires) if defined $f->{requires}; 570 | my %rule = ( 571 | 0 => {condition => $precondition, effects => $FS->parse($f->{name}), 572 | alternate_effects => $FS->parse('-' . $f->{name})}, 573 | prob => [map fuzz($sit->{prob}), @syllable_structure], 574 | FS => $FS, 575 | ); 576 | substr($rule{0}{effects}, $FS->{feature_index}{$f->{antithetical}}, 1) = '0' if (defined $f->{antithetical}); 577 | if (@by_families) { 578 | for (@by_families) { 579 | my %rule1 = %rule; 580 | $rule1{0}{condition} = $FS->overwrite($precondition, $_); 581 | # Don't allow a rule inserting f in families to be sensitive to f. 582 | # (It confuses the inventory-taker.) 583 | next if index($rule1{0}{condition}, 'u') == -1; 584 | push @phone_generator, \%rule1; 585 | } 586 | } else { 587 | push @phone_generator, \%rule; 588 | } 589 | bless $_, 'PhonologicalRule' for @phone_generator; 590 | 591 | $generable[$fi] = 1; 592 | $generable_val[0][$fi] = []; 593 | $generable_val[1][$fi] = []; 594 | 595 | for my $slot (@syllable_structure) { 596 | my $r = rand(); 597 | my %static_features = %{$slot->{features}}; 598 | while (my ($phone, $weight) = each %static_features) { 599 | $_ = $phone; 600 | s/u/./g; 601 | next unless $FS->compatible($_, $precondition); 602 | 603 | delete $slot->{features}{$phone}; 604 | if (defined $f->{slots}{$slot->{tag}}) { 605 | if ($r < $f->{slots}{$slot->{tag}}[0]) { 606 | $rule{antieffects}{0} = '.' x @{$FS->{features}} unless defined $rule{antieffects}{0}; 607 | $phone = $FS->overwrite($phone, $rule{antieffects}{0}); 608 | } elsif ($r < $f->{slots}{$slot->{tag}}[0] + $f->{slots}{$slot->{tag}}[1]) { 609 | $phone = $FS->overwrite($phone, $rule{0}{effects}); 610 | $phone = $FS->overwrite($phone, $FS->parse($f->{slot_if_on})) 611 | if defined $f->{slot_if_on}; 612 | } elsif ($r < $f->{slots}{$slot->{tag}}[0] + $f->{slots}{$slot->{tag}}[1] + $f->{slots}{$slot->{tag}}[2]) { 613 | substr($phone, $FS->{feature_index}{$f->{name}}, 1) = 'U'; 614 | $special_filling{$FS->{feature_index}{$f->{name}}} = 1; 615 | } 616 | } 617 | if (defined $slot->{features}{$phone}) { 618 | $slot->{features}{$phone} += $weight; 619 | } else { 620 | $slot->{features}{$phone} = $weight; 621 | } 622 | } # each %static_features 623 | } # @syllable_structure 624 | 625 | for my $fam (split / /, $f->{families}) { 626 | $_ = $precondition; 627 | for my $phone (keys %{$family_inventories{$fam}}) { 628 | my $s = $_; 629 | $s =~ s/u/./g; 630 | next if $phone !~ /^$s$/; 631 | # Using $rule{prob}[0] here of course isn't especially correct, but it'll do. 632 | $family_inventories{$fam}{$FS->overwrite($phone, $rule{0}{effects})} += 633 | $family_inventories{$fam}{$phone} * $rule{prob}[0] if ($rule{prob}[0] > 0); 634 | $rule{antieffects}{0} = '.' x @{$FS->{features}} unless defined $rule{antieffects}{0}; 635 | $family_inventories{$fam}{$FS->overwrite($phone, $rule{antieffects}{0})} += 636 | $family_inventories{$fam}{$phone} * (1 - $rule{prob}[0]) if ($rule{prob}[0] < 1); 637 | delete $family_inventories{$fam}{$phone}; 638 | } 639 | } 640 | 641 | # If the same string appears as the value of key 'prevent_marked' on both a 642 | # generable situation and a marked, then -- if the generable situation is chosen, 643 | # the marked will never be. 644 | if (defined $sit->{prevent_marked}) { 645 | $prevent_marked{$sit->{prevent_marked}} = 1; 646 | } 647 | } # rand() < $sit->{contrast} 648 | } # situations for generation 649 | 650 | if (defined $f->{structural}) { 651 | $generable[$fi] = 1; 652 | $generable_val[0][$fi] = [-1]; # -1 to not trip the never triggerable check. 653 | $generable_val[1][$fi] = [-1]; # hope it doesn't screw up other things 654 | } 655 | } # features in the phone generator 656 | 657 | # Map features to the things on which their defaults depend, including stripping situations. 658 | my %forcibly_unmark; 659 | for my $i (0..@{$FS->{features}}-1) { 660 | if (!$generable[$i] and defined $FS->{features}[$i]{forcibly_unmark} 661 | and rand() < $FS->{features}[$i]{forcibly_unmark}) { 662 | my @l = (); 663 | for my $default (@{$FS->{features}[$i]{default}}) { 664 | my $phone = $FS->parse($default->{condition}); 665 | for (0..@{$FS->{features}}-1) { 666 | push @l, $_ if substr($phone, $_, 1) ne '.'; 667 | } 668 | } 669 | FUSTRIP: for my $stripping (@{$FS->{strippings}}) { 670 | my $trigger = $FS->parse($stripping->{strip}); 671 | for my $i (@l) { 672 | if (substr($trigger, $i, 1) ne '.') { 673 | my $phone = $FS->parse($stripping->{condition}); 674 | for (0..@{$FS->{features}}-1) { 675 | push @l, $_ if substr($phone, $_, 1) ne '.'; 676 | } 677 | next FUSTRIP; 678 | } 679 | } 680 | } 681 | $forcibly_unmark{$i} = \@l; 682 | } 683 | } 684 | 685 | # Choose the order the rules are going to appear in, and write down a list of rule tag strings. 686 | 687 | # Marked single phones and sequences are handled by rules of the same type. 688 | # If the constraint is against a sequence of length one, the rule is placed before 689 | # the point defining what the phonemes ar. Sequences of greater length are placed after, 690 | # and correspond to allophony. 691 | 692 | # Default provision rules come in a random order; contrastive features are more likely to 693 | # come early; among uncontrastive features the unlikely to have been contrastive are biased to come late. 694 | 695 | # Subject to that, repair rules come as soon as they can; we have taken occasional 696 | # advantage of the fact that they are not further randomized. 697 | 698 | my @feature_at_position; # do first feature in this list first, etc. 699 | my @position_of_feature; # the inverse of this, plus 1 (so do the $i such that $p_o_f[$i] is 1 first) 700 | my @sortkey; 701 | for my $i (0..@{$FS->{features}}-1) { 702 | $sortkey[$i] = std_normal(); 703 | unless ($generable[$i]) { 704 | my $max_generation = 1e-6; # zero is scary 705 | for (@{$FS->{features}[$i]{generated}}) { 706 | $max_generation = $_->{contrast} if $max_generation < $_->{contrast}; 707 | } 708 | $sortkey[$i] += log($max_generation); # there is a hidden multiplicative magic constant of 1 here 709 | } 710 | } 711 | @feature_at_position = sort {$sortkey[$b] <=> $sortkey[$a]} (0..@{$FS->{features}}-1); 712 | $position_of_feature[$feature_at_position[$_]] = 1 + $_ for (0..@{$FS->{features}}-1); 713 | 714 | my @single_repair_indices = grep $FS->{marked}[$_]{condition} !~ /,/, 0..$#{$FS->{marked}}; 715 | my @sequence_repair_indices = grep $FS->{marked}[$_]{condition} =~ /,/, 0..$#{$FS->{marked}}; 716 | 717 | my @repair_rule_tags; 718 | for my $k (@single_repair_indices) { 719 | # How should {prevented_by} be generalised? 720 | next if defined $FS->{marked}[$k]{prevented_by} and $prevent_marked{$FS->{marked}[$k]{prevented_by}}; 721 | my $f = $FS->parse($FS->{marked}[$k]{condition}, undefined => 1); 722 | my $when = 0; 723 | for (0..length($f)-1) { 724 | $when = $position_of_feature[$_] 725 | if substr($f, $_, 1) ne 'u' and !defined $generable[$_] and $position_of_feature[$_] > $when; 726 | } 727 | push @{$repair_rule_tags[$when]}, "repair $k" unless defined $FS->{marked}[$k]{phonemic_only}; 728 | } 729 | my @assim_tags = ((map "repair $_", (@sequence_repair_indices))); 730 | for my $i (0..@assim_tags-1) { 731 | my $j = $i + int rand(@assim_tags - $i); 732 | $_ = $assim_tags[$i]; 733 | $assim_tags[$i] = $assim_tags[$j]; 734 | $assim_tags[$j] = $_; 735 | } 736 | 737 | my @rule_tags; 738 | push @rule_tags, "stripping $_" for (0..@{$FS->{strippings}}-1); 739 | push @rule_tags, "default $_" for sort keys %special_filling; 740 | for my $i (0..@{$FS->{features}}) { 741 | push @rule_tags, "default $feature_at_position[$i-1]" unless $i <= 0 or defined $special_filling{$feature_at_position[$i-1]}; 742 | push @rule_tags, @{$repair_rule_tags[$i]} if defined $repair_rule_tags[$i]; 743 | } 744 | for my $k (@single_repair_indices) { 745 | next if defined $FS->{marked}[$k]{prevented_by} and $prevent_marked{$FS->{marked}[$k]{prevented_by}}; 746 | push @rule_tags, "repair $k" if defined $FS->{marked}[$k]{phonemic_only}; 747 | } 748 | push @rule_tags, '#'; # false tag for end of phoneme straightening-out 749 | push @rule_tags, @assim_tags; 750 | 751 | my $self = { 752 | syllable_structure => \@syllable_structure, 753 | phone_generator => \@phone_generator, 754 | phonology => \@phonology, 755 | FS => $FS, 756 | }; 757 | bless $self; 758 | 759 | for my $tag (@rule_tags) { 760 | if ($tag eq '#') { 761 | print STDERR "on to allophony...\n" if $verbose; 762 | $self->{start_sequences} = @phonology; # end of rules that pertain only to individual segments 763 | next; 764 | } 765 | # We pass the generator as a way of specifying what contrasts are available. 766 | # For sound change purposes we'll need an alternate way to pass this information. 767 | $self->generate_new_rule($tag, 768 | generator => \@phone_generator, 769 | generable_val => \@generable_val, 770 | initial => 1, 771 | syllable_structure => \@syllable_structure, # used only by extra conditions, presently 772 | bar_sequences => defined $self->{start_sequences} ? undef : 1, 773 | forcibly_unmark => defined $self->{start_sequences} ? undef : \%forcibly_unmark); 774 | } 775 | 776 | $self; 777 | } 778 | 779 | # vectors, whee. Class method. 780 | sub add_in { 781 | my ($inventory, $x, $v) = @_; 782 | return unless grep $_, @$v; 783 | if (defined $inventory->{$x}) { 784 | for my $i (0..@$v-1) { 785 | $inventory->{$x}[$i] += $v->[$i]; 786 | } 787 | } else { 788 | $inventory->{$x} = $v; # shallow copy okay? 789 | } 790 | } 791 | 792 | # The inventory this returns is raw, and needs a post-processing stage. 793 | 794 | sub compute_inventory { 795 | my $self = shift; 796 | my ($syllable_structure, $phone_generator, $phonology, $which_preconditions) = 797 | @$self{qw/syllable_structure phone_generator phonology which_preconditions/}; 798 | # This is a hash from phones to lists of probabilities in the various syllable positions. 799 | # We use these for generation and to calculate the single-phone entropy. 800 | my %inventory; 801 | 802 | for my $i (0..@$syllable_structure-1) { 803 | for my $phone (keys %{$syllable_structure->[$i]{features}}) { 804 | add_in \%inventory, $phone, 805 | [map(($_ == $i ? $syllable_structure->[$i]{features}{$phone} : 0), 806 | (0..@$syllable_structure-1))]; 807 | } 808 | } 809 | 810 | # TODO: Revise this if ever resolvent rules before start_sequences can cause breakings or whatever. 811 | for my $rule (@$phone_generator) { 812 | my %inventory2; 813 | for my $phone (keys %inventory) { 814 | my @v = @{$inventory{$phone}}; 815 | my @word; 816 | @word = ($phone); 817 | $rule->run(\@word, alternate_effects => 0); 818 | add_in \%inventory2, $word[0], [map $v[$_] * $rule->{prob}[$_], 0..@v-1]; 819 | @word = ($phone); 820 | $rule->run(\@word, alternate_effects => 1); 821 | add_in \%inventory2, $word[0], [map $v[$_] * (1 - $rule->{prob}[$_]), 0..@v-1]; 822 | } 823 | %inventory = %inventory2; 824 | } 825 | 826 | # Strip unsupported features at the end, in case the syllable structure put them in. 827 | for my $phone (keys %inventory) { 828 | my $stripped = $self->{FS}->add_entailments($phone); 829 | $stripped =~ s/U/u/g; 830 | if ($stripped ne $phone) { 831 | add_in \%inventory, $stripped, $inventory{$phone}; 832 | delete $inventory{$phone}; 833 | } 834 | } 835 | 836 | my %prinv; 837 | my %resolver; 838 | 839 | for my $phone (keys %inventory) { 840 | my @word = ($phone); 841 | print STDERR "in: $phone\n" if $debug >= 1; 842 | $self->run(\@word, end => $self->{start_sequences}); 843 | my $outcome = join(' ', @word); 844 | print STDERR "out: $outcome /" . (@word ? $debug_alphabet->name_phone($word[0]) : '') . "/\n" if $debug >= 1; 845 | $resolver{$phone} = $outcome; 846 | add_in \%prinv, $outcome, $inventory{$phone}; 847 | } 848 | 849 | # Handle zero specially: its likelihood should not be given by resolutions 850 | # but should be the likelihood we picked by fiat earlier. 851 | # In addition, a syllable position where nothing can show up 852 | # should just be thrown away. 853 | my @old_zero_probs = defined $prinv{''} ? @{$prinv{''}} : ((0) x @$syllable_structure); 854 | $prinv{''} = [map 1 - $syllable_structure->[$_]{prob}, 0..@$syllable_structure-1]; 855 | for (my $i = @$syllable_structure-1; $i >= 0; $i--) { 856 | if ($old_zero_probs[$i] >= 1 - 1e-8) { # numerical things going on? 857 | splice @old_zero_probs, $i, 1; 858 | splice @$syllable_structure, $i, 1; 859 | splice @$_, $i, 1 for values %prinv; 860 | } 861 | } 862 | for my $phone (keys %prinv) { 863 | next if $phone eq ''; 864 | for my $i (0..@$syllable_structure-1) { 865 | $prinv{$phone}[$i] = $prinv{$phone}[$i] / (1 - $old_zero_probs[$i]) * $syllable_structure->[$i]{prob}; 866 | } 867 | } 868 | 869 | # We will need to use the resolver when it comes to generating alternations. 870 | # It's not necessary to for ordinary stem generation, though; for that the inventory suffices. 871 | $self->{gen_inventory} = \%prinv; 872 | } 873 | 874 | # Do some artificial thing meant to stop a lot of the frequency mass from being concentrated 875 | # in a few phones. Fairly harsh. 876 | # Class method. 877 | sub bend_frequencies { 878 | my ($gi, $i, $threshold) = (shift, shift, shift); 879 | my $n = scalar keys %$gi; 880 | my $sum = 0; 881 | for my $phone (keys %$gi) { 882 | if ($gi->{$phone}[$i] > $threshold / $n) { 883 | $gi->{$phone}[$i] = (log($gi->{$phone}[$i] * $n / $threshold) + 1) * $threshold / $n; 884 | } 885 | $sum += $gi->{$phone}[$i]; 886 | } 887 | for my $phone (keys %$gi) { 888 | $gi->{$phone}[$i] /= $sum; 889 | } 890 | } 891 | 892 | # Make some tweaks to the inventory of the sort that're problematic to do in initial generation. 893 | # Prominent among these are the forcing of the phoneme frequencies in the main slots not to have 894 | # certain phonemes anomalously common. 895 | 896 | # There is some icky duplication in here. 897 | sub postprocess { 898 | my $self = shift; 899 | 900 | for (my $i = $#{$self->{syllable_structure}}; $i >= 0; --$i) { 901 | # bend frequencies 902 | if (defined $self->{syllable_structure}[$i]{bend}) { 903 | bend_frequencies $self->{gen_inventory}, $i, $self->{syllable_structure}[$i]{bend}; 904 | delete $self->{syllable_structure}[$i]{bend}; 905 | } 906 | 907 | # drop nonmatches if we want to force matches 908 | if (defined $self->{syllable_structure}[$i]{reprune}) { 909 | my $sum = 0; 910 | for my $phone (keys %{$self->{gen_inventory}}) { 911 | $self->{gen_inventory}{$phone}[$i] = 0 if $phone and !grep {s/u/./; $phone =~ /^$_$/;} keys %{$self->{syllable_structure}[$i]{features}}; 912 | $sum += $self->{gen_inventory}{$phone}[$i]; 913 | unless (grep $_ > 0, @{$self->{gen_inventory}{$phone}}) { 914 | delete $self->{gen_inventory}{$phone}; 915 | } 916 | } 917 | for my $phone (keys %{$self->{gen_inventory}}) { 918 | $self->{gen_inventory}{$phone}[$i] /= $sum; 919 | } 920 | delete $self->{syllable_structure}[$i]{reprune}; 921 | } 922 | 923 | # drop excepted things 924 | if (defined $self->{syllable_structure}[$i]{except}) { 925 | my $sum = 0; 926 | for my $phone (keys %{$self->{gen_inventory}}) { 927 | $self->{gen_inventory}{$phone}[$i] = 0 if grep $phone =~ /^$_$/, @{$self->{syllable_structure}[$i]{except}}; 928 | $sum += $self->{gen_inventory}{$phone}[$i]; 929 | unless (grep $_ > 0, @{$self->{gen_inventory}{$phone}}) { 930 | delete $self->{gen_inventory}{$phone}; 931 | } 932 | } 933 | for my $phone (keys %{$self->{gen_inventory}}) { 934 | $self->{gen_inventory}{$phone}[$i] /= $sum; 935 | } 936 | delete $self->{syllable_structure}[$i]{except}; 937 | } 938 | 939 | # eliminate positions with nothing in them 940 | if ($self->{gen_inventory}{''}[$i] >= 1) { 941 | for my $phone (keys %{$self->{gen_inventory}}) { 942 | splice @{$self->{gen_inventory}{$phone}}, $i, 1; 943 | } 944 | splice @{$self->{syllable_structure}}, $i, 1; 945 | } 946 | } 947 | 948 | # Second pass, for some adjacency things sensitive to it. 949 | for (my $i = $#{$self->{syllable_structure}}; $i >= 0; --$i) { 950 | # Check for lumping together of syllable positions. 951 | # We lump if this position contains at least as much as the one it's next to, or roughly so. 952 | # If lump = -1, compare to the previous; if lump = +1, to the next. 953 | if (defined $self->{syllable_structure}[$i]{lump}) { 954 | my $j = $i + $self->{syllable_structure}[$i]{lump}; 955 | my $unique_to_j = 0; 956 | for my $phone (keys %{$self->{gen_inventory}}) { 957 | $unique_to_j += $self->{gen_inventory}{$phone}[$j] if $self->{gen_inventory}{$phone}[$i] <= 0; 958 | } 959 | $unique_to_j /= (1 - ($self->{gen_inventory}{''}[$j] + 2.2250738585072014e-308)); # smallest float 960 | 961 | # do we proceed with lumping? 962 | if (rand() < 1 - 2*$unique_to_j) { # magic function 963 | my $i0 = $self->{gen_inventory}{''}[$i]; 964 | my $j0 = $self->{gen_inventory}{''}[$j]; 965 | my $d = $i0 + $j0 - $i0 * $j0; 966 | for my $phone (keys %{$self->{gen_inventory}}) { 967 | if ($phone ne '') { 968 | $self->{gen_inventory}{$phone}[$j] = 969 | ($self->{gen_inventory}{$phone}[$j] * $i0 + $self->{gen_inventory}{$phone}[$i] * $j0) / $d; 970 | } else { 971 | $self->{gen_inventory}{$phone}[$j] = $i0 * $j0 / $d; 972 | } 973 | } 974 | 975 | for my $phone (keys %{$self->{gen_inventory}}) { 976 | splice @{$self->{gen_inventory}{$phone}}, $i, 1; 977 | } 978 | splice @{$self->{syllable_structure}}, $i, 1; 979 | } # proceed with lumping 980 | } 981 | } 982 | 983 | # Now that we're done playing with it, record entropies in bits on the syllable structure. 984 | for my $i (0..@{$self->{syllable_structure}}-1) { 985 | my $entropy = 0; 986 | while (my ($phone, $v) = each %{$self->{gen_inventory}}) { 987 | $entropy += $v->[$i] * log($v->[$i]) / log(0.5) if $v->[$i] > 0; 988 | } 989 | $self->{syllable_structure}[$i]{entropy} = $entropy; 990 | } 991 | } 992 | 993 | sub generate_form { 994 | my ($self, $target_entropy) = (shift, shift); 995 | 996 | my $normal = 0; 997 | $normal += rand(1/4.0) for 1..8; # std dev sqrt(2/3) eh 998 | my $entropy = $normal * $target_entropy; 999 | 1000 | # Form generation was once done by rules with probabilistic effects. But that is long obsolete. 1001 | 1002 | my @form; 1003 | # The form of this loop will very much be changing when we start asking for 1004 | # forms that aren't made of whole syllables. 1005 | my $total_entropy = 0; 1006 | while ($total_entropy < $entropy) { 1007 | for my $i (0..@{$self->{syllable_structure}}-1) { 1008 | # next if $self->{syllable_structure}[$i]{nonzero_prob} == 0; # these cases are eliminated. 1009 | 1010 | $total_entropy += $self->{syllable_structure}[$i]{entropy}; 1011 | next if rand() >= $self->{syllable_structure}[$i]{prob}; 1012 | 1013 | my $rand = rand (1 - $self->{gen_inventory}{''}[$i]); 1014 | my $selected_phone; 1015 | # only generate structural zeroes in a form, not resolvent zeroes 1016 | # (though we've corrected the probabilities anyhow) 1017 | for (keys %{$self->{gen_inventory}}) { 1018 | $selected_phone = $_, last if $_ ne '' and ($rand -= $self->{gen_inventory}{$_}[$i]) < 0; 1019 | } 1020 | push @form, split / /, $selected_phone; 1021 | } 1022 | } 1023 | 1024 | \@form; 1025 | } 1026 | 1027 | # Simplify the phonemic presentation of a form. 1028 | # E.g. if we generate /agsa/ but there's compulsory regressive voice assimilation in that situation, 1029 | # and /aksa/ consists of extant phonemes and has the same outcome, we may as well present it as /aksa/. 1030 | # Returns (outcome, canonicalised); we may as well, since we end up with both. 1031 | 1032 | # FIXME: broken, haven't tested why. 1033 | # Distant future TODO: when morphology gets here, respect it. 1034 | 1035 | sub canonicalise_phonemic_form { 1036 | my ($self, $word) = (shift, shift); 1037 | my @canonical_word = @$word; 1038 | my @current_word = @$word; 1039 | my @sources = 0..@$word-1; 1040 | 1041 | for my $k ($self->{start_sequences}..@{$self->{phonology}}-1) { 1042 | # print "before $k /" . $debug_alphabet->spell(\@canonical_word) . "/ [" . $debug_alphabet->spell(\@current_word) . "]\n"; # debug 1043 | my @old_sources = @sources; 1044 | my @old_word = @current_word; 1045 | my (@prov_canonical_word, @prov_current_word); 1046 | my $changed; 1047 | $self->run(\@current_word, 1048 | start => $k, 1049 | end => $k+1, 1050 | sources => \@sources); 1051 | # print "target [" . $debug_alphabet->spell(\@current_word) . "]\n"; # debug 1052 | 1053 | { # block for redo 1054 | $changed = 0; 1055 | 1056 | # check for new deletions 1057 | for my $source (@old_sources) { 1058 | unless (grep $_ == $source, @sources) { 1059 | @prov_canonical_word = @canonical_word; 1060 | splice @prov_canonical_word, $source, 1; 1061 | @prov_current_word = @prov_canonical_word; 1062 | $self->run(\@prov_current_word, 1063 | start => $self->{start_sequences}, 1064 | end => $k+1); 1065 | if (scalar @prov_current_word == scalar @current_word and 1066 | !grep $prov_current_word[$_] != $current_word[$_], 0..$#current_word) { 1067 | $changed = 1; 1068 | @canonical_word = @prov_canonical_word; 1069 | @sources = map $_ > $source ? $_-1 : $_, @sources; 1070 | } 1071 | } 1072 | } # deletions 1073 | 1074 | # check for featural changes. 1075 | # Try out every underlying phoneme that is Hamming-between this phone in the old and the new words. 1076 | # (Maybe special-case place, i.e.\ features that are bound, for e.g. /mk/ vs. /nk/ when there's no /N/.) 1077 | # What about syllable position restrictions? I think it's more conventional to ignore them. 1078 | CHANGE_SOURCE: for my $i (0..$#sources) { 1079 | next if $sources[$i] == -1; # insertions 1080 | my $old_i; 1081 | for (0..$#old_sources) { 1082 | $old_i = $_, last if $old_sources[$_] == $sources[$i]; 1083 | } 1084 | # no need to continue unless there was a change in this sound 1085 | next if $old_word[$old_i] eq $current_word[$i]; 1086 | 1087 | my @varying_features = grep substr($canonical_word[$sources[$i]], $_, 1) ne substr($current_word[$i], $_, 1), 1088 | 0..length($current_word[$i])-1; 1089 | # print "@varying_features vary\n" if @varying_features; # debug 1090 | my @prov_varied_features = @varying_features; 1091 | while (@prov_varied_features) { 1092 | @prov_canonical_word = @canonical_word; 1093 | substr($prov_canonical_word[$sources[$i]], $_, 1) = substr($current_word[$i], $_, 1) 1094 | for @prov_varied_features; 1095 | if (defined $self->{gen_inventory}{$prov_canonical_word[$sources[$i]]}) { 1096 | # print "trying out " . $debug_alphabet->name_phone($prov_canonical_word[$sources[$i]]) . " at $i\n"; # debug 1097 | @prov_current_word = @prov_canonical_word; 1098 | $self->run(\@prov_current_word, 1099 | start => $self->{start_sequences}, 1100 | end => $k+1); 1101 | if (scalar @prov_current_word == scalar @current_word and 1102 | !grep $prov_current_word[$_] != $current_word[$_], 0..$#current_word) { 1103 | $changed = 1; 1104 | @canonical_word = @prov_canonical_word; 1105 | next CHANGE_SOURCE; 1106 | } 1107 | } 1108 | 1109 | # loop increment 1110 | my $was_last = pop @prov_varied_features; 1111 | my $t; 1112 | for ($t = -1; $varying_features[$t] != $was_last; $t--) { } 1113 | push @prov_varied_features, @varying_features[$t+1..-1] if $t < -1; # stupid negatives convention 1114 | } # prov_varied_features 1115 | } 1116 | 1117 | # In order not to miss cases of form /0a 0b 1a 1b/ [1b 1b 1a 1b], we need to iterate 1118 | # the description simplification, *not* the sound changes. 1119 | redo if $changed; 1120 | } 1121 | } 1122 | (\@current_word, \@canonical_word); 1123 | } 1124 | 1125 | 1; 1126 | -------------------------------------------------------------------------------- /phonology/PhonologySynchronicState.pm: -------------------------------------------------------------------------------- 1 | package PhonologySynchronicState; 2 | use strict; 3 | use constant INF => 9**9**9; # is there really nothing sensible better? 4 | 5 | # This keeps track of certain data in a phonology in a more accessible way than the list of rules 6 | # which define it. To wit, it currently keeps a list of phones and records which phones resolve 7 | # to others. 8 | # In a future version, this is where the frequency table of bigrams will go. 9 | 10 | # If there is a conditioned outcome of one of the single phones that are supposed to be resolved, 11 | # that will be stored in {conditional_resolutions}, which is a hash from rule numbers to 12 | # resolutions that have applied up to before when that rule does. 13 | 14 | sub initialise { 15 | my ($pd, $FS, $start, %args) = @_; 16 | 17 | my @inventory; 18 | my %phone_resolutions; 19 | my %resolution_expiries; 20 | if (defined $args{inventory}) { 21 | @inventory = @{$args{inventory}}; 22 | } elsif ($args{start} == $pd->{start_sequences}) { 23 | @inventory = keys %{$pd->{gen_inventory}}; 24 | } else { 25 | # FIXME: else we have to run to update the inventory to the starting point 26 | warn "unimplemented start time"; 27 | } 28 | @inventory = grep $_, @inventory; 29 | %phone_resolutions = (map(($_ => $_), @inventory)); 30 | 31 | my $s = { 32 | pd => $pd, 33 | FS => $FS, 34 | start => $start, 35 | inventory => \@inventory, 36 | resolutions => \%phone_resolutions, 37 | resolution_expiries => \%resolution_expiries, 38 | conditional_resolutions => {}, 39 | conditional_resolution_expiries => {}, 40 | 41 | # These next things are data used by the describer, and are updated here on running one rule. 42 | # By "frame" we mean a phone in the effect 43 | # with its assimilation characters replaced by things they might assimilate to. 44 | # relevant_frames{$locus}[$h] is the frames relevant to the $h-th phone in the effects 45 | relevant_frames => {}, 46 | # outcomes{$frame}{$phone} is what $phone turns into in $frame 47 | outcomes => {}, 48 | # matcheds{$displ} is a list of phones which can be at $displ when this is triggered 49 | matcheds => {}, 50 | }; 51 | bless $s; 52 | } 53 | 54 | 55 | 56 | # Methods that deal with a specific rule follow. Many of these methods can cause the rule 57 | # to be marked with $rule->{pointless} = 1, meaning that the rule can never cause a change 58 | # given the current state of the phonology. 59 | 60 | 61 | 62 | sub clear_rule_data { 63 | my $self = shift; 64 | $self->{relevant_frames} = {}; 65 | $self->{outcomes} = {}; 66 | $self->{matcheds} = {}; 67 | } 68 | 69 | # Given a rule, produce the sets of phones in the current inventory that can match each of its positions. 70 | # If this position can match word boundary, then include the pause_phone. 71 | # 72 | # TODO: put pointless persistent rules into some kind of holding tank, 73 | # to check on creation of new phones. 74 | # 75 | # Actually, quite a lot of stuff for persistent rules potentially needs rewriting 76 | # when new phones are around. Ick. I'm happy to just ignore this for now. 77 | sub find_matches { 78 | my ($self, $rule) = @_; 79 | my %matcheds; 80 | for my $displ ($rule->indices('condition')) { 81 | $matcheds{$displ} = [grep $rule->{$displ}->matches($_), @{$self->{inventory}}]; 82 | @{$matcheds{$displ}} = grep $rule->{filter}->matches($_), @{$matcheds{$displ}} if defined $rule->{filter}; 83 | push @{$matcheds{$displ}}, $rule->{$displ}{or_pause} if defined $rule->{$displ}{or_pause}; 84 | unless (@{$matcheds{$displ}}) { 85 | $rule->{pointless} = 1; 86 | } 87 | } 88 | $self->{matcheds} = \%matcheds; 89 | return %matcheds; 90 | } 91 | 92 | # Simplify the rule assuming correctness of the current state. 93 | sub simplify { 94 | my ($self, $rule, %args) = @_; 95 | my %matcheds; 96 | if ($self->{matcheds}) { 97 | %matcheds = %{$self->{matcheds}}; 98 | } else { 99 | %matcheds = $self->find_matches($rule); 100 | } 101 | my $simplified_rule = $rule->deep_copy_indexed(); 102 | 103 | # Drop the assimilatory parts of the rule if there aren't multiple values among the things being 104 | # assimilated to. 105 | # It would be at least as sensible to do this one assimilation at a time, in theory, 106 | # but that would throw off the naming. 107 | for my $locus ($simplified_rule->indices('effects')) { 108 | my $effect = $simplified_rule->{$locus}{effects}; 109 | if ($effect =~ //) { 123 | my $not_variable = 0; 124 | my @indices = grep substr($effect, $_, 1) eq '>', 0..length($effect)-1; 125 | for my $j (@indices) { 126 | $not_variable = 1, last if grep substr($_, $j, 1) eq '0', @{$matcheds{$locus+1}} 127 | and grep substr($_, $j, 1) eq '1', @{$matcheds{$locus+1}}; 128 | } 129 | unless ($not_variable) { 130 | for (0..length($effect)-1) { 131 | substr($effect, $_, 1) = substr($matcheds{$locus+1}[0], $_, 1) if substr($effect, $_, 1) eq '>'; 132 | } 133 | } 134 | } 135 | if ($simplified_rule->{$locus}{condition} =~ /^$effect$/ and !$simplified_rule->{$locus}{deletions}) { 136 | $simplified_rule->{pointless} = 1; 137 | } 138 | $simplified_rule->{$locus}{effects} = $effect; 139 | } # $locus 140 | 141 | $simplified_rule; 142 | } 143 | 144 | # Update this state to reflect the running of one more rule. 145 | sub update { 146 | my ($self, $rule, $i, %args) = @_; 147 | my %matcheds; 148 | if ($self->{matcheds}) { 149 | %matcheds = %{$self->{matcheds}}; 150 | } else { 151 | %matcheds = $self->find_matches($rule); 152 | } 153 | 154 | if ($args{record_old_inventory}) { 155 | $self->{old_inventory} = [@{$self->{inventory}}]; 156 | } 157 | 158 | # There should be no resolutions whose expiry is _strictly less than_ $i. 159 | # TODO: when this is being done dynamically, one will have to take a different approach to finding these. (what, why?) 160 | if (defined $self->{resolution_expiries}{$i}) { 161 | delete $self->{resolutions}{$_} for @{$self->{resolution_expiries}{$i}}; 162 | } 163 | if (defined $self->{conditional_resolution_expiries}{$i}) { 164 | delete $self->{conditional_resolutions}{$_} for @{$self->{conditional_resolution_expiries}{$i}}; 165 | } 166 | 167 | my @new_inventory = @{$self->{inventory}}; 168 | for my $locus ($rule->indices('effects'), $rule->indices('deletions')) { 169 | my $effect = defined $rule->{$locus}{effects} ? $rule->{$locus}{effects} : '.' x @{$self->{FS}{features}}; # might be many phones! 170 | my $pointless = 1; 171 | my @template_set; 172 | push @template_set, @{$matcheds{$locus-1}} if $effect =~ //; 174 | push @template_set, '.' x @{$self->{FS}{features}} unless @template_set; 175 | $self->{outcomes}{$locus} = {}; 176 | $self->{relevant_frames}{$locus} = []; 177 | 178 | my @pieces_of_frame = split / /, $effect; 179 | for my $original_frame (@pieces_of_frame) { 180 | my %outcome = (); 181 | my %frames_examined = (); 182 | for my $template (@template_set) { 183 | my $frame = $self->{FS}->add_entailments($original_frame); # why entailed? for antitheticals? 184 | my $frame_is_worthwhile = 0; 185 | for (0..length($frame)-1) { 186 | # unlikely issue: not right for assimilation of some features in each direction. and elsewhere 187 | substr($frame, $_, 1) = substr($template, $_, 1) if substr($frame, $_, 1) =~ /[<>]/; 188 | } 189 | next if defined $frames_examined{$frame}; 190 | $frames_examined{$frame} = 1; 191 | 192 | for my $phone (@{$matcheds{$locus}}) { 193 | my $outcome; 194 | if (defined $rule->{$locus}{deletions} and $rule->{$locus}{deletions}) { 195 | $outcome = ''; 196 | } else { 197 | my $changed = $self->{FS}->add_entailments($self->{FS}->overwrite($phone, $frame)); 198 | if (!defined $self->{resolutions}{$changed}) { 199 | my $word = [$changed]; 200 | my $expiry = []; 201 | my $context_dependent = {}; 202 | $self->{pd}->run 203 | ($word, 204 | cleanup => $i, 205 | change_record => [FeatureSystem::change_record($phone, $changed)], 206 | track_expiry => $expiry, 207 | nopause => 1, 208 | context_dependent => $context_dependent); 209 | $self->{resolutions}{$changed} = join ' ', @$word; 210 | push @{$self->{resolution_expiries}{$expiry->[0]}}, $changed if $expiry->[0] < INF; 211 | while (my ($k, $outcome_before) = each %$context_dependent) { 212 | next if $k >= $self->{start}; 213 | $self->{conditional_resolutions}{$k}{$changed} = join ' ', @$outcome_before; 214 | if (defined $self->{pd}{phonology}[$k]{inactive}) { 215 | push @{$self->{conditional_resolution_expiries}{$self->{pd}{phonology}[$k]{inactive}}}, $k; 216 | } 217 | } 218 | push @new_inventory, @$word; 219 | } 220 | 221 | # For description in rules we want not the ultimate outcome but the nearest one. 222 | $outcome = $self->{resolutions}{$changed}; 223 | unless ($args{no_old_conditionals}) { 224 | for my $k (sort {$a <=> $b} keys %{$self->{conditional_resolutions}}) { 225 | $outcome = $self->{conditional_resolutions}{$k}{$changed}, last 226 | if defined $self->{conditional_resolutions}{$k}{$changed}; 227 | } 228 | } 229 | } 230 | $pointless = 0 unless ($phone eq $outcome and $phone !~ /^$effect$/); 231 | $outcome{$frame}{$phone} = $outcome; 232 | $frame_is_worthwhile = 1 if $outcome ne $phone; 233 | } 234 | delete $outcome{$frame} unless $frame_is_worthwhile; 235 | } # $template 236 | 237 | push @{$self->{relevant_frames}{$locus}}, [keys %outcome]; 238 | 239 | %{$self->{outcomes}{$locus}} = (%{$self->{outcomes}{$locus}}, %outcome); 240 | } # $original_frame in @pieces_of_frame 241 | $rule->{pointless} = 1 if $pointless; 242 | } # $locus 243 | 244 | %_ = map(($_ => 1), @new_inventory); 245 | @{$self->{inventory}} = keys %_; # uniq 246 | } 247 | 248 | 1; 249 | -------------------------------------------------------------------------------- /phonology/README: -------------------------------------------------------------------------------- 1 | gleb.pl is a script for generating and describing phonologies and random forms following them. The current version, v0.3, generates a limited set of phonological processes, mostly assimilatory processes among adjacent phones; it doesn't yet know about syllables or tone or sundry other things. 2 | 3 | I like to invoke it as 4 | gleb.pl -h -d -w 30 ; 5 | that is, output in *H*TML an inventory and *d*escription of the rules and 30 sample *w*ords. (The inventory display in particular is much better in HTML than plaintext, on account of getting to use tables.) 6 | 7 | Refer to the command-line usage for more. 8 | 9 | You can try a nearly-current version online at http://gleb.000024.org . 10 | -------------------------------------------------------------------------------- /phonology/Transcription.pm: -------------------------------------------------------------------------------- 1 | package Transcription; 2 | use strict; 3 | 4 | # Right now the only Transcription objects are the standard phonetic alphabets. 5 | # It's my intent to use this also for romanizations designed to fit a particular phonology, 6 | # once creating those is done. 7 | 8 | sub load_file { 9 | my ($filename, $FS) = (shift, shift); 10 | my $alphabet = YAML::Any::LoadFile($filename); 11 | $alphabet->{FS} = $FS; 12 | 13 | for my $type (qw/characters ligations/) { 14 | for my $c (keys %{$alphabet->{$type}}) { 15 | $alphabet->{$type}{$FS->parse($c)} = $alphabet->{$type}{$c}; 16 | delete $alphabet->{$type}{$c}; 17 | } 18 | } 19 | for my $c (keys %{$alphabet->{modifiers}}) { 20 | my @fs = split / /, $c; 21 | my $s = $FS->parse($c) . ' ' . $FS->parse($fs[0]); 22 | $alphabet->{modifiers}{$s} = $alphabet->{modifiers}{$c}; 23 | delete $alphabet->{modifiers}{$c}; 24 | } 25 | 26 | bless $alphabet; 27 | } 28 | 29 | # In a modifier description, it's only the first phone that the modifier actually spells; 30 | # the rest are just conditions on its applicability. 31 | 32 | sub name_phone { 33 | my ($self, $phone, %args) = (shift, @_); 34 | my $FS = $self->{FS}; 35 | my %taken_care_of; 36 | my $s = '##'; 37 | $s = "$s"; #if $use_html; # handy for debugging 38 | 39 | for my $x (keys %{$self->{ligations}}) { 40 | next if $phone !~ /^$x$/; 41 | my $phone0 = $FS->overwrite($phone, $FS->parse($self->{ligations}{$x}[0])); 42 | my $phone1 = $FS->overwrite($phone, $FS->parse($self->{ligations}{$x}[1])); 43 | my ($tc0, $s0) = $self->name_phone($phone0, %args, no_modifiers => 1); 44 | my ($tc1, $s1) = $self->name_phone($phone1, %args, no_modifiers => 1); 45 | $s = $self->{ligations}{$x}[2]; 46 | $s =~ s/\[\]/$s0/; 47 | $s =~ s/\[\]/$s1/; 48 | %taken_care_of = (%$tc0, %$tc1, 49 | map(($_ => 1), (grep substr($x, $_, 1) ne '.', 0..@{$FS->{features}}-1)) ); 50 | last; 51 | } 52 | 53 | if ($s =~ /##/) { 54 | for my $x (keys %{$self->{characters}}) { 55 | next if $phone !~ /^$x$/; 56 | $s = $self->{characters}{$x}; 57 | %taken_care_of = map(($_ => 1), (grep substr($x, $_, 1) ne '.', 0..@{$FS->{features}}-1)); 58 | last; 59 | } 60 | } 61 | 62 | return (\%taken_care_of, $s) if $args{no_modifiers}; 63 | 64 | MODIFIER: for my $x (keys %{$self->{modifiers}}) { 65 | my ($x_all, $x_spells) = split / /, $x; 66 | next if $phone !~ /^$x_all$/; 67 | my $redundant = 1; 68 | for (0..@{$FS->{features}}-1) { 69 | $redundant = 0 if substr($x_spells, $_, 1) ne '.' and !defined $taken_care_of{$_}; 70 | } 71 | next if $redundant; 72 | my $t = $self->{modifiers}{$x}; 73 | $t =~ s/\[\]/$s/; 74 | $s = $t; 75 | %taken_care_of = (%taken_care_of, map(($_ => 1), (grep substr($x_spells, $_, 1) ne '.', 0..@{$FS->{features}}-1))); 76 | } 77 | 78 | $s; 79 | } 80 | 81 | sub spell { 82 | my ($self, $word, %args) = (shift, @_); 83 | if ($args{null} and !@$word) { 84 | return $self->{null}; 85 | } 86 | join "", map $self->name_phone($_, %args), @$word; 87 | } 88 | 89 | # duplicated for efficiency :/ 90 | sub spell_spaced_string { 91 | my ($self, $word, %args) = (shift, @_); 92 | if ($args{null} and ($word eq '')) { 93 | return $self->{null}; 94 | } 95 | join "", map $self->name_phone($_, %args), (split / /, $word); 96 | } 97 | 98 | 1; 99 | -------------------------------------------------------------------------------- /phonology/gleb.cgi: -------------------------------------------------------------------------------- 1 | gleb.pl -------------------------------------------------------------------------------- /phonology/gleb.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # Generate random phonologies, featurally and via ordered rules with persistence, 3 | # with allophony and the potential for good morphophonology and the works. (Getting there!) 4 | # Alex Fink, January 2010 -- present. 5 | # Thanks to Marcus Smith for unwitting inspiration, 6 | # and Marcus and UPSID for being proximal sources for various numbers. 7 | # (A much greater proportion of the numbers are wholly fabricated, though!) 8 | 9 | # What next? Should we privilege 10 | # (a) advanced inventory tracking, with the bigram transition matrix stuff; or 11 | # (b) new phonology? (long-distance rules; syllable tracking > moraic stuff) 12 | 13 | use strict; 14 | use YAML::Any; 15 | use CGI; 16 | 17 | use lib '.'; 18 | 19 | use FeatureSystem; 20 | use PhoneSet; 21 | use PhonologicalRule; 22 | use Phonology; 23 | use PhonologySynchronicState; 24 | use Transcription; 25 | use PhonologyDescriber; 26 | 27 | my $version = '0.3.2'; 28 | my $credits = 'Gleb, a phonology generator, by Alex Fink' . 29 | (' ' x (29 - length($version))) . # for a total length of 78 30 | "version $version"; 31 | 32 | my $verbose; 33 | my $show_seed; 34 | my $use_html; 35 | my $CGI; 36 | my $seed = time ^ $$ ^ $$<<15; 37 | 38 | 39 | 40 | my $outfile; 41 | my $annotate_output; 42 | my $infile; 43 | my $show_inventory; 44 | my $show_all; 45 | my $num_words = 0; 46 | my $phone_to_interpret; 47 | my $canonicalise; 48 | 49 | sub die_with_usage { 50 | print STDERR < Phonology output file. Defaults to no output. The output is a 64 | YAML-formatted, not human-friendly, collection of the data 65 | needed to run the phonology generator. 66 | -O As above, with a little extra annotation for 67 | like translations of the internal phone notation. 68 | (Still not human-friendly.) 69 | -i Input the phonology from the named file, rather than generating 70 | a new one. 71 | 72 | -r N Use N as the random seed. 73 | -v Verbose. Show progress and a few other things. 74 | -D Show some debugging output. 75 | -p Do some conversions between phone formats. Do nothing else. 76 | 77 | USAGE 78 | exit 1; 79 | } 80 | 81 | sub parse_args { 82 | my $arg; 83 | while ($arg = shift) { 84 | if ($arg eq '-D') { 85 | $Phonology::debug++; 86 | } 87 | elsif ($arg eq '--noprune') { 88 | $Phonology::noprune++; 89 | } 90 | elsif ($arg eq '-r') { 91 | $seed = shift; 92 | die "-r expects an integer argument\n" if !defined $seed or ($seed !~ /^\-?[0-9]+$/); 93 | } 94 | elsif ($arg =~ /^-[oO]$/) { 95 | $outfile = shift; 96 | $annotate_output = 1 if $arg eq '-O'; 97 | die "$arg expects a filename argument\n" if !defined $outfile; 98 | } 99 | elsif ($arg eq '-i') { 100 | $infile = shift; 101 | die "-i expects a filename argument\n" if !defined $infile; 102 | } 103 | elsif ($arg eq '-I') { 104 | $show_inventory = 1; 105 | } 106 | elsif ($arg eq '-d') { 107 | $show_inventory = $show_all = 1; 108 | } 109 | elsif ($arg eq '-v') { 110 | $verbose = $Phonology::verbose = 1; 111 | } 112 | elsif ($arg eq '--showseed') { 113 | $show_seed = 1; 114 | } 115 | elsif ($arg eq '-w') { 116 | $num_words = shift; 117 | die "-w expects an integer argument\n" if !defined $num_words or ($num_words !~ /^\-?[0-9]+$/); 118 | } 119 | elsif ($arg eq '-p') { 120 | $phone_to_interpret = shift; 121 | die "-p expects an argument\n" if !defined $phone_to_interpret; 122 | } 123 | elsif ($arg eq '-c') { 124 | $canonicalise = 1; 125 | } 126 | elsif ($arg eq '-h') { 127 | $use_html = 1; 128 | } 129 | else { 130 | die_with_usage; 131 | } 132 | } 133 | } 134 | 135 | my $yamlimpl = YAML::Any->implementation; 136 | unless (grep $_ eq $yamlimpl, qw(YAML::XS YAML::Syck YAML::Perl)) { 137 | print STDERR <new; 145 | 146 | $use_html = 1; 147 | # lazily not taking query arguments for now, except for this one 148 | if (defined $CGI->param('r') and ($CGI->param('r') =~ /^\-?[0-9]+$/)) { 149 | $seed = $CGI->param('r'); 150 | } 151 | $show_inventory = $show_all = 1; 152 | $num_words = 30; 153 | 154 | print $CGI->header; 155 | } 156 | 157 | parse_args @ARGV; 158 | 159 | if ($use_html) { 160 | my $style = < 'Random phonology', 166 | -style => {-code => $style}); 167 | } 168 | else { 169 | $" = $, = ", "; 170 | } 171 | 172 | die 'feature system not found' unless -f 'features.yml'; 173 | my $FS = FeatureSystem::load_file('features.yml'); 174 | 175 | my $phonetic_alphabet; 176 | if ($use_html && -f 'IPA_HTML.yml') { 177 | $phonetic_alphabet = Transcription::load_file('IPA_HTML.yml', $FS); 178 | } elsif (-f 'CXS.yml') { 179 | $phonetic_alphabet = Transcription::load_file('CXS.yml', $FS); 180 | } else { 181 | die 'no suitable phonetic alphabet found'; 182 | } 183 | $Phonology::debug_alphabet = Transcription::load_file('CXS.yml', $FS) if -f 'CXS.yml'; 184 | 185 | if (defined $phone_to_interpret) { 186 | $phone_to_interpret = $FS->parse($phone_to_interpret, undefined => 1) unless $phone_to_interpret =~ /^[.01u]*$/; 187 | print '[' . $phonetic_alphabet->name_phone($phone_to_interpret) . '] ' . $FS->feature_string($phone_to_interpret); 188 | $phone_to_interpret =~ /[01]/g; 189 | print ' ' . (pos($phone_to_interpret) - 1) if defined pos($phone_to_interpret); 190 | print "\n"; 191 | exit 0; 192 | } 193 | 194 | print STDERR "seed $seed\n" if $verbose or $show_seed; 195 | srand $seed; 196 | 197 | my $pd; 198 | 199 | if (defined $infile) { 200 | $pd = Phonology::load_file($infile, $FS); 201 | } else { 202 | $pd = Phonology::generate($FS); 203 | } 204 | 205 | if (defined $outfile) { 206 | $pd->dump_file($outfile, $annotate_output); 207 | } 208 | 209 | my $pdes = PhonologyDescriber::new($phonetic_alphabet, YAML::Any::LoadFile('phon_descr.yml'), $use_html); 210 | 211 | if ($show_inventory) { 212 | print $pdes->describe_inventory($pd, html => $use_html); 213 | } 214 | 215 | if ($show_all) { 216 | $pdes->tabulate($pd, annotate_only => 1); # should this be given a name of its own? 217 | my ($template, $elaborations) = $pdes->describe_syllable_structure($pd, html => $use_html); 218 | if ($use_html) { 219 | print CGI::h2('Syllable structure'), 220 | CGI::p(join '', @$template), 221 | CGI::p(join '
', @$elaborations); 222 | } else { 223 | print "\nSyllable structure: " . join('', @$template) . "\n"; # not well formatted at present, eh 224 | print join "\n", @$elaborations; 225 | print "\n\n"; 226 | } 227 | 228 | print STDERR "describing rules...\n" if $verbose; 229 | my $rules = $pdes->describe_rules($pd); 230 | if ($use_html) { 231 | print CGI::h2('Allophony'), 232 | CGI::ul(CGI::li($rules)); 233 | } else { 234 | print "Allophony:\n"; 235 | print join "\n", @$rules; 236 | print "\n\n"; 237 | } 238 | } 239 | 240 | if ($use_html and $num_words > 0) { 241 | print CGI::h2('Some words'), 242 | CGI::start_table(); 243 | } 244 | print STDERR "generating sample words...\n" if $verbose; 245 | for (1..$num_words) { 246 | my $word = $pd->generate_form(12); # magic entropy value 247 | my $surface_word; 248 | my $generated_word = [@$word]; 249 | if (defined $canonicalise) { 250 | ($surface_word, $word) = $pd->canonicalise_phonemic_form($generated_word); 251 | } else { 252 | $surface_word = [@$word]; 253 | $pd->run($surface_word, start => $pd->{start_sequences}); 254 | } 255 | 256 | if ($use_html) { 257 | print ''; 258 | print "//", 259 | $phonetic_alphabet->spell($generated_word), 260 | "//" 261 | if defined $canonicalise; 262 | print "/", 263 | $phonetic_alphabet->spell($word), 264 | "/[", 265 | $phonetic_alphabet->spell($surface_word), 266 | "]\n"; 267 | } else { 268 | print "//" . $phonetic_alphabet->spell($generated_word). "//\t" if defined $canonicalise; 269 | print "/" . $phonetic_alphabet->spell($word) . "/\t[" . $phonetic_alphabet->spell($surface_word) . "]\n"; 270 | for my $phone (@$surface_word) { 271 | $_ = $phonetic_alphabet->name_phone($phone); 272 | print $FS->feature_string($phone), "\n" if /\#\#/; 273 | } 274 | } 275 | } 276 | 277 | if ($use_html and $num_words > 0) { 278 | print CGI::end_table(); 279 | } 280 | 281 | if ($use_html) { 282 | print CGI::p({-style => 'font-size: small;'}, 283 | "Generated by Gleb", 284 | "version $version / $FS->{version} ", 285 | $infile ? "from the file $infile." : "with seed $seed."); 286 | print CGI::end_html; 287 | } 288 | 289 | 290 | 291 | 292 | -------------------------------------------------------------------------------- /phonology/phon_descr.yml: -------------------------------------------------------------------------------- 1 | --- 2 | syllable_slots: 3 | - "+syllabic: V" 4 | - "-syllabic dorsal +resonant -lateral -tap_or_trill: G" 5 | - "-syllabic +nasal: N" 6 | - "-syllabic +resonant|-syllabic +nasal: R" 7 | - "-syllabic: C" 8 | - ": *" # isn't underlyingly generated at present, but still can show up 9 | 10 | 11 | extra_natural_classes: 12 | - "-fricative -nasal" 13 | - "-resonant -nasal" 14 | 15 | 16 | table_structure: 17 | subtables: syllabic 18 | name: phone 19 | name_first: 1 20 | dominant: 1 # set thus for groups of vowels and semivowels 21 | 0: 22 | name: consonant 23 | caption: Consonants 24 | # rfaNPiTTl LCDlARLsHflpbrpAl vcS 25 | order: "resonant fricative affricate !nasal !prenasalised implosive !tap_or_trill !trill long; !labial !coronal !dorsal labiodental !anterior !retroflex !laminal sibilant !high front lateral pharyngealised back round palatalised_velar !ATR low; voice constricted_glottis !spread_glottis" 26 | flips: 27 | "dorsal": front 28 | "-dorsal -coronal -labial": pharyngealised 29 | undefineds: 30 | "-dorsal -coronal -labial +pharyngealised -constricted_glottis": "+fricative +spread_glottis" # mostly to keep them from nestling alongside /h/ 31 | "-dorsal -coronal -labial +constricted_glottis": "-fricative -affricate" 32 | "-dorsal -coronal -labial +spread_glottis": "+fricative" 33 | "-dorsal -coronal -labial": "-resonant -prenasalised -implosive -lateral -retroflex" # TODO: if e.g. trill ever becomes valid on an obstruent, put it here 34 | "dorsal +resonant +front": "-palatalised_velar" 35 | collapse: "low ATR long palatalised_velar affricate labiodental c_with_tSj sibilant laminal trill lateral tap_or_trill implosive spread_glottis constricted_glottis back kp_with_w round pharyngealised front retroflex high anterior prenasalised voice" 36 | named_collapses: 37 | c_with_tSj: 38 | from: "dorsal +front" 39 | to: "-dorsal coronal -anterior -retroflex +sibilant +front -high" # +front for flips 40 | avoid_unless: "-dorsal coronal -anterior -retroflex" 41 | type: columns 42 | kp_with_w: 43 | from: "dorsal labial" 44 | to: "+round -labial" 45 | avoid_unless: "dorsal +round -labial" 46 | type: columns 47 | labels: 48 | rows: 49 | - "-resonant +nasal -fricative -affricate: nasal" 50 | - "-resonant -fricative +affricate: affricate" 51 | - "-resonant +implosive -fricative: implosive" 52 | - "-resonant -nasal -fricative: stop" 53 | - "-resonant +fricative: fricative" 54 | - "-resonant -fricative: stop or nasal" 55 | - "-resonant -nasal: obstruent" 56 | - "-resonant: obstruent or nasal" 57 | - "+resonant +tap_or_trill +trill: trill" 58 | - "+resonant +tap_or_trill -trill: tap" 59 | - "+resonant +tap_or_trill: tap or trill" 60 | - "+resonant -tap_or_trill: approximant" 61 | - "+resonant: resonant" 62 | rows_mod: 63 | - "+nasal: nasal []" 64 | - "+implosive: implosive []" 65 | - "+prenasalised: prenasalised []" 66 | - "+long: long []" 67 | columns: 68 | - "labial -labiodental: bilabial" 69 | - "labial +labiodental: labiodental" 70 | - "labial: labial" 71 | - "coronal +anterior: alveolar" 72 | - "coronal -anterior +retroflex: retroflex" 73 | - "coronal -anterior -retroflex +front -dorsal: alveopal{atal}" # -dorsal is a stopgap against my crude modifiers 74 | - "coronal -anterior -retroflex: palatoalv{eolar}" 75 | - "coronal -anterior: postalv{eolar}" 76 | - "coronal: coronal" 77 | # do we need all these height pairs? 78 | - "dorsal -front +high +back: velar" # so we don't see "back velar" 79 | - "dorsal -front +high: velar" 80 | - "dorsal -front -high +back: uvular" 81 | - "dorsal -high: uvular" 82 | - "dorsal -front: vel{ar} or uvul{ar}" 83 | - "dorsal +front -back -palatalised_velar +high: palatal" 84 | - "dorsal +front -back +palatalised_velar +high: palatalised velar" 85 | - "dorsal +front -back -palatalised_velar: palatal" 86 | - "dorsal +front -back +palatalised_velar: palatalised velar" 87 | - "dorsal +front -back -high: palatalised uvular" 88 | - "dorsal +front -back +high: palatal" 89 | - "dorsal +high: pal{atal} or vel{ar}" 90 | - "dorsal: dorsal" 91 | - "-dorsal -coronal -labial +pharyngealised: pharyngeal" 92 | - "-dorsal -coronal -labial -pharyngealised: glottal" 93 | - "-dorsal -coronal -labial: radical" 94 | columns_mod: # Beware of flip triggers here. 95 | # special-cased a little, so they only show when unmarked 96 | - "+laminal coronal +anterior: laminal []" 97 | - "+laminal coronal +retroflex: laminal []" 98 | - "-laminal coronal -anterior -retroflex: apical []" 99 | - "+retroflex: retroflexed []" # needed for semivowels and the like 100 | - "-back -front dorsal: fronted []" 101 | - "+back -front -dorsal: velarised []" 102 | - "+pharyngealised: pharyng{ealised} []" 103 | - "+round: rounded []" 104 | - "+front -back -dorsal: palatalised []" 105 | - "+lateral: [] lateral" 106 | - "+sibilant: [] sibilant" 107 | modificate: columns 108 | repeat_columns: "-" 109 | pre_other: 110 | - "+resonant dorsal -tap_or_trill -lateral: semivowel" 111 | # the below is a kluge of some sort against strippings 112 | - "-dorsal -coronal -labial -pharyngealised +constricted_glottis: glottal stop" 113 | - "-dorsal -coronal -labial -pharyngealised +spread_glottis: glottal fricative" 114 | - "-dorsal -coronal -labial -pharyngealised: glottal" 115 | - "-dorsal -coronal -labial +pharyngealised: pharyngeal" 116 | - "-dorsal -coronal -labial: radical" 117 | pre_other_mod: 118 | - "+high -low -ATR +resonant dorsal -tap_or_trill -lateral: near-high []" 119 | - "+high -low +resonant dorsal -tap_or_trill -lateral: high []" 120 | - "-high -low +ATR +resonant dorsal -tap_or_trill -lateral: mid-high []" 121 | - "-high -low -ATR +resonant dorsal -tap_or_trill -lateral: mid-low []" 122 | - "-high -low +resonant dorsal -tap_or_trill -lateral: mid []" 123 | - "+low -high +resonant dorsal -tap_or_trill -lateral: low []" 124 | other_mod: 125 | - "-voice +constricted_glottis: ejective []" 126 | - "-voice: voiceless []" 127 | - "+voice +spread_glottis: breathy-voiced []" 128 | - "+voice +constricted_glottis: creaky-voiced []" 129 | - "+voice: voiced []" 130 | - "+spread_glottis: aspirated []" 131 | - "+constricted_glottis: glottalised []" 132 | - "+front: front []" # I think this is clearer than "palatalised" or the like 133 | - "+back: back []" # ditto 134 | - "+laminal: laminal []" 135 | - "-laminal: apical []" 136 | - "-ATR: lax []" 137 | - "+ATR: tense []" 138 | negate: 139 | "rounded": "unrounded" 140 | "aspirated": "unaspirated" 141 | "long": "short" 142 | "sibilant": "non-sibilant coronal" # kluge, but for now for clarity 143 | special_negate: 144 | "-resonant -nasal": "sonorant" 145 | nominalised: 146 | eliminate: 147 | "dorsal coronal labial": "retroflex front back round labiodental anterior laminal high palatalised_velar" 148 | "dorsal retroflex labiodental": "dorsal coronal labial retroflex front back round labiodental anterior laminal high palatalised_velar" 149 | other: 150 | - "dorsal coronal labial: place of articulation" 151 | - "dorsal retroflex labiodental: place of articulation" # a total revolting kluge, but needed for the counterstripping rules 152 | - "dorsal: dorsality" 153 | - "coronal: coronality" 154 | - "labial: labiality" 155 | - "resonant: resonancy" 156 | - "nasal: nasality" 157 | - "prenasalised: prenasalisation" 158 | - "fricative: fricativity" 159 | - "affricate: affrication" 160 | - "tap_or_trill: vibrancy" 161 | - "trill: trill-hood" 162 | - "implosive: implosivity" 163 | - "voice: voice" 164 | - "spread_glottis: aspiration" 165 | - "constricted_glottis: glottalisation" 166 | - "labiodental: labiodentality" 167 | - "anterior: anteriority" 168 | - "laminal: laminality" 169 | - "sibilant: sibilancy" 170 | - "retroflex: retroflexion" 171 | - "high: uvularity" # which it loves to use as a term for height 172 | - "front: palatalisation" # secondary more common than primary? 173 | - "palatalised_velar: whether palatal or palatalised velar" # ick 174 | - "round: rounding" 175 | - "back: velarisation" # secondary more common than primary? 176 | - "ATR: laxity" 177 | - "lateral: laterality" 178 | - "pharyngeal: pharyngealisation" 179 | - "long: length" 180 | modificate: other # kluge. necessary for repeat_ behaviour 181 | repeat_other: " and " 182 | 1: 183 | name: vowel 184 | caption: Vowels 185 | order: "resonant fricative affricate !prenasalised implosive !tap_or_trill trill low !high !ATR; !labial !coronal !dorsal labiodental !anterior sibilant !laminal lateral palatalised_velar !front back; round retroflex pharyngealised nasal long !voice constricted_glottis spread_glottis" 186 | collapse: "palatalised_velar spread_glottis constricted_glottis affricate labiodental sibilant trill lateral tap_or_trill implosive laminal pharyngealised retroflex anterior prenasalised voice round ATR" 187 | labels: 188 | eliminate: 189 | "+high": "low" 190 | "+low": "high" 191 | "+front": "back" 192 | "+back": "front" 193 | rows: 194 | - "+high -ATR: near-high" 195 | - "+high: high" 196 | - "-high -low +ATR: mid-high" 197 | - "-high -low -ATR: mid-low" 198 | - "-high -low: mid" 199 | - "+low: low" 200 | rows_mod: ~ 201 | columns: 202 | - "+front: front" 203 | - "-front -back: central" 204 | - "+back: back" 205 | columns_mod: ~ 206 | modificate: rows columns 207 | other_mod: 208 | - "-ATR: lax []" 209 | - "+ATR: tense []" 210 | - "+round: rounded []" 211 | - "+retroflex: retroflex []" 212 | - "+pharyngealised: pharyngealised []" 213 | - "+nasal: nasal []" 214 | - "+long: long []" 215 | - "-voice +constricted_glottis: ejective []" 216 | - "-voice: voiceless []" 217 | - "+voice +spread_glottis: breathy-voiced []" 218 | - "+voice +constricted_glottis: creaky-voiced []" 219 | - "+voice: voiced []" 220 | - "+spread_glottis: aspirated []" 221 | - "+constricted_glottis: glottalised []" 222 | negate: 223 | "rounded": "unrounded" 224 | "aspirated": "unaspirated" 225 | "long": "short" 226 | nominalised: 227 | other: 228 | - "high low: height" 229 | - "high: highness" 230 | - "low: lowness" 231 | - "front: frontness" 232 | - "back: backness" 233 | - "ATR: tenseness" 234 | - "round: rounding" 235 | - "retroflex: retroflexion" 236 | - "pharyngealised: pharyngealisation" 237 | - "nasal: nasality" 238 | - "long: length" 239 | - "voice: voice" 240 | - "spread_glottis: aspiration" 241 | - "constricted_glottis: glottalisation" 242 | 243 | 244 | --------------------------------------------------------------------------------