├── .gitattributes ├── .gitignore ├── README.md ├── charmodel └── charmodel.go ├── earley └── earley.go ├── environment └── environment.go ├── grammar └── grammar.go ├── interpreter └── interpreter.go ├── lexer └── lexer.go ├── lexfuncs.go ├── logopoeist.go ├── parser ├── parsefuncs.go └── parser.go ├── test.lgp ├── types └── types.go └── wordmodel └── wordmodel.go /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear on external disk 35 | .Spotlight-V100 36 | .Trashes 37 | 38 | # Directories potentially created on remote AFP share 39 | .AppleDB 40 | .AppleDesktop 41 | Network Trash Folder 42 | Temporary Items 43 | .apdisk 44 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Logopoeist 2 | ========== 3 | 4 | Logopoeist is a random word-generation system for conlangers (creators of constructed languages) which conforms to user-specified phonotactic rules as well as user-specified conditional probability distributions to control phoneme frequencies. 5 | 6 | Pre-compiled binaries for 64-bit Windows, Mac OS, and Linux can be downloaded on the release page: https://github.com/conlang-software-dev/Logopoeist/releases/tag/v0.1 7 | 8 | Logopoeist is a command-line program, and takes the following arguments: 9 | 10 | * `-file {string}`: the name of an input configuration file. If absent, Logopoeist will try to read configuration commands from standard input. 11 | * `-n {int}`: the number of random words to output. Defaults to 10. 12 | * `-lmin {uint}`: the minimum length of words to output. Defaults to 0. 13 | * `-lmax {uint}`: the maximum length of words to output. Defaults to unbounded. 14 | 15 | A sample configuration for a strict-CV language with vowel harmony is provided in `test.lgp`. 16 | 17 | Word Generation 18 | --------------- 19 | 20 | Logopoeist generates words by randomly selecting phoneme/grapheme tokens from a distribution that is calculated for each position by intersecting information from an n-gram character model and syllable structure model. An incremental Earley chart parser is used to keep track of all of the possible partial-parses that satisfy the syllable structure rules given whatever phonemes have been generated so far; the parser state is examined to produce a combined distribution for all possible phonemes that could be added and still produce a valid parse, weighted by the probability of each partial parse in the Earley chart at that position. The n-gram model is then used to determine what distribution of phonemes would be allowed in the same position given the previous context. These two distributions are then intersected, and a random phoneme is selected from the resulting joint distribution to fill in that slot, which then further constrains the possible parses and n-gram environments for the next position. Word boundaries are produced (thus terminating the productino of one word) by considering the relative total probability of all complete parses vs. all incomplete parses at a given position; that ratio is then used to make a weighted random choice to produce a completed word or to keep going. 21 | 22 | Several possible situations can arise that result in a failed production- a state where there are no possible phonemes that can be added, but the word also isn't complete according to the syllable structure rules. These include things like disjoint distributions produced by the n-gram and syllable structure models, production of a word that is too short or too long compared to the limits set by the user, or production of a duplicate word that's already been seen before. In any of these cases, Logopoeist will use recursive back-tracking; the parser state is rewound, the last produced phoneme is discarded and removed from the distribution so that it cannot be selected for the same environment again, and Logopoeist tries again with a different randomly selected phoneme (or, in the case of a too-short word, the word boundary is discarded and the system starts the next-phoneme selection process for the first time). This guarantees that the word generator will make progress and produce new output in finite time, without retracing failed paths that it had already explored, while still matching whatever number of output words were requested; i.e., it does not have to randomly generate possibly-colliding words for an unbounded amount of time, hoping to accumulate as many as you asked for; nor does it run a fixed number of cycles, showing you maybe as many unique words as you requested, but maybe less, after filtering duplicates. Additionally, the recursive backtracking strategy allows Logopoeist to detect when it has completely exhausted the finite number of options permitted in a certain range, and inform you of that fact, rather than freezing up while continuing to look for more options that don't exist. Unfortunately, however, it cannot detect infinite grammars- syllable structure rules that do not permit any finite words. If you're not careful, and feed it an infinite grammar with no maximum word length specified, it will loop forever (if a maximum word length is specified, it will helpfully inform you that no valid words exist in the given range). 23 | 24 | The system does start to slow down eventually, due to increased need for backtracking, after generating large numbers of unique words. For practical purposes, however, it is quite fast. For example, it can generate all 6156 possible 8-letter words allowed by the sample configuration file in about 15 seconds, and then helpfully inform you that there are no more valid words of that length. 25 | 26 | Configuration 27 | ------------- 28 | 29 | Logopoeist uses a simple domain-specific programming language to describe the allowed shapes of words in its config files. The LGP language has three kinds of statements: 30 | 31 | 1. Variable declarations 32 | 2. Word Syntax rules 33 | 3. Conditional probability rules 34 | 35 | ### Variable Declarations 36 | 37 | Variable declarations let you give names to sets (or classes) of characters, so that you can use them multiple times in the phonotactic rules. Variable declarations have the form 38 | 39 | {C-var} = {C-class} 40 | 41 | where `{C-var}` is a character class variable name, and `{C-class}` is either another variable name, or a literal character class. Character class variables always begin with a hash symbol (`#`). Literal character classes have the form 42 | 43 | <{char} *{Frequency} ...> 44 | 45 | where `{char}` is some string of characters representing a phoneme (not limiting it to a single typable character allows you to treat digraphs, trigraphs, and other sequences as single characters from the point of view of phonotactics), and `{Frequency}` is a number specifying the relative frequency of that phoneme compared to others in the same set. The `*{Frequency}` setting after each phoneme is optional, and will be automatically set to 1 if not specified. 46 | 47 | ### Word Syntax Rules 48 | 49 | Word Syntax rules describe the high-level phonotactic structure of a language in terms of a probabilistic context-free grammar. 50 | 51 | Syntax rules have the general form 52 | 53 | {S-var} -> [Replacement List] *{Frequency} 54 | 55 | Where `{S-var}` is a syntax variable, `[Replacement List]` is a space-separated list of syntax variables, character class variables, or literal character classes. As with character classes, the frequency specification is optional, and defaults to 1 if not specified; in this case, it specifies how frequently a particular substitution rule will be applied when more than one is available for the same syntax variable. Syntax variables always begin with a dollar symbol (`$`). 56 | 57 | The left-hand symbol for the _first_ syntax rule in a configuration file will be used as the starting symbol for the probabilistic grammar. 58 | 59 | ### Conditional Probability Rules 60 | 61 | Conditional Probability rules are used to manipulate the frequency of certain phonemes in particular contexts, given by the two preceding phonemes in a given word. They have the general form 62 | 63 | _ {C-class} ... -> {C-class} 64 | 65 | where each `{C-class}` is either a character class variable or a character class literal. The leading underscore is optional; if present, it indicates a word boundary. Thus, `_ -> #A` specifies a distribution of phonemes that can come at the beginning of a word, while `_ #A -> #B` specifies a distribution of phonemes that can come second in a word. 66 | 67 | Classes preceding the arrow (`->`) are known as _conditioning classes_. The frequencies of the conditioning classes are ignored, and they are used to generate _conditioning n-grams_ from all combinations of phonemes that can be selected from each class in order (i.e., the Cartesian product). The conditional distribution to the right of the arrow is assigned to all positions following any conditioning n-gram derived from the current rule. 68 | 69 | A variant on conditional probability rules can be used to indicate that certain phonemes are disallowed in specific position. These have the form 70 | 71 | _ {C-class} ... !> {C-class} 72 | 73 | Note the differently shaped arrow- `!>` instead of `->`. In these rules, frequencies are ignored in both conditioning and conditional classes; members of the conditional class are assigned zero probability after any conditioning n-grams derived from these rules, overriding whatever other distributions they might have had due to other rules. 74 | 75 | 76 | first creates a phonotactic template for that word using the probabilistic word grammar by replacing syntax variables until a list of only character classes (indicated either by variables or literal character classes) is left. 77 | 78 | At that point, the template is filled in from left-to-right by 79 | 80 | 1. Examining the distribution for the character class. 81 | 2. Intersecting the template distribution with any conditional distributions given by the previous phonemes already generated for the word. 82 | 3. Randomly selecting a character from the resulting distribution. 83 | 84 | If there are no conditional distribution rules that apply at a certain position, the conditional distribution is implicitly a uniform distribution over all phonemes in the language. Intersecting distributions means eliminating phonemes that are not present in both distributions, and the multiplying the relative frequencies for each phoneme from each distribution. 85 | 86 | TODO 87 | ---- 88 | 89 | There are plenty of ways that Logopoeist could be improved, so feel free to make suggestions and/or pull requests! -------------------------------------------------------------------------------- /charmodel/charmodel.go: -------------------------------------------------------------------------------- 1 | package charmodel 2 | 3 | import "strings" 4 | import . "github.com/conlang-software-dev/Logopoeist/types" 5 | 6 | type ngrams map[string]*CharSet 7 | 8 | type CharModel struct { 9 | conds ngrams 10 | excls ngrams 11 | } 12 | 13 | func NewModel() *CharModel { 14 | return &CharModel{ 15 | conds: make(ngrams), 16 | excls: make(ngrams), 17 | } 18 | } 19 | 20 | func (m *CharModel) AddCondition(ngram string, dist *CharSet) { 21 | if ndist, ok := m.conds[ngram]; ok { 22 | // copy the old map in case it was shared, 23 | union := make(CharSet, len(*ndist)) 24 | m.conds[ngram] = &union 25 | for k, v := range *ndist { 26 | union[k] = v 27 | } 28 | 29 | // then union with the current distribution 30 | for k, v := range *dist { 31 | if _, ok := union[k]; ok { 32 | union[k] += v 33 | } else { 34 | union[k] = v 35 | } 36 | } 37 | } else { 38 | m.conds[ngram] = dist 39 | } 40 | } 41 | 42 | func (m *CharModel) AddExclusion(ngram string, dist *CharSet) { 43 | if edist, ok := m.excls[ngram]; ok { 44 | 45 | // create a new map in case the original was shared 46 | union := make(CharSet, len(*edist)) 47 | m.excls[ngram] = &union 48 | 49 | for k := range *edist { 50 | union[k] = 0 51 | } 52 | for k := range *dist { 53 | union[k] = 0 54 | } 55 | } else { 56 | // reference a single common object as much as possible 57 | m.excls[ngram] = dist 58 | } 59 | } 60 | 61 | func (m *CharModel) CalcDistribution(base *CharSet, context []string) CharSet { 62 | ndist := make(CharSet, len(*base)) 63 | for k, v := range *base { 64 | ndist[k] = v 65 | } 66 | 67 | // iterate over conditioning ngrams 68 | order := len(context) 69 | for j := order; j > 0; j-- { 70 | ngram := strings.Join(context[order-j:order], "") 71 | 72 | // remove any exclusions 73 | if edist, ok := m.excls[ngram]; ok { 74 | for char, _ := range *edist { 75 | delete(ndist, char) 76 | } 77 | } 78 | 79 | // intersect with conditional distributions 80 | if cdist, ok := m.conds[ngram]; ok { 81 | for char, nweight := range ndist { 82 | if cweight, ok := (*cdist)[char]; ok { 83 | ndist[char] = nweight * cweight 84 | } else { 85 | delete(ndist, char) 86 | } 87 | } 88 | } 89 | } 90 | return ndist 91 | } 92 | -------------------------------------------------------------------------------- /earley/earley.go: -------------------------------------------------------------------------------- 1 | package earley 2 | 3 | import . "github.com/conlang-software-dev/Logopoeist/parser" 4 | import . "github.com/conlang-software-dev/Logopoeist/grammar" 5 | import . "github.com/conlang-software-dev/Logopoeist/types" 6 | import . "github.com/conlang-software-dev/Logopoeist/environment" 7 | 8 | type state struct { 9 | lhs string 10 | rhs []*Node 11 | dot uint 12 | start uint 13 | terminal bool 14 | weight float64 15 | } 16 | 17 | func (s *state) iscomplete() bool { 18 | return s.dot >= uint(len(s.rhs)) 19 | } 20 | 21 | func (s *state) needNonTerminal() bool { 22 | return s.rhs[s.dot].Type == SVar 23 | } 24 | 25 | func (s *state) equals(other *state) bool { 26 | if s.terminal != other.terminal || 27 | s.lhs != other.lhs || 28 | s.dot != other.dot || 29 | s.start != other.start { 30 | return false 31 | } 32 | if len(s.rhs) != len(other.rhs) { 33 | return false 34 | } 35 | for i, n := range s.rhs { 36 | o := other.rhs[i] 37 | if n.Type != o.Type || n.Value != o.Value { 38 | return false 39 | } 40 | } 41 | return true 42 | } 43 | 44 | type EarleyParser struct { 45 | parent *EarleyParser 46 | level uint 47 | synmodel Grammar 48 | env Environment 49 | root string 50 | column []*state 51 | finished bool 52 | } 53 | 54 | func NewParser(env Environment, g Grammar, root string) *EarleyParser { 55 | np := &EarleyParser{ 56 | parent: nil, 57 | level: 0, 58 | env: env, 59 | synmodel: g, 60 | root: root, 61 | column: []*state{}, 62 | finished: false, 63 | } 64 | 65 | np.init() 66 | return np 67 | } 68 | 69 | func newLevel(p *EarleyParser) *EarleyParser { 70 | return &EarleyParser{ 71 | parent: p, 72 | level: p.level + 1, 73 | env: p.env, 74 | synmodel: p.synmodel, 75 | root: p.root, 76 | column: []*state{}, 77 | finished: false, 78 | } 79 | } 80 | 81 | func (p *EarleyParser) init() { 82 | if rset, ok := p.synmodel.Rules(p.root); ok { 83 | for i, rhs := range rset.Rules { 84 | p.addToChart(&state{ 85 | lhs: p.root, 86 | rhs: rhs, 87 | dot: 0, 88 | start: 0, 89 | terminal: false, 90 | weight: rset.Weights[i], 91 | }) 92 | } 93 | } 94 | p.process() 95 | } 96 | 97 | func (p *EarleyParser) IsFinished() bool { 98 | return p.finished 99 | } 100 | 101 | func (p *EarleyParser) IsEmpty() bool { 102 | return len(p.column) == 0 103 | } 104 | 105 | func (p *EarleyParser) addToChart(s *state) { 106 | for _, old := range p.column { 107 | if s.equals(old) { 108 | old.weight += s.weight 109 | return 110 | } 111 | } 112 | p.column = append(p.column, s) 113 | } 114 | 115 | func (p *EarleyParser) getColumn(index uint) []*state { 116 | for p.level > index { 117 | p = p.parent 118 | } 119 | return p.column 120 | } 121 | 122 | func (chart *EarleyParser) scan(s *state, token string) { 123 | if s.iscomplete() { 124 | return 125 | } 126 | 127 | term := s.rhs[s.dot] 128 | if term.Type != CVar { 129 | return 130 | } 131 | 132 | chars, ok := chart.env.Lookup(term.Value) 133 | if !ok { 134 | return 135 | } 136 | 137 | if chars.Contains(token) { 138 | chart.addToChart(&state{ 139 | lhs: term.Value, 140 | rhs: []*Node{}, // could store the token here, but it's not necessary for our purposes 141 | dot: 1, // 0 would work as well, since rhs is empty; the point is to make this state "finished" 142 | start: chart.level - 1, 143 | terminal: true, 144 | weight: s.weight, 145 | }) 146 | } 147 | } 148 | 149 | func (chart *EarleyParser) predict(s *state) { 150 | g := chart.synmodel 151 | term := s.rhs[s.dot] 152 | if term.Type != SVar { 153 | return 154 | } 155 | if rset, ok := g.Rules(term.Value); ok { 156 | for i, rhs := range rset.Rules { 157 | chart.addToChart(&state{ 158 | lhs: term.Value, 159 | rhs: rhs, 160 | dot: 0, 161 | start: chart.level, 162 | terminal: false, 163 | weight: s.weight * rset.Weights[i], 164 | }) 165 | } 166 | } 167 | } 168 | 169 | func (chart *EarleyParser) complete(s *state) { 170 | for _, old := range chart.getColumn(s.start) { 171 | if old.iscomplete() { 172 | continue 173 | } 174 | term := old.rhs[old.dot] 175 | t := SVar 176 | if s.terminal { 177 | t = CVar 178 | } 179 | if term.Type == t && term.Value == s.lhs { 180 | chart.addToChart(&state{ 181 | lhs: old.lhs, 182 | rhs: old.rhs, 183 | dot: old.dot + 1, 184 | start: old.start, 185 | terminal: false, 186 | weight: s.weight, 187 | }) 188 | } 189 | } 190 | } 191 | 192 | func (p *EarleyParser) process() { 193 | //can't range because p.column is altered during the loop 194 | for i := 0; i < len(p.column); i++ { 195 | s := p.column[i] 196 | if s.iscomplete() { 197 | if s.start == 0 && s.lhs == p.root { 198 | p.finished = true 199 | } 200 | p.complete(s) 201 | } else if s.needNonTerminal() { 202 | p.predict(s) 203 | } 204 | } 205 | //optional: filter out completed states to save memory 206 | } 207 | 208 | func (p *EarleyParser) Next(token string) (*EarleyParser, bool) { 209 | np := newLevel(p) 210 | for _, s := range p.column { 211 | np.scan(s, token) 212 | } 213 | 214 | np.process() 215 | return np, len(np.column) > 0 216 | } 217 | 218 | func (p *EarleyParser) TerminationProbability() float64 { 219 | done_weight := 0.0 220 | cont_weight := 0.0 221 | for _, s := range p.column { 222 | if s.iscomplete() { 223 | if s.start == 0 && s.lhs == p.root { 224 | done_weight += s.weight 225 | } 226 | } else { 227 | cont_weight += s.weight 228 | } 229 | } 230 | return done_weight / cont_weight 231 | } 232 | 233 | func (p *EarleyParser) AllowedTokens() *CharSet { 234 | cset := make(CharSet) 235 | for _, s := range p.column { 236 | if s.iscomplete() { 237 | continue 238 | } 239 | 240 | term := s.rhs[s.dot] 241 | if term.Type != CVar { 242 | continue 243 | } 244 | 245 | if sset, ok := p.env.Lookup(term.Value); ok { 246 | for k, v := range sset.Weights { 247 | if _, ok := cset[k]; ok { 248 | cset[k] += v * s.weight 249 | } else { 250 | cset[k] = v * s.weight 251 | } 252 | } 253 | } 254 | } 255 | return &cset 256 | } 257 | -------------------------------------------------------------------------------- /environment/environment.go: -------------------------------------------------------------------------------- 1 | package environment 2 | 3 | import "fmt" 4 | import "strconv" 5 | import . "github.com/conlang-software-dev/Logopoeist/parser" 6 | import . "github.com/conlang-software-dev/Logopoeist/types" 7 | import . "github.com/conlang-software-dev/Logopoeist/interpreter" 8 | 9 | type Environment map[string]*CharClass 10 | 11 | func (e Environment) Assign(varname string, n *Node) { 12 | e[varname] = e.GetClass(n) 13 | } 14 | 15 | var nextvar = 0 16 | 17 | func (e Environment) AssignNew(n *Node) string { 18 | nextvar += 1 19 | varname := strconv.Itoa(nextvar) 20 | e[varname] = e.GetClass(n) 21 | return varname 22 | } 23 | 24 | func (e Environment) Lookup(varname string) (*CharClass, bool) { 25 | if cclass, ok := e[varname]; ok { 26 | return cclass, true 27 | } 28 | return nil, false 29 | } 30 | 31 | func (e Environment) GetClass(n *Node) *CharClass { 32 | switch n.Type { 33 | case CVar: 34 | if cclass, ok := e.Lookup(n.Value); ok { 35 | return cclass 36 | } 37 | panic(fmt.Sprintf("Variable #%s referenced before definition", n.Value)) 38 | case Class: 39 | return InterpretClass(n) 40 | default: 41 | panic(fmt.Sprintf("Invalid Node Type for Character Class: %s", n.ToString())) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /grammar/grammar.go: -------------------------------------------------------------------------------- 1 | package grammar 2 | 3 | import . "github.com/conlang-software-dev/Logopoeist/parser" 4 | 5 | type RuleSet struct { 6 | total float64 7 | Weights []float64 8 | Rules [][]*Node 9 | } 10 | 11 | type Grammar map[string]*RuleSet 12 | 13 | func (g Grammar) AddRule(v string, rule []*Node, weight float64) { 14 | if rset, ok := g[v]; ok { 15 | rset.total += weight 16 | rset.Rules = append(rset.Rules, rule) 17 | rset.Weights = append(rset.Weights, weight) 18 | } else { 19 | g[v] = &RuleSet{ 20 | total: weight, 21 | Weights: []float64{weight}, 22 | Rules: [][]*Node{rule}, 23 | } 24 | } 25 | } 26 | 27 | func (g Grammar) Rules(v string) (*RuleSet, bool) { 28 | if ruleset, ok := g[v]; ok { 29 | return ruleset, true 30 | } 31 | return &RuleSet{}, false 32 | } 33 | -------------------------------------------------------------------------------- /interpreter/interpreter.go: -------------------------------------------------------------------------------- 1 | package interpreter 2 | 3 | import "fmt" 4 | import "strconv" 5 | import . "github.com/conlang-software-dev/Logopoeist/parser" 6 | import . "github.com/conlang-software-dev/Logopoeist/types" 7 | 8 | func InterpretNumber(n *Node) float64 { 9 | freq, err := strconv.ParseFloat(n.Value, 64) 10 | if err != nil { 11 | panic(fmt.Sprintf("Invalid numeric literal: %s", n.Value)) 12 | } 13 | return freq 14 | } 15 | 16 | func InterpretClass(n *Node) *CharClass { 17 | list := make([]string, 0, 10) 18 | weights := make(CharSet, 10) 19 | for sn := n.Left; sn != nil; sn = sn.Right { 20 | fnode := sn.Left 21 | phoneme := fnode.Left.Value 22 | freq := InterpretNumber(fnode.Right) 23 | 24 | if _, ok := weights[phoneme]; ok { 25 | weights[phoneme] += freq 26 | } else { 27 | weights[phoneme] = freq 28 | list = append(list, phoneme) 29 | } 30 | } 31 | return &CharClass{ 32 | List: list, 33 | Weights: weights, 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /lexer/lexer.go: -------------------------------------------------------------------------------- 1 | package lexer 2 | 3 | import "strings" 4 | import "io" 5 | 6 | type Item struct { 7 | Type string 8 | Token string 9 | } 10 | 11 | type StateFn func(*RuneBuffer, chan *Item) StateFn 12 | 13 | type RuneBuffer struct { 14 | in io.RuneReader 15 | r rune 16 | more bool 17 | } 18 | 19 | // Peek looks at the next rune but doesn't advance the input. 20 | func (rb *RuneBuffer) Peek() (rune, bool) { 21 | return rb.r, rb.more 22 | } 23 | 24 | // Next returns the next rune in the input. 25 | func (rb *RuneBuffer) Next() (rune, bool) { 26 | if !rb.more { 27 | return 0, false 28 | } 29 | r, more := rb.r, rb.more 30 | nr, _, err := rb.in.ReadRune() 31 | rb.r, rb.more = nr, (err == nil) 32 | return r, more 33 | } 34 | 35 | // accept consumes the next rune if it's from the valid set. 36 | func (rb *RuneBuffer) Accept(valid string) (rune, bool, bool) { 37 | r, ok := rb.Peek() 38 | if !ok { 39 | return r, false, false 40 | } 41 | if strings.IndexRune(valid, r) >= 0 { 42 | rb.Next() 43 | return r, true, true 44 | } 45 | return r, false, true 46 | } 47 | 48 | // accept consumes the next rune if it's not from the invalid set. 49 | func (rb *RuneBuffer) AcceptNot(invalid string) (rune, bool, bool) { 50 | r, ok := rb.Peek() 51 | if !ok { 52 | return r, false, false 53 | } 54 | if strings.IndexRune(invalid, r) < 0 { 55 | rb.Next() 56 | return r, true, true 57 | } 58 | return r, false, true 59 | } 60 | 61 | type Lexer struct { 62 | tokens chan *Item 63 | next *Item 64 | more bool 65 | } 66 | 67 | // Peek looks at the next token but doesn't advance the input. 68 | func (l *Lexer) Peek() (*Item, bool) { 69 | return l.next, l.more 70 | } 71 | 72 | // Next returns the next token from the input. 73 | func (l *Lexer) Next() (*Item, bool) { 74 | item, ok := l.next, l.more 75 | l.next, l.more = <-l.tokens 76 | return item, ok 77 | } 78 | 79 | func Lex(input io.RuneReader, start StateFn) *Lexer { 80 | tokens := make(chan *Item) 81 | 82 | go func() { 83 | r, _, err := input.ReadRune() 84 | buf := &RuneBuffer{ 85 | in: input, 86 | r: r, 87 | more: (err == nil), 88 | } 89 | 90 | for state := start; state != nil; { 91 | state = state(buf, tokens) 92 | } 93 | 94 | close(tokens) 95 | }() 96 | 97 | first, ok := <-tokens 98 | return &Lexer{ 99 | tokens: tokens, 100 | next: first, 101 | more: ok, 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /lexfuncs.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import "bytes" 4 | import "strings" 5 | import . "github.com/conlang-software-dev/Logopoeist/lexer" 6 | 7 | func commentState(in *RuneBuffer, out chan *Item) StateFn { 8 | for { 9 | r, ok := in.Next() 10 | if !ok || r == '\n' { 11 | break 12 | } 13 | } 14 | return switchState 15 | } 16 | 17 | func numberState(in *RuneBuffer, out chan *Item) StateFn { 18 | buf := new(bytes.Buffer) 19 | for { 20 | r, ok, more := in.Accept("0123456789.") 21 | if !(more && ok) { 22 | break 23 | } 24 | buf.WriteRune(r) 25 | } 26 | out <- &Item{Type: "number", Token: buf.String()} 27 | return switchState 28 | } 29 | 30 | func symbolState(in *RuneBuffer, out chan *Item) StateFn { 31 | buf := new(bytes.Buffer) 32 | for { 33 | r, ok, _ := in.AcceptNot(" \t\r\n;*<>-!=") 34 | if !ok { 35 | break 36 | } 37 | buf.WriteRune(r) 38 | } 39 | 40 | out <- &Item{Type: "symbol", Token: buf.String()} 41 | return switchState 42 | } 43 | 44 | func arrowState(in *RuneBuffer, out chan *Item) StateFn { 45 | first, _ := in.Next() 46 | second, _ := in.Next() 47 | 48 | out <- &Item{Type: "arrow", Token: string([]rune{first, second})} 49 | return switchState 50 | } 51 | 52 | func phonemeState(in *RuneBuffer, out chan *Item) StateFn { 53 | buf := new(bytes.Buffer) 54 | for { 55 | r, ok, _ := in.AcceptNot(" \t\r\n*>") 56 | if !ok { 57 | break 58 | } 59 | if r == '\\' { // escape character 60 | in.Next() 61 | r, ok = in.Next() 62 | if !ok { 63 | break 64 | } 65 | } 66 | buf.WriteRune(r) 67 | } 68 | 69 | out <- &Item{Type: "phoneme", Token: buf.String()} 70 | return setState 71 | } 72 | 73 | func setState(in *RuneBuffer, out chan *Item) StateFn { 74 | if r, ok := in.Peek(); ok { 75 | switch { 76 | case strings.IndexRune(" \t\r\n", r) >= 0: 77 | for ok { //skip whitespace 78 | _, ok, _ = in.Accept(" \t\r\n") 79 | } 80 | return setState 81 | case strings.IndexRune("*/", r) >= 0: 82 | in.Next() 83 | out <- &Item{Type: string(r), Token: string(r)} 84 | return setState 85 | case strings.IndexRune("0123456789", r) >= 0: 86 | numberState(in, out) 87 | return setState 88 | case r == ';': 89 | in.Next() 90 | out <- &Item{Type: "EOL", Token: "EOL"} 91 | commentState(in, out) 92 | return setState 93 | case r == '>': 94 | in.Next() 95 | out <- &Item{Type: ">", Token: ">"} 96 | return switchState 97 | default: 98 | return phonemeState 99 | } 100 | } else { 101 | return nil 102 | } 103 | } 104 | 105 | func switchState(in *RuneBuffer, out chan *Item) StateFn { 106 | if r, ok := in.Peek(); ok { 107 | for ok { // skip spaces 108 | _, ok, _ = in.Accept(" \t\r") 109 | } 110 | switch { 111 | case strings.IndexRune(" \t\r", r) >= 0: 112 | for ok { // skip whitespace 113 | _, ok, _ = in.Accept(" \t\r") 114 | } 115 | return switchState 116 | case r == '\n': 117 | in.Next() 118 | out <- &Item{Type: "EOL", Token: "EOL"} 119 | return switchState 120 | case r == ';': 121 | in.Next() 122 | out <- &Item{Type: "EOL", Token: "EOL"} 123 | return commentState 124 | case strings.IndexRune("#$_*/=", r) >= 0: 125 | in.Next() 126 | out <- &Item{Type: string(r), Token: string(r)} 127 | return switchState 128 | case r == '<': 129 | in.Next() 130 | out <- &Item{Type: "<", Token: "<"} 131 | return setState 132 | case strings.IndexRune("-!", r) >= 0: 133 | return arrowState 134 | case strings.IndexRune("0123456789", r) >= 0: 135 | return numberState 136 | default: 137 | return symbolState 138 | } 139 | } 140 | out <- &Item{Type: "EOF", Token: "EOF"} 141 | return nil 142 | } 143 | -------------------------------------------------------------------------------- /logopoeist.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import "fmt" 4 | import "bufio" 5 | import "os" 6 | import "flag" 7 | import "strings" 8 | 9 | import "github.com/conlang-software-dev/Logopoeist/lexer" 10 | import "github.com/conlang-software-dev/Logopoeist/parser" 11 | import . "github.com/conlang-software-dev/Logopoeist/wordmodel" 12 | 13 | func main() { 14 | var file *os.File 15 | var fname string 16 | var wcount int 17 | var min int 18 | var max int 19 | 20 | flag.StringVar(&fname, "file", "", "The name of the configuration file; defaults to standard input.") 21 | flag.IntVar(&wcount, "n", 10, "The number of words to generate; defaults to 10.") 22 | flag.IntVar(&min, "lmin", 0, "The minimum length of words; defaults to 0.") 23 | flag.IntVar(&max, "lmax", 0, "The maximum length of words; defaults to unbounded.") 24 | 25 | flag.Parse() 26 | 27 | if max > 0 && min > max { 28 | fmt.Printf("lmin must be less than lmax\n") 29 | return 30 | } 31 | 32 | if fname != "" { 33 | var err error 34 | file, err = os.Open(fname) 35 | if err != nil { 36 | fmt.Printf("Error opening source file.\n") 37 | return 38 | } 39 | defer file.Close() 40 | } else { 41 | file = os.Stdin 42 | } 43 | 44 | defer func() { 45 | if err := recover(); err != nil { 46 | fmt.Printf("Error: %s\n", err) 47 | } 48 | }() 49 | 50 | lex := lexer.Lex(bufio.NewReader(file), switchState) 51 | model := WordModel() 52 | for command := range parser.Parse(lex) { 53 | model.Execute(command) 54 | } 55 | 56 | for i := 0; i < wcount; i++ { 57 | if clist, ok := model.Generate(min, max); ok { 58 | word := strings.Join(clist, "") 59 | fmt.Printf("%s\n", word) 60 | continue 61 | } 62 | 63 | if min == 0 && max == 0 { 64 | if i == 0 { 65 | fmt.Printf("No Valid Words Found. Model May Be Inconsistent.") 66 | } else { 67 | fmt.Printf("Exhausted Unique Words.") 68 | } 69 | } else { 70 | if i == 0 { 71 | fmt.Printf("No Valid Words Found in the Given Range.") 72 | } else { 73 | fmt.Printf("Exhausted Unique Words in the Given Range.") 74 | } 75 | } 76 | return 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /parser/parsefuncs.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import "fmt" 4 | import . "github.com/conlang-software-dev/Logopoeist/lexer" 5 | 6 | func parseSVar(lex *Lexer) *Node { 7 | lex.Next() // skip $ sigil 8 | symbol, ok := lex.Next() 9 | if !ok || symbol.Type != "symbol" { 10 | panic("Parse error: Missing Syntax Variable") 11 | } 12 | return &Node{ 13 | Type: SVar, 14 | Value: symbol.Token, 15 | } 16 | } 17 | 18 | func parseCVar(lex *Lexer) *Node { 19 | lex.Next() // skip # sigil 20 | symbol, ok := lex.Next() 21 | if !ok || symbol.Type != "symbol" { 22 | panic("Parse error: Missing Class Variable") 23 | } 24 | return &Node{ 25 | Type: CVar, 26 | Value: symbol.Token, 27 | } 28 | } 29 | 30 | func parsePhoneme(lex *Lexer) *Node { 31 | loop: 32 | symbol, ok := lex.Next() 33 | if !ok || symbol.Type == ">" { 34 | return nil 35 | } 36 | if symbol.Type == "EOL" { 37 | goto loop 38 | } 39 | 40 | frequency := parseFrequency(lex) 41 | rest := parsePhoneme(lex) 42 | phoneme := &Node{ 43 | Type: Phoneme, 44 | Value: symbol.Token, 45 | } 46 | 47 | return &Node{ 48 | Type: Seq, 49 | Value: "", 50 | Right: rest, 51 | Left: &Node{ 52 | Type: Freq, 53 | Left: phoneme, 54 | Right: frequency, 55 | }, 56 | } 57 | } 58 | 59 | func parseClass(lex *Lexer) *Node { 60 | lex.Next() // skip < token 61 | phonemes := parsePhoneme(lex) 62 | return &Node{ 63 | Type: Class, 64 | Left: phonemes, 65 | } 66 | } 67 | 68 | func parseClassOrCVar(lex *Lexer) *Node { 69 | item, ok := lex.Peek() 70 | if !ok { 71 | panic("Parse error: Expected Character Class or Variable") 72 | } 73 | switch item.Type { 74 | case "#": 75 | return parseCVar(lex) 76 | case "<": 77 | return parseClass(lex) 78 | default: 79 | panic(fmt.Sprintf("Parse error: Expected Character Class or Variable; saw %s", item.Token)) 80 | } 81 | } 82 | 83 | func parseSubstitutions(lex *Lexer) *Node { 84 | item, ok := lex.Peek() 85 | if !ok { 86 | return nil 87 | } 88 | 89 | var left *Node 90 | switch item.Type { 91 | case "*", "EOL": 92 | return nil 93 | case "$": 94 | left = parseSVar(lex) 95 | case "#": 96 | left = parseCVar(lex) 97 | case "<": 98 | left = parseClass(lex) 99 | default: 100 | panic(fmt.Sprintf("Parse error: Unexpected Token %s in Syntax Rule", item.Token)) 101 | } 102 | 103 | right := parseSubstitutions(lex) 104 | return &Node{ 105 | Type: Seq, 106 | Left: left, 107 | Right: right, 108 | } 109 | } 110 | 111 | func parseFrequency(lex *Lexer) *Node { 112 | item, ok := lex.Peek() 113 | if !ok || item.Type != "*" { 114 | return &Node{ 115 | Type: Num, 116 | Value: "1", 117 | } 118 | } 119 | 120 | lex.Next() // skip * token 121 | item, ok = lex.Next() 122 | if !ok || item.Type != "number" { 123 | panic("Parse error: Missing Number") 124 | } 125 | 126 | return &Node{ 127 | Type: Num, 128 | Value: item.Token, 129 | } 130 | } 131 | 132 | func parseSyntax(lex *Lexer) *Node { 133 | left := parseSVar(lex) 134 | 135 | arrow, ok := lex.Next() 136 | if !ok || arrow.Token != "->" { 137 | panic("Parse error: Expected -> in syntax definition") 138 | } 139 | 140 | substitutions := parseSubstitutions(lex) 141 | frequency := parseFrequency(lex) 142 | 143 | return &Node{ 144 | Type: Production, 145 | Value: "", 146 | Left: left, 147 | Right: &Node{ 148 | Type: Freq, 149 | Left: substitutions, 150 | Right: frequency, 151 | }, 152 | } 153 | } 154 | 155 | func parseCondList(lex *Lexer) *Node { 156 | item, ok := lex.Peek() 157 | if !ok { 158 | return nil 159 | } 160 | 161 | var left *Node 162 | switch item.Type { 163 | case "EOL", "EOF", "arrow": 164 | return nil 165 | case "#": 166 | left = parseCVar(lex) 167 | case "<": 168 | left = parseClass(lex) 169 | default: 170 | panic(fmt.Sprintf("Parse error: Unexpected Token %s in Condition Expression", item.Token)) 171 | } 172 | 173 | right := parseCondList(lex) 174 | return &Node{ 175 | Type: Seq, 176 | Left: left, 177 | Right: right, 178 | } 179 | } 180 | 181 | func parseCondOrDef(lex *Lexer) *Node { 182 | var first *Node 183 | 184 | item, ok := lex.Peek() 185 | if !ok { 186 | panic("Invalid call to parseCondOrDef") 187 | } 188 | 189 | switch item.Type { 190 | case "#": 191 | first = parseCVar(lex) 192 | case "<": 193 | first = parseClass(lex) 194 | case "_": 195 | lex.Next() 196 | first = &Node{Type: Boundary} 197 | default: 198 | panic("Invalid call to parseCondOrDef") 199 | } 200 | 201 | item, ok = lex.Peek() 202 | if !ok || item.Type == "EOL" { 203 | if first.Type == CVar { 204 | panic("Parse error: Incomplete Variable Definition") 205 | } 206 | panic("Parse error: Incomplete Condition Expression") 207 | } 208 | 209 | if item.Type == "=" { 210 | if first.Type != CVar { 211 | panic("Parse error: Unexpected _") 212 | } 213 | 214 | lex.Next() // skip = token 215 | second := parseClassOrCVar(lex) 216 | return &Node{ 217 | Type: Definition, 218 | Left: first, 219 | Right: second, 220 | } 221 | } else { 222 | rest := parseCondList(lex) 223 | 224 | arrow, ok := lex.Next() 225 | if !ok { 226 | panic("Parse error: Missing Arrow in Condition Expression") 227 | } 228 | 229 | right := parseClassOrCVar(lex) 230 | left := &Node{ 231 | Type: Seq, 232 | Left: first, 233 | Right: rest, 234 | } 235 | 236 | switch arrow.Token { 237 | case "->": 238 | return &Node{ 239 | Type: Condition, 240 | Left: left, 241 | Right: right, 242 | } 243 | case "!>": 244 | return &Node{ 245 | Type: Exclusion, 246 | Left: left, 247 | Right: right, 248 | } 249 | default: 250 | panic("Parse error: Invalid Arrow in Condition Expression") 251 | } 252 | } 253 | } 254 | 255 | func parseCommand(lex *Lexer) *Node { 256 | item, ok := lex.Peek() 257 | for ok && item.Type != "EOF" { 258 | switch item.Type { 259 | case "#", "_", "<": 260 | return parseCondOrDef(lex) 261 | case "$": 262 | return parseSyntax(lex) 263 | case "EOL": 264 | lex.Next() 265 | item, ok = lex.Peek() 266 | default: 267 | panic(fmt.Sprintf("Parse error: Unexpected Token %s", item.Token)) 268 | } 269 | } 270 | return nil 271 | } 272 | -------------------------------------------------------------------------------- /parser/parser.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import "fmt" 4 | import . "github.com/conlang-software-dev/Logopoeist/lexer" 5 | 6 | const ( // Node Types 7 | Production = iota 8 | Definition 9 | Condition 10 | Exclusion 11 | SVar 12 | CVar 13 | Class 14 | Phoneme 15 | Seq 16 | Freq 17 | Num 18 | Boundary 19 | ) 20 | 21 | type Node struct { 22 | Type int 23 | Value string 24 | Left *Node 25 | Right *Node 26 | } 27 | 28 | func (n *Node) ToString() string { 29 | if n == nil { 30 | return "" 31 | } 32 | switch n.Type { 33 | case Production: 34 | return fmt.Sprintf("%s -> %s\n", n.Left.ToString(), n.Right.ToString()) 35 | case Definition: 36 | return fmt.Sprintf("%s = %s\n", n.Left.ToString(), n.Right.ToString()) 37 | case Condition: 38 | return fmt.Sprintf("%s -> %s\n", n.Left.ToString(), n.Right.ToString()) 39 | case Exclusion: 40 | return fmt.Sprintf("%s !> %s\n", n.Left.ToString(), n.Right.ToString()) 41 | case SVar: 42 | return fmt.Sprintf("$%s", n.Value) 43 | case CVar: 44 | return fmt.Sprintf("#%s", n.Value) 45 | case Class: 46 | return fmt.Sprintf("<%s>", n.Left.ToString()) 47 | case Seq: 48 | if n.Right == nil { 49 | return n.Left.ToString() 50 | } 51 | return fmt.Sprintf("%s %s", n.Left.ToString(), n.Right.ToString()) 52 | case Freq: 53 | if n.Right.Value == "1" { 54 | return n.Left.ToString() 55 | } 56 | return fmt.Sprintf("%s *%s", n.Left.ToString(), n.Right.ToString()) 57 | case Num: 58 | return n.Value 59 | case Phoneme: 60 | return n.Value 61 | case Boundary: 62 | return "_" 63 | default: 64 | return "{unknown}" 65 | } 66 | } 67 | 68 | func Parse(lex *Lexer) chan *Node { 69 | nodes := make(chan *Node) 70 | 71 | go func() { 72 | 73 | defer func() { 74 | if err := recover(); err != nil { 75 | close(nodes) 76 | fmt.Printf("Error: %s\n", err) 77 | } 78 | }() 79 | 80 | for { 81 | item, ok := lex.Peek() 82 | if !ok || item.Type == "EOF" { 83 | close(nodes) 84 | return 85 | } 86 | nodes <- parseCommand(lex) 87 | } 88 | }() 89 | 90 | return nodes 91 | } 92 | -------------------------------------------------------------------------------- /test.lgp: -------------------------------------------------------------------------------- 1 | $W -> $W1 *2 ; Start symbol is $W 2 | $W -> $W2 *1 ; A word can be a $W1, or a $W2, with type 1 occuring twice as often 3 | $W1 -> $W1 $S1 ; A type-1 word is a list of type-1 syllables ($S1) 4 | $W1 -> $S1 ; After every syllable, it's equally likely to add one more or stop there 5 | $W2 -> $W2 $S2 ; Type-2 words are built the same way 6 | $W2 -> $S2 ; Thus, 1/2 of all words are 1 syllable, 1/4 are 2 syllables, etc. 7 | $S1 -> #C #V1 ; A type-1 syllable consist of a consonant and a type-1 vowel 8 | $S2 -> #C #V2 ; And similarly for type-2 syllables 9 | 10 | ; This means that all syllables are strictly CV, 11 | ; and all syllables in one word have the same kind 12 | ; of vowel, which is how we enforce vowel harmony 13 | 14 | #C =
; Consonants are
, or ; After a type-1 syllable starting with or