├── __init__.py ├── src ├── __init__.py ├── pyle.py ├── logging.conf ├── syllables.py ├── lang.py ├── syntax.py ├── gen.py ├── core.py ├── _pattern.py └── sce.py ├── .gitignore ├── license.txt └── readme.md /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | langs/ 3 | trees/ 4 | examples.py 5 | correspondences.py 6 | interface.py 7 | *.dat 8 | -------------------------------------------------------------------------------- /src/pyle.py: -------------------------------------------------------------------------------- 1 | '''Collection of conlanging tools 2 | '''''' 3 | ==================================== To-do ==================================== 4 | === Bug-fixes === 5 | 6 | === Implementation === 7 | 8 | === Features === 9 | 10 | === Style === 11 | ''' 12 | 13 | -------------------------------------------------------------------------------- /src/logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root,sce 3 | 4 | [handlers] 5 | keys=consoleHandler 6 | 7 | [formatters] 8 | keys=sceFormatter 9 | 10 | [logger_root] 11 | level=WARNING 12 | handlers=consoleHandler 13 | 14 | [logger_sce] 15 | level=WARNING 16 | handlers=consoleHandler 17 | qualname=sce 18 | propagate=0 19 | 20 | [handler_consoleHandler] 21 | class=StreamHandler 22 | level=DEBUG 23 | formatter=sceFormatter 24 | args=(sys.stdout,) 25 | 26 | [formatter_sceFormatter] 27 | format=%(asctime)s %(levelname)s:%(message)s 28 | datefmt=%d/%m/%Y %H:%M:%S -------------------------------------------------------------------------------- /license.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Kathryn Spence 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Conlanger 2 | 3 | Conlanger is a package containing several tools designed primarily to aid conlangers with tedious tasks such as word 4 | generation and diachronics. To use, simply place the package on your Python path, and `import conlanger` into an 5 | interface script. 6 | 7 | ## Syntax 8 | 9 | `conlanger.syntax` is a stand-alone module allowing for the generation of syntax tree images, using either dependency or 10 | constituency trees, from a textual list-based representation of trees - labelled lists for constituency trees, and 11 | unlabelled lists as an alterantive for dependency trees. Requires Pillow as a dependency. 12 | 13 | ## Gen 14 | 15 | `conlanger.gen` is a module allowing the generation of words from syllables whose graphemes, where the syllable types 16 | and graphemes are both distributed according to peaked power law distributions. Additionally, restrictions (both linear 17 | and non-linear) can be placed on what outputs are considered valid. 18 | 19 | ## SCE 20 | 21 | `conlanger.sce` is a module with powerful tools for transforming words according to transformation rules. Documentation 22 | of the rules can be found [here](http://www.dragonlinguistics.com/sce/doc.html). 23 | 24 | ## Lang 25 | 26 | `conlanger.lang` is a module providing support for storing the configuration data for the other modules on a 27 | per-language basis, as well as saving this data to and loading from file. It also provides shortcuts to utilising the 28 | other modules with a given language, automatically providing the configuation data defined for that language. 29 | -------------------------------------------------------------------------------- /src/syllables.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | @dataclass 4 | class Syllabifier: 5 | rules: tuple 6 | 7 | def __init__(self, cats, onsets=(), nuclei=(), codas=(), margins=(), constraints=()): 8 | from ._pattern import parsePatterns 9 | onsets = parsePatterns(onsets) 10 | nuclei = parsePatterns(nuclei) 11 | codas = parsePatterns(codas) 12 | margins = parsePatterns(margins) 13 | constraints = parsePatterns(constraints) 14 | rules = [] 15 | rules.extend(generateNonFinals(codas, onsets, nuclei)) # Medials 16 | rules.extend(generateFinals(codas, margins)) # Finals 17 | rules.extend(generateNonFinals(margins, onsets, nuclei)) # Initials 18 | self.rules = tuple(rule for rule in self.rules if checkValid(rule[0], constraints)) 19 | 20 | def __call__(self, word): 21 | breaks = [] 22 | # Step through the word 23 | pos = 0 24 | while pos < len(word): 25 | for rule, _breaks in self.rules: 26 | if rule == ['_', '#'] and pos in breaks: 27 | continue 28 | match, rpos = word.matchPattern(rule, pos)[:2] 29 | if match: 30 | # Compute and add breaks for this pattern 31 | for ix in _breaks: 32 | # Syllable breaks must be within the word and unique 33 | if 0 < pos+ix < len(word) and pos+ix not in breaks: 34 | breaks.append(pos+ix) 35 | # Step past this match 36 | pos = rpos 37 | if rule[-1] == '#': 38 | pos -= 1 39 | break 40 | else: # No matches here 41 | pos += 1 42 | return tuple(breaks) 43 | 44 | def generateNonFinals(codas, onsets, nuclei): 45 | rules = [] 46 | for crank, coda in enumerate(codas): 47 | if coda[-1] == '#': 48 | continue 49 | elif coda[-1] == '_': 50 | coda = coda[:-1] 51 | for orank, onset in enumerate(onsets): 52 | if onset[0] == '#': 53 | if coda == ['#']: 54 | onset = onset[1:] 55 | else: 56 | continue 57 | if onset == ['_']: 58 | onset = [] 59 | for nrank, nucleus in enumerate(nuclei): 60 | if nucleus[0] == '#': 61 | if coda == ['#'] and onset == []: 62 | nucleus = nucleus[1:] 63 | else: 64 | continue 65 | pattern = coda + onset + nucleus 66 | breaks = [len(coda)] 67 | if pattern[-1] == '#': 68 | breaks.append(len(pattern)-1) 69 | rank = crank + orank + nrank 70 | rules.append((pattern, breaks, rank)) 71 | return (r[:2] for r in sorted(rules, key=lambda r: r[2])) 72 | 73 | def generateFinals(codas, margins): 74 | rules = [] 75 | for mrank, margin in enumerate([margin for margin in margins if margin[-1] == '#']): 76 | if margin == ['_', '#']: 77 | margin = ['#'] 78 | for crank, coda in enumerate(codas): 79 | if coda[-1] == '#': 80 | if margin == ['#']: 81 | coda = coda[:-1] 82 | else: 83 | continue 84 | pattern = coda + margin 85 | breaks = [0 if coda == ['_'] else len(coda)] 86 | rank = crank + mrank 87 | rules.append((pattern, breaks, rank)) 88 | return (r[:2] for r in sorted(rules, key=lambda r: r[2])) 89 | 90 | def checkValid(rule, constraints): 91 | for constraint in constraints: 92 | for rpos in range(len(rule)-len(constraint)): 93 | for cpos, ctoken in enumerate(constraint): 94 | rtoken = rule[rpos+cpos] 95 | if isinstance(rtoken, str) and isinstance(ctoken, str): 96 | if rtoken == ctoken: 97 | continue 98 | elif isinstance(rtoken, str) and isinstance(ctoken, Cat): 99 | if rtoken in ctoken: 100 | continue 101 | elif isinstance(rtoken, Cat) and isinstance(ctoken, Cat): 102 | if rtoken <= ctoken: 103 | continue 104 | break 105 | else: 106 | return False 107 | return True 108 | -------------------------------------------------------------------------------- /src/lang.py: -------------------------------------------------------------------------------- 1 | '''Create and manipulate languages 2 | 3 | Classes: 4 | Config -- collection of gen.py configuration data 5 | Language -- represents a language 6 | 7 | Functions: 8 | load -- load the data from the named language file 9 | save -- save the given language's data to file 10 | '''''' 11 | ==================================== To-do ==================================== 12 | === Bug-fixes === 13 | 14 | === Implementation === 15 | Implement confirming overwriting save data - needs UI 16 | Maybe add different modes for each positional syllable type 17 | 18 | === Features === 19 | Add generating every possible word/root 20 | Language.apply_ruleset will be replaced by calls to the diachronics module, once that exists 21 | 22 | === Style === 23 | Consider where to raise/handle exceptions 24 | ''' 25 | 26 | from dataclasses import dataclass 27 | import os 28 | import json 29 | from .core import Cat, Syllabifier, parseCats 30 | from ._pattern import parsePatterns, unparsePattern 31 | from . import gen, sce 32 | 33 | os.chdir(os.path.dirname(os.path.abspath(__file__))) # Language files are in conlanger/langs/ 34 | 35 | ## Classes 36 | 37 | @dataclass 38 | class Config: 39 | patterns: dict 40 | constraints: list 41 | sylrange: range 42 | sylmode: list 43 | patternmode: list 44 | graphmode: list 45 | 46 | @dataclass 47 | class Language: 48 | '''Class for representing a single language. 49 | 50 | Instance variables: 51 | name -- language name (str) 52 | cats -- grapheme categories (dict) 53 | wordConfig -- word configuration data (Config) 54 | syllabifier -- syllabification function (RulesSyllabifier) 55 | 56 | Methods: 57 | gen_word -- generate words 58 | apply_ruleset -- apply a sound change ruleset to a wordset 59 | ''' 60 | name: str 61 | cats: dict 62 | configs: dict 63 | phonotactics: dict 64 | syllabifier: Syllabifier 65 | 66 | def __init__(self, name='', cats=None, configs=None, phonotactics=None, syllabifier=None): 67 | '''Constructor for Language. 68 | 69 | Arguments: 70 | name -- language name (str) 71 | cats -- grapheme categories (dict) 72 | configs -- configuration data sets (dict) 73 | ''' 74 | self.name = name 75 | self.cats = parseCats(cats) 76 | if 'graphs' not in self.cats: # Category 'graphs' must exist 77 | self.cats['graphs'] = Cat("'") 78 | self.configs = {} 79 | if configs is None: 80 | configs = {} 81 | for config in configs: 82 | _config = configs[config].copy() 83 | _config['patterns'] = parsePatterns(_config['patterns'], self.cats) 84 | _config['constraints'] = parsePatterns(_config['constraints'], self.cats) 85 | _config['sylrange'] = range(_config['sylrange'][0], _config['sylrange'][1]+1) 86 | self.configs[config] = Config(**_config) 87 | phonotactics = parsePatterns(phonotactics, self.cats) 88 | self.phonotactics = {'nuclei': phonotactics['nuclei'], 'margins': []} 89 | # Need some default phonotactics instead of empty lists 90 | self.phonotactics['onsets'] = phonotactics['onsets'] or parsePatterns('_') 91 | self.phonotactics['codas'] = phonotactics['codas'] or parsePatterns('_') 92 | for margin in phonotactics['margins']: 93 | if (margin[0] == '#') != (margin[-1] == '#'): 94 | self.phonotactics['margins'].append(margin) 95 | if not any((margin[0] == '#') for margin in self.phonotactics['margins']): 96 | self.phonotactics['margins'].extend(parsePatterns('#_')) 97 | if not any((margin[-1] == '#') for margin in self.phonotactics['margins']): 98 | self.phonotactics['margins'].extend(parsePatterns('_#')) 99 | self.syllabifier = Syllabifier(self.cats, **self.phonotactics) 100 | 101 | @property 102 | def data(self): 103 | data = {} 104 | if self.name != '': 105 | data['name'] = self.name 106 | if self.cats != {}: 107 | data['cats'] = {name: list(cat) for name, cat in self.cats.items()} 108 | if self._configs != {}: 109 | data['configs'] = self._configs 110 | data['syllabifier'] = [] 111 | for rule in self.syllabifier.rules: 112 | rule, indices = rule 113 | rule = rule.copy() 114 | for i in reversed(indices): 115 | rule.insert(i, '$') 116 | data['syllabifier'].append(unparsePattern(rule)) 117 | if self.phonotactics is not None: 118 | data['phonotactics'] = {k: [unparsePattern(pattern) for pattern in v] for k, v in self.phonotactics.items()} 119 | return data 120 | 121 | def gen(self, config, num=1): 122 | '''Generates 'num' words using 'config'. 123 | 124 | Arguments: 125 | config -- config data to use 126 | num -- number of words to generate, 0 generates every possible word (int) 127 | 128 | Returns a list 129 | ''' 130 | if config not in self.configs: 131 | return [] 132 | if num == 0: # Generate every possible word, unimplemented 133 | return [] 134 | return [gen.gen_word(self.configs[config], self.cats['graphs'], self.syllabifier) for i in range(num)] 135 | 136 | def apply_ruleset(self, wordset, ruleset, output='list'): 137 | '''Runs the sound change 'ruleset' on the 'wordset'. 138 | 139 | Arguments: 140 | wordset -- the words to be changed (str, list) 141 | ruleset -- the sound changes to apply (str, list) 142 | to_string -- whether or not to have string output 143 | 144 | Returns a str or list 145 | ''' 146 | return sce.run(wordset, ruleset, self.cats, self.syllabifier, output) 147 | 148 | # == Functions == # 149 | def load(name): 150 | '''Loads language data from file. 151 | 152 | Arguments: 153 | name -- the name of the language file to load from 154 | 155 | Returns a Language 156 | ''' 157 | with open('langs/{}.dat'.format(name.lower()), 'r', encoding='utf-8') as f: 158 | data = json.load(f) 159 | return Language(**data) 160 | 161 | def save(lang): 162 | '''Saves a language to file. 163 | 164 | Arguments: 165 | lang -- the Language to save 166 | ''' 167 | data = lang.data 168 | # Check for existing save data 169 | with open('langs/{}.dat'.format(name.lower()), 'r+', encoding='utf-8') as f: 170 | if f.read(): 171 | if True: # Check if the user wants to overwrite this data - not implemented yet 172 | f.truncate() 173 | else: 174 | return 175 | json.dump(data) 176 | 177 | def getcwd(): 178 | print(os.getcwd()) 179 | -------------------------------------------------------------------------------- /src/syntax.py: -------------------------------------------------------------------------------- 1 | ''' 2 | ==================================== To-do ==================================== 3 | === Bug-fixes === 4 | 5 | === Implementation === 6 | Multi-word labels require that triangle thing 7 | Need to reimplement discontinuities somehow 8 | 9 | === Features === 10 | 11 | === Style === 12 | ''' 13 | 14 | import re 15 | from dataclasses import dataclass, field 16 | from math import floor 17 | from PIL import Image, ImageDraw, ImageFont 18 | from .core import LangException, Token, CompilerError, TokenError 19 | 20 | ## Constants 21 | SCALE = 10 22 | POINT_SIZE = 16*SCALE 23 | FONT = ImageFont.truetype('calibri.ttf', POINT_SIZE) 24 | GAP_WIDTH = POINT_SIZE # minimum horizontal spacing between trees 25 | GAP_HEIGHT = POINT_SIZE # minimum vertical spacing between layers 26 | LAYER_HEIGHT = GAP_HEIGHT + POINT_SIZE 27 | PADDING = POINT_SIZE # Padding around the edge of the image 28 | 29 | ## Tokens 30 | TOKENS = { 31 | 'LBRACKET': r'\[', 32 | 'RBRACKET': r'\]', 33 | 'WHITESPACE': r' +', 34 | 'QUOTED': r'\".*?\"', 35 | 'INDEX': r'[₀-₉]+', 36 | 'STRING': r'[^\[\]₀-₉ ]+', 37 | 'UNKNOWN': r'.' 38 | } 39 | TOKEN_REGEX = re.compile('|'.join(f'(?P<{type}>{regex})' for type, regex in TOKENS.items())) 40 | 41 | ## Exceptions 42 | class TreeException(LangException): 43 | pass 44 | 45 | class TreeFormatError(TreeException): 46 | pass 47 | 48 | class UnexpectedToken(TokenError): 49 | def __init__(self, token, expected=None): 50 | type = token.type.lower() 51 | if expected is None: 52 | super().__init__(f'Unexpected {type} token', token) 53 | else: 54 | super().__init__(f'Unexpected {type} token, expected {expected}', token) 55 | 56 | ## Classes 57 | @dataclass 58 | class Tree: 59 | label: str 60 | children: list = field(default_factory=list) 61 | 62 | def __len__(self): 63 | return len(self.children) 64 | 65 | def __getitem__(self, key): 66 | return self.children[key] 67 | 68 | def __iter__(self): 69 | yield from self.children 70 | 71 | def __str__(self): 72 | children = ' '.join(str(child) for child in self) 73 | return f'[{self.label} {children}]' if children else f'[{self.label}]' 74 | 75 | def __repr__(self): 76 | return f'Tree("{self}")' 77 | 78 | @staticmethod 79 | def make(string): 80 | tokens = list(tokenise(string)) 81 | if tokens[0].type == 'LBRACKET' and tokens[-1].type == 'RBRACKET': 82 | return compileTree(tokens[1:-1]) 83 | else: 84 | raise TreeFormatError('invalid syntax') 85 | 86 | ## Tree geometry 87 | @property 88 | def isleaf(self): 89 | return self.children == [] 90 | 91 | @property 92 | def depth(self): 93 | if self.isleaf: 94 | return 0 95 | else: 96 | return max(child.depth for child in self) + 1 97 | 98 | ## Tree Size 99 | @property 100 | def childrenwidth(self): 101 | return max(0, GAP_WIDTH*(len(self)-1) + sum(child.width for child in self)) 102 | 103 | @property 104 | def width(self): 105 | return max(self.labelwidth, self.childrenwidth) 106 | 107 | @property 108 | def height(self): 109 | return POINT_SIZE + self.depth * LAYER_HEIGHT 110 | 111 | ## Label 112 | @property 113 | def labelwidth(self): 114 | return FONT.getsize(self.label)[0] 115 | 116 | @property 117 | def labelmiddle(self): 118 | if len(self) <= 1: 119 | return floor(self.width/2) 120 | else: 121 | return floor((self[0].labelmiddle + (self.width - self[-1].width + self[-1].labelmiddle))/2) 122 | 123 | @property 124 | def labelleft(self): 125 | return self.labelmiddle - floor(self.labelwidth/2) 126 | 127 | @property 128 | def deplabelmiddle(self): 129 | ix = None 130 | leaf = None 131 | for i, child in enumerate(self): 132 | if child.isleaf: 133 | if leaf is None: 134 | ix = i 135 | leaf = child 136 | else: 137 | raise TreeFormatError('dependency nodes may have at most one leaf child') 138 | if leaf is None: 139 | return self.labelmiddle 140 | else: 141 | return sum(child.width for child in self[:ix]) + GAP_WIDTH*ix + floor((self.width-self.childrenwidth)/2) + floor(leaf.labelwidth/2) 142 | 143 | @property 144 | def deplabelleft(self): 145 | return self.deplabelmiddle - floor(self.labelwidth/2) 146 | 147 | ## Compiling Functions 148 | def tokenise(string): 149 | for match in TOKEN_REGEX.finditer(string): 150 | type = match.lastgroup 151 | value = match.group() 152 | column = match.start() 153 | if type == 'WHITESPACE': 154 | continue 155 | elif type == 'QUOTED': 156 | type = 'STRING' 157 | value = value.strip('"') 158 | elif type == 'INDEX': 159 | value = value.translate(str.maketrans('₀₁₂₃₄₅₆₇₈₉', '0123456789')) 160 | elif type == 'UNKNOWN': 161 | raise CompilerError('unexpected character', value, 0, column) 162 | yield Token(type, value, 0, column) 163 | 164 | def matchBrackets(tokens, start=0): 165 | if tokens[start].type != 'LBRACKET': 166 | raise UnexpectedToken(tokens[start], 'lbracket') 167 | depth = 0 168 | for i, token in enumerate(tokens[start:], start+1): 169 | if token.type == 'LBRACKET': 170 | depth += 1 171 | elif token.type == 'RBRACKET': 172 | depth -= 1 173 | if depth == 0: 174 | return i 175 | raise TokenError(f'unmatched bracket', tokens[start]) 176 | 177 | def compileTree(tokens): 178 | if tokens[0].type != 'STRING': 179 | raise UnexpectedToken(tokens[0], 'string') 180 | label = tokens[0].value 181 | children = [] 182 | i = 1 183 | while i < len(tokens): 184 | type, value = tokens[i] 185 | if type == 'LBRACKET': 186 | j = matchBrackets(tokens, i) 187 | children.append(compileTree(tokens[i+1:j-1])) 188 | i = j 189 | elif type == 'STRING': 190 | children.append(Tree(value)) 191 | i += 1 192 | else: 193 | raise UnexpectedToken(tokens[i]) 194 | return Tree(label, children) 195 | 196 | ## Drawing Functions 197 | def drawDependency(tree, draw, leaftop, top, left): 198 | # Draw label 199 | labelcolour = 'red' if tree.isleaf else 'blue' 200 | draw.text((left+tree.deplabelleft, top), tree.label, labelcolour, FONT) 201 | # Draw descendents 202 | linetop = (left+tree.deplabelmiddle, top+POINT_SIZE+SCALE) # We want 1px gap between label and line after rescaling 203 | top += LAYER_HEIGHT 204 | left += floor((tree.width - tree.childrenwidth)/2) 205 | for child in tree: 206 | if child.isleaf: 207 | _top = leaftop 208 | else: 209 | _top = top 210 | # Draw line 211 | linebottom = (left+child.deplabelmiddle, _top-SCALE) # Again, 1px gap between label and line after rescaling 212 | linecolour = 'darkgrey' if child.isleaf else 'black' 213 | draw.line([linetop, linebottom], linecolour, SCALE) # Similarly, 1px line width after rescaling 214 | # Draw child 215 | drawDependency(child, draw, leaftop, _top, left) 216 | # Next child 217 | left += child.width + GAP_WIDTH 218 | 219 | def drawConstituency(tree, draw, top, left): 220 | # Draw label 221 | labelcolour = 'red' if tree.isleaf else 'blue' 222 | draw.text((left+tree.labelleft, top), tree.label, labelcolour, FONT) 223 | # Draw descendents 224 | linetop = (left+tree.labelmiddle, top+POINT_SIZE+SCALE) # We want 1px gap between label and line after rescaling 225 | top += LAYER_HEIGHT 226 | left += floor((tree.width - tree.childrenwidth)/2) 227 | for child in tree: 228 | # Draw line 229 | linebottom = (left+child.labelmiddle, top-SCALE) # Again, 1px gap between label and line after rescaling 230 | linecolour = 'darkgrey' if child.isleaf else 'black' 231 | draw.line([linetop, linebottom], linecolour, SCALE) # Similarly, 1px line width after rescaling 232 | # Draw child 233 | drawConstituency(child, draw, top, left) 234 | # Next child 235 | left += child.width + GAP_WIDTH 236 | 237 | def drawTree(string, mode): 238 | tree = Tree.make(string) 239 | size = (tree.width + PADDING*2, tree.height + PADDING*2) 240 | im = Image.new('RGB', size, 'white') 241 | draw = ImageDraw.Draw(im) 242 | if mode == 'dep': 243 | leaftop = PADDING + tree.depth*LAYER_HEIGHT 244 | drawDependency(tree, draw, leaftop, PADDING, PADDING) 245 | else: 246 | drawConstituency(tree, draw, PADDING, PADDING) 247 | return im.resize((size[0]//SCALE, size[1]//SCALE), resample=Image.ANTIALIAS) 248 | -------------------------------------------------------------------------------- /src/gen.py: -------------------------------------------------------------------------------- 1 | '''Generate syllables, words, or roots 2 | 3 | ==================================== To-do ==================================== 4 | === Bug-fixes === 5 | Doesn't seem to be checking exceptions correctly (not urgent-urgent) 6 | 7 | === Implementation === 8 | Potentially going to be overhauled in the near future 9 | 10 | === Features === 11 | 12 | === Style === 13 | Consider where to raise/handle exceptions 14 | 15 | === Mathematical model === 16 | r is the number of segments 17 | p is the 'dropoff rate' 18 | f(n) = p**n*(1-p)/(1-p**r) is the frequency of the nth most frequent segment (frequencies sum to 1) 19 | 20 | p must be determined by observing that a = (1-p)/(1-p**r) is the frequency of the most frequent segment. From this, we 21 | can estimate p ≈ 1-a, and with a first-order correction, p ≈ (1-a)+(a*(1-a)**r)/(1-a*r*(1-a)**(r-1)). 22 | 23 | P(n) = (1-p**n)/(1-p**r) is the cumulative frequency of the first n segments, and can be found by summing over f(n) 24 | 25 | A probability distribution can then be obtained by finding the inverse of P(n). Let x be a continuous random variable 26 | from 0 to 1 (say, random.random()). Then n = floor(log(1-x*(1-p**r),p)) 27 | 28 | Obtaining a variant with a peak can be done by using two distributions, one reversed, with their modes overlapping. This 29 | can be done by taking the range of x corresponding to the reversed section and rescaling it as follows, where a is the 30 | frequency of the mode, and c the cumulative frequency of the bins before the mode: x -> 1-x/(a+c). Thus, when x (x-c)/(1-c). Note that the mode belongs to this 33 | second distribution. 34 | ''' 35 | 36 | from .core import LangException, Cat, Word 37 | from random import random, choice 38 | from math import log, floor, ceil 39 | 40 | # == Constants == # 41 | MAX_RUNS = 10**5 # maximum number of times something can fail to be generated 42 | 43 | # == Exceptions == # 44 | class ExceededMaxRunsError(LangException): 45 | '''Exception raised when something has failed to be generated too many times.''' 46 | 47 | # == Functions == # 48 | def dist(bins, a=0, x=None): # First bin has frequency a, random variable x 49 | '''Returns an element of 'bins' according to a power law distribution. 50 | 51 | Arguments: 52 | bins -- a non-empty ordered collection of elements (str, list, tuple) 53 | a -- the frequency that the first bin should be selected (0 for equiprobable distribution) (float) 54 | x -- a random variable supplied if the default random.random() is not desired (float) 55 | ''' 56 | # See the docstring titled 'Mathematical Model' for the maths 57 | r = len(bins) 58 | if a <= 0: # Use equiprobable distribution instead 59 | return choice(bins) 60 | if r == 1 or a >= 1: # Only one bin 61 | return bins[0] 62 | if x is None: # No random variable supplied 63 | x = random() 64 | p = (1-a)+(a*(1-a)**r)/(1-a*r*(1-a)**(r-1)) 65 | return bins[floor(log(1-x*(1-p**r), p))] 66 | 67 | def peakedDist(bins, a=0, m=0, c=0): 68 | '''Returns an element of 'bins' according to a peaked power law distribution. 69 | 70 | Arguments: 71 | bins -- an ordered collection of elements (str, list, tuple) 72 | a -- the frequency that the most frequent bin should be selected (0 for equiprobable distribution) (float) 73 | m -- the index of the most frequent bin 74 | c -- the cumulative frequency of bins 0 to m-1 75 | ''' 76 | # See the docstring titled 'Mathematical Model' for the maths 77 | if m <= 0 or c <= 0: # All bins before the mode are ignored 78 | return dist(bins[m:], a) 79 | x = random() 80 | if x < c: # In the left-hand branch 81 | return dist(bins[m::-1], a/(a+c), 1-x/(a+c)) 82 | else: 83 | return dist(bins[m:], a/(1-c), (x-c)/(1-c)) 84 | 85 | def populate(pattern, mode): 86 | '''Generate a word section according to 'pattern' 87 | 88 | Arguments: 89 | pattern -- the pattern to generate (list) 90 | mode -- representation of the mode of the grapheme distribution (list) 91 | all -- indicator to generate every possible pattern, or one random pattern (bool) 92 | ''' 93 | result = [] 94 | for token in pattern: 95 | if token.type == 'category': 96 | result.append(peakedDist(token.cat, *mode)) 97 | elif token == '"': 98 | result.append(result[-1]) 99 | else: 100 | result.append(str(token)) 101 | return result 102 | 103 | def populateAll(pattern): 104 | results = [[]] 105 | for token in pattern: 106 | if token.type == 'category': 107 | temp = [] 108 | for result in results: 109 | for graph in token.cat: 110 | temp.append(result+[graph]) 111 | results = temp 112 | elif token == '"': 113 | for result in results: 114 | result.append(result[-1]) 115 | else: 116 | for result in results: 117 | result.append(str(token)) 118 | return results 119 | 120 | def genFromConfig(config, graphs=None, separator='', syllabifier=None): 121 | '''Generate a single word as specified by the 'config'. 122 | 123 | Arguments: 124 | config -- the config data to be used to generate this word 125 | graphs -- the set of graphemes used for this word 126 | 127 | Returns a Word 128 | 129 | Raises ExceededMaxRunsError when the word repeatedly fails to be valid 130 | ''' 131 | word = Word(['#'], graphs, separator, syllabifier) 132 | patterns, constraints, sylrange, sylmode, patternmode, graphmode = config 133 | sylcount = peakedDist(sylrange, *sylmode) 134 | for i in range(sylcount): 135 | if sylcount == 1: # Monosyllable 136 | _patterns = patterns['mono'] or patterns['init'] or patterns['term'] or patterns['medi'] 137 | elif i == 0: # Initial syllable 138 | _patterns = patterns['init'] or patterns['medi'] 139 | elif i == sylcount-1: # Final syllable 140 | _patterns = patterns['term'] or patterns['medi'] 141 | else: # Medial syllable 142 | _patterns = patterns['medi'] 143 | for j in range(MAX_RUNS): 144 | pattern = peakedDist(_patterns, *patternmode) 145 | syl = populate(pattern, graphmode) 146 | _word = word + syl 147 | for constraint in constraints: 148 | if constraint and constraint in _word: 149 | break 150 | else: 151 | word = _word 152 | break 153 | else: 154 | raise ExceededMaxRunsError() 155 | return word + '#' 156 | 157 | def genFromPhonotactics(phonotactics, sylrange=(1,), sylmode=(), graphs=None, syllabifier=None): 158 | '''Generate a single word as specified by the 'phonotactics'. 159 | 160 | Arguments: 161 | phonotactics -- the phonotactic data to be used 162 | graphs -- the set of graphemes used for this word 163 | syllabifier -- the syllabifier used for syllabification 164 | 165 | Returns a Word 166 | ''' 167 | word = Word([], graphs, syllabifier) 168 | sylcount = peakedDist(sylrange, *sylmode) 169 | for i in range(sylcount): 170 | # Generate a syllable 171 | for _ in range(MAX_RUNS): 172 | # Pick an onset 173 | onset = selectPeriphery(phonotactics['onsets'], phonotactics['margins'], 'left', i) 174 | # Pick a coda 175 | coda = selectPeriphery(phonotactics['codas'], phonotactics['margins'], 'right', i-sylcount) 176 | # Pick a nucleus 177 | nuclei = phonotactics['nuclei'] 178 | if onset != ['#']: 179 | nuclei = [nucleus for nucleus in nuclei if nucleus[0] != '#'] 180 | if coda != ['#']: 181 | nuclei = [nucleus for nucleus in nuclei if nucleus[-1] != '#'] 182 | nucleus = choice(nuclei) 183 | syl = populate(onset+nucleus+coda, ()) 184 | _word = word + syl 185 | for env in phonotactics['constraints']: 186 | if env and env in _word: 187 | break 188 | else: 189 | word = _word 190 | break 191 | else: 192 | raise ExceededMaxRunsError() 193 | return word 194 | 195 | def selectPeriphery(peripheries, margins, edge, i): 196 | edge = 0 if edge == 'left' else -1 197 | if i == edge: 198 | margin = choice([margin for margin in margins if margin[edge] == '#']) 199 | if margin == (['_', '#'] if edge else ['#', '_']): 200 | margin = ['#'] 201 | peripheries = [(p+margin if edge else margin+p) if p[edge] != '#' else p for p in peripheries] 202 | else: 203 | peripheries = [p for p in peripheries if p[edge] != '#'] 204 | periphery = choice(peripheries) 205 | if edge and periphery[0] == '_': 206 | return periphery[1:] 207 | elif not edge and periphery[-1] == '_': 208 | return periphery[:-1] 209 | else: 210 | return periphery 211 | -------------------------------------------------------------------------------- /src/core.py: -------------------------------------------------------------------------------- 1 | '''Base classes and functions 2 | 3 | ==================================== To-do ==================================== 4 | === Bug-fixes === 5 | 6 | === Implementation === 7 | Perhaps adjust Cat to allow sequences of graphemes to be stored 8 | 9 | === Features === 10 | Something something punctuation 11 | Hijack global environments with no pattern to test for position in word 12 | 13 | === Style === 14 | Consider where to raise/handle exceptions 15 | Go over docstrings 16 | ''' 17 | 18 | import re 19 | from dataclasses import dataclass, field, InitVar 20 | from .syllables import Syllabifier 21 | 22 | # == Exceptions == # 23 | class LangException(Exception): 24 | '''Base class for exceptions in this package''' 25 | 26 | class FormatError(LangException): 27 | '''Exception raised for errors in formatting objects.''' 28 | 29 | class RuleError(LangException): 30 | '''Exception raised for errors when running rules.''' 31 | 32 | class CompilerError(LangException): 33 | '''Base class for errors during compilation.''' 34 | def __init__(self, error, value, linenum, column): 35 | super().__init__(f'{error}: `{value}` @ {linenum}:{column}') 36 | 37 | class TokenError(CompilerError): 38 | '''Base class for errors involving tokens.''' 39 | def __init__(self, error, token): 40 | super().__init__(error, token.value, token.linenum, token.column) 41 | 42 | # == Decorators == # 43 | # Implements a decorator we can use as a variation on @property, where the value is calculated once and then stored 44 | class memoisedproperty(object): 45 | def __init__(self, fget): 46 | self.fget = fget 47 | self.funcname = fget.__name__ 48 | 49 | def __get__(self, obj, cls): 50 | if obj is None: 51 | return None 52 | value = self.fget(obj) 53 | setattr(obj, self.funcname, value) 54 | return value 55 | 56 | # == Classes == # 57 | @dataclass 58 | class Token: 59 | type: str 60 | value: str 61 | linenum: int 62 | column: int 63 | 64 | def __iter__(self): 65 | yield self.type 66 | yield self.value 67 | 68 | @dataclass 69 | class Cat: 70 | '''Represents a category of graphemes.''' 71 | values: list 72 | name: str = field(default=None, compare=False) 73 | 74 | def __str__(self): 75 | return f'[{", ".join(self)}]' 76 | 77 | def __len__(self): 78 | return len(self.values) 79 | 80 | def __getitem__(self, key): 81 | return self.values[key] 82 | 83 | def __iter__(self): 84 | yield from self.values 85 | 86 | def __contains__(self, item): 87 | return item in self.values 88 | 89 | def __and__(self, cat): 90 | return Cat([value for value in self if value in cat]) 91 | 92 | def __add__(self, cat): 93 | return Cat(self.values + list(cat)) 94 | 95 | def __iadd__(self, cat): 96 | return NotImplemented 97 | 98 | def __sub__(self, cat): 99 | return Cat([value for value in self if value not in cat]) 100 | 101 | def __le__(self, cat): 102 | return all(value in cat for value in self) 103 | 104 | def __lt__(self, cat): 105 | return self <= cat and not (self >= cat) 106 | 107 | def __ge__(self, cat): 108 | return all(value in self for value in cat) 109 | 110 | def __gt__(self, cat): 111 | return self >= cat and not (self <= cat) 112 | 113 | def index(self, item): 114 | return self.values.index(item) 115 | 116 | @staticmethod 117 | def make(string, cats=None, name=None): 118 | if not (string.startswith('[') and string.endswith(']')): 119 | raise FormatError(f'invalid category: {string}') 120 | cat = string[1:-1] 121 | if ',' in cat: # Nonce category 122 | if cat.endswith(','): 123 | if cat.count(',') == 1: 124 | cat = cat[:-1] 125 | else: 126 | raise FormatError(f'invalid category values: {cat}') 127 | values = [] 128 | for value in re.split(r', ?', cat): 129 | if not value: 130 | raise FormatError(f'invalid category values: {cat}') 131 | elif value.startswith('[') and value.endswith(']'): 132 | values.extend(Cat.make(value, cats)) 133 | elif ' ' in value or '[' in value or ']' in value: 134 | raise FormatError(f'invalid category value: {value}') 135 | else: 136 | values.append(value) 137 | return Cat(values, name) 138 | else: # Named category 139 | if cats is not None and cat in cats: 140 | return cats[cat] 141 | else: 142 | raise FormatError(f'invalid category name: {cat}') 143 | 144 | @dataclass 145 | class Word: 146 | '''Represents a word as a list of graphemes. 147 | 148 | Instance variables: 149 | graphs -- a category of graphemes (Cat) 150 | syllabifier -- a function that syllabifies the input word (RulesSyllabifier) 151 | 152 | Methods: 153 | find -- find a match of a list using pattern notation to the word 154 | matchPattern -- match a list using pattern notation to the word 155 | matchEnv -- match a sound change environment to the word 156 | applyMatch -- apply a single match to the word 157 | strip -- remove leading and trailing graphemes 158 | ''' 159 | phones: list = field(init=False) 160 | lexeme: InitVar[str] = '' 161 | graphs: Cat = field(default_factory=Cat) 162 | separator: str = '' 163 | syllabifier: Syllabifier = None 164 | 165 | def __post_init__(self, lexeme): 166 | if isinstance(lexeme, str): 167 | self.phones = parseWord(f' {lexeme} ', self.graphs, self.separator) 168 | else: 169 | phones = [] 170 | for i, phone in enumerate(lexeme): 171 | if not phone: 172 | continue 173 | elif not (phone == '#' and phones and phones[-1] == '#'): 174 | phones.append(phone) 175 | self.phones = phones 176 | 177 | @memoisedproperty 178 | def syllables(self): 179 | return self.syllabifier(self) 180 | 181 | def __repr__(self): 182 | return f'Word({str(self)!r})' 183 | 184 | def __str__(self): 185 | return unparseWord(self, self.graphs, self.separator) 186 | 187 | def __len__(self): 188 | return len(self.phones) 189 | 190 | def __getitem__(self, item): 191 | if isinstance(item, slice): 192 | return Word(self.phones[item], self.graphs, self.separator, self.syllabifier) 193 | else: 194 | return self.phones[item] 195 | 196 | def __iter__(self): 197 | yield from self.phones 198 | 199 | def __contains__(self, item): 200 | if isinstance(item, (list, Word)): 201 | return self.find(item) != -1 202 | else: 203 | return item in self.phones 204 | 205 | def __add__(self, other): 206 | graphs = self.graphs 207 | separator = self.separator 208 | if isinstance(other, Word): 209 | graphs = Cat(list(set().union(graphs, other.graphs))) 210 | separator = separator or other.separator 211 | other = other.phones 212 | elif isinstance(other, str): 213 | other = parseWord(other, graphs) 214 | return Word(self.phones + other, graphs, separator, self.syllabifier) 215 | 216 | def __radd__(self, other): 217 | graphs = self.graphs 218 | separator = self.separator 219 | other = parseWord(other, graphs) 220 | return Word(other + self.phones, graphs, separator, self.syllabifier) 221 | 222 | def __mul__(self, other): 223 | return Word(self.phones * other, self.graphs, self.separator, self.syllabifier) 224 | 225 | def __rmul__(self, other): 226 | return Word(self.phones * other, self.graphs, self.separator, self.syllabifier) 227 | 228 | def __iadd__(*args): 229 | return NotImplemented 230 | 231 | def __imul__(*args): 232 | return NotImplemented 233 | 234 | def strip(self, chars=None): 235 | if chars is None: 236 | chars = '#' 237 | start = end = None 238 | for i, char in enumerate(self): 239 | if char not in chars: 240 | if start is None: 241 | start = i 242 | if self[i+1] in chars: 243 | end = i+1 244 | return self[start:end] 245 | 246 | def find(self, sub, start=None, end=None): 247 | '''Match a sequence using pattern notation to the word. 248 | 249 | Arguments: 250 | sub -- the list to be found (list) 251 | start -- the index of the beginning of the range to check (int) 252 | end -- the index of the end of the range to check (int) 253 | 254 | Returns an int 255 | ''' 256 | from ._pattern import parsePattern 257 | start, end = sliceIndices(self, start, end) 258 | if isinstance(sub, Word): 259 | sub = parsePattern(sub) 260 | if sub and sub[-1].type == 'Comparison': # Counting 261 | matches = 0 262 | op, count = sub[-1].operation, sub[-1].value 263 | for pos in range(start, end): 264 | match = self.matchPattern(sub[:-1], pos, end)[0] 265 | if match: 266 | matches += 1 267 | if eval(f'matches {op} count'): 268 | return 1 269 | else: 270 | for pos in range(start, end): 271 | match = self.matchPattern(sub, pos, end)[0] 272 | if match: 273 | return pos 274 | return -1 275 | 276 | def matchPattern(self, pattern, start=None, end=None, step=1): 277 | '''Match a pattern sequence to the word. 278 | 279 | Return if the sequence matches the end of the given slice of the word, the far end of the match, and category indexes. 280 | 281 | Arguments: 282 | pattern -- the sequence being matched 283 | start, end, step -- determine the slice of the word to match within 284 | stack -- used to pass stack references into an optional segment 285 | 286 | Returns a tuple. 287 | ''' 288 | from ._pattern import matchPattern 289 | start, end = sliceIndices(self, start, end) 290 | return matchPattern(self, pattern, start, end, step) 291 | 292 | def matchEnv(self, environment, pos=0, rpos=0): # Test if the env matches the word 293 | '''Match a sound change environment to the word. 294 | 295 | Arguments: 296 | environment -- the environment to be matched (list) 297 | pos, rpos -- the slice of the word giving the target (int, int) 298 | 299 | Returns a bool 300 | ''' 301 | for env in environment: 302 | if env is None: # Blank environment 303 | continue 304 | env = env.resolveTargetRef(self[pos:rpos]) 305 | if not env.match(self, pos, rpos): 306 | return False 307 | return True 308 | 309 | def applyMatch(self, match, rep): 310 | '''Apply a replacement to a word 311 | 312 | Arguments: 313 | match -- the match to be used 314 | rep -- the replacement to be used 315 | word -- the word to be changed 316 | 317 | Returns a Word. 318 | ''' 319 | from .sce import Replacement, LocalEnvironment, GlobalEnvironment 320 | pos, rpos, catixes = match[:3] 321 | if not rep: 322 | return self[:pos] + self[rpos:] 323 | target = self[pos:rpos] 324 | if isinstance(rep, Replacement): 325 | _rep = [] 326 | ix = 0 327 | for element in rep.resolveTargetRef(target).pattern: 328 | if element.type == 'Grapheme': 329 | _rep.append(element.grapheme) 330 | elif element.type == 'Category': 331 | if not catixes: 332 | raise RuleError('replacement contains a category but target did not') 333 | cat = element.cat 334 | _rep.append(cat[catixes[ix] % len(cat)]) 335 | ix = (ix + 1) % len(catixes) 336 | elif element.type == 'Ditto': 337 | _rep.append(_rep[-1] if _rep else self[pos-1]) 338 | else: 339 | _rep.append('') 340 | return self[:pos] + _rep + self[rpos:] 341 | elif isinstance(rep, tuple): # Copy/Move 342 | mode, envs = rep 343 | matches = [] 344 | for env in envs: # Each anded environment contributes destinations 345 | if isinstance(env, LocalEnvironment): 346 | env = env.resolveTargetRef(target) 347 | for wpos in range(1, len(self)): # Find all matches 348 | if env.match(self, wpos, wpos): 349 | if mode == 'move' and wpos >= rpos: # We'll need to adjust the matches down 350 | wpos -= rpos-pos 351 | matches.append(wpos) 352 | elif isinstance(env, GlobalEnvironment): # Indices 353 | if env.pattern: 354 | raise RuleError(f'global environment as destination must have no pattern: {rep}') 355 | matches.extend(env.indices) 356 | else: 357 | raise RuleError(f'unknown environment: {rep}') 358 | if mode == 'move': # Move - delete original target 359 | word = self[:pos] + self[rpos:] 360 | else: 361 | word = self 362 | for match in sorted(matches, reverse=True): 363 | word = word[:match] + target + word[match:] 364 | return word 365 | else: 366 | raise RuleError(f'invalid replacement: {rep}') 367 | 368 | # == Functions == # 369 | def resolveTargetRef(pattern, target): 370 | _pattern = [] 371 | for element in pattern: 372 | if element.type == 'TargetRef': 373 | _pattern.extend(element.resolveTarget(target)) 374 | else: 375 | _pattern.append(element) 376 | return _pattern 377 | 378 | def sliceIndices(iter, start=None, end=None): 379 | '''Calculate absolute indices from slice indices on an iterable. 380 | 381 | Arguments: 382 | iter -- the iterable being sliced 383 | start -- the index of the start of the slice 384 | end -- the index of the end of the slice 385 | 386 | Returns a tuple of 2 ints. 387 | ''' 388 | if start is None: 389 | start = 0 390 | elif start < 0: 391 | start += len(iter) 392 | if end is None: 393 | end = len(iter) 394 | elif end < 0: 395 | end += len(iter) 396 | return start, end 397 | 398 | def parseCats(cats, initialcats=None): 399 | '''Parses a set of categories. 400 | 401 | Arguments: 402 | cats -- the set of categories to be parsed (str) 403 | initialcats -- prior categories (dict) 404 | 405 | Returns a dict. 406 | ''' 407 | if initialcats is None: 408 | _cats = {} 409 | else: 410 | _cats = initialcats.copy() 411 | for key, value in cats.items(): 412 | if key == '' or not value: 413 | pass 414 | elif isinstance(value, Cat): 415 | _cats[key] = value 416 | elif isinstance(value, list): 417 | _cats[key] = Cat(value, key) 418 | elif isinstance(value, str): 419 | _cats[key] = Cat.make(f'[{value}]', _cats, key) 420 | else: 421 | raise FormatError('invalid category values') 422 | for cat in list(_cats): # Discard blank categories 423 | if not _cats[cat]: 424 | del _cats[cat] 425 | return _cats 426 | 427 | WHITESPACE_REGEX = re.compile(r'\s+') 428 | 429 | def parseWord(string, graphs=(), separator=''): 430 | string = WHITESPACE_REGEX.sub('#', string) 431 | polygraphs = sorted(filter(lambda g: len(g) > 1, graphs), key=len, reverse=True) 432 | if not polygraphs: 433 | return list(string.replace(separator, '')) 434 | if not separator: 435 | separator = '.' 436 | word = [] 437 | string = string.lstrip(separator) 438 | while string: 439 | graph = next(filter(lambda p: string.startswith(p), polygraphs), string[0]) 440 | word.append(graph) 441 | string = string[len(graph):].lstrip(separator) 442 | return word 443 | 444 | def unparseWord(word, graphs=(), separator=''): 445 | string = '' 446 | polygraphs = list(filter(lambda g: len(g) > 1, graphs)) 447 | if not polygraphs: 448 | string = ''.join(word) 449 | word = [] 450 | if not separator: 451 | separator = '.' 452 | ambig = [] 453 | for graph in word: 454 | if ambig: 455 | ambig.append(graph) 456 | for i in range(len(ambig)): 457 | test = ''.join(ambig[i:]) 458 | minlength = len(ambig[i]) 459 | if any(test.startswith(poly) and len(poly) > minlength for poly in polygraphs): 460 | string += separator 461 | ambig = [graph] 462 | break 463 | for i in range(len(ambig)): 464 | test = ''.join(ambig[i:]) 465 | if any(poly.startswith(test) and poly != test for poly in polygraphs): 466 | ambig = ambig[i:] 467 | break 468 | else: 469 | ambig = [] 470 | elif any(poly.startswith(graph) and poly != graph for poly in polygraphs): 471 | ambig.append(graph) 472 | string += graph 473 | return string.strip(separator+'#').replace('#', ' ') 474 | 475 | def partition(sequence, *, sep=None, sepfunc=None, yieldsep=False): 476 | if sep is None == sepfunc is None: 477 | raise ValueError('exactly one of sep and sepfunc must be given') 478 | if sep is not None: 479 | sepfunc = lambda item: item == sep 480 | i = 0 481 | for j, item in enumerate(sequence): 482 | if sepfunc(item): 483 | if yieldsep: 484 | yield (sequence[i:j], sequence[j]) 485 | else: 486 | yield sequence[i:j] 487 | i = j+1 488 | if yieldsep: 489 | yield sequence[i:], None 490 | else: 491 | yield sequence[i:] 492 | 493 | def partitionTokens(tokens, sep=None, yieldsep=True): 494 | yield from partition(tokens, sepfunc=(lambda element: element.type == sep), yieldsep=yieldsep) 495 | -------------------------------------------------------------------------------- /src/_pattern.py: -------------------------------------------------------------------------------- 1 | '''Pattern parsing and matching 2 | 3 | Classes: 4 | Token -- Class utilised by the tokeniser 5 | Element -- Base class for pattern elements 6 | Grapheme -- Element matching a specific grapheme 7 | Ditto -- Element matching the second of two identical segments 8 | SylBreak -- Element matching a syllable boundary 9 | Category -- Element matching a category of graphemes 10 | Wildcard -- Element matching one or more arbitrary segments 11 | WildcardRep -- Element matching one or more copies of the previous element 12 | Optional -- Element matching an optional sequence of elements 13 | Comparison -- Element used for indicating the number of another element 14 | TargetRef -- Element used to refer to the target 15 | 16 | Functions: 17 | escape -- processes escaped characters in a string 18 | tokenise -- returns a generator producing tokens 19 | parsePattern -- parses a string utilising pattern notation into a list of elements 20 | parsePatterns -- parses a collection of strings using pattern notation 21 | matchPattern -- matches a list of elements to a specified slice of a word 22 | '''''' 23 | ==================================== To-do ==================================== 24 | === Bug-fixes === 25 | catixes in matchPattern should be redone to cope with non-linearity 26 | 27 | === Implementation === 28 | Replace super-disgusting hacky wildcard repetition workaround in matchPattern with something better 29 | - How though 30 | Handling of optionals needs a lot of work 31 | 32 | === Features === 33 | 34 | === Style === 35 | ''' 36 | import re 37 | from dataclasses import dataclass, InitVar 38 | from typing import Dict, List 39 | from .core import FormatError, CompilerError, TokenError, Token, Cat 40 | 41 | ## Constants 42 | TOKENS = { 43 | 'COMMA': r', ?', 44 | 'NULL': r'\[\]', 45 | 'LOPT': r'\(', 46 | 'ROPT': r'\)\??', 47 | 'LCAT': r'\[', 48 | 'RCAT': r'\]', 49 | 'WILDCARDREP': r'\{\*\??\}', 50 | 'COMPARISON': r'\{(?:!=|[=<>]=?)\d+\}', 51 | 'ESCAPE': r'\\.', 52 | 'REPETITION': r'\{\d+\}', 53 | 'WILDCARD': r'\*\*?\??', 54 | 'TARGETREF': r'%|<', 55 | 'DITTO': r'\"', 56 | 'SYLBREAK': r'\$', 57 | 'TEXT': r'[^ >\/!+\-[\](){}*?\\"%<$^,&_~@]+', 58 | 'UNKNOWN': r'.', 59 | } 60 | TOKEN_REGEX = re.compile('|'.join(f'(?P<{type}>{regex})' for type, regex in TOKENS.items())) 61 | TOKENS = {type: re.compile(regex) for type, regex in TOKENS.items()} 62 | 63 | ## Classes 64 | @dataclass(repr=False, eq=False) 65 | class Element: 66 | def __str__(self): 67 | return '' 68 | 69 | def __repr__(self): 70 | return f'{self.type}({str(self)!r})' 71 | 72 | def __eq__(self, other): 73 | if isinstance(other, str): 74 | return str(self) == other 75 | elif type(self) == type(other): 76 | return str(self) == str(other) 77 | else: 78 | return NotImplemented 79 | 80 | @property 81 | def type(self): 82 | return self.__class__.__name__ 83 | 84 | # This method should not be called directly, as it does not check its arguments for correctness 85 | @classmethod 86 | def make(cls, string=None, cats=None): 87 | return cls() 88 | 89 | @classmethod 90 | def fromString(cls, string=None, cats=None): 91 | if TOKENS[cls.__name__.upper()].match(string) is not None: # Sanity check 92 | return cls.make(string, cats) 93 | raise TokenError(f'invalid {cls.__name__}', tokens[0]) 94 | 95 | @classmethod 96 | def fromTokens(cls, tokens=None, cats=None): 97 | if len(tokens) != 1: 98 | raise CompilerError(f'too many tokens', tokens, tokens[0].linenum, tokens[0].column) 99 | type, value = tokens[0] 100 | if type == cls.__name__.upper() and TOKENS[type].match(value) is not None: # Sanity check 101 | return cls.make(value, cats) 102 | raise TokenError(f'invalid {cls.__name__}', tokens[0]) 103 | 104 | # This method must guarantee that the last two return values are [] if the first is False 105 | def match(self, word, pos, ix, step, istep): 106 | # matched, length, ilength, stack, catixes 107 | return False, 0, 0, [], [] 108 | 109 | ## Matching elements ## 110 | @dataclass(repr=False, eq=False) 111 | class Grapheme(Element): 112 | grapheme: str 113 | 114 | def __str__(self): 115 | return self.grapheme 116 | 117 | @staticmethod 118 | def make(string, cats=None): 119 | return Grapheme(grapheme=string) 120 | 121 | @staticmethod 122 | def fromString(string=None, cats=None): 123 | if TOKENS['ESCAPE'].match(string) is not None: # Sanity check 124 | return Grapheme(grapheme=value[1]) 125 | raise TokenError(f'invalid {cls.__name__}', tokens[0]) 126 | 127 | @staticmethod 128 | def fromTokens(tokens, cats=None): 129 | if len(tokens) != 1: 130 | raise CompilerError(f'too many tokens', tokens, tokens[0].linenum, tokens[0].column) 131 | type, value = tokens[0] 132 | if type == 'ESCAPE' and TOKENS['ESCAPE'].match(value) is not None: # Sanity check 133 | return Grapheme(grapheme=value[1]) 134 | raise TokenError('invalid Grapheme', tokens[0]) 135 | 136 | def match(self, word, pos, ix, step, istep): 137 | return self.grapheme == word[pos], step, istep, [], [] 138 | 139 | @dataclass(repr=False, eq=False) 140 | class Ditto(Element): 141 | def __str__(self): 142 | return '"' 143 | 144 | def match(self, word, pos, ix, step, istep): 145 | return word[pos] == word[pos-1], step, istep, [], [] 146 | 147 | @dataclass(repr=False, eq=False) 148 | class SylBreak(Element): 149 | def __str__(self): 150 | return '$' 151 | 152 | def match(self, word, pos, ix, step, istep): 153 | return (pos in word.syllables), 0, istep, [], [] 154 | 155 | @dataclass(repr=False, eq=False) 156 | class Category(Element): 157 | cat: Cat 158 | 159 | def __str__(self): 160 | if self.cat.name is None: 161 | return str(self.cat) 162 | else: 163 | return f'[{self.cat.name}]' 164 | 165 | def __eq__(self, other): 166 | if isinstance(other, Category): 167 | return self.cat == other.cat 168 | else: 169 | return self.cat == other 170 | 171 | @staticmethod 172 | def make(string, cats=None): 173 | return Category(cat=Cat.make(string, cats)) 174 | 175 | @staticmethod 176 | def fromString(string, cats=None): 177 | return Category.make(string, cats) 178 | 179 | @staticmethod 180 | def fromTokens(tokens, cats=None): 181 | string = ''.join(token.value for token in tokens) 182 | return Category.make(string, cats) 183 | 184 | def match(self, word, pos, ix, step, istep): 185 | if word[pos] in self.cat: # This might change 186 | return True, step, istep, [], [self.cat.index(word[pos])] 187 | return False, 0, 0, [], [] 188 | 189 | @dataclass(repr=False, eq=False) 190 | class Wildcard(Element): 191 | greedy: bool 192 | extended: bool 193 | 194 | def __str__(self): 195 | return ('**' if self.extended else '*') + ('' if self.greedy else '?') 196 | 197 | @staticmethod 198 | def make(string, cats=None): 199 | greedy = not string.endswith('?') 200 | extended = string.startswith('**') 201 | return Wildcard(greedy=greedy, extended=extended) 202 | 203 | def match(self, word, pos, ix, step, istep): 204 | if self.extended or word[pos] != '#': 205 | if self.greedy: 206 | stack = [(pos+step, ix+istep)] 207 | istep = 0 208 | else: 209 | stack = [(pos+step, ix)] 210 | return True, step, istep, stack, [] 211 | return False, 0, 0, [], [] 212 | 213 | @dataclass(repr=False, eq=False) 214 | class WildcardRep(Element): 215 | greedy: bool 216 | 217 | def __str__(self): 218 | return '{*}' if self.greedy else '{*?}' 219 | 220 | @staticmethod 221 | def make(string, cats=None): 222 | if string == '{*}': 223 | return WildcardRep(greedy=True) 224 | else: 225 | return WildcardRep(greedy=False) 226 | 227 | def match(self, word, pos, ix, step, istep): 228 | if self.greedy: 229 | istep *= -1 230 | return True, 0, istep, [(pos, ix-istep)], [] 231 | 232 | ## Non-matching elements ## 233 | @dataclass(repr=False, eq=False) 234 | class Optional(Element): 235 | greedy: bool 236 | pattern: List[Token] 237 | 238 | def __str__(self): 239 | string = unparsePattern(self.pattern) 240 | return f'({string})' if self.greedy else f'({string})?' 241 | 242 | @staticmethod 243 | def make(string, cats=None): 244 | greedy = not string.endswith('?') 245 | pattern = parsePattern(string.rstrip('?')[1:-1], cats) 246 | if len(pattern) == 1 and isinstance(pattern[0], Wildcard): 247 | pattern[0].greedy = greedy 248 | return Optional(greedy=greedy, pattern=pattern) 249 | 250 | @staticmethod 251 | def fromString(string, cats=None): 252 | return Optional.make(string, cats) 253 | 254 | @staticmethod 255 | def fromTokens(tokens, cats=None): 256 | if tokens[0].type != 'LOPT' or tokens[-1].type != 'ROPT': 257 | raise FormatError(f'the given tokens are not a valid optional: {tokens}') 258 | greedy = not tokens[-1].value.endswith('?') 259 | pattern = compile(tokens[1:-1], cats) 260 | if len(pattern) == 1 and isinstance(pattern[0], Wildcard): 261 | pattern[0].greedy = greedy 262 | return Optional(greedy=greedy, pattern=pattern) 263 | 264 | # Somehow I need to adapt the special matching code for this framework - won't be easy 265 | 266 | @dataclass(repr=False, eq=False) 267 | class Comparison(Element): 268 | operation: str 269 | value: int 270 | 271 | def __str__(self): 272 | return f'{{{self.operation}{self.value}}}'.replace('==', '=') 273 | 274 | @staticmethod 275 | def make(string, cats=None): 276 | string = string[1:-1] 277 | for op in ('==', '=', '!=', '>=', '>', '<=', '<'): 278 | if string.startswith(op): 279 | value = int(string[len(op):]) 280 | if op == '=': 281 | op = '==' 282 | return Comparison(operation=op, value=value) 283 | 284 | @dataclass(repr=False, eq=False) 285 | class TargetRef(Element): 286 | direction: int 287 | 288 | def __str__(self): 289 | return '%' if self.direction == 1 else '<' 290 | 291 | @staticmethod 292 | def make(string, cats=None): 293 | if string == '%': 294 | return TargetRef(direction=1) 295 | else: 296 | return TargetRef(direction=-1) 297 | 298 | def resolveTarget(self, target): 299 | return [Grapheme(graph) for graph in (target if self.direction == 1 else reversed(target))] 300 | 301 | ELEMENT_DICT = { 302 | 'LOPT': Optional, 303 | 'LCAT': Category, 304 | 'WILDCARDREP': WildcardRep, 305 | 'COMPARISON': Comparison, 306 | 'ESCAPE': Grapheme, 307 | 'WILDCARD': Wildcard, 308 | 'TARGETREF': TargetRef, 309 | 'DITTO': Ditto, 310 | 'SYLBREAK': SylBreak, 311 | } 312 | 313 | # Don't slice the string when calling this 314 | def tokenise(string, colstart=None, linenum=0): 315 | '''Tokenise a string using pattern notation. 316 | 317 | Arguments: 318 | string -- the input string using pattern notation (str) 319 | colstart -- the column to start token indexing at (int) 320 | 321 | Yields Token objects 322 | ''' 323 | if colstart is None: 324 | nested = False 325 | colstart = 0 326 | else: 327 | nested = True 328 | if not string: 329 | if nested: 330 | return colstart 331 | return 332 | brackets = [] 333 | for match in TOKEN_REGEX.finditer(string, colstart): 334 | type = match.lastgroup 335 | value = match.group() 336 | column = match.start() 337 | colstart = match.end() 338 | if type == 'COMMA': 339 | if not (brackets and brackets[-1] == '['): 340 | if not brackets and nested: 341 | return column 342 | raise CompilerError(f'unexpected comma', value, linenum, column) 343 | elif type in ('LOPT', 'LCAT'): # Left brackets 344 | if value == '(' and brackets and brackets[-1] == '[': 345 | raise CompilerError(f'optionals may not appear inside categories', value, linenum, column) 346 | brackets.append(value) 347 | elif type in ('ROPT', 'RCAT'): # Right brackets 348 | if not brackets: 349 | raise CompilerError(f'unexpected bracket', value, linenum, column) 350 | bracket = brackets.pop() 351 | if bracket+value[0] not in ('()', '[]'): 352 | raise CompilerError(f'mismatched brackets', value, linenum, column) 353 | elif type == 'UNKNOWN': 354 | if nested: 355 | return column 356 | else: 357 | raise CompilerError(f'unexpected character', value, linenum, column) 358 | yield Token(type, value, linenum, column) 359 | if nested: 360 | return colstart 361 | 362 | def matchBrackets(tokens, start=0): 363 | if tokens[start].type not in ('LOPT', 'LCAT'): 364 | raise TokenError(f'expected bracket', tokens[start]) 365 | else: 366 | left = tokens[start].type 367 | right = left.replace('L', 'R') 368 | depth = 0 369 | for i, token in enumerate(tokens[start:], start+1): 370 | if token.type == left: 371 | depth += 1 372 | elif token.type == right: 373 | depth -= 1 374 | if depth == 0: 375 | return i 376 | raise TokenError(f'unmatched bracket', tokens[start]) 377 | 378 | def compile(tokens, cats=None): 379 | from .core import parseWord 380 | tokens = list(tokens) 381 | if not tokens: 382 | return [] 383 | if cats is not None and 'graphs' in cats: 384 | graphs = cats['graphs'] 385 | else: 386 | graphs = () 387 | elements = [] 388 | i = 0 389 | while i < len(tokens): 390 | type, value = tokens[i] 391 | if type in ('LOPT', 'LCAT'): 392 | j = matchBrackets(tokens, i) 393 | else: 394 | j = i+1 395 | if type == 'NULL': 396 | pass 397 | elif type == 'REPETITION': 398 | elements[-1:] = elements[-1:]*int(value[1:-1]) 399 | elif type == 'TEXT': 400 | elements.extend([Grapheme(graph) for graph in parseWord(value, graphs)]) 401 | elif type in ELEMENT_DICT: 402 | cls = ELEMENT_DICT[type] 403 | elements.append(cls.fromTokens(tokens[i:j], cats)) 404 | else: 405 | raise TokenError(f'unexpected token', tokens[i]) 406 | i = j 407 | return elements 408 | 409 | def parsePattern(pattern, cats=None): 410 | '''Parse a string using pattern notation. 411 | 412 | Arguments: 413 | pattern -- the input string using pattern notation (str or Word) 414 | cats -- a dictionary of categories to use for interpreting categories (dict) 415 | 416 | Returns a list 417 | ''' 418 | from .core import Word 419 | if isinstance(pattern, Word): 420 | return [Grapheme(graph) for graph in pattern] 421 | try: 422 | return compile(tokenise(pattern), cats) 423 | except CompilerError as e: 424 | raise FormatError(f'invalid pattern: {pattern!r}; {e.args[0]}') 425 | 426 | def unparsePattern(pattern, graphs=(), separator=''): 427 | from .core import unparseWord 428 | # Add collapsing repeated tokens 429 | elements = [] 430 | for element in pattern: 431 | if isinstance(element, Optional): 432 | string = unparsePattern(element.pattern, graphs, separator) 433 | elements.append(f'({string})' if self.greedy else f'({string})?') 434 | else: 435 | elements.append(str(element)) 436 | return unparseWord(elements, graphs, separator) 437 | 438 | def parsePatterns(patterns, cats=None): 439 | '''Parses generation patterns. 440 | 441 | Arguments: 442 | patterns -- set of patterns to parse (str, list, or dict) 443 | 444 | Returns a list 445 | ''' 446 | if isinstance(patterns, str): 447 | patterns = patterns.splitlines() 448 | if isinstance(patterns, list): 449 | _patterns = [] 450 | for pattern in patterns: 451 | #Remove comments 452 | if isinstance(pattern, str): 453 | pattern = pattern.split('//')[0] 454 | if not pattern: 455 | continue 456 | if isinstance(pattern, str): 457 | _patterns.append(parsePattern(pattern, cats)) 458 | else: 459 | _patterns.append(pattern) 460 | elif isinstance(patterns, dict): 461 | _patterns = {key: parsePatterns(patterns[key], cats) for key in patterns} 462 | else: 463 | _patterns = None 464 | return _patterns 465 | 466 | def matchPattern(word, pattern, start, end, step, stack=None): 467 | '''Match a pattern sequence to the word. 468 | 469 | Return if the sequence matches the end of the given slice of the word, the far end of the match, and category indexes. 470 | 471 | Arguments: 472 | word -- the word to match to 473 | pattern -- the sequence being matched 474 | start, end, step -- determine the slice of the word to match within 475 | stack -- used to pass stack references into an optional segment 476 | 477 | Returns a tuple. 478 | ''' 479 | pos = start if step > 0 else end-1 480 | ix = 0 if step > 0 else (len(pattern)-1) 481 | istep = 1 if step > 0 else -1 482 | if stack is None: 483 | stack = [] # This stores the positions in the word and sequence that we branched at 484 | _returnstack = False 485 | else: 486 | if stack: 487 | pos, ix = stack.pop() 488 | _returnstack = True 489 | catixes = [] # This records the index of each category match. This needs to be redone to cope with non-linearity 490 | # Hacky thing for now to make wildcard repetitions actually work in rtl 491 | pattern = pattern.copy() 492 | if step < 0: 493 | for i, element in enumerate(pattern): 494 | if element.type == 'WildcardRep': 495 | pattern[i-1:i+1] = reversed(pattern[i-1:i+1]) 496 | matched = True 497 | while 0 <= ix < len(pattern): 498 | if start <= pos < end: # Still in the slice 499 | element = pattern[ix] 500 | if not isinstance(element, Optional): 501 | matched, length, ilength, _stack, _catixes = element.match(word, pos, ix, step, istep) 502 | stack.extend(_stack) 503 | catixes.extend(_catixes) 504 | else: # Optionals require special handling 505 | if not matched: # Jumped here via the stack, check if we've got a nested stack reference 506 | if stack and isinstance(stack[-1], list): 507 | _stack = stack.pop() 508 | else: 509 | _stack = [] 510 | if element.greedy: # Greedy 511 | if ix < len(pattern)-istep and pattern[ix+istep].type == 'WildcardRep': # We need to make sure to step past a wildcard repetition 512 | stack.append((pos, ix+istep*2)) 513 | else: 514 | stack.append((pos, ix+istep)) 515 | ilength = step 516 | elif matched: # Non-greedy, we stepped in normally 517 | stack.append((pos, ix)) 518 | if ix < len(pattern)-istep and pattern[ix+istep].type == 'WildcardRep': # We need to make sure to step past a wildcard repetition 519 | ilength = istep*2 520 | else: 521 | ilength = istep 522 | matched = True 523 | length = 0 524 | if element.greedy or not matched: 525 | _start, _end = (pos, end) if istep > 0 else (start, pos+1) 526 | matched, rpos, _catixes, _stack = matchPattern(word, element.pattern, _start, _end, step, _stack) 527 | # Merge in the stack - if a reference has an index within element, nest it and push a reference to 528 | # the element, else correct the index and push it directly 529 | for _pos, _ix in _stack: 530 | if _ix >= len(element.pattern): 531 | _ix -= len(element.pattern)-1 532 | stack.append((_pos, _ix)) 533 | else: 534 | if len(stack) >= 2 and isinstance(stack[-2], list): 535 | stack[-2].append((_pos, _ix)) 536 | else: 537 | stack.append([(_pos, _ix)]) 538 | stack.append((_pos, ix)) 539 | length = rpos-pos 540 | if matched: 541 | catixes.extend(_catixes) 542 | else: 543 | matched, length, ilength = False, 0, 0 544 | if matched: 545 | ix += ilength 546 | pos += length 547 | elif stack: # This segment failed to match, so we jump back to the next branch 548 | pos, ix = stack.pop() 549 | else: # Total match failure 550 | if _returnstack: 551 | return False, 0, [], [] # Maybe? 552 | else: 553 | return False, 0, [] 554 | if _returnstack: 555 | return True, pos, catixes, stack 556 | else: 557 | return True, pos, catixes 558 | -------------------------------------------------------------------------------- /src/sce.py: -------------------------------------------------------------------------------- 1 | '''Apply sound changes to a lexicon 2 | 3 | Exceptions: 4 | RuleFailed -- exception to mark that a rule failed 5 | 6 | Classes: 7 | Rule -- represents a sound change rule 8 | 9 | Functions: 10 | compileRuleset -- compiles a sound change ruleset 11 | compileRule -- compiles a sound change rule 12 | run -- applies a set of sound change rules to a set of words 13 | '''''' 14 | ==================================== To-do ==================================== 15 | === Bug-fixes === 16 | 17 | === Implementation === 18 | Maybe change >^ and >^? to >> and >>? 19 | 20 | === Features === 21 | Is it possible to implement a>b>c as notation for a chain shift? 22 | Think about expanding the options for grapheme handling 23 | - diacritics 24 | Allow ~ in tar and rep 25 | Implement more category operations 26 | - intersection 27 | -- feature-style? [+A +B -C] == [A] && [B] && ~[C] 28 | More format conversion metarules? 29 | - !sca2 30 | 31 | === Style === 32 | Consider where to raise/handle exceptions 33 | Go over docstrings 34 | ''' 35 | 36 | import logging 37 | import logging.config 38 | import os.path 39 | import re 40 | from contextlib import suppress 41 | from dataclasses import dataclass, InitVar 42 | from .core import LangException, FormatError, RuleError, CompilerError, TokenError, Token, Cat, Word, resolveTargetRef, parseCats, partitionTokens 43 | from ._pattern import tokenise as tokenisePattern, compile as compilePattern 44 | 45 | # == Constants == # 46 | MAX_RUNS = 10**3 # Maximum number of times a rule may be repeated 47 | __location__ = os.path.realpath( 48 | os.path.join(os.getcwd(), os.path.dirname(__file__), 'logging.conf')) 49 | RULE_TOKENS = { 50 | 'EPENTHESIS': r'^\+ ?', 51 | 'DELETION': r'^\- ?', 52 | 'MOVE': r'>\^\?| +>\^\? ', 53 | 'COPY': r'>\^| +>\^ ', 54 | 'REPLACEMENT': r'>| +> ', 55 | 'ENVIRONMENT': r'/| +/ ', 56 | 'EXCEPTION': r'!| +! ', 57 | 'OR': r', ?', 58 | 'AND': r'&| & ', 59 | 'PLACEHOLDER': r'_', 60 | # 'ADJACENCY': r'~', 61 | 'INDICES': r'@\-?\d+(?:\|\-?\d+)*', 62 | 'SPACE': r' ', 63 | 'UNKNOWN': r'.' 64 | } 65 | RULE_REGEX = re.compile('|'.join(f'(?P<{type}>{regex})' for type, regex in RULE_TOKENS.items())) 66 | METARULES = [ 67 | 'block', 68 | 'def', 69 | 'rule', 70 | ] 71 | METARULE_TOKENS = { 72 | 'METARULE': fr'^!(?:{"|".join(METARULES)})', 73 | 'COLON': r': ?', 74 | 'NUMBER': r'\d+', 75 | 'IDENTIFIER': r'[a-z_]+', 76 | 'SPACE': r' ', 77 | 'UNKNOWN': r'.', 78 | } 79 | METARULE_REGEX = re.compile('|'.join(f'(?P<{type}>{regex})' for type, regex in METARULE_TOKENS.items())) 80 | FLAGS = [ 81 | 'ignore', 82 | 'rtl', 83 | 'ditto', 84 | 'stop', 85 | 'repeat', 86 | 'persist', 87 | 'chance', 88 | ] 89 | FLAG_TOKENS = { 90 | 'FLAG': '|'.join(FLAGS), 91 | 'COLON': r': ?', 92 | 'ARGUMENT': r'\d+', 93 | 'NEGATION': r'!', 94 | 'SEPARATOR': r'; ?', 95 | 'UNKNOWN': r'.', 96 | } 97 | FLAG_REGEX = re.compile('|'.join(f'(?P<{type}>{regex})' for type, regex in FLAG_TOKENS.items())) 98 | CATEGORY_TOKENS = { 99 | 'CATEGORY': r'^\w+', 100 | 'OP': r'(?:\+|\-)?=| +(?:\+|\-)?= ', 101 | 'VALUES': r'.+$', # Might make this part more precise 102 | 'UNKNOWN': r'.', 103 | } 104 | CATEGORY_REGEX = re.compile('|'.join(f'(?P<{type}>{regex})' for type, regex in CATEGORY_TOKENS.items())) 105 | 106 | # == Globals == # 107 | logger = None 108 | 109 | # == Exceptions == # 110 | class RuleFailed(LangException): 111 | '''Used to indicate that the rule failed to be applied.''' 112 | 113 | # == Classes == # 114 | @dataclass 115 | class IndexedPattern: 116 | pattern: list 117 | indices: list = None 118 | 119 | def __str__(self): 120 | if self.indices is None: 121 | return str(self.pattern) 122 | elif not self.pattern: 123 | return f'@{self.indices}' 124 | else: 125 | return f'{self.pattern}@{self.indices}' 126 | 127 | def __iter__(self): 128 | yield self.pattern 129 | yield self.indices 130 | 131 | def copy(self): 132 | cls = self.__class__ 133 | if self.indices is not None: 134 | return cls(self.pattern.copy(), self.indices.copy()) 135 | else: 136 | return cls(self.pattern.copy()) 137 | 138 | @dataclass 139 | class Target(IndexedPattern): 140 | pass 141 | 142 | @dataclass 143 | class Replacement: 144 | pattern: list 145 | 146 | def __str__(self): 147 | return str(self.pattern) 148 | 149 | def __iter__(self): 150 | yield self.pattern 151 | 152 | def copy(self): 153 | return Replacement(self.pattern.copy()) 154 | 155 | def resolveTargetRef(self, target): 156 | return Replacement(resolveTargetRef(self.pattern, target)) 157 | 158 | @dataclass 159 | class LocalEnvironment: 160 | left: list 161 | right: list 162 | 163 | def __str__(self): 164 | if self.left and self.right: 165 | return f'{self.left}_{self.right}' 166 | elif self.left: 167 | return f'{self.left}_' 168 | elif self.right: 169 | return f'_{self.right}' 170 | else: 171 | return '_' 172 | 173 | def __bool__(self): 174 | return bool(self.left or self.right) 175 | 176 | def __iter__(self): 177 | yield self.left 178 | yield self.right 179 | 180 | def copy(self): 181 | return LocalEnvironment(self.left.copy(), self.right.copy()) 182 | 183 | def resolveTargetRef(self, target): 184 | return LocalEnvironment(resolveTargetRef(self.left, target), resolveTargetRef(self.right, target)) 185 | 186 | def match(self, word, pos=0, rpos=0): 187 | left, right = self 188 | if pos: 189 | matchleft = word.matchPattern(left, 0, pos, -1)[0] 190 | else: # At the left edge, which can only be matched by a null env 191 | matchleft = False if left else True 192 | matchright = word.matchPattern(right, rpos)[0] 193 | return matchleft and matchright 194 | 195 | @dataclass 196 | class GlobalEnvironment(IndexedPattern): 197 | def __bool__(self): 198 | return bool(self.pattern or self.indices) 199 | 200 | def resolveTargetRef(self, target): 201 | if self.indices is not None: 202 | return GlobalEnvironment(resolveTargetRef(self.pattern, target), self.indices.copy()) 203 | else: 204 | return GlobalEnvironment(resolveTargetRef(self.pattern, target)) 205 | 206 | def match(self, word, pos=0, rpos=0): 207 | pattern, indices = self 208 | if indices is None: 209 | return word.find(pattern) != -1 210 | else: 211 | return any(word.matchPattern(pattern, index)[0] for index in indices) 212 | 213 | @dataclass 214 | class Flags: 215 | ignore: int = 0 216 | ditto: int = 0 217 | stop: int = 0 218 | rtl: int = 0 219 | repeat: int = 1 220 | persist: int = 1 221 | chance: int = 100 222 | 223 | @dataclass 224 | class Rule: 225 | '''Class for representing a sound change rule. 226 | 227 | Instance variables: 228 | rule -- the rule as a string (str) 229 | tars -- target segments (list) 230 | reps -- replacement segments (list) 231 | envs -- application environments (list) 232 | excs -- exception environments (list) 233 | otherwise -- the rule to apply if an exception is satisfied (Rule) 234 | flags -- flags for altering execution (Flags) 235 | 236 | Methods: 237 | apply -- apply the rule to a word 238 | checkMatch -- check if the match is valid 239 | ''' 240 | tars: list 241 | reps: list 242 | envs: list 243 | excs: list 244 | otherwise: 'Rule' 245 | flags: Flags 246 | rule: str = '' 247 | 248 | def __repr__(self): 249 | return f"Rule('{self!s}')" 250 | 251 | def __str__(self): 252 | return self.rule 253 | 254 | def __eq__(self, other): 255 | return self[1:] == other[1:] 256 | 257 | def __iter__(self): 258 | yield self.tars 259 | yield self.reps 260 | yield self.envs 261 | yield self.excs 262 | yield self.otherwise 263 | yield self.flags 264 | yield self.rule 265 | 266 | def apply(self, word): 267 | '''Apply the sound change rule to a single word. 268 | 269 | Arguments: 270 | word -- the word to which the rule is to be applied (Word) 271 | 272 | Raises RuleFailed if the rule did not apply to the word. 273 | ''' 274 | logger.debug(f'This rule: `{self}`') 275 | # Get all target matches, filtered by given indices 276 | logger.debug('Begin matching targets') 277 | matches = [] 278 | for i, target in enumerate(self.tars): 279 | logger.debug(f'> Matching `{target}`') 280 | if target: 281 | pattern, indices = target 282 | else: 283 | pattern, indices = [], None 284 | if not pattern: # All pos's 285 | logger.debug(f'>> Null target matched all positions in range 1..{len(word)}') 286 | _matches = [(pos, pos, [], i) for pos in range(1, len(word))] 287 | else: 288 | _matches = [] 289 | for pos in range(1, len(word)): # Find all matches 290 | match, rpos, catixes = word.matchPattern(pattern, pos) 291 | if match: # pattern matches at pos 292 | logger.debug(f'>> Target matched `{word[pos:rpos]}` at {pos}') 293 | _matches.append((pos, rpos, catixes, i)) 294 | if not _matches: 295 | logger.debug('>> No matches for this target') 296 | # Filter only those matches selected by the given indices 297 | if indices is None: 298 | matches += _matches 299 | elif _matches: 300 | matches += [_matches[ix] for ix in indices if -len(_matches) <= ix < len(_matches)] 301 | matches.sort() 302 | logger.debug(f'> Final matches at positions {[match[0] for match in matches]}') 303 | if not matches: 304 | logger.debug('No matches') 305 | raise RuleFailed 306 | # Filter only those matches that fit the environment - also record the corresponding replacement 307 | logger.debug('Check matches against environments and exceptions') 308 | reps = [] 309 | for i in reversed(range(len(matches))): 310 | logger.debug(f'> Checking match at {matches[i][0]}') 311 | check = self.checkMatch(matches[i], word) 312 | if not check: 313 | logger.debug(f'>> Match at {matches[i][0]} failed') 314 | del matches[i] 315 | else: 316 | # Find the correct replacement 317 | logger.debug('>> Get replacement for this match') 318 | rule = self 319 | for j in range(check-1): 320 | rule = rule.otherwise 321 | _reps = rule.reps 322 | match = matches[i][3] 323 | if isinstance(_reps, tuple): # Copy/move 324 | reps.append((_reps[0], _reps[1][match%len(_reps[1])])) 325 | else: 326 | reps.append(_reps[match%len(_reps)]) 327 | logger.debug(f'>>> Found {reps[-1]}') 328 | if not reps: 329 | logger.debug('No matches matched environment') 330 | raise RuleFailed 331 | reps.reverse() 332 | matches = sorted(zip(matches, reps), reverse=True) 333 | # Filter overlaps 334 | logger.debug('Filter out overlapping matches') 335 | if self.flags.rtl: 336 | logger.debug('> Proceeding right-to-left') 337 | i = 1 338 | while i < len(matches): 339 | if matches[i][0][1] > matches[i-1][0][0]: # Overlap 340 | logger.debug(f'>> Match at {matches[i][0][0]} overlaps match at {matches[i-1][0][0]}') 341 | del matches[i] 342 | else: 343 | i += 1 344 | else: 345 | logger.debug('> Proceeding left-to-right') 346 | for i in reversed(range(len(matches)-1)): 347 | if matches[i][0][0] < matches[i+1][0][1]: # Overlap 348 | logger.debug(f'>> Match at {matches[i][0][0]} overlaps match at {matches[i+1][0][0]}') 349 | del matches[i] 350 | logger.debug(f'Applying matches to `{word}`') 351 | for match, rep in matches: 352 | logger.debug(f'> Changing `{list(word[match[0]:match[1]])}` to `{rep}` at {match[0]}') 353 | word = word.applyMatch(match, rep) 354 | return word 355 | 356 | def checkMatch(self, match, word): 357 | pos, rpos = match[:2] 358 | if any(word.matchEnv(exc, pos, rpos) for exc in self.excs): # If there are exceptions, does any match? 359 | logger.debug('>> Matched an exception, check the "else" rule') 360 | elif any(word.matchEnv(env, pos, rpos) for env in self.envs): # Does any environment match? 361 | logger.debug('>> Matched an environment, check succeeded') 362 | return 1 363 | elif self.excs: # Are there exceptions? 364 | logger.debug('>> Environments and exceptions both don\'t match, check failed') 365 | return 0 366 | else: 367 | logger.debug('>> Environment doesn\'t match, check "else" rule') 368 | if self.otherwise is not None: # Try checking otherwise 369 | check = self.otherwise.checkMatch(match, word) 370 | return check + (1 if check else 0) 371 | else: 372 | logger.debug('>> No "else" rule, check failed') 373 | return 0 374 | 375 | @dataclass 376 | class RuleBlock(list): 377 | '''Groups a block of sound changes together. 378 | 379 | Instance variables: 380 | flags -- flags for altering execution (Flags) 381 | ''' 382 | ruleset: InitVar[list] 383 | flags: Flags = Flags() 384 | 385 | def __post_init__(self, ruleset): 386 | list.__init__(self, ruleset) 387 | 388 | def apply(self, word): 389 | from random import randint 390 | applied = False 391 | rules = [] # We use a list to store rules, since they may be applied multiple times 392 | values = [] # We have a parallel list for storing the values of the 'for' flag per rule 393 | for _rule in self: 394 | # We want _rule to run before the stored rules, but to be placed at the end instead 395 | rules.append(_rule) 396 | values.append(_rule.flags.persist) 397 | for rule in [_rule]+rules[:-1]: 398 | flags = rule.flags 399 | if not flags.ditto or (flags.ditto != 1) ^ applied: 400 | for j in range(flags.repeat): 401 | if randint(1, 100) <= flags.chance: 402 | applied = True 403 | wordin = word 404 | try: 405 | word = rule.apply(word) 406 | except RuleFailed: 407 | applied = False 408 | logger.info(f'`{rule}` does not apply to `{word}`') 409 | break 410 | except RuleError as e: 411 | logger.warning(f'`{rule}` execution suffered an error: {e}') 412 | break 413 | if wordin == word: 414 | logger.info(f'`{rule}` does not change `{word}`') 415 | break 416 | else: 417 | logger.info(f'`{wordin}` -> `{rule}` -> `{word}`') 418 | else: 419 | applied = False 420 | logger.info(f'`{rule}` was randomly not run on `{word}`') 421 | if flags.stop and (flags.stop != 1) ^ applied: 422 | return word 423 | for i in reversed(range(len(rules))): 424 | values[i] -= 1 425 | if values[i] == 0: # If the rule has 'expired', discard it 426 | del rules[i] 427 | del values[i] 428 | return word 429 | 430 | @dataclass 431 | class Line: 432 | word: Word = None 433 | comment: str = None 434 | 435 | def __str__(self): 436 | components = [] 437 | if self.word is not None: 438 | components.append(str(self.word)) 439 | if self.comment is not None: 440 | components.append(f'//{self.comment}') 441 | return ' '.join(components) 442 | 443 | # == Functions == # 444 | def parseWordset(wordset, graphs=(), separator='', syllabifier=None): 445 | '''Parses a wordlist. 446 | 447 | Arguments: 448 | wordset -- the words to be parsed (str) 449 | graphs -- list of graphemes used to parse the words (list) 450 | 451 | Returns a list. 452 | ''' 453 | if isinstance(wordset, str): 454 | wordset = wordset.splitlines() 455 | _wordset = [] 456 | for word in wordset: 457 | if isinstance(word, Word): 458 | line = Line(word=word) 459 | elif isinstance(word, Line): 460 | line = word 461 | elif word.startswith('//'): # Is a comment 462 | line = Line(comment=word[2:]) 463 | elif '//' in word: # Contains a comment 464 | word, comment = word.split('//', 1) 465 | line = Line(word=Word(word, graphs, separator, syllabifier), comment=comment) 466 | elif word: 467 | line = Line(word=Word(word, graphs, separator, syllabifier)) 468 | else: 469 | line = Line() 470 | _wordset.append(line) 471 | return _wordset 472 | 473 | def tokeniseCategory(line, linenum=0): 474 | for match in CATEGORY_REGEX.finditer(line): 475 | type = match.lastgroup 476 | value = match.group() 477 | column = match.start() 478 | if type == 'OP': 479 | value = value.strip() 480 | elif type == 'UNKNOWN': 481 | raise CompilerError(f'unexpected character', value, linenum, column) 482 | yield Token(type, value, linenum, column) 483 | 484 | def compileCategory(line, linenum=0, cats=None): 485 | tokens = list(tokeniseCategory(line, linenum)) 486 | if [token.type for token in tokens] != ['CATEGORY', 'OP', 'VALUES']: 487 | raise FormatError(f'{line!r} is not a category definition') 488 | name, op, values = [token.value for token in tokens] 489 | if ',' not in values: 490 | values += ',' 491 | cat = Cat.make(f'[{values}]', cats, name) 492 | if op == '=': 493 | return {name: cat} 494 | else: 495 | if cats is None or name not in cats: 496 | raise TokenError(f'category {name!r} is not defined', tokens[1]) 497 | if op == '+=': 498 | return {name: cats[name]+cat} 499 | elif op == '-=': 500 | return {name: cats[name]-cat} 501 | else: 502 | raise TokenError('invalid category operation', tokens[1]) 503 | 504 | def tokeniseFlags(line, linenum=0, colstart=None): 505 | for match in FLAG_REGEX.finditer(line, colstart): 506 | type = match.lastgroup 507 | value = match.group() 508 | column = match.start() 509 | if type == 'UNKNOWN': 510 | raise CompilerError(f'unexpected character', value, linenum, column) 511 | yield Token(type, value, linenum, column) 512 | 513 | def compileFlags(tokens): 514 | tokens = list(tokens) 515 | binaryflags = ('ignore', 'rtl') 516 | ternaryflags = ('ditto', 'stop') 517 | numericflags = {'repeat': MAX_RUNS, 'persist': MAX_RUNS, 'chance': 100} # Maximum values 518 | flags = {} 519 | for flag, token in partitionTokens(tokens, 'SEPARATOR'): 520 | if not flag: 521 | raise TokenError('expected flag', token) 522 | elif flag[0].type == 'NEGATION': 523 | name = flag[-1].value 524 | if len(flag) == 1: 525 | raise TokenError('expected flag name', token) 526 | elif flag[1].type != 'FLAG': 527 | raise TokenError('expected flag name', flag[1]) 528 | elif name not in ternaryflags: 529 | raise TokenError('invalid ternary flag name', flag[1]) 530 | elif len(flag) == 2: 531 | flags[name] = -1 532 | else: 533 | raise TokenError('expected semicolon', flag[2]) 534 | elif flag[0].type == 'FLAG': 535 | name = flag[0].value 536 | arg = flag[-1].value 537 | if name not in FLAGS: 538 | raise TokenError('invalid flag name', flag[0]) 539 | elif len(flag) == 1: 540 | if name in numericflags: 541 | flags[name] = numericflags[name] # Set to maximum value 542 | else: 543 | flags[name] = 1 544 | elif flag[1].type != 'COLON': 545 | raise TokenError('expected colon or semicolon', flag[1]) 546 | elif name not in numericflags: 547 | raise TokenError('invalid numeric flag name', flag[1]) 548 | elif len(flag) == 2: 549 | raise TokenError('expected integer argument', token) 550 | elif flag[2].type != 'ARGUMENT': 551 | raise TokenError('expected integer argument', flag[2]) 552 | elif not (1 <= int(arg) <= numericflags[name]): 553 | raise TokenError('argument out of range', flag[2]) 554 | elif len(flag) == 3: 555 | flags[name] = int(arg) 556 | else: 557 | raise TokenError('expected semicolon', flag[3]) 558 | else: 559 | raise TokenError('invalid flag', flag[0]) 560 | return Flags(**flags) 561 | 562 | def tokeniseMetarule(line, linenum=0): 563 | for match in METARULE_REGEX.finditer(line): 564 | type = match.lastgroup 565 | value = match.group() 566 | column = match.start() 567 | if type == 'METARULE': 568 | value = value[1:] 569 | elif type == 'SPACE': 570 | yield Token(type, value, linenum, column) 571 | yield from tokeniseFlags(line, linenum, match.end()) 572 | break 573 | elif type == 'UNKNOWN': 574 | raise CompilerError('unexpected character', value, linenum, column) 575 | yield Token(type, value, linenum, column) 576 | 577 | def compileMetarule(line, linenum=0): 578 | tokens = list(tokeniseMetarule(line, linenum)) 579 | if not tokens: 580 | raise ValueError('tokens cannot be empty') 581 | name = tokens[0].value 582 | for ix, token in enumerate(tokens): 583 | if token.type == 'SPACE': # Found flags 584 | flags = compileFlags(tokens[ix+1:]) 585 | break 586 | else: 587 | ix = len(tokens) 588 | if name == 'block': 589 | flags = Flags() 590 | else: 591 | flags = None 592 | arg = tokens[ix-1].value 593 | if tokens[0].type != 'METARULE': 594 | raise TokenError('expected metarule name', tokens[0]) 595 | elif name not in METARULES: 596 | raise TokenError('invalid metarule name', tokens[0]) 597 | elif name in ('def', 'rule') and flags: 598 | raise TokenError(f'metarule !{name} cannot take flags', tokens[ix]) 599 | elif ix == 1: 600 | if name == 'block': 601 | arg = None 602 | else: 603 | if ix < len(tokens): 604 | token = tokens[ix] 605 | else: 606 | token = Token('', '', linenum, tokens[-1].column+len(tokens[-1].value)) 607 | raise TokenError(f'metarule !{name} requires an argument', token) 608 | elif tokens[1].type != 'COLON': 609 | raise TokenError('expected colon', tokens[1]) 610 | elif ix == 2: 611 | raise TokenError('colon must be followed by an argument', tokens[1]) 612 | elif tokens[2].type != 'NUMBER' and name == 'block': 613 | raise TokenError('metarule !block requires an integer argument', tokens[2]) 614 | elif tokens[2].type != 'IDENTIFIER' and name in ('def', 'rule'): 615 | raise TokenError(f'metarule !{name} requires an alphabetic argument', tokens[2]) 616 | elif ix == 3: 617 | if name == 'block': 618 | arg = int(arg) 619 | else: 620 | raise TokenError('expected space or newline', tokens[3]) 621 | return name, arg, flags 622 | 623 | def tokeniseRule(line, linenum=0): 624 | colstart = 0 625 | while colstart < len(line): 626 | match = RULE_REGEX.match(line, colstart) 627 | type = match.lastgroup 628 | value = match.group() 629 | column = match.start() 630 | colstart = match.end() 631 | if type == 'INDICES': 632 | yield Token(type, value[1:], linenum, column) 633 | continue 634 | elif type == 'SPACE': 635 | yield Token(type, value, linenum, column) 636 | yield from tokeniseFlags(line, linenum, colstart) 637 | break 638 | elif type == 'UNKNOWN': 639 | if column == 0: 640 | type = 'TARGET' 641 | value = '' 642 | colstart = 0 643 | else: 644 | raise CompilerError(f'unexpected character', value, linenum, column) 645 | yield Token(type, value, linenum, column) 646 | colstart = yield from tokenisePattern(line, colstart, linenum) 647 | else: 648 | yield Token('END', '', linenum, colstart) 649 | 650 | def compileIndexedPattern(pattern, cats=None, reduceindices=True): 651 | if pattern[-1].type == 'INDICES': 652 | indices = [int(index) for index in pattern[-1].value.split('|')] 653 | if reduceindices: 654 | indices = [index-(1 if index>0 else 0) for index in indices] 655 | pattern = pattern[:-1] 656 | else: 657 | indices = None 658 | return compilePattern(pattern, cats), indices 659 | 660 | def compileTarget(pattern, cats=None): 661 | return Target(*compileIndexedPattern(pattern, cats)) 662 | 663 | def compileEpenthesis(pattern, cats=None): 664 | pattern, indices = compileIndexedPattern(pattern, cats, False) 665 | return Target([], indices), Replacement(pattern) 666 | 667 | def compileReplacement(pattern, cats=None): 668 | pattern, indices = compileIndexedPattern(pattern, cats, False) 669 | if indices is not None: 670 | raise FormatError('replacement field cannot contain indices') 671 | else: 672 | return Replacement(pattern) 673 | 674 | def compileEnvironment(pattern, cats=None, reduceindices=True): 675 | patterns = [] 676 | for pattern, sep in partitionTokens(pattern, 'PLACEHOLDER'): 677 | if sep is not None and patterns: # Only one placeholder is allowed, which then follows the first pattern 678 | raise TokenError('invalid placeholder', sep) 679 | patterns.append(pattern) 680 | if len(patterns) == 2: 681 | left, right = patterns 682 | env = LocalEnvironment(compilePattern(left, cats), compilePattern(right, cats)) 683 | elif len(patterns) == 1: 684 | pattern = patterns[0] 685 | env = GlobalEnvironment(*compileIndexedPattern(pattern, cats, reduceindices)) 686 | return env or None 687 | 688 | COMPILERS = { 689 | 'EPENTHESIS': compileEpenthesis, 690 | 'DELETION': compileTarget, 691 | 'TARGET': compileTarget, 692 | 'MOVE': lambda pattern, cats: compileField(pattern, cats, 'AND', False), 693 | 'COPY': lambda pattern, cats: compileField(pattern, cats, 'AND', False), 694 | 'REPLACEMENT': compileReplacement, 695 | 'ENVIRONMENT': lambda pattern, cats: compileField(pattern, cats, 'AND'), 696 | 'EXCEPTION': lambda pattern, cats: compileField(pattern, cats, 'AND'), 697 | } 698 | 699 | def compileField(tokens, cats=None, delimiter='OR', reduceindices=True): 700 | if not tokens: 701 | return [] 702 | if tokens[-1].type == delimiter: 703 | raise TokenError('invalid delimiter', tokens[-1]) 704 | fieldmarker = tokens[0].type 705 | _compile = COMPILERS.get(fieldmarker, lambda pattern, cats: compileEnvironment(pattern, cats, reduceindices)) 706 | if fieldmarker in COMPILERS: 707 | tokens = tokens[1:] 708 | field = [] 709 | for pattern, sep in partitionTokens(tokens, delimiter): 710 | if not pattern: 711 | raise TokenError('unexpected delimiter', sep) 712 | field.append(_compile(pattern, cats)) 713 | # Final replacements field handling 714 | if fieldmarker in ('MOVE', 'COPY'): 715 | return fieldmarker.lower(), field 716 | elif fieldmarker == 'EPENTHESIS': 717 | return map(list, zip(*field)) 718 | return field 719 | 720 | FIELD_MARKERS = { 721 | 'EPENTHESIS': 'reps', 722 | 'DELETION': 'tars', 723 | 'TARGET': 'tars', 724 | 'MOVE': 'reps', 725 | 'COPY': 'reps', 726 | 'REPLACEMENT': 'reps', 727 | 'ENVIRONMENT': 'envs', 728 | 'EXCEPTION': 'excs', 729 | } 730 | 731 | def compileRule(line, linenum=0, cats=None): 732 | from math import ceil 733 | if isinstance(line, str): 734 | tokens = list(tokeniseRule(line, linenum)) 735 | else: 736 | tokens = line 737 | line = '' 738 | if tokens[0].type == 'END': 739 | tokens = [] 740 | elif tokens[0].type not in FIELD_MARKERS: 741 | raise TokenError('unexpected token', tokens[0]) 742 | fields = { 743 | 'otherwise': None, 744 | 'flags': Flags(), 745 | 'rule': line 746 | } 747 | # Extract flags 748 | for ix, token in enumerate(tokens): 749 | if token.type == 'SPACE': 750 | fields['flags'] = compileFlags(tokens[ix+1:]) 751 | tokens[ix].type = 'END' 752 | break 753 | # Extract remainder of fields 754 | i = None 755 | for j, token in enumerate(tokens): 756 | type, value = token 757 | if type in FIELD_MARKERS or type == 'END': 758 | if i is not None: 759 | field = FIELD_MARKERS[tokens[i].type] 760 | if field in fields: 761 | raise TokenError('unexpected field marker', tokens[i]) 762 | fields[field] = tokens[i:j] 763 | i = j 764 | if type in ('MOVE', 'COPY', 'REPLACEMENT'): 765 | if 'reps' in fields: # Detected an otherwise 766 | fields['otherwise'] = compileRule(fields.get('tars', []) + tokens[j:ix+1], cats=cats) 767 | break 768 | elif type == 'END': 769 | break 770 | # Check for restricted field combinations 771 | if 'tars' in fields and 'reps' in fields: 772 | if fields['tars'][0].type == 'DELETION': 773 | raise TokenError('replacement field not allowed with deletion', fields['reps'][0]) 774 | if fields['reps'][0].type == 'EPENTHESIS': 775 | raise TokenError('target field not allowed with epenthesis', fields['tars'][0]) 776 | # Compile fields 777 | fields['tars'] = compileField(fields.get('tars', []), cats) or [[]] 778 | fields['reps'] = compileField(fields.get('reps', []), cats) or [[]] 779 | fields['envs'] = compileField(fields.get('envs', []), cats) or [[]] 780 | fields['excs'] = compileField(fields.get('excs', []), cats) 781 | # Handle indexed epenthesis 782 | if isinstance(fields['reps'], map): # Epenthesis 783 | fields['tars'], fields['reps'] = fields['reps'] 784 | return Rule(**fields) 785 | 786 | def compileLine(line, linenum=0, cats=None): 787 | if not line: 788 | return None 789 | # Attempt to tokenise as category 790 | with suppress(CompilerError, FormatError): 791 | return compileCategory(line, linenum, cats) 792 | # Attempt to tokenise as metarule 793 | with suppress(CompilerError, FormatError): 794 | return compileMetarule(line, linenum) 795 | # Attempt to tokenise as rule 796 | return compileRule(line, linenum, cats) 797 | 798 | def makeBlock(ruleset, start=None, num=None, defs=None): 799 | if defs is None: 800 | defs = {} 801 | else: 802 | defs = defs.copy() 803 | cats = [] 804 | block = [] 805 | if start is None: 806 | i = 0 807 | else: 808 | i = start 809 | while len(block) != num and i < len(ruleset): 810 | rule = ruleset[i] 811 | i += 1 812 | if isinstance(rule, Rule): # Rule 813 | block.append(rule) 814 | elif isinstance(rule, dict): # Category 815 | cats += rule.items() 816 | elif isinstance(rule, tuple): # Metarule 817 | name, arg, flags = rule 818 | if name == 'block': 819 | if arg is not None: 820 | _block, _cats, i, defs = makeBlock(ruleset, i, arg, defs) 821 | else: 822 | _block, _cats = makeBlock(ruleset, i, arg, defs) 823 | i = len(ruleset) 824 | block.append(RuleBlock(_block, flags)) 825 | cats += _cats 826 | elif name == 'def': 827 | _block, _cats, i, defs = makeBlock(ruleset, i, 1, defs) 828 | defs[arg] = _block 829 | cats += _cats 830 | elif name == 'rule': 831 | block.extend(defs[arg]) 832 | if start is None: 833 | return block, cats 834 | else: 835 | return block, cats, i, defs 836 | 837 | def compileRuleset(ruleset, cats=None): 838 | if isinstance(ruleset, str): 839 | ruleset = ruleset.splitlines() 840 | if cats is None: 841 | cats = {} 842 | else: 843 | cats = cats.copy() 844 | _ruleset = [] 845 | for linenum, line in enumerate(ruleset): 846 | # Remove comments 847 | line = line.split('//')[0].strip() 848 | # Compile 849 | try: 850 | rule = compileLine(line, linenum, cats) 851 | except CompilerError as e: 852 | logger.warning(f'{line!r} failed to compile due to bad formatting: {e}') 853 | except Exception as e: 854 | logger.warning(f'{line!r} failed to compile due to an unexpected error: {e}') 855 | else: 856 | if isinstance(rule, dict): # Category 857 | cats.update(rule) 858 | _ruleset.append(rule) 859 | # Evaluate meta-rules 860 | ruleset, _cats = makeBlock(_ruleset) 861 | return RuleBlock(ruleset), _cats 862 | 863 | def setupLogging(filename=__location__, loggername='sce'): 864 | global logger 865 | if filename is not None: 866 | logging.config.fileConfig(filename) 867 | logger = logging.getLogger(loggername) 868 | 869 | def run(wordset, ruleset, cats=None, syllabifier=None, output='list'): 870 | '''Applies a set of sound change rules to a set of words. 871 | 872 | Arguments: 873 | wordset -- the words to which the rules are to be applied (list) 874 | ruleset -- the rules which are to be applied to the words (RuleBlock) 875 | cats -- the initial categories to be used in ruleset compiling (dict) 876 | syllabifier -- the syllabifier function to use for syllabifying words (RulesSyllabifier) 877 | output -- what form to provide the output in - one of 'list', 'as-is', 'str' (str) 878 | 879 | Returns a str or list. 880 | ''' 881 | if not ruleset or not wordset: # One of these is blank so do nothing 882 | return wordset 883 | cats = parseCats(cats or {}) 884 | ruleset, _cats = compileRuleset(ruleset, cats) # Compile ruleset first so we can use the graphs it contains 885 | # Try to get graphs and separator from the initial categories 886 | graphs = cats.get('graphs', ()) 887 | separator = cats.get('separator', [''])[0] 888 | # Ruleset overrides externally-supplied categories 889 | for name, cat in _cats: 890 | if name == 'graphs': 891 | graphs = cat 892 | elif name == 'separator': 893 | separator = cat[0] 894 | else: 895 | break 896 | wordset = parseWordset(wordset, graphs, separator, syllabifier) 897 | for line in wordset: 898 | if line.word is not None: # There's a word 899 | logger.info(f'This word: {line.word}') 900 | logger.debug(f'Segments: {line.word.phones}') 901 | line.word = ruleset.apply(line.word) 902 | if output != 'as-is': 903 | wordset = [str(line) for line in wordset] 904 | if output == 'str': 905 | wordset = '\n'.join(wordset) 906 | return wordset 907 | 908 | apply_ruleset = run 909 | 910 | # Setup logging 911 | setupLogging() 912 | --------------------------------------------------------------------------------