├── __init__.py
├── src
    ├── __init__.py
    ├── pyle.py
    ├── logging.conf
    ├── syllables.py
    ├── lang.py
    ├── syntax.py
    ├── gen.py
    ├── core.py
    ├── _pattern.py
    └── sce.py
├── .gitignore
├── license.txt
└── readme.md


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | langs/
3 | trees/
4 | examples.py
5 | correspondences.py
6 | interface.py
7 | *.dat
8 | 


--------------------------------------------------------------------------------
/src/pyle.py:
--------------------------------------------------------------------------------
 1 | '''Collection of conlanging tools
 2 | ''''''
 3 | ==================================== To-do ====================================
 4 | === Bug-fixes ===
 5 | 
 6 | === Implementation ===
 7 | 
 8 | === Features ===
 9 | 
10 | === Style ===
11 | '''
12 | 
13 | 


--------------------------------------------------------------------------------
/src/logging.conf:
--------------------------------------------------------------------------------
 1 | [loggers]
 2 | keys=root,sce
 3 | 
 4 | [handlers]
 5 | keys=consoleHandler
 6 | 
 7 | [formatters]
 8 | keys=sceFormatter
 9 | 
10 | [logger_root]
11 | level=WARNING
12 | handlers=consoleHandler
13 | 
14 | [logger_sce]
15 | level=WARNING
16 | handlers=consoleHandler
17 | qualname=sce
18 | propagate=0
19 | 
20 | [handler_consoleHandler]
21 | class=StreamHandler
22 | level=DEBUG
23 | formatter=sceFormatter
24 | args=(sys.stdout,)
25 | 
26 | [formatter_sceFormatter]
27 | format=%(asctime)s %(levelname)s:%(message)s
28 | datefmt=%d/%m/%Y %H:%M:%S


--------------------------------------------------------------------------------
/license.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Kathryn Spence
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # Conlanger
 2 | 
 3 | Conlanger is a package containing several tools designed primarily to aid conlangers with tedious tasks such as word
 4 | generation and diachronics. To use, simply place the package on your Python path, and `import conlanger` into an
 5 | interface script.
 6 | 
 7 | ## Syntax
 8 | 
 9 | `conlanger.syntax` is a stand-alone module allowing for the generation of syntax tree images, using either dependency or
10 | constituency trees, from a textual list-based representation of trees - labelled lists for constituency trees, and
11 | unlabelled lists as an alterantive for dependency trees. Requires Pillow as a dependency.
12 | 
13 | ## Gen
14 | 
15 | `conlanger.gen` is a module allowing the generation of words from syllables whose graphemes, where the syllable types
16 | and graphemes are both distributed according to peaked power law distributions. Additionally, restrictions (both linear
17 | and non-linear) can be placed on what outputs are considered valid.
18 | 
19 | ## SCE
20 | 
21 | `conlanger.sce` is a module with powerful tools for transforming words according to transformation rules. Documentation
22 | of the rules can be found [here](http://www.dragonlinguistics.com/sce/doc.html).
23 | 
24 | ## Lang
25 | 
26 | `conlanger.lang` is a module providing support for storing the configuration data for the other modules on a
27 | per-language basis, as well as saving this data to and loading from file. It also provides shortcuts to utilising the
28 | other modules with a given language, automatically providing the configuation data defined for that language.
29 | 


--------------------------------------------------------------------------------
/src/syllables.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | 
  3 | @dataclass
  4 | class Syllabifier:
  5 |     rules: tuple
  6 | 
  7 |     def __init__(self, cats, onsets=(), nuclei=(), codas=(), margins=(), constraints=()):
  8 |         from ._pattern import parsePatterns
  9 |         onsets = parsePatterns(onsets)
 10 |         nuclei = parsePatterns(nuclei)
 11 |         codas = parsePatterns(codas)
 12 |         margins = parsePatterns(margins)
 13 |         constraints = parsePatterns(constraints)
 14 |         rules = []
 15 |         rules.extend(generateNonFinals(codas, onsets, nuclei))    # Medials
 16 |         rules.extend(generateFinals(codas, margins))              # Finals
 17 |         rules.extend(generateNonFinals(margins, onsets, nuclei))  # Initials
 18 |         self.rules = tuple(rule for rule in self.rules if checkValid(rule[0], constraints))
 19 | 
 20 |     def __call__(self, word):
 21 |         breaks = []
 22 |         # Step through the word
 23 |         pos = 0
 24 |         while pos < len(word):
 25 |             for rule, _breaks in self.rules:
 26 |                 if rule == ['_', '#'] and pos in breaks:
 27 |                     continue
 28 |                 match, rpos = word.matchPattern(rule, pos)[:2]
 29 |                 if match:
 30 |                     # Compute and add breaks for this pattern
 31 |                     for ix in _breaks:
 32 |                         # Syllable breaks must be within the word and unique
 33 |                         if 0 < pos+ix < len(word) and pos+ix not in breaks:
 34 |                             breaks.append(pos+ix)
 35 |                     # Step past this match
 36 |                     pos = rpos
 37 |                     if rule[-1] == '#':
 38 |                         pos -= 1
 39 |                     break
 40 |             else:  # No matches here
 41 |                 pos += 1
 42 |         return tuple(breaks)
 43 | 
 44 | def generateNonFinals(codas, onsets, nuclei):
 45 |     rules = []
 46 |     for crank, coda in enumerate(codas):
 47 |         if coda[-1] == '#':
 48 |             continue
 49 |         elif coda[-1] == '_':
 50 |             coda = coda[:-1]
 51 |         for orank, onset in enumerate(onsets):
 52 |             if onset[0] == '#':
 53 |                 if coda == ['#']:
 54 |                     onset = onset[1:]
 55 |                 else:
 56 |                     continue
 57 |             if onset == ['_']:
 58 |                 onset = []
 59 |             for nrank, nucleus in enumerate(nuclei):
 60 |                 if nucleus[0] == '#':
 61 |                     if coda == ['#'] and onset == []:
 62 |                         nucleus = nucleus[1:]
 63 |                     else:
 64 |                         continue
 65 |                 pattern = coda + onset + nucleus
 66 |                 breaks = [len(coda)]
 67 |                 if pattern[-1] == '#':
 68 |                     breaks.append(len(pattern)-1)
 69 |                 rank = crank + orank + nrank
 70 |                 rules.append((pattern, breaks, rank))
 71 |     return (r[:2] for r in sorted(rules, key=lambda r: r[2]))
 72 | 
 73 | def generateFinals(codas, margins):
 74 |     rules = []
 75 |     for mrank, margin in enumerate([margin for margin in margins if margin[-1] == '#']):
 76 |         if margin == ['_', '#']:
 77 |             margin = ['#']
 78 |         for crank, coda in enumerate(codas):
 79 |             if coda[-1] == '#':
 80 |                 if margin == ['#']:
 81 |                     coda = coda[:-1]
 82 |                 else:
 83 |                     continue
 84 |             pattern = coda + margin
 85 |             breaks = [0 if coda == ['_'] else len(coda)]
 86 |             rank = crank + mrank
 87 |             rules.append((pattern, breaks, rank))
 88 |     return (r[:2] for r in sorted(rules, key=lambda r: r[2]))
 89 | 
 90 | def checkValid(rule, constraints):
 91 |     for constraint in constraints:
 92 |         for rpos in range(len(rule)-len(constraint)):
 93 |             for cpos, ctoken in enumerate(constraint):
 94 |                 rtoken = rule[rpos+cpos]
 95 |                 if isinstance(rtoken, str) and isinstance(ctoken, str):
 96 |                     if rtoken == ctoken:
 97 |                         continue
 98 |                 elif isinstance(rtoken, str) and isinstance(ctoken, Cat):
 99 |                     if rtoken in ctoken:
100 |                         continue
101 |                 elif isinstance(rtoken, Cat) and isinstance(ctoken, Cat):
102 |                     if rtoken <= ctoken:
103 |                         continue
104 |                 break
105 |             else:
106 |                 return False
107 |     return True
108 | 


--------------------------------------------------------------------------------
/src/lang.py:
--------------------------------------------------------------------------------
  1 | '''Create and manipulate languages
  2 | 
  3 | Classes:
  4 |     Config   -- collection of gen.py configuration data
  5 |     Language -- represents a language
  6 | 
  7 | Functions:
  8 |     load -- load the data from the named language file
  9 |     save -- save the given language's data to file
 10 | ''''''
 11 | ==================================== To-do ====================================
 12 | === Bug-fixes ===
 13 | 
 14 | === Implementation ===
 15 | Implement confirming overwriting save data - needs UI
 16 | Maybe add different modes for each positional syllable type
 17 | 
 18 | === Features ===
 19 | Add generating every possible word/root
 20 | Language.apply_ruleset will be replaced by calls to the diachronics module, once that exists
 21 | 
 22 | === Style ===
 23 | Consider where to raise/handle exceptions
 24 | '''
 25 | 
 26 | from dataclasses import dataclass
 27 | import os
 28 | import json
 29 | from .core import Cat, Syllabifier, parseCats
 30 | from ._pattern import parsePatterns, unparsePattern
 31 | from . import gen, sce
 32 | 
 33 | os.chdir(os.path.dirname(os.path.abspath(__file__)))  # Language files are in conlanger/langs/
 34 | 
 35 | ## Classes
 36 | 
 37 | @dataclass
 38 | class Config:
 39 |     patterns: dict
 40 |     constraints: list
 41 |     sylrange: range
 42 |     sylmode: list
 43 |     patternmode: list
 44 |     graphmode: list
 45 | 
 46 | @dataclass
 47 | class Language:
 48 |     '''Class for representing a single language.
 49 | 
 50 |     Instance variables:
 51 |         name        -- language name (str)
 52 |         cats        -- grapheme categories (dict)
 53 |         wordConfig  -- word configuration data (Config)
 54 |         syllabifier -- syllabification function (RulesSyllabifier)
 55 | 
 56 |     Methods:
 57 |         gen_word      -- generate words
 58 |         apply_ruleset -- apply a sound change ruleset to a wordset
 59 |     '''
 60 |     name: str
 61 |     cats: dict
 62 |     configs: dict
 63 |     phonotactics: dict
 64 |     syllabifier: Syllabifier
 65 | 
 66 |     def __init__(self, name='', cats=None, configs=None, phonotactics=None, syllabifier=None):
 67 |         '''Constructor for Language.
 68 | 
 69 |         Arguments:
 70 |             name    -- language name (str)
 71 |             cats    -- grapheme categories (dict)
 72 |             configs -- configuration data sets (dict)
 73 |         '''
 74 |         self.name = name
 75 |         self.cats = parseCats(cats)
 76 |         if 'graphs' not in self.cats:  # Category 'graphs' must exist
 77 |             self.cats['graphs'] = Cat("'")
 78 |         self.configs = {}
 79 |         if configs is None:
 80 |             configs = {}
 81 |         for config in configs:
 82 |             _config = configs[config].copy()
 83 |             _config['patterns'] = parsePatterns(_config['patterns'], self.cats)
 84 |             _config['constraints'] = parsePatterns(_config['constraints'], self.cats)
 85 |             _config['sylrange'] = range(_config['sylrange'][0], _config['sylrange'][1]+1)
 86 |             self.configs[config] = Config(**_config)
 87 |         phonotactics = parsePatterns(phonotactics, self.cats)
 88 |         self.phonotactics = {'nuclei': phonotactics['nuclei'], 'margins': []}
 89 |         # Need some default phonotactics instead of empty lists
 90 |         self.phonotactics['onsets'] = phonotactics['onsets'] or parsePatterns('_')
 91 |         self.phonotactics['codas'] = phonotactics['codas'] or parsePatterns('_')
 92 |         for margin in phonotactics['margins']:
 93 |             if (margin[0] == '#') != (margin[-1] == '#'):
 94 |                 self.phonotactics['margins'].append(margin)
 95 |         if not any((margin[0] == '#') for margin in self.phonotactics['margins']):
 96 |             self.phonotactics['margins'].extend(parsePatterns('#_'))
 97 |         if not any((margin[-1] == '#') for margin in self.phonotactics['margins']):
 98 |             self.phonotactics['margins'].extend(parsePatterns('_#'))
 99 |         self.syllabifier = Syllabifier(self.cats, **self.phonotactics)
100 | 
101 |     @property
102 |     def data(self):
103 |         data = {}
104 |         if self.name != '':
105 |             data['name'] = self.name
106 |         if self.cats != {}:
107 |             data['cats'] = {name: list(cat) for name, cat in self.cats.items()}
108 |         if self._configs != {}:
109 |             data['configs'] = self._configs
110 |         data['syllabifier'] = []
111 |         for rule in self.syllabifier.rules:
112 |         	rule, indices = rule
113 |         	rule = rule.copy()
114 |         	for i in reversed(indices):
115 |         		rule.insert(i, '$')
116 |         	data['syllabifier'].append(unparsePattern(rule))
117 |         if self.phonotactics is not None:
118 |             data['phonotactics'] = {k: [unparsePattern(pattern) for pattern in v] for k, v in self.phonotactics.items()}
119 |         return data
120 | 
121 |     def gen(self, config, num=1):
122 |         '''Generates 'num' words using 'config'.
123 | 
124 |         Arguments:
125 |             config -- config data to use
126 |             num    -- number of words to generate, 0 generates every possible word (int)
127 | 
128 |         Returns a list
129 |         '''
130 |         if config not in self.configs:
131 |             return []
132 |         if num == 0:  # Generate every possible word, unimplemented
133 |             return []
134 |         return [gen.gen_word(self.configs[config], self.cats['graphs'], self.syllabifier) for i in range(num)]
135 | 
136 |     def apply_ruleset(self, wordset, ruleset, output='list'):
137 |         '''Runs the sound change 'ruleset' on the 'wordset'.
138 | 
139 |         Arguments:
140 |             wordset   -- the words to be changed (str, list)
141 |             ruleset   -- the sound changes to apply (str, list)
142 |             to_string -- whether or not to have string output
143 | 
144 |         Returns a str or list
145 |         '''
146 |         return sce.run(wordset, ruleset, self.cats, self.syllabifier, output)
147 | 
148 | # == Functions == #
149 | def load(name):
150 |     '''Loads language data from file.
151 | 
152 |     Arguments:
153 |         name -- the name of the language file to load from
154 | 
155 |     Returns a Language
156 |     '''
157 |     with open('langs/{}.dat'.format(name.lower()), 'r', encoding='utf-8') as f:
158 |         data = json.load(f)
159 |     return Language(**data)
160 | 
161 | def save(lang):
162 |     '''Saves a language to file.
163 | 
164 |     Arguments:
165 |         lang -- the Language to save
166 |     '''
167 |     data = lang.data
168 |     # Check for existing save data
169 |     with open('langs/{}.dat'.format(name.lower()), 'r+', encoding='utf-8') as f:
170 |         if f.read():
171 |             if True:  # Check if the user wants to overwrite this data - not implemented yet
172 |                 f.truncate()
173 |             else:
174 |                 return
175 |         json.dump(data)
176 | 
177 | def getcwd():
178 |     print(os.getcwd())
179 | 


--------------------------------------------------------------------------------
/src/syntax.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | ==================================== To-do ====================================
  3 | === Bug-fixes ===
  4 | 
  5 | === Implementation ===
  6 | Multi-word labels require that triangle thing
  7 | Need to reimplement discontinuities somehow
  8 | 
  9 | === Features ===
 10 | 
 11 | === Style ===
 12 | '''
 13 | 
 14 | import re
 15 | from dataclasses import dataclass, field
 16 | from math import floor
 17 | from PIL import Image, ImageDraw, ImageFont
 18 | from .core import LangException, Token, CompilerError, TokenError
 19 | 
 20 | ## Constants
 21 | SCALE = 10
 22 | POINT_SIZE = 16*SCALE
 23 | FONT = ImageFont.truetype('calibri.ttf', POINT_SIZE)
 24 | GAP_WIDTH = POINT_SIZE  # minimum horizontal spacing between trees
 25 | GAP_HEIGHT = POINT_SIZE  # minimum vertical spacing between layers
 26 | LAYER_HEIGHT = GAP_HEIGHT + POINT_SIZE
 27 | PADDING = POINT_SIZE  # Padding around the edge of the image
 28 | 
 29 | ## Tokens
 30 | TOKENS = {
 31 |     'LBRACKET': r'\[',
 32 |     'RBRACKET': r'\]',
 33 |     'WHITESPACE': r' +',
 34 |     'QUOTED': r'\".*?\"',
 35 |     'INDEX': r'[₀-₉]+',
 36 |     'STRING': r'[^\[\]₀-₉ ]+',
 37 |     'UNKNOWN': r'.'
 38 | }
 39 | TOKEN_REGEX = re.compile('|'.join(f'(?P<{type}>{regex})' for type, regex in TOKENS.items()))
 40 | 
 41 | ## Exceptions
 42 | class TreeException(LangException):
 43 |     pass
 44 | 
 45 | class TreeFormatError(TreeException):
 46 |     pass
 47 | 
 48 | class UnexpectedToken(TokenError):
 49 |     def __init__(self, token, expected=None):
 50 |         type = token.type.lower()
 51 |         if expected is None:
 52 |             super().__init__(f'Unexpected {type} token', token)
 53 |         else:
 54 |             super().__init__(f'Unexpected {type} token, expected {expected}', token)
 55 | 
 56 | ## Classes
 57 | @dataclass
 58 | class Tree:
 59 |     label: str
 60 |     children: list = field(default_factory=list)
 61 | 
 62 |     def __len__(self):
 63 |         return len(self.children)
 64 | 
 65 |     def __getitem__(self, key):
 66 |         return self.children[key]
 67 | 
 68 |     def __iter__(self):
 69 |         yield from self.children
 70 | 
 71 |     def __str__(self):
 72 |         children = ' '.join(str(child) for child in self)
 73 |         return f'[{self.label} {children}]' if children else f'[{self.label}]'
 74 | 
 75 |     def __repr__(self):
 76 |         return f'Tree("{self}")'
 77 | 
 78 |     @staticmethod
 79 |     def make(string):
 80 |         tokens = list(tokenise(string))
 81 |         if tokens[0].type == 'LBRACKET' and tokens[-1].type == 'RBRACKET':
 82 |             return compileTree(tokens[1:-1])
 83 |         else:
 84 |             raise TreeFormatError('invalid syntax')
 85 | 
 86 |     ## Tree geometry
 87 |     @property
 88 |     def isleaf(self):
 89 |         return self.children == []
 90 | 
 91 |     @property
 92 |     def depth(self):
 93 |         if self.isleaf:
 94 |             return 0
 95 |         else:
 96 |             return max(child.depth for child in self) + 1
 97 | 
 98 |     ## Tree Size
 99 |     @property
100 |     def childrenwidth(self):
101 |         return max(0, GAP_WIDTH*(len(self)-1) + sum(child.width for child in self))
102 | 
103 |     @property
104 |     def width(self):
105 |         return max(self.labelwidth, self.childrenwidth)
106 | 
107 |     @property
108 |     def height(self):
109 |         return POINT_SIZE + self.depth * LAYER_HEIGHT
110 | 
111 |     ## Label
112 |     @property
113 |     def labelwidth(self):
114 |         return FONT.getsize(self.label)[0]
115 | 
116 |     @property
117 |     def labelmiddle(self):
118 |         if len(self) <= 1:
119 |             return floor(self.width/2)
120 |         else:
121 |             return floor((self[0].labelmiddle + (self.width - self[-1].width + self[-1].labelmiddle))/2)
122 | 
123 |     @property
124 |     def labelleft(self):
125 |         return self.labelmiddle - floor(self.labelwidth/2)
126 | 
127 |     @property
128 |     def deplabelmiddle(self):
129 |         ix = None
130 |         leaf = None
131 |         for i, child in enumerate(self):
132 |             if child.isleaf:
133 |                 if leaf is None:
134 |                     ix = i
135 |                     leaf = child
136 |                 else:
137 |                     raise TreeFormatError('dependency nodes may have at most one leaf child')
138 |         if leaf is None:
139 |             return self.labelmiddle
140 |         else:
141 |             return sum(child.width for child in self[:ix]) + GAP_WIDTH*ix + floor((self.width-self.childrenwidth)/2) + floor(leaf.labelwidth/2)
142 | 
143 |     @property
144 |     def deplabelleft(self):
145 |         return self.deplabelmiddle - floor(self.labelwidth/2)
146 | 
147 | ## Compiling Functions
148 | def tokenise(string):
149 |     for match in TOKEN_REGEX.finditer(string):
150 |         type = match.lastgroup
151 |         value = match.group()
152 |         column = match.start()
153 |         if type == 'WHITESPACE':
154 |             continue
155 |         elif type == 'QUOTED':
156 |             type = 'STRING'
157 |             value = value.strip('"')
158 |         elif type == 'INDEX':
159 |             value = value.translate(str.maketrans('₀₁₂₃₄₅₆₇₈₉', '0123456789'))
160 |         elif type == 'UNKNOWN':
161 |             raise CompilerError('unexpected character', value, 0, column)
162 |         yield Token(type, value, 0, column)
163 | 
164 | def matchBrackets(tokens, start=0):
165 |     if tokens[start].type != 'LBRACKET':
166 |         raise UnexpectedToken(tokens[start], 'lbracket')
167 |     depth = 0
168 |     for i, token in enumerate(tokens[start:], start+1):
169 |         if token.type == 'LBRACKET':
170 |             depth += 1
171 |         elif token.type == 'RBRACKET':
172 |             depth -= 1
173 |             if depth == 0:
174 |                 return i
175 |     raise TokenError(f'unmatched bracket', tokens[start])
176 | 
177 | def compileTree(tokens):
178 |     if tokens[0].type != 'STRING':
179 |         raise UnexpectedToken(tokens[0], 'string')
180 |     label = tokens[0].value
181 |     children = []
182 |     i = 1
183 |     while i < len(tokens):
184 |         type, value = tokens[i]
185 |         if type == 'LBRACKET':
186 |             j = matchBrackets(tokens, i)
187 |             children.append(compileTree(tokens[i+1:j-1]))
188 |             i = j
189 |         elif type == 'STRING':
190 |             children.append(Tree(value))
191 |             i += 1
192 |         else:
193 |             raise UnexpectedToken(tokens[i])
194 |     return Tree(label, children)
195 | 
196 | ## Drawing Functions
197 | def drawDependency(tree, draw, leaftop, top, left):
198 |     # Draw label
199 |     labelcolour = 'red' if tree.isleaf else 'blue'
200 |     draw.text((left+tree.deplabelleft, top), tree.label, labelcolour, FONT)
201 |     # Draw descendents
202 |     linetop = (left+tree.deplabelmiddle, top+POINT_SIZE+SCALE)  # We want 1px gap between label and line after rescaling
203 |     top += LAYER_HEIGHT
204 |     left += floor((tree.width - tree.childrenwidth)/2)
205 |     for child in tree:
206 |         if child.isleaf:
207 |             _top = leaftop
208 |         else:
209 |             _top = top
210 |         # Draw line
211 |         linebottom = (left+child.deplabelmiddle, _top-SCALE)  # Again, 1px gap between label and line after rescaling
212 |         linecolour = 'darkgrey' if child.isleaf else 'black'
213 |         draw.line([linetop, linebottom], linecolour, SCALE)  # Similarly, 1px line width after rescaling
214 |         # Draw child
215 |         drawDependency(child, draw, leaftop, _top, left)
216 |         # Next child
217 |         left += child.width + GAP_WIDTH
218 | 
219 | def drawConstituency(tree, draw, top, left):
220 |     # Draw label
221 |     labelcolour = 'red' if tree.isleaf else 'blue'
222 |     draw.text((left+tree.labelleft, top), tree.label, labelcolour, FONT)
223 |     # Draw descendents
224 |     linetop = (left+tree.labelmiddle, top+POINT_SIZE+SCALE)  # We want 1px gap between label and line after rescaling
225 |     top += LAYER_HEIGHT
226 |     left += floor((tree.width - tree.childrenwidth)/2)
227 |     for child in tree:
228 |         # Draw line
229 |         linebottom = (left+child.labelmiddle, top-SCALE)  # Again, 1px gap between label and line after rescaling
230 |         linecolour = 'darkgrey' if child.isleaf else 'black'
231 |         draw.line([linetop, linebottom], linecolour, SCALE)  # Similarly, 1px line width after rescaling
232 |         # Draw child
233 |         drawConstituency(child, draw, top, left)
234 |         # Next child
235 |         left += child.width + GAP_WIDTH
236 | 
237 | def drawTree(string, mode):
238 |     tree = Tree.make(string)
239 |     size = (tree.width + PADDING*2, tree.height + PADDING*2)
240 |     im = Image.new('RGB', size, 'white')
241 |     draw = ImageDraw.Draw(im)
242 |     if mode == 'dep':
243 |         leaftop = PADDING + tree.depth*LAYER_HEIGHT
244 |         drawDependency(tree, draw, leaftop, PADDING, PADDING)
245 |     else:
246 |         drawConstituency(tree, draw, PADDING, PADDING)
247 |     return im.resize((size[0]//SCALE, size[1]//SCALE), resample=Image.ANTIALIAS)
248 | 


--------------------------------------------------------------------------------
/src/gen.py:
--------------------------------------------------------------------------------
  1 | '''Generate syllables, words, or roots
  2 | 
  3 | ==================================== To-do ====================================
  4 | === Bug-fixes ===
  5 | Doesn't seem to be checking exceptions correctly (not urgent-urgent)
  6 | 
  7 | === Implementation ===
  8 | Potentially going to be overhauled in the near future
  9 | 
 10 | === Features ===
 11 | 
 12 | === Style ===
 13 | Consider where to raise/handle exceptions
 14 | 
 15 | === Mathematical model ===
 16 | r is the number of segments
 17 | p is the 'dropoff rate'
 18 | f(n) = p**n*(1-p)/(1-p**r) is the frequency of the nth most frequent segment (frequencies sum to 1)
 19 | 
 20 | p must be determined by observing that a = (1-p)/(1-p**r) is the frequency of the most frequent segment. From this, we
 21 | can estimate p ≈ 1-a, and with a first-order correction, p ≈ (1-a)+(a*(1-a)**r)/(1-a*r*(1-a)**(r-1)).
 22 | 
 23 | P(n) = (1-p**n)/(1-p**r) is the cumulative frequency of the first n segments, and can be found by summing over f(n)
 24 | 
 25 | A probability distribution can then be obtained by finding the inverse of P(n). Let x be a continuous random variable
 26 | from 0 to 1 (say, random.random()). Then n = floor(log(1-x*(1-p**r),p))
 27 | 
 28 | Obtaining a variant with a peak can be done by using two distributions, one reversed, with their modes overlapping. This
 29 | can be done by taking the range of x corresponding to the reversed section and rescaling it as follows, where a is the
 30 | frequency of the mode, and c the cumulative frequency of the bins before the mode: x -> 1-x/(a+c). Thus, when x<c, we
 31 | use a distribution with m+1 bins, mode a/(a+c), and the rescaled random variable. For the remainder, we use a
 32 | distribution with r-m bins, mode a/(1-c), and a rescaled variable x -> (x-c)/(1-c). Note that the mode belongs to this
 33 | second distribution.
 34 | '''
 35 | 
 36 | from .core import LangException, Cat, Word
 37 | from random import random, choice
 38 | from math import log, floor, ceil
 39 | 
 40 | # == Constants == #
 41 | MAX_RUNS = 10**5  # maximum number of times something can fail to be generated
 42 | 
 43 | # == Exceptions == #
 44 | class ExceededMaxRunsError(LangException):
 45 |     '''Exception raised when something has failed to be generated too many times.'''
 46 | 
 47 | # == Functions == #
 48 | def dist(bins, a=0, x=None):  # First bin has frequency a, random variable x
 49 |     '''Returns an element of 'bins' according to a power law distribution.
 50 | 
 51 |     Arguments:
 52 |         bins -- a non-empty ordered collection of elements (str, list, tuple)
 53 |         a    -- the frequency that the first bin should be selected (0 for equiprobable distribution) (float)
 54 |         x    -- a random variable supplied if the default random.random() is not desired (float)
 55 |     '''
 56 |     # See the docstring titled 'Mathematical Model' for the maths
 57 |     r = len(bins)
 58 |     if a <= 0:  # Use equiprobable distribution instead
 59 |         return choice(bins)
 60 |     if r == 1 or a >= 1:  # Only one bin
 61 |         return bins[0]
 62 |     if x is None:  # No random variable supplied
 63 |         x = random()
 64 |     p = (1-a)+(a*(1-a)**r)/(1-a*r*(1-a)**(r-1))
 65 |     return bins[floor(log(1-x*(1-p**r), p))]
 66 | 
 67 | def peakedDist(bins, a=0, m=0, c=0):
 68 |     '''Returns an element of 'bins' according to a peaked power law distribution.
 69 | 
 70 |     Arguments:
 71 |         bins -- an ordered collection of elements (str, list, tuple)
 72 |         a    -- the frequency that the most frequent bin should be selected (0 for equiprobable distribution) (float)
 73 |         m    -- the index of the most frequent bin
 74 |         c    -- the cumulative frequency of bins 0 to m-1
 75 |     '''
 76 |     # See the docstring titled 'Mathematical Model' for the maths
 77 |     if m <= 0 or c <= 0:  # All bins before the mode are ignored
 78 |         return dist(bins[m:], a)
 79 |     x = random()
 80 |     if x < c:  # In the left-hand branch
 81 |         return dist(bins[m::-1], a/(a+c), 1-x/(a+c))
 82 |     else:
 83 |         return dist(bins[m:], a/(1-c), (x-c)/(1-c))
 84 | 
 85 | def populate(pattern, mode):
 86 |     '''Generate a word section according to 'pattern'
 87 | 
 88 |     Arguments:
 89 |         pattern -- the pattern to generate (list)
 90 |         mode    -- representation of the mode of the grapheme distribution (list)
 91 |         all     -- indicator to generate every possible pattern, or one random pattern (bool)
 92 |     '''
 93 |     result = []
 94 |     for token in pattern:
 95 |         if token.type == 'category':
 96 |             result.append(peakedDist(token.cat, *mode))
 97 |         elif token == '"':
 98 |             result.append(result[-1])
 99 |         else:
100 |             result.append(str(token))
101 |     return result
102 | 
103 | def populateAll(pattern):
104 |     results = [[]]
105 |     for token in pattern:
106 |         if token.type == 'category':
107 |             temp = []
108 |             for result in results:
109 |                 for graph in token.cat:
110 |                     temp.append(result+[graph])
111 |             results = temp
112 |         elif token == '"':
113 |             for result in results:
114 |                 result.append(result[-1])
115 |         else:
116 |             for result in results:
117 |                 result.append(str(token))
118 |     return results
119 | 
120 | def genFromConfig(config, graphs=None, separator='', syllabifier=None):
121 |     '''Generate a single word as specified by the 'config'.
122 | 
123 |     Arguments:
124 |         config -- the config data to be used to generate this word
125 |         graphs -- the set of graphemes used for this word
126 | 
127 |     Returns a Word
128 | 
129 |     Raises ExceededMaxRunsError when the word repeatedly fails to be valid
130 |     '''
131 |     word = Word(['#'], graphs, separator, syllabifier)
132 |     patterns, constraints, sylrange, sylmode, patternmode, graphmode = config
133 |     sylcount = peakedDist(sylrange, *sylmode)
134 |     for i in range(sylcount):
135 |         if sylcount == 1:  # Monosyllable
136 |             _patterns = patterns['mono'] or patterns['init'] or patterns['term'] or patterns['medi']
137 |         elif i == 0:  # Initial syllable
138 |             _patterns = patterns['init'] or patterns['medi']
139 |         elif i == sylcount-1:  # Final syllable
140 |             _patterns = patterns['term'] or patterns['medi']
141 |         else:  # Medial syllable
142 |             _patterns = patterns['medi']
143 |         for j in range(MAX_RUNS):
144 |             pattern = peakedDist(_patterns, *patternmode)
145 |             syl = populate(pattern, graphmode)
146 |             _word = word + syl
147 |             for constraint in constraints:
148 |                 if constraint and constraint in _word:
149 |                     break
150 |             else:
151 |                 word = _word
152 |                 break
153 |         else:
154 |             raise ExceededMaxRunsError()
155 |     return word + '#'
156 | 
157 | def genFromPhonotactics(phonotactics, sylrange=(1,), sylmode=(), graphs=None, syllabifier=None):
158 |     '''Generate a single word as specified by the 'phonotactics'.
159 | 
160 |     Arguments:
161 |         phonotactics  -- the phonotactic data to be used
162 |         graphs      -- the set of graphemes used for this word
163 |         syllabifier -- the syllabifier used for syllabification
164 | 
165 |     Returns a Word
166 |     '''
167 |     word = Word([], graphs, syllabifier)
168 |     sylcount = peakedDist(sylrange, *sylmode)
169 |     for i in range(sylcount):
170 |         # Generate a syllable
171 |         for _ in range(MAX_RUNS):
172 |             # Pick an onset
173 |             onset = selectPeriphery(phonotactics['onsets'], phonotactics['margins'], 'left', i)
174 |             # Pick a coda
175 |             coda = selectPeriphery(phonotactics['codas'], phonotactics['margins'], 'right', i-sylcount)
176 |             # Pick a nucleus
177 |             nuclei = phonotactics['nuclei']
178 |             if onset != ['#']:
179 |                 nuclei = [nucleus for nucleus in nuclei if nucleus[0] != '#']
180 |             if coda != ['#']:
181 |                 nuclei = [nucleus for nucleus in nuclei if nucleus[-1] != '#']
182 |             nucleus = choice(nuclei)
183 |             syl = populate(onset+nucleus+coda, ())
184 |             _word = word + syl
185 |             for env in phonotactics['constraints']:
186 |                 if env and env in _word:
187 |                     break
188 |             else:
189 |                 word = _word
190 |                 break
191 |         else:
192 |             raise ExceededMaxRunsError()
193 |     return word
194 | 
195 | def selectPeriphery(peripheries, margins, edge, i):
196 |     edge = 0 if edge == 'left' else -1
197 |     if i == edge:
198 |         margin = choice([margin for margin in margins if margin[edge] == '#'])
199 |         if margin == (['_', '#'] if edge else ['#', '_']):
200 |             margin = ['#']
201 |         peripheries = [(p+margin if edge else margin+p) if p[edge] != '#' else p for p in peripheries]
202 |     else:
203 |         peripheries = [p for p in peripheries if p[edge] != '#']
204 |     periphery = choice(peripheries)
205 |     if edge and periphery[0] == '_':
206 |         return periphery[1:]
207 |     elif not edge and periphery[-1] == '_':
208 |         return periphery[:-1]
209 |     else:
210 |         return periphery
211 | 


--------------------------------------------------------------------------------
/src/core.py:
--------------------------------------------------------------------------------
  1 | '''Base classes and functions
  2 | 
  3 | ==================================== To-do ====================================
  4 | === Bug-fixes ===
  5 | 
  6 | === Implementation ===
  7 | Perhaps adjust Cat to allow sequences of graphemes to be stored
  8 | 
  9 | === Features ===
 10 | Something something punctuation
 11 | Hijack global environments with no pattern to test for position in word
 12 | 
 13 | === Style ===
 14 | Consider where to raise/handle exceptions
 15 | Go over docstrings
 16 | '''
 17 | 
 18 | import re
 19 | from dataclasses import dataclass, field, InitVar
 20 | from .syllables import Syllabifier
 21 | 
 22 | # == Exceptions == #
 23 | class LangException(Exception):
 24 |     '''Base class for exceptions in this package'''
 25 | 
 26 | class FormatError(LangException):
 27 |     '''Exception raised for errors in formatting objects.'''
 28 | 
 29 | class RuleError(LangException):
 30 |     '''Exception raised for errors when running rules.'''
 31 | 
 32 | class CompilerError(LangException):
 33 |     '''Base class for errors during compilation.'''
 34 |     def __init__(self, error, value, linenum, column):
 35 |         super().__init__(f'{error}: `{value}` @ {linenum}:{column}')
 36 | 
 37 | class TokenError(CompilerError):
 38 |     '''Base class for errors involving tokens.'''
 39 |     def __init__(self, error, token):
 40 |         super().__init__(error, token.value, token.linenum, token.column)
 41 | 
 42 | # == Decorators == #
 43 | # Implements a decorator we can use as a variation on @property, where the value is calculated once and then stored
 44 | class memoisedproperty(object):
 45 |     def __init__(self, fget):
 46 |         self.fget = fget
 47 |         self.funcname = fget.__name__
 48 | 
 49 |     def __get__(self, obj, cls):
 50 |         if obj is None:
 51 |             return None
 52 |         value = self.fget(obj)
 53 |         setattr(obj, self.funcname, value)
 54 |         return value
 55 | 
 56 | # == Classes == #
 57 | @dataclass
 58 | class Token:
 59 |     type: str
 60 |     value: str
 61 |     linenum: int
 62 |     column: int
 63 | 
 64 |     def __iter__(self):
 65 |         yield self.type
 66 |         yield self.value
 67 | 
 68 | @dataclass
 69 | class Cat:
 70 |     '''Represents a category of graphemes.'''
 71 |     values: list
 72 |     name: str = field(default=None, compare=False)
 73 | 
 74 |     def __str__(self):
 75 |         return f'[{", ".join(self)}]'
 76 | 
 77 |     def __len__(self):
 78 |         return len(self.values)
 79 | 
 80 |     def __getitem__(self, key):
 81 |         return self.values[key]
 82 | 
 83 |     def __iter__(self):
 84 |         yield from self.values
 85 | 
 86 |     def __contains__(self, item):
 87 |         return item in self.values
 88 | 
 89 |     def __and__(self, cat):
 90 |         return Cat([value for value in self if value in cat])
 91 | 
 92 |     def __add__(self, cat):
 93 |         return Cat(self.values + list(cat))
 94 | 
 95 |     def __iadd__(self, cat):
 96 |         return NotImplemented
 97 | 
 98 |     def __sub__(self, cat):
 99 |         return Cat([value for value in self if value not in cat])
100 | 
101 |     def __le__(self, cat):
102 |         return all(value in cat for value in self)
103 | 
104 |     def __lt__(self, cat):
105 |         return self <= cat and not (self >= cat)
106 | 
107 |     def __ge__(self, cat):
108 |         return all(value in self for value in cat)
109 | 
110 |     def __gt__(self, cat):
111 |         return self >= cat and not (self <= cat)
112 | 
113 |     def index(self, item):
114 |         return self.values.index(item)
115 | 
116 |     @staticmethod
117 |     def make(string, cats=None, name=None):
118 |         if not (string.startswith('[') and string.endswith(']')):
119 |             raise FormatError(f'invalid category: {string}')
120 |         cat = string[1:-1]
121 |         if ',' in cat:  # Nonce category
122 |             if cat.endswith(','):
123 |                 if cat.count(',') == 1:
124 |                     cat = cat[:-1]
125 |                 else:
126 |                     raise FormatError(f'invalid category values: {cat}')
127 |             values = []
128 |             for value in re.split(r', ?', cat):
129 |                 if not value:
130 |                     raise FormatError(f'invalid category values: {cat}')
131 |                 elif value.startswith('[') and value.endswith(']'):
132 |                     values.extend(Cat.make(value, cats))
133 |                 elif ' ' in value or '[' in value or ']' in value:
134 |                     raise FormatError(f'invalid category value: {value}')
135 |                 else:
136 |                     values.append(value)
137 |             return Cat(values, name)
138 |         else:  # Named category
139 |             if cats is not None and cat in cats:
140 |                 return cats[cat]
141 |             else:
142 |                 raise FormatError(f'invalid category name: {cat}')
143 | 
144 | @dataclass
145 | class Word:
146 |     '''Represents a word as a list of graphemes.
147 | 
148 |     Instance variables:
149 |         graphs      -- a category of graphemes (Cat)
150 |         syllabifier -- a function that syllabifies the input word (RulesSyllabifier)
151 | 
152 |     Methods:
153 |         find          -- find a match of a list using pattern notation to the word
154 |         matchPattern -- match a list using pattern notation to the word
155 |         matchEnv     -- match a sound change environment to the word
156 |         applyMatch   -- apply a single match to the word
157 |         strip         -- remove leading and trailing graphemes
158 |     '''
159 |     phones: list = field(init=False)
160 |     lexeme: InitVar[str] = ''
161 |     graphs: Cat = field(default_factory=Cat)
162 |     separator: str = ''
163 |     syllabifier: Syllabifier = None
164 | 
165 |     def __post_init__(self, lexeme):
166 |         if isinstance(lexeme, str):
167 |             self.phones = parseWord(f' {lexeme} ', self.graphs, self.separator)
168 |         else:
169 |             phones = []
170 |             for i, phone in enumerate(lexeme):
171 |                 if not phone:
172 |                     continue
173 |                 elif not (phone == '#' and phones and phones[-1] == '#'):
174 |                     phones.append(phone)
175 |             self.phones = phones
176 | 
177 |     @memoisedproperty
178 |     def syllables(self):
179 |         return self.syllabifier(self)
180 | 
181 |     def __repr__(self):
182 |         return f'Word({str(self)!r})'
183 | 
184 |     def __str__(self):
185 |         return unparseWord(self, self.graphs, self.separator)
186 | 
187 |     def __len__(self):
188 |         return len(self.phones)
189 | 
190 |     def __getitem__(self, item):
191 |         if isinstance(item, slice):
192 |             return Word(self.phones[item], self.graphs, self.separator, self.syllabifier)
193 |         else:
194 |             return self.phones[item]
195 | 
196 |     def __iter__(self):
197 |         yield from self.phones
198 | 
199 |     def __contains__(self, item):
200 |         if isinstance(item, (list, Word)):
201 |             return self.find(item) != -1
202 |         else:
203 |             return item in self.phones
204 | 
205 |     def __add__(self, other):
206 |         graphs = self.graphs
207 |         separator = self.separator
208 |         if isinstance(other, Word):
209 |             graphs = Cat(list(set().union(graphs, other.graphs)))
210 |             separator = separator or other.separator
211 |             other = other.phones
212 |         elif isinstance(other, str):
213 |             other = parseWord(other, graphs)
214 |         return Word(self.phones + other, graphs, separator, self.syllabifier)
215 | 
216 |     def __radd__(self, other):
217 |         graphs = self.graphs
218 |         separator = self.separator
219 |         other = parseWord(other, graphs)
220 |         return Word(other + self.phones, graphs, separator, self.syllabifier)
221 | 
222 |     def __mul__(self, other):
223 |         return Word(self.phones * other, self.graphs, self.separator, self.syllabifier)
224 | 
225 |     def __rmul__(self, other):
226 |         return Word(self.phones * other, self.graphs, self.separator, self.syllabifier)
227 | 
228 |     def __iadd__(*args):
229 |         return NotImplemented
230 | 
231 |     def __imul__(*args):
232 |         return NotImplemented
233 | 
234 |     def strip(self, chars=None):
235 |         if chars is None:
236 |             chars = '#'
237 |         start = end = None
238 |         for i, char in enumerate(self):
239 |             if char not in chars:
240 |                 if start is None:
241 |                     start = i
242 |                 if self[i+1] in chars:
243 |                     end = i+1
244 |         return self[start:end]
245 | 
246 |     def find(self, sub, start=None, end=None):
247 |         '''Match a sequence using pattern notation to the word.
248 | 
249 |         Arguments:
250 |             sub   -- the list to be found (list)
251 |             start -- the index of the beginning of the range to check (int)
252 |             end   -- the index of the end of the range to check (int)
253 | 
254 |         Returns an int
255 |         '''
256 |         from ._pattern import parsePattern
257 |         start, end = sliceIndices(self, start, end)
258 |         if isinstance(sub, Word):
259 |             sub = parsePattern(sub)
260 |         if sub and sub[-1].type == 'Comparison':  # Counting
261 |             matches = 0
262 |             op, count = sub[-1].operation, sub[-1].value
263 |             for pos in range(start, end):
264 |                 match = self.matchPattern(sub[:-1], pos, end)[0]
265 |                 if match:
266 |                     matches += 1
267 |             if eval(f'matches {op} count'):
268 |                 return 1
269 |         else:
270 |             for pos in range(start, end):
271 |                 match = self.matchPattern(sub, pos, end)[0]
272 |                 if match:
273 |                     return pos
274 |         return -1
275 | 
276 |     def matchPattern(self, pattern, start=None, end=None, step=1):
277 |         '''Match a pattern sequence to the word.
278 | 
279 |         Return if the sequence matches the end of the given slice of the word, the far end of the match, and category indexes.
280 | 
281 |         Arguments:
282 |             pattern -- the sequence being matched
283 |             start, end, step -- determine the slice of the word to match within
284 |             stack -- used to pass stack references into an optional segment
285 | 
286 |         Returns a tuple.
287 |         '''
288 |         from ._pattern import matchPattern
289 |         start, end = sliceIndices(self, start, end)
290 |         return matchPattern(self, pattern, start, end, step)
291 | 
292 |     def matchEnv(self, environment, pos=0, rpos=0):  # Test if the env matches the word
293 |         '''Match a sound change environment to the word.
294 | 
295 |         Arguments:
296 |             environment -- the environment to be matched (list)
297 |             pos, rpos   -- the slice of the word giving the target (int, int)
298 | 
299 |         Returns a bool
300 |         '''
301 |         for env in environment:
302 |             if env is None:  # Blank environment
303 |                 continue
304 |             env = env.resolveTargetRef(self[pos:rpos])
305 |             if not env.match(self, pos, rpos):
306 |                 return False
307 |         return True
308 | 
309 |     def applyMatch(self, match, rep):
310 |         '''Apply a replacement to a word
311 | 
312 |         Arguments:
313 |             match -- the match to be used
314 |             rep   -- the replacement to be used
315 |             word  -- the word to be changed
316 | 
317 |         Returns a Word.
318 |         '''
319 |         from .sce import Replacement, LocalEnvironment, GlobalEnvironment
320 |         pos, rpos, catixes = match[:3]
321 |         if not rep:
322 |             return self[:pos] + self[rpos:]
323 |         target = self[pos:rpos]
324 |         if isinstance(rep, Replacement):
325 |             _rep = []
326 |             ix = 0
327 |             for element in rep.resolveTargetRef(target).pattern:
328 |                 if element.type == 'Grapheme':
329 |                     _rep.append(element.grapheme)
330 |                 elif element.type == 'Category':
331 |                     if not catixes:
332 |                         raise RuleError('replacement contains a category but target did not')
333 |                     cat = element.cat
334 |                     _rep.append(cat[catixes[ix] % len(cat)])
335 |                     ix = (ix + 1) % len(catixes)
336 |                 elif element.type == 'Ditto':
337 |                     _rep.append(_rep[-1] if _rep else self[pos-1])
338 |                 else:
339 |                     _rep.append('')
340 |             return self[:pos] + _rep + self[rpos:]
341 |         elif isinstance(rep, tuple):  # Copy/Move
342 |             mode, envs = rep
343 |             matches = []
344 |             for env in envs:  # Each anded environment contributes destinations
345 |                 if isinstance(env, LocalEnvironment):
346 |                     env = env.resolveTargetRef(target)
347 |                     for wpos in range(1, len(self)):  # Find all matches
348 |                         if env.match(self, wpos, wpos):
349 |                             if mode == 'move' and wpos >= rpos:  # We'll need to adjust the matches down
350 |                                 wpos -= rpos-pos
351 |                             matches.append(wpos)
352 |                 elif isinstance(env, GlobalEnvironment):  # Indices
353 |                     if env.pattern:
354 |                         raise RuleError(f'global environment as destination must have no pattern: {rep}')
355 |                     matches.extend(env.indices)
356 |                 else:
357 |                     raise RuleError(f'unknown environment: {rep}')
358 |             if mode == 'move':  # Move - delete original target
359 |                 word = self[:pos] + self[rpos:]
360 |             else:
361 |                 word = self
362 |             for match in sorted(matches, reverse=True):
363 |                 word = word[:match] + target + word[match:]
364 |             return word
365 |         else:
366 |             raise RuleError(f'invalid replacement: {rep}')
367 | 
368 | # == Functions == #
369 | def resolveTargetRef(pattern, target):
370 |     _pattern = []
371 |     for element in pattern:
372 |         if element.type == 'TargetRef':
373 |             _pattern.extend(element.resolveTarget(target))
374 |         else:
375 |             _pattern.append(element)
376 |     return _pattern
377 | 
378 | def sliceIndices(iter, start=None, end=None):
379 |     '''Calculate absolute indices from slice indices on an iterable.
380 | 
381 |     Arguments:
382 |         iter  -- the iterable being sliced
383 |         start -- the index of the start of the slice
384 |         end   -- the index of the end of the slice
385 | 
386 |     Returns a tuple of 2 ints.
387 |     '''
388 |     if start is None:
389 |         start = 0
390 |     elif start < 0:
391 |         start += len(iter)
392 |     if end is None:
393 |         end = len(iter)
394 |     elif end < 0:
395 |         end += len(iter)
396 |     return start, end
397 | 
398 | def parseCats(cats, initialcats=None):
399 |     '''Parses a set of categories.
400 | 
401 |     Arguments:
402 |         cats -- the set of categories to be parsed (str)
403 |         initialcats -- prior categories (dict)
404 | 
405 |     Returns a dict.
406 |     '''
407 |     if initialcats is None:
408 |         _cats = {}
409 |     else:
410 |         _cats = initialcats.copy()
411 |     for key, value in cats.items():
412 |         if key == '' or not value:
413 |             pass
414 |         elif isinstance(value, Cat):
415 |             _cats[key] = value
416 |         elif isinstance(value, list):
417 |             _cats[key] = Cat(value, key)
418 |         elif isinstance(value, str):
419 |             _cats[key] = Cat.make(f'[{value}]', _cats, key)
420 |         else:
421 |             raise FormatError('invalid category values')
422 |     for cat in list(_cats):  # Discard blank categories
423 |         if not _cats[cat]:
424 |             del _cats[cat]
425 |     return _cats
426 | 
427 | WHITESPACE_REGEX = re.compile(r'\s+')
428 | 
429 | def parseWord(string, graphs=(), separator=''):
430 |     string = WHITESPACE_REGEX.sub('#', string)
431 |     polygraphs = sorted(filter(lambda g: len(g) > 1, graphs), key=len, reverse=True)
432 |     if not polygraphs:
433 |         return list(string.replace(separator, ''))
434 |     if not separator:
435 |         separator = '.'
436 |     word = []
437 |     string = string.lstrip(separator)
438 |     while string:
439 |         graph = next(filter(lambda p: string.startswith(p), polygraphs), string[0])
440 |         word.append(graph)
441 |         string = string[len(graph):].lstrip(separator)
442 |     return word
443 | 
444 | def unparseWord(word, graphs=(), separator=''):
445 |     string = ''
446 |     polygraphs = list(filter(lambda g: len(g) > 1, graphs))
447 |     if not polygraphs:
448 |         string = ''.join(word)
449 |         word = []
450 |     if not separator:
451 |         separator = '.'
452 |     ambig = []
453 |     for graph in word:
454 |         if ambig:
455 |             ambig.append(graph)
456 |             for i in range(len(ambig)):
457 |                 test = ''.join(ambig[i:])
458 |                 minlength = len(ambig[i])
459 |                 if any(test.startswith(poly) and len(poly) > minlength for poly in polygraphs):
460 |                     string += separator
461 |                     ambig = [graph]
462 |                     break
463 |             for i in range(len(ambig)):
464 |                 test = ''.join(ambig[i:])
465 |                 if any(poly.startswith(test) and poly != test for poly in polygraphs):
466 |                     ambig = ambig[i:]
467 |                     break
468 |             else:
469 |                 ambig = []
470 |         elif any(poly.startswith(graph) and poly != graph for poly in polygraphs):
471 |             ambig.append(graph)
472 |         string += graph
473 |     return string.strip(separator+'#').replace('#', ' ')
474 | 
475 | def partition(sequence, *, sep=None, sepfunc=None, yieldsep=False):
476 |     if sep is None == sepfunc is None:
477 |         raise ValueError('exactly one of sep and sepfunc must be given')
478 |     if sep is not None:
479 |         sepfunc = lambda item: item == sep
480 |     i = 0
481 |     for j, item in enumerate(sequence):
482 |         if sepfunc(item):
483 |             if yieldsep:
484 |                 yield (sequence[i:j], sequence[j])
485 |             else:
486 |                 yield sequence[i:j]
487 |             i = j+1
488 |     if yieldsep:
489 |         yield sequence[i:], None
490 |     else:
491 |         yield sequence[i:]
492 | 
493 | def partitionTokens(tokens, sep=None, yieldsep=True):
494 |     yield from partition(tokens, sepfunc=(lambda element: element.type == sep), yieldsep=yieldsep)
495 | 


--------------------------------------------------------------------------------
/src/_pattern.py:
--------------------------------------------------------------------------------
  1 | '''Pattern parsing and matching
  2 | 
  3 | Classes:
  4 |     Token       -- Class utilised by the tokeniser
  5 |     Element     -- Base class for pattern elements
  6 |     Grapheme    -- Element matching a specific grapheme
  7 |     Ditto       -- Element matching the second of two identical segments
  8 |     SylBreak    -- Element matching a syllable boundary
  9 |     Category    -- Element matching a category of graphemes
 10 |     Wildcard    -- Element matching one or more arbitrary segments
 11 |     WildcardRep -- Element matching one or more copies of the previous element
 12 |     Optional    -- Element matching an optional sequence of elements
 13 |     Comparison  -- Element used for indicating the number of another element
 14 |     TargetRef   -- Element used to refer to the target
 15 | 
 16 | Functions:
 17 |     escape         -- processes escaped characters in a string
 18 |     tokenise       -- returns a generator producing tokens
 19 |     parsePattern  -- parses a string utilising pattern notation into a list of elements
 20 |     parsePatterns -- parses a collection of strings using pattern notation
 21 |     matchPattern  -- matches a list of elements to a specified slice of a word
 22 | ''''''
 23 | ==================================== To-do ====================================
 24 | === Bug-fixes ===
 25 | catixes in matchPattern should be redone to cope with non-linearity
 26 | 
 27 | === Implementation ===
 28 | Replace super-disgusting hacky wildcard repetition workaround in matchPattern with something better
 29 | - How though
 30 | Handling of optionals needs a lot of work
 31 | 
 32 | === Features ===
 33 | 
 34 | === Style ===
 35 | '''
 36 | import re
 37 | from dataclasses import dataclass, InitVar
 38 | from typing import Dict, List
 39 | from .core import FormatError, CompilerError, TokenError, Token, Cat
 40 | 
 41 | ## Constants
 42 | TOKENS = {
 43 |     'COMMA': r', ?',
 44 |     'NULL': r'\[\]',
 45 |     'LOPT': r'\(',
 46 |     'ROPT': r'\)\??',
 47 |     'LCAT': r'\[',
 48 |     'RCAT': r'\]',
 49 |     'WILDCARDREP': r'\{\*\??\}',
 50 |     'COMPARISON': r'\{(?:!=|[=<>]=?)\d+\}',
 51 |     'ESCAPE': r'\\.',
 52 |     'REPETITION': r'\{\d+\}',
 53 |     'WILDCARD': r'\*\*?\??',
 54 |     'TARGETREF': r'%|<',
 55 |     'DITTO': r'\"',
 56 |     'SYLBREAK': r'\$',
 57 |     'TEXT': r'[^ >\/!+\-[\](){}*?\\"%<$^,&_~@]+',
 58 |     'UNKNOWN': r'.',
 59 | }
 60 | TOKEN_REGEX = re.compile('|'.join(f'(?P<{type}>{regex})' for type, regex in TOKENS.items()))
 61 | TOKENS = {type: re.compile(regex) for type, regex in TOKENS.items()}
 62 | 
 63 | ## Classes
 64 | @dataclass(repr=False, eq=False)
 65 | class Element:
 66 |     def __str__(self):
 67 |         return ''
 68 | 
 69 |     def __repr__(self):
 70 |         return f'{self.type}({str(self)!r})'
 71 | 
 72 |     def __eq__(self, other):
 73 |         if isinstance(other, str):
 74 |             return str(self) == other
 75 |         elif type(self) == type(other):
 76 |             return str(self) == str(other)
 77 |         else:
 78 |             return NotImplemented
 79 | 
 80 |     @property
 81 |     def type(self):
 82 |         return self.__class__.__name__
 83 | 
 84 |     # This method should not be called directly, as it does not check its arguments for correctness
 85 |     @classmethod
 86 |     def make(cls, string=None, cats=None):
 87 |         return cls()
 88 | 
 89 |     @classmethod
 90 |     def fromString(cls, string=None, cats=None):
 91 |         if TOKENS[cls.__name__.upper()].match(string) is not None:  # Sanity check
 92 |             return cls.make(string, cats)
 93 |         raise TokenError(f'invalid {cls.__name__}', tokens[0])
 94 | 
 95 |     @classmethod
 96 |     def fromTokens(cls, tokens=None, cats=None):
 97 |         if len(tokens) != 1:
 98 |             raise CompilerError(f'too many tokens', tokens, tokens[0].linenum, tokens[0].column)
 99 |         type, value = tokens[0]
100 |         if type == cls.__name__.upper() and TOKENS[type].match(value) is not None:  # Sanity check
101 |             return cls.make(value, cats)
102 |         raise TokenError(f'invalid {cls.__name__}', tokens[0])
103 | 
104 |     # This method must guarantee that the last two return values are [] if the first is False
105 |     def match(self, word, pos, ix, step, istep):
106 |         # matched, length, ilength, stack, catixes
107 |         return False, 0, 0, [], []
108 | 
109 | ## Matching elements ##
110 | @dataclass(repr=False, eq=False)
111 | class Grapheme(Element):
112 |     grapheme: str
113 | 
114 |     def __str__(self):
115 |         return self.grapheme
116 | 
117 |     @staticmethod
118 |     def make(string, cats=None):
119 |         return Grapheme(grapheme=string)
120 | 
121 |     @staticmethod
122 |     def fromString(string=None, cats=None):
123 |         if TOKENS['ESCAPE'].match(string) is not None:  # Sanity check
124 |             return Grapheme(grapheme=value[1])
125 |         raise TokenError(f'invalid {cls.__name__}', tokens[0])
126 | 
127 |     @staticmethod
128 |     def fromTokens(tokens, cats=None):
129 |         if len(tokens) != 1:
130 |             raise CompilerError(f'too many tokens', tokens, tokens[0].linenum, tokens[0].column)
131 |         type, value = tokens[0]
132 |         if type == 'ESCAPE' and TOKENS['ESCAPE'].match(value) is not None:  # Sanity check
133 |             return Grapheme(grapheme=value[1])
134 |         raise TokenError('invalid Grapheme', tokens[0])
135 | 
136 |     def match(self, word, pos, ix, step, istep):
137 |         return self.grapheme == word[pos], step, istep, [], []
138 | 
139 | @dataclass(repr=False, eq=False)
140 | class Ditto(Element):
141 |     def __str__(self):
142 |         return '"'
143 | 
144 |     def match(self, word, pos, ix, step, istep):
145 |         return word[pos] == word[pos-1], step, istep, [], []
146 | 
147 | @dataclass(repr=False, eq=False)
148 | class SylBreak(Element):
149 |     def __str__(self):
150 |         return '$'
151 | 
152 |     def match(self, word, pos, ix, step, istep):
153 |         return (pos in word.syllables), 0, istep, [], []
154 | 
155 | @dataclass(repr=False, eq=False)
156 | class Category(Element):
157 |     cat: Cat
158 | 
159 |     def __str__(self):
160 |         if self.cat.name is None:
161 |             return str(self.cat)
162 |         else:
163 |             return f'[{self.cat.name}]'
164 | 
165 |     def __eq__(self, other):
166 |         if isinstance(other, Category):
167 |             return self.cat == other.cat
168 |         else:
169 |             return self.cat == other
170 | 
171 |     @staticmethod
172 |     def make(string, cats=None):
173 |         return Category(cat=Cat.make(string, cats))
174 | 
175 |     @staticmethod
176 |     def fromString(string, cats=None):
177 |         return Category.make(string, cats)
178 | 
179 |     @staticmethod
180 |     def fromTokens(tokens, cats=None):
181 |         string = ''.join(token.value for token in tokens)
182 |         return Category.make(string, cats)
183 | 
184 |     def match(self, word, pos, ix, step, istep):
185 |         if word[pos] in self.cat:  # This might change
186 |             return True, step, istep, [], [self.cat.index(word[pos])]
187 |         return False, 0, 0, [], []
188 | 
189 | @dataclass(repr=False, eq=False)
190 | class Wildcard(Element):
191 |     greedy: bool
192 |     extended: bool
193 | 
194 |     def __str__(self):
195 |         return ('**' if self.extended else '*') + ('' if self.greedy else '?')
196 | 
197 |     @staticmethod
198 |     def make(string, cats=None):
199 |         greedy = not string.endswith('?')
200 |         extended = string.startswith('**')
201 |         return Wildcard(greedy=greedy, extended=extended)
202 | 
203 |     def match(self, word, pos, ix, step, istep):
204 |         if self.extended or word[pos] != '#':
205 |             if self.greedy:
206 |                 stack = [(pos+step, ix+istep)]
207 |                 istep = 0
208 |             else:
209 |                 stack = [(pos+step, ix)]
210 |             return True, step, istep, stack, []
211 |         return False, 0, 0, [], []
212 | 
213 | @dataclass(repr=False, eq=False)
214 | class WildcardRep(Element):
215 |     greedy: bool
216 | 
217 |     def __str__(self):
218 |         return '{*}' if self.greedy else '{*?}'
219 | 
220 |     @staticmethod
221 |     def make(string, cats=None):
222 |         if string == '{*}':
223 |             return WildcardRep(greedy=True)
224 |         else:
225 |             return WildcardRep(greedy=False)
226 | 
227 |     def match(self, word, pos, ix, step, istep):
228 |         if self.greedy:
229 |             istep *= -1
230 |         return True, 0, istep, [(pos, ix-istep)], []
231 | 
232 | ## Non-matching elements ##
233 | @dataclass(repr=False, eq=False)
234 | class Optional(Element):
235 |     greedy: bool
236 |     pattern: List[Token]
237 | 
238 |     def __str__(self):
239 |         string = unparsePattern(self.pattern)
240 |         return f'({string})' if self.greedy else f'({string})?'
241 | 
242 |     @staticmethod
243 |     def make(string, cats=None):
244 |         greedy = not string.endswith('?')
245 |         pattern = parsePattern(string.rstrip('?')[1:-1], cats)
246 |         if len(pattern) == 1 and isinstance(pattern[0], Wildcard):
247 |             pattern[0].greedy = greedy
248 |         return Optional(greedy=greedy, pattern=pattern)
249 | 
250 |     @staticmethod
251 |     def fromString(string, cats=None):
252 |         return Optional.make(string, cats)
253 | 
254 |     @staticmethod
255 |     def fromTokens(tokens, cats=None):
256 |         if tokens[0].type != 'LOPT' or tokens[-1].type != 'ROPT':
257 |             raise FormatError(f'the given tokens are not a valid optional: {tokens}')
258 |         greedy = not tokens[-1].value.endswith('?')
259 |         pattern = compile(tokens[1:-1], cats)
260 |         if len(pattern) == 1 and isinstance(pattern[0], Wildcard):
261 |             pattern[0].greedy = greedy
262 |         return Optional(greedy=greedy, pattern=pattern)
263 | 
264 |     # Somehow I need to adapt the special matching code for this framework - won't be easy
265 | 
266 | @dataclass(repr=False, eq=False)
267 | class Comparison(Element):
268 |     operation: str
269 |     value: int
270 | 
271 |     def __str__(self):
272 |         return f'{{{self.operation}{self.value}}}'.replace('==', '=')
273 | 
274 |     @staticmethod
275 |     def make(string, cats=None):
276 |         string = string[1:-1]
277 |         for op in ('==', '=', '!=', '>=', '>', '<=', '<'):
278 |             if string.startswith(op):
279 |                 value = int(string[len(op):])
280 |                 if op == '=':
281 |                     op = '=='
282 |                 return Comparison(operation=op, value=value)
283 | 
284 | @dataclass(repr=False, eq=False)
285 | class TargetRef(Element):
286 |     direction: int
287 | 
288 |     def __str__(self):
289 |         return '%' if self.direction == 1 else '<'
290 | 
291 |     @staticmethod
292 |     def make(string, cats=None):
293 |         if string == '%':
294 |             return TargetRef(direction=1)
295 |         else:
296 |             return TargetRef(direction=-1)
297 | 
298 |     def resolveTarget(self, target):
299 |         return [Grapheme(graph) for graph in (target if self.direction == 1 else reversed(target))]
300 | 
301 | ELEMENT_DICT = {
302 |     'LOPT': Optional,
303 |     'LCAT': Category,
304 |     'WILDCARDREP': WildcardRep,
305 |     'COMPARISON': Comparison,
306 |     'ESCAPE': Grapheme,
307 |     'WILDCARD': Wildcard,
308 |     'TARGETREF': TargetRef,
309 |     'DITTO': Ditto,
310 |     'SYLBREAK': SylBreak,
311 | }
312 | 
313 | # Don't slice the string when calling this
314 | def tokenise(string, colstart=None, linenum=0):
315 |     '''Tokenise a string using pattern notation.
316 | 
317 |     Arguments:
318 |         string   -- the input string using pattern notation (str)
319 |         colstart -- the column to start token indexing at (int)
320 | 
321 |     Yields Token objects
322 |     '''
323 |     if colstart is None:
324 |         nested = False
325 |         colstart = 0
326 |     else:
327 |         nested = True
328 |     if not string:
329 |         if nested:
330 |             return colstart
331 |         return
332 |     brackets = []
333 |     for match in TOKEN_REGEX.finditer(string, colstart):
334 |         type = match.lastgroup
335 |         value = match.group()
336 |         column = match.start()
337 |         colstart = match.end()
338 |         if type == 'COMMA':
339 |             if not (brackets and brackets[-1] == '['):
340 |                 if not brackets and nested:
341 |                     return column
342 |                 raise CompilerError(f'unexpected comma', value, linenum, column)
343 |         elif type in ('LOPT', 'LCAT'):  # Left brackets
344 |             if value == '(' and brackets and brackets[-1] == '[':
345 |                 raise CompilerError(f'optionals may not appear inside categories', value, linenum, column)
346 |             brackets.append(value)
347 |         elif type in ('ROPT', 'RCAT'):  # Right brackets
348 |             if not brackets:
349 |                 raise CompilerError(f'unexpected bracket', value, linenum, column)
350 |             bracket = brackets.pop()
351 |             if bracket+value[0] not in ('()', '[]'):
352 |                 raise CompilerError(f'mismatched brackets', value, linenum, column)
353 |         elif type == 'UNKNOWN':
354 |             if nested:
355 |                 return column
356 |             else:
357 |                 raise CompilerError(f'unexpected character', value, linenum, column)
358 |         yield Token(type, value, linenum, column)
359 |     if nested:
360 |         return colstart
361 | 
362 | def matchBrackets(tokens, start=0):
363 |     if tokens[start].type not in ('LOPT', 'LCAT'):
364 |         raise TokenError(f'expected bracket', tokens[start])
365 |     else:
366 |         left = tokens[start].type
367 |         right = left.replace('L', 'R')
368 |     depth = 0
369 |     for i, token in enumerate(tokens[start:], start+1):
370 |         if token.type == left:
371 |             depth += 1
372 |         elif token.type == right:
373 |             depth -= 1
374 |             if depth == 0:
375 |                 return i
376 |     raise TokenError(f'unmatched bracket', tokens[start])
377 | 
378 | def compile(tokens, cats=None):
379 |     from .core import parseWord
380 |     tokens = list(tokens)
381 |     if not tokens:
382 |         return []
383 |     if cats is not None and 'graphs' in cats:
384 |         graphs = cats['graphs']
385 |     else:
386 |         graphs = ()
387 |     elements = []
388 |     i = 0
389 |     while i < len(tokens):
390 |         type, value = tokens[i]
391 |         if type in ('LOPT', 'LCAT'):
392 |             j = matchBrackets(tokens, i)
393 |         else:
394 |             j = i+1
395 |         if type == 'NULL':
396 |             pass
397 |         elif type == 'REPETITION':
398 |             elements[-1:] = elements[-1:]*int(value[1:-1])
399 |         elif type == 'TEXT':
400 |             elements.extend([Grapheme(graph) for graph in parseWord(value, graphs)])
401 |         elif type in ELEMENT_DICT:
402 |             cls = ELEMENT_DICT[type]
403 |             elements.append(cls.fromTokens(tokens[i:j], cats))
404 |         else:
405 |             raise TokenError(f'unexpected token', tokens[i])
406 |         i = j
407 |     return elements
408 | 
409 | def parsePattern(pattern, cats=None):
410 |     '''Parse a string using pattern notation.
411 | 
412 |     Arguments:
413 |         pattern -- the input string using pattern notation (str or Word)
414 |         cats    -- a dictionary of categories to use for interpreting categories (dict)
415 | 
416 |     Returns a list
417 |     '''
418 |     from .core import Word
419 |     if isinstance(pattern, Word):
420 |         return [Grapheme(graph) for graph in pattern]
421 |     try:
422 |         return compile(tokenise(pattern), cats)
423 |     except CompilerError as e:
424 |         raise FormatError(f'invalid pattern: {pattern!r}; {e.args[0]}')
425 | 
426 | def unparsePattern(pattern, graphs=(), separator=''):
427 |     from .core import unparseWord
428 |     # Add collapsing repeated tokens
429 |     elements = []
430 |     for element in pattern:
431 |         if isinstance(element, Optional):
432 |             string = unparsePattern(element.pattern, graphs, separator)
433 |             elements.append(f'({string})' if self.greedy else f'({string})?')
434 |         else:
435 |             elements.append(str(element))
436 |     return unparseWord(elements, graphs, separator)
437 | 
438 | def parsePatterns(patterns, cats=None):
439 |     '''Parses generation patterns.
440 | 
441 |     Arguments:
442 |         patterns -- set of patterns to parse (str, list, or dict)
443 | 
444 |     Returns a list
445 |     '''
446 |     if isinstance(patterns, str):
447 |         patterns = patterns.splitlines()
448 |     if isinstance(patterns, list):
449 |         _patterns = []
450 |         for pattern in patterns:
451 |             #Remove comments
452 |             if isinstance(pattern, str):
453 |                 pattern = pattern.split('//')[0]
454 |             if not pattern:
455 |                 continue
456 |             if isinstance(pattern, str):
457 |                 _patterns.append(parsePattern(pattern, cats))
458 |             else:
459 |                 _patterns.append(pattern)
460 |     elif isinstance(patterns, dict):
461 |         _patterns = {key: parsePatterns(patterns[key], cats) for key in patterns}
462 |     else:
463 |         _patterns = None
464 |     return _patterns
465 | 
466 | def matchPattern(word, pattern, start, end, step, stack=None):
467 |     '''Match a pattern sequence to the word.
468 | 
469 |     Return if the sequence matches the end of the given slice of the word, the far end of the match, and category indexes.
470 | 
471 |     Arguments:
472 |         word -- the word to match to
473 |         pattern -- the sequence being matched
474 |         start, end, step -- determine the slice of the word to match within
475 |         stack -- used to pass stack references into an optional segment
476 | 
477 |     Returns a tuple.
478 |     '''
479 |     pos = start if step > 0 else end-1
480 |     ix = 0 if step > 0 else (len(pattern)-1)
481 |     istep = 1 if step > 0 else -1
482 |     if stack is None:
483 |         stack = []  # This stores the positions in the word and sequence that we branched at
484 |         _returnstack = False
485 |     else:
486 |         if stack:
487 |             pos, ix = stack.pop()
488 |         _returnstack = True
489 |     catixes = []  # This records the index of each category match. This needs to be redone to cope with non-linearity
490 |     # Hacky thing for now to make wildcard repetitions actually work in rtl
491 |     pattern = pattern.copy()
492 |     if step < 0:
493 |         for i, element in enumerate(pattern):
494 |             if element.type == 'WildcardRep':
495 |                 pattern[i-1:i+1] = reversed(pattern[i-1:i+1])
496 |     matched = True
497 |     while 0 <= ix < len(pattern):
498 |         if start <= pos < end:  # Still in the slice
499 |             element = pattern[ix]
500 |             if not isinstance(element, Optional):
501 |                 matched, length, ilength, _stack, _catixes = element.match(word, pos, ix, step, istep)
502 |                 stack.extend(_stack)
503 |                 catixes.extend(_catixes)
504 |             else:  # Optionals require special handling
505 |                 if not matched:  # Jumped here via the stack, check if we've got a nested stack reference
506 |                     if stack and isinstance(stack[-1], list):
507 |                         _stack = stack.pop()
508 |                 else:
509 |                     _stack = []
510 |                 if element.greedy:  # Greedy
511 |                     if ix < len(pattern)-istep and pattern[ix+istep].type == 'WildcardRep':  # We need to make sure to step past a wildcard repetition
512 |                         stack.append((pos, ix+istep*2))
513 |                     else:
514 |                         stack.append((pos, ix+istep))
515 |                     ilength = step
516 |                 elif matched:  # Non-greedy, we stepped in normally
517 |                     stack.append((pos, ix))
518 |                     if ix < len(pattern)-istep and pattern[ix+istep].type == 'WildcardRep':  # We need to make sure to step past a wildcard repetition
519 |                         ilength = istep*2
520 |                     else:
521 |                         ilength = istep
522 |                     matched = True
523 |                     length = 0
524 |                 if element.greedy or not matched:
525 |                     _start, _end = (pos, end) if istep > 0 else (start, pos+1)
526 |                     matched, rpos, _catixes, _stack = matchPattern(word, element.pattern, _start, _end, step, _stack)
527 |                     # Merge in the stack - if a reference has an index within element, nest it and push a reference to
528 |                     # the element, else correct the index and push it directly
529 |                     for _pos, _ix in _stack:
530 |                         if _ix >= len(element.pattern):
531 |                             _ix -= len(element.pattern)-1
532 |                             stack.append((_pos, _ix))
533 |                         else:
534 |                             if len(stack) >= 2 and isinstance(stack[-2], list):
535 |                                 stack[-2].append((_pos, _ix))
536 |                             else:
537 |                                 stack.append([(_pos, _ix)])
538 |                                 stack.append((_pos, ix))
539 |                     length = rpos-pos
540 |                     if matched:
541 |                         catixes.extend(_catixes)
542 |         else:
543 |             matched, length, ilength = False, 0, 0
544 |         if matched:
545 |             ix += ilength
546 |             pos += length
547 |         elif stack:  # This segment failed to match, so we jump back to the next branch
548 |             pos, ix = stack.pop()
549 |         else:  # Total match failure
550 |             if _returnstack:
551 |                 return False, 0, [], []  # Maybe?
552 |             else:
553 |                 return False, 0, []
554 |     if _returnstack:
555 |         return True, pos, catixes, stack
556 |     else:
557 |         return True, pos, catixes
558 | 


--------------------------------------------------------------------------------
/src/sce.py:
--------------------------------------------------------------------------------
  1 | '''Apply sound changes to a lexicon
  2 | 
  3 | Exceptions:
  4 |     RuleFailed    -- exception to mark that a rule failed
  5 | 
  6 | Classes:
  7 |     Rule -- represents a sound change rule
  8 | 
  9 | Functions:
 10 |     compileRuleset -- compiles a sound change ruleset
 11 |     compileRule    -- compiles a sound change rule
 12 |     run             -- applies a set of sound change rules to a set of words
 13 | ''''''
 14 | ==================================== To-do ====================================
 15 | === Bug-fixes ===
 16 | 
 17 | === Implementation ===
 18 | Maybe change >^ and >^? to >> and >>?
 19 | 
 20 | === Features ===
 21 | Is it possible to implement a>b>c as notation for a chain shift?
 22 | Think about expanding the options for grapheme handling
 23 | - diacritics
 24 | Allow ~ in tar and rep
 25 | Implement more category operations
 26 | - intersection
 27 | -- feature-style? [+A +B -C] == [A] && [B] && ~[C]
 28 | More format conversion metarules?
 29 | - !sca2
 30 | 
 31 | === Style ===
 32 | Consider where to raise/handle exceptions
 33 | Go over docstrings
 34 | '''
 35 | 
 36 | import logging
 37 | import logging.config
 38 | import os.path
 39 | import re
 40 | from contextlib import suppress
 41 | from dataclasses import dataclass, InitVar
 42 | from .core import LangException, FormatError, RuleError, CompilerError, TokenError, Token, Cat, Word, resolveTargetRef, parseCats, partitionTokens
 43 | from ._pattern import tokenise as tokenisePattern, compile as compilePattern
 44 | 
 45 | # == Constants == #
 46 | MAX_RUNS = 10**3  # Maximum number of times a rule may be repeated
 47 | __location__ = os.path.realpath(
 48 |     os.path.join(os.getcwd(), os.path.dirname(__file__), 'logging.conf'))
 49 | RULE_TOKENS = {
 50 |     'EPENTHESIS': r'^\+ ?',
 51 |     'DELETION': r'^\- ?',
 52 |     'MOVE': r'>\^\?| +>\^\? ',
 53 |     'COPY': r'>\^| +>\^ ',
 54 |     'REPLACEMENT': r'>| +> ',
 55 |     'ENVIRONMENT': r'/| +/ ',
 56 |     'EXCEPTION': r'!| +! ',
 57 |     'OR': r', ?',
 58 |     'AND': r'&| & ',
 59 |     'PLACEHOLDER': r'_',
 60 |     # 'ADJACENCY': r'~',
 61 |     'INDICES': r'@\-?\d+(?:\|\-?\d+)*',
 62 |     'SPACE': r' ',
 63 |     'UNKNOWN': r'.'
 64 | }
 65 | RULE_REGEX = re.compile('|'.join(f'(?P<{type}>{regex})' for type, regex in RULE_TOKENS.items()))
 66 | METARULES = [
 67 |     'block',
 68 |     'def',
 69 |     'rule',
 70 | ]
 71 | METARULE_TOKENS = {
 72 |     'METARULE': fr'^!(?:{"|".join(METARULES)})',
 73 |     'COLON': r': ?',
 74 |     'NUMBER': r'\d+',
 75 |     'IDENTIFIER': r'[a-z_]+',
 76 |     'SPACE': r' ',
 77 |     'UNKNOWN': r'.',
 78 | }
 79 | METARULE_REGEX = re.compile('|'.join(f'(?P<{type}>{regex})' for type, regex in METARULE_TOKENS.items()))
 80 | FLAGS = [
 81 |     'ignore',
 82 |     'rtl',
 83 |     'ditto',
 84 |     'stop',
 85 |     'repeat',
 86 |     'persist',
 87 |     'chance',
 88 | ]
 89 | FLAG_TOKENS = {
 90 |     'FLAG': '|'.join(FLAGS),
 91 |     'COLON': r': ?',
 92 |     'ARGUMENT': r'\d+',
 93 |     'NEGATION': r'!',
 94 |     'SEPARATOR': r'; ?',
 95 |     'UNKNOWN': r'.',
 96 | }
 97 | FLAG_REGEX = re.compile('|'.join(f'(?P<{type}>{regex})' for type, regex in FLAG_TOKENS.items()))
 98 | CATEGORY_TOKENS = {
 99 |     'CATEGORY': r'^\w+',
100 |     'OP': r'(?:\+|\-)?=| +(?:\+|\-)?= ',
101 |     'VALUES': r'.+$',  # Might make this part more precise
102 |     'UNKNOWN': r'.',
103 | }
104 | CATEGORY_REGEX = re.compile('|'.join(f'(?P<{type}>{regex})' for type, regex in CATEGORY_TOKENS.items()))
105 | 
106 | # == Globals == #
107 | logger = None
108 | 
109 | # == Exceptions == #
110 | class RuleFailed(LangException):
111 |     '''Used to indicate that the rule failed to be applied.'''
112 | 
113 | # == Classes == #
114 | @dataclass
115 | class IndexedPattern:
116 |     pattern: list
117 |     indices: list = None
118 | 
119 |     def __str__(self):
120 |         if self.indices is None:
121 |             return str(self.pattern)
122 |         elif not self.pattern:
123 |             return f'@{self.indices}'
124 |         else:
125 |             return f'{self.pattern}@{self.indices}'
126 | 
127 |     def __iter__(self):
128 |         yield self.pattern
129 |         yield self.indices
130 | 
131 |     def copy(self):
132 |         cls = self.__class__
133 |         if self.indices is not None:
134 |             return cls(self.pattern.copy(), self.indices.copy())
135 |         else:
136 |             return cls(self.pattern.copy())
137 | 
138 | @dataclass
139 | class Target(IndexedPattern):
140 |     pass
141 | 
142 | @dataclass
143 | class Replacement:
144 |     pattern: list
145 | 
146 |     def __str__(self):
147 |         return str(self.pattern)
148 | 
149 |     def __iter__(self):
150 |         yield self.pattern
151 | 
152 |     def copy(self):
153 |         return Replacement(self.pattern.copy())
154 | 
155 |     def resolveTargetRef(self, target):
156 |         return Replacement(resolveTargetRef(self.pattern, target))
157 | 
158 | @dataclass
159 | class LocalEnvironment:
160 |     left: list
161 |     right: list
162 | 
163 |     def __str__(self):
164 |         if self.left and self.right:
165 |             return f'{self.left}_{self.right}'
166 |         elif self.left:
167 |             return f'{self.left}_'
168 |         elif self.right:
169 |             return f'_{self.right}'
170 |         else:
171 |             return '_'
172 | 
173 |     def __bool__(self):
174 |         return bool(self.left or self.right)
175 | 
176 |     def __iter__(self):
177 |         yield self.left
178 |         yield self.right
179 | 
180 |     def copy(self):
181 |         return LocalEnvironment(self.left.copy(), self.right.copy())
182 | 
183 |     def resolveTargetRef(self, target):
184 |         return LocalEnvironment(resolveTargetRef(self.left, target), resolveTargetRef(self.right, target))
185 | 
186 |     def match(self, word, pos=0, rpos=0):
187 |         left, right = self
188 |         if pos:
189 |             matchleft = word.matchPattern(left, 0, pos, -1)[0]
190 |         else:  # At the left edge, which can only be matched by a null env
191 |             matchleft = False if left else True
192 |         matchright = word.matchPattern(right, rpos)[0]
193 |         return matchleft and matchright
194 | 
195 | @dataclass
196 | class GlobalEnvironment(IndexedPattern):
197 |     def __bool__(self):
198 |         return bool(self.pattern or self.indices)
199 | 
200 |     def resolveTargetRef(self, target):
201 |         if self.indices is not None:
202 |             return GlobalEnvironment(resolveTargetRef(self.pattern, target), self.indices.copy())
203 |         else:
204 |             return GlobalEnvironment(resolveTargetRef(self.pattern, target))
205 | 
206 |     def match(self, word, pos=0, rpos=0):
207 |         pattern, indices = self
208 |         if indices is None:
209 |             return word.find(pattern) != -1
210 |         else:
211 |             return any(word.matchPattern(pattern, index)[0] for index in indices)
212 | 
213 | @dataclass
214 | class Flags:
215 |     ignore: int = 0
216 |     ditto: int = 0
217 |     stop: int = 0
218 |     rtl: int = 0
219 |     repeat: int = 1
220 |     persist: int = 1
221 |     chance: int = 100
222 | 
223 | @dataclass
224 | class Rule:
225 |     '''Class for representing a sound change rule.
226 | 
227 |     Instance variables:
228 |         rule      -- the rule as a string (str)
229 |         tars      -- target segments (list)
230 |         reps      -- replacement segments (list)
231 |         envs      -- application environments (list)
232 |         excs      -- exception environments (list)
233 |         otherwise -- the rule to apply if an exception is satisfied (Rule)
234 |         flags     -- flags for altering execution (Flags)
235 | 
236 |     Methods:
237 |         apply       -- apply the rule to a word
238 |         checkMatch -- check if the match is valid
239 |     '''
240 |     tars: list
241 |     reps: list
242 |     envs: list
243 |     excs: list
244 |     otherwise: 'Rule'
245 |     flags: Flags
246 |     rule: str = ''
247 | 
248 |     def __repr__(self):
249 |         return f"Rule('{self!s}')"
250 | 
251 |     def __str__(self):
252 |         return self.rule
253 | 
254 |     def __eq__(self, other):
255 |         return self[1:] == other[1:]
256 | 
257 |     def __iter__(self):
258 |         yield self.tars
259 |         yield self.reps
260 |         yield self.envs
261 |         yield self.excs
262 |         yield self.otherwise
263 |         yield self.flags
264 |         yield self.rule
265 | 
266 |     def apply(self, word):
267 |         '''Apply the sound change rule to a single word.
268 | 
269 |         Arguments:
270 |             word -- the word to which the rule is to be applied (Word)
271 | 
272 |         Raises RuleFailed if the rule did not apply to the word.
273 |         '''
274 |         logger.debug(f'This rule: `{self}`')
275 |         # Get all target matches, filtered by given indices
276 |         logger.debug('Begin matching targets')
277 |         matches = []
278 |         for i, target in enumerate(self.tars):
279 |             logger.debug(f'> Matching `{target}`')
280 |             if target:
281 |                 pattern, indices = target
282 |             else:
283 |                 pattern, indices = [], None
284 |             if not pattern:  # All pos's
285 |                 logger.debug(f'>> Null target matched all positions in range 1..{len(word)}')
286 |                 _matches = [(pos, pos, [], i) for pos in range(1, len(word))]
287 |             else:
288 |                 _matches = []
289 |                 for pos in range(1, len(word)):  # Find all matches
290 |                     match, rpos, catixes = word.matchPattern(pattern, pos)
291 |                     if match:  # pattern matches at pos
292 |                         logger.debug(f'>> Target matched `{word[pos:rpos]}` at {pos}')
293 |                         _matches.append((pos, rpos, catixes, i))
294 |                 if not _matches:
295 |                     logger.debug('>> No matches for this target')
296 |             # Filter only those matches selected by the given indices
297 |             if indices is None:
298 |                 matches += _matches
299 |             elif _matches:
300 |                 matches += [_matches[ix] for ix in indices if -len(_matches) <= ix < len(_matches)]
301 |         matches.sort()
302 |         logger.debug(f'> Final matches at positions {[match[0] for match in matches]}')
303 |         if not matches:
304 |             logger.debug('No matches')
305 |             raise RuleFailed
306 |         # Filter only those matches that fit the environment - also record the corresponding replacement
307 |         logger.debug('Check matches against environments and exceptions')
308 |         reps = []
309 |         for i in reversed(range(len(matches))):
310 |             logger.debug(f'> Checking match at {matches[i][0]}')
311 |             check = self.checkMatch(matches[i], word)
312 |             if not check:
313 |                 logger.debug(f'>> Match at {matches[i][0]} failed')
314 |                 del matches[i]
315 |             else:
316 |                 # Find the correct replacement
317 |                 logger.debug('>> Get replacement for this match')
318 |                 rule = self
319 |                 for j in range(check-1):
320 |                     rule = rule.otherwise
321 |                 _reps = rule.reps
322 |                 match = matches[i][3]
323 |                 if isinstance(_reps, tuple):  # Copy/move
324 |                     reps.append((_reps[0], _reps[1][match%len(_reps[1])]))
325 |                 else:
326 |                     reps.append(_reps[match%len(_reps)])
327 |                 logger.debug(f'>>> Found {reps[-1]}')
328 |         if not reps:
329 |             logger.debug('No matches matched environment')
330 |             raise RuleFailed
331 |         reps.reverse()
332 |         matches = sorted(zip(matches, reps), reverse=True)
333 |         # Filter overlaps
334 |         logger.debug('Filter out overlapping matches')
335 |         if self.flags.rtl:
336 |             logger.debug('> Proceeding right-to-left')
337 |             i = 1
338 |             while i < len(matches):
339 |                 if matches[i][0][1] > matches[i-1][0][0]:  # Overlap
340 |                     logger.debug(f'>> Match at {matches[i][0][0]} overlaps match at {matches[i-1][0][0]}')
341 |                     del matches[i]
342 |                 else:
343 |                     i += 1
344 |         else:
345 |             logger.debug('> Proceeding left-to-right')
346 |             for i in reversed(range(len(matches)-1)):
347 |                 if matches[i][0][0] < matches[i+1][0][1]:  # Overlap
348 |                     logger.debug(f'>> Match at {matches[i][0][0]} overlaps match at {matches[i+1][0][0]}')
349 |                     del matches[i]
350 |         logger.debug(f'Applying matches to `{word}`')
351 |         for match, rep in matches:
352 |             logger.debug(f'> Changing `{list(word[match[0]:match[1]])}` to `{rep}` at {match[0]}')
353 |             word = word.applyMatch(match, rep)
354 |         return word
355 | 
356 |     def checkMatch(self, match, word):
357 |         pos, rpos = match[:2]
358 |         if any(word.matchEnv(exc, pos, rpos) for exc in self.excs):  # If there are exceptions, does any match?
359 |             logger.debug('>> Matched an exception, check the "else" rule')
360 |         elif any(word.matchEnv(env, pos, rpos) for env in self.envs):  # Does any environment match?
361 |             logger.debug('>> Matched an environment, check succeeded')
362 |             return 1
363 |         elif self.excs:  # Are there exceptions?
364 |             logger.debug('>> Environments and exceptions both don\'t match, check failed')
365 |             return 0
366 |         else:
367 |             logger.debug('>> Environment doesn\'t match, check "else" rule')
368 |         if self.otherwise is not None:  # Try checking otherwise
369 |             check = self.otherwise.checkMatch(match, word)
370 |             return check + (1 if check else 0)
371 |         else:
372 |             logger.debug('>> No "else" rule, check failed')
373 |         return 0
374 | 
375 | @dataclass
376 | class RuleBlock(list):
377 |     '''Groups a block of sound changes together.
378 | 
379 |     Instance variables:
380 |         flags -- flags for altering execution (Flags)
381 |     '''
382 |     ruleset: InitVar[list]
383 |     flags: Flags = Flags()
384 | 
385 |     def __post_init__(self, ruleset):
386 |         list.__init__(self, ruleset)
387 | 
388 |     def apply(self, word):
389 |         from random import randint
390 |         applied = False
391 |         rules = []  # We use a list to store rules, since they may be applied multiple times
392 |         values = []  # We have a parallel list for storing the values of the 'for' flag per rule
393 |         for _rule in self:
394 |             # We want _rule to run before the stored rules, but to be placed at the end instead
395 |             rules.append(_rule)
396 |             values.append(_rule.flags.persist)
397 |             for rule in [_rule]+rules[:-1]:
398 |                 flags = rule.flags
399 |                 if not flags.ditto or (flags.ditto != 1) ^ applied:
400 |                     for j in range(flags.repeat):
401 |                         if randint(1, 100) <= flags.chance:
402 |                             applied = True
403 |                             wordin = word
404 |                             try:
405 |                                 word = rule.apply(word)
406 |                             except RuleFailed:
407 |                                 applied = False
408 |                                 logger.info(f'`{rule}` does not apply to `{word}`')
409 |                                 break
410 |                             except RuleError as e:
411 |                                 logger.warning(f'`{rule}` execution suffered an error: {e}')
412 |                                 break
413 |                             if wordin == word:
414 |                                 logger.info(f'`{rule}` does not change `{word}`')
415 |                                 break
416 |                             else:
417 |                                 logger.info(f'`{wordin}` -> `{rule}` -> `{word}`')
418 |                         else:
419 |                             applied = False
420 |                             logger.info(f'`{rule}` was randomly not run on `{word}`')
421 |                     if flags.stop and (flags.stop != 1) ^ applied:
422 |                         return word
423 |             for i in reversed(range(len(rules))):
424 |                 values[i] -= 1
425 |                 if values[i] == 0:  # If the rule has 'expired', discard it
426 |                     del rules[i]
427 |                     del values[i]
428 |         return word
429 | 
430 | @dataclass
431 | class Line:
432 |     word: Word = None
433 |     comment: str = None
434 | 
435 |     def __str__(self):
436 |         components = []
437 |         if self.word is not None:
438 |             components.append(str(self.word))
439 |         if self.comment is not None:
440 |             components.append(f'//{self.comment}')
441 |         return ' '.join(components)
442 | 
443 | # == Functions == #
444 | def parseWordset(wordset, graphs=(), separator='', syllabifier=None):
445 |     '''Parses a wordlist.
446 | 
447 |     Arguments:
448 |         wordset -- the words to be parsed (str)
449 |         graphs  -- list of graphemes used to parse the words (list)
450 | 
451 |     Returns a list.
452 |     '''
453 |     if isinstance(wordset, str):
454 |         wordset = wordset.splitlines()
455 |     _wordset = []
456 |     for word in wordset:
457 |         if isinstance(word, Word):
458 |             line = Line(word=word)
459 |         elif isinstance(word, Line):
460 |             line = word
461 |         elif word.startswith('//'):  # Is a comment
462 |             line = Line(comment=word[2:])
463 |         elif '//' in word:  # Contains a comment
464 |             word, comment = word.split('//', 1)
465 |             line = Line(word=Word(word, graphs, separator, syllabifier), comment=comment)
466 |         elif word:
467 |             line = Line(word=Word(word, graphs, separator, syllabifier))
468 |         else:
469 |             line = Line()
470 |         _wordset.append(line)
471 |     return _wordset
472 | 
473 | def tokeniseCategory(line, linenum=0):
474 |     for match in CATEGORY_REGEX.finditer(line):
475 |         type = match.lastgroup
476 |         value = match.group()
477 |         column = match.start()
478 |         if type == 'OP':
479 |             value = value.strip()
480 |         elif type == 'UNKNOWN':
481 |             raise CompilerError(f'unexpected character', value, linenum, column)
482 |         yield Token(type, value, linenum, column)
483 | 
484 | def compileCategory(line, linenum=0, cats=None):
485 |     tokens = list(tokeniseCategory(line, linenum))
486 |     if [token.type for token in tokens] != ['CATEGORY', 'OP', 'VALUES']:
487 |         raise FormatError(f'{line!r} is not a category definition')
488 |     name, op, values = [token.value for token in tokens]
489 |     if ',' not in values:
490 |         values += ','
491 |     cat = Cat.make(f'[{values}]', cats, name)
492 |     if op == '=':
493 |         return {name: cat}
494 |     else:
495 |         if cats is None or name not in cats:
496 |             raise TokenError(f'category {name!r} is not defined', tokens[1])
497 |         if op == '+=':
498 |             return {name: cats[name]+cat}
499 |         elif op == '-=':
500 |             return {name: cats[name]-cat}
501 |         else:
502 |             raise TokenError('invalid category operation', tokens[1])
503 | 
504 | def tokeniseFlags(line, linenum=0, colstart=None):
505 |     for match in FLAG_REGEX.finditer(line, colstart):
506 |         type = match.lastgroup
507 |         value = match.group()
508 |         column = match.start()
509 |         if type == 'UNKNOWN':
510 |             raise CompilerError(f'unexpected character', value, linenum, column)
511 |         yield Token(type, value, linenum, column)
512 | 
513 | def compileFlags(tokens):
514 |     tokens = list(tokens)
515 |     binaryflags = ('ignore', 'rtl')
516 |     ternaryflags = ('ditto', 'stop')
517 |     numericflags = {'repeat': MAX_RUNS, 'persist': MAX_RUNS, 'chance': 100}  # Maximum values
518 |     flags = {}
519 |     for flag, token in partitionTokens(tokens, 'SEPARATOR'):
520 |         if not flag:
521 |             raise TokenError('expected flag', token)
522 |         elif flag[0].type == 'NEGATION':
523 |             name = flag[-1].value
524 |             if len(flag) == 1:
525 |                 raise TokenError('expected flag name', token)
526 |             elif flag[1].type != 'FLAG':
527 |                 raise TokenError('expected flag name', flag[1])
528 |             elif name not in ternaryflags:
529 |                 raise TokenError('invalid ternary flag name', flag[1])
530 |             elif len(flag) == 2:
531 |                 flags[name] = -1
532 |             else:
533 |                 raise TokenError('expected semicolon', flag[2])
534 |         elif flag[0].type == 'FLAG':
535 |             name = flag[0].value
536 |             arg = flag[-1].value
537 |             if name not in FLAGS:
538 |                 raise TokenError('invalid flag name', flag[0])
539 |             elif len(flag) == 1:
540 |                 if name in numericflags:
541 |                     flags[name] = numericflags[name]  # Set to maximum value
542 |                 else:
543 |                     flags[name] = 1
544 |             elif flag[1].type != 'COLON':
545 |                 raise TokenError('expected colon or semicolon', flag[1])
546 |             elif name not in numericflags:
547 |                 raise TokenError('invalid numeric flag name', flag[1])
548 |             elif len(flag) == 2:
549 |                 raise TokenError('expected integer argument', token)
550 |             elif flag[2].type != 'ARGUMENT':
551 |                 raise TokenError('expected integer argument', flag[2])
552 |             elif not (1 <= int(arg) <= numericflags[name]):
553 |                 raise TokenError('argument out of range', flag[2])
554 |             elif len(flag) == 3:
555 |                 flags[name] = int(arg)
556 |             else:
557 |                 raise TokenError('expected semicolon', flag[3])
558 |         else:
559 |             raise TokenError('invalid flag', flag[0])
560 |     return Flags(**flags)
561 | 
562 | def tokeniseMetarule(line, linenum=0):
563 |     for match in METARULE_REGEX.finditer(line):
564 |         type = match.lastgroup
565 |         value = match.group()
566 |         column = match.start()
567 |         if type == 'METARULE':
568 |             value = value[1:]
569 |         elif type == 'SPACE':
570 |             yield Token(type, value, linenum, column)
571 |             yield from tokeniseFlags(line, linenum, match.end())
572 |             break
573 |         elif type == 'UNKNOWN':
574 |             raise CompilerError('unexpected character', value, linenum, column)
575 |         yield Token(type, value, linenum, column)
576 | 
577 | def compileMetarule(line, linenum=0):
578 |     tokens = list(tokeniseMetarule(line, linenum))
579 |     if not tokens:
580 |         raise ValueError('tokens cannot be empty')
581 |     name = tokens[0].value
582 |     for ix, token in enumerate(tokens):
583 |         if token.type == 'SPACE':  # Found flags
584 |             flags = compileFlags(tokens[ix+1:])
585 |             break
586 |     else:
587 |         ix = len(tokens)
588 |         if name == 'block':
589 |             flags = Flags()
590 |         else:
591 |             flags = None
592 |     arg = tokens[ix-1].value
593 |     if tokens[0].type != 'METARULE':
594 |         raise TokenError('expected metarule name', tokens[0])
595 |     elif name not in METARULES:
596 |         raise TokenError('invalid metarule name', tokens[0])
597 |     elif name in ('def', 'rule') and flags:
598 |         raise TokenError(f'metarule !{name} cannot take flags', tokens[ix])
599 |     elif ix == 1:
600 |         if name == 'block':
601 |             arg = None
602 |         else:
603 |             if ix < len(tokens):
604 |                 token = tokens[ix]
605 |             else:
606 |                 token = Token('', '', linenum, tokens[-1].column+len(tokens[-1].value))
607 |             raise TokenError(f'metarule !{name} requires an argument', token)
608 |     elif tokens[1].type != 'COLON':
609 |         raise TokenError('expected colon', tokens[1])
610 |     elif ix == 2:
611 |         raise TokenError('colon must be followed by an argument', tokens[1])
612 |     elif tokens[2].type != 'NUMBER' and name == 'block':
613 |         raise TokenError('metarule !block requires an integer argument', tokens[2])
614 |     elif tokens[2].type != 'IDENTIFIER' and name in ('def', 'rule'):
615 |         raise TokenError(f'metarule !{name} requires an alphabetic argument', tokens[2])
616 |     elif ix == 3:
617 |         if name == 'block':
618 |             arg = int(arg)
619 |     else:
620 |         raise TokenError('expected space or newline', tokens[3])
621 |     return name, arg, flags
622 | 
623 | def tokeniseRule(line, linenum=0):
624 |     colstart = 0
625 |     while colstart < len(line):
626 |         match = RULE_REGEX.match(line, colstart)
627 |         type = match.lastgroup
628 |         value = match.group()
629 |         column = match.start()
630 |         colstart = match.end()
631 |         if type == 'INDICES':
632 |             yield Token(type, value[1:], linenum, column)
633 |             continue
634 |         elif type == 'SPACE':
635 |             yield Token(type, value, linenum, column)
636 |             yield from tokeniseFlags(line, linenum, colstart)
637 |             break
638 |         elif type == 'UNKNOWN':
639 |             if column == 0:
640 |                 type = 'TARGET'
641 |                 value = ''
642 |                 colstart = 0
643 |             else:
644 |                 raise CompilerError(f'unexpected character', value, linenum, column)
645 |         yield Token(type, value, linenum, column)
646 |         colstart = yield from tokenisePattern(line, colstart, linenum)
647 |     else:
648 |         yield Token('END', '', linenum, colstart)
649 | 
650 | def compileIndexedPattern(pattern, cats=None, reduceindices=True):
651 |     if pattern[-1].type == 'INDICES':
652 |         indices = [int(index) for index in pattern[-1].value.split('|')]
653 |         if reduceindices:
654 |             indices = [index-(1 if index>0 else 0) for index in indices]
655 |         pattern = pattern[:-1]
656 |     else:
657 |         indices = None
658 |     return compilePattern(pattern, cats), indices
659 | 
660 | def compileTarget(pattern, cats=None):
661 |     return Target(*compileIndexedPattern(pattern, cats))
662 | 
663 | def compileEpenthesis(pattern, cats=None):
664 |     pattern, indices = compileIndexedPattern(pattern, cats, False)
665 |     return Target([], indices), Replacement(pattern)
666 | 
667 | def compileReplacement(pattern, cats=None):
668 |     pattern, indices = compileIndexedPattern(pattern, cats, False)
669 |     if indices is not None:
670 |         raise FormatError('replacement field cannot contain indices')
671 |     else:
672 |         return Replacement(pattern)
673 | 
674 | def compileEnvironment(pattern, cats=None, reduceindices=True):
675 |     patterns = []
676 |     for pattern, sep in partitionTokens(pattern, 'PLACEHOLDER'):
677 |         if sep is not None and patterns:  # Only one placeholder is allowed, which then follows the first pattern
678 |             raise TokenError('invalid placeholder', sep)
679 |         patterns.append(pattern)
680 |     if len(patterns) == 2:
681 |         left, right = patterns
682 |         env = LocalEnvironment(compilePattern(left, cats), compilePattern(right, cats))
683 |     elif len(patterns) == 1:
684 |         pattern = patterns[0]
685 |         env = GlobalEnvironment(*compileIndexedPattern(pattern, cats, reduceindices))
686 |     return env or None
687 | 
688 | COMPILERS = {
689 |     'EPENTHESIS': compileEpenthesis,
690 |     'DELETION': compileTarget,
691 |     'TARGET': compileTarget,
692 |     'MOVE': lambda pattern, cats: compileField(pattern, cats, 'AND', False),
693 |     'COPY': lambda pattern, cats: compileField(pattern, cats, 'AND', False),
694 |     'REPLACEMENT': compileReplacement,
695 |     'ENVIRONMENT': lambda pattern, cats: compileField(pattern, cats, 'AND'),
696 |     'EXCEPTION': lambda pattern, cats: compileField(pattern, cats, 'AND'),
697 | }
698 | 
699 | def compileField(tokens, cats=None, delimiter='OR', reduceindices=True):
700 |     if not tokens:
701 |         return []
702 |     if tokens[-1].type == delimiter:
703 |         raise TokenError('invalid delimiter', tokens[-1])
704 |     fieldmarker = tokens[0].type
705 |     _compile = COMPILERS.get(fieldmarker, lambda pattern, cats: compileEnvironment(pattern, cats, reduceindices))
706 |     if fieldmarker in COMPILERS:
707 |         tokens = tokens[1:]
708 |     field = []
709 |     for pattern, sep in partitionTokens(tokens, delimiter):
710 |         if not pattern:
711 |             raise TokenError('unexpected delimiter', sep)
712 |         field.append(_compile(pattern, cats))
713 |     # Final replacements field handling
714 |     if fieldmarker in ('MOVE', 'COPY'):
715 |         return fieldmarker.lower(), field
716 |     elif fieldmarker == 'EPENTHESIS':
717 |         return map(list, zip(*field))
718 |     return field
719 | 
720 | FIELD_MARKERS = {
721 |     'EPENTHESIS': 'reps',
722 |     'DELETION': 'tars',
723 |     'TARGET': 'tars',
724 |     'MOVE': 'reps',
725 |     'COPY': 'reps',
726 |     'REPLACEMENT': 'reps',
727 |     'ENVIRONMENT': 'envs',
728 |     'EXCEPTION': 'excs',
729 | }
730 | 
731 | def compileRule(line, linenum=0, cats=None):
732 |     from math import ceil
733 |     if isinstance(line, str):
734 |         tokens = list(tokeniseRule(line, linenum))
735 |     else:
736 |         tokens = line
737 |         line = ''
738 |     if tokens[0].type == 'END':
739 |         tokens = []
740 |     elif tokens[0].type not in FIELD_MARKERS:
741 |         raise TokenError('unexpected token', tokens[0])
742 |     fields = {
743 |         'otherwise': None,
744 |         'flags': Flags(),
745 |         'rule': line
746 |     }
747 |     # Extract flags
748 |     for ix, token in enumerate(tokens):
749 |         if token.type == 'SPACE':
750 |             fields['flags'] = compileFlags(tokens[ix+1:])
751 |             tokens[ix].type = 'END'
752 |             break
753 |     # Extract remainder of fields
754 |     i = None
755 |     for j, token in enumerate(tokens):
756 |         type, value = token
757 |         if type in FIELD_MARKERS or type == 'END':
758 |             if i is not None:
759 |                 field = FIELD_MARKERS[tokens[i].type]
760 |                 if field in fields:
761 |                     raise TokenError('unexpected field marker', tokens[i])
762 |                 fields[field] = tokens[i:j]
763 |             i = j
764 |             if type in ('MOVE', 'COPY', 'REPLACEMENT'):
765 |                 if 'reps' in fields:  # Detected an otherwise
766 |                     fields['otherwise'] = compileRule(fields.get('tars', []) + tokens[j:ix+1], cats=cats)
767 |                     break
768 |             elif type == 'END':
769 |                 break
770 |     # Check for restricted field combinations
771 |     if 'tars' in fields and 'reps' in fields:
772 |         if fields['tars'][0].type == 'DELETION':
773 |             raise TokenError('replacement field not allowed with deletion', fields['reps'][0])
774 |         if fields['reps'][0].type == 'EPENTHESIS':
775 |             raise TokenError('target field not allowed with epenthesis', fields['tars'][0])
776 |     # Compile fields
777 |     fields['tars'] = compileField(fields.get('tars', []), cats) or [[]]
778 |     fields['reps'] = compileField(fields.get('reps', []), cats) or [[]]
779 |     fields['envs'] = compileField(fields.get('envs', []), cats) or [[]]
780 |     fields['excs'] = compileField(fields.get('excs', []), cats)
781 |     # Handle indexed epenthesis
782 |     if isinstance(fields['reps'], map):  # Epenthesis
783 |         fields['tars'], fields['reps'] = fields['reps']
784 |     return Rule(**fields)
785 | 
786 | def compileLine(line, linenum=0, cats=None):
787 |     if not line:
788 |         return None
789 |     # Attempt to tokenise as category
790 |     with suppress(CompilerError, FormatError):
791 |         return compileCategory(line, linenum, cats)
792 |     # Attempt to tokenise as metarule
793 |     with suppress(CompilerError, FormatError):
794 |         return compileMetarule(line, linenum)
795 |     # Attempt to tokenise as rule
796 |     return compileRule(line, linenum, cats)
797 | 
798 | def makeBlock(ruleset, start=None, num=None, defs=None):
799 |     if defs is None:
800 |         defs = {}
801 |     else:
802 |         defs = defs.copy()
803 |     cats = []
804 |     block = []
805 |     if start is None:
806 |         i = 0
807 |     else:
808 |         i = start
809 |     while len(block) != num and i < len(ruleset):
810 |         rule = ruleset[i]
811 |         i += 1
812 |         if isinstance(rule, Rule):  # Rule
813 |             block.append(rule)
814 |         elif isinstance(rule, dict):  # Category
815 |             cats += rule.items()
816 |         elif isinstance(rule, tuple):  # Metarule
817 |             name, arg, flags = rule
818 |             if name == 'block':
819 |                 if arg is not None:
820 |                     _block, _cats, i, defs = makeBlock(ruleset, i, arg, defs)
821 |                 else:
822 |                     _block, _cats = makeBlock(ruleset, i, arg, defs)
823 |                     i = len(ruleset)
824 |                 block.append(RuleBlock(_block, flags))
825 |                 cats += _cats
826 |             elif name == 'def':
827 |                 _block, _cats, i, defs = makeBlock(ruleset, i, 1, defs)
828 |                 defs[arg] = _block
829 |                 cats += _cats
830 |             elif name == 'rule':
831 |                 block.extend(defs[arg])
832 |     if start is None:
833 |         return block, cats
834 |     else:
835 |         return block, cats, i, defs
836 | 
837 | def compileRuleset(ruleset, cats=None):
838 |     if isinstance(ruleset, str):
839 |         ruleset = ruleset.splitlines()
840 |     if cats is None:
841 |         cats = {}
842 |     else:
843 |         cats = cats.copy()
844 |     _ruleset = []
845 |     for linenum, line in enumerate(ruleset):
846 |         # Remove comments
847 |         line = line.split('//')[0].strip()
848 |         # Compile
849 |         try:
850 |             rule = compileLine(line, linenum, cats)
851 |         except CompilerError as e:
852 |             logger.warning(f'{line!r} failed to compile due to bad formatting: {e}')
853 |         except Exception as e:
854 |             logger.warning(f'{line!r} failed to compile due to an unexpected error: {e}')
855 |         else:
856 |             if isinstance(rule, dict):  # Category
857 |                 cats.update(rule)
858 |             _ruleset.append(rule)
859 |     # Evaluate meta-rules
860 |     ruleset, _cats = makeBlock(_ruleset)
861 |     return RuleBlock(ruleset), _cats
862 | 
863 | def setupLogging(filename=__location__, loggername='sce'):
864 |     global logger
865 |     if filename is not None:
866 |         logging.config.fileConfig(filename)
867 |     logger = logging.getLogger(loggername)
868 | 
869 | def run(wordset, ruleset, cats=None, syllabifier=None, output='list'):
870 |     '''Applies a set of sound change rules to a set of words.
871 | 
872 |     Arguments:
873 |         wordset     -- the words to which the rules are to be applied (list)
874 |         ruleset     -- the rules which are to be applied to the words (RuleBlock)
875 |         cats        -- the initial categories to be used in ruleset compiling (dict)
876 |         syllabifier -- the syllabifier function to use for syllabifying words (RulesSyllabifier)
877 |         output      -- what form to provide the output in - one of 'list', 'as-is', 'str' (str)
878 | 
879 |     Returns a str or list.
880 |     '''
881 |     if not ruleset or not wordset:  # One of these is blank so do nothing
882 |         return wordset
883 |     cats = parseCats(cats or {})
884 |     ruleset, _cats = compileRuleset(ruleset, cats)  # Compile ruleset first so we can use the graphs it contains
885 |     # Try to get graphs and separator from the initial categories
886 |     graphs = cats.get('graphs', ())
887 |     separator = cats.get('separator', [''])[0]
888 |     # Ruleset overrides externally-supplied categories
889 |     for name, cat in _cats:
890 |         if name == 'graphs':
891 |             graphs = cat
892 |         elif name == 'separator':
893 |             separator = cat[0]
894 |         else:
895 |             break
896 |     wordset = parseWordset(wordset, graphs, separator, syllabifier)
897 |     for line in wordset:
898 |         if line.word is not None:  # There's a word
899 |             logger.info(f'This word: {line.word}')
900 |             logger.debug(f'Segments: {line.word.phones}')
901 |             line.word = ruleset.apply(line.word)
902 |     if output != 'as-is':
903 |         wordset = [str(line) for line in wordset]
904 |     if output == 'str':
905 |         wordset = '\n'.join(wordset)
906 |     return wordset
907 | 
908 | apply_ruleset = run
909 | 
910 | # Setup logging
911 | setupLogging()
912 | 


--------------------------------------------------------------------------------