.
676 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | #
2 | # PatternOmatic Makefile
3 | #
4 | # This file is part of PatternOmatic.
5 | #
6 | # Copyright © 2020 Miguel Revuelta Espinosa
7 | #
8 | # PatternOmatic is free software: you can redistribute it and/or
9 | # modify it under the terms of the GNU Lesser General Public License
10 | # as published by the Free Software Foundation, either version 3 of
11 | # the License, or (at your option) any later version.
12 | #
13 | # PatternOmatic is distributed in the hope that it will be useful,
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | # GNU Lesser General Public License for more details.
17 | #
18 | # You should have received a copy of the GNU Lesser General Public License
19 | # along with PatternOmatic. If not, see .
20 | #
21 | export PYTHONPATH=.
22 |
23 | all: libs coverage clean build sonar
24 |
25 | venv:
26 | source venv/bin/activate
27 |
28 | clean:
29 | rm -rf `pwd`/build
30 | rm -rf `pwd`/dist
31 | rm -rf `pwd`/PatternOmatic.egg-info
32 | rm -rf `pwd`/fil-result
33 |
34 | libs:
35 | pip install -r requirements.txt
36 |
37 | test:
38 | python -m unittest
39 |
40 | coverage:
41 | coverage run --branch --source=PatternOmatic,scripts,tests --omit=*__init__* -m unittest && \
42 | coverage report --ignore-errors --omit=venv/**,tests/**,*__init__* && \
43 | coverage xml
44 |
45 | sonar:
46 | sonar-scanner -Dsonar.projectKey=pOm -Dsonar.exclusions=tests/**
47 |
48 | sonarcloud:
49 | sonar-scanner -Dsonar.projectKey=revuel_PatternOmatic
50 |
51 | build:
52 | python setup.py sdist bdist_wheel
53 |
54 | publish:
55 | twine upload -u __token__ -p ${PYPI_TOKEN} --repository-url https://upload.pypi.org/legacy/ dist/*
56 |
57 | run:
58 | python ./scripts/patternomatic.py -s Hello Mr. Puffin -s Goodbye Mrs. Muffin
59 |
--------------------------------------------------------------------------------
/PatternOmatic/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/revuel/PatternOmatic/8f95c1c134a14419a11b8cb192144857b40d0b3c/PatternOmatic/__init__.py
--------------------------------------------------------------------------------
/PatternOmatic/api.py:
--------------------------------------------------------------------------------
1 | """ Application Programming Interface module
2 |
3 | This file is part of PatternOmatic.
4 |
5 | Copyright © 2020 Miguel Revuelta Espinosa
6 |
7 | PatternOmatic is free software: you can redistribute it and/or
8 | modify it under the terms of the GNU Lesser General Public License
9 | as published by the Free Software Foundation, either version 3 of
10 | the License, or (at your option) any later version.
11 |
12 | PatternOmatic is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU Lesser General Public License for more details.
16 |
17 | You should have received a copy of the GNU Lesser General Public License
18 | along with PatternOmatic. If not, see .
19 |
20 | """
21 | import time
22 | import pkg_resources
23 | from typing import List, Union, Tuple, Any
24 | from spacy import load as spacy_load
25 | from spacy.cli import download as spacy_download
26 |
27 | from PatternOmatic.ge.population import Population
28 | from PatternOmatic.ge.stats import Stats
29 | from PatternOmatic.settings.config import Config
30 | from PatternOmatic.settings.log import LOG
31 | from PatternOmatic.nlp.bnf import dynamic_generator as dgg
32 |
33 |
34 | def find_patterns(
35 | samples: List[str],
36 | configuration: Union[str, None] = None,
37 | spacy_language_model_name: Union[str, None] = None) -> List[Tuple[Any, ...]]:
38 | """
39 | Given some samples, this function finds optimized patterns to be used by the Spacy's Rule Based Matcher.
40 | Args:
41 | samples: List of strings from where to find common linguistic patterns
42 | configuration: (str) Optional configuration file path to to be loaded (Fallbacks to default configuration)
43 | spacy_language_model_name: (str) Optional valid Spacy Language Model (Fallbacks to Spacy's en_core_web_sm)
44 |
45 | Returns: List of patterns found and list of each pattern matching score against the samples
46 |
47 | """
48 | LOG.info(f'Loading language model {spacy_language_model_name}...')
49 | if 'en-core-web-sm' not in [d.project_name for d in pkg_resources.working_set]:
50 | LOG.info(f'PatternOmatic\'s default spaCy\'s Language Model not installed,'
51 | f' proceeding to install en_core_web_sm, please wait...')
52 | spacy_download('en_core_web_sm')
53 |
54 | try:
55 | nlp = spacy_load(spacy_language_model_name)
56 | except OSError:
57 | LOG.warning(f'Model {spacy_language_model_name} not found, '
58 | f'falling back to patternOmatic\'s default language model: en_core_web_sm')
59 |
60 | nlp = spacy_load('en_core_web_sm')
61 |
62 | LOG.info(f'Building Doc instances...')
63 | samples = [nlp(sample) for sample in samples]
64 |
65 | if isinstance(configuration, str):
66 | LOG.info(f'Setting up configuration from the following path: {configuration}...')
67 | config = Config(config_file_path=configuration)
68 | else:
69 | config = Config()
70 | LOG.info(f'Existing Config instance found: {config}')
71 |
72 | stats = Stats()
73 |
74 | bnf_g = dgg(samples)
75 |
76 | LOG.info('Starting Execution...')
77 | for _ in range(0, config.max_runs):
78 | start = time.monotonic()
79 | p = Population(samples, bnf_g, stats)
80 | p.evolve()
81 | end = time.monotonic()
82 | stats.add_time(end - start)
83 | stats.calculate_metrics()
84 |
85 | LOG.info(f'Execution report {stats}')
86 | stats.persist()
87 |
88 | LOG.info(f'Best individuals for this execution:')
89 | stats.most_fitted_accumulator.sort(key=lambda i: i.fitness_value, reverse=True)
90 | for individual in stats.most_fitted_accumulator:
91 | LOG.info(f'{individual}')
92 |
93 | return list(zip(*[[i.fenotype, i.fitness_value] for i in stats.most_fitted_accumulator]))
94 |
--------------------------------------------------------------------------------
/PatternOmatic/ge/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/revuel/PatternOmatic/8f95c1c134a14419a11b8cb192144857b40d0b3c/PatternOmatic/ge/__init__.py
--------------------------------------------------------------------------------
/PatternOmatic/ge/individual.py:
--------------------------------------------------------------------------------
1 | """ Evolutionary Individual related classes module
2 |
3 | This file is part of PatternOmatic.
4 |
5 | Copyright © 2020 Miguel Revuelta Espinosa
6 |
7 | PatternOmatic is free software: you can redistribute it and/or
8 | modify it under the terms of the GNU Lesser General Public License
9 | as published by the Free Software Foundation, either version 3 of
10 | the License, or (at your option) any later version.
11 |
12 | PatternOmatic is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU Lesser General Public License for more details.
16 |
17 | You should have received a copy of the GNU Lesser General Public License
18 | along with PatternOmatic. If not, see .
19 |
20 | """
21 | import re
22 | import json
23 |
24 | from random import random
25 | from itertools import cycle
26 | from spacy.tokens import Doc
27 | from spacy.matcher import Matcher
28 |
29 | from PatternOmatic.ge.stats import Stats
30 | from PatternOmatic.settings.config import Config
31 | from PatternOmatic.settings.log import LOG
32 | from PatternOmatic.settings.literals import FitnessType, S, T, XPS, TOKEN_WILDCARD, UNDERSCORE, P, F, EF, IN, NOT_IN, \
33 | SLD, SRD, GTH, LTH, GEQ, LEQ, EQQ, XPS_AS
34 |
35 |
36 | class Fitness(object):
37 | """ Dispatches the proper fitness type for individual instances """
38 | __slots__ = ('_fitness', 'config', 'samples', 'fenotype')
39 |
40 | def __init__(self, config, samples, fenotype):
41 | self.config = config
42 | self.samples = samples
43 | self.fenotype = fenotype
44 | self._dispatch_fitness(self.config.fitness_function_type)
45 |
46 | def __call__(self, *args, **kwargs) -> float:
47 | return self._fitness()
48 |
49 | def _dispatch_fitness(self, fitness_function_type: FitnessType) -> None:
50 | """
51 | Sets the type of the fitness function for an Individual instance
52 | Args:
53 | fitness_function_type: The fitness function to be used
54 |
55 | Returns: None
56 |
57 | """
58 | if fitness_function_type == FitnessType.FULL_MATCH:
59 | self._fitness = self._fitness_full_match
60 | else:
61 | self._fitness = self._fitness_basic
62 |
63 | def _fitness_basic(self) -> float:
64 | """
65 | Sets the fitness value for an individual. If makes a partial match over a sample, a score is added
66 | for that sample even if the matches are only a portion of the sample's length
67 | Returns: Float (fitness value)
68 |
69 | """
70 | max_score_per_sample = 1 / len(self.samples)
71 | matcher = Matcher(self.samples[0].vocab)
72 | matcher.add(repr(FitnessType.BASIC), None, self.fenotype)
73 | contact = 0.0
74 |
75 | for sample in self.samples:
76 | matches = matcher(sample)
77 | if len(matches) > 0:
78 | contact += max_score_per_sample
79 |
80 | return self._wildcard_penalty(contact)
81 |
82 | def _fitness_full_match(self) -> float:
83 | """
84 | Sets the fitness value for an individual. It only gives a partial score if any of the matches equals the full
85 | length of the sample
86 | Returns: Float
87 |
88 | """
89 | max_score_per_sample = 1 / len(self.samples)
90 |
91 | current_vocab = self.samples[0].vocab
92 |
93 | matcher = Matcher(current_vocab)
94 | matcher.add(repr(FitnessType.FULL_MATCH), None, self.fenotype)
95 | contact = 0.0
96 |
97 | for sample in self.samples:
98 | matches = matcher(sample)
99 | if len(matches) > 0:
100 | for match in matches:
101 | contact += max_score_per_sample if match[2] == len(sample) and match[1] == 0 else + 0
102 | return self._wildcard_penalty(contact)
103 |
104 | def _wildcard_penalty(self, contact: float) -> float:
105 | """
106 | Applies a penalty for the usage of token wildcard if usage of token wildcard is enabled
107 | Args:
108 | contact: Temporary fitness value for the current individual
109 |
110 | Returns: Final fitness value for the current individual
111 |
112 | """
113 | if self.config.use_token_wildcard:
114 | num_tokens = len(self.fenotype)
115 | for item in self.fenotype:
116 | if item == {}:
117 | LOG.debug('Applying token wildcard penalty!')
118 | penalty = 1/num_tokens
119 | contact -= penalty
120 |
121 | return contact
122 |
123 |
124 | class Individual(object):
125 | """ Individual implementation of an AI Grammatical Evolution algorithm in OOP fashion """
126 | __slots__ = ('config', 'samples', 'grammar', 'stats', 'bin_genotype', 'int_genotype', 'fenotype', 'fitness_value')
127 |
128 | def __init__(self, samples: [Doc], grammar: dict, stats: Stats, dna: str = None):
129 | """
130 | Individual constructor, if dna is not supplied, sets up randomly its binary genotype
131 | Args:
132 | samples: list of Spacy doc objects
133 | grammar: Backus Naur Form grammar notation encoded in a dictionary
134 | stats (Stats): statistics object related with this run
135 | dna: Optional, binary string representation
136 | """
137 | self.config = Config()
138 |
139 | self.samples = samples
140 | self.grammar = grammar
141 | self.stats = stats
142 | self.bin_genotype = self._initialize() if dna is None else self.mutate(dna, self.config.mutation_probability)
143 | self.int_genotype = self._transcription()
144 | self.fenotype = self._translation()
145 | self.fitness_value = Fitness(self.config, self.samples, self.fenotype).__call__()
146 |
147 | # Stats concerns
148 | self._is_solution()
149 |
150 | @property
151 | def __dict__(self):
152 | """ Dictionary representation for a slotted class (that has no dict at all) """
153 | # Above works just for POPOs
154 | return {s: getattr(self, s, None) for s in self.__slots__ if s in ('bin_genotype', 'fenotype', 'fitness_value')}
155 |
156 | def __repr__(self):
157 | """ String representation of a slotted class using hijacked dict """
158 | return f'{self.__class__.__name__}({self.__dict__})'
159 |
160 | #
161 | # Problem specific GE methods
162 | #
163 | def _initialize(self) -> str:
164 | """
165 | Sets up randomly the binary string representation of an individual
166 | Returns: String, binary fashion
167 |
168 | """
169 | return ''.join([''.join('1') if random() > 0.5
170 | else ''.join('0') for _ in range(0, self.config.dna_length)]).strip()
171 |
172 | def _transcription(self) -> [int]:
173 | """
174 | Converts a binary string representation to an integer representation codon by codon
175 | Returns: List of integers
176 |
177 | """
178 | return [int(self.bin_genotype[i:(i+self.config.codon_length-1)], 2)
179 | for i in range(0, len(self.bin_genotype), self.config.codon_length-1)]
180 |
181 | def _translation(self):
182 | done = False
183 | symbolic_string = self.grammar[S][0] # Root
184 | circular = cycle(self.int_genotype)
185 |
186 | while done is not True:
187 | # First save previous iteration copy
188 | old_symbolic_string = symbolic_string
189 | ci = next(circular)
190 |
191 | for key in self.grammar.keys():
192 | symbolic_string = self._translate(ci, key, symbolic_string)
193 |
194 | # Check if anything changed from last iteration
195 | if old_symbolic_string == symbolic_string:
196 | done = True
197 |
198 | translated_individual = '[' + symbolic_string + ']'
199 |
200 | return json.loads(translated_individual)
201 |
202 | def _translate(self, ci: iter, key, symbolic_string: str):
203 | """
204 | Helper method to reduce cognitive overload of the public method with the same name (_translation)
205 | Args:
206 | ci: Last circular iterator
207 | key: Last key in the grammar dict
208 | symbolic_string: String representation of the individual's Spacy's Rule Based Matcher pattern
209 |
210 | Returns: String representation of the individual's Spacy's Rule Based Matcher pattern
211 |
212 | """
213 | fire = divmod(ci, len(self.grammar[key]))[1]
214 |
215 | if key in [T, XPS]:
216 | fired_rule = self.grammar[key][fire]
217 | if fired_rule == TOKEN_WILDCARD:
218 | symbolic_string = re.sub(key, "{}", symbolic_string, 1)
219 | else:
220 | symbolic_string = re.sub(key, "{" + str(self.grammar[key][fire]) + "}", symbolic_string, 1)
221 |
222 | elif key is UNDERSCORE:
223 | symbolic_string = re.sub(key, "\"_\"" + ": " + "{" + str(self.grammar[key][fire]) + "}", symbolic_string, 1)
224 |
225 | elif key in [P, T, F, EF]:
226 | symbolic_string = re.sub(key, str(self.grammar[key][fire]), symbolic_string, 1)
227 |
228 | elif key in [IN, NOT_IN]:
229 | key_r = key.replace(SLD, '').replace(SRD, '')
230 | feature = "\"" + key_r + "\"" + ":" + str(self.grammar[key][fire]).replace("\'", "\"").replace("\'", "")
231 | symbolic_string = re.sub(key, feature, symbolic_string, 1)
232 |
233 | elif key in [GTH, LTH, GEQ, LEQ, EQQ]:
234 | feature = "\"" + XPS_AS[key] + "\"" + ":" + str(self.grammar[key][fire])
235 | symbolic_string = re.sub(key, feature, symbolic_string, 1)
236 |
237 | else:
238 | key_r = key.replace(SLD, '').replace(SRD, '')
239 | fired_rule = str(self.grammar[key][fire])
240 | if fired_rule != XPS:
241 | feature = "\"" + key_r + "\"" + ":" + "\"" + fired_rule + "\""
242 | else:
243 | feature = "\"" + key_r + "\"" + ":" + fired_rule
244 | symbolic_string = re.sub(key, feature, symbolic_string, 1)
245 |
246 | return symbolic_string
247 |
248 | #
249 | # Generic GA methods
250 | #
251 | @classmethod
252 | def mutate(cls, dna, mutation_probability) -> str:
253 | """
254 | Mutates a given dna string by a mutation probability
255 | Args:
256 | dna: binary string representation of a dna sequence
257 | mutation_probability: Chances of each gen to be mutated
258 |
259 | Returns: Binary string
260 |
261 | """
262 | mutated_dna = ''
263 |
264 | for gen in dna:
265 | if random() < mutation_probability:
266 | if gen == '1':
267 | mutated_dna += '0'
268 | else:
269 | mutated_dna += '1'
270 | else:
271 | mutated_dna += gen
272 | return mutated_dna
273 |
274 | #
275 | # Stats concerns
276 | #
277 | def _is_solution(self) -> None:
278 | """
279 | Method to manage AES for the given RUN
280 |
281 | """
282 | if self.stats.solution_found is False:
283 | self.stats.sum_aes(1)
284 | if self.fitness_value >= self.config.success_threshold:
285 | LOG.debug('Solution found for this run!')
286 | self.stats.solution_found = True
287 |
--------------------------------------------------------------------------------
/PatternOmatic/ge/population.py:
--------------------------------------------------------------------------------
1 | """ Evolutionary Population related classes module
2 |
3 | This file is part of PatternOmatic.
4 |
5 | Copyright © 2020 Miguel Revuelta Espinosa
6 |
7 | PatternOmatic is free software: you can redistribute it and/or
8 | modify it under the terms of the GNU Lesser General Public License
9 | as published by the Free Software Foundation, either version 3 of
10 | the License, or (at your option) any later version.
11 |
12 | PatternOmatic is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU Lesser General Public License for more details.
16 |
17 | You should have received a copy of the GNU Lesser General Public License
18 | along with PatternOmatic. If not, see .
19 |
20 | """
21 | import random
22 | from typing import List, Tuple, Dict
23 | from spacy.tokens import Doc
24 |
25 | from PatternOmatic.ge.individual import Individual
26 | from PatternOmatic.ge.stats import Stats
27 | from PatternOmatic.settings.config import Config
28 | from PatternOmatic.settings.literals import SelectionType, ReplacementType
29 | from PatternOmatic.settings.log import LOG
30 |
31 |
32 | class Selection(object):
33 | """ Dispatches the proper selection type for population instances """
34 | __slots__ = '_select'
35 |
36 | def __init__(self, selection_type: SelectionType):
37 | self.__dispatch_selection(selection_type)
38 |
39 | def __call__(self, generation: List[Individual]) -> List[Individual]:
40 | """
41 | Performs a selection operation for the population
42 | Args:
43 | generation: A list of Individual instances
44 |
45 | Returns: A list of Individual instances
46 |
47 | """
48 | LOG.debug(f'Selecting individuals...')
49 | return self._select(generation)
50 |
51 | def __dispatch_selection(self, selection_type: SelectionType) -> None:
52 | """
53 | Sets the type of the selection operation for the current evolution
54 | Args:
55 | selection_type: SelectionType Enum
56 |
57 | Returns: None
58 |
59 | """
60 | if isinstance(selection_type, SelectionType):
61 | if selection_type == SelectionType.K_TOURNAMENT:
62 | self._select = self._k_tournament
63 | else:
64 | self._select = self._binary_tournament
65 | else:
66 | self._select = self._binary_tournament
67 |
68 | @staticmethod
69 | def _binary_tournament(generation: List[Individual]) -> List[Individual]:
70 | """
71 | Selects members of the current generation into the mating pool in order to produce offspring by comparing pairs
72 | of Individuals and adding the best of each pair to the "mating pool" until its filled
73 |
74 | Args:
75 | generation: A list of Individual instances
76 |
77 | Returns: A list of Individual instances
78 |
79 | """
80 | mating_pool = []
81 |
82 | while len(mating_pool) <= len(generation):
83 | i = random.randint(0, len(generation) - 1)
84 | j = i
85 |
86 | while j == i:
87 | j = random.randint(0, len(generation) - 1)
88 |
89 | i = generation[i]
90 | j = generation[j]
91 |
92 | if i.fitness_value >= j.fitness_value:
93 | mating_pool.append(i)
94 | else:
95 | mating_pool.append(j)
96 |
97 | return mating_pool
98 |
99 | @staticmethod
100 | def _k_tournament(generation: List[Individual]) -> List[Individual]:
101 | """
102 | Not implemented
103 | Args:
104 | generation: A list of Individual instances
105 |
106 | Raises: NotImplementedError
107 | Returns: A list of Individual instances
108 |
109 | """
110 | # TODO(me): k tournament
111 | raise NotImplementedError
112 |
113 |
114 | class Recombination(object):
115 | """ Dispatches the proper recombination type for population instances """
116 | __slots__ = ('_recombine', 'config', 'grammar', 'samples', 'stats')
117 |
118 | def __init__(self, grammar: Dict, samples: List[Doc], stats: Stats):
119 | self._recombine = None
120 | self.config = Config()
121 | self.grammar = grammar
122 | self.samples = samples
123 | self.stats = stats
124 | self.__dispatch_recombination_type()
125 |
126 | def __call__(self, mating_pool: List[Individual], generation: List[Individual]) -> List[Individual]:
127 | LOG.debug(f'Combining individuals...')
128 | return self._recombine(mating_pool, generation)
129 |
130 | def __dispatch_recombination_type(self) -> None:
131 | """
132 | Sets the type of the selection operation for the current evolution
133 |
134 | Returns: None
135 |
136 | """
137 | self._recombine = self._random_one_point_crossover
138 |
139 | def _random_one_point_crossover(
140 | self, mating_pool: List[Individual], generation: List[Individual]) -> List[Individual]:
141 | """
142 | For each pair of Individual instances, recombines them produce two offsprings. Puts them all into the offspring
143 | Args:
144 | mating_pool: A list of Individual instances
145 | generation: A list of Individual instances
146 |
147 | Returns: A list of Individual instances
148 |
149 | """
150 | offspring = []
151 | offspring_max_size = round(len(generation) * self.config.offspring_max_size_factor)
152 |
153 | while len(offspring) <= offspring_max_size:
154 | parent_1 = random.choice(mating_pool)
155 | parent_2 = random.choice(mating_pool)
156 |
157 | if random.random() < self.config.mating_probability:
158 | cut = random.randint(1, self.config.codon_length - 1) * self.config.num_codons_per_individual
159 |
160 | # Create children
161 | child_1 = Individual(self.samples, self.grammar, self.stats,
162 | dna=parent_1.bin_genotype[:cut] + parent_2.bin_genotype[
163 | -(self.config.dna_length - cut):])
164 |
165 | child_2 = Individual(self.samples, self.grammar, self.stats,
166 | dna=parent_2.bin_genotype[:cut] + parent_1.bin_genotype[
167 | -(self.config.dna_length - cut):])
168 |
169 | offspring.append(child_1)
170 | offspring.append(child_2)
171 |
172 | return offspring
173 |
174 |
175 | class Replacement(object):
176 | """ Dispatches the proper recombination type for population instances """
177 | __slots__ = '_replace'
178 |
179 | def __init__(self, replacement_type: ReplacementType):
180 | self.__dispatch_replacement_type(replacement_type)
181 |
182 | def __call__(self, generation: List[Individual], offspring: List[Individual]) \
183 | -> Tuple[List[Individual], List[Individual]]:
184 | LOG.debug(f'Replacing individuals...')
185 | return self._replace(generation, offspring)
186 |
187 | def __dispatch_replacement_type(self, replacement_type: ReplacementType) -> None:
188 | """
189 | Sets the type of the replacement operation for the current evolution
190 | Args:
191 | replacement_type: ReplacementType Enum
192 |
193 | Returns: None
194 |
195 | """
196 | if isinstance(replacement_type, ReplacementType):
197 | if replacement_type == ReplacementType.MU_LAMBDA_WITH_ELITISM:
198 | self._replace = self._mu_lambda_elite
199 | elif replacement_type == ReplacementType.MU_LAMBDA_WITHOUT_ELITISM:
200 | self._replace = self._mu_lambda_no_elite
201 | else:
202 | self._replace = self._mu_plus_lambda
203 | else:
204 | self._replace = self._mu_plus_lambda
205 |
206 | @staticmethod
207 | def _mu_plus_lambda(generation: List[Individual], offspring: List[Individual]) \
208 | -> Tuple[List[Individual], List[Individual]]:
209 | """
210 | Produces the next generation combining the current generation with the offspring
211 | Args:
212 | generation: A list of Individual instances
213 | offspring: A list of Individual instances
214 |
215 | Returns: A tuple containing two list of Individual instances
216 |
217 | """
218 | replacement_pool = generation + offspring
219 | replacement_pool.sort(key=lambda i: i.fitness_value, reverse=True)
220 | generation = replacement_pool[:len(generation)]
221 | offspring = []
222 |
223 | return generation, offspring
224 |
225 | @staticmethod
226 | def _mu_lambda_elite(generation: List[Individual], offspring: List[Individual]) \
227 | -> Tuple[List[Individual], List[Individual]]:
228 | """
229 | Produces the next generation using the offspring and the best Individual of the current generation
230 | Args:
231 | generation: A list of Individual instances
232 | offspring: A list of Individual instances
233 |
234 | Returns: A tuple containing two list of Individual instances
235 |
236 | """
237 | generation.sort(key=lambda i: i.fitness_value, reverse=True)
238 | offspring.sort(key=lambda i: i.fitness_value, reverse=True)
239 | generation[1:len(generation)] = offspring[0:len(generation)]
240 | offspring = []
241 |
242 | return generation, offspring
243 |
244 | @staticmethod
245 | def _mu_lambda_no_elite(generation: List[Individual], offspring: List[Individual]) \
246 | -> Tuple[List[Individual], List[Individual]]:
247 | """
248 | Produces the next generation totally replacing the current generation with the offspring
249 | Args:
250 | generation: A list of Individual instances
251 | offspring: A list of Individual instances
252 |
253 | Returns: A tuple containing two list of Individual instances
254 |
255 | """
256 | offspring.sort(key=lambda i: i.fitness_value, reverse=True)
257 | generation = offspring[0:len(generation)]
258 | offspring = []
259 |
260 | return generation, offspring
261 |
262 |
263 | class Population(object):
264 | """ Population implementation of an AI Grammatical Evolution algorithm in OOP fashion """
265 | __slots__ = ('config', 'samples', 'grammar', 'stats', 'generation', 'offspring', 'best_individual',
266 | 'selection', 'recombination', 'replacement')
267 |
268 | def __init__(self, samples: [Doc], grammar: dict, stats: Stats):
269 | """
270 | Population constructor, initializes a list of Individual objects
271 | Args:
272 | samples: list of Spacy doc objets
273 | grammar: Backus Naur Form grammar notation encoded in a dictionary
274 | """
275 | self.config = Config()
276 |
277 | self.samples = samples
278 | self.grammar = grammar
279 | self.stats = stats
280 | self.generation = self._genesis()
281 | self.offspring = list()
282 | self.best_individual = None
283 |
284 | self.selection = Selection(self.config.selection_type)
285 | self.recombination = Recombination(grammar, samples, stats)
286 | self.replacement = Replacement(self.config.replacement_type)
287 |
288 | #
289 | # Population specific methods
290 | #
291 | def _genesis(self) -> List[Individual]:
292 | """
293 | Initializes the first generation
294 | Returns: A list of individual objects
295 |
296 | """
297 | return [Individual(self.samples, self.grammar, self.stats) for _ in range(0, self.config.dna_length)]
298 |
299 | def _best_challenge(self) -> None:
300 | """
301 | Compares current generation best fitness individual against previous generation best fitness individual.
302 | Updates the best individual attribute accordingly
303 | """
304 | if self.best_individual is not None:
305 | if self.generation[0].fitness_value > self.best_individual.fitness_value:
306 | self.best_individual = self.generation[0]
307 | else:
308 | self.best_individual = self.generation[0]
309 |
310 | #
311 | # Evolution
312 | #
313 | def evolve(self):
314 | """
315 | Search Engine:
316 | 1) Selects individuals of the current generation to constitute who will mate
317 | 2) Crossover or recombination of the previously selected individuals
318 | 3) Replace/mix the this generation with the offspring
319 | 4) Save the best individual by fitness
320 | 5) Calculate statistics for this Run
321 | """
322 |
323 | LOG.info('Evolution taking place, please wait...')
324 |
325 | self.stats.reset()
326 |
327 | for _ in range(self.config.max_generations):
328 | mating_pool = self.selection(self.generation)
329 | self.offspring = self.recombination(mating_pool, self.generation)
330 | self.generation, self.offspring = self.replacement(self.generation, self.offspring)
331 | self._best_challenge()
332 |
333 | LOG.info(f'Best candidate found on this run: {self.best_individual}')
334 |
335 | # Stats concerns
336 | self.stats.add_most_fitted(self.best_individual)
337 | self.stats.add_mbf(self.best_individual.fitness_value)
338 |
339 | if self.best_individual.fitness_value > self.config.success_threshold:
340 | self.stats.add_sr(True)
341 | else:
342 | self.stats.add_sr(False)
343 |
--------------------------------------------------------------------------------
/PatternOmatic/ge/stats.py:
--------------------------------------------------------------------------------
1 | """ Grammatical Evolution performance metrics module
2 |
3 | This file is part of PatternOmatic.
4 |
5 | Copyright © 2020 Miguel Revuelta Espinosa
6 |
7 | PatternOmatic is free software: you can redistribute it and/or
8 | modify it under the terms of the GNU Lesser General Public License
9 | as published by the Free Software Foundation, either version 3 of
10 | the License, or (at your option) any later version.
11 |
12 | PatternOmatic is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU Lesser General Public License for more details.
16 |
17 | You should have received a copy of the GNU Lesser General Public License
18 | along with PatternOmatic. If not, see .
19 |
20 | """
21 | import operator
22 | from time import time
23 |
24 | from PatternOmatic.settings.literals import ReportFormat
25 | from PatternOmatic.settings.config import Config
26 |
27 |
28 | class Stats(object):
29 | """ Class responsible of handling performance metrics """
30 | __slots__ = [
31 | 'config',
32 | 'success_rate_accumulator',
33 | 'mbf_accumulator',
34 | 'aes_accumulator',
35 | 'time_accumulator',
36 | 'most_fitted_accumulator',
37 | 'solution_found',
38 | 'success_rate',
39 | 'mbf',
40 | 'aes',
41 | 'mean_time',
42 | 'aes_counter'
43 | ]
44 |
45 | def __init__(self):
46 | """ Stats instances constructor """
47 | self.config = Config()
48 | self.success_rate_accumulator = list()
49 | self.mbf_accumulator = list()
50 | self.aes_accumulator = list()
51 | self.time_accumulator = list()
52 | self.most_fitted_accumulator = list()
53 | self.solution_found = False
54 | self.success_rate = None
55 | self.mbf = None
56 | self.aes = None
57 | self.mean_time = None
58 |
59 | self.aes_counter = 0
60 |
61 | @property
62 | def __dict__(self):
63 | """ Dictionary representation for a slotted class (that has no dict at all) """
64 | # Above works just for POPOs
65 | stats_dict = \
66 | {s: getattr(self, s, None) for s in self.__slots__ if s in ('success_rate', 'mbf', 'aes', 'mean_time')}
67 |
68 | most_fitted = self.get_most_fitted()
69 | most_fitted_dict = {'most_fitted': most_fitted.__dict__} if most_fitted is not None else {'most_fitted': None}
70 | stats_dict.update(most_fitted_dict)
71 |
72 | return stats_dict
73 |
74 | def __repr__(self):
75 | """ String representation of a slotted class using hijacked dict """
76 | return f'{self.__class__.__name__}({self.__dict__})'
77 |
78 | def __iter__(self):
79 | """ Enable dict(self) """
80 | yield from self.__dict__.items()
81 |
82 | #
83 | # Accumulators & Counters
84 | #
85 | def add_sr(self, sr: bool) -> None:
86 | """
87 | Adds a new Success Rate value to the accumulator
88 | Args:
89 | sr: Boolean value that indicates if the RUN succeeded (True) or not (False)
90 |
91 | """
92 | self.success_rate_accumulator.append(sr)
93 |
94 | def add_mbf(self, bf: float) -> None:
95 | """
96 | Adds a new Best Fitness value to the accumulator
97 | Args:
98 | bf: Best fitness fount over a RUN
99 |
100 | """
101 | self.mbf_accumulator.append(bf)
102 |
103 | def add_aes(self, es: int) -> None:
104 | """
105 | Adds a new Evaluations to Solution value to the accumulator
106 | Args:
107 | es: Number of evaluations to solution over a RUN
108 |
109 | """
110 | self.aes_accumulator.append(es)
111 |
112 | def add_time(self, time_interval: float) -> None:
113 | """
114 | Adds a new Time lapsed value to the accumulator
115 | Args:
116 | time_interval: Time lapsed of a RUN
117 |
118 | """
119 | self.time_accumulator.append(time_interval)
120 |
121 | def add_most_fitted(self, individual: any) -> None:
122 | """
123 | Adds a new individual to the accumulator
124 | Args:
125 | individual: Individual with best fitness found over a RUN
126 |
127 | Returns:
128 |
129 | """
130 | self.most_fitted_accumulator.append(individual)
131 |
132 | def sum_aes(self, es: int) -> None:
133 | """
134 | Sums a new Evaluations to Solution value to the counter
135 | Args:
136 | es: Number of evaluations to Solution of a given Run
137 |
138 | Returns:
139 |
140 | """
141 | self.aes_counter += es
142 |
143 | #
144 | # Metrics
145 | #
146 | def reset(self):
147 | """ Resets variables that depend on the run """
148 | self.aes_counter = 0
149 | self.solution_found = False
150 |
151 | def calculate_metrics(self):
152 | """ Calculates the common GE evaluation metrics """
153 | self.add_aes(self.aes_counter)
154 | self.success_rate = Stats.avg(self.success_rate_accumulator)
155 | self.mbf = Stats.avg(self.mbf_accumulator)
156 | self.aes = Stats.avg(self.aes_accumulator)
157 | self.mean_time = Stats.avg(self.time_accumulator)
158 |
159 | #
160 | # Auxiliary methods
161 | #
162 | def get_most_fitted(self):
163 | """
164 | Best individual found
165 | Returns: Individual with Best Fitness found for this Execution
166 |
167 | """
168 | return max(self.most_fitted_accumulator, key=operator.attrgetter('fitness_value')) \
169 | if len(self.most_fitted_accumulator) > 0 else None
170 |
171 | @staticmethod
172 | def avg(al: list) -> float:
173 | """
174 | Returns the mean of a list if the list is not empty
175 | Args:
176 | al: List instance
177 |
178 | Returns: float, the mean/average of the list
179 |
180 | """
181 | return sum(al) / len(al) if len(al) > 0 else 0.0
182 |
183 | def persist(self) -> None:
184 | """
185 | Makes or append execution result to file. If no valid format is specified CSV will be used as default
186 | Returns: None
187 |
188 | """
189 | if self.config.report_format == ReportFormat.JSON:
190 | with open(self.config.report_path, mode='a+') as f:
191 | f.writelines(f'{dict(self)}' + '\n')
192 | else:
193 | with open(self.config.report_path, mode='a+') as f:
194 | f.writelines(self._to_csv() + '\n')
195 |
196 | def _to_csv(self):
197 | """
198 | Generates Comma Separated Value (csv) representation of a Stats instance object
199 | Returns: String, csv instance representation
200 |
201 | """
202 | csv = f'{time()}' + '\t'
203 |
204 | for k, v in self.__dict__.items():
205 | if not type(v) is dict:
206 | csv = csv + str(v) + '\t'
207 | else:
208 | for _, vi in v.items():
209 | csv = csv + str(vi) + '\t'
210 | return csv
211 |
--------------------------------------------------------------------------------
/PatternOmatic/nlp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/revuel/PatternOmatic/8f95c1c134a14419a11b8cb192144857b40d0b3c/PatternOmatic/nlp/__init__.py
--------------------------------------------------------------------------------
/PatternOmatic/nlp/bnf.py:
--------------------------------------------------------------------------------
1 | """ Backus Naur Form Grammar Generator module
2 |
3 | This file is part of PatternOmatic.
4 |
5 | Copyright © 2020 Miguel Revuelta Espinosa
6 |
7 | PatternOmatic is free software: you can redistribute it and/or
8 | modify it under the terms of the GNU Lesser General Public License
9 | as published by the Free Software Foundation, either version 3 of
10 | the License, or (at your option) any later version.
11 |
12 | PatternOmatic is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU Lesser General Public License for more details.
16 |
17 | You should have received a copy of the GNU Lesser General Public License
18 | along with PatternOmatic. If not, see .
19 |
20 | """
21 | from inspect import getmembers
22 | from spacy.tokens import Doc, Token
23 | from PatternOmatic.settings.config import Config
24 | from PatternOmatic.settings.literals import S, P, T, F, OP, NEGATION, ZERO_OR_ONE, ZERO_OR_MORE, ONE_OR_MORE, LENGTH, \
25 | XPS, IN, NOT_IN, EQQ, GEQ, LEQ, GTH, LTH, TOKEN_WILDCARD, UNDERSCORE, EF, ORTH, TEXT, LOWER, POS, TAG, DEP, LEMMA, \
26 | SHAPE, ENT_TYPE, IS_ALPHA, IS_ASCII, IS_DIGIT, IS_BRACKET, IS_LOWER, IS_PUNCT, IS_QUOTE, IS_SPACE, IS_TITLE, \
27 | IS_OOV, IS_UPPER, IS_STOP, IS_CURRENCY, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, LIKE_NUM, LIKE_EMAIL, \
28 | LANG, NORM, PREFIX, SENTIMENT, STRING, SUFFIX, TEXT_WITH_WS, WHITESPACE, LIKE_URL, MATCHER_SUPPORTED_ATTRIBUTES, \
29 | ENT_ID, ENT_IOB, ENT_KB_ID, HAS_VECTOR
30 | from PatternOmatic.settings.log import LOG
31 |
32 |
33 | #
34 | # Dynamic Grammar (Backus Naur Form) Generator
35 | #
36 | def dynamic_generator(samples: [Doc]) -> dict:
37 | """
38 | Dynamically generates a grammar in Backus Naur Form (BNF) notation representing the available Spacy NLP
39 | Linguistic Feature values of the given sample list of Doc instances
40 | Args:
41 | samples: List of Spacy Doc objects
42 |
43 | Returns: Backus Naur Form grammar notation encoded in a dictionary
44 |
45 | """
46 | config = Config()
47 |
48 | LOG.info(f'Generating BNF based on the following samples: {str(samples)}')
49 |
50 | # BNF root
51 | pattern_grammar = {S: [P]}
52 |
53 | # Watch out features of seen samples and max number of tokens per sample
54 | max_length_token, min_length_token, features_dict, extended_features = _features_seen(samples)
55 |
56 | # Update times token per pattern [Min length of tokens, Max length of tokens] interval
57 | pattern_grammar[P] = _symbol_stacker(T, max_length_token, min_length_token)
58 |
59 | # Update times features per token (Max length of features)
60 | pattern_grammar[T] = _symbol_stacker(F, _get_features_per_token(features_dict))
61 |
62 | if config.use_token_wildcard is True:
63 | pattern_grammar[T].append(TOKEN_WILDCARD)
64 |
65 | # Update available features (just the features list)
66 | list_of_features = list(features_dict.keys())
67 | if config.use_grammar_operators is True and config.use_extended_pattern_syntax is False:
68 | pattern_grammar = _add_grammar_operators(pattern_grammar, list_of_features)
69 | elif config.use_extended_pattern_syntax is True and config.use_grammar_operators is False:
70 | pattern_grammar = _add_extended_pattern_syntax(pattern_grammar, list_of_features, features_dict)
71 | else:
72 | pattern_grammar[F] = list_of_features
73 |
74 | # Update each feature possible values
75 | for k, v in features_dict.items():
76 | if config.use_extended_pattern_syntax is True:
77 | v.append(XPS)
78 | pattern_grammar.update({k: v})
79 |
80 | if config.use_custom_attributes is True:
81 | pattern_grammar = _add_custom_attributes(pattern_grammar, extended_features)
82 |
83 | LOG.info(f'Dynamically generated BNF: {str(pattern_grammar)}')
84 |
85 | return pattern_grammar
86 |
87 |
88 | #
89 | # BNF Utilities
90 | #
91 | def _features_seen(samples: [Doc]) -> (int, int, dict, dict):
92 | """
93 | Builds up a dictionary containing Spacy Linguistic Feature Keys and their respective seen values for the sample
94 | Args:
95 | samples: List of Spacy Doc objects
96 |
97 | Returns: Integer, the max length of a doc within the sample and a dict of features
98 |
99 | """
100 | config = Config()
101 |
102 | # Just tokenizer features
103 | orth_list = []
104 | text_list = []
105 | lower_list = []
106 | length_list = []
107 | shape_list = []
108 |
109 | # For boolean features
110 | bool_list = [True, False]
111 |
112 | # Require more than a tokenizer
113 | pos_list = []
114 | tag_list = []
115 | dep_list = []
116 | lemma_list = []
117 | ent_type_list = []
118 |
119 | # Capture the len of the largest doc
120 | max_doc_length = 0
121 | min_doc_length = 999999999
122 |
123 | # Set token extensions
124 | if config.use_custom_attributes is True:
125 | _set_token_extension_attributes(samples[0][0])
126 | extended_features = _extended_features_seen([token for sample in samples for token in sample])
127 | else:
128 | extended_features = {UNDERSCORE: {}}
129 |
130 | for sample in samples:
131 | sample_length = len(sample)
132 |
133 | for token in sample:
134 | orth_list.append(token.orth_)
135 | text_list.append(token.text)
136 | lower_list.append(token.lower_)
137 | length_list.append(len(token))
138 | pos_list.append(token.pos_)
139 | tag_list.append(token.tag_)
140 | dep_list.append(token.dep_)
141 | lemma_list.append(token.lemma_)
142 | shape_list.append(token.shape_)
143 | ent_type_list.append(token.ent_type_)
144 |
145 | # Checks for max/min length of tokens per sample
146 | if sample_length > max_doc_length:
147 | max_doc_length = sample_length
148 |
149 | if sample_length < min_doc_length:
150 | min_doc_length = sample_length
151 |
152 | if config.use_uniques is True:
153 | features = {ORTH: sorted(list(set(orth_list))),
154 | TEXT: sorted(list(set(text_list))),
155 | LOWER: sorted(list(set(lower_list))),
156 | LENGTH: sorted(list(set(length_list))),
157 | POS: sorted(list(set(pos_list))),
158 | TAG: sorted(list(set(tag_list))),
159 | DEP: sorted(list(set(dep_list))),
160 | LEMMA: sorted(list(set(lemma_list))),
161 | SHAPE: sorted(list(set(shape_list))),
162 | ENT_TYPE: sorted(list(set(ent_type_list)))}
163 | else:
164 | features = {ORTH: orth_list,
165 | TEXT: text_list,
166 | LOWER: lower_list,
167 | LENGTH: length_list,
168 | POS: pos_list,
169 | TAG: tag_list,
170 | DEP: dep_list,
171 | LEMMA: lemma_list,
172 | SHAPE: shape_list,
173 | ENT_TYPE: ent_type_list}
174 |
175 | # Add boolean features
176 | if config.use_boolean_features is True:
177 | features.update({
178 | IS_ALPHA: bool_list,
179 | IS_ASCII: bool_list,
180 | IS_DIGIT: bool_list,
181 | IS_LOWER: bool_list,
182 | IS_UPPER: bool_list,
183 | IS_TITLE: bool_list,
184 | IS_PUNCT: bool_list,
185 | IS_SPACE: bool_list,
186 | IS_STOP: bool_list,
187 | LIKE_NUM: bool_list,
188 | LIKE_URL: bool_list,
189 | LIKE_EMAIL: bool_list
190 | })
191 |
192 | # Drop all observations equal to empty string
193 | features = _feature_pruner(features)
194 | extended_features[UNDERSCORE] = _feature_pruner(extended_features[UNDERSCORE])
195 |
196 | return max_doc_length, min_doc_length, features, extended_features
197 |
198 |
199 | def _set_token_extension_attributes(token: Token) -> None:
200 | """
201 | Given a Spacy Token instance, register all the Spacy token attributes not accepted by the Spacy Matcher
202 | as custom attributes inside the Token Extensions (token._. space)
203 | Returns: None
204 |
205 | """
206 | # Retrieve cleaned up Token Attributes
207 | token_attributes = _clean_token_attributes(
208 | {k: v for k, v in getmembers(token) if type(v) in (str, bool, float)})
209 |
210 | # Set token custom attributes
211 | lambda_list = []
212 | i = 0
213 | for k, v in token_attributes.items():
214 | lambda_list.append(lambda token_=token, k_=k: getattr(token_, k_))
215 | token.set_extension(str('custom_'+k).upper(), getter=lambda_list[i])
216 | i += 1
217 |
218 |
219 | def _clean_token_attributes(token_attributes: dict) -> dict:
220 | """
221 | Removes from input dict keys contained in a set that represents the Spacy Matcher supported token attributes
222 | Args:
223 | token_attributes: dict of token features
224 |
225 | Returns: Token attributes dict without Spacy Matcher's supported attribute keys
226 |
227 | """
228 | token_attributes.pop('__doc__')
229 | for item in MATCHER_SUPPORTED_ATTRIBUTES:
230 | token_attributes.pop(item)
231 |
232 | return token_attributes
233 |
234 |
235 | def _extended_features_seen(tokens: [Token]) -> dict:
236 | """
237 | Builds up a dictionary containing Spacy Linguistic Feature Keys and their respective seen values for the
238 | input token list extended attributes (those attributes not accepted by the Spacy Matcher by default,
239 | included as token extensions)
240 | Args:
241 | tokens: List of Spacy Token instances
242 |
243 | Returns: dict of features
244 |
245 | """
246 | bool_list = [True, False]
247 |
248 | extended_features = \
249 | {
250 | UNDERSCORE: {
251 | ENT_ID: sorted(list(set([getattr(getattr(token, '_'), 'CUSTOM_ENT_ID_') for token in tokens]))),
252 | ENT_IOB: sorted(list(set([getattr(getattr(token, '_'), 'CUSTOM_ENT_IOB_') for token in tokens]))),
253 | ENT_KB_ID: sorted(list(set([getattr(getattr(token, '_'), 'CUSTOM_ENT_KB_ID_') for token in tokens]))),
254 | HAS_VECTOR: bool_list,
255 | IS_BRACKET: bool_list,
256 | IS_CURRENCY: bool_list,
257 | IS_LEFT_PUNCT: bool_list,
258 | IS_OOV: bool_list,
259 | IS_QUOTE: bool_list,
260 | IS_RIGHT_PUNCT: bool_list,
261 | # IS_SENT_START:
262 | # sorted(list(set([getattr(getattr(token, '_'), 'CUSTOM_IS_SENT_START') for token in tokens]))),
263 | LANG: sorted(list(set([getattr(getattr(token, '_'), 'CUSTOM_LANG_') for token in tokens]))),
264 | NORM: sorted(list(set([getattr(getattr(token, '_'), 'CUSTOM_NORM_') for token in tokens]))),
265 | PREFIX: sorted(list(set([getattr(getattr(token, '_'), 'CUSTOM_PREFIX_') for token in tokens]))),
266 | # PROB:
267 | # sorted(list(set([abs(getattr(getattr(token, '_'), 'CUSTOM_PROB')) for token in tokens]))),
268 | SENTIMENT: sorted(list(set([getattr(getattr(token, '_'), 'CUSTOM_SENTIMENT') for token in tokens]))),
269 | STRING: sorted(list(set([getattr(getattr(token, '_'), 'CUSTOM_STRING') for token in tokens]))),
270 | SUFFIX: sorted(list(set([getattr(getattr(token, '_'), 'CUSTOM_SUFFIX_') for token in tokens]))),
271 | TEXT_WITH_WS: sorted(list(set(
272 | [getattr(getattr(token, '_'), 'CUSTOM_TEXT_WITH_WS') for token in tokens]))),
273 | WHITESPACE: sorted(list(set([getattr(getattr(token, '_'), 'CUSTOM_WHITESPACE_') for token in tokens])))
274 | }
275 | }
276 |
277 | return extended_features
278 |
279 |
280 | def _feature_pruner(features: dict) -> dict:
281 | """
282 | Prunes dict keys whose values contain a list of repeated items
283 | Args:
284 | features: dict
285 |
286 | Returns: pruned dict
287 |
288 | """
289 | # Drop all observations equal to empty string
290 | to_del_list = list()
291 | for k in features.keys():
292 | if len(features[k]) == 1 and features[k][0] == '':
293 | to_del_list.append(k)
294 |
295 | for k_item in to_del_list:
296 | features.pop(k_item)
297 |
298 | return features
299 |
300 |
301 | def _symbol_stacker(symbol: str, max_length: int, min_length: int = 1) -> list:
302 | """
303 | Given a symbol creates a list of length max_length where each item is symbol concat previous list item
304 | Args:
305 | symbol: string
306 | max_length: list max length
307 |
308 | Returns: list of symbol
309 |
310 | """
311 | symbol_times_list = list()
312 | last = ''
313 |
314 | for _ in range(max_length):
315 | if last == '':
316 | last = symbol
317 | else:
318 | last = last + "," + symbol
319 |
320 | symbol_times_list.append(last)
321 |
322 | if 1 < min_length <= max_length:
323 | symbol_times_list = symbol_times_list[min_length-1:]
324 |
325 | return symbol_times_list
326 |
327 |
328 | def _get_features_per_token(features_dict: dict) -> int:
329 | """
330 | Given the configuration set up, determine the maximum number of features per token at grammar
331 | Args:
332 | features_dict: dictionary of features keys with all possible feature value options
333 |
334 | Returns: integer
335 |
336 | """
337 | config = Config()
338 |
339 | if config.features_per_token <= 0:
340 | max_length_features = len(features_dict.keys())
341 | else:
342 | if len(features_dict.keys()) < config.features_per_token + 1:
343 | max_length_features = len(features_dict.keys())
344 | else:
345 | max_length_features = config.features_per_token
346 |
347 | return max_length_features
348 |
349 |
350 | def _add_grammar_operators(pattern_grammar: dict, list_of_features: list) -> dict:
351 | """
352 | Adds support to Spacy's grammar operators usage
353 | Args:
354 | pattern_grammar: BNF dict
355 | list_of_features: list of token features
356 |
357 | Returns: Backus Naur Form grammar notation encoded in a dictionary with Spacy's grammar operators
358 |
359 | """
360 | list_of_features_op = list()
361 | for feature in list_of_features:
362 | list_of_features_op.append(feature)
363 | list_of_features_op.append(feature + ',' + OP)
364 | pattern_grammar[F] = list_of_features_op
365 | pattern_grammar[OP] = [NEGATION, ZERO_OR_ONE, ONE_OR_MORE, ZERO_OR_MORE]
366 | return pattern_grammar
367 |
368 |
369 | def _add_extended_pattern_syntax(pattern_grammar: dict, list_of_features: list, features_dict: dict) -> dict:
370 | """
371 | Adds support to the extended pattern syntax at BNF dicts
372 | Args:
373 | pattern_grammar: BNF dict
374 | list_of_features: list of token features
375 | features_dict: dict of token features
376 |
377 | Returns:
378 | dict: Backus Naur Form grammar notation encoded in a dictionary with Spacy's extended pattern syntax
379 | """
380 | tmp_lengths = features_dict[LENGTH].copy()
381 | full_terminal_stack = _all_feature_terminal_list(features_dict)
382 | pattern_grammar[F] = list_of_features
383 | pattern_grammar[XPS] = [IN, NOT_IN, EQQ, GEQ, LEQ, GTH, LTH]
384 | pattern_grammar[IN] = full_terminal_stack
385 | pattern_grammar[NOT_IN] = full_terminal_stack
386 | pattern_grammar[EQQ] = tmp_lengths
387 | pattern_grammar[GEQ] = tmp_lengths
388 | pattern_grammar[LEQ] = tmp_lengths
389 | pattern_grammar[GTH] = tmp_lengths
390 | pattern_grammar[LTH] = tmp_lengths
391 |
392 | return pattern_grammar
393 |
394 |
395 | def _all_feature_terminal_list(features_dict: dict) -> list:
396 | """
397 | Stacks all feature terminal options in a list of lists to be used for the extended pattern syntax set operators
398 | Args:
399 | features_dict: dictionary of feature keys with all possible feature value options
400 |
401 | Returns:
402 |
403 | """
404 | all_terminal_list = list()
405 |
406 | for item in list(features_dict.items()):
407 | current_terminal_holder = list()
408 |
409 | for terminal_list_item in item[1]:
410 | if len(current_terminal_holder) > 0:
411 | temp_list = list(current_terminal_holder[-1])
412 | temp_list.append(terminal_list_item)
413 | current_terminal_holder.append(temp_list)
414 | else:
415 | current_terminal_holder.append([terminal_list_item])
416 |
417 | all_terminal_list += current_terminal_holder
418 |
419 | all_terminal_list = [ele for ind, ele in enumerate(all_terminal_list) if ele not in all_terminal_list[:ind]]
420 | return all_terminal_list
421 |
422 |
423 | def _add_custom_attributes(pattern_grammar: dict, extended_features: dict) -> dict:
424 | """
425 | Adds support to a specific set of custom attributes at BNF dict
426 | Args:
427 | pattern_grammar: BNF dict
428 | extended_features: dict of token features not supported by default by the Spacy's Matcher
429 |
430 | Returns: Backus Naur Form grammar notation encoded in a dictionary with Spacy's custom attributes
431 |
432 | """
433 | pattern_grammar[UNDERSCORE] = _symbol_stacker(EF, _get_features_per_token(extended_features[UNDERSCORE]))
434 | pattern_grammar[EF] = list(extended_features[UNDERSCORE].keys())
435 | pattern_grammar.update(extended_features[UNDERSCORE].items())
436 | pattern_grammar[T].append(UNDERSCORE)
437 | pattern_grammar[T].append(F + "," + UNDERSCORE)
438 | return pattern_grammar
439 |
--------------------------------------------------------------------------------
/PatternOmatic/settings/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/revuel/PatternOmatic/8f95c1c134a14419a11b8cb192144857b40d0b3c/PatternOmatic/settings/__init__.py
--------------------------------------------------------------------------------
/PatternOmatic/settings/config.py:
--------------------------------------------------------------------------------
1 | """ Configuration Management module
2 |
3 | This file is part of PatternOmatic.
4 |
5 | Copyright © 2020 Miguel Revuelta Espinosa
6 |
7 | PatternOmatic is free software: you can redistribute it and/or
8 | modify it under the terms of the GNU Lesser General Public License
9 | as published by the Free Software Foundation, either version 3 of
10 | the License, or (at your option) any later version.
11 |
12 | PatternOmatic is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU Lesser General Public License for more details.
16 |
17 | You should have received a copy of the GNU Lesser General Public License
18 | along with PatternOmatic. If not, see .
19 |
20 | """
21 | from __future__ import annotations
22 | import configparser
23 | from typing import Optional
24 | from PatternOmatic.settings.log import LOG
25 | from PatternOmatic.settings.literals import GE, MAX_RUNS, SUCCESS_THRESHOLD, POPULATION_SIZE, MAX_GENERATIONS, \
26 | CODON_LENGTH, CODONS_X_INDIVIDUAL, MUTATION_PROBABILITY, OFFSPRING_FACTOR, MATING_PROBABILITY, K_VALUE, \
27 | SELECTION_TYPE, REPLACEMENT_TYPE, RECOMBINATION_TYPE, RecombinationType, ReplacementType, SelectionType, \
28 | FitnessType, FITNESS_FUNCTION_TYPE, \
29 | DGG, FEATURES_X_TOKEN, USE_BOOLEAN_FEATURES, USE_CUSTOM_ATTRIBUTES, USE_UNIQUES, \
30 | USE_GRAMMAR_OPERATORS, USE_TOKEN_WILDCARD, USE_EXTENDED_PATTERN_SYNTAX, REPORT_PATH, IO, ReportFormat, REPORT_FORMAT
31 |
32 |
33 | class SingletonMetaNaive(type):
34 | """ The Naive Singleton Design Pattern of type Metaclass builder """
35 |
36 | _instance: Optional[Config, None] = None
37 |
38 | def __call__(cls, config_file_path: str = None) -> Config:
39 | if cls._instance is None:
40 | LOG.debug('Creating config object!')
41 | cls._instance = super().__call__(config_file_path)
42 | return cls._instance
43 |
44 | def clear_instance(self):
45 | """ For testing purposes, destroy Singleton instance """
46 | LOG.debug('Removing config object!')
47 | self._instance = None
48 | del self._instance
49 |
50 |
51 | class Config(metaclass=SingletonMetaNaive):
52 | """ Singleton Configuration package's Class"""
53 | __slots__ = (
54 | 'max_runs',
55 | 'success_threshold',
56 | 'population_size',
57 | 'max_generations',
58 | 'codon_length',
59 | 'num_codons_per_individual',
60 | 'dna_length',
61 | 'mutation_probability',
62 | 'offspring_max_size_factor',
63 | 'mating_probability',
64 | 'k_value',
65 | 'selection_type',
66 | 'recombination_type',
67 | 'replacement_type',
68 | 'fitness_function_type',
69 | 'features_per_token',
70 | 'use_boolean_features',
71 | 'use_custom_attributes',
72 | 'use_uniques',
73 | 'use_grammar_operators',
74 | 'use_token_wildcard',
75 | 'use_extended_pattern_syntax',
76 | 'report_path',
77 | 'report_format',
78 | 'file_path'
79 | )
80 |
81 | def __init__(self, config_file_path: str = None):
82 | """
83 | Config object constructor
84 | Args:
85 | config_file_path: Path for a configuration file
86 | """
87 | config_parser = configparser.ConfigParser()
88 |
89 | if config_file_path is None:
90 | LOG.warning(f'Configuration file not provided. Falling back to default values')
91 | self.file_path = None
92 | else:
93 | file_list = config_parser.read(config_file_path)
94 | if len(file_list) == 0:
95 | LOG.warning(f'File {config_file_path} not found. Falling back to default values')
96 | self.file_path = None
97 | else:
98 | self.file_path = config_file_path
99 |
100 | #
101 | # GE configuration parameters
102 | #
103 | self.max_runs = self._validate_config_argument(GE, MAX_RUNS, 4, config_parser)
104 | self.success_threshold = self._validate_config_argument(GE, SUCCESS_THRESHOLD, 0.8, config_parser)
105 | self.population_size = self._validate_config_argument(GE, POPULATION_SIZE, 10, config_parser)
106 | self.max_generations = self._validate_config_argument(GE, MAX_GENERATIONS, 3, config_parser)
107 | self.codon_length = self._validate_config_argument(GE, CODON_LENGTH, 8, config_parser)
108 | self.num_codons_per_individual = self._validate_config_argument(GE, CODONS_X_INDIVIDUAL, 4, config_parser)
109 | self.dna_length = self.codon_length * self.num_codons_per_individual
110 | self.mutation_probability = self._validate_config_argument(GE, MUTATION_PROBABILITY, 0.5, config_parser)
111 | self.offspring_max_size_factor = self._validate_config_argument(GE, OFFSPRING_FACTOR, 3.5, config_parser)
112 | self.mating_probability = self._validate_config_argument(GE, MATING_PROBABILITY, 0.9, config_parser)
113 | self.k_value = self._validate_config_argument(GE, K_VALUE, 3, config_parser)
114 |
115 | #
116 | # GE configuration methods
117 | #
118 | self.selection_type = SelectionType(
119 | self._validate_config_argument(GE, SELECTION_TYPE, 0, config_parser))
120 |
121 | self.recombination_type = RecombinationType(
122 | self._validate_config_argument(GE, RECOMBINATION_TYPE, 0, config_parser))
123 |
124 | self.replacement_type = ReplacementType(
125 | self._validate_config_argument(GE, REPLACEMENT_TYPE, 0, config_parser))
126 |
127 | self.fitness_function_type = FitnessType(
128 | self._validate_config_argument(GE, FITNESS_FUNCTION_TYPE, 1, config_parser))
129 |
130 | #
131 | # BNF Grammar Generation configuration options
132 | #
133 | self.features_per_token = self._validate_config_argument(DGG, FEATURES_X_TOKEN, 1, config_parser)
134 | self.use_boolean_features = self._validate_config_argument(DGG, USE_BOOLEAN_FEATURES, False, config_parser)
135 | self.use_custom_attributes = self._validate_config_argument(DGG, USE_CUSTOM_ATTRIBUTES, False, config_parser)
136 | self.use_uniques = self._validate_config_argument(DGG, USE_UNIQUES, True, config_parser)
137 | self.use_grammar_operators = self._validate_config_argument(DGG, USE_GRAMMAR_OPERATORS, False, config_parser)
138 | self.use_token_wildcard = self._validate_config_argument(DGG, USE_TOKEN_WILDCARD, False, config_parser)
139 | self.use_extended_pattern_syntax = \
140 | self._validate_config_argument(DGG, USE_EXTENDED_PATTERN_SYNTAX, False, config_parser)
141 |
142 | #
143 | # Configuration validation
144 | #
145 | self._check_xps_op_restriction()
146 |
147 | #
148 | # IO
149 | #
150 | self.report_path = \
151 | self._validate_config_argument(IO, REPORT_PATH, '/tmp/patternomatic_report.txt', config_parser)
152 |
153 | self.report_format = ReportFormat(self._validate_config_argument(IO, REPORT_FORMAT, 0, config_parser))
154 |
155 | LOG.info(f'Configuration instance: {self}')
156 |
157 | def __setattr__(self, key, value) -> None:
158 | """
159 | Overrides method to be used with slots
160 | Args:
161 | key: An object slotted property
162 | value: An intended value for the object key
163 |
164 | Returns: None
165 |
166 | """
167 | if hasattr(self, key):
168 | if self._preserve_property_type(getattr(self, key), value):
169 | super(Config, self).__setattr__(key, value)
170 | LOG.info(f'Updating configuration parameter {key.upper()} with value {value}')
171 | if key == USE_EXTENDED_PATTERN_SYNTAX.lower() or key == USE_GRAMMAR_OPERATORS.lower():
172 | self._check_xps_op_restriction()
173 | else:
174 | LOG.warning(f'Invalid data type {type(value)} for property {key}. Skipping update')
175 | else:
176 | super(Config, self).__setattr__(key, value)
177 |
178 | @property
179 | def __dict__(self):
180 | """ Hijacks dictionary for this config slotted class """
181 | return {s: getattr(self, s, None) for s in self.__slots__}
182 |
183 | def __repr__(self):
184 | """ Representation of config instance """
185 | return f'{self.__class__.__name__}({self.__dict__})'
186 |
187 | #
188 | # Utilities
189 | #
190 | @staticmethod
191 | def _validate_config_argument(section, option, default, config_parser):
192 | """
193 |
194 | Args:
195 | section:
196 | option:
197 | default:
198 | config_parser:
199 |
200 | Returns:
201 |
202 | """
203 | try:
204 | if isinstance(default, bool):
205 | value = config_parser.getboolean(section, option, fallback=default)
206 | elif isinstance(default, int):
207 | value = config_parser.getint(section, option, fallback=default)
208 | elif isinstance(default, float):
209 | value = config_parser.getfloat(section, option, fallback=default)
210 | elif isinstance(default, str):
211 | value = config_parser.get(section, option, fallback=default)
212 | else:
213 | value = default
214 | except ValueError:
215 | LOG.warning(f'[{section}][{option}] configuration parameter wrongly set. '
216 | f'Falling back to its default value: {default}')
217 | value = default
218 |
219 | LOG.debug(f'[{section}][{option}] {value}')
220 | return value
221 |
222 | @staticmethod
223 | def _preserve_property_type(_property, value):
224 | return isinstance(value, type(_property))
225 |
226 | #
227 | # Problem specific restrictions
228 | #
229 | def _check_xps_op_restriction(self) -> None:
230 | """
231 | Spacy's Grammar Operators and Quantifiers and the Spacy's Extended Pattern Syntax can not be used together at
232 | the same time in a pattern for the Spacy's Rule Based Matcher.
233 |
234 | This method checks the provided configuration and disables the Spacy's Extended Pattern Syntax if both
235 | mechanisms are found enabled at the provided configuration.
236 |
237 | Returns: None
238 |
239 | """
240 | if hasattr(self, USE_EXTENDED_PATTERN_SYNTAX.lower()) and hasattr(self, USE_GRAMMAR_OPERATORS.lower()) and \
241 | self.use_extended_pattern_syntax is True and self.use_grammar_operators is True:
242 | LOG.warning(f'Extended Pattern Syntax is not compatible with the usage of Grammar Operators. '
243 | f'Extended Pattern Syntax has been disabled!')
244 | self.use_extended_pattern_syntax = False
245 |
--------------------------------------------------------------------------------
/PatternOmatic/settings/literals.py:
--------------------------------------------------------------------------------
1 | """ Literals/constants module
2 |
3 | This file is part of PatternOmatic.
4 |
5 | Copyright © 2020 Miguel Revuelta Espinosa
6 |
7 | PatternOmatic is free software: you can redistribute it and/or
8 | modify it under the terms of the GNU Lesser General Public License
9 | as published by the Free Software Foundation, either version 3 of
10 | the License, or (at your option) any later version.
11 |
12 | PatternOmatic is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU Lesser General Public License for more details.
16 |
17 | You should have received a copy of the GNU Lesser General Public License
18 | along with PatternOmatic. If not, see .
19 |
20 | """
21 | from enum import Enum, unique
22 |
23 |
24 | #
25 | # GE Related literals
26 | #
27 | @unique
28 | class SelectionType(Enum):
29 | """ Evolutionary selection types """
30 | BINARY_TOURNAMENT = 0
31 | K_TOURNAMENT = 1
32 |
33 | def __repr__(self):
34 | """ Human readable """
35 | return self.name
36 |
37 |
38 | @unique
39 | class RecombinationType(Enum):
40 | """ Evolutionary recombination types enum """
41 | RANDOM_ONE_POINT_CROSSOVER = 0
42 |
43 | def __repr__(self):
44 | """ Human readable """
45 | return self.name
46 |
47 |
48 | @unique
49 | class ReplacementType(Enum):
50 | """ Evolutionary replacement types enum """
51 | MU_PLUS_LAMBDA = 0
52 | MU_LAMBDA_WITH_ELITISM = 1
53 | MU_LAMBDA_WITHOUT_ELITISM = 2
54 |
55 | def __repr__(self):
56 | """ Human readable """
57 | return self.name
58 |
59 |
60 | # Fitness types
61 | @unique
62 | class FitnessType(Enum):
63 | """ Fitness function type """
64 | BASIC = 0
65 | FULL_MATCH = 1
66 |
67 | def __repr__(self):
68 | """ Human readable """
69 | return self.name
70 |
71 |
72 | #
73 | # Dynamic grammar generation related literals
74 | #
75 | # Symbol delimiters
76 | SLD = '<'
77 | SRD = '>'
78 | # Grammar symbols
79 | S = SLD + 'S' + SRD
80 | P = SLD + 'P' + SRD
81 | T = SLD + 'T' + SRD
82 | F = SLD + 'F' + SRD
83 | # Feature symbols (base)
84 | ORTH = SLD + 'ORTH' + SRD
85 | TEXT = SLD + 'TEXT' + SRD
86 | LOWER = SLD + 'LOWER' + SRD
87 | LENGTH = SLD + 'LENGTH' + SRD
88 | POS = SLD + 'POS' + SRD
89 | TAG = SLD + 'TAG' + SRD
90 | DEP = SLD + 'DEP' + SRD
91 | LEMMA = SLD + 'LEMMA' + SRD
92 | SHAPE = SLD + 'SHAPE' + SRD
93 | ENT_TYPE = SLD + 'ENT_TYPE' + SRD
94 | # Feature symbols (base boolean)
95 | IS_ALPHA = SLD + 'IS_ALPHA' + SRD
96 | IS_ASCII = SLD + 'IS_ASCII' + SRD
97 | IS_DIGIT = SLD + 'IS_DIGIT' + SRD
98 | IS_LOWER = SLD + 'IS_LOWER' + SRD
99 | IS_UPPER = SLD + 'IS_UPPER' + SRD
100 | IS_TITLE = SLD + 'IS_TITLE' + SRD
101 | IS_PUNCT = SLD + 'IS_PUNCT' + SRD
102 | IS_SPACE = SLD + 'IS_SPACE' + SRD
103 | IS_STOP = SLD + 'IS_STOP' + SRD
104 | LIKE_NUM = SLD + 'LIKE_NUM' + SRD
105 | LIKE_URL = SLD + 'LIKE_URL' + SRD
106 | LIKE_EMAIL = SLD + 'LIKE_EMAIL' + SRD
107 | # Grammar operator and quantifier symbols
108 | OP = SLD + 'OP' + SRD
109 | NEGATION = '!'
110 | ZERO_OR_ONE = '?'
111 | ONE_OR_MORE = '+'
112 | ZERO_OR_MORE = '*'
113 | # Token wildcard
114 | TOKEN_WILDCARD = '{}'
115 | # Grammar extended pattern syntax
116 | XPS = SLD + 'XPS' + SRD
117 | IN = SLD + 'IN' + SRD
118 | NOT_IN = SLD + 'NOT_IN' + SRD
119 | EQQ = SLD + 'EQQ' + SRD
120 | GEQ = SLD + 'GEQ' + SRD
121 | LEQ = SLD + 'LEQ' + SRD
122 | GTH = SLD + 'GTH' + SRD
123 | LTH = SLD + 'LTH' + SRD
124 | XPS_AS = {EQQ: "==", GEQ: ">=", LEQ: "<=", GTH: ">", LTH: "<"}
125 | # Grammar custom attributes extension symbol
126 | UNDERSCORE = SLD + 'UNDERSCORE' + SRD
127 | EF = SLD + 'EF' + SRD
128 | ENT_ID = SLD + 'CUSTOM_ENT_ID_' + SRD
129 | ENT_IOB = SLD + 'CUSTOM_ENT_IOB_' + SRD
130 | ENT_KB_ID = SLD + 'CUSTOM_ENT_KB_ID_' + SRD
131 | HAS_VECTOR = SLD + 'CUSTOM_HAS_VECTOR' + SRD
132 | IS_BRACKET = SLD + 'CUSTOM_IS_BRACKET' + SRD
133 | IS_CURRENCY = SLD + 'CUSTOM_IS_CURRENCY' + SRD
134 | IS_LEFT_PUNCT = SLD + 'CUSTOM_IS_LEFT_PUNCT' + SRD
135 | IS_OOV = SLD + 'CUSTOM_IS_OOV' + SRD
136 | IS_QUOTE = SLD + 'CUSTOM_IS_QUOTE' + SRD
137 | IS_RIGHT_PUNCT = SLD + 'CUSTOM_IS_RIGHT_PUNCT' + SRD
138 | IS_SENT_START = SLD + 'CUSTOM_IS_SENT_START' + SRD
139 | LANG = SLD + 'CUSTOM_LANG_' + SRD
140 | NORM = SLD + 'CUSTOM_NORM_' + SRD
141 | PREFIX = SLD + 'CUSTOM_PREFIX_' + SRD
142 | PROB = SLD + 'CUSTOM_PROB' + SRD
143 | SENT_START = SLD + 'CUSTOM_SENT_START' + SRD
144 | SENTIMENT = SLD + 'CUSTOM_SENTIMENT' + SRD
145 | STRING = SLD + 'CUSTOM_STRING' + SRD
146 | SUFFIX = SLD + 'CUSTOM_SUFFIX_' + SRD
147 | TEXT_WITH_WS = SLD + 'CUSTOM_TEXT_WITH_WS' + SRD
148 | WHITESPACE = SLD + 'CUSTOM_WHITESPACE_' + SRD
149 | # Matcher's util
150 | MATCHER_SUPPORTED_ATTRIBUTES = (
151 | 'orth_',
152 | 'text',
153 | 'lower_',
154 | 'pos_',
155 | 'tag_',
156 | 'dep_',
157 | 'lemma_',
158 | 'shape_',
159 | 'ent_type_',
160 | 'is_alpha',
161 | 'is_ascii',
162 | 'is_digit',
163 | 'is_lower',
164 | 'is_upper',
165 | 'is_title',
166 | 'is_punct',
167 | 'is_space',
168 | 'is_stop',
169 | 'like_num',
170 | 'like_url',
171 | 'like_email')
172 |
173 | #
174 | # Config ini literals
175 | #
176 | GE = 'GE'
177 | MAX_RUNS = 'MAX_RUNS'
178 | SUCCESS_THRESHOLD = 'SUCCESS_THRESHOLD'
179 | POPULATION_SIZE = 'POPULATION_SIZE'
180 | MAX_GENERATIONS = 'MAX_GENERATIONS'
181 | CODON_LENGTH = 'CODON_LENGTH'
182 | CODONS_X_INDIVIDUAL = 'CODONS_X_INDIVIDUAL'
183 | MUTATION_PROBABILITY = 'MUTATION_PROBABILITY'
184 | OFFSPRING_FACTOR = 'OFFSPRING_FACTOR'
185 | MATING_PROBABILITY = 'MATING_PROBABILITY'
186 | K_VALUE = 'K_VALUE'
187 | SELECTION_TYPE = 'SELECTION_TYPE'
188 | RECOMBINATION_TYPE = 'RECOMBINATION_TYPE'
189 | REPLACEMENT_TYPE = 'REPLACEMENT_TYPE'
190 | FITNESS_FUNCTION_TYPE = 'FITNESS_FUNCTION_TYPE'
191 | DGG = 'DGG'
192 | FEATURES_X_TOKEN = 'FEATURES_X_TOKEN'
193 | USE_BOOLEAN_FEATURES = 'USE_BOOLEAN_FEATURES'
194 | USE_UNIQUES = 'USE_UNIQUES'
195 | USE_GRAMMAR_OPERATORS = 'USE_GRAMMAR_OPERATORS'
196 | USE_TOKEN_WILDCARD = 'USE_TOKEN_WILDCARD'
197 | USE_EXTENDED_PATTERN_SYNTAX = 'USE_EXTENDED_PATTERN_SYNTAX'
198 | USE_CUSTOM_ATTRIBUTES = 'USE_CUSTOM_ATTRIBUTES'
199 | IO = 'IO'
200 | REPORT_PATH = 'REPORT_PATH'
201 | REPORT_FORMAT = 'REPORT_FORMAT'
202 |
203 |
204 | @unique
205 | class ReportFormat(Enum):
206 | """ Report format type """
207 | JSON = 0
208 | CSV = 1
209 |
210 | def __repr__(self):
211 | """ Human readable """
212 | return self.name
213 |
--------------------------------------------------------------------------------
/PatternOmatic/settings/log.py:
--------------------------------------------------------------------------------
1 | """ Logging module
2 |
3 | This file is part of PatternOmatic.
4 |
5 | Copyright © 2020 Miguel Revuelta Espinosa
6 |
7 | PatternOmatic is free software: you can redistribute it and/or
8 | modify it under the terms of the GNU Lesser General Public License
9 | as published by the Free Software Foundation, either version 3 of
10 | the License, or (at your option) any later version.
11 |
12 | PatternOmatic is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU Lesser General Public License for more details.
16 |
17 | You should have received a copy of the GNU Lesser General Public License
18 | along with PatternOmatic. If not, see .
19 |
20 | """
21 | import logging
22 | import sys
23 | import tempfile
24 | from logging.handlers import TimedRotatingFileHandler
25 |
26 | FORMATTER = \
27 | logging.Formatter('[%(levelname)s] %(asctime)s %(filename)s:%(funcName)s:%(lineno)d : %(message)s')
28 |
29 | LOG_FILE = tempfile.gettempdir() + '/patternomatic.log'
30 |
31 |
32 | def _get_console_handler():
33 | """
34 | Console handler logger
35 | Returns:
36 |
37 | """
38 | console_handler = logging.StreamHandler(sys.stdout)
39 | console_handler.setFormatter(FORMATTER)
40 | return console_handler
41 |
42 |
43 | def _get_file_handler():
44 | """
45 | File handler logger
46 | Returns:
47 |
48 | """
49 | file_handler = TimedRotatingFileHandler(LOG_FILE, when='midnight')
50 | file_handler.setFormatter(FORMATTER)
51 | return file_handler
52 |
53 |
54 | def get_logger(logger_name):
55 | """
56 | Returns a set up logger
57 | Args:
58 | logger_name: Name of the logger
59 |
60 | Returns: logger
61 |
62 | """
63 | logger = logging.getLogger(logger_name)
64 | logger.setLevel(logging.INFO)
65 | logger.addHandler(_get_console_handler())
66 | logger.addHandler(_get_file_handler())
67 | logger.propagate = False
68 | return logger
69 |
70 |
71 | LOG = get_logger('PatternOmatic')
72 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # PatternOmatic 0.2.*
4 |
5 | **\#AI · \#EvolutionaryComputation · \#NLP**
6 |
7 | [](https://spacy.io)
8 | [](https://www.gnu.org/licenses/lgpl-3.0)
9 | [](https://travis-ci.org/revuel/PatternOmatic)
10 | [](https://sonarcloud.io/dashboard?id=revuel_PatternOmatic)
11 | [](https://sonarcloud.io/dashboard?id=revuel_PatternOmatic)
12 | [](https://sonarcloud.io/dashboard?id=revuel_PatternOmatic)
13 | [](#)
14 | [](https://libraries.io/pypi/PatternOmatic/sourcerank)
15 | [](https://pypistats.org/packages/PatternOmatic)
16 | [](https://badge.fury.io/py/PatternOmatic)
17 |
18 | _Discover spaCy's linguistic patterns matching a given set of string samples_
19 |
20 | ## Requirements
21 | - [Python 3.7.3](https://www.python.org/downloads/release/python-373/)
22 | - [Spacy 2.3.*](https://spacy.io/usage/v2-3)
23 |
24 | ## Basic usage
25 |
26 | ### From sources
27 | *[Clone SCM official repository](https://github.com/revuel/PatternOmatic)*
28 |
29 | `git clone git@github.com:revuel/PatternOmatic.git`
30 |
31 | *Play with Makefile*
32 |
33 | - `make venv` to activate project's [Virtual Environment*](https://docs.python.org/3.7/library/venv.html)
34 | - `make libs` to install dependencies
35 | - `make test` to run Unit Tests
36 | - `make coverage` to run Code Coverage
37 | - `make run` to run PatternOmatic's script with example parameters
38 |
39 | * you must have one first
40 |
41 | ### From package
42 | *Install package*
43 |
44 | `pip install PatternOmatic`
45 |
46 | *Play with the CLI*
47 |
48 | ```
49 | # Show help
50 | patternomatic.py -h
51 |
52 | # Usage example 1: Basic
53 | patternomatic.py -s Hello world -s Goodbye world
54 |
55 | # Usage example 2: Using a different language
56 | python -m spacy download es_core_news_sm
57 | patternomatic.py -s Me llamo Miguel -s Se llama PatternOmatic -l es_core_news_sm
58 | ```
59 |
60 | *Play with the library*
61 | ```
62 | """
63 | PatternOmatic library client example.
64 | Find linguistic patterns to be used by the spaCy Rule Based Matcher
65 |
66 | """
67 | from PatternOmatic.api import find_patterns, Config
68 |
69 | if __name__ == '__main__':
70 |
71 | my_samples = ['I am a cat!', 'You are a dog!', 'She is an owl!']
72 |
73 | # Optionally, let it evolve a little bit more!
74 | config = Config()
75 | config.max_generations = 150
76 | config.max_runs = 3
77 |
78 | patterns_found, _ = find_patterns(my_samples)
79 |
80 | print(f'Patterns found: {patterns_found}')
81 |
82 | ```
83 | ---
84 |
85 | ## Features
86 |
87 | ### Generic
88 |
89 | ✅ No OS dependencies, no storage or database required!
90 |
91 | ✅ Lightweight package with just a little direct pip dependencies
92 | - [spaCy](https://pypi.org/project/spacy/2.3.2/)
93 | - [spaCy's en_core_web_sm Language Model](https://github.com/explosion/spacy-models/releases/tag/en_core_web_sm-2.3.0)
94 |
95 | ✅ Easy and highly configurable to boost clever searches
96 |
97 | ✅ Includes basic logging mechanism
98 |
99 | ✅ Includes basic reporting, JSON and CSV format supported. Report file path is configurable
100 |
101 | ✅ Configuration file example provided (config.ini)
102 |
103 | ✅ Default configuration is run if no configuration file provided
104 |
105 | ✅ Provides rollback actions against several possible misconfiguration scenarios
106 |
107 | ### Evolutionary
108 |
109 | ✅ Basic Evolutionary (Grammatical Evolution) parameters available and configurable
110 |
111 | ✅ Supports two different Evolutionary Fitness functions
112 |
113 | ✅ Supports Binary Tournament Evolutionary Selection Type
114 |
115 | ✅ Supports Random One Point Crossover Evolutionary Recombination Type
116 |
117 | ✅ Supports "µ + λ" Evolutionary Replacement Type
118 |
119 | ✅ Supports "µ ∪ λ" with elitism Evolutionary Replacement Type
120 |
121 | ✅ Supports "µ ∪ λ" without elitism Evolutionary Replacement Type
122 |
123 | ✅ Typical evolutionary performance metrics included:
124 | - Success Rate (SR)
125 | - Mean Best Fitness (MBF)
126 | - Average Evaluations to Solution (AES)
127 |
128 | ### Linguistic
129 |
130 | ✅ [Compatible with any spaCy Language Model](https://spacy.io/usage/models#languages)
131 |
132 | ✅ [Supports all spaCy's Rule Based Matcher standard Token attributes](https://spacy.io/usage/rule-based-matching#adding-patterns-attributes)
133 |
134 | ✅ [Supports the following spaCy's Rule Based Matcher non standard Token attributes](https://spacy.io/api/token#attributes) [(via underscore)](https://spacy.io/usage/processing-pipelines#custom-components-attributes)
135 | - ent_id
136 | - ent_iob
137 | - ent_kb_id
138 | - has_vector
139 | - is_bracket
140 | - is_currency
141 | - is_left_punct
142 | - is_oov
143 | - is_quote
144 | - is_right_punct
145 | - lang
146 | - norm
147 | - prefix
148 | - sentiment
149 | - string
150 | - suffix
151 | - text_with_ws
152 | - whitespace
153 |
154 | ✅ Supports skipping boolean Token attributes
155 |
156 | ✅ [Supports spaCy's Rule Based Matcher Extended Pattern Syntax](https://spacy.io/usage/rule-based-matching#adding-patterns-attributes-extended)
157 |
158 | ✅ [Supports spaCy's Rule Based Matcher Grammar Operators and Quantifiers](https://spacy.io/usage/rule-based-matching#quantifiers)
159 |
160 | ✅ [Supports Token Wildcard](https://spacy.io/usage/rule-based-matching#adding-patterns-wildcard)
161 |
162 | ✅ Supports defining the number of attributes per token within searched patterns
163 |
164 | ✅ Supports usage of non repeated token attribute values
165 |
166 | ---
167 |
168 | Author: [Miguel Revuelta Espinosa _(revuel)_](mailto:revuel22@hotmail.com "Contact author"), a humble AI enthusiastic
169 |
--------------------------------------------------------------------------------
/config.ini:
--------------------------------------------------------------------------------
1 | # - Configuration File example for "PatternOmatic"
2 | #
3 | # Grammatical Evolution (GE) parameters
4 | #
5 | [GE]
6 | # Number of runs per execution. This is the amount of new populations to spawn per execution. Integer greater than 0
7 | # Integer within interval [3, *)
8 | MAX_RUNS = 4
9 |
10 | # Minimum fitness value found in an execution to consider this a successful execution.
11 | # Float within interval [0.0, 1.0]
12 | SUCCESS_THRESHOLD = 0.8
13 |
14 | # Number of individuals per population
15 | # Integer within interval [4, *)
16 | POPULATION_SIZE = 100
17 |
18 | # Maximum number of generations per population in a run.
19 | # Integer within interval [1, *)
20 | MAX_GENERATIONS = 20
21 |
22 | # Number of gen per codon
23 | # Integer within interval [1, 16]
24 | CODON_LENGTH = 8
25 |
26 | # Number of codons per individual dna
27 | # Integer within the set (4, 8, 16)
28 | CODONS_X_INDIVIDUAL = 4
29 |
30 | # Mutation probability. Chance of mutating a gen within and individual birth, for every gen
31 | # Float within interval [0.0, 1.0]
32 | MUTATION_PROBABILITY = 0.5
33 |
34 | # Growth factor while generating offspring.
35 | # This factor appears in the literature. Do not edit this value.
36 | OFFSPRING_FACTOR = 3.5
37 |
38 | # Chances to produce offspring per individuals selection
39 | # Float within interval [0.0, 1.0]
40 | MATING_PROBABILITY = 0.9
41 |
42 | # Number of indivudals to compete where K_TOURNAMENT is the selection mode
43 | # Integer within interval [3, *)
44 | K_VALUE = 3
45 |
46 | # Selection type:
47 | # 0 = BINARY_TOURNAMENT
48 | # 1 = K_TOURNAMENT
49 | SELECTION_TYPE = 0
50 |
51 | # Recombination type:
52 | # 0 = RANDOM_ONE_POINT_CROSSOVER
53 | RECOMBINATION_TYPE = 0
54 |
55 | # Replacement type:
56 | # 0 = MU_PLUS_LAMBDA
57 | # 1 = MU_LAMBDA_WITH_ELITISM
58 | # 2 = MU_LAMBDA_WITHOUT_ELITISM
59 | REPLACEMENT_TYPE = 0
60 |
61 | # Fitness function type:
62 | # 0 = BASIC
63 | # 1 = FULL_MATCH
64 | FITNESS_FUNCTION_TYPE = 1
65 |
66 | #
67 | # Dynamic Grammar Generation (DGG) parameters
68 | #
69 | [DGG]
70 | # Features per token:
71 | # 0 or < 0 = unlimited
72 | # 1 or more until the maximum number of features = that number of features per token
73 | # > maximum number of features per token = the maximum number of features per token
74 | # 1 is the recommended value here
75 | FEATURES_X_TOKEN = 1
76 |
77 | # Use uniques:
78 | # True = Do not repeat features per production rule
79 | # False = Features can be repeated per production rule
80 | USE_UNIQUES = True
81 |
82 | # Use boolean features:
83 | # True = Enable the usage of Spacy's boolean token features (not recommended)
84 | # False = Disable the usage of Spacy's boolean token features (recommended)
85 | # This features show a highly positive correlation, which means they are not usefull for finding patterns
86 | USE_BOOLEAN_FEATURES = False
87 |
88 | # Use Grammar Operators:
89 | # True = Enable patterns with Spacy's Grammar Operators
90 | # False = Disable patterns with Spacy's Grammar Operators
91 | # Grammar Operators and Extended Pattern Syntax can not be enabled together
92 | USE_GRAMMAR_OPERATORS = False
93 |
94 | # Use Token Wildcard:
95 | # True = Enable patterns with Token Wildcard
96 | # False = Disable patterns with Token Wildcard
97 | USE_TOKEN_WILDCARD = False
98 |
99 | # Use Extended Pattern Syntax:
100 | # True = Enable patterns with Spacy's Extended Pattern Syntax
101 | # False = Disable patterns with Spacy's Extended Pattern Syntax
102 | # Grammar Operators and Extended Pattern Syntax can not be enabled together
103 | USE_EXTENDED_PATTERN_SYNTAX = False
104 |
105 | # Use Custom Features:
106 | # True = Enable patterns with underscore, where all the token's attributes not accepted by the Matcher are included
107 | # False = Disable patterns with underscore, where all the token's attributes not accepted by the Matcher are included
108 | USE_CUSTOM_ATTRIBUTES = False
109 |
110 | #
111 | # Operating System (OS) configuration options
112 | #
113 | [IO]
114 | # Valid OS path and filename to persist execution report
115 | REPORT_PATH = /tmp/patternOmatic_report.txt
116 |
117 | # Report format
118 | # 0 = json format
119 | # 1 = csv format
120 | REPORT_FORMAT = 0
121 |
--------------------------------------------------------------------------------
/patternomatic_logo.svg:
--------------------------------------------------------------------------------
1 |
2 |
4 |
225 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | setuptools==40.8.0
2 | pip==20.2.3
3 | coverage==5.0.3
4 | wheel==0.33.6
5 | importlib-metadata==2.0.0
6 | twine==3.2.0
7 | spacy==2.3.*
8 | https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.0/en_core_web_sm-2.3.0.tar.gz#egg=en_core_web_sm
9 |
--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/revuel/PatternOmatic/8f95c1c134a14419a11b8cb192144857b40d0b3c/scripts/__init__.py
--------------------------------------------------------------------------------
/scripts/patternomatic.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | """ Command Line Interface module
3 |
4 | This file is part of PatternOmatic.
5 |
6 | Copyright © 2020 Miguel Revuelta Espinosa
7 |
8 | PatternOmatic is free software: you can redistribute it and/or
9 | modify it under the terms of the GNU Lesser General Public License
10 | as published by the Free Software Foundation, either version 3 of
11 | the License, or (at your option) any later version.
12 |
13 | PatternOmatic is distributed in the hope that it will be useful,
14 | but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | GNU Lesser General Public License for more details.
17 |
18 | You should have received a copy of the GNU Lesser General Public License
19 | along with PatternOmatic. If not, see .
20 |
21 | """
22 | import sys
23 | from typing import List
24 | from argparse import ArgumentParser
25 | from PatternOmatic.api import find_patterns
26 | from PatternOmatic.settings.log import LOG
27 |
28 |
29 | def main(args: List) -> None:
30 | """
31 | PatternOmatic's script main function wrapper
32 | Args:
33 | args: Command Line Input Arguments
34 |
35 | Returns: None
36 |
37 | """
38 | LOG.info('Parsing command line arguments...')
39 | try:
40 | cli = ArgumentParser(
41 | description='Finds the Spacy\'s Matcher pattern for the given samples',
42 | epilog='...using actual Artificial Intelligence'
43 | )
44 |
45 | # Samples
46 | cli.add_argument(
47 | '-s',
48 | '--sample',
49 | action='append',
50 | required=True,
51 | nargs='+',
52 | type=str,
53 | help='A sample phrase'
54 | )
55 |
56 | # Spacy Language Model
57 | cli.add_argument(
58 | '-l',
59 | '--language',
60 | nargs='?',
61 | type=str,
62 | default='en_core_web_sm',
63 | help='Spacy language model to be used'
64 | )
65 |
66 | # Configuration file to be used
67 | cli.add_argument(
68 | '-c',
69 | '--config',
70 | nargs='?',
71 | type=str,
72 | help='Configuration file path to be used',
73 | default=None,
74 | )
75 |
76 | # Parse command line input arguments/options
77 | parsed_args = cli.parse_args(args)
78 |
79 | # Join sample arguments
80 | for index, item in enumerate(parsed_args.sample):
81 | parsed_args.sample[index] = ' '.join(item)
82 |
83 | #
84 | # Find patterns
85 | #
86 | patterns_found, _ = find_patterns(
87 | parsed_args.sample,
88 | configuration=parsed_args.config,
89 | spacy_language_model_name=parsed_args.language)
90 |
91 | LOG.info(f'Patterns found: {patterns_found}')
92 |
93 | except Exception as ex:
94 | LOG.critical(f'Fatal error: {repr(ex)}')
95 | raise ex
96 |
97 |
98 | #
99 | # OS INPUT
100 | #
101 | if __name__ == '__main__': \
102 | main(sys.argv[1:])
103 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | """ Setup tools (build distribution) module
2 |
3 | This file is part of PatternOmatic.
4 |
5 | Copyright © 2020 Miguel Revuelta Espinosa
6 |
7 | PatternOmatic is free software: you can redistribute it and/or
8 | modify it under the terms of the GNU Lesser General Public License
9 | as published by the Free Software Foundation, either version 3 of
10 | the License, or (at your option) any later version.
11 |
12 | PatternOmatic is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU Lesser General Public License for more details.
16 |
17 | You should have received a copy of the GNU Lesser General Public License
18 | along with PatternOmatic. If not, see .
19 |
20 | """
21 | import setuptools
22 |
23 | with open("README.md", "r") as fh:
24 | long_description = fh.read()
25 |
26 | setuptools.setup(
27 | name="PatternOmatic",
28 | version="0.2.3",
29 | author="Miguel Revuelta Espinosa",
30 | author_email="revuel22@hotmail.com",
31 | description="AI/NLP (Spacy) Rule Based Matcher pattern finder",
32 | long_description=long_description,
33 | long_description_content_type="text/markdown",
34 | url="https://github.com/revuel/PatternOmatic",
35 | packages=setuptools.find_packages(),
36 | scripts=['scripts/patternomatic.py'],
37 | install_requires=[
38 | 'spacy==2.3.0'
39 | ],
40 | classifiers=[
41 | "Programming Language :: Python :: 3",
42 | "License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)",
43 | "Operating System :: OS Independent",
44 | ],
45 | python_requires='>=3.7',
46 | )
47 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/revuel/PatternOmatic/8f95c1c134a14419a11b8cb192144857b40d0b3c/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test_api.py:
--------------------------------------------------------------------------------
1 | """ Unit testing file for API module
2 |
3 | This file is part of PatternOmatic.
4 |
5 | Copyright © 2020 Miguel Revuelta Espinosa
6 |
7 | PatternOmatic is free software: you can redistribute it and/or
8 | modify it under the terms of the GNU Lesser General Public License
9 | as published by the Free Software Foundation, either version 3 of
10 | the License, or (at your option) any later version.
11 |
12 | PatternOmatic is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU Lesser General Public License for more details.
16 |
17 | You should have received a copy of the GNU Lesser General Public License
18 | along with PatternOmatic. If not, see .
19 |
20 | """
21 | import os
22 | import spacy
23 | from unittest import TestCase, mock
24 | from PatternOmatic.api import find_patterns
25 | from PatternOmatic.settings.config import Config
26 | from PatternOmatic.settings.log import LOG
27 |
28 |
29 | class Test(TestCase):
30 |
31 | my_samples = ['Hello world!', 'Goodbye world!']
32 |
33 | def test_find_patterns_when_only_samples_provided(self):
34 | """ Tests that providing just samples makes the find_pattern keeps working """
35 | patterns, _ = find_patterns(self.my_samples)
36 | super().assertEqual(4, len(patterns))
37 |
38 | def test_find_patterns_when_valid_configuration_file_provided(self):
39 | """ Checks that providing a valid configuration file path loads configuration from that file """
40 |
41 | config_file_path = \
42 | os.path.join(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir), 'config.ini')
43 | _ = find_patterns(self.my_samples, configuration=config_file_path)
44 | super().assertEqual(config_file_path, Config().file_path)
45 |
46 | def test_find_patterns_when_config_instance_provided(self):
47 | """ Checks when setting up a Config instance before find_patterns invocation works """
48 | config = Config()
49 | config.max_runs = 10
50 | patterns, _ = find_patterns(self.my_samples)
51 | super().assertEqual(10, len(patterns))
52 |
53 | def test_find_patterns_when_bad_language_provided(self):
54 | """ Checks that providing an imaginary language model makes find_patterns use en_core_web_sm """
55 | with super().assertLogs(LOG) as cm:
56 | bad_model = 'Something'
57 | _ = find_patterns(self.my_samples, spacy_language_model_name=bad_model)
58 | super().assertEqual(f'WARNING:PatternOmatic:Model {bad_model} not found, falling back to '
59 | f'patternOmatic\'s default language model: en_core_web_sm', cm.output[1])
60 |
61 | def test_installs_en_core_web_sm_if_not_found(self):
62 | """ Due to questionable PyPI security policies, check en_core_web_sm installation is fired if not present """
63 | nlp = spacy.load('en_core_web_sm')
64 |
65 | with mock.patch('PatternOmatic.api.pkg_resources.working_set') as patch_working_set:
66 | with mock.patch('PatternOmatic.api.spacy_download') as patch_spacy_download:
67 | with mock.patch('PatternOmatic.api.spacy_load') as patch_spacy_load:
68 | patch_working_set.return_value = []
69 | patch_spacy_download.return_value = 'I\'ve been fired'
70 | patch_spacy_load.return_value = nlp
71 | find_patterns(['Hi'])
72 | super().assertTrue(patch_spacy_download.called)
73 |
74 | def tearDown(self) -> None:
75 | """ Destroy Config instance """
76 | Config.clear_instance()
77 |
--------------------------------------------------------------------------------
/tests/test_bnf.py:
--------------------------------------------------------------------------------
1 | """ Unit testing file for BNF module
2 |
3 | This file is part of PatternOmatic.
4 |
5 | Copyright © 2020 Miguel Revuelta Espinosa
6 |
7 | PatternOmatic is free software: you can redistribute it and/or
8 | modify it under the terms of the GNU Lesser General Public License
9 | as published by the Free Software Foundation, either version 3 of
10 | the License, or (at your option) any later version.
11 |
12 | PatternOmatic is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU Lesser General Public License for more details.
16 |
17 | You should have received a copy of the GNU Lesser General Public License
18 | along with PatternOmatic. If not, see .
19 |
20 | """
21 | import unittest
22 | import spacy
23 | from spacy.tokens.doc import Underscore
24 |
25 | import PatternOmatic.nlp.bnf as bnf
26 | from PatternOmatic.settings.literals import S, P, T, F, OP, NEGATION, ZERO_OR_ONE, ZERO_OR_MORE, ONE_OR_MORE, XPS, IN,\
27 | NOT_IN, EQQ, GEQ, LEQ, GTH, LTH, TOKEN_WILDCARD, UNDERSCORE, ORTH, TEXT, LOWER, POS, TAG, DEP, LEMMA, SHAPE, \
28 | IS_ASCII, IS_UPPER, HAS_VECTOR
29 | from PatternOmatic.settings.config import Config
30 |
31 |
32 | class TestDG(unittest.TestCase):
33 | """ Test class for Dynamic Grammar """
34 |
35 | nlp = spacy.load('en_core_web_sm')
36 | samples = [nlp(u'This is a test.'), nlp(u'Checks for Backus Naur Form grammars')]
37 | config = None
38 |
39 | def test_basic_grammar_dg(self):
40 | """ Tests that basic grammar is correctly generated """
41 | grammar = bnf.dynamic_generator(self.samples)
42 |
43 | super().assertIn(P, grammar.keys())
44 | super().assertIn(S, grammar.keys())
45 | super().assertIn(T, grammar.keys())
46 | super().assertIn(F, grammar.keys())
47 | super().assertEqual(len(grammar[SHAPE]), 7)
48 | super().assertEqual(len(grammar[F]), 9)
49 |
50 | def test_basic_grammar_without_uniques_dg(self):
51 | """ Tests that basic grammar is correctly generated when use uniques is false """
52 | self.config.use_uniques = False
53 | grammar = bnf.dynamic_generator(self.samples)
54 |
55 | super().assertEqual(len(grammar[SHAPE]), 11)
56 |
57 | def test_basic_grammar_with_booleans_dg(self):
58 | """ Tests that basic grammar with booleans is correctly generated """
59 | self.config.use_boolean_features = True
60 | grammar = bnf.dynamic_generator(self.samples)
61 |
62 | super().assertIn(IS_ASCII, grammar.keys())
63 | super().assertIn(IS_UPPER, grammar.keys())
64 |
65 | def test_basic_grammar_with_booleans_and_operators_dg(self):
66 | """ Tests that basic grammar with boolean features and operators is correctly generated """
67 | self.config.use_boolean_features = True
68 | self.config.use_grammar_operators = True
69 |
70 | grammar = bnf.dynamic_generator(self.samples)
71 |
72 | super().assertIn(IS_ASCII, grammar.keys())
73 | super().assertIn(IS_UPPER, grammar.keys())
74 | super().assertIn(OP, grammar.keys())
75 | super().assertListEqual(grammar[OP], [NEGATION, ZERO_OR_ONE, ONE_OR_MORE, ZERO_OR_MORE])
76 |
77 | def test_basic_grammar_with_booleans_and_extended_pattern_syntax_dg(self):
78 | """ Tests that basic grammar with boolean features and extended pattern syntax is correctly generated """
79 | self.config.use_boolean_features = True
80 | self.config.use_extended_pattern_syntax = True
81 |
82 | grammar = bnf.dynamic_generator(self.samples)
83 |
84 | super().assertIn(IS_ASCII, grammar.keys())
85 | super().assertIn(IS_UPPER, grammar.keys())
86 | super().assertIn(XPS, grammar.keys())
87 | super().assertListEqual(grammar[XPS], [IN, NOT_IN, EQQ, GEQ, LEQ, GTH, LTH])
88 |
89 | def test_basic_grammar_with_booleans_and_custom_attributes_dg(self):
90 | """ Tests that basic grammar with boolean features and custom attributes is correctly generated """
91 | self.config.use_boolean_features = True
92 | self.config.use_custom_attributes = True
93 |
94 | grammar = bnf.dynamic_generator(self.samples)
95 |
96 | super().assertIn(IS_ASCII, grammar.keys())
97 | super().assertIn(IS_UPPER, grammar.keys())
98 | super().assertIn(UNDERSCORE, grammar.keys())
99 | # super().assertIn(IS_SENT_START, grammar.keys())
100 | super().assertIn(HAS_VECTOR, grammar.keys())
101 |
102 | def test_basic_grammar_with_token_wildcard_dg(self):
103 | """ Tests grammar is generated with token wildcard """
104 | self.config.use_token_wildcard = True
105 |
106 | grammar = bnf.dynamic_generator(self.samples)
107 |
108 | super().assertIn(TOKEN_WILDCARD, grammar[T])
109 |
110 | def test_get_features_per_token(self):
111 | """ Tests that the number of features per token is properly set given different configurations """
112 | features_dict = {ORTH: None, TEXT: None, LOWER: None, POS: None, TAG: None, LEMMA: None}
113 | len_features_dict = len(features_dict.keys())
114 |
115 | # When features_per_token is equal or lower to 0, the maximum number of features per token is set
116 | self.config.features_per_token = 0
117 | super().assertEqual(len_features_dict, bnf._get_features_per_token(features_dict))
118 | self.config.features_per_token = -100
119 | super().assertEqual(len_features_dict, bnf._get_features_per_token(features_dict))
120 |
121 | # When features_per_token is greater than the actual features, the maximum number of features per token is set
122 | self.config.features_per_token = 100
123 | super().assertEqual(len_features_dict, bnf._get_features_per_token(features_dict))
124 |
125 | # When features_per_token is inside the range (0, actual features), the config parameter is respected
126 | self.config.features_per_token = 3
127 | super().assertEqual(3, bnf._get_features_per_token(features_dict))
128 |
129 | def test_symbol_stacker(self):
130 | """ Tests that symbols are stacked properly """
131 | expected_1 = [DEP, DEP + ',' + DEP, DEP + ',' + DEP + ',' + DEP]
132 | super().assertListEqual(expected_1, bnf._symbol_stacker(DEP, 3))
133 |
134 | expected_2 = [DEP + ',' + DEP,
135 | DEP + ',' + DEP + ',' + DEP,
136 | DEP + ',' + DEP + ',' + DEP + ',' + DEP]
137 |
138 | super().assertListEqual(expected_2, bnf._symbol_stacker(DEP, 4, 2))
139 |
140 | expected_2.insert(0, DEP)
141 |
142 | super().assertListEqual(expected_2, bnf._symbol_stacker(DEP, 4, 5))
143 |
144 | super().assertListEqual([expected_1[2]], bnf._symbol_stacker(DEP, 3, 3))
145 |
146 | #
147 | # Helpers
148 | #
149 | def setUp(self) -> None:
150 | """ Fresh Config instance """
151 | self.config = Config()
152 |
153 | def tearDown(self) -> None:
154 | """ Destroy Config instance, reset Underscore's token extensions """
155 | Config.clear_instance()
156 | Underscore.token_extensions = {}
157 |
158 |
159 | if __name__ == "__main__":
160 | unittest.main()
161 |
--------------------------------------------------------------------------------
/tests/test_individual.py:
--------------------------------------------------------------------------------
1 | """ Unit testing module for GE Individual module
2 |
3 | This file is part of PatternOmatic.
4 |
5 | Copyright © 2020 Miguel Revuelta Espinosa
6 |
7 | PatternOmatic is free software: you can redistribute it and/or
8 | modify it under the terms of the GNU Lesser General Public License
9 | as published by the Free Software Foundation, either version 3 of
10 | the License, or (at your option) any later version.
11 |
12 | PatternOmatic is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU Lesser General Public License for more details.
16 |
17 | You should have received a copy of the GNU Lesser General Public License
18 | along with PatternOmatic. If not, see .
19 |
20 | """
21 | import unittest
22 | import spacy
23 |
24 | from PatternOmatic.ge.stats import Stats
25 | from PatternOmatic.nlp.bnf import dynamic_generator as dgg
26 | from PatternOmatic.ge.individual import Individual, Fitness
27 | from PatternOmatic.settings.config import Config
28 | from PatternOmatic.settings.literals import FitnessType, S, P, T, F, ORTH, TOKEN_WILDCARD, UNDERSCORE, IS_CURRENCY, \
29 | NOT_IN, ZERO_OR_MORE, OP, GTH, XPS, IN
30 |
31 |
32 | class TestIndividual(unittest.TestCase):
33 | """ Unit Test class for GE Individual object """
34 | config = Config()
35 |
36 | nlp = spacy.load("en_core_web_sm")
37 |
38 | samples = [nlp(u'I am a raccoon!'),
39 | nlp(u'You are a cat!'),
40 | nlp(u'Is she a rabbit?'),
41 | nlp(u'This is a test')]
42 |
43 | grammar = dgg(samples)
44 |
45 | stats = Stats()
46 |
47 | def test_init(self):
48 | """ Test that Individual instantiation works """
49 | i = Individual(self.samples, self.grammar, self.stats)
50 | super().assertIs(type(i), Individual)
51 |
52 | def test_init_with_dna(self):
53 | """ Test that Individual instantiation works when providing dna"""
54 | i = Individual(self.samples, self.grammar, self.stats, '10101010101010101010101010101010')
55 | super().assertNotEqual(i, None)
56 |
57 | def test_transcription(self):
58 | """ Check for transcription idempotency """
59 | self.config.mutation_probability = 0.0
60 | i = Individual(self.samples, self.grammar, self.stats, '11111111')
61 | i._transcription()
62 | i._transcription()
63 | i._transcription()
64 |
65 | super().assertListEqual(i.int_genotype, [127, 1])
66 |
67 | def test_translation(self):
68 | """ Check for translation idempotency """
69 | self.config.mutation_probability = 0.0
70 | i = Individual(self.samples, self.grammar, self.stats, '11111111')
71 | i._translation()
72 | i._translation()
73 | i._translation()
74 | super().assertListEqual(
75 | i.fenotype, [{'TEXT': 'am'}, {'TEXT': '?'}, {'TEXT': 'am'}, {'TEXT': '?'}, {'TEXT': 'am'}])
76 |
77 | def test_mutation(self):
78 | """ Checks that mutation works """
79 | self.config.mutation_probability = 1.0
80 | i = Individual(self.samples, self.grammar, self.stats, '11111111')
81 | super().assertNotEqual(i.bin_genotype, '11111111')
82 |
83 | def test_fitness_basic(self):
84 | """ Fitness "basic" sets fitness """
85 | self.config.mutation_probability = 0.0
86 | self.config.fitness_function_type = FitnessType.BASIC
87 | i = Individual(self.samples, self.grammar, self.stats, '01110101100101100110010110010101')
88 |
89 | super().assertEqual(i.fitness_value, 0.25)
90 |
91 | def test_fitness_full_match(self):
92 | """ Fitness "full match" sets fitness """
93 | self.config.mutation_probability = 0.0
94 | self.config.fitness_function_type = FitnessType.FULL_MATCH
95 | i = Individual(self.samples, self.grammar, self.stats, '01101010100001101000110111000100')
96 |
97 | super().assertEqual(i.fitness_value, 0.25)
98 |
99 | def test_token_wildcard_penalty(self):
100 | """ Checks that token wildcard penalty is properly set """
101 | # When using token wildcard, penalty is applied
102 | f = object.__new__(Fitness)
103 | f.fenotype = [{}, {}, {}, 'Whatever']
104 | self.config.use_token_wildcard = True
105 | f.config = self.config
106 | super().assertEqual(0.25, f._wildcard_penalty(1.0))
107 |
108 | # When not using token wildcard, penalty is not applied
109 | self.config.use_token_wildcard = False
110 | f.fenotype = 1.0
111 | super().assertEqual(1.0, f._wildcard_penalty(1.0))
112 |
113 | def test_translate(self):
114 | """ Verifies conversions over the BNF are done correctly """
115 | i = object.__new__(Individual)
116 |
117 | # Root
118 | i.grammar = {S: [P]}
119 | super().assertEqual('"S":""', i._translate(0, S, S))
120 |
121 | # Pattern root symbol to Token symbol
122 | i.grammar = {P: [T]}
123 | super().assertEqual(T, i._translate(0, P, P))
124 |
125 | # Token symbol to Feature symbol inside Token
126 | i.grammar = {T: [F]}
127 | super().assertEqual('{}', i._translate(0, T, T))
128 |
129 | # Token symbol to wildcard
130 | i.grammar = {T: [TOKEN_WILDCARD]}
131 | super().assertEqual('{}', i._translate(0, T, T))
132 |
133 | # Feature symbol to specific symbol
134 | i.grammar = {F: [ORTH]}
135 | super().assertEqual('{}', i._translate(0, F, '{}'))
136 |
137 | # Basic Terminal conversion
138 | i.grammar = {ORTH: ['Test']}
139 | super().assertEqual('{"ORTH":"Test"}', i._translate(0, ORTH, '{}'))
140 |
141 | # Underscore conversion
142 | i.grammar = {UNDERSCORE: [IS_CURRENCY]}
143 | super().assertEqual('{"_": {}}', i._translate(0, UNDERSCORE, '{}'))
144 |
145 | # Underscore terminal conversion
146 | i.grammar = {IS_CURRENCY: [True]}
147 | super().assertEqual('{"_": {"CUSTOM_IS_CURRENCY":"True"}}',
148 | i._translate(0, IS_CURRENCY, '{"_": {}}'))
149 |
150 | # Grammar Operators conversion
151 | i.grammar = {OP: ZERO_OR_MORE}
152 | super().assertEqual('"OP":"*"', i._translate(0, OP, ''))
153 |
154 | # Extended Pattern Syntax conversion (base)
155 | i.grammar = {XPS: [IN]}
156 | super().assertEqual('{}', i._translate(0, XPS, ''))
157 |
158 | i.grammar = {ORTH: [XPS]}
159 | super().assertEqual('"ORTH":', i._translate(0, ORTH, ''))
160 |
161 | # Extended Pattern Syntax conversion (terminal logical)
162 | i.grammar = {NOT_IN: [['Test']]}
163 | super().assertEqual('{"ORTH": {"NOT_IN":["Test"]}}', i._translate(0, NOT_IN, '{"ORTH": {}}'))
164 |
165 | # Extended Pattern Syntax (terminal arithmetical)
166 | i.grammar = {GTH: [5]}
167 | super().assertEqual('{"LENGTH": {">":5}}', i._translate(0, GTH, '{"LENGTH": {}}'))
168 |
169 | #
170 | # Helpers
171 | #
172 | def setUp(self) -> None:
173 | """ Fresh Config instance """
174 | self.config = Config()
175 |
176 | def tearDown(self) -> None:
177 | """ Destroy Config instance """
178 | Config.clear_instance()
179 |
180 |
181 | if __name__ == "__main__":
182 | unittest.main()
183 |
--------------------------------------------------------------------------------
/tests/test_population.py:
--------------------------------------------------------------------------------
1 | """ Unit testing module for GE Population module
2 |
3 | This file is part of PatternOmatic.
4 |
5 | Copyright © 2020 Miguel Revuelta Espinosa
6 |
7 | PatternOmatic is free software: you can redistribute it and/or
8 | modify it under the terms of the GNU Lesser General Public License
9 | as published by the Free Software Foundation, either version 3 of
10 | the License, or (at your option) any later version.
11 |
12 | PatternOmatic is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU Lesser General Public License for more details.
16 |
17 | You should have received a copy of the GNU Lesser General Public License
18 | along with PatternOmatic. If not, see .
19 |
20 | """
21 | import unittest
22 | import spacy
23 |
24 | from PatternOmatic.ge.stats import Stats
25 | from PatternOmatic.nlp.bnf import dynamic_generator as dgg
26 | from PatternOmatic.ge.population import Population, Selection, Recombination, Replacement
27 | from PatternOmatic.ge.individual import Individual
28 | from PatternOmatic.settings.config import Config
29 | from PatternOmatic.settings.literals import FitnessType, SelectionType, RecombinationType, ReplacementType
30 |
31 |
32 | class BasePopulationTest(unittest.TestCase):
33 | """ Base class to supply shard attributes and helpers """
34 | #
35 | # Shared attributes
36 | #
37 | config = Config()
38 |
39 | nlp = spacy.load("en_core_web_sm")
40 |
41 | samples = [nlp(u'I am a raccoon!'),
42 | nlp(u'You are a cat!'),
43 | nlp(u'Is she a rabbit?'),
44 | nlp(u'This is a test')]
45 |
46 | grammar = dgg(samples)
47 |
48 | stats = Stats()
49 |
50 | #
51 | # Helpers
52 | #
53 | def setUp(self) -> None:
54 | """ Fresh Config instance """
55 | self.config = Config()
56 |
57 | def tearDown(self) -> None:
58 | """ Destroy Config instance """
59 | Config.clear_instance()
60 |
61 |
62 | class TestPopulation(BasePopulationTest):
63 | """ Unit Test class for GE Population object """
64 |
65 | def test_initialize(self):
66 | """ Tests that a population is correctly filled with Individuals """
67 | p = Population(self.samples, self.grammar, self.stats)
68 |
69 | super().assertIsInstance(p.generation[0], Individual)
70 |
71 | def test_best_challenge(self):
72 | """ Tests that the most fitted individual occupies the population's best_individual slot """
73 | self.config.max_generations = 3
74 | self.config.fitness_function_type = FitnessType.BASIC
75 | p = Population(self.samples, self.grammar, self.stats)
76 | self.config.mutation_probability = 0.0
77 | p.generation[0] = Individual(self.samples, self.grammar, self.stats, '01110101100101100110010110010101')
78 | self.config.mutation_probability = 0.5
79 | p.evolve()
80 |
81 | super().assertGreaterEqual(p.best_individual.fitness_value, 0.2)
82 |
83 | def test_binary_tournament(self):
84 | """ Test that binary tournament works as expected """
85 | self.config.max_generations = 3
86 | self.config.fitness_function_type = FitnessType.FULL_MATCH
87 | self.config.selection_type = SelectionType.BINARY_TOURNAMENT
88 | p = Population(self.samples, self.grammar, self.stats)
89 | mating_pool = p.selection(p.generation)
90 |
91 | super().assertNotEqual(p.generation, mating_pool)
92 |
93 | def test_k_tournament(self):
94 | """ Test that k tournament raises error """
95 | self.config.selection_type = SelectionType.K_TOURNAMENT
96 | p = Population(self.samples, self.grammar, self.stats)
97 | with super().assertRaises(NotImplementedError):
98 | _ = p.selection(p.generation)
99 |
100 | def test_random_one_point_crossover(self):
101 | """ Test that crossover 'random one point' works as expected """
102 | self.config.max_generations = 3
103 | self.config.fitness_function_type = FitnessType.BASIC
104 | self.config.selection_type = SelectionType.BINARY_TOURNAMENT
105 | self.config.recombination_type = RecombinationType.RANDOM_ONE_POINT_CROSSOVER
106 | p = Population(self.samples, self.grammar, self.stats)
107 | mating_pool = p.selection(p.generation)
108 | p.offspring = p.recombination(mating_pool, p.generation)
109 | super().assertNotEqual(p.generation, p.offspring)
110 |
111 | def test_mu_plus_lambda(self):
112 | """ Tests that replacement 'mu plus lambda' works as expected """
113 | self.config.replacement_type = ReplacementType.MU_PLUS_LAMBDA
114 | p = Population(self.samples, self.grammar, self.stats)
115 | mating_pool = p.selection(p.generation)
116 | p.offspring = p.recombination(mating_pool, p.generation)
117 | p.generation, p.offspring = p.replacement(p.generation, p.offspring)
118 | super().assertListEqual(p.offspring, [])
119 |
120 | def test_mu_lambda_elite(self):
121 | """ Tests that replacement 'mu lambda with elitism' works as expected """
122 | self.config.replacement_type = ReplacementType.MU_LAMBDA_WITH_ELITISM
123 | p = Population(self.samples, self.grammar, self.stats)
124 | mating_pool = p.selection(p.generation)
125 | p.offspring = p.recombination(mating_pool, p.generation)
126 | p.generation, p.offspring = p.replacement(p.generation, p.offspring)
127 | super().assertListEqual(p.offspring, [])
128 |
129 | def test_mu_lambda_no_elite(self):
130 | """ Tests that replacement 'mu lambda without elitism' works as expected """
131 | self.config.replacement_type = ReplacementType.MU_LAMBDA_WITHOUT_ELITISM
132 | p = Population(self.samples, self.grammar, self.stats)
133 | mating_pool = p.selection(p.generation)
134 | p.offspring = p.recombination(mating_pool, p.generation)
135 | p.generation, p.offspring = p.replacement(p.generation, p.offspring)
136 | super().assertListEqual(p.offspring, [])
137 |
138 | def test_evolve(self):
139 | """ Tests that an evolution works, preserving a fitted individual """
140 | self.config.max_generations = 3
141 | self.config.fitness_function_type = FitnessType.BASIC
142 | p = Population(self.samples, self.grammar, self.stats)
143 | self.config.mutation_probability = 0.0
144 | p.generation[0] = Individual(self.samples, self.grammar, self.stats, '01110101100101100110010110010101')
145 | self.config.mutation_probability = 0.5
146 | p.evolve()
147 | super().assertLessEqual(0.25, p.generation[0].fitness_value)
148 |
149 | def test_best_challenge_changes_best_individual(self):
150 | """ Covers best challenge cases """
151 | self.config.mutation_probability = 0.0
152 | self.config.fitness_function_type = FitnessType.BASIC
153 |
154 | p = Population(self.samples, self.grammar, self.stats)
155 | i1 = Individual(self.samples, self.grammar, self.stats, dna='00000000000000000000000000000000')
156 | i2 = Individual(self.samples, self.grammar, self.stats, dna='01110101100101100110010110010101')
157 |
158 | # When there's no best individual yet, population's best individual is updated
159 | p.best_individual = None
160 | p.generation = [i2]
161 | p._best_challenge()
162 |
163 | super().assertEqual(p.best_individual, p.generation[0])
164 |
165 | # When a better individual is better fitted in a new generation, population's best individual is updated
166 | p.best_individual = i1
167 | p.generation = [i2]
168 | p._best_challenge()
169 |
170 | super().assertEqual(p.best_individual, p.generation[0])
171 |
172 | # When a worse individual is the most fitted in a new generation, population's best individual remains the same
173 | p.best_individual = i2
174 | p.generation = [i1]
175 | p._best_challenge()
176 |
177 | super().assertEqual(i2, p.best_individual)
178 |
179 | def test_sr_update(self):
180 | """ Check SR is updated if a solution is found for the run """
181 | stats = Stats()
182 |
183 | self.config.max_generations = 1
184 | self.config.population_size = 3
185 | self.config.fitness_function_type = FitnessType.BASIC
186 | self.config.mutation_probability = 0.0
187 |
188 | self.config.success_threshold = 0.0
189 | p = Population(self.samples, self.grammar, stats)
190 | p.generation[0] = Individual(self.samples, self.grammar, stats, '01110101100101100110010110010101')
191 | p.evolve()
192 | super().assertListEqual([True], stats.success_rate_accumulator)
193 |
194 | self.config.success_threshold = 1.0
195 | self.config.population_size = 1
196 | p = Population(self.samples, self.grammar, stats)
197 | p.generation[0] = Individual(self.samples, self.grammar, stats, '00000000000000000000000000000000')
198 | p.evolve()
199 | super().assertListEqual([True, False], stats.success_rate_accumulator)
200 |
201 |
202 | class TestSelection(BasePopulationTest):
203 | """ Unit Test class for GE Selection object """
204 |
205 | def test_dispatch(self):
206 | """ Dispatcher method provides the proper selection method """
207 | selection = Selection(SelectionType.BINARY_TOURNAMENT)
208 | super().assertIs(selection._select, Selection._binary_tournament)
209 |
210 | selection = Selection(SelectionType.K_TOURNAMENT)
211 | super().assertIs(selection._select, Selection._k_tournament)
212 |
213 | # Check unknown SelectionType
214 | selection = Selection(None)
215 | super().assertIs(selection._select, Selection._binary_tournament)
216 |
217 |
218 | class TestRecombination(BasePopulationTest):
219 | """ Unit Test class for GE Recombination object """
220 |
221 | def test_dispatch(self):
222 | """ Dispatcher method provides the proper recombine method """
223 | recombination = Recombination(self.grammar, self.samples, self.stats)
224 | super().assertEqual(recombination._recombine, recombination._random_one_point_crossover)
225 |
226 |
227 | class TestReplacement(BasePopulationTest):
228 | """ Unit Test class for GE Replacement object """
229 |
230 | def test_dispatch(self):
231 | """ Dispatcher method provides the proper replacement method """
232 | replacement = Replacement(ReplacementType.MU_PLUS_LAMBDA)
233 | super().assertIs(replacement._replace, Replacement._mu_plus_lambda)
234 |
235 | replacement = Replacement(ReplacementType.MU_LAMBDA_WITH_ELITISM)
236 | super().assertIs(replacement._replace, Replacement._mu_lambda_elite)
237 |
238 | replacement = Replacement(ReplacementType.MU_LAMBDA_WITHOUT_ELITISM)
239 | super().assertIs(replacement._replace, Replacement._mu_lambda_no_elite)
240 |
241 | # Check unknown ReplacementType
242 | replacement = Replacement(None)
243 | super().assertIs(replacement._replace, Replacement._mu_plus_lambda)
244 |
245 |
246 | if __name__ == "__main__":
247 | unittest.main()
248 |
--------------------------------------------------------------------------------
/tests/test_script.py:
--------------------------------------------------------------------------------
1 | """ Unit testing file for CLI module
2 |
3 | This file is part of PatternOmatic.
4 |
5 | Copyright © 2020 Miguel Revuelta Espinosa
6 |
7 | PatternOmatic is free software: you can redistribute it and/or
8 | modify it under the terms of the GNU Lesser General Public License
9 | as published by the Free Software Foundation, either version 3 of
10 | the License, or (at your option) any later version.
11 |
12 | PatternOmatic is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU Lesser General Public License for more details.
16 |
17 | You should have received a copy of the GNU Lesser General Public License
18 | along with PatternOmatic. If not, see .
19 |
20 | """
21 | import os
22 | import scripts.patternomatic as pom
23 |
24 | from unittest import TestCase, mock
25 | from spacy import load as spacy_load
26 | from PatternOmatic.settings.log import LOG
27 |
28 |
29 | class TestPatternomaticScript(TestCase):
30 | """ Test class to verify patternomatic.py correct behaviour """
31 |
32 | nlp = spacy_load('en_core_web_sm')
33 |
34 | samples = [nlp(u'My shirt is white'),
35 | nlp(u'My cat is black'),
36 | nlp(u'Your home is comfortable'),
37 | nlp(u'Their attitude is great')]
38 |
39 | config_file_path = os.path.join(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir), 'config.ini')
40 |
41 | full_args = ['-s', 'Hello', '-s', 'Goodbye', '-c', config_file_path, '-l', 'en_core_web_sm']
42 |
43 | def test_main(self):
44 | """ Checks that main method works """
45 | with super().assertLogs(LOG) as cm:
46 | pom.main(self.full_args)
47 | super().assertIn('INFO:PatternOmatic:Best individuals for this execution:', cm.output)
48 |
49 | def test_main_errors_raised(self):
50 | """ Checks that main raises errors when bad arguments are supplied """
51 | # No args
52 | with super().assertRaises(SystemExit):
53 | pom.main([])
54 |
55 | # Wrong args
56 | with super().assertRaises(SystemExit):
57 | pom.main(['-k'])
58 |
59 | # Wrong lang
60 | with super().assertLogs(LOG) as cm:
61 | bad_model = 'bad_model'
62 | args = self.full_args.copy()[:-1]
63 | args.append(bad_model)
64 | pom.main(args)
65 | super().assertEqual(f'WARNING:PatternOmatic:Model {bad_model} not found, falling back to '
66 | f'patternOmatic\'s default language model: en_core_web_sm', cm.output[2])
67 |
68 | # Fatal error
69 | with mock.patch('scripts.patternomatic.ArgumentParser') as mock_arg_parser:
70 | mock_arg_parser.return_value = Exception('Mocked exception')
71 |
72 | with super().assertRaises(Exception):
73 | pom.main(self.full_args)
74 |
75 | def test_patternomatic_script(self):
76 | """ Checks that patternomatic can be run as a script properly """
77 | script_path = os.path.join(
78 | os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir), 'scripts', 'patternomatic.py')
79 |
80 | output_signal = os.system('python ' + script_path + ' -s Hello -s Goodbye')
81 | super().assertEqual(0, output_signal)
82 |
--------------------------------------------------------------------------------
/tests/test_settings.py:
--------------------------------------------------------------------------------
1 | """ Unit testing module for settings module
2 |
3 | This file is part of PatternOmatic.
4 |
5 | Copyright © 2020 Miguel Revuelta Espinosa
6 |
7 | PatternOmatic is free software: you can redistribute it and/or
8 | modify it under the terms of the GNU Lesser General Public License
9 | as published by the Free Software Foundation, either version 3 of
10 | the License, or (at your option) any later version.
11 |
12 | PatternOmatic is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU Lesser General Public License for more details.
16 |
17 | You should have received a copy of the GNU Lesser General Public License
18 | along with PatternOmatic. If not, see .
19 |
20 | """
21 | import configparser
22 | import os
23 | import unittest
24 |
25 | from PatternOmatic.settings.config import Config, RecombinationType
26 |
27 |
28 | class TestConfig(unittest.TestCase):
29 | """ Test class for settings """
30 |
31 | config = None
32 |
33 | def test_config_is_singleton(self):
34 | """ Tests config instance is a singleton one """
35 | another_config = Config()
36 | super().assertEqual(self.config, another_config)
37 |
38 | def test_config_is_clearable(self):
39 | """ Tests its possible to renew the singleton instance """
40 | Config.clear_instance()
41 | another_config = Config()
42 |
43 | super().assertNotEqual(self.config, another_config)
44 |
45 | def test_config_read_from_path(self):
46 | """ Tests providing or not providing a configuration file works as expected"""
47 | # No config file provided
48 | super().assertEqual(None, self.config.file_path)
49 |
50 | # Correct config file provided
51 |
52 | file_path = os.path.join(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir), 'config.ini')
53 | Config.clear_instance()
54 | self.config = Config(file_path)
55 | super().assertEqual(file_path, self.config.file_path)
56 |
57 | # Bad path provided
58 | Config.clear_instance()
59 | self.config = Config('')
60 | super().assertEqual(None, self.config.file_path)
61 |
62 | def test_xps_gop_can_not_be_enabled_together(self):
63 | """ Tests Spacy's Grammar Operators and Extended Patter Syntax can not be enabled both """
64 | config = Config()
65 | config.use_grammar_operators = True
66 | config.use_extended_pattern_syntax = True
67 | super().assertNotEqual(config.use_grammar_operators, config.use_extended_pattern_syntax)
68 |
69 | config.use_grammar_operators = False
70 | config.use_extended_pattern_syntax = True
71 | super().assertEqual(True, config.use_extended_pattern_syntax)
72 |
73 | config.use_grammar_operators = True
74 | super().assertEqual(False, config.use_extended_pattern_syntax)
75 |
76 | def test_setting_config_attribute_with_wrong_type_has_no_effect(self):
77 | config = Config()
78 |
79 | config.max_runs = 0.5
80 | config.use_extended_pattern_syntax = None
81 | config.fitness_function_type = RecombinationType.RANDOM_ONE_POINT_CROSSOVER
82 | config.report_path = 0
83 |
84 | super().assertNotEqual(config.max_runs, 0.5)
85 | super().assertNotEqual(config.use_extended_pattern_syntax, None)
86 | super().assertNotEqual(config.fitness_function_type, RecombinationType.RANDOM_ONE_POINT_CROSSOVER)
87 | super().assertNotEqual(config.report_path, 0)
88 |
89 | def test_validate_config_argument(self):
90 | """ Checks that config arguments are properly fetched according to its type """
91 | config_parser = configparser.ConfigParser()
92 |
93 | test_section = 'test_section'
94 | test_option_int = 'test_option_int'
95 | test_option_float = 'test_option_float'
96 | test_option_boolean = 'test_option_boolean'
97 | test_option_string = 'test_option_string'
98 |
99 | config_parser.add_section(test_section)
100 |
101 | config_parser[test_section][test_option_int] = '0'
102 | config_parser[test_section][test_option_float] = '0.0'
103 | config_parser[test_section][test_option_boolean] = 'False'
104 | config_parser[test_section][test_option_string] = ''
105 |
106 | # With valid types
107 | super().assertEqual(
108 | 0, self.config._validate_config_argument(test_section, test_option_int, 1, config_parser))
109 | super().assertEqual(
110 | .0, self.config._validate_config_argument(test_section, test_option_float, .1, config_parser))
111 | super().assertEqual(
112 | False, self.config._validate_config_argument(test_section, test_option_boolean, True, config_parser))
113 | super().assertEqual(
114 | '', self.config._validate_config_argument(test_section, test_option_string, 'Whatever', config_parser))
115 |
116 | # With wrong type
117 | config_parser[test_section][test_option_int] = 'False'
118 | super().assertEqual(
119 | 1, self.config._validate_config_argument(test_section, test_option_int, 1, config_parser))
120 |
121 | # With not even a possible type used by the config parser
122 | super().assertEqual(
123 | {}, self.config._validate_config_argument(test_section, test_option_int, {}, config_parser))
124 |
125 | #
126 | # Helpers
127 | #
128 | def setUp(self) -> None:
129 | """ Fresh Config instance """
130 | self.config = Config()
131 |
132 | def tearDown(self) -> None:
133 | """ Destroy Config instance """
134 | Config.clear_instance()
135 |
--------------------------------------------------------------------------------
/tests/test_stats.py:
--------------------------------------------------------------------------------
1 | """ Unit testing module for stats module
2 |
3 | This file is part of PatternOmatic.
4 |
5 | Copyright © 2020 Miguel Revuelta Espinosa
6 |
7 | PatternOmatic is free software: you can redistribute it and/or
8 | modify it under the terms of the GNU Lesser General Public License
9 | as published by the Free Software Foundation, either version 3 of
10 | the License, or (at your option) any later version.
11 |
12 | PatternOmatic is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU Lesser General Public License for more details.
16 |
17 | You should have received a copy of the GNU Lesser General Public License
18 | along with PatternOmatic. If not, see .
19 |
20 | """
21 | import os
22 | from unittest import TestCase, mock
23 |
24 | from PatternOmatic.ge.individual import Individual
25 | from PatternOmatic.ge.stats import Stats
26 | from PatternOmatic.settings.config import Config
27 | from PatternOmatic.settings.literals import ReportFormat
28 |
29 |
30 | class TestStats(TestCase):
31 | """ Tests for Stats class """
32 |
33 | stats = None
34 | test_report_path_file = 'test_report_path_file.txt'
35 | fitness_value_literal = 'fitness_value'
36 |
37 | def test_add_sr(self):
38 | """ SR accumulator works """
39 | self.stats.add_sr(True)
40 | super().assertListEqual([True], self.stats.success_rate_accumulator)
41 |
42 | def test_add_mbf(self):
43 | """ MBF accumulator works """
44 | self.stats.add_mbf(0.5)
45 | super().assertListEqual([0.5], self.stats.mbf_accumulator)
46 |
47 | def test_add_aes(self):
48 | """ AES accumulator works """
49 | self.stats.add_aes(10)
50 | super().assertListEqual([10], self.stats.aes_accumulator)
51 |
52 | def test_add_time(self):
53 | """ Time accumulator works """
54 | self.stats.add_time(0.2222)
55 | super().assertListEqual([0.2222], self.stats.time_accumulator)
56 |
57 | def test_add_most_fitted(self):
58 | """ Most fitted accumulator works """
59 | expected = object.__new__(Individual)
60 | expected.__setattr__(self.fitness_value_literal, 0.5)
61 |
62 | self.stats.add_most_fitted(expected)
63 | super().assertListEqual([expected], self.stats.most_fitted_accumulator)
64 |
65 | def test_sum_aes(self):
66 | """ Time counter works """
67 | self.stats.sum_aes(2)
68 | self.stats.sum_aes(2)
69 | super().assertEqual(4, self.stats.aes_counter,)
70 |
71 | def test_reset(self):
72 | """ Reset stats method works """
73 | self.stats.aes_counter = 100
74 | self.stats.solution_found = True
75 | self.stats.reset()
76 | super().assertEqual(0, self.stats.aes_counter)
77 | super().assertEqual(False, self.stats.solution_found)
78 |
79 | def test_calculate_metrics(self):
80 | """ Calculate metrics works """
81 | self.stats.success_rate_accumulator = [1, 1, 1]
82 | self.stats.mbf_accumulator = [2, 2, 2]
83 | self.stats.aes_counter = 100
84 | self.stats.time_accumulator = [3, 3, 3]
85 |
86 | self.stats.calculate_metrics()
87 |
88 | super().assertEqual(1, self.stats.success_rate)
89 | super().assertEqual(2, self.stats.mbf)
90 | super().assertEqual(100, self.stats.aes)
91 | super().assertEqual(3, self.stats.mean_time)
92 |
93 | def test_get_most_fitted(self):
94 | """ Most fitted individual is found on most fitted accumulator """
95 | i1 = object.__new__(Individual)
96 | i1.__setattr__(self.fitness_value_literal, 0.01)
97 | i2 = object.__new__(Individual)
98 | i2.__setattr__(self.fitness_value_literal, 0.1)
99 | i3 = object.__new__(Individual)
100 | i3.__setattr__(self.fitness_value_literal, 0.001)
101 |
102 | mock_individual_list = list()
103 |
104 | mock_individual_list.append(i1)
105 | mock_individual_list.append(i2)
106 | mock_individual_list.append(i3)
107 |
108 | self.stats.most_fitted_accumulator = mock_individual_list
109 |
110 | super().assertEqual(self.stats.get_most_fitted(), i2)
111 |
112 | def test_avg(self):
113 | """ Average implementation works """
114 | test_list_1 = [1, 2, 3]
115 | super().assertEqual(2, self.stats.avg(test_list_1))
116 |
117 | def test_dict_and_repr(self):
118 | """ Checks that Stats instances are properly represented """
119 | stats_dict = {
120 | 'success_rate': 1.0,
121 | 'mbf': 0.5,
122 | 'aes': 100,
123 | 'mean_time': 4.5,
124 | 'most_fitted': None
125 | }
126 |
127 | # Check that with no best individual representation is well formed
128 | stats = Stats()
129 | stats.success_rate = stats_dict['success_rate']
130 | stats.mbf = stats_dict['mbf']
131 | stats.aes = stats_dict['aes']
132 | stats.mean_time = stats_dict['mean_time']
133 |
134 | super().assertEqual(stats.__dict__, stats_dict)
135 | super().assertEqual(dict(stats), stats_dict)
136 | super().assertEqual(f'Stats({repr(stats_dict)})', repr(stats))
137 |
138 | # Check that with most fitted accumulator representation is well formed
139 | i = object.__new__(Individual)
140 | i.__setattr__(self.fitness_value_literal, 1.0)
141 |
142 | stats.most_fitted_accumulator = [i]
143 | stats_dict['most_fitted'] = i.__dict__
144 |
145 | super().assertDictEqual(stats_dict, stats.__dict__)
146 | super().assertEqual(stats_dict, dict(stats))
147 | super().assertEqual(f'Stats({repr(stats_dict)})', repr(stats))
148 |
149 | def test_persist(self):
150 | config = Config()
151 | config.report_format = ReportFormat.JSON
152 | config.report_path = self.test_report_path_file
153 |
154 | # When a best individual has been found
155 | i = object.__new__(Individual)
156 | i.__setattr__(self.fitness_value_literal, 1.0)
157 | self.stats.aes = 100
158 | self.stats.mbf = 0.9
159 | self.stats.mean_time = 0.42
160 | self.stats.success_rate = 1.0
161 | self.stats.most_fitted_accumulator = [i]
162 | self.stats.persist()
163 |
164 | with open(self.test_report_path_file, 'r') as persisted_report:
165 | red_report = persisted_report.readlines()
166 |
167 | super().assertEqual(str(dict(self.stats)) + '\n', red_report[0])
168 |
169 | # When a best individual has not been found
170 | self.stats.most_fitted_accumulator = []
171 | self.stats.persist()
172 |
173 | with open(self.test_report_path_file, 'r') as persisted_report:
174 | red_report = persisted_report.readlines()
175 |
176 | super().assertEqual(str(dict(self.stats)) + '\n', red_report[1])
177 |
178 | def test_to_csv(self):
179 | """ Test stats instance dict to csv conversion """
180 | with mock.patch('PatternOmatic.ge.stats.time') as mock_time:
181 | mock_time.return_value = .123
182 | self.stats.aes = 10
183 | self.stats.mbf = 0.5
184 | self.stats.mean_time = 0.22
185 | self.stats.success_rate = 0.5
186 |
187 | # When a best individual has not been found
188 | csv_stats = \
189 | f'{.123}\t{self.stats.mbf}\t{self.stats.success_rate}\t{self.stats.aes}\t{self.stats.mean_time}\t' \
190 | f'{None}\t'
191 |
192 | super().assertEqual(csv_stats, self.stats._to_csv())
193 |
194 | # When a best individual has been found
195 | i = object.__new__(Individual)
196 | i.__setattr__(self.fitness_value_literal, 1.0)
197 | self.stats.most_fitted_accumulator = [i]
198 |
199 | csv_stats += f'{None}\t{i.fitness_value}\t'
200 | super().assertEqual(csv_stats, self.stats._to_csv())
201 |
202 | # Also check csv is correctly persisted
203 | config = Config()
204 | config.report_path = self.test_report_path_file
205 | config.report_format = ReportFormat.CSV
206 | self.stats.persist()
207 |
208 | with open(self.test_report_path_file, 'r') as persisted_report:
209 | red_report = persisted_report.readlines()
210 |
211 | super().assertEqual(csv_stats + '\n', red_report[0])
212 |
213 | #
214 | # Helpers
215 | #
216 | def setUp(self) -> None:
217 | """ Fresh Stats instance """
218 | self.stats = Stats()
219 | if os.path.exists(self.test_report_path_file):
220 | os.remove(self.test_report_path_file)
221 |
222 | @classmethod
223 | def tearDownClass(cls) -> None:
224 | """ Remove temporary report file """
225 | if os.path.exists(cls.test_report_path_file):
226 | os.remove(cls.test_report_path_file)
227 |
--------------------------------------------------------------------------------