├── README.md ├── __init__.py ├── ngram_model.py ├── partition_tree.py ├── run_sampling_from_corpus.py ├── sampler.py ├── tests ├── __init__.py ├── edgar_allan_poe.py ├── ngram_model_test.py └── test_corpus │ ├── edgar_allan_poe.txt │ └── edgar_allan_poe_long.txt ├── tokenizer.py └── utilities.py /README.md: -------------------------------------------------------------------------------- 1 | # ngram-language-model 2 | An implementation of a HMM Ngram language model in python. 3 | 4 | Currently implements basic NGram analysis, and provides an interface to create samplers from your favorite corpus. 5 | 6 | Use run_sampling_from_corpus.py to create samples trained on a corpus in a text file. 7 | 8 | For more info about the input arguments, type 9 | ``` 10 | run_sampling_from_corpus.py -h 11 | ``` 12 | 13 | For more control, you can import the SentenceSamplerUtility class from the utilities module, which provides a convenient wrapper around the mechanics of sampler construction. 14 | 15 | ------------------------------------------------------- 16 | 17 | Some highlights from a trigram model trained on the collected works of Edgar Allan Poe from Project Gutenberg (included in the tests directory): 18 | ``` 19 | "And yet all was blackness and vacancy." 20 | "Notwithstanding the obscurity which thus oppressed me." 21 | "And it was the groan of mortal terror." 22 | "Among this nation of necromancers there was wine." 23 | ``` 24 | 25 | Interestingly, only the shorter sentences seem vaguely comprehensible in this model. Poe's love of stringing short relative clauses together with commas causes longer sentences to descend into trippy comma-segmented context switches: 26 | ``` 27 | And so with combativeness, with a sudden elevation in turpitude, whose success at guessing in the contemplation of natural glory mingled at length, upon application of the Marchesa di Mentoni, (who for some time, much of what the more distressing, the more bitterly did I shudder to name them at all, the enthusiasm, and she trembled and very bitterly wept; but to define the day of my soul! 28 | ``` 29 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'root' 2 | -------------------------------------------------------------------------------- /ngram_model.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from math import log 3 | import sampler 4 | 5 | STOP = 'STOP' 6 | 7 | class NGramMaker(object): 8 | def __init__(self, N): 9 | self.N = N 10 | self.STOP = STOP 11 | self.FIRST_CHARACTER_TEMPLATE = '*_{0}' 12 | self.starting_tokens = [self.FIRST_CHARACTER_TEMPLATE.format(-(N-i)) for i in range(1, N)] 13 | 14 | def get_starting_tokens(self): 15 | return self.starting_tokens 16 | 17 | def get_stop_token(self): 18 | return self.STOP 19 | 20 | def make_ngrams(self, sequence): 21 | ngrams = [] 22 | augmented_sequence = self.starting_tokens + sequence + [self.STOP] 23 | starting_point_of_last_ngram = len(augmented_sequence) - (self.N - 1) - 1 24 | for i in range(starting_point_of_last_ngram+1): 25 | ngrams.append(tuple(augmented_sequence[i:i+self.N])) 26 | return ngrams 27 | 28 | class AbstractNGramFrequencyModel(object): 29 | """Model which stores the frequencies of ngrams occurring in a language. 30 | Also stores the frequency of frequencies""" 31 | def __init__(self, N): 32 | self.N = N 33 | self.ngram_maker = NGramMaker(N) 34 | self.frequency_tree = NGramFrequencyTree() 35 | 36 | def fit(self, sequences): 37 | for sequence in sequences: 38 | ngrams = self._make_ngrams(sequence) 39 | [self.frequency_tree.add_ngram_observation(ngram) for ngram in ngrams] 40 | 41 | def predict(self, sequences): 42 | return [self._get_sequence_log_probability(sequence) for sequence in sequences] 43 | 44 | def _get_sequence_log_probability(self, sequence): 45 | ngrams = self._make_ngrams(sequence) 46 | log_likelihood = sum([self._get_ngram_log_probability(ngram) for ngram in ngrams]) 47 | return log_likelihood 48 | 49 | def _make_ngrams(self, sequence): 50 | return self.ngram_maker.make_ngrams(sequence) 51 | 52 | def _get_ngram_log_probability(self, ngram): 53 | raw_prob = self._get_ngram_probability(ngram) 54 | return log(raw_prob) 55 | 56 | def _get_ngram_probability(self, ngram): 57 | raise NotImplementedError 58 | 59 | class MLEModel(AbstractNGramFrequencyModel): 60 | def _get_ngram_probability(self, ngram): 61 | context_count, tail_count = self.frequency_tree.get_ngram_frequency(ngram) 62 | if not context_count or not tail_count: 63 | return 1e-20 64 | return tail_count/context_count 65 | 66 | class AdditiveSmoothingModel(AbstractNGramFrequencyModel): 67 | def _get_ngram_probability(self, ngram): 68 | context_count, tail_count = self.frequency_tree.get_ngram_frequency(ngram) 69 | ngram_count = self.frequency_tree.get_unique_count() 70 | return (tail_count+1)/(context_count+ngram_count) 71 | 72 | class NGramFrequencyTree(object): 73 | def __init__(self): 74 | self.base_ngram_tree = defaultdict(int) #TODO: Consolidate as one data structure with custom node type 75 | self.frequency_tree = defaultdict(lambda: defaultdict(int)) 76 | self.unique_ngram_count = 0 77 | 78 | def add_ngram_observation(self, ngram): 79 | preceding_elements, last_element = self._partition_ngram(ngram) 80 | self.base_ngram_tree[preceding_elements] += 1 81 | if self.frequency_tree[preceding_elements][last_element] == 0: 82 | self.unique_ngram_count += 1 83 | self.frequency_tree[preceding_elements][last_element] += 1 84 | 85 | def get_ngram_frequency(self, ngram): 86 | preceding_elements, last_element = self._partition_ngram(ngram) 87 | return self.base_ngram_tree[preceding_elements], self.frequency_tree[preceding_elements][last_element] 88 | 89 | def get_continuation_probability(self, ngram_stem, continuation): 90 | base_count, continuation_count = self.get_ngram_frequency(tuple(list(ngram_stem) + [continuation])) 91 | return 1.0*continuation_count/base_count 92 | 93 | def get_all_ngram_stems(self): 94 | return self.base_ngram_tree.keys() 95 | 96 | def get_all_continuations(self, ngram_stem): 97 | return self.frequency_tree[ngram_stem].keys() 98 | 99 | def _partition_ngram(self, ngram): 100 | *head, tail = ngram 101 | return tuple(head), tail 102 | 103 | def get_unique_count(self): 104 | return self.unique_ngram_count 105 | 106 | class NGramSampler(object): 107 | def __init__(self, sequence_tree, default_initial_stem=None): 108 | self.sequence_tree = sequence_tree 109 | self.samplers = self._init_samplers(sequence_tree) 110 | self.default_initial_stem = default_initial_stem 111 | 112 | def _init_samplers(self, sequence_tree): 113 | return {key:self._build_sampler(sequence_tree, key) for key in sequence_tree.get_all_ngram_stems()} 114 | 115 | def _build_sampler(self, ngram_tree, ngram_stem): 116 | ngram_continuations = ngram_tree.get_all_continuations(ngram_stem) 117 | probabilities = [ngram_tree.get_continuation_probability(ngram_stem, cont) for cont in ngram_continuations] 118 | sampler_obj = sampler.Multinomial_Sampler(probabilities, ngram_continuations) 119 | return sampler_obj 120 | 121 | def sample_sequence(self): 122 | import copy 123 | sampled_sentence = copy.deepcopy(self.default_initial_stem) 124 | N = len(self.default_initial_stem) + 1 125 | while sampled_sentence[-1] != STOP: 126 | stem = tuple(sampled_sentence[-(N-1):]) 127 | next_char = self.samplers[stem].sample() 128 | sampled_sentence.append(next_char) 129 | trimmed_sampled_sentence = sampled_sentence[len(self.default_initial_stem):-1] 130 | return trimmed_sampled_sentence -------------------------------------------------------------------------------- /partition_tree.py: -------------------------------------------------------------------------------- 1 | class PartitionTreeNode(object): 2 | def __init__(self, left=None, right=None, interval=None): 3 | self.left = left 4 | self.right = right 5 | self.interval = interval 6 | 7 | class PartitionTree(object): 8 | #TODO: This should be a self-balancing tree 9 | def __init__(self, intervals, labels): 10 | self.mapping = {} 11 | self.root = PartitionTreeNode() 12 | for interval, label in zip(intervals, labels): 13 | self._add_interval(interval, self.root) 14 | self.mapping[interval] = label 15 | 16 | def _add_interval(self, interval, starting_node): 17 | node = starting_node 18 | left, right = interval 19 | while node.interval: 20 | node_left, node_right = node.interval 21 | if right <= node_left: 22 | node = node.left 23 | elif left >= node_right: 24 | node = node.right 25 | else: 26 | raise Exception('Duplicate interval added to tree') 27 | node.interval = interval 28 | node.left = PartitionTreeNode() 29 | node.right = PartitionTreeNode() 30 | 31 | def get_label(self, number): 32 | interval = self._get_interval(number, self.root) 33 | return self.mapping[interval] 34 | 35 | def _get_interval(self, number, node): 36 | left_bound, right_bound = node.interval 37 | while number < left_bound or number > right_bound: 38 | if number < left_bound: 39 | node = node.left 40 | elif number > right_bound: 41 | node = node.right 42 | left_bound, right_bound = node.interval 43 | return node.interval -------------------------------------------------------------------------------- /run_sampling_from_corpus.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | import utilities 4 | # Parse Args 5 | args = sys.argv[1:] 6 | p = argparse.ArgumentParser(description='A script for constructing an NGram sampler and sampling from it.') 7 | p.add_argument('-input_file', help='input filepath') 8 | p.add_argument('-collapse_whitespace', help='convert all whitespace to a single space', action="store_true") 9 | p.add_argument('-punct_as_newline', help='use punctuation (!?.) as sentence terminators in addition to newlines', action="store_true") 10 | p.add_argument('-number_samples', help='number of samples from language model to put in output file') 11 | p.add_argument('-ngram_order', help='length of n-grams (value of n)') 12 | parsed_args = p.parse_args(args) 13 | 14 | n = int(parsed_args.ngram_order) 15 | number_samples = int(parsed_args.number_samples) 16 | 17 | # Read file 18 | corpus = open(parsed_args.input_file) 19 | document = corpus.read() 20 | preprocessor = utilities.DocumentPreProcessor() 21 | sentences = preprocessor.preprocess(document) 22 | sentences = [sentence for sentence in sentences if len(sentence.split('n')) > n] 23 | 24 | # Build Samples 25 | model = utilities.SentenceSamplerUtility(sentences, n) 26 | samples = [model.get_sample() for i in range(number_samples)] 27 | for s in samples: 28 | print(s) -------------------------------------------------------------------------------- /sampler.py: -------------------------------------------------------------------------------- 1 | import random 2 | import partition_tree 3 | 4 | class Multinomial_Sampler(object): 5 | def __init__(self, probabilities, event_names): 6 | intervals = self._build_intervals_from_probabilities(probabilities) 7 | self.tree = partition_tree.PartitionTree(intervals, event_names) 8 | 9 | def _build_intervals_from_probabilities(self, probabilities): 10 | intervals = [] 11 | left_side = 0.0 12 | for p in probabilities: 13 | intervals.append((left_side, left_side+p)) 14 | left_side += p 15 | return intervals 16 | 17 | 18 | def sample(self): 19 | random_0_1 = random.random() 20 | return self.tree.get_label(random_0_1) 21 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'root' 2 | -------------------------------------------------------------------------------- /tests/edgar_allan_poe.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('..') 3 | import unittest 4 | import utilities 5 | import re 6 | 7 | class EdgarAllanPoeTest(unittest.TestCase): 8 | def test_corpus_sampling_end_to_end(self): 9 | filename = 'test_corpus/edgar_allan_poe.txt' 10 | self.run_corpus_trigram_sampling_end_to_end(filename) 11 | 12 | # @unittest.skip('Performance issues still need resolving') 13 | def test_corpus_sampling_end_to_end_long(self): 14 | filename = 'test_corpus/edgar_allan_poe_long.txt' 15 | self.run_corpus_trigram_sampling_end_to_end(filename) 16 | 17 | def run_corpus_trigram_sampling_end_to_end(self, filename): 18 | n=3 19 | poe_corpus = open(filename) 20 | poe_document = poe_corpus.read() 21 | poe_document = poe_document.replace('--', ' -- ') 22 | preprocessor = utilities.DocumentPreProcessor() 23 | poe_sentences = preprocessor.preprocess(poe_document) 24 | poe_sentences = [sentence for sentence in poe_sentences if len(sentence.split('n')) > n] 25 | print('Processing {0} sentences'.format(len(poe_sentences))) 26 | poe_model = utilities.SentenceSamplerUtility(poe_sentences, n) 27 | samples = [poe_model.get_sample() for i in range(100)] 28 | for s in samples: 29 | print(s) -------------------------------------------------------------------------------- /tests/ngram_model_test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('..') 3 | import unittest 4 | import partition_tree 5 | import sampler 6 | import ngram_model 7 | import tokenizer 8 | import utilities 9 | 10 | class TokenizerTest(unittest.TestCase): 11 | def test_tokenizer(self): 12 | test_strings = ['The man who is tall is happy.','Is the man who is tall happy?'] 13 | tokenized_strings = [['The','man','who','is','tall','is','happy','.'], 14 | ['Is', 'the', 'man','who','is','tall','happy','?']] 15 | T = tokenizer.Tokenizer() 16 | assert T.process(test_strings) == tokenized_strings 17 | 18 | class NgramTest(unittest.TestCase): 19 | train_sequences = [['the','dog','runs'],['the','dog','jumps']] 20 | 21 | def test_frequency_tree(self): 22 | unigram_tree = ngram_model.NGramFrequencyTree() 23 | for sequence in self.train_sequences: 24 | for unigram in sequence: 25 | unigram_tree.add_ngram_observation([unigram]) 26 | assert set(unigram_tree.get_all_continuations(())) == {'the', 'dog','runs', 'jumps'} 27 | self._assert_ngram_frequency(unigram_tree, ['the'], 6, 2) 28 | 29 | 30 | def _assert_ngram_frequency(self, tree, sequence, expected_total, expected_sequence_count): 31 | sequence_total, sequence_count = tree.get_ngram_frequency(sequence) 32 | assert sequence_total == expected_total and sequence_count == expected_sequence_count 33 | 34 | def test_train_additive_trigram_model(self): 35 | model = ngram_model.MLEModel(3) 36 | model.fit(self.train_sequences) 37 | for seq in self.train_sequences + [['the', 'dog', 'dog']]: 38 | print(seq, model.predict([seq])) 39 | 40 | class PartitionTreeTest(unittest.TestCase): 41 | def test_partition_tree(self): 42 | tree = partition_tree.PartitionTree([(0.0,0.5),(0.5,1.0)],['A', 'B']) 43 | values = [0.0, 0.3, 0.5, 0.7, 1.0] 44 | labels = [tree.get_label(v) for v in values] 45 | correct_labels = ['A', 'A', 'A', 'B', 'B'] 46 | assert labels == correct_labels 47 | 48 | class MultinomialSampleTest(unittest.TestCase): 49 | def test_biased_coin_flip(self): 50 | true_heads, true_tails = 0.3, 0.7 51 | P = [true_heads, true_tails] 52 | event_names = ['Heads', 'Tails'] 53 | s = sampler.Multinomial_Sampler(P, event_names) 54 | from collections import Counter 55 | total_samples = 1000000 56 | sample_counter = Counter([s.sample() for i in range(total_samples)]) 57 | allowed_error = 0.001 58 | head_frequency = 1.0*sample_counter['Heads']/total_samples 59 | tail_frequency = 1.0*sample_counter['Tails']/total_samples 60 | print(head_frequency, tail_frequency) 61 | assert true_heads - allowed_error <= head_frequency <= true_heads + allowed_error 62 | assert true_tails - allowed_error <= tail_frequency <= true_tails + allowed_error 63 | 64 | class NGramSamplerTest(unittest.TestCase): 65 | def bigram_sample_from_corpus_model(self, test_sequences): 66 | N = 2 67 | sequence_tree = ngram_model.NGramFrequencyTree() 68 | token = tokenizer.Tokenizer() 69 | ngram_maker = ngram_model.NGramMaker(N) 70 | tokenized_sequences = token.process(test_sequences) 71 | prepared_sequences = [ngram_maker.make_ngrams(s) for s in tokenized_sequences] 72 | [sequence_tree.add_ngram_observation(ngram) for sequence in prepared_sequences for ngram in sequence] 73 | sampler = ngram_model.NGramSampler(sequence_tree, ngram_maker.starting_tokens) 74 | return sampler.sample_sequence() 75 | 76 | 77 | def test_single_length_sample(self): 78 | test_sequences = ['a.', 'a.'] 79 | assert self.bigram_sample_from_corpus_model(test_sequences) == ['a', '.'] 80 | 81 | def test_very_unlikely_sample(self): 82 | test_sequences = ['a.']*10000 + ['b.'] 83 | assert self.bigram_sample_from_corpus_model(test_sequences) == ['a', '.'] 84 | 85 | def test_long_sequence(self): 86 | test_sequences = ['a b c d e f g.'] 87 | assert self.bigram_sample_from_corpus_model(test_sequences) == ['a', 'b', 'c', 'd', 'e', 'f', 'g', '.'] 88 | 89 | class SampleUtilityTest(unittest.TestCase): 90 | def test_utilities_sampler_construction(self): 91 | test_sequences = ['a b c d e f g.'] 92 | N = 2 93 | sampler = utilities.SentenceSamplerUtility(test_sequences, N) 94 | assert sampler.get_sample() == 'a b c d e f g.' 95 | 96 | def test_two_sentence_language(self): 97 | test_sequences = ['a b c d.', 'e f g.'] 98 | N = 2 99 | sampler = utilities.SentenceSamplerUtility(test_sequences, N) 100 | sample = sampler.get_sample() 101 | print(sample) 102 | assert sample in test_sequences 103 | 104 | 105 | if __name__ == '__main__': 106 | unittest.main() 107 | -------------------------------------------------------------------------------- /tokenizer.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | class Tokenizer(object): 4 | PUNCTUATION = ',.\'\\";:/?!' 5 | def __init__(self, delimiter=' '): 6 | self.delimiter = delimiter 7 | 8 | def process(self, sequences): 9 | tokenized_sequences = [] 10 | for seq in sequences: 11 | preprocessed = self._preprocess_sequence(seq) 12 | tokenized = self._tokenize(preprocessed) 13 | tokenized_sequences.append(tokenized) 14 | return tokenized_sequences 15 | 16 | def _preprocess_sequence(self, seq): 17 | return seq 18 | 19 | def _tokenize(self, sequence): 20 | preprocessed_sequence = self._preprocess_punctuation(sequence) 21 | if preprocessed_sequence[0] == ' ': 22 | preprocessed_sequence = preprocessed_sequence[1:] 23 | return preprocessed_sequence.split(self.delimiter) 24 | 25 | def _preprocess_punctuation(self, sequence): 26 | sequence_copy = copy.deepcopy(sequence) 27 | for punctuation_mark in self.PUNCTUATION: 28 | sequence_copy = sequence_copy.replace(punctuation_mark, self.delimiter + punctuation_mark) 29 | return sequence_copy 30 | -------------------------------------------------------------------------------- /utilities.py: -------------------------------------------------------------------------------- 1 | import tokenizer 2 | import ngram_model 3 | import re 4 | 5 | PUNCTUATION = ',.\'\\";:/?!' 6 | 7 | class SentenceSamplerUtility(object): 8 | def __init__(self, sentences, n): 9 | ngram_maker = ngram_model.NGramMaker(n) 10 | ngram_tree = self._construct_ngram_tree_from_sentences(sentences, ngram_maker) 11 | self.sampler = ngram_model.NGramSampler(ngram_tree, default_initial_stem=ngram_maker.starting_tokens) 12 | 13 | def _construct_ngram_tree_from_sentences(self, sentences, ngram_maker): 14 | T = tokenizer.Tokenizer() 15 | ngram_tree = ngram_model.NGramFrequencyTree() 16 | 17 | tokenized_sentences = T.process(sentences) 18 | ngram_sequences = [ngram_maker.make_ngrams(tokenized_sentence) for tokenized_sentence in tokenized_sentences] 19 | [ngram_tree.add_ngram_observation(ngram) for sequence in ngram_sequences for ngram in sequence] 20 | return ngram_tree 21 | 22 | def get_sample(self): 23 | sampled_sequence = self.sampler.sample_sequence() 24 | sampled_sentence = ' '.join(sampled_sequence) 25 | for p in PUNCTUATION: 26 | sampled_sentence = sampled_sentence.replace(' '+p, p) 27 | return sampled_sentence 28 | 29 | class DocumentPreProcessor(object): 30 | END_OF_SENTENCE_CHARS = '?.!' 31 | 32 | def __init__(self, reduce_whitespace=True, add_line_end_to_punctuation=True): 33 | self.preprocess_methods = [] 34 | if reduce_whitespace: 35 | self.preprocess_methods.append(self._reduce_whitespace) 36 | if add_line_end_to_punctuation: 37 | self.preprocess_methods.append(self._add_line_end_to_punctuation) 38 | self.preprocess_methods.append(self.split_on_newline) 39 | 40 | def preprocess(self, document): 41 | for preprocess in self.preprocess_methods: 42 | document = preprocess(document) 43 | return document 44 | 45 | def _add_line_end_to_punctuation(self, document): 46 | processed_document = document.replace('\n', ' ') 47 | for eos in self.END_OF_SENTENCE_CHARS: 48 | processed_document = processed_document.replace(eos+' ', eos+'\n') 49 | return processed_document 50 | 51 | def _reduce_whitespace(self, document): 52 | return re.sub("\s+"," ", document) 53 | 54 | def split_on_newline(self, document): 55 | return document.split('\n') --------------------------------------------------------------------------------