├── README.md
├── __init__.py
├── ngram_model.py
├── partition_tree.py
├── run_sampling_from_corpus.py
├── sampler.py
├── tests
    ├── __init__.py
    ├── edgar_allan_poe.py
    ├── ngram_model_test.py
    └── test_corpus
    │   ├── edgar_allan_poe.txt
    │   └── edgar_allan_poe_long.txt
├── tokenizer.py
└── utilities.py


/README.md:
--------------------------------------------------------------------------------
 1 | # ngram-language-model
 2 | An implementation of a HMM Ngram language model in python.
 3 | 
 4 | Currently implements basic NGram analysis, and provides an interface to create samplers from your favorite corpus.
 5 | 
 6 | Use run_sampling_from_corpus.py to create samples trained on a corpus in a text file.
 7 | 
 8 | For more info about the input arguments, type
 9 | ```
10 | run_sampling_from_corpus.py -h
11 | ```
12 | 
13 | For more control, you can import the SentenceSamplerUtility class from the utilities module, which provides a convenient wrapper around the mechanics of sampler construction.
14 | 
15 | -------------------------------------------------------
16 | 
17 | Some highlights from a trigram model trained on the collected works of Edgar Allan Poe from Project Gutenberg (included in the tests directory):
18 | ```
19 | "And yet all was blackness and vacancy."
20 | "Notwithstanding the obscurity which thus oppressed me."
21 | "And it was the groan of mortal terror."
22 | "Among this nation of necromancers there was wine."
23 | ```
24 | 
25 | Interestingly, only the shorter sentences seem vaguely comprehensible in this model. Poe's love of stringing short relative clauses together with commas causes longer sentences to descend into trippy comma-segmented context switches:
26 | ```
27 | And so with combativeness, with a sudden elevation in turpitude, whose success at guessing in the contemplation of natural glory mingled at length, upon application of the Marchesa di Mentoni, (who for some time, much of what the more distressing, the more bitterly did I shudder to name them at all, the enthusiasm, and she trembled and very bitterly wept; but to define the day of my soul!
28 | ```
29 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'root'
2 | 


--------------------------------------------------------------------------------
/ngram_model.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from math import log
  3 | import sampler
  4 | 
  5 | STOP = 'STOP'
  6 | 
  7 | class NGramMaker(object):
  8 |     def __init__(self, N):
  9 |         self.N = N
 10 |         self.STOP = STOP
 11 |         self.FIRST_CHARACTER_TEMPLATE = '*_{0}'
 12 |         self.starting_tokens = [self.FIRST_CHARACTER_TEMPLATE.format(-(N-i)) for i in range(1, N)]
 13 | 
 14 |     def get_starting_tokens(self):
 15 |         return self.starting_tokens
 16 | 
 17 |     def get_stop_token(self):
 18 |         return self.STOP
 19 | 
 20 |     def make_ngrams(self, sequence):
 21 |         ngrams = []
 22 |         augmented_sequence = self.starting_tokens + sequence + [self.STOP]
 23 |         starting_point_of_last_ngram = len(augmented_sequence) - (self.N - 1) - 1
 24 |         for i in range(starting_point_of_last_ngram+1):
 25 |             ngrams.append(tuple(augmented_sequence[i:i+self.N]))
 26 |         return ngrams
 27 | 
 28 | class AbstractNGramFrequencyModel(object):
 29 |     """Model which stores the frequencies of ngrams occurring in a language.
 30 |     Also stores the frequency of frequencies"""
 31 |     def __init__(self, N):
 32 |         self.N = N
 33 |         self.ngram_maker = NGramMaker(N)
 34 |         self.frequency_tree = NGramFrequencyTree()
 35 | 
 36 |     def fit(self, sequences):
 37 |         for sequence in sequences:
 38 |             ngrams = self._make_ngrams(sequence)
 39 |             [self.frequency_tree.add_ngram_observation(ngram) for ngram in ngrams]
 40 | 
 41 |     def predict(self, sequences):
 42 |         return [self._get_sequence_log_probability(sequence) for sequence in sequences]
 43 | 
 44 |     def _get_sequence_log_probability(self, sequence):
 45 |         ngrams = self._make_ngrams(sequence)
 46 |         log_likelihood = sum([self._get_ngram_log_probability(ngram) for ngram in ngrams])
 47 |         return log_likelihood
 48 | 
 49 |     def _make_ngrams(self, sequence):
 50 |         return self.ngram_maker.make_ngrams(sequence)
 51 | 
 52 |     def _get_ngram_log_probability(self, ngram):
 53 |         raw_prob = self._get_ngram_probability(ngram)
 54 |         return log(raw_prob)
 55 | 
 56 |     def _get_ngram_probability(self, ngram):
 57 |         raise NotImplementedError
 58 | 
 59 | class MLEModel(AbstractNGramFrequencyModel):
 60 |     def _get_ngram_probability(self, ngram):
 61 |         context_count, tail_count = self.frequency_tree.get_ngram_frequency(ngram)
 62 |         if not context_count or not tail_count:
 63 |             return 1e-20
 64 |         return tail_count/context_count
 65 | 
 66 | class AdditiveSmoothingModel(AbstractNGramFrequencyModel):
 67 |     def _get_ngram_probability(self, ngram):
 68 |         context_count, tail_count = self.frequency_tree.get_ngram_frequency(ngram)
 69 |         ngram_count = self.frequency_tree.get_unique_count()
 70 |         return (tail_count+1)/(context_count+ngram_count)
 71 | 
 72 | class NGramFrequencyTree(object):
 73 |     def __init__(self):
 74 |         self.base_ngram_tree = defaultdict(int) #TODO: Consolidate as one data structure with custom node type
 75 |         self.frequency_tree = defaultdict(lambda: defaultdict(int))
 76 |         self.unique_ngram_count = 0
 77 | 
 78 |     def add_ngram_observation(self, ngram):
 79 |         preceding_elements, last_element = self._partition_ngram(ngram)
 80 |         self.base_ngram_tree[preceding_elements] += 1
 81 |         if self.frequency_tree[preceding_elements][last_element] == 0:
 82 |             self.unique_ngram_count += 1
 83 |         self.frequency_tree[preceding_elements][last_element] += 1
 84 | 
 85 |     def get_ngram_frequency(self, ngram):
 86 |         preceding_elements, last_element = self._partition_ngram(ngram)
 87 |         return self.base_ngram_tree[preceding_elements], self.frequency_tree[preceding_elements][last_element]
 88 | 
 89 |     def get_continuation_probability(self, ngram_stem, continuation):
 90 |         base_count, continuation_count = self.get_ngram_frequency(tuple(list(ngram_stem) + [continuation]))
 91 |         return 1.0*continuation_count/base_count
 92 | 
 93 |     def get_all_ngram_stems(self):
 94 |         return self.base_ngram_tree.keys()
 95 | 
 96 |     def get_all_continuations(self, ngram_stem):
 97 |         return self.frequency_tree[ngram_stem].keys()
 98 | 
 99 |     def _partition_ngram(self, ngram):
100 |         *head, tail = ngram
101 |         return tuple(head), tail
102 | 
103 |     def get_unique_count(self):
104 |         return self.unique_ngram_count
105 | 
106 | class NGramSampler(object):
107 |     def __init__(self, sequence_tree, default_initial_stem=None):
108 |         self.sequence_tree = sequence_tree
109 |         self.samplers = self._init_samplers(sequence_tree)
110 |         self.default_initial_stem = default_initial_stem
111 | 
112 |     def _init_samplers(self, sequence_tree):
113 |         return {key:self._build_sampler(sequence_tree, key) for key in sequence_tree.get_all_ngram_stems()}
114 | 
115 |     def _build_sampler(self, ngram_tree, ngram_stem):
116 |         ngram_continuations = ngram_tree.get_all_continuations(ngram_stem)
117 |         probabilities = [ngram_tree.get_continuation_probability(ngram_stem, cont) for cont in ngram_continuations]
118 |         sampler_obj = sampler.Multinomial_Sampler(probabilities, ngram_continuations)
119 |         return sampler_obj
120 | 
121 |     def sample_sequence(self):
122 |         import copy
123 |         sampled_sentence = copy.deepcopy(self.default_initial_stem)
124 |         N = len(self.default_initial_stem) + 1
125 |         while sampled_sentence[-1] != STOP:
126 |             stem = tuple(sampled_sentence[-(N-1):])
127 |             next_char = self.samplers[stem].sample()
128 |             sampled_sentence.append(next_char)
129 |         trimmed_sampled_sentence = sampled_sentence[len(self.default_initial_stem):-1]
130 |         return trimmed_sampled_sentence


--------------------------------------------------------------------------------
/partition_tree.py:
--------------------------------------------------------------------------------
 1 | class PartitionTreeNode(object):
 2 |     def __init__(self, left=None, right=None, interval=None):
 3 |         self.left = left
 4 |         self.right = right
 5 |         self.interval = interval
 6 | 
 7 | class PartitionTree(object):
 8 |     #TODO: This should be a self-balancing tree
 9 |     def __init__(self, intervals, labels):
10 |         self.mapping = {}
11 |         self.root = PartitionTreeNode()
12 |         for interval, label in zip(intervals, labels):
13 |             self._add_interval(interval, self.root)
14 |             self.mapping[interval] = label
15 | 
16 |     def _add_interval(self, interval, starting_node):
17 |         node = starting_node
18 |         left, right = interval
19 |         while node.interval:
20 |             node_left, node_right = node.interval
21 |             if right <= node_left:
22 |                 node = node.left
23 |             elif left >= node_right:
24 |                 node = node.right
25 |             else:
26 |                 raise Exception('Duplicate interval added to tree')
27 |         node.interval = interval
28 |         node.left = PartitionTreeNode()
29 |         node.right = PartitionTreeNode()
30 | 
31 |     def get_label(self, number):
32 |         interval = self._get_interval(number, self.root)
33 |         return self.mapping[interval]
34 | 
35 |     def _get_interval(self, number, node):
36 |         left_bound, right_bound = node.interval
37 |         while number < left_bound or number > right_bound:
38 |             if number < left_bound:
39 |                 node = node.left
40 |             elif number > right_bound:
41 |                 node = node.right
42 |             left_bound, right_bound = node.interval
43 |         return node.interval


--------------------------------------------------------------------------------
/run_sampling_from_corpus.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import argparse
 3 | import utilities
 4 | # Parse Args
 5 | args = sys.argv[1:]
 6 | p = argparse.ArgumentParser(description='A script for constructing an NGram sampler and sampling from it.')
 7 | p.add_argument('-input_file', help='input filepath')
 8 | p.add_argument('-collapse_whitespace', help='convert all whitespace to a single space', action="store_true")
 9 | p.add_argument('-punct_as_newline', help='use punctuation (!?.) as sentence terminators in addition to newlines', action="store_true")
10 | p.add_argument('-number_samples', help='number of samples from language model to put in output file')
11 | p.add_argument('-ngram_order', help='length of n-grams (value of n)')
12 | parsed_args = p.parse_args(args)
13 | 
14 | n = int(parsed_args.ngram_order)
15 | number_samples = int(parsed_args.number_samples)
16 | 
17 | # Read file
18 | corpus = open(parsed_args.input_file)
19 | document = corpus.read()
20 | preprocessor = utilities.DocumentPreProcessor()
21 | sentences = preprocessor.preprocess(document)
22 | sentences = [sentence for sentence in sentences if len(sentence.split('n')) > n]
23 | 
24 | # Build Samples
25 | model = utilities.SentenceSamplerUtility(sentences, n)
26 | samples = [model.get_sample() for i in range(number_samples)]
27 | for s in samples:
28 |     print(s)


--------------------------------------------------------------------------------
/sampler.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import partition_tree
 3 | 
 4 | class Multinomial_Sampler(object):
 5 |     def __init__(self, probabilities, event_names):
 6 |         intervals = self._build_intervals_from_probabilities(probabilities)
 7 |         self.tree = partition_tree.PartitionTree(intervals, event_names)
 8 | 
 9 |     def _build_intervals_from_probabilities(self, probabilities):
10 |         intervals = []
11 |         left_side = 0.0
12 |         for p in probabilities:
13 |             intervals.append((left_side, left_side+p))
14 |             left_side += p
15 |         return intervals
16 | 
17 | 
18 |     def sample(self):
19 |         random_0_1 = random.random()
20 |         return self.tree.get_label(random_0_1)
21 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'root'
2 | 


--------------------------------------------------------------------------------
/tests/edgar_allan_poe.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append('..')
 3 | import unittest
 4 | import utilities
 5 | import re
 6 | 
 7 | class EdgarAllanPoeTest(unittest.TestCase):
 8 |     def test_corpus_sampling_end_to_end(self):
 9 |         filename = 'test_corpus/edgar_allan_poe.txt'
10 |         self.run_corpus_trigram_sampling_end_to_end(filename)
11 | 
12 |     # @unittest.skip('Performance issues still need resolving')
13 |     def test_corpus_sampling_end_to_end_long(self):
14 |         filename = 'test_corpus/edgar_allan_poe_long.txt'
15 |         self.run_corpus_trigram_sampling_end_to_end(filename)
16 | 
17 |     def run_corpus_trigram_sampling_end_to_end(self, filename):
18 |         n=3
19 |         poe_corpus = open(filename)
20 |         poe_document = poe_corpus.read()
21 |         poe_document = poe_document.replace('--', ' -- ')
22 |         preprocessor = utilities.DocumentPreProcessor()
23 |         poe_sentences = preprocessor.preprocess(poe_document)
24 |         poe_sentences = [sentence for sentence in poe_sentences if len(sentence.split('n')) > n]
25 |         print('Processing {0} sentences'.format(len(poe_sentences)))
26 |         poe_model = utilities.SentenceSamplerUtility(poe_sentences, n)
27 |         samples = [poe_model.get_sample() for i in range(100)]
28 |         for s in samples:
29 |             print(s)


--------------------------------------------------------------------------------
/tests/ngram_model_test.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append('..')
  3 | import unittest
  4 | import partition_tree
  5 | import sampler
  6 | import ngram_model
  7 | import tokenizer
  8 | import utilities
  9 | 
 10 | class TokenizerTest(unittest.TestCase):
 11 |     def test_tokenizer(self):
 12 |         test_strings = ['The man who is tall is happy.','Is the man who is tall happy?']
 13 |         tokenized_strings = [['The','man','who','is','tall','is','happy','.'],
 14 |                              ['Is', 'the', 'man','who','is','tall','happy','?']]
 15 |         T = tokenizer.Tokenizer()
 16 |         assert T.process(test_strings) == tokenized_strings
 17 | 
 18 | class NgramTest(unittest.TestCase):
 19 |     train_sequences = [['the','dog','runs'],['the','dog','jumps']]
 20 | 
 21 |     def test_frequency_tree(self):
 22 |         unigram_tree = ngram_model.NGramFrequencyTree()
 23 |         for sequence in self.train_sequences:
 24 |             for unigram in sequence:
 25 |                 unigram_tree.add_ngram_observation([unigram])
 26 |         assert set(unigram_tree.get_all_continuations(())) == {'the', 'dog','runs', 'jumps'}
 27 |         self._assert_ngram_frequency(unigram_tree, ['the'], 6, 2)
 28 | 
 29 | 
 30 |     def _assert_ngram_frequency(self, tree, sequence, expected_total, expected_sequence_count):
 31 |         sequence_total, sequence_count = tree.get_ngram_frequency(sequence)
 32 |         assert sequence_total == expected_total and sequence_count == expected_sequence_count
 33 | 
 34 |     def test_train_additive_trigram_model(self):
 35 |         model = ngram_model.MLEModel(3)
 36 |         model.fit(self.train_sequences)
 37 |         for seq in self.train_sequences + [['the', 'dog', 'dog']]:
 38 |             print(seq, model.predict([seq]))
 39 | 
 40 | class PartitionTreeTest(unittest.TestCase):
 41 |     def test_partition_tree(self):
 42 |         tree = partition_tree.PartitionTree([(0.0,0.5),(0.5,1.0)],['A', 'B'])
 43 |         values = [0.0, 0.3, 0.5, 0.7, 1.0]
 44 |         labels = [tree.get_label(v) for v in values]
 45 |         correct_labels = ['A', 'A', 'A', 'B', 'B']
 46 |         assert labels == correct_labels
 47 | 
 48 | class MultinomialSampleTest(unittest.TestCase):
 49 |     def test_biased_coin_flip(self):
 50 |         true_heads, true_tails = 0.3, 0.7
 51 |         P = [true_heads, true_tails]
 52 |         event_names = ['Heads', 'Tails']
 53 |         s = sampler.Multinomial_Sampler(P, event_names)
 54 |         from collections import Counter
 55 |         total_samples = 1000000
 56 |         sample_counter = Counter([s.sample() for i in range(total_samples)])
 57 |         allowed_error = 0.001
 58 |         head_frequency = 1.0*sample_counter['Heads']/total_samples
 59 |         tail_frequency = 1.0*sample_counter['Tails']/total_samples
 60 |         print(head_frequency, tail_frequency)
 61 |         assert true_heads - allowed_error <= head_frequency <= true_heads + allowed_error
 62 |         assert true_tails - allowed_error <= tail_frequency <= true_tails + allowed_error
 63 | 
 64 | class NGramSamplerTest(unittest.TestCase):
 65 |     def bigram_sample_from_corpus_model(self, test_sequences):
 66 |         N = 2
 67 |         sequence_tree = ngram_model.NGramFrequencyTree()
 68 |         token = tokenizer.Tokenizer()
 69 |         ngram_maker = ngram_model.NGramMaker(N)
 70 |         tokenized_sequences = token.process(test_sequences)
 71 |         prepared_sequences = [ngram_maker.make_ngrams(s) for s in tokenized_sequences]
 72 |         [sequence_tree.add_ngram_observation(ngram) for sequence in prepared_sequences for ngram in sequence]
 73 |         sampler = ngram_model.NGramSampler(sequence_tree, ngram_maker.starting_tokens)
 74 |         return sampler.sample_sequence()
 75 | 
 76 | 
 77 |     def test_single_length_sample(self):
 78 |         test_sequences = ['a.', 'a.']
 79 |         assert self.bigram_sample_from_corpus_model(test_sequences)  == ['a', '.']
 80 | 
 81 |     def test_very_unlikely_sample(self):
 82 |         test_sequences = ['a.']*10000 + ['b.']
 83 |         assert self.bigram_sample_from_corpus_model(test_sequences)  == ['a', '.']
 84 | 
 85 |     def test_long_sequence(self):
 86 |         test_sequences = ['a b c d e f g.']
 87 |         assert self.bigram_sample_from_corpus_model(test_sequences) == ['a', 'b', 'c', 'd', 'e', 'f', 'g', '.']
 88 | 
 89 | class SampleUtilityTest(unittest.TestCase):
 90 |     def test_utilities_sampler_construction(self):
 91 |         test_sequences = ['a b c d e f g.']
 92 |         N = 2
 93 |         sampler = utilities.SentenceSamplerUtility(test_sequences, N)
 94 |         assert sampler.get_sample() == 'a b c d e f g.'
 95 | 
 96 |     def test_two_sentence_language(self):
 97 |         test_sequences = ['a b c d.', 'e f g.']
 98 |         N = 2
 99 |         sampler = utilities.SentenceSamplerUtility(test_sequences, N)
100 |         sample = sampler.get_sample()
101 |         print(sample)
102 |         assert sample in test_sequences
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     unittest.main()
107 | 


--------------------------------------------------------------------------------
/tokenizer.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | 
 3 | class Tokenizer(object):
 4 |     PUNCTUATION = ',.\'\\";:/?!'
 5 |     def __init__(self, delimiter=' '):
 6 |         self.delimiter = delimiter
 7 | 
 8 |     def process(self, sequences):
 9 |         tokenized_sequences = []
10 |         for seq in sequences:
11 |             preprocessed = self._preprocess_sequence(seq)
12 |             tokenized = self._tokenize(preprocessed)
13 |             tokenized_sequences.append(tokenized)
14 |         return tokenized_sequences
15 | 
16 |     def _preprocess_sequence(self, seq):
17 |         return seq
18 | 
19 |     def _tokenize(self, sequence):
20 |         preprocessed_sequence = self._preprocess_punctuation(sequence)
21 |         if preprocessed_sequence[0] == ' ':
22 |             preprocessed_sequence = preprocessed_sequence[1:]
23 |         return preprocessed_sequence.split(self.delimiter)
24 | 
25 |     def _preprocess_punctuation(self, sequence):
26 |         sequence_copy = copy.deepcopy(sequence)
27 |         for punctuation_mark in self.PUNCTUATION:
28 |             sequence_copy = sequence_copy.replace(punctuation_mark, self.delimiter + punctuation_mark)
29 |         return sequence_copy
30 | 


--------------------------------------------------------------------------------
/utilities.py:
--------------------------------------------------------------------------------
 1 | import tokenizer
 2 | import ngram_model
 3 | import re
 4 | 
 5 | PUNCTUATION = ',.\'\\";:/?!'
 6 | 
 7 | class SentenceSamplerUtility(object):
 8 |     def __init__(self, sentences, n):
 9 |         ngram_maker = ngram_model.NGramMaker(n)
10 |         ngram_tree = self._construct_ngram_tree_from_sentences(sentences, ngram_maker)
11 |         self.sampler = ngram_model.NGramSampler(ngram_tree, default_initial_stem=ngram_maker.starting_tokens)
12 | 
13 |     def _construct_ngram_tree_from_sentences(self, sentences, ngram_maker):
14 |         T = tokenizer.Tokenizer()
15 |         ngram_tree = ngram_model.NGramFrequencyTree()
16 | 
17 |         tokenized_sentences = T.process(sentences)
18 |         ngram_sequences = [ngram_maker.make_ngrams(tokenized_sentence) for tokenized_sentence in tokenized_sentences]
19 |         [ngram_tree.add_ngram_observation(ngram) for sequence in ngram_sequences for ngram in sequence]
20 |         return ngram_tree
21 | 
22 |     def get_sample(self):
23 |         sampled_sequence = self.sampler.sample_sequence()
24 |         sampled_sentence = ' '.join(sampled_sequence)
25 |         for p in PUNCTUATION:
26 |             sampled_sentence = sampled_sentence.replace(' '+p, p)
27 |         return sampled_sentence
28 | 
29 | class DocumentPreProcessor(object):
30 |     END_OF_SENTENCE_CHARS = '?.!'
31 | 
32 |     def __init__(self, reduce_whitespace=True, add_line_end_to_punctuation=True):
33 |         self.preprocess_methods = []
34 |         if reduce_whitespace:
35 |             self.preprocess_methods.append(self._reduce_whitespace)
36 |         if add_line_end_to_punctuation:
37 |             self.preprocess_methods.append(self._add_line_end_to_punctuation)
38 |         self.preprocess_methods.append(self.split_on_newline)
39 | 
40 |     def preprocess(self, document):
41 |         for preprocess in self.preprocess_methods:
42 |             document = preprocess(document)
43 |         return document
44 | 
45 |     def _add_line_end_to_punctuation(self, document):
46 |         processed_document = document.replace('\n', ' ')
47 |         for eos in self.END_OF_SENTENCE_CHARS:
48 |             processed_document = processed_document.replace(eos+' ', eos+'\n')
49 |         return processed_document
50 | 
51 |     def _reduce_whitespace(self, document):
52 |         return re.sub("\s+"," ", document)
53 | 
54 |     def split_on_newline(self, document):
55 |         return document.split('\n')


--------------------------------------------------------------------------------