└── Text Generator └── task └── text_generator └── text_generator.py /Text Generator/task/text_generator/text_generator.py: -------------------------------------------------------------------------------- 1 | from nltk.tokenize import WhitespaceTokenizer 2 | from nltk.probability import FreqDist 3 | from nltk.util import bigrams 4 | from collections import Counter 5 | import random 6 | 7 | 8 | def text_analyzer(file_data, mode): 9 | 10 | wt = WhitespaceTokenizer() 11 | tokens_list = wt.tokenize(file_data) 12 | 13 | temp_list = tokens_list 14 | bigrams_list = list(bigrams(tokens_list)) 15 | 16 | markov_chain_temp_dict = Counter(bigrams_list) 17 | markov_chain_dict = {} 18 | 19 | for key, val in markov_chain_temp_dict.items(): 20 | markov_chain_dict.setdefault(key[0], {}).setdefault(key[1], val) 21 | 22 | if mode == "tokens": 23 | 24 | print("Corpus statistics") 25 | print(f"All tokens: {FreqDist(tokens_list).N()}") 26 | print(f"Unique tokens: {FreqDist(tokens_list).B()}") 27 | 28 | elif mode == "bi-gram": 29 | 30 | temp_list = bigrams_list 31 | print(f"Number of bigrams: {len(bigrams_list)}") 32 | 33 | elif mode == "random_text": 34 | 35 | for i in range(10): 36 | 37 | sentence = [] 38 | 39 | while True: 40 | head_word = str(random.choice(tokens_list)) 41 | if head_word[0].isupper() and head_word[-1] not in [".", "!", "?"]: 42 | sentence.append(head_word) 43 | break 44 | 45 | while True: 46 | tail_word = random.choices(list(markov_chain_dict.get(head_word).keys()), 47 | list(markov_chain_dict.get(head_word).values()), k=1)[0] 48 | 49 | if (tail_word[-1] not in [".", "!", "?"]) and len(sentence) < 5: 50 | 51 | sentence.append(tail_word) 52 | head_word = tail_word 53 | 54 | elif len(sentence) > 4: 55 | 56 | sentence.append(tail_word) 57 | 58 | if sentence[-1][-1] in [".", "!", "?"]: 59 | break 60 | 61 | head_word = tail_word 62 | 63 | print(*sentence) 64 | quit() 65 | 66 | answer = "" 67 | 68 | while answer != "exit": 69 | answer = input() 70 | 71 | if answer == "exit": 72 | break 73 | 74 | try: 75 | 76 | if mode == "token": 77 | print(temp_list[int(answer)]) 78 | 79 | elif mode == "bi-gram": 80 | print(f"Head: {temp_list[int(answer)][0]}\t Tail: {temp_list[int(answer)][1]}") 81 | 82 | elif mode == "markov_chain": 83 | if markov_chain_dict[answer]: 84 | print(f"Head: {answer}") 85 | for key, val in markov_chain_dict.get(answer).items(): 86 | print(f"Tail: {key}\tCount: {val}") 87 | 88 | except IndexError: 89 | print("Index Error. Please input an integer that is in the range of the corpus.") 90 | except ValueError: 91 | print("Value Error. Please input an integer.") 92 | except TypeError: 93 | print("Type Error. Please input an integer.") 94 | except KeyError: 95 | print("Key Error. The requested word is not in the model. Please input another word.") 96 | 97 | 98 | def main(): 99 | 100 | filename = input() 101 | with open(filename, "r", encoding="utf-8") as file: 102 | file_data = file.read() 103 | 104 | text_analyzer(file_data, "random_text") 105 | 106 | 107 | if __name__ == "__main__": 108 | main() 109 | --------------------------------------------------------------------------------