├── 1. Edit Distance ├── Problem_Statement.jpg ├── Solution.py └── readme.md ├── 2. Regex ├── Development Set.txt ├── Problem_Statement.jpg ├── Report.pdf ├── Solution.py └── readme.md ├── 3. Generative and Discriminative Models ├── Problem_Statement.jpg ├── Report.pdf ├── Solution.py └── readme.md ├── 4. HMM - Veterbi Algorithm ├── Problem_Statement.jpg ├── Report.pdf ├── Solution.py ├── Training set_HMM.txt ├── models │ ├── emission_dic.pkl │ ├── start_dic.pkl │ └── transition_dic.pkl ├── readme.md └── train_test_files │ ├── create_test.py │ ├── test.txt │ ├── test_output.txt │ └── train.txt ├── 5. NLP Tools ├── 1. Word Similarity - word2vec │ └── Solution.py ├── 2. Document Similarity - Doc2Vec │ └── Solution.py ├── 3. Spacy - Lemmatization, POS Tagging, NER, Word Similarity │ └── Solution.py ├── Problem_Statement.jpg ├── Report.pdf └── readme.md └── README.md /1. Edit Distance/Problem_Statement.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shrebox/Natural-Language-Processing/187e80e128e06094d1b9d798b3f727da54377ee3/1. Edit Distance/Problem_Statement.jpg -------------------------------------------------------------------------------- /1. Edit Distance/Solution.py: -------------------------------------------------------------------------------- 1 | # author @ Shreyash Arya 2015097 2 | # References mentioned with line; 3 | # majorly referenced "Speech and Language Processing by D. Jurafsky and J.H. Martin - Chapter 3 Minimum Edit Distance pg. 76 Algo; edition 2" 4 | import sys 5 | 6 | # -------------------------------Printing Matrices Function----------------------------------------------------------- 7 | 8 | def print_mat(distance): 9 | for i in range(len(distance)): 10 | print "" 11 | for j in range(len(distance[i])): 12 | sys.stdout.write(str(distance[i][j])+" ") 13 | sys.stdout.flush() # https://stackoverflow.com/questions/493386/how-to-print-without-newline-or-space 14 | print "" 15 | 16 | # ---------------------------------Minimum Edit Distance Function------------------------------------------------------ 17 | 18 | def min_edit_distance(target,source): 19 | 20 | n = len(target) 21 | m = len(source) 22 | 23 | distance = [ [0 for x in range(n+1)] for y in range(m+1) ] #list comprehension: https://stackoverflow.com/questions/6667201/how-to-define-a-two-dimensional-array-in-python 24 | distance[0][0] = 0 25 | 26 | ptr = [ [-1 for x in range(n+1)] for y in range(m+1) ] 27 | ptr[0][0] = -2 28 | 29 | for i in range(1,m+1): 30 | # print_mat(distance) 31 | distance[i][0] = distance[i-1][0]+1 # for each row, 1st column 32 | 33 | 34 | for j in range(1,n+1): # for each column 35 | # print_mat(distance) 36 | distance[0][j] = distance[0][j-1]+1 # 1st row, for all columns 37 | 38 | for j in range(1,n+1): 39 | print_mat(distance) 40 | for i in range(1,m+1): 41 | if target[j-1]==source[i-1]: 42 | distance[i][j] = min(distance[i-1][j-1],distance[i-1][j]+1,distance[i][j-1]+1) 43 | ptr[i][j] = 0 44 | else: 45 | distance[i][j] = min(distance[i-1][j-1]+2,distance[i-1][j]+1,distance[i][j-1]+1) 46 | subs = distance[i-1][j-1]+2 47 | delt = distance[i-1][j]+1 48 | insrt = distance[i][j-1]+1 49 | if subs<=delt and subs<=insrt: 50 | ptr[i][j] = 2 51 | elif delt\'\"]*\n[\n]+') # if we consider paragraph is 2 or more \n are present 7 | pat = re.compile(r'[.!?]+[\]})>\'\"]*([\n]+|\Z)') 8 | total = "" 9 | for i in inf: 10 | total+=i 11 | return len(pat.findall(total)) 12 | 13 | def count_words(infile): 14 | pat = re.compile(r'.*[A-Za-z0-9].*') 15 | data = "" 16 | with open(infile,'r') as f: 17 | for line in f: 18 | data+=line 19 | tok = re.split(r'\s+',data) 20 | filtered = [] 21 | for i in range(len(tok)): 22 | if pat.match(tok[i]): 23 | filtered.append(tok[i]) 24 | return len(filtered) 25 | 26 | def count_sentences(infile): 27 | inf = open(infile,'r') 28 | count = 0 29 | pat = re.compile(r'[.!?]+[\])}>\'\"]*(\n|\Z)|[.!?]+[\])}>\'\"]*[\s]+[\"\'{\[(<]*[A-Z0-9]') 30 | linn = 0 31 | for i in inf: 32 | linn+=1 33 | i = re.sub(r'Dr. |Ms. |Mr. |Mrs. |Er. ','',i) 34 | count+=len(pat.findall(i)) 35 | return count 36 | 37 | def task2(word,infile): 38 | pat = re.compile(r'[.!?]+[>)\'}\]\"]*(\s|\n|\Z)+[\'\"{\[\(<]*'+word+r'[^A-Za-z0-9]') 39 | data = "" 40 | with open(infile,'r') as f: 41 | for line in f: 42 | data+=line 43 | return len(pat.findall(data)) 44 | 45 | def task3(word,infile): 46 | inf = open(infile,'r') 47 | count=0 48 | pat = re.compile(r'[^A-Za-z0-9]'+word+r'([.!?]+[\])}>\'\"]*(\n|\Z)|[.!?]+[\])}>\'\"]*[\s]+[\"\'{\[(<]*[A-Z0-9])') 49 | linn=0 50 | for i in inf: 51 | linn+=1 52 | i = re.sub(r'Dr. |Ms. |Mr. |Mrs. |Er. ','',i) 53 | count+=len(pat.findall(i)) 54 | return count 55 | 56 | def task4(word,infile): 57 | inf = open(infile,'r') 58 | count=0 59 | pat = re.compile(r'(^|[^a-zA-Z0-9])['+word[0].lower()+word.upper()+r']'+word[1:len(word)]+r'([^a-zA-Z0-9]|$)') 60 | linn=0 61 | for i in inf: 62 | linn+=1 63 | count+=len(pat.findall(i)) 64 | return count 65 | 66 | file_name = raw_input("Input the file path: ") 67 | 68 | while(True): 69 | try: 70 | print "-------------------------------------------" 71 | print " 1. # of paragraphs, sentences and words\n 2. # of sentences starting with input word\n 3. # of sentences ending with input word\n 4. # of occurrences of input word\n 5. Change test set file\n 6. Exit!\n-------------------------------------------" 72 | option = raw_input("Enter: ") 73 | option = int(option) 74 | if option==1: 75 | print "" 76 | print "Paragraphs: "+ str(count_paragraphs(file_name)) 77 | print "Sentences: "+ str(count_sentences(file_name)) 78 | print "Words: "+ str(count_words(file_name)) 79 | print "" 80 | elif option==2: 81 | word = raw_input("Enter word: ") 82 | print task2(word,file_name) 83 | elif option==3: 84 | word = raw_input("Enter word: ") 85 | print task3(word,file_name) 86 | elif option==4: 87 | word = raw_input("Enter word: ") 88 | print task4(word,file_name) 89 | elif option==5: 90 | word = raw_input("Enter file path: ") 91 | file_name = word 92 | else: 93 | print "Bye!" 94 | break 95 | except: 96 | print "Enter valid option!!" 97 | 98 | 99 | -------------------------------------------------------------------------------- /2. Regex/readme.md: -------------------------------------------------------------------------------- 1 | ![alt text](https://github.com/shrebox/Natural-Language-Processing/blob/master/2.%20Regex/Problem_Statement.jpg) 2 | -------------------------------------------------------------------------------- /3. Generative and Discriminative Models/Problem_Statement.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shrebox/Natural-Language-Processing/187e80e128e06094d1b9d798b3f727da54377ee3/3. Generative and Discriminative Models/Problem_Statement.jpg -------------------------------------------------------------------------------- /3. Generative and Discriminative Models/Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shrebox/Natural-Language-Processing/187e80e128e06094d1b9d798b3f727da54377ee3/3. Generative and Discriminative Models/Report.pdf -------------------------------------------------------------------------------- /3. Generative and Discriminative Models/Solution.py: -------------------------------------------------------------------------------- 1 | import re, string, unicodedata 2 | import nltk 3 | import contractions 4 | import inflect 5 | from bs4 import BeautifulSoup 6 | from nltk import word_tokenize 7 | from nltk.corpus import stopwords 8 | # from nltk.stem import LancasterStemmer, WordNetLemmatizer 9 | from nltk.tokenize import TweetTokenizer 10 | import operator, random 11 | import sys 12 | import math 13 | import pickle 14 | 15 | def avg_sentence_length(infile): 16 | inf = open(infile,'r') 17 | count = 0 18 | pat = re.compile(r'[.!?]+[\])}>\'\"]*[\n]|[.!?]+[\])}>\'\"]*[\s]+[\"\'{\[(<]*[A-Z0-9]') 19 | linn = 0 20 | totallen = 0 21 | countlen = 0 22 | for i in inf: 23 | linn+=1 24 | i = re.sub(r'Dr. |Ms. |Mr. |Mrs. |Er. ','',i) 25 | val = pat.findall(i) 26 | count+=len(val) 27 | if len(val)>0: 28 | totallen+=len(i.split(" ")) 29 | countlen+=1 30 | # print val 31 | # break 32 | return totallen/float(countlen) 33 | # return count 34 | 35 | def sentence_start_words(infile): 36 | pat = re.compile(r'[.!?]+[>)\'}\]\"]*(\s|\n)+[\'\"{\[\(<]*[A-Za-z0-9]+') 37 | count = un =0 38 | word_freq = {} 39 | with open(infile,'r') as f: 40 | for line in f: 41 | val = pat.findall(line) 42 | if len(val)>0: 43 | for jk in range(len(val)): 44 | w = line.split(val[jk])[1] 45 | if len(w)>1: 46 | try: 47 | ww = w.split(' ')[0] 48 | if ww not in word_freq: 49 | word_freq[ww]=0 50 | word_freq[ww]+=1 51 | except Exception, e: 52 | un+=1 53 | else: 54 | if w not in word_freq: 55 | word_freq[w]=0 56 | word_freq[w]+=1 57 | count+=1 58 | 59 | del word_freq[''] 60 | sorted_xu = sorted(word_freq.items(), key=operator.itemgetter(1),reverse=True) 61 | return sorted_xu 62 | for i in range(100): 63 | print sorted_xu[i] 64 | 65 | def remove_html(data): 66 | return BeautifulSoup(data, "html.parser").get_text() 67 | 68 | def remove_btw_sqr(data): 69 | fin = re.sub('\[[^]]*\]', '', data) 70 | return fin 71 | 72 | def fix_contractions(data): 73 | fin = contractions.fix(data) 74 | return fin 75 | 76 | def words_tokenizer(data): 77 | words = nltk.word_tokenize(data) 78 | # tknzr = TweetTokenizer() 79 | # tknzr.tokenize(data) 80 | return words 81 | 82 | def remove_non_ascii(words): 83 | new_words = [] 84 | flag = 0 85 | for i in range(len(words)): 86 | flag = 1 87 | new_word = unicodedata.normalize('NFKD',words[i]) 88 | new_word = new_word.encode('ascii','ignore') 89 | new_word = new_word.decode('utf-8','ignore') 90 | flag+=1 91 | new_words.append(new_word) 92 | return new_words 93 | # def remove_non_ascii(words): 94 | # new_words = [] 95 | # flag = 0 96 | # for i in range(len(words)): 97 | # flag=1 98 | # new_word = unicodedata.normalize('NFKD', words[i]).encode('ascii', 'ignore').decode('utf-8', 'ignore') 99 | # flag+=1 100 | # new_words.append(new_word) 101 | # return new_words 102 | 103 | def to_lowercase(words): 104 | new_words = [] 105 | flag = 0 106 | for i in range(len(words)): 107 | new_word = words[i].lower() 108 | flag+=1 109 | new_words.append(new_word) 110 | return new_words 111 | 112 | def remove_punctuation(words): 113 | new_words = [] 114 | flag = 0 115 | for i in range(len(words)): 116 | flag+=1 117 | new_word = re.sub(r'([^\w\s])|_+', '', words[i]) 118 | if new_word != '': 119 | flag=0 120 | new_words.append(new_word) 121 | return new_words 122 | 123 | # def replace_numbers(words): 124 | # p = inflect.engine() 125 | # new_words = [] 126 | # flag = 0 127 | # for i in range(len(words)): 128 | # flag = 1 129 | # if words[i].isdigit(): 130 | # new_word = p.number_to_words(words[i]) 131 | # flag+=1 132 | # new_words.append(new_word) 133 | # else: 134 | # flag = 0 135 | # new_words.append(word) 136 | # return new_words 137 | 138 | def remove_stopwords(words): 139 | new_words = [] 140 | flag = 0 141 | for i in range(len(words)): 142 | flag = 1 143 | if words[i] not in stopwords.words('english'): 144 | flag+=1 145 | new_words.append(words[i]) 146 | return new_words 147 | 148 | def preprocess_data_unigram(data): 149 | data = remove_html(data) 150 | flag=0 151 | data = remove_btw_sqr(data) 152 | data = fix_contractions(data) 153 | flag+=1 154 | words = words_tokenizer(data) 155 | x = flag 156 | words = remove_non_ascii(words) 157 | words = to_lowercase(words) 158 | x+=1 159 | words = remove_punctuation(words) 160 | # words = replace_numbers(words) 161 | words = remove_stopwords(words) 162 | return words 163 | 164 | def preprocess_data_bitrigram(data): 165 | data = remove_html(data) 166 | flag=0 167 | data = remove_btw_sqr(data) 168 | x = flag 169 | data = fix_contractions(data) 170 | words = words_tokenizer(data) 171 | x+=1 172 | words = remove_non_ascii(words) 173 | words = to_lowercase(words) 174 | x+=2 175 | words = remove_punctuation(words) 176 | # words = replace_numbers(words) 177 | # words = remove_stopwords(words) 178 | return words 179 | 180 | def preprocess_input_sentence(data): 181 | # data = remove_html(data) 182 | # data = remove_btw_sqr(data) 183 | data = fix_contractions(data) 184 | words = words_tokenizer(data) 185 | # words = remove_non_ascii(words) 186 | words = to_lowercase(words) 187 | words = remove_punctuation(words) 188 | # words = replace_numbers(words) 189 | # words = remove_stopwords(words) 190 | return words 191 | 192 | def frequency_unigram(words): 193 | findic = {} 194 | for i in range(len(words)): 195 | if words[i] not in findic: 196 | findic[words[i]]=0 197 | findic[words[i]]+=1 198 | return findic 199 | 200 | def frequency_bigram(words): 201 | findic = {} 202 | findic[('',words[0])]=1 203 | for i in range(len(words)): 204 | if i==len(words)-1: 205 | if (words[i],'') not in findic: 206 | findic[(words[i],'')]=0 207 | findic[(words[i],'')]+=1 208 | else: 209 | if (words[i],words[i+1]) not in findic: 210 | findic[(words[i],words[i+1])]=0 211 | findic[(words[i],words[i+1])]+=1 212 | return findic 213 | 214 | def frequency_bigram_part2(words): 215 | findic = {} 216 | for i in range(len(words)-1): 217 | if (words[i],words[i+1]) not in findic: 218 | findic[(words[i],words[i+1])]=0 219 | findic[(words[i],words[i+1])]+=1 220 | return findic 221 | 222 | def frequency_trigram(words): 223 | findic = {} 224 | findic[('',words[0],words[1])]=1 225 | for i in range(len(words)-1): 226 | if i==len(words)-2: 227 | if (words[i],words[i+1],'') not in findic: 228 | findic[(words[i],words[i+1],'')]=0 229 | findic[(words[i],words[i+1],'')]+=1 230 | else: 231 | if (words[i],words[i+1],words[i+2]) not in findic: 232 | findic[(words[i],words[i+1],words[i+2])]=0 233 | findic[(words[i],words[i+1],words[i+2])]+=1 234 | return findic 235 | 236 | def generate_prob_dic_bigram(count_dictionaryb,wordsu_stop): 237 | prob_dic_bigram = {} 238 | for k,v in count_dictionaryb.iteritems(): 239 | prob_dic_bigram[k] = math.log(v/float(wordsu_stop[k[0]])) 240 | return prob_dic_bigram 241 | 242 | def generate_bigram_sentences(prob_dic_bigram,zerow,firstw,sentence_length): 243 | finalstr = zerow+" "+firstw+" " 244 | for i in range(sentence_length): 245 | tempdic = {} 246 | inc = [] 247 | for k,v in prob_dic_bigram.iteritems(): 248 | if k[0] == firstw: 249 | tempdic[k] = v 250 | sorttemp = sorted(tempdic.items(), key=operator.itemgetter(1),reverse=True) 251 | if (sorttemp[0][0] not in inc) or (sorttemp[0][0] in inc and len(sorttemp)==1): 252 | # print i, sorttemp[0][0] 253 | finalstr+=sorttemp[0][0][1]+" " 254 | firstw = sorttemp[0][0][1] 255 | prob_dic_bigram[sorttemp[0][0]]*=2 256 | inc.append(sorttemp[0]) 257 | else: 258 | # print i, sorttemp[1][0] 259 | finalstr+=sorttemp[1][0][1]+" " 260 | firstw = sorttemp[1][0][1] 261 | prob_dic_bigram[sorttemp[1][0]]*=2 262 | inc.append(sorttemp[1][0]) 263 | 264 | return finalstr 265 | 266 | def generate_prob_dic_trigram(count_dictionaryt,count_dictionaryb): 267 | prob_dic_trigram = {} 268 | for k,v in count_dictionaryt.iteritems(): 269 | k1 = str(k[0]) 270 | k2 = str(k[1]) 271 | kpass = (k1,k2) 272 | prob_dic_trigram[k] = math.log(v/float(count_dictionaryb[kpass])) 273 | return prob_dic_trigram 274 | 275 | def generate_trigram_sentence(prob_dic_trigram,zerow,firstw,secondw,sentence_length): 276 | finalstr = zerow+" "+secondw+" "+firstw+" " 277 | for i in range(sentence_length): 278 | tempdic = {} 279 | inc = [] 280 | for k,v in prob_dic_trigram.iteritems(): 281 | if k[0] == secondw and k[1]==firstw: 282 | tempdic[k] = v 283 | sorttemp = sorted(tempdic.items(), key=operator.itemgetter(1),reverse=True) 284 | if (sorttemp[0][0] not in inc) or (sorttemp[0][0] in inc and len(sorttemp)==1): 285 | # print i, sorttemp[0][0] 286 | finalstr+=sorttemp[0][0][2]+" " 287 | firstw = sorttemp[0][0][2] 288 | secondw = sorttemp[0][0][1] 289 | prob_dic_trigram[sorttemp[0][0]]*=2 290 | inc.append(sorttemp[0]) 291 | else: 292 | # print i, sorttemp[1][0] 293 | finalstr+=sorttemp[1][0][2]+" " 294 | firstw = sorttemp[1][0][2] 295 | secondw = sorttemp[1][0][1] 296 | prob_dic_trigram[sorttemp[1][0]]*=2 297 | inc.append(sorttemp[1][0]) 298 | return finalstr 299 | 300 | def smooth_prob_cal(bigrams,corpus_unigrams,corpus_bigrams): 301 | smooth_prob = {} 302 | for k,v in bigrams.iteritems(): 303 | c1 = c2 = 0 304 | try: 305 | c2 = corpus_bigrams[k] 306 | except Exception, e: 307 | print e 308 | try: 309 | c1 = corpus_unigrams[k[0]] 310 | except Exception, e: 311 | print e 312 | nprob = math.log(((c2+1)/float((c1+len(corpus_unigrams))))*v) 313 | smooth_prob[k] = nprob 314 | return smooth_prob 315 | 316 | # words = sentence_start_words('combine_data2.txt') 317 | 318 | # with open('motorcycle_sentence_startwords.pkl','wb') as f: 319 | # pickle.dump(words,f) 320 | print "Average sentence length for corpus: Computers: " + str(avg_sentence_length('combine_data2.txt')) 321 | print "Average sentence length for corpus: Motorcycle: " + str(avg_sentence_length('combine3.txt')) 322 | 323 | # def smooth_prob_cal_unk(bigrams,corpus_unigrams,corpus_bigrams): 324 | # smooth_prob = {} 325 | # for k,v in bigrams.iteritems(): 326 | # c1 = c2 = 0 327 | # try: 328 | # c2 = corpus_bigrams[k] 329 | # except Exception, e: 330 | # print e 331 | # try: 332 | # c1 = corpus_unigrams[k[0]] 333 | # except Exception, e: 334 | # print e 335 | # nprob = math.log((c2+1)/float((c1+len(corpus_unigrams)))) 336 | # smooth_prob[k] = nprob 337 | # return smooth_prob 338 | 339 | #=======================================================part 1====================================================================== 340 | 341 | # #---------------data combining------------------------ 342 | # data = "" 343 | # with open('combine3.txt') as f: 344 | # for line in f: 345 | # data+=str(line) 346 | 347 | #-------------generating unigrams------------------------ 348 | print"\nUnigram: \n" 349 | 350 | # wordsu_stop = frequency_unigram(preprocess_data_bitrigram(data)) 351 | # wordsu_stop[''] = 1 352 | # wordsu_stop[''] = 1 353 | 354 | # wordsu = preprocess_data_unigram(data) 355 | # count_dictionaryu = frequency_unigram(wordsu) 356 | 357 | # with open('computers_unigrams_part1.pkl','wb') as f: 358 | # pickle.dump(count_dictionaryu,f) 359 | 360 | count_dictionaryu_part1 = {} 361 | with open('computers_unigrams_part1.pkl','rb') as f: 362 | count_dictionaryu_part1 = pickle.load(f) 363 | 364 | sorted_xu = sorted(count_dictionaryu_part1.items(), key=operator.itemgetter(1),reverse=True) 365 | 366 | for i in range(14): # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>need to change this to some threshold 367 | sys.stdout.write(str(sorted_xu[i][0])+" ") 368 | sys.stdout.flush() # https://stackoverflow.com/questions/493386/how-to-print-without-newline-or-space 369 | 370 | #------------generating bigrams--------------------- 371 | print "\n" 372 | print"\nBigrams: \n" 373 | 374 | # wordsb = preprocess_data_bitrigram(data) 375 | # count_dictionaryb = frequency_bigram(wordsb) 376 | # prob_dic_bigram = generate_prob_dic_bigram(count_dictionaryb,wordsu_stop) 377 | 378 | # with open('computers_bigram_prob.pkl','wb') as f: 379 | # pickle.dump(prob_dic_bigram,f) 380 | 381 | computers_bigram_prob = {} 382 | with open('computers_bigram_prob.pkl','rb') as f: 383 | computers_bigram_prob = pickle.load(f) 384 | 385 | #>>>>>>>>>>>>>>>>>>>>>>>>>> choose the start of the sentence using unigram frequency of the start word 386 | # finalstr = "i have " 387 | 388 | # computer_sentence_startwords = {} 389 | # with open('computer_sentence_startwords.pkl','rb') as f: 390 | # computer_sentence_startwords = pickle.load(f) 391 | 392 | #the original, the program, to the, to be 393 | 394 | zerow = "the" 395 | firstw = "original" 396 | 397 | finalstr = generate_bigram_sentences(computers_bigram_prob,zerow,firstw,14) 398 | print finalstr 399 | print "" 400 | 401 | zerow = "to" 402 | firstw = "the" 403 | 404 | finalstr = generate_bigram_sentences(computers_bigram_prob,zerow,firstw,14) 405 | print finalstr 406 | 407 | # # #---------------------generating trigrams---------------------------- 408 | 409 | print"\nTrigrams: \n" 410 | 411 | # count_dictionaryb = {} 412 | # with open('computers_bigrams.pkl','rb') as f: 413 | # count_dictionaryb = pickle.load(f) 414 | 415 | # wordst = preprocess_data_bitrigram(data) 416 | # count_dictionaryt = frequency_trigram(wordst) 417 | # prob_dic_trigram = generate_prob_dic_trigram(count_dictionaryt,count_dictionaryb) 418 | 419 | # with open('computers_trigram_prob.pkl','wb') as f: 420 | # pickle.dump(prob_dic_trigram,f) 421 | 422 | computers_trigram_prob = {} 423 | with open('computers_trigram_prob.pkl','rb') as f: 424 | computers_trigram_prob = pickle.load(f) 425 | 426 | # sorted_xt = sorted(count_dictionaryt.items(), key=operator.itemgetter(1),reverse=True) 427 | 428 | # In [390]: print sorted_xt[0] 429 | # ((u'the', u'egavgaadapter', u'by'), 0.0) 430 | 431 | # In [391]: print sorted_xt[1] 432 | # ((u'the', u'copierprinter', u'has'), 0.0) 433 | 434 | # In [392]: print sorted_xt[2] 435 | # ((u'the', u'setting', u'are'), 0.0) 436 | 437 | # In [396]: print sorted_xt[0] 438 | # ((u'the', u'original', u'is'), -2.847812143477369) 439 | 440 | # In [397]: print sorted_xt[1] 441 | # ((u'the', u'original', u'still'), -3.1354942149291497) 442 | 443 | # In [398]: print sorted_xt[2] 444 | # ((u'the', u'original', u'fullcolor'), -3.1354942149291497) 445 | 446 | # In [402]: print sorted_xt[0] 447 | # ((u'to', u'mprenderrequest', u'icaseedu'), 0.0) 448 | 449 | # In [403]: print sorted_xt[1] 450 | # ((u'to', u'grassftpadmin', u'mooncecerarmymil'), 0.0) 451 | 452 | # In [404]: print sorted_xt[2] 453 | # ((u'to', u'specific', u'subdirectories'), 0.0) 454 | 455 | zerow = "the" 456 | secondw = "egavgaadapter" 457 | firstw = "by" 458 | 459 | finalstr = generate_trigram_sentence(computers_trigram_prob,zerow,firstw,secondw,14) 460 | print finalstr 461 | print "\n" 462 | 463 | zerow = "the" 464 | secondw = "copierprinter" 465 | firstw = "has" 466 | 467 | finalstr = generate_trigram_sentence(computers_trigram_prob,zerow,firstw,secondw,14) 468 | print finalstr 469 | 470 | # #=======================================================part 2====================================================================== 471 | 472 | # #---------------data combining------------------------ 473 | # data = "" 474 | # with open('combine_data2.txt') as f: 475 | # for line in f: 476 | # data+=str(line) 477 | 478 | #-------------generating unigrams------------------------ 479 | print"\nUnigram: \n" 480 | 481 | # wordsu_stop = frequency_unigram(preprocess_data_bitrigram(data)) 482 | # wordsu_stop[''] = 1 483 | # wordsu_stop[''] = 1 484 | 485 | # wordsu = preprocess_data_unigram(data) 486 | # count_dictionaryu = frequency_unigram(wordsu) 487 | 488 | # with open('motorcycle_unigrams_part1.pkl','wb') as f: 489 | # pickle.dump(count_dictionaryu,f) 490 | 491 | count_dictionaryu_part1 = {} 492 | with open('motorcycle_unigrams_part1.pkl','rb') as f: 493 | count_dictionaryu_part1 = pickle.load(f) 494 | 495 | sorted_xu = sorted(count_dictionaryu_part1.items(), key=operator.itemgetter(1),reverse=True) 496 | 497 | for i in range(14): # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>need to change this to some threshold 498 | sys.stdout.write(str(sorted_xu[i][0])+" ") 499 | sys.stdout.flush() # https://stackoverflow.com/questions/493386/how-to-print-without-newline-or-space 500 | 501 | # # #------------generating bigrams--------------------- 502 | print "\n" 503 | print"\nBigrams: \n" 504 | 505 | # wordsb = preprocess_data_bitrigram(data) 506 | # count_dictionaryb = frequency_bigram(wordsb) 507 | 508 | # motorcycles_unigrams = {} 509 | # with open('motorcycles_unigrams.pkl','rb') as f: 510 | # motorcycles_unigrams = pickle.load(f) 511 | 512 | # motorcycles_bigrams = {} 513 | # with open('motorcycles_bigrams.pkl','rb') as f: 514 | # motorcycles_bigrams = pickle.load(f) 515 | 516 | # prob_dic_bigram = generate_prob_dic_bigram(motorcycles_bigrams,motorcycles_unigrams) 517 | 518 | # with open('motorcycles_bigram_prob.pkl','wb') as f: 519 | # pickle.dump(prob_dic_bigram,f) 520 | 521 | motorcycles_bigram_prob = {} 522 | with open('motorcycles_bigram_prob.pkl','rb') as f: 523 | motorcycles_bigram_prob = pickle.load(f) 524 | #>>>>>>>>>>>>>>>>>>>>>>>>>> choose the start of the sentence using unigram frequency of the start word 525 | 526 | # the first, the dog, the ground, the only 527 | # to be, to do, to get, to a 528 | 529 | # finalstr = "i have " 530 | zerow = "the" 531 | firstw = "first" 532 | 533 | finalstr = generate_bigram_sentences(motorcycles_bigram_prob,zerow,firstw,14) 534 | print finalstr 535 | print "" 536 | 537 | zerow = "to" 538 | firstw = "be" 539 | 540 | finalstr = generate_bigram_sentences(motorcycles_bigram_prob,zerow,firstw,14) 541 | print finalstr 542 | 543 | # # #---------------------generating trigrams---------------------------- 544 | 545 | print"\nTrigrams: \n" 546 | 547 | # wordst = preprocess_data_bitrigram(data) 548 | # count_dictionaryt = frequency_trigram(wordst) 549 | 550 | # motorcycles_bigrams = {} 551 | # with open('motorcycles_bigrams.pkl','rb') as f: 552 | # motorcycles_bigrams = pickle.load(f) 553 | 554 | # prob_dic_trigram = generate_prob_dic_trigram(count_dictionaryt,motorcycles_bigrams) 555 | # # sorted_xt = sorted(count_dictionaryt.items(), key=operator.itemgetter(1),reverse=True 556 | 557 | # with open('motorcycles_trigram_prob.pkl','wb') as f: 558 | # pickle.dump(prob_dic_trigram,f) 559 | 560 | motorcycles_trigram_prob = {} 561 | with open('motorcycles_trigram_prob.pkl','rb') as f: 562 | motorcycles_trigram_prob = pickle.load(f) 563 | 564 | # In [441]: print sorted_xt[0] 565 | # ((u'the', u'leading', u'lady'), 0.0) 566 | 567 | # In [442]: print sorted_xt[1] 568 | # ((u'the', u'inspector', u'general'), 0.0) 569 | 570 | # In [443]: print sorted_xt[2] 571 | # ((u'the', u'areas', u'you'), 0.0) 572 | 573 | # In [447]: print sorted_xt[0] 574 | # ((u'to', u'treat', u'others'), 0.0) 575 | 576 | # In [448]: print sorted_xt[1] 577 | # ((u'to', u'boil', u'off'), 0.0) 578 | 579 | # In [449]: print sorted_xt[2] 580 | # ((u'to', u'coast', u'to'), 0.0) 581 | 582 | zerow = "the" 583 | secondw = "inspector" 584 | firstw = "general" 585 | 586 | finalstr = generate_trigram_sentence(motorcycles_trigram_prob,zerow,firstw,secondw,14) 587 | print finalstr 588 | print "" 589 | 590 | zerow = "to" 591 | secondw = "boil" 592 | firstw = "off" 593 | 594 | finalstr = generate_trigram_sentence(motorcycles_trigram_prob,zerow,firstw,secondw,14) 595 | print finalstr 596 | 597 | print "\n" 598 | #=======================================================part 3====================================================================== 599 | 600 | # print unigrams 601 | # print bigrams 602 | 603 | # data = "" 604 | # with open('combine3.txt') as f: 605 | # for line in f: 606 | # data+=str(line) 607 | 608 | # wordsu_stop = frequency_unigram(preprocess_data_bitrigram(data)) 609 | # wordsu_stop[''] = 1 610 | # wordsu_stop[''] = 1 611 | 612 | # with open('computers_unigrams.pkl','wb') as f: 613 | # pickle.dump(wordsu_stop,f) 614 | 615 | # wordsb = preprocess_data_bitrigram(data) 616 | # count_dictionaryb = frequency_bigram(wordsb) 617 | 618 | # with open('computers_bigrams.pkl','wb') as f: 619 | # pickle.dump(count_dictionaryb,f) 620 | 621 | computers_unigrams = {} 622 | with open('computers_unigrams.pkl','rb') as f: 623 | computers_unigrams = pickle.load(f) 624 | 625 | computers_bigrams = {} 626 | with open('computers_bigrams.pkl','rb') as f: 627 | computers_bigrams = pickle.load(f) 628 | 629 | motorcycles_unigrams = {} 630 | with open('motorcycles_unigrams.pkl','rb') as f: 631 | motorcycles_unigrams = pickle.load(f) 632 | 633 | motorcycles_bigrams = {} 634 | with open('motorcycles_bigrams.pkl','rb') as f: 635 | motorcycles_bigrams = pickle.load(f) 636 | 637 | input_sentence = str(raw_input("Enter the sentence: ")) 638 | 639 | input_sentence = preprocess_input_sentence(input_sentence) 640 | # input_sentence = input_sentence.split(" ") 641 | 642 | bigrams = frequency_bigram_part2(input_sentence) 643 | 644 | smoothval1 = smooth_prob_cal(bigrams,computers_unigrams,computers_bigrams) 645 | 646 | finalprob1 = 0 647 | for k,v in smoothval1.iteritems(): 648 | # print k,v 649 | finalprob1+=v 650 | 651 | print finalprob1 652 | 653 | smoothval2 = smooth_prob_cal(bigrams,motorcycles_unigrams,motorcycles_bigrams) 654 | 655 | finalprob2 = 0 656 | for k,v in smoothval2.iteritems(): 657 | # print k,v 658 | finalprob2+=v 659 | 660 | print finalprob2 661 | 662 | print "\n" 663 | 664 | if finalprob2>finalprob1: 665 | print "motorcycles" 666 | else: 667 | print "computers" 668 | 669 | print "\n" 670 | 671 | #=======================================================part 4====================================================================== 672 | 673 | # data = "" 674 | # with open('combine_data2.txt') as f: 675 | # for line in f: 676 | # data+=str(line) 677 | 678 | # pp = preprocess_data_unigram(data) 679 | # wordsu_stop = frequency_unigram(pp) 680 | # wordsu_stop[''] = 1 681 | # wordsu_stop[''] = 1 682 | 683 | # with open('motorcycle_vocab.pkl','wb') as f: 684 | # pickle.dump(wordsu_stop,f) 685 | 686 | # threshold_words = [] 687 | # for k,v in wordsu_stop.iteritems(): 688 | # if v<2: 689 | # threshold_words.append(k) 690 | 691 | # for i in range(len(pp)): 692 | # if pp[i] in threshold_words: 693 | # pp[i] = '' 694 | 695 | # wordsu_stop = frequency_unigram(pp) 696 | # wordsu_stop[''] = 1 697 | # wordsu_stop[''] = 1 698 | 699 | # with open('motorcycle_unigrams_unk.pkl','wb') as f: 700 | # pickle.dump(wordsu_stop,f) 701 | 702 | # count_dictionaryb = frequency_bigram(pp) 703 | 704 | # with open('motorcycle_bigrams_unk.pkl','wb') as f: 705 | # pickle.dump(count_dictionaryb,f) 706 | 707 | computers_vocab = {} 708 | with open('computers_vocab.pkl','rb') as f: 709 | computers_vocab = pickle.load(f) 710 | 711 | computers_unigrams_unk = {} 712 | with open('computers_unigrams_unk.pkl','rb') as f: 713 | computers_unigrams_unk = pickle.load(f) 714 | 715 | computers_bigrams_unk = {} 716 | with open('computers_bigrams_unk.pkl','rb') as f: 717 | computers_bigrams_unk = pickle.load(f) 718 | 719 | motorcycles_vocab = {} 720 | with open('motorcycles_vocab.pkl','rb') as f: 721 | motorcycles_vocab = pickle.load(f) 722 | 723 | motorcycles_unigrams_unk = {} 724 | with open('motorcycles_unigrams_unk.pkl','rb') as f: 725 | motorcycles_unigrams_unk = pickle.load(f) 726 | 727 | motorcycles_bigrams_unk = {} 728 | with open('motorcycles_bigrams_unk.pkl','rb') as f: 729 | motorcycles_bigrams_unk = pickle.load(f) 730 | 731 | input_sentence = str(raw_input("Enter the sentence: ")) 732 | 733 | input_sentence = preprocess_input_sentence(input_sentence) 734 | # input_sentence = input_sentence.split(" ") 735 | 736 | inp1 = [] 737 | for i in range(len(input_sentence)): 738 | if input_sentence[i] not in computers_vocab: 739 | inp1.append('') 740 | else: 741 | inp1.append(input_sentence[i]) 742 | 743 | bigrams = frequency_bigram_part2(inp1) 744 | 745 | smoothval1 = smooth_prob_cal(bigrams,computers_unigrams_unk,computers_bigrams_unk) 746 | 747 | finalprob1 = 0 748 | for k,v in smoothval1.iteritems(): 749 | print k,v 750 | if k[0] == '' or k[1] == '': 751 | finalprob1+=v*2 752 | else: 753 | finalprob1+=v 754 | 755 | print finalprob1 756 | 757 | inp2 = [] 758 | for i in range(len(input_sentence)): 759 | if input_sentence[i] not in motorcycles_vocab: 760 | inp2.append('') 761 | else: 762 | inp2.append(input_sentence[i]) 763 | 764 | bigrams = frequency_bigram_part2(inp2) 765 | 766 | smoothval2 = smooth_prob_cal(bigrams,motorcycles_unigrams_unk,motorcycles_bigrams_unk) 767 | 768 | finalprob2 = 0 769 | for k,v in smoothval2.iteritems(): 770 | print k,v 771 | if k[0] == '' or k[1] == '': 772 | finalprob2+=v*2 773 | else: 774 | finalprob2+=v 775 | 776 | print finalprob2 777 | 778 | print "\n" 779 | 780 | if finalprob2>finalprob1: 781 | print "motorcycles" 782 | else: 783 | print "computers" 784 | 785 | # Several years ago, while driving a cage, a dog darted out at a quiet --> Motorcycles (Line 1753) 786 | # I currently have some grayscale image files that are not in any --> Computer (Line 11) 787 | 788 | -------------------------------------------------------------------------------- /3. Generative and Discriminative Models/readme.md: -------------------------------------------------------------------------------- 1 | ![alt text](https://github.com/shrebox/Natural-Language-Processing/blob/master/3.%20Generative%20and%20Discriminative%20Models/Problem_Statement.jpg) 2 | -------------------------------------------------------------------------------- /4. HMM - Veterbi Algorithm/Problem_Statement.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shrebox/Natural-Language-Processing/187e80e128e06094d1b9d798b3f727da54377ee3/4. HMM - Veterbi Algorithm/Problem_Statement.jpg -------------------------------------------------------------------------------- /4. HMM - Veterbi Algorithm/Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shrebox/Natural-Language-Processing/187e80e128e06094d1b9d798b3f727da54377ee3/4. HMM - Veterbi Algorithm/Report.pdf -------------------------------------------------------------------------------- /4. HMM - Veterbi Algorithm/Solution.py: -------------------------------------------------------------------------------- 1 | import math 2 | import pickle 3 | 4 | def frequency_unigram(words): 5 | findic = {} 6 | for i in range(len(words)): 7 | if words[i] not in findic: 8 | findic[words[i]]=0 9 | findic[words[i]]+=1 10 | return findic 11 | 12 | def frequency_bigram_part2(words): 13 | findic = {} 14 | for i in range(len(words)-1): 15 | if (words[i],words[i+1]) not in findic: 16 | findic[(words[i],words[i+1])]=0 17 | findic[(words[i],words[i+1])]+=1 18 | return findic 19 | 20 | def get_prob_dic(dicty,tags_unique): 21 | findic = {} 22 | total = 0 23 | for k,v in dicty.iteritems(): 24 | total+=v 25 | for k,v in dicty.iteritems(): 26 | findic[k] = math.log((v+1)/((total*1.0)+len(tags_unique))) 27 | for i in range(len(tags_unique)): 28 | if tags_unique[i] not in findic: 29 | findic[tags_unique[i]] = math.log((1)/((total*1.0)+len(tags_unique))) 30 | return findic 31 | 32 | def Viterbit(obs, states, s_pro, t_pro, e_pro): 33 | # path = { s:[] for s in states} # init path: path[s] represents the path ends with s 34 | 35 | # Initializing step 36 | curr_pro = {} 37 | path = {} 38 | last_flag = 0 39 | for s in states: 40 | first_obs = obs[0] 41 | state_val = s 42 | emmi_prob = e_pro[first_obs][state_val] 43 | start_prob = s_pro[s] 44 | path[s] = [] 45 | curr_pro[s] = start_prob+emmi_prob 46 | 47 | # Recurssion Step 48 | total_state_counts=0 49 | for i in xrange(1, len(obs)): 50 | last_pro = curr_pro 51 | curr_pro = {} 52 | for curr_state in states: 53 | max_pro = -999999999 54 | last_sta = -1 55 | for last_state in states: 56 | last_state_prob = last_pro[last_state] # last stage probability 57 | transition_prob = t_pro[last_state][curr_state] # transition probability 58 | emmision_prob = e_pro[obs[i]][curr_state] # emission probability 59 | tempmax = last_state_prob+transition_prob+emmision_prob # log probabilities are added 60 | if tempmax>max_pro: 61 | max_pro = tempmax 62 | last_sta = last_state 63 | # max_pro, last_sta = max(((last_pro[last_state]*t_pro[last_state][curr_state]*e_pro[obs[i]][curr_state], last_state) for last_state in states)) 64 | curr_pro[curr_state] = max_pro 65 | total_state_counts+=1 66 | path[curr_state].append(last_sta) # storing the path for backtrack 67 | 68 | # Termination Step 69 | max_pro = -999999999 70 | last_flag = 1 71 | max_path = None 72 | for s in states: 73 | state_to_append = s 74 | path[state_to_append].append(state_to_append) 75 | cval = curr_pro[s] 76 | max_val = max_pro 77 | if cval <= max_val: 78 | pass 79 | else: 80 | max_path = path[s] 81 | max_pro = cval 82 | # print '%s: %s'%(curr_pro[s], path[s]) # different path and their probability 83 | # exit() 84 | return max_path 85 | 86 | words = [] 87 | tags = [] 88 | words_unigram = {} 89 | tags_unigram = {} 90 | tags_bigram = {} 91 | word_tag = {} 92 | start_tags_x = [] 93 | 94 | #--------------------------------data preprocessing--------------------------------------------- 95 | flag=1 # this flag is used for getting the start words 96 | with open('train.txt') as f: # data in train corpus 97 | for line in f: 98 | split = line.split('\t') 99 | if len(split)>1: 100 | if split[0] != ".": 101 | word_val = split[0] 102 | tag_val = split[1].split('\n')[0] 103 | if flag==1: 104 | start_tags_x.append(tag_val) 105 | flag=0 106 | words.append(word_val) # word list 107 | tags.append(tag_val) # tags list 108 | if word_val not in word_tag: # preparing words to tag count dictionary 109 | word_tag[word_val] = {} 110 | inner_tag_dic = word_tag[word_val] 111 | if tag_val not in inner_tag_dic: 112 | inner_tag_dic[tag_val]=0 113 | inner_tag_dic[tag_val]+=1 114 | else: 115 | flag=1 116 | 117 | words_unigram_x = frequency_unigram(words) 118 | tags_unigram = frequency_unigram(tags) 119 | tags_bigram = frequency_bigram_part2(tags) 120 | start_tags = frequency_unigram(start_tags_x) 121 | 122 | #---------------------------------------Unknown words handling-------------------------------- 123 | # making UNK for words with 1 frequency for unknown words: 124 | 125 | unk_words = [] 126 | count=0 127 | for k,v in words_unigram_x.iteritems(): 128 | if v!=1: 129 | words_unigram[k] = v 130 | else: 131 | unk_words.append(k) 132 | count+=1 133 | 134 | words_unigram[''] = count 135 | 136 | # handling UNK for word-tag dictionary 137 | unk_words_tags_extra = [] 138 | word_tag_x = word_tag 139 | word_tag = {} 140 | for k,v in word_tag_x.iteritems(): 141 | if k in unk_words: 142 | unk_words_tags_extra.append(v) 143 | else: 144 | word_tag[k] = v 145 | 146 | unk_tags_findic = {} 147 | for i in range(len(unk_words_tags_extra)): 148 | for k,v in unk_words_tags_extra[i].iteritems(): 149 | if k not in unk_tags_findic: 150 | unk_tags_findic[k] = 0 151 | unk_tags_findic[k]+=v 152 | 153 | word_tag[''] = unk_tags_findic 154 | 155 | # removing redundant values 156 | del words_unigram[';'] 157 | del tags_unigram[':'] 158 | del tags_bigram[(':','NN')] 159 | del word_tag[';'] 160 | 161 | #----------------------------------smoothing and probability dictionaries creation----------------------------------------------- 162 | 163 | # ||V|| values to smooth the probabilities 164 | tags_unique = [] 165 | for k,v in tags_unigram.iteritems(): 166 | if k not in tags_unique: 167 | tags_unique.append(k) 168 | 169 | # emission probabilities smoothing 170 | words_tag_prob = {} 171 | for k,v in word_tag.iteritems(): # word-tags probabilities 172 | temp = {} 173 | for key,val in v.iteritems(): # smoothing the values for which are present 174 | temp[key] = math.log((val+1.0)/((tags_unigram[key]*1.0)+(len(tags_unique)))) 175 | for i in range(len(tags_unique)): 176 | if tags_unique[i] not in temp: # smoothing absent values 177 | temp[tags_unique[i]] = math.log((1.0)/((tags_unigram[tags_unique[i]]*1.0)+(len(tags_unique)))) 178 | words_tag_prob[k] = temp 179 | 180 | tags_bigram_y = tags_bigram 181 | tags_bigram = {} 182 | for k,v in tags_bigram_y.iteritems(): 183 | tag1 = k[0] 184 | tag2 = k[1] 185 | if tag1 not in tags_bigram: 186 | tags_bigram[tag1] = {} 187 | tags_bigram[tag1][tag2] = v 188 | 189 | # transition probabilities smoothing 190 | tags_bigram_prob = {} 191 | for k,v in tags_bigram.iteritems(): # bigram tags probabilities 192 | temp = {} 193 | for key,val in v.iteritems(): # smoothing the values for which are present 194 | temp[key] = math.log((val+1.0)/((tags_unigram[k]*1.0)+(len(tags_unique)))) 195 | for i in range(len(tags_unique)): 196 | if tags_unique[i] not in temp: # smoothing absent values 197 | temp[tags_unique[i]] = math.log((1.0)/((tags_unigram[k]*1.0)+(len(tags_unique)))) 198 | tags_bigram_prob[k] = temp 199 | # tags_bigram_prob[k] = math.log(v/(tags_unigram[k[0]]*1.0)) 200 | 201 | 202 | start_tags_prob = get_prob_dic(start_tags,tags_unique) 203 | transtion_prob_tags = tags_bigram_prob 204 | emission_prob_word_tags = words_tag_prob 205 | 206 | # with open('emission_dic.pkl','wb') as f: 207 | # pickle.dump(emission_prob_word_tags,f) 208 | 209 | #--------------------------------------implementing veterbi-------------------------------------- 210 | 211 | # obs = ['i','want','to','have','dinner'] 212 | # obs = ['next','thursday'] 213 | # obs = ['like','to','go','to','a','fancy','japanese','restaurant'] 214 | # obs = ['dinner'] 215 | # obs = ['as','far','away', 'as', 'we', 'can', 'get'] 216 | test_data = [] 217 | with open('test.txt') as f: 218 | for line in f: 219 | test_data.append(line.split('\n')[0]) 220 | 221 | ccv=0 222 | obs = [] 223 | predicted_tags = [] 224 | towrite_data = [] 225 | for i in range(len(test_data)): 226 | ccv+=1 227 | if test_data[i] != '': 228 | # print test_data[i],obs 229 | if (test_data[i] not in words_unigram) and (test_data[i] != '.'): 230 | obs.append('') 231 | elif test_data[i] == '.': 232 | pass 233 | else: 234 | obs.append(test_data[i]) 235 | else: 236 | # print ccv 237 | # print obs 238 | if len(obs)>0: 239 | tempo = Viterbit(obs,tags_unique,start_tags_prob,transtion_prob_tags,emission_prob_word_tags) 240 | # print tempo 241 | for kama in range(len(tempo)): 242 | towrite_data.append(obs[kama]+"\t"+tempo[kama]+"\n") 243 | predicted_tags.append(tempo[kama]) 244 | towrite_data.append(str(".\t.\n")) 245 | towrite_data.append("\n") 246 | obs = [] 247 | 248 | accuracy_count = 0 249 | for i in range(len(tags)): 250 | if tags[i]==predicted_tags[i]: 251 | accuracy_count+=1 252 | 253 | print accuracy_count/(1.0*len(tags)) 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | -------------------------------------------------------------------------------- /4. HMM - Veterbi Algorithm/models/start_dic.pkl: -------------------------------------------------------------------------------- 1 | (dp0 2 | S'PRP$' 3 | p1 4 | F0.0009527439024390244 5 | sS'VBG' 6 | p2 7 | F0.0013973577235772358 8 | sS'FW' 9 | p3 10 | F0.002223069105691057 11 | sS'VBN' 12 | p4 13 | F6.351626016260163e-05 14 | sS'VBP' 15 | p5 16 | F0.044334349593495935 17 | sS'WDT' 18 | p6 19 | F0.009463922764227643 20 | sS'HYPH' 21 | p7 22 | F6.351626016260163e-05 23 | sS'JJ' 24 | p8 25 | F0.01918191056910569 26 | sS'WP' 27 | p9 28 | F0.03398119918699187 29 | sS'VBZ' 30 | p10 31 | F0.027121443089430895 32 | sS'DT' 33 | p11 34 | F0.048208841463414635 35 | sS'RP' 36 | p12 37 | F0.0009527439024390244 38 | sS'NN' 39 | p13 40 | F0.04643038617886179 41 | sS'VBD' 42 | p14 43 | F0.0031122967479674797 44 | sS'POS' 45 | p15 46 | F6.351626016260163e-05 47 | sS'TO' 48 | p16 49 | F0.0036839430894308944 50 | sS'LS' 51 | p17 52 | F0.0006351626016260162 53 | sS'RB' 54 | p18 55 | F0.03976117886178862 56 | sS'NNS' 57 | p19 58 | F0.0032393292682926828 59 | sS'PRP' 60 | p20 61 | F0.3508638211382114 62 | sS'VB' 63 | p21 64 | F0.11801321138211382 65 | sS'WRB' 66 | p22 67 | F0.053671239837398375 68 | sS'CC' 69 | p23 70 | F0.0015879065040650406 71 | sS'PDT' 72 | p24 73 | F0.0003175813008130081 74 | sS'RBS' 75 | p25 76 | F6.351626016260163e-05 77 | sS'RBR' 78 | p26 79 | F0.00025406504065040653 80 | sS'CD' 81 | p27 82 | F0.013401930894308942 83 | sS'EX' 84 | p28 85 | F0.0005081300813008131 86 | sS'IN' 87 | p29 88 | F0.015752032520325202 89 | sS'MD' 90 | p30 91 | F0.036839430894308946 92 | sS'JJS' 93 | p31 94 | F0.0006351626016260162 95 | sS'JJR' 96 | p32 97 | F0.006034044715447154 98 | sS'UH' 99 | p33 100 | F0.11540904471544715 101 | sS'NNP' 102 | p34 103 | F0.0017784552845528454 104 | s. -------------------------------------------------------------------------------- /4. HMM - Veterbi Algorithm/models/transition_dic.pkl: -------------------------------------------------------------------------------- 1 | (dp0 2 | S'PRP$' 3 | p1 4 | (dp2 5 | S'PRP$' 6 | p3 7 | F0.0032679738562091504 8 | sS'VBG' 9 | p4 10 | F0.0032679738562091504 11 | sS'FW' 12 | p5 13 | F0.0032679738562091504 14 | sS'VBN' 15 | p6 16 | F0.0032679738562091504 17 | sS'VBP' 18 | p7 19 | F0.006535947712418301 20 | sS'WDT' 21 | p8 22 | F0.0032679738562091504 23 | sS'HYPH' 24 | p9 25 | F0.0032679738562091504 26 | sS'JJ' 27 | p10 28 | F0.11437908496732026 29 | sS'WP' 30 | p11 31 | F0.0032679738562091504 32 | sS'VBZ' 33 | p12 34 | F0.0032679738562091504 35 | sS'DT' 36 | p13 37 | F0.029411764705882353 38 | sS'RP' 39 | p14 40 | F0.0032679738562091504 41 | sS'NN' 42 | p15 43 | F0.6405228758169934 44 | sS'VBD' 45 | p16 46 | F0.0032679738562091504 47 | sS'POS' 48 | p17 49 | F0.0032679738562091504 50 | sS'TO' 51 | p18 52 | F0.0032679738562091504 53 | sS'PRP' 54 | p19 55 | F0.0032679738562091504 56 | sS'RB' 57 | p20 58 | F0.0032679738562091504 59 | sS'NNS' 60 | p21 61 | F0.10784313725490197 62 | sS'NNP' 63 | p22 64 | F0.0032679738562091504 65 | sS'VB' 66 | p23 67 | F0.0032679738562091504 68 | sS'WRB' 69 | p24 70 | F0.0032679738562091504 71 | sS'CC' 72 | p25 73 | F0.0032679738562091504 74 | sS'LS' 75 | p26 76 | F0.0032679738562091504 77 | sS'PDT' 78 | p27 79 | F0.0032679738562091504 80 | sS'RBS' 81 | p28 82 | F0.0032679738562091504 83 | sS'RBR' 84 | p29 85 | F0.0032679738562091504 86 | sS'CD' 87 | p30 88 | F0.0032679738562091504 89 | sS'EX' 90 | p31 91 | F0.0032679738562091504 92 | sS'IN' 93 | p32 94 | F0.0032679738562091504 95 | sS'MD' 96 | p33 97 | F0.0032679738562091504 98 | sS'JJS' 99 | p34 100 | F0.0032679738562091504 101 | sS'JJR' 102 | p35 103 | F0.0032679738562091504 104 | sS'UH' 105 | p36 106 | F0.00980392156862745 107 | ssS'VBG' 108 | p37 109 | (dp38 110 | S'PRP$' 111 | p39 112 | F0.003827751196172249 113 | sS'VBG' 114 | p40 115 | F0.007655502392344498 116 | sg5 117 | F0.0009569377990430622 118 | sg6 119 | F0.0009569377990430622 120 | sS'VBP' 121 | p41 122 | F0.0009569377990430622 123 | sg8 124 | F0.0009569377990430622 125 | sS'JJ' 126 | p42 127 | F0.0660287081339713 128 | sS'WP' 129 | p43 130 | F0.0019138755980861245 131 | sg12 132 | F0.0009569377990430622 133 | sS'DT' 134 | p44 135 | F0.0430622009569378 136 | sS'RP' 137 | p45 138 | F0.009569377990430622 139 | sS'NN' 140 | p46 141 | F0.3090909090909091 142 | sg16 143 | F0.0009569377990430622 144 | sg17 145 | F0.0009569377990430622 146 | sS'TO' 147 | p47 148 | F0.03444976076555024 149 | sS'HYPH' 150 | p48 151 | F0.010526315789473684 152 | sS'RB' 153 | p49 154 | F0.06124401913875598 155 | sS'NNS' 156 | p50 157 | F0.0009569377990430622 158 | sS'PRP' 159 | p51 160 | F0.004784688995215311 161 | sS'VB' 162 | p52 163 | F0.003827751196172249 164 | sS'WRB' 165 | p53 166 | F0.0019138755980861245 167 | sg25 168 | F0.0009569377990430622 169 | sg26 170 | F0.0009569377990430622 171 | sg27 172 | F0.0009569377990430622 173 | sg28 174 | F0.0009569377990430622 175 | sg29 176 | F0.0009569377990430622 177 | sS'CD' 178 | p54 179 | F0.0028708133971291866 180 | sg31 181 | F0.0009569377990430622 182 | sS'IN' 183 | p55 184 | F0.37894736842105264 185 | sg33 186 | F0.0009569377990430622 187 | sg34 188 | F0.0009569377990430622 189 | sS'JJR' 190 | p56 191 | F0.01818181818181818 192 | sS'UH' 193 | p57 194 | F0.02583732057416268 195 | sg22 196 | F0.0009569377990430622 197 | ssS'FW' 198 | p58 199 | (dp59 200 | g3 201 | F0.0013123359580052493 202 | sg4 203 | F0.0013123359580052493 204 | sg16 205 | F0.0013123359580052493 206 | sS'VBN' 207 | p60 208 | F0.01837270341207349 209 | sS'VBP' 210 | p61 211 | F0.027559055118110236 212 | sS'WDT' 213 | p62 214 | F0.007874015748031496 215 | sS'JJ' 216 | p63 217 | F0.04068241469816273 218 | sS'WP' 219 | p64 220 | F0.006561679790026247 221 | sg12 222 | F0.0013123359580052493 223 | sS'DT' 224 | p65 225 | F0.0013123359580052493 226 | sg14 227 | F0.0013123359580052493 228 | sS'NN' 229 | p66 230 | F0.14304461942257218 231 | sS'FW' 232 | p67 233 | F0.494750656167979 234 | sS'POS' 235 | p68 236 | F0.08267716535433071 237 | sg18 238 | F0.0013123359580052493 239 | sS'HYPH' 240 | p69 241 | F0.03937007874015748 242 | sg20 243 | F0.0013123359580052493 244 | sg50 245 | F0.0013123359580052493 246 | sS'PRP' 247 | p70 248 | F0.010498687664041995 249 | sS'VB' 250 | p71 251 | F0.051181102362204724 252 | sS'WRB' 253 | p72 254 | F0.009186351706036745 255 | sg25 256 | F0.0013123359580052493 257 | sg26 258 | F0.0013123359580052493 259 | sg27 260 | F0.0013123359580052493 261 | sg28 262 | F0.0013123359580052493 263 | sg29 264 | F0.0013123359580052493 265 | sg30 266 | F0.0013123359580052493 267 | sg31 268 | F0.0013123359580052493 269 | sg32 270 | F0.0013123359580052493 271 | sS'MD' 272 | p73 273 | F0.013123359580052493 274 | sg34 275 | F0.0013123359580052493 276 | sg35 277 | F0.0013123359580052493 278 | sS'UH' 279 | p74 280 | F0.028871391076115485 281 | sg22 282 | F0.0013123359580052493 283 | ssS'VBN' 284 | p75 285 | (dp76 286 | g3 287 | F0.0023094688221709007 288 | sg4 289 | F0.0023094688221709007 290 | sg5 291 | F0.0023094688221709007 292 | sg6 293 | F0.0023094688221709007 294 | sS'VBP' 295 | p77 296 | F0.004618937644341801 297 | sg8 298 | F0.0023094688221709007 299 | sS'JJ' 300 | p78 301 | F0.004618937644341801 302 | sS'WP' 303 | p79 304 | F0.004618937644341801 305 | sS'VBZ' 306 | p80 307 | F0.009237875288683603 308 | sS'DT' 309 | p81 310 | F0.11778290993071594 311 | sg14 312 | F0.0023094688221709007 313 | sS'NN' 314 | p82 315 | F0.28868360277136257 316 | sg16 317 | F0.0023094688221709007 318 | sg17 319 | F0.0023094688221709007 320 | sS'TO' 321 | p83 322 | F0.009237875288683603 323 | sS'HYPH' 324 | p84 325 | F0.011547344110854504 326 | sS'RB' 327 | p85 328 | F0.03002309468822171 329 | sS'NNS' 330 | p86 331 | F0.020785219399538105 332 | sS'PRP' 333 | p87 334 | F0.07621247113163972 335 | sS'VB' 336 | p88 337 | F0.023094688221709007 338 | sS'WRB' 339 | p89 340 | F0.07852193995381063 341 | sS'CC' 342 | p90 343 | F0.013856812933025405 344 | sg26 345 | F0.0023094688221709007 346 | sg27 347 | F0.0023094688221709007 348 | sg28 349 | F0.0023094688221709007 350 | sg29 351 | F0.0023094688221709007 352 | sS'CD' 353 | p91 354 | F0.013856812933025405 355 | sg31 356 | F0.0023094688221709007 357 | sS'IN' 358 | p92 359 | F0.13625866050808313 360 | sS'MD' 361 | p93 362 | F0.016166281755196306 363 | sg34 364 | F0.0023094688221709007 365 | sg35 366 | F0.0023094688221709007 367 | sS'UH' 368 | p94 369 | F0.10392609699769054 370 | sg22 371 | F0.0023094688221709007 372 | ssS'VBP' 373 | p95 374 | (dp96 375 | S'PRP$' 376 | p97 377 | F0.002363667060611177 378 | sg4 379 | F0.0552085092014182 380 | sg5 381 | F0.00016883336147222692 382 | sg6 383 | F0.01148066858011143 384 | sg41 385 | F0.00016883336147222692 386 | sg8 387 | F0.00016883336147222692 388 | sS'JJ' 389 | p98 390 | F0.11531318588553098 391 | sS'WP' 392 | p99 393 | F0.0015195002532500423 394 | sS'VBZ' 395 | p100 396 | F0.003714333952388992 397 | sS'DT' 398 | p101 399 | F0.1110923518487253 400 | sS'RP' 401 | p102 402 | F0.0027013337835556307 403 | sS'NN' 404 | p103 405 | F0.04794867465811244 406 | sg16 407 | F0.00016883336147222692 408 | sS'POS' 409 | p104 410 | F0.0020260003376667227 411 | sS'TO' 412 | p105 413 | F0.25780854296809047 414 | sS'PRP' 415 | p106 416 | F0.14975519162586526 417 | sS'RB' 418 | p107 419 | F0.12848218808036468 420 | sS'NNS' 421 | p108 422 | F0.0175586695931116 423 | sg9 424 | F0.00016883336147222692 425 | sS'VB' 426 | p109 427 | F0.003714333952388992 428 | sS'WRB' 429 | p110 430 | F0.00033766672294445384 431 | sg25 432 | F0.00016883336147222692 433 | sg26 434 | F0.00016883336147222692 435 | sS'PDT' 436 | p111 437 | F0.0005065000844166807 438 | sg28 439 | F0.00016883336147222692 440 | sg29 441 | F0.00016883336147222692 442 | sS'CD' 443 | p112 444 | F0.005571500928583488 445 | sg31 446 | F0.0395070065845011 447 | sS'IN' 448 | p113 449 | F0.021104170184028364 450 | sS'MD' 451 | p114 452 | F0.0040520006753334455 453 | sg34 454 | F0.00016883336147222692 455 | sS'JJR' 456 | p115 457 | F0.008104001350666891 458 | sS'UH' 459 | p116 460 | F0.008272834712139118 461 | sg22 462 | F0.00016883336147222692 463 | ssS'WDT' 464 | p117 465 | (dp118 466 | g3 467 | F0.0015923566878980893 468 | sg4 469 | F0.0015923566878980893 470 | sg5 471 | F0.0015923566878980893 472 | sg6 473 | F0.0015923566878980893 474 | sS'VBP' 475 | p119 476 | F0.16878980891719744 477 | sg8 478 | F0.0015923566878980893 479 | sS'JJ' 480 | p120 481 | F0.0015923566878980893 482 | sg11 483 | F0.0015923566878980893 484 | sS'VBZ' 485 | p121 486 | F0.37898089171974525 487 | sS'DT' 488 | p122 489 | F0.012738853503184714 490 | sg14 491 | F0.0015923566878980893 492 | sS'NN' 493 | p123 494 | F0.1480891719745223 495 | sS'VBD' 496 | p124 497 | F0.022292993630573247 498 | sg17 499 | F0.0015923566878980893 500 | sg18 501 | F0.0015923566878980893 502 | sg9 503 | F0.0015923566878980893 504 | sg20 505 | F0.0015923566878980893 506 | sg50 507 | F0.0732484076433121 508 | sS'PRP' 509 | p125 510 | F0.07006369426751592 511 | sg23 512 | F0.0015923566878980893 513 | sS'WRB' 514 | p126 515 | F0.004777070063694267 516 | sg25 517 | F0.0015923566878980893 518 | sg26 519 | F0.0015923566878980893 520 | sg27 521 | F0.0015923566878980893 522 | sg28 523 | F0.0015923566878980893 524 | sg29 525 | F0.0015923566878980893 526 | sS'CD' 527 | p127 528 | F0.022292993630573247 529 | sg31 530 | F0.0015923566878980893 531 | sS'IN' 532 | p128 533 | F0.05573248407643312 534 | sg33 535 | F0.0015923566878980893 536 | sg34 537 | F0.0015923566878980893 538 | sg35 539 | F0.0015923566878980893 540 | sS'UH' 541 | p129 542 | F0.006369426751592357 543 | sg22 544 | F0.0015923566878980893 545 | ssS'JJ' 546 | p130 547 | (dp131 548 | g3 549 | F0.00012548625925461163 550 | sS'VBG' 551 | p132 552 | F0.0016313213703099511 553 | sS'VBD' 554 | p133 555 | F0.0005019450370184465 556 | sS'VBN' 557 | p134 558 | F0.002007780148073786 559 | sS'VBP' 560 | p135 561 | F0.0016313213703099511 562 | sS'WDT' 563 | p136 564 | F0.0008784038147822813 565 | sS'JJ' 566 | p137 567 | F0.057096247960848286 568 | sS'WP' 569 | p138 570 | F0.003513615259129125 571 | sS'VBZ' 572 | p139 573 | F0.01167022211067888 574 | sS'DT' 575 | p140 576 | F0.007152716777512862 577 | sg14 578 | F0.00012548625925461163 579 | sS'NN' 580 | p141 581 | F0.5586648262015309 582 | sS'FW' 583 | p142 584 | F0.0011293763332915045 585 | sS'POS' 586 | p143 587 | F0.003011670222110679 588 | sS'TO' 589 | p144 590 | F0.03965365792445727 591 | sS'PRP' 592 | p145 593 | F0.047559292257497804 594 | sS'RB' 595 | p146 596 | F0.022587526665830093 597 | sS'NNS' 598 | p147 599 | F0.0766721044045677 600 | sS'HYPH' 601 | p148 602 | F0.005395909147948299 603 | sS'VB' 604 | p149 605 | F0.00464299159242063 606 | sS'WRB' 607 | p150 608 | F0.008282093110804366 609 | sS'CC' 610 | p151 611 | F0.0335048312209813 612 | sg26 613 | F0.00012548625925461163 614 | sg27 615 | F0.00012548625925461163 616 | sg28 617 | F0.00012548625925461163 618 | sg29 619 | F0.00012548625925461163 620 | sS'CD' 621 | p152 622 | F0.005521395407202911 623 | sS'EX' 624 | p153 625 | F0.0005019450370184465 626 | sS'IN' 627 | p154 628 | F0.07378592044171163 629 | sS'MD' 630 | p155 631 | F0.003011670222110679 632 | sS'JJS' 633 | p156 634 | F0.0012548625925461162 635 | sS'JJR' 636 | p157 637 | F0.003011670222110679 638 | sS'UH' 639 | p158 640 | F0.02183460911030242 641 | sS'NNP' 642 | p159 643 | F0.0031371564813652904 644 | ssS'WP' 645 | p160 646 | (dp161 647 | g3 648 | F0.0012690355329949238 649 | sg4 650 | F0.0012690355329949238 651 | sg16 652 | F0.0012690355329949238 653 | sg6 654 | F0.0012690355329949238 655 | sS'VBP' 656 | p162 657 | F0.19416243654822335 658 | sg8 659 | F0.0012690355329949238 660 | sg9 661 | F0.0012690355329949238 662 | sS'JJ' 663 | p163 664 | F0.01649746192893401 665 | sg11 666 | F0.0012690355329949238 667 | sS'VBZ' 668 | p164 669 | F0.25253807106598986 670 | sS'DT' 671 | p165 672 | F0.0025380710659898475 673 | sg14 674 | F0.0012690355329949238 675 | sS'NN' 676 | p166 677 | F0.08883248730964467 678 | sg5 679 | F0.0012690355329949238 680 | sg17 681 | F0.0012690355329949238 682 | sg18 683 | F0.0012690355329949238 684 | sS'PRP' 685 | p167 686 | F0.050761421319796954 687 | sS'RB' 688 | p168 689 | F0.0025380710659898475 690 | sS'NNS' 691 | p169 692 | F0.0532994923857868 693 | sg22 694 | F0.0012690355329949238 695 | sg23 696 | F0.0012690355329949238 697 | sg24 698 | F0.0012690355329949238 699 | sg25 700 | F0.0012690355329949238 701 | sg26 702 | F0.0012690355329949238 703 | sg27 704 | F0.0012690355329949238 705 | sg28 706 | F0.0012690355329949238 707 | sg29 708 | F0.0012690355329949238 709 | sg30 710 | F0.0012690355329949238 711 | sg31 712 | F0.0012690355329949238 713 | sS'IN' 714 | p170 715 | F0.28553299492385786 716 | sS'MD' 717 | p171 718 | F0.02284263959390863 719 | sg34 720 | F0.0012690355329949238 721 | sg35 722 | F0.0012690355329949238 723 | sS'UH' 724 | p172 725 | F0.0012690355329949238 726 | ssS'VBZ' 727 | p173 728 | (dp174 729 | S'PRP$' 730 | p175 731 | F0.0024390243902439024 732 | sS'VBG' 733 | p176 734 | F0.01991869918699187 735 | sS'FW' 736 | p177 737 | F0.00853658536585366 738 | sS'VBN' 739 | p178 740 | F0.032520325203252036 741 | sS'VBP' 742 | p179 743 | F0.0012195121951219512 744 | sg8 745 | F0.0004065040650406504 746 | sg9 747 | F0.0004065040650406504 748 | sS'JJ' 749 | p180 750 | F0.13333333333333333 751 | sS'WP' 752 | p181 753 | F0.005691056910569106 754 | sS'VBZ' 755 | p182 756 | F0.004878048780487805 757 | sS'DT' 758 | p183 759 | F0.21991869918699186 760 | sg14 761 | F0.0004065040650406504 762 | sS'NN' 763 | p184 764 | F0.07357723577235772 765 | sg16 766 | F0.0004065040650406504 767 | sg17 768 | F0.0004065040650406504 769 | sS'TO' 770 | p185 771 | F0.01097560975609756 772 | sS'PRP' 773 | p186 774 | F0.044715447154471545 775 | sS'RB' 776 | p187 777 | F0.183739837398374 778 | sS'NNS' 779 | p188 780 | F0.020731707317073172 781 | sS'NNP' 782 | p189 783 | F0.02886178861788618 784 | sS'VB' 785 | p190 786 | F0.01991869918699187 787 | sS'WRB' 788 | p191 789 | F0.0020325203252032522 790 | sg25 791 | F0.0004065040650406504 792 | sg26 793 | F0.0004065040650406504 794 | sg27 795 | F0.0004065040650406504 796 | sg28 797 | F0.0004065040650406504 798 | sS'RBR' 799 | p192 800 | F0.0016260162601626016 801 | sS'CD' 802 | p193 803 | F0.02032520325203252 804 | sS'EX' 805 | p194 806 | F0.05569105691056911 807 | sS'IN' 808 | p195 809 | F0.04878048780487805 810 | sg33 811 | F0.0004065040650406504 812 | sS'JJS' 813 | p196 814 | F0.004878048780487805 815 | sS'JJR' 816 | p197 817 | F0.016260162601626018 818 | sS'UH' 819 | p198 820 | F0.03536585365853658 821 | ssS'DT' 822 | p199 823 | (dp200 824 | g3 825 | F0.00011664528169835531 826 | sS'VBG' 827 | p201 828 | F0.005132392394727633 829 | sS'FW' 830 | p202 831 | F0.0009331622535868425 832 | sS'VBN' 833 | p203 834 | F0.0020996150705703954 835 | sS'VBP' 836 | p204 837 | F0.004782456549632568 838 | sS'WDT' 839 | p205 840 | F0.00046658112679342123 841 | sg120 842 | F0.23259069170652047 843 | sS'WP' 844 | p206 845 | F0.0006998716901901318 846 | sS'VBZ' 847 | p207 848 | F0.012714335705120728 849 | sS'DT' 850 | p208 851 | F0.004432520704537502 852 | sS'RP' 853 | p209 854 | F0.0003499358450950659 855 | sS'NN' 856 | p210 857 | F0.567945876589292 858 | sS'VBD' 859 | p211 860 | F0.001516388662078619 861 | sS'POS' 862 | p212 863 | F0.001166452816983553 864 | sg18 865 | F0.00011664528169835531 866 | sg9 867 | F0.006882071620202963 868 | sS'RB' 869 | p213 870 | F0.00921497725417007 871 | sS'NNS' 872 | p214 873 | F0.03324390528403126 874 | sS'NNP' 875 | p215 876 | F0.002682841479062172 877 | sS'VB' 878 | p216 879 | F0.001866324507173685 880 | sS'WRB' 881 | p217 882 | F0.0008165169718884871 883 | sg25 884 | F0.00011664528169835531 885 | sg26 886 | F0.00011664528169835531 887 | sg27 888 | F0.00011664528169835531 889 | sg28 890 | F0.002216260352268751 891 | sg29 892 | F0.0029161320424588824 893 | sS'CD' 894 | p218 895 | F0.014813950775691124 896 | sS'PRP' 897 | p219 898 | F0.005249037676425989 899 | sg31 900 | F0.00011664528169835531 901 | sS'IN' 902 | p220 903 | F0.005365682958124344 904 | sS'MD' 905 | p221 906 | F0.006998716901901318 907 | sg34 908 | F0.021929312959290796 909 | sS'JJR' 910 | p222 911 | F0.03581010148139508 912 | sS'UH' 913 | p223 914 | F0.014464014930596058 915 | ssS'RP' 916 | p224 917 | (dp225 918 | g3 919 | F0.000984251968503937 920 | sS'VBG' 921 | p226 922 | F0.001968503937007874 923 | sg5 924 | F0.000984251968503937 925 | sg6 926 | F0.000984251968503937 927 | sS'VBP' 928 | p227 929 | F0.022637795275590553 930 | sg117 931 | F0.002952755905511811 932 | sS'JJ' 933 | p228 934 | F0.01673228346456693 935 | sS'WP' 936 | p229 937 | F0.010826771653543307 938 | sS'VBZ' 939 | p230 940 | F0.003937007874015748 941 | sS'DT' 942 | p231 943 | F0.022637795275590553 944 | sg14 945 | F0.000984251968503937 946 | sS'NN' 947 | p232 948 | F0.04822834645669291 949 | sg16 950 | F0.000984251968503937 951 | sg17 952 | F0.000984251968503937 953 | sS'TO' 954 | p233 955 | F0.017716535433070866 956 | sS'PRP' 957 | p234 958 | F0.1594488188976378 959 | sS'RB' 960 | p235 961 | F0.028543307086614175 962 | sg50 963 | F0.000984251968503937 964 | sg9 965 | F0.000984251968503937 966 | sS'VB' 967 | p236 968 | F0.30413385826771655 969 | sS'WRB' 970 | p237 971 | F0.011811023622047244 972 | sS'CC' 973 | p238 974 | F0.015748031496062992 975 | sg26 976 | F0.000984251968503937 977 | sg27 978 | F0.000984251968503937 979 | sg28 980 | F0.000984251968503937 981 | sg29 982 | F0.000984251968503937 983 | sS'CD' 984 | p239 985 | F0.004921259842519685 986 | sg31 987 | F0.000984251968503937 988 | sS'IN' 989 | p240 990 | F0.21456692913385828 991 | sS'MD' 992 | p241 993 | F0.014763779527559055 994 | sg34 995 | F0.000984251968503937 996 | sS'JJR' 997 | p242 998 | F0.006889763779527559 999 | sS'UH' 1000 | p243 1001 | F0.07677165354330709 1002 | sg22 1003 | F0.000984251968503937 1004 | ssS'NN' 1005 | p244 1006 | (dp245 1007 | S'PRP$' 1008 | p246 1009 | F0.0006191439303256698 1010 | sS'VBG' 1011 | p247 1012 | F0.002476575721302679 1013 | sS'FW' 1014 | p248 1015 | F0.003384653485780328 1016 | sS'VBN' 1017 | p249 1018 | F0.0021463656251289884 1019 | sS'VBP' 1020 | p250 1021 | F0.026334255169851817 1022 | sg8 1023 | F0.01601518966442399 1024 | sS'JJ' 1025 | p251 1026 | F0.020927064845007635 1027 | sS'WP' 1028 | p252 1029 | F0.013621166467164733 1030 | sS'VBZ' 1031 | p253 1032 | F0.03070953894415322 1033 | sS'DT' 1034 | p254 1035 | F0.022041523919593842 1036 | sS'RP' 1037 | p255 1038 | F0.0008668015024559376 1039 | sS'NN' 1040 | p256 1041 | F0.1955669294588682 1042 | sS'VBD' 1043 | p257 1044 | F0.001444669170759896 1045 | sS'POS' 1046 | p258 1047 | F0.02389895571057085 1048 | sS'TO' 1049 | p259 1050 | F0.011887563462252858 1051 | sS'PRP' 1052 | p260 1053 | F0.14719115036942254 1054 | sS'RB' 1055 | p261 1056 | F0.04664217608453378 1057 | sS':' 1058 | p262 1059 | F0.0002476575721302679 1060 | sS'NNS' 1061 | p263 1062 | F0.019276014364139185 1063 | sS'HYPH' 1064 | p264 1065 | F0.02167003756139844 1066 | sS'VB' 1067 | p265 1068 | F0.03855202872827837 1069 | sS'WRB' 1070 | p266 1071 | F0.02394023197259256 1072 | sg25 1073 | F0.04866471292359764 1074 | sg26 1075 | F8.255252404342263e-05 1076 | sS'PDT' 1077 | p267 1078 | F0.00020638131010855657 1079 | sg28 1080 | F4.127626202171132e-05 1081 | sS'RBR' 1082 | p268 1083 | F0.0012795641226730507 1084 | sS'CD' 1085 | p269 1086 | F0.005531019110909316 1087 | sS'EX' 1088 | p270 1089 | F0.0007429727163908036 1090 | sS'IN' 1091 | p271 1092 | F0.17596070499855532 1093 | sS'MD' 1094 | p272 1095 | F0.030915920254261774 1096 | sS'JJS' 1097 | p273 1098 | F0.00041276262021711313 1099 | sS'JJR' 1100 | p274 1101 | F0.005324637800800759 1102 | sg172 1103 | F0.05989185619350312 1104 | sS'NNP' 1105 | p275 1106 | F0.0014859454327816072 1107 | ssS'VBD' 1108 | p276 1109 | (dp277 1110 | S'PRP$' 1111 | p278 1112 | F0.009950248756218905 1113 | sS'VBG' 1114 | p279 1115 | F0.029850746268656716 1116 | sg5 1117 | F0.001658374792703151 1118 | sg6 1119 | F0.001658374792703151 1120 | sS'VBP' 1121 | p280 1122 | F0.003316749585406302 1123 | sg8 1124 | F0.001658374792703151 1125 | sS'JJ' 1126 | p281 1127 | F0.029850746268656716 1128 | sg11 1129 | F0.001658374792703151 1130 | sS'VBZ' 1131 | p282 1132 | F0.014925373134328358 1133 | sS'DT' 1134 | p283 1135 | F0.17081260364842454 1136 | sS'RP' 1137 | p284 1138 | F0.03316749585406302 1139 | sS'NN' 1140 | p285 1141 | F0.0945273631840796 1142 | sg16 1143 | F0.001658374792703151 1144 | sg17 1145 | F0.001658374792703151 1146 | sS'TO' 1147 | p286 1148 | F0.05970149253731343 1149 | sS'PRP' 1150 | p287 1151 | F0.12271973466003316 1152 | sS'RB' 1153 | p288 1154 | F0.16086235489220563 1155 | sS'NNS' 1156 | p289 1157 | F0.01658374792703151 1158 | sg9 1159 | F0.001658374792703151 1160 | sS'VB' 1161 | p290 1162 | F0.024875621890547265 1163 | sS'WRB' 1164 | p291 1165 | F0.013266998341625208 1166 | sg25 1167 | F0.001658374792703151 1168 | sg26 1169 | F0.001658374792703151 1170 | sg27 1171 | F0.001658374792703151 1172 | sS'RBS' 1173 | p292 1174 | F0.028192371475953566 1175 | sg29 1176 | F0.001658374792703151 1177 | sS'CD' 1178 | p293 1179 | F0.02155887230514096 1180 | sg31 1181 | F0.001658374792703151 1182 | sS'IN' 1183 | p294 1184 | F0.0779436152570481 1185 | sS'MD' 1186 | p295 1187 | F0.014925373134328358 1188 | sg34 1189 | F0.001658374792703151 1190 | sS'JJR' 1191 | p296 1192 | F0.03648424543946932 1193 | sS'UH' 1194 | p297 1195 | F0.013266998341625208 1196 | sg22 1197 | F0.001658374792703151 1198 | ssS'POS' 1199 | p298 1200 | (dp299 1201 | g3 1202 | F0.0009727626459143969 1203 | sS'VBG' 1204 | p300 1205 | F0.016536964980544747 1206 | sS'FW' 1207 | p301 1208 | F0.03988326848249027 1209 | sS'VBN' 1210 | p302 1211 | F0.005836575875486381 1212 | sS'VBP' 1213 | p303 1214 | F0.0029182879377431907 1215 | sg8 1216 | F0.0009727626459143969 1217 | sS'JJ' 1218 | p304 1219 | F0.06712062256809338 1220 | sS'WP' 1221 | p305 1222 | F0.013618677042801557 1223 | sS'VBZ' 1224 | p306 1225 | F0.04377431906614786 1226 | sS'DT' 1227 | p307 1228 | F0.009727626459143969 1229 | sg14 1230 | F0.0009727626459143969 1231 | sS'NN' 1232 | p308 1233 | F0.5739299610894941 1234 | sg16 1235 | F0.0009727626459143969 1236 | sg17 1237 | F0.0009727626459143969 1238 | sg18 1239 | F0.0009727626459143969 1240 | sg9 1241 | F0.0009727626459143969 1242 | sg20 1243 | F0.0009727626459143969 1244 | sS'NNS' 1245 | p309 1246 | F0.005836575875486381 1247 | sS'PRP' 1248 | p310 1249 | F0.03599221789883268 1250 | sS'VB' 1251 | p311 1252 | F0.0622568093385214 1253 | sS'WRB' 1254 | p312 1255 | F0.017509727626459144 1256 | sS'CC' 1257 | p313 1258 | F0.005836575875486381 1259 | sg26 1260 | F0.0009727626459143969 1261 | sg27 1262 | F0.0009727626459143969 1263 | sg28 1264 | F0.0009727626459143969 1265 | sg29 1266 | F0.0009727626459143969 1267 | sg30 1268 | F0.0009727626459143969 1269 | sg31 1270 | F0.0009727626459143969 1271 | sS'IN' 1272 | p314 1273 | F0.03404669260700389 1274 | sS'MD' 1275 | p315 1276 | F0.013618677042801557 1277 | sg34 1278 | F0.0009727626459143969 1279 | sg35 1280 | F0.0009727626459143969 1281 | sS'UH' 1282 | p316 1283 | F0.0311284046692607 1284 | sS'NNP' 1285 | p317 1286 | F0.0048638132295719845 1287 | ssS'TO' 1288 | p318 1289 | (dp319 1290 | g3 1291 | F0.00019179133103183735 1292 | sg4 1293 | F0.00019179133103183735 1294 | sg16 1295 | F0.00019179133103183735 1296 | sg6 1297 | F0.00019179133103183735 1298 | sg41 1299 | F0.00019179133103183735 1300 | sg8 1301 | F0.00019179133103183735 1302 | sg120 1303 | F0.00019179133103183735 1304 | sg11 1305 | F0.00019179133103183735 1306 | sg12 1307 | F0.00019179133103183735 1308 | sg65 1309 | F0.00019179133103183735 1310 | sg14 1311 | F0.00019179133103183735 1312 | sS'NN' 1313 | p320 1314 | F0.0009589566551591868 1315 | sg5 1316 | F0.00019179133103183735 1317 | sg17 1318 | F0.00019179133103183735 1319 | sS'TO' 1320 | p321 1321 | F0.004794783275795934 1322 | sg9 1323 | F0.00019179133103183735 1324 | sS'RB' 1325 | p322 1326 | F0.007096279248177982 1327 | sg50 1328 | F0.00019179133103183735 1329 | sS'PRP' 1330 | p323 1331 | F0.0011507479861910242 1332 | sS'VB' 1333 | p324 1334 | F0.9334484081319524 1335 | sg24 1336 | F0.00019179133103183735 1337 | sg25 1338 | F0.00019179133103183735 1339 | sg26 1340 | F0.00019179133103183735 1341 | sg27 1342 | F0.00019179133103183735 1343 | sg28 1344 | F0.00019179133103183735 1345 | sg29 1346 | F0.00019179133103183735 1347 | sS'CD' 1348 | p325 1349 | F0.04181051016494054 1350 | sg31 1351 | F0.00019179133103183735 1352 | sS'IN' 1353 | p326 1354 | F0.0003835826620636747 1355 | sg33 1356 | F0.00019179133103183735 1357 | sg34 1358 | F0.00019179133103183735 1359 | sg35 1360 | F0.00019179133103183735 1361 | sS'UH' 1362 | p327 1363 | F0.005370157268891446 1364 | sg22 1365 | F0.00019179133103183735 1366 | ssS'HYPH' 1367 | p328 1368 | (dp329 1369 | g3 1370 | F0.0012091898428053204 1371 | sS'VBG' 1372 | p330 1373 | F0.08464328899637243 1374 | sS'FW' 1375 | p331 1376 | F0.1003627569528416 1377 | sg6 1378 | F0.0012091898428053204 1379 | sg41 1380 | F0.0012091898428053204 1381 | sg8 1382 | F0.0012091898428053204 1383 | sS'JJ' 1384 | p332 1385 | F0.0036275695284159614 1386 | sg11 1387 | F0.0012091898428053204 1388 | sg12 1389 | F0.0012091898428053204 1390 | sg65 1391 | F0.0012091898428053204 1392 | sS'RP' 1393 | p333 1394 | F0.012091898428053204 1395 | sS'NN' 1396 | p334 1397 | F0.6880290205562273 1398 | sg16 1399 | F0.0012091898428053204 1400 | sg17 1401 | F0.0012091898428053204 1402 | sg18 1403 | F0.0012091898428053204 1404 | sg19 1405 | F0.0012091898428053204 1406 | sg20 1407 | F0.0012091898428053204 1408 | sS'NNS' 1409 | p335 1410 | F0.02539298669891173 1411 | sg9 1412 | F0.0012091898428053204 1413 | sg23 1414 | F0.0012091898428053204 1415 | sg24 1416 | F0.0012091898428053204 1417 | sg25 1418 | F0.0012091898428053204 1419 | sg26 1420 | F0.0012091898428053204 1421 | sg27 1422 | F0.0012091898428053204 1423 | sg28 1424 | F0.0012091898428053204 1425 | sg29 1426 | F0.0012091898428053204 1427 | sg30 1428 | F0.0012091898428053204 1429 | sg31 1430 | F0.0012091898428053204 1431 | sg32 1432 | F0.0012091898428053204 1433 | sg33 1434 | F0.0012091898428053204 1435 | sg34 1436 | F0.0012091898428053204 1437 | sg35 1438 | F0.0012091898428053204 1439 | sg172 1440 | F0.0012091898428053204 1441 | sS'NNP' 1442 | p336 1443 | F0.053204353083434096 1444 | ssS'RB' 1445 | p337 1446 | (dp338 1447 | g3 1448 | F0.00016561775422325274 1449 | sS'VBG' 1450 | p339 1451 | F0.004306061609804571 1452 | sS'FW' 1453 | p340 1454 | F0.001324942033786022 1455 | sS'VBN' 1456 | p341 1457 | F0.006955945677376615 1458 | sS'VBP' 1459 | p342 1460 | F0.028817489234845974 1461 | sS'WDT' 1462 | p343 1463 | F0.005299768135144088 1464 | sS'JJ' 1465 | p344 1466 | F0.14988406757204373 1467 | sS'WP' 1468 | p345 1469 | F0.004306061609804571 1470 | sS'VBZ' 1471 | p346 1472 | F0.016230539913878766 1473 | sS'DT' 1474 | p347 1475 | F0.043226233852268967 1476 | sg14 1477 | F0.00016561775422325274 1478 | sS'NN' 1479 | p348 1480 | F0.023352103345478634 1481 | sS'VBD' 1482 | p349 1483 | F0.005299768135144088 1484 | sg17 1485 | F0.00016561775422325274 1486 | sS'TO' 1487 | p350 1488 | F0.003146737330241802 1489 | sS'PRP' 1490 | p351 1491 | F0.10897648227890029 1492 | sS'RB' 1493 | p352 1494 | F0.12901623053991387 1495 | sS'NNS' 1496 | p353 1497 | F0.001324942033786022 1498 | sS'HYPH' 1499 | p354 1500 | F0.012255713812520701 1501 | sS'VB' 1502 | p355 1503 | F0.18482941371315004 1504 | sS'WRB' 1505 | p356 1506 | F0.007949652202716132 1507 | sS'CC' 1508 | p357 1509 | F0.005796621397813845 1510 | sg26 1511 | F0.00016561775422325274 1512 | sg27 1513 | F0.00016561775422325274 1514 | sS'RBS' 1515 | p358 1516 | F0.0003312355084465055 1517 | sS'RBR' 1518 | p359 1519 | F0.001324942033786022 1520 | sg30 1521 | F0.05382577012255714 1522 | sS'EX' 1523 | p360 1524 | F0.0011593242795627692 1525 | sS'IN' 1526 | p361 1527 | F0.11576681020205366 1528 | sS'MD' 1529 | p362 1530 | F0.012586949320967208 1531 | sg34 1532 | F0.00016561775422325274 1533 | sS'JJR' 1534 | p363 1535 | F0.028817489234845974 1536 | sS'UH' 1537 | p364 1538 | F0.041073203047366676 1539 | sS'NNP' 1540 | p365 1541 | F0.00182179529645578 1542 | ssS'NNS' 1543 | p366 1544 | (dp367 1545 | g3 1546 | F0.00016874789065136686 1547 | sS'VBG' 1548 | p368 1549 | F0.005399932500843739 1550 | sS'FW' 1551 | p369 1552 | F0.0003374957813027337 1553 | sS'VBN' 1554 | p370 1555 | F0.002362470469119136 1556 | sS'VBP' 1557 | p371 1558 | F0.061592980087748904 1559 | sS'WDT' 1560 | p372 1561 | F0.017043536955788054 1562 | sS'JJ' 1563 | p373 1564 | F0.022949713128585892 1565 | sS'WP' 1566 | p374 1567 | F0.012993587580155248 1568 | sS'VBZ' 1569 | p375 1570 | F0.02733715828552143 1571 | sS'DT' 1572 | p376 1573 | F0.0539993250084374 1574 | sS'RP' 1575 | p377 1576 | F0.0016874789065136687 1577 | sS'NN' 1578 | p378 1579 | F0.03678704016199798 1580 | sS'VBD' 1581 | p379 1582 | F0.0030374620317246033 1583 | sg17 1584 | F0.01231859601754978 1585 | sS'TO' 1586 | p380 1587 | F0.0026999662504218697 1588 | sS'PRP' 1589 | p381 1590 | F0.1829227134660817 1591 | sS'RB' 1592 | p382 1593 | F0.0890988862639217 1594 | sS'NNS' 1595 | p383 1596 | F0.007256159298008775 1597 | sS'HYPH' 1598 | p384 1599 | F0.0020249746878164025 1600 | sS'VB' 1601 | p385 1602 | F0.043874451569355384 1603 | sS'WRB' 1604 | p386 1605 | F0.01940600742490719 1606 | sS'CC' 1607 | p387 1608 | F0.044380695241309484 1609 | sg26 1610 | F0.00016874789065136686 1611 | sg27 1612 | F0.00016874789065136686 1613 | sg28 1614 | F0.00016874789065136686 1615 | sg29 1616 | F0.00016874789065136686 1617 | sS'CD' 1618 | p388 1619 | F0.01619979750253122 1620 | sS'EX' 1621 | p389 1622 | F0.0018562267971650355 1623 | sS'IN' 1624 | p390 1625 | F0.2163347958150523 1626 | sS'MD' 1627 | p391 1628 | F0.015018562267971651 1629 | sg34 1630 | F0.00016874789065136686 1631 | sS'JJR' 1632 | p392 1633 | F0.010124873439082012 1634 | sS'UH' 1635 | p393 1636 | F0.08977387782652717 1637 | sg22 1638 | F0.00016874789065136686 1639 | ssS'NNP' 1640 | p394 1641 | (dp395 1642 | g3 1643 | F0.0012903225806451613 1644 | sg4 1645 | F0.0012903225806451613 1646 | sS'VBD' 1647 | p396 1648 | F0.00903225806451613 1649 | sS'VBN' 1650 | p397 1651 | F0.005161290322580645 1652 | sS'VBP' 1653 | p398 1654 | F0.0064516129032258064 1655 | sg8 1656 | F0.0012903225806451613 1657 | sS'JJ' 1658 | p399 1659 | F0.0025806451612903226 1660 | sS'WP' 1661 | p400 1662 | F0.01032258064516129 1663 | sS'VBZ' 1664 | p401 1665 | F0.05290322580645161 1666 | sS'DT' 1667 | p402 1668 | F0.027096774193548386 1669 | sg14 1670 | F0.0012903225806451613 1671 | sS'NN' 1672 | p403 1673 | F0.07096774193548387 1674 | sg29 1675 | F0.0012903225806451613 1676 | sS'FW' 1677 | p404 1678 | F0.01806451612903226 1679 | sS'POS' 1680 | p405 1681 | F0.2761290322580645 1682 | sS'TO' 1683 | p406 1684 | F0.014193548387096775 1685 | sS'HYPH' 1686 | p407 1687 | F0.027096774193548386 1688 | sS'RB' 1689 | p408 1690 | F0.04903225806451613 1691 | sg50 1692 | F0.0012903225806451613 1693 | sS'NNP' 1694 | p409 1695 | F0.05806451612903226 1696 | sS'VB' 1697 | p410 1698 | F0.016774193548387096 1699 | sS'WRB' 1700 | p411 1701 | F0.011612903225806452 1702 | sS'CC' 1703 | p412 1704 | F0.12 1705 | sg26 1706 | F0.0012903225806451613 1707 | sg27 1708 | F0.0012903225806451613 1709 | sg28 1710 | F0.0012903225806451613 1711 | sS'PRP' 1712 | p413 1713 | F0.11096774193548387 1714 | sS'CD' 1715 | p414 1716 | F0.0025806451612903226 1717 | sg31 1718 | F0.0012903225806451613 1719 | sS'IN' 1720 | p415 1721 | F0.03741935483870968 1722 | sS'MD' 1723 | p416 1724 | F0.005161290322580645 1725 | sg34 1726 | F0.0012903225806451613 1727 | sg35 1728 | F0.0012903225806451613 1729 | sS'UH' 1730 | p417 1731 | F0.05290322580645161 1732 | ssS'VB' 1733 | p418 1734 | (dp419 1735 | S'PRP$' 1736 | p420 1737 | F0.000962000962000962 1738 | sS'VBG' 1739 | p421 1740 | F0.006184291898577613 1741 | sS'VBD' 1742 | p422 1743 | F0.0002748574177145606 1744 | sS'VBN' 1745 | p423 1746 | F0.005634577063148492 1747 | sS'VBP' 1748 | p424 1749 | F0.0020614306328592042 1750 | sS'WDT' 1751 | p425 1752 | F0.0008245722531436817 1753 | sS'JJ' 1754 | p426 1755 | F0.055383769669483954 1756 | sg11 1757 | F0.005497148354291211 1758 | sS'VBZ' 1759 | p427 1760 | F0.0020614306328592042 1761 | sS'DT' 1762 | p428 1763 | F0.12025012025012025 1764 | sg14 1765 | F0.060812203669346525 1766 | sS'NN' 1767 | p429 1768 | F0.08252593966879682 1769 | sg5 1770 | F6.871435442864014e-05 1771 | sS'POS' 1772 | p430 1773 | F0.000962000962000962 1774 | sg18 1775 | F0.1732288875146018 1776 | sS'HYPH' 1777 | p431 1778 | F0.0006871435442864014 1779 | sg20 1780 | F0.04727547584690442 1781 | sS'NNS' 1782 | p432 1783 | F0.0094138665567237 1784 | sS'NNP' 1785 | p433 1786 | F0.001992716278430564 1787 | sS'VB' 1788 | p434 1789 | F0.0037105751391465678 1790 | sg24 1791 | F0.006596578025149453 1792 | sS'CC' 1793 | p435 1794 | F0.006940149797292654 1795 | sg26 1796 | F6.871435442864014e-05 1797 | sS'PDT' 1798 | p436 1799 | F0.001443001443001443 1800 | sg28 1801 | F6.871435442864014e-05 1802 | sS'PRP' 1803 | p437 1804 | F0.18154332440046725 1805 | sS'RBR' 1806 | p438 1807 | F0.0010307153164296021 1808 | sg31 1809 | F6.871435442864014e-05 1810 | sg32 1811 | F0.14416271559128702 1812 | sS'CD' 1813 | p439 1814 | F0.02343159486016629 1815 | sS'MD' 1816 | p440 1817 | F0.0013055727341441626 1818 | sS'JJS' 1819 | p441 1820 | F0.0008932866075723219 1821 | sg35 1822 | F0.03621246478389335 1823 | sS'UH' 1824 | p442 1825 | F0.016422730708444994 1826 | ssg24 1827 | (dp443 1828 | g3 1829 | F0.0008503401360544217 1830 | sg4 1831 | F0.0008503401360544217 1832 | sS'VBD' 1833 | p444 1834 | F0.0017006802721088435 1835 | sg6 1836 | F0.0008503401360544217 1837 | sS'VBP' 1838 | p445 1839 | F0.025510204081632654 1840 | sg8 1841 | F0.0008503401360544217 1842 | sS'JJ' 1843 | p446 1844 | F0.10714285714285714 1845 | sg11 1846 | F0.0008503401360544217 1847 | sS'VBZ' 1848 | p447 1849 | F0.19387755102040816 1850 | sS'DT' 1851 | p448 1852 | F0.007653061224489796 1853 | sg14 1854 | F0.0008503401360544217 1855 | sS'NN' 1856 | p449 1857 | F0.00510204081632653 1858 | sg5 1859 | F0.0008503401360544217 1860 | sg17 1861 | F0.0008503401360544217 1862 | sS'TO' 1863 | p450 1864 | F0.011054421768707483 1865 | sg9 1866 | F0.0008503401360544217 1867 | sS'RB' 1868 | p451 1869 | F0.06292517006802721 1870 | sS'NNS' 1871 | p452 1872 | F0.003401360544217687 1873 | sS'PRP' 1874 | p453 1875 | F0.08673469387755102 1876 | sS'VB' 1877 | p454 1878 | F0.016156462585034014 1879 | sS'WRB' 1880 | p455 1881 | F0.004251700680272109 1882 | sg25 1883 | F0.0008503401360544217 1884 | sg26 1885 | F0.0008503401360544217 1886 | sg27 1887 | F0.0008503401360544217 1888 | sg28 1889 | F0.0008503401360544217 1890 | sg29 1891 | F0.0008503401360544217 1892 | sg30 1893 | F0.0008503401360544217 1894 | sS'EX' 1895 | p456 1896 | F0.01445578231292517 1897 | sS'IN' 1898 | p457 1899 | F0.3010204081632653 1900 | sS'MD' 1901 | p458 1902 | F0.14285714285714285 1903 | sg34 1904 | F0.0008503401360544217 1905 | sg35 1906 | F0.0008503401360544217 1907 | sg172 1908 | F0.0008503401360544217 1909 | sg22 1910 | F0.0008503401360544217 1911 | ssS'CC' 1912 | p459 1913 | (dp460 1914 | S'PRP$' 1915 | p461 1916 | F0.01908566355969818 1917 | sS'VBG' 1918 | p462 1919 | F0.0039946737683089215 1920 | sS'VBD' 1921 | p463 1922 | F0.006213936972924989 1923 | sg6 1924 | F0.0004438526409232135 1925 | sg41 1926 | F0.0004438526409232135 1927 | sg8 1928 | F0.0004438526409232135 1929 | sg9 1930 | F0.0004438526409232135 1931 | sS'JJ' 1932 | p464 1933 | F0.08610741233910342 1934 | sg11 1935 | F0.0004438526409232135 1936 | sS'VBZ' 1937 | p465 1938 | F0.0039946737683089215 1939 | sS'DT' 1940 | p466 1941 | F0.08788282290279627 1942 | sg14 1943 | F0.0004438526409232135 1944 | sS'NN' 1945 | p467 1946 | F0.09764758100310697 1947 | sg5 1948 | F0.0004438526409232135 1949 | sg17 1950 | F0.0004438526409232135 1951 | sg18 1952 | F0.0004438526409232135 1953 | sS'PRP' 1954 | p468 1955 | F0.22592099422991566 1956 | sS'RB' 1957 | p469 1958 | F0.10874389702618731 1959 | sS'NNS' 1960 | p470 1961 | F0.05015534842432313 1962 | sS'NNP' 1963 | p471 1964 | F0.023968042609853527 1965 | sS'VB' 1966 | p472 1967 | F0.02174877940523746 1968 | sS'WRB' 1969 | p473 1970 | F0.004882379050155349 1971 | sS'CC' 1972 | p474 1973 | F0.004882379050155349 1974 | sg26 1975 | F0.0004438526409232135 1976 | sg27 1977 | F0.0004438526409232135 1978 | sg28 1979 | F0.0004438526409232135 1980 | sg29 1981 | F0.0004438526409232135 1982 | sS'CD' 1983 | p475 1984 | F0.10031069684864626 1985 | sg31 1986 | F0.0004438526409232135 1987 | sS'IN' 1988 | p476 1989 | F0.044829116733244564 1990 | sS'MD' 1991 | p477 1992 | F0.010652463382157125 1993 | sg34 1994 | F0.0004438526409232135 1995 | sS'JJR' 1996 | p478 1997 | F0.033288948069241014 1998 | sS'UH' 1999 | p479 2000 | F0.05903240124278739 2001 | ssg26 2002 | (dp480 2003 | g3 2004 | F0.023255813953488372 2005 | sg4 2006 | F0.023255813953488372 2007 | sg5 2008 | F0.023255813953488372 2009 | sg6 2010 | F0.023255813953488372 2011 | sg41 2012 | F0.023255813953488372 2013 | sg8 2014 | F0.023255813953488372 2015 | sg120 2016 | F0.023255813953488372 2017 | sg11 2018 | F0.023255813953488372 2019 | sg12 2020 | F0.023255813953488372 2021 | sg65 2022 | F0.023255813953488372 2023 | sg14 2024 | F0.023255813953488372 2025 | sg141 2026 | F0.023255813953488372 2027 | sg16 2028 | F0.023255813953488372 2029 | sg17 2030 | F0.023255813953488372 2031 | sg18 2032 | F0.023255813953488372 2033 | sS'HYPH' 2034 | p481 2035 | F0.23255813953488372 2036 | sg20 2037 | F0.023255813953488372 2038 | sg50 2039 | F0.023255813953488372 2040 | sg22 2041 | F0.023255813953488372 2042 | sg23 2043 | F0.023255813953488372 2044 | sg24 2045 | F0.023255813953488372 2046 | sg25 2047 | F0.023255813953488372 2048 | sg26 2049 | F0.023255813953488372 2050 | sg27 2051 | F0.023255813953488372 2052 | sg28 2053 | F0.023255813953488372 2054 | sg19 2055 | F0.023255813953488372 2056 | sg29 2057 | F0.023255813953488372 2058 | sg31 2059 | F0.023255813953488372 2060 | sg32 2061 | F0.023255813953488372 2062 | sg30 2063 | F0.023255813953488372 2064 | sg33 2065 | F0.023255813953488372 2066 | sg34 2067 | F0.023255813953488372 2068 | sg35 2069 | F0.023255813953488372 2070 | sg172 2071 | F0.023255813953488372 2072 | ssg27 2073 | (dp482 2074 | g3 2075 | F0.006329113924050633 2076 | sg4 2077 | F0.006329113924050633 2078 | sg5 2079 | F0.006329113924050633 2080 | sg6 2081 | F0.006329113924050633 2082 | sg41 2083 | F0.006329113924050633 2084 | sg8 2085 | F0.006329113924050633 2086 | sg120 2087 | F0.006329113924050633 2088 | sg11 2089 | F0.006329113924050633 2090 | sg12 2091 | F0.006329113924050633 2092 | sS'DT' 2093 | p483 2094 | F0.7911392405063291 2095 | sg14 2096 | F0.006329113924050633 2097 | sg141 2098 | F0.006329113924050633 2099 | sg16 2100 | F0.006329113924050633 2101 | sg17 2102 | F0.006329113924050633 2103 | sg18 2104 | F0.006329113924050633 2105 | sg9 2106 | F0.006329113924050633 2107 | sg20 2108 | F0.006329113924050633 2109 | sg50 2110 | F0.006329113924050633 2111 | sg22 2112 | F0.006329113924050633 2113 | sg23 2114 | F0.006329113924050633 2115 | sg24 2116 | F0.006329113924050633 2117 | sg25 2118 | F0.006329113924050633 2119 | sg26 2120 | F0.006329113924050633 2121 | sg27 2122 | F0.006329113924050633 2123 | sg28 2124 | F0.006329113924050633 2125 | sg19 2126 | F0.006329113924050633 2127 | sg29 2128 | F0.006329113924050633 2129 | sg31 2130 | F0.006329113924050633 2131 | sg32 2132 | F0.006329113924050633 2133 | sg30 2134 | F0.006329113924050633 2135 | sg33 2136 | F0.006329113924050633 2137 | sg34 2138 | F0.006329113924050633 2139 | sg35 2140 | F0.006329113924050633 2141 | sg172 2142 | F0.006329113924050633 2143 | ssS'RBS' 2144 | p484 2145 | (dp485 2146 | g3 2147 | F0.013157894736842105 2148 | sg4 2149 | F0.013157894736842105 2150 | sg16 2151 | F0.013157894736842105 2152 | sg6 2153 | F0.013157894736842105 2154 | sg41 2155 | F0.013157894736842105 2156 | sg8 2157 | F0.013157894736842105 2158 | sS'JJ' 2159 | p486 2160 | F0.25 2161 | sg11 2162 | F0.013157894736842105 2163 | sg12 2164 | F0.013157894736842105 2165 | sS'DT' 2166 | p487 2167 | F0.10526315789473684 2168 | sg14 2169 | F0.013157894736842105 2170 | sg141 2171 | F0.013157894736842105 2172 | sg29 2173 | F0.013157894736842105 2174 | sg5 2175 | F0.013157894736842105 2176 | sg17 2177 | F0.013157894736842105 2178 | sg18 2179 | F0.013157894736842105 2180 | sg9 2181 | F0.013157894736842105 2182 | sS'RB' 2183 | p488 2184 | F0.2236842105263158 2185 | sg50 2186 | F0.013157894736842105 2187 | sg22 2188 | F0.013157894736842105 2189 | sg23 2190 | F0.013157894736842105 2191 | sg24 2192 | F0.013157894736842105 2193 | sg25 2194 | F0.013157894736842105 2195 | sg26 2196 | F0.013157894736842105 2197 | sg27 2198 | F0.013157894736842105 2199 | sg28 2200 | F0.013157894736842105 2201 | sg19 2202 | F0.013157894736842105 2203 | sS'CD' 2204 | p489 2205 | F0.02631578947368421 2206 | sg31 2207 | F0.013157894736842105 2208 | sg32 2209 | F0.013157894736842105 2210 | sg33 2211 | F0.013157894736842105 2212 | sg34 2213 | F0.013157894736842105 2214 | sg35 2215 | F0.013157894736842105 2216 | sg172 2217 | F0.013157894736842105 2218 | ssS'PRP' 2219 | p490 2220 | (dp491 2221 | g3 2222 | F7.702973347712218e-05 2223 | sS'VBG' 2224 | p492 2225 | F0.001386535202588199 2226 | sg16 2227 | F0.030734863657371745 2228 | sS'VBN' 2229 | p493 2230 | F0.00046217840086273303 2231 | sg41 2232 | F0.32152210753350796 2233 | sS'WDT' 2234 | p494 2235 | F0.0010784162686797104 2236 | sS'JJ' 2237 | p495 2238 | F0.011246341087659836 2239 | sS'WP' 2240 | p496 2241 | F0.001309505469111077 2242 | sg12 2243 | F0.024803574179633338 2244 | sS'DT' 2245 | p497 2246 | F0.036358034201201664 2247 | sS'RP' 2248 | p498 2249 | F0.0006162378678169774 2250 | sS'NN' 2251 | p499 2252 | F0.013249114158065014 2253 | sg5 2254 | F7.702973347712218e-05 2255 | sg17 2256 | F7.702973347712218e-05 2257 | sS'TO' 2258 | p500 2259 | F0.005700200277307041 2260 | sS'LS' 2261 | p501 2262 | F0.0006932676012940995 2263 | sS'RB' 2264 | p502 2265 | F0.02095208750577723 2266 | sS'NNS' 2267 | p503 2268 | F0.001386535202588199 2269 | sS'HYPH' 2270 | p504 2271 | F0.00038514866738561084 2272 | sS'VB' 2273 | p505 2274 | F0.1493606532121399 2275 | sS'WRB' 2276 | p506 2277 | F0.0015405946695424434 2278 | sS'CC' 2279 | p507 2280 | F0.0007702973347712217 2281 | sg27 2282 | F0.001309505469111077 2283 | sg28 2284 | F7.702973347712218e-05 2285 | sS'RBR' 2286 | p508 2287 | F0.025959020181790173 2288 | sS'CD' 2289 | p509 2290 | F0.00023108920043136651 2291 | sS'PRP' 2292 | p510 2293 | F0.009551686951163148 2294 | sg31 2295 | F7.702973347712218e-05 2296 | sS'IN' 2297 | p511 2298 | F0.0476043752888615 2299 | sg33 2300 | F0.2721460483746726 2301 | sg34 2302 | F7.702973347712218e-05 2303 | sS'JJR' 2304 | p512 2305 | F0.010321984285934371 2306 | sS'UH' 2307 | p513 2308 | F0.008781389616391928 2309 | sg22 2310 | F7.702973347712218e-05 2311 | ssS'RBR' 2312 | p514 2313 | (dp515 2314 | g3 2315 | F0.002150537634408602 2316 | sg4 2317 | F0.002150537634408602 2318 | sg16 2319 | F0.002150537634408602 2320 | sg6 2321 | F0.002150537634408602 2322 | sg41 2323 | F0.002150537634408602 2324 | sg8 2325 | F0.002150537634408602 2326 | sS'JJ' 2327 | p516 2328 | F0.13333333333333333 2329 | sg11 2330 | F0.002150537634408602 2331 | sg12 2332 | F0.002150537634408602 2333 | sg65 2334 | F0.002150537634408602 2335 | sg14 2336 | F0.002150537634408602 2337 | sg141 2338 | F0.002150537634408602 2339 | sg5 2340 | F0.002150537634408602 2341 | sg17 2342 | F0.002150537634408602 2343 | sg18 2344 | F0.002150537634408602 2345 | sS'PRP' 2346 | p517 2347 | F0.034408602150537634 2348 | sS'RB' 2349 | p518 2350 | F0.025806451612903226 2351 | sg50 2352 | F0.002150537634408602 2353 | sg9 2354 | F0.002150537634408602 2355 | sS'VB' 2356 | p519 2357 | F0.008602150537634409 2358 | sg24 2359 | F0.002150537634408602 2360 | sS'CC' 2361 | p520 2362 | F0.015053763440860216 2363 | sg26 2364 | F0.002150537634408602 2365 | sg27 2366 | F0.002150537634408602 2367 | sg28 2368 | F0.002150537634408602 2369 | sg29 2370 | F0.002150537634408602 2371 | sg30 2372 | F0.002150537634408602 2373 | sg31 2374 | F0.002150537634408602 2375 | sS'IN' 2376 | p521 2377 | F0.7225806451612903 2378 | sg33 2379 | F0.002150537634408602 2380 | sg34 2381 | F0.002150537634408602 2382 | sg35 2383 | F0.002150537634408602 2384 | sg172 2385 | F0.002150537634408602 2386 | sg22 2387 | F0.002150537634408602 2388 | ssS'EX' 2389 | p522 2390 | (dp523 2391 | g3 2392 | F0.002105263157894737 2393 | sg4 2394 | F0.002105263157894737 2395 | sS'VBD' 2396 | p524 2397 | F0.04631578947368421 2398 | sg6 2399 | F0.002105263157894737 2400 | sS'VBP' 2401 | p525 2402 | F0.014736842105263158 2403 | sg8 2404 | F0.002105263157894737 2405 | sg9 2406 | F0.002105263157894737 2407 | sS'JJ' 2408 | p526 2409 | F0.05263157894736842 2410 | sg11 2411 | F0.002105263157894737 2412 | sS'VBZ' 2413 | p527 2414 | F0.1031578947368421 2415 | sS'DT' 2416 | p528 2417 | F0.6652631578947369 2418 | sg14 2419 | F0.002105263157894737 2420 | sS'NN' 2421 | p529 2422 | F0.02526315789473684 2423 | sg5 2424 | F0.002105263157894737 2425 | sg17 2426 | F0.002105263157894737 2427 | sg18 2428 | F0.002105263157894737 2429 | sg19 2430 | F0.002105263157894737 2431 | sS'RB' 2432 | p530 2433 | F0.021052631578947368 2434 | sg50 2435 | F0.002105263157894737 2436 | sg22 2437 | F0.002105263157894737 2438 | sg23 2439 | F0.002105263157894737 2440 | sg24 2441 | F0.002105263157894737 2442 | sg25 2443 | F0.002105263157894737 2444 | sg26 2445 | F0.002105263157894737 2446 | sg27 2447 | F0.002105263157894737 2448 | sg28 2449 | F0.002105263157894737 2450 | sg29 2451 | F0.002105263157894737 2452 | sg30 2453 | F0.002105263157894737 2454 | sg31 2455 | F0.002105263157894737 2456 | sg32 2457 | F0.002105263157894737 2458 | sg33 2459 | F0.002105263157894737 2460 | sg34 2461 | F0.002105263157894737 2462 | sg35 2463 | F0.002105263157894737 2464 | sS'UH' 2465 | p531 2466 | F0.016842105263157894 2467 | ssS'IN' 2468 | p532 2469 | (dp533 2470 | S'PRP$' 2471 | p534 2472 | F0.010744608911263056 2473 | sS'VBG' 2474 | p535 2475 | F0.016229619054774963 2476 | sS'VBD' 2477 | p536 2478 | F0.000751371252535878 2479 | sS'VBN' 2480 | p537 2481 | F0.0004508227515215268 2482 | sS'VBP' 2483 | p538 2484 | F0.0006010970020287024 2485 | sS'WDT' 2486 | p539 2487 | F0.000751371252535878 2488 | sg9 2489 | F7.51371252535878e-05 2490 | sS'JJ' 2491 | p540 2492 | F0.07250732586971223 2493 | sS'WP' 2494 | p541 2495 | F0.002554662258621985 2496 | sS'VBZ' 2497 | p542 2498 | F0.0001502742505071756 2499 | sg65 2500 | F0.20820497407769178 2501 | sg14 2502 | F7.51371252535878e-05 2503 | sS'NN' 2504 | p543 2505 | F0.3326320534976332 2506 | sg5 2507 | F0.0065369298970621385 2508 | sg17 2509 | F7.51371252535878e-05 2510 | sS'TO' 2511 | p544 2512 | F0.005184461642497558 2513 | sS'PRP' 2514 | p545 2515 | F0.038319933879329776 2516 | sS'RB' 2517 | p546 2518 | F0.01194680291532046 2519 | sS'NNS' 2520 | p547 2521 | F0.057029078067473135 2522 | sg22 2523 | F0.03073108422871741 2524 | sS'VB' 2525 | p548 2526 | F0.0019535652565932826 2527 | sS'WRB' 2528 | p549 2529 | F0.0002254113757607634 2530 | sg25 2531 | F7.51371252535878e-05 2532 | sg26 2533 | F7.51371252535878e-05 2534 | sS'PDT' 2535 | p550 2536 | F0.0062363813960477875 2537 | sg484 2538 | F0.0006010970020287024 2539 | sS'RBR' 2540 | p551 2541 | F0.001051919753550229 2542 | sS'CD' 2543 | p552 2544 | F0.14892178225261102 2545 | sS'EX' 2546 | p553 2547 | F0.000751371252535878 2548 | sS'IN' 2549 | p554 2550 | F0.010519197535502291 2551 | sS'MD' 2552 | p555 2553 | F0.0001502742505071756 2554 | sS'JJS' 2555 | p556 2556 | F0.008039672402133895 2557 | sS'JJR' 2558 | p557 2559 | F0.005409873018258321 2560 | sS'UH' 2561 | p558 2562 | F0.02043729806897588 2563 | ssS'CD' 2564 | p559 2565 | (dp560 2566 | g3 2567 | F0.00023860653781913624 2568 | sg4 2569 | F0.00023860653781913624 2570 | sg5 2571 | F0.00023860653781913624 2572 | sg6 2573 | F0.00023860653781913624 2574 | sS'VBP' 2575 | p561 2576 | F0.003101884991648771 2577 | sS'WDT' 2578 | p562 2579 | F0.0021474588403722263 2580 | sg120 2581 | F0.00023860653781913624 2582 | sg11 2583 | F0.00023860653781913624 2584 | sS'VBZ' 2585 | p563 2586 | F0.003101884991648771 2587 | sS'DT' 2588 | p564 2589 | F0.0023860653781913625 2590 | sg14 2591 | F0.00023860653781913624 2592 | sS'NN' 2593 | p565 2594 | F0.0816034359341446 2595 | sg16 2596 | F0.00023860653781913624 2597 | sg17 2598 | F0.00023860653781913624 2599 | sS'TO' 2600 | p566 2601 | F0.02290622763063708 2602 | sg9 2603 | F0.00023860653781913624 2604 | sg20 2605 | F0.00023860653781913624 2606 | sS'NNS' 2607 | p567 2608 | F0.7368169887854927 2609 | sS'PRP' 2610 | p568 2611 | F0.008589835361488905 2612 | sS'VB' 2613 | p569 2614 | F0.0004772130756382725 2615 | sS'WRB' 2616 | p570 2617 | F0.0004772130756382725 2618 | sS'CC' 2619 | p571 2620 | F0.046766881412550705 2621 | sg26 2622 | F0.00023860653781913624 2623 | sg27 2624 | F0.00023860653781913624 2625 | sg28 2626 | F0.00023860653781913624 2627 | sg29 2628 | F0.00023860653781913624 2629 | sS'CD' 2630 | p572 2631 | F0.07015032211882606 2632 | sg31 2633 | F0.00023860653781913624 2634 | sS'IN' 2635 | p573 2636 | F0.010021474588403722 2637 | sg33 2638 | F0.00023860653781913624 2639 | sg34 2640 | F0.00023860653781913624 2641 | sg35 2642 | F0.00023860653781913624 2643 | sS'UH' 2644 | p574 2645 | F0.006680983058935815 2646 | sg22 2647 | F0.00023860653781913624 2648 | ssS'MD' 2649 | p575 2650 | (dp576 2651 | g3 2652 | F0.00020044097013429546 2653 | sg4 2654 | F0.00020044097013429546 2655 | sS'VBD' 2656 | p577 2657 | F0.0006013229104028864 2658 | sg6 2659 | F0.00020044097013429546 2660 | sg41 2661 | F0.00020044097013429546 2662 | sg8 2663 | F0.00020044097013429546 2664 | sg120 2665 | F0.00020044097013429546 2666 | sg11 2667 | F0.00020044097013429546 2668 | sg12 2669 | F0.00020044097013429546 2670 | sS'DT' 2671 | p578 2672 | F0.0010022048506714773 2673 | sg14 2674 | F0.00020044097013429546 2675 | sg141 2676 | F0.00020044097013429546 2677 | sg5 2678 | F0.00020044097013429546 2679 | sg17 2680 | F0.00020044097013429546 2681 | sS'TO' 2682 | p579 2683 | F0.0020044097013429546 2684 | sg9 2685 | F0.00020044097013429546 2686 | sS'RB' 2687 | p580 2688 | F0.025255562236921228 2689 | sg50 2690 | F0.00020044097013429546 2691 | sS'PRP' 2692 | p581 2693 | F0.17298055722589697 2694 | sg23 2695 | F0.7907396271797955 2696 | sS'WRB' 2697 | p582 2698 | F0.0010022048506714773 2699 | sg25 2700 | F0.00020044097013429546 2701 | sg26 2702 | F0.00020044097013429546 2703 | sg27 2704 | F0.00020044097013429546 2705 | sg28 2706 | F0.00020044097013429546 2707 | sg29 2708 | F0.00020044097013429546 2709 | sg30 2710 | F0.00020044097013429546 2711 | sg31 2712 | F0.00020044097013429546 2713 | sg32 2714 | F0.00020044097013429546 2715 | sS'MD' 2716 | p583 2717 | F0.0010022048506714773 2718 | sg34 2719 | F0.00020044097013429546 2720 | sg35 2721 | F0.00020044097013429546 2722 | sS'UH' 2723 | p584 2724 | F0.0004008819402685909 2725 | sg22 2726 | F0.00020044097013429546 2727 | ssS'JJS' 2728 | p585 2729 | (dp586 2730 | g3 2731 | F0.002531645569620253 2732 | sg4 2733 | F0.002531645569620253 2734 | sg5 2735 | F0.002531645569620253 2736 | sg6 2737 | F0.002531645569620253 2738 | sS'VBP' 2739 | p587 2740 | F0.007594936708860759 2741 | sg8 2742 | F0.002531645569620253 2743 | sS'JJ' 2744 | p588 2745 | F0.11139240506329114 2746 | sS'WP' 2747 | p589 2748 | F0.007594936708860759 2749 | sg12 2750 | F0.002531645569620253 2751 | sg65 2752 | F0.002531645569620253 2753 | sg14 2754 | F0.002531645569620253 2755 | sS'NN' 2756 | p590 2757 | F0.2936708860759494 2758 | sg16 2759 | F0.002531645569620253 2760 | sg17 2761 | F0.002531645569620253 2762 | sg18 2763 | F0.002531645569620253 2764 | sg9 2765 | F0.002531645569620253 2766 | sS'RB' 2767 | p591 2768 | F0.060759493670886074 2769 | sS'NNS' 2770 | p592 2771 | F0.043037974683544304 2772 | sS'PRP' 2773 | p593 2774 | F0.010126582278481013 2775 | sS'VB' 2776 | p594 2777 | F0.007594936708860759 2778 | sg24 2779 | F0.002531645569620253 2780 | sg25 2781 | F0.002531645569620253 2782 | sg26 2783 | F0.002531645569620253 2784 | sg27 2785 | F0.002531645569620253 2786 | sg28 2787 | F0.002531645569620253 2788 | sg29 2789 | F0.002531645569620253 2790 | sS'CD' 2791 | p595 2792 | F0.28607594936708863 2793 | sg31 2794 | F0.002531645569620253 2795 | sS'IN' 2796 | p596 2797 | F0.035443037974683546 2798 | sS'MD' 2799 | p597 2800 | F0.02278481012658228 2801 | sS'JJS' 2802 | p598 2803 | F0.017721518987341773 2804 | sg35 2805 | F0.002531645569620253 2806 | sS'UH' 2807 | p599 2808 | F0.043037974683544304 2809 | sg22 2810 | F0.002531645569620253 2811 | ssS'JJR' 2812 | p600 2813 | (dp601 2814 | g3 2815 | F0.0005824111822947001 2816 | sg4 2817 | F0.0005824111822947001 2818 | sS'VBD' 2819 | p602 2820 | F0.0011648223645894002 2821 | sg6 2822 | F0.0005824111822947001 2823 | sS'VBP' 2824 | p603 2825 | F0.0052417006406523005 2826 | sg8 2827 | F0.0005824111822947001 2828 | sS'JJ' 2829 | p604 2830 | F0.0023296447291788003 2831 | sS'WP' 2832 | p605 2833 | F0.0011648223645894002 2834 | sg12 2835 | F0.0005824111822947001 2836 | sS'DT' 2837 | p606 2838 | F0.0011648223645894002 2839 | sS'RP' 2840 | p607 2841 | F0.0023296447291788003 2842 | sS'NN' 2843 | p608 2844 | F0.15841584158415842 2845 | sg5 2846 | F0.0005824111822947001 2847 | sS'POS' 2848 | p609 2849 | F0.004076878276062901 2850 | sS'TO' 2851 | p610 2852 | F0.0029120559114735 2853 | sS'PRP' 2854 | p611 2855 | F0.019801980198019802 2856 | sS'RB' 2857 | p612 2858 | F0.0029120559114735 2859 | sS'NNS' 2860 | p613 2861 | F0.008736167734420501 2862 | sg9 2863 | F0.0005824111822947001 2864 | sS'VB' 2865 | p614 2866 | F0.004659289458357601 2867 | sS'WRB' 2868 | p615 2869 | F0.0052417006406523005 2870 | sg25 2871 | F0.0005824111822947001 2872 | sg26 2873 | F0.0005824111822947001 2874 | sg27 2875 | F0.0005824111822947001 2876 | sg28 2877 | F0.0005824111822947001 2878 | sg29 2879 | F0.0005824111822947001 2880 | sg30 2881 | F0.0005824111822947001 2882 | sg31 2883 | F0.0005824111822947001 2884 | sS'IN' 2885 | p616 2886 | F0.7303436225975539 2887 | sS'MD' 2888 | p617 2889 | F0.007571345369831101 2890 | sS'JJS' 2891 | p618 2892 | F0.009900990099009901 2893 | sS'JJR' 2894 | p619 2895 | F0.009900990099009901 2896 | sS'UH' 2897 | p620 2898 | F0.013395457192778102 2899 | sg22 2900 | F0.0005824111822947001 2901 | ssS'UH' 2902 | p621 2903 | (dp622 2904 | g3 2905 | F0.008496577767288176 2906 | sS'VBG' 2907 | p623 2908 | F0.006844465423648808 2909 | sS'FW' 2910 | p624 2911 | F0.0018881283927307056 2912 | sS'VBN' 2913 | p625 2914 | F0.001180080245456691 2915 | sS'VBP' 2916 | p626 2917 | F0.02076941232003776 2918 | sS'WDT' 2919 | p627 2920 | F0.004720320981826764 2921 | sS'JJ' 2922 | p628 2923 | F0.07198489497285815 2924 | sS'WP' 2925 | p629 2926 | F0.024781685154590512 2927 | sS'VBZ' 2928 | p630 2929 | F0.003540240736370073 2930 | sS'DT' 2931 | p631 2932 | F0.05192353080009441 2933 | sS'RP' 2934 | p632 2935 | F0.0014160962945480293 2936 | sS'NN' 2937 | p633 2938 | F0.07670521595468492 2939 | sS'VBD' 2940 | p634 2941 | F0.0014160962945480293 2942 | sS'POS' 2943 | p635 2944 | F0.0018881283927307056 2945 | sS'TO' 2946 | p636 2947 | F0.021005428369129102 2948 | sS'PRP' 2949 | p637 2950 | F0.2150106207222091 2951 | sS'RB' 2952 | p638 2953 | F0.04932735426008968 2954 | sS'NNS' 2955 | p639 2956 | F0.00660844937455747 2957 | sg9 2958 | F0.0002360160490913382 2959 | sS'VB' 2960 | p640 2961 | F0.17488789237668162 2962 | sS'WRB' 2963 | p641 2964 | F0.02336558886004248 2965 | sS'CC' 2966 | p642 2967 | F0.010148690110927543 2968 | sg26 2969 | F0.0002360160490913382 2970 | sg27 2971 | F0.0002360160490913382 2972 | sg28 2973 | F0.0002360160490913382 2974 | sS'RBR' 2975 | p643 2976 | F0.001180080245456691 2977 | sS'CD' 2978 | p644 2979 | F0.03493037526551806 2980 | sS'EX' 2981 | p645 2982 | F0.0028321925890960587 2983 | sS'IN' 2984 | p646 2985 | F0.04130280859098419 2986 | sS'MD' 2987 | p647 2988 | F0.024781685154590512 2989 | sS'JJS' 2990 | p648 2991 | F0.0014160962945480293 2992 | sS'JJR' 2993 | p649 2994 | F0.01038470616001888 2995 | sS'UH' 2996 | p650 2997 | F0.10408307764928015 2998 | sg22 2999 | F0.0002360160490913382 3000 | ss. -------------------------------------------------------------------------------- /4. HMM - Veterbi Algorithm/readme.md: -------------------------------------------------------------------------------- 1 | ![HMM - Viterbi Algorithm](https://github.com/shrebox/Natural-Language-Processing/blob/master/4.%20HMM%20-%20Veterbi%20Algorithm/Problem_Statement.jpg) 2 | 3 | ## Walkthrough: 4 | 5 | **Code:** ```Solution.py, train_test_files/create_test.py``` 6 | 7 | To run python files, ```$ python 'filename'``` 8 | 9 | * 'Solution.py' is used to create the models using train dataset and predict the tags for the test dataset. 10 | * 'create_test.py' is used to create the 'train_test_files/test.txt' data file for testing the training accuracy. 11 | 12 | **Data files:** ```train_test_files/train.txt, train_test_files/test.txt, train_test_files/test_output.txt``` 13 | 14 | * 'train.txt' is train data on which models are trained. 15 | * 'test.txt' is used for testing the train accuracy. 16 | * 'test_output.txt' is the result for the predicted output tags on the 'test.txt' file. 17 | 18 | **Models:** ```models/start_dic.pkl, models/transition_dic.pkl, models/emission_dic.pkl``` 19 | 20 | * 'start_dic.pkl' --> start probability dictionary 21 | * 'transition_dic.pkl' --> transition probability dictionary 22 | * 'emission_dic.pkl' --> emission probability dictionary 23 | 24 | **Report:** ```Report.pdf``` 25 | 26 | 28 | 29 | * Contains the detailed analysis and results for the problem. 30 | -------------------------------------------------------------------------------- /4. HMM - Veterbi Algorithm/train_test_files/create_test.py: -------------------------------------------------------------------------------- 1 | 2 | data_to_write = [] 3 | #--------------------------------data preprocessing--------------------------------------------- 4 | flag=1 # this flag is used for getting the start words 5 | with open('train.txt') as f: # data in train corpus 6 | for line in f: 7 | split = line.split('\t') 8 | # if len(split)>1: 9 | # if split[0] != ".": 10 | word_val = split[0] 11 | data_to_write.append(word_val) 12 | 13 | with open('test.txt','wb') as f: 14 | for i in range(len(data_to_write)): 15 | if data_to_write[i]!="\n": 16 | f.write(data_to_write[i]+"\n") 17 | else: 18 | f.write(data_to_write[i]) 19 | 20 | 21 | # tag_val = split[1].split('\n')[0] 22 | # if flag==1: 23 | # start_tags_x.append(tag_val) 24 | # flag=0 25 | # words.append(word_val) # word list 26 | # tags.append(tag_val) # tags list 27 | # if word_val not in word_tag: # preparing words to tag count dictionary 28 | # word_tag[word_val] = {} 29 | # inner_tag_dic = word_tag[word_val] 30 | # if tag_val not in inner_tag_dic: 31 | # inner_tag_dic[tag_val]=0 32 | # inner_tag_dic[tag_val]+=1 33 | # else: 34 | # flag=1 -------------------------------------------------------------------------------- /5. NLP Tools/1. Word Similarity - word2vec/Solution.py: -------------------------------------------------------------------------------- 1 | import gensim 2 | 3 | from gensim.models import KeyedVectors 4 | 5 | filename = "GoogleNews-vectors-negative300.bin" 6 | 7 | model = KeyedVectors.load_word2vec_format(filename, binary=True) 8 | 9 | result1 = model.most_similar(positive=['China','Delhi'], negative=['India'],topn=1) 10 | 11 | result2 = model.most_similar(positive=['USA','ISRO'], negative=['India'],topn=1) 12 | 13 | print result1,result2 -------------------------------------------------------------------------------- /5. NLP Tools/2. Document Similarity - Doc2Vec/Solution.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gensim.models as models 3 | from gensim.models.doc2vec import Doc2Vec, TaggedDocument 4 | from nltk.tokenize import word_tokenize 5 | import random,numpy as np 6 | 7 | def read_total_data(): # reading the total data 8 | data = {} 9 | for i in os.listdir('20_newsgroups/'): 10 | corpus = [] 11 | for j in sorted(os.listdir('20_newsgroups/'+i)): 12 | temp_data = open('20_newsgroups/'+i+'/'+j,'rb').read().decode('utf-8', 'ignore').lower() 13 | temp_data = word_tokenize(temp_data) 14 | corpus.append(temp_data) 15 | data[i] = corpus 16 | return data 17 | 18 | def read_train_data(data): # preparing the train data 19 | t_train_data = [] 20 | for i,name in enumerate(data): 21 | temp = [] 22 | if name != "comp.graphics": 23 | for j in range(1,len(data[name])): 24 | temp2 = TaggedDocument(data[name][j], [j+i*1000]) 25 | temp.append(temp2) 26 | else: 27 | for j in range(20,len(data[name])): 28 | temp2 = TaggedDocument(data[name][j], [j+i*1000]) 29 | temp.append(temp2) 30 | t_train_data.extend(temp) 31 | return t_train_data 32 | 33 | def test_data(data): # preparing test data 34 | t_from_graphics_19_others = data['comp.graphics'][1:20] 35 | t_one_each_groups={} 36 | for i in data: 37 | if i!="comp.graphics": 38 | t_one_each_groups[i] = data[i][0] 39 | return t_from_graphics_19_others,t_one_each_groups 40 | 41 | def train_model(train_data): # training and saving model 42 | model = models.doc2vec.Doc2Vec(vector_size=100,min_count=6,window=10,workers=4,epochs=15) 43 | temp_train = train_data 44 | model.build_vocab(temp_train) 45 | model.train(temp_train, total_examples=model.corpus_count, epochs=model.epochs) 46 | model.save("doc2vec_trained_newsgroup.model") 47 | # print("Model Saved") 48 | return model 49 | 50 | # -------------------------------Runner functions for reading the data and training the model----------------------------- 51 | data = read_total_data() 52 | train_data = read_train_data(data) 53 | from_graphics_19_others,one_each_groups = test_data(data) 54 | model = train_model(train_data) 55 | 56 | # --------------------------------------------Similarity for different groups---------------------------------------------- 57 | different_groups_results = {} 58 | for i,doc in enumerate(one_each_groups): 59 | # calculating the similarity and appending to the results; cosine similarity is calculated 60 | t_num = np.dot(model.infer_vector('20_newsgroups/'), model.infer_vector(one_each_groups[doc])) 61 | t_deno = np.linalg.norm(model.infer_vector('20_newsgroups/'))*np.linalg.norm(model.infer_vector(one_each_groups[doc])) 62 | different_groups_results[doc] = t_num/(t_deno) 63 | print(different_groups_results) 64 | 65 | # calculating the normalized accuracy 66 | diff_acc = 0 67 | for k,v in different_groups_results.iteritems(): 68 | diff_acc+=v 69 | print (diff_acc)/len(different_groups_results) 70 | 71 | # -----------------------------------------------Similarity for same group---------------------------------------------- 72 | same_group_results = [] 73 | for i,doc in enumerate(from_graphics_19_others): 74 | # calculating the similarity and appending to the results; cosine similarity is calculated 75 | t_num = np.dot(model.infer_vector('20_newsgroups/'), model.infer_vector(doc)) 76 | t_deno = np.linalg.norm(model.infer_vector('20_newsgroups/'))*np.linalg.norm(model.infer_vector(doc)) 77 | same_group_results.append(t_num/(t_deno)) 78 | print(same_group_results) 79 | 80 | # calculating the normalized accuracy 81 | same_diff = 0 82 | for i in range(len(same_group_results)): 83 | same_diff+=same_group_results[i] 84 | print (same_diff)/len(same_group_results) 85 | 86 | -------------------------------------------------------------------------------- /5. NLP Tools/3. Spacy - Lemmatization, POS Tagging, NER, Word Similarity/Solution.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | 3 | def read_document(filepath): 4 | temp = "" 5 | with open(filepath) as f: 6 | for line in f: 7 | temp += str(line) + " " 8 | f.close() 9 | return unicode(temp) 10 | 11 | inputval = str(raw_input("1. Sentence 2. Document 3. Test sentence\n")) 12 | 13 | data = "" 14 | if inputval == "1": 15 | data = unicode(str(raw_input("Enter Sentence: "))) 16 | 17 | elif inputval == "2": 18 | filepath = str(raw_input("Enter document path: ")) 19 | data = read_document(filepath) 20 | 21 | else: 22 | data = unicode("Apple is looking at buying U.K. startup for $1 billion") 23 | 24 | # https://spacy.io/usage/linguistic-features 25 | 26 | print "\n----part 1----\n" 27 | 28 | model = spacy.load('en') 29 | 30 | parsed_data = model(data) 31 | 32 | print "Token Lemma POS TAG DEP" # POS = Lemma: The base form of the word; POS: The simple part-of-speech tag; Tag: The detailed part-of-speech tag; Dep: Syntactic dependency, i.e. the relation between tokens. 33 | for token in parsed_data: 34 | print token, token.lemma_, token.pos_, token.tag_, token.dep_ 35 | 36 | print "\n----part 2----\n" 37 | 38 | inputval = str(raw_input("1. Sentence 2. Document 3. Test sentence\n")) 39 | 40 | data = "" 41 | if inputval == "1": 42 | data = unicode(str(raw_input("Enter Sentence: "))) 43 | 44 | elif inputval == "2": 45 | filepath = str(raw_input("Enter document path: ")) 46 | data = read_document(filepath) 47 | 48 | else: 49 | data = unicode("Apple is looking at buying U.K. startup for $1 billion") 50 | 51 | model = spacy.load('en_core_web_sm') 52 | 53 | parsed_data = model(data) 54 | 55 | for ent in parsed_data.ents: 56 | print ent.text, ent.start_char, ent.end_char, ent.label_ 57 | 58 | # https://spacy.io/usage/vectors-similarity 59 | 60 | print "\n----part 3----\n" 61 | 62 | data = "" 63 | 64 | w1 = str(raw_input("Enter first word: ")) 65 | w2 = str(raw_input("Enter second word: ")) 66 | 67 | data+=w1+" "+w2 68 | data = unicode(data) 69 | 70 | model = spacy.load('en_core_web_md') 71 | 72 | parsed_data = model(data) 73 | 74 | print "" 75 | 76 | for t1 in parsed_data: 77 | for t2 in parsed_data: 78 | if t1.text == w1 and t2.text==w2: 79 | print t1.text, t2.text, t1.similarity(t2) -------------------------------------------------------------------------------- /5. NLP Tools/Problem_Statement.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shrebox/Natural-Language-Processing/187e80e128e06094d1b9d798b3f727da54377ee3/5. NLP Tools/Problem_Statement.jpg -------------------------------------------------------------------------------- /5. NLP Tools/Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shrebox/Natural-Language-Processing/187e80e128e06094d1b9d798b3f727da54377ee3/5. NLP Tools/Report.pdf -------------------------------------------------------------------------------- /5. NLP Tools/readme.md: -------------------------------------------------------------------------------- 1 | ![nlp-tools](https://github.com/shrebox/Natural-Language-Processing/blob/master/5.%20NLP%20Tools/Problem_Statement.jpg) 2 | 3 | Link to Doc2Vec model: https://drive.google.com/open?id=12wYKu6GzcMarE58iKZs3kGHWZ2HwjsCi 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Natural-Language-Processing 2 | 3 | This repository includes the codes for the following topics: 4 | 5 | 1. [Edit Distance](https://github.com/shrebox/Natural-Language-Processing/tree/master/1.%20Edit%20Distance) 6 | 2. [Regex](https://github.com/shrebox/Natural-Language-Processing/tree/master/2.%20Regex) 7 | 3. [Discriminative and Generative Models](https://github.com/shrebox/Natural-Language-Processing/tree/master/3.%20Generative%20and%20Discriminative%20Models) 8 | 4. [HMM - Viterbi Algorithm](https://github.com/shrebox/Natural-Language-Processing/tree/master/4.%20HMM%20-%20Veterbi%20Algorithm) 9 | 5. [NLP Tools](https://github.com/shrebox/Natural-Language-Processing/tree/master/5.%20NLP%20Tools): 10 | - Word Similarity (Word2vec) 11 | - Document Similarity (Doc2Vec) 12 | - Spacy (Lemmatization, POS Tagging, NER, Word Similarity) 13 | 14 | Each directory contains these files: 15 | 16 | * ```$ Problem_Statement.jpg``` #the problem statement. 17 | * ```$ Solution.py``` #contains the solution code. 18 | * ```$ Report.pdf``` #contains the detailed analysis of the problem and it's solution explaination. 19 | 20 | :alien: Do checkout the **Information Retrieval (IR)** codes compilation too [here](https://github.com/shrebox/Information-Retrieval)! 21 | 22 | I'll also try to add resources below this relevant to NLP research: 23 | 24 | * https://www.cse.iitd.ac.in/~mausam/courses/col873/spring2020/ 25 | * [ACL Year-Round Mentorship Program](https://mentorship.aclweb.org/Home.html) for NLP. 26 | --------------------------------------------------------------------------------