├── README.md ├── LICENSE ├── regression.py └── naive_bayes.py /README.md: -------------------------------------------------------------------------------- 1 | # probabilistic_models 2 | 3 | A repository to store helper probailistic models. 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Classical Language Toolkit 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /regression.py: -------------------------------------------------------------------------------- 1 | from math import exp 2 | from math import log 3 | 4 | class LogisticRegression: 5 | 6 | def __init__(self, vars, outcome, learning_rate, gen): 7 | #learning rate coefficient 8 | self.a = learning_rate 9 | #define generations 10 | self.gen = gen 11 | 12 | #Adds pseudo-row for calculating the intercept 13 | vars.insert(0, [1 for _ in range(len(vars[0]))]) 14 | 15 | #Matrices representing training set 16 | self.x = [*zip(*vars)] 17 | self.y = outcome 18 | #Initialize weights 19 | self.theta = [0.5 for _ in range(len(vars))] 20 | 21 | self.train_stochastic_gradient() 22 | 23 | def h(self, x): 24 | return 1/(1 + exp(-sum([self.theta[i]*x[i] for i in range(len(x))]))) 25 | 26 | def p(self, x, y): 27 | """ 28 | conditional probability function p(y|x), 0 0.5 else 0 49 | 50 | class LinearRegresssion: 51 | 52 | def __init__(self, vars, outcome, learning_rate, gen): 53 | #learning rate coefficient 54 | self.a = learning_rate 55 | #define generations 56 | self.gen = gen 57 | 58 | #Adds pseudo-row for calculating the intercept 59 | vars.insert(0, [1 for _ in range(len(vars[0]))]) 60 | 61 | #Matrices representing training set 62 | self.x = [*zip(*vars)] 63 | self.y = outcome 64 | #Initialize weights 65 | self.theta = [0.5 for _ in range(len(vars))] 66 | 67 | self.train_stochastic_gradient() 68 | 69 | def train_stochastic_gradient (self): 70 | for k in range(self.gen): 71 | for i in range(len(self.y)): 72 | self.theta = [self.theta[j] + self.x[i][j]*self.a*(self.y[i]-self.h(self.x[i])) for j in range(len(self.theta))] 73 | 74 | def C(self): 75 | return sum([(self.h(self.x[i]) - self.y[i])**2 for i in range(len(self.y))])/2 76 | 77 | def h(self, x): 78 | return sum([self.theta[i]*x[i] for i in range(len(x))]) 79 | -------------------------------------------------------------------------------- /naive_bayes.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from itertools import chain 3 | from math import log 4 | 5 | class NaiveBayes: 6 | 7 | def __init__(self, featureset, featureset_class, binarization=False): 8 | """ 9 | The constructor accepts two basic parameters for 10 | the training set and its corresponding class. 11 | 12 | e.g. 13 | 14 | >>> A = NaiveBayes(["cogito, ergo sum", "νόησις νοήσεως"], ["Latin", "Greek"]) 15 | 16 | :param text_class: str list 17 | :param text_labels: str list 18 | """ 19 | 20 | self.N_features = len(featureset) 21 | 22 | if binarization: 23 | print(featureset) 24 | for i in range(self.N_features): 25 | featureset[i] = list(set(featureset[i])) 26 | print(featureset) 27 | 28 | featureset_class = tuple(featureset_class) 29 | 30 | labeled = defaultdict(list) 31 | 32 | for feature, c in zip(featureset, featureset_class): 33 | try: 34 | labeled[c].append(feature) 35 | except ValueError: 36 | labeled[c] = [] 37 | labeled[c].append(feature) 38 | 39 | self.prior_probability = {c: self.prior(c, labeled) for c in featureset_class} 40 | 41 | words = list(chain(*featureset)) 42 | self.N = len(set(words)) 43 | 44 | self.likelihood_probability = defaultdict(dict) 45 | 46 | for c in set(featureset_class): 47 | class_words = list(chain(*labeled[c])) 48 | for w in words: 49 | self.likelihood_probability[c][w] = self.likelihood( w, class_words) 50 | 51 | def get_likelihood(self, c, w): 52 | return self.likelihood_probability[c].setdefault(w, 53 | log(1/(len(self.likelihood_probability[c]) + self.N))) 54 | 55 | def prior(self, c, labeled): 56 | """ 57 | Calculate the prior probability P(c), 58 | P(c) = occurrences of class c / total documents 59 | 60 | :param c: str 61 | :param labeled: dictionary: dictionary of the form 62 | {class: features} 63 | """ 64 | return log(len(labeled[c])/self.N_features) 65 | 66 | def likelihood(self, w, class_words): 67 | """ 68 | Calculates the probability of a word w belonging to 69 | the class c. 70 | 71 | :param c: str 72 | :param w: str 73 | :param class_words: total number of words belonging to the class c 74 | """ 75 | return log((class_words.count(w) + 1)/(len(class_words) + self.N)) 76 | 77 | def probability_s(self, s, c): 78 | """ 79 | Probability a sentence s belongs to class c 80 | :param s: str 81 | :param c: str 82 | """ 83 | return sum([self.get_likelihood(c, w) for w in s]) + self.prior_probability[c] 84 | 85 | def predict(self, s): 86 | """ 87 | Predict class of sentence s 88 | :param s: str 89 | """ 90 | return max([(self.probability_s(s, c), c) for c in self.likelihood_probability.keys()])[1] 91 | 92 | --------------------------------------------------------------------------------