├── README.md
├── LICENSE
├── regression.py
└── naive_bayes.py


/README.md:
--------------------------------------------------------------------------------
1 | # probabilistic_models
2 | 
3 | A repository to store helper probailistic models.
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Classical Language Toolkit
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/regression.py:
--------------------------------------------------------------------------------
 1 | from math import exp
 2 | from math import log
 3 | 
 4 | class LogisticRegression:
 5 | 
 6 |     def __init__(self, vars, outcome, learning_rate, gen):
 7 |         #learning rate coefficient
 8 |         self.a = learning_rate
 9 |         #define generations
10 |         self.gen = gen
11 | 
12 |         #Adds pseudo-row for calculating the intercept
13 |         vars.insert(0, [1 for _ in range(len(vars[0]))])
14 | 
15 |         #Matrices representing training set
16 |         self.x = [*zip(*vars)]
17 |         self.y = outcome
18 |         #Initialize weights
19 |         self.theta = [0.5 for _ in range(len(vars))]
20 | 
21 |         self.train_stochastic_gradient()
22 | 
23 |     def h(self, x):
24 |         return 1/(1 + exp(-sum([self.theta[i]*x[i] for i in range(len(x))])))
25 | 
26 |     def p(self, x, y):
27 |         """
28 |         conditional probability function p(y|x), 0<y<1
29 |         """
30 | 
31 |         return (self.h(x)**y)*(1-self.h(x))**(1-y)
32 | 
33 |     def l(self):
34 |         """
35 |         the logarithmic likelihood function, subject
36 |         to maximization
37 |         """
38 | 
39 |         return sum([(self.y[i]*log(self.h(self.x[i])) + (1-self.y[i])*log(1-self.h(self.x[i])))
40 |                     for i in range(len(self.y))])
41 | 
42 |     def train_stochastic_gradient (self):
43 |         for k in range(self.gen):
44 |             for i in range(len(self.y)):
45 |                 self.theta = [self.theta[j] + self.x[i][j]*self.a*(self.y[i]-self.h(self.x[i])) for j in range(len(self.theta))]
46 | 
47 |     def guess(self, x):
48 |         return 1 if self.h([1] + x)> 0.5 else 0
49 | 
50 | class LinearRegresssion:
51 | 
52 |     def __init__(self, vars, outcome, learning_rate, gen):
53 |         #learning rate coefficient
54 |         self.a = learning_rate
55 |         #define generations
56 |         self.gen = gen
57 | 
58 |         #Adds pseudo-row for calculating the intercept
59 |         vars.insert(0, [1 for _ in range(len(vars[0]))])
60 | 
61 |         #Matrices representing training set
62 |         self.x = [*zip(*vars)]
63 |         self.y = outcome
64 |         #Initialize weights
65 |         self.theta = [0.5 for _ in range(len(vars))]
66 | 
67 |         self.train_stochastic_gradient()
68 | 
69 |     def train_stochastic_gradient (self):
70 |         for k in range(self.gen):
71 |             for i in range(len(self.y)):
72 |                 self.theta = [self.theta[j] + self.x[i][j]*self.a*(self.y[i]-self.h(self.x[i])) for j in range(len(self.theta))]
73 | 
74 |     def C(self):
75 |         return sum([(self.h(self.x[i]) - self.y[i])**2 for i in range(len(self.y))])/2
76 | 
77 |     def h(self, x):
78 |         return sum([self.theta[i]*x[i] for i in range(len(x))])
79 | 


--------------------------------------------------------------------------------
/naive_bayes.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from itertools import chain
 3 | from math import log
 4 | 
 5 | class NaiveBayes:
 6 | 
 7 |     def __init__(self, featureset, featureset_class, binarization=False):
 8 |         """
 9 |         The constructor accepts two basic parameters for
10 |         the training set and its corresponding class.
11 | 
12 |         e.g.
13 | 
14 |         >>> A = NaiveBayes(["cogito, ergo sum", "νόησις νοήσεως"], ["Latin", "Greek"])
15 | 
16 |         :param text_class: str list
17 |         :param text_labels: str list
18 |         """
19 | 
20 |         self.N_features = len(featureset)
21 | 
22 |         if binarization:
23 |             print(featureset)
24 |             for i in range(self.N_features):
25 |                 featureset[i] = list(set(featureset[i]))
26 |             print(featureset)
27 | 
28 |         featureset_class = tuple(featureset_class)
29 | 
30 |         labeled = defaultdict(list)
31 | 
32 |         for feature, c in zip(featureset, featureset_class):
33 |             try:
34 |                 labeled[c].append(feature)
35 |             except ValueError:
36 |                 labeled[c] = []
37 |                 labeled[c].append(feature)
38 | 
39 |         self.prior_probability = {c: self.prior(c, labeled) for c in featureset_class}
40 | 
41 |         words = list(chain(*featureset))
42 |         self.N = len(set(words))
43 | 
44 |         self.likelihood_probability = defaultdict(dict)
45 | 
46 |         for c in set(featureset_class):
47 |             class_words = list(chain(*labeled[c]))
48 |             for w in words:
49 |                 self.likelihood_probability[c][w] = self.likelihood( w, class_words)
50 | 
51 |     def get_likelihood(self, c, w):
52 |         return self.likelihood_probability[c].setdefault(w,
53 |                                                       log(1/(len(self.likelihood_probability[c]) + self.N)))
54 | 
55 |     def prior(self, c, labeled):
56 |         """
57 |         Calculate the prior probability P(c),
58 |         P(c) = occurrences of class c / total documents
59 | 
60 |         :param c: str
61 |         :param labeled: dictionary: dictionary of the form
62 |         {class: features}
63 |         """
64 |         return log(len(labeled[c])/self.N_features)
65 | 
66 |     def likelihood(self, w, class_words):
67 |         """
68 |         Calculates the probability of a word w belonging to
69 |         the class c.
70 | 
71 |         :param c: str
72 |         :param w: str
73 |         :param class_words: total number of words belonging to the class c
74 |         """
75 |         return log((class_words.count(w) + 1)/(len(class_words) + self.N))
76 | 
77 |     def probability_s(self, s, c):
78 |         """
79 |         Probability a sentence s belongs to class c
80 |         :param s: str
81 |         :param c: str
82 |         """
83 |         return sum([self.get_likelihood(c, w) for w in s]) + self.prior_probability[c]
84 | 
85 |     def predict(self, s):
86 |         """
87 |         Predict class of sentence s
88 |         :param s: str
89 |         """
90 |         return max([(self.probability_s(s, c), c) for c in self.likelihood_probability.keys()])[1]
91 | 
92 | 


--------------------------------------------------------------------------------