├── Category.php ├── LICENSE.md ├── NaiveBayesClassifier.php ├── README.md ├── db_connect.php ├── main.php └── screenshots └── database_schema.png /Category.php: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Varun kumar 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NaiveBayesClassifier.php: -------------------------------------------------------------------------------- 1 | 5 | */ 6 | 7 | require_once('Category.php'); 8 | 9 | 10 | class NaiveBayesClassifier { 11 | 12 | public function __construct() { 13 | } 14 | 15 | /** 16 | * sentence is text(document) which will be classified as ham or spam 17 | * @return category- ham/spam 18 | */ 19 | public function classify($sentence) { 20 | 21 | // extracting keywords from input text/sentence 22 | $keywordsArray = $this -> tokenize($sentence); 23 | 24 | // classifying the category 25 | $category = $this -> decide($keywordsArray); 26 | 27 | return $category; 28 | } 29 | 30 | /** 31 | * @sentence- text/document provided by user as training data 32 | * @category- category of sentence 33 | * this function will save sentence aka text/document in trainingSet table with given category 34 | * It will also update count of words (or insert new) in wordFrequency table 35 | */ 36 | public function train($sentence, $category) { 37 | $spam = Category::$SPAM; 38 | $ham = Category::$HAM; 39 | 40 | if ($category == $spam || $category == $ham) { 41 | 42 | //connecting to database 43 | require 'db_connect.php'; 44 | 45 | // inserting sentence into trainingSet with given category 46 | $sql = mysqli_query($conn, "INSERT into trainingSet (document, category) values('$sentence', '$category')"); 47 | 48 | // extracting keywords 49 | $keywordsArray = $this -> tokenize($sentence); 50 | 51 | // updating wordFrequency table 52 | foreach ($keywordsArray as $word) { 53 | 54 | // if this word is already present with given category then update count else insert 55 | $sql = mysqli_query($conn, "SELECT count(*) as total FROM wordFrequency WHERE word = '$word' and category= '$category' "); 56 | $count = mysqli_fetch_assoc($sql); 57 | 58 | if ($count['total'] == 0) { 59 | $sql = mysqli_query($conn, "INSERT into wordFrequency (word, category, count) values('$word', '$category', 1)"); 60 | } else { 61 | $sql = mysqli_query($conn, "UPDATE wordFrequency set count = count + 1 where word = '$word'"); 62 | } 63 | } 64 | 65 | //closing connection 66 | $conn -> close(); 67 | 68 | } else { 69 | throw new Exception('Unknown category. Valid categories are: $ham, $spam'); 70 | } 71 | } 72 | 73 | /** 74 | * this function takes a paragraph of text as input and returns an array of keywords. 75 | */ 76 | 77 | private function tokenize($sentence) { 78 | $stopWords = array('about','and','are','com','for','from','how', 79 | 'that','the','this', 'was','what','when','where','who','will','with','und','the','www'); 80 | 81 | //removing all the characters which ar not letters, numbers or space 82 | $sentence = preg_replace("/[^a-zA-Z 0-9]+/", "", $sentence); 83 | 84 | //converting to lowercase 85 | $sentence = strtolower($sentence); 86 | 87 | //an empty array 88 | $keywordsArray = array(); 89 | 90 | //splitting text into array of keywords 91 | //http://www.w3schools.com/php/func_string_strtok.asp 92 | $token = strtok($sentence, " "); 93 | while ($token !== false) { 94 | 95 | //excluding elements of length less than 3 96 | if (!(strlen($token) <= 2)) { 97 | 98 | //excluding elements which are present in stopWords array 99 | //http://www.w3schools.com/php/func_array_in_array.asp 100 | if (!(in_array($token, $stopWords))) { 101 | array_push($keywordsArray, $token); 102 | } 103 | } 104 | $token = strtok(" "); 105 | } 106 | return $keywordsArray; 107 | } 108 | 109 | /** 110 | * This function takes an array of words as input and return category (spam/ham) after 111 | * applying Naive Bayes Classifier 112 | * 113 | * Naive Bayes Classifier Algorithm - 114 | * 115 | * p(spam/bodyText) = p(spam) * p(bodyText/spam) / p(bodyText); 116 | * p(ham/bodyText) = p(ham) * p(bodyText/ham) / p(bodyText); 117 | * p(bodyText) is constant so it can be ommitted 118 | * p(spam) = no of documents (sentence) belonging to category spam / total no of documents (sentence) 119 | * p(bodyText/spam) = p(word1/spam) * p(word2/spam) * .... p(wordn/spam) 120 | * Laplace smoothing for such cases is usually given by (c+1)/(N+V), 121 | * where V is the vocabulary size (total no of different words) 122 | * p(word/spam) = no of times word occur in spam / no of all words in spam 123 | * Reference: 124 | * http://stackoverflow.com/questions/9996327/using-a-naive-bayes-classifier-to-classify-tweets-some-problems 125 | * https://github.com/ttezel/bayes/blob/master/lib/naive_bayes.js 126 | */ 127 | private function decide ($keywordsArray) { 128 | $spam = Category::$SPAM; 129 | $ham = Category::$HAM; 130 | 131 | // by default assuming category to be ham 132 | $category = $ham; 133 | 134 | // making connection to database 135 | require 'db_connect.php'; 136 | 137 | $sql = mysqli_query($conn, "SELECT count(*) as total FROM trainingSet WHERE category = '$spam' "); 138 | $spamCount = mysqli_fetch_assoc($sql); 139 | $spamCount = $spamCount['total']; 140 | 141 | $sql = mysqli_query($conn, "SELECT count(*) as total FROM trainingSet WHERE category = '$ham' "); 142 | $hamCount = mysqli_fetch_assoc($sql); 143 | $hamCount = $hamCount['total']; 144 | 145 | $sql = mysqli_query($conn, "SELECT count(*) as total FROM trainingSet "); 146 | $totalCount = mysqli_fetch_assoc($sql); 147 | $totalCount = $totalCount['total']; 148 | 149 | //p(spam) 150 | $pSpam = $spamCount / $totalCount; // (no of documents classified as spam / total no of documents) 151 | 152 | //p(ham) 153 | $pHam = $hamCount / $totalCount; // (no of documents classified as ham / total no of documents) 154 | 155 | //echo $pSpam." ".$pHam; 156 | 157 | // no of distinct words (used for laplace smoothing) 158 | $sql = mysqli_query($conn, "SELECT count(*) as total FROM wordFrequency "); 159 | $distinctWords = mysqli_fetch_assoc($sql); 160 | $distinctWords = $distinctWords['total']; 161 | 162 | $bodyTextIsSpam = log($pSpam); 163 | foreach ($keywordsArray as $word) { 164 | $sql = mysqli_query($conn, "SELECT count as total FROM wordFrequency where word = '$word' and category = '$spam' "); 165 | $wordCount = mysqli_fetch_assoc($sql); 166 | $wordCount = $wordCount['total']; 167 | $bodyTextIsSpam += log(($wordCount + 1) / ($spamCount + $distinctWords)); 168 | } 169 | 170 | $bodyTextIsHam = log($pHam); 171 | foreach ($keywordsArray as $word) { 172 | $sql = mysqli_query($conn, "SELECT count as total FROM wordFrequency where word = '$word' and category = '$ham' "); 173 | $wordCount = mysqli_fetch_assoc($sql); 174 | $wordCount = $wordCount['total']; 175 | $bodyTextIsHam += log(($wordCount + 1) / ($hamCount + $distinctWords)); 176 | } 177 | 178 | if ($bodyTextIsHam >= $bodyTextIsSpam) { 179 | $category = $ham; 180 | } else { 181 | $category = $spam; 182 | } 183 | 184 | $conn -> close(); 185 | 186 | return $category; 187 | } 188 | } 189 | 190 | ?> 191 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Naive Bayes Classifier 2 | 3 | Implementing Naive Bayes Classification algorithm into PHP to classify given text as ham or spam using MySql database. 4 | 5 | ### How to use 6 | 7 | train('Have a pleasurable stay! Get up to 30% off + Flat 20% Cashback on Oyo Room' . 16 | ' bookings done via Paytm', $spam); 17 | $classifier -> train('Lets Talk Fashion! Get flat 40% Cashback on Backpacks, Watches, Perfumes,' . 18 | ' Sunglasses & more', $spam); 19 | 20 | $classifier -> train('Opportunity with Product firm for Fullstack | Backend | Frontend- Bangalore', $ham); 21 | $classifier -> train('Javascript Developer, Fullstack Developer in Bangalore- Urgent Requirement', $ham); 22 | 23 | $category = $classifier -> classify('Scan Paytm QR Code to Pay & Win 100% Cashback'); 24 | echo $category; // spam 25 | 26 | $category = $classifier -> classify('Re: Applying for Fullstack Developer'); 27 | echo $category; // ham 28 | 29 | ?> 30 | 31 | ### How to install the project- 32 | 33 | 1. Download the project and extract zip. 34 | 2. Create database in MySql- 35 | 36 | 1. mysql> create database naiveBayes; 37 | 2. mysql> use naiveBayes; 38 | 3. mysql> create table trainingSet (S_NO integer primary key auto_increment, document text, category varchar(255)); 39 | 4. mysql> create table wordFrequency (S_NO integer primary key auto_increment, word varchar(255), count integer, category varchar(255)); 40 | 41 | 3. Open a terminal and move to project folder 42 | 4. Edit database connection info in db_connect.php file 43 | 5. Execute main.php `php main.php` 44 | 45 | ### Database Schema 46 | 47 | ![Database Schema](./screenshots/database_schema.png) 48 | 49 | #### Blog 50 | https://medium.com/@varunon9/classify-emails-into-ham-and-spam-using-naive-bayes-classifier-ffddd7faa1ef 51 | 52 | ##### For any bug/mistake you can create github issue. Contact varunon9@gmail.com for suggestion/query. -------------------------------------------------------------------------------- /db_connect.php: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /main.php: -------------------------------------------------------------------------------- 1 | create database naiveBayes; 5 | * mysql> use naiveBayes; 6 | * mysql> create table trainingSet (S_NO integer primary key auto_increment, document text, category varchar(255)); 7 | * mysql> create table wordFrequency (S_NO integer primary key auto_increment, word varchar(255), count integer, category varchar(255)); 8 | */ 9 | 10 | require_once('NaiveBayesClassifier.php'); 11 | 12 | $classifier = new NaiveBayesClassifier(); 13 | $spam = Category::$SPAM; 14 | $ham = Category::$HAM; 15 | 16 | $classifier -> train('Have a pleasurable stay! Get up to 30% off + Flat 20% Cashback on Oyo Room' . 17 | ' bookings done via Paytm', $spam); 18 | $classifier -> train('Lets Talk Fashion! Get flat 40% Cashback on Backpacks, Watches, Perfumes,' . 19 | ' Sunglasses & more', $spam); 20 | 21 | $classifier -> train('Opportunity with Product firm for Fullstack | Backend | Frontend- Bangalore', $ham); 22 | $classifier -> train('Javascript Developer, Fullstack Developer in Bangalore- Urgent Requirement', $ham); 23 | 24 | $category = $classifier -> classify('Scan Paytm QR Code to Pay & Win 100% Cashback'); 25 | echo $category; 26 | 27 | $category = $classifier -> classify('Re: Applying for Fullstack Developer'); 28 | echo $category; 29 | 30 | ?> -------------------------------------------------------------------------------- /screenshots/database_schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varunon9/naive-bayes-classifier/a05029795ac999b600ac32f707da5f1d542d9e06/screenshots/database_schema.png --------------------------------------------------------------------------------