├── .gitignore ├── composer.json ├── composer.lock ├── readme.md └── src ├── NaiveBayes.php └── Stemmer.php /.gitignore: -------------------------------------------------------------------------------- 1 | /vendor -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "biobii/naive-bayes-text-classifier", 3 | "description": "Text classification using Naive Bayes approach", 4 | "type": "library", 5 | "keywords": [ 6 | "naive bayes", 7 | "php naive bayes", 8 | "klasifikasi teks", 9 | "bahasa indonesia", 10 | "klasifikasi naive bayes" 11 | ], 12 | "license": "MIT", 13 | "authors": [ 14 | { 15 | "name": "Mohammad Robih T. Z", 16 | "email": "biobii.game@gmail.com" 17 | } 18 | ], 19 | "require": { 20 | "php": ">=5.6.0", 21 | "sastrawi/sastrawi": "^1" 22 | }, 23 | "autoload": { 24 | "psr-4": { 25 | "Biobii\\": "src/" 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /composer.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_readme": [ 3 | "This file locks the dependencies of your project to a known state", 4 | "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", 5 | "This file is @generated automatically" 6 | ], 7 | "content-hash": "a0da88b7c144895005e67fbb4c16226d", 8 | "packages": [ 9 | { 10 | "name": "sastrawi/sastrawi", 11 | "version": "v1.2.0", 12 | "source": { 13 | "type": "git", 14 | "url": "https://github.com/sastrawi/sastrawi.git", 15 | "reference": "31fd4261dd4980cc57447f44d1aec4bd69e2abec" 16 | }, 17 | "dist": { 18 | "type": "zip", 19 | "url": "https://api.github.com/repos/sastrawi/sastrawi/zipball/31fd4261dd4980cc57447f44d1aec4bd69e2abec", 20 | "reference": "31fd4261dd4980cc57447f44d1aec4bd69e2abec", 21 | "shasum": "" 22 | }, 23 | "require": { 24 | "php": ">=5.3" 25 | }, 26 | "require-dev": { 27 | "phpunit/phpunit": "4.8.7", 28 | "squizlabs/php_codesniffer": "~1.0" 29 | }, 30 | "type": "library", 31 | "extra": { 32 | "branch-alias": { 33 | "dev-master": "1.1.x-dev" 34 | } 35 | }, 36 | "autoload": { 37 | "psr-0": { 38 | "Sastrawi\\": "src" 39 | } 40 | }, 41 | "notification-url": "https://packagist.org/downloads/", 42 | "license": [ 43 | "MIT" 44 | ], 45 | "authors": [ 46 | { 47 | "name": "Andy Librian", 48 | "role": "Lead Developer", 49 | "email": "andylibrian@gmail.com", 50 | "homepage": "http://andylibrian.com" 51 | } 52 | ], 53 | "description": "PHP library for stemming Indonesian language (Bahasa Indonesia)", 54 | "homepage": "https://github.com/sastrawi/sastrawi", 55 | "keywords": [ 56 | "bahasa", 57 | "indonesia", 58 | "indonesian", 59 | "kata dasar", 60 | "root word", 61 | "stem" 62 | ], 63 | "time": "2016-10-22T03:19:27+00:00" 64 | } 65 | ], 66 | "packages-dev": [], 67 | "aliases": [], 68 | "minimum-stability": "stable", 69 | "stability-flags": [], 70 | "prefer-stable": false, 71 | "prefer-lowest": false, 72 | "platform": [], 73 | "platform-dev": [] 74 | } 75 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Naive Bayes Text Classifier 2 | 3 | Library untuk klasifikasi teks Bahasa Indonesia menggunakan algoritma Naive Bayes Classifier (NBC). Proses stemming pada package ini menggunakan library [Sastrawi](https://github.com/sastrawi/sastrawi). 4 | 5 | ## Cara Penggunaan 6 | 7 | Install menggunakan perintah `composer require biobii/naive-bayes-text-classifier`. 8 | 9 | Menyiapkan data training. Bentuk data harus mengikuti seperti contoh berikut. Nilai pada key `class` dapat disesuaikan sesuai kebutuhan. 10 | ```php 11 | $data = [ 12 | [ 13 | 'text' => 'Filmnya bagus, saya suka', 14 | 'class' => 'positif' 15 | ], 16 | [ 17 | 'text' => 'Film jelek, aktingnya payah.', 18 | 'class' => 'negatif' 19 | ], 20 | ]; 21 | ``` 22 | 23 | Berikut contoh lengkap penggunaan. 24 | ```php 25 | require __DIR__ . '/vendor/autoload.php'; 26 | 27 | use Biobii\NaiveBayes; 28 | 29 | $data = [ 30 | [ 31 | 'text' => 'Filmnya bagus, saya suka', 32 | 'class' => 'positif' 33 | ], 34 | [ 35 | 'text' => 'Filmnya menarik, aktingnya bagus', 36 | 'class' => 'positif' 37 | ], 38 | [ 39 | 'text' => 'Saya suka film ini sangat keren', 40 | 'class' => 'positif' 41 | ], 42 | [ 43 | 'text' => 'Film jelek, aktingnya payah.', 44 | 'class' => 'negatif' 45 | ], 46 | [ 47 | 'text' => 'Kecewa, ini adalah film terburuk yang pernah saya tonton', 48 | 'class' => 'negatif' 49 | ], 50 | ]; 51 | 52 | $nb = new NaiveBayes(); 53 | 54 | // mendefinisikan class target sesuai dengan yang ada pada data training. 55 | $nb->setClass(['positif', 'negatif']); 56 | 57 | // proses training 58 | $nb->training($data); 59 | 60 | // pengujian 61 | echo $nb->predict('alur ceritanya jelek dan aktingnya payah'); // output "negatif" 62 | ``` -------------------------------------------------------------------------------- /src/NaiveBayes.php: -------------------------------------------------------------------------------- 1 | class = $class; 53 | $this->setWordsClass($class); 54 | } 55 | 56 | /** 57 | * Set words and computing data for each class. 58 | * 59 | * @param string $class 60 | * @return void 61 | */ 62 | protected function setWordsClass($class) 63 | { 64 | $this->wordsClass = []; 65 | foreach ($class as $item) { 66 | $this->wordsClass[] = [ 67 | 'class' => $item, 68 | 'words' => [], 69 | 'pData' => 0, 70 | 'computed' => [] 71 | ]; 72 | } 73 | } 74 | 75 | /** 76 | * Filter data by class or target. 77 | * 78 | * @param string $class 79 | * @return array 80 | */ 81 | public function getDataByClass(string $class) 82 | { 83 | return array_filter($this->data, function ($item) use ($class) { 84 | return ($item['class'] === $class); 85 | }); 86 | } 87 | 88 | /** 89 | * Set stemmed data. 90 | * 91 | * @param array $data 92 | * @return void 93 | */ 94 | public function setStemmedData(array $data) 95 | { 96 | $this->stemmedData = $data; 97 | } 98 | 99 | /** 100 | * Set stemmed words. 101 | * 102 | * @param array $words 103 | * @return void 104 | */ 105 | public function setWords(array $words) 106 | { 107 | $this->words = $words; 108 | } 109 | 110 | /** 111 | * Find wordsClass index by class. 112 | * 113 | * @param string $class 114 | * @return int 115 | */ 116 | public function findWordsClassIndex(string $class) 117 | { 118 | foreach ($this->wordsClass as $index => $item) { 119 | foreach ($item as $key => $value) { 120 | if ($item['class'] === $class) { 121 | return $index; 122 | } 123 | } 124 | } 125 | 126 | return -1; 127 | } 128 | 129 | /** 130 | * Training data. 131 | * 132 | * @param array $data 133 | * @return void 134 | */ 135 | public function training(array $data) 136 | { 137 | $this->data = $data; 138 | $stemmer = new Stemmer(); 139 | foreach ($this->data as $index => $item) { 140 | $stemmed = $stemmer->stem($item['text']); 141 | $this->data[$index]['text'] = $stemmed; 142 | } 143 | 144 | $this->setWords($stemmer->getWords()); 145 | 146 | foreach ($this->class as $item) { 147 | $classData = $this->getDataByClass($item); 148 | $index = $this->findWordsClassIndex($item); 149 | 150 | foreach ($this->words as $word) { 151 | $this->wordsClass[$index]['words'][] = ['word' => $word, 'count' => 0]; 152 | } 153 | 154 | foreach ($classData as $item) { 155 | $splits = explode(' ', $item['text']); 156 | foreach ($this->wordsClass[$index]['words'] as $key => $word) { 157 | foreach ($splits as $split) { 158 | if ($word['word'] === $split) { 159 | $this->wordsClass[$index]['words'][$key]['count']++; 160 | } 161 | } 162 | } 163 | } 164 | 165 | $this->wordsClass[$index]['pData'] = count($classData) / count($data); 166 | $wordsCount = count(array_filter($this->wordsClass[$index]['words'], function ($item) { 167 | return ($item['count'] !== 0); 168 | })); 169 | foreach ($this->wordsClass[$index]['words'] as $word) { 170 | $this->wordsClass[$index]['computed'][] = [ 171 | 'word' => $word['word'], 172 | 'value' => ($word['count'] + 1) / ($wordsCount + count($this->words)) 173 | ]; 174 | } 175 | } 176 | } 177 | 178 | /** 179 | * Predict data. 180 | * 181 | * @param string|array $data 182 | * @return string 183 | */ 184 | public function predict($data) 185 | { 186 | $stemmer = new Stemmer(); 187 | $stemmed = $stemmer->stem($data); 188 | $wordsArray = explode(' ', $stemmed); 189 | 190 | // calculate each class 191 | $testClass = []; 192 | foreach ($this->class as $class) { 193 | $index = $this->findWordsClassIndex($class); 194 | foreach ($wordsArray as $word) { 195 | $match = array_filter($this->wordsClass[$index]['computed'], function ($item) use ($word) { 196 | return ($item['word'] === $word); 197 | }); 198 | 199 | if ($match) { 200 | $testClass[$class]['computed'][] = reset($match)['value']; 201 | } 202 | } 203 | 204 | $testClass[$class]['result'] = 1; // init the result for the class 205 | } 206 | 207 | foreach ($testClass as $key => $value) { 208 | foreach ($value['computed'] as $val) { 209 | $testClass[$key]['result'] *= $val; 210 | } 211 | } 212 | 213 | $result = []; 214 | foreach ($this->class as $class) { 215 | $result[] = $testClass[$class]['result']; 216 | } 217 | 218 | $max = max($result); 219 | foreach ($testClass as $key => $item) { 220 | if ($item['result'] === $max) return $key; 221 | } 222 | 223 | return false; 224 | } 225 | } -------------------------------------------------------------------------------- /src/Stemmer.php: -------------------------------------------------------------------------------- 1 | stemmerFactory = new StemmerFactory(); 39 | $this->stemmer = $this->stemmerFactory->createStemmer(); 40 | } 41 | 42 | /** 43 | * Stemming process. 44 | * 45 | * @param string $text 46 | * @return string 47 | */ 48 | public function stem(string $text) 49 | { 50 | $stemmed = $this->stemmer->stem($text); 51 | $words = explode(' ', $stemmed); 52 | foreach ($words as $word) { 53 | $this->words[] = $word; 54 | } 55 | 56 | return $stemmed; 57 | } 58 | 59 | /** 60 | * Get all words. 61 | * 62 | * @param void 63 | * @return array 64 | */ 65 | public function getWords() 66 | { 67 | $unique = array_unique($this->words); 68 | $this->words = array_values($unique); 69 | return $this->words; 70 | } 71 | } --------------------------------------------------------------------------------