├── .gitignore ├── .travis.yml ├── README.md ├── composer.json ├── composer.lock ├── phpunit.xml.dist ├── src └── Fieg │ └── Bayes │ ├── Classifier.php │ ├── Tokenizer │ └── WhitespaceAndPunctuationTokenizer.php │ └── TokenizerInterface.php └── tests ├── Fieg └── Bayes │ ├── ClassifierTest.php │ └── Tokenizer │ └── WhitespaceAndPunctuationTokenizerTest.php └── bootstrap.php /.gitignore: -------------------------------------------------------------------------------- 1 | vendor/ 2 | bin/ 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: php 2 | 3 | php: 4 | - "5.4" 5 | - "5.5" 6 | - "5.6" 7 | - "7.0" 8 | - "hhvm" 9 | 10 | before_script: composer install 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Naive Bayes Classifier 2 | ====================== 3 | 4 | Implementation of Naive Bayes Classifier algorithm in PHP. 5 | 6 | Based on [Machine Learning: Naive Bayes Document Classification Algorithm in Javascript](http://burakkanber.com/blog/machine-learning-naive-bayes-1/) by Burak Kanber. 7 | 8 | [![Build Status](https://travis-ci.org/fieg/bayes.png?branch=master)](https://travis-ci.org/fieg/bayes) 9 | 10 | Getting started 11 | --------------- 12 | 13 | ```php 14 | use Fieg\Bayes\Classifier; 15 | use Fieg\Bayes\Tokenizer\WhitespaceAndPunctuationTokenizer; 16 | 17 | $tokenizer = new WhitespaceAndPunctuationTokenizer(); 18 | $classifier = new Classifier($tokenizer); 19 | 20 | $classifier->train('en', 'This is english'); 21 | $classifier->train('fr', 'Je suis Hollandais'); 22 | 23 | $result = $classifier->classify('This is a naive bayes classifier'); 24 | ``` 25 | 26 | Which would result in: 27 | 28 | ``` 29 | array(2) { 30 | 'en' => 31 | double(0.9) 32 | 'fr' => 33 | double(0.1) 34 | } 35 | ``` 36 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "fieg/bayes", 3 | "description": "Implementation of Naive Bayes Classifier algorithm in PHP.", 4 | "type": "library", 5 | "keywords": ["machine", "learning", "naive", "bayes", "classifier"], 6 | "homepage": "https://github.com/fieg/bayes", 7 | "authors": [ 8 | { 9 | "name": "fieg", 10 | "email": "jeroen@webcreate.nl" 11 | } 12 | ], 13 | "require": { 14 | "php": ">=5.4" 15 | }, 16 | "require-dev": { 17 | "phpunit/phpunit": "*" 18 | }, 19 | "autoload": { 20 | "psr-0": {"Fieg": "src/"} 21 | }, 22 | "config": { 23 | "bin-dir": "bin" 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /composer.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_readme": [ 3 | "This file locks the dependencies of your project to a known state", 4 | "Read more about it at http://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file" 5 | ], 6 | "hash": "e1d38eb330fe2fa46b38395f91f7bd03", 7 | "packages": [ 8 | 9 | ], 10 | "packages-dev": [ 11 | { 12 | "name": "phpunit/php-code-coverage", 13 | "version": "1.2.13", 14 | "source": { 15 | "type": "git", 16 | "url": "https://github.com/sebastianbergmann/php-code-coverage.git", 17 | "reference": "466e7cd2554b4e264c9e3f31216d25ac0e5f3d94" 18 | }, 19 | "dist": { 20 | "type": "zip", 21 | "url": "https://api.github.com/repos/sebastianbergmann/php-code-coverage/zipball/466e7cd2554b4e264c9e3f31216d25ac0e5f3d94", 22 | "reference": "466e7cd2554b4e264c9e3f31216d25ac0e5f3d94", 23 | "shasum": "" 24 | }, 25 | "require": { 26 | "php": ">=5.3.3", 27 | "phpunit/php-file-iterator": ">=1.3.0@stable", 28 | "phpunit/php-text-template": ">=1.1.1@stable", 29 | "phpunit/php-token-stream": ">=1.1.3@stable" 30 | }, 31 | "require-dev": { 32 | "phpunit/phpunit": "3.7.*@dev" 33 | }, 34 | "suggest": { 35 | "ext-dom": "*", 36 | "ext-xdebug": ">=2.0.5" 37 | }, 38 | "type": "library", 39 | "extra": { 40 | "branch-alias": { 41 | "dev-master": "1.2.x-dev" 42 | } 43 | }, 44 | "autoload": { 45 | "classmap": [ 46 | "PHP/" 47 | ] 48 | }, 49 | "notification-url": "https://packagist.org/downloads/", 50 | "include-path": [ 51 | "" 52 | ], 53 | "license": [ 54 | "BSD-3-Clause" 55 | ], 56 | "authors": [ 57 | { 58 | "name": "Sebastian Bergmann", 59 | "email": "sb@sebastian-bergmann.de", 60 | "role": "lead" 61 | } 62 | ], 63 | "description": "Library that provides collection, processing, and rendering functionality for PHP code coverage information.", 64 | "homepage": "https://github.com/sebastianbergmann/php-code-coverage", 65 | "keywords": [ 66 | "coverage", 67 | "testing", 68 | "xunit" 69 | ], 70 | "time": "2013-09-10 08:14:32" 71 | }, 72 | { 73 | "name": "phpunit/php-file-iterator", 74 | "version": "1.3.4", 75 | "source": { 76 | "type": "git", 77 | "url": "https://github.com/sebastianbergmann/php-file-iterator.git", 78 | "reference": "acd690379117b042d1c8af1fafd61bde001bf6bb" 79 | }, 80 | "dist": { 81 | "type": "zip", 82 | "url": "https://api.github.com/repos/sebastianbergmann/php-file-iterator/zipball/acd690379117b042d1c8af1fafd61bde001bf6bb", 83 | "reference": "acd690379117b042d1c8af1fafd61bde001bf6bb", 84 | "shasum": "" 85 | }, 86 | "require": { 87 | "php": ">=5.3.3" 88 | }, 89 | "type": "library", 90 | "autoload": { 91 | "classmap": [ 92 | "File/" 93 | ] 94 | }, 95 | "notification-url": "https://packagist.org/downloads/", 96 | "include-path": [ 97 | "" 98 | ], 99 | "license": [ 100 | "BSD-3-Clause" 101 | ], 102 | "authors": [ 103 | { 104 | "name": "Sebastian Bergmann", 105 | "email": "sb@sebastian-bergmann.de", 106 | "role": "lead" 107 | } 108 | ], 109 | "description": "FilterIterator implementation that filters files based on a list of suffixes.", 110 | "homepage": "https://github.com/sebastianbergmann/php-file-iterator/", 111 | "keywords": [ 112 | "filesystem", 113 | "iterator" 114 | ], 115 | "time": "2013-10-10 15:34:57" 116 | }, 117 | { 118 | "name": "phpunit/php-text-template", 119 | "version": "1.1.4", 120 | "source": { 121 | "type": "git", 122 | "url": "https://github.com/sebastianbergmann/php-text-template.git", 123 | "reference": "5180896f51c5b3648ac946b05f9ec02be78a0b23" 124 | }, 125 | "dist": { 126 | "type": "zip", 127 | "url": "https://api.github.com/repos/sebastianbergmann/php-text-template/zipball/5180896f51c5b3648ac946b05f9ec02be78a0b23", 128 | "reference": "5180896f51c5b3648ac946b05f9ec02be78a0b23", 129 | "shasum": "" 130 | }, 131 | "require": { 132 | "php": ">=5.3.3" 133 | }, 134 | "type": "library", 135 | "autoload": { 136 | "classmap": [ 137 | "Text/" 138 | ] 139 | }, 140 | "notification-url": "https://packagist.org/downloads/", 141 | "include-path": [ 142 | "" 143 | ], 144 | "license": [ 145 | "BSD-3-Clause" 146 | ], 147 | "authors": [ 148 | { 149 | "name": "Sebastian Bergmann", 150 | "email": "sb@sebastian-bergmann.de", 151 | "role": "lead" 152 | } 153 | ], 154 | "description": "Simple template engine.", 155 | "homepage": "https://github.com/sebastianbergmann/php-text-template/", 156 | "keywords": [ 157 | "template" 158 | ], 159 | "time": "2012-10-31 18:15:28" 160 | }, 161 | { 162 | "name": "phpunit/php-timer", 163 | "version": "1.0.5", 164 | "source": { 165 | "type": "git", 166 | "url": "https://github.com/sebastianbergmann/php-timer.git", 167 | "reference": "19689d4354b295ee3d8c54b4f42c3efb69cbc17c" 168 | }, 169 | "dist": { 170 | "type": "zip", 171 | "url": "https://api.github.com/repos/sebastianbergmann/php-timer/zipball/19689d4354b295ee3d8c54b4f42c3efb69cbc17c", 172 | "reference": "19689d4354b295ee3d8c54b4f42c3efb69cbc17c", 173 | "shasum": "" 174 | }, 175 | "require": { 176 | "php": ">=5.3.3" 177 | }, 178 | "type": "library", 179 | "autoload": { 180 | "classmap": [ 181 | "PHP/" 182 | ] 183 | }, 184 | "notification-url": "https://packagist.org/downloads/", 185 | "include-path": [ 186 | "" 187 | ], 188 | "license": [ 189 | "BSD-3-Clause" 190 | ], 191 | "authors": [ 192 | { 193 | "name": "Sebastian Bergmann", 194 | "email": "sb@sebastian-bergmann.de", 195 | "role": "lead" 196 | } 197 | ], 198 | "description": "Utility class for timing", 199 | "homepage": "https://github.com/sebastianbergmann/php-timer/", 200 | "keywords": [ 201 | "timer" 202 | ], 203 | "time": "2013-08-02 07:42:54" 204 | }, 205 | { 206 | "name": "phpunit/php-token-stream", 207 | "version": "1.2.1", 208 | "source": { 209 | "type": "git", 210 | "url": "https://github.com/sebastianbergmann/php-token-stream.git", 211 | "reference": "5220af2a7929aa35cf663d97c89ad3d50cf5fa3e" 212 | }, 213 | "dist": { 214 | "type": "zip", 215 | "url": "https://api.github.com/repos/sebastianbergmann/php-token-stream/zipball/5220af2a7929aa35cf663d97c89ad3d50cf5fa3e", 216 | "reference": "5220af2a7929aa35cf663d97c89ad3d50cf5fa3e", 217 | "shasum": "" 218 | }, 219 | "require": { 220 | "ext-tokenizer": "*", 221 | "php": ">=5.3.3" 222 | }, 223 | "type": "library", 224 | "extra": { 225 | "branch-alias": { 226 | "dev-master": "1.2-dev" 227 | } 228 | }, 229 | "autoload": { 230 | "classmap": [ 231 | "PHP/" 232 | ] 233 | }, 234 | "notification-url": "https://packagist.org/downloads/", 235 | "include-path": [ 236 | "" 237 | ], 238 | "license": [ 239 | "BSD-3-Clause" 240 | ], 241 | "authors": [ 242 | { 243 | "name": "Sebastian Bergmann", 244 | "email": "sb@sebastian-bergmann.de", 245 | "role": "lead" 246 | } 247 | ], 248 | "description": "Wrapper around PHP's tokenizer extension.", 249 | "homepage": "https://github.com/sebastianbergmann/php-token-stream/", 250 | "keywords": [ 251 | "tokenizer" 252 | ], 253 | "time": "2013-09-13 04:58:23" 254 | }, 255 | { 256 | "name": "phpunit/phpunit", 257 | "version": "3.7.29", 258 | "source": { 259 | "type": "git", 260 | "url": "https://github.com/sebastianbergmann/phpunit.git", 261 | "reference": "faeb2d9f15dc83830d2db5e4c67acf1d68c9b5ac" 262 | }, 263 | "dist": { 264 | "type": "zip", 265 | "url": "https://api.github.com/repos/sebastianbergmann/phpunit/zipball/faeb2d9f15dc83830d2db5e4c67acf1d68c9b5ac", 266 | "reference": "faeb2d9f15dc83830d2db5e4c67acf1d68c9b5ac", 267 | "shasum": "" 268 | }, 269 | "require": { 270 | "ext-dom": "*", 271 | "ext-pcre": "*", 272 | "ext-reflection": "*", 273 | "ext-spl": "*", 274 | "php": ">=5.3.3", 275 | "phpunit/php-code-coverage": "~1.2.1", 276 | "phpunit/php-file-iterator": ">=1.3.1", 277 | "phpunit/php-text-template": ">=1.1.1", 278 | "phpunit/php-timer": ">=1.0.4", 279 | "phpunit/phpunit-mock-objects": "~1.2.0", 280 | "symfony/yaml": "~2.0" 281 | }, 282 | "require-dev": { 283 | "pear-pear/pear": "1.9.4" 284 | }, 285 | "suggest": { 286 | "ext-json": "*", 287 | "ext-simplexml": "*", 288 | "ext-tokenizer": "*", 289 | "phpunit/php-invoker": ">=1.1.0,<1.2.0" 290 | }, 291 | "bin": [ 292 | "composer/bin/phpunit" 293 | ], 294 | "type": "library", 295 | "extra": { 296 | "branch-alias": { 297 | "dev-master": "3.7.x-dev" 298 | } 299 | }, 300 | "autoload": { 301 | "classmap": [ 302 | "PHPUnit/" 303 | ] 304 | }, 305 | "notification-url": "https://packagist.org/downloads/", 306 | "include-path": [ 307 | "", 308 | "../../symfony/yaml/" 309 | ], 310 | "license": [ 311 | "BSD-3-Clause" 312 | ], 313 | "authors": [ 314 | { 315 | "name": "Sebastian Bergmann", 316 | "email": "sebastian@phpunit.de", 317 | "role": "lead" 318 | } 319 | ], 320 | "description": "The PHP Unit Testing framework.", 321 | "homepage": "http://www.phpunit.de/", 322 | "keywords": [ 323 | "phpunit", 324 | "testing", 325 | "xunit" 326 | ], 327 | "time": "2014-01-15 06:46:38" 328 | }, 329 | { 330 | "name": "phpunit/phpunit-mock-objects", 331 | "version": "1.2.3", 332 | "source": { 333 | "type": "git", 334 | "url": "https://github.com/sebastianbergmann/phpunit-mock-objects.git", 335 | "reference": "5794e3c5c5ba0fb037b11d8151add2a07fa82875" 336 | }, 337 | "dist": { 338 | "type": "zip", 339 | "url": "https://api.github.com/repos/sebastianbergmann/phpunit-mock-objects/zipball/5794e3c5c5ba0fb037b11d8151add2a07fa82875", 340 | "reference": "5794e3c5c5ba0fb037b11d8151add2a07fa82875", 341 | "shasum": "" 342 | }, 343 | "require": { 344 | "php": ">=5.3.3", 345 | "phpunit/php-text-template": ">=1.1.1@stable" 346 | }, 347 | "suggest": { 348 | "ext-soap": "*" 349 | }, 350 | "type": "library", 351 | "autoload": { 352 | "classmap": [ 353 | "PHPUnit/" 354 | ] 355 | }, 356 | "notification-url": "https://packagist.org/downloads/", 357 | "include-path": [ 358 | "" 359 | ], 360 | "license": [ 361 | "BSD-3-Clause" 362 | ], 363 | "authors": [ 364 | { 365 | "name": "Sebastian Bergmann", 366 | "email": "sb@sebastian-bergmann.de", 367 | "role": "lead" 368 | } 369 | ], 370 | "description": "Mock Object library for PHPUnit", 371 | "homepage": "https://github.com/sebastianbergmann/phpunit-mock-objects/", 372 | "keywords": [ 373 | "mock", 374 | "xunit" 375 | ], 376 | "time": "2013-01-13 10:24:48" 377 | }, 378 | { 379 | "name": "symfony/yaml", 380 | "version": "v2.4.1", 381 | "target-dir": "Symfony/Component/Yaml", 382 | "source": { 383 | "type": "git", 384 | "url": "https://github.com/symfony/Yaml.git", 385 | "reference": "4e1a237fc48145fae114b96458d799746ad89aa0" 386 | }, 387 | "dist": { 388 | "type": "zip", 389 | "url": "https://api.github.com/repos/symfony/Yaml/zipball/4e1a237fc48145fae114b96458d799746ad89aa0", 390 | "reference": "4e1a237fc48145fae114b96458d799746ad89aa0", 391 | "shasum": "" 392 | }, 393 | "require": { 394 | "php": ">=5.3.3" 395 | }, 396 | "type": "library", 397 | "extra": { 398 | "branch-alias": { 399 | "dev-master": "2.4-dev" 400 | } 401 | }, 402 | "autoload": { 403 | "psr-0": { 404 | "Symfony\\Component\\Yaml\\": "" 405 | } 406 | }, 407 | "notification-url": "https://packagist.org/downloads/", 408 | "license": [ 409 | "MIT" 410 | ], 411 | "authors": [ 412 | { 413 | "name": "Fabien Potencier", 414 | "email": "fabien@symfony.com" 415 | }, 416 | { 417 | "name": "Symfony Community", 418 | "homepage": "http://symfony.com/contributors" 419 | } 420 | ], 421 | "description": "Symfony Yaml Component", 422 | "homepage": "http://symfony.com", 423 | "time": "2013-12-28 08:12:03" 424 | } 425 | ], 426 | "aliases": [ 427 | 428 | ], 429 | "minimum-stability": "stable", 430 | "stability-flags": [ 431 | 432 | ], 433 | "platform": [ 434 | 435 | ], 436 | "platform-dev": [ 437 | 438 | ] 439 | } 440 | -------------------------------------------------------------------------------- /phpunit.xml.dist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 15 | 16 | 17 | 18 | tests 19 | 20 | 21 | 22 | 23 | 24 | src/ 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /src/Fieg/Bayes/Classifier.php: -------------------------------------------------------------------------------- 1 | 8 | * @copyright Webcreate (http://webcreate.nl) 9 | */ 10 | 11 | namespace Fieg\Bayes; 12 | 13 | class Classifier 14 | { 15 | /** 16 | * @var TokenizerInterface 17 | */ 18 | protected $tokenizer; 19 | 20 | /** 21 | * @var array 22 | */ 23 | protected $labels = array(); 24 | 25 | /** 26 | * @var array 27 | */ 28 | protected $docs = array(); 29 | 30 | /** 31 | * @var array 32 | */ 33 | protected $tokens = array(); 34 | 35 | /** 36 | * @var array 37 | */ 38 | protected $data = array(); 39 | 40 | /** 41 | * Constructor. 42 | * 43 | * @param TokenizerInterface $tokenizer 44 | */ 45 | public function __construct(TokenizerInterface $tokenizer) 46 | { 47 | $this->tokenizer = $tokenizer; 48 | } 49 | 50 | /** 51 | * Trains the classifier one text+label combination a time 52 | * 53 | * @param string $label 54 | * @param string $text 55 | */ 56 | public function train($label, $text) 57 | { 58 | $tokens = $this->tokenizer->tokenize($text); 59 | 60 | if (!isset($this->labels[$label])) { 61 | $this->labels[$label] = 0; 62 | $this->data[$label] = []; 63 | $this->docs[$label] = 0; 64 | } 65 | 66 | foreach ($tokens as $token) { 67 | if (!isset($this->tokens[$token])) { 68 | $this->tokens[$token] = 0; 69 | } 70 | if (!isset($this->data[$label][$token])) { 71 | $this->data[$label][$token] = 0; 72 | } 73 | 74 | $this->labels[$label]++; 75 | $this->tokens[$token]++; 76 | $this->data[$label][$token]++; 77 | } 78 | 79 | $this->docs[$label]++; 80 | } 81 | 82 | /** 83 | * Classifies a text and returns the probability (score) per label 84 | * 85 | * @param string $text 86 | * @return array 87 | */ 88 | public function classify($text) 89 | { 90 | $totalDocCount = array_sum($this->docs); 91 | 92 | $tokens = $this->tokenizer->tokenize($text); 93 | 94 | $scores = array(); 95 | 96 | foreach ($this->labels as $label => $labelCount) { 97 | $logSum = 0; 98 | 99 | $docCount = $this->docs[$label]; 100 | $inversedDocCount = $totalDocCount - $docCount; 101 | 102 | if (0 === $inversedDocCount) { 103 | continue; 104 | } 105 | 106 | foreach ($tokens as $token) { 107 | $totalTokenCount = isset($this->tokens[$token]) ? $this->tokens[$token] : 0; 108 | 109 | if (0 === $totalTokenCount) { 110 | continue; 111 | } 112 | 113 | $tokenCount = isset($this->data[$label][$token]) ? $this->data[$label][$token] : 0; 114 | $inversedTokenCount = $this->inversedTokenCount($token, $label); 115 | 116 | $tokenProbabilityPositive = $tokenCount / $docCount; 117 | $tokenProbabilityNegative = $inversedTokenCount / $inversedDocCount; 118 | 119 | $probability = $tokenProbabilityPositive / ($tokenProbabilityPositive + $tokenProbabilityNegative); 120 | 121 | $probability = ((1 * 0.5) + ($totalTokenCount * $probability)) / (1 + $totalTokenCount); 122 | 123 | if (0 === $probability) { 124 | $probability = 0.01; 125 | } elseif (1 === $probability) { 126 | $probability = 0.99; 127 | } 128 | 129 | $logSum += log(1 - $probability) - log($probability); 130 | } 131 | 132 | $scores[$label] = 1 / (1 + exp($logSum)); 133 | } 134 | 135 | arsort($scores, SORT_NUMERIC); 136 | 137 | return $scores; 138 | } 139 | 140 | /** 141 | * Resets the classifier 142 | */ 143 | public function reset() 144 | { 145 | $this->labels = array(); 146 | $this->docs = array(); 147 | $this->tokens = array(); 148 | $this->data = array(); 149 | } 150 | 151 | /** 152 | * @param string $token 153 | * @param string $label 154 | * @return int 155 | */ 156 | protected function inversedTokenCount($token, $label) 157 | { 158 | $data = $this->data; 159 | 160 | $totalTokenCount = $this->tokens[$token]; 161 | 162 | $totalLabelTokenCount = isset($data[$label][$token]) ? $data[$label][$token] : 0; 163 | 164 | $retval = $totalTokenCount - $totalLabelTokenCount; 165 | 166 | return $retval; 167 | } 168 | 169 | /** 170 | * @param string $label 171 | * @return number 172 | */ 173 | protected function inversedDocCount($label) 174 | { 175 | $data = $this->docs; 176 | 177 | unset($data[$label]); 178 | 179 | return array_sum($data); 180 | } 181 | } 182 | -------------------------------------------------------------------------------- /src/Fieg/Bayes/Tokenizer/WhitespaceAndPunctuationTokenizer.php: -------------------------------------------------------------------------------- 1 | 5 | * @copyright Webcreate (http://webcreate.nl) 6 | */ 7 | 8 | namespace Fieg\Bayes\Tokenizer; 9 | 10 | use Fieg\Bayes\TokenizerInterface; 11 | 12 | class WhitespaceAndPunctuationTokenizer implements TokenizerInterface 13 | { 14 | protected $pattern = "/[ ,.?!-:;\\n\\r\\t…_]/u"; 15 | 16 | public function tokenize($string) 17 | { 18 | $retval = preg_split($this->pattern, mb_strtolower($string, 'utf8')); 19 | $retval = array_filter($retval, 'trim'); 20 | $retval = array_values($retval); 21 | 22 | return $retval; 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/Fieg/Bayes/TokenizerInterface.php: -------------------------------------------------------------------------------- 1 | 5 | * @copyright Webcreate (http://webcreate.nl) 6 | */ 7 | 8 | namespace Fieg\Bayes; 9 | 10 | interface TokenizerInterface 11 | { 12 | public function tokenize($string); 13 | } 14 | -------------------------------------------------------------------------------- /tests/Fieg/Bayes/ClassifierTest.php: -------------------------------------------------------------------------------- 1 | trainingData(); 16 | foreach ($data as $row) { 17 | list($label, $text) = $row; 18 | 19 | $classifier->train($label, $text); 20 | } 21 | 22 | $result = $classifier->classify($string); 23 | 24 | reset($result); 25 | 26 | $topMatch = key($result); 27 | 28 | $this->assertEquals($expectedLabel, $topMatch); 29 | } 30 | 31 | public function classifyDataProvider() 32 | { 33 | return array( 34 | array('en', 'scientific problems and the need'), 35 | array('fr', 'D\'icône de la cause des femmes à celui de renégate'), 36 | array('es', 'Un importante punto de inflexión en la historia de la ciencia filosófica primitiva'), 37 | ); 38 | } 39 | 40 | public function trainingData() 41 | { 42 | return array( 43 | // fr 44 | array( 45 | 'fr', 46 | "L'Italie a été gouvernée pendant un an par un homme qui n'avait pas été élu par le peuple. Dès la nomination de Mario Monti au poste de président du conseil, fin 2011, j'avais dit :Attention, c'est prendre un risque politique majeur. Par leur vote, les Italiens n'ont pas seulement adressé un message à leurs élites nationales, ils ont voulu dire : Nous, le peuple, nous voulons garder la maîtrise de notre destin. Et ce message pourrait être envoyé par n'importe quel peuple européen, y compris le peuple français.", 47 | ), 48 | array( 49 | 'fr', 50 | "Il en faut peu, parfois, pour passer du statut d'icône de la cause des femmes à celui de renégate. Lorsqu'elle a été nommée à la tête de Yahoo!, le 26 juillet 2012, Marissa Mayer était vue comme un modèle. Elle montrait qu'il était possible de perforer le fameux plafond de verre, même dans les bastions les mieux gardés du machisme (M du 28 juillet 2012). A 37 ans, cette brillante diplômée de Stanford, formée chez Google, faisait figure d'exemple dans la Silicon Valley californienne, où moins de 5 % des postes de direction sont occupés par des femmes. En quelques mois, le symbole a beaucoup perdu de sa puissance.", 51 | ), 52 | array( 53 | 'fr', 54 | "Premier intervenant de taille à SXSW 2013, Bre Pettis, PDG de la société Makerbot, spécialisée dans la vente d'imprimantes 3D, a posé une question toute simple, avant de dévoiler un nouveau produit qui l'est un peu moins. Voulez-vous rejoindre notre environnement 3D ?, a-t-il demandé à la foule qui débordait de l'Exhibit Hall 5 du Convention Center.", 55 | ), 56 | array( 57 | 'fr', 58 | "Des milliers de manifestants ont défilé samedi 9 mars à Tokyo pour exiger l'abandon rapide de l'énergie nucléaire au Japon, près de deux ans jour pour jour après le début de la catastrophe de Fukushima.", 59 | ), 60 | 61 | // es 62 | array( 63 | 'es', 64 | "El ex presidente sudafricano, Nelson Mandela, ha sido hospitalizado la tarde del sábado, según confirmó un hospital de Pretoria a CNN. Al parecer se trata de un chequeo médico que ya estaba previsto, relacionado con su avanzada edad, según explicó el portavoz de la presidencia Sudafricana Mac Maharaj.", 65 | ), 66 | array( 67 | 'es', 68 | 'Guerras continuas y otros problemas llevaron finalmente a un estado de disminución. Las invasiones napoleónicas de España llevaron al caos, lo que provocó los movimientos de independencia que destrozaron la mayor parte del imperio y abandonaron el país políticamente inestable', 69 | ), 70 | array( 71 | 'es', 72 | 'En el uso moderno, la "ciencia" a menudo se refiere a una forma de perseguir el conocimiento, no sólo el conocimiento mismo. También se suele restringirse a las ramas de estudio que tratan de explicar los fenómenos del universo material. [6] En los siglos 17 y 18 científicos cada vez más solicitados para formular el conocimiento en términos de las leyes de la naturaleza, tales como las leyes del movimiento de Newton. Y en el transcurso del siglo 19, la palabra "ciencia" se hizo cada vez más asociada con el método científico en sí, como una manera disciplinada para estudiar el mundo natural, incluyendo la física, la química, la geología y la biología', 73 | ), 74 | array( 75 | 'es', 76 | 'Un importante punto de inflexión en la historia de la ciencia filosófica primitiva fue el intento controversial pero exitoso por Sócrates para aplicar la filosofía al estudio de los seres humanos, incluyendo la naturaleza humana, la naturaleza de las comunidades políticas, y el conocimiento humano en sí. Criticó el tipo más antiguo de estudio de la física como demasiado puramente especulativo y carente de autocrítica. Se mostró especialmente preocupado de que algunos de los primeros físicos trataron la naturaleza como si pudiera ser asumido que no tenía orden inteligente, explicando las cosas sólo en términos de movimiento y la materia.', 77 | ), 78 | 79 | // en 80 | array( 81 | 'en', 82 | "Other possible reasons have been proposed for the lengthy research in the progress of strong AI. The intricacy of scientific problems and the need to fully understand the human brain through psychology and neurophysiology have limited many researchers from emulating the function of the human brain into a computer hardware.", 83 | ), 84 | array( 85 | 'en', 86 | "There have been many AI researchers that debate over the idea whether machines should be created with emotions. There are no emotions in typical models of AI and some researchers say programming emotions into machines allows them to have a mind of their own.", 87 | ), 88 | ); 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /tests/Fieg/Bayes/Tokenizer/WhitespaceAndPunctuationTokenizerTest.php: -------------------------------------------------------------------------------- 1 | tokenize($string); 15 | 16 | $this->assertEquals($expected, $result); 17 | } 18 | 19 | public function tokenizeDataProvider() 20 | { 21 | return array( 22 | array('Hello, how are you?', array('hello', 'how', 'are', 'you')), 23 | array("Hello\n\nHow are you?!", array('hello', 'how', 'are', 'you')), 24 | array("Un importante punto de inflexión en la historia de la ciencia filosófica primitiva", array('un','importante','punto','de','inflexión','en','la','historia','de','la','ciencia','filosófica','primitiva')), 25 | ); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /tests/bootstrap.php: -------------------------------------------------------------------------------- 1 | 5 | * @copyright Webcreate (http://webcreate.nl) 6 | */ 7 | 8 | $loader = require 'vendor/autoload.php'; 9 | --------------------------------------------------------------------------------