├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── composer.json ├── composer.lock ├── examples ├── basic.php ├── basic_svm.php ├── bayes.php ├── converter.php ├── language.php └── svm.php └── src └── Camspiers └── StatisticalClassifier ├── Classifier ├── Classifier.php ├── ClassifierInterface.php ├── ComplementNaiveBayes.php └── SVM.php ├── Config └── DataSourceConfiguration.php ├── DataSource ├── CSV.php ├── Closure.php ├── Converter.php ├── DataArray.php ├── DataSourceInterface.php ├── Directory.php ├── Grouped.php ├── Json.php ├── PDOQuery.php └── Serialized.php ├── Model ├── CachedModel.php ├── Model.php ├── ModelInterface.php ├── SVMCachedModel.php └── SVMModel.php ├── Normalizer ├── Document │ ├── Lowercase.php │ └── NormalizerInterface.php └── Token │ ├── Grouped.php │ ├── NormalizerInterface.php │ ├── PhpStemmer.php │ ├── Porter.php │ └── Stopword.php ├── Tokenizer ├── TokenizerInterface.php └── Word.php └── Transform ├── Complement.php ├── DocumentCount.php ├── DocumentLength.php ├── DocumentTokenCounts.php ├── TFIDF.php ├── TokenAppearanceCount.php ├── TokenCountByDocument.php ├── TokenPreparation.php ├── TokensByCategory.php └── Weight.php /.gitattributes: -------------------------------------------------------------------------------- 1 | tests/ export-ignore 2 | resources/ export-ignore 3 | phpunit.xml.dist export-ignore 4 | .travis.yml export-ignore 5 | .scrutinizer.yml export-ignore 6 | .php_cs export-ignore 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /cache/ 2 | /vendor 3 | /.settings 4 | /.idea -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012-2013 Cam Spiers 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is furnished 8 | to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PHP Classifier 2 | 3 | [![Build Status](https://travis-ci.org/camspiers/statistical-classifier.png?branch=master)](https://travis-ci.org/camspiers/statistical-classifier) 4 | [![Latest Stable Version](https://poser.pugx.org/camspiers/statistical-classifier/v/stable.png)](https://packagist.org/packages/camspiers/statistical-classifier) 5 | 6 | PHP Classifier uses [semantic versioning](http://semver.org/), it is currently at major version 0, so the public API should not be considered stable. 7 | 8 | # What is it? 9 | 10 | PHP Classifier is a text classification library with a focus on reuse, customizability and performance. 11 | Classifiers can be used for many purposes, but are particularly useful in detecting spam. 12 | 13 | ## Features 14 | 15 | * Complement Naive Bayes Classifier 16 | * SVM (libsvm) Classifier 17 | * Highly customizable (easily modify or build your own classifier) 18 | * Command-line interface via separate library (phar archive) 19 | * Multiple **data import types** to get your data into the classifier (Directory of files, Database queries, Json, Serialized arrays) 20 | * Multiple types of **model caching** 21 | * Compatible with HipHop VM 22 | 23 | # Installation 24 | 25 | ```bash 26 | $ composer require camspiers/statistical-classifier 27 | ``` 28 | 29 | ## SVM Support 30 | 31 | For SVM Support both libsvm and php-svm are required. For installation intructions refer to [php-svm](https://github.com/ianbarber/php-svm). 32 | 33 | # Usage 34 | 35 | ## Non-cached Naive Bayes 36 | 37 | ```php 38 | use Camspiers\StatisticalClassifier\Classifier\ComplementNaiveBayes; 39 | use Camspiers\StatisticalClassifier\DataSource\DataArray; 40 | 41 | $source = new DataArray(); 42 | $source->addDocument('spam', 'Some spam document'); 43 | $source->addDocument('spam', 'Another spam document'); 44 | $source->addDocument('ham', 'Some ham document'); 45 | $source->addDocument('ham', 'Another ham document'); 46 | 47 | $classifier = new ComplementNaiveBayes($source); 48 | $classifier->is('ham', 'Some ham document'); // bool(true) 49 | $classifier->classify('Some ham document'); // string "ham" 50 | ``` 51 | 52 | ## Non-cached SVM 53 | 54 | ```php 55 | use Camspiers\StatisticalClassifier\Classifier\SVM; 56 | use Camspiers\StatisticalClassifier\DataSource\DataArray; 57 | 58 | $source = new DataArray() 59 | $source->addDocument('spam', 'Some spam document'); 60 | $source->addDocument('spam', 'Another spam document'); 61 | $source->addDocument('ham', 'Some ham document'); 62 | $source->addDocument('ham', 'Another ham document'); 63 | 64 | $classifier = new SVM($source); 65 | $classifier->is('ham', 'Some ham document'); // bool(true) 66 | $classifier->classify('Some ham document'); // string "ham" 67 | ``` 68 | 69 | # Caching models 70 | 71 | Caching models requires [maximebf/CacheCache](https://github.com/maximebf/CacheCache) which can be installed via packagist. Additional caching systems can be easily integrated. 72 | 73 | ## Cached Naive Bayes 74 | 75 | ```php 76 | use Camspiers\StatisticalClassifier\Classifier\ComplementNaiveBayes; 77 | use Camspiers\StatisticalClassifier\Model\CachedModel; 78 | use Camspiers\StatisticalClassifier\DataSource\DataArray; 79 | 80 | $source = new DataArray(); 81 | $source->addDocument('spam', 'Some spam document'); 82 | $source->addDocument('spam', 'Another spam document'); 83 | $source->addDocument('ham', 'Some ham document'); 84 | $source->addDocument('ham', 'Another ham document'); 85 | 86 | $model = new CachedModel( 87 | 'mycachename', 88 | new CacheCache\Cache( 89 | new CacheCache\Backends\File( 90 | array( 91 | 'dir' => __DIR__ 92 | ) 93 | ) 94 | ) 95 | ); 96 | 97 | $classifier = new ComplementNaiveBayes($source, $model); 98 | $classifier->is('ham', 'Some ham document'); // bool(true) 99 | $classifier->classify('Some ham document'); // string "ham" 100 | ``` 101 | 102 | ## Cached SVM 103 | 104 | ```php 105 | use Camspiers\StatisticalClassifier\Classifier\SVM; 106 | use Camspiers\StatisticalClassifier\Model\SVMCachedModel; 107 | use Camspiers\StatisticalClassifier\DataSource\DataArray; 108 | 109 | $source = new DataArray(); 110 | $source->addDocument('spam', 'Some spam document'); 111 | $source->addDocument('spam', 'Another spam document'); 112 | $source->addDocument('ham', 'Some ham document'); 113 | $source->addDocument('ham', 'Another ham document'); 114 | 115 | $model = new Model\SVMCachedModel( 116 | __DIR__ . '/model.svm', 117 | new CacheCache\Cache( 118 | new CacheCache\Backends\File( 119 | array( 120 | 'dir' => __DIR__ 121 | ) 122 | ) 123 | ) 124 | ); 125 | 126 | $classifier = new SVM($source, $model); 127 | $classifier->is('ham', 'Some ham document'); // bool(true) 128 | $classifier->classify('Some ham document'); // string "ham" 129 | ``` 130 | 131 | # Unit testing 132 | 133 | statistical-classifier/ $ composer install --dev 134 | statistical-classifier/ $ phpunit 135 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "camspiers/statistical-classifier", 3 | "description": "A PHP implementation of Complement Naive Bayes and SVM statistical classifiers, including a structure for building other classifier, multiple data sources and multiple caching backends", 4 | "keywords": ["classifier", "svm", "naive", "bayes"], 5 | "homepage": "http://php-classifier.com/", 6 | "type": "library", 7 | "license": "MIT", 8 | "authors": [ 9 | { 10 | "name": "Cam Spiers", 11 | "email": "camspiers@gmail.com" 12 | } 13 | ], 14 | "require": { 15 | "php": ">=5.3.3", 16 | "symfony/config": "~2.2", 17 | "symfony/options-resolver": "~2.2" 18 | }, 19 | "require-dev": { 20 | "phpunit/phpunit": "~3.7", 21 | "mikey179/vfsStream": "~1.2", 22 | "maximebf/cachecache": "~1.0" 23 | }, 24 | "suggest": { 25 | "camspiers/porter-stemmer": "Using a stemmer can help with language based classification", 26 | "maximebf/cachecache": "Using caching will help improve performance on large datasets" 27 | }, 28 | "autoload": { 29 | "psr-0": { 30 | "Camspiers\\StatisticalClassifier": "src/" 31 | } 32 | }, 33 | "extra": { 34 | "branch-alias": { 35 | "dev-master": "0.8.x-dev" 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /composer.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_readme": [ 3 | "This file locks the dependencies of your project to a known state", 4 | "Read more about it at http://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file" 5 | ], 6 | "hash": "285839f1f0ee6773158176e48410a929", 7 | "packages": [ 8 | { 9 | "name": "symfony/config", 10 | "version": "v2.4.0", 11 | "target-dir": "Symfony/Component/Config", 12 | "source": { 13 | "type": "git", 14 | "url": "https://github.com/symfony/Config.git", 15 | "reference": "16068f76c0af74968f3ad8fcec3eb90df1fde394" 16 | }, 17 | "dist": { 18 | "type": "zip", 19 | "url": "https://api.github.com/repos/symfony/Config/zipball/16068f76c0af74968f3ad8fcec3eb90df1fde394", 20 | "reference": "16068f76c0af74968f3ad8fcec3eb90df1fde394", 21 | "shasum": "" 22 | }, 23 | "require": { 24 | "php": ">=5.3.3", 25 | "symfony/filesystem": "~2.3" 26 | }, 27 | "type": "library", 28 | "extra": { 29 | "branch-alias": { 30 | "dev-master": "2.4-dev" 31 | } 32 | }, 33 | "autoload": { 34 | "psr-0": { 35 | "Symfony\\Component\\Config\\": "" 36 | } 37 | }, 38 | "notification-url": "https://packagist.org/downloads/", 39 | "license": [ 40 | "MIT" 41 | ], 42 | "authors": [ 43 | { 44 | "name": "Fabien Potencier", 45 | "email": "fabien@symfony.com" 46 | }, 47 | { 48 | "name": "Symfony Community", 49 | "homepage": "http://symfony.com/contributors" 50 | } 51 | ], 52 | "description": "Symfony Config Component", 53 | "homepage": "http://symfony.com", 54 | "time": "2013-11-26 16:40:27" 55 | }, 56 | { 57 | "name": "symfony/filesystem", 58 | "version": "v2.4.0", 59 | "target-dir": "Symfony/Component/Filesystem", 60 | "source": { 61 | "type": "git", 62 | "url": "https://github.com/symfony/Filesystem.git", 63 | "reference": "79acd777762e81d0f3414ca25a602deeeb48240d" 64 | }, 65 | "dist": { 66 | "type": "zip", 67 | "url": "https://api.github.com/repos/symfony/Filesystem/zipball/79acd777762e81d0f3414ca25a602deeeb48240d", 68 | "reference": "79acd777762e81d0f3414ca25a602deeeb48240d", 69 | "shasum": "" 70 | }, 71 | "require": { 72 | "php": ">=5.3.3" 73 | }, 74 | "type": "library", 75 | "extra": { 76 | "branch-alias": { 77 | "dev-master": "2.4-dev" 78 | } 79 | }, 80 | "autoload": { 81 | "psr-0": { 82 | "Symfony\\Component\\Filesystem\\": "" 83 | } 84 | }, 85 | "notification-url": "https://packagist.org/downloads/", 86 | "license": [ 87 | "MIT" 88 | ], 89 | "authors": [ 90 | { 91 | "name": "Fabien Potencier", 92 | "email": "fabien@symfony.com" 93 | }, 94 | { 95 | "name": "Symfony Community", 96 | "homepage": "http://symfony.com/contributors" 97 | } 98 | ], 99 | "description": "Symfony Filesystem Component", 100 | "homepage": "http://symfony.com", 101 | "time": "2013-11-16 15:13:54" 102 | }, 103 | { 104 | "name": "symfony/options-resolver", 105 | "version": "v2.4.0", 106 | "target-dir": "Symfony/Component/OptionsResolver", 107 | "source": { 108 | "type": "git", 109 | "url": "https://github.com/symfony/OptionsResolver.git", 110 | "reference": "2b7145803c970e62ac521b2b0984b4bb67209561" 111 | }, 112 | "dist": { 113 | "type": "zip", 114 | "url": "https://api.github.com/repos/symfony/OptionsResolver/zipball/2b7145803c970e62ac521b2b0984b4bb67209561", 115 | "reference": "2b7145803c970e62ac521b2b0984b4bb67209561", 116 | "shasum": "" 117 | }, 118 | "require": { 119 | "php": ">=5.3.3" 120 | }, 121 | "type": "library", 122 | "extra": { 123 | "branch-alias": { 124 | "dev-master": "2.4-dev" 125 | } 126 | }, 127 | "autoload": { 128 | "psr-0": { 129 | "Symfony\\Component\\OptionsResolver\\": "" 130 | } 131 | }, 132 | "notification-url": "https://packagist.org/downloads/", 133 | "license": [ 134 | "MIT" 135 | ], 136 | "authors": [ 137 | { 138 | "name": "Fabien Potencier", 139 | "email": "fabien@symfony.com" 140 | }, 141 | { 142 | "name": "Symfony Community", 143 | "homepage": "http://symfony.com/contributors" 144 | } 145 | ], 146 | "description": "Symfony OptionsResolver Component", 147 | "homepage": "http://symfony.com", 148 | "keywords": [ 149 | "config", 150 | "configuration", 151 | "options" 152 | ], 153 | "time": "2013-09-19 09:47:34" 154 | } 155 | ], 156 | "packages-dev": [ 157 | { 158 | "name": "maximebf/cachecache", 159 | "version": "1.0.2", 160 | "source": { 161 | "type": "git", 162 | "url": "https://github.com/maximebf/CacheCache.git", 163 | "reference": "f89ff2f60e9c5ef059414ea4775637f18c034bdc" 164 | }, 165 | "dist": { 166 | "type": "zip", 167 | "url": "https://api.github.com/repos/maximebf/CacheCache/zipball/f89ff2f60e9c5ef059414ea4775637f18c034bdc", 168 | "reference": "f89ff2f60e9c5ef059414ea4775637f18c034bdc", 169 | "shasum": "" 170 | }, 171 | "require": { 172 | "php": ">=5.3.0" 173 | }, 174 | "type": "library", 175 | "autoload": { 176 | "psr-0": { 177 | "CacheCache": "src/" 178 | } 179 | }, 180 | "notification-url": "https://packagist.org/downloads/", 181 | "license": [ 182 | "MIT" 183 | ], 184 | "authors": [ 185 | { 186 | "name": "Maxime Bouroumeau-Fuseau", 187 | "email": "maxime.bouroumeau@gmail.com", 188 | "homepage": "http://maximebf.com" 189 | } 190 | ], 191 | "description": "Caching library for PHP5.3", 192 | "homepage": "https://github.com/maximebf/CacheCache", 193 | "keywords": [ 194 | "cache" 195 | ], 196 | "time": "2013-08-12 09:30:49" 197 | }, 198 | { 199 | "name": "mikey179/vfsStream", 200 | "version": "v1.2.0", 201 | "source": { 202 | "type": "git", 203 | "url": "https://github.com/mikey179/vfsStream.git", 204 | "reference": "v1.2.0" 205 | }, 206 | "dist": { 207 | "type": "zip", 208 | "url": "https://api.github.com/repos/mikey179/vfsStream/zipball/v1.2.0", 209 | "reference": "v1.2.0", 210 | "shasum": "" 211 | }, 212 | "require": { 213 | "php": ">=5.3.0" 214 | }, 215 | "type": "library", 216 | "autoload": { 217 | "psr-0": { 218 | "org\\bovigo\\vfs\\": "src/main/php" 219 | } 220 | }, 221 | "notification-url": "https://packagist.org/downloads/", 222 | "license": [ 223 | "BSD" 224 | ], 225 | "homepage": "http://vfs.bovigo.org/", 226 | "time": "2013-04-01 10:41:02" 227 | }, 228 | { 229 | "name": "phpunit/php-code-coverage", 230 | "version": "1.2.13", 231 | "source": { 232 | "type": "git", 233 | "url": "https://github.com/sebastianbergmann/php-code-coverage.git", 234 | "reference": "466e7cd2554b4e264c9e3f31216d25ac0e5f3d94" 235 | }, 236 | "dist": { 237 | "type": "zip", 238 | "url": "https://api.github.com/repos/sebastianbergmann/php-code-coverage/zipball/466e7cd2554b4e264c9e3f31216d25ac0e5f3d94", 239 | "reference": "466e7cd2554b4e264c9e3f31216d25ac0e5f3d94", 240 | "shasum": "" 241 | }, 242 | "require": { 243 | "php": ">=5.3.3", 244 | "phpunit/php-file-iterator": ">=1.3.0@stable", 245 | "phpunit/php-text-template": ">=1.1.1@stable", 246 | "phpunit/php-token-stream": ">=1.1.3@stable" 247 | }, 248 | "require-dev": { 249 | "phpunit/phpunit": "3.7.*@dev" 250 | }, 251 | "suggest": { 252 | "ext-dom": "*", 253 | "ext-xdebug": ">=2.0.5" 254 | }, 255 | "type": "library", 256 | "extra": { 257 | "branch-alias": { 258 | "dev-master": "1.2.x-dev" 259 | } 260 | }, 261 | "autoload": { 262 | "classmap": [ 263 | "PHP/" 264 | ] 265 | }, 266 | "notification-url": "https://packagist.org/downloads/", 267 | "include-path": [ 268 | "" 269 | ], 270 | "license": [ 271 | "BSD-3-Clause" 272 | ], 273 | "authors": [ 274 | { 275 | "name": "Sebastian Bergmann", 276 | "email": "sb@sebastian-bergmann.de", 277 | "role": "lead" 278 | } 279 | ], 280 | "description": "Library that provides collection, processing, and rendering functionality for PHP code coverage information.", 281 | "homepage": "https://github.com/sebastianbergmann/php-code-coverage", 282 | "keywords": [ 283 | "coverage", 284 | "testing", 285 | "xunit" 286 | ], 287 | "time": "2013-09-10 08:14:32" 288 | }, 289 | { 290 | "name": "phpunit/php-file-iterator", 291 | "version": "1.3.4", 292 | "source": { 293 | "type": "git", 294 | "url": "https://github.com/sebastianbergmann/php-file-iterator.git", 295 | "reference": "acd690379117b042d1c8af1fafd61bde001bf6bb" 296 | }, 297 | "dist": { 298 | "type": "zip", 299 | "url": "https://api.github.com/repos/sebastianbergmann/php-file-iterator/zipball/acd690379117b042d1c8af1fafd61bde001bf6bb", 300 | "reference": "acd690379117b042d1c8af1fafd61bde001bf6bb", 301 | "shasum": "" 302 | }, 303 | "require": { 304 | "php": ">=5.3.3" 305 | }, 306 | "type": "library", 307 | "autoload": { 308 | "classmap": [ 309 | "File/" 310 | ] 311 | }, 312 | "notification-url": "https://packagist.org/downloads/", 313 | "include-path": [ 314 | "" 315 | ], 316 | "license": [ 317 | "BSD-3-Clause" 318 | ], 319 | "authors": [ 320 | { 321 | "name": "Sebastian Bergmann", 322 | "email": "sb@sebastian-bergmann.de", 323 | "role": "lead" 324 | } 325 | ], 326 | "description": "FilterIterator implementation that filters files based on a list of suffixes.", 327 | "homepage": "https://github.com/sebastianbergmann/php-file-iterator/", 328 | "keywords": [ 329 | "filesystem", 330 | "iterator" 331 | ], 332 | "time": "2013-10-10 15:34:57" 333 | }, 334 | { 335 | "name": "phpunit/php-text-template", 336 | "version": "1.1.4", 337 | "source": { 338 | "type": "git", 339 | "url": "git://github.com/sebastianbergmann/php-text-template.git", 340 | "reference": "1.1.4" 341 | }, 342 | "dist": { 343 | "type": "zip", 344 | "url": "https://github.com/sebastianbergmann/php-text-template/zipball/1.1.4", 345 | "reference": "1.1.4", 346 | "shasum": "" 347 | }, 348 | "require": { 349 | "php": ">=5.3.3" 350 | }, 351 | "type": "library", 352 | "autoload": { 353 | "classmap": [ 354 | "Text/" 355 | ] 356 | }, 357 | "notification-url": "https://packagist.org/downloads/", 358 | "include-path": [ 359 | "" 360 | ], 361 | "license": [ 362 | "BSD-3-Clause" 363 | ], 364 | "authors": [ 365 | { 366 | "name": "Sebastian Bergmann", 367 | "email": "sb@sebastian-bergmann.de", 368 | "role": "lead" 369 | } 370 | ], 371 | "description": "Simple template engine.", 372 | "homepage": "https://github.com/sebastianbergmann/php-text-template/", 373 | "keywords": [ 374 | "template" 375 | ], 376 | "time": "2012-10-31 11:15:28" 377 | }, 378 | { 379 | "name": "phpunit/php-timer", 380 | "version": "1.0.5", 381 | "source": { 382 | "type": "git", 383 | "url": "https://github.com/sebastianbergmann/php-timer.git", 384 | "reference": "19689d4354b295ee3d8c54b4f42c3efb69cbc17c" 385 | }, 386 | "dist": { 387 | "type": "zip", 388 | "url": "https://api.github.com/repos/sebastianbergmann/php-timer/zipball/19689d4354b295ee3d8c54b4f42c3efb69cbc17c", 389 | "reference": "19689d4354b295ee3d8c54b4f42c3efb69cbc17c", 390 | "shasum": "" 391 | }, 392 | "require": { 393 | "php": ">=5.3.3" 394 | }, 395 | "type": "library", 396 | "autoload": { 397 | "classmap": [ 398 | "PHP/" 399 | ] 400 | }, 401 | "notification-url": "https://packagist.org/downloads/", 402 | "include-path": [ 403 | "" 404 | ], 405 | "license": [ 406 | "BSD-3-Clause" 407 | ], 408 | "authors": [ 409 | { 410 | "name": "Sebastian Bergmann", 411 | "email": "sb@sebastian-bergmann.de", 412 | "role": "lead" 413 | } 414 | ], 415 | "description": "Utility class for timing", 416 | "homepage": "https://github.com/sebastianbergmann/php-timer/", 417 | "keywords": [ 418 | "timer" 419 | ], 420 | "time": "2013-08-02 07:42:54" 421 | }, 422 | { 423 | "name": "phpunit/php-token-stream", 424 | "version": "1.2.1", 425 | "source": { 426 | "type": "git", 427 | "url": "https://github.com/sebastianbergmann/php-token-stream.git", 428 | "reference": "5220af2a7929aa35cf663d97c89ad3d50cf5fa3e" 429 | }, 430 | "dist": { 431 | "type": "zip", 432 | "url": "https://api.github.com/repos/sebastianbergmann/php-token-stream/zipball/5220af2a7929aa35cf663d97c89ad3d50cf5fa3e", 433 | "reference": "5220af2a7929aa35cf663d97c89ad3d50cf5fa3e", 434 | "shasum": "" 435 | }, 436 | "require": { 437 | "ext-tokenizer": "*", 438 | "php": ">=5.3.3" 439 | }, 440 | "type": "library", 441 | "extra": { 442 | "branch-alias": { 443 | "dev-master": "1.2-dev" 444 | } 445 | }, 446 | "autoload": { 447 | "classmap": [ 448 | "PHP/" 449 | ] 450 | }, 451 | "notification-url": "https://packagist.org/downloads/", 452 | "include-path": [ 453 | "" 454 | ], 455 | "license": [ 456 | "BSD-3-Clause" 457 | ], 458 | "authors": [ 459 | { 460 | "name": "Sebastian Bergmann", 461 | "email": "sb@sebastian-bergmann.de", 462 | "role": "lead" 463 | } 464 | ], 465 | "description": "Wrapper around PHP's tokenizer extension.", 466 | "homepage": "https://github.com/sebastianbergmann/php-token-stream/", 467 | "keywords": [ 468 | "tokenizer" 469 | ], 470 | "time": "2013-09-13 04:58:23" 471 | }, 472 | { 473 | "name": "phpunit/phpunit", 474 | "version": "3.7.28", 475 | "source": { 476 | "type": "git", 477 | "url": "https://github.com/sebastianbergmann/phpunit.git", 478 | "reference": "3b97c8492bcafbabe6b6fbd2ab35f2f04d932a8d" 479 | }, 480 | "dist": { 481 | "type": "zip", 482 | "url": "https://api.github.com/repos/sebastianbergmann/phpunit/zipball/3b97c8492bcafbabe6b6fbd2ab35f2f04d932a8d", 483 | "reference": "3b97c8492bcafbabe6b6fbd2ab35f2f04d932a8d", 484 | "shasum": "" 485 | }, 486 | "require": { 487 | "ext-dom": "*", 488 | "ext-pcre": "*", 489 | "ext-reflection": "*", 490 | "ext-spl": "*", 491 | "php": ">=5.3.3", 492 | "phpunit/php-code-coverage": "~1.2.1", 493 | "phpunit/php-file-iterator": ">=1.3.1", 494 | "phpunit/php-text-template": ">=1.1.1", 495 | "phpunit/php-timer": ">=1.0.4", 496 | "phpunit/phpunit-mock-objects": "~1.2.0", 497 | "symfony/yaml": "~2.0" 498 | }, 499 | "require-dev": { 500 | "pear-pear/pear": "1.9.4" 501 | }, 502 | "suggest": { 503 | "ext-json": "*", 504 | "ext-simplexml": "*", 505 | "ext-tokenizer": "*", 506 | "phpunit/php-invoker": ">=1.1.0,<1.2.0" 507 | }, 508 | "bin": [ 509 | "composer/bin/phpunit" 510 | ], 511 | "type": "library", 512 | "extra": { 513 | "branch-alias": { 514 | "dev-master": "3.7.x-dev" 515 | } 516 | }, 517 | "autoload": { 518 | "classmap": [ 519 | "PHPUnit/" 520 | ] 521 | }, 522 | "notification-url": "https://packagist.org/downloads/", 523 | "include-path": [ 524 | "", 525 | "../../symfony/yaml/" 526 | ], 527 | "license": [ 528 | "BSD-3-Clause" 529 | ], 530 | "authors": [ 531 | { 532 | "name": "Sebastian Bergmann", 533 | "email": "sebastian@phpunit.de", 534 | "role": "lead" 535 | } 536 | ], 537 | "description": "The PHP Unit Testing framework.", 538 | "homepage": "http://www.phpunit.de/", 539 | "keywords": [ 540 | "phpunit", 541 | "testing", 542 | "xunit" 543 | ], 544 | "time": "2013-10-17 07:27:40" 545 | }, 546 | { 547 | "name": "phpunit/phpunit-mock-objects", 548 | "version": "1.2.3", 549 | "source": { 550 | "type": "git", 551 | "url": "git://github.com/sebastianbergmann/phpunit-mock-objects.git", 552 | "reference": "1.2.3" 553 | }, 554 | "dist": { 555 | "type": "zip", 556 | "url": "https://github.com/sebastianbergmann/phpunit-mock-objects/archive/1.2.3.zip", 557 | "reference": "1.2.3", 558 | "shasum": "" 559 | }, 560 | "require": { 561 | "php": ">=5.3.3", 562 | "phpunit/php-text-template": ">=1.1.1@stable" 563 | }, 564 | "suggest": { 565 | "ext-soap": "*" 566 | }, 567 | "type": "library", 568 | "autoload": { 569 | "classmap": [ 570 | "PHPUnit/" 571 | ] 572 | }, 573 | "notification-url": "https://packagist.org/downloads/", 574 | "include-path": [ 575 | "" 576 | ], 577 | "license": [ 578 | "BSD-3-Clause" 579 | ], 580 | "authors": [ 581 | { 582 | "name": "Sebastian Bergmann", 583 | "email": "sb@sebastian-bergmann.de", 584 | "role": "lead" 585 | } 586 | ], 587 | "description": "Mock Object library for PHPUnit", 588 | "homepage": "https://github.com/sebastianbergmann/phpunit-mock-objects/", 589 | "keywords": [ 590 | "mock", 591 | "xunit" 592 | ], 593 | "time": "2013-01-13 10:24:48" 594 | }, 595 | { 596 | "name": "symfony/yaml", 597 | "version": "v2.4.0", 598 | "target-dir": "Symfony/Component/Yaml", 599 | "source": { 600 | "type": "git", 601 | "url": "https://github.com/symfony/Yaml.git", 602 | "reference": "1ae235a1b9d3ad3d9f3860ff20acc072df95b7f5" 603 | }, 604 | "dist": { 605 | "type": "zip", 606 | "url": "https://api.github.com/repos/symfony/Yaml/zipball/1ae235a1b9d3ad3d9f3860ff20acc072df95b7f5", 607 | "reference": "1ae235a1b9d3ad3d9f3860ff20acc072df95b7f5", 608 | "shasum": "" 609 | }, 610 | "require": { 611 | "php": ">=5.3.3" 612 | }, 613 | "type": "library", 614 | "extra": { 615 | "branch-alias": { 616 | "dev-master": "2.4-dev" 617 | } 618 | }, 619 | "autoload": { 620 | "psr-0": { 621 | "Symfony\\Component\\Yaml\\": "" 622 | } 623 | }, 624 | "notification-url": "https://packagist.org/downloads/", 625 | "license": [ 626 | "MIT" 627 | ], 628 | "authors": [ 629 | { 630 | "name": "Fabien Potencier", 631 | "email": "fabien@symfony.com" 632 | }, 633 | { 634 | "name": "Symfony Community", 635 | "homepage": "http://symfony.com/contributors" 636 | } 637 | ], 638 | "description": "Symfony Yaml Component", 639 | "homepage": "http://symfony.com", 640 | "time": "2013-11-26 16:40:27" 641 | } 642 | ], 643 | "aliases": [ 644 | 645 | ], 646 | "minimum-stability": "stable", 647 | "stability-flags": [ 648 | 649 | ], 650 | "platform": { 651 | "php": ">=5.3.3" 652 | }, 653 | "platform-dev": [ 654 | 655 | ] 656 | } 657 | -------------------------------------------------------------------------------- /examples/basic.php: -------------------------------------------------------------------------------- 1 | 'spam', 13 | 'document' => 'Some spam document' 14 | ), 15 | array( 16 | 'category' => 'spam', 17 | 'document' => 'Another spam document' 18 | ), 19 | array( 20 | 'category' => 'ham', 21 | 'document' => 'Some ham document' 22 | ), 23 | array( 24 | 'category' => 'ham', 25 | 'document' => 'Another ham document' 26 | ) 27 | ) 28 | ); 29 | 30 | $source->addDocument('spam', 'Another spam document'); 31 | $source->addDocument('ham', 'Another ham document'); 32 | 33 | $c = new ComplementNaiveBayes($source); 34 | 35 | echo $c->classify("Some ham document"), PHP_EOL; 36 | -------------------------------------------------------------------------------- /examples/basic_svm.php: -------------------------------------------------------------------------------- 1 | addDocument('pig', 'Pigs are great. Pink and cute!'); 12 | $source->addDocument('wolf', 'Wolves have teeth. They are gray.'); 13 | 14 | $c = new SVM($source); 15 | $c->setThreshold(0.6); 16 | 17 | var_dump($c->classify('0943jf904jf09j34fpj')); 18 | -------------------------------------------------------------------------------- /examples/bayes.php: -------------------------------------------------------------------------------- 1 | __DIR__ . '/../resources/20news-bydate/20news-bydate-train', 31 | 'include' => $cats 32 | ) 33 | ) 34 | ); 35 | 36 | $testSource = new Directory( 37 | array( 38 | 'directory' => __DIR__ . '/../resources/20news-bydate/20news-bydate-test', 39 | 'include' => $cats 40 | ) 41 | ); 42 | 43 | $data = $testSource->getData(); 44 | $stats = array(); 45 | 46 | foreach ($data as $category => $documents) { 47 | $stats[$category] = array(0, count($documents)); 48 | foreach ($documents as $document) { 49 | if ($classifier->is($category, $document)) { 50 | $stats[$category][0]++; 51 | } 52 | } 53 | } 54 | 55 | foreach ($stats as $category => $data) { 56 | echo $category, ': ', ($data[0] / $data[1]), PHP_EOL; 57 | } 58 | -------------------------------------------------------------------------------- /examples/converter.php: -------------------------------------------------------------------------------- 1 | run(); 11 | -------------------------------------------------------------------------------- /examples/language.php: -------------------------------------------------------------------------------- 1 | addDocument(basename($file), file_get_contents($file)); 18 | } 19 | 20 | $nb = new ComplementNaiveBayes($source); 21 | 22 | $examples = array( 23 | "Agricultura (-ae, f.), sensu latissimo, est summa omnium artium et scientiarum et technologiarum quae de terris colendis et animalibus creandis curant, ut poma, frumenta, charas, carnes, textilia, et aliae res e terra bene producantur. Specialius, agronomia est ars et scientia quae terris colendis student, agricultio autem animalibus creandis.", 24 | "El llatí és una llengua indoeuropea de la branca itàlica, parlada antigament pels romans. A partir de l'evolució de la seva versió vulgar en sorgiren les llengües romàniques que sobreviuen avui dia.", 25 | "hola", 26 | "Hi there, this is a tiny text", 27 | "* This file implements in memory hash tables with insert/del/replace/find/ 28 | * get-random-element operations. Hash tables will auto resize if needed 29 | * tables of power of two in size are used, collisions are handled by 30 | * chaining. See the source code for more information... :)", 31 | "House of Cards is an American political drama series developed and produced by Beau Willimon. It is an adaptation of a previous BBC miniseries of the same name which is based on the novel by Michael Dobbs. The entire first season premiered on February 1, 2013, on the streaming service Netflix.[1] A second season of 13 episodes is currently in production.[1][2]" 32 | ); 33 | 34 | foreach ($examples as $example) { 35 | echo $nb->classify($example), PHP_EOL; 36 | } 37 | -------------------------------------------------------------------------------- /examples/svm.php: -------------------------------------------------------------------------------- 1 | __DIR__ . '/../resources/20news-bydate/20news-bydate-train', 22 | 'include' => $cats 23 | ) 24 | ); 25 | 26 | $classifier = new SVM($source); 27 | 28 | $testSource = new Directory( 29 | array( 30 | 'directory' => __DIR__ . '/../resources/20news-bydate/20news-bydate-test', 31 | 'include' => $cats 32 | ) 33 | ); 34 | 35 | $data = $testSource->getData(); 36 | $stats = array(); 37 | 38 | foreach ($data as $category => $documents) { 39 | $stats[$category] = array(0, count($documents)); 40 | foreach ($documents as $document) { 41 | if ($classifier->is($category, $document)) { 42 | $stats[$category][0]++; 43 | } 44 | } 45 | } 46 | 47 | foreach ($stats as $category => $data) { 48 | echo $category, ': ', ($data[0] / $data[1]), PHP_EOL; 49 | } 50 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Classifier/Classifier.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\Classifier; 13 | 14 | use Camspiers\StatisticalClassifier\DataSource\DataSourceInterface; 15 | use Camspiers\StatisticalClassifier\Model\ModelInterface; 16 | use RuntimeException; 17 | 18 | /** 19 | * A generic classifier which can be used to built a classifier given a number of injected components 20 | * @author Cam Spiers 21 | * @package Statistical Classifier 22 | */ 23 | abstract class Classifier implements ClassifierInterface 24 | { 25 | /** 26 | * @var \Camspiers\StatisticalClassifier\DataSource\DataSourceInterface 27 | */ 28 | protected $dataSource; 29 | /** 30 | * The model to apply the transforms to 31 | * @var \Camspiers\StatisticalClassifier\Model\ModelInterface 32 | */ 33 | protected $model; 34 | /** 35 | * @inheritdoc 36 | */ 37 | public function is($category, $document) 38 | { 39 | if ($this->dataSource->hasCategory($category)) { 40 | return $this->classify($document) === $category; 41 | } else { 42 | throw new RuntimeException( 43 | sprintf( 44 | "The category '%s' doesn't exist", 45 | $category 46 | ) 47 | ); 48 | } 49 | } 50 | /** 51 | * Builds the model from the data source by applying transforms to the data source 52 | * @return null 53 | */ 54 | abstract public function prepareModel(); 55 | /** 56 | * Return an model which has been prepared for classification 57 | * @return \Camspiers\StatisticalClassifier\Model\ModelInterface 58 | */ 59 | protected function preparedModel() 60 | { 61 | if (!$this->model->isPrepared()) { 62 | $this->prepareModel(); 63 | } 64 | 65 | return $this->model; 66 | } 67 | /** 68 | * Take a callable and run it passing in any additionally specified arguments 69 | * @param callable $transform 70 | * @throws \RuntimeException 71 | * @return mixed 72 | */ 73 | protected function applyTransform($transform) 74 | { 75 | if (is_callable($transform)) { 76 | return call_user_func_array($transform, array_slice(func_get_args(), 1)); 77 | } else { 78 | throw new RuntimeException("Argument to applyTransform must be callable"); 79 | } 80 | } 81 | /** 82 | * @param \Camspiers\StatisticalClassifier\Model\ModelInterface $model 83 | */ 84 | public function setModel(ModelInterface $model) 85 | { 86 | $this->model = $model; 87 | } 88 | /** 89 | * @param \Camspiers\StatisticalClassifier\DataSource\DataSourceInterface $dataSource 90 | */ 91 | public function setDataSource(DataSourceInterface $dataSource) 92 | { 93 | $this->dataSource = $dataSource; 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Classifier/ClassifierInterface.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\Classifier; 13 | 14 | /** 15 | * Provides an interface for classifier. 16 | * Implementing classes are classifiers capable of classifying documents into categories 17 | * 18 | * @author Cam Spiers 19 | * @package Statistical Classifier 20 | */ 21 | interface ClassifierInterface 22 | { 23 | /** 24 | * Returns whether or not the document is of the category 25 | * @param string $category The category in question 26 | * @param string $document The document to check 27 | * @return boolean Whether or not the document is in the category 28 | */ 29 | public function is($category, $document); 30 | /** 31 | * Classify the document and return its category 32 | * @param string $document The document to classify 33 | * @return string|bool The category of the document 34 | */ 35 | public function classify($document); 36 | } 37 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Classifier/ComplementNaiveBayes.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\Classifier; 13 | 14 | use Camspiers\StatisticalClassifier\DataSource\DataSourceInterface; 15 | use Camspiers\StatisticalClassifier\Model\Model; 16 | use Camspiers\StatisticalClassifier\Model\ModelInterface; 17 | use Camspiers\StatisticalClassifier\Normalizer\Document; 18 | use Camspiers\StatisticalClassifier\Normalizer\Token; 19 | use Camspiers\StatisticalClassifier\Tokenizer\TokenizerInterface; 20 | use Camspiers\StatisticalClassifier\Tokenizer\Word; 21 | use Camspiers\StatisticalClassifier\Transform; 22 | 23 | /** 24 | * An implementation of a Naive Bayes classifier. 25 | * 26 | * This classifier is based off *Tackling the Poor Assumptions of Naive Bayes Text Classifiers* by Jason Rennie 27 | * @author Cam Spiers 28 | * @package Statistical Classifier 29 | */ 30 | class ComplementNaiveBayes extends Classifier 31 | { 32 | /** 33 | * Tokenizer (the way of breaking up documents) 34 | * @var TokenizerInterface 35 | */ 36 | protected $tokenizer; 37 | /** 38 | * Takes document and makes it consistent 39 | * @var Document\NormalizerInterface 40 | */ 41 | protected $documentNormalizer; 42 | /** 43 | * Takes tokenized data and makes it consistent or stem it 44 | * @var Token\NormalizerInterface 45 | */ 46 | protected $tokenNormalizer; 47 | /** 48 | * Create the Naive Bayes Classifier 49 | * @param DataSourceInterface $dataSource 50 | * @param ModelInterface $model An model to store data in 51 | * @param Document\NormalizerInterface $documentNormalizer The normalizer to make document consistent 52 | * @param TokenizerInterface $tokenizer The tokenizer to break up the documents 53 | * @param Token\NormalizerInterface $tokenNormalizer The normaizer to make tokens consistent 54 | */ 55 | public function __construct( 56 | DataSourceInterface $dataSource, 57 | ModelInterface $model = null, 58 | Document\NormalizerInterface $documentNormalizer = null, 59 | TokenizerInterface $tokenizer = null, 60 | Token\NormalizerInterface $tokenNormalizer = null 61 | ) { 62 | $this->dataSource = $dataSource; 63 | $this->model = $model ?: new Model(); 64 | $this->documentNormalizer = $documentNormalizer ?: new Document\Lowercase(); 65 | $this->tokenizer = $tokenizer ?: new Word(); 66 | $this->tokenNormalizer = $tokenNormalizer; 67 | } 68 | /** 69 | * @inheritdoc 70 | */ 71 | public function prepareModel() 72 | { 73 | $data = $this->applyTransform( 74 | new Transform\TokenPreparation( 75 | $this->tokenizer, 76 | $this->documentNormalizer, 77 | $this->tokenNormalizer 78 | ), 79 | $this->dataSource->getData() 80 | ); 81 | 82 | $tokenCountByDocument = $this->applyTransform( 83 | new Transform\TokenCountByDocument(), 84 | $data 85 | ); 86 | 87 | $documentCount = $this->applyTransform( 88 | new Transform\DocumentCount(), 89 | $data 90 | ); 91 | 92 | unset($data); 93 | 94 | $tokenAppearanceCount = $this->applyTransform( 95 | new Transform\TokenAppearanceCount(), 96 | $tokenCountByDocument 97 | ); 98 | 99 | $tokensByCateory = $this->applyTransform( 100 | new Transform\TokensByCategory(), 101 | $tokenCountByDocument 102 | ); 103 | 104 | $tfidf = $this->applyTransform( 105 | new Transform\TFIDF(), 106 | $tokenCountByDocument, 107 | $documentCount, 108 | $tokenAppearanceCount 109 | ); 110 | 111 | unset($tokenCountByDocument); 112 | unset($tokenAppearanceCount); 113 | 114 | $documentLength = $this->applyTransform( 115 | new Transform\DocumentLength(), 116 | $tfidf 117 | ); 118 | 119 | unset($tfidf); 120 | 121 | $documentTokenCounts = $this->applyTransform( 122 | new Transform\DocumentTokenCounts(), 123 | $documentLength 124 | ); 125 | 126 | $complement = $this->applyTransform( 127 | new Transform\Complement(), 128 | $documentLength, 129 | $tokensByCateory, 130 | $documentCount, 131 | $documentTokenCounts 132 | ); 133 | 134 | unset( 135 | $documentLength, 136 | $tokensByCateory, 137 | $documentCount, 138 | $documentTokenCounts 139 | ); 140 | 141 | $this->model->setModel( 142 | $this->applyTransform( 143 | new Transform\Weight(), 144 | $complement 145 | ) 146 | ); 147 | 148 | $this->model->setPrepared(true); 149 | } 150 | /** 151 | * @inheritdoc 152 | */ 153 | public function classify($document) 154 | { 155 | $results = array(); 156 | 157 | if ($this->documentNormalizer) { 158 | $document = $this->documentNormalizer->normalize($document); 159 | } 160 | 161 | $tokens = $this->tokenizer->tokenize($document); 162 | 163 | if ($this->tokenNormalizer) { 164 | $tokens = $this->tokenNormalizer->normalize($tokens); 165 | } 166 | 167 | $tokens = array_count_values($tokens); 168 | 169 | $weights = $this->preparedModel()->getModel(); 170 | 171 | foreach (array_keys($weights) as $category) { 172 | $results[$category] = 0; 173 | foreach ($tokens as $token => $count) { 174 | if (array_key_exists($token, $weights[$category])) { 175 | $results[$category] += $count * $weights[$category][$token]; 176 | } 177 | } 178 | } 179 | 180 | asort($results, SORT_NUMERIC); 181 | 182 | $category = key($results); 183 | 184 | $value = array_shift($results); 185 | 186 | if ($value === array_shift($results)) { 187 | return false; 188 | } else { 189 | return $category; 190 | } 191 | } 192 | } 193 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Classifier/SVM.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\Classifier; 13 | 14 | use Camspiers\StatisticalClassifier\DataSource\DataSourceInterface; 15 | use Camspiers\StatisticalClassifier\Model\SVMModel; 16 | use Camspiers\StatisticalClassifier\Normalizer\Document; 17 | use Camspiers\StatisticalClassifier\Normalizer\Token; 18 | use Camspiers\StatisticalClassifier\Tokenizer\TokenizerInterface; 19 | use Camspiers\StatisticalClassifier\Tokenizer\Word; 20 | use Camspiers\StatisticalClassifier\Transform; 21 | 22 | /** 23 | * Provides a text based SVM classifier which uses libsvm 24 | * 25 | * @author Cam Spiers 26 | * @package Statistical Classifier 27 | */ 28 | class SVM extends Classifier 29 | { 30 | /** 31 | * Tokenizer (the way of breaking up documents) 32 | * @var TokenizerInterface 33 | */ 34 | protected $tokenizer; 35 | /** 36 | * Takes document and makes it consistent 37 | * @var Document\NormalizerInterface 38 | */ 39 | protected $documentNormalizer; 40 | /** 41 | * Takes tokenized data and makes it consistent or stem it 42 | * @var Token\NormalizerInterface 43 | */ 44 | protected $tokenNormalizer; 45 | /** 46 | * 47 | * @var float|bool 48 | */ 49 | protected $threshold; 50 | /** 51 | * @param DataSourceInterface $dataSource 52 | * @param SVMModel $model 53 | * @param Document\NormalizerInterface $documentNormalizer 54 | * @param TokenizerInterface $tokenizer 55 | * @param Token\NormalizerInterface $tokenNormalizer 56 | * @param \SVM $svm 57 | * @param null $threshold 58 | */ 59 | public function __construct( 60 | DataSourceInterface $dataSource, 61 | SVMModel $model = null, 62 | Document\NormalizerInterface $documentNormalizer = null, 63 | TokenizerInterface $tokenizer = null, 64 | Token\NormalizerInterface $tokenNormalizer = null, 65 | \SVM $svm = null, 66 | $threshold = null 67 | ) { 68 | $this->dataSource = $dataSource; 69 | $this->model = $model ? : new SVMModel(); 70 | $this->documentNormalizer = $documentNormalizer ?: new Document\Lowercase(); 71 | $this->tokenizer = $tokenizer ?: new Word(); 72 | $this->tokenNormalizer = $tokenNormalizer; 73 | if (!$svm) { 74 | $svm = new \SVM(); 75 | $svm->setOptions( 76 | array( 77 | \SVM::OPT_KERNEL_TYPE => \SVM::KERNEL_LINEAR 78 | ) 79 | ); 80 | } 81 | $this->svm = $svm; 82 | if ($threshold) { 83 | $this->setThreshold($threshold); 84 | } 85 | } 86 | /** 87 | * @inheritdoc 88 | */ 89 | public function prepareModel() 90 | { 91 | $data = $this->applyTransform( 92 | new Transform\TokenPreparation( 93 | $this->tokenizer, 94 | $this->documentNormalizer, 95 | $this->tokenNormalizer 96 | ), 97 | $this->dataSource->getData() 98 | ); 99 | 100 | $tokenCountByDocument = $this->applyTransform( 101 | new Transform\TokenCountByDocument(), 102 | $data 103 | ); 104 | 105 | $documentLength = $this->applyTransform( 106 | new Transform\DocumentLength(), 107 | $this->applyTransform( 108 | new Transform\TFIDF(), 109 | $tokenCountByDocument, 110 | $this->applyTransform( 111 | new Transform\DocumentCount(), 112 | $data 113 | ), 114 | $this->applyTransform( 115 | new Transform\TokenAppearanceCount(), 116 | $tokenCountByDocument 117 | ) 118 | ) 119 | ); 120 | 121 | $categoryMap = array(); 122 | $categoryCount = 0; 123 | $tokenMap = array(); 124 | $tokenCount = 1; 125 | 126 | // Produce the token and category maps for the whole document set 127 | foreach ($documentLength as $category => $documents) { 128 | if (!array_key_exists($category, $categoryMap)) { 129 | $categoryMap[$category] = $categoryCount; 130 | $categoryCount++; 131 | } 132 | foreach ($documents as $document) { 133 | foreach (array_keys($document) as $token) { 134 | if (!array_key_exists($token, $tokenMap)) { 135 | $tokenMap[$token] = $tokenCount; 136 | $tokenCount++; 137 | } 138 | } 139 | } 140 | } 141 | 142 | // When using probabilities and our dataset is small we need to increase its 143 | // size by duplicating the data 144 | // see: http://www.csie.ntu.edu.tw/~cjlin/papers/libsvm.pdf section "8 Probability Estimates" 145 | if ($this->hasThreshold()) { 146 | foreach ($documentLength as $category => $documents) { 147 | while (count($documents) <= 5) { 148 | foreach ($documents as $document) { 149 | $documents[] = $document; 150 | } 151 | } 152 | $documentLength[$category] = $documents; 153 | } 154 | } 155 | 156 | $transform = array(); 157 | 158 | // Prep the svm data set for use 159 | foreach ($documentLength as $category => $documents) { 160 | foreach ($documents as $document) { 161 | $entry = array( 162 | $categoryMap[$category] 163 | ); 164 | foreach ($document as $token => $value) { 165 | $entry[$tokenMap[$token]] = $value; 166 | } 167 | ksort($entry, SORT_NUMERIC); 168 | $transform[] = $entry; 169 | } 170 | } 171 | 172 | // Weight the data set by the number of docs that appear in each class. 173 | $weights = array(); 174 | 175 | foreach ($documentLength as $category => $documents) { 176 | $weights[$categoryMap[$category]] = count($documents); 177 | } 178 | 179 | $lowest = min($weights); 180 | 181 | foreach ($weights as $index => $weight) { 182 | $weights[$index] = $lowest / $weight; 183 | } 184 | 185 | $this->model->setMaps(array_flip($categoryMap), $tokenMap); 186 | 187 | $this->model->setModel( 188 | $this->svm->train( 189 | $transform, 190 | $weights 191 | ) 192 | ); 193 | 194 | $this->model->setPrepared(true); 195 | } 196 | /** 197 | * @inheritdoc 198 | */ 199 | public function classify($document) 200 | { 201 | /** @var SVMModel $model */ 202 | $model = $this->preparedModel(); 203 | 204 | $categoryMap = $model->getCategoryMap(); 205 | 206 | $data = $this->prepareDocument($document, $model); 207 | 208 | if ($this->hasThreshold()) { 209 | $probabilities = array(); 210 | $category = $model->getModel()->predict_probability($data, $probabilities); 211 | 212 | return $probabilities[$category] > $this->threshold ? $categoryMap[$category] : false; 213 | } else { 214 | $category = $model->getModel()->predict($data); 215 | 216 | return $categoryMap[$category]; 217 | } 218 | } 219 | /** 220 | * Formats the document for use in \SVMModel 221 | * @param string $document 222 | * @param \Camspiers\StatisticalClassifier\Model\SVMModel $model 223 | * @return array 224 | */ 225 | protected function prepareDocument($document, SVMModel $model) 226 | { 227 | $tokenMap = $model->getTokenMap(); 228 | 229 | $data = array(); 230 | 231 | if ($this->documentNormalizer) { 232 | $document = $this->documentNormalizer->normalize($document); 233 | } 234 | 235 | $tokens = $this->tokenizer->tokenize($document); 236 | 237 | if ($this->tokenNormalizer) { 238 | $tokens = $this->tokenNormalizer->normalize($tokens); 239 | } 240 | 241 | $tokenCounts = array_count_values($tokens); 242 | 243 | foreach ($tokenCounts as $token => $value) { 244 | if (isset($tokenMap[$token])) { 245 | $data[$tokenMap[$token]] = $value; 246 | } 247 | } 248 | 249 | ksort($data, SORT_NUMERIC); 250 | 251 | return $data; 252 | } 253 | /** 254 | * Set the threshold probability a classifier document must meet 255 | * @param float $threshold float value between 0-1 256 | * @throws \InvalidArgumentException 257 | */ 258 | public function setThreshold($threshold) 259 | { 260 | if (is_numeric($threshold)) { 261 | $this->threshold = $threshold; 262 | $this->svm->setOptions( 263 | array( 264 | \SVM::OPT_PROBABILITY => true 265 | ) 266 | ); 267 | if ($this->model->isPrepared()) { 268 | $this->model->setPrepared(false); 269 | } 270 | } else { 271 | throw new \InvalidArgumentException("Threshold must be a float value between 0-1"); 272 | } 273 | } 274 | /** 275 | * Returns the probabilities of the document being in each category 276 | * @param string $document 277 | * @return array 278 | */ 279 | public function getProbabilities($document) 280 | { 281 | if ($this->hasThreshold()) { 282 | $model = $this->preparedModel(); 283 | $data = $this->prepareDocument($document, $model); 284 | $probabilities = array(); 285 | $model->getModel()->predict_probability($data, $probabilities); 286 | 287 | return array_combine($model->getCategoryMap(), $probabilities); 288 | } 289 | } 290 | /** 291 | * @return bool 292 | */ 293 | protected function hasThreshold() 294 | { 295 | return $this->threshold !== null; 296 | } 297 | } 298 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Config/DataSourceConfiguration.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\Config; 13 | 14 | use Symfony\Component\Config\Definition\Builder\TreeBuilder; 15 | use Symfony\Component\Config\Definition\ConfigurationInterface; 16 | 17 | /** 18 | * @author Cam Spiers 19 | * @package Statistical Classifier 20 | */ 21 | class DataSourceConfiguration implements ConfigurationInterface 22 | { 23 | /** 24 | * Returns a specification for data sources 25 | * @return TreeBuilder 26 | */ 27 | public function getConfigTreeBuilder() 28 | { 29 | $treeBuilder = new TreeBuilder(); 30 | $rootNode = $treeBuilder->root('datasource'); 31 | 32 | $rootNode 33 | ->prototype('array') 34 | ->children() 35 | ->scalarNode('category')->isRequired()->end() 36 | ->scalarNode('document')->isRequired()->end() 37 | ->end(); 38 | 39 | return $treeBuilder; 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/DataSource/CSV.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\DataSource; 13 | 14 | use Symfony\Component\OptionsResolver\OptionsResolver; 15 | use Symfony\Component\OptionsResolver\OptionsResolverInterface; 16 | 17 | /** 18 | * @author Cam Spiers 19 | * @package Statistical Classifier 20 | */ 21 | class CSV extends DataArray 22 | { 23 | /** 24 | * Stores the configuration options 25 | * @var array 26 | */ 27 | protected $options; 28 | /** 29 | * Creates the object from an array of options 30 | * @param array $options 31 | */ 32 | public function __construct(array $options = array()) 33 | { 34 | $resolver = new OptionsResolver(); 35 | $this->setDefaultOptions($resolver); 36 | $this->options = $resolver->resolve($options); 37 | } 38 | /** 39 | * Sets the configuration for the options resolver 40 | * @param OptionsResolverInterface $resolver 41 | */ 42 | protected function setDefaultOptions(OptionsResolverInterface $resolver) 43 | { 44 | $resolver->setRequired( 45 | array( 46 | 'file', 47 | 'document_columns', 48 | 'category_column' 49 | ) 50 | ); 51 | 52 | $resolver->setOptional( 53 | array( 54 | 'limit', 55 | 'delimiter', 56 | 'enclosure', 57 | 'escape', 58 | 'category_modifier' 59 | ) 60 | ); 61 | 62 | $resolver->setDefaults( 63 | array( 64 | 'limit' => false, 65 | 'length' => 0, 66 | 'delimiter' => ',', 67 | 'enclosure' => '"', 68 | 'escape' => '\\', 69 | 'category_modifier' => false 70 | ) 71 | ); 72 | 73 | $resolver->setAllowedTypes( 74 | array( 75 | 'file' => 'string', 76 | 'document_columns' => 'array', 77 | 'category_column' => 'string', 78 | 'length' => 'int', 79 | 'delimiter' => 'string', 80 | 'enclosure' => 'string', 81 | 'escape' => 'string' 82 | ) 83 | ); 84 | } 85 | /** 86 | * @{inheritdoc} 87 | */ 88 | public function read() 89 | { 90 | $entries = array(); 91 | 92 | if (file_exists($this->options['file'])) { 93 | $handle = fopen($this->options['file'], 'r'); 94 | 95 | $columns = $this->readColumns($handle); 96 | $this->checkColumns($columns); 97 | 98 | $columnTotal = count($columns); 99 | $entryCount = 0; 100 | $hasModifier = is_callable($this->options['category_modifier']); 101 | 102 | while (true) { 103 | if ($this->options['limit'] && $this->options['limit'] < $entryCount) { 104 | break; 105 | } 106 | 107 | if (($csvData = $this->readLine($handle)) === false) { 108 | break; 109 | } 110 | 111 | if ($columnTotal !== count($csvData)) { 112 | continue; 113 | } 114 | 115 | $document = array(); 116 | 117 | foreach ($this->options['document_columns'] as $column) { 118 | $document[] = $csvData[$columns[$column]]; 119 | } 120 | 121 | $category = $csvData[$columns[$this->options['category_column']]]; 122 | 123 | if ($hasModifier) { 124 | $category = $this->options['category_modifier']($category); 125 | } 126 | 127 | $entries[] = array( 128 | 'document' => implode(' ', $document), 129 | 'category' => $category 130 | ); 131 | 132 | $entryCount++; 133 | } 134 | 135 | fclose($handle); 136 | } 137 | 138 | return $entries; 139 | } 140 | /** 141 | * @param $handle 142 | * @return array 143 | * @throws \RuntimeException 144 | */ 145 | protected function readColumns($handle) 146 | { 147 | if ($handle === false) { 148 | throw new \RuntimeException("Could not read file '{$this->options['file']}'"); 149 | } 150 | 151 | $columns = $this->readLine($handle); 152 | 153 | if ($columns === false) { 154 | throw new \RuntimeException("Failed to determine csv columns"); 155 | } 156 | 157 | /** 158 | * Result: 159 | * array( 160 | * 'ColumnName1' => 0, 161 | * 'ColumnName2' => 1 162 | * ) 163 | */ 164 | $columns = array_flip($columns); 165 | 166 | return $columns; 167 | } 168 | /** 169 | * @param $columns 170 | * @return array 171 | * @throws \RuntimeException 172 | */ 173 | protected function checkColumns($columns) 174 | { 175 | $neededColumns = $this->options['document_columns']; 176 | $neededColumns[] = $this->options['category_column']; 177 | foreach ($neededColumns as $column) { 178 | if (!array_key_exists($column, $columns)) { 179 | throw new \RuntimeException("Column '$column' doesn't exist in the csv"); 180 | } 181 | } 182 | } 183 | /** 184 | * @param $handle 185 | * @return array 186 | */ 187 | protected function readLine($handle) 188 | { 189 | return fgetcsv( 190 | $handle, 191 | $this->options['length'], 192 | $this->options['delimiter'], 193 | $this->options['enclosure'], 194 | $this->options['escape'] 195 | ); 196 | } 197 | /** 198 | * @{inheritdoc} 199 | */ 200 | public function write() 201 | { 202 | } 203 | } 204 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/DataSource/Closure.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\DataSource; 13 | 14 | /** 15 | * @author Cam Spiers 16 | * @package Statistical Classifier 17 | */ 18 | class Closure extends DataArray 19 | { 20 | /** 21 | * @var callable 22 | */ 23 | protected $closure; 24 | /** 25 | * @param callable $closure 26 | */ 27 | public function __construct(\Closure $closure) 28 | { 29 | $this->closure = $closure; 30 | } 31 | /** 32 | * @{inheritdoc} 33 | */ 34 | public function read() 35 | { 36 | return $this->closure->__invoke(); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/DataSource/Converter.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\DataSource; 13 | 14 | /** 15 | * @author Cam Spiers 16 | * @package Statistical Classifier 17 | */ 18 | class Converter 19 | { 20 | /** 21 | * The source to convert from 22 | * @var DataSourceInterface 23 | */ 24 | private $convertFrom; 25 | /** 26 | * The source to convert to 27 | * @var DataSourceInterface 28 | */ 29 | private $convertTo; 30 | /** 31 | * Creates the converter using to data sources 32 | * @param DataSourceInterface $convertFrom 33 | * @param DataSourceInterface $convertTo 34 | */ 35 | public function __construct(DataSourceInterface $convertFrom, DataSourceInterface $convertTo) 36 | { 37 | $this->convertFrom = $convertFrom; 38 | $this->convertTo = $convertTo; 39 | } 40 | /** 41 | * run the conversion 42 | * @return null 43 | */ 44 | public function run() 45 | { 46 | $this->convertTo->setData($this->convertFrom->getData()); 47 | $this->convertTo->write(); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/DataSource/DataArray.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\DataSource; 13 | 14 | use Camspiers\StatisticalClassifier\Config\DataSourceConfiguration; 15 | use RuntimeException; 16 | use Serializable; 17 | use Symfony\Component\Config\Definition\Processor; 18 | 19 | /** 20 | * @author Cam Spiers 21 | * @package Statistical Classifier 22 | */ 23 | class DataArray implements DataSourceInterface, Serializable 24 | { 25 | /** 26 | * An array to hold the sources data 27 | * 28 | * Should be in the form: 29 | * array( 30 | * array( 31 | * 'category' => 'somecategory', 32 | * 'document' => 'Some document' 33 | * ) 34 | * ) 35 | * @var array 36 | */ 37 | protected $data = array(); 38 | /** 39 | * Holds the config class that setData needs to conforms to 40 | * @var 41 | */ 42 | protected $config; 43 | /** 44 | * Processes the data with the config 45 | * @var 46 | */ 47 | protected $processor; 48 | 49 | /** 50 | * Creates the data array 51 | * @param array $data The initial data 52 | */ 53 | public function __construct(array $data = null) 54 | { 55 | if (is_array($data)) { 56 | $this->setData($data); 57 | } 58 | } 59 | /** 60 | * @{inheritdoc} 61 | */ 62 | public function getCategories() 63 | { 64 | return array_keys($this->data); 65 | } 66 | /** 67 | * @{inheritdoc} 68 | */ 69 | public function hasCategory($category) 70 | { 71 | $this->prepare(); 72 | 73 | return isset($this->data[$category]); 74 | } 75 | /** 76 | * @{inheritdoc} 77 | */ 78 | public function addDocument($category, $document) 79 | { 80 | if (!isset($this->data[$category])) { 81 | $this->data[$category] = array(); 82 | } 83 | $this->data[$category][] = $document; 84 | } 85 | /** 86 | * @{inheritdoc} 87 | */ 88 | protected function read() 89 | { 90 | return $this->data; 91 | } 92 | /** 93 | * @{inheritdoc} 94 | */ 95 | public function getData() 96 | { 97 | $this->prepare(); 98 | 99 | return $this->data; 100 | } 101 | /** 102 | * @{inheritdoc} 103 | */ 104 | public function setData(array $data) 105 | { 106 | $data = $this->getProcessor()->processConfiguration( 107 | $this->getConfig(), 108 | array( 109 | $data 110 | ) 111 | ); 112 | foreach ($data as $document) { 113 | $this->addDocument($document['category'], $document['document']); 114 | } 115 | } 116 | 117 | /** 118 | * @{inheritdoc} 119 | */ 120 | public function write() 121 | { 122 | throw new RuntimeException('This data source cannot be written'); 123 | } 124 | /** 125 | * Serialize the class 126 | * @return string The serialized data 127 | */ 128 | public function serialize() 129 | { 130 | return serialize($this->getData()); 131 | } 132 | /** 133 | * Restore the serialized class 134 | * @param string $data The serialized data 135 | * @return null 136 | */ 137 | public function unserialize($data) 138 | { 139 | $this->data = unserialize($data); 140 | } 141 | /** 142 | * @param DataSourceConfiguration $config 143 | */ 144 | public function setConfig(DataSourceConfiguration $config) 145 | { 146 | $this->config = $config; 147 | } 148 | /** 149 | * Return the config for the data 150 | * @return mixed 151 | */ 152 | public function getConfig() 153 | { 154 | if (null === $this->config) { 155 | $this->setConfig(new DataSourceConfiguration()); 156 | } 157 | 158 | return $this->config; 159 | } 160 | /** 161 | * Sets the processor for the config 162 | * @param Processor $processor 163 | */ 164 | protected function setProcessor(Processor $processor) 165 | { 166 | $this->processor = $processor; 167 | } 168 | /** 169 | * Gets the processor 170 | * @return mixed 171 | */ 172 | protected function getProcessor() 173 | { 174 | if (null === $this->processor) { 175 | $this->setProcessor(new Processor()); 176 | } 177 | 178 | return $this->processor; 179 | } 180 | /** 181 | * Read the data and set it 182 | */ 183 | protected function prepare() 184 | { 185 | if (!is_array($this->data) || count($this->data) == 0) { 186 | $this->setData($this->read()); 187 | } 188 | } 189 | } 190 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/DataSource/DataSourceInterface.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\DataSource; 13 | 14 | /** 15 | * @author Cam Spiers 16 | * @package Statistical Classifier 17 | */ 18 | interface DataSourceInterface 19 | { 20 | /** 21 | * Write the data source if possible 22 | * @return null 23 | */ 24 | public function write(); 25 | /** 26 | * Get the data 27 | * @return array The data 28 | */ 29 | public function getData(); 30 | /** 31 | * Set data to the data source 32 | * @param array $data 33 | * @return null 34 | */ 35 | public function setData(array $data); 36 | /** 37 | * Returns the categories of the data 38 | * @return array The categories 39 | */ 40 | public function getCategories(); 41 | /** 42 | * Returnnes whether or not the data has a category 43 | * @param string $category The category to check 44 | * @return boolean [description] 45 | */ 46 | public function hasCategory($category); 47 | /** 48 | * Adds a document by category to the data 49 | * @param string $category The category of the document 50 | * @param string $document The document 51 | */ 52 | public function addDocument($category, $document); 53 | } 54 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/DataSource/Directory.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\DataSource; 13 | 14 | use Symfony\Component\OptionsResolver\OptionsResolver; 15 | use Symfony\Component\OptionsResolver\OptionsResolverInterface; 16 | 17 | /** 18 | * @author Cam Spiers 19 | * @package Statistical Classifier 20 | */ 21 | class Directory extends DataArray 22 | { 23 | /** 24 | * Used for docs in category folder 25 | */ 26 | const MODE_DIRECTORY_AS_CATEGORY = 0; 27 | /** 28 | * Used for docs named be category 29 | */ 30 | const MODE_DOCUMENT_AS_CATEGORY = 1; 31 | /** 32 | * Stores the configuration options 33 | * @var array 34 | */ 35 | protected $options; 36 | /** 37 | * Creates the object from an array of options 38 | * @param array $options 39 | */ 40 | public function __construct(array $options = array()) 41 | { 42 | $resolver = new OptionsResolver(); 43 | $this->setDefaultOptions($resolver); 44 | $this->options = $resolver->resolve($options); 45 | } 46 | /** 47 | * Sets the configuration for the options resolver 48 | * @param OptionsResolverInterface $resolver 49 | */ 50 | protected function setDefaultOptions(OptionsResolverInterface $resolver) 51 | { 52 | $resolver->setRequired( 53 | array( 54 | 'directory' 55 | ) 56 | ); 57 | 58 | $resolver->setDefaults( 59 | array( 60 | 'mode' => self::MODE_DIRECTORY_AS_CATEGORY, 61 | 'include' => array(), 62 | 'limit' => null 63 | ) 64 | ); 65 | 66 | $resolver->setAllowedValues( 67 | array( 68 | 'mode' => array( 69 | self::MODE_DOCUMENT_AS_CATEGORY, 70 | self::MODE_DIRECTORY_AS_CATEGORY 71 | ) 72 | ) 73 | ); 74 | 75 | $resolver->setAllowedTypes( 76 | array( 77 | 'directory' => 'string', 78 | 'mode' => 'int', 79 | 'include' => 'array' 80 | ) 81 | ); 82 | } 83 | /** 84 | * @{inheritdoc} 85 | */ 86 | public function read() 87 | { 88 | $data = array(); 89 | if (file_exists($this->options['directory'])) { 90 | $pattern = $this->options['mode'] == self::MODE_DIRECTORY_AS_CATEGORY ? '/*' : ''; 91 | if (is_array($this->options['include']) && count($this->options['include']) !== 0) { 92 | $files = array(); 93 | foreach ($this->options['include'] as $include) { 94 | $files = array_merge( 95 | $files, 96 | array_slice( 97 | glob( 98 | "{$this->options['directory']}/{$include}{$pattern}", 99 | GLOB_NOSORT 100 | ), 101 | 0, 102 | $this->options['limit'] 103 | ) 104 | ); 105 | } 106 | } else { 107 | $files = array_slice( 108 | glob("{$this->options['directory']}{$pattern}/*", GLOB_NOSORT), 109 | 0, 110 | $this->options['limit'] 111 | ); 112 | } 113 | foreach ($files as $filename) { 114 | if (is_file($filename)) { 115 | if ($this->options['mode'] === self::MODE_DIRECTORY_AS_CATEGORY) { 116 | $categoryPath = dirname($filename); 117 | } else { 118 | $categoryPath = $filename; 119 | } 120 | $data[] = array( 121 | 'category' => basename($categoryPath), 122 | 'document' => file_get_contents($filename) 123 | ); 124 | } 125 | } 126 | } 127 | 128 | return $data; 129 | } 130 | /** 131 | * @{inheritdoc} 132 | */ 133 | public function write() 134 | { 135 | foreach ($this->data as $category => $documents) { 136 | if (!file_exists($this->options['directory'] . '/' . $category)) { 137 | mkdir($this->options['directory'] . '/' . $category); 138 | } 139 | foreach ($documents as $document) { 140 | file_put_contents( 141 | $this->options['directory'] . '/' . $category . '/' . md5($document), 142 | $document 143 | ); 144 | } 145 | } 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/DataSource/Grouped.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\DataSource; 13 | 14 | use InvalidArgumentException; 15 | 16 | /** 17 | * @author Cam Spiers 18 | * @package Statistical Classifier 19 | */ 20 | class Grouped extends DataArray 21 | { 22 | /** 23 | * The data sources to use 24 | * @var array 25 | */ 26 | protected $dataSources = array(); 27 | /** 28 | * Create the object passing in the datasources as an array 29 | * @param mixed $dataSources The data sources 30 | * @throws \InvalidArgumentException 31 | */ 32 | public function __construct($dataSources = array()) 33 | { 34 | if (!is_array($dataSources)) { 35 | $dataSources = func_get_args(); 36 | } 37 | 38 | if (count($dataSources) < 2) { 39 | throw new InvalidArgumentException("A group of data sources must contain at least 2 data sources"); 40 | } 41 | 42 | foreach ($dataSources as $dataSource) { 43 | $this->addDataSource($dataSource); 44 | } 45 | } 46 | /** 47 | * Add a data source to the array 48 | * @param DataSourceInterface $dataSource The data source 49 | */ 50 | public function addDataSource(DataSourceInterface $dataSource) 51 | { 52 | $this->dataSources[] = $dataSource; 53 | } 54 | /** 55 | * Returns any datasources that are part of the group 56 | * @return array 57 | */ 58 | public function getDataSources() 59 | { 60 | return $this->dataSources; 61 | } 62 | /** 63 | * @{inheritdoc} 64 | */ 65 | public function read() 66 | { 67 | $groupedData = array(); 68 | 69 | foreach ($this->dataSources as $dataSource) { 70 | $groupedData[] = $dataSource->getData(); 71 | } 72 | 73 | return call_user_func_array('array_merge', $groupedData); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/DataSource/Json.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\DataSource; 13 | 14 | /** 15 | * @author Cam Spiers 16 | * @package Statistical Classifier 17 | */ 18 | class Json extends DataArray 19 | { 20 | /** 21 | * The filename of the json file 22 | * @var string 23 | */ 24 | private $filename; 25 | /** 26 | * Creates the object from the filename 27 | * @param string $filename The filename of the json file 28 | */ 29 | public function __construct($filename) 30 | { 31 | $this->filename = $filename; 32 | } 33 | /** 34 | * @{inheritdoc} 35 | */ 36 | public function read() 37 | { 38 | if (file_exists($this->filename)) { 39 | $data = json_decode(file_get_contents($this->filename), true); 40 | if (is_array($data)) { 41 | return $data; 42 | } 43 | } 44 | 45 | return array(); 46 | } 47 | /** 48 | * @{inheritdoc} 49 | */ 50 | public function write() 51 | { 52 | $data = array(); 53 | foreach ($this->data as $category => $documents) { 54 | foreach ($documents as $document) { 55 | $data[] = array( 56 | 'category' => $category, 57 | 'document' => $document 58 | ); 59 | } 60 | } 61 | file_put_contents($this->filename, json_encode($data)); 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/DataSource/PDOQuery.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\DataSource; 13 | 14 | use PDO; 15 | 16 | /** 17 | * @author Cam Spiers 18 | * @package Statistical Classifier 19 | */ 20 | class PDOQuery extends DataArray 21 | { 22 | /** 23 | * The pdo connection object 24 | * @var PDO 25 | */ 26 | private $pdo; 27 | /** 28 | * The category of the query 29 | * @var string 30 | */ 31 | private $category; 32 | /** 33 | * The query to run 34 | * @var string 35 | */ 36 | private $query; 37 | /** 38 | * The column to use for the document 39 | * @var string 40 | */ 41 | private $documentColumn; 42 | /** 43 | * Creates the data source with the query details 44 | * @param string $category Category of the query 45 | * @param PDO $pdo The PDO connection object 46 | * @param string $query The query to run 47 | * @param string $documentColumn The column of the document 48 | */ 49 | public function __construct( 50 | $category, 51 | PDO $pdo, 52 | $query, 53 | $documentColumn 54 | ) { 55 | $this->category = $category; 56 | $this->pdo = $pdo; 57 | $this->query = $query; 58 | $this->documentColumn = $documentColumn; 59 | } 60 | /** 61 | * @{inheritdoc} 62 | */ 63 | public function read() 64 | { 65 | $query = $this->pdo->query($this->query); 66 | $query->setFetchMode(PDO::FETCH_ASSOC); 67 | $documents = array(); 68 | while ($row = $query->fetch()) { 69 | $documents[] = array( 70 | 'category' => $this->category, 71 | 'document' => $row[$this->documentColumn] 72 | ); 73 | } 74 | 75 | return $documents; 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/DataSource/Serialized.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\DataSource; 13 | 14 | /** 15 | * @author Cam Spiers 16 | * @package Statistical Classifier 17 | */ 18 | class Serialized extends DataArray 19 | { 20 | /** 21 | * The filename to read and write to 22 | * @var string 23 | */ 24 | private $filename; 25 | /** 26 | * Creates the data source using the filename 27 | * @param string $filename The filename to use 28 | */ 29 | public function __construct($filename) 30 | { 31 | $this->filename = $filename; 32 | } 33 | /** 34 | * @{inheritdoc} 35 | */ 36 | public function read() 37 | { 38 | if (file_exists($this->filename)) { 39 | $data = unserialize(file_get_contents($this->filename)); 40 | if (is_array($data)) { 41 | return $data; 42 | } 43 | } 44 | 45 | return array(); 46 | } 47 | /** 48 | * @{inheritdoc} 49 | */ 50 | public function write() 51 | { 52 | file_put_contents($this->filename, serialize($this->data)); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Model/CachedModel.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\Model; 13 | 14 | use CacheCache\Cache; 15 | 16 | /** 17 | * @author Cam Spiers 18 | * @package Statistical Classifier 19 | */ 20 | class CachedModel extends Model 21 | { 22 | /** 23 | * The name of the model 24 | * @var string 25 | */ 26 | private $modelName; 27 | /** 28 | * An instance of Cache 29 | * @var Cache 30 | */ 31 | private $cache; 32 | /** 33 | * Create the CachedModel using the modelname, cache and datasource 34 | * @param string $modelName The name of the model 35 | * @param Cache $cache The cache to use 36 | */ 37 | public function __construct( 38 | $modelName, 39 | Cache $cache 40 | ) { 41 | $this->modelName = $modelName; 42 | $this->cache = $cache; 43 | $data = $this->cache->get($this->modelName); 44 | if ($data !== null) { 45 | $this->prepared = true; 46 | $this->model = $data; 47 | } 48 | } 49 | /** 50 | * @param array $model 51 | * @return mixed|void 52 | */ 53 | public function setModel($model) 54 | { 55 | $this->model = $model; 56 | $this->cache->set($this->modelName, $model); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Model/Model.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\Model; 13 | 14 | /** 15 | * @author Cam Spiers 16 | * @package Statistical Classifier 17 | */ 18 | class Model implements ModelInterface 19 | { 20 | /** 21 | * This is an status variable indicating that the nessacary processing 22 | * has occured on the model 23 | * @var boolean 24 | */ 25 | protected $prepared = false; 26 | /** 27 | * The built model 28 | * @var array 29 | */ 30 | protected $model = array(); 31 | /** 32 | * @{inheritdoc} 33 | */ 34 | public function isPrepared() 35 | { 36 | return $this->prepared; 37 | } 38 | /** 39 | * @param $prepared 40 | * @return mixed|void 41 | */ 42 | public function setPrepared($prepared) 43 | { 44 | $this->prepared = $prepared; 45 | } 46 | /** 47 | * @param $model 48 | * @return mixed|void 49 | */ 50 | public function setModel($model) 51 | { 52 | $this->model = $model; 53 | } 54 | /** 55 | * @return array 56 | */ 57 | public function getModel() 58 | { 59 | return $this->model; 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Model/ModelInterface.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\Model; 13 | 14 | /** 15 | * @author Cam Spiers 16 | * @package Statistical Classifier 17 | */ 18 | interface ModelInterface 19 | { 20 | /** 21 | * Returns whether or not the model is prepared 22 | * @return boolean The prepared status 23 | */ 24 | public function isPrepared(); 25 | /** 26 | * @param $prepared 27 | * @return mixed 28 | */ 29 | public function setPrepared($prepared); 30 | /** 31 | * Get the data 32 | * @return array 33 | */ 34 | public function getModel(); 35 | /** 36 | * @param $model 37 | * @return mixed 38 | */ 39 | public function setModel($model); 40 | } 41 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Model/SVMCachedModel.php: -------------------------------------------------------------------------------- 1 | modelFilename = $modelFilename; 16 | $this->cache = $cache; 17 | $data = $this->cache->get($modelFilename); 18 | if ($data !== null && file_exists($this->modelFilename)) { 19 | $this->model = new \SVMModel; 20 | $this->model->load($this->modelFilename); 21 | $this->categoryMap = $data['categoryMap']; 22 | $this->tokenMap = $data['tokenMap']; 23 | $this->prepared = true; 24 | } 25 | } 26 | /** 27 | * @param $model 28 | * @return mixed|void 29 | * @throws \RuntimeException 30 | */ 31 | public function setModel($model) 32 | { 33 | if (!$model instanceof \SVMModel) { 34 | throw new \RuntimeException("Expected SVMModel"); 35 | } 36 | $this->model = $model; 37 | $this->model->save($this->modelFilename); 38 | } 39 | /** 40 | * @param $categoryMap 41 | * @param $tokenMap 42 | */ 43 | public function setMaps($categoryMap, $tokenMap) 44 | { 45 | $this->cache->set( 46 | $this->modelFilename, 47 | array( 48 | 'categoryMap' => $categoryMap, 49 | 'tokenMap' => $tokenMap 50 | ) 51 | ); 52 | parent::setMaps($categoryMap, $tokenMap); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Model/SVMModel.php: -------------------------------------------------------------------------------- 1 | categoryMap = $categoryMap; 22 | $this->tokenMap = $tokenMap; 23 | } 24 | /** 25 | * @return array 26 | */ 27 | public function getCategoryMap() 28 | { 29 | return $this->categoryMap; 30 | } 31 | /** 32 | * @return array 33 | */ 34 | public function getTokenMap() 35 | { 36 | return $this->tokenMap; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Normalizer/Document/Lowercase.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\Normalizer\Document; 13 | 14 | /** 15 | * @author Cam Spiers 16 | * @package Statistical Classifier 17 | */ 18 | class Lowercase implements NormalizerInterface 19 | { 20 | /** 21 | * {@inheritdoc} 22 | */ 23 | public function normalize($document) 24 | { 25 | return mb_strtolower($document, 'utf-8'); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Normalizer/Document/NormalizerInterface.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\Normalizer\Document; 13 | 14 | /** 15 | * @author Cam Spiers 16 | * @package Statistical Classifier 17 | */ 18 | interface NormalizerInterface 19 | { 20 | /** 21 | * Makes document more consistent by a particular method. 22 | * 23 | * @param string $document The document to normalize 24 | * @return string The normalized document 25 | */ 26 | public function normalize($document); 27 | } 28 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Normalizer/Token/Grouped.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\Normalizer\Token; 13 | 14 | use InvalidArgumentException; 15 | 16 | /** 17 | * @author Cam Spiers 18 | * @package Statistical Classifier 19 | */ 20 | class Grouped implements NormalizerInterface 21 | { 22 | /** 23 | * An array of normalizers to run 24 | * @var array 25 | */ 26 | protected $normalizers = array(); 27 | /** 28 | * Create the normalizer using an array or normalizers as input 29 | * @param mixed $normalizers 30 | * @throws \InvalidArgumentException 31 | */ 32 | public function __construct($normalizers = array()) 33 | { 34 | if (!is_array($normalizers)) { 35 | $normalizers = func_get_args(); 36 | } 37 | 38 | if (count($normalizers) === 0) { 39 | throw new InvalidArgumentException('A group of normalizers must contain at least one normalizer'); 40 | } 41 | 42 | foreach ($normalizers as $normalizer) { 43 | $this->addNormalizer($normalizer); 44 | } 45 | } 46 | /** 47 | * Add a normalizer to the group 48 | * @param NormalizerInterface $normalizer The normalizer 49 | */ 50 | public function addNormalizer(NormalizerInterface $normalizer) 51 | { 52 | $this->normalizers[] = $normalizer; 53 | } 54 | /** 55 | * @{inheritdoc} 56 | */ 57 | public function normalize(array $tokens) 58 | { 59 | foreach ($this->normalizers as $normalizer) { 60 | $tokens = $normalizer->normalize($tokens); 61 | } 62 | 63 | return $tokens; 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Normalizer/Token/NormalizerInterface.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\Normalizer\Token; 13 | 14 | /** 15 | * @author Cam Spiers 16 | * @package Statistical Classifier 17 | */ 18 | interface NormalizerInterface 19 | { 20 | /** 21 | * Makes tokens more consistent by a particular method. 22 | * 23 | * This is to increase matches across what tokens are deemed equivalent but differnt 24 | * @param array $tokens The tokens to normalizer 25 | * @return array The array of normalized tokens 26 | */ 27 | public function normalize(array $tokens); 28 | } 29 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Normalizer/Token/PhpStemmer.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\Normalizer\Token; 13 | 14 | /** 15 | * @author Cam Spiers 16 | * @package Statistical Classifier 17 | * @see https://github.com/hthetiot/php-stemmer.git 18 | */ 19 | class PhpStemmer implements NormalizerInterface 20 | { 21 | /** 22 | * Available languages. 23 | * 24 | * @var array 25 | */ 26 | protected $availableLanguages = array('danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 27 | 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 28 | 'spanish', 'swedish', 'turkish'); 29 | 30 | /** 31 | * Charset. 32 | * 33 | * @var string 34 | */ 35 | protected $charset; 36 | 37 | /** 38 | * Lang. 39 | * 40 | * @var string 41 | */ 42 | protected $lang; 43 | 44 | /** 45 | * @param string $lang 46 | * @param string $charset 47 | */ 48 | public function __construct($lang, $charset = 'utf-8') 49 | { 50 | $lang = strtolower($lang); 51 | 52 | if (! in_array($lang, $this->availableLanguages)) { 53 | throw new \InvalidArgumentException("Invalid language $lang"); 54 | } 55 | 56 | $this->charset = strtoupper(str_replace('-', '_', $charset));; 57 | $this->lang = $lang; 58 | } 59 | 60 | /** 61 | * {@inheritdoc} 62 | */ 63 | public function normalize(array $tokens) 64 | { 65 | foreach ($tokens as $k => $token) { 66 | $tokens[$k] = stemword($token, $this->lang, $this->charset); 67 | } 68 | 69 | return $tokens; 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Normalizer/Token/Porter.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\Normalizer\Token; 13 | 14 | use Porter as PorterStemmer; 15 | 16 | /** 17 | * @author Cam Spiers 18 | * @package Statistical Classifier 19 | */ 20 | class Porter implements NormalizerInterface 21 | { 22 | /** 23 | * {@inheritdoc} 24 | */ 25 | public function normalize(array $tokens) 26 | { 27 | return array_map( 28 | function ($token) { 29 | return PorterStemmer::Stem(strtolower($token)); 30 | }, 31 | $tokens 32 | ); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Normalizer/Token/Stopword.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\Normalizer\Token; 13 | 14 | /** 15 | * @author Cam Spiers 16 | * @package Statistical Classifier 17 | */ 18 | class Stopword implements NormalizerInterface 19 | { 20 | /** 21 | * An array of words to filter 22 | * @var array 23 | */ 24 | protected $stopwords; 25 | /** 26 | * Create the normalizer from an array of stopwords 27 | * @param array $stopwords The array of stopwords 28 | */ 29 | public function __construct(array $stopwords) 30 | { 31 | $this->stopwords = $stopwords; 32 | } 33 | /** 34 | * {@inheritdoc} 35 | */ 36 | public function normalize(array $tokens) 37 | { 38 | return array_diff($tokens, $this->stopwords); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Tokenizer/TokenizerInterface.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\Tokenizer; 13 | 14 | /** 15 | * @author Cam Spiers 16 | * @package Statistical Classifier 17 | */ 18 | interface TokenizerInterface 19 | { 20 | /** 21 | * Split document into tokens 22 | * @param string $document The document to split 23 | * @return array An array of tokens 24 | */ 25 | public function tokenize($document); 26 | } 27 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Tokenizer/Word.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\Tokenizer; 13 | 14 | /** 15 | * @author Cam Spiers 16 | * @package Statistical Classifier 17 | */ 18 | class Word implements TokenizerInterface 19 | { 20 | /** 21 | * @{inheritdoc} 22 | */ 23 | public function tokenize($document) 24 | { 25 | return preg_split('/\PL+/u', $document, null, PREG_SPLIT_NO_EMPTY); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Transform/Complement.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\Transform; 13 | 14 | /** 15 | * @author Cam Spiers 16 | * @package Statistical Classifier 17 | */ 18 | class Complement 19 | { 20 | public function __invoke( 21 | $documentLength, 22 | $tokensByCategory, 23 | $documentCount, 24 | $documentTokenCounts 25 | ) { 26 | $cats = array_keys($tokensByCategory); 27 | $trans = array(); 28 | 29 | $tokByCatSums = array(); 30 | 31 | foreach ($tokensByCategory as $cat => $tokens) { 32 | $tokByCatSums[$cat] = array_sum($tokens); 33 | } 34 | 35 | $documentCounts = array(); 36 | 37 | foreach ($documentLength as $cat => $documents) { 38 | $documentCounts[$cat] = count($documents); 39 | } 40 | 41 | foreach ($tokensByCategory as $cat => $tokens) { 42 | 43 | $trans[$cat] = array(); 44 | $categoriesSelection = array_diff($cats, array($cat)); 45 | 46 | $docsInOtherCats = $documentCount - $documentCounts[$cat]; 47 | 48 | foreach (array_keys($tokens) as $token) { 49 | $trans[$cat][$token] = $docsInOtherCats; 50 | foreach ($categoriesSelection as $currCat) { 51 | if (array_key_exists($token, $tokensByCategory[$currCat])) { 52 | $trans[$cat][$token] += $tokensByCategory[$currCat][$token]; 53 | } 54 | } 55 | foreach ($categoriesSelection as $currCat) { 56 | $trans[$cat][$token] = 57 | $trans[$cat][$token] 58 | / 59 | ($tokByCatSums[$currCat] + $documentTokenCounts[$currCat]); 60 | } 61 | 62 | } 63 | 64 | } 65 | 66 | return $trans; 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Transform/DocumentCount.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\Transform; 13 | 14 | /** 15 | * @author Cam Spiers 16 | * @package Statistical Classifier 17 | */ 18 | class DocumentCount 19 | { 20 | public function __invoke($data) 21 | { 22 | $count = 0; 23 | foreach ($data as $docs) { 24 | $count += count($docs); 25 | } 26 | 27 | return $count; 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Transform/DocumentLength.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\Transform; 13 | 14 | /** 15 | * @author Cam Spiers 16 | * @package Statistical Classifier 17 | */ 18 | class DocumentLength 19 | { 20 | public function __invoke($tfidf) 21 | { 22 | $transform = $tfidf; 23 | 24 | foreach ($tfidf as $category => $documents) { 25 | foreach ($documents as $documentIndex => $document) { 26 | $denominator = 0; 27 | foreach ($document as $count) { 28 | $denominator += $count * $count; 29 | } 30 | $denominator = sqrt($denominator); 31 | if ($denominator != 0) { 32 | foreach ($document as $token => $count) { 33 | $transform 34 | [$category] 35 | [$documentIndex] 36 | [$token] = $count / $denominator; 37 | } 38 | } else { 39 | throw new \RuntimeException("Cannot divide by 0 in DocumentLength transform"); 40 | } 41 | } 42 | } 43 | 44 | return $transform; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Transform/DocumentTokenCounts.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\Transform; 13 | 14 | /** 15 | * @author Cam Spiers 16 | * @package Statistical Classifier 17 | */ 18 | class DocumentTokenCounts 19 | { 20 | public function __invoke($data) 21 | { 22 | $transform = array(); 23 | 24 | foreach ($data as $category => $documents) { 25 | $transform[$category] = 0; 26 | foreach ($documents as $document) { 27 | $transform[$category] += count($document); 28 | } 29 | } 30 | 31 | return $transform; 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Transform/TFIDF.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\Transform; 13 | 14 | /** 15 | * @author Cam Spiers 16 | * @package Statistical Classifier 17 | */ 18 | class TFIDF 19 | { 20 | public function __invoke( 21 | $tokenCountByDocument, 22 | $documentCount, 23 | $tokenAppreanceCount 24 | ) { 25 | foreach ($tokenCountByDocument as $category => $documents) { 26 | foreach ($documents as $documentModel => $document) { 27 | foreach ($document as $token => $count) { 28 | $tokenCountByDocument 29 | [$category] 30 | [$documentModel] 31 | [$token] = log($count + 1, 10) * log( 32 | $documentCount / $tokenAppreanceCount[$token], 33 | 10 34 | ); 35 | } 36 | } 37 | } 38 | 39 | return $tokenCountByDocument; 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Transform/TokenAppearanceCount.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\Transform; 13 | 14 | /** 15 | * @author Cam Spiers 16 | * @package Statistical Classifier 17 | */ 18 | class TokenAppearanceCount 19 | { 20 | public function __invoke($tokenCountByDocument) 21 | { 22 | $transform = array(); 23 | foreach ($tokenCountByDocument as $documents) { 24 | foreach ($documents as $document) { 25 | foreach ($document as $token => $count) { 26 | if ($count > 0) { 27 | if (!isset($transform[$token])) { 28 | $transform[$token] = 0; 29 | } 30 | $transform[$token]++; 31 | } 32 | } 33 | } 34 | } 35 | 36 | return $transform; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Transform/TokenCountByDocument.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\Transform; 13 | 14 | /** 15 | * @author Cam Spiers 16 | * @package Statistical Classifier 17 | */ 18 | class TokenCountByDocument 19 | { 20 | public function __invoke($data) 21 | { 22 | $transform = array(); 23 | 24 | foreach ($data as $category => $documents) { 25 | $transform[$category] = array(); 26 | foreach ($documents as $tokens) { 27 | $transform[$category][] = array_count_values($tokens); 28 | } 29 | } 30 | 31 | return $transform; 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Transform/TokenPreparation.php: -------------------------------------------------------------------------------- 1 | tokenizer = $tokenizer; 34 | $this->documentNormalizer = $documentNormalizer; 35 | $this->tokenNormalizer = $tokenNormalizer; 36 | } 37 | 38 | public function __invoke($data) 39 | { 40 | foreach ($data as $category => $documents) { 41 | foreach ($documents as $index => $document) { 42 | if ($this->documentNormalizer) { 43 | $document = $this->documentNormalizer->normalize($document); 44 | } 45 | 46 | $tokens = $this->tokenizer->tokenize($document); 47 | 48 | if ($this->tokenNormalizer) { 49 | $tokens = $this->tokenNormalizer->normalize($tokens); 50 | } 51 | 52 | $data[$category][$index] = $tokens; 53 | } 54 | } 55 | 56 | return $data; 57 | } 58 | } -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Transform/TokensByCategory.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\Transform; 13 | 14 | /** 15 | * @author Cam Spiers 16 | * @package Statistical Classifier 17 | */ 18 | class TokensByCategory 19 | { 20 | public function __invoke($tokenCountbyDocument) 21 | { 22 | $transform = array(); 23 | 24 | foreach ($tokenCountbyDocument as $category => $documents) { 25 | $transform[$category] = array(); 26 | foreach ($documents as $document) { 27 | foreach (array_keys($document) as $token) { 28 | if (array_key_exists($token, $transform[$category])) { 29 | $transform[$category][$token] += $document[$token]; 30 | } else { 31 | $transform[$category][$token] = $document[$token]; 32 | } 33 | } 34 | } 35 | } 36 | 37 | return $transform; 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/Camspiers/StatisticalClassifier/Transform/Weight.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * For the full copyright and license information, please view the LICENSE 9 | * file that was distributed with this source code. 10 | */ 11 | 12 | namespace Camspiers\StatisticalClassifier\Transform; 13 | 14 | /** 15 | * @author Cam Spiers 16 | * @package Statistical Classifier 17 | */ 18 | class Weight 19 | { 20 | public function __invoke($data) 21 | { 22 | foreach ($data as $category => $tokens) { 23 | foreach ($tokens as $token => $value) { 24 | $data[$category][$token] = log($value, 10); 25 | } 26 | } 27 | 28 | return $data; 29 | } 30 | } 31 | --------------------------------------------------------------------------------