├── .github └── workflows │ └── build.yml ├── .gitignore ├── CHANGELOG.md ├── LICENSE.txt ├── README.md ├── composer.json ├── src ├── BinaryRelationDetector.php ├── BinaryRelationTrainer.php ├── Document.php ├── Exception.php ├── FFI.php ├── NER.php ├── NERTrainer.php ├── NERTrainingInstance.php ├── TextCategorizer.php ├── TextCategorizerTrainer.php ├── Utils.php └── Vendor.php └── tests ├── BinaryRelationDetectorTest.php ├── BinaryRelationTrainerTest.php ├── DocumentTest.php ├── NERTest.php ├── NERTrainerTest.php ├── NERTrainingInstanceTest.php ├── TestCase.php └── TextCategorizerTest.php /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | on: [push, pull_request] 3 | jobs: 4 | build: 5 | strategy: 6 | fail-fast: false 7 | matrix: 8 | # TODO debug windows-* 9 | os: [ubuntu-latest, macos-latest] 10 | runs-on: ${{ matrix.os }} 11 | steps: 12 | - uses: actions/checkout@v4 13 | - if: ${{ startsWith(matrix.os, 'windows') }} 14 | run: | 15 | (Get-Content C:\tools\php\php.ini) -replace ';extension=ffi', 'extension=ffi' | Out-File -encoding ASCII C:\tools\php\php.ini 16 | (Get-Content C:\tools\php\php.ini) -replace ';ffi.enable=preload', 'ffi.enable=preload' | Out-File -encoding ASCII C:\tools\php\php.ini 17 | - run: composer install 18 | 19 | - uses: actions/cache@v4 20 | with: 21 | path: MITIE-models 22 | key: models 23 | id: cache-models 24 | - name: Download models 25 | if: steps.cache-models.outputs.cache-hit != 'true' 26 | run: | 27 | curl -Ls -o models.tar.bz2 https://github.com/mit-nlp/MITIE/releases/download/v0.4/MITIE-models-v0.2.tar.bz2 28 | tar xfj models.tar.bz2 29 | 30 | - run: composer test 31 | env: 32 | MITIE_MODELS_PATH: MITIE-models/english 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /vendor/ 2 | /composer.lock 3 | /lib/ 4 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## 0.2.0 (2024-06-30) 2 | 3 | - Dropped support for PHP < 8.1 4 | 5 | ## 0.1.4 (2023-02-11) 6 | 7 | - Fixed warnings with PHP 8.2 8 | 9 | ## 0.1.3 (2022-09-05) 10 | 11 | - Added support for training NER models 12 | - Added support for training binary relation detectors 13 | 14 | ## 0.1.2 (2022-08-30) 15 | 16 | - Added support for text categorization 17 | - Added `saveToDisk` method to `NER` 18 | 19 | ## 0.1.1 (2022-08-28) 20 | 21 | - Added binary relation detection 22 | 23 | ## 0.1.0 (2022-08-27) 24 | 25 | - First release 26 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Boost Software License - Version 1.0 - August 17th, 2003 2 | 3 | Permission is hereby granted, free of charge, to any person or organization 4 | obtaining a copy of the software and accompanying documentation covered by 5 | this license (the "Software") to use, reproduce, display, distribute, 6 | execute, and transmit the Software, and to prepare derivative works of the 7 | Software, and to permit third-parties to whom the Software is furnished to 8 | do so, all subject to the following: 9 | 10 | The copyright notices in the Software and this entire statement, including 11 | the above license grant, this restriction and the following disclaimer, 12 | must be included in all copies of the Software, in whole or in part, and 13 | all derivative works of the Software, unless such copies or derivative 14 | works are solely in the form of machine-executable object code generated by 15 | a source language processor. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 20 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 21 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 22 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 | DEALINGS IN THE SOFTWARE. 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MITIE PHP 2 | 3 | [MITIE](https://github.com/mit-nlp/MITIE) - named-entity recognition, binary relation detection, and text categorization - for PHP 4 | 5 | - Finds people, organizations, and locations in text 6 | - Detects relationships between entities, like `PERSON` was born in `LOCATION` 7 | 8 | [![Build Status](https://github.com/ankane/mitie-php/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/mitie-php/actions) 9 | 10 | ## Installation 11 | 12 | Run: 13 | 14 | ```sh 15 | composer require ankane/mitie 16 | ``` 17 | 18 | Add scripts to `composer.json` to download the shared library: 19 | 20 | ```json 21 | "scripts": { 22 | "post-install-cmd": "Mitie\\Vendor::check", 23 | "post-update-cmd": "Mitie\\Vendor::check" 24 | } 25 | ``` 26 | 27 | Run: 28 | 29 | ```sh 30 | composer install 31 | ``` 32 | 33 | And download the pre-trained models for your language: 34 | 35 | - [English](https://github.com/mit-nlp/MITIE/releases/download/v0.4/MITIE-models-v0.2.tar.bz2) 36 | - [Spanish](https://github.com/mit-nlp/MITIE/releases/download/v0.4/MITIE-models-v0.2-Spanish.zip) 37 | - [German](https://github.com/mit-nlp/MITIE/releases/download/v0.4/MITIE-models-v0.2-German.tar.bz2) 38 | 39 | ## Getting Started 40 | 41 | - [Named Entity Recognition](#named-entity-recognition) 42 | - [Binary Relation Detection](#binary-relation-detection) 43 | - [Text Categorization](#text-categorization) 44 | 45 | ## Named Entity Recognition 46 | 47 | Load an NER model 48 | 49 | ```php 50 | $model = new Mitie\NER('ner_model.dat'); 51 | ``` 52 | 53 | Create a document 54 | 55 | ```php 56 | $doc = $model->doc('Nat works at GitHub in San Francisco'); 57 | ``` 58 | 59 | Get entities 60 | 61 | ```php 62 | $doc->entities(); 63 | ``` 64 | 65 | This returns 66 | 67 | ```php 68 | [ 69 | ['text' => 'Nat', 'tag' => 'PERSON', 'score' => 0.3112371212688382, 'offset' => 0], 70 | ['text' => 'GitHub', 'tag' => 'ORGANIZATION', 'score' => 0.5660115198329334, 'offset' => 13], 71 | ['text' => 'San Francisco', 'tag' => 'LOCATION', 'score' => 1.3890524313885309, 'offset' => 23] 72 | ] 73 | ``` 74 | 75 | Get tokens 76 | 77 | ```php 78 | $doc->tokens(); 79 | ``` 80 | 81 | Get tokens and their offset 82 | 83 | ```php 84 | $doc->tokensWithOffset(); 85 | ``` 86 | 87 | Get all tags for a model 88 | 89 | ```php 90 | $model->tags(); 91 | ``` 92 | 93 | ### Training 94 | 95 | Load an NER model into a trainer 96 | 97 | ```php 98 | $trainer = new Mitie\NERTrainer('total_word_feature_extractor.dat'); 99 | ``` 100 | 101 | Create training instances 102 | 103 | ```php 104 | $tokens = ['You', 'can', 'do', 'machine', 'learning', 'in', 'PHP', '!']; 105 | $instance = new Mitie\NERTrainingInstance($tokens); 106 | $instance->addEntity(3, 4, 'topic'); // machine learning 107 | $instance->addEntity(6, 6, 'language'); // PHP 108 | ``` 109 | 110 | Add the training instances to the trainer 111 | 112 | ```php 113 | $trainer->add($instance); 114 | ``` 115 | 116 | Train the model 117 | 118 | ```php 119 | $model = $trainer->train(); 120 | ``` 121 | 122 | Save the model 123 | 124 | ```php 125 | $model->saveToDisk('ner_model.dat'); 126 | ``` 127 | 128 | ## Binary Relation Detection 129 | 130 | Detect relationships betweens two entities, like: 131 | 132 | - `PERSON` was born in `LOCATION` 133 | - `ORGANIZATION` was founded in `LOCATION` 134 | - `FILM` was directed by `PERSON` 135 | 136 | There are 21 detectors for English. You can find them in the `binary_relations` directory in the model download. 137 | 138 | Load a detector 139 | 140 | ```php 141 | $detector = new Mitie\BinaryRelationDetector('rel_classifier_organization.organization.place_founded.svm'); 142 | ``` 143 | 144 | And create a document 145 | 146 | ```php 147 | $doc = $model->doc('Shopify was founded in Ottawa'); 148 | ``` 149 | 150 | Get relations 151 | 152 | ```php 153 | $detector->relations($doc); 154 | ``` 155 | 156 | This returns 157 | 158 | ```php 159 | [['first' => 'Shopify', 'second' => 'Ottawa', 'score' => 0.17649169745814464]] 160 | ``` 161 | 162 | ### Training 163 | 164 | Load an NER model into a trainer 165 | 166 | ```php 167 | $trainer = new Mitie\BinaryRelationTrainer($model); 168 | ``` 169 | 170 | Add positive and negative examples to the trainer 171 | 172 | ```php 173 | $tokens = ['Shopify', 'was', 'founded', 'in', 'Ottawa']; 174 | $trainer->addPositiveBinaryRelation($tokens, [0, 0], [4, 4]); 175 | $trainer->addNegativeBinaryRelation($tokens, [4, 4], [0, 0]); 176 | ``` 177 | 178 | Train the detector 179 | 180 | ```php 181 | $detector = $trainer->train(); 182 | ``` 183 | 184 | Save the detector 185 | 186 | ```php 187 | $detector->saveToDisk('binary_relation_detector.svm'); 188 | ``` 189 | 190 | ## Text Categorization 191 | 192 | Load a model into a trainer 193 | 194 | ```php 195 | $trainer = new Mitie\TextCategorizerTrainer('total_word_feature_extractor.dat'); 196 | ``` 197 | 198 | Add labeled text to the trainer 199 | 200 | ```php 201 | $trainer->add('This is super cool', 'positive'); 202 | ``` 203 | 204 | Train the model 205 | 206 | ```php 207 | $model = $trainer->train(); 208 | ``` 209 | 210 | Save the model 211 | 212 | ```php 213 | $model->saveToDisk('text_categorization_model.dat'); 214 | ``` 215 | 216 | Load a saved model 217 | 218 | ```php 219 | $model = new Mitie\TextCategorizer('text_categorization_model.dat'); 220 | ``` 221 | 222 | Categorize text 223 | 224 | ```php 225 | $model->categorize('What a super nice day'); 226 | ``` 227 | 228 | ## History 229 | 230 | View the [changelog](CHANGELOG.md) 231 | 232 | ## Contributing 233 | 234 | Everyone is encouraged to help improve this project. Here are a few ways you can help: 235 | 236 | - [Report bugs](https://github.com/ankane/mitie-php/issues) 237 | - Fix bugs and [submit pull requests](https://github.com/ankane/mitie-php/pulls) 238 | - Write, clarify, or fix documentation 239 | - Suggest or add new features 240 | 241 | To get started with development: 242 | 243 | ```sh 244 | git clone https://github.com/ankane/mitie-php.git 245 | cd mitie-php 246 | composer install 247 | composer test 248 | ``` 249 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ankane/mitie", 3 | "description": "Named-entity recognition for PHP", 4 | "license": "BSL-1.0", 5 | "support": { 6 | "issues": "https://github.com/ankane/mitie-php/issues", 7 | "source": "https://github.com/ankane/mitie-php" 8 | }, 9 | "authors": [ 10 | { 11 | "name": "Andrew Kane", 12 | "email": "andrew@ankane.org" 13 | } 14 | ], 15 | "autoload": { 16 | "psr-4": { 17 | "Mitie\\": "src/" 18 | } 19 | }, 20 | "autoload-dev": { 21 | "psr-4": { 22 | "Tests\\": "tests/" 23 | } 24 | }, 25 | "require": { 26 | "php": ">= 8.1", 27 | "ext-ffi": ">= 8.1" 28 | }, 29 | "require-dev": { 30 | "phpunit/phpunit": "^10" 31 | }, 32 | "scripts": { 33 | "test": "phpunit tests", 34 | "post-install-cmd": "Mitie\\Vendor::check", 35 | "post-update-cmd": "Mitie\\Vendor::check" 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/BinaryRelationDetector.php: -------------------------------------------------------------------------------- 1 | ffi = FFI::instance(); 13 | 14 | if (!is_null($path)) { 15 | if (!file_exists($path)) { 16 | throw new \InvalidArgumentException('File does not exist'); 17 | } 18 | $this->pointer = $this->ffi->mitie_load_binary_relation_detector($path); 19 | } elseif (!is_null($pointer)) { 20 | $this->pointer = $pointer; 21 | } else { 22 | throw new \InvalidArgumentException('Must pass either a path or a pointer'); 23 | } 24 | } 25 | 26 | public function __destruct() 27 | { 28 | FFI::mitie_free($this->pointer); 29 | } 30 | 31 | public function name() 32 | { 33 | return $this->ffi->mitie_binary_relation_detector_name_string($this->pointer); 34 | } 35 | 36 | public function relations($doc) 37 | { 38 | if (!($doc instanceof Document)) { 39 | throw new \InvalidArgumentException('Expected Mitie\Document'); 40 | } 41 | 42 | $entities = $doc->entities(); 43 | $combinations = []; 44 | for ($i = 0; $i < count($entities) - 1; $i++) { 45 | $combinations[] = [$entities[$i], $entities[$i + 1]]; 46 | $combinations[] = [$entities[$i + 1], $entities[$i]]; 47 | } 48 | 49 | $relations = []; 50 | foreach ($combinations as [$entity1, $entity2]) { 51 | $relation = $this->extractRelation($doc, $entity1, $entity2); 52 | if (!is_null($relation)) { 53 | $relations[] = $relation; 54 | } 55 | } 56 | return $relations; 57 | } 58 | 59 | public function saveToDisk($filename) 60 | { 61 | if ($this->ffi->mitie_save_binary_relation_detector($filename, $this->pointer) != 0) { 62 | throw new Exception('Unable to save detector'); 63 | } 64 | } 65 | 66 | private function extractRelation($doc, $entity1, $entity2) 67 | { 68 | try { 69 | $relation = $this->ffi->mitie_extract_binary_relation( 70 | $doc->model->pointer, 71 | $doc->tokensPtr, 72 | $entity1['token_index'], 73 | $entity1['token_length'], 74 | $entity2['token_index'], 75 | $entity2['token_length'] 76 | ); 77 | 78 | $scorePtr = $this->ffi->new('double'); 79 | $status = $this->ffi->mitie_classify_binary_relation($this->pointer, $relation, \FFI::addr($scorePtr)); 80 | if ($status != 0) { 81 | throw new Exception("Bad status: $status"); 82 | } 83 | 84 | $score = $scorePtr->cdata; 85 | if ($score > 0) { 86 | return [ 87 | 'first' => $entity1['text'], 88 | 'second' => $entity2['text'], 89 | 'score' => $score 90 | ]; 91 | } 92 | } finally { 93 | FFI::mitie_free($relation); 94 | } 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/BinaryRelationTrainer.php: -------------------------------------------------------------------------------- 1 | ffi = FFI::instance(); 13 | 14 | $this->pointer = $this->ffi->mitie_create_binary_relation_trainer($name, $ner->pointer); 15 | } 16 | 17 | public function __destruct() 18 | { 19 | FFI::mitie_free($this->pointer); 20 | } 21 | 22 | public function addPositiveBinaryRelation($tokens, $range1, $range2) 23 | { 24 | $this->checkAdd($tokens, $range1, $range2); 25 | 26 | $tokensPointer = Utils::arrayToPointer($tokens); 27 | $status = $this->ffi->mitie_add_positive_binary_relation($this->pointer, $tokensPointer, $range1[0], $range1[1] - $range1[0] + 1, $range2[0], $range2[1] - $range2[0] + 1); 28 | if ($status != 0) { 29 | throw new Exception('Unable to add binary relation'); 30 | } 31 | } 32 | 33 | public function addNegativeBinaryRelation($tokens, $range1, $range2) 34 | { 35 | $this->checkAdd($tokens, $range1, $range2); 36 | 37 | $tokensPointer = Utils::arrayToPointer($tokens); 38 | $status = $this->ffi->mitie_add_negative_binary_relation($this->pointer, $tokensPointer, $range1[0], $range1[1] - $range1[0] + 1, $range2[0], $range2[1] - $range2[0] + 1); 39 | if ($status != 0) { 40 | throw new Exception('Unable to add binary relation'); 41 | } 42 | } 43 | 44 | public function beta() 45 | { 46 | return $this->ffi->mitie_binary_relation_trainer_get_beta($this->pointer); 47 | } 48 | 49 | public function setBeta($value) 50 | { 51 | if ($value < 0) { 52 | throw new \InvalidArgumentException('beta must be greater than or equal to zero'); 53 | } 54 | 55 | $this->ffi->mitie_binary_relation_trainer_set_beta($this->pointer, $value); 56 | } 57 | 58 | public function numThreads() 59 | { 60 | return $this->ffi->mitie_binary_relation_trainer_get_num_threads($this->pointer); 61 | } 62 | 63 | public function setNumThreads($value) 64 | { 65 | return $this->ffi->mitie_binary_relation_trainer_set_num_threads($this->pointer, $value); 66 | } 67 | 68 | public function numPositiveExamples() 69 | { 70 | return $this->ffi->mitie_binary_relation_trainer_num_positive_examples($this->pointer); 71 | } 72 | 73 | public function numNegativeExamples() 74 | { 75 | return $this->ffi->mitie_binary_relation_trainer_num_negative_examples($this->pointer); 76 | } 77 | 78 | public function train() 79 | { 80 | if ($this->numPositiveExamples() + $this->numNegativeExamples() == 0) { 81 | throw new Exception("You can't call train() on an empty trainer"); 82 | } 83 | 84 | $detector = $this->ffi->mitie_train_binary_relation_detector($this->pointer); 85 | 86 | if (is_null($detector)) { 87 | throw new Exception('Unable to create binary relation detector. Probably ran out of RAM.'); 88 | } 89 | 90 | return new BinaryRelationDetector(pointer: $detector); 91 | } 92 | 93 | private function checkAdd($tokens, $range1, $range2) 94 | { 95 | Utils::checkRange($range1[0], $range1[1], count($tokens)); 96 | Utils::checkRange($range2[0], $range2[1], count($tokens)); 97 | 98 | if ($this->entitiesOverlap($range1, $range2)) { 99 | throw new \InvalidArgumentException('Entities overlap'); 100 | } 101 | } 102 | 103 | private function entitiesOverlap($range1, $range2) 104 | { 105 | return $this->ffi->mitie_entities_overlap($range1[0], $range1[1] - $range1[0] + 1, $range2[0], $range2[1] - $range2[0] + 1) == 1; 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/Document.php: -------------------------------------------------------------------------------- 1 | ffi = FFI::instance(); 17 | 18 | $this->model = $model; 19 | $this->text = $text; 20 | 21 | $this->offsetsPtr = $this->ffi->new('unsigned long*'); 22 | $this->tokensPtr = $this->ffi->mitie_tokenize_with_offsets($this->text, \FFI::addr($this->offsetsPtr)); 23 | } 24 | 25 | public function __destruct() 26 | { 27 | FFI::mitie_free($this->offsetsPtr); 28 | FFI::mitie_free($this->tokensPtr); 29 | } 30 | 31 | public function tokens() 32 | { 33 | return array_map(fn ($v) => $v[0], $this->tokensWithOffset()); 34 | } 35 | 36 | public function tokensWithOffset() 37 | { 38 | $i = 0; 39 | $tokens = []; 40 | while (true) { 41 | $token = $this->tokensPtr[$i]; 42 | if (is_null($token)) { 43 | break; 44 | } 45 | $offset = $this->offsetsPtr[$i]; 46 | $tokens[] = [\FFI::string($token), $offset]; 47 | $i++; 48 | } 49 | return $tokens; 50 | } 51 | 52 | // TODO memoize 53 | public function entities() 54 | { 55 | try { 56 | $entities = []; 57 | $tokens = $this->tokensWithOffset(); 58 | $detections = $this->ffi->mitie_extract_entities($this->model->pointer, $this->tokensPtr); 59 | $numDetections = $this->ffi->mitie_ner_get_num_detections($detections); 60 | for ($i = 0; $i < $numDetections; $i++) { 61 | $pos = $this->ffi->mitie_ner_get_detection_position($detections, $i); 62 | $len = $this->ffi->mitie_ner_get_detection_length($detections, $i); 63 | $tag = $this->ffi->mitie_ner_get_detection_tagstr($detections, $i); 64 | $score = $this->ffi->mitie_ner_get_detection_score($detections, $i); 65 | $tok = array_slice($tokens, $pos, $len); 66 | $offset = $tok[0][1]; 67 | 68 | $entity = []; 69 | if (!is_null($offset)) { 70 | $finish = end($tok)[1] + strlen(end($tok)[0]); 71 | $entity['text'] = substr($this->text, $offset, $finish - $offset); 72 | } else { 73 | $entity['text'] = array_map(fn ($v) => $v[0], $tok); 74 | } 75 | $entity['tag'] = $tag; 76 | $entity['score'] = $score; 77 | if (!is_null($offset)) { 78 | $entity['offset'] = $offset; 79 | } 80 | $entity['token_index'] = $pos; 81 | $entity['token_length'] = $len; 82 | $entities[] = $entity; 83 | } 84 | 85 | return $entities; 86 | } finally { 87 | FFI::mitie_free($detections); 88 | } 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/Exception.php: -------------------------------------------------------------------------------- 1 | mitie_free($ptr); 390 | } 391 | } 392 | } 393 | -------------------------------------------------------------------------------- /src/NER.php: -------------------------------------------------------------------------------- 1 | ffi = FFI::instance(); 14 | 15 | if (!is_null($path)) { 16 | if (!file_exists($path)) { 17 | throw new \InvalidArgumentException('File does not exist'); 18 | } 19 | $this->pointer = $this->ffi->mitie_load_named_entity_extractor($path); 20 | } elseif (!is_null($pointer)) { 21 | $this->pointer = $pointer; 22 | } else { 23 | throw new \InvalidArgumentException('Must pass either a path or a pointer'); 24 | } 25 | } 26 | 27 | public function __destruct() 28 | { 29 | FFI::mitie_free($this->pointer); 30 | } 31 | 32 | public function tags() 33 | { 34 | $tagsCount = $this->ffi->mitie_get_num_possible_ner_tags($this->pointer); 35 | $tags = []; 36 | for ($i = 0; $i < $tagsCount; $i++) { 37 | $tags[] = $this->ffi->mitie_get_named_entity_tagstr($this->pointer, $i); 38 | } 39 | return $tags; 40 | } 41 | 42 | public function doc($text) 43 | { 44 | return new Document($this, $text); 45 | } 46 | 47 | public function entities($text) 48 | { 49 | return $this->doc($text)->entities(); 50 | } 51 | 52 | public function saveToDisk($filename) 53 | { 54 | if ($this->ffi->mitie_save_named_entity_extractor($filename, $this->pointer) != 0) { 55 | throw new Exception('Unable to save model'); 56 | } 57 | } 58 | 59 | public function tokens($text) 60 | { 61 | return $this->doc($text)->tokens(); 62 | } 63 | 64 | public function tokensWithOffset($text) 65 | { 66 | return $this->doc($text)->tokensWithOffset(); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/NERTrainer.php: -------------------------------------------------------------------------------- 1 | ffi = FFI::instance(); 13 | 14 | if (!file_exists($filename)) { 15 | throw new \InvalidArgumentException('File does not exist'); 16 | } 17 | 18 | $this->pointer = $this->ffi->mitie_create_ner_trainer($filename); 19 | } 20 | 21 | public function __destruct() 22 | { 23 | FFI::mitie_free($this->pointer); 24 | } 25 | 26 | public function add($instance) 27 | { 28 | $this->ffi->mitie_add_ner_training_instance($this->pointer, $instance->pointer); 29 | } 30 | 31 | public function beta() 32 | { 33 | return $this->ffi->mitie_ner_trainer_get_beta($this->pointer); 34 | } 35 | 36 | public function setBeta($value) 37 | { 38 | if ($value < 0) { 39 | throw new \InvalidArgumentException('beta must be greater than or equal to zero'); 40 | } 41 | 42 | $this->ffi->mitie_ner_trainer_set_beta($this->pointer, $value); 43 | } 44 | 45 | public function numThreads() 46 | { 47 | return $this->ffi->mitie_ner_trainer_get_num_threads($this->pointer); 48 | } 49 | 50 | public function setNumThreads($value) 51 | { 52 | return $this->ffi->mitie_ner_trainer_set_num_threads($this->pointer, $value); 53 | } 54 | 55 | public function size() 56 | { 57 | return $this->ffi->mitie_ner_trainer_size($this->pointer); 58 | } 59 | 60 | public function train() 61 | { 62 | if ($this->size() == 0) { 63 | throw new Exception("You can't call train() on an empty trainer"); 64 | } 65 | 66 | $extractor = $this->ffi->mitie_train_named_entity_extractor($this->pointer); 67 | 68 | if (is_null($extractor)) { 69 | throw new Exception('Unable to create named entity extractor. Probably ran out of RAM.'); 70 | } 71 | 72 | return new NER(pointer: $extractor); 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/NERTrainingInstance.php: -------------------------------------------------------------------------------- 1 | ffi = FFI::instance(); 14 | 15 | $tokensPointer = Utils::arrayToPointer($tokens); 16 | 17 | $this->pointer = $this->ffi->mitie_create_ner_training_instance($tokensPointer); 18 | if (is_null($this->pointer)) { 19 | throw new Exception('Unable to create training instance. Probably ran out of RAM.'); 20 | } 21 | } 22 | 23 | public function __destruct() 24 | { 25 | FFI::mitie_free($this->pointer); 26 | } 27 | 28 | public function addEntity($start, $end, $label) 29 | { 30 | Utils::checkRange($start, $end, $this->numTokens()); 31 | 32 | if ($this->overlapsAnyEntity($start, $end)) { 33 | throw new \InvalidArgumentException('Range overlaps existing entity'); 34 | } 35 | 36 | if ($this->ffi->mitie_add_ner_training_entity($this->pointer, $start, $end - $start + 1, $label) != 0) { 37 | throw new Exception('Unable to add entity to training instance. Probably ran out of RAM.'); 38 | } 39 | } 40 | 41 | public function numEntities() 42 | { 43 | return $this->ffi->mitie_ner_training_instance_num_entities($this->pointer); 44 | } 45 | 46 | public function numTokens() 47 | { 48 | return $this->ffi->mitie_ner_training_instance_num_tokens($this->pointer); 49 | } 50 | 51 | public function overlapsAnyEntity($start, $end) 52 | { 53 | Utils::checkRange($start, $end, $this->numTokens()); 54 | 55 | return $this->ffi->mitie_overlaps_any_entity($this->pointer, $start, $end - $start + 1) == 1; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/TextCategorizer.php: -------------------------------------------------------------------------------- 1 | ffi = FFI::instance(); 13 | 14 | if (!is_null($path)) { 15 | if (!file_exists($path)) { 16 | throw new \InvalidArgumentException('File does not exist'); 17 | } 18 | $this->pointer = $this->ffi->mitie_load_text_categorizer($path); 19 | } elseif (!is_null($pointer)) { 20 | $this->pointer = $pointer; 21 | } else { 22 | throw new \InvalidArgumentException('Must pass either a path or a pointer'); 23 | } 24 | } 25 | 26 | public function __destruct() 27 | { 28 | $this->ffi->mitie_free($this->pointer); 29 | } 30 | 31 | public function categorize($text) 32 | { 33 | try { 34 | // TODO support tokens 35 | $tokensPtr = $this->ffi->mitie_tokenize($text); 36 | 37 | $textTag = $this->ffi->new('char*'); 38 | $textScore = $this->ffi->new('double'); 39 | 40 | if ($this->ffi->mitie_categorize_text($this->pointer, $tokensPtr, \FFI::addr($textTag), \FFI::addr($textScore)) != 0) { 41 | throw new Exception('Unable to categorize'); 42 | } 43 | 44 | return [ 45 | 'tag' => \FFI::string($textTag), 46 | 'score' => $textScore->cdata 47 | ]; 48 | } finally { 49 | FFI::mitie_free($tokensPtr); 50 | FFI::mitie_free($textTag); 51 | } 52 | } 53 | 54 | public function saveToDisk($filename) 55 | { 56 | if ($this->ffi->mitie_save_text_categorizer($filename, $this->pointer) != 0) { 57 | throw new Exception('Unable to save model'); 58 | } 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/TextCategorizerTrainer.php: -------------------------------------------------------------------------------- 1 | ffi = FFI::instance(); 13 | 14 | if (!file_exists($path)) { 15 | throw new \InvalidArgumentException('File does not exist'); 16 | } 17 | 18 | $this->pointer = $this->ffi->mitie_create_text_categorizer_trainer($path); 19 | } 20 | 21 | public function __destruct() 22 | { 23 | $this->ffi->mitie_free($this->pointer); 24 | } 25 | 26 | public function add($text, $label) 27 | { 28 | try { 29 | // TODO support tokens 30 | $tokensPtr = $this->ffi->mitie_tokenize($text); 31 | $this->ffi->mitie_add_text_categorizer_labeled_text($this->pointer, $tokensPtr, $label); 32 | } finally { 33 | FFI::mitie_free($tokensPtr); 34 | } 35 | } 36 | 37 | public function beta() 38 | { 39 | return $this->ffi->mitie_text_categorizer_trainer_get_beta($this->pointer); 40 | } 41 | 42 | public function setBeta($value) 43 | { 44 | if ($value < 0) { 45 | throw new \InvalidArgumentException('beta must be greater than or equal to zero'); 46 | } 47 | 48 | $this->ffi->mitie_text_categorizer_trainer_set_beta($this->pointer, $value); 49 | } 50 | 51 | public function numThreads() 52 | { 53 | return $this->ffi->mitie_text_categorizer_trainer_get_num_threads($this->pointer); 54 | } 55 | 56 | public function setNumThreads($value) 57 | { 58 | $this->ffi->mitie_text_categorizer_trainer_set_num_threads($this->pointer, $value); 59 | } 60 | 61 | public function size() 62 | { 63 | return $this->ffi->mitie_text_categorizer_trainer_size($this->pointer); 64 | } 65 | 66 | public function train() 67 | { 68 | if ($this->size() == 0) { 69 | throw new Exception("You can't call train() on an empty trainer"); 70 | } 71 | 72 | $categorizer = $this->ffi->mitie_train_text_categorizer($this->pointer); 73 | 74 | if (is_null($categorizer)) { 75 | throw new Exception('Unable to create text categorizer. Probably ran out of RAM.'); 76 | } 77 | 78 | return new TextCategorizer(pointer: $categorizer); 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/Utils.php: -------------------------------------------------------------------------------- 1 | new('char*[' . ($tokensSize + 1) . ']'); 11 | for ($i = 0; $i < $tokensSize; $i++) { 12 | $tokensPtr[$i] = self::cstring($tokens[$i]); 13 | } 14 | return $tokensPtr; 15 | } 16 | 17 | public static function checkRange($start, $end, $numTokens) 18 | { 19 | if ($start > $end || $start < 0 || $end >= $numTokens) { 20 | throw new \InvalidArgumentException('Invalid range'); 21 | } 22 | } 23 | 24 | private static function cstring($str) 25 | { 26 | $bytes = strlen($str) + 1; 27 | // TODO fix? 28 | $ptr = FFI::instance()->new("char[$bytes]", owned: false); 29 | \FFI::memcpy($ptr, $str, $bytes - 1); 30 | $ptr[$bytes - 1] = "\0"; 31 | return $ptr; 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/Vendor.php: -------------------------------------------------------------------------------- 1 | [ 11 | 'file' => 'libmitie.so', 12 | 'checksum' => '07b241d857a4bcd7fd97b68a87ccb06fbab70bfc621ee25aa0ea6bd7f905c45c' 13 | ], 14 | 'x86_64-darwin' => [ 15 | 'file' => 'libmitie.dylib', 16 | 'checksum' => '8c4fdbe11ef137c401141242af8030628672d64589b5e63ba9c13b7162d29d6c' 17 | ], 18 | 'arm64-darwin' => [ 19 | 'file' => 'libmitie.arm64.dylib', 20 | 'checksum' => '616117825ac8a37ec1f016016868e1d72a21e5f3a90cc6b0347d4ff9dbf98088' 21 | ], 22 | 'x64-windows' => [ 23 | 'file' => 'mitie.dll', 24 | 'checksum' => 'dfeaaf72b12c7323d9447275af16afe5a1c64096ec2f00d04cb50f518ca19776' 25 | ] 26 | ]; 27 | 28 | public static function check($event = null) 29 | { 30 | $dest = self::defaultLib(); 31 | if (file_exists($dest)) { 32 | echo "✔ MITIE found\n"; 33 | return; 34 | } 35 | 36 | $dir = self::libDir(); 37 | if (!file_exists($dir)) { 38 | mkdir($dir); 39 | } 40 | 41 | echo "Downloading MITIE...\n"; 42 | 43 | $file = self::libFile(); 44 | $url = self::withVersion("https://github.com/ankane/ml-builds/releases/download/mitie-{{version}}/$file"); 45 | $contents = file_get_contents($url); 46 | 47 | $checksum = hash('sha256', $contents); 48 | if ($checksum != self::platform('checksum')) { 49 | throw new Exception("Bad checksum: $checksum"); 50 | } 51 | 52 | file_put_contents($dest, $contents); 53 | 54 | echo "✔ Success\n"; 55 | } 56 | 57 | public static function defaultLib() 58 | { 59 | return self::libDir() . '/' . self::libFile(); 60 | } 61 | 62 | private static function libDir() 63 | { 64 | return __DIR__ . '/../lib'; 65 | } 66 | 67 | private static function libFile() 68 | { 69 | return self::platform('file'); 70 | } 71 | 72 | private static function platform($key) 73 | { 74 | return self::PLATFORMS[self::platformKey()][$key]; 75 | } 76 | 77 | private static function platformKey() 78 | { 79 | if (PHP_OS_FAMILY == 'Windows') { 80 | return 'x64-windows'; 81 | } elseif (PHP_OS_FAMILY == 'Darwin') { 82 | if (php_uname('m') == 'x86_64') { 83 | return 'x86_64-darwin'; 84 | } else { 85 | return 'arm64-darwin'; 86 | } 87 | } else { 88 | if (php_uname('m') == 'x86_64') { 89 | return 'x86_64-linux'; 90 | } else { 91 | return 'aarch64-linux'; 92 | } 93 | } 94 | } 95 | 96 | private static function withVersion($str) 97 | { 98 | return str_replace('{{version}}', self::VERSION, $str); 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /tests/BinaryRelationDetectorTest.php: -------------------------------------------------------------------------------- 1 | modelsPath(); 10 | $detector = new Mitie\BinaryRelationDetector("$modelsPath/binary_relations/rel_classifier_film.film.directed_by.svm"); 11 | $this->assertEquals('film.film.directed_by', $detector->name()); 12 | $doc = $this->model()->doc('The Shawshank Redemption was directed by Frank Darabont and starred Tim Robbins and Morgan Freeman'); 13 | 14 | $relations = $detector->relations($doc); 15 | $this->assertCount(1, $relations); 16 | 17 | $relation = $relations[0]; 18 | $this->assertEquals('Shawshank Redemption', $relation['first']); 19 | $this->assertEquals('Frank Darabont', $relation['second']); 20 | } 21 | 22 | public function testPlaceFounded() 23 | { 24 | $modelsPath = $this->modelsPath(); 25 | $detector = new Mitie\BinaryRelationDetector("$modelsPath/binary_relations/rel_classifier_organization.organization.place_founded.svm"); 26 | $this->assertEquals('organization.organization.place_founded', $detector->name()); 27 | $doc = $this->model()->doc('Shopify was founded in Ottawa'); 28 | 29 | $relations = $detector->relations($doc); 30 | $this->assertCount(1, $relations); 31 | 32 | $relation = $relations[0]; 33 | $this->assertEquals('Shopify', $relation['first']); 34 | $this->assertEquals('Ottawa', $relation['second']); 35 | } 36 | 37 | public function testNonDocument() 38 | { 39 | $this->expectException(InvalidArgumentException::class); 40 | $this->expectExceptionMessage('Expected Mitie\Document'); 41 | 42 | $modelsPath = $this->modelsPath(); 43 | $detector = new Mitie\BinaryRelationDetector("$modelsPath/binary_relations/rel_classifier_film.film.directed_by.svm"); 44 | $detector->relations('Hi'); 45 | } 46 | 47 | public function testMissingFile() 48 | { 49 | $this->expectException(InvalidArgumentException::class); 50 | $this->expectExceptionMessage('File does not exist'); 51 | 52 | new Mitie\BinaryRelationDetector('missing.dat'); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /tests/BinaryRelationTrainerTest.php: -------------------------------------------------------------------------------- 1 | model()); 10 | $trainer->addPositiveBinaryRelation($this->tokens(), [0, 0], [4, 4]); 11 | $trainer->addNegativeBinaryRelation($this->tokens(), [4, 4], [0, 0]); 12 | $this->assertEquals(1, $trainer->numPositiveExamples()); 13 | $this->assertEquals(1, $trainer->numNegativeExamples()); 14 | $detector = $trainer->train(); 15 | $this->assertEquals('', $detector->name()); 16 | 17 | $path = tempnam(sys_get_temp_dir(), 'detector'); 18 | $detector->saveToDisk($path); 19 | $this->assertFileExists($path); 20 | 21 | $detector = new Mitie\BinaryRelationDetector($path); 22 | $doc = $this->model()->doc('Shopify was founded in Ottawa'); 23 | 24 | $relations = $detector->relations($doc); 25 | $this->assertCount(1, $relations); 26 | 27 | $relation = $relations[0]; 28 | $this->assertEquals('Shopify', $relation['first']); 29 | $this->assertEquals('Ottawa', $relation['second']); 30 | } 31 | 32 | public function testAddPositiveBinaryRelationInvalidRange1() 33 | { 34 | $this->expectException(InvalidArgumentException::class); 35 | $this->expectExceptionMessage('Invalid range'); 36 | 37 | $trainer = new Mitie\BinaryRelationTrainer($this->model()); 38 | $trainer->addPositiveBinaryRelation($this->tokens(), [0, -1], [4, 4]); 39 | } 40 | 41 | public function testAddPositiveBinaryRelationInvalidRange2() 42 | { 43 | $this->expectException(InvalidArgumentException::class); 44 | $this->expectExceptionMessage('Invalid range'); 45 | 46 | $trainer = new Mitie\BinaryRelationTrainer($this->model()); 47 | $trainer->addPositiveBinaryRelation($this->tokens(), [0, 0], [4, 3]); 48 | } 49 | 50 | public function testAddPositiveBinaryRelationInvalidRange3() 51 | { 52 | $this->expectException(InvalidArgumentException::class); 53 | $this->expectExceptionMessage('Invalid range'); 54 | 55 | $trainer = new Mitie\BinaryRelationTrainer($this->model()); 56 | $trainer->addPositiveBinaryRelation($this->tokens(), [0, 0], [4, 5]); 57 | } 58 | 59 | public function testAddNegativeBinaryRelationInvalidRange1() 60 | { 61 | $this->expectException(InvalidArgumentException::class); 62 | $this->expectExceptionMessage('Invalid range'); 63 | 64 | $trainer = new Mitie\BinaryRelationTrainer($this->model()); 65 | $trainer->addNegativeBinaryRelation($this->tokens(), [0, -1], [4, 4]); 66 | } 67 | 68 | public function testAddNegativeBinaryRelationInvalidRange2() 69 | { 70 | $this->expectException(InvalidArgumentException::class); 71 | $this->expectExceptionMessage('Invalid range'); 72 | 73 | $trainer = new Mitie\BinaryRelationTrainer($this->model()); 74 | $trainer->addNegativeBinaryRelation($this->tokens(), [0, 0], [4, 3]); 75 | } 76 | 77 | public function testAddNegativeBinaryRelationInvalidRange3() 78 | { 79 | $this->expectException(InvalidArgumentException::class); 80 | $this->expectExceptionMessage('Invalid range'); 81 | 82 | $trainer = new Mitie\BinaryRelationTrainer($this->model()); 83 | $trainer->addNegativeBinaryRelation($this->tokens(), [0, 0], [4, 5]); 84 | } 85 | 86 | public function testAddPositiveBinaryRelationEntitiesOverlap() 87 | { 88 | $this->expectException(InvalidArgumentException::class); 89 | $this->expectExceptionMessage('Entities overlap'); 90 | 91 | $trainer = new Mitie\BinaryRelationTrainer($this->model()); 92 | $trainer->addPositiveBinaryRelation($this->tokens(), [0, 1], [1, 2]); 93 | } 94 | 95 | public function testAddNegativeBinaryRelationEntitiesOverlap() 96 | { 97 | $this->expectException(InvalidArgumentException::class); 98 | $this->expectExceptionMessage('Entities overlap'); 99 | 100 | $trainer = new Mitie\BinaryRelationTrainer($this->model()); 101 | $trainer->addNegativeBinaryRelation($this->tokens(), [0, 1], [1, 2]); 102 | } 103 | 104 | public function testEmptyTrainer() 105 | { 106 | $this->expectException(Mitie\Exception::class); 107 | $this->expectExceptionMessage("You can't call train() on an empty trainer"); 108 | 109 | $trainer = new Mitie\BinaryRelationTrainer($this->model()); 110 | $trainer->train(); 111 | } 112 | 113 | private function tokens() 114 | { 115 | return ['Shopify', 'was', 'founded', 'in', 'Ottawa']; 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /tests/DocumentTest.php: -------------------------------------------------------------------------------- 1 | 'Nat', 'tag' => 'PERSON', 'score' => 0.31123712126883823, 'offset' => 0, 'token_index' => 0, 'token_length' => 1], 11 | ['text' => 'GitHub', 'tag' => 'LOCATION', 'score' => 0.5660115198329334, 'offset' => 13, 'token_index' => 3, 'token_length' => 1], 12 | ['text' => 'San Francisco', 'tag' => 'LOCATION', 'score' => 1.3890524313885309, 'offset' => 23, 'token_index' => 5, 'token_length' => 2] 13 | ]; 14 | $this->assertEquals($expected, $this->doc()->entities()); 15 | } 16 | 17 | public function testEntitiesTokens() 18 | { 19 | $this->markTestSkipped('Not supported yet'); 20 | 21 | $expected = [ 22 | ['text' => ['Nat'], 'tag' => 'PERSON', 'score' => 0.31123712126883823, 'token_index' => 0, 'token_length' => 1], 23 | ['text' => ['GitHub'], 'tag' => 'LOCATION', 'score' => 0.5660115198329334, 'token_index' => 3, 'token_length' => 1], 24 | ['text' => ['San', 'Francisco'], 'tag' => 'LOCATION', 'score' => 1.3890524313885309, 'token_index' => 5, 'token_length' => 2] 25 | ]; 26 | $this->assertEquals($expected, $this->tokenDoc()->entities()); 27 | } 28 | 29 | public function testEntitiesLocation() 30 | { 31 | // would ideally return a single location 32 | $this->assertEquals(['San Francisco', 'California'], array_map(fn ($e) => $e['text'], $this->model()->doc('San Francisco, California')->entities())); 33 | } 34 | 35 | // offset is in bytes 36 | public function testEntitiesByteOrderMark() 37 | { 38 | $expected = [['text' => 'California', 'tag' => 'LOCATION', 'score' => 1.4244816233933328, 'offset' => 12, 'token_index' => 2, 'token_length' => 1]]; 39 | $this->assertEquals($expected, $this->model()->doc("\xEF\xBB\xBFWorks in California")->entities()); 40 | } 41 | 42 | public function testTokens() 43 | { 44 | $expected = ['Nat', 'works', 'at', 'GitHub', 'in', 'San', 'Francisco']; 45 | $this->assertEquals($expected, $this->doc()->tokens()); 46 | } 47 | 48 | public function testTokensTokens() 49 | { 50 | $this->markTestSkipped('Not supported yet'); 51 | 52 | $expected = ['Nat', 'works', 'at', 'GitHub', 'in', 'San', 'Francisco']; 53 | $this->assertEquals($expected, $this->tokenDoc()->tokens()); 54 | } 55 | 56 | public function testTokensWithOffset() 57 | { 58 | $expected = [['Nat', 0], ['works', 4], ['at', 10], ['GitHub', 13], ['in', 20], ['San', 23], ['Francisco', 27]]; 59 | $this->assertEquals($expected, $this->doc()->tokensWithOffset()); 60 | } 61 | 62 | public function testTokensWithOffsetTokens() 63 | { 64 | $this->markTestSkipped('Not supported yet'); 65 | 66 | $expected =[['Nat', null], ['works', null], ['at', null], ['GitHub', null], ['in', null], ['San', null], ['Francisco', null]]; 67 | $this->assertEquals($expected, $this->tokenDoc()->tokensWithOffset()); 68 | } 69 | 70 | public function testTokensUtf8() 71 | { 72 | $this->assertEquals(['“', 'hello', '”'], $this->model()->doc('“hello”')->tokens()); 73 | } 74 | 75 | public function testTokensWithOffsetUtf8() 76 | { 77 | // https://github.com/mit-nlp/MITIE/issues/211 78 | $this->markTestSkipped('Possible bug with MITIE'); 79 | 80 | $this->assertEquals([['“', 0], ['hello', 1], ['”', 6]], $this->model()->doc('“hello”')->tokensWithOffset()); 81 | } 82 | 83 | private function doc() 84 | { 85 | return $this->model()->doc($this->text()); 86 | } 87 | 88 | private function tokenDoc() 89 | { 90 | return $this->model()->doc($this->tokens()); 91 | } 92 | 93 | private function tokens() 94 | { 95 | return ['Nat', 'works', 'at', 'GitHub', 'in', 'San', 'Francisco']; 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /tests/NERTest.php: -------------------------------------------------------------------------------- 1 | 'Nat', 'tag' => 'PERSON', 'score' => 0.31123712126883823, 'offset' => 0, 'token_index' => 0, 'token_length' => 1], 11 | ['text' => 'GitHub', 'tag' => 'LOCATION', 'score' => 0.5660115198329334, 'offset' => 13, 'token_index' => 3, 'token_length' => 1], 12 | ['text' => 'San Francisco', 'tag' => 'LOCATION', 'score' => 1.3890524313885309, 'offset' => 23, 'token_index' => 5, 'token_length' => 2] 13 | ]; 14 | $this->assertEquals($expected, $this->model()->entities($this->text())); 15 | } 16 | 17 | public function testTokens() 18 | { 19 | $expected = ['Nat', 'works', 'at', 'GitHub', 'in', 'San', 'Francisco']; 20 | $this->assertEquals($expected, $this->model()->tokens($this->text())); 21 | } 22 | 23 | public function testTokensUtf8() 24 | { 25 | $this->assertEquals(['“', 'hello', '”'], $this->model()->tokens('“hello”')); 26 | } 27 | 28 | public function testTokensWithOffset() 29 | { 30 | $expected = [['Nat', 0], ['works', 4], ['at', 10], ['GitHub', 13], ['in', 20], ['San', 23], ['Francisco', 27]]; 31 | $this->assertEquals($expected, $this->model()->tokensWithOffset($this->text())); 32 | } 33 | 34 | public function testTags() 35 | { 36 | $this->assertEquals(['PERSON', 'LOCATION', 'ORGANIZATION', 'MISC'], $this->model()->tags()); 37 | } 38 | 39 | public function testMissingFile() 40 | { 41 | $this->expectException(InvalidArgumentException::class); 42 | $this->expectExceptionMessage('File does not exist'); 43 | 44 | new Mitie\NER('missing.dat'); 45 | } 46 | 47 | public function testSaveToDisk() 48 | { 49 | $path = tempnam(sys_get_temp_dir(), 'model'); 50 | $this->model()->saveToDisk($path); 51 | $this->assertFileExists($path); 52 | unlink($path); 53 | } 54 | 55 | public function testSaveToDiskError() 56 | { 57 | $this->expectException(Mitie\Exception::class); 58 | $this->expectExceptionMessage('Unable to save model'); 59 | 60 | $this->model()->saveToDisk('missing/ner_model.dat'); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /tests/NERTrainerTest.php: -------------------------------------------------------------------------------- 1 | featureExtractorPath()); 10 | $trainer->setBeta(2.0); 11 | $this->assertEquals(2.0, $trainer->beta()); 12 | } 13 | 14 | public function testBetaWriterRaisesOnInvalidInput() 15 | { 16 | $this->expectException(InvalidArgumentException::class); 17 | $this->expectExceptionMessage('beta must be greater than or equal to zero'); 18 | 19 | $trainer = new Mitie\NERTrainer($this->featureExtractorPath()); 20 | $trainer->setBeta(-0.5); 21 | } 22 | 23 | public function testNumThreadsAccessors() 24 | { 25 | $trainer = new Mitie\NERTrainer($this->featureExtractorPath()); 26 | $trainer->setNumThreads(2); 27 | $this->assertEquals(2, $trainer->numThreads()); 28 | } 29 | 30 | public function testTrain() 31 | { 32 | $tokens = ['You', 'can', 'do', 'machine', 'learning', 'in', 'PHP', '!']; 33 | $instance = new Mitie\NERTrainingInstance($tokens); 34 | $instance->addEntity(3, 4, 'topic'); 35 | $instance->addEntity(6, 6, 'language'); 36 | 37 | $trainer = new Mitie\NERTrainer($this->featureExtractorPath()); 38 | $trainer->add($instance); 39 | $trainer->setNumThreads(2); 40 | $model = $trainer->train(); 41 | 42 | $entity = $model->doc('Code in PHP')->entities()[0]; 43 | $this->assertEquals('PHP', $entity['text']); 44 | $this->assertEquals('language', $entity['tag']); 45 | } 46 | 47 | public function testEmptyTrainer() 48 | { 49 | $this->expectException(Mitie\Exception::class); 50 | $this->expectExceptionMessage("You can't call train() on an empty trainer"); 51 | 52 | $trainer = new Mitie\NERTrainer($this->featureExtractorPath()); 53 | $trainer->train(); 54 | } 55 | 56 | public function testMissingFile() 57 | { 58 | $this->expectException(InvalidArgumentException::class); 59 | $this->expectExceptionMessage('File does not exist'); 60 | 61 | new Mitie\NERTrainer('missing.dat'); 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /tests/NERTrainingInstanceTest.php: -------------------------------------------------------------------------------- 1 | expectException(InvalidArgumentException::class); 10 | $this->expectExceptionMessage('Invalid range'); 11 | 12 | $tokens = ['I', 'raise', 'errors', '.']; 13 | $instance = new Mitie\NERTrainingInstance($tokens); 14 | $instance->addEntity(2, 2, 'noun'); 15 | $instance->addEntity(1, 0, 'nope'); 16 | } 17 | 18 | public function testAddEntityRaisesOnInvalidInput2() 19 | { 20 | $this->expectException(InvalidArgumentException::class); 21 | $this->expectExceptionMessage('Invalid range'); 22 | 23 | $tokens = ['I', 'raise', 'errors', '.']; 24 | $instance = new Mitie\NERTrainingInstance($tokens); 25 | $instance->addEntity(2, 2, 'noun'); 26 | $instance->addEntity(1, 8, 'nope'); 27 | } 28 | 29 | public function testAddEntityRaisesOnInvalidInput3() 30 | { 31 | $this->expectException(InvalidArgumentException::class); 32 | $this->expectExceptionMessage('Invalid range'); 33 | 34 | $tokens = ['I', 'raise', 'errors', '.']; 35 | $instance = new Mitie\NERTrainingInstance($tokens); 36 | $instance->addEntity(2, 2, 'noun'); 37 | $instance->addEntity(-1, 1, 'nope'); 38 | } 39 | 40 | public function testAddEntityRaisesOnInvalidInput4() 41 | { 42 | $this->expectException(InvalidArgumentException::class); 43 | $this->expectExceptionMessage('Range overlaps existing entity'); 44 | 45 | $tokens = ['I', 'raise', 'errors', '.']; 46 | $instance = new Mitie\NERTrainingInstance($tokens); 47 | $instance->addEntity(2, 2, 'noun'); 48 | $instance->addEntity(2, 2, 'nope'); 49 | } 50 | 51 | public function testNumEntities() 52 | { 53 | $tokens = ['You', 'can', 'do', 'machine', 'learning', 'in', 'PHP', '!']; 54 | $instance = new Mitie\NERTrainingInstance($tokens); 55 | 56 | $this->assertEquals(0, $instance->numEntities()); 57 | 58 | $instance->addEntity(3, 4, 'topic'); 59 | $instance->addEntity(6, 6, 'language'); 60 | 61 | $this->assertEquals(2, $instance->numEntities()); 62 | } 63 | 64 | public function testNumTokens() 65 | { 66 | $tokens = ['I', 'have', 'five', 'tokens', '.']; 67 | $instance = new Mitie\NERTrainingInstance($tokens); 68 | $this->assertEquals(5, $instance->numTokens()); 69 | } 70 | 71 | public function testOverlapsAnyEntity() 72 | { 73 | $tokens = ['You', 'can', 'do', 'machine', 'learning', 'in', 'PHP', '!']; 74 | $instance = new Mitie\NERTrainingInstance($tokens); 75 | $instance->addEntity(3, 4, 'topic'); 76 | $instance->addEntity(6, 6, 'language'); 77 | 78 | $this->assertFalse($instance->overlapsAnyEntity(1, 2)); 79 | $this->assertTrue($instance->overlapsAnyEntity(2, 3)); 80 | $this->assertFalse($instance->overlapsAnyEntity(5, 5)); 81 | } 82 | 83 | public function testOverlapsAnyEntityInvalidRange1() 84 | { 85 | $this->expectException(InvalidArgumentException::class); 86 | $this->expectExceptionMessage('Invalid range'); 87 | 88 | $tokens = ['I', 'raise', 'errors', '.']; 89 | $instance = new Mitie\NERTrainingInstance($tokens); 90 | $instance->addEntity(2, 2, 'noun'); 91 | $instance->overlapsAnyEntity(1, 0); 92 | } 93 | 94 | public function testOverlapsAnyEntityInvalidRange2() 95 | { 96 | $this->expectException(InvalidArgumentException::class); 97 | $this->expectExceptionMessage('Invalid range'); 98 | 99 | $tokens = ['I', 'raise', 'errors', '.']; 100 | $instance = new Mitie\NERTrainingInstance($tokens); 101 | $instance->addEntity(2, 2, 'noun'); 102 | $instance->overlapsAnyEntity(9, 12); 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /tests/TestCase.php: -------------------------------------------------------------------------------- 1 | modelsPath() . '/ner_model.dat'); 14 | return self::$model; 15 | } 16 | 17 | protected function text() 18 | { 19 | return 'Nat works at GitHub in San Francisco'; 20 | } 21 | 22 | protected function modelsPath() 23 | { 24 | return getenv('MITIE_MODELS_PATH'); 25 | } 26 | 27 | protected function featureExtractorPath() 28 | { 29 | return $this->modelsPath() . '/total_word_feature_extractor.dat'; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /tests/TextCategorizerTest.php: -------------------------------------------------------------------------------- 1 | markTestSkipped('TODO'); 10 | 11 | $trainer = new Mitie\TextCategorizerTrainer($this->featureExtractorPath()); 12 | $trainer->add(['This', 'is', 'super', 'cool'], 'positive'); 13 | $trainer->add(['I', 'am', 'not', 'a', 'fan'], 'negative'); 14 | $model = $trainer->train(); 15 | 16 | $path = tempnam(sys_get_temp_dir(), 'model'); 17 | $model->saveToDisk($path); 18 | $this->assertFileExists($path); 19 | 20 | $model = new Mitie\TextCategorizer($path); 21 | $result = $model->categorize(['What', 'a', 'super', 'nice', 'day']); 22 | $this->assertEquals('positive', $result['tag']); 23 | $this->assertEqualsWithDelta(0.0684, $result['score'], 0.001); 24 | } 25 | 26 | public function testStrings() 27 | { 28 | $trainer = new Mitie\TextCategorizerTrainer($this->featureExtractorPath()); 29 | $trainer->add('This is super cool', 'positive'); 30 | $trainer->add('I am not a fan', 'negative'); 31 | $model = $trainer->train(); 32 | 33 | $path = tempnam(sys_get_temp_dir(), 'model'); 34 | $model->saveToDisk($path); 35 | $this->assertFileExists($path); 36 | 37 | $model = new Mitie\TextCategorizer($path); 38 | $result = $model->categorize('What a super nice day'); 39 | $this->assertEquals('positive', $result['tag']); 40 | $this->assertEqualsWithDelta(0.0684, $result['score'], 0.001); 41 | } 42 | 43 | public function testEmptyTrainer() 44 | { 45 | $this->expectException(Mitie\Exception::class); 46 | $this->expectExceptionMessage("You can't call train() on an empty trainer"); 47 | 48 | $trainer = new Mitie\TextCategorizerTrainer($this->featureExtractorPath()); 49 | $trainer->train(); 50 | } 51 | } 52 | --------------------------------------------------------------------------------