├── CHANGELOG.md ├── README.md ├── autoload.php ├── composer.json ├── examples ├── ner.php ├── pos.php └── stanford.php └── src └── StanfordNLP ├── Base.php ├── Exception.php ├── NERTagger.php ├── POSTagger.php ├── Parser.php └── StanfordTagger.php /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # CHANGELOG 2 | 3 | ## 0.1.1 4 | 5 | - (CHG) Update StanfordTagger::tag to return results for multiple sentences. This affects the return values for POSTagger::tag and NERTagger::tag 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PHP-Stanford-NLP # 2 | 3 | PHP interface to Stanford NLP Tools (POS Tagger, NER, Parser) 4 | 5 | This library was tested against individual jar files for each package version 3.8.0 (english). 6 | 7 | It was NOT built for use with the [Stanford CoreNLP](http://nlp.stanford.edu/software/corenlp.shtml). 8 | 9 | ### Installation 10 | 11 | This library requires PHP 5.3 or later. 12 | 13 | It is available via Composer as [agentile/php-stanford-nlp](https://packagist.org/packages/agentile/php-stanford-nlp). 14 | 15 | You may also clone this repository, then require or include its _autoload.php_ file. 16 | 17 | ## POS Tagger ## 18 | 19 | [https://nlp.stanford.edu/software/tagger.html#Download](https://nlp.stanford.edu/software/tagger.html#Download) 20 | 21 | Mimicks http://nltk.org/_modules/nltk/tag/stanford.html#StanfordTagger 22 | 23 | ### Example Usage ### 24 | 25 | See examples [here](https://github.com/agentile/PHP-Stanford-NLP/tree/master/examples) 26 | 27 | ```php 28 | $pos = new \StanfordNLP\POSTagger( 29 | '/path/to/stanford-postagger-2017-06-09/models/english-left3words-distsim.tagger', 30 | '/path/to/stanford-postagger-2017-06-09/stanford-postagger-3.8.0.jar' 31 | ); 32 | $result = $pos->tag(explode(' ', "What does the fox say?")); 33 | var_dump($result); 34 | ``` 35 | 36 | ## NER Tagger ## 37 | 38 | [https://nlp.stanford.edu/software/CRF-NER.shtml#Download](https://nlp.stanford.edu/software/CRF-NER.shtml#Download) 39 | 40 | Mimicks http://nltk.org/_modules/nltk/tag/stanford.html#StanfordTagger 41 | 42 | ### Example Usage ### 43 | 44 | ```php 45 | $pos = new \StanfordNLP\NERTagger( 46 | '/path/to/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz', 47 | '/path/to/stanford-ner-2017-06-09/stanford-ner-3.8.0.jar' 48 | ); 49 | $result = $pos->tag(explode(' ', "The Federal Reserve Bank of New York led by Timothy R. Geithner.")); 50 | var_dump($result); 51 | ``` 52 | 53 | ## Parser ## 54 | 55 | [https://nlp.stanford.edu/software/lex-parser.shtml#Download](https://nlp.stanford.edu/software/lex-parser.shtml#Download) 56 | 57 | ### Example Usage ### 58 | 59 | ```php 60 | $parser = new \StanfordNLP\Parser( 61 | '/path/to/stanford-parser-full-2017-06-09/stanford-parser.jar', 62 | '/path/to/stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar' 63 | ); 64 | $result = $parser->parseSentence("What does the fox say?"); 65 | var_dump($result); 66 | ``` 67 | -------------------------------------------------------------------------------- /autoload.php: -------------------------------------------------------------------------------- 1 | array( 11 | __DIR__ . '/src/' . $ns, 12 | ), 13 | ); 14 | 15 | // go through the prefixes 16 | foreach ($prefixes as $prefix => $dirs) { 17 | 18 | // does the requested class match the namespace prefix? 19 | $prefix_len = strlen($prefix); 20 | if (substr($class, 0, $prefix_len) !== $prefix) { 21 | continue; 22 | } 23 | 24 | // strip the prefix off the class 25 | $class = substr($class, $prefix_len); 26 | 27 | // a partial filename 28 | $part = str_replace('\\', DIRECTORY_SEPARATOR, $class) . '.php'; 29 | 30 | // go through the directories to find classes 31 | foreach ($dirs as $dir) { 32 | $dir = str_replace('/', DIRECTORY_SEPARATOR, $dir); 33 | $file = $dir . DIRECTORY_SEPARATOR . $part; 34 | if (is_readable($file)) { 35 | require $file; 36 | return; 37 | } 38 | } 39 | } 40 | }); 41 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "agentile/php-stanford-nlp", 3 | "type": "library", 4 | "description": "PHP interface to Stanford NLP Tools", 5 | "keywords": ["stanford","nlp","ner","pos","parser"], 6 | "homepage": "https://github.com/agentile/PHP-Stanford-NLP", 7 | "license": "MIT", 8 | "authors": [ 9 | { 10 | "name": "Anthony Gentile", 11 | "email": "asgentile@gmail.com", 12 | "homepage": "http://agentile.com", 13 | "role": "Developer" 14 | } 15 | ], 16 | "require": { 17 | "php": ">=5.3.0" 18 | }, 19 | "autoload": { 20 | "psr-0": { 21 | "StanfordNLP": "src" 22 | } 23 | } 24 | } -------------------------------------------------------------------------------- /examples/ner.php: -------------------------------------------------------------------------------- 1 | setDebug(true); 13 | 14 | $result = $pos->tag(explode(' ', "The Federal Reserve Bank of New York led by Timothy R. Geithner. He also said that we should call the Internal Revenue Services office")); 15 | //$results = $pos->batchTag([explode(' ', "The Federal Reserve Bank of New York led by Timothy R. Geithner."), explode(' ', "He also said that we should call the Internal Revenue Services office")]); 16 | var_dump($result); 17 | 18 | /* 19 | array(2) { 20 | [0]=> 21 | array(13) { 22 | [0]=> 23 | array(2) { 24 | [0]=> 25 | string(3) "The" 26 | [1]=> 27 | string(1) "O" 28 | } 29 | [1]=> 30 | array(2) { 31 | [0]=> 32 | string(7) "Federal" 33 | [1]=> 34 | string(12) "ORGANIZATION" 35 | } 36 | [2]=> 37 | array(2) { 38 | [0]=> 39 | string(7) "Reserve" 40 | [1]=> 41 | string(12) "ORGANIZATION" 42 | } 43 | [3]=> 44 | array(2) { 45 | [0]=> 46 | string(4) "Bank" 47 | [1]=> 48 | string(12) "ORGANIZATION" 49 | } 50 | [4]=> 51 | array(2) { 52 | [0]=> 53 | string(2) "of" 54 | [1]=> 55 | string(12) "ORGANIZATION" 56 | } 57 | [5]=> 58 | array(2) { 59 | [0]=> 60 | string(3) "New" 61 | [1]=> 62 | string(12) "ORGANIZATION" 63 | } 64 | [6]=> 65 | array(2) { 66 | [0]=> 67 | string(4) "York" 68 | [1]=> 69 | string(12) "ORGANIZATION" 70 | } 71 | [7]=> 72 | array(2) { 73 | [0]=> 74 | string(3) "led" 75 | [1]=> 76 | string(1) "O" 77 | } 78 | [8]=> 79 | array(2) { 80 | [0]=> 81 | string(2) "by" 82 | [1]=> 83 | string(1) "O" 84 | } 85 | [9]=> 86 | array(2) { 87 | [0]=> 88 | string(7) "Timothy" 89 | [1]=> 90 | string(6) "PERSON" 91 | } 92 | [10]=> 93 | array(2) { 94 | [0]=> 95 | string(2) "R." 96 | [1]=> 97 | string(6) "PERSON" 98 | } 99 | [11]=> 100 | array(2) { 101 | [0]=> 102 | string(8) "Geithner" 103 | [1]=> 104 | string(6) "PERSON" 105 | } 106 | [12]=> 107 | array(2) { 108 | [0]=> 109 | string(1) "." 110 | [1]=> 111 | string(1) "O" 112 | } 113 | } 114 | [1]=> 115 | array(12) { 116 | [0]=> 117 | array(2) { 118 | [0]=> 119 | string(2) "He" 120 | [1]=> 121 | string(1) "O" 122 | } 123 | [1]=> 124 | array(2) { 125 | [0]=> 126 | string(4) "also" 127 | [1]=> 128 | string(1) "O" 129 | } 130 | [2]=> 131 | array(2) { 132 | [0]=> 133 | string(4) "said" 134 | [1]=> 135 | string(1) "O" 136 | } 137 | [3]=> 138 | array(2) { 139 | [0]=> 140 | string(4) "that" 141 | [1]=> 142 | string(1) "O" 143 | } 144 | [4]=> 145 | array(2) { 146 | [0]=> 147 | string(2) "we" 148 | [1]=> 149 | string(1) "O" 150 | } 151 | [5]=> 152 | array(2) { 153 | [0]=> 154 | string(6) "should" 155 | [1]=> 156 | string(1) "O" 157 | } 158 | [6]=> 159 | array(2) { 160 | [0]=> 161 | string(4) "call" 162 | [1]=> 163 | string(1) "O" 164 | } 165 | [7]=> 166 | array(2) { 167 | [0]=> 168 | string(3) "the" 169 | [1]=> 170 | string(1) "O" 171 | } 172 | [8]=> 173 | array(2) { 174 | [0]=> 175 | string(8) "Internal" 176 | [1]=> 177 | string(12) "ORGANIZATION" 178 | } 179 | [9]=> 180 | array(2) { 181 | [0]=> 182 | string(7) "Revenue" 183 | [1]=> 184 | string(12) "ORGANIZATION" 185 | } 186 | [10]=> 187 | array(2) { 188 | [0]=> 189 | string(8) "Services" 190 | [1]=> 191 | string(12) "ORGANIZATION" 192 | } 193 | [11]=> 194 | array(2) { 195 | [0]=> 196 | string(6) "office" 197 | [1]=> 198 | string(1) "O" 199 | } 200 | } 201 | } 202 | */ 203 | -------------------------------------------------------------------------------- /examples/pos.php: -------------------------------------------------------------------------------- 1 | setDebug(true); 13 | 14 | $result = $pos->tag(explode(' ', "What does the fox say? What does the parrot say?")); 15 | //$results = $pos->batchTag([explode(' ', "What does the fox say?"), explode(' ', "What does the parrot say?")]); 16 | var_dump($result); 17 | 18 | /* 19 | array(2) { 20 | [0]=> 21 | array(6) { 22 | [0]=> 23 | array(2) { 24 | [0]=> 25 | string(4) "What" 26 | [1]=> 27 | string(2) "WP" 28 | } 29 | [1]=> 30 | array(2) { 31 | [0]=> 32 | string(4) "does" 33 | [1]=> 34 | string(3) "VBZ" 35 | } 36 | [2]=> 37 | array(2) { 38 | [0]=> 39 | string(3) "the" 40 | [1]=> 41 | string(2) "DT" 42 | } 43 | [3]=> 44 | array(2) { 45 | [0]=> 46 | string(3) "fox" 47 | [1]=> 48 | string(2) "NN" 49 | } 50 | [4]=> 51 | array(2) { 52 | [0]=> 53 | string(3) "say" 54 | [1]=> 55 | string(2) "VB" 56 | } 57 | [5]=> 58 | array(2) { 59 | [0]=> 60 | string(1) "?" 61 | [1]=> 62 | string(1) "." 63 | } 64 | } 65 | [1]=> 66 | array(6) { 67 | [0]=> 68 | array(2) { 69 | [0]=> 70 | string(4) "What" 71 | [1]=> 72 | string(2) "WP" 73 | } 74 | [1]=> 75 | array(2) { 76 | [0]=> 77 | string(4) "does" 78 | [1]=> 79 | string(3) "VBZ" 80 | } 81 | [2]=> 82 | array(2) { 83 | [0]=> 84 | string(3) "the" 85 | [1]=> 86 | string(2) "DT" 87 | } 88 | [3]=> 89 | array(2) { 90 | [0]=> 91 | string(6) "parrot" 92 | [1]=> 93 | string(2) "NN" 94 | } 95 | [4]=> 96 | array(2) { 97 | [0]=> 98 | string(3) "say" 99 | [1]=> 100 | string(2) "VB" 101 | } 102 | [5]=> 103 | array(2) { 104 | [0]=> 105 | string(1) "?" 106 | [1]=> 107 | string(1) "." 108 | } 109 | } 110 | } 111 | */ 112 | -------------------------------------------------------------------------------- /examples/stanford.php: -------------------------------------------------------------------------------- 1 | setDebug(true); 13 | //$parser->setOutputFormat('penn'); 14 | 15 | //$result = $parser->parseSentence("What does the fox say?"); 16 | $result = $parser->parseSentences(["What does the fox say?", "Hi bob, how are you?"]); 17 | var_dump($result); 18 | 19 | /* 20 | array(2) { 21 | [0]=> 22 | array(3) { 23 | ["wordsAndTags"]=> 24 | array(6) { 25 | [0]=> 26 | array(2) { 27 | [0]=> 28 | string(4) "What" 29 | [1]=> 30 | string(2) "WP" 31 | } 32 | [1]=> 33 | array(2) { 34 | [0]=> 35 | string(4) "does" 36 | [1]=> 37 | string(3) "VBZ" 38 | } 39 | [2]=> 40 | array(2) { 41 | [0]=> 42 | string(3) "the" 43 | [1]=> 44 | string(2) "DT" 45 | } 46 | [3]=> 47 | array(2) { 48 | [0]=> 49 | string(3) "fox" 50 | [1]=> 51 | string(2) "NN" 52 | } 53 | [4]=> 54 | array(2) { 55 | [0]=> 56 | string(3) "say" 57 | [1]=> 58 | string(2) "VB" 59 | } 60 | [5]=> 61 | array(2) { 62 | [0]=> 63 | string(1) "?" 64 | [1]=> 65 | string(1) "." 66 | } 67 | } 68 | ["penn"]=> 69 | array(2) { 70 | ["parent"]=> 71 | string(4) "ROOT" 72 | ["children"]=> 73 | array(1) { 74 | [0]=> 75 | array(2) { 76 | ["parent"]=> 77 | string(5) "SBARQ" 78 | ["children"]=> 79 | array(3) { 80 | [0]=> 81 | array(2) { 82 | ["parent"]=> 83 | string(4) "WHNP" 84 | ["children"]=> 85 | array(1) { 86 | [0]=> 87 | array(2) { 88 | ["parent"]=> 89 | string(7) "WP What" 90 | ["children"]=> 91 | array(0) { 92 | } 93 | } 94 | } 95 | } 96 | [1]=> 97 | array(2) { 98 | ["parent"]=> 99 | string(2) "SQ" 100 | ["children"]=> 101 | array(3) { 102 | [0]=> 103 | array(2) { 104 | ["parent"]=> 105 | string(8) "VBZ does" 106 | ["children"]=> 107 | array(0) { 108 | } 109 | } 110 | [1]=> 111 | array(2) { 112 | ["parent"]=> 113 | string(2) "NP" 114 | ["children"]=> 115 | array(2) { 116 | [0]=> 117 | array(2) { 118 | ["parent"]=> 119 | string(6) "DT the" 120 | ["children"]=> 121 | array(0) { 122 | } 123 | } 124 | [1]=> 125 | array(2) { 126 | ["parent"]=> 127 | string(6) "NN fox" 128 | ["children"]=> 129 | array(0) { 130 | } 131 | } 132 | } 133 | } 134 | [2]=> 135 | array(2) { 136 | ["parent"]=> 137 | string(2) "VP" 138 | ["children"]=> 139 | array(1) { 140 | [0]=> 141 | array(2) { 142 | ["parent"]=> 143 | string(6) "VB say" 144 | ["children"]=> 145 | array(0) { 146 | } 147 | } 148 | } 149 | } 150 | } 151 | } 152 | [2]=> 153 | array(2) { 154 | ["parent"]=> 155 | string(3) ". ?" 156 | ["children"]=> 157 | array(0) { 158 | } 159 | } 160 | } 161 | } 162 | } 163 | } 164 | ["typedDependencies"]=> 165 | array(5) { 166 | [0]=> 167 | array(3) { 168 | ["type"]=> 169 | string(4) "dobj" 170 | [0]=> 171 | array(2) { 172 | ["feature"]=> 173 | string(3) "say" 174 | ["index"]=> 175 | int(5) 176 | } 177 | [1]=> 178 | array(2) { 179 | ["feature"]=> 180 | string(4) "What" 181 | ["index"]=> 182 | int(1) 183 | } 184 | } 185 | [1]=> 186 | array(3) { 187 | ["type"]=> 188 | string(3) "aux" 189 | [0]=> 190 | array(2) { 191 | ["feature"]=> 192 | string(3) "say" 193 | ["index"]=> 194 | int(5) 195 | } 196 | [1]=> 197 | array(2) { 198 | ["feature"]=> 199 | string(4) "does" 200 | ["index"]=> 201 | int(2) 202 | } 203 | } 204 | [2]=> 205 | array(3) { 206 | ["type"]=> 207 | string(3) "det" 208 | [0]=> 209 | array(2) { 210 | ["feature"]=> 211 | string(3) "fox" 212 | ["index"]=> 213 | int(4) 214 | } 215 | [1]=> 216 | array(2) { 217 | ["feature"]=> 218 | string(3) "the" 219 | ["index"]=> 220 | int(3) 221 | } 222 | } 223 | [3]=> 224 | array(3) { 225 | ["type"]=> 226 | string(5) "nsubj" 227 | [0]=> 228 | array(2) { 229 | ["feature"]=> 230 | string(3) "say" 231 | ["index"]=> 232 | int(5) 233 | } 234 | [1]=> 235 | array(2) { 236 | ["feature"]=> 237 | string(3) "fox" 238 | ["index"]=> 239 | int(4) 240 | } 241 | } 242 | [4]=> 243 | array(3) { 244 | ["type"]=> 245 | string(4) "root" 246 | [0]=> 247 | array(2) { 248 | ["feature"]=> 249 | string(4) "ROOT" 250 | ["index"]=> 251 | int(0) 252 | } 253 | [1]=> 254 | array(2) { 255 | ["feature"]=> 256 | string(3) "say" 257 | ["index"]=> 258 | int(5) 259 | } 260 | } 261 | } 262 | } 263 | [1]=> 264 | array(3) { 265 | ["wordsAndTags"]=> 266 | array(7) { 267 | [0]=> 268 | array(2) { 269 | [0]=> 270 | string(2) "Hi" 271 | [1]=> 272 | string(3) "NNP" 273 | } 274 | [1]=> 275 | array(2) { 276 | [0]=> 277 | string(3) "bob" 278 | [1]=> 279 | string(3) "VBP" 280 | } 281 | [2]=> 282 | array(2) { 283 | [0]=> 284 | string(1) "," 285 | [1]=> 286 | string(1) "," 287 | } 288 | [3]=> 289 | array(2) { 290 | [0]=> 291 | string(3) "how" 292 | [1]=> 293 | string(3) "WRB" 294 | } 295 | [4]=> 296 | array(2) { 297 | [0]=> 298 | string(3) "are" 299 | [1]=> 300 | string(3) "VBP" 301 | } 302 | [5]=> 303 | array(2) { 304 | [0]=> 305 | string(3) "you" 306 | [1]=> 307 | string(3) "PRP" 308 | } 309 | [6]=> 310 | array(2) { 311 | [0]=> 312 | string(1) "?" 313 | [1]=> 314 | string(1) "." 315 | } 316 | } 317 | ["penn"]=> 318 | array(2) { 319 | ["parent"]=> 320 | string(4) "ROOT" 321 | ["children"]=> 322 | array(1) { 323 | [0]=> 324 | array(2) { 325 | ["parent"]=> 326 | string(1) "S" 327 | ["children"]=> 328 | array(2) { 329 | [0]=> 330 | array(2) { 331 | ["parent"]=> 332 | string(2) "NP" 333 | ["children"]=> 334 | array(1) { 335 | [0]=> 336 | array(2) { 337 | ["parent"]=> 338 | string(6) "NNP Hi" 339 | ["children"]=> 340 | array(0) { 341 | } 342 | } 343 | } 344 | } 345 | [1]=> 346 | array(2) { 347 | ["parent"]=> 348 | string(2) "VP" 349 | ["children"]=> 350 | array(3) { 351 | [0]=> 352 | array(2) { 353 | ["parent"]=> 354 | string(7) "VBP bob" 355 | ["children"]=> 356 | array(0) { 357 | } 358 | } 359 | [1]=> 360 | array(2) { 361 | ["parent"]=> 362 | string(3) ", ," 363 | ["children"]=> 364 | array(0) { 365 | } 366 | } 367 | [2]=> 368 | array(2) { 369 | ["parent"]=> 370 | string(5) "SBARQ" 371 | ["children"]=> 372 | array(3) { 373 | [0]=> 374 | array(2) { 375 | ["parent"]=> 376 | string(6) "WHADVP" 377 | ["children"]=> 378 | array(1) { 379 | [0]=> 380 | array(2) { 381 | ["parent"]=> 382 | string(7) "WRB how" 383 | ["children"]=> 384 | array(0) { 385 | } 386 | } 387 | } 388 | } 389 | [1]=> 390 | array(2) { 391 | ["parent"]=> 392 | string(2) "SQ" 393 | ["children"]=> 394 | array(2) { 395 | [0]=> 396 | array(2) { 397 | ["parent"]=> 398 | string(7) "VBP are" 399 | ["children"]=> 400 | array(0) { 401 | } 402 | } 403 | [1]=> 404 | array(2) { 405 | ["parent"]=> 406 | string(2) "NP" 407 | ["children"]=> 408 | array(1) { 409 | [0]=> 410 | array(2) { 411 | ["parent"]=> 412 | string(7) "PRP you" 413 | ["children"]=> 414 | array(0) { 415 | } 416 | } 417 | } 418 | } 419 | } 420 | } 421 | [2]=> 422 | array(2) { 423 | ["parent"]=> 424 | string(3) ". ?" 425 | ["children"]=> 426 | array(0) { 427 | } 428 | } 429 | } 430 | } 431 | } 432 | } 433 | } 434 | } 435 | } 436 | } 437 | ["typedDependencies"]=> 438 | array(5) { 439 | [0]=> 440 | array(3) { 441 | ["type"]=> 442 | string(5) "nsubj" 443 | [0]=> 444 | array(2) { 445 | ["feature"]=> 446 | string(3) "bob" 447 | ["index"]=> 448 | int(2) 449 | } 450 | [1]=> 451 | array(2) { 452 | ["feature"]=> 453 | string(2) "Hi" 454 | ["index"]=> 455 | int(1) 456 | } 457 | } 458 | [1]=> 459 | array(3) { 460 | ["type"]=> 461 | string(4) "root" 462 | [0]=> 463 | array(2) { 464 | ["feature"]=> 465 | string(4) "ROOT" 466 | ["index"]=> 467 | int(0) 468 | } 469 | [1]=> 470 | array(2) { 471 | ["feature"]=> 472 | string(3) "bob" 473 | ["index"]=> 474 | int(2) 475 | } 476 | } 477 | [2]=> 478 | array(3) { 479 | ["type"]=> 480 | string(6) "advmod" 481 | [0]=> 482 | array(2) { 483 | ["feature"]=> 484 | string(3) "are" 485 | ["index"]=> 486 | int(5) 487 | } 488 | [1]=> 489 | array(2) { 490 | ["feature"]=> 491 | string(3) "how" 492 | ["index"]=> 493 | int(4) 494 | } 495 | } 496 | [3]=> 497 | array(3) { 498 | ["type"]=> 499 | string(5) "ccomp" 500 | [0]=> 501 | array(2) { 502 | ["feature"]=> 503 | string(3) "bob" 504 | ["index"]=> 505 | int(2) 506 | } 507 | [1]=> 508 | array(2) { 509 | ["feature"]=> 510 | string(3) "are" 511 | ["index"]=> 512 | int(5) 513 | } 514 | } 515 | [4]=> 516 | array(3) { 517 | ["type"]=> 518 | string(5) "nsubj" 519 | [0]=> 520 | array(2) { 521 | ["feature"]=> 522 | string(3) "are" 523 | ["index"]=> 524 | int(5) 525 | } 526 | [1]=> 527 | array(2) { 528 | ["feature"]=> 529 | string(3) "you" 530 | ["index"]=> 531 | int(6) 532 | } 533 | } 534 | } 535 | } 536 | } 537 | */ 538 | -------------------------------------------------------------------------------- /src/StanfordNLP/Base.php: -------------------------------------------------------------------------------- 1 | 11 | */ 12 | namespace StanfordNLP; 13 | 14 | /** 15 | * 16 | * Base Exception class for Stanford NLP 17 | * 18 | * @package StanfordNLP 19 | * 20 | */ 21 | class Base { 22 | 23 | /** 24 | * Java path 25 | * 26 | * relative/absolute path to java 27 | * e.g. /usr/bin/java 28 | */ 29 | protected $java_path = 'java'; // assume relative path to start 30 | 31 | /** 32 | * Stanford Jar file 33 | */ 34 | protected $jar; 35 | 36 | /** 37 | * Stanford Models Jar file 38 | */ 39 | protected $models_jar; 40 | 41 | /** 42 | * Java options to use with our jar instance 43 | */ 44 | protected $java_options; 45 | 46 | /** 47 | * Output from NLP Tool 48 | */ 49 | protected $output = null; 50 | 51 | /** 52 | * Errors from NLP Tool 53 | */ 54 | protected $errors = null; 55 | 56 | /** 57 | * PHP Operating System 58 | */ 59 | protected $php_os = 'linux'; 60 | 61 | /** 62 | * PHP Operating System 63 | */ 64 | protected $debug = false; 65 | 66 | /** 67 | * Constructor! 68 | * - Set PHP Operating System. 69 | * 70 | * @return null 71 | */ 72 | public function __construct() 73 | { 74 | if (defined('PHP_OS')) { 75 | if (strtolower(substr(PHP_OS, 0, 3)) == 'win') { 76 | $this->php_os = 'windows'; 77 | } 78 | } 79 | } 80 | 81 | /** 82 | * Java path setter 83 | * 84 | * @param $java_path string path to java executable 85 | * 86 | * @return null 87 | */ 88 | public function setJavaPath($java_path) 89 | { 90 | $this->java_path = $java_path; 91 | } 92 | 93 | /** 94 | * Java path getter 95 | * 96 | * @return string 97 | */ 98 | public function getJavaPath() 99 | { 100 | return $this->java_path; 101 | } 102 | 103 | /** 104 | * Debug setter 105 | * 106 | * @param boolean 107 | * 108 | * @return null 109 | */ 110 | public function setDebug($val) 111 | { 112 | $this->debug = (bool) $val; 113 | } 114 | 115 | /** 116 | * Debug getter 117 | * 118 | * @return boolean 119 | */ 120 | public function getDebug() 121 | { 122 | return $this->debug; 123 | } 124 | 125 | /** 126 | * Jar setter 127 | * 128 | * @param $jar string path to jar file 129 | * 130 | * @return null 131 | */ 132 | public function setJar($jar) 133 | { 134 | if (file_exists($jar)) { 135 | $this->jar = $jar; 136 | } else { 137 | throw new Exception("Jar file path does not exist."); 138 | } 139 | } 140 | 141 | /** 142 | * Jar getter 143 | * 144 | * @return mixed 145 | */ 146 | public function getJar() 147 | { 148 | return $this->jar; 149 | } 150 | 151 | /** 152 | * Models Jar setter 153 | * 154 | * @param $jar string path to jar file 155 | * 156 | * @return null 157 | */ 158 | public function setModelsJar($jar) 159 | { 160 | if (file_exists($jar)) { 161 | $this->models_jar = $jar; 162 | } else { 163 | throw new Exception("Models Jar file path does not exist."); 164 | } 165 | } 166 | 167 | /** 168 | * Models Jar getter 169 | * 170 | * @return mixed 171 | */ 172 | public function getModelsJar() 173 | { 174 | return $this->models_jar; 175 | } 176 | 177 | /** 178 | * Java options setter 179 | * 180 | * @param $options mixed java options 181 | * 182 | * @return null 183 | */ 184 | public function setJavaOptions($options) 185 | { 186 | $this->java_options = (array) $options; 187 | } 188 | 189 | /** 190 | * Java options getter 191 | * 192 | * @return array 193 | */ 194 | public function getJavaOptions() 195 | { 196 | return $this->java_options; 197 | } 198 | 199 | /** 200 | * Output setter 201 | * 202 | * @param $output 203 | * 204 | * @return null 205 | */ 206 | public function setOutput($output) 207 | { 208 | $this->output = $output; 209 | } 210 | 211 | /** 212 | * Output getter 213 | * 214 | * @return mixed 215 | */ 216 | public function getOutput() 217 | { 218 | return $this->output; 219 | } 220 | 221 | /** 222 | * Errors setter 223 | * 224 | * @param $errors 225 | * 226 | * @return null 227 | */ 228 | public function setErrors($errors) 229 | { 230 | $this->errors = $errors; 231 | } 232 | 233 | /** 234 | * Errors getter 235 | * 236 | * @return mixed 237 | */ 238 | public function getErrors() 239 | { 240 | return $this->errors; 241 | } 242 | } 243 | -------------------------------------------------------------------------------- /src/StanfordNLP/Exception.php: -------------------------------------------------------------------------------- 1 | 11 | */ 12 | namespace StanfordNLP; 13 | 14 | /** 15 | * 16 | * Base Exception class for Stanford NLP 17 | * 18 | * @package StanfordNLP 19 | * 20 | */ 21 | class Exception extends \Exception 22 | { 23 | } 24 | -------------------------------------------------------------------------------- /src/StanfordNLP/NERTagger.php: -------------------------------------------------------------------------------- 1 | 10 | */ 11 | namespace StanfordNLP; 12 | 13 | class NERTagger extends StanfordTagger { 14 | 15 | /** 16 | * NER classifier file 17 | */ 18 | protected $classifier; 19 | 20 | /** 21 | * Constructor! 22 | * 23 | * @param $classifier string path to classifier file 24 | * @param $jar string path stanford ner jar file 25 | * @param $java_options mixed command line arguments to pass 26 | * 27 | * @return null 28 | */ 29 | public function __construct($classifier, $jar, $java_options = array('-mx300m')) 30 | { 31 | parent::__construct(); 32 | $this->setClassifier($classifier); 33 | $this->setJar($jar); 34 | $this->setJavaOptions($java_options); 35 | } 36 | 37 | /** 38 | * Tag multiple arrays of tokens for sentences 39 | * 40 | * @param $sentences array array of arrays of tokens 41 | * 42 | * @return mixed 43 | */ 44 | public function batchTag($sentences) 45 | { 46 | $this->setSeparator('/'); 47 | $this->setTagType('ner'); 48 | return parent::batchTag($sentences); 49 | } 50 | 51 | /** 52 | * Classifier setter 53 | * 54 | * @param $classifier string path to classifier file 55 | * 56 | * @return mixed 57 | */ 58 | public function setClassifier($classifier) 59 | { 60 | if (file_exists($classifier)) { 61 | $this->classifier = $classifier; 62 | } else { 63 | throw new Exception("Classifier file path does not exist."); 64 | } 65 | } 66 | 67 | /** 68 | * Classifier getter 69 | * 70 | * @return mixed 71 | */ 72 | public function getClassifier() 73 | { 74 | return $this->classifier; 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/StanfordNLP/POSTagger.php: -------------------------------------------------------------------------------- 1 | 12 | */ 13 | namespace StanfordNLP; 14 | 15 | class POSTagger extends StanfordTagger { 16 | 17 | /** 18 | * Tagger model file 19 | */ 20 | protected $model; 21 | 22 | /** 23 | * Constructor! 24 | * 25 | * @param $model string path to tagging model file 26 | * @param $jar string path stanford pos tagger jar file 27 | * @param $java_options mixed command line arguments to pass 28 | * 29 | * @return null 30 | */ 31 | public function __construct($model, $jar, $java_options = array('-mx300m')) 32 | { 33 | parent::__construct(); 34 | $this->setModel($model); 35 | $this->setJar($jar); 36 | $this->setJavaOptions($java_options); 37 | } 38 | 39 | /** 40 | * Tag multiple arrays of tokens for sentences 41 | * 42 | * @param $sentences array array of arrays of tokens 43 | * 44 | * @return mixed 45 | */ 46 | public function batchTag($sentences) 47 | { 48 | $this->setTagType('pos'); 49 | return parent::batchTag($sentences); 50 | } 51 | 52 | /** 53 | * Model setter 54 | * 55 | * @param $model string path to model file 56 | * 57 | * @return null 58 | */ 59 | public function setModel($model) 60 | { 61 | if (file_exists($model)) { 62 | $this->model = $model; 63 | } else { 64 | throw new Exception("Model file path does not exist."); 65 | } 66 | } 67 | 68 | /** 69 | * Model getter 70 | * 71 | * @return mixed 72 | */ 73 | public function getModel() 74 | { 75 | return $this->model; 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/StanfordNLP/Parser.php: -------------------------------------------------------------------------------- 1 | 10 | */ 11 | namespace StanfordNLP; 12 | 13 | class Parser extends Base { 14 | 15 | /** 16 | * Output format? 17 | * 18 | * CSV style list of output types 19 | * e.g. penn,typedDependencies,wordsAndTags 20 | */ 21 | public $output_format = "wordsAndTags,penn,typedDependencies"; 22 | 23 | /** 24 | * Use lexicalized parser? 25 | */ 26 | public $lexicalized_parser = false; 27 | 28 | /** 29 | * Constructor! 30 | * 31 | * @param $model string path to tagging model file 32 | * @param $jar string path stanford parser jar file 33 | * @param $java_options mixed command line arguments to pass 34 | * 35 | * @return null 36 | */ 37 | public function __construct($jar, $models_jar = null, $java_options = array('-mx300m')) 38 | { 39 | parent::__construct(); 40 | $this->setJar($jar); 41 | $this->setModelsJar($models_jar); 42 | $this->setJavaOptions($java_options); 43 | } 44 | 45 | /** 46 | * Parse sentence 47 | * 48 | * @param $tokens array tokens 49 | * 50 | * @return mixed 51 | */ 52 | public function parseSentence($sentence) 53 | { 54 | $results = $this->parseSentences(array($sentence)); 55 | return isset($results[0]) ? $results[0] : array(); 56 | } 57 | 58 | /** 59 | * Parse array of sentences 60 | * 61 | * @param $sentences array of sentences 62 | * 63 | * @return mixed 64 | */ 65 | public function parseSentences($sentences) 66 | { 67 | // Reset errors and output 68 | $this->setErrors(null); 69 | $this->setOutput(null); 70 | 71 | // Make temp file to store sentences. 72 | $tmpfname = tempnam(DIRECTORY_SEPARATOR . 'tmp', 'phpnlpparser'); 73 | chmod($tmpfname, 0644); 74 | $handle = fopen($tmpfname, "w"); 75 | 76 | $str = implode("\n", $sentences); 77 | 78 | fwrite($handle, $str); 79 | fclose($handle); 80 | 81 | // Create process to run stanford ner. 82 | $descriptorspec = array( 83 | 0 => array("pipe", "r"), // stdin 84 | 1 => array("pipe", "w"), // stdout 85 | 2 => array("pipe", "w") // stderr 86 | ); 87 | 88 | $options = implode(' ', $this->getJavaOptions()); 89 | 90 | $parser = $this->lexicalized_parser ? 'edu/stanford/nlp/models/lexparser/englishFactored.ser.gz' : 'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'; 91 | $osSeparator = $this->php_os == 'windows' ? ';' : ':'; 92 | $cmd = $this->getJavaPath() 93 | . " $options -cp \"" 94 | . $this->getJar() 95 | . $osSeparator 96 | . $this->getModelsJar() 97 | . '" edu.stanford.nlp.parser.lexparser.LexicalizedParser -encoding UTF-8 -outputFormat "' 98 | . $this->getOutputFormat() 99 | . "\" " 100 | . $parser 101 | . " " 102 | . $tmpfname; 103 | 104 | if ($this->debug) { 105 | echo 'DEBUG: Command used: ' . $cmd . PHP_EOL; 106 | } 107 | 108 | $process = proc_open($cmd, $descriptorspec, $pipes, dirname($this->getJar())); 109 | 110 | $output = null; 111 | $errors = null; 112 | if (is_resource($process)) { 113 | // We aren't working with stdin 114 | fclose($pipes[0]); 115 | 116 | // Get output 117 | $output = stream_get_contents($pipes[1]); 118 | fclose($pipes[1]); 119 | 120 | // Get any errors 121 | $errors = stream_get_contents($pipes[2]); 122 | fclose($pipes[2]); 123 | 124 | // close pipe before calling proc_close in order to avoid a deadlock 125 | $return_value = proc_close($process); 126 | if ($return_value == -1) { 127 | throw new Exception("Java process returned with an error (proc_close)."); 128 | } 129 | } 130 | 131 | unlink($tmpfname); 132 | 133 | if ($errors) { 134 | $this->setErrors($errors); 135 | } 136 | 137 | if ($output) { 138 | $this->setOutput($output); 139 | } 140 | 141 | return $this->parseOutput(); 142 | } 143 | 144 | /** 145 | * Build text output from jar into array structure 146 | * 147 | * @return array 148 | */ 149 | public function parseOutput() 150 | { 151 | // Output is separated by two line breaks 152 | // Word and tags is first 153 | // penn is second 154 | // typed dependencies is last. 155 | $output = explode("\n\n", trim($this->getOutput())); 156 | 157 | $formats = explode(',', $this->getOutputFormat()); 158 | foreach ($formats as $k => $v) { 159 | $formats[$k] = trim(strtolower($v)); 160 | } 161 | 162 | $count = count($formats); 163 | $length = count($output); 164 | $i = 0; 165 | $set = array(); 166 | 167 | while ($i < $length) { 168 | $arr = array( 169 | 'wordsAndTags' => null, 170 | 'penn' => null, 171 | 'typedDependencies' => null, 172 | ); 173 | $index_offset = 0; 174 | if (in_array('wordsandtags', $formats)) { 175 | $arr['wordsAndTags'] = $this->parseWordsAndTags($output[$i+$index_offset]); 176 | $index_offset++; 177 | } 178 | 179 | if (in_array('penn', $formats)) { 180 | $arr['penn'] = $this->parsePenn($output[$i+$index_offset]); 181 | $index_offset++; 182 | } 183 | 184 | if (in_array('typeddependencies', $formats)) { 185 | $arr['typedDependencies'] = $this->parseTypedDependencies($output[$i+$index_offset]); 186 | } 187 | $set[] = $arr; 188 | $i += $count; 189 | } 190 | 191 | return $set; 192 | } 193 | 194 | /** 195 | * POS tags into array structure 196 | * 197 | * @return array 198 | */ 199 | public function parseWordsAndTags($str) 200 | { 201 | $arr = array(); 202 | 203 | if (trim($str) == '') { 204 | return $arr; 205 | } 206 | $s = array(); 207 | $tagged = explode(' ', trim($str)); 208 | foreach ($tagged as $tag) { 209 | $parts = explode('/', $tag); 210 | $pos = array_pop($parts); 211 | $arr[] = array(implode('/', $parts), $pos); 212 | } 213 | 214 | return $arr; 215 | } 216 | 217 | /** 218 | * Penn into array structure 219 | * 220 | * @return array 221 | */ 222 | public function parsePenn($string) 223 | { 224 | $arr = array('parent' => null, 'children' => array()); 225 | $stack = array(); 226 | $length = strlen($string); 227 | $node = ''; 228 | $bracket = 1; 229 | for ($i = 1; $i < $length; $i++) { 230 | if ($string[$i] == '(') { 231 | $bracket += 1; 232 | $match_i = $this->getMatchingParen($string, $i); 233 | $arr['children'][] = $this->parsePenn(substr($string, $i, ($match_i - $i) + 1)); 234 | $i = $match_i - 1; 235 | } else if ($string[$i] == ')') { 236 | $bracket -= 1; 237 | $arr['parent'] = trim($node); 238 | } else { 239 | $node .= $string[$i]; 240 | } 241 | if ($bracket == 0) { 242 | return $arr; 243 | } 244 | } 245 | 246 | return $arr; 247 | } 248 | 249 | /** 250 | * Find the position of a matching closing bracket for a string opening bracket 251 | */ 252 | public function getMatchingParen($string, $start_pos) 253 | { 254 | $length = strlen($string); 255 | $bracket = 1; 256 | foreach (range($start_pos + 1, $length) as $i) { 257 | if ($string[$i] == '(') { 258 | $bracket += 1; 259 | } else if ($string[$i] == ')') { 260 | $bracket -= 1; 261 | } 262 | if ($bracket == 0) { 263 | return $i; 264 | } 265 | } 266 | } 267 | 268 | /** 269 | * Typed dependencies into array structure 270 | * 271 | * @return array 272 | */ 273 | public function parseTypedDependencies($str) 274 | { 275 | $arr = array(); 276 | $lines = explode("\n", $str); 277 | foreach ($lines as $line) { 278 | $paren_pos = strpos($line, '('); 279 | 280 | if ($paren_pos === false) { 281 | continue; 282 | } 283 | 284 | $type = substr($line, 0, $paren_pos); 285 | $parts = explode(', ', substr($line, $paren_pos + 1, -1)); 286 | 287 | $first = substr($parts[0], 0, strrpos($parts[0], '-')); 288 | $first_index = (int) substr($parts[0], strrpos($parts[0], '-') + 1); 289 | 290 | $second = substr($parts[1], 0, strrpos($parts[1], '-')); 291 | $second_index = (int) substr($parts[1], strrpos($parts[1], '-') + 1); 292 | 293 | $arr[] = array( 294 | 'type' => $type, 295 | array( 296 | 'feature' => $first, 297 | 'index' => $first_index 298 | ), 299 | array( 300 | 'feature' => $second, 301 | 'index' => $second_index 302 | ) 303 | ); 304 | } 305 | return $arr; 306 | } 307 | 308 | /** 309 | * Lexicalized parser setter 310 | * 311 | * @param $bool 312 | * 313 | * @return null 314 | */ 315 | public function setLexicalizedParser($bool) 316 | { 317 | $this->lexicalized_parser = (bool) $bool; 318 | } 319 | 320 | /** 321 | * Lexicalized parser getter 322 | * 323 | * @return boolean 324 | */ 325 | public function getLexicalizedParser() 326 | { 327 | return $this->lexicalized_parser; 328 | } 329 | 330 | /** 331 | * Output format setter 332 | * 333 | * @param $format string 334 | * 335 | * @return null 336 | */ 337 | public function setOutputFormat($format) 338 | { 339 | $this->output_format = $format; 340 | } 341 | 342 | /** 343 | * Output format getter 344 | * 345 | * @return mixed 346 | */ 347 | public function getOutputFormat() 348 | { 349 | return $this->output_format; 350 | } 351 | } 352 | -------------------------------------------------------------------------------- /src/StanfordNLP/StanfordTagger.php: -------------------------------------------------------------------------------- 1 | 12 | */ 13 | namespace StanfordNLP; 14 | 15 | class StanfordTagger extends Base { 16 | 17 | /** 18 | * Tag separator 19 | */ 20 | protected $separator = '_'; 21 | 22 | /** 23 | * Tag type 24 | */ 25 | protected $tag_type = 'pos'; 26 | 27 | /** 28 | * Constructor! 29 | * 30 | * @return null 31 | */ 32 | public function __construct() 33 | { 34 | parent::__construct(); 35 | } 36 | 37 | /** 38 | * Separator setter 39 | * 40 | * @param $output 41 | * 42 | * @return null 43 | */ 44 | public function setSeparator($separator) 45 | { 46 | $this->separator = $separator; 47 | } 48 | 49 | /** 50 | * Separator getter 51 | * 52 | * @return mixed 53 | */ 54 | public function getSeparator() 55 | { 56 | return $this->separator; 57 | } 58 | 59 | /** 60 | * Tag type setter 61 | * 62 | * @param $type 63 | * 64 | * @return null 65 | */ 66 | public function setTagType($type) 67 | { 68 | $this->tag_type = $type; 69 | } 70 | 71 | /** 72 | * Tag type getter 73 | * 74 | * @return mixed 75 | */ 76 | public function getTagType() 77 | { 78 | return $this->tag_type; 79 | } 80 | 81 | /** 82 | * Tag from an array of tokens for given sentence(s) 83 | * 84 | * @param $tokens array tokens 85 | * 86 | * @return mixed 87 | */ 88 | public function tag($tokens) 89 | { 90 | $results = $this->batchTag(array($tokens)); 91 | return !empty($results) ? $results : array(); 92 | } 93 | 94 | /** 95 | * Tag multiple arrays of tokens for sentences 96 | * 97 | * @param $sentences array array of arrays of tokens 98 | * 99 | * @return mixed 100 | */ 101 | public function batchTag($sentences) 102 | { 103 | // Reset errors and output 104 | $this->setErrors(null); 105 | $this->setOutput(null); 106 | 107 | // Make temp file to store sentences. 108 | $tmpfname = tempnam(DIRECTORY_SEPARATOR . 'tmp', 'phpnlptag'); 109 | chmod($tmpfname, 0644); 110 | $handle = fopen($tmpfname, "w"); 111 | 112 | foreach ($sentences as $k => $v) { 113 | $sentences[$k] = implode(' ', $v); 114 | } 115 | $str = implode("\n", $sentences); 116 | 117 | fwrite($handle, $str); 118 | fclose($handle); 119 | 120 | // Create process to run stanford ner. 121 | $descriptorspec = array( 122 | 0 => array("pipe", "r"), // stdin 123 | 1 => array("pipe", "w"), // stdout 124 | 2 => array("pipe", "w") // stderr 125 | ); 126 | 127 | $options = implode(' ', $this->getJavaOptions()); 128 | $osSeparator = $this->php_os == 'windows' ? ';' : ':'; 129 | switch ($this->getTagType()) { 130 | case 'pos': 131 | $separator = $this->getSeparator(); 132 | $cmd = escapeshellcmd( 133 | $this->getJavaPath() 134 | . " $options -cp \"" 135 | . $this->getJar() 136 | . "{$osSeparator}\" edu.stanford.nlp.tagger.maxent.MaxentTagger -model " 137 | . $this->getModel() 138 | . " -textFile " 139 | . $tmpfname 140 | . " -outputFormat slashTags -tagSeparator " 141 | . $separator 142 | . " -encoding utf8" 143 | ); 144 | break; 145 | case 'ner': 146 | $cmd = escapeshellcmd( 147 | $this->getJavaPath() 148 | . " $options -cp \"" 149 | . $this->getJar() 150 | . "{$osSeparator}\" edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier " 151 | . $this->getClassifier() 152 | . " -textFile " 153 | . $tmpfname 154 | . " -encoding utf8" 155 | ); 156 | break; 157 | } 158 | 159 | if ($this->debug) { 160 | echo 'DEBUG: Command used: ' . $cmd . PHP_EOL; 161 | } 162 | 163 | $process = proc_open($cmd, $descriptorspec, $pipes, dirname($this->getJar())); 164 | 165 | $output = null; 166 | $errors = null; 167 | if (is_resource($process)) { 168 | // We aren't working with stdin 169 | fclose($pipes[0]); 170 | 171 | // Get output 172 | $output = stream_get_contents($pipes[1]); 173 | fclose($pipes[1]); 174 | 175 | // Get any errors 176 | $errors = stream_get_contents($pipes[2]); 177 | fclose($pipes[2]); 178 | 179 | // close pipe before calling proc_close in order to avoid a deadlock 180 | $return_value = proc_close($process); 181 | if ($return_value == -1) { 182 | throw new Exception("Java process returned with an error (proc_close)."); 183 | } 184 | } 185 | 186 | unlink($tmpfname); 187 | 188 | if ($errors) { 189 | $this->setErrors($errors); 190 | } 191 | 192 | if ($output) { 193 | $this->setOutput($output); 194 | } 195 | 196 | return $this->parseOutput(); 197 | } 198 | 199 | /** 200 | * Build text output from jar into array structure 201 | * 202 | * @return array 203 | */ 204 | public function parseOutput() 205 | { 206 | $output = $this->getOutput(); 207 | if (!$output) { 208 | return array(); 209 | } 210 | 211 | $separator = $this->getSeparator(); 212 | $arr = array(); 213 | $sentences = explode("\n", $output); 214 | foreach ($sentences as $sentence) { 215 | if (trim($sentence) == '') { 216 | continue; 217 | } 218 | $s = array(); 219 | $tagged = explode(' ', trim($sentence)); 220 | foreach ($tagged as $tag) { 221 | $parts = explode($separator, $tag); 222 | $pos = array_pop($parts); 223 | $s[] = array(implode($separator, $parts), $pos); 224 | } 225 | $arr[] = $s; 226 | } 227 | return $arr; 228 | } 229 | } 230 | --------------------------------------------------------------------------------