├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── README.markdown ├── autoloader.php ├── composer.json ├── src └── NlpTools │ ├── Analysis │ ├── FreqDist.php │ └── Idf.php │ ├── Classifiers │ ├── ClassifierInterface.php │ ├── FeatureBasedLinearClassifier.php │ └── MultinomialNBClassifier.php │ ├── Clustering │ ├── CentroidFactories │ │ ├── CentroidFactoryInterface.php │ │ ├── Euclidean.php │ │ ├── Hamming.php │ │ └── MeanAngle.php │ ├── Clusterer.php │ ├── Hierarchical.php │ ├── KMeans.php │ └── MergeStrategies │ │ ├── CompleteLink.php │ │ ├── GroupAverage.php │ │ ├── HeapLinkage.php │ │ ├── MergeStrategyInterface.php │ │ └── SingleLink.php │ ├── Documents │ ├── DocumentInterface.php │ ├── RawDocument.php │ ├── TokensDocument.php │ ├── TrainingDocument.php │ ├── TrainingSet.php │ └── WordDocument.php │ ├── Exceptions │ └── InvalidExpression.php │ ├── FeatureFactories │ ├── DataAsFeatures.php │ ├── FeatureFactoryInterface.php │ └── FunctionFeatures.php │ ├── Models │ ├── FeatureBasedNB.php │ ├── Lda.php │ ├── LinearModel.php │ ├── Maxent.php │ └── MultinomialNBModelInterface.php │ ├── Optimizers │ ├── ExternalMaxentOptimizer.php │ ├── FeatureBasedLinearOptimizerInterface.php │ ├── GradientDescentOptimizer.php │ ├── MaxentGradientDescent.php │ └── MaxentOptimizerInterface.php │ ├── Random │ ├── Distributions │ │ ├── AbstractDistribution.php │ │ ├── Dirichlet.php │ │ ├── Gamma.php │ │ └── Normal.php │ └── Generators │ │ ├── FromFile.php │ │ ├── GeneratorInterface.php │ │ └── MersenneTwister.php │ ├── Similarity │ ├── CosineSimilarity.php │ ├── DiceSimilarity.php │ ├── DistanceInterface.php │ ├── Euclidean.php │ ├── HammingDistance.php │ ├── JaccardIndex.php │ ├── OverlapCoefficient.php │ ├── Simhash.php │ ├── SimilarityInterface.php │ └── TverskyIndex.php │ ├── Stemmers │ ├── GreekStemmer.php │ ├── LancasterStemmer.php │ ├── PorterStemmer.php │ ├── RegexStemmer.php │ └── Stemmer.php │ ├── Tokenizers │ ├── ClassifierBasedTokenizer.php │ ├── PennTreeBankTokenizer.php │ ├── RegexTokenizer.php │ ├── TokenizerInterface.php │ ├── WhitespaceAndPunctuationTokenizer.php │ └── WhitespaceTokenizer.php │ └── Utils │ ├── ClassifierBasedTransformation.php │ ├── EnglishVowels.php │ ├── Normalizers │ ├── English.php │ ├── Greek.php │ └── Normalizer.php │ ├── StopWords.php │ ├── TransformationInterface.php │ └── VowelsAbstractFactory.php └── tests ├── NlpTools ├── Analysis │ ├── FreqDistTest.php │ └── IdfTest.php ├── Classifiers │ └── EndOfSentenceRules.php ├── Clustering │ ├── ClusteringTestBase.php │ ├── HierarchicalTest.php │ └── KmeansTest.php ├── Documents │ ├── EuclideanPoint.php │ ├── TransformationsTest.php │ └── WordDocumentTest.php ├── Models │ └── LdaTest.php ├── Similarity │ ├── CosineSimilarityTest.php │ ├── DiceSimilarityTest.php │ ├── HammingDistanceTest.php │ ├── JaccardIndexTest.php │ ├── OverlapCoefficientTest.php │ ├── SimhashTest.php │ └── TverskyIndexTest.php ├── Stemmers │ ├── GreekStemmerTest.php │ ├── LancasterStemmerTest.php │ ├── PorterStemmerTest.php │ ├── StemmerTestBase.php │ └── TransformationTest.php ├── Tokenizers │ ├── ClassifierBasedTokenizerTest.php │ ├── PennTreeBankTokenizerTest.php │ ├── RegexTokenizerTest.php │ ├── WhitespaceAndPuntuationTokenizerTest.php │ └── WhitespaceTokenizerTest.php └── Utils │ ├── ClassifierBasedTransformationTest.php │ ├── EnglishVowelsTest.php │ ├── IdentityTransformer.php │ ├── Normalizers │ └── NormalizerTest.php │ └── StopWordsTest.php ├── README.markdown ├── bootstrap.php ├── data ├── .gitignore ├── Stemmers │ ├── GreekStemmerTest │ │ ├── appendix-a-stems │ │ └── appendix-a-words │ └── PorterStemmerTest │ │ ├── stems.txt │ │ └── words.txt └── Tokenizers │ └── PennTreeBankTokenizerTest │ ├── test.txt │ └── tokenized ├── phpunit.xml └── sentiment_maxent.php /.gitignore: -------------------------------------------------------------------------------- 1 | vendor/ 2 | /nbproject/private/ 3 | nbproject 4 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Contribution guidelines 2 | =================== 3 | 4 | This document contains guidelines for contributing to NlpTools. 5 | 6 | Coding style 7 | ------------------ 8 | 9 | NlpTools adheres to the [psr-2][1] standard. It also follows the convention of 10 | appending the word *Interface* to any interface. 11 | 12 | To enforce the psr-2 style it is suggested to use the [php-cs-fixer][2] tool. 13 | While you 're at it why not enforce some more styles as well. The fixers used 14 | are the **default** (which are more than the psr-2 level uses) but they will be 15 | explicitly listed here in case they change in the future. 16 | 17 | * indentation 18 | * linefeed 19 | * trailing_spaces 20 | * unused_use 21 | * phpdoc_params 22 | * visibility 23 | * return 24 | * short_tag 25 | * braces 26 | * include 27 | * php_closing_tag 28 | * extra_empty_lines 29 | * psr0 30 | * control_spaces 31 | * elseif 32 | * eof_ending 33 | 34 | The above fixers are the default. 35 | 36 | Commenting Style 37 | -------------------------- 38 | 39 | Every public method must have comments that follow the php doc convention. 40 | @param and @return annotations are mandatory. The comments should be 41 | explanatory not simply rewriting the method's name in a sentence. If the method 42 | is too simple or the name explains the actions sufficiently then just add the 43 | @param and @return annotations. 44 | 45 | Examples of bad commenting currently in the develop branch: 46 | 47 | ``` php 48 | /** 49 | * Calls internal functions to handle data processing 50 | * @param type $string 51 | */ 52 | public function tokenize($str) 53 | { 54 | ...... 55 | } 56 | ``` 57 | 58 | It should be something along the lines of: 59 | 60 | ``` php 61 | /** 62 | * Splits $str to smaller strings according to Penn Treebank tokenization rules. 63 | * 64 | * You can see the regexes in function initPatternAndReplacement() 65 | * @param string $str The string to be tokenized 66 | * @return array An array of smaller strings (the tokens) 67 | */ 68 | .... 69 | ``` 70 | 71 | Equally necessary are class comments. The class comment should be explaining 72 | what the class does from a high point of view. Redirections to online resources 73 | like wikipedia are welcome. A good example that also contains a reference to an 74 | external resource is the following: 75 | 76 | ``` php 77 | /** 78 | * Implement a gradient descent algorithm that maximizes the conditional 79 | * log likelihood of the training data. 80 | * 81 | * See page 24 - 28 of http://nlp.stanford.edu/pubs/maxent-tutorial-slides.pdf 82 | * @see NlpTools\Models\Maxent 83 | */ 84 | class MaxentGradientDescent extends GradientDescentOptimizer implements MaxentOptimizerInterface 85 | ``` 86 | 87 | Pull Requests 88 | -------------------- 89 | 90 | ### Find something to work on ### 91 | 92 | If it is your first contribution try to find something straightforward and 93 | concise to implement without many design decisions as much as development 94 | decisions. You could first submit an issue, if you like, and state your will to 95 | correct this issue yourself. 96 | 97 | ### Branch off ### 98 | 99 | When you 've found something to develop, create a new branch off of the develop 100 | branch. Make your changes, add your tests (see below for testing) and then make 101 | a pull request. Always keep your develop branch in sync with the remote and 102 | before you create a pull request **rebase** your local branch to develop. 103 | 104 | If you rebased but there has been a change pushed since, you don't have to 105 | remove the pull request, rebase and recreate it. I will pull your changes 106 | rebase them, merge them and then close the pull request. This will have the 107 | effect of showing some merged pull requests as simply closed but it is worth 108 | keeping the commit history clean. 109 | 110 | So in two small sentences: Always create a new branch to develop on. Always 111 | rebase before making a pull request. 112 | 113 | ### Tests ### 114 | 115 | If you are implementing a new feature always include tests in your pull request. 116 | 117 | Also contributing just tests is extremely welcome. 118 | 119 | Testing 120 | ----------- 121 | 122 | A bit of information can be found in the tests folder in the README file. 123 | 124 | Tests should test the implementation thoroughly. You can test your 125 | implementation like a black box, based only on the outputs given some inputs, 126 | or you can test every small part for how it works. Either is acceptable. I will 127 | make my point clear with an example. 128 | 129 | The PorterStemmer implementation has 5 steps and some even have sub steps. One 130 | way to write the test would be to expose those steps (maybe by extending the 131 | PorterStemmer class) and write tests for each one. One other way would be to 132 | simply take a big list of English words and their stems according to the 133 | canonical implementation and check if your code produces the same results. 134 | 135 | While the second is a lot easier to implement, in case of failure, it gives 136 | very little information regarding the cause of the error. Both are acceptable 137 | (in the case of the example the second is implemented). 138 | 139 | [1]: https://github.com/php-fig/fig-standards/blob/master/accepted/PSR-2-coding-style-guide.md 140 | [2]: http://cs.sensiolabs.org/ 141 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 2 | Version 2, December 2004 3 | 4 | Copyright (C) 2004 Sam Hocevar 5 | 6 | Everyone is permitted to copy and distribute verbatim or modified 7 | copies of this license document, and changing it is allowed as long 8 | as the name is changed. 9 | 10 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 11 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 12 | 13 | 0. You just DO WHAT THE FUCK YOU WANT TO. 14 | -------------------------------------------------------------------------------- /README.markdown: -------------------------------------------------------------------------------- 1 | [PHP NlpTools](http://php-nlp-tools.com/) 2 | ============= 3 | 4 | NlpTools is a set of php 5.3+ classes for beginner to 5 | semi advanced natural language processing work. 6 | 7 | Documentation 8 | ------------- 9 | 10 | You can find documentation and code examples at the project's [homepage](http://php-nlp-tools.com/documentation/). 11 | 12 | Contents 13 | --------- 14 | 15 | ### Classification Models ### 16 | 17 | 1. [Multinomial Naive Bayes](http://php-nlp-tools.com/documentation/bayesian-model.html) 18 | 2. [Maximum Entropy (Conditional Exponential model)](http://php-nlp-tools.com/documentation/maximum-entropy-model.html) 19 | 20 | ### Topic Modeling ### 21 | 22 | Lda is still experimental and quite slow but it works. [See an example](http://php-nlp-tools.com/posts/introducing-latent-dirichlet-allocation.html). 23 | 24 | 1. [Latent Dirichlet Allocation](http://php-nlp-tools.com/documentation/api/#NlpTools/Models/Lda) 25 | 26 | ### Clustering ### 27 | 28 | 1. [K-Means](http://php-nlp-tools.com/documentation/clustering.html) 29 | 2. [Hierarchical Agglomerative Clustering](http://php-nlp-tools.com/documentation/clustering.html) 30 | * SingleLink 31 | * CompleteLink 32 | * GroupAverage 33 | 34 | ### Tokenizers ### 35 | 36 | 1. [WhitespaceTokenizer](http://php-nlp-tools.com/documentation/api/#NlpTools/Tokenizers/WhitespaceTokenizer) 37 | 2. [WhitespaceAndPunctuationTokenizer](http://php-nlp-tools.com/documentation/api/#NlpTools/Tokenizers/WhitespaceAndPunctuationTokenizer) 38 | 3. [PennTreebankTokenizer](http://php-nlp-tools.com/documentation/api/#NlpTools/Tokenizers/PennTreebankTokenizer) 39 | 4. [RegexTokenizer](http://php-nlp-tools.com/documentation/api/#NlpTools\Tokenizers\RegexTokenizer) 40 | 5. [ClassifierBasedTokenizer](http://php-nlp-tools.com/documentation/api/#NlpTools/Tokenizers/ClassifierBasedTokenizer) 41 | This tokenizer allows us to build a lot more complex tokenizers 42 | than the previous ones 43 | 44 | ### Documents ### 45 | 46 | 1. [TokensDocument](http://php-nlp-tools.com/documentation/api/#NlpTools/Documents/TokensDocument) 47 | represents a bag of words model for a document. 48 | 2. [WordDocument](http://php-nlp-tools.com/documentation/api/#NlpTools/Documents/WordDocument) 49 | represents a single word with the context of a larger document. 50 | 3. [TrainingDocument](http://php-nlp-tools.com/documentation/api/#NlpTools/Documents/TrainingDocument) 51 | represents a document whose class is known. 52 | 4. [TrainingSet](http://php-nlp-tools.com/documentation/api/#NlpTools/Documents/TrainingSet) 53 | a collection of TrainingDocuments 54 | 55 | ### Feature factories ### 56 | 57 | 1. [FunctionFeatures](http://php-nlp-tools.com/documentation/api/#NlpTools/FeatureFactories/FunctionFeatures) 58 | Allows the creation of a feature factory from a number of callables 59 | 2. [DataAsFeatures](http://php-nlp-tools.com/documentation/api/#NlpTools/FeatureFactories/DataAsFeatures) 60 | Simply return the data as features. 61 | 62 | ### Similarity ### 63 | 64 | 1. [Jaccard Index](http://php-nlp-tools.com/documentation/api/#NlpTools/Similarity/JaccardIndex) 65 | 2. [Cosine similarity](http://php-nlp-tools.com/documentation/api/#NlpTools/Similarity/CosineSimilarity) 66 | 3. [Simhash](http://php-nlp-tools.com/documentation/api/#NlpTools/Similarity/Simhash) 67 | 4. [Euclidean](http://php-nlp-tools.com/documentation/api/#NlpTools/Similarity/Euclidean) 68 | 5. [HammingDistance](http://php-nlp-tools.com/documentation/api/#NlpTools/Similarity/HammingDistance) 69 | 70 | ### Stemmers ### 71 | 72 | 1. [PorterStemmer](http://php-nlp-tools.com/documentation/api/#NlpTools/Stemmers/PorterStemmer) 73 | 2. [RegexStemmer](http://php-nlp-tools.com/documentation/api/#NlpTools/Stemmers/RegexStemmer) 74 | 3. [LancasterStemmer](http://php-nlp-tools.com/documentation/api/#NlpTools/Stemmers/LancasterStemmer) 75 | 4. [GreekStemmer](http://php-nlp-tools.com/documentation/api/#NlpTools/Stemmers/GreekStemmer) 76 | 77 | ### Optimizers (MaxEnt only) ### 78 | 79 | 1. [A gradient descent optimizer](http://php-nlp-tools.com/documentation/api/#NlpTools/Optimizers/MaxentGradientDescent) 80 | (written in php) for educational use. 81 | It is a simple implementation for anyone wanting to know a bit 82 | more about either GD or MaxEnt models 83 | 2. A fast (faster than nltk-scipy), parallel gradient descent 84 | optimizer written in [Go](http://golang.org/). This optimizer 85 | resides in another [repo](https://github.com/angeloskath/nlp-maxent-optimizer), 86 | it is used via the [external optimizer](http://php-nlp-tools.com/documentation/api/#NlpTools/Optimizers/ExternalMaxentOptimizer). 87 | TODO: At least write a readme for the optimizer written in Go. 88 | 89 | ### Other ### 90 | 91 | 1. Idf Inverse document frequency 92 | 2. Stop words 93 | 3. Language based normalizers 94 | 4. Classifier based transformation for creating flexible preprocessing pipelines 95 | -------------------------------------------------------------------------------- /autoloader.php: -------------------------------------------------------------------------------- 1 | =5.3" 14 | }, 15 | "autoload": { 16 | "psr-0": { 17 | "NlpTools\\": "src/" 18 | } 19 | }, 20 | "extra": { 21 | "branch-alias": { 22 | "dev-master": "1.0.x-dev" 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/NlpTools/Analysis/FreqDist.php: -------------------------------------------------------------------------------- 1 | preCompute($tokens); 33 | $this->totalTokens = count($tokens); 34 | } 35 | 36 | /** 37 | * Get the total number of tokens in this tokensDocument 38 | * @return int 39 | */ 40 | public function getTotalTokens() 41 | { 42 | return $this->totalTokens; 43 | } 44 | 45 | /** 46 | * Internal function for summarizing all the data into a key value store 47 | * @param array $tokens The set of tokens passed into the constructor 48 | */ 49 | protected function preCompute(array &$tokens) 50 | { 51 | //count all the tokens up and put them in a key value store 52 | $this->keyValues = array_count_values($tokens); 53 | arsort($this->keyValues); 54 | } 55 | 56 | /** 57 | * Return the weight of a single token 58 | * @return float 59 | */ 60 | public function getWeightPerToken() 61 | { 62 | return 1 / $this->getTotalTokens(); 63 | } 64 | 65 | /** 66 | * Return get the total number of unique tokens 67 | * @return int 68 | */ 69 | public function getTotalUniqueTokens() 70 | { 71 | return count($this->keyValues); 72 | } 73 | 74 | /** 75 | * Return the sorted keys by frequency desc 76 | * @return array 77 | */ 78 | public function getKeys() 79 | { 80 | return array_keys($this->keyValues); 81 | } 82 | 83 | /** 84 | * Return the sorted values by frequency desc 85 | * @return array 86 | */ 87 | public function getValues() 88 | { 89 | return array_values($this->keyValues); 90 | } 91 | 92 | /** 93 | * Return the full key value store 94 | * @return array 95 | */ 96 | public function getKeyValues() 97 | { 98 | return $this->keyValues; 99 | } 100 | 101 | /** 102 | * Return a token's count 103 | * @param string $string 104 | * @return mixed 105 | */ 106 | public function getTotalByToken($string) 107 | { 108 | $array = $this->keyValues; 109 | if(array_key_exists($string, $array)) { 110 | return $array[$string]; 111 | } else { 112 | return false; 113 | } 114 | } 115 | 116 | /** 117 | * Return a token's weight (for user's own tf-idf/pdf/iduf implem) 118 | * @param string $string 119 | * @return mixed 120 | */ 121 | public function getTokenWeight($string) 122 | { 123 | if($this->getTotalByToken($string)){ 124 | return $this->getTotalByToken($string)/$this->getTotalTokens(); 125 | } else { 126 | return false; 127 | } 128 | } 129 | 130 | /** 131 | * 132 | * Returns an array of tokens that occurred once 133 | * @todo This is an inefficient approach 134 | * @return array 135 | */ 136 | public function getHapaxes() 137 | { 138 | $samples = array(); 139 | foreach ($this->getKeyValues() as $sample => $count) { 140 | if ($count == 1) { 141 | $samples[] = $sample; 142 | } 143 | } 144 | return $samples; 145 | } 146 | 147 | } 148 | -------------------------------------------------------------------------------- /src/NlpTools/Analysis/Idf.php: -------------------------------------------------------------------------------- 1 | setAsKey(TrainingSet::CLASS_AS_KEY); 33 | foreach ($tset as $class=>$doc) { 34 | $tokens = $ff->getFeatureArray($class,$doc); // extract tokens from the document 35 | $tokens = array_fill_keys($tokens,1); // make them occur once 36 | foreach ($tokens as $token=>$v) { 37 | if (isset($this->idf[$token])) 38 | $this->idf[$token]++; 39 | else 40 | $this->idf[$token] = 1; 41 | } 42 | } 43 | 44 | // this idf so far contains the doc frequency 45 | // we will now inverse it and take the log 46 | $D = count($tset); 47 | foreach ($this->idf as &$v) { 48 | $v = log($D/$v); 49 | } 50 | $this->logD = log($D); 51 | } 52 | 53 | /** 54 | * Implements the array access interface. Return the computed idf or 55 | * the logarithm of the count of the documents for a token we have not 56 | * seen before. 57 | * 58 | * @param string $token The token to return the idf for 59 | * @return float The idf 60 | */ 61 | public function offsetGet($token) 62 | { 63 | if (isset($this->idf[$token])) { 64 | return $this->idf[$token]; 65 | } else { 66 | return $this->logD; 67 | } 68 | } 69 | 70 | /** 71 | * Implements the array access interface. Return true if the token exists 72 | * in the corpus. 73 | * 74 | * @param string $token The token to check if it exists in the corpus 75 | * @return bool 76 | */ 77 | public function offsetExists($token) 78 | { 79 | return isset($this->idf[$token]); 80 | } 81 | 82 | /** 83 | * Will not be implemented. Throws \BadMethodCallException because 84 | * one should not be able to alter the idf values directly. 85 | */ 86 | public function offsetSet($token, $value) 87 | { 88 | throw new \BadMethodCallException("The idf of a specific token cannot be set explicitly"); 89 | } 90 | 91 | /** 92 | * Will not be implemented. Throws \BadMethodCallException because 93 | * one should not be able to alter the idf values directly. 94 | */ 95 | public function offsetUnset($token) 96 | { 97 | throw new \BadMethodCallException("The idf of a specific token cannot be unset"); 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /src/NlpTools/Classifiers/ClassifierInterface.php: -------------------------------------------------------------------------------- 1 | feature_factory = $ff; 23 | $this->model = $m; 24 | } 25 | 26 | /** 27 | * Compute the vote for every class. Return the class that 28 | * receive the maximum vote. 29 | * 30 | * @param array $classes A set of classes 31 | * @param DocumentInterface $d A Document 32 | * @return string A class 33 | */ 34 | public function classify(array $classes, DocumentInterface $d) 35 | { 36 | $maxclass = current($classes); 37 | $maxvote = $this->getVote($maxclass,$d); 38 | while ($class = next($classes)) { 39 | $v = $this->getVote($class,$d); 40 | if ($v>$maxvote) { 41 | $maxclass = $class; 42 | $maxvote = $v; 43 | } 44 | } 45 | 46 | return $maxclass; 47 | } 48 | 49 | /** 50 | * Compute the features that fire for the Document $d. The sum of 51 | * the weights of the features is the vote. 52 | * 53 | * @param string $class The vote for class $class 54 | * @param DocumentInterface $d The vote for Document $d 55 | * @return float The vote of the model for class $class and Document $d 56 | */ 57 | public function getVote($class, DocumentInterface $d) 58 | { 59 | $v = 0; 60 | $features = $this->feature_factory->getFeatureArray($class,$d); 61 | foreach ($features as $f) { 62 | $v += $this->model->getWeight($f); 63 | } 64 | 65 | return $v; 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/NlpTools/Classifiers/MultinomialNBClassifier.php: -------------------------------------------------------------------------------- 1 | feature_factory = $ff; 22 | $this->model = $m; 23 | } 24 | 25 | /** 26 | * Compute the probability of $d belonging to each class 27 | * successively and return that class that has the maximum 28 | * probability. 29 | * 30 | * @param array $classes The classes from which to choose 31 | * @param DocumentInterface $d The document to classify 32 | * @return string $class The class that has the maximum probability 33 | */ 34 | public function classify(array $classes, DocumentInterface $d) 35 | { 36 | $maxclass = current($classes); 37 | $maxscore = $this->getScore($maxclass,$d); 38 | while ($class=next($classes)) { 39 | $score = $this->getScore($class,$d); 40 | if ($score>$maxscore) { 41 | $maxclass = $class; 42 | $maxscore = $score; 43 | } 44 | } 45 | 46 | return $maxclass; 47 | } 48 | 49 | /** 50 | * Compute the log of the probability of the Document $d belonging 51 | * to class $class. We compute the log so that we can sum over the 52 | * logarithms instead of multiplying each probability. 53 | * 54 | * @todo perhaps MultinomialNBModel should have precomputed the logs 55 | * ex.: getLogPrior() and getLogCondProb() 56 | * 57 | * @param string $class The class for which we are getting a score 58 | * @param DocumentInterface The document whose score we are getting 59 | * @return float The log of the probability of $d belonging to $class 60 | */ 61 | public function getScore($class, DocumentInterface $d) 62 | { 63 | $score = log($this->model->getPrior($class)); 64 | $features = $this->feature_factory->getFeatureArray($class,$d); 65 | if (is_int(key($features))) 66 | $features = array_count_values($features); 67 | foreach ($features as $f=>$fcnt) { 68 | $score += $fcnt*log($this->model->getCondProb($f,$class)); 69 | } 70 | 71 | return $score; 72 | } 73 | 74 | } 75 | -------------------------------------------------------------------------------- /src/NlpTools/Clustering/CentroidFactories/CentroidFactoryInterface.php: -------------------------------------------------------------------------------- 1 | getVector($docs[$idx]); 43 | foreach ($doc as $k=>$w) { 44 | if (!isset($v[$k])) 45 | $v[$k] = $w; 46 | else 47 | $v[$k] += $w; 48 | } 49 | } 50 | foreach ($v as &$w) { 51 | $w /= $cnt; 52 | } 53 | 54 | return $v; 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/NlpTools/Clustering/CentroidFactories/Hamming.php: -------------------------------------------------------------------------------- 1 | &$v) { 32 | if ($s[$i]=='1') 33 | $v += 1; 34 | else 35 | $v -= 1; 36 | } 37 | } 38 | 39 | return implode( 40 | '', 41 | array_map( 42 | function ($v) { 43 | return ($v>0) ? '1' : '0'; 44 | }, 45 | $buckets 46 | ) 47 | ); 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /src/NlpTools/Clustering/CentroidFactories/MeanAngle.php: -------------------------------------------------------------------------------- 1 | normalize($this->getVector($docs[$idx])); 38 | foreach ($d as $i=>$vi) { 39 | if (!isset($v[$i])) 40 | $v[$i] = $vi; 41 | else 42 | $v[$i] += $vi; 43 | } 44 | } 45 | 46 | return array_map( 47 | function ($vi) use ($cnt) { 48 | return $vi/$cnt; 49 | }, 50 | $v 51 | ); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/NlpTools/Clustering/Clusterer.php: -------------------------------------------------------------------------------- 1 | getFeatureArray('',$d); 27 | } 28 | 29 | return $docs; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/NlpTools/Clustering/Hierarchical.php: -------------------------------------------------------------------------------- 1 | strategy = $ms; 22 | $this->dist = $d; 23 | } 24 | 25 | /** 26 | * Iteratively merge documents together to create an hierarchy of clusters. 27 | * While hierarchical clustering only returns one element, it still wraps it 28 | * in an array to be consistent with the rest of the clustering methods. 29 | * 30 | * @return array An array containing one element which is the resulting dendrogram 31 | */ 32 | public function cluster(TrainingSet $documents, FeatureFactoryInterface $ff) 33 | { 34 | // what a complete waste of memory here ... 35 | // the same data exists in $documents, $docs and 36 | // the only useful parts are in $this->strategy 37 | $docs = $this->getDocumentArray($documents, $ff); 38 | $this->strategy->initializeStrategy($this->dist,$docs); 39 | unset($docs); // perhaps save some memory 40 | 41 | // start with all the documents being in their 42 | // own cluster we 'll merge later 43 | $clusters = range(0,count($documents)-1); 44 | $c = count($clusters); 45 | while ($c>1) { 46 | // ask the strategy which to merge. The strategy 47 | // will assume that we will indeed merge the returned clusters 48 | list($i,$j) = $this->strategy->getNextMerge(); 49 | $clusters[$i] = array($clusters[$i],$clusters[$j]); 50 | unset($clusters[$j]); 51 | $c--; 52 | } 53 | $clusters = array($clusters[$i]); 54 | 55 | // return the dendrogram 56 | return array($clusters); 57 | } 58 | 59 | /** 60 | * Flatten a dendrogram to an almost specific 61 | * number of clusters (the closest power of 2 larger than 62 | * $NC) 63 | * 64 | * @param array $tree The dendrogram to be flattened 65 | * @param integer $NC The number of clusters to cut to 66 | * @return array The flat clusters 67 | */ 68 | public static function dendrogramToClusters($tree,$NC) 69 | { 70 | $clusters = $tree; 71 | while (count($clusters)<$NC) { 72 | $tmpc = array(); 73 | foreach ($clusters as $subclust) { 74 | if (!is_array($subclust)) 75 | $tmpc[] = $subclust; 76 | else { 77 | foreach ($subclust as $c) 78 | $tmpc[] = $c; 79 | } 80 | } 81 | $clusters = $tmpc; 82 | } 83 | foreach ($clusters as &$c) { 84 | $c = iterator_to_array( 85 | new \RecursiveIteratorIterator( 86 | new \RecursiveArrayIterator( 87 | array($c) 88 | ) 89 | ), 90 | false // do not use keys 91 | ); 92 | } 93 | 94 | return $clusters; 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/NlpTools/Clustering/KMeans.php: -------------------------------------------------------------------------------- 1 | dist = $d; 37 | $this->n = $n; 38 | $this->cutoff = $cutoff; 39 | $this->centroidF = $cf; 40 | } 41 | 42 | /** 43 | * Apply the feature factory to the documents and then cluster the resulting array 44 | * using the provided distance metric and centroid factory. 45 | */ 46 | public function cluster(TrainingSet $documents, FeatureFactoryInterface $ff) 47 | { 48 | // transform the documents according to the FeatureFactory 49 | $docs = $this->getDocumentArray($documents,$ff); 50 | 51 | // choose N centroids at random 52 | $centroids = array(); 53 | foreach (array_rand($docs,$this->n) as $key) { 54 | $centroids[] = $docs[$key]; 55 | } 56 | 57 | // cache the distance and centroid factory functions for use 58 | // with closures 59 | $dist = array($this->dist,'dist'); 60 | $cf = array($this->centroidF,'getCentroid'); 61 | 62 | // looooooooop 63 | while (true) { 64 | // compute the distance each document has from our centroids 65 | // the array is MxN where M = count($docs) and N = count($centroids) 66 | $distances = array_map( 67 | function ($doc) use (&$centroids,$dist) { 68 | return array_map( 69 | function ($c) use ($dist,$doc) { 70 | // it is passed with an array because dist expects references 71 | // and it failed when run with phpunit. 72 | // see http://php.net/manual/en/function.call-user-func.php 73 | // for the solution used below 74 | return call_user_func_array( 75 | $dist, 76 | array( 77 | &$c, 78 | &$doc 79 | ) 80 | ); 81 | }, 82 | $centroids 83 | ); 84 | }, 85 | $docs 86 | ); 87 | 88 | // initialize the empty clusters 89 | $clusters = array_fill_keys( 90 | array_keys($centroids), 91 | array() 92 | ); 93 | foreach ($distances as $idx=>$d) { 94 | // assign document idx to the closest centroid 95 | $clusters[array_search(min($d),$d)][] = $idx; 96 | } 97 | 98 | // compute the new centroids from the assigned documents 99 | // using the centroid factory function 100 | $new_centroids = array_map( 101 | function ($cluster) use (&$docs,$cf) { 102 | return call_user_func_array( 103 | $cf, 104 | array( 105 | &$docs, 106 | $cluster 107 | ) 108 | ); 109 | }, 110 | $clusters 111 | ); 112 | 113 | // compute the change each centroid had from the previous one 114 | $changes = array_map( 115 | $dist, 116 | $new_centroids, 117 | $centroids 118 | ); 119 | 120 | // if the largest change is small enough we are done 121 | if (max($changes)<$this->cutoff) { 122 | // return the clusters, the centroids and the distances 123 | return array($clusters,$centroids,$distances); 124 | } 125 | 126 | // update the centroids and loooooop again 127 | $centroids = $new_centroids; 128 | } 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /src/NlpTools/Clustering/MergeStrategies/CompleteLink.php: -------------------------------------------------------------------------------- 1 | dm[$xi],$this->dm[$yi]); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/NlpTools/Clustering/MergeStrategies/GroupAverage.php: -------------------------------------------------------------------------------- 1 | cluster_size = array_fill_keys( 24 | range(0,$this->L-1), 25 | 1 26 | ); 27 | } 28 | 29 | protected function newDistance($xi,$yi,$x,$y) 30 | { 31 | $size_x = $this->cluster_size[$x]; 32 | $size_y = $this->cluster_size[$y]; 33 | 34 | return ($this->dm[$xi]*$size_x + $this->dm[$yi]*$size_y)/($size_x + $size_y); 35 | } 36 | 37 | public function getNextMerge() 38 | { 39 | $r = parent::getNextMerge(); 40 | 41 | $this->cluster_size[$r[0]] += $this->cluster_size[$r[1]]; 42 | unset($this->cluster_size[$r[1]]); 43 | 44 | return $r; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/NlpTools/Clustering/MergeStrategies/HeapLinkage.php: -------------------------------------------------------------------------------- 1 | y swap x,y 20 | * 2. index = y*(y-1)/2 + x 21 | */ 22 | abstract class HeapLinkage implements MergeStrategyInterface 23 | { 24 | protected $L; 25 | protected $queue; 26 | protected $dm; 27 | protected $removed; 28 | 29 | /** 30 | * Calculate the distance of the merged cluster x,y with cluster i 31 | * based on a merge strategy (SingleLink, CompleteLink, GroupAverage, ...) 32 | * Ex.: for single link this function would be 33 | * return min($this->dm[$xi],$this->dm[$yi]); 34 | */ 35 | abstract protected function newDistance($xi,$yi,$x,$y); 36 | 37 | /** 38 | * Initialize the distance matrix and any other data structure needed 39 | * to calculate the merges later. 40 | * 41 | * @param DistanceInterface $d The distance metric used to calculate the distance matrix 42 | * @param array $docs The docs to be clustered 43 | */ 44 | public function initializeStrategy(DistanceInterface $d, array &$docs) 45 | { 46 | // the number of documents and the dimensions of the matrix 47 | $this->L = count($docs); 48 | // just to hold which document has been removed 49 | $this->removed = array_fill_keys(range(0, $this->L-1), false); 50 | // how many distances we must compute 51 | $elements = (int) ($this->L*($this->L-1))/2; 52 | // the containers that will hold the distances 53 | $this->dm = new \SplFixedArray($elements); 54 | $this->queue = new \SplPriorityQueue(); 55 | $this->queue->setExtractFlags(\SplPriorityQueue::EXTR_BOTH); 56 | 57 | // for each unique pair of documents calculate the distance and 58 | // save it in the heap and distance matrix 59 | for ($x=0;$x<$this->L;$x++) { 60 | for ($y=$x+1;$y<$this->L;$y++) { 61 | $index = $this->packIndex($y,$x); 62 | $tmp_d = $d->dist($docs[$x],$docs[$y]); 63 | $this->dm[$index] = $tmp_d; 64 | $this->queue->insert($index, -$tmp_d); 65 | } 66 | } 67 | } 68 | 69 | /** 70 | * Return the pair of clusters x,y to be merged. 71 | * 1. Extract the pair with the smallest distance 72 | * 2. Recalculate the distance of the merged cluster with every other cluster 73 | * 3. Merge the clusters (by labeling one as removed) 74 | * 4. Reheap 75 | * 76 | * @return array The pair (x,y) to be merged 77 | */ 78 | public function getNextMerge() 79 | { 80 | // extract the pair with the smallest distance 81 | $tmp = $this->queue->extract(); 82 | $index = $tmp["data"]; 83 | $d = -$tmp["priority"]; 84 | list($y,$x) = $this->unravelIndex($index); 85 | // check if it is invalid 86 | while ($this->removed[$y] || $this->removed[$x] || $this->dm[$index]!=$d) { 87 | $tmp = $this->queue->extract(); 88 | $index = $tmp["data"]; 89 | $d = -$tmp["priority"]; 90 | list($y,$x) = $this->unravelIndex($index); 91 | } 92 | 93 | // Now that we have a valid pair to be merged 94 | // calculate the distances of the merged cluster with any 95 | // other cluster 96 | $yi = $this->packIndex($y,0); 97 | $xi = $this->packIndex($x,0); 98 | 99 | // for every cluster with index inewDistance($xi,$yi,$x,$y); 102 | if ($d!=$this->dm[$xi]) { 103 | $this->dm[$xi] = $d; 104 | $this->queue->insert($xi, -$d); 105 | } 106 | } 107 | // for every cluster with index xpackIndex($i,$x); 110 | $d = $this->newDistance($xi,$yi,$x,$y); 111 | if ($d!=$this->dm[$xi]) { 112 | $this->dm[$xi] = $d; 113 | $this->queue->insert($xi, -$d); 114 | } 115 | } 116 | // for every cluster xL;$i++) { 118 | $xi = $this->packIndex($i,$x); 119 | $yi = $this->packIndex($i,$y); 120 | $d = $this->newDistance($xi,$yi,$x,$y); 121 | if ($d!=$this->dm[$xi]) { 122 | $this->dm[$xi] = $d; 123 | $this->queue->insert($xi, -$d); 124 | } 125 | } 126 | 127 | // mark y as removed 128 | $this->removed[$y] = true; 129 | 130 | return array($x,$y); 131 | } 132 | 133 | /** 134 | * Use binary search to unravel the index to its coordinates x,y 135 | * return them in the order y,x . This operation is to be done only 136 | * once per merge so it doesn't add much overhead. 137 | * 138 | * Note: y will always be larger than x 139 | * 140 | * @param integer $index The index to be unraveled 141 | * @return array An array containing (y,x) 142 | */ 143 | protected function unravelIndex($index) 144 | { 145 | $a = 0; 146 | $b = $this->L-1; 147 | $y = 0; 148 | while ($b-$a > 1) { 149 | // the middle row in the interval [a,b] 150 | $y = (int) (($a+$b)/2); 151 | // the candidate index aka how many points until this row 152 | $i = $y*($y-1)/2; 153 | 154 | // if we need an offset les then the wanted y will be in the offset [a,y] 155 | if ($i > $index) { 156 | $b = $y; 157 | } else { 158 | // else it will be in the offset [y,b] 159 | $a = $y; 160 | } 161 | } 162 | // we have finished searching it is either a or b 163 | $x = $index - $i; 164 | 165 | // this means that it is b and we have a 166 | if ($y <= $x) { 167 | $y++; 168 | $x = $index - $y*($y-1)/2; 169 | } elseif ($x < 0) { 170 | // this means that it is a and we have b 171 | $y--; 172 | $x = $index - $y*($y-1)/2; 173 | } 174 | 175 | return array( 176 | (int) $y, 177 | (int) $x 178 | ); 179 | } 180 | 181 | /** 182 | * Pack the coordinates x and y to an integer offset from 0. 183 | * The first line (y=0) contains 0 elements, the 2nd 1 the 3rd 2 ... 184 | * So to calculate how many elements are from the first to the nth line 185 | * we should calculate the sum 0+1+2+...+n-1 which is equal to (n-1)*n / 2 186 | * 187 | * Note: y must always be larger than x 188 | * 189 | * @param integer $y The y coordinate (large) 190 | * @param integer $x The x coordinate (small) 191 | * @return integer The offset in the low triangle matri containing the item (x,y) 192 | */ 193 | protected function packIndex($y, $x) 194 | { 195 | return $y*($y-1)/2 + $x; 196 | } 197 | } 198 | -------------------------------------------------------------------------------- /src/NlpTools/Clustering/MergeStrategies/MergeStrategyInterface.php: -------------------------------------------------------------------------------- 1 | dm[$xi],$this->dm[$yi]); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/NlpTools/Documents/DocumentInterface.php: -------------------------------------------------------------------------------- 1 | data = $data; 17 | } 18 | 19 | public function getDocumentData() 20 | { 21 | return $this->data; 22 | } 23 | 24 | public function applyTransformation(TransformationInterface $transform) 25 | { 26 | $this->data = $transform->transform($this->data); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/NlpTools/Documents/TokensDocument.php: -------------------------------------------------------------------------------- 1 | tokens = $tokens; 16 | } 17 | /** 18 | * Simply return the tokens received in the constructor 19 | * @return array The tokens array 20 | */ 21 | public function getDocumentData() 22 | { 23 | return $this->tokens; 24 | } 25 | 26 | /** 27 | * Apply the transform to each token. Filter out the null tokens. 28 | * 29 | * @param TransformationInterface $transform The transformation to be applied 30 | */ 31 | public function applyTransformation(TransformationInterface $transform) 32 | { 33 | // array_values for re-indexing 34 | $this->tokens = array_values( 35 | array_filter( 36 | array_map( 37 | array($transform, 'transform'), 38 | $this->tokens 39 | ), 40 | function ($token) { 41 | return $token!==null; 42 | } 43 | ) 44 | ); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/NlpTools/Documents/TrainingDocument.php: -------------------------------------------------------------------------------- 1 | d = $d; 24 | $this->class = $class; 25 | } 26 | public function getDocumentData() 27 | { 28 | return $this->d->getDocumentData(); 29 | } 30 | public function getClass() 31 | { 32 | return $this->class; 33 | } 34 | 35 | /** 36 | * Pass the transformation to the decorated document 37 | * 38 | * @param TransformationInterface $transform The transformation to be applied 39 | */ 40 | public function applyTransformation(TransformationInterface $transform) 41 | { 42 | $this->d->applyTransformation($transform); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/NlpTools/Documents/TrainingSet.php: -------------------------------------------------------------------------------- 1 | classSet = array(); 26 | $this->documents = array(); 27 | $this->keytype = self::CLASS_AS_KEY; 28 | } 29 | 30 | /** 31 | * Add a document to the set. 32 | * 33 | * @param $class The documents actual class 34 | * @param $d The Document 35 | * @return void 36 | */ 37 | public function addDocument($class, DocumentInterface $d) 38 | { 39 | $this->documents[] = new TrainingDocument($class,$d); 40 | $this->classSet[$class] = 1; 41 | } 42 | // return the classset 43 | public function getClassSet() 44 | { 45 | return array_keys($this->classSet); 46 | } 47 | 48 | /** 49 | * Decide what should be returned as key when iterated upon 50 | */ 51 | public function setAsKey($what) 52 | { 53 | switch ($what) { 54 | case self::CLASS_AS_KEY: 55 | case self::OFFSET_AS_KEY: 56 | $this->keytype = $what; 57 | break; 58 | default: 59 | $this->keytype = self::CLASS_AS_KEY; 60 | break; 61 | } 62 | } 63 | 64 | /** 65 | * Apply an array of transformations to all documents in this container. 66 | * 67 | * @param array An array of TransformationInterface instances 68 | */ 69 | public function applyTransformations(array $transforms) 70 | { 71 | foreach ($this->documents as $doc) { 72 | foreach ($transforms as $transform) { 73 | $doc->applyTransformation($transform); 74 | } 75 | } 76 | } 77 | 78 | // ====== Implementation of \Iterator interface ========= 79 | public function rewind() 80 | { 81 | reset($this->documents); 82 | $this->currentDocument = current($this->documents); 83 | } 84 | public function next() 85 | { 86 | $this->currentDocument = next($this->documents); 87 | } 88 | public function valid() 89 | { 90 | return $this->currentDocument!=false; 91 | } 92 | public function current() 93 | { 94 | return $this->currentDocument; 95 | } 96 | public function key() 97 | { 98 | switch ($this->keytype) { 99 | case self::CLASS_AS_KEY: 100 | return $this->currentDocument->getClass(); 101 | case self::OFFSET_AS_KEY: 102 | return key($this->documents); 103 | default: 104 | // we should never be here 105 | throw new \Exception("Undefined type as key"); 106 | } 107 | } 108 | // === Implementation of \Iterator interface finished === 109 | 110 | // ====== Implementation of \ArrayAccess interface ========= 111 | public function offsetSet($key,$value) 112 | { 113 | throw new \Exception("Shouldn't add documents this way, add them through addDocument()"); 114 | } 115 | public function offsetUnset($key) 116 | { 117 | throw new \Exception("Cannot unset any document"); 118 | } 119 | public function offsetGet($key) 120 | { 121 | return $this->documents[$key]; 122 | } 123 | public function offsetExists($key) 124 | { 125 | return isset($this->documents[$key]); 126 | } 127 | // === Implementation of \ArrayAccess interface finished === 128 | 129 | // implementation of \Countable interface 130 | public function count() 131 | { 132 | return count($this->documents); 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /src/NlpTools/Documents/WordDocument.php: -------------------------------------------------------------------------------- 1 | word = $tokens[$index]; 19 | 20 | $this->before = array(); 21 | for ($start = max($index-$context,0);$start<$index;$start++) { 22 | $this->before[] = $tokens[$start]; 23 | } 24 | 25 | $this->after = array(); 26 | $end = min($index+$context+1,count($tokens)); 27 | for ($start = $index+1;$start<$end;$start++) { 28 | $this->after[] = $tokens[$start]; 29 | } 30 | } 31 | 32 | /** 33 | * It returns an array with the first element being the actual word, 34 | * the second element being an array of previous words, and the 35 | * third an array of following words 36 | * 37 | * @return array 38 | */ 39 | public function getDocumentData() 40 | { 41 | return array($this->word,$this->before,$this->after); 42 | } 43 | 44 | /** 45 | * Apply the transformation to the token and the surrounding context. 46 | * Filter out the null tokens from the context. If the word is transformed 47 | * to null it is for the feature factory to decide what to do. 48 | * 49 | * @param TransformationInterface $transform The transformation to be applied 50 | */ 51 | public function applyTransformation(TransformationInterface $transform) 52 | { 53 | $null_filter = function ($token) { 54 | return $token!==null; 55 | }; 56 | 57 | $this->word = $transform->transform($this->word); 58 | // array_values for re-indexing 59 | $this->before = array_values( 60 | array_filter( 61 | array_map( 62 | array($transform,"transform"), 63 | $this->before 64 | ), 65 | $null_filter 66 | ) 67 | ); 68 | $this->after = array_values( 69 | array_filter( 70 | array_map( 71 | array($transform,"transform"), 72 | $this->after 73 | ), 74 | $null_filter 75 | ) 76 | ); 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/NlpTools/Exceptions/InvalidExpression.php: -------------------------------------------------------------------------------- 1 | getDocumentData(); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/NlpTools/FeatureFactories/FeatureFactoryInterface.php: -------------------------------------------------------------------------------- 1 | functions=$f; 27 | $this->frequency=false; 28 | } 29 | /** 30 | * Set the feature factory to model frequency instead of presence 31 | */ 32 | public function modelFrequency() 33 | { 34 | $this->frequency = true; 35 | } 36 | /** 37 | * Set the feature factory to model presence instead of frequency 38 | */ 39 | public function modelPresence() 40 | { 41 | $this->frequency = false; 42 | } 43 | /** 44 | * Add a function as a feature 45 | * 46 | * @param callable $feature 47 | */ 48 | public function add( $feature ) 49 | { 50 | $this->functions[] = $feature; 51 | } 52 | 53 | /** 54 | * Compute the features that "fire" for a given class,document pair. 55 | * 56 | * Call each function one by one. Eliminate each return value that 57 | * evaluates to false. If the return value is a string add it to 58 | * the feature set. If the return value is an array iterate over it 59 | * and add each value to the feature set. 60 | * 61 | * @param string $class The class for which we are calculating features 62 | * @param DocumentInterface $d The document for which we are calculating features 63 | * @return array 64 | */ 65 | public function getFeatureArray($class, DocumentInterface $d) 66 | { 67 | $features = array_filter( 68 | array_map( function ($feature) use ($class,$d) { 69 | return call_user_func($feature, $class, $d); 70 | }, 71 | $this->functions 72 | )); 73 | $set = array(); 74 | foreach ($features as $f) { 75 | if (is_array($f)) { 76 | foreach ($f as $ff) { 77 | if (!isset($set[$ff])) 78 | $set[$ff] = 0; 79 | $set[$ff]++; 80 | } 81 | } else { 82 | if (!isset($set[$f])) 83 | $set[$f] = 0; 84 | $set[$f]++; 85 | } 86 | } 87 | if ($this->frequency) 88 | return $set; 89 | else 90 | return array_keys($set); 91 | } 92 | 93 | } 94 | -------------------------------------------------------------------------------- /src/NlpTools/Models/FeatureBasedNB.php: -------------------------------------------------------------------------------- 1 | priors = array(); 24 | $this->condprob = array(); 25 | $this->unknown = array(); 26 | } 27 | 28 | /** 29 | * Return the prior probability of class $class 30 | * P(c) as computed by the training data 31 | * 32 | * @param string $class 33 | * @return float prior probability 34 | */ 35 | public function getPrior($class) 36 | { 37 | return isset($this->priors[$class]) 38 | ? $this->priors[$class] 39 | : 0; 40 | } 41 | 42 | /** 43 | * Return the conditional probability of a term for a given class. 44 | * 45 | * @param string $term The term (word, feature id, ...) 46 | * @param string $class The class 47 | * @return float 48 | */ 49 | public function getCondProb($term,$class) 50 | { 51 | if (!isset($this->condprob[$term][$class])) { 52 | 53 | return isset($this->unknown[$class]) 54 | ? $this->unknown[$class] 55 | : 0; 56 | 57 | } else { 58 | return $this->condprob[$term][$class]; 59 | } 60 | } 61 | 62 | /** 63 | * Train on the given set and fill the model's variables. Use the 64 | * training context provided to update the counts as if the training 65 | * set was appended to the previous one that provided the context. 66 | * 67 | * It can be used for incremental training. It is not meant to be used 68 | * with the same training set twice. 69 | * 70 | * @param array $train_ctx The previous training context 71 | * @param FeatureFactoryInterface $ff A feature factory to compute features from a training document 72 | * @param TrainingSet The training set 73 | * @param integer $a_smoothing The parameter for additive smoothing. Defaults to add-one smoothing. 74 | * @return array Return a training context to be used for further incremental training, 75 | * although this is not necessary since the changes also happen in place 76 | */ 77 | public function train_with_context(array &$train_ctx, FeatureFactoryInterface $ff, TrainingSet $tset, $a_smoothing=1) 78 | { 79 | $this->countTrainingSet( 80 | $ff, 81 | $tset, 82 | $train_ctx['termcount_per_class'], 83 | $train_ctx['termcount'], 84 | $train_ctx['ndocs_per_class'], 85 | $train_ctx['voc'], 86 | $train_ctx['ndocs'] 87 | ); 88 | 89 | $voccount = count($train_ctx['voc']); 90 | 91 | $this->computeProbabilitiesFromCounts( 92 | $tset->getClassSet(), 93 | $train_ctx['termcount_per_class'], 94 | $train_ctx['termcount'], 95 | $train_ctx['ndocs_per_class'], 96 | $train_ctx['ndocs'], 97 | $voccount, 98 | $a_smoothing 99 | ); 100 | 101 | return $train_ctx; 102 | } 103 | 104 | /** 105 | * Train on the given set and fill the models variables 106 | * 107 | * priors[c] = NDocs[c]/NDocs 108 | * condprob[t][c] = count( t in c) + 1 / sum( count( t' in c ) + 1 , for every t' ) 109 | * unknown[c] = condbrob['word that doesnt exist in c'][c] ( so that count(t in c)==0 ) 110 | * 111 | * More information on the algorithm can be found at 112 | * http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html 113 | * 114 | * @param FeatureFactoryInterface A feature factory to compute features from a training document 115 | * @param TrainingSet The training set 116 | * @param integer $a_smoothing The parameter for additive smoothing. Defaults to add-one smoothing. 117 | * @return array Return a training context to be used for incremental training 118 | */ 119 | public function train(FeatureFactoryInterface $ff, TrainingSet $tset, $a_smoothing=1) 120 | { 121 | $class_set = $tset->getClassSet(); 122 | 123 | $ctx = array( 124 | 'termcount_per_class'=>array_fill_keys($class_set,0), 125 | 'termcount'=>array_fill_keys($class_set,array()), 126 | 'ndocs_per_class'=>array_fill_keys($class_set,0), 127 | 'voc'=>array(), 128 | 'ndocs'=>0 129 | ); 130 | 131 | return $this->train_with_context($ctx,$ff,$tset,$a_smoothing); 132 | } 133 | 134 | /** 135 | * Count all the features for each document. All parameters are passed 136 | * by reference and they are filled in this function. Useful for not 137 | * making copies of big arrays. 138 | * 139 | * @param FeatureFactoryInterface $ff A feature factory to create the features for each document in the set 140 | * @param TrainingSet $tset The training set (collection of labeled documents) 141 | * @param array $termcount_per_class The count of occurences of each feature in each class 142 | * @param array $termcount The total count of occurences of each term 143 | * @param array $ndocs_per_class The total number of documents per class 144 | * @param array $voc A set of the found features 145 | * @param integer $ndocs The number of documents 146 | * @return void 147 | */ 148 | protected function countTrainingSet(FeatureFactoryInterface $ff, TrainingSet $tset, array &$termcount_per_class, array &$termcount, array &$ndocs_per_class, array &$voc, &$ndocs) 149 | { 150 | foreach ($tset as $tdoc) { 151 | $ndocs++; 152 | $c = $tdoc->getClass(); 153 | $ndocs_per_class[$c]++; 154 | $features = $ff->getFeatureArray($c,$tdoc); 155 | if (is_int(key($features))) 156 | $features = array_count_values($features); 157 | foreach ($features as $f=>$fcnt) { 158 | if (!isset($voc[$f])) 159 | $voc[$f] = 0; 160 | 161 | $termcount_per_class[$c]+=$fcnt; 162 | if (isset($termcount[$c][$f])) 163 | $termcount[$c][$f]+=$fcnt; 164 | else 165 | $termcount[$c][$f] = $fcnt; 166 | } 167 | } 168 | } 169 | 170 | /** 171 | * Compute the probabilities given the counts of the features in the 172 | * training set. 173 | * 174 | * @param array $class_set Just the array that contains the classes 175 | * @param array $termcount_per_class The count of occurences of each feature in each class 176 | * @param array $termcount The total count of occurences of each term 177 | * @param array $ndocs_per_class The total number of documents per class 178 | * @param integer $ndocs The total number of documents 179 | * @param integer $voccount The total number of features found 180 | * @return void 181 | */ 182 | protected function computeProbabilitiesFromCounts(array $class_set, array &$termcount_per_class, array &$termcount, array &$ndocs_per_class, $ndocs, $voccount, $a_smoothing=1) 183 | { 184 | $denom_smoothing = $a_smoothing*$voccount; 185 | foreach ($class_set as $class) { 186 | $this->priors[$class] = $ndocs_per_class[$class] / $ndocs; 187 | foreach ($termcount[$class] as $term=>$count) { 188 | $this->condprob[$term][$class] = ($count + $a_smoothing) / ($termcount_per_class[$class] + $denom_smoothing); 189 | } 190 | } 191 | foreach ($class_set as $class) { 192 | $this->unknown[$class] = $a_smoothing / ($termcount_per_class[$class] + $denom_smoothing); 193 | } 194 | } 195 | 196 | /** 197 | * Just save the probabilities for reuse 198 | */ 199 | public function __sleep() 200 | { 201 | return array('priors','condprob','unknown'); 202 | } 203 | } 204 | -------------------------------------------------------------------------------- /src/NlpTools/Models/LinearModel.php: -------------------------------------------------------------------------------- 1 | l = $l; 21 | } 22 | /** 23 | * Get the weight for a given feature 24 | * 25 | * @param string $feature The feature for which the weight will be returned 26 | * @return float The weight 27 | */ 28 | public function getWeight($feature) 29 | { 30 | if (!isset($this->l[$feature])) return 0; 31 | else return $this->l[$feature]; 32 | } 33 | 34 | /** 35 | * Get all the weights as an array. 36 | * 37 | * @return array The weights as an associative array 38 | */ 39 | public function getWeights() 40 | { 41 | return $this->l; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/NlpTools/Models/Maxent.php: -------------------------------------------------------------------------------- 1 | getClassSet(); 32 | 33 | $features = $this->calculateFeatureArray($classSet,$tset,$ff); 34 | $this->l = $opt->optimize($features); 35 | } 36 | 37 | /** 38 | * Calculate all the features for each possible class of each 39 | * document. This is done so that we can optimize without the need 40 | * of the FeatureFactory. 41 | * 42 | * We do not want to use the FeatureFactoryInterface both because it would 43 | * be slow to calculate the features over and over again, but also 44 | * because we want to be able to optimize externally to 45 | * gain speed (PHP is slow!). 46 | * 47 | * @param $classes A set of the classes in the training set 48 | * @param $tset A collection of training documents 49 | * @param $ff The feature factory 50 | * @return array An array that contains every feature for every possible class of every document 51 | */ 52 | protected function calculateFeatureArray(array $classes, TrainingSet $tset, FeatureFactoryInterface $ff) 53 | { 54 | $features = array(); 55 | $tset->setAsKey(TrainingSet::OFFSET_AS_KEY); 56 | foreach ($tset as $offset=>$doc) { 57 | $features[$offset] = array(); 58 | foreach ($classes as $class) { 59 | $features[$offset][$class] = $ff->getFeatureArray($class,$doc); 60 | } 61 | $features[$offset]['__label__'] = $doc->getClass(); 62 | } 63 | 64 | return $features; 65 | } 66 | 67 | /** 68 | * Calculate the probability that document $d belongs to the class 69 | * $class given a set of possible classes, a feature factory and 70 | * the model's weights l[i] 71 | * 72 | * @param $classes The set of possible classes 73 | * @param $ff The feature factory 74 | * @param $d The document 75 | * @param string $class A class for which we calculate the probability 76 | * @return float The probability that document $d belongs to class $class 77 | */ 78 | public function P(array $classes,FeatureFactoryInterface $ff,DocumentInterface $d,$class) 79 | { 80 | $exps = array(); 81 | foreach ($classes as $cl) { 82 | $tmp = 0.0; 83 | foreach ($ff->getFeatureArray($cl,$d) as $i) { 84 | $tmp += $this->l[$i]; 85 | } 86 | $exps[$cl] = exp($tmp); 87 | } 88 | 89 | return $exps[$class]/array_sum($exps); 90 | } 91 | 92 | /** 93 | * Not implemented yet. 94 | * Simply put: 95 | * result += log( $this->P(..., ..., ...) ) for every doc in TrainingSet 96 | * 97 | * @throws \Exception 98 | */ 99 | public function CLogLik(TrainingSet $tset,FeatureFactoryInterface $ff) 100 | { 101 | throw new \Exception("Unimplemented"); 102 | } 103 | 104 | /** 105 | * Simply print_r weights. Usefull for some kind of debugging when 106 | * working with small training sets and few features 107 | */ 108 | public function dumpWeights() 109 | { 110 | print_r($this->l); 111 | } 112 | 113 | } 114 | -------------------------------------------------------------------------------- /src/NlpTools/Models/MultinomialNBModelInterface.php: -------------------------------------------------------------------------------- 1 | optimizer = $optimizer; 54 | } 55 | 56 | /** 57 | * Open a pipe to the optimizer, send him the data encoded in json 58 | * and then read the stdout to get the results encoded in json 59 | * 60 | * @param array $feature_array The features that fired for any document for any class @see NlpTools\Models\Maxent 61 | * @return array The optimized weights 62 | */ 63 | public function optimize(array &$feature_array) 64 | { 65 | // whete we will read from where we will write to 66 | $desrciptorspec = array( 67 | 0=>array('pipe','r'), 68 | 1=>array('pipe','w'), 69 | 2=>STDERR // Should that be redirected to /dev/null or like? 70 | ); 71 | 72 | // Run the optimizer 73 | $process = proc_open($this->optimizer,$desrciptorspec,$pipes); 74 | if (!is_resource($process)) { 75 | return array(); 76 | } 77 | 78 | // send the data 79 | fwrite($pipes[0],json_encode($feature_array)); 80 | fclose($pipes[0]); 81 | 82 | // get the weights 83 | $json = stream_get_contents($pipes[1]); 84 | 85 | // decode as an associative array 86 | $l = json_decode( $json , true ); 87 | 88 | // close up the optimizer 89 | fclose($pipes[1]); 90 | proc_close($process); 91 | 92 | return $l; 93 | } 94 | 95 | } 96 | -------------------------------------------------------------------------------- /src/NlpTools/Optimizers/FeatureBasedLinearOptimizerInterface.php: -------------------------------------------------------------------------------- 1 | precision = $precision; 25 | $this->step = $step; 26 | $this->maxiter = $maxiter; 27 | } 28 | 29 | /** 30 | * Should initialize the weights and compute any constant 31 | * expressions needed for the fprime calculation. 32 | * 33 | * @param $feature_array All the data known about the training set 34 | * @param $l The current set of weights to be initialized 35 | * @return void 36 | */ 37 | abstract protected function initParameters(array &$feature_array, array &$l); 38 | /** 39 | * Should calculate any parameter needed by Fprime that cannot be 40 | * calculated by initParameters because it is not constant. 41 | * 42 | * @param $feature_array All the data known about the training set 43 | * @param $l The current set of weights to be initialized 44 | * @return void 45 | */ 46 | abstract protected function prepareFprime(array &$feature_array, array &$l); 47 | /** 48 | * Actually compute the fprime_vector. Set for each $l[$i] the 49 | * value of the partial derivative of f for delta $l[$i] 50 | * 51 | * @param $feature_array All the data known about the training set 52 | * @param $l The current set of weights to be initialized 53 | * @return void 54 | */ 55 | abstract protected function Fprime(array &$feature_array, array &$l); 56 | 57 | /** 58 | * Actually do the gradient descent algorithm. 59 | * l[i] = l[i] - learning_rate*( theta f/delta l[i] ) for each i 60 | * Could possibly benefit from a vetor add/scale function. 61 | * 62 | * @param $feature_array All the data known about the training set 63 | * @return array The parameters $l[$i] that minimize F 64 | */ 65 | public function optimize(array &$feature_array) 66 | { 67 | $itercount = 0; 68 | $optimized = false; 69 | $maxiter = $this->maxiter; 70 | $prec = $this->precision; 71 | $step = $this->step; 72 | $l = array(); 73 | $this->initParameters($feature_array,$l); 74 | while (!$optimized && $itercount++!=$maxiter) { 75 | //$start = microtime(true); 76 | $optimized = true; 77 | $this->prepareFprime($feature_array,$l); 78 | $this->Fprime($feature_array,$l); 79 | foreach ($this->fprime_vector as $i=>$fprime_i_val) { 80 | $l[$i] -= $step*$fprime_i_val; 81 | if (abs($fprime_i_val) > $prec) { 82 | $optimized = false; 83 | } 84 | } 85 | //fprintf(STDERR,"%f\n",microtime(true)-$start); 86 | if ($this->verbose>0) 87 | $this->reportProgress($itercount); 88 | } 89 | 90 | return $l; 91 | } 92 | 93 | public function reportProgress($itercount) 94 | { 95 | if ($itercount == 1) { 96 | echo "#\t|Fprime|\n------------------\n"; 97 | } 98 | $norm = 0; 99 | foreach ($this->fprime_vector as $fprime_i_val) { 100 | $norm += $fprime_i_val*$fprime_i_val; 101 | } 102 | $norm = sqrt($norm); 103 | printf("%d\t%.3f\n",$itercount,$norm); 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /src/NlpTools/Optimizers/MaxentGradientDescent.php: -------------------------------------------------------------------------------- 1 | numerators = array(); 33 | $this->fprime_vector = array(); 34 | foreach ($feature_array as $doc) { 35 | foreach ($doc as $class=>$features) { 36 | if (!is_array($features)) continue; 37 | foreach ($features as $fi) { 38 | $l[$fi] = 0; 39 | $this->fprime_vector[$fi] = 0; 40 | if (!isset($this->numerators[$fi])) { 41 | $this->numerators[$fi] = 0; 42 | } 43 | } 44 | } 45 | foreach ($doc[$doc['__label__']] as $fi) { 46 | $this->numerators[$fi]++; 47 | } 48 | } 49 | } 50 | 51 | /** 52 | * Compute the denominators which is the predicted expectation of 53 | * each feature given a set of weights L and a set of features for 54 | * each document for each class. 55 | * 56 | * @param $feature_array All the data known about the training set 57 | * @param $l The current set of weights to be initialized 58 | * @return void 59 | */ 60 | protected function prepareFprime(array &$feature_array, array &$l) 61 | { 62 | $this->denominators = array(); 63 | foreach ($feature_array as $offset=>$doc) { 64 | $numerator = array_fill_keys(array_keys($doc),0.0); 65 | $denominator = 0.0; 66 | foreach ($doc as $cl=>$f) { 67 | if (!is_array($f)) continue; 68 | $tmp = 0.0; 69 | foreach ($f as $i) { 70 | $tmp += $l[$i]; 71 | } 72 | $tmp = exp($tmp); 73 | $numerator[$cl] += $tmp; 74 | $denominator += $tmp; 75 | } 76 | foreach ($doc as $class=>$features) { 77 | if (!is_array($features)) continue; 78 | foreach ($features as $fi) { 79 | if (!isset($this->denominators[$fi])) { 80 | $this->denominators[$fi] = 0; 81 | } 82 | $this->denominators[$fi] += $numerator[$class]/$denominator; 83 | } 84 | } 85 | } 86 | } 87 | 88 | /** 89 | * The partial Fprime for each i is 90 | * empirical expectation - predicted expectation . We need to 91 | * maximize the CLogLik (CLogLik is the f whose Fprime we calculate) 92 | * so we instead minimize the -CLogLik. 93 | * 94 | * See page 28 of http://nlp.stanford.edu/pubs/maxent-tutorial-slides.pdf 95 | * 96 | * @param $feature_array All the data known about the training set 97 | * @param $l The current set of weights to be initialized 98 | * @return void 99 | */ 100 | protected function Fprime(array &$feature_array, array &$l) 101 | { 102 | foreach ($this->fprime_vector as $i=>&$fprime_i_val) { 103 | $fprime_i_val = $this->denominators[$i] - $this->numerators[$i]; 104 | } 105 | } 106 | 107 | } 108 | -------------------------------------------------------------------------------- /src/NlpTools/Optimizers/MaxentOptimizerInterface.php: -------------------------------------------------------------------------------- 1 | rnd = MersenneTwister::get(); 16 | else 17 | $this->rnd = $rnd; 18 | } 19 | 20 | abstract public function sample(); 21 | } 22 | -------------------------------------------------------------------------------- /src/NlpTools/Random/Distributions/Dirichlet.php: -------------------------------------------------------------------------------- 1 | rnd; 25 | $this->gamma = array_map( 26 | function ($a) use ($rnd) { 27 | return new Gamma($a,1,$rnd); 28 | }, 29 | $a 30 | ); 31 | } 32 | 33 | public function sample() 34 | { 35 | $y = array(); 36 | foreach ($this->gamma as $g) { 37 | $y[] = $g->sample(); 38 | } 39 | $sum = array_sum($y); 40 | 41 | return array_map( 42 | function ($y) use ($sum) { 43 | return $y/$sum; 44 | }, 45 | $y 46 | ); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/NlpTools/Random/Distributions/Gamma.php: -------------------------------------------------------------------------------- 1 | scale = $scale; 24 | $this->shape = abs($shape); 25 | if ($this->shape >= 1) 26 | $this->normal = new Normal(0,1,$this->rnd); 27 | else 28 | $this->gamma = new Gamma($this->shape + 1, 1, $this->rnd); 29 | 30 | } 31 | 32 | public function sample() 33 | { 34 | if ($this->shape >= 1) { 35 | $d = $this->shape - 1/3; 36 | $c = 1/sqrt(9*$d); 37 | for (;;) { 38 | do { 39 | $x = $this->normal->sample(); 40 | $v = 1 + $c*$x; 41 | } while ($v <= 0); 42 | $v = $v*$v*$v; 43 | $u = $this->rnd->generate(); 44 | $xsq = $x*$x; 45 | if ($u < 1-.0331*$xsq*$xsq || log($u) < 0.5*$xsq + $d*(1-$v+log($v))) 46 | return $this->scale*$d*$v; 47 | } 48 | } else { 49 | $g = $this->gamma->sample(); 50 | $w = $this->rnd->generate(); 51 | 52 | return $this->scale*$g*pow($w,1/$this->shape); 53 | } 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/NlpTools/Random/Distributions/Normal.php: -------------------------------------------------------------------------------- 1 | m = $m; 17 | $this->sigma = abs($sigma); 18 | } 19 | 20 | public function sample() 21 | { 22 | $u1 = $this->rnd->generate(); 23 | $u2 = $this->rnd->generate(); 24 | $r = sqrt(-2*log($u1)); 25 | $theta = 2.0*M_PI*$u2; 26 | 27 | return $this->m + $this->sigma*$r*sin($theta); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/NlpTools/Random/Generators/FromFile.php: -------------------------------------------------------------------------------- 1 | h = fopen($f,'r'); 21 | } 22 | 23 | /** 24 | * Read a float from a file and return it. It doesn't do anything 25 | * to make sure that the float returned will be in the appropriate 26 | * range. 27 | * 28 | * If the file has reached its end it rewinds the file pointer. 29 | * 30 | * @return float A random float in the range (0,1) 31 | */ 32 | public function generate() 33 | { 34 | if (feof($this->h)) 35 | rewind($this->h); 36 | 37 | return (float) fgets($this->h); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/NlpTools/Random/Generators/GeneratorInterface.php: -------------------------------------------------------------------------------- 1 | 9 | */ 10 | interface GeneratorInterface 11 | { 12 | /** 13 | * Generates a pseudo-random number with uniform distribution in the 14 | * interval [0,1) 15 | * 16 | * @return float The "random" number 17 | */ 18 | public function generate(); 19 | } 20 | -------------------------------------------------------------------------------- /src/NlpTools/Random/Generators/MersenneTwister.php: -------------------------------------------------------------------------------- 1 | 1, 23 | * 'feature_2'=>0.55, 24 | * 'feature_3'=>12.7, 25 | * .... 26 | * ) 27 | */ 28 | class CosineSimilarity implements SimilarityInterface, DistanceInterface 29 | { 30 | 31 | /** 32 | * Returns a number between 0,1 that corresponds to the cos(theta) 33 | * where theta is the angle between the two sets if they are treated 34 | * as n-dimensional vectors. 35 | * 36 | * See the class comment about why the number is in [0,1] and not 37 | * in [-1,1] as it normally should. 38 | * 39 | * @param array $A Either feature vector or simply vector 40 | * @param array $B Either feature vector or simply vector 41 | * @return float The cosinus of the angle between the two vectors 42 | */ 43 | public function similarity(&$A, &$B) 44 | { 45 | 46 | if (!is_array($A) || !is_array($B)) { 47 | throw new \InvalidArgumentException('Vector $' . (!is_array($A) ? 'A' : 'B') . ' is not an array'); 48 | } 49 | 50 | // This means they are simple text vectors 51 | // so we need to count to make them vectors 52 | if (is_int(key($A))) 53 | $v1 = array_count_values($A); 54 | else 55 | $v1 = &$A; 56 | if (is_int(key($B))) 57 | $v2 = array_count_values($B); 58 | else 59 | $v2 = &$B; 60 | 61 | $prod = 0.0; 62 | $v1_norm = 0.0; 63 | foreach ($v1 as $i=>$xi) { 64 | if (isset($v2[$i])) { 65 | $prod += $xi*$v2[$i]; 66 | } 67 | $v1_norm += $xi*$xi; 68 | } 69 | $v1_norm = sqrt($v1_norm); 70 | if ($v1_norm==0) 71 | throw new \InvalidArgumentException("Vector \$A is the zero vector"); 72 | 73 | $v2_norm = 0.0; 74 | foreach ($v2 as $i=>$xi) { 75 | $v2_norm += $xi*$xi; 76 | } 77 | $v2_norm = sqrt($v2_norm); 78 | if ($v2_norm==0) 79 | throw new \InvalidArgumentException("Vector \$B is the zero vector"); 80 | 81 | return $prod/($v1_norm*$v2_norm); 82 | } 83 | 84 | /** 85 | * Cosine distance is simply 1-cosine similarity 86 | */ 87 | public function dist(&$A, &$B) 88 | { 89 | return 1-$this->similarity($A,$B); 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/NlpTools/Similarity/DiceSimilarity.php: -------------------------------------------------------------------------------- 1 | similarity($A,$B); 30 | } 31 | } -------------------------------------------------------------------------------- /src/NlpTools/Similarity/DistanceInterface.php: -------------------------------------------------------------------------------- 1 | $v) { 30 | $r[$k] = $v; 31 | } 32 | foreach ($v2 as $k=>$v) { 33 | if (isset($r[$k])) 34 | $r[$k] -= $v; 35 | else 36 | $r[$k] = $v; 37 | } 38 | 39 | return sqrt( 40 | array_sum( 41 | array_map( 42 | function ($x) { 43 | return $x*$x; 44 | }, 45 | $r 46 | ) 47 | ) 48 | ); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/NlpTools/Similarity/HammingDistance.php: -------------------------------------------------------------------------------- 1 | similarity($A,$B); 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/NlpTools/Similarity/OverlapCoefficient.php: -------------------------------------------------------------------------------- 1 | similarity($A,$B); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/NlpTools/Similarity/Simhash.php: -------------------------------------------------------------------------------- 1 | length = $len; 42 | $this->h = $hash; 43 | } 44 | 45 | /** 46 | * Compute the locality sensitive hash for this set. 47 | * Maintain a vector ($boxes) of length $this->length initialized to 48 | * 0. Each member of the set is hashed to a {$this->length} bit vector. 49 | * For each of these bits we either increment or decrement the 50 | * corresponding $boxes dimension depending on the bit being either 51 | * 1 or 0. Finally the signs of each dimension of the boxes vector 52 | * is the locality sensitive hash. 53 | * 54 | * We have departed from the original implementation at the 55 | * following points: 56 | * 1. Each feature has a weight of 1, but feature duplication is 57 | * allowed. 58 | * 59 | * @param array $set 60 | * @return string The bits of the hash as a string 61 | * */ 62 | public function simhash(array &$set) 63 | { 64 | $boxes = array_fill(0,$this->length,0); 65 | if (is_int(key($set))) 66 | $dict = array_count_values($set); 67 | else 68 | $dict = &$set; 69 | foreach ($dict as $m=>$w) { 70 | $h = call_user_func($this->h,$m); 71 | for ($bit_idx=0;$bit_idx<$this->length;$bit_idx++) { 72 | $boxes[$bit_idx] += ($h[$bit_idx]=='1') ? $w : -$w; 73 | } 74 | } 75 | $s = ''; 76 | foreach ($boxes as $box) { 77 | if ($box>0) 78 | $s .= '1'; 79 | else 80 | $s .= '0'; 81 | } 82 | 83 | return $s; 84 | } 85 | 86 | /** 87 | * Computes the hamming distance of the simhashes of two sets. 88 | * 89 | * @param array $A 90 | * @param array $B 91 | * @return int [0,$this->length] 92 | */ 93 | public function dist(&$A, &$B) 94 | { 95 | $h1 = $this->simhash($A); 96 | $h2 = $this->simhash($B); 97 | $d = 0; 98 | for ($i=0;$i<$this->length;$i++) { 99 | if ($h1[$i]!=$h2[$i]) 100 | $d++; 101 | } 102 | 103 | return $d; 104 | } 105 | 106 | /** 107 | * Computes a similarity measure from two sets. The similarity is 108 | * computed as 1 - (sets' distance) / (maximum possible distance). 109 | * 110 | * @param array $A 111 | * @param array $B 112 | * @return float [0,1] 113 | */ 114 | public function similarity(&$A, &$B) 115 | { 116 | return ($this->length-$this->dist($A,$B))/$this->length; 117 | } 118 | 119 | } 120 | -------------------------------------------------------------------------------- /src/NlpTools/Similarity/SimilarityInterface.php: -------------------------------------------------------------------------------- 1 | alpha = $alpha; 25 | $this->beta = $beta; 26 | } 27 | 28 | /** 29 | * Compute the similarity using the alpha and beta values given in the 30 | * constructor. 31 | * 32 | * @param array $A 33 | * @param array $B 34 | * @return float 35 | */ 36 | public function similarity(&$A, &$B) 37 | { 38 | $alpha = $this->alpha; 39 | $beta = $this->beta; 40 | 41 | $a = array_fill_keys($A,1); 42 | $b = array_fill_keys($B,1); 43 | 44 | $min = min(count(array_diff_key($a,$b)),count(array_diff_key($b, $a))); 45 | $max = max(count(array_diff_key($a,$b)),count(array_diff_key($b, $a))); 46 | 47 | $intersect = count(array_intersect_key($a,$b)); 48 | 49 | return $intersect/($intersect + ($beta * ($alpha * $min + $max*(1-$alpha)) )); 50 | } 51 | 52 | public function dist(&$A, &$B) 53 | { 54 | return 1-$this->similarity($A,$B); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/NlpTools/Stemmers/RegexStemmer.php: -------------------------------------------------------------------------------- 1 | regex = $regexstr; 21 | $this->min = $min; 22 | } 23 | 24 | public function stem($word) 25 | { 26 | if (mb_strlen($word,'utf-8')>=$this->min) 27 | return preg_replace($this->regex,'',$word); 28 | return $word; 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /src/NlpTools/Stemmers/Stemmer.php: -------------------------------------------------------------------------------- 1 | stem($word); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/NlpTools/Tokenizers/ClassifierBasedTokenizer.php: -------------------------------------------------------------------------------- 1 | tok = new WhitespaceAndPunctuationTokenizer(); 60 | } else { 61 | $this->tok = $tok; 62 | } 63 | $this->classifier = $cls; 64 | $this->sep = $sep; 65 | } 66 | 67 | /** 68 | * Tokenize the string. 69 | * 70 | * 1. Break up the string in tokens using the initial tokenizer 71 | * 2. Classify each token if it is an EOW 72 | * 3. For each token that is not an EOW add it to the next EOW token using a separator 73 | * 74 | * @param string $str The character sequence to be broken in tokens 75 | * @return array The token array 76 | */ 77 | public function tokenize($str) 78 | { 79 | // split the string in tokens and create documents to be 80 | // classified 81 | $tokens = $this->tok->tokenize($str); 82 | $docs = array(); 83 | foreach ($tokens as $offset=>$tok) { 84 | $docs[] = new WordDocument($tokens,$offset,5); 85 | } 86 | 87 | // classify each token as an EOW or O 88 | $tags = array(); 89 | foreach ($docs as $doc) { 90 | $tags[] = $this->classifier->classify(self::$classSet, $doc); 91 | } 92 | 93 | // merge O and EOW into real tokens 94 | $realtokens = array(); 95 | $currentToken = array(); 96 | foreach ($tokens as $offset=>$tok) { 97 | $currentToken[] = $tok; 98 | if ($tags[$offset] == self::EOW) { 99 | $realtokens[] = implode($this->sep,$currentToken); 100 | $currentToken = array(); 101 | } 102 | } 103 | 104 | // return real tokens 105 | return $realtokens; 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/NlpTools/Tokenizers/PennTreeBankTokenizer.php: -------------------------------------------------------------------------------- 1 | initPatternReplacement(); 24 | } 25 | 26 | /** 27 | * Calls internal functions to handle data processing 28 | * @param string $str 29 | */ 30 | public function tokenize($str) 31 | { 32 | return parent::tokenize($this->execute($str)); 33 | } 34 | /** 35 | * Handles the data processing 36 | * @param string $string The raw text to get parsed 37 | */ 38 | protected function execute($string) 39 | { 40 | foreach ($this->patternsAndReplacements as $patternAndReplacement) { 41 | $tmp = preg_replace("/".$patternAndReplacement->pattern."/s", $patternAndReplacement->replacement, $string); 42 | if ($tmp === null) { 43 | InvalidExpression::invalidRegex($patternAndReplacement->pattern, $patternAndReplacement->replacement); 44 | } else { 45 | $string = $tmp; 46 | } 47 | } 48 | 49 | return $string; 50 | } 51 | 52 | /** 53 | * Initializes the patterns and replacements/ 54 | */ 55 | protected function initPatternReplacement() 56 | { 57 | $this->addPatternAndReplacement('^"', '``'); 58 | $this->addPatternAndReplacement("\([ ([{<]\)","$1 `` "); 59 | $this->addPatternAndReplacement("\.\.\."," ... "); 60 | $this->addPatternAndReplacement("([,;:@#$%&])", " $1 "); 61 | $this->addPatternAndReplacement("([^.])([.])([])}>\"\']*)[ ]*$","\${1} \${2}\${3}"); 62 | $this->addPatternAndReplacement("[?!]"," $0 "); 63 | $this->addPatternAndReplacement("[][(){}<>]"," $0 "); 64 | $this->addPatternAndReplacement("--"," -- "); 65 | $this->addPatternAndReplacement("\""," '' "); 66 | 67 | $this->addPatternAndReplacement("([^'])' ","\${1} ' "); 68 | $this->addPatternAndReplacement("'([sSmMdD]) "," '\${1} "); 69 | $this->addPatternAndReplacement("'ll "," 'll "); 70 | $this->addPatternAndReplacement("'re "," 're "); 71 | $this->addPatternAndReplacement("'ve "," 've "); 72 | $this->addPatternAndReplacement("n't "," n't "); 73 | $this->addPatternAndReplacement("'LL "," 'LL "); 74 | $this->addPatternAndReplacement("'RE "," 'RE "); 75 | $this->addPatternAndReplacement("'VE "," 'VE "); 76 | $this->addPatternAndReplacement("N'T "," N'T "); 77 | 78 | $this->addPatternAndReplacement(" ([Cc])annot "," \1an not "); 79 | $this->addPatternAndReplacement(" ([Dd])'ye "," \${1}' ye "); 80 | $this->addPatternAndReplacement(" ([Gg])imme "," \${1}im me "); 81 | $this->addPatternAndReplacement(" ([Gg])onna "," \${1}on na "); 82 | $this->addPatternAndReplacement(" ([Gg])otta "," \${1}ot ta "); 83 | $this->addPatternAndReplacement(" ([Ll])emme "," \${1}em me "); 84 | $this->addPatternAndReplacement(" ([Mm])ore'n "," \${1}ore 'n "); 85 | $this->addPatternAndReplacement(" '([Tt])is "," '\${1} is "); 86 | $this->addPatternAndReplacement(" '([Tt])was "," '\${1} was "); 87 | $this->addPatternAndReplacement(" ([Ww])anna "," \${1}an na "); 88 | 89 | $this->addPatternAndReplacement(" *"," "); 90 | $this->addPatternAndReplacement("^ *",""); 91 | 92 | } 93 | 94 | /** 95 | * Appends \stdClass objects to the internal data structure $patternsAndReplacements 96 | * @param string $pattern 97 | * @param string $replacement 98 | */ 99 | protected function addPatternAndReplacement($pattern, $replacement) 100 | { 101 | $instance = new \stdClass(); 102 | $instance->pattern = $pattern; 103 | $instance->replacement = $replacement; 104 | $this->patternsAndReplacements[] = $instance; 105 | } 106 | 107 | } 108 | -------------------------------------------------------------------------------- /src/NlpTools/Tokenizers/RegexTokenizer.php: -------------------------------------------------------------------------------- 1 | patterns = $patterns; 21 | } 22 | 23 | /** 24 | * Iteratively run for each pattern. The tokens resulting from one pattern are 25 | * fed to the next as strings. 26 | * 27 | * If the pattern is given alone, it is assumed that it is a pattern used 28 | * for splitting with preg_split. 29 | * 30 | * If the pattern is given together with an integer then it is assumed to be 31 | * a pattern used with preg_match 32 | * 33 | * If a pattern is given with a string it is assumed to be a transformation 34 | * pattern used with preg_replace 35 | * 36 | * @param string $str The string to be tokenized 37 | * @return array The tokens 38 | */ 39 | public function tokenize($str) 40 | { 41 | $str = array($str); 42 | foreach ($this->patterns as $p) { 43 | if (!is_array($p)) $p = array($p); 44 | if (count($p)==1) { // split pattern 45 | $this->split($str, $p[0]); 46 | } elseif (is_int($p[1])) { // match pattern 47 | $this->match($str, $p[0], $p[1]); 48 | } else { // replace pattern 49 | $this->replace($str, $p[0], $p[1]); 50 | } 51 | } 52 | 53 | return $str; 54 | } 55 | 56 | /** 57 | * Execute the SPLIT mode 58 | * 59 | * @param array &$str The tokens to be further tokenized 60 | */ 61 | protected function split(array &$str, $pattern) 62 | { 63 | $tokens = array(); 64 | foreach ($str as $s) { 65 | $tokens = array_merge( 66 | $tokens, 67 | preg_split($pattern, $s, null, PREG_SPLIT_NO_EMPTY) 68 | ); 69 | } 70 | 71 | $str = $tokens; 72 | } 73 | 74 | /** 75 | * Execute the KEEP_MATCHES mode 76 | * 77 | * @param array &$str The tokens to be further tokenized 78 | */ 79 | protected function match(array &$str, $pattern, $keep) 80 | { 81 | $tokens = array(); 82 | foreach ($str as $s) { 83 | preg_match_all($pattern, $s, $m); 84 | $tokens = array_merge( 85 | $tokens, 86 | $m[$keep] 87 | ); 88 | } 89 | 90 | $str = $tokens; 91 | } 92 | 93 | /** 94 | * Execute the TRANSFORM mode. 95 | * 96 | * @param string $str The string to be tokenized 97 | */ 98 | protected function replace(array &$str, $pattern, $replacement) 99 | { 100 | foreach ($str as &$s) { 101 | $s = preg_replace($pattern, $replacement, $s); 102 | } 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/NlpTools/Tokenizers/TokenizerInterface.php: -------------------------------------------------------------------------------- 1 | cls = $cls; 32 | } 33 | 34 | /** 35 | * Classify the passed in variable w and then apply each transformation 36 | * to the output of the previous one. 37 | */ 38 | public function transform($w) 39 | { 40 | $class = $this->cls->classify( 41 | $this->classes, 42 | new RawDocument($w) 43 | ); 44 | 45 | foreach ($this->transforms[$class] as $t) { 46 | $w = $t->transform($w); 47 | } 48 | 49 | return $w; 50 | } 51 | 52 | /** 53 | * Register a set of transformations for a given class. 54 | * 55 | * @param string $class 56 | * @param array|TransformationInterface Either an array of transformations or a single transformation 57 | */ 58 | public function register($class, $transforms) 59 | { 60 | if (!is_array($transforms)) { 61 | $transforms = array($transforms); 62 | } 63 | foreach ($transforms as $t) { 64 | if (!($t instanceof TransformationInterface)) { 65 | throw new \InvalidArgumentException("Only instances of TransformationInterface can be registered"); 66 | } 67 | } 68 | 69 | if (!isset($this->transforms[$class])) { 70 | $this->classes[] = $class; 71 | $this->transforms[$class] = array(); 72 | } 73 | 74 | foreach ($transforms as $t) { 75 | $this->transforms[$class][] = $t; 76 | } 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/NlpTools/Utils/EnglishVowels.php: -------------------------------------------------------------------------------- 1 | the 17 | * I -> i 18 | * WhAtEvEr -> whatever 19 | * Άγγελος -> αγγελοσ 20 | * Αριστοτέλης -> αριστοτελησ 21 | */ 22 | abstract class Normalizer implements TransformationInterface 23 | { 24 | /** 25 | * Transform the word according to the class description 26 | * 27 | * @param string $w The word to normalize 28 | * @return string 29 | */ 30 | abstract public function normalize($w); 31 | 32 | /** 33 | * {@inheritdoc} 34 | */ 35 | public function transform($w) 36 | { 37 | return $this->normalize($w); 38 | } 39 | 40 | /** 41 | * Apply the normalize function to all the items in the array 42 | * @param array $items 43 | * @return array 44 | */ 45 | public function normalizeAll(array $items) 46 | { 47 | return array_map( 48 | array($this, 'normalize'), 49 | $items 50 | ); 51 | } 52 | 53 | /** 54 | * Just instantiate the normalizer using a factory method. 55 | * Keep in mind that this is NOT required. The constructor IS 56 | * visible. 57 | * 58 | * @param string $language 59 | */ 60 | public static function factory($language = "English") 61 | { 62 | $classname = __NAMESPACE__."\\$language"; 63 | 64 | return new $classname(); 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/NlpTools/Utils/StopWords.php: -------------------------------------------------------------------------------- 1 | stopwords = array_fill_keys( 21 | $stopwords, 22 | true 23 | ); 24 | 25 | $this->inner_transform = $transform; 26 | } 27 | 28 | public function transform($token) 29 | { 30 | $tocheck = $token; 31 | 32 | if ($this->inner_transform) { 33 | $tocheck = $this->inner_transform->transform($token); 34 | } 35 | 36 | return isset($this->stopwords[$tocheck]) ? null : $token; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/NlpTools/Utils/TransformationInterface.php: -------------------------------------------------------------------------------- 1 | assertTrue(count($freqDist->getHapaxes()) === 3); 17 | $this->assertEquals(9, $freqDist->getTotalTokens()); 18 | $this->assertEquals(6, $freqDist->getTotalUniqueTokens()); 19 | } 20 | 21 | public function testSimpleFreqWeight() 22 | { 23 | $freqDist = new FreqDist(array("time", "flies", "like", "an", "arrow", "time", "flies", "like", "what")); 24 | $this->assertEquals(1, $freqDist->getTotalByToken('an')); 25 | $this->assertEquals(0.111, $freqDist->getTokenWeight('an')); 26 | } 27 | 28 | public function testEmptyHapaxesFreqDist() 29 | { 30 | $freqDist = new FreqDist(array("time", "time", "what", "what")); 31 | $this->assertTrue(count($freqDist->getHapaxes()) === 0); 32 | $this->assertEquals(4, $freqDist->getTotalTokens()); 33 | $this->assertEquals(2, $freqDist->getTotalUniqueTokens()); 34 | } 35 | 36 | public function testSingleHapaxFreqDist() 37 | { 38 | $freqDist = new FreqDist(array("time")); 39 | $this->assertTrue(count($freqDist->getHapaxes()) === 1); 40 | $this->assertEquals(1, $freqDist->getTotalTokens()); 41 | $this->assertEquals(1, $freqDist->getTotalUniqueTokens()); 42 | } 43 | } 44 | 45 | -------------------------------------------------------------------------------- /tests/NlpTools/Analysis/IdfTest.php: -------------------------------------------------------------------------------- 1 | addDocument( 14 | "", 15 | new TokensDocument(array("a","b","c","d")) 16 | ); 17 | $ts->addDocument( 18 | "", 19 | new TokensDocument(array("a","c","d")) 20 | ); 21 | $ts->addDocument( 22 | "", 23 | new TokensDocument(array("a")) 24 | ); 25 | 26 | $idf = new Idf($ts); 27 | 28 | $this->assertEquals( 29 | 0.405, 30 | $idf["c"], 31 | null, 32 | 0.001 33 | ); 34 | $this->assertEquals( 35 | 1.098, 36 | $idf["b"], 37 | null, 38 | 0.001 39 | ); 40 | $this->assertEquals( 41 | 1.098, 42 | $idf["non-existing"], 43 | null, 44 | 0.001 45 | ); 46 | $this->assertEquals( 47 | 0, 48 | $idf["a"] 49 | ); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /tests/NlpTools/Classifiers/EndOfSentenceRules.php: -------------------------------------------------------------------------------- 1 | getDocumentData(); 12 | 13 | $dotcnt = count(explode('.',$token))-1; 14 | $lastdot = substr($token,-1)=='.'; 15 | 16 | if (!$lastdot) // assume that all sentences end in full stops 17 | return 'O'; 18 | 19 | if ($dotcnt>1) // to catch some naive abbreviations (e.g.: U.S.A.) 20 | return 'O'; 21 | 22 | return 'EOW'; 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /tests/NlpTools/Clustering/ClusteringTestBase.php: -------------------------------------------------------------------------------- 1 | 0) ? 1 : 0; }; 14 | $pulse = function ($x,$a,$b) use ($u) { return $u($x-$a)-$u($x-$b); }; 15 | 16 | return array( 17 | (int) ( 255*( $pulse($t,0,1/3) + $pulse($t,1/3,2/3)*(2-3*$t) ) ), 18 | (int) ( 255*( $pulse($t,0,1/3)*3*$t + $pulse($t,1/3,2/3) + $pulse($t,2/3,1)*(3-3*$t) ) ), 19 | (int) ( 255*( $pulse($t,1/3,2/3)*(3*$t-1) + $pulse($t,2/3,1) ) ) 20 | ); 21 | } 22 | 23 | /** 24 | * Return a gd handle with a visualization of the clustering or null in case gd is not present. 25 | */ 26 | protected function drawClusters($tset, $clusters, $centroids=null, $lines=False,$emphasize=0,$w=300,$h=200) 27 | { 28 | if (!function_exists('imagecreate')) 29 | return null; 30 | 31 | $im = imagecreatetruecolor($w,$h); 32 | $white = imagecolorallocate($im,255,255,255); 33 | $colors = array(); 34 | $NC = count($clusters); 35 | for ($i=1;$i<=$NC;$i++) { 36 | list($r,$g,$b) = $this->getColor($i/$NC); 37 | $colors[] = imagecolorallocate($im,$r,$g,$b); 38 | } 39 | 40 | imagefill($im,0,0,$white); 41 | foreach ($clusters as $cid=>$cluster) { 42 | foreach ($cluster as $idx) { 43 | $data = $tset[$idx]->getDocumentData(); 44 | if ($emphasize>0) 45 | imagefilledarc($im,$data['x'],$data['y'],$emphasize,$emphasize,0,360,$colors[$cid],0); 46 | else 47 | imagesetpixel($im,$data['x'],$data['y'],$colors[$cid]); 48 | } 49 | if (is_array($centroids)) { 50 | $x = $centroids[$cid]['x']; 51 | $y = $centroids[$cid]['y']; 52 | if ($lines) { 53 | // draw line 54 | // for cosine similarity 55 | imagesetthickness($im,5); 56 | imageline($im,0,0,$x*400,$y*400,$colors[$cid]); 57 | } else { 58 | // draw circle for euclidean 59 | imagefilledarc($im,$x,$y,10,10,0,360,$colors[$cid],0); 60 | } 61 | } 62 | } 63 | 64 | return $im; 65 | } 66 | 67 | /** 68 | * Return a gd handle with a visualization of the given dendrogram or null 69 | * if gd is not present. 70 | */ 71 | protected function drawDendrogram($tset, $dendrogram, $w=300, $h=200) 72 | { 73 | if (!function_exists('imagecreate')) 74 | return null; 75 | 76 | $im = imagecreatetruecolor($w,$h); 77 | $white = imagecolorallocate($im, 255,255,255); 78 | $black = imagecolorallocate($im, 0,0,0); 79 | $blue = imagecolorallocate($im, 0,0,255); 80 | imagefill($im, 0,0, $white); 81 | 82 | // padding 5% 83 | $padding = round(0.05*$w); 84 | // equally distribute 85 | $d = ($w-2*$padding)/count($tset); 86 | $count_depth = function ($a) use (&$depth, &$count_depth) { 87 | if (is_array($a)) { 88 | return max( 89 | array_map( 90 | $count_depth, 91 | $a 92 | ) 93 | ) + 1; 94 | } else { 95 | return 1; 96 | } 97 | }; 98 | $depth = $count_depth($dendrogram)-1; 99 | $d_v = ($h-2*$padding)/$depth; 100 | 101 | // offset from bottom 102 | $y = $h-$padding; 103 | $left = $padding; 104 | 105 | $draw_subcluster = function ($dendrogram, &$left) use (&$im, $d, $y, $d_v, $black, &$draw_subcluster,$blue) { 106 | if (!is_array($dendrogram)) { 107 | imagestring($im, 1, $left-(2 * strlen($dendrogram)), $y, $dendrogram, $black); 108 | $left += $d; 109 | 110 | return array($left - $d,$y-5); 111 | } 112 | list($l,$yl) = $draw_subcluster($dendrogram[0],$left); 113 | list($r,$yr) = $draw_subcluster($dendrogram[1],$left); 114 | $ym = min($yl,$yr)-$d_v; 115 | imageline($im, $l, $yl, $l, $ym, $blue); 116 | imageline($im, $r, $yr, $r, $ym, $blue); 117 | imageline($im, $l, $ym, $r, $ym, $blue); 118 | 119 | return array($l+($r-$l)/2,$ym); 120 | }; 121 | 122 | if (count($dendrogram)==1) 123 | $draw_subcluster($dendrogram[0],$left); 124 | else 125 | $draw_subcluster($dendrogram,$left); 126 | 127 | return $im; 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /tests/NlpTools/Clustering/HierarchicalTest.php: -------------------------------------------------------------------------------- 1 | 0,'y'=>0), 29 | array('x'=>0,'y'=>1), 30 | array('x'=>1,'y'=>3), 31 | array('x'=>4,'y'=>6), 32 | array('x'=>6,'y'=>6) 33 | ); 34 | 35 | $sl = new SingleLink(); 36 | $sl->initializeStrategy(new Euclidean(), $docs); 37 | 38 | $pair = $sl->getNextMerge(); 39 | $this->assertEquals( 40 | array(0,1), 41 | $pair 42 | ); 43 | 44 | $pair = $sl->getNextMerge(); 45 | $this->assertEquals( 46 | array(3,4), 47 | $pair 48 | ); 49 | 50 | $pair = $sl->getNextMerge(); 51 | $this->assertEquals( 52 | array(0,2), 53 | $pair 54 | ); 55 | 56 | $pair = $sl->getNextMerge(); 57 | $this->assertEquals( 58 | array(0,3), 59 | $pair 60 | ); 61 | 62 | $this->setExpectedException( 63 | "RuntimeException", 64 | "Can't extract from an empty heap" 65 | ); 66 | $sl->getNextMerge(); 67 | } 68 | 69 | /** 70 | * We are clustering the following points. 71 | * 72 | * 1 | * * * * * * 73 | * 0 +---------------- 74 | * -1 | 0 1 2 3 4 7 75 | * 76 | * They are merged with the following order (x coordinates indicate which point). 77 | * 78 | * +-----+ 79 | * | | 80 | * +----+ | 81 | * | | | 82 | * | +--+ | 83 | * | | | | 84 | * | +-+ | | 85 | * | | | | | 86 | * +-+ | | | | 87 | * | | | | | | 88 | * 0 1 2 3 4 7 89 | * 90 | */ 91 | public function testCompleteLink() 92 | { 93 | $docs = array( 94 | array('x'=>0,'y'=>1), 95 | array('x'=>1,'y'=>1), 96 | array('x'=>2,'y'=>1), 97 | array('x'=>3,'y'=>1), 98 | array('x'=>4,'y'=>1), 99 | array('x'=>7,'y'=>1) 100 | ); 101 | 102 | $cl = new CompleteLink(); 103 | $cl->initializeStrategy(new Euclidean(), $docs); 104 | 105 | $pair = $cl->getNextMerge(); 106 | $this->assertEquals( 107 | array(0,1), 108 | $pair 109 | ); 110 | 111 | $pair = $cl->getNextMerge(); 112 | $this->assertEquals( 113 | array(2,3), 114 | $pair 115 | ); 116 | 117 | $pair = $cl->getNextMerge(); 118 | $this->assertEquals( 119 | array(2,4), 120 | $pair 121 | ); 122 | 123 | $pair = $cl->getNextMerge(); 124 | $this->assertEquals( 125 | array(0,2), 126 | $pair 127 | ); 128 | 129 | $pair = $cl->getNextMerge(); 130 | $this->assertEquals( 131 | array(0,5), 132 | $pair 133 | ); 134 | 135 | $this->setExpectedException( 136 | "RuntimeException", 137 | "Can't extract from an empty heap" 138 | ); 139 | $cl->getNextMerge(); 140 | } 141 | 142 | /** 143 | * 144 | * | * * * * * 145 | * +------------ 146 | * 0 1 2 3 4.51 147 | * 148 | * results in 149 | * 150 | * +----+ 151 | * | | 152 | * +---+ | 153 | * | | | 154 | * | +-+ | 155 | * +-+ | | | 156 | * | | | | | 157 | * 0 1 2 3 4.51 158 | * 159 | * while 160 | * 161 | * | * * * * * 162 | * +------------ 163 | * 0 1 2 3 4.49 164 | * 165 | * in 166 | * 167 | * +----+ 168 | * | | 169 | * | +--+ 170 | * | | | 171 | * | +-+ | 172 | * +-+ | | | 173 | * | | | | | 174 | * 0 1 2 3 4.49 175 | * 176 | * because the distance between the groups {0,1}-{2,3} is 2 and {2,3},{4.5} is also 2. 177 | * 178 | */ 179 | public function testGroupAverage() 180 | { 181 | $docs = array( 182 | array('x'=>0,'y'=>1), 183 | array('x'=>1,'y'=>1), 184 | array('x'=>2,'y'=>1), 185 | array('x'=>3,'y'=>1), 186 | array('x'=>4.51,'y'=>1), 187 | ); 188 | 189 | $ga = new GroupAverage(); 190 | $ga->initializeStrategy(new Euclidean(), $docs); 191 | 192 | $pair = $ga->getNextMerge(); 193 | $this->assertEquals( 194 | array(0,1), 195 | $pair 196 | ); 197 | 198 | $pair = $ga->getNextMerge(); 199 | $this->assertEquals( 200 | array(2,3), 201 | $pair 202 | ); 203 | 204 | $pair = $ga->getNextMerge(); 205 | $this->assertEquals( 206 | array(0,2), 207 | $pair 208 | ); 209 | 210 | $pair = $ga->getNextMerge(); 211 | $this->assertEquals( 212 | array(0,4), 213 | $pair 214 | ); 215 | 216 | $docs[4] = array('x'=>4.49,'y'=>1); 217 | $ga->initializeStrategy(new Euclidean(), $docs); 218 | 219 | $pair = $ga->getNextMerge(); 220 | $this->assertEquals( 221 | array(0,1), 222 | $pair 223 | ); 224 | 225 | $pair = $ga->getNextMerge(); 226 | $this->assertEquals( 227 | array(2,3), 228 | $pair 229 | ); 230 | 231 | $pair = $ga->getNextMerge(); 232 | $this->assertEquals( 233 | array(2,4), 234 | $pair 235 | ); 236 | 237 | $pair = $ga->getNextMerge(); 238 | $this->assertEquals( 239 | array(0,2), 240 | $pair 241 | ); 242 | } 243 | 244 | public function testDendrogramToClusters() 245 | { 246 | $dendrograms = array( 247 | array( 248 | array(array(0,1),array(array(2,3),4)), 249 | array(array(0,1),array(2,3,4)) 250 | ), 251 | array( 252 | array(array(0,array(1,array(2,array(3,array(4,array(5,array(6,7)))))))), 253 | array(array(0),array(1),array(2),array(3,4,5,6,7)) 254 | ) 255 | ); 256 | 257 | foreach ($dendrograms as $i=>$d) { 258 | $this->assertEquals( 259 | $d[1], 260 | Hierarchical::dendrogramToClusters( 261 | $d[0], 262 | count($d[1]) 263 | ), 264 | "Error transforming dendrogram $i" 265 | ); 266 | } 267 | } 268 | 269 | public function testClustering1() 270 | { 271 | $points = array( 272 | array('x'=>1, 'y'=>1), 273 | array('x'=>1, 'y'=>2), 274 | array('x'=>2, 'y'=>2), 275 | array('x'=>3, 'y'=>3), 276 | array('x'=>3, 'y'=>4), 277 | ); 278 | 279 | $tset = new TrainingSet(); 280 | foreach ($points as $p) 281 | $tset->addDocument('',new TokensDocument($p)); 282 | 283 | $hc = new Hierarchical( 284 | new SingleLink(), // use the single link strategy 285 | new Euclidean() // with euclidean distance 286 | ); 287 | 288 | list($dendrogram) = $hc->cluster($tset,new DataAsFeatures()); 289 | $this->assertEquals( 290 | array( 291 | array( 292 | array( 293 | array( 294 | 0, 295 | 1 296 | ), 297 | 2 298 | ), 299 | array( 300 | 3, 301 | 4 302 | ) 303 | ) 304 | ), 305 | $dendrogram 306 | ); 307 | } 308 | 309 | public function testClustering2() 310 | { 311 | $N = 50; 312 | $tset = new TrainingSet(); 313 | for ($i=0;$i<$N;$i++) { 314 | $tset->addDocument( 315 | '', 316 | EuclideanPoint::getRandomPointAround(100,100,45) 317 | ); 318 | } 319 | for ($i=0;$i<$N;$i++) { 320 | $tset->addDocument( 321 | '', 322 | EuclideanPoint::getRandomPointAround(200,100,45) 323 | ); 324 | } 325 | 326 | $hc = new Hierarchical( 327 | new SingleLink(), // use the single link strategy 328 | new Euclidean() // with euclidean distance 329 | ); 330 | 331 | list($dendrogram) = $hc->cluster($tset,new DataAsFeatures()); 332 | $dg = $this->drawDendrogram( 333 | $tset, 334 | $dendrogram, 335 | 600 // width 336 | ); 337 | 338 | $clusters = Hierarchical::dendrogramToClusters($dendrogram,2); 339 | $im = $this->drawClusters( 340 | $tset, 341 | $clusters, 342 | null, // no centroids 343 | false, // no lines 344 | 10 // emphasize points (for little points) 345 | ); 346 | 347 | if ($dg) 348 | imagepng($dg, TEST_DATA_DIR."/Clustering/HierarchicalTest/dendrogram.png"); 349 | if ($im) 350 | imagepng($im, TEST_DATA_DIR."/Clustering/HierarchicalTest/clusters.png"); 351 | } 352 | } 353 | -------------------------------------------------------------------------------- /tests/NlpTools/Clustering/KmeansTest.php: -------------------------------------------------------------------------------- 1 | addDocument( 35 | 'A', 36 | EuclideanPoint::getRandomPointAround(100,100,45) 37 | ); 38 | } 39 | for ($i=0;$i<500;$i++) { 40 | $tset->addDocument( 41 | 'B', 42 | EuclideanPoint::getRandomPointAround(200,100,45) 43 | ); 44 | } 45 | 46 | list($clusters,$centroids,$distances) = $clust->cluster($tset,new DataAsFeatures()); 47 | 48 | $im = $this->drawClusters( 49 | $tset, 50 | $clusters, 51 | $centroids, 52 | false // lines or not 53 | ); 54 | 55 | if ($im) 56 | imagepng($im,TEST_DATA_DIR."/Clustering/KmeansTest/clusters.png"); 57 | 58 | // since the dataset is artificial and clearly separated, the kmeans 59 | // algorithm should always cluster it correctly 60 | foreach ($clusters as $clust) { 61 | $classes = array(); 62 | foreach ($clust as $point_idx) { 63 | $class = $tset[$point_idx]->getClass(); 64 | if (!isset($classes[$class])) 65 | $classes[$class] = true; 66 | } 67 | // assert that all the documents (points) in this cluster belong 68 | // in the same class 69 | $this->assertCount( 70 | 1, 71 | $classes 72 | ); 73 | } 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /tests/NlpTools/Documents/EuclideanPoint.php: -------------------------------------------------------------------------------- 1 | x = $x; 15 | $this->y = $y; 16 | } 17 | public function getDocumentData() 18 | { 19 | return array( 20 | 'x'=>$this->x, 21 | 'y'=>$this->y 22 | ); 23 | } 24 | 25 | public static function getRandomPointAround($x,$y,$R) 26 | { 27 | return new EuclideanPoint( 28 | $x+mt_rand(-$R,$R), 29 | $y+mt_rand(-$R,$R) 30 | ); 31 | } 32 | 33 | public function applyTransformation(TransformationInterface $transform) 34 | { 35 | $this->x = $transform->transform($this->x); 36 | $this->y = $transform->transform($this->y); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /tests/NlpTools/Documents/TransformationsTest.php: -------------------------------------------------------------------------------- 1 | assertEquals( 24 | $tokens, 25 | $doc->getDocumentData() 26 | ); 27 | $doc->applyTransformation($transformer); 28 | $this->assertEquals( 29 | $tokens, 30 | $doc->getDocumentData() 31 | ); 32 | 33 | $tdoc = new TrainingDocument("", new TokensDocument($tokens)); 34 | $tdoc->applyTransformation($transformer); 35 | $this->assertEquals( 36 | $tokens, 37 | $tdoc->getDocumentData() 38 | ); 39 | } 40 | 41 | /** 42 | * @dataProvider provideTokens 43 | */ 44 | public function testWordDocument($tokens) 45 | { 46 | $transformer = new IdentityTransformer(); 47 | $doc = new WordDocument($tokens,count($tokens)/2, 2); 48 | $correct = $doc->getDocumentData(); 49 | $doc->applyTransformation($transformer); 50 | $this->assertEquals( 51 | $correct, 52 | $doc->getDocumentData() 53 | ); 54 | 55 | $tdoc = new TrainingDocument("", new WordDocument($tokens,count($tokens)/2, 2)); 56 | $tdoc->applyTransformation($transformer); 57 | $this->assertEquals( 58 | $correct, 59 | $tdoc->getDocumentData() 60 | ); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /tests/NlpTools/Documents/WordDocumentTest.php: -------------------------------------------------------------------------------- 1 | tokens = array("The","quick","brown","fox","jumped","over","the","lazy","dog"); 15 | } 16 | 17 | /** 18 | * Test that the WordDocument correctly represents the ith token 19 | */ 20 | public function testTokenSelection() 21 | { 22 | foreach ($this->tokens as $i=>$t) { 23 | // no context 24 | $doc = new WordDocument($this->tokens, $i, 0); 25 | list($w,$prev,$next) = $doc->getDocumentData(); 26 | 27 | $this->assertEquals( 28 | $t, 29 | $w, 30 | "The {$i}th token should be $t not $w" 31 | ); 32 | 33 | // no context means prev,next are empty 34 | $this->assertCount( 35 | 0, 36 | $prev 37 | ); 38 | $this->assertCount( 39 | 0, 40 | $next 41 | ); 42 | } 43 | } 44 | 45 | /** 46 | * Start with the 5th word and increase the amount of context 47 | * until it reaches the edges of the token list. Check the 48 | * previous tokens. 49 | */ 50 | public function testPrevContext() 51 | { 52 | for ($i=0;$i<5;$i++) { 53 | $doc = new WordDocument($this->tokens, 4, $i); 54 | list($_,$prev,$_) = $doc->getDocumentData(); 55 | 56 | $this->assertCount( 57 | $i, 58 | $prev, 59 | "With $i words context prev should be $i words long" 60 | ); 61 | for ( 62 | $j=3,$y=$i-1; 63 | $j>=4-$i; 64 | $y--,$j--) { 65 | $this->assertEquals( 66 | $this->tokens[$j], 67 | $prev[$y] 68 | ); 69 | } 70 | } 71 | } 72 | 73 | /** 74 | * Start with the 5th word and increase the amount of context 75 | * until it reaches the edges of the token list. Check the 76 | * next tokens. 77 | */ 78 | public function testNextContext() 79 | { 80 | for ($i=0;$i<5;$i++) { 81 | $doc = new WordDocument($this->tokens, 4, $i); 82 | list($_,$_,$next) = $doc->getDocumentData(); 83 | 84 | $this->assertCount( 85 | $i, 86 | $next, 87 | "With $i words context next should be $i words long" 88 | ); 89 | for ($j=5; $j<5+$i; $j++) { 90 | $this->assertEquals( 91 | $this->tokens[$j], 92 | $next[$j-5] 93 | ); 94 | } 95 | } 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /tests/NlpTools/Models/LdaTest.php: -------------------------------------------------------------------------------- 1 | markTestSkipped("The gd library is not available"); 28 | } 29 | 30 | $this->path = TEST_DATA_DIR."/Models/LdaTest"; 31 | if (!file_exists($this->path)) { 32 | if (!file_exists(TEST_DATA_DIR."/Models")) 33 | mkdir(TEST_DATA_DIR."/Models"); 34 | mkdir($this->path); 35 | } 36 | 37 | if (!file_exists("{$this->path}/topics")) { 38 | mkdir("{$this->path}/topics"); 39 | } 40 | $this->createTopics(); 41 | 42 | if (!file_exists("{$this->path}/data")) { 43 | mkdir("{$this->path}/data"); 44 | } 45 | if (count(new \DirectoryIterator("{$this->path}/data"))<502) { 46 | $this->createData(); 47 | } 48 | 49 | if (!file_exists("{$this->path}/results")) { 50 | mkdir("{$this->path}/results"); 51 | } 52 | 53 | $this->loadData(); 54 | } 55 | 56 | /** 57 | * @group Slow 58 | * @group VerySlow 59 | */ 60 | public function testLda() 61 | { 62 | $lda = new Lda( 63 | new DataAsFeatures(), // feature factory 64 | 10, // number of topics 65 | 1, // dirichlet prior per doc topic dist 66 | 1 // dirichlet prior per word topic dist 67 | ); 68 | 69 | $this->assertInstanceOf( 70 | "NlpTools\Models\Lda", 71 | $lda 72 | ); 73 | 74 | $docs = $lda->generateDocs($this->tset); 75 | $this->assertCount( 76 | count($this->tset), 77 | $docs 78 | ); 79 | 80 | $lda->initialize($docs); 81 | 82 | for ($i=0;$i<100;$i++) { 83 | $lda->gibbsSample($docs); 84 | $topics = $lda->getPhi(); 85 | echo $lda->getLogLikelihood(),PHP_EOL; 86 | foreach ($topics as $t=>$topic) { 87 | $name = sprintf("{$this->path}/results/topic-%04d-%04d",$i,$t); 88 | $max = max($topic); 89 | $this->createImage( 90 | array_map( 91 | function ($x) use ($topic,$max) { 92 | return array_map( 93 | function ($y) use ($x,$topic,$max) { 94 | return (int) (($topic[$y*5+$x]/$max)*255); 95 | }, 96 | range(0,4) 97 | ); 98 | }, 99 | range(0,4) 100 | ), 101 | $name 102 | ); 103 | } 104 | } 105 | 106 | // TODO: assert the resemblance of the inferred topics 107 | // with the actual topics 108 | } 109 | 110 | // WARNING: Massive set up code follows 111 | // Lda is one of the hardest models to test. 112 | // This functional test is the test the creators of Lda 113 | // performed themselves. 114 | // 115 | // TODO: Unit testing for lda is needed 116 | 117 | protected function createTopics() 118 | { 119 | $topics = array( 120 | array( 121 | array(1,1,1,1,1), 122 | array(0,0,0,0,0), 123 | array(0,0,0,0,0), 124 | array(0,0,0,0,0), 125 | array(0,0,0,0,0) 126 | ), 127 | array( 128 | array(0,0,0,0,0), 129 | array(1,1,1,1,1), 130 | array(0,0,0,0,0), 131 | array(0,0,0,0,0), 132 | array(0,0,0,0,0) 133 | ), 134 | array( 135 | array(0,0,0,0,0), 136 | array(0,0,0,0,0), 137 | array(1,1,1,1,1), 138 | array(0,0,0,0,0), 139 | array(0,0,0,0,0) 140 | ), 141 | array( 142 | array(0,0,0,0,0), 143 | array(0,0,0,0,0), 144 | array(0,0,0,0,0), 145 | array(1,1,1,1,1), 146 | array(0,0,0,0,0) 147 | ), 148 | array( 149 | array(0,0,0,0,0), 150 | array(0,0,0,0,0), 151 | array(0,0,0,0,0), 152 | array(0,0,0,0,0), 153 | array(1,1,1,1,1) 154 | ), 155 | array( 156 | array(0,0,0,0,1), 157 | array(0,0,0,0,1), 158 | array(0,0,0,0,1), 159 | array(0,0,0,0,1), 160 | array(0,0,0,0,1) 161 | ), 162 | array( 163 | array(0,0,0,1,0), 164 | array(0,0,0,1,0), 165 | array(0,0,0,1,0), 166 | array(0,0,0,1,0), 167 | array(0,0,0,1,0) 168 | ), 169 | array( 170 | array(0,0,1,0,0), 171 | array(0,0,1,0,0), 172 | array(0,0,1,0,0), 173 | array(0,0,1,0,0), 174 | array(0,0,1,0,0) 175 | ), 176 | array( 177 | array(0,1,0,0,0), 178 | array(0,1,0,0,0), 179 | array(0,1,0,0,0), 180 | array(0,1,0,0,0), 181 | array(0,1,0,0,0) 182 | ), 183 | array( 184 | array(1,0,0,0,0), 185 | array(1,0,0,0,0), 186 | array(1,0,0,0,0), 187 | array(1,0,0,0,0), 188 | array(1,0,0,0,0) 189 | ) 190 | ); 191 | 192 | $this->topics = array_map( 193 | function ($topic) { 194 | $t = call_user_func_array( 195 | "array_merge", 196 | $topic 197 | ); 198 | 199 | $s = array_sum($t); 200 | 201 | return array_map( 202 | function ($ti) use ($s) { 203 | return $ti/$s; 204 | }, 205 | $t 206 | ); 207 | }, 208 | $topics 209 | ); 210 | 211 | // multiply by 255 to make gray-scale images of 212 | // the above arrays 213 | $topics = array_map( 214 | function ($topic) { 215 | return array_map( 216 | function ($row) { 217 | return array_map( 218 | function ($pixel) { 219 | return (int) (255*$pixel); 220 | }, 221 | $row 222 | ); 223 | }, 224 | $topic 225 | ); 226 | }, 227 | $topics 228 | ); 229 | 230 | // save them to disk 231 | foreach ($topics as $key=>$topic) { 232 | $this->createImage($topic, "{$this->path}/topics/topic-$key"); 233 | } 234 | } 235 | 236 | protected function createData() 237 | { 238 | $dir = new Dirichlet(1, count($this->topics)); 239 | 240 | for ($i=0;$i<500;$i++) { 241 | $d = $this->createDocument($this->topics, $dir->sample(), 100); 242 | $this->createImage($d, "{$this->path}/data/$i"); 243 | } 244 | } 245 | 246 | protected function loadData() 247 | { 248 | $this->tset = new TrainingSet(); 249 | foreach (new \DirectoryIterator("{$this->path}/data") as $f) { 250 | if ($f->isDir()) 251 | continue; 252 | 253 | $this->tset->addDocument( 254 | "", 255 | new TokensDocument( 256 | $this->fromImg($f->getRealPath()) 257 | ) 258 | ); 259 | } 260 | } 261 | 262 | /** 263 | * Save a two dimensional array as a grey-scale image 264 | */ 265 | protected function createImage(array $img,$filename) 266 | { 267 | $im = imagecreate(count($img),count(current($img))); 268 | imagecolorallocate($im,0,0,0); 269 | foreach ($img as $y=>$row) { 270 | foreach ($row as $x=>$color) { 271 | $color = min(255,max(0,$color)); 272 | $c = imagecolorallocate($im,$color,$color,$color); 273 | imagesetpixel($im,$x,$y,$c); 274 | } 275 | } 276 | imagepng($im,$filename); 277 | } 278 | 279 | /** 280 | * Draw once from a multinomial distribution 281 | */ 282 | protected function draw($d) 283 | { 284 | $mt = MersenneTwister::get(); // simply mt_rand but in the interval [0,1) 285 | $x = $mt->generate(); 286 | $p = 0.0; 287 | foreach ($d as $i=>$v) { 288 | $p+=$v; 289 | if ($p > $x) 290 | return $i; 291 | } 292 | } 293 | 294 | /** 295 | * Create a document sticking to the model's assumptions 296 | * and hypotheses 297 | */ 298 | public function createDocument($topic_dists,$theta,$length) 299 | { 300 | $doc = array_fill_keys(range(0,24),0); 301 | while ($length-- > 0) { 302 | $topic = $this->draw($theta); 303 | $word = $this->draw($topic_dists[$topic]); 304 | $doc[$word] += 1; 305 | } 306 | 307 | return array_map( 308 | function ($start) use ($doc) { 309 | return array_slice($doc,$start,5); 310 | }, 311 | range(0,24,5) 312 | ); 313 | } 314 | 315 | /** 316 | * Load a document from an image saved to disk 317 | */ 318 | public function fromImg($file) 319 | { 320 | $im = imagecreatefrompng($file); 321 | $d = array(); 322 | for ($w=0;$w<25;$w++) { 323 | $x = (int) ($w%5); 324 | $y = (int) ($w/5); 325 | 326 | $c = imagecolorsforindex($im,imagecolorat($im,$x,$y)); 327 | $c = $c['red']; 328 | if ($c>0) { 329 | $d = array_merge( 330 | $d, 331 | array_fill_keys( 332 | range(0,$c-1), 333 | $w 334 | ) 335 | ); 336 | } 337 | } 338 | 339 | return $d; 340 | } 341 | 342 | } 343 | -------------------------------------------------------------------------------- /tests/NlpTools/Similarity/CosineSimilarityTest.php: -------------------------------------------------------------------------------- 1 | assertEquals( 16 | 1, 17 | $sim->similarity($A,$A), 18 | "The cosine similarity of a set/vector with itsself should be 1" 19 | ); 20 | 21 | $this->assertEquals( 22 | 1, 23 | $sim->similarity($A,$A_times_2), 24 | "The cosine similarity of a vector with a linear combination of itsself should be 1" 25 | ); 26 | 27 | $this->assertEquals( 28 | 0, 29 | $sim->similarity($A,$B)-$sim->similarity($A_times_2,$B), 30 | "Parallel vectors should have the same angle with any vector B" 31 | ); 32 | } 33 | 34 | public function testProducedAngles() 35 | { 36 | $sim = new CosineSimilarity(); 37 | 38 | $ba = array(1,1,2,2,2,2); // ba = (2,4) 39 | $bc = array(1,1,1,2,2); // bc = (3,2) 40 | $bba = array('a'=>2,'b'=>4); 41 | $bbc = array('a'=>3,'b'=>2); 42 | $ba_to_bc = cos(0.5191461142); // approximately 30 deg 43 | 44 | $this->assertEquals( 45 | $ba_to_bc, 46 | $sim->similarity($ba,$bc) 47 | ); 48 | 49 | $this->assertEquals( 50 | $ba_to_bc, 51 | $sim->similarity($bba,$bbc) 52 | ); 53 | } 54 | 55 | public function testInvalidArgumentException() 56 | { 57 | $sim = new CosineSimilarity(); 58 | $a = array(1); 59 | $zero = array(); 60 | try { 61 | $sim->similarity( 62 | $a, 63 | $zero 64 | ); 65 | $this->fail("Cosine similarity with the zero vector should trigger an exception"); 66 | } catch (\InvalidArgumentException $e) { 67 | $this->assertEquals( 68 | "Vector \$B is the zero vector", 69 | $e->getMessage() 70 | ); 71 | } 72 | try { 73 | $sim->similarity( 74 | $zero, 75 | $a 76 | ); 77 | $this->fail("Cosine similarity with the zero vector should trigger an exception"); 78 | } catch (\InvalidArgumentException $e) { 79 | $this->assertEquals( 80 | "Vector \$A is the zero vector", 81 | $e->getMessage() 82 | ); 83 | } 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /tests/NlpTools/Similarity/DiceSimilarityTest.php: -------------------------------------------------------------------------------- 1 | assertEquals( 16 | 1, 17 | $sim->similarity($A,$A), 18 | "The similarity of a set with itsself is 1" 19 | ); 20 | 21 | $this->assertEquals( 22 | 0, 23 | $sim->similarity($A,$e), 24 | "The similarity of any set with the empty set is 0" 25 | ); 26 | 27 | $this->assertEquals( 28 | 0.75, 29 | $sim->similarity($A,$B), 30 | "similarity({'my','name','is','john'},{'my','name','is','joe'}) = 0.75" 31 | ); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /tests/NlpTools/Similarity/HammingDistanceTest.php: -------------------------------------------------------------------------------- 1 | assertEquals( 17 | max(strlen($A),strlen($B)), 18 | $dist->dist($A,$B), 19 | "Two completely dissimilar strings should have distance equal to max(strlen(\$A),strlen(\$B))" 20 | ); 21 | 22 | $this->assertEquals( 23 | 2, 24 | $dist->dist($C,$D), 25 | "10101 ~ 11111 have a hamming distance = 2" 26 | ); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /tests/NlpTools/Similarity/JaccardIndexTest.php: -------------------------------------------------------------------------------- 1 | assertEquals( 16 | 1, 17 | $sim->similarity($A,$A), 18 | "The similarity of a set with itsself is 1" 19 | ); 20 | 21 | $this->assertEquals( 22 | 0, 23 | $sim->similarity($A,$e), 24 | "The similarity of any set with the empty set is 0" 25 | ); 26 | 27 | $this->assertEquals( 28 | 0.5, 29 | $sim->similarity($A,$B), 30 | "J({1,2,3},{1,2,3,4,5,6}) = 0.5" 31 | ); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /tests/NlpTools/Similarity/OverlapCoefficientTest.php: -------------------------------------------------------------------------------- 1 | assertEquals( 16 | 1, 17 | $sim->similarity($A,$A), 18 | "The similarity of a set with itsself is 1" 19 | ); 20 | 21 | $this->assertEquals( 22 | 0, 23 | $sim->similarity($A,$e), 24 | "The similarity of any set with the empty set is 0" 25 | ); 26 | 27 | $this->assertEquals( 28 | 0.5, 29 | $sim->similarity($A,$B), 30 | "similarity({'my','name','is','john'},{'your','name','is','joe'}) = 0.5" 31 | ); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /tests/NlpTools/Similarity/SimhashTest.php: -------------------------------------------------------------------------------- 1 | assertEquals( 17 | 1, 18 | $sim->similarity($A,$A), 19 | "Two identical sets should have the same hash therefore a similarity of 1" 20 | ); 21 | 22 | $this->assertGreaterThan( 23 | $sim->similarity($A,$B), 24 | $sim->similarity($b,$B), 25 | "The more elements in common the more similar the two sets should be" 26 | ); 27 | } 28 | 29 | public function testWeightedSets() 30 | { 31 | $sim = new Simhash(64); 32 | 33 | $A = array("a","a","a","b","b",); 34 | $B = array("a"=>3,"b"=>2); 35 | 36 | $this->assertEquals( 37 | 1, 38 | $sim->similarity($A,$B), 39 | "The two sets are identical given that one is the weighted version of the other" 40 | ); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /tests/NlpTools/Similarity/TverskyIndexTest.php: -------------------------------------------------------------------------------- 1 | similarity($A, $B); 12 | } 13 | 14 | public function testTverskyIndex() 15 | { 16 | $sim = new TverskyIndex(); 17 | 18 | $A = array("my","name","is","john"); 19 | $B = array("my","name","is","joe"); 20 | $C = array(1,2,3); 21 | $D = array(1,2,3,4,5,6); 22 | $e = array(); 23 | 24 | $this->assertEquals( 25 | 1, 26 | $this->sim($A,$A, 0.5, 1), 27 | "The similarity of a set with itsself is 1" 28 | ); 29 | 30 | $this->assertEquals( 31 | 0, 32 | $this->sim($A,$e, 0.5, 2), 33 | "The similarity of any set with the empty set is 0" 34 | ); 35 | 36 | $this->assertEquals( 37 | 0.75, 38 | $this->sim($A,$B, 0.5, 1), 39 | "similarity({'my','name','is','john'},{'my','name','is','joe'}) = 0.75" 40 | ); 41 | 42 | $this->assertEquals( 43 | 0.5, 44 | $this->sim($C,$D, 0.5, 2), 45 | "similarity({1,2,3},{1,2,3,4,5,6}) = 0.5" 46 | ); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /tests/NlpTools/Stemmers/GreekStemmerTest.php: -------------------------------------------------------------------------------- 1 | setFlags(\SplFileObject::DROP_NEW_LINE | \SplFileObject::SKIP_EMPTY); 19 | $stems->setFlags(\SplFileObject::DROP_NEW_LINE | \SplFileObject::SKIP_EMPTY); 20 | $stems->rewind(); 21 | 22 | $stemmer = new GreekStemmer(); 23 | $this->checkStemmer($stemmer, $words, $stems); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /tests/NlpTools/Stemmers/LancasterStemmerTest.php: -------------------------------------------------------------------------------- 1 | assertEquals('maxim', $stemmer->stem('maximum')); 15 | $this->assertEquals('presum', $stemmer->stem('presumably')); 16 | $this->assertEquals('multiply', $stemmer->stem('multiply')); 17 | $this->assertEquals('provid', $stemmer->stem('provision')); 18 | $this->assertEquals('ow', $stemmer->stem('owed')); 19 | $this->assertEquals('ear', $stemmer->stem('ear')); 20 | $this->assertEquals('say', $stemmer->stem('saying')); 21 | $this->assertEquals('cry', $stemmer->stem('crying')); 22 | $this->assertEquals('string', $stemmer->stem('string')); 23 | $this->assertEquals('meant', $stemmer->stem('meant')); 24 | $this->assertEquals('cem', $stemmer->stem('cement')); 25 | } 26 | 27 | /** 28 | * Added to cover issue #34 29 | */ 30 | public function testEmptyStringForWord() 31 | { 32 | $stemmer = new LancasterStemmer(); 33 | $this->assertEquals("", $stemmer->stem("")); 34 | } 35 | } 36 | 37 | -------------------------------------------------------------------------------- /tests/NlpTools/Stemmers/PorterStemmerTest.php: -------------------------------------------------------------------------------- 1 | setFlags(\SplFileObject::DROP_NEW_LINE | \SplFileObject::SKIP_EMPTY); 24 | $stems->setFlags(\SplFileObject::DROP_NEW_LINE | \SplFileObject::SKIP_EMPTY); 25 | $stems->rewind(); 26 | 27 | $stemmer = new PorterStemmer(); 28 | $this->checkStemmer($stemmer, $words, $stems); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /tests/NlpTools/Stemmers/StemmerTestBase.php: -------------------------------------------------------------------------------- 1 | current(); 16 | $this->assertEquals( 17 | $stemmer->stem($word), 18 | $stem, 19 | "The stem for '$word' should be '$stem' not '{$stemmer->stem($word)}'" 20 | ); 21 | $stems->next(); 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /tests/NlpTools/Stemmers/TransformationTest.php: -------------------------------------------------------------------------------- 1 | stemAll($tokens); 24 | $doc = new TokensDocument($tokens); 25 | 26 | $this->assertNotEquals( 27 | $stemmed, 28 | $doc->getDocumentData() 29 | ); 30 | 31 | $doc->applyTransformation($stemmer); 32 | $this->assertEquals( 33 | $stemmed, 34 | $doc->getDocumentData() 35 | ); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /tests/NlpTools/Tokenizers/ClassifierBasedTokenizerTest.php: -------------------------------------------------------------------------------- 1 | assertEquals( 20 | array( 21 | "We are what we repeatedly do.", 22 | "Excellence, then, is not an act, but a habit." 23 | ), 24 | $tok->tokenize($text) 25 | ); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /tests/NlpTools/Tokenizers/PennTreeBankTokenizerTest.php: -------------------------------------------------------------------------------- 1 | tokenize("Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks."); 16 | $this->assertCount(16, $tokens); 17 | } 18 | 19 | public function testTokenizer2() 20 | { 21 | $tokenizer = new PennTreeBankTokenizer(); 22 | $this->assertCount(7, $tokenizer->tokenize("They'll save and invest more.")); 23 | } 24 | 25 | public function testTokenizer3() 26 | { 27 | $tokenizer = new PennTreeBankTokenizer(); 28 | $this->assertCount(4, $tokenizer->tokenize("I'm some text")); 29 | } 30 | 31 | public function testAgainstOriginalSedImplementation() 32 | { 33 | $tokenizer = new PennTreeBankTokenizer(); 34 | $tokenized = new \SplFileObject(TEST_DATA_DIR."/Tokenizers/PennTreeBankTokenizerTest/tokenized"); 35 | $tokenized->setFlags(\SplFileObject::DROP_NEW_LINE); 36 | $sentences = new \SplFileObject(TEST_DATA_DIR."/Tokenizers/PennTreeBankTokenizerTest/test.txt"); 37 | $sentences->setFlags(\SplFileObject::DROP_NEW_LINE); 38 | 39 | $tokenized->rewind(); 40 | foreach ($sentences as $sentence) { 41 | if ($sentence) // skip empty lines 42 | { 43 | $this->assertEquals( 44 | $tokenized->current(), 45 | implode(" ",$tokenizer->tokenize($sentence)), 46 | "Sentence: '$sentence' was not tokenized correctly" 47 | ); 48 | } 49 | $tokenized->next(); 50 | } 51 | 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /tests/NlpTools/Tokenizers/RegexTokenizerTest.php: -------------------------------------------------------------------------------- 1 | tokenize("0 1 2 3 4 5 6 7 8 9"); 18 | $this->assertCount(10, $tokens); 19 | $this->assertEquals("0123456789",implode("",$tokens)); 20 | 21 | // check split2 22 | $tok = new RegexTokenizer(array( 23 | "/\n+/" 24 | )); 25 | 26 | $tokens = $tok->tokenize("0 1 2 3 4\n5 6 7 8 9"); 27 | $this->assertCount(2, $tokens); 28 | $this->assertEquals("0 1 2 3 45 6 7 8 9",implode("",$tokens)); 29 | 30 | $tokens = $tok->tokenize("0 1 2 3 4\n\n5 6 7 8 9"); 31 | $this->assertCount(2, $tokens); 32 | $this->assertEquals("0 1 2 3 45 6 7 8 9",implode("",$tokens)); 33 | 34 | } 35 | 36 | /** 37 | * Test a pattern that captures instead of splits 38 | */ 39 | public function testMatches() 40 | { 41 | // check keep matches 42 | $tok = new RegexTokenizer(array( 43 | array("/(\s+)?(\w+)(\s+)?/",2) 44 | )); 45 | 46 | $tokens = $tok->tokenize("0 1 2 3 4 5 6 7 8 9"); 47 | $this->assertCount(10, $tokens); 48 | $this->assertEquals("0123456789",implode("",$tokens)); 49 | } 50 | 51 | /** 52 | * Test a pattern that firsts replaces all digits with themselves separated 53 | * by a space and then tokenizes on whitespace. 54 | */ 55 | public function testReplace() 56 | { 57 | // check keep matches 58 | $tok = new RegexTokenizer(array( 59 | array("/\d/",'$0 '), 60 | WhitespaceTokenizer::PATTERN 61 | )); 62 | 63 | $tokens = $tok->tokenize("0123456789"); 64 | $this->assertCount(10, $tokens); 65 | $this->assertEquals("0123456789",implode("",$tokens)); 66 | } 67 | 68 | /** 69 | * Test a simple pattern meant to split the full stop from the last 70 | * word of a sentence. 71 | */ 72 | public function testSplitWithManyPatterns() 73 | { 74 | $tok = new RegexTokenizer(array( 75 | WhitespaceTokenizer::PATTERN, // split on whitespace 76 | array("/([^\.])\.$/",'$1 .'), // replace . with . 77 | "/ /" // split on 78 | )); 79 | 80 | // example text stolen from NLTK :-) 81 | $str = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."; 82 | 83 | $tokens = $tok->tokenize($str); 84 | $this->assertCount(17, $tokens); 85 | $this->assertEquals($tokens[3], "$3.88"); 86 | $this->assertEquals($tokens[7], "."); 87 | $this->assertEquals($tokens[14], "."); 88 | $this->assertEquals($tokens[16], "."); 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /tests/NlpTools/Tokenizers/WhitespaceAndPuntuationTokenizerTest.php: -------------------------------------------------------------------------------- 1 | assertEquals( 19 | $tokens, 20 | $tok->tokenize($s) 21 | ); 22 | } 23 | 24 | public function testTokenizerOnUtf8() 25 | { 26 | $tok = new WhitespaceAndPunctuationTokenizer(); 27 | 28 | $s = "Ελληνικό κείμενο για παράδειγμα utf-8 χαρακτήρων"; 29 | $tokens = array('Ελληνικό','κείμενο','για','παράδειγμα','utf','-','8','χαρακτήρων'); 30 | // test tokenization of multibyte non-whitespace characters 31 | $this->assertEquals( 32 | $tokens, 33 | $tok->tokenize($s) 34 | ); 35 | 36 | $s = "Here exists non-breaking space   "; 37 | $tokens = array('Here','exists','non','-','breaking','space'); 38 | // test tokenization of multibyte whitespace 39 | $this->assertEquals( 40 | $tokens, 41 | $tok->tokenize($s) 42 | ); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /tests/NlpTools/Tokenizers/WhitespaceTokenizerTest.php: -------------------------------------------------------------------------------- 1 | assertEquals( 19 | $tokens, 20 | $tok->tokenize($s) 21 | ); 22 | } 23 | 24 | public function testTokenizerOnUtf8() 25 | { 26 | $tok = new WhitespaceTokenizer(); 27 | 28 | $s = "Ελληνικό κείμενο για παράδειγμα utf-8 χαρακτήρων"; 29 | $tokens = array('Ελληνικό','κείμενο','για','παράδειγμα','utf-8','χαρακτήρων'); 30 | // test tokenization of multibyte non-whitespace characters 31 | $this->assertEquals( 32 | $tokens, 33 | $tok->tokenize($s) 34 | ); 35 | 36 | $s = "Here exists non-breaking space   "; 37 | $tokens = array('Here','exists','non-breaking','space'); 38 | // test tokenization of multibyte whitespace 39 | $this->assertEquals( 40 | $tokens, 41 | $tok->tokenize($s) 42 | ); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /tests/NlpTools/Utils/ClassifierBasedTransformationTest.php: -------------------------------------------------------------------------------- 1 | getDocumentData() % count($classes)]; 13 | } 14 | 15 | public function testEvenAndOdd() 16 | { 17 | $stubEven = $this->getMock("NlpTools\\Utils\\TransformationInterface"); 18 | $stubEven->expects($this->any()) 19 | ->method('transform') 20 | ->will($this->returnValue('even')); 21 | $stubOdd = $this->getMock("NlpTools\\Utils\\TransformationInterface"); 22 | $stubOdd->expects($this->any()) 23 | ->method('transform') 24 | ->will($this->returnValue('odd')); 25 | 26 | $transform = new ClassifierBasedTransformation($this); 27 | $transform->register("even", $stubEven); 28 | $transform->register("odd", $stubOdd); 29 | 30 | $this->assertEquals( 31 | "odd", 32 | $transform->transform(3) 33 | ); 34 | $this->assertEquals( 35 | "even", 36 | $transform->transform(4) 37 | ); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /tests/NlpTools/Utils/EnglishVowelsTest.php: -------------------------------------------------------------------------------- 1 | assertTrue($vowelChecker->isVowel("man", 1)); 14 | } 15 | 16 | public function testYIsVowel() 17 | { 18 | $vowelChecker = VowelsAbstractFactory::factory("English"); 19 | $this->assertTrue($vowelChecker->isVowel("try", 2)); 20 | } 21 | } 22 | 23 | 24 | -------------------------------------------------------------------------------- /tests/NlpTools/Utils/IdentityTransformer.php: -------------------------------------------------------------------------------- 1 | assertEquals( 13 | explode(" ","ο μορφωμενοσ διαφερει απο τον αμορφωτο οσο ο ζωντανοσ απο τον νεκρο"), 14 | $greek->normalizeAll( 15 | explode(" ","Ο μορφωμένος διαφέρει από τον αμόρφωτο όσο ο ζωντανός από τον νεκρό") 16 | ) 17 | ); 18 | 19 | $this->assertEquals( 20 | explode(" ","ο μορφωμένος διαφέρει από τον αμόρφωτο όσο ο ζωντανός από τον νεκρό"), 21 | $english->normalizeAll( 22 | explode(" ","Ο μορφωμένος διαφέρει από τον αμόρφωτο όσο ο ζωντανός από τον νεκρό") 23 | ) 24 | ); 25 | 26 | $this->assertEquals( 27 | explode(" ","when a father gives to his son both laugh when a son gives to his father both cry" ), 28 | $english->normalizeAll( 29 | explode(" ","When a father gives to his son both laugh when a son gives to his father both cry" ) 30 | ) 31 | ); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /tests/NlpTools/Utils/StopWordsTest.php: -------------------------------------------------------------------------------- 1 | applyTransformation($stopwords); 21 | $this->assertEquals( 22 | array( 23 | "if", "you", "tell", "truth", "you", "do", "not", "have", "remember", "anything" 24 | ), 25 | $doc->getDocumentData() 26 | ); 27 | } 28 | 29 | public function testStopwordsWithTransformation() 30 | { 31 | $stopwords = new StopWords( 32 | array( 33 | "to", 34 | "the" 35 | ), 36 | Normalizer::factory("English") 37 | ); 38 | 39 | $doc = new TokensDocument(explode(" ", "If you Tell The truth You do not have To remember Anything")); 40 | $doc->applyTransformation($stopwords); 41 | $this->assertEquals( 42 | array( 43 | "If", "you", "Tell", "truth", "You", "do", "not", "have", "remember", "Anything" 44 | ), 45 | $doc->getDocumentData() 46 | ); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /tests/README.markdown: -------------------------------------------------------------------------------- 1 | Testing information 2 | =================== 3 | 4 | This readme contains a bit of information regarding writing tests for NlpTools and executing them. 5 | 6 | Writing Tests 7 | ------------- 8 | 9 | * Test classes should be in the same namespace as the class that is being tested 10 | * Any data needed for the test or produced by the test should be in the 'data' directory 11 | under the same folder as the namespace. Only data needed (not produced) are commited to 12 | the repository. 13 | * Tests should be marked with the groups **Slow** and **VerySlow** if they require more than 14 | 10 seconds and 1 minute respectively. If a test is marked as VerySlow it should also be marked 15 | as Slow. 16 | * Both functional and unit tests are welcome. 17 | 18 | Executing Tests 19 | --------------- 20 | 21 | Currently only one testsuite is defined (all tests). Because some tests take a long time to 22 | run you can try running `phpunit --exclude-group Slow` or `phpunit --exclude-group VerySlow` 23 | to avoid some slow tests. 24 | 25 | PHPUnit should be run from inside the tests folder or the phpunit.xml file should be provided 26 | as config. 27 | -------------------------------------------------------------------------------- /tests/bootstrap.php: -------------------------------------------------------------------------------- 1 | 2 | 3 | ./NlpTools/ 4 | 5 | 6 | -------------------------------------------------------------------------------- /tests/sentiment_maxent.php: -------------------------------------------------------------------------------- 1 | >/tmp/imdb.list 15 | * for f in `ls neg`; do echo `pwd`/neg/$f; done >>/tmp/imdb.list 16 | * shuf /tmp/imdb.list >/tmp/imdb-shuffled.list 17 | * head -n 1800 /tmp/imdb-shuffled.list > train 18 | * tail -n 200 /tmp/imdb-shuffled.list > test 19 | * 20 | * Then call the script like this: 21 | * php -d memory_limit=300M sentiment_maxent.php train test 22 | * 23 | */ 24 | 25 | // include the autoloader 26 | include '../autoloader.php'; 27 | 28 | use NlpTools\Tokenizers\WhitespaceTokenizer; 29 | use NlpTools\FeatureFactories\FunctionFeatures; 30 | use NlpTools\Documents\Document; 31 | use NlpTools\Documents\TokensDocument; 32 | use NlpTools\Documents\TrainingSet; 33 | use NlpTools\Optimizers\ExternalMaxentOptimizer; 34 | use NlpTools\Models\Maxent; 35 | use NlpTools\Classifiers\FeatureBasedLinearClassifier; 36 | 37 | // create needed reusable objects, a tokenizer and a feature factory 38 | $tok = new WhitespaceTokenizer(); 39 | $ff = new FunctionFeatures(); 40 | $ff->add(function ($class, DocumentInterface $d) { 41 | $r = array(); 42 | foreach ($d->getDocumentData() as $tok) 43 | $r[] = $class.$tok; 44 | 45 | return $r; 46 | }); 47 | 48 | // create 49 | // 1. an empty training set 50 | // 2. an optimizer 51 | // 3. an empty model 52 | $tset = new TrainingSet(); 53 | $OPTIMIZER_PATH = isset($_ENV["GD_OPTIMIZER"]) ? $_ENV["GD_OPTIMIZER"] : 'gradient-descent'; 54 | $optimizer = new ExternalMaxentOptimizer($OPTIMIZER_PATH); 55 | $model = new Maxent(array()); 56 | 57 | // argv[1] and argv[2] are paths to files that contain the paths 58 | // to the actual documents. 59 | $train = new SplFileObject($argv[1]); 60 | $test = new SplFileObject($argv[2]); 61 | 62 | // fill in the training set 63 | foreach ($train as $f) { 64 | $f = substr($f,0,-1); 65 | if (strlen($f)==0) 66 | continue; 67 | $class = "neg"; 68 | if (strpos($f,"pos")!==false) { 69 | $class = "pos"; 70 | } 71 | $tset->addDocument( 72 | $class, 73 | new TokensDocument($tok->tokenize(file_get_contents($f))) 74 | ); 75 | } 76 | 77 | // train the model 78 | $model->train($ff,$tset,$optimizer); 79 | 80 | // to use the model we need a classifier 81 | $cls = new FeatureBasedLinearClassifier($ff,$model); 82 | 83 | // evaluate the model 84 | $correct = 0; 85 | $total = 0; 86 | foreach ($test as $f) { 87 | $f = substr($f,0,-1); 88 | if (strlen($f)==0) 89 | continue; 90 | $class = "neg"; 91 | if (strpos($f,"pos")!==false) { 92 | $class = "pos"; 93 | } 94 | $doc = new TokensDocument($tok->tokenize(file_get_contents($f))); 95 | $predicted = $cls->classify(array("pos","neg"),$doc); 96 | if ($predicted == $class) { 97 | $correct++; 98 | } 99 | $total++; 100 | } 101 | 102 | printf("Acc: %.2f%%\n",(100*$correct/$total)); 103 | --------------------------------------------------------------------------------