├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── README.markdown
├── autoloader.php
├── composer.json
├── src
    └── NlpTools
    │   ├── Analysis
    │       ├── FreqDist.php
    │       └── Idf.php
    │   ├── Classifiers
    │       ├── ClassifierInterface.php
    │       ├── FeatureBasedLinearClassifier.php
    │       └── MultinomialNBClassifier.php
    │   ├── Clustering
    │       ├── CentroidFactories
    │       │   ├── CentroidFactoryInterface.php
    │       │   ├── Euclidean.php
    │       │   ├── Hamming.php
    │       │   └── MeanAngle.php
    │       ├── Clusterer.php
    │       ├── Hierarchical.php
    │       ├── KMeans.php
    │       └── MergeStrategies
    │       │   ├── CompleteLink.php
    │       │   ├── GroupAverage.php
    │       │   ├── HeapLinkage.php
    │       │   ├── MergeStrategyInterface.php
    │       │   └── SingleLink.php
    │   ├── Documents
    │       ├── DocumentInterface.php
    │       ├── RawDocument.php
    │       ├── TokensDocument.php
    │       ├── TrainingDocument.php
    │       ├── TrainingSet.php
    │       └── WordDocument.php
    │   ├── Exceptions
    │       └── InvalidExpression.php
    │   ├── FeatureFactories
    │       ├── DataAsFeatures.php
    │       ├── FeatureFactoryInterface.php
    │       └── FunctionFeatures.php
    │   ├── Models
    │       ├── FeatureBasedNB.php
    │       ├── Lda.php
    │       ├── LinearModel.php
    │       ├── Maxent.php
    │       └── MultinomialNBModelInterface.php
    │   ├── Optimizers
    │       ├── ExternalMaxentOptimizer.php
    │       ├── FeatureBasedLinearOptimizerInterface.php
    │       ├── GradientDescentOptimizer.php
    │       ├── MaxentGradientDescent.php
    │       └── MaxentOptimizerInterface.php
    │   ├── Random
    │       ├── Distributions
    │       │   ├── AbstractDistribution.php
    │       │   ├── Dirichlet.php
    │       │   ├── Gamma.php
    │       │   └── Normal.php
    │       └── Generators
    │       │   ├── FromFile.php
    │       │   ├── GeneratorInterface.php
    │       │   └── MersenneTwister.php
    │   ├── Similarity
    │       ├── CosineSimilarity.php
    │       ├── DiceSimilarity.php
    │       ├── DistanceInterface.php
    │       ├── Euclidean.php
    │       ├── HammingDistance.php
    │       ├── JaccardIndex.php
    │       ├── OverlapCoefficient.php
    │       ├── Simhash.php
    │       ├── SimilarityInterface.php
    │       └── TverskyIndex.php
    │   ├── Stemmers
    │       ├── GreekStemmer.php
    │       ├── LancasterStemmer.php
    │       ├── PorterStemmer.php
    │       ├── RegexStemmer.php
    │       └── Stemmer.php
    │   ├── Tokenizers
    │       ├── ClassifierBasedTokenizer.php
    │       ├── PennTreeBankTokenizer.php
    │       ├── RegexTokenizer.php
    │       ├── TokenizerInterface.php
    │       ├── WhitespaceAndPunctuationTokenizer.php
    │       └── WhitespaceTokenizer.php
    │   └── Utils
    │       ├── ClassifierBasedTransformation.php
    │       ├── EnglishVowels.php
    │       ├── Normalizers
    │           ├── English.php
    │           ├── Greek.php
    │           └── Normalizer.php
    │       ├── StopWords.php
    │       ├── TransformationInterface.php
    │       └── VowelsAbstractFactory.php
└── tests
    ├── NlpTools
        ├── Analysis
        │   ├── FreqDistTest.php
        │   └── IdfTest.php
        ├── Classifiers
        │   └── EndOfSentenceRules.php
        ├── Clustering
        │   ├── ClusteringTestBase.php
        │   ├── HierarchicalTest.php
        │   └── KmeansTest.php
        ├── Documents
        │   ├── EuclideanPoint.php
        │   ├── TransformationsTest.php
        │   └── WordDocumentTest.php
        ├── Models
        │   └── LdaTest.php
        ├── Similarity
        │   ├── CosineSimilarityTest.php
        │   ├── DiceSimilarityTest.php
        │   ├── HammingDistanceTest.php
        │   ├── JaccardIndexTest.php
        │   ├── OverlapCoefficientTest.php
        │   ├── SimhashTest.php
        │   └── TverskyIndexTest.php
        ├── Stemmers
        │   ├── GreekStemmerTest.php
        │   ├── LancasterStemmerTest.php
        │   ├── PorterStemmerTest.php
        │   ├── StemmerTestBase.php
        │   └── TransformationTest.php
        ├── Tokenizers
        │   ├── ClassifierBasedTokenizerTest.php
        │   ├── PennTreeBankTokenizerTest.php
        │   ├── RegexTokenizerTest.php
        │   ├── WhitespaceAndPuntuationTokenizerTest.php
        │   └── WhitespaceTokenizerTest.php
        └── Utils
        │   ├── ClassifierBasedTransformationTest.php
        │   ├── EnglishVowelsTest.php
        │   ├── IdentityTransformer.php
        │   ├── Normalizers
        │       └── NormalizerTest.php
        │   └── StopWordsTest.php
    ├── README.markdown
    ├── bootstrap.php
    ├── data
        ├── .gitignore
        ├── Stemmers
        │   ├── GreekStemmerTest
        │   │   ├── appendix-a-stems
        │   │   └── appendix-a-words
        │   └── PorterStemmerTest
        │   │   ├── stems.txt
        │   │   └── words.txt
        └── Tokenizers
        │   └── PennTreeBankTokenizerTest
        │       ├── test.txt
        │       └── tokenized
    ├── phpunit.xml
    └── sentiment_maxent.php


/.gitignore:
--------------------------------------------------------------------------------
1 | vendor/
2 | /nbproject/private/
3 | nbproject
4 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | Contribution guidelines
  2 | ===================
  3 | 
  4 | This document contains guidelines for contributing to NlpTools.
  5 | 
  6 | Coding style
  7 | ------------------
  8 | 
  9 | NlpTools adheres to the [psr-2][1] standard. It also follows the convention of
 10 | appending the word *Interface* to any interface.
 11 | 
 12 | To enforce the psr-2 style it is suggested to use the [php-cs-fixer][2] tool.
 13 | While you 're at it why not enforce some more styles as well. The fixers used
 14 | are the **default** (which are more than the psr-2 level uses) but they will be
 15 | explicitly listed here in case they change in the future.
 16 | 
 17 | * indentation
 18 | * linefeed
 19 | * trailing_spaces
 20 | * unused_use
 21 | * phpdoc_params
 22 | * visibility
 23 | * return
 24 | * short_tag
 25 | * braces
 26 | * include
 27 | * php_closing_tag
 28 | * extra_empty_lines
 29 | * psr0
 30 | * control_spaces
 31 | * elseif
 32 | * eof_ending
 33 | 
 34 | The above fixers are the default.
 35 | 
 36 | Commenting Style
 37 | --------------------------
 38 | 
 39 | Every public method must have comments that follow the php doc convention.
 40 | @param and @return annotations are mandatory. The comments should be
 41 | explanatory not simply rewriting the method's name in a sentence. If the method
 42 | is too simple or the name explains the actions sufficiently then just add the
 43 | @param and @return annotations.
 44 | 
 45 | Examples of bad commenting currently in the develop branch:
 46 | 
 47 | ``` php
 48 | /**
 49 |  * Calls internal functions to handle data processing
 50 |  * @param type $string
 51 |  */
 52 | public function tokenize($str)
 53 | {
 54 |     ......
 55 | }
 56 | ```
 57 | 
 58 | It should be something along the lines of:
 59 | 
 60 | ``` php
 61 | /**
 62 |  * Splits $str to smaller strings according to Penn Treebank tokenization rules.
 63 |  *
 64 |  * You can see the regexes in function initPatternAndReplacement()
 65 |  * @param  string $str The string to be tokenized
 66 |  * @return array  An array of smaller strings (the tokens)
 67 |  */
 68 | ....
 69 | ```
 70 | 
 71 | Equally necessary are class comments. The class comment should be explaining
 72 | what the class does from a high point of view. Redirections to online resources
 73 | like wikipedia are welcome. A good example that also contains a reference to an
 74 | external resource is the following:
 75 | 
 76 | ``` php
 77 | /**
 78 |  * Implement a gradient descent algorithm that maximizes the conditional
 79 |  * log likelihood of the training data.
 80 |  *
 81 |  * See page 24 - 28 of http://nlp.stanford.edu/pubs/maxent-tutorial-slides.pdf
 82 |  * @see NlpTools\Models\Maxent
 83 |  */
 84 | class MaxentGradientDescent extends GradientDescentOptimizer implements MaxentOptimizerInterface
 85 | ```
 86 | 
 87 | Pull Requests
 88 | --------------------
 89 | 
 90 | ### Find something to work on ###
 91 | 
 92 | If it is your first contribution try to find something straightforward and
 93 | concise to implement without many design decisions as much as development
 94 | decisions. You could first submit an issue, if you like, and state your will to
 95 | correct this issue yourself.
 96 | 
 97 | ### Branch off ###
 98 | 
 99 | When you 've found something to develop, create a new branch off of the develop
100 | branch. Make your changes, add your tests (see below for testing) and then make
101 | a pull request. Always keep your develop branch in sync with the remote and
102 | before you create a pull request **rebase** your local branch to develop.
103 | 
104 | If you rebased but there has been a change pushed since, you don't have to
105 | remove the pull request, rebase and recreate it. I will pull your changes
106 | rebase them, merge them and then close the pull request. This will have the
107 | effect of showing some merged pull requests as simply closed but it is worth
108 | keeping the commit history clean.
109 | 
110 | So in two small sentences: Always create a new branch to develop on. Always
111 | rebase before making a pull request.
112 | 
113 | ### Tests ###
114 | 
115 | If you are implementing a new feature always include tests in your pull request.
116 | 
117 | Also contributing just tests is extremely welcome.
118 | 
119 | Testing
120 | -----------
121 | 
122 | A bit of information can be found in the tests folder in the README file.
123 | 
124 | Tests should test the implementation thoroughly. You can test your
125 | implementation like a black box, based only on the outputs given some inputs,
126 | or you can test every small part for how it works. Either is acceptable. I will
127 | make my point clear with an example.
128 | 
129 | The PorterStemmer implementation has 5 steps and some even have sub steps. One
130 | way to write the test would be to expose those steps (maybe by extending the
131 | PorterStemmer class) and write tests for each one. One other way would be to
132 | simply take a big list of English words and their stems according to the
133 | canonical implementation and check if your code produces the same results.
134 | 
135 | While the second is a lot easier to implement, in case of failure, it gives
136 | very little information regarding the cause of the error. Both are acceptable
137 | (in the case of the example the second is implemented).
138 | 
139 | [1]: https://github.com/php-fig/fig-standards/blob/master/accepted/PSR-2-coding-style-guide.md
140 | [2]: http://cs.sensiolabs.org/
141 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |             DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
 2 |                     Version 2, December 2004
 3 | 
 4 |  Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
 5 | 
 6 |  Everyone is permitted to copy and distribute verbatim or modified
 7 |  copies of this license document, and changing it is allowed as long
 8 |  as the name is changed.
 9 | 
10 |             DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
11 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
12 | 
13 |   0. You just DO WHAT THE FUCK YOU WANT TO.
14 | 


--------------------------------------------------------------------------------
/README.markdown:
--------------------------------------------------------------------------------
 1 | [PHP NlpTools](http://php-nlp-tools.com/)
 2 | =============
 3 | 
 4 | NlpTools is a set of php 5.3+ classes for beginner to
 5 | semi advanced natural language processing work.
 6 | 
 7 | Documentation
 8 | -------------
 9 | 
10 | You can find documentation and code examples at the project's [homepage](http://php-nlp-tools.com/documentation/).
11 | 
12 | Contents
13 | ---------
14 | 
15 | ### Classification Models ###
16 | 
17 | 1. [Multinomial Naive Bayes](http://php-nlp-tools.com/documentation/bayesian-model.html)
18 | 2. [Maximum Entropy (Conditional Exponential model)](http://php-nlp-tools.com/documentation/maximum-entropy-model.html)
19 | 
20 | ### Topic Modeling ###
21 | 
22 | Lda is still experimental and quite slow but it works. [See an example](http://php-nlp-tools.com/posts/introducing-latent-dirichlet-allocation.html).
23 | 
24 | 1. [Latent Dirichlet Allocation](http://php-nlp-tools.com/documentation/api/#NlpTools/Models/Lda)
25 | 
26 | ### Clustering ###
27 | 
28 | 1. [K-Means](http://php-nlp-tools.com/documentation/clustering.html)
29 | 2. [Hierarchical Agglomerative Clustering](http://php-nlp-tools.com/documentation/clustering.html)
30 |    * SingleLink
31 |    * CompleteLink
32 |    * GroupAverage
33 | 
34 | ### Tokenizers ###
35 | 
36 | 1. [WhitespaceTokenizer](http://php-nlp-tools.com/documentation/api/#NlpTools/Tokenizers/WhitespaceTokenizer)
37 | 2. [WhitespaceAndPunctuationTokenizer](http://php-nlp-tools.com/documentation/api/#NlpTools/Tokenizers/WhitespaceAndPunctuationTokenizer)
38 | 3. [PennTreebankTokenizer](http://php-nlp-tools.com/documentation/api/#NlpTools/Tokenizers/PennTreebankTokenizer)
39 | 4. [RegexTokenizer](http://php-nlp-tools.com/documentation/api/#NlpTools\Tokenizers\RegexTokenizer)
40 | 5. [ClassifierBasedTokenizer](http://php-nlp-tools.com/documentation/api/#NlpTools/Tokenizers/ClassifierBasedTokenizer)
41 |    This tokenizer allows us to build a lot more complex tokenizers
42 |    than the previous ones
43 | 
44 | ### Documents ###
45 | 
46 | 1. [TokensDocument](http://php-nlp-tools.com/documentation/api/#NlpTools/Documents/TokensDocument)
47 |    represents a bag of words model for a document.
48 | 2. [WordDocument](http://php-nlp-tools.com/documentation/api/#NlpTools/Documents/WordDocument)
49 |    represents a single word with the context of a larger document.
50 | 3. [TrainingDocument](http://php-nlp-tools.com/documentation/api/#NlpTools/Documents/TrainingDocument)
51 |    represents a document whose class is known.
52 | 4. [TrainingSet](http://php-nlp-tools.com/documentation/api/#NlpTools/Documents/TrainingSet)
53 |    a collection of TrainingDocuments
54 | 
55 | ### Feature factories ###
56 | 
57 | 1. [FunctionFeatures](http://php-nlp-tools.com/documentation/api/#NlpTools/FeatureFactories/FunctionFeatures)
58 |    Allows the creation of a feature factory from a number of callables
59 | 2. [DataAsFeatures](http://php-nlp-tools.com/documentation/api/#NlpTools/FeatureFactories/DataAsFeatures)
60 |    Simply return the data as features.
61 | 
62 | ### Similarity ###
63 | 
64 | 1. [Jaccard Index](http://php-nlp-tools.com/documentation/api/#NlpTools/Similarity/JaccardIndex)
65 | 2. [Cosine similarity](http://php-nlp-tools.com/documentation/api/#NlpTools/Similarity/CosineSimilarity)
66 | 3. [Simhash](http://php-nlp-tools.com/documentation/api/#NlpTools/Similarity/Simhash)
67 | 4. [Euclidean](http://php-nlp-tools.com/documentation/api/#NlpTools/Similarity/Euclidean)
68 | 5. [HammingDistance](http://php-nlp-tools.com/documentation/api/#NlpTools/Similarity/HammingDistance)
69 | 
70 | ### Stemmers ###
71 | 
72 | 1. [PorterStemmer](http://php-nlp-tools.com/documentation/api/#NlpTools/Stemmers/PorterStemmer)
73 | 2. [RegexStemmer](http://php-nlp-tools.com/documentation/api/#NlpTools/Stemmers/RegexStemmer)
74 | 3. [LancasterStemmer](http://php-nlp-tools.com/documentation/api/#NlpTools/Stemmers/LancasterStemmer)
75 | 4. [GreekStemmer](http://php-nlp-tools.com/documentation/api/#NlpTools/Stemmers/GreekStemmer)
76 | 
77 | ### Optimizers (MaxEnt only) ###
78 | 
79 | 1. [A gradient descent optimizer](http://php-nlp-tools.com/documentation/api/#NlpTools/Optimizers/MaxentGradientDescent)
80 |    (written in php) for educational use.
81 |    It is a simple implementation for anyone wanting to know a bit
82 |    more about either GD or MaxEnt models
83 | 2. A fast (faster than nltk-scipy), parallel gradient descent
84 |    optimizer written in [Go](http://golang.org/). This optimizer
85 |    resides in another [repo](https://github.com/angeloskath/nlp-maxent-optimizer),
86 |    it is used via the [external optimizer](http://php-nlp-tools.com/documentation/api/#NlpTools/Optimizers/ExternalMaxentOptimizer).
87 |    TODO: At least write a readme for the optimizer written in Go.
88 | 
89 | ### Other ###
90 | 
91 | 1. Idf Inverse document frequency
92 | 2. Stop words
93 | 3. Language based normalizers
94 | 4. Classifier based transformation for creating flexible preprocessing pipelines
95 | 


--------------------------------------------------------------------------------
/autoloader.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | spl_autoload_register(function ($className) {
 4 |     $className = ltrim($className,'\\');
 5 |     $fileName = __DIR__.DIRECTORY_SEPARATOR.'src'.DIRECTORY_SEPARATOR;
 6 |     $namespace = '';
 7 |     $lastNsPos = strrpos($className,'\\');
 8 |     if ($lastNsPos!==false) {
 9 |         $namespace = substr($className,0,$lastNsPos);
10 |         $className = substr($className,$lastNsPos+1);
11 |         $fileName .= str_replace('\\',DIRECTORY_SEPARATOR,$namespace).DIRECTORY_SEPARATOR;
12 |     }
13 |     $fileName .= str_replace('_',DIRECTORY_SEPARATOR,$className).'.php';
14 | 
15 |     if (file_exists($fileName))
16 |         require($fileName);
17 | });
18 | 


--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "nlp-tools/nlp-tools",
 3 | 	"description": "NlpTools is a set of php 5.3+ classes for beginner to semi advanced natural language processing work.",
 4 | 	"keywords": ["nlp","machine learning"],
 5 | 	"license": "WTFPL",
 6 | 	"authors": [
 7 | 		{
 8 | 			"name": "Angelos Katharopoulos",
 9 | 			"email": "angelos@yourse.gr"
10 | 		}
11 | 	],
12 | 	"require": {
13 | 		"php": ">=5.3"
14 | 	},
15 | 	"autoload": {
16 | 		"psr-0": {
17 | 			"NlpTools\\": "src/"
18 | 		}
19 | 	},
20 | 	"extra": {
21 | 		"branch-alias": {
22 | 			"dev-master": "1.0.x-dev"
23 | 		}
24 | 	}
25 | }
26 | 


--------------------------------------------------------------------------------
/src/NlpTools/Analysis/FreqDist.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | namespace NlpTools\Analysis;
  3 | 
  4 | use NlpTools\Documents\TokensDocument;
  5 | 
  6 | /**
  7 |  * Extract the Frequency distribution of keywords
  8 |  * @author Dan Cardin
  9 |  */
 10 | class FreqDist
 11 | {
 12 | 
 13 |     /**
 14 |      * An associative array that holds all the frequencies per token
 15 |      * @var array
 16 |      */
 17 |     protected $keyValues = array();
 18 | 
 19 |     /**
 20 |      * The total number of tokens originally passed into FreqDist
 21 |      * @var int
 22 |      */
 23 |     protected $totalTokens = null;
 24 | 
 25 |     /**
 26 |      * This sorts the token meta data collection right away so use
 27 |      * frequency distribution data can be extracted.
 28 |      * @param array $tokens
 29 |      */
 30 |     public function __construct(array $tokens)
 31 |     {
 32 |         $this->preCompute($tokens);
 33 |         $this->totalTokens = count($tokens);
 34 |     }
 35 | 
 36 |     /**
 37 |      * Get the total number of tokens in this tokensDocument
 38 |      * @return int
 39 |      */
 40 |     public function getTotalTokens()
 41 |     {
 42 |         return $this->totalTokens;
 43 |     }
 44 | 
 45 |     /**
 46 |      * Internal function for summarizing all the data into a key value store
 47 |      * @param array $tokens The set of tokens passed into the constructor
 48 |      */
 49 |     protected function preCompute(array &$tokens)
 50 |     {
 51 |         //count all the tokens up and put them in a key value store
 52 |         $this->keyValues = array_count_values($tokens);
 53 |         arsort($this->keyValues);
 54 |     }
 55 | 
 56 |     /**
 57 |      * Return the weight of a single token
 58 |      * @return float
 59 |      */
 60 |     public function getWeightPerToken()
 61 |     {
 62 |         return 1 / $this->getTotalTokens();
 63 |     }
 64 | 
 65 |     /**
 66 |      * Return get the total number of unique tokens
 67 |      * @return int
 68 |      */
 69 |     public function getTotalUniqueTokens()
 70 |     {
 71 |         return count($this->keyValues);
 72 |     }
 73 | 
 74 |     /**
 75 |      * Return the sorted keys by frequency desc
 76 |      * @return array
 77 |      */
 78 |     public function getKeys()
 79 |     {
 80 |         return array_keys($this->keyValues);
 81 |     }
 82 | 
 83 |     /**
 84 |      * Return the sorted values by frequency desc
 85 |      * @return array
 86 |      */
 87 |     public function getValues()
 88 |     {
 89 |         return array_values($this->keyValues);
 90 |     }
 91 | 
 92 |     /**
 93 |      * Return the full key value store
 94 |      * @return array
 95 |      */
 96 |     public function getKeyValues()
 97 |     {
 98 |         return $this->keyValues;
 99 |     }
100 | 
101 |     /**
102 |      * Return a token's count
103 |      * @param string $string
104 |      * @return mixed
105 |      */
106 |     public function getTotalByToken($string)
107 |     {
108 |         $array = $this->keyValues;
109 |         if(array_key_exists($string, $array)) {
110 |             return $array[$string];
111 |         } else {
112 |             return false;
113 |         }
114 |     }
115 | 
116 |     /**
117 |      * Return a token's weight (for user's own tf-idf/pdf/iduf implem)
118 |      * @param string $string
119 |      * @return mixed
120 |      */
121 |     public function getTokenWeight($string)
122 |     {
123 |         if($this->getTotalByToken($string)){
124 |             return $this->getTotalByToken($string)/$this->getTotalTokens();
125 |         } else {
126 |             return false;
127 |         }
128 |     }
129 | 
130 |     /**
131 |      *
132 |      * Returns an array of tokens that occurred once
133 |      * @todo This is an inefficient approach
134 |      * @return array
135 |      */
136 |     public function getHapaxes()
137 |     {
138 |         $samples = array();
139 |         foreach ($this->getKeyValues() as $sample => $count) {
140 |             if ($count == 1) {
141 |                 $samples[] = $sample;
142 |             }
143 |         }
144 |         return $samples;
145 |     }
146 | 
147 | }
148 | 


--------------------------------------------------------------------------------
/src/NlpTools/Analysis/Idf.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace NlpTools\Analysis;
  4 | 
  5 | use NlpTools\Documents\TrainingSet;
  6 | use NlpTools\FeatureFactories\FeatureFactoryInterface;
  7 | use NlpTools\FeatureFactories\DataAsFeatures;
  8 | 
  9 | /**
 10 |  * Idf implements the inverse document frequency measure.
 11 |  * Idf is a measure of whether a term T is common or rare accross
 12 |  * a set of documents.
 13 |  *
 14 |  * Idf implements the ArrayAccess interface so it should be used
 15 |  * as a read only array that contains tokens as keys and idf values
 16 |  * as values.
 17 |  */
 18 | class Idf implements \ArrayAccess
 19 | {
 20 |     protected $logD;
 21 |     protected $idf;
 22 | 
 23 |     /**
 24 |      * @param TrainingSet             $tset The set of documents for which we will compute the idf
 25 |      * @param FeatureFactoryInterface $ff   A feature factory to translate the document data to single tokens
 26 |      */
 27 |     public function __construct(TrainingSet $tset, FeatureFactoryInterface $ff=null)
 28 |     {
 29 |         if ($ff===null)
 30 |             $ff = new DataAsFeatures();
 31 | 
 32 |         $tset->setAsKey(TrainingSet::CLASS_AS_KEY);
 33 |         foreach ($tset as $class=>$doc) {
 34 |             $tokens = $ff->getFeatureArray($class,$doc); // extract tokens from the document
 35 |             $tokens = array_fill_keys($tokens,1); // make them occur once
 36 |             foreach ($tokens as $token=>$v) {
 37 |                 if (isset($this->idf[$token]))
 38 |                     $this->idf[$token]++;
 39 |                 else
 40 |                     $this->idf[$token] = 1;
 41 |             }
 42 |         }
 43 | 
 44 |         // this idf so far contains the doc frequency
 45 |         // we will now inverse it and take the log
 46 |         $D = count($tset);
 47 |         foreach ($this->idf as &$v) {
 48 |             $v = log($D/$v);
 49 |         }
 50 |         $this->logD = log($D);
 51 |     }
 52 | 
 53 |     /**
 54 |      * Implements the array access interface. Return the computed idf or
 55 |      * the logarithm of the count of the documents for a token we have not
 56 |      * seen before.
 57 |      *
 58 |      * @param  string $token The token to return the idf for
 59 |      * @return float  The idf
 60 |      */
 61 |     public function offsetGet($token)
 62 |     {
 63 |         if (isset($this->idf[$token])) {
 64 |             return $this->idf[$token];
 65 |         } else {
 66 |             return $this->logD;
 67 |         }
 68 |     }
 69 | 
 70 |     /**
 71 |      * Implements the array access interface. Return true if the token exists
 72 |      * in the corpus.
 73 |      *
 74 |      * @param  string $token The token to check if it exists in the corpus
 75 |      * @return bool
 76 |      */
 77 |     public function offsetExists($token)
 78 |     {
 79 |         return isset($this->idf[$token]);
 80 |     }
 81 | 
 82 |     /**
 83 |      * Will not be implemented. Throws \BadMethodCallException because
 84 |      * one should not be able to alter the idf values directly.
 85 |      */
 86 |     public function offsetSet($token, $value)
 87 |     {
 88 |         throw new \BadMethodCallException("The idf of a specific token cannot be set explicitly");
 89 |     }
 90 | 
 91 |     /**
 92 |      * Will not be implemented. Throws \BadMethodCallException because
 93 |      * one should not be able to alter the idf values directly.
 94 |      */
 95 |     public function offsetUnset($token)
 96 |     {
 97 |         throw new \BadMethodCallException("The idf of a specific token cannot be unset");
 98 |     }
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/NlpTools/Classifiers/ClassifierInterface.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Classifiers;
 4 | 
 5 | interface ClassifierInterface
 6 | {
 7 |     /**
 8 |      * Decide in which class C member of $classes would $d fit best.
 9 |      *
10 |      * @param  array             $classes A set of classes
11 |      * @param  DocumentInterface $d       A Document
12 |      * @return string            A class
13 |      */
14 |     public function classify(array $classes, \NlpTools\Documents\DocumentInterface $d);
15 | }
16 | 


--------------------------------------------------------------------------------
/src/NlpTools/Classifiers/FeatureBasedLinearClassifier.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Classifiers;
 4 | 
 5 | use \NlpTools\Documents\DocumentInterface;
 6 | use \NlpTools\FeatureFactories\FeatureFactoryInterface;
 7 | use \NlpTools\Models\LinearModel;
 8 | 
 9 | /**
10 |  * Classify using a linear model. A model that assigns a weight l for
11 |  * each feature f.
12 |  */
13 | class FeatureBasedLinearClassifier implements ClassifierInterface
14 | {
15 |     // The feature factory
16 |     protected $feature_factory;
17 |     // The LinearModel
18 |     protected $model;
19 | 
20 |     public function __construct(FeatureFactoryInterface $ff, LinearModel $m)
21 |     {
22 |         $this->feature_factory = $ff;
23 |         $this->model = $m;
24 |     }
25 | 
26 |     /**
27 |      * Compute the vote for every class. Return the class that
28 |      * receive the maximum vote.
29 |      *
30 |      * @param  array             $classes A set of classes
31 |      * @param  DocumentInterface $d       A Document
32 |      * @return string            A class
33 |      */
34 |     public function classify(array $classes, DocumentInterface $d)
35 |     {
36 |         $maxclass = current($classes);
37 |         $maxvote = $this->getVote($maxclass,$d);
38 |         while ($class = next($classes)) {
39 |             $v = $this->getVote($class,$d);
40 |             if ($v>$maxvote) {
41 |                 $maxclass = $class;
42 |                 $maxvote = $v;
43 |             }
44 |         }
45 | 
46 |         return $maxclass;
47 |     }
48 | 
49 |     /**
50 |      * Compute the features that fire for the Document $d. The sum of
51 |      * the weights of the features is the vote.
52 |      *
53 |      * @param  string            $class The vote for class $class
54 |      * @param  DocumentInterface $d     The vote for Document $d
55 |      * @return float             The vote of the model for class $class and Document $d
56 |      */
57 |     public function getVote($class, DocumentInterface $d)
58 |     {
59 |         $v = 0;
60 |         $features = $this->feature_factory->getFeatureArray($class,$d);
61 |         foreach ($features as $f) {
62 |             $v += $this->model->getWeight($f);
63 |         }
64 | 
65 |         return $v;
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/NlpTools/Classifiers/MultinomialNBClassifier.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Classifiers;
 4 | 
 5 | use NlpTools\Documents\DocumentInterface;
 6 | use NlpTools\FeatureFactories\FeatureFactoryInterface;
 7 | use NlpTools\Models\MultinomialNBModelInterface;
 8 | 
 9 | /**
10 |  * Use a multinomia NB model to classify a document
11 |  */
12 | class MultinomialNBClassifier implements ClassifierInterface
13 | {
14 |     // The feature factory
15 |     protected $feature_factory;
16 |     // The NBModel
17 |     protected $model;
18 | 
19 |     public function __construct(FeatureFactoryInterface $ff, MultinomialNBModelInterface $m)
20 |     {
21 |         $this->feature_factory = $ff;
22 |         $this->model = $m;
23 |     }
24 | 
25 |     /**
26 |      * Compute the probability of $d belonging to each class
27 |      * successively and return that class that has the maximum
28 |      * probability.
29 |      *
30 |      * @param  array             $classes The classes from which to choose
31 |      * @param  DocumentInterface $d       The document to classify
32 |      * @return string            $class The class that has the maximum probability
33 |      */
34 |     public function classify(array $classes, DocumentInterface $d)
35 |     {
36 |         $maxclass = current($classes);
37 |         $maxscore = $this->getScore($maxclass,$d);
38 |         while ($class=next($classes)) {
39 |             $score = $this->getScore($class,$d);
40 |             if ($score>$maxscore) {
41 |                 $maxclass = $class;
42 |                 $maxscore = $score;
43 |             }
44 |         }
45 | 
46 |         return $maxclass;
47 |     }
48 | 
49 |     /**
50 |      * Compute the log of the probability of the Document $d belonging
51 |      * to class $class. We compute the log so that we can sum over the
52 |      * logarithms instead of multiplying each probability.
53 |      *
54 |      * @todo perhaps MultinomialNBModel should have precomputed the logs
55 |      *       ex.: getLogPrior() and getLogCondProb()
56 |      *
57 |      * @param string $class The class for which we are getting a score
58 |      * @param DocumentInterface The document whose score we are getting
59 |      * @return float The log of the probability of $d belonging to $class
60 |      */
61 |     public function getScore($class, DocumentInterface $d)
62 |     {
63 |         $score = log($this->model->getPrior($class));
64 |         $features = $this->feature_factory->getFeatureArray($class,$d);
65 |         if (is_int(key($features)))
66 |             $features = array_count_values($features);
67 |         foreach ($features as $f=>$fcnt) {
68 |             $score += $fcnt*log($this->model->getCondProb($f,$class));
69 |         }
70 | 
71 |         return $score;
72 |     }
73 | 
74 | }
75 | 


--------------------------------------------------------------------------------
/src/NlpTools/Clustering/CentroidFactories/CentroidFactoryInterface.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Clustering\CentroidFactories;
 4 | 
 5 | interface CentroidFactoryInterface
 6 | {
 7 |     /**
 8 |      * Parse the provided docs and create a doc that given a metric
 9 |      * of distance is the centroid of the provided docs.
10 |      *
11 |      * The second array is to choose some of the provided docs to
12 |      * compute the centroid.
13 |      *
14 |      * @param  array $docs   The docs from which the centroid will be computed
15 |      * @param  array $choose The indexes from which the centroid will be computed (if empty all the docs will be used)
16 |      * @return mixed The centroid. It could be any form of data a number, a vector (it will be the same as the data provided in docs)
17 |      */
18 |     public function getCentroid(array &$docs, array $choose=array());
19 | }
20 | 


--------------------------------------------------------------------------------
/src/NlpTools/Clustering/CentroidFactories/Euclidean.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Clustering\CentroidFactories;
 4 | 
 5 | /**
 6 |  * Computes the euclidean centroid of the provided sparse vectors
 7 |  */
 8 | class Euclidean implements CentroidFactoryInterface
 9 | {
10 |     /**
11 |      * If the document is a collection of tokens or features transorm it to
12 |      * a sparse vector with frequency information.
13 |      *
14 |      * Ex.: If 'A' appears twice in the doc the dimension 'A' will have value 2
15 |      * in the resulting vector
16 |      *
17 |      * @param  array $doc The doc data to transform to sparse vector
18 |      * @return array A sparse vector representing the document to the n-dimensional euclidean space
19 |      */
20 |     protected function getVector(array $doc)
21 |     {
22 |         if (is_int(key($doc)))
23 |             return array_count_values($doc);
24 |         else
25 |             return $doc;
26 |     }
27 | 
28 |     /**
29 |      * Compute the mean value for each dimension.
30 |      *
31 |      * @param  array $docs   The docs from which the centroid will be computed
32 |      * @param  array $choose The indexes from which the centroid will be computed (if empty all the docs will be used)
33 |      * @return mixed The centroid. It could be any form of data a number, a vector (it will be the same as the data provided in docs)
34 |      */
35 |     public function getCentroid(array &$docs, array $choose=array())
36 |     {
37 |         $v = array();
38 |         if (empty($choose))
39 |             $choose = range(0,count($docs)-1);
40 |         $cnt = count($choose);
41 |         foreach ($choose as $idx) {
42 |             $doc = $this->getVector($docs[$idx]);
43 |             foreach ($doc as $k=>$w) {
44 |                 if (!isset($v[$k]))
45 |                     $v[$k] = $w;
46 |                 else
47 |                     $v[$k] += $w;
48 |             }
49 |         }
50 |         foreach ($v as &$w) {
51 |             $w /= $cnt;
52 |         }
53 | 
54 |         return $v;
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/NlpTools/Clustering/CentroidFactories/Hamming.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Clustering\CentroidFactories;
 4 | 
 5 | /**
 6 | This class computes the centroid of the hamming distance between two strings
 7 | that are the binary representations of two integers (the strings are supposed
 8 | to only contain the characters 1 and 0).
 9 |  */
10 | class Hamming implements CentroidFactoryInterface
11 | {
12 | 
13 |     /**
14 |      * Return a number in binary encoding in a string such that the sum of its
15 |      * hamming distances of each document is minimized.
16 |      *
17 |      * Assumptions: The docs array should contain strings that are properly padded
18 |      * 			 binary (they should all be the same length).
19 |      */
20 |     public function getCentroid(array &$docs, array $choose=array())
21 |     {
22 |         $bitl = strlen($docs[0]);
23 |         $buckets = array_fill_keys(
24 |             range(0,$bitl-1),
25 |             0
26 |         );
27 |         if (empty($choose))
28 |             $choose = range(0,count($docs)-1);
29 |         foreach ($choose as $idx) {
30 |             $s = $docs[$idx];
31 |             foreach ($buckets as $i=>&$v) {
32 |                 if ($s[$i]=='1')
33 |                     $v += 1;
34 |                 else
35 |                     $v -= 1;
36 |             }
37 |         }
38 | 
39 |         return implode(
40 |             '',
41 |             array_map(
42 |                 function ($v) {
43 |                     return ($v>0) ? '1' : '0';
44 |                 },
45 |                 $buckets
46 |             )
47 |         );
48 |     }
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/src/NlpTools/Clustering/CentroidFactories/MeanAngle.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Clustering\CentroidFactories;
 4 | 
 5 | /**
 6 |  * MeanAngle computes the unit vector with angle the average of all
 7 |  * the given vectors. The purpose is to compute a vector M such that
 8 |  * sum(cosine_similarity(M,x_i)) is maximized
 9 |  */
10 | class MeanAngle extends Euclidean
11 | {
12 |     protected function normalize(array $v)
13 |     {
14 |         $norm = array_reduce(
15 |             $v,
16 |             function ($v,$w) {
17 |                 return $v+$w*$w;
18 |             }
19 |         );
20 |         $norm = sqrt($norm);
21 | 
22 |         return array_map(
23 |             function ($vi) use ($norm) {
24 |                 return $vi/$norm;
25 |             },
26 |             $v
27 |         );
28 |     }
29 | 
30 |     public function getCentroid(array &$docs, array $choose=array())
31 |     {
32 |         if (empty($choose))
33 |             $choose = range(0,count($docs)-1);
34 |         $cnt = count($choose);
35 |         $v = array();
36 |         foreach ($choose as $idx) {
37 |             $d = $this->normalize($this->getVector($docs[$idx]));
38 |             foreach ($d as $i=>$vi) {
39 |                 if (!isset($v[$i]))
40 |                     $v[$i] = $vi;
41 |                 else
42 |                     $v[$i] += $vi;
43 |             }
44 |         }
45 | 
46 |         return array_map(
47 |             function ($vi) use ($cnt) {
48 |                 return $vi/$cnt;
49 |             },
50 |             $v
51 |         );
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/src/NlpTools/Clustering/Clusterer.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Clustering;
 4 | 
 5 | use NlpTools\FeatureFactories\FeatureFactoryInterface;
 6 | use NlpTools\Documents\TrainingSet;
 7 | 
 8 | abstract class Clusterer
 9 | {
10 |     /**
11 |      * Group the documents together
12 |      *
13 |      * @param  TrainingSet             $documents The documents to be clustered
14 |      * @param  FeatureFactoryInterface $ff        A feature factory to transform the documents given
15 |      * @return array                   The clusters, an array containing arrays of offsets for the documents
16 |      */
17 |     abstract public function cluster(TrainingSet $documents, FeatureFactoryInterface $ff);
18 | 
19 |     /**
20 |      * Helper function to transform a TrainingSet to an array of feature vectors
21 |      */
22 |     protected function getDocumentArray(TrainingSet $documents, FeatureFactoryInterface $ff)
23 |     {
24 |         $docs = array();
25 |         foreach ($documents as $d) {
26 |             $docs[] = $ff->getFeatureArray('',$d);
27 |         }
28 | 
29 |         return $docs;
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/NlpTools/Clustering/Hierarchical.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Clustering;
 4 | 
 5 | use NlpTools\Clustering\MergeStrategies\MergeStrategyInterface;
 6 | use NlpTools\Similarity\DistanceInterface;
 7 | use NlpTools\Documents\TrainingSet;
 8 | use NlpTools\FeatureFactories\FeatureFactoryInterface;
 9 | 
10 | /**
11 |  * This class implements hierarchical agglomerative clustering.
12 |  * It receives a MergeStrategy as a parameter and a Distance metric.
13 |  */
14 | class Hierarchical extends Clusterer
15 | {
16 |     protected $strategy;
17 |     protected $dist;
18 | 
19 |     public function __construct(MergeStrategyInterface $ms, DistanceInterface $d)
20 |     {
21 |         $this->strategy = $ms;
22 |         $this->dist = $d;
23 |     }
24 | 
25 |     /**
26 |      * Iteratively merge documents together to create an hierarchy of clusters.
27 |      * While hierarchical clustering only returns one element, it still wraps it
28 |      * in an array to be consistent with the rest of the clustering methods.
29 |      *
30 |      * @return array An array containing one element which is the resulting dendrogram
31 |      */
32 |     public function cluster(TrainingSet $documents, FeatureFactoryInterface $ff)
33 |     {
34 |         // what a complete waste of memory here ...
35 |         // the same data exists in $documents, $docs and
36 |         // the only useful parts are in $this->strategy
37 |         $docs = $this->getDocumentArray($documents, $ff);
38 |         $this->strategy->initializeStrategy($this->dist,$docs);
39 |         unset($docs); // perhaps save some memory
40 | 
41 |         // start with all the documents being in their
42 |         // own cluster we 'll merge later
43 |         $clusters = range(0,count($documents)-1);
44 |         $c = count($clusters);
45 |         while ($c>1) {
46 |             // ask the strategy which to merge. The strategy
47 |             // will assume that we will indeed merge the returned clusters
48 |             list($i,$j) = $this->strategy->getNextMerge();
49 |             $clusters[$i] = array($clusters[$i],$clusters[$j]);
50 |             unset($clusters[$j]);
51 |             $c--;
52 |         }
53 |         $clusters = array($clusters[$i]);
54 | 
55 |         // return the dendrogram
56 |         return array($clusters);
57 |     }
58 | 
59 |     /**
60 |      * Flatten a dendrogram to an almost specific
61 |      * number of clusters (the closest power of 2 larger than
62 |      * $NC)
63 |      *
64 |      * @param  array   $tree The dendrogram to be flattened
65 |      * @param  integer $NC   The number of clusters to cut to
66 |      * @return array   The flat clusters
67 |      */
68 |     public static function dendrogramToClusters($tree,$NC)
69 |     {
70 |         $clusters = $tree;
71 |         while (count($clusters)<$NC) {
72 |             $tmpc = array();
73 |             foreach ($clusters as $subclust) {
74 |                 if (!is_array($subclust))
75 |                     $tmpc[] = $subclust;
76 |                 else {
77 |                     foreach ($subclust as $c)
78 |                         $tmpc[] = $c;
79 |                 }
80 |             }
81 |             $clusters = $tmpc;
82 |         }
83 |         foreach ($clusters as &$c) {
84 |             $c = iterator_to_array(
85 |                 new \RecursiveIteratorIterator(
86 |                     new \RecursiveArrayIterator(
87 |                         array($c)
88 |                     )
89 |                 ),
90 |                 false // do not use keys
91 |             );
92 |         }
93 | 
94 |         return $clusters;
95 |     }
96 | }
97 | 


--------------------------------------------------------------------------------
/src/NlpTools/Clustering/KMeans.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace NlpTools\Clustering;
  4 | 
  5 | use NlpTools\Similarity\DistanceInterface;
  6 | use NlpTools\Clustering\CentroidFactories\CentroidFactoryInterface;
  7 | use NlpTools\Documents\TrainingSet;
  8 | use NlpTools\FeatureFactories\FeatureFactoryInterface;
  9 | 
 10 | /**
 11 |  * This clusterer uses the KMeans algorithm for clustering documents.
 12 |  * It accepts as parameters the number of clusters and the distance metric
 13 |  * as well as the methodology for computing the new centroids (thus it
 14 |  * can be used to cluster documents in spaces other than the euclidean
 15 |  * vector space).
 16 |  * A description of this algorithm can be found at
 17 |  * http://en.wikipedia.org/wiki/K-means_clustering
 18 |  */
 19 | class KMeans extends Clusterer
 20 | {
 21 |     protected $dist;
 22 |     protected $centroidF;
 23 |     protected $n;
 24 |     protected $cutoff;
 25 | 
 26 |     /**
 27 |      * Initialize the K Means clusterer
 28 |      *
 29 |      * @param int                      $n      The number of clusters to compute
 30 |      * @param DistanceInterface        $d      The distance metric to be used (Euclidean, Hamming, ...)
 31 |      * @param CentroidFactoryInterface $cf     This parameter will be used to create the new centroids from a set of documents
 32 |      * @param float                    $cutoff When the maximum change of the centroids is smaller than that stop iterating
 33 |      */
 34 |     public function __construct($n, DistanceInterface $d, CentroidFactoryInterface $cf, $cutoff=1e-5)
 35 |     {
 36 |         $this->dist = $d;
 37 |         $this->n = $n;
 38 |         $this->cutoff = $cutoff;
 39 |         $this->centroidF = $cf;
 40 |     }
 41 | 
 42 |     /**
 43 |      * Apply the feature factory to the documents and then cluster the resulting array
 44 |      * using the provided distance metric and centroid factory.
 45 |      */
 46 |     public function cluster(TrainingSet $documents, FeatureFactoryInterface $ff)
 47 |     {
 48 |         // transform the documents according to the FeatureFactory
 49 |         $docs = $this->getDocumentArray($documents,$ff);
 50 | 
 51 |         // choose N centroids at random
 52 |         $centroids = array();
 53 |         foreach (array_rand($docs,$this->n) as $key) {
 54 |             $centroids[] = $docs[$key];
 55 |         }
 56 | 
 57 |         // cache the distance and centroid factory functions for use
 58 |         // with closures
 59 |         $dist = array($this->dist,'dist');
 60 |         $cf = array($this->centroidF,'getCentroid');
 61 | 
 62 |         // looooooooop
 63 |         while (true) {
 64 |             // compute the distance each document has from our centroids
 65 |             // the array is MxN where M = count($docs) and N = count($centroids)
 66 |             $distances = array_map(
 67 |                 function ($doc) use (&$centroids,$dist) {
 68 |                     return array_map(
 69 |                         function ($c) use ($dist,$doc) {
 70 |                             // it is passed with an array because dist expects references
 71 |                             // and it failed when run with phpunit.
 72 |                             // see http://php.net/manual/en/function.call-user-func.php
 73 |                             // for the solution used below
 74 |                             return call_user_func_array(
 75 |                                 $dist,
 76 |                                 array(
 77 |                                     &$c,
 78 |                                     &$doc
 79 |                                 )
 80 |                             );
 81 |                         },
 82 |                         $centroids
 83 |                     );
 84 |                 },
 85 |                 $docs
 86 |             );
 87 | 
 88 |             // initialize the empty clusters
 89 |             $clusters = array_fill_keys(
 90 |                 array_keys($centroids),
 91 |                 array()
 92 |             );
 93 |             foreach ($distances as $idx=>$d) {
 94 |                 // assign document idx to the closest centroid
 95 |                 $clusters[array_search(min($d),$d)][] = $idx;
 96 |             }
 97 | 
 98 |             // compute the new centroids from the assigned documents
 99 |             // using the centroid factory function
100 |             $new_centroids = array_map(
101 |                 function ($cluster) use (&$docs,$cf) {
102 |                     return call_user_func_array(
103 |                         $cf,
104 |                         array(
105 |                             &$docs,
106 |                             $cluster
107 |                         )
108 |                     );
109 |                 },
110 |                 $clusters
111 |             );
112 | 
113 |             // compute the change each centroid had from the previous one
114 |             $changes = array_map(
115 |                 $dist,
116 |                 $new_centroids,
117 |                 $centroids
118 |             );
119 | 
120 |             // if the largest change is small enough we are done
121 |             if (max($changes)<$this->cutoff) {
122 |                 // return the clusters, the centroids and the distances
123 |                 return array($clusters,$centroids,$distances);
124 |             }
125 | 
126 |             // update the centroids and loooooop again
127 |             $centroids = $new_centroids;
128 |         }
129 |     }
130 | }
131 | 


--------------------------------------------------------------------------------
/src/NlpTools/Clustering/MergeStrategies/CompleteLink.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Clustering\MergeStrategies;
 4 | 
 5 | /**
 6 |  * In single linkage clustering the new distance of the merged cluster with
 7 |  * cluster i is the maximum distance of either cluster x to i or y to i.
 8 |  *
 9 |  * For a more detailed description see the documentation of SingleLink.
10 |  */
11 | class CompleteLink extends HeapLinkage
12 | {
13 |     protected function newDistance($xi,$yi,$x,$y)
14 |     {
15 |         return max($this->dm[$xi],$this->dm[$yi]);
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/src/NlpTools/Clustering/MergeStrategies/GroupAverage.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Clustering\MergeStrategies;
 4 | 
 5 | use NlpTools\Similarity\DistanceInterface;
 6 | 
 7 | /**
 8 |  * In single linkage clustering the new distance of the merged cluster with
 9 |  * cluster i is the average distance of all points in cluster x to i and y to i.
10 |  *
11 |  * The average distance is efficiently computed by assuming that every point from
12 |  * every other point in each cluster have the same distance (the average distance).
13 |  * Then the computation is simply a weighted average of the average distances.
14 |  */
15 | class GroupAverage extends HeapLinkage
16 | {
17 |     protected $cluster_size;
18 | 
19 |     public function initializeStrategy(DistanceInterface $d, array &$docs)
20 |     {
21 |         parent::initializeStrategy($d,$docs);
22 | 
23 |         $this->cluster_size = array_fill_keys(
24 |             range(0,$this->L-1),
25 |             1
26 |         );
27 |     }
28 | 
29 |     protected function newDistance($xi,$yi,$x,$y)
30 |     {
31 |         $size_x = $this->cluster_size[$x];
32 |         $size_y = $this->cluster_size[$y];
33 | 
34 |         return ($this->dm[$xi]*$size_x + $this->dm[$yi]*$size_y)/($size_x + $size_y);
35 |     }
36 | 
37 |     public function getNextMerge()
38 |     {
39 |         $r = parent::getNextMerge();
40 | 
41 |         $this->cluster_size[$r[0]] += $this->cluster_size[$r[1]];
42 |         unset($this->cluster_size[$r[1]]);
43 | 
44 |         return $r;
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/NlpTools/Clustering/MergeStrategies/HeapLinkage.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace NlpTools\Clustering\MergeStrategies;
  4 | 
  5 | use NlpTools\Similarity\DistanceInterface;
  6 | 
  7 | /**
  8 |  * HeapLinkage is an abstract merge strategy.
  9 |  *
 10 |  * It creates a pairwise distance matrix. It then uses a heap to compute
 11 |  * efficiently the minimum in the distance matrix. Then the two clusters
 12 |  * with the minimum distance are merged and the distance of the new cluster
 13 |  * with every other cluster i is recomputed. This recomputation is delegated
 14 |  * to the children classes through the abstract function newDistance().
 15 |  *
 16 |  * The class uses an SplFixedArray that is filled with the lower triangle of
 17 |  * the distance matrix. This is done to save memory. The index of a pair x,y
 18 |  * is computed as follows:
 19 |  *  1. if x>y swap x,y
 20 |  *  2. index = y*(y-1)/2 + x
 21 |  */
 22 | abstract class HeapLinkage implements MergeStrategyInterface
 23 | {
 24 |     protected $L;
 25 |     protected $queue;
 26 |     protected $dm;
 27 |     protected $removed;
 28 | 
 29 |     /**
 30 |      * Calculate the distance of the merged cluster x,y with cluster i
 31 |      * based on a merge strategy (SingleLink, CompleteLink, GroupAverage, ...)
 32 |      * Ex.: for single link this function would be
 33 |      * return min($this->dm[$xi],$this->dm[$yi]);
 34 |      */
 35 |     abstract protected function newDistance($xi,$yi,$x,$y);
 36 | 
 37 |     /**
 38 |      * Initialize the distance matrix and any other data structure needed
 39 |      * to calculate the merges later.
 40 |      *
 41 |      * @param DistanceInterface $d    The distance metric used to calculate the distance matrix
 42 |      * @param array             $docs The docs to be clustered
 43 |      */
 44 |     public function initializeStrategy(DistanceInterface $d, array &$docs)
 45 |     {
 46 |         // the number of documents and the dimensions of the matrix
 47 |         $this->L = count($docs);
 48 |         // just to hold which document has been removed
 49 |         $this->removed = array_fill_keys(range(0, $this->L-1), false);
 50 |         // how many distances we must compute
 51 |         $elements = (int) ($this->L*($this->L-1))/2;
 52 |         // the containers that will hold the distances
 53 |         $this->dm = new \SplFixedArray($elements);
 54 |         $this->queue = new \SplPriorityQueue();
 55 |         $this->queue->setExtractFlags(\SplPriorityQueue::EXTR_BOTH);
 56 | 
 57 |         // for each unique pair of documents calculate the distance and
 58 |         // save it in the heap and distance matrix
 59 |         for ($x=0;$x<$this->L;$x++) {
 60 |             for ($y=$x+1;$y<$this->L;$y++) {
 61 |                 $index = $this->packIndex($y,$x);
 62 |                 $tmp_d = $d->dist($docs[$x],$docs[$y]);
 63 |                 $this->dm[$index] = $tmp_d;
 64 |                 $this->queue->insert($index, -$tmp_d);
 65 |             }
 66 |         }
 67 |     }
 68 | 
 69 |     /**
 70 |      * Return the pair of clusters x,y to be merged.
 71 |      *  1. Extract the pair with the smallest distance
 72 |      *  2. Recalculate the distance of the merged cluster with every other cluster
 73 |      *  3. Merge the clusters (by labeling one as removed)
 74 |      *  4. Reheap
 75 |      *
 76 |      * @return array The pair (x,y) to be merged
 77 |      */
 78 |     public function getNextMerge()
 79 |     {
 80 |         // extract the pair with the smallest distance
 81 |         $tmp = $this->queue->extract();
 82 |         $index = $tmp["data"];
 83 |         $d = -$tmp["priority"];
 84 |         list($y,$x) = $this->unravelIndex($index);
 85 |         // check if it is invalid
 86 |         while ($this->removed[$y] || $this->removed[$x] || $this->dm[$index]!=$d) {
 87 |             $tmp = $this->queue->extract();
 88 |             $index = $tmp["data"];
 89 |             $d = -$tmp["priority"];
 90 |             list($y,$x) = $this->unravelIndex($index);
 91 |         }
 92 | 
 93 |         // Now that we have a valid pair to be merged
 94 |         // calculate the distances of the merged cluster with any
 95 |         // other cluster
 96 |         $yi = $this->packIndex($y,0);
 97 |         $xi = $this->packIndex($x,0);
 98 | 
 99 |         // for every cluster with index i<x<y
100 |         for ($i=0;$i<$x;$i++,$yi++,$xi++) {
101 |             $d = $this->newDistance($xi,$yi,$x,$y);
102 |             if ($d!=$this->dm[$xi]) {
103 |                 $this->dm[$xi] = $d;
104 |                 $this->queue->insert($xi, -$d);
105 |             }
106 |         }
107 |         // for every cluster with index x<i<y
108 |         for ($i=$x+1;$i<$y;$i++,$yi++) {
109 |             $xi = $this->packIndex($i,$x);
110 |             $d = $this->newDistance($xi,$yi,$x,$y);
111 |             if ($d!=$this->dm[$xi]) {
112 |                 $this->dm[$xi] = $d;
113 |                 $this->queue->insert($xi, -$d);
114 |             }
115 |         }
116 |         // for every cluster x<y<i
117 |         for ($i=$y+1;$i<$this->L;$i++) {
118 |             $xi = $this->packIndex($i,$x);
119 |             $yi = $this->packIndex($i,$y);
120 |             $d = $this->newDistance($xi,$yi,$x,$y);
121 |             if ($d!=$this->dm[$xi]) {
122 |                 $this->dm[$xi] = $d;
123 |                 $this->queue->insert($xi, -$d);
124 |             }
125 |         }
126 | 
127 |         // mark y as removed
128 |         $this->removed[$y] = true;
129 | 
130 |         return array($x,$y);
131 |     }
132 | 
133 |     /**
134 |      * Use binary search to unravel the index to its coordinates x,y
135 |      * return them in the order y,x . This operation is to be done only
136 |      * once per merge so it doesn't add much overhead.
137 |      *
138 |      * Note: y will always be larger than x
139 |      *
140 |      * @param  integer $index The index to be unraveled
141 |      * @return array   An array containing (y,x)
142 |      */
143 |     protected function unravelIndex($index)
144 |     {
145 |         $a = 0;
146 |         $b = $this->L-1;
147 |         $y = 0;
148 |         while ($b-$a > 1) {
149 |             // the middle row in the interval [a,b]
150 |             $y = (int) (($a+$b)/2);
151 |             // the candidate index aka how many points until this row
152 |             $i = $y*($y-1)/2;
153 | 
154 |             // if we need an offset les then the wanted y will be in the offset [a,y]
155 |             if ($i > $index) {
156 |                 $b = $y;
157 |             } else {
158 |             // else it will be in the offset [y,b]
159 |                 $a = $y;
160 |             }
161 |         }
162 |         // we have finished searching it is either a or b
163 |         $x = $index - $i;
164 | 
165 |         // this means that it is b and we have a
166 |         if ($y <= $x) {
167 |             $y++;
168 |             $x = $index - $y*($y-1)/2;
169 |         } elseif ($x < 0) {
170 |         // this means that it is a and we have b
171 |             $y--;
172 |             $x = $index - $y*($y-1)/2;
173 |         }
174 | 
175 |         return array(
176 |             (int) $y,
177 |             (int) $x
178 |         );
179 |     }
180 | 
181 |     /**
182 |      * Pack the coordinates x and y to an integer offset from 0.
183 |      * The first line (y=0) contains 0 elements, the 2nd 1 the 3rd 2 ...
184 |      * So to calculate how many elements are from the first to the nth line
185 |      * we should calculate the sum 0+1+2+...+n-1 which is equal to (n-1)*n / 2
186 |      *
187 |      * Note: y must always be larger than x
188 |      *
189 |      * @param  integer $y The y coordinate (large)
190 |      * @param  integer $x The x coordinate (small)
191 |      * @return integer The offset in the low triangle matri containing the item (x,y)
192 |      */
193 |     protected function packIndex($y, $x)
194 |     {
195 |         return $y*($y-1)/2 + $x;
196 |     }
197 | }
198 | 


--------------------------------------------------------------------------------
/src/NlpTools/Clustering/MergeStrategies/MergeStrategyInterface.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Clustering\MergeStrategies;
 4 | 
 5 | use NlpTools\Similarity\DistanceInterface;
 6 | 
 7 | /**
 8 |  * In hierarchical agglomerative clustering each document starts in its own
 9 |  * cluster and then it is subsequently merged with the "closest" cluster.
10 |  * The MergeStrategy defines how a new distance for the merged cluster is
11 |  * going to be calculated based on the distances of the individual clusters.
12 |  */
13 | interface MergeStrategyInterface
14 | {
15 |     /**
16 |      * Study the docs and preprocess anything required for
17 |      * computing the merges
18 |      */
19 |     public function initializeStrategy(DistanceInterface $d, array &$docs);
20 | 
21 |     /**
22 |      * Return the next two clusters for merging and assume
23 |      * they are merged (ex. update a similarity matrix)
24 |      *
25 |      * @return array An array with two numbers which are the cluster ids
26 |      */
27 |     public function getNextMerge();
28 | }
29 | 


--------------------------------------------------------------------------------
/src/NlpTools/Clustering/MergeStrategies/SingleLink.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Clustering\MergeStrategies;
 4 | 
 5 | /**
 6 |  * In single linkage clustering the new distance of the merged cluster with
 7 |  * cluster i is the smallest distance of either cluster x to i or y to i.
 8 |  *
 9 |  * Example:
10 |  *
11 |  * Suppose we have the following four clusters
12 |  *
13 |  * a = [ (0,0) ]
14 |  * b = [ (5,2) ]
15 |  * c = [ (0,5) ]
16 |  * d = [ (0,2) ]
17 |  *
18 |  * with the following pairwise distance matrix
19 |  *
20 |  *       a     b     c     d
21 |  *   +-----+-----+-----+-----+
22 |  * a |   0 | 5.3 |   5 |   2 |
23 |  *   +-----+-----+-----+-----+
24 |  * b | 5.3 |   0 | 5.8 |   5 |
25 |  *   +-----+-----+-----+-----+
26 |  * c |   5 | 5.8 |   0 |   3 |
27 |  *   +-----+-----+-----+-----+
28 |  * d |   2 |   5 |   3 |   0 |
29 |  *   +-----+-----+-----+-----+
30 |  *
31 |  * if we merge clusters a,d (which are the closest) then we need to update the
32 |  * matrix to represent the new distances. For every other cluster (b and c) the
33 |  * new distance has to be calculated and it is going to be the minimum between
34 |  * the distances of the two clusters to be merged.
35 |  *
36 |  *               a,d             b             c
37 |  *     +-------------+-------------+-------------+
38 |  * a,d |           0 | min(5.3, 2) |   min(5, 3) |
39 |  *     +-------------+-------------+-------------+
40 |  *   b | min(5.3, 2) |           0 |         5.8 |
41 |  *     +-------------+-------------+-------------+
42 |  *   c |   min(5, 3) |         5.8 |           0 |
43 |  *     +-------------+-------------+-------------+
44 |  *
45 |  */
46 | class SingleLink extends HeapLinkage
47 | {
48 |     protected function newDistance($xi,$yi,$x,$y)
49 |     {
50 |         return min($this->dm[$xi],$this->dm[$yi]);
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/src/NlpTools/Documents/DocumentInterface.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Documents;
 4 | 
 5 | use NlpTools\Utils\TransformationInterface;
 6 | 
 7 | /**
 8 |  * A Document is a representation of a Document to be classified.
 9 |  * It can be a representation of a word, of a bunch of text, of a text
10 |  * that has structure (ex.: Title,Body,Link)
11 |  */
12 | interface DocumentInterface
13 | {
14 |     /**
15 |      * Return the data of what is being represented. If it were a word
16 |      * we could return a word. If it were a blog post we could return
17 |      * an array(Title,Body,array(Comments)).
18 |      *
19 |      * @return mixed
20 |      */
21 |     public function getDocumentData();
22 | 
23 |     /**
24 |      * Apply the transformation to the data of this document.
25 |      * How the transformation is applied (per token, per token sequence, etc)
26 |      * is decided by the implementing classes.
27 |      *
28 |      * @param TransformationInterface $transform The transformation to be applied
29 |      */
30 |     public function applyTransformation(TransformationInterface $transform);
31 | }
32 | 


--------------------------------------------------------------------------------
/src/NlpTools/Documents/RawDocument.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Documents;
 4 | 
 5 | use NlpTools\Utils\TransformationInterface;
 6 | 
 7 | /**
 8 |  * RawDocument simply encapsulates a php variable
 9 |  */
10 | class RawDocument implements DocumentInterface
11 | {
12 |     protected $data;
13 | 
14 |     public function __construct($data)
15 |     {
16 |         $this->data = $data;
17 |     }
18 | 
19 |     public function getDocumentData()
20 |     {
21 |         return $this->data;
22 |     }
23 | 
24 |     public function applyTransformation(TransformationInterface $transform)
25 |     {
26 |         $this->data = $transform->transform($this->data);
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/NlpTools/Documents/TokensDocument.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Documents;
 4 | 
 5 | use NlpTools\Utils\TransformationInterface;
 6 | 
 7 | /**
 8 |  * Represents a bag of words (tokens) document.
 9 |  */
10 | class TokensDocument implements DocumentInterface
11 | {
12 |     protected $tokens;
13 |     public function __construct(array $tokens)
14 |     {
15 |         $this->tokens = $tokens;
16 |     }
17 |     /**
18 |      * Simply return the tokens received in the constructor
19 |      * @return array The tokens array
20 |      */
21 |     public function getDocumentData()
22 |     {
23 |         return $this->tokens;
24 |     }
25 | 
26 |     /**
27 |      * Apply the transform to each token. Filter out the null tokens.
28 |      *
29 |      * @param TransformationInterface $transform The transformation to be applied
30 |      */
31 |     public function applyTransformation(TransformationInterface $transform)
32 |     {
33 |         // array_values for re-indexing
34 |         $this->tokens = array_values(
35 |             array_filter(
36 |                 array_map(
37 |                     array($transform, 'transform'),
38 |                     $this->tokens
39 |                 ),
40 |                 function ($token) {
41 |                     return $token!==null;
42 |                 }
43 |             )
44 |         );
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/NlpTools/Documents/TrainingDocument.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Documents;
 4 | 
 5 | use NlpTools\Utils\TransformationInterface;
 6 | 
 7 | /**
 8 |  * A TrainingDocument is a document that "decorates" any other document
 9 |  * to add the real class of the document. It is used while training
10 |  * together with the training set.
11 |  */
12 | class TrainingDocument implements DocumentInterface
13 | {
14 |     protected $d;
15 |     protected $class;
16 | 
17 |     /**
18 |      * @param string            $class The actual class of the Document $d
19 |      * @param DocumentInterface $d     The document to be decorated
20 |      */
21 |     public function __construct($class, DocumentInterface $d)
22 |     {
23 |         $this->d = $d;
24 |         $this->class = $class;
25 |     }
26 |     public function getDocumentData()
27 |     {
28 |         return $this->d->getDocumentData();
29 |     }
30 |     public function getClass()
31 |     {
32 |         return $this->class;
33 |     }
34 | 
35 |     /**
36 |      * Pass the transformation to the decorated document
37 |      *
38 |      * @param TransformationInterface $transform The transformation to be applied
39 |      */
40 |     public function applyTransformation(TransformationInterface $transform)
41 |     {
42 |         $this->d->applyTransformation($transform);
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/src/NlpTools/Documents/TrainingSet.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace NlpTools\Documents;
  4 | 
  5 | /**
  6 |  * A collection of TrainingDocument objects. It implements many built
  7 |  * in php interfaces for ease of use.
  8 |  */
  9 | class TrainingSet implements \Iterator,\ArrayAccess,\Countable
 10 | {
 11 |     const CLASS_AS_KEY = 1;
 12 |     const OFFSET_AS_KEY = 2;
 13 | 
 14 |     // An array that contains all the classes present in the TrainingSet
 15 |     protected $classSet;
 16 |     protected $documents; // The documents container
 17 | 
 18 |     // When iterated upon what should the key be?
 19 |     protected $keytype;
 20 |     // When iterated upon the currentDocument
 21 |     protected $currentDocument;
 22 | 
 23 |     public function __construct()
 24 |     {
 25 |         $this->classSet = array();
 26 |         $this->documents = array();
 27 |         $this->keytype = self::CLASS_AS_KEY;
 28 |     }
 29 | 
 30 |     /**
 31 |      * Add a document to the set.
 32 |      *
 33 |      * @param $class The documents actual class
 34 |      * @param $d The Document
 35 |      * @return void
 36 |      */
 37 |     public function addDocument($class, DocumentInterface $d)
 38 |     {
 39 |         $this->documents[] = new TrainingDocument($class,$d);
 40 |         $this->classSet[$class] = 1;
 41 |     }
 42 |     // return the classset
 43 |     public function getClassSet()
 44 |     {
 45 |         return array_keys($this->classSet);
 46 |     }
 47 | 
 48 |     /**
 49 |      * Decide what should be returned as key when iterated upon
 50 |      */
 51 |     public function setAsKey($what)
 52 |     {
 53 |         switch ($what) {
 54 |             case self::CLASS_AS_KEY:
 55 |             case self::OFFSET_AS_KEY:
 56 |                 $this->keytype = $what;
 57 |                 break;
 58 |             default:
 59 |                 $this->keytype = self::CLASS_AS_KEY;
 60 |                 break;
 61 |         }
 62 |     }
 63 | 
 64 |     /**
 65 |      * Apply an array of transformations to all documents in this container.
 66 |      *
 67 |      * @param array An array of TransformationInterface instances
 68 |      */
 69 |     public function applyTransformations(array $transforms)
 70 |     {
 71 |         foreach ($this->documents as $doc) {
 72 |             foreach ($transforms as $transform) {
 73 |                 $doc->applyTransformation($transform);
 74 |             }
 75 |         }
 76 |     }
 77 | 
 78 |     // ====== Implementation of \Iterator interface =========
 79 |     public function rewind()
 80 |     {
 81 |         reset($this->documents);
 82 |         $this->currentDocument = current($this->documents);
 83 |     }
 84 |     public function next()
 85 |     {
 86 |         $this->currentDocument = next($this->documents);
 87 |     }
 88 |     public function valid()
 89 |     {
 90 |         return $this->currentDocument!=false;
 91 |     }
 92 |     public function current()
 93 |     {
 94 |         return $this->currentDocument;
 95 |     }
 96 |     public function key()
 97 |     {
 98 |         switch ($this->keytype) {
 99 |             case self::CLASS_AS_KEY:
100 |                 return $this->currentDocument->getClass();
101 |             case self::OFFSET_AS_KEY:
102 |                 return key($this->documents);
103 |             default:
104 |                 // we should never be here
105 |                 throw new \Exception("Undefined type as key");
106 |         }
107 |     }
108 |     // === Implementation of \Iterator interface finished ===
109 | 
110 |     // ====== Implementation of \ArrayAccess interface =========
111 |     public function offsetSet($key,$value)
112 |     {
113 |         throw new \Exception("Shouldn't add documents this way, add them through addDocument()");
114 |     }
115 |     public function offsetUnset($key)
116 |     {
117 |         throw new \Exception("Cannot unset any document");
118 |     }
119 |     public function offsetGet($key)
120 |     {
121 |         return $this->documents[$key];
122 |     }
123 |     public function offsetExists($key)
124 |     {
125 |         return isset($this->documents[$key]);
126 |     }
127 |     // === Implementation of \ArrayAccess interface finished ===
128 | 
129 |     // implementation of \Countable interface
130 |     public function count()
131 |     {
132 |         return count($this->documents);
133 |     }
134 | }
135 | 


--------------------------------------------------------------------------------
/src/NlpTools/Documents/WordDocument.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Documents;
 4 | 
 5 | use NlpTools\Utils\TransformationInterface;
 6 | 
 7 | /**
 8 |  * A Document that represents a single word but with a context of a
 9 |  * larger document. Useful for Named Entity Recognition
10 |  */
11 | class WordDocument implements DocumentInterface
12 | {
13 |     protected $word;
14 |     protected $before;
15 |     protected $after;
16 |     public function __construct(array $tokens, $index, $context)
17 |     {
18 |         $this->word = $tokens[$index];
19 | 
20 |         $this->before = array();
21 |         for ($start = max($index-$context,0);$start<$index;$start++) {
22 |             $this->before[] = $tokens[$start];
23 |         }
24 | 
25 |         $this->after = array();
26 |         $end = min($index+$context+1,count($tokens));
27 |         for ($start = $index+1;$start<$end;$start++) {
28 |             $this->after[] = $tokens[$start];
29 |         }
30 |     }
31 | 
32 |     /**
33 |      * It returns an array with the first element being the actual word,
34 |      * the second element being an array of previous words, and the
35 |      * third an array of following words
36 |      *
37 |      * @return array
38 |      */
39 |     public function getDocumentData()
40 |     {
41 |         return array($this->word,$this->before,$this->after);
42 |     }
43 | 
44 |     /**
45 |      * Apply the transformation to the token and the surrounding context.
46 |      * Filter out the null tokens from the context. If the word is transformed
47 |      * to null it is for the feature factory to decide what to do.
48 |      *
49 |      * @param TransformationInterface $transform The transformation to be applied
50 |      */
51 |     public function applyTransformation(TransformationInterface $transform)
52 |     {
53 |         $null_filter = function ($token) {
54 |             return $token!==null;
55 |         };
56 | 
57 |         $this->word = $transform->transform($this->word);
58 |         // array_values for re-indexing
59 |         $this->before = array_values(
60 |             array_filter(
61 |                 array_map(
62 |                     array($transform,"transform"),
63 |                     $this->before
64 |                 ),
65 |                 $null_filter
66 |             )
67 |         );
68 |         $this->after = array_values(
69 |             array_filter(
70 |                 array_map(
71 |                     array($transform,"transform"),
72 |                     $this->after
73 |                 ),
74 |                 $null_filter
75 |             )
76 |         );
77 |     }
78 | }
79 | 


--------------------------------------------------------------------------------
/src/NlpTools/Exceptions/InvalidExpression.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | namespace NlpTools\Exceptions;
 3 | 
 4 | /**
 5 |  * Used by the tokenization, primarily
 6 |  * @author Dan Cardin
 7 |  */
 8 | class InvalidExpression extends \Exception
 9 | {
10 |     public static function invalidRegex($pattern, $replacement)
11 |     {
12 |         throw new InvalidExpression("The pattern '{$pattern}', and the replacement '{$replacement}' caused an error.");
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/src/NlpTools/FeatureFactories/DataAsFeatures.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | namespace NlpTools\FeatureFactories;
 3 | 
 4 | use NlpTools\Documents\DocumentInterface;
 5 | 
 6 | class DataAsFeatures implements FeatureFactoryInterface
 7 | {
 8 |     /**
 9 |      * For use with TokensDocument mostly. Simply return the data as
10 |      * features. Could contain duplicates (a feature firing twice in
11 |      * for a signle document).
12 |      *
13 |      * @param  string            $class The class for which we are calculating features
14 |      * @param  DocumentInterface $d     The document to calculate features for.
15 |      * @return array
16 |      */
17 |     public function getFeatureArray($class, DocumentInterface $d)
18 |     {
19 |         return $d->getDocumentData();
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/src/NlpTools/FeatureFactories/FeatureFactoryInterface.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\FeatureFactories;
 4 | 
 5 | use NlpTools\Documents\DocumentInterface;
 6 | 
 7 | interface FeatureFactoryInterface
 8 | {
 9 |     /**
10 |      * Return an array with unique strings that are the features that
11 |      * "fire" for the specified Document $d and class $class
12 |      *
13 |      * @param  string            $class The class for which we are calculating features
14 |      * @param  DocumentInterface $d     The document for which we are calculating features
15 |      * @return array
16 |      */
17 |     public function getFeatureArray($class, DocumentInterface $d);
18 | }
19 | 


--------------------------------------------------------------------------------
/src/NlpTools/FeatureFactories/FunctionFeatures.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\FeatureFactories;
 4 | 
 5 | use \NlpTools\Documents\DocumentInterface;
 6 | 
 7 | /**
 8 |  * An implementation of FeatureFactoryInterface that takes any number of callables
 9 |  * (function names, closures, array($object,'func_name'), etc.) and
10 |  * calls them consecutively using the return value as a feature's unique
11 |  * string.
12 |  *
13 |  * The class can model both feature frequency and presence
14 |  */
15 | class FunctionFeatures implements FeatureFactoryInterface
16 | {
17 | 
18 |     protected $functions;
19 |     protected $frequency;
20 | 
21 |     /**
22 |      * @param array $f An array of feature functions
23 |      */
24 |     public function __construct(array $f=array())
25 |     {
26 |         $this->functions=$f;
27 |         $this->frequency=false;
28 |     }
29 |     /**
30 |      * Set the feature factory to model frequency instead of presence
31 |      */
32 |     public function modelFrequency()
33 |     {
34 |         $this->frequency = true;
35 |     }
36 |     /**
37 |      * Set the feature factory to model presence instead of frequency
38 |      */
39 |     public function modelPresence()
40 |     {
41 |         $this->frequency = false;
42 |     }
43 |     /**
44 |      * Add a function as a feature
45 |      *
46 |      * @param callable $feature
47 |      */
48 |     public function add( $feature )
49 |     {
50 |         $this->functions[] = $feature;
51 |     }
52 | 
53 |     /**
54 |      * Compute the features that "fire" for a given class,document pair.
55 |      *
56 |      * Call each function one by one. Eliminate each return value that
57 |      * evaluates to false. If the return value is a string add it to
58 |      * the feature set. If the return value is an array iterate over it
59 |      * and add each value to the feature set.
60 |      *
61 |      * @param  string            $class The class for which we are calculating features
62 |      * @param  DocumentInterface $d     The document for which we are calculating features
63 |      * @return array
64 |      */
65 |     public function getFeatureArray($class, DocumentInterface $d)
66 |     {
67 |         $features = array_filter(
68 |             array_map( function ($feature) use ($class,$d) {
69 |                     return call_user_func($feature, $class, $d);
70 |                 },
71 |                 $this->functions
72 |             ));
73 |         $set = array();
74 |         foreach ($features as $f) {
75 |             if (is_array($f)) {
76 |                 foreach ($f as $ff) {
77 |                     if (!isset($set[$ff]))
78 |                         $set[$ff] = 0;
79 |                     $set[$ff]++;
80 |                 }
81 |             } else {
82 |                 if (!isset($set[$f]))
83 |                     $set[$f] = 0;
84 |                 $set[$f]++;
85 |             }
86 |         }
87 |         if ($this->frequency)
88 |             return $set;
89 |         else
90 |             return array_keys($set);
91 |     }
92 | 
93 | }
94 | 


--------------------------------------------------------------------------------
/src/NlpTools/Models/FeatureBasedNB.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace NlpTools\Models;
  4 | 
  5 | use \NlpTools\FeatureFactories\FeatureFactoryInterface;
  6 | use \NlpTools\Documents\TrainingSet;
  7 | 
  8 | /**
  9 |  * Implement a MultinomialNBModel by training on a TrainingSet with a
 10 |  * FeatureFactoryInterface and additive smoothing.
 11 |  */
 12 | class FeatureBasedNB implements MultinomialNBModelInterface
 13 | {
 14 |     // computed prior probabilities
 15 |     protected $priors;
 16 |     // computed conditional probabilites
 17 |     protected $condprob;
 18 |     // probability for each unknown word in a class a/(len(terms[class])+a*len(V))
 19 |     protected $unknown;
 20 | 
 21 |     public function __construct()
 22 |     {
 23 |         $this->priors = array();
 24 |         $this->condprob = array();
 25 |         $this->unknown = array();
 26 |     }
 27 | 
 28 |     /**
 29 |      * Return the prior probability of class $class
 30 |      * P(c) as computed by the training data
 31 |      *
 32 |      * @param  string $class
 33 |      * @return float  prior probability
 34 |      */
 35 |     public function getPrior($class)
 36 |     {
 37 |         return isset($this->priors[$class])
 38 |             ? $this->priors[$class]
 39 |             : 0;
 40 |     }
 41 | 
 42 |     /**
 43 |      * Return the conditional probability of a term for a given class.
 44 |      *
 45 |      * @param  string $term  The term (word, feature id, ...)
 46 |      * @param  string $class The class
 47 |      * @return float
 48 |      */
 49 |     public function getCondProb($term,$class)
 50 |     {
 51 |         if (!isset($this->condprob[$term][$class])) {
 52 |             
 53 |             return isset($this->unknown[$class])
 54 |                 ? $this->unknown[$class]
 55 |                 : 0;
 56 | 
 57 |         } else {
 58 |             return $this->condprob[$term][$class];
 59 |         }
 60 |     }
 61 | 
 62 |     /**
 63 |      * Train on the given set and fill the model's variables. Use the
 64 |      * training context provided to update the counts as if the training
 65 |      * set was appended to the previous one that provided the context.
 66 |      *
 67 |      * It can be used for incremental training. It is not meant to be used
 68 |      * with the same training set twice.
 69 |      *
 70 |      * @param array                   $train_ctx The previous training context
 71 |      * @param FeatureFactoryInterface $ff        A feature factory to compute features from a training document
 72 |      * @param TrainingSet The training set
 73 |      * @param  integer $a_smoothing The parameter for additive smoothing. Defaults to add-one smoothing.
 74 |      * @return array   Return a training context to be used for further incremental training,
 75 |      *               although this is not necessary since the changes also happen in place
 76 |      */
 77 |     public function train_with_context(array &$train_ctx, FeatureFactoryInterface $ff, TrainingSet $tset, $a_smoothing=1)
 78 |     {
 79 |         $this->countTrainingSet(
 80 |                                 $ff,
 81 |                                 $tset,
 82 |                                 $train_ctx['termcount_per_class'],
 83 |                                 $train_ctx['termcount'],
 84 |                                 $train_ctx['ndocs_per_class'],
 85 |                                 $train_ctx['voc'],
 86 |                                 $train_ctx['ndocs']
 87 |                             );
 88 | 
 89 |         $voccount = count($train_ctx['voc']);
 90 | 
 91 |         $this->computeProbabilitiesFromCounts(
 92 |                                     $tset->getClassSet(),
 93 |                                     $train_ctx['termcount_per_class'],
 94 |                                     $train_ctx['termcount'],
 95 |                                     $train_ctx['ndocs_per_class'],
 96 |                                     $train_ctx['ndocs'],
 97 |                                     $voccount,
 98 |                                     $a_smoothing
 99 |                                 );
100 | 
101 |         return $train_ctx;
102 |     }
103 | 
104 |     /**
105 |      * Train on the given set and fill the models variables
106 |      *
107 |      * priors[c] = NDocs[c]/NDocs
108 |      * condprob[t][c] = count( t in c) + 1 / sum( count( t' in c ) + 1 , for every t' )
109 |      * unknown[c] = condbrob['word that doesnt exist in c'][c] ( so that count(t in c)==0 )
110 |      *
111 |      * More information on the algorithm can be found at
112 |      * http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html
113 |      *
114 |      * @param FeatureFactoryInterface A feature factory to compute features from a training document
115 |      * @param TrainingSet The training set
116 |      * @param  integer $a_smoothing The parameter for additive smoothing. Defaults to add-one smoothing.
117 |      * @return array   Return a training context to be used for incremental training
118 |      */
119 |     public function train(FeatureFactoryInterface $ff, TrainingSet $tset, $a_smoothing=1)
120 |     {
121 |         $class_set = $tset->getClassSet();
122 | 
123 |         $ctx = array(
124 |             'termcount_per_class'=>array_fill_keys($class_set,0),
125 |             'termcount'=>array_fill_keys($class_set,array()),
126 |             'ndocs_per_class'=>array_fill_keys($class_set,0),
127 |             'voc'=>array(),
128 |             'ndocs'=>0
129 |         );
130 | 
131 |         return $this->train_with_context($ctx,$ff,$tset,$a_smoothing);
132 |     }
133 | 
134 |     /**
135 |      * Count all the features for each document. All parameters are passed
136 |      * by reference and they are filled in this function. Useful for not
137 |      * making copies of big arrays.
138 |      *
139 |      * @param  FeatureFactoryInterface $ff                  A feature factory to create the features for each document in the set
140 |      * @param  TrainingSet             $tset                The training set (collection of labeled documents)
141 |      * @param  array                   $termcount_per_class The count of occurences of each feature in each class
142 |      * @param  array                   $termcount           The total count of occurences of each term
143 |      * @param  array                   $ndocs_per_class     The total number of documents per class
144 |      * @param  array                   $voc                 A set of the found features
145 |      * @param  integer                 $ndocs               The number of documents
146 |      * @return void
147 |      */
148 |     protected function countTrainingSet(FeatureFactoryInterface $ff, TrainingSet $tset, array &$termcount_per_class, array &$termcount, array &$ndocs_per_class, array &$voc, &$ndocs)
149 |     {
150 |         foreach ($tset as $tdoc) {
151 |             $ndocs++;
152 |             $c = $tdoc->getClass();
153 |             $ndocs_per_class[$c]++;
154 |             $features = $ff->getFeatureArray($c,$tdoc);
155 |             if (is_int(key($features)))
156 |                 $features = array_count_values($features);
157 |             foreach ($features as $f=>$fcnt) {
158 |                 if (!isset($voc[$f]))
159 |                     $voc[$f] = 0;
160 | 
161 |                 $termcount_per_class[$c]+=$fcnt;
162 |                 if (isset($termcount[$c][$f]))
163 |                     $termcount[$c][$f]+=$fcnt;
164 |                 else
165 |                     $termcount[$c][$f] = $fcnt;
166 |             }
167 |         }
168 |     }
169 | 
170 |     /**
171 |      * Compute the probabilities given the counts of the features in the
172 |      * training set.
173 |      *
174 |      * @param  array   $class_set           Just the array that contains the classes
175 |      * @param  array   $termcount_per_class The count of occurences of each feature in each class
176 |      * @param  array   $termcount           The total count of occurences of each term
177 |      * @param  array   $ndocs_per_class     The total number of documents per class
178 |      * @param  integer $ndocs               The total number of documents
179 |      * @param  integer $voccount            The total number of features found
180 |      * @return void
181 |      */
182 |     protected function computeProbabilitiesFromCounts(array $class_set, array &$termcount_per_class, array &$termcount, array &$ndocs_per_class, $ndocs, $voccount, $a_smoothing=1)
183 |     {
184 |         $denom_smoothing = $a_smoothing*$voccount;
185 |         foreach ($class_set as $class) {
186 |             $this->priors[$class] = $ndocs_per_class[$class] / $ndocs;
187 |             foreach ($termcount[$class] as $term=>$count) {
188 |                 $this->condprob[$term][$class] = ($count + $a_smoothing) / ($termcount_per_class[$class] + $denom_smoothing);
189 |             }
190 |         }
191 |         foreach ($class_set as $class) {
192 |             $this->unknown[$class] = $a_smoothing / ($termcount_per_class[$class] + $denom_smoothing);
193 |         }
194 |     }
195 | 
196 |     /**
197 |      * Just save the probabilities for reuse
198 |      */
199 |     public function __sleep()
200 |     {
201 |         return array('priors','condprob','unknown');
202 |     }
203 | }
204 | 


--------------------------------------------------------------------------------
/src/NlpTools/Models/LinearModel.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Models;
 4 | 
 5 | /**
 6 |  * This class represents a linear model of the following form
 7 |  * f(x_vec) = l1*x1 + l2*x2 + l3*x3 ...
 8 |  *
 9 |  * Maybe the name is a bit off. What is really meant is that models of
10 |  * this type provide a set of weights that will be used by the classifier
11 |  * (probably through a linear combination) to decide the class of a
12 |  * given document.
13 |  *
14 |  */
15 | class LinearModel
16 | {
17 |     protected $l;
18 |     public function __construct(array $l)
19 |     {
20 |         $this->l = $l;
21 |     }
22 |     /**
23 |      * Get the weight for a given feature
24 |      *
25 |      * @param  string $feature The feature for which the weight will be returned
26 |      * @return float  The weight
27 |      */
28 |     public function getWeight($feature)
29 |     {
30 |         if (!isset($this->l[$feature])) return 0;
31 |         else return $this->l[$feature];
32 |     }
33 | 
34 |     /**
35 |      * Get all the weights as an array.
36 |      *
37 |      * @return array The weights as an associative array
38 |      */
39 |     public function getWeights()
40 |     {
41 |         return $this->l;
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/src/NlpTools/Models/Maxent.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace NlpTools\Models;
  4 | 
  5 | use \NlpTools\FeatureFactories\FeatureFactoryInterface;
  6 | use \NlpTools\Documents\TrainingSet;
  7 | use NlpTools\Optimizers\MaxentOptimizerInterface;
  8 | 
  9 | /**
 10 |  * Maxent is a model that assigns a weight for each feature such that all
 11 |  * the weights maximize the Conditional Log Likelihood of the training
 12 |  * data. Because it does that without making any assumptions about the data
 13 |  * it is named maximum entropy model (maximum ignorance).
 14 |  */
 15 | class Maxent extends LinearModel
 16 | {
 17 |     const INITIAL_PARAM_VALUE = 0;
 18 | 
 19 |     /**
 20 |      * Calculate all the features for every possible class. Pass the
 21 |      * information to the optimizer to find the weights that satisfy the
 22 |      * constraints and maximize the entropy
 23 |      *
 24 |      * @param $ff The feature factory
 25 |      * @param $tset A collection of training documents
 26 |      * @param $opt An optimizer, we need a maxent optimizer
 27 |      * @return void
 28 |      */
 29 |     public function train(FeatureFactoryInterface $ff, TrainingSet $tset, MaxentOptimizerInterface $opt)
 30 |     {
 31 |         $classSet = $tset->getClassSet();
 32 | 
 33 |         $features = $this->calculateFeatureArray($classSet,$tset,$ff);
 34 |         $this->l = $opt->optimize($features);
 35 |     }
 36 | 
 37 |     /**
 38 |      * Calculate all the features for each possible class of each
 39 |      * document. This is done so that we can optimize without the need
 40 |      * of the FeatureFactory.
 41 |      *
 42 |      * We do not want to use the FeatureFactoryInterface both because it would
 43 |      * be slow to calculate the features over and over again, but also
 44 |      * because we want to be able to optimize externally to
 45 |      * gain speed (PHP is slow!).
 46 |      *
 47 |      * @param $classes A set of the classes in the training set
 48 |      * @param $tset A collection of training documents
 49 |      * @param $ff The feature factory
 50 |      * @return array An array that contains every feature for every possible class of every document
 51 |      */
 52 |     protected function calculateFeatureArray(array $classes, TrainingSet $tset, FeatureFactoryInterface $ff)
 53 |     {
 54 |         $features = array();
 55 |         $tset->setAsKey(TrainingSet::OFFSET_AS_KEY);
 56 |         foreach ($tset as $offset=>$doc) {
 57 |             $features[$offset] = array();
 58 |             foreach ($classes as $class) {
 59 |                 $features[$offset][$class] = $ff->getFeatureArray($class,$doc);
 60 |             }
 61 |             $features[$offset]['__label__'] = $doc->getClass();
 62 |         }
 63 | 
 64 |         return $features;
 65 |     }
 66 | 
 67 |     /**
 68 |      * Calculate the probability that document $d belongs to the class
 69 |      * $class given a set of possible classes, a feature factory and
 70 |      * the model's weights l[i]
 71 |      *
 72 |      * @param $classes The set of possible classes
 73 |      * @param $ff The feature factory
 74 |      * @param $d The document
 75 |      * @param  string $class A class for which we calculate the probability
 76 |      * @return float  The probability that document $d belongs to class $class
 77 |      */
 78 |     public function P(array $classes,FeatureFactoryInterface $ff,DocumentInterface $d,$class)
 79 |     {
 80 |         $exps = array();
 81 |         foreach ($classes as $cl) {
 82 |             $tmp = 0.0;
 83 |             foreach ($ff->getFeatureArray($cl,$d) as $i) {
 84 |                 $tmp += $this->l[$i];
 85 |             }
 86 |             $exps[$cl] = exp($tmp);
 87 |         }
 88 | 
 89 |         return $exps[$class]/array_sum($exps);
 90 |     }
 91 | 
 92 |     /**
 93 |      * Not implemented yet.
 94 |      * Simply put:
 95 |      * 	result += log( $this->P(..., ..., ...) ) for every doc in TrainingSet
 96 |      *
 97 |      * @throws \Exception
 98 |      */
 99 |     public function CLogLik(TrainingSet $tset,FeatureFactoryInterface $ff)
100 |     {
101 |         throw new \Exception("Unimplemented");
102 |     }
103 | 
104 |     /**
105 |      * Simply print_r weights. Usefull for some kind of debugging when
106 |      * working with small training sets and few features
107 |      */
108 |     public function dumpWeights()
109 |     {
110 |         print_r($this->l);
111 |     }
112 | 
113 | }
114 | 


--------------------------------------------------------------------------------
/src/NlpTools/Models/MultinomialNBModelInterface.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Models;
 4 | 
 5 | /**
 6 |  * Interface that describes a NB model.
 7 |  * All that we need is the prior probability of a class
 8 |  * and the conditional probability of a term given a class.
 9 |  */
10 | interface MultinomialNBModelInterface
11 | {
12 |     public function getPrior($class);
13 |     public function getCondProb($term,$class);
14 | }
15 | 


--------------------------------------------------------------------------------
/src/NlpTools/Optimizers/ExternalMaxentOptimizer.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Optimizers;
 4 | 
 5 | /**
 6 |  * This class enables the use of a program written in a different
 7 |  * language to optimize our model and return the weights for use in php.
 8 |  * Mostly common use: Optimize in a fast compiled language (ex.: C) or
 9 |  * a language with great libraries (ex.: Matlab)
10 |  *
11 |  * Output a json array that contains all the necessary information to
12 |  * train the model and determine the weights that maximize the
13 |  * conditional log likelihood of the training data
14 |  *
15 |  * The array has one element for each training document. The element is
16 |  * a map with each possible class as key and an array of features that
17 |  * fire for each class. There is a special key called '__label__' that
18 |  * contains a string value which is the actual class of this document
19 |  *
20 |  * Ex.:
21 |  * [
22 |  *   {
23 |  *     "class1": ["feature1","feature2","feature3"],
24 |  *     "class2": ["feature1","feature4","feature5"],
25 |  *     "__label__": "class1"
26 |  *   },
27 |  *   {
28 |  *     "class1": ["feature2","feature3"],
29 |  *     "class2": ["feature1","feature4","feature5"],
30 |  *     "__label__": "class1"
31 |  *   },
32 |  *   {
33 |  *     "class1": ["feature1","feature2","feature3"],
34 |  *     "class2": ["feature1"],
35 |  *     "__label__": "class2"
36 |  *   }
37 |  * ]
38 |  *
39 |  * Send this array to an external program that will return a map of
40 |  * floats in json that will contain the weight for each feature.
41 |  *
42 |  */
43 | class ExternalMaxentOptimizer implements MaxentOptimizerInterface
44 | {
45 |     // holds the program name to be run
46 |     protected $optimizer;
47 | 
48 |     /**
49 |      * @param string $optimizer The path for an external optimizer executable
50 |      */
51 |     public function __construct($optimizer)
52 |     {
53 |         $this->optimizer = $optimizer;
54 |     }
55 | 
56 |     /**
57 |      * Open a pipe to the optimizer, send him the data encoded in json
58 |      * and then read the stdout to get the results encoded in json
59 |      *
60 |      * @param  array $feature_array The features that fired for any document for any class @see NlpTools\Models\Maxent
61 |      * @return array The optimized weights
62 |      */
63 |     public function optimize(array &$feature_array)
64 |     {
65 |         // whete we will read from where we will write to
66 |         $desrciptorspec = array(
67 |             0=>array('pipe','r'),
68 |             1=>array('pipe','w'),
69 |             2=>STDERR // Should that be redirected to /dev/null or like?
70 |         );
71 | 
72 |         // Run the optimizer
73 |         $process = proc_open($this->optimizer,$desrciptorspec,$pipes);
74 |         if (!is_resource($process)) {
75 |             return array();
76 |         }
77 | 
78 |         // send the data
79 |         fwrite($pipes[0],json_encode($feature_array));
80 |         fclose($pipes[0]);
81 | 
82 |         // get the weights
83 |         $json = stream_get_contents($pipes[1]);
84 | 
85 |         // decode as an associative array
86 |         $l = json_decode( $json , true );
87 | 
88 |         // close up the optimizer
89 |         fclose($pipes[1]);
90 |         proc_close($process);
91 | 
92 |         return $l;
93 |     }
94 | 
95 | }
96 | 


--------------------------------------------------------------------------------
/src/NlpTools/Optimizers/FeatureBasedLinearOptimizerInterface.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Optimizers;
 4 | 
 5 | interface FeatureBasedLinearOptimizerInterface
 6 | {
 7 |     /**
 8 |      * This function receives an array that contains an array for
 9 |      * each document which contains an array of feature identifiers for
10 |      * each class and at the special key '__label__' the actual class
11 |      * of the training document.
12 |      *
13 |      * As a result it contains all the information needed to train a
14 |      * set of weights with any target. Ex.: If we were training a maxent
15 |      * model we would try to maximize the CLogLik that can be calculated
16 |      * from this array.
17 |      *
18 |      * @param  array &$feature_array
19 |      * @return array The parameteres $l
20 |      */
21 |     public function optimize(array &$feature_array);
22 | }
23 | 


--------------------------------------------------------------------------------
/src/NlpTools/Optimizers/GradientDescentOptimizer.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace NlpTools\Optimizers;
  4 | 
  5 | /**
  6 |  * Implements gradient descent with fixed step.
  7 |  * Leaves the computation of the fprime to the children classes.
  8 |  */
  9 | abstract class GradientDescentOptimizer implements FeatureBasedLinearOptimizerInterface
 10 | {
 11 |     // gradient descent parameters
 12 |     protected $precision; // how close to zero should fprime go
 13 |     protected $step; // learning rate
 14 |     protected $maxiter; // maximum iterations (-1 for "infinite")
 15 | 
 16 |     // array that holds the current fprime
 17 |     protected $fprime_vector;
 18 | 
 19 |     // report the improvement
 20 |     protected $verbose=2;
 21 | 
 22 |     public function __construct($precision=0.001, $step=0.1, $maxiter = -1)
 23 |     {
 24 |         $this->precision = $precision;
 25 |         $this->step = $step;
 26 |         $this->maxiter = $maxiter;
 27 |     }
 28 | 
 29 |     /**
 30 |      * Should initialize the weights and compute any constant
 31 |      * expressions needed for the fprime calculation.
 32 |      *
 33 |      * @param $feature_array All the data known about the training set
 34 |      * @param $l The current set of weights to be initialized
 35 |      * @return void
 36 |      */
 37 |     abstract protected function initParameters(array &$feature_array, array &$l);
 38 |     /**
 39 |      * Should calculate any parameter needed by Fprime that cannot be
 40 |      * calculated by initParameters because it is not constant.
 41 |      *
 42 |      * @param $feature_array All the data known about the training set
 43 |      * @param $l The current set of weights to be initialized
 44 |      * @return void
 45 |      */
 46 |     abstract protected function prepareFprime(array &$feature_array, array &$l);
 47 |     /**
 48 |      * Actually compute the fprime_vector. Set for each $l[$i] the
 49 |      * value of the partial derivative of f for delta $l[$i]
 50 |      *
 51 |      * @param $feature_array All the data known about the training set
 52 |      * @param $l The current set of weights to be initialized
 53 |      * @return void
 54 |      */
 55 |     abstract protected function Fprime(array &$feature_array, array &$l);
 56 | 
 57 |     /**
 58 |      * Actually do the gradient descent algorithm.
 59 |      * l[i] = l[i] - learning_rate*( theta f/delta l[i] ) for each i
 60 |      * Could possibly benefit from a vetor add/scale function.
 61 |      *
 62 |      * @param $feature_array All the data known about the training set
 63 |      * @return array The parameters $l[$i] that minimize F
 64 |      */
 65 |     public function optimize(array &$feature_array)
 66 |     {
 67 |         $itercount = 0;
 68 |         $optimized = false;
 69 |         $maxiter = $this->maxiter;
 70 |         $prec = $this->precision;
 71 |         $step = $this->step;
 72 |         $l = array();
 73 |         $this->initParameters($feature_array,$l);
 74 |         while (!$optimized && $itercount++!=$maxiter) {
 75 |             //$start = microtime(true);
 76 |             $optimized = true;
 77 |             $this->prepareFprime($feature_array,$l);
 78 |             $this->Fprime($feature_array,$l);
 79 |             foreach ($this->fprime_vector as $i=>$fprime_i_val) {
 80 |                 $l[$i] -= $step*$fprime_i_val;
 81 |                 if (abs($fprime_i_val) > $prec) {
 82 |                     $optimized = false;
 83 |                 }
 84 |             }
 85 |             //fprintf(STDERR,"%f\n",microtime(true)-$start);
 86 |             if ($this->verbose>0)
 87 |                 $this->reportProgress($itercount);
 88 |         }
 89 | 
 90 |         return $l;
 91 |     }
 92 | 
 93 |     public function reportProgress($itercount)
 94 |     {
 95 |         if ($itercount == 1) {
 96 |             echo "#\t|Fprime|\n------------------\n";
 97 |         }
 98 |         $norm = 0;
 99 |         foreach ($this->fprime_vector as $fprime_i_val) {
100 |             $norm += $fprime_i_val*$fprime_i_val;
101 |         }
102 |         $norm = sqrt($norm);
103 |         printf("%d\t%.3f\n",$itercount,$norm);
104 |     }
105 | }
106 | 


--------------------------------------------------------------------------------
/src/NlpTools/Optimizers/MaxentGradientDescent.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace NlpTools\Optimizers;
  4 | 
  5 | /**
  6 |  * Implement a gradient descent algorithm that maximizes the conditional
  7 |  * log likelihood of the training data.
  8 |  *
  9 |  * See page 24 - 28 of http://nlp.stanford.edu/pubs/maxent-tutorial-slides.pdf
 10 |  * @see NlpTools\Models\Maxent
 11 |  */
 12 | class MaxentGradientDescent extends GradientDescentOptimizer implements MaxentOptimizerInterface
 13 | {
 14 |     // will hold the constant numerators
 15 |     protected $numerators;
 16 |     // denominators will be computed on each iteration because they
 17 |     // depend on the weights
 18 |     protected $denominators;
 19 | 
 20 |     /**
 21 |      * We initialize all weight for any feature we find to 0. We also
 22 |      * compute the empirical expectation (the count) for each feature in
 23 |      * the training data (which of course remains constant for a
 24 |      * specific set of data).
 25 |      *
 26 |      * @param $feature_array All the data known about the training set
 27 |      * @param $l The current set of weights to be initialized
 28 |      * @return void
 29 |      */
 30 |     protected function initParameters(array &$feature_array, array &$l)
 31 |     {
 32 |         $this->numerators = array();
 33 |         $this->fprime_vector = array();
 34 |         foreach ($feature_array as $doc) {
 35 |             foreach ($doc as $class=>$features) {
 36 |                 if (!is_array($features)) continue;
 37 |                 foreach ($features as $fi) {
 38 |                     $l[$fi] = 0;
 39 |                     $this->fprime_vector[$fi] = 0;
 40 |                     if (!isset($this->numerators[$fi])) {
 41 |                         $this->numerators[$fi] = 0;
 42 |                     }
 43 |                 }
 44 |             }
 45 |             foreach ($doc[$doc['__label__']] as $fi) {
 46 |                 $this->numerators[$fi]++;
 47 |             }
 48 |         }
 49 |     }
 50 | 
 51 |     /**
 52 |      * Compute the denominators which is the predicted expectation of
 53 |      * each feature given a set of weights L and a set of features for
 54 |      * each document for each class.
 55 |      *
 56 |      * @param $feature_array All the data known about the training set
 57 |      * @param $l The current set of weights to be initialized
 58 |      * @return void
 59 |      */
 60 |     protected function prepareFprime(array &$feature_array, array &$l)
 61 |     {
 62 |         $this->denominators = array();
 63 |         foreach ($feature_array as $offset=>$doc) {
 64 |             $numerator = array_fill_keys(array_keys($doc),0.0);
 65 |             $denominator = 0.0;
 66 |             foreach ($doc as $cl=>$f) {
 67 |                 if (!is_array($f)) continue;
 68 |                 $tmp = 0.0;
 69 |                 foreach ($f as $i) {
 70 |                     $tmp += $l[$i];
 71 |                 }
 72 |                 $tmp = exp($tmp);
 73 |                 $numerator[$cl] += $tmp;
 74 |                 $denominator += $tmp;
 75 |             }
 76 |             foreach ($doc as $class=>$features) {
 77 |                 if (!is_array($features)) continue;
 78 |                 foreach ($features as $fi) {
 79 |                     if (!isset($this->denominators[$fi])) {
 80 |                         $this->denominators[$fi] = 0;
 81 |                     }
 82 |                     $this->denominators[$fi] += $numerator[$class]/$denominator;
 83 |                 }
 84 |             }
 85 |         }
 86 |     }
 87 | 
 88 |     /**
 89 |      * The partial Fprime for each i is
 90 |      * empirical expectation - predicted expectation . We need to
 91 |      * maximize the CLogLik (CLogLik is the f whose Fprime we calculate)
 92 |      * so we instead minimize the -CLogLik.
 93 |      *
 94 |      * See page 28 of http://nlp.stanford.edu/pubs/maxent-tutorial-slides.pdf
 95 |      *
 96 |      * @param $feature_array All the data known about the training set
 97 |      * @param $l The current set of weights to be initialized
 98 |      * @return void
 99 |      */
100 |     protected function Fprime(array &$feature_array, array &$l)
101 |     {
102 |         foreach ($this->fprime_vector as $i=>&$fprime_i_val) {
103 |             $fprime_i_val = $this->denominators[$i] - $this->numerators[$i];
104 |         }
105 |     }
106 | 
107 | }
108 | 


--------------------------------------------------------------------------------
/src/NlpTools/Optimizers/MaxentOptimizerInterface.php:
--------------------------------------------------------------------------------
1 | <?php
2 | 
3 | namespace NlpTools\Optimizers;
4 | 
5 | /**
6 |  * Marker interface to use with the Maxent model for type checking
7 |  */
8 | interface MaxentOptimizerInterface extends FeatureBasedLinearOptimizerInterface {}
9 | 


--------------------------------------------------------------------------------
/src/NlpTools/Random/Distributions/AbstractDistribution.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Random\Distributions;
 4 | 
 5 | use NlpTools\Random\Generators\GeneratorInterface;
 6 | use NlpTools\Random\Generators\MersenneTwister;
 7 | 
 8 | abstract class AbstractDistribution
 9 | {
10 |     protected $rnd;
11 | 
12 |     public function __construct(GeneratorInterface $rnd=null)
13 |     {
14 |         if ($rnd == null)
15 |             $this->rnd = MersenneTwister::get();
16 |         else
17 |             $this->rnd = $rnd;
18 |     }
19 | 
20 |     abstract public function sample();
21 | }
22 | 


--------------------------------------------------------------------------------
/src/NlpTools/Random/Distributions/Dirichlet.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Random\Distributions;
 4 | 
 5 | use NlpTools\Random\Generators\GeneratorInterface;
 6 | 
 7 | /**
 8 |  * Implement a k-dimensional Dirichlet distribution using draws from
 9 |  * k gamma distributions and then normalizing.
10 |  */
11 | class Dirichlet extends AbstractDistribution
12 | {
13 |     protected $gamma;
14 | 
15 |     public function __construct($a,$k,GeneratorInterface $rnd=null)
16 |     {
17 |         parent::__construct($rnd);
18 | 
19 |         $k = (int) abs($k);
20 |         if (!is_array($a)) {
21 |             $a = array_fill_keys(range(0,$k-1),$a);
22 |         }
23 | 
24 |         $rnd = $this->rnd;
25 |         $this->gamma = array_map(
26 |             function ($a) use ($rnd) {
27 |                 return new Gamma($a,1,$rnd);
28 |             },
29 |             $a
30 |         );
31 |     }
32 | 
33 |     public function sample()
34 |     {
35 |         $y = array();
36 |         foreach ($this->gamma as $g) {
37 |             $y[] = $g->sample();
38 |         }
39 |         $sum = array_sum($y);
40 | 
41 |         return array_map(
42 |             function ($y) use ($sum) {
43 |                 return $y/$sum;
44 |             },
45 |             $y
46 |         );
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/NlpTools/Random/Distributions/Gamma.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Random\Distributions;
 4 | 
 5 | use NlpTools\Random\Generators\GeneratorInterface;
 6 | 
 7 | /**
 8 |  * Implement the gamma distribution.
 9 |  * The implementation is ported to php from c++. C++ is written by John
10 |  * D. Cook and can be found at http://www.johndcook.com/SimpleRNG.cpp
11 |  */
12 | class Gamma extends AbstractDistribution
13 | {
14 |     protected $normal;
15 |     protected $gamma;
16 |     protected $shape;
17 |     protected $scale;
18 | 
19 |     public function __construct($shape,$scale,  GeneratorInterface $rnd=null)
20 |     {
21 |         parent::__construct($rnd);
22 | 
23 |         $this->scale = $scale;
24 |         $this->shape = abs($shape);
25 |         if ($this->shape >= 1)
26 |             $this->normal = new Normal(0,1,$this->rnd);
27 |         else
28 |             $this->gamma = new Gamma($this->shape + 1, 1, $this->rnd);
29 | 
30 |     }
31 | 
32 |     public function sample()
33 |     {
34 |         if ($this->shape >= 1) {
35 |             $d = $this->shape - 1/3;
36 |             $c = 1/sqrt(9*$d);
37 |             for (;;) {
38 |                 do {
39 |                     $x = $this->normal->sample();
40 |                     $v = 1 + $c*$x;
41 |                 } while ($v <= 0);
42 |                 $v = $v*$v*$v;
43 |                 $u = $this->rnd->generate();
44 |                 $xsq = $x*$x;
45 |                 if ($u < 1-.0331*$xsq*$xsq || log($u) < 0.5*$xsq + $d*(1-$v+log($v)))
46 |                     return $this->scale*$d*$v;
47 |             }
48 |         } else {
49 |             $g = $this->gamma->sample();
50 |             $w = $this->rnd->generate();
51 | 
52 |             return $this->scale*$g*pow($w,1/$this->shape);
53 |         }
54 |     }
55 | }
56 | 


--------------------------------------------------------------------------------
/src/NlpTools/Random/Distributions/Normal.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Random\Distributions;
 4 | 
 5 | use NlpTools\Random\Generators\GeneratorInterface;
 6 | 
 7 | class Normal extends AbstractDistribution
 8 | {
 9 |     protected $m;
10 |     protected $sigma;
11 | 
12 |     public function __construct($m=0.0,$sigma=1.0, GeneratorInterface $rnd=null)
13 |     {
14 |         parent::__construct($rnd);
15 | 
16 |         $this->m = $m;
17 |         $this->sigma = abs($sigma);
18 |     }
19 | 
20 |     public function sample()
21 |     {
22 |         $u1 = $this->rnd->generate();
23 |         $u2 = $this->rnd->generate();
24 |         $r = sqrt(-2*log($u1));
25 |         $theta = 2.0*M_PI*$u2;
26 | 
27 |         return $this->m + $this->sigma*$r*sin($theta);
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/NlpTools/Random/Generators/FromFile.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Random\Generators;
 4 | 
 5 | /**
 6 |  * Return floats from a file. A useful generator for debugging algorithms
 7 |  * with random numbers from different platforms or different generation
 8 |  * algorithms.
 9 |  */
10 | class FromFile implements GeneratorInterface
11 | {
12 |     protected $h;
13 | 
14 |     /**
15 |      * Construct a FromFile generator
16 |      * @param string $f A file name to read from
17 |      */
18 |     public function __construct($f)
19 |     {
20 |         $this->h = fopen($f,'r');
21 |     }
22 | 
23 |     /**
24 |      * Read a float from a file and return it. It doesn't do anything
25 |      * to make sure that the float returned will be in the appropriate
26 |      * range.
27 |      *
28 |      * If the file has reached its end it rewinds the file pointer.
29 |      *
30 |      * @return float A random float in the range (0,1)
31 |      */
32 |     public function generate()
33 |     {
34 |         if (feof($this->h))
35 |             rewind($this->h);
36 | 
37 |         return (float) fgets($this->h);
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/NlpTools/Random/Generators/GeneratorInterface.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Random\Generators;
 4 | 
 5 | /**
 6 |  * An interface for pseudo-random number generators.
 7 |  *
 8 |  * @author Katharopoulos Angelos <angelos@yourse.gr>
 9 |  */
10 | interface GeneratorInterface
11 | {
12 |     /**
13 |      * Generates a pseudo-random number with uniform distribution in the
14 |      * interval [0,1)
15 |      *
16 |      * @return float The "random" number
17 |      */
18 |     public function generate();
19 | }
20 | 


--------------------------------------------------------------------------------
/src/NlpTools/Random/Generators/MersenneTwister.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Random\Generators;
 4 | 
 5 | /**
 6 |  * A simple wrapper over the built in mt_rand() method
 7 |  */
 8 | class MersenneTwister implements GeneratorInterface
 9 | {
10 |     public function generate()
11 |     {
12 |         return mt_rand()/mt_getrandmax();
13 |     }
14 | 
15 |     protected static $instance;
16 |     public static function get()
17 |     {
18 |         if (self::$instance!=null) return self::$instance;
19 |         self::$instance = new MersenneTwister();
20 | 
21 |         return self::$instance;
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/NlpTools/Similarity/CosineSimilarity.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Similarity;
 4 | 
 5 | /**
 6 |  * Given two vectors compute cos(theta) where theta is the angle
 7 |  * between the two vectors in a N-dimensional vector space.
 8 |  *
 9 |  * cos(theta) = A•B / |A||B|
10 |  * '•' means inner product
11 |  *
12 |  * Since the vectors are meant to be feature vectors, the value of
13 |  * each vector for each dimension is simply the frequency of this
14 |  * feature. Moreover, there cannot be negative frequency of occurence so
15 |  * there cannot be negative vector coefficients and the angle will
16 |  * always be between 0 and pi/2.
17 |  *
18 |  * If the current key of the passed array is not the number 0 then the feature
19 |  * vector is supposed to have been passed as a mapping between the feature name
20 |  * and a value like the following
21 |  * array(
22 |  * 	'feature_1'=>1,
23 |  * 	'feature_2'=>0.55,
24 |  * 	'feature_3'=>12.7,
25 |  * 	....
26 |  * )
27 |  */
28 | class CosineSimilarity implements SimilarityInterface, DistanceInterface
29 | {
30 | 
31 |     /**
32 |      * Returns a number between 0,1 that corresponds to the cos(theta)
33 |      * where theta is the angle between the two sets if they are treated
34 |      * as n-dimensional vectors.
35 |      *
36 |      * See the class comment about why the number is in [0,1] and not
37 |      * in [-1,1] as it normally should.
38 |      *
39 |      * @param  array $A Either feature vector or simply vector
40 |      * @param  array $B Either feature vector or simply vector
41 |      * @return float The cosinus of the angle between the two vectors
42 |      */
43 |     public function similarity(&$A, &$B)
44 |     {
45 | 
46 |         if (!is_array($A) || !is_array($B)) {
47 |             throw new \InvalidArgumentException('Vector $' . (!is_array($A) ? 'A' : 'B') . ' is not an array');
48 |         }
49 | 
50 |         // This means they are simple text vectors
51 |         // so we need to count to make them vectors
52 |         if (is_int(key($A)))
53 |             $v1 = array_count_values($A);
54 |         else
55 |             $v1 = &$A;
56 |         if (is_int(key($B)))
57 |             $v2 = array_count_values($B);
58 |         else
59 |             $v2 = &$B;
60 | 
61 |         $prod = 0.0;
62 |         $v1_norm = 0.0;
63 |         foreach ($v1 as $i=>$xi) {
64 |             if (isset($v2[$i])) {
65 |                 $prod += $xi*$v2[$i];
66 |             }
67 |             $v1_norm += $xi*$xi;
68 |         }
69 |         $v1_norm = sqrt($v1_norm);
70 |         if ($v1_norm==0)
71 |             throw new \InvalidArgumentException("Vector \$A is the zero vector");
72 | 
73 |         $v2_norm = 0.0;
74 |         foreach ($v2 as $i=>$xi) {
75 |             $v2_norm += $xi*$xi;
76 |         }
77 |         $v2_norm = sqrt($v2_norm);
78 |         if ($v2_norm==0)
79 |             throw new \InvalidArgumentException("Vector \$B is the zero vector");
80 | 
81 |         return $prod/($v1_norm*$v2_norm);
82 |     }
83 | 
84 |     /**
85 |      * Cosine distance is simply 1-cosine similarity
86 |      */
87 |     public function dist(&$A, &$B)
88 |     {
89 |         return 1-$this->similarity($A,$B);
90 |     }
91 | }
92 | 


--------------------------------------------------------------------------------
/src/NlpTools/Similarity/DiceSimilarity.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Similarity;
 4 | 
 5 | /**
 6 |  * http://en.wikipedia.org/wiki/Sørensen–Dice_coefficient
 7 |  */
 8 | class DiceSimilarity implements SimilarityInterface, DistanceInterface
 9 | {
10 |     /**
11 |     * The similarity returned by this algorithm is a number between 0,1
12 |     */
13 |     public function similarity(&$A, &$B)
14 |     {
15 | 
16 | 
17 |         $a = array_fill_keys($A,1);
18 |         $b = array_fill_keys($B,1);
19 | 
20 |         $intersect = count(array_intersect_key($a,$b));
21 |         $a_count = count($a);
22 |         $b_count = count($b);
23 | 
24 |         return (2*$intersect)/($a_count + $b_count);
25 |     }
26 | 
27 |     public function dist(&$A, &$B)
28 |     {
29 |         return 1-$this->similarity($A,$B);
30 |     }
31 | }


--------------------------------------------------------------------------------
/src/NlpTools/Similarity/DistanceInterface.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Similarity;
 4 | 
 5 | /**
 6 |  * Distance should return a number proportional to how dissimilar
 7 |  * the two instances are(with any metric)
 8 |  */
 9 | interface DistanceInterface
10 | {
11 |     public function dist(&$A, &$B);
12 | }
13 | 


--------------------------------------------------------------------------------
/src/NlpTools/Similarity/Euclidean.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Similarity;
 4 | 
 5 | /**
 6 |  * This class computes the very simple euclidean distance between
 7 |  * two vectors ( sqrt(sum((a_i-b_i)^2)) ).
 8 |  */
 9 | class Euclidean implements DistanceInterface
10 | {
11 |     /**
12 |      * see class description
13 |      * @param  array $A Either a vector or a collection of tokens to be transformed to a vector
14 |      * @param  array $B Either a vector or a collection of tokens to be transformed to a vector
15 |      * @return float The euclidean distance between $A and $B
16 |      */
17 |     public function dist(&$A, &$B)
18 |     {
19 |         if (is_int(key($A)))
20 |             $v1 = array_count_values($A);
21 |         else
22 |             $v1 = &$A;
23 |         if (is_int(key($B)))
24 |             $v2 = array_count_values($B);
25 |         else
26 |             $v2 = &$B;
27 | 
28 |         $r = array();
29 |         foreach ($v1 as $k=>$v) {
30 |             $r[$k] = $v;
31 |         }
32 |         foreach ($v2 as $k=>$v) {
33 |             if (isset($r[$k]))
34 |                 $r[$k] -= $v;
35 |             else
36 |                 $r[$k] = $v;
37 |         }
38 | 
39 |         return sqrt(
40 |             array_sum(
41 |                 array_map(
42 |                     function ($x) {
43 |                         return $x*$x;
44 |                     },
45 |                     $r
46 |                 )
47 |             )
48 |         );
49 |     }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/NlpTools/Similarity/HammingDistance.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Similarity;
 4 | 
 5 | /**
 6 |  * This class implements the hamming distance of two strings or sets.
 7 |  * To be used with numbers one should pass the numbers to decbin() first
 8 |  * and make sure the smaller number is properly padded with zeros.
 9 |  */
10 | class HammingDistance implements DistanceInterface
11 | {
12 |     /**
13 |      * Count the number of positions that A and B differ.
14 |      *
15 |      * @param  string $A
16 |      * @param  string $B
17 |      * @return int    The hamming distance of the two strings A and B
18 |      */
19 |     public function dist(&$A, &$B)
20 |     {
21 |         $l1 = strlen($A);
22 |         $l2 = strlen($B);
23 |         $l = min($l1,$l2);
24 |         $d = 0;
25 |         for ($i=0;$i<$l;$i++) {
26 |             $d += (int) ($A[$i]!=$B[$i]);
27 |         }
28 | 
29 |         return $d + (int) abs($l1-$l2);
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/NlpTools/Similarity/JaccardIndex.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Similarity;
 4 | 
 5 | /**
 6 |  * http://en.wikipedia.org/wiki/Jaccard_index
 7 |  */
 8 | class JaccardIndex implements SimilarityInterface, DistanceInterface
 9 | {
10 |     /**
11 |      * The similarity returned by this algorithm is a number between 0,1
12 |      */
13 |     public function similarity(&$A, &$B)
14 |     {
15 |         $a = array_fill_keys($A,1);
16 |         $b = array_fill_keys($B,1);
17 | 
18 |         $intersect = count(array_intersect_key($a,$b));
19 |         $union = count(array_fill_keys(array_merge($A,$B),1));
20 | 
21 |         return $intersect/$union;
22 |     }
23 | 
24 |     /**
25 |      * Jaccard Distance is simply the complement of the jaccard similarity
26 |      */
27 |     public function dist(&$A, &$B)
28 |     {
29 |         return 1-$this->similarity($A,$B);
30 |     }
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/src/NlpTools/Similarity/OverlapCoefficient.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Similarity;
 4 | 
 5 | /**
 6 |  * https://en.wikipedia.org/wiki/Overlap_coefficient
 7 |  */
 8 | class OverlapCoefficient implements SimilarityInterface, DistanceInterface
 9 | {
10 |    /**
11 |     * The similarity returned by this algorithm is a number between 0,1
12 |     */
13 |     public function similarity(&$A, &$B)
14 |     {
15 |         // Make the arrays into sets
16 |         $a = array_fill_keys($A,1);
17 |         $b = array_fill_keys($B,1);
18 | 
19 |         // Count the cardinalities of the sets
20 |         $a_count = count($a);
21 |         $b_count = count($b);
22 | 
23 |         if ($a_count == 0 || $b_count == 0) {
24 |             return 0;
25 |         }
26 | 
27 |         // Compute the intersection and count its cardinality
28 |         $intersect = count(array_intersect_key($a,$b));
29 | 
30 |         return $intersect/min($a_count,$b_count);
31 |     }
32 | 
33 |     public function dist(&$A, &$B)
34 |     {
35 |         return 1-$this->similarity($A,$B);
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/NlpTools/Similarity/Simhash.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace NlpTools\Similarity;
  4 | 
  5 | /**
  6 |  * Simhash is an implementation of the locality sensitive hash function
  7 |  * families proposed by Moses Charikar using the Earth Mover's Distance
  8 |  * http://www.cs.princeton.edu/courses/archive/spring04/cos598B/bib/CharikarEstim.pdf
  9 |  *
 10 |  * A better description of the implementation can be found at
 11 |  * http://infolab.stanford.edu/~manku/papers/07www-duplicates.pdf
 12 |  *
 13 |  * The current implementation uses md5 by default to hash the documents
 14 |  * features. Weighted features are not supported (unless duplicating a
 15 |  * feature is considered adding weight to it).
 16 |  */
 17 | class Simhash implements SimilarityInterface, DistanceInterface
 18 | {
 19 |     // The length in bits of the simhash
 20 |     protected $length;
 21 | 
 22 |     // $h is a hash function that returns a string of 1,0
 23 |     // corresponding to the bits of the hash
 24 |     protected $h;
 25 | 
 26 |     // This is the default hash function used to hash
 27 |     // the members of the sets (it is just a wrapper over md5)
 28 |     protected static $search = array('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f');
 29 |     protected static $replace = array('0000','0001','0010','0011','0100','0101','0110','0111','1000','1001','1010','1011','1100','1101','1110','1111');
 30 |     protected static function md5($w)
 31 |     {
 32 |         return str_replace(self::$search,self::$replace,md5($w));
 33 |     }
 34 | 
 35 |     /**
 36 |      * @param integer  $len  The length of the simhash in bits
 37 |      * @param callable $hash The hash function to compute the hashes of the features
 38 |      */
 39 |     public function __construct($len,$hash='self::md5')
 40 |     {
 41 |         $this->length = $len;
 42 |         $this->h = $hash;
 43 |     }
 44 | 
 45 |     /**
 46 |      * Compute the locality sensitive hash for this set.
 47 |      * Maintain a vector ($boxes) of length $this->length initialized to
 48 |      * 0. Each member of the set is hashed to a {$this->length} bit vector.
 49 |      * For each of these bits we either increment or decrement the
 50 |      * corresponding $boxes dimension depending on the bit being either
 51 |      * 1 or 0. Finally the signs of each dimension of the boxes vector
 52 |      * is the locality sensitive hash.
 53 |      *
 54 |      * We have departed from the original implementation at the
 55 |      * following points:
 56 |      *  1. Each feature has a weight of 1, but feature duplication is
 57 |      *     allowed.
 58 |      *
 59 |      * @param  array  $set
 60 |      * @return string The bits of the hash as a string
 61 |      * */
 62 |     public function simhash(array &$set)
 63 |     {
 64 |         $boxes = array_fill(0,$this->length,0);
 65 |         if (is_int(key($set)))
 66 |             $dict = array_count_values($set);
 67 |         else
 68 |             $dict = &$set;
 69 |         foreach ($dict as $m=>$w) {
 70 |             $h = call_user_func($this->h,$m);
 71 |             for ($bit_idx=0;$bit_idx<$this->length;$bit_idx++) {
 72 |                     $boxes[$bit_idx] += ($h[$bit_idx]=='1') ? $w : -$w;
 73 |             }
 74 |         }
 75 |         $s = '';
 76 |         foreach ($boxes as $box) {
 77 |             if ($box>0)
 78 |                 $s .= '1';
 79 |             else
 80 |                 $s .= '0';
 81 |         }
 82 | 
 83 |         return $s;
 84 |     }
 85 | 
 86 |     /**
 87 |      * Computes the hamming distance of the simhashes of two sets.
 88 |      *
 89 |      * @param  array $A
 90 |      * @param  array $B
 91 |      * @return int   [0,$this->length]
 92 |      */
 93 |     public function dist(&$A, &$B)
 94 |     {
 95 |         $h1 = $this->simhash($A);
 96 |         $h2 = $this->simhash($B);
 97 |         $d = 0;
 98 |         for ($i=0;$i<$this->length;$i++) {
 99 |             if ($h1[$i]!=$h2[$i])
100 |                 $d++;
101 |         }
102 | 
103 |         return $d;
104 |     }
105 | 
106 |     /**
107 |      * Computes a similarity measure from two sets. The similarity is
108 |      * computed as 1 - (sets' distance) / (maximum possible distance).
109 |      *
110 |      * @param  array $A
111 |      * @param  array $B
112 |      * @return float [0,1]
113 |      */
114 |     public function similarity(&$A, &$B)
115 |     {
116 |         return ($this->length-$this->dist($A,$B))/$this->length;
117 |     }
118 | 
119 | }
120 | 


--------------------------------------------------------------------------------
/src/NlpTools/Similarity/SimilarityInterface.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Similarity;
 4 | 
 5 | /**
 6 |  * Similarity should return a number that is proportional to how
 7 |  * similar those two instances are (with any metric).
 8 |  *
 9 |  */
10 | interface SimilarityInterface
11 | {
12 |     public function similarity(&$A, &$B);
13 | }
14 | 


--------------------------------------------------------------------------------
/src/NlpTools/Similarity/TverskyIndex.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Similarity;
 4 | 
 5 | /**
 6 |  * A Generalization of Jaccard Index and Dice Similarity.
 7 |  *
 8 |  * The similarity returned by this algorithm is a number between 0,1 The
 9 |  * algorithm described in
10 |  * http://www.cogsci.ucsd.edu/~coulson/203/tversky-features.pdf, which
11 |  * generalizes both Dice similarity and Jaccard index, does not meet the
12 |  * criteria for a similarity metric (due to its inherent assymetry), but has
13 |  * been made symmetrical as applied here (by Jimenez, S., Becerra, C., Gelbukh,
14 |  * A.): http://aclweb.org/anthology/S/S13/S13-1028.pdf
15 |  */
16 | class TverskyIndex implements SimilarityInterface, DistanceInterface
17 | {
18 |     /**
19 |      * @param $alpha Set to 0.5 to get either Jaccard Index or Dice Similarity
20 |      * @param $beta  Set to 1 to get Jaccard Index and 2 for Dice Similarity
21 |      */
22 |     public function __construct($alpha=0.5, $beta=1)
23 |     {
24 |         $this->alpha = $alpha;
25 |         $this->beta = $beta;
26 |     }
27 | 
28 |     /**
29 |      * Compute the similarity using the alpha and beta values given in the
30 |      * constructor.
31 |      *
32 |      * @param  array $A
33 |      * @param  array $B
34 |      * @return float
35 |      */
36 |     public function similarity(&$A, &$B)
37 |     {
38 |         $alpha = $this->alpha;
39 |         $beta = $this->beta;
40 | 
41 |         $a = array_fill_keys($A,1);
42 |         $b = array_fill_keys($B,1);
43 | 
44 |         $min = min(count(array_diff_key($a,$b)),count(array_diff_key($b, $a)));
45 |         $max = max(count(array_diff_key($a,$b)),count(array_diff_key($b, $a)));
46 | 
47 |         $intersect = count(array_intersect_key($a,$b));
48 | 
49 |         return $intersect/($intersect + ($beta * ($alpha * $min + $max*(1-$alpha)) ));
50 |     }
51 | 
52 |     public function dist(&$A, &$B)
53 |     {
54 |         return 1-$this->similarity($A,$B);
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/NlpTools/Stemmers/RegexStemmer.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Stemmers;
 4 | 
 5 | /**
 6 |  * This stemmer removes affixes according to a regular expression.
 7 |  */
 8 | class RegexStemmer extends Stemmer
 9 | {
10 | 
11 |     protected $regex;
12 |     protected $min;
13 | 
14 |     /**
15 |      * @param string  $regexstr The regex that will be passed to preg_replace
16 |      * @param integer $min      Do nothing for tokens smaller than $min length
17 |      */
18 |     public function __construct($regexstr,$min=0)
19 |     {
20 |         $this->regex = $regexstr;
21 |         $this->min = $min;
22 |     }
23 | 
24 |     public function stem($word)
25 |     {
26 |         if (mb_strlen($word,'utf-8')>=$this->min)
27 |             return preg_replace($this->regex,'',$word);
28 |         return $word;
29 |     }
30 | 
31 | }
32 | 


--------------------------------------------------------------------------------
/src/NlpTools/Stemmers/Stemmer.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Stemmers;
 4 | 
 5 | use NlpTools\Utils\TransformationInterface;
 6 | 
 7 | /**
 8 |  * http://en.wikipedia.org/wiki/Stemming
 9 |  */
10 | abstract class Stemmer implements TransformationInterface
11 | {
12 | 
13 |     /**
14 |      * Remove the suffix from $word
15 |      *
16 |      * @return string
17 |      */
18 |     abstract public function stem($word);
19 | 
20 |     /**
21 |      * Apply the stemmer to every single token.
22 |      *
23 |      * @return array
24 |      */
25 |     public function stemAll(array $tokens)
26 |     {
27 |         return array_map(array($this,'stem'),$tokens);
28 |     }
29 | 
30 |     /**
31 |      * A stemmer's transformation is simply the replacing of a word
32 |      * with its stem.
33 |      */
34 |     public function transform($word)
35 |     {
36 |         return $this->stem($word);
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/NlpTools/Tokenizers/ClassifierBasedTokenizer.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace NlpTools\Tokenizers;
  4 | 
  5 | use \NlpTools\Classifiers\ClassifierInterface;
  6 | use \NlpTools\Documents\WordDocument;
  7 | 
  8 | /**
  9 |  * A tokenizer that uses a classifier (of any type) to determine if
 10 |  * there is an "end of word" (EOW). It takes as a parameter an initial
 11 |  * tokenizer and then determines if any two following tokens should in
 12 |  * fact be one token.
 13 |  *
 14 |  * Those tokenizers could be nested to produce sentence tokenizers.
 15 |  *
 16 |  * Example:
 17 |  *
 18 |  * If we were for example to tokenize the following sentence
 19 |  * "Me and O'Brien, we 'll go!" and we used a simple space tokenizer we
 20 |  * would end up with this
 21 |  * ["Me","and","O'Brien,","we","'ll","go!"]
 22 |  * if we used a space and punctuation tokenizer we 'd end up with
 23 |  * ["Me","and","O","'","Brien",",","we","'","ll","go","!"]
 24 |  * but we want
 25 |  * ["Me","and","O'Brien",",","we","'ll","go","!"]
 26 |  * so we should train a classifier to do the following
 27 |  *
 28 |  * Token | Cls
 29 |  * ------------
 30 |  * Me    | EOW
 31 |  * and   | EOW
 32 |  * O     | O
 33 |  * '     | O
 34 |  * Brien | EOW
 35 |  * ,     | EOW
 36 |  * we    | EOW
 37 |  * '     | O
 38 |  * ll    | EOW
 39 |  * go    | EOW
 40 |  * !     | EOW
 41 |  *
 42 |  */
 43 | class ClassifierBasedTokenizer implements TokenizerInterface
 44 | {
 45 |     const EOW = 'EOW';
 46 |     protected static $classSet = array('O','EOW');
 47 | 
 48 |     // initial tokenizer
 49 |     protected $tok;
 50 | 
 51 |     protected $classifier;
 52 | 
 53 |     // used when joining the tokens into one
 54 |     protected $sep;
 55 | 
 56 |     public function __construct(ClassifierInterface $cls, TokenizerInterface $tok=null,$sep=' ')
 57 |     {
 58 |         if ($tok == null) {
 59 |             $this->tok = new WhitespaceAndPunctuationTokenizer();
 60 |         } else {
 61 |             $this->tok  = $tok;
 62 |         }
 63 |         $this->classifier = $cls;
 64 |         $this->sep = $sep;
 65 |     }
 66 | 
 67 |     /**
 68 |      * Tokenize the string.
 69 |      *
 70 |      * 1. Break up the string in tokens using the initial tokenizer
 71 |      * 2. Classify each token if it is an EOW
 72 |      * 3. For each token that is not an EOW add it to the next EOW token using a separator
 73 |      *
 74 |      * @param  string $str The character sequence to be broken in tokens
 75 |      * @return array  The token array
 76 |      */
 77 |     public function tokenize($str)
 78 |     {
 79 |         // split the string in tokens and create documents to be
 80 |         // classified
 81 |         $tokens = $this->tok->tokenize($str);
 82 |         $docs = array();
 83 |         foreach ($tokens as $offset=>$tok) {
 84 |             $docs[] = new WordDocument($tokens,$offset,5);
 85 |         }
 86 | 
 87 |         // classify each token as an EOW or O
 88 |         $tags = array();
 89 |         foreach ($docs as $doc) {
 90 |             $tags[] = $this->classifier->classify(self::$classSet, $doc);
 91 |         }
 92 | 
 93 |         // merge O and EOW into real tokens
 94 |         $realtokens = array();
 95 |         $currentToken = array();
 96 |         foreach ($tokens as $offset=>$tok) {
 97 |             $currentToken[] = $tok;
 98 |             if ($tags[$offset] == self::EOW) {
 99 |                 $realtokens[] = implode($this->sep,$currentToken);
100 |                 $currentToken = array();
101 |             }
102 |         }
103 | 
104 |         // return real tokens
105 |         return $realtokens;
106 |     }
107 | }
108 | 


--------------------------------------------------------------------------------
/src/NlpTools/Tokenizers/PennTreeBankTokenizer.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace NlpTools\Tokenizers;
  4 | use NlpTools\Exceptions\InvalidExpression;
  5 | 
  6 | /**
  7 |  * PennTreeBank Tokenizer
  8 |  * Based on http://www.cis.upenn.edu/~treebank/tokenizer.sed
  9 |  *
 10 |  *
 11 |  * @author Dan Cardin
 12 |  */
 13 | class PennTreeBankTokenizer extends WhitespaceTokenizer
 14 | {
 15 |     /**
 16 |      *
 17 |      * @var array An array that holds the patterns and replacements
 18 |      */
 19 |     protected $patternsAndReplacements = array();
 20 | 
 21 |     public function __construct()
 22 |     {
 23 |         $this->initPatternReplacement();
 24 |     }
 25 | 
 26 |     /**
 27 |      * Calls internal functions to handle data processing
 28 |      * @param string $str
 29 |      */
 30 |     public function tokenize($str)
 31 |     {
 32 |         return parent::tokenize($this->execute($str));
 33 |     }
 34 |     /**
 35 |      * Handles the data processing
 36 |      * @param string $string The raw text to get parsed
 37 |      */
 38 |     protected function execute($string)
 39 |     {
 40 |         foreach ($this->patternsAndReplacements as $patternAndReplacement) {
 41 |             $tmp = preg_replace("/".$patternAndReplacement->pattern."/s", $patternAndReplacement->replacement, $string);
 42 |             if ($tmp === null) {
 43 |                 InvalidExpression::invalidRegex($patternAndReplacement->pattern, $patternAndReplacement->replacement);
 44 |             } else {
 45 |                 $string = $tmp;
 46 |             }
 47 |         }
 48 |         
 49 |         return $string;
 50 |     }
 51 | 
 52 |     /**
 53 |      * Initializes the patterns and replacements/
 54 |      */
 55 |     protected function initPatternReplacement()
 56 |     {
 57 |         $this->addPatternAndReplacement('^"', '``');
 58 |         $this->addPatternAndReplacement("\([ ([{<]\)","$1 `` ");
 59 |         $this->addPatternAndReplacement("\.\.\."," ... ");
 60 |         $this->addPatternAndReplacement("([,;:@#$%&])", " $1 ");
 61 |         $this->addPatternAndReplacement("([^.])([.])([])}>\"\']*)[ 	]*$","\${1} \${2}\${3}");
 62 |         $this->addPatternAndReplacement("[?!]"," $0 ");
 63 |         $this->addPatternAndReplacement("[][(){}<>]"," $0 ");
 64 |         $this->addPatternAndReplacement("--"," -- ");
 65 |         $this->addPatternAndReplacement("\""," '' ");
 66 | 
 67 |         $this->addPatternAndReplacement("([^'])' ","\${1} ' ");
 68 |         $this->addPatternAndReplacement("'([sSmMdD]) "," '\${1} ");
 69 |         $this->addPatternAndReplacement("'ll "," 'll ");
 70 |         $this->addPatternAndReplacement("'re "," 're ");
 71 |         $this->addPatternAndReplacement("'ve "," 've ");
 72 |         $this->addPatternAndReplacement("n't "," n't ");
 73 |         $this->addPatternAndReplacement("'LL "," 'LL ");
 74 |         $this->addPatternAndReplacement("'RE "," 'RE ");
 75 |         $this->addPatternAndReplacement("'VE "," 'VE ");
 76 |         $this->addPatternAndReplacement("N'T "," N'T ");
 77 | 
 78 |         $this->addPatternAndReplacement(" ([Cc])annot "," \1an not ");
 79 |         $this->addPatternAndReplacement(" ([Dd])'ye "," \${1}' ye ");
 80 |         $this->addPatternAndReplacement(" ([Gg])imme "," \${1}im me ");
 81 |         $this->addPatternAndReplacement(" ([Gg])onna "," \${1}on na ");
 82 |         $this->addPatternAndReplacement(" ([Gg])otta "," \${1}ot ta ");
 83 |         $this->addPatternAndReplacement(" ([Ll])emme "," \${1}em me ");
 84 |         $this->addPatternAndReplacement(" ([Mm])ore'n "," \${1}ore 'n ");
 85 |         $this->addPatternAndReplacement(" '([Tt])is "," '\${1} is ");
 86 |         $this->addPatternAndReplacement(" '([Tt])was "," '\${1} was ");
 87 |         $this->addPatternAndReplacement(" ([Ww])anna "," \${1}an na ");
 88 | 
 89 |         $this->addPatternAndReplacement("  *"," ");
 90 |         $this->addPatternAndReplacement("^ *","");
 91 | 
 92 |     }
 93 | 
 94 |     /**
 95 |      * Appends \stdClass objects to the internal data structure $patternsAndReplacements
 96 |      * @param string $pattern
 97 |      * @param string $replacement
 98 |      */
 99 |     protected function addPatternAndReplacement($pattern, $replacement)
100 |     {
101 |         $instance = new \stdClass();
102 |         $instance->pattern = $pattern;
103 |         $instance->replacement = $replacement;
104 |         $this->patternsAndReplacements[] = $instance;
105 |     }
106 | 
107 | }
108 | 


--------------------------------------------------------------------------------
/src/NlpTools/Tokenizers/RegexTokenizer.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace NlpTools\Tokenizers;
  4 | 
  5 | /**
  6 |  * Regex tokenizer tokenizes text based on a set of regexes
  7 |  */
  8 | class RegexTokenizer implements TokenizerInterface
  9 | {
 10 |     // the patterns to be used
 11 |     protected $patterns;
 12 | 
 13 |     /**
 14 |      * Initialize the Tokenizer
 15 |      *
 16 |      * @param array $patterns The regular expressions
 17 |      */
 18 |     public function __construct(array $patterns)
 19 |     {
 20 |         $this->patterns = $patterns;
 21 |     }
 22 | 
 23 |     /**
 24 |      * Iteratively run for each pattern. The tokens resulting from one pattern are
 25 |      * fed to the next as strings.
 26 |      *
 27 |      * If the pattern is given alone, it is assumed that it is a pattern used
 28 |      * for splitting with preg_split.
 29 |      *
 30 |      * If the pattern is given together with an integer then it is assumed to be
 31 |      * a pattern used with preg_match
 32 |      *
 33 |      * If a pattern is given with a string it is assumed to be a transformation
 34 |      * pattern used with preg_replace
 35 |      *
 36 |      * @param  string $str The string to be tokenized
 37 |      * @return array  The tokens
 38 |      */
 39 |     public function tokenize($str)
 40 |     {
 41 |         $str = array($str);
 42 |         foreach ($this->patterns as $p) {
 43 |             if (!is_array($p)) $p = array($p);
 44 |             if (count($p)==1) { // split pattern
 45 |                 $this->split($str, $p[0]);
 46 |             } elseif (is_int($p[1])) { // match pattern
 47 |                 $this->match($str, $p[0], $p[1]);
 48 |             } else { // replace pattern
 49 |                 $this->replace($str, $p[0], $p[1]);
 50 |             }
 51 |         }
 52 | 
 53 |         return $str;
 54 |     }
 55 | 
 56 |     /**
 57 |      * Execute the SPLIT mode
 58 |      *
 59 |      * @param array &$str The tokens to be further tokenized
 60 |      */
 61 |     protected function split(array &$str, $pattern)
 62 |     {
 63 |         $tokens = array();
 64 |         foreach ($str as $s) {
 65 |             $tokens = array_merge(
 66 |                 $tokens,
 67 |                 preg_split($pattern, $s, null, PREG_SPLIT_NO_EMPTY)
 68 |             );
 69 |         }
 70 | 
 71 |         $str = $tokens;
 72 |     }
 73 | 
 74 |     /**
 75 |      * Execute the KEEP_MATCHES mode
 76 |      *
 77 |      * @param array &$str The tokens to be further tokenized
 78 |      */
 79 |     protected function match(array &$str, $pattern, $keep)
 80 |     {
 81 |         $tokens = array();
 82 |         foreach ($str as $s) {
 83 |             preg_match_all($pattern, $s, $m);
 84 |             $tokens = array_merge(
 85 |                 $tokens,
 86 |                 $m[$keep]
 87 |             );
 88 |         }
 89 | 
 90 |         $str = $tokens;
 91 |     }
 92 | 
 93 |     /**
 94 |      * Execute the TRANSFORM mode.
 95 |      *
 96 |      * @param string $str The string to be tokenized
 97 |      */
 98 |     protected function replace(array &$str, $pattern, $replacement)
 99 |     {
100 |         foreach ($str as &$s) {
101 |             $s = preg_replace($pattern, $replacement, $s);
102 |         }
103 |     }
104 | }
105 | 


--------------------------------------------------------------------------------
/src/NlpTools/Tokenizers/TokenizerInterface.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Tokenizers;
 4 | 
 5 | interface TokenizerInterface
 6 | {
 7 |     /**
 8 |      * Break a character sequence to a token sequence
 9 |      *
10 |      * @param  string $str The text for tokenization
11 |      * @return array  The list of tokens from the string
12 |      */
13 |     public function tokenize($str);
14 | }
15 | 


--------------------------------------------------------------------------------
/src/NlpTools/Tokenizers/WhitespaceAndPunctuationTokenizer.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Tokenizers;
 4 | 
 5 | /**
 6 |  * Simple white space tokenizer. Breaks either on whitespace or on word
 7 |  * boundaries (ex.: dots, commas, etc)
 8 |  * Does not include white space in tokens.
 9 |  * Every punctuation character is a signle token
10 |  */
11 | class WhitespaceAndPunctuationTokenizer implements TokenizerInterface
12 | {
13 |     public function tokenize($str)
14 |     {
15 |         $arr = array();
16 |         // for the character classes
17 |         // see http://php.net/manual/en/regexp.reference.unicode.php
18 |         $pat = '/
19 |                     ([\pZ\pC]*)			# match any separator or other
20 |                                         # in sequence
21 |                     (
22 |                         [^\pP\pZ\pC]+ |	# match a sequence of characters
23 |                                         # that are not punctuation,
24 |                                         # separator or other
25 | 
26 |                         .				# match punctuations one by one
27 |                     )
28 |                     ([\pZ\pC]*)			# match a sequence of separators
29 |                                         # that follows
30 |                 /xu';
31 |         preg_match_all($pat,$str,$arr);
32 | 
33 |         return $arr[2];
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/NlpTools/Tokenizers/WhitespaceTokenizer.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Tokenizers;
 4 | 
 5 | /**
 6 |  * Simple white space tokenizer.
 7 |  * Break on every white space
 8 |  */
 9 | class WhitespaceTokenizer implements TokenizerInterface
10 | {
11 |     const PATTERN = '/[\pZ\pC]+/u';
12 | 
13 |     public function tokenize($str)
14 |     {
15 |         $arr = array();
16 | 
17 |         return preg_split(self::PATTERN,$str,null,PREG_SPLIT_NO_EMPTY);
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/src/NlpTools/Utils/ClassifierBasedTransformation.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Utils;
 4 | 
 5 | use NlpTools\Classifiers\ClassifierInterface;
 6 | use NlpTools\Documents\RawDocument;
 7 | 
 8 | /**
 9 |  * Classify whatever is passed in the transform and pass it a different set
10 |  * of transformations based on the class.
11 |  *
12 |  * Can be used to create, for instance, language based transformations.
13 |  */
14 | class ClassifierBasedTransformation implements TransformationInterface
15 | {
16 |     protected $cls;
17 | 
18 |     protected $transforms;
19 |     protected $classes = array();
20 | 
21 |     /**
22 |      * In order to classify anything with NlpTools we need something
23 |      * that implements the ClassifierInterface. We also need the set
24 |      * of classes but that will be calculated by the classes for which
25 |      * we register a transformation.
26 |      *
27 |      * @param ClassifierInterface $cls
28 |      */
29 |     public function __construct(ClassifierInterface $cls)
30 |     {
31 |         $this->cls = $cls;
32 |     }
33 | 
34 |     /**
35 |      * Classify the passed in variable w and then apply each transformation
36 |      * to the output of the previous one.
37 |      */
38 |     public function transform($w)
39 |     {
40 |         $class = $this->cls->classify(
41 |             $this->classes,
42 |             new RawDocument($w)
43 |         );
44 | 
45 |         foreach ($this->transforms[$class] as $t) {
46 |             $w = $t->transform($w);
47 |         }
48 | 
49 |         return $w;
50 |     }
51 | 
52 |     /**
53 |      * Register a set of transformations for a given class.
54 |      *
55 |      * @param string $class
56 |      * @param array|TransformationInterface Either an array of transformations or a single transformation
57 |      */
58 |     public function register($class, $transforms)
59 |     {
60 |         if (!is_array($transforms)) {
61 |             $transforms = array($transforms);
62 |         }
63 |         foreach ($transforms as $t) {
64 |             if (!($t instanceof TransformationInterface)) {
65 |                 throw new \InvalidArgumentException("Only instances of TransformationInterface can be registered");
66 |             }
67 |         }
68 | 
69 |         if (!isset($this->transforms[$class])) {
70 |             $this->classes[] = $class;
71 |             $this->transforms[$class] = array();
72 |         }
73 | 
74 |         foreach ($transforms as $t) {
75 |             $this->transforms[$class][] = $t;
76 |         }
77 |     }
78 | }
79 | 


--------------------------------------------------------------------------------
/src/NlpTools/Utils/EnglishVowels.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | namespace NlpTools\Utils;
 3 | 
 4 | /**
 5 |  * Helper Vowel class, determines if the character at a given index is a vowel
 6 |  * @author Dan Cardin
 7 |  */
 8 | class EnglishVowels extends VowelsAbstractFactory
 9 | {
10 |     /**
11 |      * Returns true if the letter at the given index is a vowel, works with y
12 |      * @param  string  $word  the word to use
13 |      * @param  int     $index the index in the string to inspect
14 |      * @return boolean True letter at the provided index is a vowel
15 |      */
16 |     public function isVowel($word, $index)
17 |     {
18 |         if (strpbrk($word[$index], 'aeiou') !== false) {
19 |             return true;
20 |         } elseif ($word[$index] === 'y' && strpbrk($word[--$index], 'aeiou') === false) {
21 |             return true;
22 |         }
23 | 
24 |         return false;
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/src/NlpTools/Utils/Normalizers/English.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Utils\Normalizers;
 4 | 
 5 | /**
 6 |  * For English we simply transform to lower case using mb_strtolower.
 7 |  * This should be used as a fallback for any language since mb_strtolower
 8 |  * will do at least half good a job
 9 |  */
10 | class English extends Normalizer
11 | {
12 |     public function normalize($w)
13 |     {
14 |         return mb_strtolower($w,"utf-8");
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/src/NlpTools/Utils/Normalizers/Greek.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Utils\Normalizers;
 4 | 
 5 | /**
 6 |  * To normalize greek text we use mb_strtolower to transform
 7 |  * to lower case and then replace every accented character
 8 |  * with its non-accented counter part and the final ς with σ
 9 |  */
10 | class Greek extends Normalizer
11 | {
12 |     protected static $dirty = array(
13 |         'ά','έ','ό','ή','ί','ύ','ώ','ς'
14 |     );
15 |     protected static $clean = array(
16 |         'α','ε','ο','η','ι','υ','ω','σ'
17 |     );
18 | 
19 |     public function normalize($w)
20 |     {
21 |         return str_replace(self::$dirty, self::$clean, mb_strtolower($w, "utf-8"));
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/NlpTools/Utils/Normalizers/Normalizer.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Utils\Normalizers;
 4 | 
 5 | use NlpTools\Utils\TransformationInterface;
 6 | 
 7 | /**
 8 |  * The Normalizer's purpose is to transform any word from any
 9 |  * one of the possible writings to a single writing consistently.
10 |  * A lot of algorithms for stemming already expect normalized text.
11 |  *
12 |  * The most common normalization would be to transform the words to
13 |  * lower case. There are languages though that this is not enough
14 |  * since there maybe other diacritics that need to be removed.
15 |  *
16 |  * E.g.: The         -> the
17 |  *       I           -> i
18 |  *       WhAtEvEr    -> whatever
19 |  *       Άγγελος     -> αγγελοσ
20 |  *       Αριστοτέλης -> αριστοτελησ
21 |  */
22 | abstract class Normalizer implements TransformationInterface
23 | {
24 |     /**
25 |      * Transform the word according to the class description
26 |      *
27 |      * @param  string $w The word to normalize
28 |      * @return string
29 |      */
30 |     abstract public function normalize($w);
31 | 
32 |     /**
33 |      * {@inheritdoc}
34 |      */
35 |     public function transform($w)
36 |     {
37 |         return $this->normalize($w);
38 |     }
39 | 
40 |     /**
41 |      * Apply the normalize function to all the items in the array
42 |      * @param  array $items
43 |      * @return array
44 |      */
45 |     public function normalizeAll(array $items)
46 |     {
47 |         return array_map(
48 |             array($this, 'normalize'),
49 |             $items
50 |         );
51 |     }
52 | 
53 |     /**
54 |      * Just instantiate the normalizer using a factory method.
55 |      * Keep in mind that this is NOT required. The constructor IS
56 |      * visible.
57 |      *
58 |      * @param string $language
59 |      */
60 |     public static function factory($language = "English")
61 |     {
62 |         $classname = __NAMESPACE__."\\$language";
63 | 
64 |         return new $classname();
65 |     }
66 | }
67 | 


--------------------------------------------------------------------------------
/src/NlpTools/Utils/StopWords.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Utils;
 4 | 
 5 | /**
 6 |  * Stop Words are words which are filtered out because they carry
 7 |  * little to no information.
 8 |  *
 9 |  * This class transforms tokens. If they are listed as stop words
10 |  * it returns null in order for the Document to remove them.
11 |  * Otherwise it leaves them unchanged.
12 |  */
13 | class StopWords implements TransformationInterface
14 | {
15 |     protected $stopwords;
16 |     protected $inner_transform;
17 | 
18 |     public function __construct(array $stopwords, TransformationInterface $transform = null)
19 |     {
20 |         $this->stopwords = array_fill_keys(
21 |             $stopwords,
22 |             true
23 |         );
24 | 
25 |         $this->inner_transform = $transform;
26 |     }
27 | 
28 |     public function transform($token)
29 |     {
30 |         $tocheck = $token;
31 | 
32 |         if ($this->inner_transform) {
33 |             $tocheck = $this->inner_transform->transform($token);
34 |         }
35 | 
36 |         return isset($this->stopwords[$tocheck]) ? null : $token;
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/NlpTools/Utils/TransformationInterface.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Utils;
 4 | 
 5 | /**
 6 |  * TransformationInterface represents any type of transformation
 7 |  * to be applied upon documents. The transformation is defined upon
 8 |  * single values and how each document applies a transformation
 9 |  * differs. For instance TokensDocument should apply the transformation
10 |  * on each token but EuclideanDocument could apply it on each key (dimension).
11 |  *
12 |  * There can be combinations of transformations and documents that make
13 |  * no sense. For instance if we have a scaling transformation that expects
14 |  * numeric values and returns them multiplied by a constant c, it
15 |  * would make little sense to pass this transformation to
16 |  * TokensDocument that expects transformations to be applied on
17 |  * specific tokens.
18 |  */
19 | interface TransformationInterface
20 | {
21 |     /**
22 |      * Return the value transformed.
23 |      * @param  mixed $value The value to transform
24 |      * @return mixed
25 |      */
26 |     public function transform($value);
27 | }
28 | 


--------------------------------------------------------------------------------
/src/NlpTools/Utils/VowelsAbstractFactory.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | namespace NlpTools\Utils;
 3 | 
 4 | /**
 5 |  * Factory wrapper for Vowels
 6 |  * @author Dan Cardin
 7 |  */
 8 | abstract class VowelsAbstractFactory
 9 | {
10 |     /**
11 |      * Protected from use
12 |      */
13 |     protected function __construct(){}
14 | 
15 |     /**
16 |      * Return the correct language vowel checker
17 |      * @param  string                               $language
18 |      * @return \NlpTools\Utils\VowelAbstractFactory
19 |      * @throws \Exception
20 |      */
21 |     public static function factory($language = 'English')
22 |     {
23 |         $className = "\\".__NAMESPACE__."\\{$language}Vowels";
24 |         if (class_exists($className)) {
25 |             return new $className();
26 |         }
27 |         throw new \Exception("Class $className does not exist");
28 |     }
29 | 
30 |     /**
31 |      * Check if the the letter at the given index is a vowel
32 |      */
33 |     abstract public function isVowel($word, $index);
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Analysis/FreqDistTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | namespace NlpTools\Analysis;
 3 | use NlpTools\Documents\TokensDocument;
 4 | 
 5 | 
 6 | /**
 7 |  * Test the FreqDist class
 8 |  *
 9 |  * @author Dan Cardin
10 |  */
11 | class FreqDistTest extends \PHPUnit_Framework_TestCase
12 | {   
13 |     public function testSimpleFreqDist()
14 |     { 
15 |         $freqDist = new FreqDist(array("time", "flies", "like", "an", "arrow", "time", "flies", "like", "what"));
16 |         $this->assertTrue(count($freqDist->getHapaxes()) === 3);        
17 |         $this->assertEquals(9, $freqDist->getTotalTokens());
18 |         $this->assertEquals(6, $freqDist->getTotalUniqueTokens());
19 |     }
20 | 
21 |     public function testSimpleFreqWeight()
22 |     { 
23 |         $freqDist = new FreqDist(array("time", "flies", "like", "an", "arrow", "time", "flies", "like", "what"));
24 |         $this->assertEquals(1, $freqDist->getTotalByToken('an'));
25 |         $this->assertEquals(0.111, $freqDist->getTokenWeight('an'));
26 |     }
27 |     
28 |     public function testEmptyHapaxesFreqDist()
29 |     { 
30 |         $freqDist = new FreqDist(array("time", "time", "what", "what"));
31 |         $this->assertTrue(count($freqDist->getHapaxes()) === 0);        
32 |         $this->assertEquals(4, $freqDist->getTotalTokens());
33 |         $this->assertEquals(2, $freqDist->getTotalUniqueTokens());
34 |     }
35 |     
36 |     public function testSingleHapaxFreqDist()
37 |     {
38 |         $freqDist = new FreqDist(array("time"));
39 |         $this->assertTrue(count($freqDist->getHapaxes()) === 1);        
40 |         $this->assertEquals(1, $freqDist->getTotalTokens());
41 |         $this->assertEquals(1, $freqDist->getTotalUniqueTokens());        
42 |     }
43 | }
44 | 
45 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Analysis/IdfTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Analysis;
 4 | 
 5 | use NlpTools\Documents\TokensDocument;
 6 | use NlpTools\Documents\TrainingSet;
 7 | 
 8 | class IdfTest extends \PHPUnit_Framework_TestCase
 9 | {
10 |     public function testIdf()
11 |     {
12 |         $ts = new TrainingSet();
13 |         $ts->addDocument(
14 |             "",
15 |             new TokensDocument(array("a","b","c","d"))
16 |         );
17 |         $ts->addDocument(
18 |             "",
19 |             new TokensDocument(array("a","c","d"))
20 |         );
21 |         $ts->addDocument(
22 |             "",
23 |             new TokensDocument(array("a"))
24 |         );
25 | 
26 |         $idf = new Idf($ts);
27 | 
28 |         $this->assertEquals(
29 |             0.405,
30 |             $idf["c"],
31 |             null,
32 |             0.001
33 |         );
34 |         $this->assertEquals(
35 |             1.098,
36 |             $idf["b"],
37 |             null,
38 |             0.001
39 |         );
40 |         $this->assertEquals(
41 |             1.098,
42 |             $idf["non-existing"],
43 |             null,
44 |             0.001
45 |         );
46 |         $this->assertEquals(
47 |             0,
48 |             $idf["a"]
49 |         );
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Classifiers/EndOfSentenceRules.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Classifiers;
 4 | 
 5 | use NlpTools\Documents\DocumentInterface;
 6 | 
 7 | class EndOfSentenceRules implements ClassifierInterface
 8 | {
 9 |     public function classify(array $classes, DocumentInterface $d)
10 |     {
11 |         list($token,$before,$after) = $d->getDocumentData();
12 | 
13 |         $dotcnt = count(explode('.',$token))-1;
14 |         $lastdot = substr($token,-1)=='.';
15 | 
16 |         if (!$lastdot) // assume that all sentences end in full stops
17 |             return 'O';
18 | 
19 |         if ($dotcnt>1) // to catch some naive abbreviations (e.g.: U.S.A.)
20 |             return 'O';
21 | 
22 |         return 'EOW';
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Clustering/ClusteringTestBase.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace NlpTools\Clustering;
  4 | 
  5 | class ClusteringTestBase extends \PHPUnit_Framework_TestCase
  6 | {
  7 |     /**
  8 |      * Return a color distributed in the pallete according to $t
  9 |      * $t should be in (0,1)
 10 |      */
 11 |     protected function getColor($t)
 12 |     {
 13 |         $u = function ($x) { return ($x>0) ? 1 : 0; };
 14 |         $pulse = function ($x,$a,$b) use ($u) { return $u($x-$a)-$u($x-$b); };
 15 | 
 16 |         return array(
 17 |             (int) ( 255*( $pulse($t,0,1/3) + $pulse($t,1/3,2/3)*(2-3*$t) ) ),
 18 |             (int) ( 255*( $pulse($t,0,1/3)*3*$t + $pulse($t,1/3,2/3) + $pulse($t,2/3,1)*(3-3*$t) ) ),
 19 |             (int) ( 255*( $pulse($t,1/3,2/3)*(3*$t-1) + $pulse($t,2/3,1) ) )
 20 |         );
 21 |     }
 22 | 
 23 |     /**
 24 |      * Return a gd handle with a visualization of the clustering or null in case gd is not present.
 25 |      */
 26 |     protected function drawClusters($tset, $clusters, $centroids=null, $lines=False,$emphasize=0,$w=300,$h=200)
 27 |     {
 28 |         if (!function_exists('imagecreate'))
 29 |             return null;
 30 | 
 31 |         $im = imagecreatetruecolor($w,$h);
 32 |         $white = imagecolorallocate($im,255,255,255);
 33 |         $colors = array();
 34 |         $NC = count($clusters);
 35 |         for ($i=1;$i<=$NC;$i++) {
 36 |             list($r,$g,$b) = $this->getColor($i/$NC);
 37 |             $colors[] = imagecolorallocate($im,$r,$g,$b);
 38 |         }
 39 | 
 40 |         imagefill($im,0,0,$white);
 41 |         foreach ($clusters as $cid=>$cluster) {
 42 |             foreach ($cluster as $idx) {
 43 |                 $data = $tset[$idx]->getDocumentData();
 44 |                 if ($emphasize>0)
 45 |                     imagefilledarc($im,$data['x'],$data['y'],$emphasize,$emphasize,0,360,$colors[$cid],0);
 46 |                 else
 47 |                     imagesetpixel($im,$data['x'],$data['y'],$colors[$cid]);
 48 |             }
 49 |             if (is_array($centroids)) {
 50 |                 $x = $centroids[$cid]['x'];
 51 |                 $y = $centroids[$cid]['y'];
 52 |                 if ($lines) {
 53 |                     // draw line
 54 |                     // for cosine similarity
 55 |                     imagesetthickness($im,5);
 56 |                     imageline($im,0,0,$x*400,$y*400,$colors[$cid]);
 57 |                 } else {
 58 |                     // draw circle for euclidean
 59 |                     imagefilledarc($im,$x,$y,10,10,0,360,$colors[$cid],0);
 60 |                 }
 61 |             }
 62 |         }
 63 | 
 64 |         return $im;
 65 |     }
 66 | 
 67 |     /**
 68 |      * Return a gd handle with a visualization of the given dendrogram or null
 69 |      * if gd is not present.
 70 |      */
 71 |     protected function drawDendrogram($tset, $dendrogram, $w=300, $h=200)
 72 |     {
 73 |         if (!function_exists('imagecreate'))
 74 |             return null;
 75 | 
 76 |         $im = imagecreatetruecolor($w,$h);
 77 |         $white = imagecolorallocate($im, 255,255,255);
 78 |         $black = imagecolorallocate($im, 0,0,0);
 79 |         $blue = imagecolorallocate($im, 0,0,255);
 80 |         imagefill($im, 0,0, $white);
 81 | 
 82 |         // padding 5%
 83 |         $padding = round(0.05*$w);
 84 |         // equally distribute
 85 |         $d = ($w-2*$padding)/count($tset);
 86 |         $count_depth = function ($a) use (&$depth, &$count_depth) {
 87 |             if (is_array($a)) {
 88 |                 return max(
 89 |                     array_map(
 90 |                         $count_depth,
 91 |                         $a
 92 |                     )
 93 |                 ) + 1;
 94 |             } else {
 95 |                 return 1;
 96 |             }
 97 |         };
 98 |         $depth = $count_depth($dendrogram)-1;
 99 |         $d_v = ($h-2*$padding)/$depth;
100 | 
101 |         // offset from bottom
102 |         $y = $h-$padding;
103 |         $left = $padding;
104 | 
105 |         $draw_subcluster = function ($dendrogram, &$left) use (&$im, $d, $y, $d_v, $black, &$draw_subcluster,$blue) {
106 |             if (!is_array($dendrogram)) {
107 |                 imagestring($im, 1, $left-(2 * strlen($dendrogram)), $y, $dendrogram, $black);
108 |                 $left += $d;
109 | 
110 |                 return array($left - $d,$y-5);
111 |             }
112 |             list($l,$yl) = $draw_subcluster($dendrogram[0],$left);
113 |             list($r,$yr) = $draw_subcluster($dendrogram[1],$left);
114 |             $ym = min($yl,$yr)-$d_v;
115 |             imageline($im, $l, $yl, $l, $ym, $blue);
116 |             imageline($im, $r, $yr, $r, $ym, $blue);
117 |             imageline($im, $l, $ym, $r, $ym, $blue);
118 | 
119 |             return array($l+($r-$l)/2,$ym);
120 |         };
121 | 
122 |         if (count($dendrogram)==1)
123 |             $draw_subcluster($dendrogram[0],$left);
124 |         else
125 |             $draw_subcluster($dendrogram,$left);
126 | 
127 |         return $im;
128 |     }
129 | }
130 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Clustering/HierarchicalTest.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace NlpTools\Clustering;
  4 | 
  5 | use NlpTools\Clustering\MergeStrategies\SingleLink;
  6 | use NlpTools\Clustering\MergeStrategies\CompleteLink;
  7 | use NlpTools\Clustering\MergeStrategies\GroupAverage;
  8 | use NlpTools\Similarity\Euclidean;
  9 | use NlpTools\Documents\TrainingSet;
 10 | use NlpTools\Documents\TokensDocument;
 11 | use NlpTools\Documents\EuclideanPoint;
 12 | use NlpTools\FeatureFactories\DataAsFeatures;
 13 | 
 14 | class HierarchicalTest extends ClusteringTestBase
 15 | {
 16 |     protected function setUp()
 17 |     {
 18 |         if (!file_exists(TEST_DATA_DIR."/Clustering/HierarchicalTest")) {
 19 |             if (!file_exists(TEST_DATA_DIR."/Clustering"))
 20 |                 mkdir(TEST_DATA_DIR."/Clustering");
 21 |             mkdir(TEST_DATA_DIR."/Clustering/HierarchicalTest");
 22 |         }
 23 |     }
 24 | 
 25 |     public function testSingleLink()
 26 |     {
 27 |         $docs = array(
 28 |             array('x'=>0,'y'=>0),
 29 |             array('x'=>0,'y'=>1),
 30 |             array('x'=>1,'y'=>3),
 31 |             array('x'=>4,'y'=>6),
 32 |             array('x'=>6,'y'=>6)
 33 |         );
 34 | 
 35 |         $sl = new SingleLink();
 36 |         $sl->initializeStrategy(new Euclidean(), $docs);
 37 | 
 38 |         $pair = $sl->getNextMerge();
 39 |         $this->assertEquals(
 40 |             array(0,1),
 41 |             $pair
 42 |         );
 43 | 
 44 |         $pair = $sl->getNextMerge();
 45 |         $this->assertEquals(
 46 |             array(3,4),
 47 |             $pair
 48 |         );
 49 | 
 50 |         $pair = $sl->getNextMerge();
 51 |         $this->assertEquals(
 52 |             array(0,2),
 53 |             $pair
 54 |         );
 55 | 
 56 |         $pair = $sl->getNextMerge();
 57 |         $this->assertEquals(
 58 |             array(0,3),
 59 |             $pair
 60 |         );
 61 | 
 62 |         $this->setExpectedException(
 63 |             "RuntimeException",
 64 |             "Can't extract from an empty heap"
 65 |         );
 66 |         $sl->getNextMerge();
 67 |     }
 68 | 
 69 |     /**
 70 |      * We are clustering the following points.
 71 |      *
 72 |      *  1 | * * * * *     *
 73 |      *  0 +----------------
 74 |      * -1 | 0 1 2 3 4     7
 75 |      *
 76 |      * They are merged with the following order (x coordinates indicate which point).
 77 |      *
 78 |      *     +-----+
 79 |      *     |     |
 80 |      *  +----+   |
 81 |      *  |    |   |
 82 |      *  |   +--+ |
 83 |      *  |   |  | |
 84 |      *  |  +-+ | |
 85 |      *  |  | | | |
 86 |      * +-+ | | | |
 87 |      * | | | | | |
 88 |      * 0 1 2 3 4 7
 89 |      *
 90 |      */
 91 |     public function testCompleteLink()
 92 |     {
 93 |         $docs = array(
 94 |             array('x'=>0,'y'=>1),
 95 |             array('x'=>1,'y'=>1),
 96 |             array('x'=>2,'y'=>1),
 97 |             array('x'=>3,'y'=>1),
 98 |             array('x'=>4,'y'=>1),
 99 |             array('x'=>7,'y'=>1)
100 |         );
101 | 
102 |         $cl = new CompleteLink();
103 |         $cl->initializeStrategy(new Euclidean(), $docs);
104 | 
105 |         $pair = $cl->getNextMerge();
106 |         $this->assertEquals(
107 |             array(0,1),
108 |             $pair
109 |         );
110 | 
111 |         $pair = $cl->getNextMerge();
112 |         $this->assertEquals(
113 |             array(2,3),
114 |             $pair
115 |         );
116 | 
117 |         $pair = $cl->getNextMerge();
118 |         $this->assertEquals(
119 |             array(2,4),
120 |             $pair
121 |         );
122 | 
123 |         $pair = $cl->getNextMerge();
124 |         $this->assertEquals(
125 |             array(0,2),
126 |             $pair
127 |         );
128 | 
129 |         $pair = $cl->getNextMerge();
130 |         $this->assertEquals(
131 |             array(0,5),
132 |             $pair
133 |         );
134 | 
135 |         $this->setExpectedException(
136 |             "RuntimeException",
137 |             "Can't extract from an empty heap"
138 |         );
139 |         $cl->getNextMerge();
140 |     }
141 | 
142 |     /**
143 |      *
144 |      * | * * * *   *
145 |      * +------------
146 |      *   0 1 2 3   4.51
147 |      *
148 |      * results in
149 |      *
150 |      *    +----+
151 |      *    |    |
152 |      *  +---+  |
153 |      *  |   |  |
154 |      *  |  +-+ |
155 |      * +-+ | | |
156 |      * | | | | |
157 |      * 0 1 2 3 4.51
158 |      *
159 |      * while
160 |      *
161 |      * | * * * *   *
162 |      * +------------
163 |      *   0 1 2 3   4.49
164 |      *
165 |      * in
166 |      *
167 |      *  +----+
168 |      *  |    |
169 |      *  |   +--+
170 |      *  |   |  |
171 |      *  |  +-+ |
172 |      * +-+ | | |
173 |      * | | | | |
174 |      * 0 1 2 3 4.49
175 |      *
176 |      * because the distance between the groups {0,1}-{2,3} is 2 and {2,3},{4.5} is also 2.
177 |      *
178 |      */
179 |     public function testGroupAverage()
180 |     {
181 |         $docs = array(
182 |             array('x'=>0,'y'=>1),
183 |             array('x'=>1,'y'=>1),
184 |             array('x'=>2,'y'=>1),
185 |             array('x'=>3,'y'=>1),
186 |             array('x'=>4.51,'y'=>1),
187 |         );
188 | 
189 |         $ga = new GroupAverage();
190 |         $ga->initializeStrategy(new Euclidean(), $docs);
191 | 
192 |         $pair = $ga->getNextMerge();
193 |         $this->assertEquals(
194 |             array(0,1),
195 |             $pair
196 |         );
197 | 
198 |         $pair = $ga->getNextMerge();
199 |         $this->assertEquals(
200 |             array(2,3),
201 |             $pair
202 |         );
203 | 
204 |         $pair = $ga->getNextMerge();
205 |         $this->assertEquals(
206 |             array(0,2),
207 |             $pair
208 |         );
209 | 
210 |         $pair = $ga->getNextMerge();
211 |         $this->assertEquals(
212 |             array(0,4),
213 |             $pair
214 |         );
215 | 
216 |         $docs[4] = array('x'=>4.49,'y'=>1);
217 |         $ga->initializeStrategy(new Euclidean(), $docs);
218 | 
219 |         $pair = $ga->getNextMerge();
220 |         $this->assertEquals(
221 |             array(0,1),
222 |             $pair
223 |         );
224 | 
225 |         $pair = $ga->getNextMerge();
226 |         $this->assertEquals(
227 |             array(2,3),
228 |             $pair
229 |         );
230 | 
231 |         $pair = $ga->getNextMerge();
232 |         $this->assertEquals(
233 |             array(2,4),
234 |             $pair
235 |         );
236 | 
237 |         $pair = $ga->getNextMerge();
238 |         $this->assertEquals(
239 |             array(0,2),
240 |             $pair
241 |         );
242 |     }
243 | 
244 |     public function testDendrogramToClusters()
245 |     {
246 |         $dendrograms = array(
247 |             array(
248 |                 array(array(0,1),array(array(2,3),4)),
249 |                 array(array(0,1),array(2,3,4))
250 |             ),
251 |             array(
252 |                 array(array(0,array(1,array(2,array(3,array(4,array(5,array(6,7)))))))),
253 |                 array(array(0),array(1),array(2),array(3,4,5,6,7))
254 |             )
255 |         );
256 | 
257 |         foreach ($dendrograms as $i=>$d) {
258 |             $this->assertEquals(
259 |                 $d[1],
260 |                 Hierarchical::dendrogramToClusters(
261 |                     $d[0],
262 |                     count($d[1])
263 |                 ),
264 |                 "Error transforming dendrogram $i"
265 |             );
266 |         }
267 |     }
268 | 
269 |     public function testClustering1()
270 |     {
271 |         $points = array(
272 |             array('x'=>1, 'y'=>1),
273 |             array('x'=>1, 'y'=>2),
274 |             array('x'=>2, 'y'=>2),
275 |             array('x'=>3, 'y'=>3),
276 |             array('x'=>3, 'y'=>4),
277 |         );
278 | 
279 |         $tset = new TrainingSet();
280 |         foreach ($points as $p)
281 |             $tset->addDocument('',new TokensDocument($p));
282 | 
283 |         $hc = new Hierarchical(
284 |             new SingleLink(), // use the single link strategy
285 |             new Euclidean() // with euclidean distance
286 |         );
287 | 
288 |         list($dendrogram) = $hc->cluster($tset,new DataAsFeatures());
289 |         $this->assertEquals(
290 |             array(
291 |                 array(
292 |                     array(
293 |                         array(
294 |                             0,
295 |                             1
296 |                         ),
297 |                         2
298 |                     ),
299 |                     array(
300 |                         3,
301 |                         4
302 |                     )
303 |                 )
304 |             ),
305 |             $dendrogram
306 |         );
307 |     }
308 | 
309 |     public function testClustering2()
310 |     {
311 |         $N = 50;
312 |         $tset = new TrainingSet();
313 |         for ($i=0;$i<$N;$i++) {
314 |             $tset->addDocument(
315 |                 '',
316 |                 EuclideanPoint::getRandomPointAround(100,100,45)
317 |             );
318 |         }
319 |         for ($i=0;$i<$N;$i++) {
320 |             $tset->addDocument(
321 |                 '',
322 |                 EuclideanPoint::getRandomPointAround(200,100,45)
323 |             );
324 |         }
325 | 
326 |         $hc = new Hierarchical(
327 |             new SingleLink(), // use the single link strategy
328 |             new Euclidean() // with euclidean distance
329 |         );
330 | 
331 |         list($dendrogram) = $hc->cluster($tset,new DataAsFeatures());
332 |         $dg = $this->drawDendrogram(
333 |             $tset,
334 |             $dendrogram,
335 |             600 // width
336 |         );
337 | 
338 |         $clusters = Hierarchical::dendrogramToClusters($dendrogram,2);
339 |         $im = $this->drawClusters(
340 |             $tset,
341 |             $clusters,
342 |             null, // no centroids
343 |             false, // no lines
344 |             10 // emphasize points (for little points)
345 |         );
346 | 
347 |         if ($dg)
348 |             imagepng($dg, TEST_DATA_DIR."/Clustering/HierarchicalTest/dendrogram.png");
349 |         if ($im)
350 |             imagepng($im, TEST_DATA_DIR."/Clustering/HierarchicalTest/clusters.png");
351 |     }
352 | }
353 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Clustering/KmeansTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Clustering;
 4 | 
 5 | use NlpTools\FeatureFactories\DataAsFeatures;
 6 | use NlpTools\Documents\TrainingSet;
 7 | use NlpTools\Documents\EuclideanPoint;
 8 | use NlpTools\Similarity\Euclidean;
 9 | use NlpTools\Clustering\CentroidFactories\Euclidean as EuclidCF;
10 | 
11 | class KmeansTest extends ClusteringTestBase
12 | {
13 | 
14 |     protected function setUp()
15 |     {
16 |         if (!file_exists(TEST_DATA_DIR."/Clustering/KmeansTest")) {
17 |             if (!file_exists(TEST_DATA_DIR."/Clustering"))
18 |                 mkdir(TEST_DATA_DIR."/Clustering");
19 |             mkdir(TEST_DATA_DIR."/Clustering/KmeansTest");
20 |         }
21 |     }
22 | 
23 |     public function testEuclideanClustering()
24 |     {
25 |         $clust = new KMeans(
26 |             2,
27 |             new Euclidean(),
28 |             new EuclidCF(),
29 |             0.001
30 |         );
31 | 
32 |         $tset = new TrainingSet();
33 |         for ($i=0;$i<500;$i++) {
34 |             $tset->addDocument(
35 |                 'A',
36 |                 EuclideanPoint::getRandomPointAround(100,100,45)
37 |             );
38 |         }
39 |         for ($i=0;$i<500;$i++) {
40 |             $tset->addDocument(
41 |                 'B',
42 |                 EuclideanPoint::getRandomPointAround(200,100,45)
43 |             );
44 |         }
45 | 
46 |         list($clusters,$centroids,$distances) = $clust->cluster($tset,new DataAsFeatures());
47 | 
48 |         $im = $this->drawClusters(
49 |             $tset,
50 |             $clusters,
51 |             $centroids,
52 |             false // lines or not
53 |         );
54 | 
55 |         if ($im)
56 |             imagepng($im,TEST_DATA_DIR."/Clustering/KmeansTest/clusters.png");
57 | 
58 |         // since the dataset is artificial and clearly separated, the kmeans
59 |         // algorithm should always cluster it correctly
60 |         foreach ($clusters as $clust) {
61 |             $classes = array();
62 |             foreach ($clust as $point_idx) {
63 |                 $class = $tset[$point_idx]->getClass();
64 |                 if (!isset($classes[$class]))
65 |                     $classes[$class] = true;
66 |             }
67 |             // assert that all the documents (points) in this cluster belong
68 |             // in the same class
69 |             $this->assertCount(
70 |                 1,
71 |                 $classes
72 |             );
73 |         }
74 |     }
75 | }
76 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Documents/EuclideanPoint.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Documents;
 4 | 
 5 | use NlpTools\Utils\TransformationInterface;
 6 | 
 7 | class EuclideanPoint implements DocumentInterface
 8 | {
 9 |     public $x;
10 |     public $y;
11 | 
12 |     public function __construct($x,$y)
13 |     {
14 |         $this->x = $x;
15 |         $this->y = $y;
16 |     }
17 |     public function getDocumentData()
18 |     {
19 |         return array(
20 |             'x'=>$this->x,
21 |             'y'=>$this->y
22 |         );
23 |     }
24 | 
25 |     public static function getRandomPointAround($x,$y,$R)
26 |     {
27 |         return new EuclideanPoint(
28 |             $x+mt_rand(-$R,$R),
29 |             $y+mt_rand(-$R,$R)
30 |         );
31 |     }
32 | 
33 |     public function applyTransformation(TransformationInterface $transform)
34 |     {
35 |         $this->x = $transform->transform($this->x);
36 |         $this->y = $transform->transform($this->y);
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Documents/TransformationsTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Documents;
 4 | 
 5 | use NlpTools\Utils\IdentityTransformer;
 6 | 
 7 | class TransformationsTest extends \PHPUnit_Framework_TestCase
 8 | {
 9 |     public function provideTokens()
10 |     {
11 |         return array(
12 |             array(array("1","2","3","4","5","6","7"))
13 |         );
14 |     }
15 | 
16 |     /**
17 |      * @dataProvider provideTokens
18 |      */
19 |     public function testTokensDocument($tokens)
20 |     {
21 |         $doc = new TokensDocument($tokens);
22 |         $transformer = new IdentityTransformer();
23 |         $this->assertEquals(
24 |             $tokens,
25 |             $doc->getDocumentData()
26 |         );
27 |         $doc->applyTransformation($transformer);
28 |         $this->assertEquals(
29 |             $tokens,
30 |             $doc->getDocumentData()
31 |         );
32 | 
33 |         $tdoc = new TrainingDocument("", new TokensDocument($tokens));
34 |         $tdoc->applyTransformation($transformer);
35 |         $this->assertEquals(
36 |             $tokens,
37 |             $tdoc->getDocumentData()
38 |         );
39 |     }
40 | 
41 |     /**
42 |      * @dataProvider provideTokens
43 |      */
44 |     public function testWordDocument($tokens)
45 |     {
46 |         $transformer = new IdentityTransformer();
47 |         $doc = new WordDocument($tokens,count($tokens)/2, 2);
48 |         $correct = $doc->getDocumentData();
49 |         $doc->applyTransformation($transformer);
50 |         $this->assertEquals(
51 |             $correct,
52 |             $doc->getDocumentData()
53 |         );
54 | 
55 |         $tdoc = new TrainingDocument("", new WordDocument($tokens,count($tokens)/2, 2));
56 |         $tdoc->applyTransformation($transformer);
57 |         $this->assertEquals(
58 |             $correct,
59 |             $tdoc->getDocumentData()
60 |         );
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Documents/WordDocumentTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Documents;
 4 | 
 5 | /**
 6 |  * TODO: Add checks for the edges of the token list
 7 |  */
 8 | class WordDocumentTest extends \PHPUnit_Framework_TestCase
 9 | {
10 |     protected $tokens;
11 | 
12 |     public function __construct()
13 |     {
14 |         $this->tokens = array("The","quick","brown","fox","jumped","over","the","lazy","dog");
15 |     }
16 | 
17 |     /**
18 |      * Test that the WordDocument correctly represents the ith token
19 |      */
20 |     public function testTokenSelection()
21 |     {
22 |         foreach ($this->tokens as $i=>$t) {
23 |             // no context
24 |             $doc = new WordDocument($this->tokens, $i, 0);
25 |             list($w,$prev,$next) = $doc->getDocumentData();
26 | 
27 |             $this->assertEquals(
28 |                 $t,
29 |                 $w,
30 |                 "The {$i}th token should be $t not $w"
31 |             );
32 | 
33 |             // no context means prev,next are empty
34 |             $this->assertCount(
35 |                 0,
36 |                 $prev
37 |             );
38 |             $this->assertCount(
39 |                 0,
40 |                 $next
41 |             );
42 |         }
43 |     }
44 | 
45 |     /**
46 |      * Start with the 5th word and increase the amount of context
47 |      * until it reaches the edges of the token list. Check the
48 |      * previous tokens.
49 |      */
50 |     public function testPrevContext()
51 |     {
52 |         for ($i=0;$i<5;$i++) {
53 |             $doc = new WordDocument($this->tokens, 4, $i);
54 |             list($_,$prev,$_) = $doc->getDocumentData();
55 | 
56 |             $this->assertCount(
57 |                 $i,
58 |                 $prev,
59 |                 "With $i words context prev should be $i words long"
60 |             );
61 |             for (
62 |                 $j=3,$y=$i-1;
63 |                 $j>=4-$i;
64 |                 $y--,$j--) {
65 |                 $this->assertEquals(
66 |                     $this->tokens[$j],
67 |                     $prev[$y]
68 |                 );
69 |             }
70 |         }
71 |     }
72 | 
73 |     /**
74 |      * Start with the 5th word and increase the amount of context
75 |      * until it reaches the edges of the token list. Check the
76 |      * next tokens.
77 |      */
78 |     public function testNextContext()
79 |     {
80 |         for ($i=0;$i<5;$i++) {
81 |             $doc = new WordDocument($this->tokens, 4, $i);
82 |             list($_,$_,$next) = $doc->getDocumentData();
83 | 
84 |             $this->assertCount(
85 |                 $i,
86 |                 $next,
87 |                 "With $i words context next should be $i words long"
88 |             );
89 |             for ($j=5; $j<5+$i; $j++) {
90 |                 $this->assertEquals(
91 |                     $this->tokens[$j],
92 |                     $next[$j-5]
93 |                 );
94 |             }
95 |         }
96 |     }
97 | }
98 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Models/LdaTest.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace NlpTools\Models;
  4 | 
  5 | use NlpTools\Random\Distributions\Dirichlet;
  6 | use NlpTools\Random\Generators\MersenneTwister;
  7 | use NlpTools\Documents\TrainingSet;
  8 | use NlpTools\Documents\TokensDocument;
  9 | use NlpTools\FeatureFactories\DataAsFeatures;
 10 | 
 11 | /**
 12 |  * Functional testing of the Latent Dirichlet Allocation
 13 |  * (LDA) model
 14 |  *
 15 |  * To check the output see the results in the tests/data/Models/LdaTest/results
 16 |  * folder.
 17 |  */
 18 | class LdaTest extends \PHPUnit_Framework_Testcase
 19 | {
 20 |     protected $path;
 21 |     protected $tset;
 22 |     protected $topics;
 23 | 
 24 |     protected function setUp()
 25 |     {
 26 |         if (!extension_loaded("gd")) {
 27 |             $this->markTestSkipped("The gd library is not available");
 28 |         }
 29 | 
 30 |         $this->path = TEST_DATA_DIR."/Models/LdaTest";
 31 |         if (!file_exists($this->path)) {
 32 |             if (!file_exists(TEST_DATA_DIR."/Models"))
 33 |                 mkdir(TEST_DATA_DIR."/Models");
 34 |             mkdir($this->path);
 35 |         }
 36 | 
 37 |         if (!file_exists("{$this->path}/topics")) {
 38 |             mkdir("{$this->path}/topics");
 39 |         }
 40 |         $this->createTopics();
 41 | 
 42 |         if (!file_exists("{$this->path}/data")) {
 43 |             mkdir("{$this->path}/data");
 44 |         }
 45 |         if (count(new \DirectoryIterator("{$this->path}/data"))<502) {
 46 |             $this->createData();
 47 |         }
 48 | 
 49 |         if (!file_exists("{$this->path}/results")) {
 50 |             mkdir("{$this->path}/results");
 51 |         }
 52 | 
 53 |         $this->loadData();
 54 |     }
 55 | 
 56 |     /**
 57 |      * @group Slow
 58 |      * @group VerySlow
 59 |      */
 60 |     public function testLda()
 61 |     {
 62 |         $lda = new Lda(
 63 |             new DataAsFeatures(), // feature factory
 64 |             10,                   // number of topics
 65 |             1,                    // dirichlet prior per doc topic dist
 66 |             1                     // dirichlet prior per word topic dist
 67 |         );
 68 | 
 69 |         $this->assertInstanceOf(
 70 |             "NlpTools\Models\Lda",
 71 |             $lda
 72 |         );
 73 | 
 74 |         $docs = $lda->generateDocs($this->tset);
 75 |         $this->assertCount(
 76 |             count($this->tset),
 77 |             $docs
 78 |         );
 79 | 
 80 |         $lda->initialize($docs);
 81 | 
 82 |         for ($i=0;$i<100;$i++) {
 83 |             $lda->gibbsSample($docs);
 84 |             $topics = $lda->getPhi();
 85 |             echo $lda->getLogLikelihood(),PHP_EOL;
 86 |             foreach ($topics as $t=>$topic) {
 87 |                 $name = sprintf("{$this->path}/results/topic-%04d-%04d",$i,$t);
 88 |                 $max = max($topic);
 89 |                 $this->createImage(
 90 |                     array_map(
 91 |                         function ($x) use ($topic,$max) {
 92 |                             return array_map(
 93 |                                 function ($y) use ($x,$topic,$max) {
 94 |                                     return (int) (($topic[$y*5+$x]/$max)*255);
 95 |                                 },
 96 |                                 range(0,4)
 97 |                             );
 98 |                         },
 99 |                         range(0,4)
100 |                     ),
101 |                     $name
102 |                 );
103 |             }
104 |         }
105 | 
106 |         // TODO: assert the resemblance of the inferred topics
107 |         //       with the actual topics
108 |     }
109 | 
110 |     // WARNING: Massive set up code follows
111 |     // Lda is one of the hardest models to test.
112 |     // This functional test is the test the creators of Lda
113 |     // performed themselves.
114 |     //
115 |     // TODO: Unit testing for lda is needed
116 | 
117 |     protected function createTopics()
118 |     {
119 |         $topics = array(
120 |             array(
121 |                 array(1,1,1,1,1),
122 |                 array(0,0,0,0,0),
123 |                 array(0,0,0,0,0),
124 |                 array(0,0,0,0,0),
125 |                 array(0,0,0,0,0)
126 |             ),
127 |             array(
128 |                 array(0,0,0,0,0),
129 |                 array(1,1,1,1,1),
130 |                 array(0,0,0,0,0),
131 |                 array(0,0,0,0,0),
132 |                 array(0,0,0,0,0)
133 |             ),
134 |             array(
135 |                 array(0,0,0,0,0),
136 |                 array(0,0,0,0,0),
137 |                 array(1,1,1,1,1),
138 |                 array(0,0,0,0,0),
139 |                 array(0,0,0,0,0)
140 |             ),
141 |             array(
142 |                 array(0,0,0,0,0),
143 |                 array(0,0,0,0,0),
144 |                 array(0,0,0,0,0),
145 |                 array(1,1,1,1,1),
146 |                 array(0,0,0,0,0)
147 |             ),
148 |             array(
149 |                 array(0,0,0,0,0),
150 |                 array(0,0,0,0,0),
151 |                 array(0,0,0,0,0),
152 |                 array(0,0,0,0,0),
153 |                 array(1,1,1,1,1)
154 |             ),
155 |             array(
156 |                 array(0,0,0,0,1),
157 |                 array(0,0,0,0,1),
158 |                 array(0,0,0,0,1),
159 |                 array(0,0,0,0,1),
160 |                 array(0,0,0,0,1)
161 |             ),
162 |             array(
163 |                 array(0,0,0,1,0),
164 |                 array(0,0,0,1,0),
165 |                 array(0,0,0,1,0),
166 |                 array(0,0,0,1,0),
167 |                 array(0,0,0,1,0)
168 |             ),
169 |             array(
170 |                 array(0,0,1,0,0),
171 |                 array(0,0,1,0,0),
172 |                 array(0,0,1,0,0),
173 |                 array(0,0,1,0,0),
174 |                 array(0,0,1,0,0)
175 |             ),
176 |             array(
177 |                 array(0,1,0,0,0),
178 |                 array(0,1,0,0,0),
179 |                 array(0,1,0,0,0),
180 |                 array(0,1,0,0,0),
181 |                 array(0,1,0,0,0)
182 |             ),
183 |             array(
184 |                 array(1,0,0,0,0),
185 |                 array(1,0,0,0,0),
186 |                 array(1,0,0,0,0),
187 |                 array(1,0,0,0,0),
188 |                 array(1,0,0,0,0)
189 |             )
190 |         );
191 | 
192 |         $this->topics = array_map(
193 |             function ($topic) {
194 |                 $t = call_user_func_array(
195 |                     "array_merge",
196 |                     $topic
197 |                 );
198 | 
199 |                 $s = array_sum($t);
200 | 
201 |                 return array_map(
202 |                     function ($ti) use ($s) {
203 |                         return $ti/$s;
204 |                     },
205 |                     $t
206 |                 );
207 |             },
208 |             $topics
209 |         );
210 | 
211 |         // multiply by 255 to make gray-scale images of
212 |         // the above arrays
213 |         $topics = array_map(
214 |             function ($topic) {
215 |                 return array_map(
216 |                     function ($row) {
217 |                         return array_map(
218 |                             function ($pixel) {
219 |                                 return (int) (255*$pixel);
220 |                             },
221 |                             $row
222 |                         );
223 |                     },
224 |                     $topic
225 |                 );
226 |             },
227 |             $topics
228 |         );
229 | 
230 |         // save them to disk
231 |         foreach ($topics as $key=>$topic) {
232 |             $this->createImage($topic, "{$this->path}/topics/topic-$key");
233 |         }
234 |     }
235 | 
236 |     protected function createData()
237 |     {
238 |         $dir = new Dirichlet(1, count($this->topics));
239 | 
240 |         for ($i=0;$i<500;$i++) {
241 |             $d = $this->createDocument($this->topics, $dir->sample(), 100);
242 |             $this->createImage($d, "{$this->path}/data/$i");
243 |         }
244 |     }
245 | 
246 |     protected function loadData()
247 |     {
248 |         $this->tset = new TrainingSet();
249 |         foreach (new \DirectoryIterator("{$this->path}/data") as $f) {
250 |             if ($f->isDir())
251 |                 continue;
252 | 
253 |             $this->tset->addDocument(
254 |                 "",
255 |                 new TokensDocument(
256 |                     $this->fromImg($f->getRealPath())
257 |                 )
258 |             );
259 |         }
260 |     }
261 | 
262 |     /**
263 |      * Save a two dimensional array as a grey-scale image
264 |      */
265 |     protected function createImage(array $img,$filename)
266 |     {
267 |         $im = imagecreate(count($img),count(current($img)));
268 |         imagecolorallocate($im,0,0,0);
269 |         foreach ($img as $y=>$row) {
270 |             foreach ($row as $x=>$color) {
271 |                 $color = min(255,max(0,$color));
272 |                 $c = imagecolorallocate($im,$color,$color,$color);
273 |                 imagesetpixel($im,$x,$y,$c);
274 |             }
275 |         }
276 |         imagepng($im,$filename);
277 |     }
278 | 
279 |     /**
280 |      * Draw once from a multinomial distribution
281 |      */
282 |     protected function draw($d)
283 |     {
284 |         $mt = MersenneTwister::get(); // simply mt_rand but in the interval [0,1)
285 |         $x = $mt->generate();
286 |         $p = 0.0;
287 |         foreach ($d as $i=>$v) {
288 |             $p+=$v;
289 |             if ($p > $x)
290 |                 return $i;
291 |         }
292 |     }
293 | 
294 |     /**
295 |      * Create a document sticking to the model's assumptions
296 |      * and hypotheses
297 |      */
298 |     public function createDocument($topic_dists,$theta,$length)
299 |     {
300 |         $doc = array_fill_keys(range(0,24),0);
301 |         while ($length-- > 0) {
302 |             $topic = $this->draw($theta);
303 |             $word = $this->draw($topic_dists[$topic]);
304 |             $doc[$word] += 1;
305 |         }
306 | 
307 |         return array_map(
308 |             function ($start) use ($doc) {
309 |                 return array_slice($doc,$start,5);
310 |             },
311 |             range(0,24,5)
312 |         );
313 |     }
314 | 
315 |     /**
316 |      * Load a document from an image saved to disk
317 |      */
318 |     public function fromImg($file)
319 |     {
320 |         $im = imagecreatefrompng($file);
321 |         $d = array();
322 |         for ($w=0;$w<25;$w++) {
323 |             $x = (int) ($w%5);
324 |             $y = (int) ($w/5);
325 | 
326 |             $c = imagecolorsforindex($im,imagecolorat($im,$x,$y));
327 |             $c = $c['red'];
328 |             if ($c>0) {
329 |                 $d = array_merge(
330 |                     $d,
331 |                     array_fill_keys(
332 |                         range(0,$c-1),
333 |                         $w
334 |                     )
335 |                 );
336 |             }
337 |         }
338 | 
339 |         return $d;
340 |     }
341 | 
342 | }
343 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Similarity/CosineSimilarityTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Similarity;
 4 | 
 5 | class CosineSimilarityTest extends \PHPUnit_Framework_TestCase
 6 | {
 7 |     public function testSetSimilarity()
 8 |     {
 9 |         $sim = new CosineSimilarity();
10 | 
11 |         $A = array(1,2,3);
12 |         $A_times_2 = array(1,2,3,1,2,3);
13 |         $B = array(1,2,3,4,5,6);
14 | 
15 |         $this->assertEquals(
16 |             1,
17 |             $sim->similarity($A,$A),
18 |             "The cosine similarity of a set/vector with itsself should be 1"
19 |         );
20 | 
21 |         $this->assertEquals(
22 |             1,
23 |             $sim->similarity($A,$A_times_2),
24 |             "The cosine similarity of a vector with a linear combination of itsself should be 1"
25 |         );
26 | 
27 |         $this->assertEquals(
28 |             0,
29 |             $sim->similarity($A,$B)-$sim->similarity($A_times_2,$B),
30 |             "Parallel vectors should have the same angle with any vector B"
31 |         );
32 |     }
33 | 
34 |     public function testProducedAngles()
35 |     {
36 |         $sim = new CosineSimilarity();
37 | 
38 |         $ba = array(1,1,2,2,2,2); // ba = (2,4)
39 |         $bc = array(1,1,1,2,2); // bc = (3,2)
40 |         $bba = array('a'=>2,'b'=>4);
41 |         $bbc = array('a'=>3,'b'=>2);
42 |         $ba_to_bc = cos(0.5191461142); // approximately 30 deg
43 | 
44 |         $this->assertEquals(
45 |             $ba_to_bc,
46 |             $sim->similarity($ba,$bc)
47 |         );
48 | 
49 |         $this->assertEquals(
50 |             $ba_to_bc,
51 |             $sim->similarity($bba,$bbc)
52 |         );
53 |     }
54 | 
55 |     public function testInvalidArgumentException()
56 |     {
57 |         $sim = new CosineSimilarity();
58 |         $a = array(1);
59 |         $zero = array();
60 |         try {
61 |             $sim->similarity(
62 |                 $a,
63 |                 $zero
64 |             );
65 |             $this->fail("Cosine similarity with the zero vector should trigger an exception");
66 |         } catch (\InvalidArgumentException $e) {
67 |             $this->assertEquals(
68 |                 "Vector \$B is the zero vector",
69 |                 $e->getMessage()
70 |             );
71 |         }
72 |         try {
73 |             $sim->similarity(
74 |                 $zero,
75 |                 $a
76 |             );
77 |             $this->fail("Cosine similarity with the zero vector should trigger an exception");
78 |         } catch (\InvalidArgumentException $e) {
79 |             $this->assertEquals(
80 |                 "Vector \$A is the zero vector",
81 |                 $e->getMessage()
82 |             );
83 |         }
84 |     }
85 | }
86 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Similarity/DiceSimilarityTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Similarity;
 4 | 
 5 | class DiceSimilarityTest extends \PHPUnit_Framework_TestCase
 6 | {
 7 |     public function testDiceSimilarity()
 8 |     {
 9 |         $sim = new DiceSimilarity();
10 | 
11 |         $A = array("my","name","is","john");
12 |         $B = array("my","name","is","joe");
13 |         $e = array();
14 | 
15 |         $this->assertEquals(
16 |             1,
17 |             $sim->similarity($A,$A),
18 |             "The similarity of a set with itsself is 1"
19 |         );
20 | 
21 |         $this->assertEquals(
22 |             0,
23 |             $sim->similarity($A,$e),
24 |             "The similarity of any set with the empty set is 0"
25 |         );
26 | 
27 |         $this->assertEquals(
28 |             0.75,
29 |             $sim->similarity($A,$B),
30 |             "similarity({'my','name','is','john'},{'my','name','is','joe'}) = 0.75"
31 |         );
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Similarity/HammingDistanceTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Similarity;
 4 | 
 5 | class HammingDistanceTest extends \PHPUnit_Framework_TestCase
 6 | {
 7 |     public function testHammingDistance()
 8 |     {
 9 |         $dist = new HammingDistance();
10 | 
11 |         $A = "ABCDE";
12 |         $B = "FGHIJ";
13 |         $C = "10101";
14 |         $D = "11111";
15 | 
16 |         $this->assertEquals(
17 |             max(strlen($A),strlen($B)),
18 |             $dist->dist($A,$B),
19 |             "Two completely dissimilar strings should have distance equal to max(strlen(\$A),strlen(\$B))"
20 |         );
21 | 
22 |         $this->assertEquals(
23 |             2,
24 |             $dist->dist($C,$D),
25 |             "10101 ~ 11111 have a hamming distance = 2"
26 |         );
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Similarity/JaccardIndexTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Similarity;
 4 | 
 5 | class JaccardIndexTest extends \PHPUnit_Framework_TestCase
 6 | {
 7 |     public function testJaccardIndex()
 8 |     {
 9 |         $sim = new JaccardIndex();
10 | 
11 |         $A = array(1,2,3);
12 |         $B = array(1,2,3,4,5,6);
13 |         $e = array();
14 | 
15 |         $this->assertEquals(
16 |             1,
17 |             $sim->similarity($A,$A),
18 |             "The similarity of a set with itsself is 1"
19 |         );
20 | 
21 |         $this->assertEquals(
22 |             0,
23 |             $sim->similarity($A,$e),
24 |             "The similarity of any set with the empty set is 0"
25 |         );
26 | 
27 |         $this->assertEquals(
28 |             0.5,
29 |             $sim->similarity($A,$B),
30 |             "J({1,2,3},{1,2,3,4,5,6}) = 0.5"
31 |         );
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Similarity/OverlapCoefficientTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Similarity;
 4 | 
 5 | class OverlapCoefficientTest extends \PHPUnit_Framework_TestCase
 6 | {
 7 |     public function testOverlapCoefficient()
 8 |     {
 9 |         $sim = new OverlapCoefficient();
10 | 
11 |         $A = array("my","name","is","john");
12 |         $B = array("your","name","is","joe");
13 |         $e = array();
14 | 
15 |         $this->assertEquals(
16 |             1,
17 |             $sim->similarity($A,$A),
18 |             "The similarity of a set with itsself is 1"
19 |         );
20 | 
21 |         $this->assertEquals(
22 |             0,
23 |             $sim->similarity($A,$e),
24 |             "The similarity of any set with the empty set is 0"
25 |         );
26 | 
27 |         $this->assertEquals(
28 |             0.5,
29 |             $sim->similarity($A,$B),
30 |             "similarity({'my','name','is','john'},{'your','name','is','joe'}) = 0.5"
31 |         );
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Similarity/SimhashTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Similarity;
 4 | 
 5 | class SimhashTest extends \PHPUnit_Framework_TestCase
 6 | {
 7 |     public function testSimhash()
 8 |     {
 9 |         $sim = new Simhash(64);
10 | 
11 |         $A = array(1,2,3);
12 |         $B = array(1,2,3,4,5,6);
13 |         $b = array(1,2,3,4,5);
14 |         $e = array();
15 | 
16 |         $this->assertEquals(
17 |             1,
18 |             $sim->similarity($A,$A),
19 |             "Two identical sets should have the same hash therefore a similarity of 1"
20 |         );
21 | 
22 |         $this->assertGreaterThan(
23 |             $sim->similarity($A,$B),
24 |             $sim->similarity($b,$B),
25 |             "The more elements in common the more similar the two sets should be"
26 |         );
27 |     }
28 | 
29 |     public function testWeightedSets()
30 |     {
31 |         $sim = new Simhash(64);
32 | 
33 |         $A = array("a","a","a","b","b",);
34 |         $B = array("a"=>3,"b"=>2);
35 | 
36 |         $this->assertEquals(
37 |             1,
38 |             $sim->similarity($A,$B),
39 |             "The two sets are identical given that one is the weighted version of the other"
40 |         );
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Similarity/TverskyIndexTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Similarity;
 4 | 
 5 | class TverskyIndexTest extends \PHPUnit_Framework_TestCase
 6 | {
 7 |     private function sim($A, $B, $a, $b)
 8 |     {
 9 |         $sim = new TverskyIndex($a, $b);
10 | 
11 |         return $sim->similarity($A, $B);
12 |     }
13 | 
14 |     public function testTverskyIndex()
15 |     {
16 |         $sim = new TverskyIndex();
17 | 
18 |         $A = array("my","name","is","john");
19 |         $B = array("my","name","is","joe");
20 |         $C = array(1,2,3);
21 |         $D = array(1,2,3,4,5,6);
22 |         $e = array();
23 | 
24 |         $this->assertEquals(
25 |             1,
26 |             $this->sim($A,$A, 0.5, 1),
27 |             "The similarity of a set with itsself is 1"
28 |         );
29 | 
30 |         $this->assertEquals(
31 |             0,
32 |             $this->sim($A,$e, 0.5, 2),
33 |             "The similarity of any set with the empty set is 0"
34 |         );
35 | 
36 |         $this->assertEquals(
37 |             0.75,
38 |             $this->sim($A,$B, 0.5, 1),
39 |             "similarity({'my','name','is','john'},{'my','name','is','joe'}) = 0.75"
40 |         );
41 | 
42 |         $this->assertEquals(
43 |             0.5,
44 |             $this->sim($C,$D, 0.5, 2),
45 |             "similarity({1,2,3},{1,2,3,4,5,6}) = 0.5"
46 |         );
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Stemmers/GreekStemmerTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Stemmers;
 4 | 
 5 | class GreekStemmerTest extends StemmerTestBase
 6 | {
 7 |     /**
 8 |      * Test the words found in Appendix A from Mr. Ntais's thesis.
 9 |      *
10 |      * The words are not tested against the stem reported in the appendix
11 |      * but against the results of Mr. Ntais's canonical implementation in js
12 |      * found here http://people.dsv.su.se/~hercules/greek_stemmer.gr.html
13 |      */
14 |     public function testFromAppendixA()
15 |     {
16 |         $words = new \SplFileObject(TEST_DATA_DIR.'/Stemmers/GreekStemmerTest/appendix-a-words');
17 |         $stems = new \SplFileObject(TEST_DATA_DIR.'/Stemmers/GreekStemmerTest/appendix-a-stems');
18 |         $words->setFlags(\SplFileObject::DROP_NEW_LINE | \SplFileObject::SKIP_EMPTY);
19 |         $stems->setFlags(\SplFileObject::DROP_NEW_LINE | \SplFileObject::SKIP_EMPTY);
20 |         $stems->rewind();
21 | 
22 |         $stemmer = new GreekStemmer();
23 |         $this->checkStemmer($stemmer, $words, $stems);
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Stemmers/LancasterStemmerTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | namespace NlpTools\Stemmers;
 3 | 
 4 | /**
 5 |  * Description of LancasterStemmerTest
 6 |  *
 7 |  * @author Dan Cardin
 8 |  */
 9 | class LancasterStemmerTest extends \PHPUnit_Framework_TestCase
10 | {    
11 |     public function testLancasterStemmper()
12 |     {
13 |         $stemmer = new LancasterStemmer();
14 |         $this->assertEquals('maxim', $stemmer->stem('maximum'));
15 |         $this->assertEquals('presum', $stemmer->stem('presumably'));       
16 |         $this->assertEquals('multiply', $stemmer->stem('multiply'));     
17 |         $this->assertEquals('provid', $stemmer->stem('provision'));  
18 |         $this->assertEquals('ow', $stemmer->stem('owed'));            
19 |         $this->assertEquals('ear', $stemmer->stem('ear'));           
20 |         $this->assertEquals('say', $stemmer->stem('saying'));      
21 |         $this->assertEquals('cry', $stemmer->stem('crying'));
22 |         $this->assertEquals('string', $stemmer->stem('string'));
23 |         $this->assertEquals('meant', $stemmer->stem('meant')); 
24 |         $this->assertEquals('cem', $stemmer->stem('cement')); 
25 |     }
26 | 
27 |     /**
28 |      * Added to cover issue #34
29 |      */
30 |     public function testEmptyStringForWord()
31 |     {
32 |         $stemmer = new LancasterStemmer();
33 |         $this->assertEquals("", $stemmer->stem(""));
34 |     }
35 | }
36 | 
37 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Stemmers/PorterStemmerTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Stemmers;
 4 | 
 5 | /**
 6 |  * Check the correctness of the porter stemmer implementation
 7 |  *
 8 |  * words.txt and stems.txt are taken from
 9 |  * http://tartarus.org/~martin/PorterStemmer/
10 |  */
11 | class PorterStemmerTest extends StemmerTestBase
12 | {
13 |     /**
14 |      * Load a set of words and their stems and check if the stemmer
15 |      * produces the correct stems
16 |      *
17 |      * @group Slow
18 |      */
19 |     public function testStemmer()
20 |     {
21 |         $words = new \SplFileObject(TEST_DATA_DIR.'/Stemmers/PorterStemmerTest/words.txt');
22 |         $stems = new \SplFileObject(TEST_DATA_DIR.'/Stemmers/PorterStemmerTest/stems.txt');
23 |         $words->setFlags(\SplFileObject::DROP_NEW_LINE | \SplFileObject::SKIP_EMPTY);
24 |         $stems->setFlags(\SplFileObject::DROP_NEW_LINE | \SplFileObject::SKIP_EMPTY);
25 |         $stems->rewind();
26 | 
27 |         $stemmer = new PorterStemmer();
28 |         $this->checkStemmer($stemmer, $words, $stems);
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Stemmers/StemmerTestBase.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Stemmers;
 4 | 
 5 | /**
 6 |  * This class simply provides a bit of functioanlity to test
 7 |  * a stemmer agains two lists of words and stems just to keep
 8 |  * the test code a bit DRY
 9 |  */
10 | class StemmerTestBase extends \PHPUnit_Framework_TestCase
11 | {
12 |     protected function checkStemmer(Stemmer $stemmer, \Iterator $words, \Iterator $stems)
13 |     {
14 |         foreach ($words as $word) {
15 |             $stem = $stems->current();
16 |             $this->assertEquals(
17 |                 $stemmer->stem($word),
18 |                 $stem,
19 |                 "The stem for '$word' should be '$stem' not '{$stemmer->stem($word)}'"
20 |             );
21 |             $stems->next();
22 |         }
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Stemmers/TransformationTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Stemmers;
 4 | 
 5 | use NlpTools\Documents\TokensDocument;
 6 | 
 7 | class TransformationTest extends \PHPUnit_Framework_TestCase
 8 | {
 9 |     public function provideStemmers()
10 |     {
11 |         return array(
12 |             array(new LancasterStemmer()),
13 |             array(new PorterStemmer())
14 |         );
15 |     }
16 | 
17 |     /**
18 |      * @dataProvider provideStemmers
19 |      */
20 |     public function testStemmer(Stemmer $stemmer)
21 |     {
22 |         $tokens = explode(" ","this renowned monster who had come off victorious in a hundred fights with his pursuers was an old bull whale of prodigious size and strength from the effect of age or more probably from a freak of nature a singular consequence had resulted he was white as wool");
23 |         $stemmed = $stemmer->stemAll($tokens);
24 |         $doc = new TokensDocument($tokens);
25 | 
26 |         $this->assertNotEquals(
27 |             $stemmed,
28 |             $doc->getDocumentData()
29 |         );
30 | 
31 |         $doc->applyTransformation($stemmer);
32 |         $this->assertEquals(
33 |             $stemmed,
34 |             $doc->getDocumentData()
35 |         );
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Tokenizers/ClassifierBasedTokenizerTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Tokenizers;
 4 | 
 5 | use NlpTools\Classifiers\EndOfSentenceRules;
 6 | 
 7 | class ClassifierBasedTokenizerTest extends \PHPUnit_Framework_TestCase
 8 | {
 9 |     public function testTokenizer()
10 |     {
11 |         $tok = new ClassifierBasedTokenizer(
12 |             new EndOfSentenceRules(),
13 |             new WhitespaceTokenizer()
14 |         );
15 | 
16 |         $text = "We are what we repeatedly do.
17 |                 Excellence, then, is not an act, but a habit.";
18 | 
19 |         $this->assertEquals(
20 |             array(
21 |                 "We are what we repeatedly do.",
22 |                 "Excellence, then, is not an act, but a habit."
23 |             ),
24 |             $tok->tokenize($text)
25 |         );
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Tokenizers/PennTreeBankTokenizerTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Tokenizers;
 4 | 
 5 | /**
 6 |  *
 7 |  * @author Dan Cardin
 8 |  */
 9 | class PennTreeBankTokenizerTest extends \PHPUnit_Framework_TestCase
10 | {
11 |     
12 |     public function testTokenizer()
13 |     {
14 |         $tokenizer = new PennTreeBankTokenizer();
15 |         $tokens = $tokenizer->tokenize("Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks.");
16 |         $this->assertCount(16, $tokens);
17 |     }
18 | 
19 |     public function testTokenizer2()
20 |     {
21 |         $tokenizer = new PennTreeBankTokenizer();
22 |         $this->assertCount(7, $tokenizer->tokenize("They'll save and invest more."));
23 |     }
24 |     
25 |     public function testTokenizer3()
26 |     {
27 |         $tokenizer = new PennTreeBankTokenizer();
28 |         $this->assertCount(4, $tokenizer->tokenize("I'm some text"));
29 |     }
30 |     
31 |     public function testAgainstOriginalSedImplementation()
32 |     {
33 |         $tokenizer = new PennTreeBankTokenizer();
34 |         $tokenized = new \SplFileObject(TEST_DATA_DIR."/Tokenizers/PennTreeBankTokenizerTest/tokenized");
35 |         $tokenized->setFlags(\SplFileObject::DROP_NEW_LINE);
36 |         $sentences = new \SplFileObject(TEST_DATA_DIR."/Tokenizers/PennTreeBankTokenizerTest/test.txt");
37 |         $sentences->setFlags(\SplFileObject::DROP_NEW_LINE);
38 |  
39 |         $tokenized->rewind();
40 |         foreach ($sentences as $sentence) {
41 |             if ($sentence) // skip empty lines
42 |             {
43 |                 $this->assertEquals(
44 |                     $tokenized->current(),
45 |                     implode(" ",$tokenizer->tokenize($sentence)),
46 |                     "Sentence: '$sentence' was not tokenized correctly"
47 |                 );
48 |             }
49 |             $tokenized->next();
50 |         }
51 |                 
52 |     }
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Tokenizers/RegexTokenizerTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Tokenizers;
 4 | 
 5 | class RegexTokenizerTest extends \PHPUnit_Framework_TestCase
 6 | {
 7 |     /**
 8 |      * Test simple splitting patterns
 9 |      */
10 |     public function testSplit()
11 |     {
12 |         // check split1
13 |         $tok = new RegexTokenizer(array(
14 |             "/\s+/"
15 |         ));
16 | 
17 |         $tokens = $tok->tokenize("0 1 2 3 4 5 6 7 8 9");
18 |         $this->assertCount(10, $tokens);
19 |         $this->assertEquals("0123456789",implode("",$tokens));
20 | 
21 |         // check split2
22 |         $tok = new RegexTokenizer(array(
23 |             "/\n+/"
24 |         ));
25 | 
26 |         $tokens = $tok->tokenize("0 1 2 3 4\n5 6 7 8 9");
27 |         $this->assertCount(2, $tokens);
28 |         $this->assertEquals("0 1 2 3 45 6 7 8 9",implode("",$tokens));
29 | 
30 |         $tokens = $tok->tokenize("0 1 2 3 4\n\n5 6 7 8 9");
31 |         $this->assertCount(2, $tokens);
32 |         $this->assertEquals("0 1 2 3 45 6 7 8 9",implode("",$tokens));
33 | 
34 |     }
35 | 
36 |     /**
37 |      * Test a pattern that captures instead of splits
38 |      */
39 |     public function testMatches()
40 |     {
41 |         // check keep matches
42 |         $tok = new RegexTokenizer(array(
43 |             array("/(\s+)?(\w+)(\s+)?/",2)
44 |         ));
45 | 
46 |         $tokens = $tok->tokenize("0 1 2 3 4 5 6 7 8 9");
47 |         $this->assertCount(10, $tokens);
48 |         $this->assertEquals("0123456789",implode("",$tokens));
49 |     }
50 | 
51 |     /**
52 |      * Test a pattern that firsts replaces all digits with themselves separated
53 |      * by a space and then tokenizes on whitespace.
54 |      */
55 |     public function testReplace()
56 |     {
57 |         // check keep matches
58 |         $tok = new RegexTokenizer(array(
59 |             array("/\d/",'$0 '),
60 |             WhitespaceTokenizer::PATTERN
61 |         ));
62 | 
63 |         $tokens = $tok->tokenize("0123456789");
64 |         $this->assertCount(10, $tokens);
65 |         $this->assertEquals("0123456789",implode("",$tokens));
66 |     }
67 | 
68 |     /**
69 |      * Test a simple pattern meant to split the full stop from the last
70 |      * word of a sentence.
71 |      */
72 |     public function testSplitWithManyPatterns()
73 |     {
74 |         $tok = new RegexTokenizer(array(
75 |             WhitespaceTokenizer::PATTERN, 	// split on whitespace
76 |             array("/([^\.])\.$/",'$1 .'),	// replace <word>. with <word><space>.
77 |             "/ /"							// split on <space>
78 |         ));
79 | 
80 |         // example text stolen from NLTK :-)
81 |         $str = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks.";
82 | 
83 |         $tokens = $tok->tokenize($str);
84 |         $this->assertCount(17, $tokens);
85 |         $this->assertEquals($tokens[3], "$3.88");
86 |         $this->assertEquals($tokens[7], ".");
87 |         $this->assertEquals($tokens[14], ".");
88 |         $this->assertEquals($tokens[16], ".");
89 |     }
90 | }
91 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Tokenizers/WhitespaceAndPuntuationTokenizerTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Tokenizers;
 4 | 
 5 | class WhitespaceAndPunctuationTokenizerTest extends \PHPUnit_Framework_TestCase
 6 | {
 7 |     public function testTokenizerOnAscii()
 8 |     {
 9 |         $tok = new WhitespaceAndPunctuationTokenizer();
10 | 
11 |         $s = "This is a simple space delimited string
12 |         with new lines and many     spaces between the words.
13 |         Also	tabs	tabs	tabs	tabs";
14 |         $tokens = array('This','is','a','simple','space','delimited','string',
15 |         'with','new','lines','and','many','spaces','between','the','words','.',
16 |         'Also','tabs','tabs','tabs','tabs');
17 | 
18 |         $this->assertEquals(
19 |             $tokens,
20 |             $tok->tokenize($s)
21 |         );
22 |     }
23 | 
24 |     public function testTokenizerOnUtf8()
25 |     {
26 |         $tok = new WhitespaceAndPunctuationTokenizer();
27 | 
28 |         $s = "Ελληνικό κείμενο για παράδειγμα utf-8 χαρακτήρων";
29 |         $tokens = array('Ελληνικό','κείμενο','για','παράδειγμα','utf','-','8','χαρακτήρων');
30 |         // test tokenization of multibyte non-whitespace characters
31 |         $this->assertEquals(
32 |             $tokens,
33 |             $tok->tokenize($s)
34 |         );
35 | 
36 |         $s = "Here exists non-breaking space   ";
37 |         $tokens = array('Here','exists','non','-','breaking','space');
38 |         // test tokenization of multibyte whitespace
39 |         $this->assertEquals(
40 |             $tokens,
41 |             $tok->tokenize($s)
42 |         );
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Tokenizers/WhitespaceTokenizerTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Tokenizers;
 4 | 
 5 | class WhitespaceTokenizerTest extends \PHPUnit_Framework_TestCase
 6 | {
 7 |     public function testTokenizerOnAscii()
 8 |     {
 9 |         $tok = new WhitespaceTokenizer();
10 | 
11 |         $s = "This is a simple space delimited string
12 |         with new lines and many     spaces between the words.
13 |         Also	tabs	tabs	tabs	tabs";
14 |         $tokens = array('This','is','a','simple','space','delimited','string',
15 |         'with','new','lines','and','many','spaces','between','the','words.',
16 |         'Also','tabs','tabs','tabs','tabs');
17 | 
18 |         $this->assertEquals(
19 |             $tokens,
20 |             $tok->tokenize($s)
21 |         );
22 |     }
23 | 
24 |     public function testTokenizerOnUtf8()
25 |     {
26 |         $tok = new WhitespaceTokenizer();
27 | 
28 |         $s = "Ελληνικό κείμενο για παράδειγμα utf-8 χαρακτήρων";
29 |         $tokens = array('Ελληνικό','κείμενο','για','παράδειγμα','utf-8','χαρακτήρων');
30 |         // test tokenization of multibyte non-whitespace characters
31 |         $this->assertEquals(
32 |             $tokens,
33 |             $tok->tokenize($s)
34 |         );
35 | 
36 |         $s = "Here exists non-breaking space   ";
37 |         $tokens = array('Here','exists','non-breaking','space');
38 |         // test tokenization of multibyte whitespace
39 |         $this->assertEquals(
40 |             $tokens,
41 |             $tok->tokenize($s)
42 |         );
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Utils/ClassifierBasedTransformationTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Utils;
 4 | 
 5 | use NlpTools\Classifiers\ClassifierInterface;
 6 | use NlpTools\Documents\DocumentInterface;
 7 | 
 8 | class ClassifierBasedTransformationTest extends \PHPUnit_Framework_TestCase implements ClassifierInterface
 9 | {
10 |     public function classify(array $classes, DocumentInterface $d)
11 |     {
12 |         return $classes[$d->getDocumentData() % count($classes)];
13 |     }
14 | 
15 |     public function testEvenAndOdd()
16 |     {
17 |         $stubEven = $this->getMock("NlpTools\\Utils\\TransformationInterface");
18 |         $stubEven->expects($this->any())
19 |             ->method('transform')
20 |             ->will($this->returnValue('even'));
21 |         $stubOdd = $this->getMock("NlpTools\\Utils\\TransformationInterface");
22 |         $stubOdd->expects($this->any())
23 |             ->method('transform')
24 |             ->will($this->returnValue('odd'));
25 | 
26 |         $transform = new ClassifierBasedTransformation($this);
27 |         $transform->register("even", $stubEven);
28 |         $transform->register("odd", $stubOdd);
29 | 
30 |         $this->assertEquals(
31 |             "odd",
32 |             $transform->transform(3)
33 |         );
34 |         $this->assertEquals(
35 |             "even",
36 |             $transform->transform(4)
37 |         );
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Utils/EnglishVowelsTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | namespace NlpTools\Utils;
 3 | 
 4 | /**
 5 |  *
 6 |  * @author Dan Cardin
 7 |  */
 8 | class EnglishVowelsTest extends \PHPUnit_Framework_TestCase
 9 | {
10 |     public function testIsVowel()
11 |     {       
12 |         $vowelChecker = VowelsAbstractFactory::factory("English");
13 |         $this->assertTrue($vowelChecker->isVowel("man", 1));
14 |     }
15 |     
16 |     public function testYIsVowel()
17 |     {
18 |         $vowelChecker = VowelsAbstractFactory::factory("English");
19 |         $this->assertTrue($vowelChecker->isVowel("try", 2));
20 |     }
21 | }
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Utils/IdentityTransformer.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Utils;
 4 | 
 5 | /**
 6 |  * The identity transformer is for testing purposes. It implements
 7 |  * the TransformationInterface but it changes nothing so data after
 8 |  * applying this transformer should be exactly as they were.
 9 |  */
10 | class IdentityTransformer implements TransformationInterface
11 | {
12 |     public function transform($value)
13 |     {
14 |         return $value;
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Utils/Normalizers/NormalizerTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Utils\Normalizers;
 4 | 
 5 | class NormalizerTest extends \PHPUnit_Framework_TestCase
 6 | {
 7 |     public function testNormalizer()
 8 |     {
 9 |         $english = Normalizer::factory();
10 |         $greek = Normalizer::factory("Greek");
11 | 
12 |         $this->assertEquals(
13 |             explode(" ","ο μορφωμενοσ διαφερει απο τον αμορφωτο οσο ο ζωντανοσ απο τον νεκρο"),
14 |             $greek->normalizeAll(
15 |                 explode(" ","Ο μορφωμένος διαφέρει από τον αμόρφωτο όσο ο ζωντανός από τον νεκρό")
16 |             )
17 |         );
18 | 
19 |         $this->assertEquals(
20 |             explode(" ","ο μορφωμένος διαφέρει από τον αμόρφωτο όσο ο ζωντανός από τον νεκρό"),
21 |             $english->normalizeAll(
22 |                 explode(" ","Ο μορφωμένος διαφέρει από τον αμόρφωτο όσο ο ζωντανός από τον νεκρό")
23 |             )
24 |         );
25 | 
26 |         $this->assertEquals(
27 |             explode(" ","when a father gives to his son both laugh when a son gives to his father both cry" ),
28 |             $english->normalizeAll(
29 |                 explode(" ","When a father gives to his son both laugh when a son gives to his father both cry" )
30 |             )
31 |         );
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/tests/NlpTools/Utils/StopWordsTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace NlpTools\Utils;
 4 | 
 5 | use NlpTools\Documents\TokensDocument;
 6 | use NlpTools\Utils\Normalizers\Normalizer;
 7 | 
 8 | class StopWordsTest extends \PHPUnit_Framework_TestCase
 9 | {
10 |     public function testStopwords()
11 |     {
12 |         $stopwords = new StopWords(
13 |             array(
14 |                 "to",
15 |                 "the"
16 |             )
17 |         );
18 | 
19 |         $doc = new TokensDocument(explode(" ","if you tell the truth you do not have to remember anything"));
20 |         $doc->applyTransformation($stopwords);
21 |         $this->assertEquals(
22 |             array(
23 |                 "if", "you", "tell", "truth", "you", "do", "not", "have", "remember", "anything"
24 |             ),
25 |             $doc->getDocumentData()
26 |         );
27 |     }
28 | 
29 |     public function testStopwordsWithTransformation()
30 |     {
31 |         $stopwords = new StopWords(
32 |             array(
33 |                 "to",
34 |                 "the"
35 |             ),
36 |             Normalizer::factory("English")
37 |         );
38 | 
39 |         $doc = new TokensDocument(explode(" ", "If you Tell The truth You do not have To remember Anything"));
40 |         $doc->applyTransformation($stopwords);
41 |         $this->assertEquals(
42 |             array(
43 |                 "If", "you", "Tell", "truth", "You", "do", "not", "have", "remember", "Anything"
44 |             ),
45 |             $doc->getDocumentData()
46 |         );
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/tests/README.markdown:
--------------------------------------------------------------------------------
 1 | Testing information
 2 | ===================
 3 | 
 4 | This readme contains a bit of information regarding writing tests for NlpTools and executing them.
 5 | 
 6 | Writing Tests
 7 | -------------
 8 | 
 9 | * Test classes should be in the same namespace as the class that is being tested
10 | * Any data needed for the test or produced by the test should be in the 'data' directory
11 |   under the same folder as the namespace. Only data needed (not produced) are commited to
12 |   the repository.
13 | * Tests should be marked with the groups **Slow** and **VerySlow** if they require more than
14 |   10 seconds and 1 minute respectively. If a test is marked as VerySlow it should also be marked
15 |   as Slow.
16 | * Both functional and unit tests are welcome.
17 | 
18 | Executing Tests
19 | ---------------
20 | 
21 | Currently only one testsuite is defined (all tests). Because some tests take a long time to
22 | run you can try running `phpunit --exclude-group Slow` or `phpunit --exclude-group VerySlow`
23 | to avoid some slow tests.
24 | 
25 | PHPUnit should be run from inside the tests folder or the phpunit.xml file should be provided
26 | as config.
27 | 


--------------------------------------------------------------------------------
/tests/bootstrap.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | error_reporting(E_ALL);
 4 | ini_set('display_startup_errors', 1);
 5 | 
 6 | // test data files
 7 | define('TEST_DATA_DIR',__DIR__.'/data');
 8 | 
 9 | // library autoloader
10 | include(__DIR__.'/../autoloader.php');
11 | 
12 | // tests autoloader
13 | spl_autoload_register(function ($className) {
14 |     $className = ltrim($className,'\\');
15 |     $fileName = __DIR__.DIRECTORY_SEPARATOR;
16 |     $namespace = '';
17 |     $lastNsPos = strrpos($className,'\\');
18 |     if ($lastNsPos!==false) {
19 |         $namespace = substr($className,0,$lastNsPos);
20 |         $className = substr($className,$lastNsPos+1);
21 |         $fileName .= str_replace('\\',DIRECTORY_SEPARATOR,$namespace).DIRECTORY_SEPARATOR;
22 |     }
23 |     $fileName .= str_replace('_',DIRECTORY_SEPARATOR,$className).'.php';
24 | 
25 |     if (file_exists($fileName))
26 |         require($fileName);
27 | });
28 | 


--------------------------------------------------------------------------------
/tests/data/.gitignore:
--------------------------------------------------------------------------------
1 | *.jpg
2 | *.png
3 | Models/LdaTest/*
4 | 


--------------------------------------------------------------------------------
/tests/data/Stemmers/GreekStemmerTest/appendix-a-stems:
--------------------------------------------------------------------------------
 1 | αυτοκινησ
 2 | υδροθεραπει
 3 | αυτοκινησ
 4 | υδροθεραπει
 5 | αυτοκινητ
 6 | υδροθεραπει
 7 | αυτοκινητ
 8 | υδροθεραπει
 9 | αυτοκινητ
10 | παιδοποδηλατ
11 | αυτοκινητ
12 | παιδοποδηλατ
13 | αυτοκινητ
14 | παιδοποδηλατ
15 | αυτοκινητ
16 | παιδοποδηλατ
17 | αυτοκινητ
18 | παιδοποδηλατ
19 | αυτοκινητ
20 | παιδοποδηλατ
21 | αυτοκινητ
22 | βροχοχορευ
23 | αυτοκινητ
24 | βροχοχορευ
25 | αυτοκινητ
26 | βροχοχορευ
27 | χορδιζ
28 | βροχοχορευ
29 | χορδιζ
30 | βροχοχορεψ
31 | χορδιζατ
32 | βροχοχορεψ
33 | χορδιζ
34 | βροχοχορεψ
35 | χορδιζ
36 | βροχοχορεψ
37 | χορδιζ
38 | επιτροπ
39 | χορδιζομ
40 | επιτροπ
41 | χορδιζ
42 | επιτροπ
43 | χορδιζ
44 | επιτροπ
45 | χορδιζ
46 | επιτροπ
47 | χορδιζ
48 | επιτροπ
49 | χορδιζ
50 | βορειοασιατ
51 | χορδιξ
52 | βορειοασιατικ
53 | χορδισ
54 | βορειοασιατικ
55 | χορδισ
56 | βορειοασιατικ
57 | μελλον
58 | βορειοασιατικ
59 | μελλοντ
60 | βορειοασιατικ
61 | μελλοντ
62 | βορειοασιατ
63 | μελλ
64 | βορειοασιατικ
65 | μελλοντ
66 | βορειοασιατ
67 | μελλοντ
68 | βορειοασιατικ
69 | αρει
70 | βορειοασιατ
71 | αρει
72 | υπομελ
73 | αρει
74 | υπομελ
75 | αρει
76 | υπομελ
77 | αρει
78 | υπομελ
79 | 


--------------------------------------------------------------------------------
/tests/data/Stemmers/GreekStemmerTest/appendix-a-words:
--------------------------------------------------------------------------------
 1 | αυτοκινηση
 2 | υδροθεραπεια
 3 | αυτοκινησησ
 4 | υδροθεραπειασ
 5 | αυτοκινητα
 6 | υδροθεραπειεσ
 7 | αυτοκινητε
 8 | υδροθεραπειων
 9 | αυτοκινητεσ
10 | παιδοποδηλατα
11 | αυτοκινητη
12 | παιδοποδηλατο
13 | αυτοκινητη
14 | παιδοποδηλατου
15 | αυτοκινητο
16 | παιδοποδηλατων
17 | αυτοκινητοι
18 | παιδοποδηλατου
19 | αυτοκινητοσ
20 | παιδοποδηλατων
21 | αυτοκινητου
22 | βροχοχορευα
23 | αυτοκινητουσ
24 | βροχοχορευαν
25 | αυτοκινητων
26 | βροχοχορευε
27 | χορδιζαμε
28 | βροχοχορευεσ
29 | χορδιζανε
30 | βροχοχορεψα
31 | χορδιζατε
32 | βροχοχορεψαν
33 | χορδιζει
34 | βροχοχορεψε
35 | χορδιζεισ
36 | βροχοχορεψεσ
37 | χορδιζετε
38 | επιτροπο
39 | χορδιζομε
40 | επιτροποι
41 | χορδιζοντασ
42 | επιτροποσ
43 | χορδιζουμε
44 | επιτροπου
45 | χορδιζουν
46 | επιτροπουσ
47 | χορδιζουνε
48 | επιτροπων
49 | χορδιζω
50 | βορειοασιατικα
51 | χορδιξει
52 | βορειοασιατικε
53 | χορδισαμε
54 | βορειοασιατικεσ
55 | χορδισαν
56 | βορειοασιατικη
57 | μελλον
58 | βορειοασιατικησ
59 | μελλοντα
60 | βορειοασιατικοι
61 | μελλοντ
62 | βορειοασιατικου
63 | μελλοντασ
64 | βορειοασιατικουσ
65 | μελλοντεσ
66 | βορειοασιατικο
67 | μελλοντοσ
68 | βορειοασιατικοσ
69 | αρειε
70 | βορειοασιατικων
71 | αρειο
72 | υπομελη
73 | αρειοι
74 | υπομελοσ
75 | αρειοσ
76 | υπομελουσ
77 | αρειου
78 | υπομελων
79 | 


--------------------------------------------------------------------------------
/tests/phpunit.xml:
--------------------------------------------------------------------------------
1 | <phpunit bootstrap="./bootstrap.php" colors="true">
2 | 	<testsuite name="NlpTools" >
3 | 			<directory>./NlpTools/</directory>
4 | 	</testsuite>
5 | </phpunit>
6 | 


--------------------------------------------------------------------------------
/tests/sentiment_maxent.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | /**
  4 |  * To use this example code you will need:
  5 |  *
  6 |  * 1. The external gradient descent optimizer which
  7 |  *    is at https://github.com/angeloskath/nlp-maxent-optimizer/
  8 |  *    Do not forget to set the environment variable.
  9 |  *
 10 |  * 2. The imdb review dataset assembled by Pang/Lee
 11 |  *    found at http://www.cs.cornell.edu/people/pabo/movie-review-data/
 12 |  *
 13 |  * 3. A way to split and shuffle the files. Suggested (90% split):
 14 |  *    for f in `ls pos`; do echo `pwd`/pos/$f; done >>/tmp/imdb.list
 15 |  *    for f in `ls neg`; do echo `pwd`/neg/$f; done >>/tmp/imdb.list
 16 |  *    shuf /tmp/imdb.list >/tmp/imdb-shuffled.list
 17 |  *    head -n 1800 /tmp/imdb-shuffled.list > train
 18 |  *    tail -n 200 /tmp/imdb-shuffled.list > test
 19 |  *
 20 |  * Then call the script like this:
 21 |  *    php -d memory_limit=300M sentiment_maxent.php train test
 22 |  *
 23 |  */
 24 | 
 25 | // include the autoloader
 26 | include '../autoloader.php';
 27 | 
 28 | use NlpTools\Tokenizers\WhitespaceTokenizer;
 29 | use NlpTools\FeatureFactories\FunctionFeatures;
 30 | use NlpTools\Documents\Document;
 31 | use NlpTools\Documents\TokensDocument;
 32 | use NlpTools\Documents\TrainingSet;
 33 | use NlpTools\Optimizers\ExternalMaxentOptimizer;
 34 | use NlpTools\Models\Maxent;
 35 | use NlpTools\Classifiers\FeatureBasedLinearClassifier;
 36 | 
 37 | // create needed reusable objects, a tokenizer and a feature factory
 38 | $tok = new WhitespaceTokenizer();
 39 | $ff = new FunctionFeatures();
 40 | $ff->add(function ($class, DocumentInterface $d) {
 41 |     $r = array();
 42 |     foreach ($d->getDocumentData() as $tok)
 43 |         $r[] = $class.$tok;
 44 | 
 45 |     return $r;
 46 | });
 47 | 
 48 | // create
 49 | //  1. an empty training set
 50 | //  2. an optimizer
 51 | //  3. an empty model
 52 | $tset = new TrainingSet();
 53 | $OPTIMIZER_PATH = isset($_ENV["GD_OPTIMIZER"]) ? $_ENV["GD_OPTIMIZER"] : 'gradient-descent';
 54 | $optimizer = new ExternalMaxentOptimizer($OPTIMIZER_PATH);
 55 | $model = new Maxent(array());
 56 | 
 57 | // argv[1] and argv[2] are paths to files that contain the paths
 58 | // to the actual documents.
 59 | $train = new SplFileObject($argv[1]);
 60 | $test = new SplFileObject($argv[2]);
 61 | 
 62 | // fill in the training set
 63 | foreach ($train as $f) {
 64 |     $f = substr($f,0,-1);
 65 |     if (strlen($f)==0)
 66 |         continue;
 67 |     $class = "neg";
 68 |     if (strpos($f,"pos")!==false) {
 69 |         $class = "pos";
 70 |     }
 71 |     $tset->addDocument(
 72 |             $class,
 73 |             new TokensDocument($tok->tokenize(file_get_contents($f)))
 74 |         );
 75 | }
 76 | 
 77 | // train the model
 78 | $model->train($ff,$tset,$optimizer);
 79 | 
 80 | // to use the model we need a classifier
 81 | $cls = new FeatureBasedLinearClassifier($ff,$model);
 82 | 
 83 | // evaluate the model
 84 | $correct = 0;
 85 | $total = 0;
 86 | foreach ($test as $f) {
 87 |     $f = substr($f,0,-1);
 88 |     if (strlen($f)==0)
 89 |         continue;
 90 |     $class = "neg";
 91 |     if (strpos($f,"pos")!==false) {
 92 |         $class = "pos";
 93 |     }
 94 |     $doc = new TokensDocument($tok->tokenize(file_get_contents($f)));
 95 |     $predicted = $cls->classify(array("pos","neg"),$doc);
 96 |     if ($predicted == $class) {
 97 |         $correct++;
 98 |     }
 99 |     $total++;
100 | }
101 | 
102 | printf("Acc: %.2f%%\n",(100*$correct/$total));
103 | 


--------------------------------------------------------------------------------