├── .editorconfig ├── LICENSE ├── README.md ├── bin ├── code-coverage.sh └── libsvm │ ├── svm-predict │ ├── svm-predict-osx │ ├── svm-predict.exe │ ├── svm-scale │ ├── svm-scale-osx │ ├── svm-scale.exe │ ├── svm-train │ ├── svm-train-osx │ └── svm-train.exe ├── composer.json ├── composer.lock ├── data ├── glass.csv ├── iris.csv └── wine.csv ├── src ├── Association │ ├── Apriori.php │ └── Associator.php ├── Classification │ ├── Classifier.php │ ├── DecisionTree.php │ ├── DecisionTree │ │ └── DecisionTreeLeaf.php │ ├── Ensemble │ │ ├── AdaBoost.php │ │ ├── Bagging.php │ │ └── RandomForest.php │ ├── KNearestNeighbors.php │ ├── Linear │ │ ├── Adaline.php │ │ ├── DecisionStump.php │ │ ├── LogisticRegression.php │ │ └── Perceptron.php │ ├── MLPClassifier.php │ ├── NaiveBayes.php │ ├── SVC.php │ └── WeightedClassifier.php ├── Clustering │ ├── Clusterer.php │ ├── DBSCAN.php │ ├── FuzzyCMeans.php │ ├── KMeans.php │ └── KMeans │ │ ├── Cluster.php │ │ ├── Point.php │ │ └── Space.php ├── CrossValidation │ ├── RandomSplit.php │ ├── Split.php │ └── StratifiedRandomSplit.php ├── Dataset │ ├── ArrayDataset.php │ ├── CsvDataset.php │ ├── Dataset.php │ ├── Demo │ │ ├── GlassDataset.php │ │ ├── IrisDataset.php │ │ └── WineDataset.php │ ├── FilesDataset.php │ ├── MnistDataset.php │ └── SvmDataset.php ├── DimensionReduction │ ├── EigenTransformerBase.php │ ├── KernelPCA.php │ ├── LDA.php │ └── PCA.php ├── Estimator.php ├── Exception │ ├── DatasetException.php │ ├── FileException.php │ ├── InvalidArgumentException.php │ ├── InvalidOperationException.php │ ├── LibsvmCommandException.php │ ├── MatrixException.php │ ├── NormalizerException.php │ └── SerializeException.php ├── FeatureExtraction │ ├── StopWords.php │ ├── StopWords │ │ ├── English.php │ │ ├── French.php │ │ ├── German.php │ │ ├── Polish.php │ │ └── Russian.php │ ├── TfIdfTransformer.php │ └── TokenCountVectorizer.php ├── FeatureSelection │ ├── ScoringFunction.php │ ├── ScoringFunction │ │ ├── ANOVAFValue.php │ │ └── UnivariateLinearRegression.php │ ├── SelectKBest.php │ └── VarianceThreshold.php ├── FeatureUnion.php ├── Helper │ ├── OneVsRest.php │ ├── Optimizer │ │ ├── ConjugateGradient.php │ │ ├── GD.php │ │ ├── Optimizer.php │ │ └── StochasticGD.php │ ├── Predictable.php │ └── Trainable.php ├── IncrementalEstimator.php ├── Math │ ├── Comparison.php │ ├── Distance.php │ ├── Distance │ │ ├── Chebyshev.php │ │ ├── Distance.php │ │ ├── Euclidean.php │ │ ├── Manhattan.php │ │ └── Minkowski.php │ ├── Kernel.php │ ├── Kernel │ │ └── RBF.php │ ├── LinearAlgebra │ │ ├── EigenvalueDecomposition.php │ │ └── LUDecomposition.php │ ├── Matrix.php │ ├── Product.php │ ├── Set.php │ └── Statistic │ │ ├── ANOVA.php │ │ ├── Correlation.php │ │ ├── Covariance.php │ │ ├── Gaussian.php │ │ ├── Mean.php │ │ ├── StandardDeviation.php │ │ └── Variance.php ├── Metric │ ├── Accuracy.php │ ├── ClassificationReport.php │ ├── ConfusionMatrix.php │ └── Regression.php ├── ModelManager.php ├── NeuralNetwork │ ├── ActivationFunction.php │ ├── ActivationFunction │ │ ├── BinaryStep.php │ │ ├── Gaussian.php │ │ ├── HyperbolicTangent.php │ │ ├── PReLU.php │ │ ├── Sigmoid.php │ │ └── ThresholdedReLU.php │ ├── Layer.php │ ├── Network.php │ ├── Network │ │ ├── LayeredNetwork.php │ │ └── MultilayerPerceptron.php │ ├── Node.php │ ├── Node │ │ ├── Bias.php │ │ ├── Input.php │ │ ├── Neuron.php │ │ └── Neuron │ │ │ └── Synapse.php │ └── Training │ │ ├── Backpropagation.php │ │ └── Backpropagation │ │ └── Sigma.php ├── Pipeline.php ├── Preprocessing │ ├── ColumnFilter.php │ ├── Imputer.php │ ├── Imputer │ │ ├── Strategy.php │ │ └── Strategy │ │ │ ├── MeanStrategy.php │ │ │ ├── MedianStrategy.php │ │ │ └── MostFrequentStrategy.php │ ├── LabelEncoder.php │ ├── LambdaTransformer.php │ ├── Normalizer.php │ ├── NumberConverter.php │ ├── OneHotEncoder.php │ └── Preprocessor.php ├── Regression │ ├── DecisionTreeRegressor.php │ ├── LeastSquares.php │ ├── Regression.php │ └── SVR.php ├── SupportVectorMachine │ ├── DataTransformer.php │ ├── Kernel.php │ ├── SupportVectorMachine.php │ └── Type.php ├── Tokenization │ ├── NGramTokenizer.php │ ├── NGramWordTokenizer.php │ ├── Tokenizer.php │ ├── WhitespaceTokenizer.php │ └── WordTokenizer.php ├── Transformer.php └── Tree │ ├── CART.php │ ├── Node.php │ └── Node │ ├── AverageNode.php │ ├── BinaryNode.php │ ├── DecisionNode.php │ ├── LeafNode.php │ └── PurityNode.php └── var └── .gitkeep /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | end_of_line = lf 5 | charset = utf-8 6 | max_line_length = 80 7 | indent_style = space 8 | indent_size = 4 9 | insert_final_newline = true 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016-2019 Arkadiusz Kondas 4 | Copyright (c) 2018 Andrew DalPino 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /bin/code-coverage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "Run PHPUnit with code coverage" 3 | bin/phpunit --coverage-html .coverage 4 | google-chrome .coverage/index.html 5 | -------------------------------------------------------------------------------- /bin/libsvm/svm-predict: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorgecasas/php-ml/de9b3a6d1c16e256e47d16bc0a384c01624d77cb/bin/libsvm/svm-predict -------------------------------------------------------------------------------- /bin/libsvm/svm-predict-osx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorgecasas/php-ml/de9b3a6d1c16e256e47d16bc0a384c01624d77cb/bin/libsvm/svm-predict-osx -------------------------------------------------------------------------------- /bin/libsvm/svm-predict.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorgecasas/php-ml/de9b3a6d1c16e256e47d16bc0a384c01624d77cb/bin/libsvm/svm-predict.exe -------------------------------------------------------------------------------- /bin/libsvm/svm-scale: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorgecasas/php-ml/de9b3a6d1c16e256e47d16bc0a384c01624d77cb/bin/libsvm/svm-scale -------------------------------------------------------------------------------- /bin/libsvm/svm-scale-osx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorgecasas/php-ml/de9b3a6d1c16e256e47d16bc0a384c01624d77cb/bin/libsvm/svm-scale-osx -------------------------------------------------------------------------------- /bin/libsvm/svm-scale.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorgecasas/php-ml/de9b3a6d1c16e256e47d16bc0a384c01624d77cb/bin/libsvm/svm-scale.exe -------------------------------------------------------------------------------- /bin/libsvm/svm-train: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorgecasas/php-ml/de9b3a6d1c16e256e47d16bc0a384c01624d77cb/bin/libsvm/svm-train -------------------------------------------------------------------------------- /bin/libsvm/svm-train-osx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorgecasas/php-ml/de9b3a6d1c16e256e47d16bc0a384c01624d77cb/bin/libsvm/svm-train-osx -------------------------------------------------------------------------------- /bin/libsvm/svm-train.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jorgecasas/php-ml/de9b3a6d1c16e256e47d16bc0a384c01624d77cb/bin/libsvm/svm-train.exe -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "php-ai/php-ml", 3 | "type": "library", 4 | "description": "PHP-ML - Machine Learning library for PHP", 5 | "keywords": [ 6 | "machine learning", 7 | "pattern recognition", 8 | "neural network", 9 | "computational learning theory", 10 | "artificial intelligence", 11 | "data science", 12 | "feature extraction" 13 | ], 14 | "homepage": "https://github.com/php-ai/php-ml", 15 | "license": "MIT", 16 | "authors": [ 17 | { 18 | "name": "Arkadiusz Kondas", 19 | "email": "arkadiusz.kondas@gmail.com" 20 | } 21 | ], 22 | "require": { 23 | "php": ">=7.2" 24 | }, 25 | "require-dev": { 26 | "phpbench/phpbench": "^0.16.0", 27 | "phpstan/phpstan-phpunit": "^0.12", 28 | "phpstan/phpstan": "^0.12", 29 | "phpstan/phpstan-strict-rules": "^0.12", 30 | "phpunit/phpunit": "^8.0", 31 | "symplify/easy-coding-standard": "^6.0" 32 | }, 33 | "config": { 34 | "preferred-install": "dist", 35 | "sort-packages": true 36 | }, 37 | "autoload": { 38 | "psr-4": { 39 | "Phpml\\": "src/" 40 | } 41 | }, 42 | "autoload-dev": { 43 | "psr-4": { 44 | "Phpml\\Tests\\": "tests/" 45 | } 46 | }, 47 | "scripts": { 48 | "check-cs": "vendor/bin/ecs check src tests bin", 49 | "fix-cs": "vendor/bin/ecs check src tests bin --fix", 50 | "phpstan": "vendor/bin/phpstan.phar analyse src tests bin --level max --configuration phpstan.neon" 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /data/iris.csv: -------------------------------------------------------------------------------- 1 | sepal_length,sepal_width,petal_length,petal_width,species 2 | 5.1,3.5,1.4,0.2,setosa 3 | 4.9,3,1.4,0.2,setosa 4 | 4.7,3.2,1.3,0.2,setosa 5 | 4.6,3.1,1.5,0.2,setosa 6 | 5,3.6,1.4,0.2,setosa 7 | 5.4,3.9,1.7,0.4,setosa 8 | 4.6,3.4,1.4,0.3,setosa 9 | 5,3.4,1.5,0.2,setosa 10 | 4.4,2.9,1.4,0.2,setosa 11 | 4.9,3.1,1.5,0.1,setosa 12 | 5.4,3.7,1.5,0.2,setosa 13 | 4.8,3.4,1.6,0.2,setosa 14 | 4.8,3,1.4,0.1,setosa 15 | 4.3,3,1.1,0.1,setosa 16 | 5.8,4,1.2,0.2,setosa 17 | 5.7,4.4,1.5,0.4,setosa 18 | 5.4,3.9,1.3,0.4,setosa 19 | 5.1,3.5,1.4,0.3,setosa 20 | 5.7,3.8,1.7,0.3,setosa 21 | 5.1,3.8,1.5,0.3,setosa 22 | 5.4,3.4,1.7,0.2,setosa 23 | 5.1,3.7,1.5,0.4,setosa 24 | 4.6,3.6,1,0.2,setosa 25 | 5.1,3.3,1.7,0.5,setosa 26 | 4.8,3.4,1.9,0.2,setosa 27 | 5,3,1.6,0.2,setosa 28 | 5,3.4,1.6,0.4,setosa 29 | 5.2,3.5,1.5,0.2,setosa 30 | 5.2,3.4,1.4,0.2,setosa 31 | 4.7,3.2,1.6,0.2,setosa 32 | 4.8,3.1,1.6,0.2,setosa 33 | 5.4,3.4,1.5,0.4,setosa 34 | 5.2,4.1,1.5,0.1,setosa 35 | 5.5,4.2,1.4,0.2,setosa 36 | 4.9,3.1,1.5,0.1,setosa 37 | 5,3.2,1.2,0.2,setosa 38 | 5.5,3.5,1.3,0.2,setosa 39 | 4.9,3.1,1.5,0.1,setosa 40 | 4.4,3,1.3,0.2,setosa 41 | 5.1,3.4,1.5,0.2,setosa 42 | 5,3.5,1.3,0.3,setosa 43 | 4.5,2.3,1.3,0.3,setosa 44 | 4.4,3.2,1.3,0.2,setosa 45 | 5,3.5,1.6,0.6,setosa 46 | 5.1,3.8,1.9,0.4,setosa 47 | 4.8,3,1.4,0.3,setosa 48 | 5.1,3.8,1.6,0.2,setosa 49 | 4.6,3.2,1.4,0.2,setosa 50 | 5.3,3.7,1.5,0.2,setosa 51 | 5,3.3,1.4,0.2,setosa 52 | 7,3.2,4.7,1.4,versicolor 53 | 6.4,3.2,4.5,1.5,versicolor 54 | 6.9,3.1,4.9,1.5,versicolor 55 | 5.5,2.3,4,1.3,versicolor 56 | 6.5,2.8,4.6,1.5,versicolor 57 | 5.7,2.8,4.5,1.3,versicolor 58 | 6.3,3.3,4.7,1.6,versicolor 59 | 4.9,2.4,3.3,1,versicolor 60 | 6.6,2.9,4.6,1.3,versicolor 61 | 5.2,2.7,3.9,1.4,versicolor 62 | 5,2,3.5,1,versicolor 63 | 5.9,3,4.2,1.5,versicolor 64 | 6,2.2,4,1,versicolor 65 | 6.1,2.9,4.7,1.4,versicolor 66 | 5.6,2.9,3.6,1.3,versicolor 67 | 6.7,3.1,4.4,1.4,versicolor 68 | 5.6,3,4.5,1.5,versicolor 69 | 5.8,2.7,4.1,1,versicolor 70 | 6.2,2.2,4.5,1.5,versicolor 71 | 5.6,2.5,3.9,1.1,versicolor 72 | 5.9,3.2,4.8,1.8,versicolor 73 | 6.1,2.8,4,1.3,versicolor 74 | 6.3,2.5,4.9,1.5,versicolor 75 | 6.1,2.8,4.7,1.2,versicolor 76 | 6.4,2.9,4.3,1.3,versicolor 77 | 6.6,3,4.4,1.4,versicolor 78 | 6.8,2.8,4.8,1.4,versicolor 79 | 6.7,3,5,1.7,versicolor 80 | 6,2.9,4.5,1.5,versicolor 81 | 5.7,2.6,3.5,1,versicolor 82 | 5.5,2.4,3.8,1.1,versicolor 83 | 5.5,2.4,3.7,1,versicolor 84 | 5.8,2.7,3.9,1.2,versicolor 85 | 6,2.7,5.1,1.6,versicolor 86 | 5.4,3,4.5,1.5,versicolor 87 | 6,3.4,4.5,1.6,versicolor 88 | 6.7,3.1,4.7,1.5,versicolor 89 | 6.3,2.3,4.4,1.3,versicolor 90 | 5.6,3,4.1,1.3,versicolor 91 | 5.5,2.5,4,1.3,versicolor 92 | 5.5,2.6,4.4,1.2,versicolor 93 | 6.1,3,4.6,1.4,versicolor 94 | 5.8,2.6,4,1.2,versicolor 95 | 5,2.3,3.3,1,versicolor 96 | 5.6,2.7,4.2,1.3,versicolor 97 | 5.7,3,4.2,1.2,versicolor 98 | 5.7,2.9,4.2,1.3,versicolor 99 | 6.2,2.9,4.3,1.3,versicolor 100 | 5.1,2.5,3,1.1,versicolor 101 | 5.7,2.8,4.1,1.3,versicolor 102 | 6.3,3.3,6,2.5,virginica 103 | 5.8,2.7,5.1,1.9,virginica 104 | 7.1,3,5.9,2.1,virginica 105 | 6.3,2.9,5.6,1.8,virginica 106 | 6.5,3,5.8,2.2,virginica 107 | 7.6,3,6.6,2.1,virginica 108 | 4.9,2.5,4.5,1.7,virginica 109 | 7.3,2.9,6.3,1.8,virginica 110 | 6.7,2.5,5.8,1.8,virginica 111 | 7.2,3.6,6.1,2.5,virginica 112 | 6.5,3.2,5.1,2,virginica 113 | 6.4,2.7,5.3,1.9,virginica 114 | 6.8,3,5.5,2.1,virginica 115 | 5.7,2.5,5,2,virginica 116 | 5.8,2.8,5.1,2.4,virginica 117 | 6.4,3.2,5.3,2.3,virginica 118 | 6.5,3,5.5,1.8,virginica 119 | 7.7,3.8,6.7,2.2,virginica 120 | 7.7,2.6,6.9,2.3,virginica 121 | 6,2.2,5,1.5,virginica 122 | 6.9,3.2,5.7,2.3,virginica 123 | 5.6,2.8,4.9,2,virginica 124 | 7.7,2.8,6.7,2,virginica 125 | 6.3,2.7,4.9,1.8,virginica 126 | 6.7,3.3,5.7,2.1,virginica 127 | 7.2,3.2,6,1.8,virginica 128 | 6.2,2.8,4.8,1.8,virginica 129 | 6.1,3,4.9,1.8,virginica 130 | 6.4,2.8,5.6,2.1,virginica 131 | 7.2,3,5.8,1.6,virginica 132 | 7.4,2.8,6.1,1.9,virginica 133 | 7.9,3.8,6.4,2,virginica 134 | 6.4,2.8,5.6,2.2,virginica 135 | 6.3,2.8,5.1,1.5,virginica 136 | 6.1,2.6,5.6,1.4,virginica 137 | 7.7,3,6.1,2.3,virginica 138 | 6.3,3.4,5.6,2.4,virginica 139 | 6.4,3.1,5.5,1.8,virginica 140 | 6,3,4.8,1.8,virginica 141 | 6.9,3.1,5.4,2.1,virginica 142 | 6.7,3.1,5.6,2.4,virginica 143 | 6.9,3.1,5.1,2.3,virginica 144 | 5.8,2.7,5.1,1.9,virginica 145 | 6.8,3.2,5.9,2.3,virginica 146 | 6.7,3.3,5.7,2.5,virginica 147 | 6.7,3,5.2,2.3,virginica 148 | 6.3,2.5,5,1.9,virginica 149 | 6.5,3,5.2,2,virginica 150 | 6.2,3.4,5.4,2.3,virginica 151 | 5.9,3,5.1,1.8,virginica -------------------------------------------------------------------------------- /src/Association/Associator.php: -------------------------------------------------------------------------------- 1 | getHTML(); 80 | } 81 | 82 | public function evaluate(array $record): bool 83 | { 84 | $recordField = $record[$this->columnIndex]; 85 | 86 | if ($this->isContinuous) { 87 | return Comparison::compare((string) $recordField, $this->numericValue, $this->operator); 88 | } 89 | 90 | return $recordField == $this->value; 91 | } 92 | 93 | /** 94 | * Returns Mean Decrease Impurity (MDI) in the node. 95 | * For terminal nodes, this value is equal to 0 96 | */ 97 | public function getNodeImpurityDecrease(int $parentRecordCount): float 98 | { 99 | if ($this->isTerminal) { 100 | return 0.0; 101 | } 102 | 103 | $nodeSampleCount = (float) count($this->records); 104 | $iT = $this->giniIndex; 105 | 106 | if ($this->leftLeaf !== null) { 107 | $pL = count($this->leftLeaf->records) / $nodeSampleCount; 108 | $iT -= $pL * $this->leftLeaf->giniIndex; 109 | } 110 | 111 | if ($this->rightLeaf !== null) { 112 | $pR = count($this->rightLeaf->records) / $nodeSampleCount; 113 | $iT -= $pR * $this->rightLeaf->giniIndex; 114 | } 115 | 116 | return $iT * $nodeSampleCount / $parentRecordCount; 117 | } 118 | 119 | /** 120 | * Returns HTML representation of the node including children nodes 121 | */ 122 | public function getHTML(?array $columnNames = null): string 123 | { 124 | if ($this->isTerminal) { 125 | $value = "{$this}->classValue"; 126 | } else { 127 | $value = $this->value; 128 | if ($columnNames !== null) { 129 | $col = $columnNames[$this->columnIndex]; 130 | } else { 131 | $col = "col_$this->columnIndex"; 132 | } 133 | 134 | if ((bool) preg_match('/^[<>=]{1,2}/', (string) $value) === false) { 135 | $value = "={$value}"; 136 | } 137 | 138 | $value = "{$col} {$value}
Gini: ".number_format($this->giniIndex, 2); 139 | } 140 | 141 | $str = ""; 142 | 143 | if ($this->leftLeaf !== null || $this->rightLeaf !== null) { 144 | $str .= ''; 145 | if ($this->leftLeaf !== null) { 146 | $str .= ''; 147 | } else { 148 | $str .= ''; 149 | } 150 | 151 | $str .= ''; 152 | if ($this->rightLeaf !== null) { 153 | $str .= ''; 154 | } else { 155 | $str .= ''; 156 | } 157 | 158 | $str .= ''; 159 | } 160 | 161 | $str .= '
{$value}
| Yes
'.$this->leftLeaf->getHTML($columnNames).'
 No |
'.$this->rightLeaf->getHTML($columnNames).'
'; 162 | 163 | return $str; 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /src/Classification/Ensemble/Bagging.php: -------------------------------------------------------------------------------- 1 | 20]; 43 | 44 | /** 45 | * @var array 46 | */ 47 | protected $classifiers = []; 48 | 49 | /** 50 | * @var float 51 | */ 52 | protected $subsetRatio = 0.7; 53 | 54 | /** 55 | * Creates an ensemble classifier with given number of base classifiers 56 | * Default number of base classifiers is 50. 57 | * The more number of base classifiers, the better performance but at the cost of procesing time 58 | */ 59 | public function __construct(int $numClassifier = 50) 60 | { 61 | $this->numClassifier = $numClassifier; 62 | } 63 | 64 | /** 65 | * This method determines the ratio of samples used to create the 'bootstrap' subset, 66 | * e.g., random samples drawn from the original dataset with replacement (allow repeats), 67 | * to train each base classifier. 68 | * 69 | * @return $this 70 | * 71 | * @throws InvalidArgumentException 72 | */ 73 | public function setSubsetRatio(float $ratio) 74 | { 75 | if ($ratio < 0.1 || $ratio > 1.0) { 76 | throw new InvalidArgumentException('Subset ratio should be between 0.1 and 1.0'); 77 | } 78 | 79 | $this->subsetRatio = $ratio; 80 | 81 | return $this; 82 | } 83 | 84 | /** 85 | * This method is used to set the base classifier. Default value is 86 | * DecisionTree::class, but any class that implements the Classifier 87 | * can be used.
88 | * While giving the parameters of the classifier, the values should be 89 | * given in the order they are in the constructor of the classifier and parameter 90 | * names are neglected. 91 | * 92 | * @return $this 93 | */ 94 | public function setClassifer(string $classifier, array $classifierOptions = []) 95 | { 96 | $this->classifier = $classifier; 97 | $this->classifierOptions = $classifierOptions; 98 | 99 | return $this; 100 | } 101 | 102 | public function train(array $samples, array $targets): void 103 | { 104 | $this->samples = array_merge($this->samples, $samples); 105 | $this->targets = array_merge($this->targets, $targets); 106 | $this->featureCount = count($samples[0]); 107 | $this->numSamples = count($this->samples); 108 | 109 | // Init classifiers and train them with bootstrap samples 110 | $this->classifiers = $this->initClassifiers(); 111 | $index = 0; 112 | foreach ($this->classifiers as $classifier) { 113 | [$samples, $targets] = $this->getRandomSubset($index); 114 | $classifier->train($samples, $targets); 115 | ++$index; 116 | } 117 | } 118 | 119 | protected function getRandomSubset(int $index): array 120 | { 121 | $samples = []; 122 | $targets = []; 123 | srand($index); 124 | $bootstrapSize = $this->subsetRatio * $this->numSamples; 125 | for ($i = 0; $i < $bootstrapSize; ++$i) { 126 | $rand = random_int(0, $this->numSamples - 1); 127 | $samples[] = $this->samples[$rand]; 128 | $targets[] = $this->targets[$rand]; 129 | } 130 | 131 | return [$samples, $targets]; 132 | } 133 | 134 | protected function initClassifiers(): array 135 | { 136 | $classifiers = []; 137 | for ($i = 0; $i < $this->numClassifier; ++$i) { 138 | $ref = new ReflectionClass($this->classifier); 139 | /** @var Classifier $obj */ 140 | $obj = count($this->classifierOptions) === 0 ? $ref->newInstance() : $ref->newInstanceArgs($this->classifierOptions); 141 | 142 | $classifiers[] = $this->initSingleClassifier($obj); 143 | } 144 | 145 | return $classifiers; 146 | } 147 | 148 | protected function initSingleClassifier(Classifier $classifier): Classifier 149 | { 150 | return $classifier; 151 | } 152 | 153 | /** 154 | * @return mixed 155 | */ 156 | protected function predictSample(array $sample) 157 | { 158 | $predictions = []; 159 | foreach ($this->classifiers as $classifier) { 160 | /** @var Classifier $classifier */ 161 | $predictions[] = $classifier->predict($sample); 162 | } 163 | 164 | $counts = array_count_values($predictions); 165 | arsort($counts); 166 | reset($counts); 167 | 168 | return key($counts); 169 | } 170 | } 171 | -------------------------------------------------------------------------------- /src/Classification/Ensemble/RandomForest.php: -------------------------------------------------------------------------------- 1 | setSubsetRatio(1.0); 33 | } 34 | 35 | /** 36 | * This method is used to determine how many of the original columns (features) 37 | * will be used to construct subsets to train base classifiers.
38 | * 39 | * Allowed values: 'sqrt', 'log' or any float number between 0.1 and 1.0
40 | * 41 | * Default value for the ratio is 'log' which results in log(numFeatures, 2) + 1 42 | * features to be taken into consideration while selecting subspace of features 43 | * 44 | * @param mixed $ratio 45 | */ 46 | public function setFeatureSubsetRatio($ratio): self 47 | { 48 | if (!is_string($ratio) && !is_float($ratio)) { 49 | throw new InvalidArgumentException('Feature subset ratio must be a string or a float'); 50 | } 51 | 52 | if (is_float($ratio) && ($ratio < 0.1 || $ratio > 1.0)) { 53 | throw new InvalidArgumentException('When a float is given, feature subset ratio should be between 0.1 and 1.0'); 54 | } 55 | 56 | if (is_string($ratio) && $ratio !== 'sqrt' && $ratio !== 'log') { 57 | throw new InvalidArgumentException("When a string is given, feature subset ratio can only be 'sqrt' or 'log'"); 58 | } 59 | 60 | $this->featureSubsetRatio = $ratio; 61 | 62 | return $this; 63 | } 64 | 65 | /** 66 | * RandomForest algorithm is usable *only* with DecisionTree 67 | * 68 | * @return $this 69 | */ 70 | public function setClassifer(string $classifier, array $classifierOptions = []) 71 | { 72 | if ($classifier !== DecisionTree::class) { 73 | throw new InvalidArgumentException('RandomForest can only use DecisionTree as base classifier'); 74 | } 75 | 76 | parent::setClassifer($classifier, $classifierOptions); 77 | 78 | return $this; 79 | } 80 | 81 | /** 82 | * This will return an array including an importance value for 83 | * each column in the given dataset. Importance values for a column 84 | * is the average importance of that column in all trees in the forest 85 | */ 86 | public function getFeatureImportances(): array 87 | { 88 | // Traverse each tree and sum importance of the columns 89 | $sum = []; 90 | foreach ($this->classifiers as $tree) { 91 | /** @var DecisionTree $tree */ 92 | $importances = $tree->getFeatureImportances(); 93 | 94 | foreach ($importances as $column => $importance) { 95 | if (array_key_exists($column, $sum)) { 96 | $sum[$column] += $importance; 97 | } else { 98 | $sum[$column] = $importance; 99 | } 100 | } 101 | } 102 | 103 | // Normalize & sort the importance values 104 | $total = array_sum($sum); 105 | array_walk($sum, function (&$importance) use ($total): void { 106 | $importance /= $total; 107 | }); 108 | arsort($sum); 109 | 110 | return $sum; 111 | } 112 | 113 | /** 114 | * A string array to represent the columns is given. They are useful 115 | * when trying to print some information about the trees such as feature importances 116 | * 117 | * @return $this 118 | */ 119 | public function setColumnNames(array $names) 120 | { 121 | $this->columnNames = $names; 122 | 123 | return $this; 124 | } 125 | 126 | /** 127 | * @return DecisionTree 128 | */ 129 | protected function initSingleClassifier(Classifier $classifier): Classifier 130 | { 131 | if (!$classifier instanceof DecisionTree) { 132 | throw new InvalidArgumentException( 133 | sprintf('Classifier %s expected, got %s', DecisionTree::class, get_class($classifier)) 134 | ); 135 | } 136 | 137 | if (is_float($this->featureSubsetRatio)) { 138 | $featureCount = (int) ($this->featureSubsetRatio * $this->featureCount); 139 | } elseif ($this->featureSubsetRatio === 'sqrt') { 140 | $featureCount = (int) ($this->featureCount ** .5) + 1; 141 | } else { 142 | $featureCount = (int) log($this->featureCount, 2) + 1; 143 | } 144 | 145 | if ($featureCount >= $this->featureCount) { 146 | $featureCount = $this->featureCount; 147 | } 148 | 149 | if ($this->columnNames === null) { 150 | $this->columnNames = range(0, $this->featureCount - 1); 151 | } 152 | 153 | return $classifier 154 | ->setColumnNames($this->columnNames) 155 | ->setNumFeatures($featureCount); 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /src/Classification/KNearestNeighbors.php: -------------------------------------------------------------------------------- 1 | k = $k; 37 | $this->samples = []; 38 | $this->targets = []; 39 | $this->distanceMetric = $distanceMetric; 40 | } 41 | 42 | /** 43 | * @return mixed 44 | */ 45 | protected function predictSample(array $sample) 46 | { 47 | $distances = $this->kNeighborsDistances($sample); 48 | $predictions = (array) array_combine(array_values($this->targets), array_fill(0, count($this->targets), 0)); 49 | 50 | foreach (array_keys($distances) as $index) { 51 | ++$predictions[$this->targets[$index]]; 52 | } 53 | 54 | arsort($predictions); 55 | reset($predictions); 56 | 57 | return key($predictions); 58 | } 59 | 60 | /** 61 | * @throws \Phpml\Exception\InvalidArgumentException 62 | */ 63 | private function kNeighborsDistances(array $sample): array 64 | { 65 | $distances = []; 66 | 67 | foreach ($this->samples as $index => $neighbor) { 68 | $distances[$index] = $this->distanceMetric->distance($sample, $neighbor); 69 | } 70 | 71 | asort($distances); 72 | 73 | return array_slice($distances, 0, $this->k, true); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/Classification/Linear/Adaline.php: -------------------------------------------------------------------------------- 1 | 31 | * 32 | * Learning rate should be a float value between 0.0(exclusive) and 1.0 (inclusive)
33 | * Maximum number of iterations can be an integer value greater than 0
34 | * If normalizeInputs is set to true, then every input given to the algorithm will be standardized 35 | * by use of standard deviation and mean calculation 36 | * 37 | * @throws InvalidArgumentException 38 | */ 39 | public function __construct( 40 | float $learningRate = 0.001, 41 | int $maxIterations = 1000, 42 | bool $normalizeInputs = true, 43 | int $trainingType = self::BATCH_TRAINING 44 | ) { 45 | if (!in_array($trainingType, [self::BATCH_TRAINING, self::ONLINE_TRAINING], true)) { 46 | throw new InvalidArgumentException('Adaline can only be trained with batch and online/stochastic gradient descent algorithm'); 47 | } 48 | 49 | $this->trainingType = $trainingType; 50 | 51 | parent::__construct($learningRate, $maxIterations, $normalizeInputs); 52 | } 53 | 54 | /** 55 | * Adapts the weights with respect to given samples and targets 56 | * by use of gradient descent learning rule 57 | */ 58 | protected function runTraining(array $samples, array $targets): void 59 | { 60 | // The cost function is the sum of squares 61 | $callback = function ($weights, $sample, $target): array { 62 | $this->weights = $weights; 63 | 64 | $output = $this->output($sample); 65 | $gradient = $output - $target; 66 | $error = $gradient ** 2; 67 | 68 | return [$error, $gradient]; 69 | }; 70 | 71 | $isBatch = $this->trainingType == self::BATCH_TRAINING; 72 | 73 | parent::runGradientDescent($samples, $targets, $callback, $isBatch); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/Classification/MLPClassifier.php: -------------------------------------------------------------------------------- 1 | classes, true)) { 20 | throw new InvalidArgumentException( 21 | sprintf('Target with value "%s" is not part of the accepted classes', $target) 22 | ); 23 | } 24 | 25 | return array_search($target, $this->classes, true); 26 | } 27 | 28 | /** 29 | * @return mixed 30 | */ 31 | protected function predictSample(array $sample) 32 | { 33 | $output = $this->setInput($sample)->getOutput(); 34 | 35 | $predictedClass = null; 36 | $max = 0; 37 | foreach ($output as $class => $value) { 38 | if ($value > $max) { 39 | $predictedClass = $class; 40 | $max = $value; 41 | } 42 | } 43 | 44 | return $predictedClass; 45 | } 46 | 47 | /** 48 | * @param mixed $target 49 | */ 50 | protected function trainSample(array $sample, $target): void 51 | { 52 | // Feed-forward. 53 | $this->setInput($sample); 54 | 55 | // Back-propagate. 56 | $this->backpropagation->backpropagate($this->getLayers(), $this->getTargetClass($target)); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/Classification/SVC.php: -------------------------------------------------------------------------------- 1 | weights = $weights; 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/Clustering/Clusterer.php: -------------------------------------------------------------------------------- 1 | epsilon = $epsilon; 36 | $this->minSamples = $minSamples; 37 | $this->distanceMetric = $distanceMetric; 38 | } 39 | 40 | public function cluster(array $samples): array 41 | { 42 | $labels = []; 43 | $n = 0; 44 | 45 | foreach ($samples as $index => $sample) { 46 | if (isset($labels[$index])) { 47 | continue; 48 | } 49 | 50 | $neighborIndices = $this->getIndicesInRegion($sample, $samples); 51 | 52 | if (count($neighborIndices) < $this->minSamples) { 53 | $labels[$index] = self::NOISE; 54 | 55 | continue; 56 | } 57 | 58 | $labels[$index] = $n; 59 | 60 | $this->expandCluster($samples, $neighborIndices, $labels, $n); 61 | 62 | ++$n; 63 | } 64 | 65 | return $this->groupByCluster($samples, $labels, $n); 66 | } 67 | 68 | private function expandCluster(array $samples, array $seeds, array &$labels, int $n): void 69 | { 70 | while (($index = array_pop($seeds)) !== null) { 71 | if (isset($labels[$index])) { 72 | if ($labels[$index] === self::NOISE) { 73 | $labels[$index] = $n; 74 | } 75 | 76 | continue; 77 | } 78 | 79 | $labels[$index] = $n; 80 | 81 | $sample = $samples[$index]; 82 | $neighborIndices = $this->getIndicesInRegion($sample, $samples); 83 | 84 | if (count($neighborIndices) >= $this->minSamples) { 85 | $seeds = array_unique(array_merge($seeds, $neighborIndices)); 86 | } 87 | } 88 | } 89 | 90 | private function getIndicesInRegion(array $center, array $samples): array 91 | { 92 | $indices = []; 93 | 94 | foreach ($samples as $index => $sample) { 95 | if ($this->distanceMetric->distance($center, $sample) < $this->epsilon) { 96 | $indices[] = $index; 97 | } 98 | } 99 | 100 | return $indices; 101 | } 102 | 103 | private function groupByCluster(array $samples, array $labels, int $n): array 104 | { 105 | $clusters = array_fill(0, $n, []); 106 | 107 | foreach ($samples as $index => $sample) { 108 | if ($labels[$index] !== self::NOISE) { 109 | $clusters[$labels[$index]][$index] = $sample; 110 | } 111 | } 112 | 113 | // Reindex (i.e. to 0, 1, 2, ...) integer indices for backword compatibility 114 | foreach ($clusters as $index => $cluster) { 115 | $clusters[$index] = array_merge($cluster, []); 116 | } 117 | 118 | return $clusters; 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /src/Clustering/KMeans.php: -------------------------------------------------------------------------------- 1 | clustersNumber = $clustersNumber; 33 | $this->initialization = $initialization; 34 | } 35 | 36 | public function cluster(array $samples): array 37 | { 38 | $space = new Space(count(reset($samples))); 39 | foreach ($samples as $key => $sample) { 40 | $space->addPoint($sample, $key); 41 | } 42 | 43 | $clusters = []; 44 | foreach ($space->cluster($this->clustersNumber, $this->initialization) as $cluster) { 45 | $clusters[] = $cluster->getPoints(); 46 | } 47 | 48 | return $clusters; 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/Clustering/KMeans/Cluster.php: -------------------------------------------------------------------------------- 1 | space = $space; 27 | $this->points = new SplObjectStorage(); 28 | } 29 | 30 | public function getPoints(): array 31 | { 32 | $points = []; 33 | foreach ($this->points as $point) { 34 | if ($point->label === null) { 35 | $points[] = $point->toArray(); 36 | } else { 37 | $points[$point->label] = $point->toArray(); 38 | } 39 | } 40 | 41 | return $points; 42 | } 43 | 44 | public function toArray(): array 45 | { 46 | return [ 47 | 'centroid' => parent::toArray(), 48 | 'points' => $this->getPoints(), 49 | ]; 50 | } 51 | 52 | public function attach(Point $point): Point 53 | { 54 | if ($point instanceof self) { 55 | throw new LogicException('Cannot attach a cluster to another'); 56 | } 57 | 58 | $this->points->attach($point); 59 | 60 | return $point; 61 | } 62 | 63 | public function detach(Point $point): Point 64 | { 65 | $this->points->detach($point); 66 | 67 | return $point; 68 | } 69 | 70 | public function attachAll(SplObjectStorage $points): void 71 | { 72 | $this->points->addAll($points); 73 | } 74 | 75 | public function detachAll(SplObjectStorage $points): void 76 | { 77 | $this->points->removeAll($points); 78 | } 79 | 80 | public function updateCentroid(): void 81 | { 82 | $count = count($this->points); 83 | if ($count === 0) { 84 | return; 85 | } 86 | 87 | $centroid = $this->space->newPoint(array_fill(0, $this->dimension, 0)); 88 | 89 | foreach ($this->points as $point) { 90 | for ($n = 0; $n < $this->dimension; ++$n) { 91 | $centroid->coordinates[$n] += $point->coordinates[$n]; 92 | } 93 | } 94 | 95 | for ($n = 0; $n < $this->dimension; ++$n) { 96 | $this->coordinates[$n] = $centroid->coordinates[$n] / $count; 97 | } 98 | } 99 | 100 | /** 101 | * @return Point[]|SplObjectStorage 102 | */ 103 | public function getIterator() 104 | { 105 | return $this->points; 106 | } 107 | 108 | public function count(): int 109 | { 110 | return count($this->points); 111 | } 112 | 113 | public function setCoordinates(array $newCoordinates): void 114 | { 115 | $this->coordinates = $newCoordinates; 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /src/Clustering/KMeans/Point.php: -------------------------------------------------------------------------------- 1 | dimension = count($coordinates); 32 | $this->coordinates = $coordinates; 33 | $this->label = $label; 34 | } 35 | 36 | public function toArray(): array 37 | { 38 | return $this->coordinates; 39 | } 40 | 41 | /** 42 | * @return float|int 43 | */ 44 | public function getDistanceWith(self $point, bool $precise = true) 45 | { 46 | $distance = 0; 47 | for ($n = 0; $n < $this->dimension; ++$n) { 48 | $difference = $this->coordinates[$n] - $point->coordinates[$n]; 49 | $distance += $difference * $difference; 50 | } 51 | 52 | return $precise ? $distance ** .5 : $distance; 53 | } 54 | 55 | /** 56 | * @param Point[] $points 57 | */ 58 | public function getClosest(array $points): ?self 59 | { 60 | $minPoint = null; 61 | 62 | foreach ($points as $point) { 63 | $distance = $this->getDistanceWith($point, false); 64 | 65 | if (!isset($minDistance)) { 66 | $minDistance = $distance; 67 | $minPoint = $point; 68 | 69 | continue; 70 | } 71 | 72 | if ($distance < $minDistance) { 73 | $minDistance = $distance; 74 | $minPoint = $point; 75 | } 76 | } 77 | 78 | return $minPoint; 79 | } 80 | 81 | public function getCoordinates(): array 82 | { 83 | return $this->coordinates; 84 | } 85 | 86 | /** 87 | * @param mixed $offset 88 | */ 89 | public function offsetExists($offset): bool 90 | { 91 | return isset($this->coordinates[$offset]); 92 | } 93 | 94 | /** 95 | * @param mixed $offset 96 | * 97 | * @return mixed 98 | */ 99 | public function offsetGet($offset) 100 | { 101 | return $this->coordinates[$offset]; 102 | } 103 | 104 | /** 105 | * @param mixed $offset 106 | * @param mixed $value 107 | */ 108 | public function offsetSet($offset, $value): void 109 | { 110 | $this->coordinates[$offset] = $value; 111 | } 112 | 113 | /** 114 | * @param mixed $offset 115 | */ 116 | public function offsetUnset($offset): void 117 | { 118 | unset($this->coordinates[$offset]); 119 | } 120 | 121 | public function count(): int 122 | { 123 | return count($this->coordinates); 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /src/CrossValidation/RandomSplit.php: -------------------------------------------------------------------------------- 1 | getSamples(); 14 | $labels = $dataset->getTargets(); 15 | $datasetSize = count($samples); 16 | $testCount = count($this->testSamples); 17 | 18 | for ($i = $datasetSize; $i > 0; --$i) { 19 | $key = mt_rand(0, $datasetSize - 1); 20 | $setName = (count($this->testSamples) - $testCount) / $datasetSize >= $testSize ? 'train' : 'test'; 21 | 22 | $this->{$setName.'Samples'}[] = $samples[$key]; 23 | $this->{$setName.'Labels'}[] = $labels[$key]; 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/CrossValidation/Split.php: -------------------------------------------------------------------------------- 1 | = 1) { 35 | throw new InvalidArgumentException('testsize must be between 0.0 and 1.0'); 36 | } 37 | 38 | $this->seedGenerator($seed); 39 | 40 | $this->splitDataset($dataset, $testSize); 41 | } 42 | 43 | public function getTrainSamples(): array 44 | { 45 | return $this->trainSamples; 46 | } 47 | 48 | public function getTestSamples(): array 49 | { 50 | return $this->testSamples; 51 | } 52 | 53 | public function getTrainLabels(): array 54 | { 55 | return $this->trainLabels; 56 | } 57 | 58 | public function getTestLabels(): array 59 | { 60 | return $this->testLabels; 61 | } 62 | 63 | abstract protected function splitDataset(Dataset $dataset, float $testSize): void; 64 | 65 | protected function seedGenerator(?int $seed = null): void 66 | { 67 | if ($seed === null) { 68 | mt_srand(); 69 | } else { 70 | mt_srand($seed); 71 | } 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/CrossValidation/StratifiedRandomSplit.php: -------------------------------------------------------------------------------- 1 | splitByTarget($dataset); 15 | 16 | foreach ($datasets as $targetSet) { 17 | parent::splitDataset($targetSet, $testSize); 18 | } 19 | } 20 | 21 | /** 22 | * @return Dataset[] 23 | */ 24 | private function splitByTarget(Dataset $dataset): array 25 | { 26 | $targets = $dataset->getTargets(); 27 | $samples = $dataset->getSamples(); 28 | 29 | $uniqueTargets = array_unique($targets); 30 | /** @var array $split */ 31 | $split = array_combine($uniqueTargets, array_fill(0, count($uniqueTargets), [])); 32 | 33 | foreach ($samples as $key => $sample) { 34 | $split[$targets[$key]][] = $sample; 35 | } 36 | 37 | return $this->createDatasets($uniqueTargets, $split); 38 | } 39 | 40 | private function createDatasets(array $uniqueTargets, array $split): array 41 | { 42 | $datasets = []; 43 | foreach ($uniqueTargets as $target) { 44 | $datasets[$target] = new ArrayDataset($split[$target], array_fill(0, count($split[$target]), $target)); 45 | } 46 | 47 | return $datasets; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/Dataset/ArrayDataset.php: -------------------------------------------------------------------------------- 1 | samples = $samples; 31 | $this->targets = $targets; 32 | } 33 | 34 | public function getSamples(): array 35 | { 36 | return $this->samples; 37 | } 38 | 39 | public function getTargets(): array 40 | { 41 | return $this->targets; 42 | } 43 | 44 | /** 45 | * @param int[] $columns 46 | */ 47 | public function removeColumns(array $columns): void 48 | { 49 | foreach ($this->samples as &$sample) { 50 | $this->removeColumnsFromSample($sample, $columns); 51 | } 52 | } 53 | 54 | private function removeColumnsFromSample(array &$sample, array $columns): void 55 | { 56 | foreach ($columns as $index) { 57 | unset($sample[$index]); 58 | } 59 | 60 | $sample = array_values($sample); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/Dataset/CsvDataset.php: -------------------------------------------------------------------------------- 1 | columnNames = array_slice((array) $data, 0, $features); 33 | } else { 34 | $this->columnNames = range(0, $features - 1); 35 | } 36 | 37 | $samples = $targets = []; 38 | while ($data = fgetcsv($handle, $maxLineLength, $delimiter)) { 39 | $samples[] = array_slice($data, 0, $features); 40 | $targets[] = $data[$features]; 41 | } 42 | 43 | fclose($handle); 44 | 45 | parent::__construct($samples, $targets); 46 | } 47 | 48 | public function getColumnNames(): array 49 | { 50 | return $this->columnNames; 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/Dataset/Dataset.php: -------------------------------------------------------------------------------- 1 | scanRootPath($rootPath); 18 | } 19 | 20 | private function scanRootPath(string $rootPath): void 21 | { 22 | $dirs = glob($rootPath.DIRECTORY_SEPARATOR.'*', GLOB_ONLYDIR); 23 | 24 | if ($dirs === false) { 25 | throw new DatasetException(sprintf('An error occurred during directory "%s" scan', $rootPath)); 26 | } 27 | 28 | foreach ($dirs as $dir) { 29 | $this->scanDir($dir); 30 | } 31 | } 32 | 33 | private function scanDir(string $dir): void 34 | { 35 | $target = basename($dir); 36 | 37 | $files = glob($dir.DIRECTORY_SEPARATOR.'*'); 38 | if ($files === false) { 39 | return; 40 | } 41 | 42 | foreach (array_filter($files, 'is_file') as $file) { 43 | $this->samples[] = file_get_contents($file); 44 | $this->targets[] = $target; 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/Dataset/MnistDataset.php: -------------------------------------------------------------------------------- 1 | samples = $this->readImages($imagePath); 26 | $this->targets = $this->readLabels($labelPath); 27 | 28 | if (count($this->samples) !== count($this->targets)) { 29 | throw new InvalidArgumentException('Must have the same number of images and labels'); 30 | } 31 | } 32 | 33 | private function readImages(string $imagePath): array 34 | { 35 | $stream = fopen($imagePath, 'rb'); 36 | 37 | if ($stream === false) { 38 | throw new InvalidArgumentException('Could not open file: '.$imagePath); 39 | } 40 | 41 | $images = []; 42 | 43 | try { 44 | $header = fread($stream, 16); 45 | 46 | $fields = unpack('Nmagic/Nsize/Nrows/Ncols', (string) $header); 47 | 48 | if ($fields['magic'] !== self::MAGIC_IMAGE) { 49 | throw new InvalidArgumentException('Invalid magic number: '.$imagePath); 50 | } 51 | 52 | if ($fields['rows'] != self::IMAGE_ROWS) { 53 | throw new InvalidArgumentException('Invalid number of image rows: '.$imagePath); 54 | } 55 | 56 | if ($fields['cols'] != self::IMAGE_COLS) { 57 | throw new InvalidArgumentException('Invalid number of image cols: '.$imagePath); 58 | } 59 | 60 | for ($i = 0; $i < $fields['size']; $i++) { 61 | $imageBytes = fread($stream, $fields['rows'] * $fields['cols']); 62 | 63 | // Convert to float between 0 and 1 64 | $images[] = array_map(function ($b) { 65 | return $b / 255; 66 | }, array_values(unpack('C*', (string) $imageBytes))); 67 | } 68 | } finally { 69 | fclose($stream); 70 | } 71 | 72 | return $images; 73 | } 74 | 75 | private function readLabels(string $labelPath): array 76 | { 77 | $stream = fopen($labelPath, 'rb'); 78 | 79 | if ($stream === false) { 80 | throw new InvalidArgumentException('Could not open file: '.$labelPath); 81 | } 82 | 83 | $labels = []; 84 | 85 | try { 86 | $header = fread($stream, 8); 87 | 88 | $fields = unpack('Nmagic/Nsize', (string) $header); 89 | 90 | if ($fields['magic'] !== self::MAGIC_LABEL) { 91 | throw new InvalidArgumentException('Invalid magic number: '.$labelPath); 92 | } 93 | 94 | $labels = fread($stream, $fields['size']); 95 | } finally { 96 | fclose($stream); 97 | } 98 | 99 | return array_values(unpack('C*', (string) $labels)); 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/Dataset/SvmDataset.php: -------------------------------------------------------------------------------- 1 | $maxIndex) { 69 | $maxIndex = $index; 70 | $sample = array_pad($sample, $maxIndex + 1, 0); 71 | } 72 | 73 | $sample[$index] = $value; 74 | } 75 | 76 | return [$sample, $target, $maxIndex]; 77 | } 78 | 79 | private static function parseLine(string $line): array 80 | { 81 | $line = explode('#', $line, 2)[0]; 82 | $line = rtrim($line); 83 | $line = str_replace("\t", ' ', $line); 84 | 85 | return explode(' ', $line); 86 | } 87 | 88 | private static function parseTargetColumn(string $column): float 89 | { 90 | if (!is_numeric($column)) { 91 | throw new DatasetException(sprintf('Invalid target "%s".', $column)); 92 | } 93 | 94 | return (float) $column; 95 | } 96 | 97 | private static function parseFeatureColumn(string $column): array 98 | { 99 | $feature = explode(':', $column, 2); 100 | if (count($feature) !== 2) { 101 | throw new DatasetException(sprintf('Invalid value "%s".', $column)); 102 | } 103 | 104 | $index = self::parseFeatureIndex($feature[0]); 105 | $value = self::parseFeatureValue($feature[1]); 106 | 107 | return [$index, $value]; 108 | } 109 | 110 | private static function parseFeatureIndex(string $index): int 111 | { 112 | if (!is_numeric($index) || !ctype_digit($index)) { 113 | throw new DatasetException(sprintf('Invalid index "%s".', $index)); 114 | } 115 | 116 | if ((int) $index < 1) { 117 | throw new DatasetException(sprintf('Invalid index "%s".', $index)); 118 | } 119 | 120 | return (int) $index - 1; 121 | } 122 | 123 | private static function parseFeatureValue(string $value): float 124 | { 125 | if (!is_numeric($value)) { 126 | throw new DatasetException(sprintf('Invalid value "%s".', $value)); 127 | } 128 | 129 | return (float) $value; 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /src/DimensionReduction/EigenTransformerBase.php: -------------------------------------------------------------------------------- 1 | getRealEigenvalues(); 55 | $eigVects = $eig->getEigenvectors(); 56 | 57 | $totalEigVal = array_sum($eigVals); 58 | // Sort eigenvalues in descending order 59 | arsort($eigVals); 60 | 61 | $explainedVar = 0.0; 62 | $vectors = []; 63 | $values = []; 64 | foreach ($eigVals as $i => $eigVal) { 65 | $explainedVar += $eigVal / $totalEigVal; 66 | $vectors[] = $eigVects[$i]; 67 | $values[] = $eigVal; 68 | 69 | if ($this->numFeatures !== null) { 70 | if (count($vectors) == $this->numFeatures) { 71 | break; 72 | } 73 | } else { 74 | if ($explainedVar >= $this->totalVariance) { 75 | break; 76 | } 77 | } 78 | } 79 | 80 | $this->eigValues = $values; 81 | $this->eigVectors = $vectors; 82 | } 83 | 84 | /** 85 | * Returns the reduced data 86 | */ 87 | protected function reduce(array $data): array 88 | { 89 | $m1 = new Matrix($data); 90 | $m2 = new Matrix($this->eigVectors); 91 | 92 | return $m1->multiply($m2->transpose())->toArray(); 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/DimensionReduction/PCA.php: -------------------------------------------------------------------------------- 1 | 31 | * 32 | * @param float $totalVariance Total explained variance to be preserved 33 | * @param int $numFeatures Number of features to be preserved 34 | * 35 | * @throws InvalidArgumentException 36 | */ 37 | public function __construct(?float $totalVariance = null, ?int $numFeatures = null) 38 | { 39 | if ($totalVariance !== null && ($totalVariance < 0.1 || $totalVariance > 0.99)) { 40 | throw new InvalidArgumentException('Total variance can be a value between 0.1 and 0.99'); 41 | } 42 | 43 | if ($numFeatures !== null && $numFeatures <= 0) { 44 | throw new InvalidArgumentException('Number of features to be preserved should be greater than 0'); 45 | } 46 | 47 | if (($totalVariance !== null) === ($numFeatures !== null)) { 48 | throw new InvalidArgumentException('Either totalVariance or numFeatures should be specified in order to run the algorithm'); 49 | } 50 | 51 | if ($numFeatures !== null) { 52 | $this->numFeatures = $numFeatures; 53 | } 54 | 55 | if ($totalVariance !== null) { 56 | $this->totalVariance = $totalVariance; 57 | } 58 | } 59 | 60 | /** 61 | * Takes a data and returns a lower dimensional version 62 | * of this data while preserving $totalVariance or $numFeatures.
63 | * $data is an n-by-m matrix and returned array is 64 | * n-by-k matrix where k <= m 65 | */ 66 | public function fit(array $data): array 67 | { 68 | $n = count($data[0]); 69 | 70 | $data = $this->normalize($data, $n); 71 | 72 | $covMatrix = Covariance::covarianceMatrix($data, array_fill(0, $n, 0)); 73 | 74 | $this->eigenDecomposition($covMatrix); 75 | 76 | $this->fit = true; 77 | 78 | return $this->reduce($data); 79 | } 80 | 81 | /** 82 | * Transforms the given sample to a lower dimensional vector by using 83 | * the eigenVectors obtained in the last run of fit. 84 | * 85 | * @throws InvalidOperationException 86 | */ 87 | public function transform(array $sample): array 88 | { 89 | if (!$this->fit) { 90 | throw new InvalidOperationException('PCA has not been fitted with respect to original dataset, please run PCA::fit() first'); 91 | } 92 | 93 | if (!is_array($sample[0])) { 94 | $sample = [$sample]; 95 | } 96 | 97 | $sample = $this->normalize($sample, count($sample[0])); 98 | 99 | return $this->reduce($sample); 100 | } 101 | 102 | protected function calculateMeans(array $data, int $n): void 103 | { 104 | // Calculate means for each dimension 105 | $this->means = []; 106 | for ($i = 0; $i < $n; ++$i) { 107 | $column = array_column($data, $i); 108 | $this->means[] = Mean::arithmetic($column); 109 | } 110 | } 111 | 112 | /** 113 | * Normalization of the data includes subtracting mean from 114 | * each dimension therefore dimensions will be centered to zero 115 | */ 116 | protected function normalize(array $data, int $n): array 117 | { 118 | if (count($this->means) === 0) { 119 | $this->calculateMeans($data, $n); 120 | } 121 | 122 | // Normalize data 123 | foreach (array_keys($data) as $i) { 124 | for ($k = 0; $k < $n; ++$k) { 125 | $data[$i][$k] -= $this->means[$k]; 126 | } 127 | } 128 | 129 | return $data; 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /src/Estimator.php: -------------------------------------------------------------------------------- 1 | stopWords = array_fill_keys($stopWords, true); 19 | } 20 | 21 | public function isStopWord(string $token): bool 22 | { 23 | return isset($this->stopWords[$token]); 24 | } 25 | 26 | public static function factory(string $language = 'English'): self 27 | { 28 | $className = __NAMESPACE__."\\StopWords\\{$language}"; 29 | 30 | if (!class_exists($className)) { 31 | throw new InvalidArgumentException(sprintf('Can\'t find "%s" language for StopWords', $language)); 32 | } 33 | 34 | return new $className(); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/FeatureExtraction/StopWords/English.php: -------------------------------------------------------------------------------- 1 | stopWords); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/FeatureExtraction/StopWords/French.php: -------------------------------------------------------------------------------- 1 | stopWords); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/FeatureExtraction/StopWords/German.php: -------------------------------------------------------------------------------- 1 | stopWords); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/FeatureExtraction/StopWords/Polish.php: -------------------------------------------------------------------------------- 1 | stopWords); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/FeatureExtraction/StopWords/Russian.php: -------------------------------------------------------------------------------- 1 | stopWords); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/FeatureExtraction/TfIdfTransformer.php: -------------------------------------------------------------------------------- 1 | 0) { 19 | $this->fit($samples); 20 | } 21 | } 22 | 23 | public function fit(array $samples, ?array $targets = null): void 24 | { 25 | $this->countTokensFrequency($samples); 26 | 27 | $count = count($samples); 28 | foreach ($this->idf as &$value) { 29 | $value = log((float) ($count / $value), 10.0); 30 | } 31 | } 32 | 33 | public function transform(array &$samples, ?array &$targets = null): void 34 | { 35 | foreach ($samples as &$sample) { 36 | foreach ($sample as $index => &$feature) { 37 | $feature *= $this->idf[$index]; 38 | } 39 | } 40 | } 41 | 42 | private function countTokensFrequency(array $samples): void 43 | { 44 | $this->idf = array_fill_keys(array_keys($samples[0]), 0); 45 | 46 | foreach ($samples as $sample) { 47 | foreach ($sample as $index => $count) { 48 | if ($count > 0) { 49 | ++$this->idf[$index]; 50 | } 51 | } 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/FeatureExtraction/TokenCountVectorizer.php: -------------------------------------------------------------------------------- 1 | tokenizer = $tokenizer; 40 | $this->stopWords = $stopWords; 41 | $this->minDF = $minDF; 42 | } 43 | 44 | public function fit(array $samples, ?array $targets = null): void 45 | { 46 | $this->buildVocabulary($samples); 47 | } 48 | 49 | public function transform(array &$samples, ?array &$targets = null): void 50 | { 51 | array_walk($samples, function (string &$sample): void { 52 | $this->transformSample($sample); 53 | }); 54 | 55 | $this->checkDocumentFrequency($samples); 56 | } 57 | 58 | public function getVocabulary(): array 59 | { 60 | return array_flip($this->vocabulary); 61 | } 62 | 63 | private function buildVocabulary(array &$samples): void 64 | { 65 | foreach ($samples as $sample) { 66 | $tokens = $this->tokenizer->tokenize($sample); 67 | foreach ($tokens as $token) { 68 | $this->addTokenToVocabulary($token); 69 | } 70 | } 71 | } 72 | 73 | private function transformSample(string &$sample): void 74 | { 75 | $counts = []; 76 | $tokens = $this->tokenizer->tokenize($sample); 77 | 78 | foreach ($tokens as $token) { 79 | $index = $this->getTokenIndex($token); 80 | if ($index !== false) { 81 | $this->updateFrequency($token); 82 | if (!isset($counts[$index])) { 83 | $counts[$index] = 0; 84 | } 85 | 86 | ++$counts[$index]; 87 | } 88 | } 89 | 90 | foreach ($this->vocabulary as $index) { 91 | if (!isset($counts[$index])) { 92 | $counts[$index] = 0; 93 | } 94 | } 95 | 96 | ksort($counts); 97 | 98 | $sample = $counts; 99 | } 100 | 101 | /** 102 | * @return int|bool 103 | */ 104 | private function getTokenIndex(string $token) 105 | { 106 | if ($this->isStopWord($token)) { 107 | return false; 108 | } 109 | 110 | return $this->vocabulary[$token] ?? false; 111 | } 112 | 113 | private function addTokenToVocabulary(string $token): void 114 | { 115 | if ($this->isStopWord($token)) { 116 | return; 117 | } 118 | 119 | if (!isset($this->vocabulary[$token])) { 120 | $this->vocabulary[$token] = count($this->vocabulary); 121 | } 122 | } 123 | 124 | private function isStopWord(string $token): bool 125 | { 126 | return $this->stopWords !== null && $this->stopWords->isStopWord($token); 127 | } 128 | 129 | private function updateFrequency(string $token): void 130 | { 131 | if (!isset($this->frequencies[$token])) { 132 | $this->frequencies[$token] = 0; 133 | } 134 | 135 | ++$this->frequencies[$token]; 136 | } 137 | 138 | private function checkDocumentFrequency(array &$samples): void 139 | { 140 | if ($this->minDF > 0) { 141 | $beyondMinimum = $this->getBeyondMinimumIndexes(count($samples)); 142 | foreach ($samples as &$sample) { 143 | $this->resetBeyondMinimum($sample, $beyondMinimum); 144 | } 145 | } 146 | } 147 | 148 | private function resetBeyondMinimum(array &$sample, array $beyondMinimum): void 149 | { 150 | foreach ($beyondMinimum as $index) { 151 | $sample[$index] = 0; 152 | } 153 | } 154 | 155 | private function getBeyondMinimumIndexes(int $samplesCount): array 156 | { 157 | $indexes = []; 158 | foreach ($this->frequencies as $token => $frequency) { 159 | if (($frequency / $samplesCount) < $this->minDF) { 160 | $indexes[] = $this->getTokenIndex((string) $token); 161 | } 162 | } 163 | 164 | return $indexes; 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /src/FeatureSelection/ScoringFunction.php: -------------------------------------------------------------------------------- 1 | $sample) { 16 | $grouped[$targets[$index]][] = $sample; 17 | } 18 | 19 | return ANOVA::oneWayF(array_values($grouped)); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/FeatureSelection/ScoringFunction/UnivariateLinearRegression.php: -------------------------------------------------------------------------------- 1 | center = $center; 36 | } 37 | 38 | public function score(array $samples, array $targets): array 39 | { 40 | if ($this->center) { 41 | $this->centerTargets($targets); 42 | $this->centerSamples($samples); 43 | } 44 | 45 | $correlations = []; 46 | foreach (array_keys($samples[0]) as $index) { 47 | $featureColumn = array_column($samples, $index); 48 | $correlations[$index] = 49 | Matrix::dot($targets, $featureColumn)[0] / (new Matrix($featureColumn, false))->transpose()->frobeniusNorm() 50 | / (new Matrix($targets, false))->frobeniusNorm(); 51 | } 52 | 53 | $degreesOfFreedom = count($targets) - ($this->center ? 2 : 1); 54 | 55 | return array_map(function (float $correlation) use ($degreesOfFreedom): float { 56 | return $correlation ** 2 / (1 - $correlation ** 2) * $degreesOfFreedom; 57 | }, $correlations); 58 | } 59 | 60 | private function centerTargets(array &$targets): void 61 | { 62 | $mean = Mean::arithmetic($targets); 63 | array_walk($targets, function (&$target) use ($mean): void { 64 | $target -= $mean; 65 | }); 66 | } 67 | 68 | private function centerSamples(array &$samples): void 69 | { 70 | $means = []; 71 | foreach ($samples[0] as $index => $feature) { 72 | $means[$index] = Mean::arithmetic(array_column($samples, $index)); 73 | } 74 | 75 | foreach ($samples as &$sample) { 76 | foreach ($sample as $index => &$feature) { 77 | $feature -= $means[$index]; 78 | } 79 | } 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/FeatureSelection/SelectKBest.php: -------------------------------------------------------------------------------- 1 | scoringFunction = $scoringFunction; 41 | $this->k = $k; 42 | } 43 | 44 | public function fit(array $samples, ?array $targets = null): void 45 | { 46 | if ($targets === null || count($targets) === 0) { 47 | throw new InvalidArgumentException('The array has zero elements'); 48 | } 49 | 50 | $this->scores = $sorted = $this->scoringFunction->score($samples, $targets); 51 | if ($this->k >= count($sorted)) { 52 | return; 53 | } 54 | 55 | arsort($sorted); 56 | $this->keepColumns = array_slice($sorted, 0, $this->k, true); 57 | } 58 | 59 | public function transform(array &$samples, ?array &$targets = null): void 60 | { 61 | if ($this->keepColumns === null) { 62 | return; 63 | } 64 | 65 | foreach ($samples as &$sample) { 66 | $sample = array_values(array_intersect_key($sample, $this->keepColumns)); 67 | } 68 | } 69 | 70 | public function scores(): array 71 | { 72 | if ($this->scores === null) { 73 | throw new InvalidOperationException('SelectKBest require to fit first to get scores'); 74 | } 75 | 76 | return $this->scores; 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/FeatureSelection/VarianceThreshold.php: -------------------------------------------------------------------------------- 1 | threshold = $threshold; 36 | } 37 | 38 | public function fit(array $samples, ?array $targets = null): void 39 | { 40 | $this->variances = array_map(static function (array $column): float { 41 | return Variance::population($column); 42 | }, Matrix::transposeArray($samples)); 43 | 44 | foreach ($this->variances as $column => $variance) { 45 | if ($variance > $this->threshold) { 46 | $this->keepColumns[$column] = true; 47 | } 48 | } 49 | } 50 | 51 | public function transform(array &$samples, ?array &$targets = null): void 52 | { 53 | foreach ($samples as &$sample) { 54 | $sample = array_values(array_intersect_key($sample, $this->keepColumns)); 55 | } 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/FeatureUnion.php: -------------------------------------------------------------------------------- 1 | pipelines = array_map(static function (Pipeline $pipeline): Pipeline { 26 | return $pipeline; 27 | }, $pipelines); 28 | } 29 | 30 | public function fit(array $samples, ?array $targets = null): void 31 | { 32 | $originSamples = $samples; 33 | foreach ($this->pipelines as $pipeline) { 34 | foreach ($pipeline->getTransformers() as $transformer) { 35 | $transformer->fit($samples, $targets); 36 | $transformer->transform($samples, $targets); 37 | } 38 | $samples = $originSamples; 39 | } 40 | } 41 | 42 | public function transform(array &$samples, ?array &$targets = null): void 43 | { 44 | $this->transformSamples($samples, $targets); 45 | } 46 | 47 | public function fitAndTransform(array &$samples, ?array &$targets = null): void 48 | { 49 | $this->transformSamples($samples, $targets, true); 50 | } 51 | 52 | private function transformSamples(array &$samples, ?array &$targets = null, bool $fit = false): void 53 | { 54 | $union = []; 55 | $originSamples = $samples; 56 | foreach ($this->pipelines as $pipeline) { 57 | foreach ($pipeline->getTransformers() as $transformer) { 58 | if ($fit) { 59 | $transformer->fit($samples, $targets); 60 | } 61 | $transformer->transform($samples, $targets); 62 | } 63 | 64 | foreach ($samples as $index => $sample) { 65 | $union[$index] = array_merge($union[$index] ?? [], is_array($sample) ? $sample : [$sample]); 66 | } 67 | $samples = $originSamples; 68 | } 69 | 70 | $samples = $union; 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/Helper/OneVsRest.php: -------------------------------------------------------------------------------- 1 | reset(); 35 | 36 | $this->trainByLabel($samples, $targets); 37 | } 38 | 39 | /** 40 | * Resets the classifier and the vars internally used by OneVsRest to create multiple classifiers. 41 | */ 42 | public function reset(): void 43 | { 44 | $this->classifiers = []; 45 | $this->allLabels = []; 46 | $this->costValues = []; 47 | 48 | $this->resetBinary(); 49 | } 50 | 51 | protected function trainByLabel(array $samples, array $targets, array $allLabels = []): void 52 | { 53 | // Overwrites the current value if it exist. $allLabels must be provided for each partialTrain run. 54 | $this->allLabels = count($allLabels) === 0 ? array_keys(array_count_values($targets)) : $allLabels; 55 | sort($this->allLabels, SORT_STRING); 56 | 57 | // If there are only two targets, then there is no need to perform OvR 58 | if (count($this->allLabels) === 2) { 59 | // Init classifier if required. 60 | if (count($this->classifiers) === 0) { 61 | $this->classifiers[0] = $this->getClassifierCopy(); 62 | } 63 | 64 | $this->classifiers[0]->trainBinary($samples, $targets, $this->allLabels); 65 | } else { 66 | // Train a separate classifier for each label and memorize them 67 | 68 | foreach ($this->allLabels as $label) { 69 | // Init classifier if required. 70 | if (!isset($this->classifiers[$label])) { 71 | $this->classifiers[$label] = $this->getClassifierCopy(); 72 | } 73 | 74 | [$binarizedTargets, $classifierLabels] = $this->binarizeTargets($targets, $label); 75 | $this->classifiers[$label]->trainBinary($samples, $binarizedTargets, $classifierLabels); 76 | } 77 | } 78 | 79 | // If the underlying classifier is capable of giving the cost values 80 | // during the training, then assign it to the relevant variable 81 | // Adding just the first classifier cost values to avoid complex average calculations. 82 | $classifierref = reset($this->classifiers); 83 | if (method_exists($classifierref, 'getCostValues')) { 84 | $this->costValues = $classifierref->getCostValues(); 85 | } 86 | } 87 | 88 | /** 89 | * Returns an instance of the current class after cleaning up OneVsRest stuff. 90 | */ 91 | protected function getClassifierCopy(): Classifier 92 | { 93 | // Clone the current classifier, so that 94 | // we don't mess up its variables while training 95 | // multiple instances of this classifier 96 | $classifier = clone $this; 97 | $classifier->reset(); 98 | 99 | return $classifier; 100 | } 101 | 102 | /** 103 | * @return mixed 104 | */ 105 | protected function predictSample(array $sample) 106 | { 107 | if (count($this->allLabels) === 2) { 108 | return $this->classifiers[0]->predictSampleBinary($sample); 109 | } 110 | 111 | $probs = []; 112 | 113 | foreach ($this->classifiers as $label => $predictor) { 114 | $probs[$label] = $predictor->predictProbability($sample, $label); 115 | } 116 | 117 | arsort($probs, SORT_NUMERIC); 118 | 119 | return key($probs); 120 | } 121 | 122 | /** 123 | * Each classifier should implement this method instead of train(samples, targets) 124 | */ 125 | abstract protected function trainBinary(array $samples, array $targets, array $labels); 126 | 127 | /** 128 | * To be overwritten by OneVsRest classifiers. 129 | */ 130 | abstract protected function resetBinary(): void; 131 | 132 | /** 133 | * Each classifier that make use of OvR approach should be able to 134 | * return a probability for a sample to belong to the given label. 135 | * 136 | * @return mixed 137 | */ 138 | abstract protected function predictProbability(array $sample, string $label); 139 | 140 | /** 141 | * Each classifier should implement this method instead of predictSample() 142 | * 143 | * @return mixed 144 | */ 145 | abstract protected function predictSampleBinary(array $sample); 146 | 147 | /** 148 | * Groups all targets into two groups: Targets equal to 149 | * the given label and the others 150 | * 151 | * $targets is not passed by reference nor contains objects so this method 152 | * changes will not affect the caller $targets array. 153 | * 154 | * @param mixed $label 155 | * 156 | * @return array Binarized targets and target's labels 157 | */ 158 | private function binarizeTargets(array $targets, $label): array 159 | { 160 | $notLabel = "not_{$label}"; 161 | foreach ($targets as $key => $target) { 162 | $targets[$key] = $target == $label ? $label : $notLabel; 163 | } 164 | 165 | $labels = [$label, $notLabel]; 166 | 167 | return [$targets, $labels]; 168 | } 169 | } 170 | -------------------------------------------------------------------------------- /src/Helper/Optimizer/GD.php: -------------------------------------------------------------------------------- 1 | samples = $samples; 26 | $this->targets = $targets; 27 | $this->gradientCb = $gradientCb; 28 | $this->sampleCount = count($this->samples); 29 | 30 | // Batch learning is executed: 31 | $currIter = 0; 32 | $this->costValues = []; 33 | while ($this->maxIterations > $currIter++) { 34 | $theta = $this->theta; 35 | 36 | // Calculate update terms for each sample 37 | [$errors, $updates, $totalPenalty] = $this->gradient($theta); 38 | 39 | $this->updateWeightsWithUpdates($updates, $totalPenalty); 40 | 41 | $this->costValues[] = array_sum($errors) / (int) $this->sampleCount; 42 | 43 | if ($this->earlyStop($theta)) { 44 | break; 45 | } 46 | } 47 | 48 | $this->clear(); 49 | 50 | return $this->theta; 51 | } 52 | 53 | /** 54 | * Calculates gradient, cost function and penalty term for each sample 55 | * then returns them as an array of values 56 | */ 57 | protected function gradient(array $theta): array 58 | { 59 | $costs = []; 60 | $gradient = []; 61 | $totalPenalty = 0; 62 | 63 | if ($this->gradientCb === null) { 64 | throw new InvalidOperationException('Gradient callback is not defined'); 65 | } 66 | 67 | foreach ($this->samples as $index => $sample) { 68 | $target = $this->targets[$index]; 69 | 70 | $result = ($this->gradientCb)($theta, $sample, $target); 71 | [$cost, $grad, $penalty] = array_pad($result, 3, 0); 72 | 73 | $costs[] = $cost; 74 | $gradient[] = $grad; 75 | $totalPenalty += $penalty; 76 | } 77 | 78 | $totalPenalty /= $this->sampleCount; 79 | 80 | return [$costs, $gradient, $totalPenalty]; 81 | } 82 | 83 | protected function updateWeightsWithUpdates(array $updates, float $penalty): void 84 | { 85 | // Updates all weights at once 86 | for ($i = 0; $i <= $this->dimensions; ++$i) { 87 | if ($i === 0) { 88 | $this->theta[0] -= $this->learningRate * array_sum($updates); 89 | } else { 90 | $col = array_column($this->samples, $i - 1); 91 | 92 | $error = 0; 93 | foreach ($col as $index => $val) { 94 | $error += $val * $updates[$index]; 95 | } 96 | 97 | $this->theta[$i] -= $this->learningRate * 98 | ($error + $penalty * $this->theta[$i]); 99 | } 100 | } 101 | } 102 | 103 | /** 104 | * Clears the optimizer internal vars after the optimization process. 105 | */ 106 | protected function clear(): void 107 | { 108 | $this->sampleCount = null; 109 | parent::clear(); 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /src/Helper/Optimizer/Optimizer.php: -------------------------------------------------------------------------------- 1 | dimensions = $dimensions; 32 | 33 | // Inits the weights randomly 34 | $this->theta = []; 35 | for ($i = 0; $i < $this->dimensions; ++$i) { 36 | $this->theta[] = (random_int(0, PHP_INT_MAX) / PHP_INT_MAX) + 0.1; 37 | } 38 | } 39 | 40 | public function setTheta(array $theta): self 41 | { 42 | if (count($theta) !== $this->dimensions) { 43 | throw new InvalidArgumentException(sprintf('Number of values in the weights array should be %s', $this->dimensions)); 44 | } 45 | 46 | $this->theta = $theta; 47 | 48 | return $this; 49 | } 50 | 51 | public function theta(): array 52 | { 53 | return $this->theta; 54 | } 55 | 56 | /** 57 | * Executes the optimization with the given samples & targets 58 | * and returns the weights 59 | */ 60 | abstract public function runOptimization(array $samples, array $targets, Closure $gradientCb): array; 61 | } 62 | -------------------------------------------------------------------------------- /src/Helper/Predictable.php: -------------------------------------------------------------------------------- 1 | predictSample($samples); 16 | } 17 | 18 | $predicted = []; 19 | foreach ($samples as $index => $sample) { 20 | $predicted[$index] = $this->predictSample($sample); 21 | } 22 | 23 | return $predicted; 24 | } 25 | 26 | /** 27 | * @return mixed 28 | */ 29 | abstract protected function predictSample(array $sample); 30 | } 31 | -------------------------------------------------------------------------------- /src/Helper/Trainable.php: -------------------------------------------------------------------------------- 1 | samples = array_merge($this->samples, $samples); 22 | $this->targets = array_merge($this->targets, $targets); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/IncrementalEstimator.php: -------------------------------------------------------------------------------- 1 | ': 21 | return $a > $b; 22 | case '>=': 23 | return $a >= $b; 24 | case '=': 25 | case '==': 26 | return $a == $b; 27 | case '===': 28 | return $a === $b; 29 | case '<=': 30 | return $a <= $b; 31 | case '<': 32 | return $a < $b; 33 | case '!=': 34 | case '<>': 35 | return $a != $b; 36 | case '!==': 37 | return $a !== $b; 38 | default: 39 | throw new InvalidArgumentException(sprintf('Invalid operator "%s" provided', $operator)); 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/Math/Distance.php: -------------------------------------------------------------------------------- 1 | deltas($a, $b)); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/Math/Distance/Distance.php: -------------------------------------------------------------------------------- 1 | norm = $norm; 26 | } 27 | 28 | /** 29 | * @throws InvalidArgumentException 30 | */ 31 | public function distance(array $a, array $b): float 32 | { 33 | $distance = 0; 34 | 35 | foreach ($this->deltas($a, $b) as $delta) { 36 | $distance += $delta ** $this->norm; 37 | } 38 | 39 | return $distance ** (1 / $this->norm); 40 | } 41 | 42 | /** 43 | * @throws InvalidArgumentException 44 | */ 45 | protected function deltas(array $a, array $b): array 46 | { 47 | $count = count($a); 48 | 49 | if ($count !== count($b)) { 50 | throw new InvalidArgumentException('Size of given arrays does not match'); 51 | } 52 | 53 | $deltas = []; 54 | 55 | for ($i = 0; $i < $count; $i++) { 56 | $deltas[] = abs($a[$i] - $b[$i]); 57 | } 58 | 59 | return $deltas; 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/Math/Distance/Euclidean.php: -------------------------------------------------------------------------------- 1 | distance($a, $b) ** 2; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/Math/Distance/Manhattan.php: -------------------------------------------------------------------------------- 1 | gamma = $gamma; 21 | } 22 | 23 | public function compute($a, $b): float 24 | { 25 | if (!is_array($a) || !is_array($b)) { 26 | throw new InvalidArgumentException(sprintf('Arguments of %s must be arrays', __METHOD__)); 27 | } 28 | 29 | $score = 2 * Product::scalar($a, $b); 30 | $squares = Product::scalar($a, $a) + Product::scalar($b, $b); 31 | 32 | return exp(-$this->gamma * ($squares - $score)); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/Math/Product.php: -------------------------------------------------------------------------------- 1 | $value) { 16 | if (is_numeric($value) && is_numeric($b[$index])) { 17 | $product += (float) $value * (float) $b[$index]; 18 | } 19 | } 20 | 21 | return $product; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/Math/Set.php: -------------------------------------------------------------------------------- 1 | elements = self::sanitize($elements); 23 | } 24 | 25 | /** 26 | * Creates the union of A and B. 27 | */ 28 | public static function union(self $a, self $b): self 29 | { 30 | return new self(array_merge($a->toArray(), $b->toArray())); 31 | } 32 | 33 | /** 34 | * Creates the intersection of A and B. 35 | */ 36 | public static function intersection(self $a, self $b): self 37 | { 38 | return new self(array_intersect($a->toArray(), $b->toArray())); 39 | } 40 | 41 | /** 42 | * Creates the difference of A and B. 43 | */ 44 | public static function difference(self $a, self $b): self 45 | { 46 | return new self(array_diff($a->toArray(), $b->toArray())); 47 | } 48 | 49 | /** 50 | * Creates the Cartesian product of A and B. 51 | * 52 | * @return Set[] 53 | */ 54 | public static function cartesian(self $a, self $b): array 55 | { 56 | $cartesian = []; 57 | 58 | foreach ($a as $multiplier) { 59 | foreach ($b as $multiplicand) { 60 | $cartesian[] = new self(array_merge([$multiplicand], [$multiplier])); 61 | } 62 | } 63 | 64 | return $cartesian; 65 | } 66 | 67 | /** 68 | * Creates the power set of A. 69 | * 70 | * @return Set[] 71 | */ 72 | public static function power(self $a): array 73 | { 74 | $power = [new self()]; 75 | 76 | foreach ($a as $multiplicand) { 77 | foreach ($power as $multiplier) { 78 | $power[] = new self(array_merge([$multiplicand], $multiplier->toArray())); 79 | } 80 | } 81 | 82 | return $power; 83 | } 84 | 85 | /** 86 | * @param string|int|float|bool $element 87 | */ 88 | public function add($element): self 89 | { 90 | return $this->addAll([$element]); 91 | } 92 | 93 | /** 94 | * @param string[]|int[]|float[]|bool[] $elements 95 | */ 96 | public function addAll(array $elements): self 97 | { 98 | $this->elements = self::sanitize(array_merge($this->elements, $elements)); 99 | 100 | return $this; 101 | } 102 | 103 | /** 104 | * @param string|int|float $element 105 | */ 106 | public function remove($element): self 107 | { 108 | return $this->removeAll([$element]); 109 | } 110 | 111 | /** 112 | * @param string[]|int[]|float[] $elements 113 | */ 114 | public function removeAll(array $elements): self 115 | { 116 | $this->elements = self::sanitize(array_diff($this->elements, $elements)); 117 | 118 | return $this; 119 | } 120 | 121 | /** 122 | * @param string|int|float $element 123 | */ 124 | public function contains($element): bool 125 | { 126 | return $this->containsAll([$element]); 127 | } 128 | 129 | /** 130 | * @param string[]|int[]|float[] $elements 131 | */ 132 | public function containsAll(array $elements): bool 133 | { 134 | return count(array_diff($elements, $this->elements)) === 0; 135 | } 136 | 137 | /** 138 | * @return string[]|int[]|float[]|bool[] 139 | */ 140 | public function toArray(): array 141 | { 142 | return $this->elements; 143 | } 144 | 145 | public function getIterator(): ArrayIterator 146 | { 147 | return new ArrayIterator($this->elements); 148 | } 149 | 150 | public function isEmpty(): bool 151 | { 152 | return $this->cardinality() === 0; 153 | } 154 | 155 | public function cardinality(): int 156 | { 157 | return count($this->elements); 158 | } 159 | 160 | /** 161 | * Removes duplicates and rewrites index. 162 | * 163 | * @param string[]|int[]|float[]|bool[] $elements 164 | * 165 | * @return string[]|int[]|float[]|bool[] 166 | */ 167 | private static function sanitize(array $elements): array 168 | { 169 | sort($elements, SORT_ASC); 170 | 171 | return array_values(array_unique($elements, SORT_ASC)); 172 | } 173 | } 174 | -------------------------------------------------------------------------------- /src/Math/Statistic/ANOVA.php: -------------------------------------------------------------------------------- 1 | $msbValue) { 57 | $f[$index] = $msbValue / $msw[$index]; 58 | } 59 | 60 | return $f; 61 | } 62 | 63 | private static function sumOfSquaresPerFeature(array $samples): array 64 | { 65 | $sum = array_fill(0, count($samples[0][0]), 0); 66 | foreach ($samples as $class) { 67 | foreach ($class as $sample) { 68 | foreach ($sample as $index => $feature) { 69 | $sum[$index] += $feature ** 2; 70 | } 71 | } 72 | } 73 | 74 | return $sum; 75 | } 76 | 77 | private static function sumOfFeaturesPerClass(array $samples): array 78 | { 79 | return array_map(static function (array $class): array { 80 | $sum = array_fill(0, count($class[0]), 0); 81 | foreach ($class as $sample) { 82 | foreach ($sample as $index => $feature) { 83 | $sum[$index] += $feature; 84 | } 85 | } 86 | 87 | return $sum; 88 | }, $samples); 89 | } 90 | 91 | private static function sumOfSquares(array $sums): array 92 | { 93 | $squares = array_fill(0, count($sums[0]), 0); 94 | foreach ($sums as $row) { 95 | foreach ($row as $index => $sum) { 96 | $squares[$index] += $sum; 97 | } 98 | } 99 | 100 | return array_map(static function ($sum) { 101 | return $sum ** 2; 102 | }, $squares); 103 | } 104 | 105 | private static function squaresSum(array $sums): array 106 | { 107 | foreach ($sums as &$row) { 108 | foreach ($row as &$sum) { 109 | $sum **= 2; 110 | } 111 | } 112 | 113 | return $sums; 114 | } 115 | 116 | private static function calculateSsbn(array $samples, array $sumSamplesSquare, array $samplesPerClass, array $squareSumSamples, int $allSamples): array 117 | { 118 | $ssbn = array_fill(0, count($samples[0][0]), 0); 119 | foreach ($sumSamplesSquare as $classIndex => $class) { 120 | foreach ($class as $index => $feature) { 121 | $ssbn[$index] += $feature / $samplesPerClass[$classIndex]; 122 | } 123 | } 124 | 125 | foreach ($squareSumSamples as $index => $sum) { 126 | $ssbn[$index] -= $sum / $allSamples; 127 | } 128 | 129 | return $ssbn; 130 | } 131 | 132 | private static function calculateSswn(array $ssbn, array $ssAllSamples, array $squareSumSamples, int $allSamples): array 133 | { 134 | $sswn = []; 135 | foreach ($ssAllSamples as $index => $ss) { 136 | $sswn[$index] = ($ss - $squareSumSamples[$index] / $allSamples) - $ssbn[$index]; 137 | } 138 | 139 | return $sswn; 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /src/Math/Statistic/Correlation.php: -------------------------------------------------------------------------------- 1 | $xi) { 37 | $yi = $y[$index]; 38 | $sum += ($xi - $meanX) * ($yi - $meanY); 39 | } 40 | 41 | if ($sample) { 42 | --$n; 43 | } 44 | 45 | return $sum / $n; 46 | } 47 | 48 | /** 49 | * Calculates covariance of two dimensions, i and k in the given data. 50 | * 51 | * @throws InvalidArgumentException 52 | * @throws \Exception 53 | */ 54 | public static function fromDataset(array $data, int $i, int $k, bool $sample = true, ?float $meanX = null, ?float $meanY = null): float 55 | { 56 | if (count($data) === 0) { 57 | throw new InvalidArgumentException('The array has zero elements'); 58 | } 59 | 60 | $n = count($data); 61 | if ($sample && $n === 1) { 62 | throw new InvalidArgumentException('The array must have at least 2 elements'); 63 | } 64 | 65 | if ($i < 0 || $k < 0 || $i >= $n || $k >= $n) { 66 | throw new InvalidArgumentException('Given indices i and k do not match with the dimensionality of data'); 67 | } 68 | 69 | if ($meanX === null || $meanY === null) { 70 | $x = array_column($data, $i); 71 | $y = array_column($data, $k); 72 | 73 | $meanX = Mean::arithmetic($x); 74 | $meanY = Mean::arithmetic($y); 75 | $sum = 0.0; 76 | foreach ($x as $index => $xi) { 77 | $yi = $y[$index]; 78 | $sum += ($xi - $meanX) * ($yi - $meanY); 79 | } 80 | } else { 81 | // In the case, whole dataset given along with dimension indices, i and k, 82 | // we would like to avoid getting column data with array_column and operate 83 | // over this extra copy of column data for memory efficiency purposes. 84 | // 85 | // Instead we traverse through the whole data and get what we actually need 86 | // without copying the data. This way, memory use will be reduced 87 | // with a slight cost of CPU utilization. 88 | $sum = 0.0; 89 | foreach ($data as $row) { 90 | $val = [0, 0]; 91 | foreach ($row as $index => $col) { 92 | if ($index == $i) { 93 | $val[0] = $col - $meanX; 94 | } 95 | 96 | if ($index == $k) { 97 | $val[1] = $col - $meanY; 98 | } 99 | } 100 | 101 | $sum += $val[0] * $val[1]; 102 | } 103 | } 104 | 105 | if ($sample) { 106 | --$n; 107 | } 108 | 109 | return $sum / $n; 110 | } 111 | 112 | /** 113 | * Returns the covariance matrix of n-dimensional data 114 | * 115 | * @param array|null $means 116 | */ 117 | public static function covarianceMatrix(array $data, ?array $means = null): array 118 | { 119 | $n = count($data[0]); 120 | 121 | if ($means === null) { 122 | $means = []; 123 | for ($i = 0; $i < $n; ++$i) { 124 | $means[] = Mean::arithmetic(array_column($data, $i)); 125 | } 126 | } 127 | 128 | $cov = []; 129 | for ($i = 0; $i < $n; ++$i) { 130 | for ($k = 0; $k < $n; ++$k) { 131 | if ($i > $k) { 132 | $cov[$i][$k] = $cov[$k][$i]; 133 | } else { 134 | $cov[$i][$k] = self::fromDataset( 135 | $data, 136 | $i, 137 | $k, 138 | true, 139 | $means[$i], 140 | $means[$k] 141 | ); 142 | } 143 | } 144 | } 145 | 146 | return $cov; 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /src/Math/Statistic/Gaussian.php: -------------------------------------------------------------------------------- 1 | mean = $mean; 22 | $this->std = $std; 23 | } 24 | 25 | /** 26 | * Returns probability density of the given $value 27 | * 28 | * @return float|int 29 | */ 30 | public function pdf(float $value) 31 | { 32 | // Calculate the probability density by use of normal/Gaussian distribution 33 | // Ref: https://en.wikipedia.org/wiki/Normal_distribution 34 | $std2 = $this->std ** 2; 35 | $mean = $this->mean; 36 | 37 | return exp(-(($value - $mean) ** 2) / (2 * $std2)) / ((2 * $std2 * M_PI) ** .5); 38 | } 39 | 40 | /** 41 | * Returns probability density value of the given $value based on 42 | * given standard deviation and the mean 43 | */ 44 | public static function distributionPdf(float $mean, float $std, float $value): float 45 | { 46 | $normal = new self($mean, $std); 47 | 48 | return $normal->pdf($value); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/Math/Statistic/Mean.php: -------------------------------------------------------------------------------- 1 | $label) { 24 | if ($label == $predictedLabels[$index]) { 25 | ++$score; 26 | } 27 | } 28 | 29 | if ($normalize) { 30 | $score /= count($actualLabels); 31 | } 32 | 33 | return $score; 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/Metric/ConfusionMatrix.php: -------------------------------------------------------------------------------- 1 | $actual) { 15 | $predicted = $predictedLabels[$index]; 16 | 17 | if (!isset($labels[$actual], $labels[$predicted])) { 18 | continue; 19 | } 20 | 21 | if ($predicted === $actual) { 22 | $row = $column = $labels[$actual]; 23 | } else { 24 | $row = $labels[$actual]; 25 | $column = $labels[$predicted]; 26 | } 27 | 28 | ++$matrix[$row][$column]; 29 | } 30 | 31 | return $matrix; 32 | } 33 | 34 | private static function generateMatrixWithZeros(array $labels): array 35 | { 36 | $count = count($labels); 37 | $matrix = []; 38 | 39 | for ($i = 0; $i < $count; ++$i) { 40 | $matrix[$i] = array_fill(0, $count, 0); 41 | } 42 | 43 | return $matrix; 44 | } 45 | 46 | private static function getUniqueLabels(array $labels): array 47 | { 48 | $labels = array_values(array_unique($labels)); 49 | sort($labels); 50 | 51 | return array_flip($labels); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/Metric/Regression.php: -------------------------------------------------------------------------------- 1 | $target) { 19 | $errors[] = (($target - $predictions[$index]) ** 2); 20 | } 21 | 22 | return Mean::arithmetic($errors); 23 | } 24 | 25 | public static function meanSquaredLogarithmicError(array $targets, array $predictions): float 26 | { 27 | self::assertCountEquals($targets, $predictions); 28 | 29 | $errors = []; 30 | foreach ($targets as $index => $target) { 31 | $errors[] = log((1 + $target) / (1 + $predictions[$index])) ** 2; 32 | } 33 | 34 | return Mean::arithmetic($errors); 35 | } 36 | 37 | public static function meanAbsoluteError(array $targets, array $predictions): float 38 | { 39 | self::assertCountEquals($targets, $predictions); 40 | 41 | $errors = []; 42 | foreach ($targets as $index => $target) { 43 | $errors[] = abs($target - $predictions[$index]); 44 | } 45 | 46 | return Mean::arithmetic($errors); 47 | } 48 | 49 | public static function medianAbsoluteError(array $targets, array $predictions): float 50 | { 51 | self::assertCountEquals($targets, $predictions); 52 | 53 | $errors = []; 54 | foreach ($targets as $index => $target) { 55 | $errors[] = abs($target - $predictions[$index]); 56 | } 57 | 58 | return (float) Mean::median($errors); 59 | } 60 | 61 | public static function r2Score(array $targets, array $predictions): float 62 | { 63 | self::assertCountEquals($targets, $predictions); 64 | 65 | return Correlation::pearson($targets, $predictions) ** 2; 66 | } 67 | 68 | public static function maxError(array $targets, array $predictions): float 69 | { 70 | self::assertCountEquals($targets, $predictions); 71 | 72 | $errors = []; 73 | foreach ($targets as $index => $target) { 74 | $errors[] = abs($target - $predictions[$index]); 75 | } 76 | 77 | return (float) max($errors); 78 | } 79 | 80 | private static function assertCountEquals(array &$targets, array &$predictions): void 81 | { 82 | if (count($targets) !== count($predictions)) { 83 | throw new InvalidArgumentException('Targets count must be equal with predictions count'); 84 | } 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/ModelManager.php: -------------------------------------------------------------------------------- 1 | = 0 ? 1.0 : 0.0; 17 | } 18 | 19 | /** 20 | * @param float|int $value 21 | * @param float|int $computedvalue 22 | */ 23 | public function differentiate($value, $computedvalue): float 24 | { 25 | if ($value === 0 || $value === 0.0) { 26 | return 1; 27 | } 28 | 29 | return 0; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/NeuralNetwork/ActivationFunction/Gaussian.php: -------------------------------------------------------------------------------- 1 | beta = $beta; 19 | } 20 | 21 | /** 22 | * @param float|int $value 23 | */ 24 | public function compute($value): float 25 | { 26 | return tanh($this->beta * $value); 27 | } 28 | 29 | /** 30 | * @param float|int $value 31 | * @param float|int $computedvalue 32 | */ 33 | public function differentiate($value, $computedvalue): float 34 | { 35 | return 1 - $computedvalue ** 2; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/NeuralNetwork/ActivationFunction/PReLU.php: -------------------------------------------------------------------------------- 1 | beta = $beta; 19 | } 20 | 21 | /** 22 | * @param float|int $value 23 | */ 24 | public function compute($value): float 25 | { 26 | return $value >= 0 ? $value : $this->beta * $value; 27 | } 28 | 29 | /** 30 | * @param float|int $value 31 | * @param float|int $computedvalue 32 | */ 33 | public function differentiate($value, $computedvalue): float 34 | { 35 | return $computedvalue >= 0 ? 1.0 : $this->beta; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/NeuralNetwork/ActivationFunction/Sigmoid.php: -------------------------------------------------------------------------------- 1 | beta = $beta; 19 | } 20 | 21 | /** 22 | * @param float|int $value 23 | */ 24 | public function compute($value): float 25 | { 26 | return 1 / (1 + exp(-$this->beta * $value)); 27 | } 28 | 29 | /** 30 | * @param float|int $value 31 | * @param float|int $computedvalue 32 | */ 33 | public function differentiate($value, $computedvalue): float 34 | { 35 | return $computedvalue * (1 - $computedvalue); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/NeuralNetwork/ActivationFunction/ThresholdedReLU.php: -------------------------------------------------------------------------------- 1 | theta = $theta; 19 | } 20 | 21 | /** 22 | * @param float|int $value 23 | */ 24 | public function compute($value): float 25 | { 26 | return $value > $this->theta ? $value : 0.0; 27 | } 28 | 29 | /** 30 | * @param float|int $value 31 | * @param float|int $calculatedvalue 32 | */ 33 | public function differentiate($value, $calculatedvalue): float 34 | { 35 | return $calculatedvalue >= $this->theta ? 1.0 : 0.0; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/NeuralNetwork/Layer.php: -------------------------------------------------------------------------------- 1 | nodes[] = $this->createNode($nodeClass, $activationFunction); 28 | } 29 | } 30 | 31 | public function addNode(Node $node): void 32 | { 33 | $this->nodes[] = $node; 34 | } 35 | 36 | /** 37 | * @return Node[] 38 | */ 39 | public function getNodes(): array 40 | { 41 | return $this->nodes; 42 | } 43 | 44 | private function createNode(string $nodeClass, ?ActivationFunction $activationFunction = null): Node 45 | { 46 | if ($nodeClass === Neuron::class) { 47 | return new Neuron($activationFunction); 48 | } 49 | 50 | return new $nodeClass(); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/NeuralNetwork/Network.php: -------------------------------------------------------------------------------- 1 | layers[] = $layer; 22 | } 23 | 24 | /** 25 | * @return Layer[] 26 | */ 27 | public function getLayers(): array 28 | { 29 | return $this->layers; 30 | } 31 | 32 | public function removeLayers(): void 33 | { 34 | unset($this->layers); 35 | } 36 | 37 | public function getOutputLayer(): Layer 38 | { 39 | return $this->layers[count($this->layers) - 1]; 40 | } 41 | 42 | public function getOutput(): array 43 | { 44 | $result = []; 45 | foreach ($this->getOutputLayer()->getNodes() as $neuron) { 46 | $result[] = $neuron->getOutput(); 47 | } 48 | 49 | return $result; 50 | } 51 | 52 | /** 53 | * @param mixed $input 54 | */ 55 | public function setInput($input): Network 56 | { 57 | $firstLayer = $this->layers[0]; 58 | 59 | foreach ($firstLayer->getNodes() as $key => $neuron) { 60 | if ($neuron instanceof Input) { 61 | $neuron->setInput($input[$key]); 62 | } 63 | } 64 | 65 | foreach ($this->getLayers() as $layer) { 66 | foreach ($layer->getNodes() as $node) { 67 | if ($node instanceof Neuron) { 68 | $node->reset(); 69 | } 70 | } 71 | } 72 | 73 | return $this; 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/NeuralNetwork/Node.php: -------------------------------------------------------------------------------- 1 | input = $input; 19 | } 20 | 21 | public function getOutput(): float 22 | { 23 | return $this->input; 24 | } 25 | 26 | public function setInput(float $input): void 27 | { 28 | $this->input = $input; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/NeuralNetwork/Node/Neuron.php: -------------------------------------------------------------------------------- 1 | activationFunction = $activationFunction ?? new Sigmoid(); 37 | } 38 | 39 | public function addSynapse(Synapse $synapse): void 40 | { 41 | $this->synapses[] = $synapse; 42 | } 43 | 44 | /** 45 | * @return Synapse[] 46 | */ 47 | public function getSynapses(): array 48 | { 49 | return $this->synapses; 50 | } 51 | 52 | public function getOutput(): float 53 | { 54 | if ($this->output === 0.0) { 55 | $this->z = 0; 56 | foreach ($this->synapses as $synapse) { 57 | $this->z += $synapse->getOutput(); 58 | } 59 | 60 | $this->output = $this->activationFunction->compute($this->z); 61 | } 62 | 63 | return $this->output; 64 | } 65 | 66 | public function getDerivative(): float 67 | { 68 | return $this->activationFunction->differentiate($this->z, $this->output); 69 | } 70 | 71 | public function reset(): void 72 | { 73 | $this->output = 0.0; 74 | $this->z = 0.0; 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/NeuralNetwork/Node/Neuron/Synapse.php: -------------------------------------------------------------------------------- 1 | node = $node; 27 | $this->weight = $weight ?? $this->generateRandomWeight(); 28 | } 29 | 30 | public function getOutput(): float 31 | { 32 | return $this->weight * $this->node->getOutput(); 33 | } 34 | 35 | public function changeWeight(float $delta): void 36 | { 37 | $this->weight += $delta; 38 | } 39 | 40 | public function getWeight(): float 41 | { 42 | return $this->weight; 43 | } 44 | 45 | public function getNode(): Node 46 | { 47 | return $this->node; 48 | } 49 | 50 | protected function generateRandomWeight(): float 51 | { 52 | return (1 / random_int(5, 25) * random_int(0, 1)) > 0 ? -1 : 1; 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/NeuralNetwork/Training/Backpropagation.php: -------------------------------------------------------------------------------- 1 | setLearningRate($learningRate); 30 | } 31 | 32 | public function setLearningRate(float $learningRate): void 33 | { 34 | $this->learningRate = $learningRate; 35 | } 36 | 37 | public function getLearningRate(): float 38 | { 39 | return $this->learningRate; 40 | } 41 | 42 | /** 43 | * @param mixed $targetClass 44 | */ 45 | public function backpropagate(array $layers, $targetClass): void 46 | { 47 | $layersNumber = count($layers); 48 | 49 | // Backpropagation. 50 | for ($i = $layersNumber; $i > 1; --$i) { 51 | $this->sigmas = []; 52 | foreach ($layers[$i - 1]->getNodes() as $key => $neuron) { 53 | if ($neuron instanceof Neuron) { 54 | $sigma = $this->getSigma($neuron, $targetClass, $key, $i == $layersNumber); 55 | foreach ($neuron->getSynapses() as $synapse) { 56 | $synapse->changeWeight($this->learningRate * $sigma * $synapse->getNode()->getOutput()); 57 | } 58 | } 59 | } 60 | 61 | $this->prevSigmas = $this->sigmas; 62 | } 63 | 64 | // Clean some memory (also it helps make MLP persistency & children more maintainable). 65 | $this->sigmas = []; 66 | $this->prevSigmas = []; 67 | } 68 | 69 | private function getSigma(Neuron $neuron, int $targetClass, int $key, bool $lastLayer): float 70 | { 71 | $neuronOutput = $neuron->getOutput(); 72 | $sigma = $neuron->getDerivative(); 73 | 74 | if ($lastLayer) { 75 | $value = 0; 76 | if ($targetClass === $key) { 77 | $value = 1; 78 | } 79 | 80 | $sigma *= ($value - $neuronOutput); 81 | } else { 82 | $sigma *= $this->getPrevSigma($neuron); 83 | } 84 | 85 | $this->sigmas[] = new Sigma($neuron, $sigma); 86 | 87 | return $sigma; 88 | } 89 | 90 | private function getPrevSigma(Neuron $neuron): float 91 | { 92 | $sigma = 0.0; 93 | 94 | foreach ($this->prevSigmas as $neuronSigma) { 95 | $sigma += $neuronSigma->getSigmaForNeuron($neuron); 96 | } 97 | 98 | return $sigma; 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /src/NeuralNetwork/Training/Backpropagation/Sigma.php: -------------------------------------------------------------------------------- 1 | neuron = $neuron; 24 | $this->sigma = $sigma; 25 | } 26 | 27 | public function getNeuron(): Neuron 28 | { 29 | return $this->neuron; 30 | } 31 | 32 | public function getSigma(): float 33 | { 34 | return $this->sigma; 35 | } 36 | 37 | public function getSigmaForNeuron(Neuron $neuron): float 38 | { 39 | $sigma = 0.0; 40 | 41 | foreach ($this->neuron->getSynapses() as $synapse) { 42 | if ($synapse->getNode() == $neuron) { 43 | $sigma += $synapse->getWeight() * $this->getSigma(); 44 | } 45 | } 46 | 47 | return $sigma; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/Pipeline.php: -------------------------------------------------------------------------------- 1 | transformers = array_map(static function (Transformer $transformer): Transformer { 27 | return $transformer; 28 | }, $transformers); 29 | $this->estimator = $estimator; 30 | } 31 | 32 | /** 33 | * @return Transformer[] 34 | */ 35 | public function getTransformers(): array 36 | { 37 | return $this->transformers; 38 | } 39 | 40 | public function getEstimator(): ?Estimator 41 | { 42 | return $this->estimator; 43 | } 44 | 45 | public function train(array $samples, array $targets): void 46 | { 47 | if ($this->estimator === null) { 48 | throw new InvalidOperationException('Pipeline without estimator can\'t use train method'); 49 | } 50 | 51 | foreach ($this->transformers as $transformer) { 52 | $transformer->fit($samples, $targets); 53 | $transformer->transform($samples, $targets); 54 | } 55 | 56 | $this->estimator->train($samples, $targets); 57 | } 58 | 59 | /** 60 | * @return mixed 61 | */ 62 | public function predict(array $samples) 63 | { 64 | $this->transform($samples); 65 | 66 | if ($this->estimator === null) { 67 | throw new InvalidOperationException('Pipeline without estimator can\'t use predict method'); 68 | } 69 | 70 | return $this->estimator->predict($samples); 71 | } 72 | 73 | public function fit(array $samples, ?array $targets = null): void 74 | { 75 | foreach ($this->transformers as $transformer) { 76 | $transformer->fit($samples, $targets); 77 | $transformer->transform($samples, $targets); 78 | } 79 | } 80 | 81 | public function transform(array &$samples, ?array &$targets = null): void 82 | { 83 | foreach ($this->transformers as $transformer) { 84 | $transformer->transform($samples, $targets); 85 | } 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/Preprocessing/ColumnFilter.php: -------------------------------------------------------------------------------- 1 | datasetColumns = array_map(static function (string $column): string { 22 | return $column; 23 | }, $datasetColumns); 24 | $this->filterColumns = array_map(static function (string $column): string { 25 | return $column; 26 | }, $filterColumns); 27 | } 28 | 29 | public function fit(array $samples, ?array $targets = null): void 30 | { 31 | //nothing to do 32 | } 33 | 34 | public function transform(array &$samples, ?array &$targets = null): void 35 | { 36 | $keys = array_intersect($this->datasetColumns, $this->filterColumns); 37 | 38 | foreach ($samples as &$sample) { 39 | $sample = array_values(array_intersect_key($sample, $keys)); 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/Preprocessing/Imputer.php: -------------------------------------------------------------------------------- 1 | missingValue = $missingValue; 42 | $this->strategy = $strategy; 43 | $this->axis = $axis; 44 | $this->samples = $samples; 45 | } 46 | 47 | public function fit(array $samples, ?array $targets = null): void 48 | { 49 | $this->samples = $samples; 50 | } 51 | 52 | public function transform(array &$samples, ?array &$targets = null): void 53 | { 54 | if ($this->samples === []) { 55 | throw new InvalidOperationException('Missing training samples for Imputer.'); 56 | } 57 | 58 | foreach ($samples as &$sample) { 59 | $this->preprocessSample($sample); 60 | } 61 | } 62 | 63 | private function preprocessSample(array &$sample): void 64 | { 65 | foreach ($sample as $column => &$value) { 66 | if ($value === $this->missingValue) { 67 | $value = $this->strategy->replaceValue($this->getAxis($column, $sample)); 68 | } 69 | } 70 | } 71 | 72 | private function getAxis(int $column, array $currentSample): array 73 | { 74 | if ($this->axis === self::AXIS_ROW) { 75 | return array_diff($currentSample, [$this->missingValue]); 76 | } 77 | 78 | $axis = []; 79 | foreach ($this->samples as $sample) { 80 | if ($sample[$column] !== $this->missingValue) { 81 | $axis[] = $sample[$column]; 82 | } 83 | } 84 | 85 | return $axis; 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/Preprocessing/Imputer/Strategy.php: -------------------------------------------------------------------------------- 1 | classes = []; 17 | 18 | foreach ($samples as $sample) { 19 | if (!isset($this->classes[(string) $sample])) { 20 | $this->classes[(string) $sample] = count($this->classes); 21 | } 22 | } 23 | } 24 | 25 | public function transform(array &$samples, ?array &$targets = null): void 26 | { 27 | foreach ($samples as &$sample) { 28 | $sample = $this->classes[(string) $sample]; 29 | } 30 | } 31 | 32 | public function inverseTransform(array &$samples): void 33 | { 34 | $classes = array_flip($this->classes); 35 | foreach ($samples as &$sample) { 36 | $sample = $classes[$sample]; 37 | } 38 | } 39 | 40 | /** 41 | * @return string[] 42 | */ 43 | public function classes(): array 44 | { 45 | return array_keys($this->classes); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/Preprocessing/LambdaTransformer.php: -------------------------------------------------------------------------------- 1 | lambda = $lambda; 17 | } 18 | 19 | public function fit(array $samples, ?array $targets = null): void 20 | { 21 | // nothing to do 22 | } 23 | 24 | public function transform(array &$samples, ?array &$targets = null): void 25 | { 26 | foreach ($samples as &$sample) { 27 | $sample = call_user_func($this->lambda, $sample); 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/Preprocessing/Normalizer.php: -------------------------------------------------------------------------------- 1 | norm = $norm; 49 | } 50 | 51 | public function fit(array $samples, ?array $targets = null): void 52 | { 53 | if ($this->fitted) { 54 | return; 55 | } 56 | 57 | if ($this->norm === self::NORM_STD) { 58 | $features = range(0, count($samples[0]) - 1); 59 | foreach ($features as $i) { 60 | $values = array_column($samples, $i); 61 | $this->std[$i] = StandardDeviation::population($values); 62 | $this->mean[$i] = Mean::arithmetic($values); 63 | } 64 | } 65 | 66 | $this->fitted = true; 67 | } 68 | 69 | public function transform(array &$samples, ?array &$targets = null): void 70 | { 71 | $methods = [ 72 | self::NORM_L1 => 'normalizeL1', 73 | self::NORM_L2 => 'normalizeL2', 74 | self::NORM_STD => 'normalizeSTD', 75 | ]; 76 | $method = $methods[$this->norm]; 77 | 78 | $this->fit($samples); 79 | 80 | foreach ($samples as &$sample) { 81 | $this->{$method}($sample); 82 | } 83 | } 84 | 85 | private function normalizeL1(array &$sample): void 86 | { 87 | $norm1 = 0; 88 | foreach ($sample as $feature) { 89 | $norm1 += abs($feature); 90 | } 91 | 92 | if ($norm1 == 0) { 93 | $count = count($sample); 94 | $sample = array_fill(0, $count, 1.0 / $count); 95 | } else { 96 | array_walk($sample, function (&$feature) use ($norm1): void { 97 | $feature /= $norm1; 98 | }); 99 | } 100 | } 101 | 102 | private function normalizeL2(array &$sample): void 103 | { 104 | $norm2 = 0; 105 | foreach ($sample as $feature) { 106 | $norm2 += $feature * $feature; 107 | } 108 | 109 | $norm2 **= .5; 110 | 111 | if ($norm2 == 0) { 112 | $sample = array_fill(0, count($sample), 1); 113 | } else { 114 | array_walk($sample, function (&$feature) use ($norm2): void { 115 | $feature /= $norm2; 116 | }); 117 | } 118 | } 119 | 120 | private function normalizeSTD(array &$sample): void 121 | { 122 | foreach (array_keys($sample) as $i) { 123 | if ($this->std[$i] != 0) { 124 | $sample[$i] = ($sample[$i] - $this->mean[$i]) / $this->std[$i]; 125 | } else { 126 | // Same value for all samples. 127 | $sample[$i] = 0; 128 | } 129 | } 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /src/Preprocessing/NumberConverter.php: -------------------------------------------------------------------------------- 1 | transformTargets = $transformTargets; 25 | $this->nonNumericPlaceholder = $nonNumericPlaceholder; 26 | } 27 | 28 | public function fit(array $samples, ?array $targets = null): void 29 | { 30 | //nothing to do 31 | } 32 | 33 | public function transform(array &$samples, ?array &$targets = null): void 34 | { 35 | foreach ($samples as &$sample) { 36 | foreach ($sample as &$feature) { 37 | $feature = is_numeric($feature) ? (float) $feature : $this->nonNumericPlaceholder; 38 | } 39 | } 40 | 41 | if ($this->transformTargets && is_array($targets)) { 42 | foreach ($targets as &$target) { 43 | $target = is_numeric($target) ? (float) $target : $this->nonNumericPlaceholder; 44 | } 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/Preprocessing/OneHotEncoder.php: -------------------------------------------------------------------------------- 1 | ignoreUnknown = $ignoreUnknown; 24 | } 25 | 26 | public function fit(array $samples, ?array $targets = null): void 27 | { 28 | foreach (array_keys(array_values(current($samples))) as $column) { 29 | $this->fitColumn($column, array_values(array_unique(array_column($samples, $column)))); 30 | } 31 | } 32 | 33 | public function transform(array &$samples, ?array &$targets = null): void 34 | { 35 | foreach ($samples as &$sample) { 36 | $sample = $this->transformSample(array_values($sample)); 37 | } 38 | } 39 | 40 | private function fitColumn(int $column, array $values): void 41 | { 42 | $count = count($values); 43 | foreach ($values as $index => $value) { 44 | $map = array_fill(0, $count, 0); 45 | $map[$index] = 1; 46 | $this->categories[$column][$value] = $map; 47 | } 48 | } 49 | 50 | private function transformSample(array $sample): array 51 | { 52 | $encoded = []; 53 | foreach ($sample as $column => $feature) { 54 | if (!isset($this->categories[$column][$feature]) && !$this->ignoreUnknown) { 55 | throw new InvalidArgumentException(sprintf('Missing category "%s" for column %s in trained encoder', $feature, $column)); 56 | } 57 | 58 | $encoded = array_merge( 59 | $encoded, 60 | $this->categories[$column][$feature] ?? array_fill(0, count($this->categories[$column]), 0) 61 | ); 62 | } 63 | 64 | return $encoded; 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/Preprocessing/Preprocessor.php: -------------------------------------------------------------------------------- 1 | maxFeatures = $maxFeatures; 49 | $this->tolerance = $tolerance; 50 | 51 | parent::__construct($maxDepth, $maxLeafSize, $minPurityIncrease); 52 | } 53 | 54 | public function train(array $samples, array $targets): void 55 | { 56 | $features = count($samples[0]); 57 | 58 | $this->columns = range(0, $features - 1); 59 | $this->maxFeatures = $this->maxFeatures ?? (int) round(sqrt($features)); 60 | 61 | $this->grow($samples, $targets); 62 | 63 | $this->columns = []; 64 | } 65 | 66 | public function predict(array $samples) 67 | { 68 | if ($this->bare()) { 69 | throw new InvalidOperationException('Regressor must be trained first'); 70 | } 71 | 72 | $predictions = []; 73 | 74 | foreach ($samples as $sample) { 75 | $node = $this->search($sample); 76 | 77 | $predictions[] = $node instanceof AverageNode 78 | ? $node->outcome() 79 | : null; 80 | } 81 | 82 | return $predictions; 83 | } 84 | 85 | protected function split(array $samples, array $targets): DecisionNode 86 | { 87 | $bestVariance = INF; 88 | $bestColumn = $bestValue = null; 89 | $bestGroups = []; 90 | 91 | shuffle($this->columns); 92 | 93 | foreach (array_slice($this->columns, 0, $this->maxFeatures) as $column) { 94 | $values = array_unique(array_column($samples, $column)); 95 | 96 | foreach ($values as $value) { 97 | $groups = $this->partition($column, $value, $samples, $targets); 98 | 99 | $variance = $this->splitImpurity($groups); 100 | 101 | if ($variance < $bestVariance) { 102 | $bestColumn = $column; 103 | $bestValue = $value; 104 | $bestGroups = $groups; 105 | $bestVariance = $variance; 106 | } 107 | 108 | if ($variance <= $this->tolerance) { 109 | break 2; 110 | } 111 | } 112 | } 113 | 114 | return new DecisionNode($bestColumn, $bestValue, $bestGroups, $bestVariance); 115 | } 116 | 117 | protected function terminate(array $targets): BinaryNode 118 | { 119 | return new AverageNode(Mean::arithmetic($targets), Variance::population($targets), count($targets)); 120 | } 121 | 122 | protected function splitImpurity(array $groups): float 123 | { 124 | $samplesCount = (int) array_sum(array_map(static function (array $group): int { 125 | return count($group[0]); 126 | }, $groups)); 127 | 128 | $impurity = 0.; 129 | 130 | foreach ($groups as $group) { 131 | $k = count($group[1]); 132 | 133 | if ($k < 2) { 134 | continue 1; 135 | } 136 | 137 | $variance = Variance::population($group[1]); 138 | 139 | $impurity += ($k / $samplesCount) * $variance; 140 | } 141 | 142 | return $impurity; 143 | } 144 | 145 | /** 146 | * @param int|float $value 147 | */ 148 | private function partition(int $column, $value, array $samples, array $targets): array 149 | { 150 | $leftSamples = $leftTargets = $rightSamples = $rightTargets = []; 151 | foreach ($samples as $index => $sample) { 152 | if ($sample[$column] < $value) { 153 | $leftSamples[] = $sample; 154 | $leftTargets[] = $targets[$index]; 155 | } else { 156 | $rightSamples[] = $sample; 157 | $rightTargets[] = $targets[$index]; 158 | } 159 | } 160 | 161 | return [ 162 | [$leftSamples, $leftTargets], 163 | [$rightSamples, $rightTargets], 164 | ]; 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /src/Regression/LeastSquares.php: -------------------------------------------------------------------------------- 1 | samples = array_merge($this->samples, $samples); 37 | $this->targets = array_merge($this->targets, $targets); 38 | 39 | $this->computeCoefficients(); 40 | } 41 | 42 | /** 43 | * @return mixed 44 | */ 45 | public function predictSample(array $sample) 46 | { 47 | $result = $this->intercept; 48 | foreach ($this->coefficients as $index => $coefficient) { 49 | $result += $coefficient * $sample[$index]; 50 | } 51 | 52 | return $result; 53 | } 54 | 55 | public function getCoefficients(): array 56 | { 57 | return $this->coefficients; 58 | } 59 | 60 | public function getIntercept(): float 61 | { 62 | return $this->intercept; 63 | } 64 | 65 | /** 66 | * coefficient(b) = (X'X)-1X'Y. 67 | */ 68 | private function computeCoefficients(): void 69 | { 70 | $samplesMatrix = $this->getSamplesMatrix(); 71 | $targetsMatrix = $this->getTargetsMatrix(); 72 | 73 | $ts = $samplesMatrix->transpose()->multiply($samplesMatrix)->inverse(); 74 | $tf = $samplesMatrix->transpose()->multiply($targetsMatrix); 75 | 76 | $this->coefficients = $ts->multiply($tf)->getColumnValues(0); 77 | $this->intercept = array_shift($this->coefficients); 78 | } 79 | 80 | /** 81 | * Add one dimension for intercept calculation. 82 | */ 83 | private function getSamplesMatrix(): Matrix 84 | { 85 | $samples = []; 86 | foreach ($this->samples as $sample) { 87 | array_unshift($sample, 1); 88 | $samples[] = $sample; 89 | } 90 | 91 | return new Matrix($samples); 92 | } 93 | 94 | private function getTargetsMatrix(): Matrix 95 | { 96 | if (is_array($this->targets[0])) { 97 | return new Matrix($this->targets); 98 | } 99 | 100 | return Matrix::fromFlatArray($this->targets); 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /src/Regression/Regression.php: -------------------------------------------------------------------------------- 1 | $label) { 21 | $set .= sprintf('%s %s %s', ($targets ? $label : $numericLabels[$label]), self::sampleRow($samples[$index]), PHP_EOL); 22 | } 23 | 24 | return $set; 25 | } 26 | 27 | public static function testSet(array $samples): string 28 | { 29 | if (count($samples) === 0) { 30 | throw new InvalidArgumentException('The array has zero elements'); 31 | } 32 | 33 | if (!is_array($samples[0])) { 34 | $samples = [$samples]; 35 | } 36 | 37 | $set = ''; 38 | foreach ($samples as $sample) { 39 | $set .= sprintf('0 %s %s', self::sampleRow($sample), PHP_EOL); 40 | } 41 | 42 | return $set; 43 | } 44 | 45 | public static function predictions(string $rawPredictions, array $labels): array 46 | { 47 | $numericLabels = self::numericLabels($labels); 48 | $results = []; 49 | foreach (explode(PHP_EOL, $rawPredictions) as $result) { 50 | if (isset($result[0])) { 51 | $results[] = array_search((int) $result, $numericLabels, true); 52 | } 53 | } 54 | 55 | return $results; 56 | } 57 | 58 | public static function probabilities(string $rawPredictions, array $labels): array 59 | { 60 | $numericLabels = self::numericLabels($labels); 61 | 62 | $predictions = explode(PHP_EOL, trim($rawPredictions)); 63 | 64 | $header = array_shift($predictions); 65 | $headerColumns = explode(' ', (string) $header); 66 | array_shift($headerColumns); 67 | 68 | $columnLabels = []; 69 | foreach ($headerColumns as $numericLabel) { 70 | $columnLabels[] = array_search((int) $numericLabel, $numericLabels, true); 71 | } 72 | 73 | $results = []; 74 | foreach ($predictions as $rawResult) { 75 | $probabilities = explode(' ', $rawResult); 76 | array_shift($probabilities); 77 | 78 | $result = []; 79 | foreach ($probabilities as $i => $prob) { 80 | $result[$columnLabels[$i]] = (float) $prob; 81 | } 82 | 83 | $results[] = $result; 84 | } 85 | 86 | return $results; 87 | } 88 | 89 | public static function numericLabels(array $labels): array 90 | { 91 | $numericLabels = []; 92 | foreach ($labels as $label) { 93 | if (isset($numericLabels[$label])) { 94 | continue; 95 | } 96 | 97 | $numericLabels[$label] = count($numericLabels); 98 | } 99 | 100 | return $numericLabels; 101 | } 102 | 103 | private static function sampleRow(array $sample): string 104 | { 105 | $row = []; 106 | foreach ($sample as $index => $feature) { 107 | $row[] = sprintf('%s:%F', $index + 1, $feature); 108 | } 109 | 110 | return implode(' ', $row); 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /src/SupportVectorMachine/Kernel.php: -------------------------------------------------------------------------------- 1 | $maxGram) { 24 | throw new InvalidArgumentException(sprintf('Invalid (%s, %s) minGram and maxGram value combination', $minGram, $maxGram)); 25 | } 26 | 27 | $this->minGram = $minGram; 28 | $this->maxGram = $maxGram; 29 | } 30 | 31 | /** 32 | * {@inheritdoc} 33 | */ 34 | public function tokenize(string $text): array 35 | { 36 | $words = []; 37 | preg_match_all('/\w\w+/u', $text, $words); 38 | 39 | $nGrams = []; 40 | foreach ($words[0] as $word) { 41 | $this->generateNGrams($word, $nGrams); 42 | } 43 | 44 | return $nGrams; 45 | } 46 | 47 | private function generateNGrams(string $word, array &$nGrams): void 48 | { 49 | $length = mb_strlen($word); 50 | 51 | for ($j = 1; $j <= $this->maxGram; $j++) { 52 | for ($k = 0; $k < $length - $j + 1; $k++) { 53 | if ($j >= $this->minGram) { 54 | $nGrams[] = mb_substr($word, $k, $j); 55 | } 56 | } 57 | } 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/Tokenization/NGramWordTokenizer.php: -------------------------------------------------------------------------------- 1 | $maxGram) { 24 | throw new InvalidArgumentException(sprintf('Invalid (%s, %s) minGram and maxGram value combination', $minGram, $maxGram)); 25 | } 26 | 27 | $this->minGram = $minGram; 28 | $this->maxGram = $maxGram; 29 | } 30 | 31 | /** 32 | * {@inheritdoc} 33 | */ 34 | public function tokenize(string $text): array 35 | { 36 | preg_match_all('/\w\w+/u', $text, $words); 37 | 38 | $words = $words[0]; 39 | 40 | $nGrams = []; 41 | for ($j = $this->minGram; $j <= $this->maxGram; $j++) { 42 | $nGrams = array_merge($nGrams, $this->getNgrams($words, $j)); 43 | } 44 | 45 | return $nGrams; 46 | } 47 | 48 | private function getNgrams(array $match, int $n = 2): array 49 | { 50 | $ngrams = []; 51 | $len = count($match); 52 | for ($i = 0; $i < $len; $i++) { 53 | if ($i > ($n - 2)) { 54 | $ng = ''; 55 | for ($j = $n - 1; $j >= 0; $j--) { 56 | $ng .= ' '.$match[$i - $j]; 57 | } 58 | $ngrams[] = trim($ng); 59 | } 60 | } 61 | 62 | return $ngrams; 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/Tokenization/Tokenizer.php: -------------------------------------------------------------------------------- 1 | maxDepth = $maxDepth; 54 | $this->maxLeafSize = $maxLeafSize; 55 | $this->minPurityIncrease = $minPurityIncrease; 56 | } 57 | 58 | public function root(): ?DecisionNode 59 | { 60 | return $this->root; 61 | } 62 | 63 | public function height(): int 64 | { 65 | return $this->root !== null ? $this->root->height() : 0; 66 | } 67 | 68 | public function balance(): int 69 | { 70 | return $this->root !== null ? $this->root->balance() : 0; 71 | } 72 | 73 | public function bare(): bool 74 | { 75 | return $this->root === null; 76 | } 77 | 78 | public function grow(array $samples, array $targets): void 79 | { 80 | $this->featureCount = count($samples[0]); 81 | $depth = 1; 82 | $this->root = $this->split($samples, $targets); 83 | $stack = [[$this->root, $depth]]; 84 | 85 | while ($stack) { 86 | [$current, $depth] = array_pop($stack) ?? []; 87 | 88 | [$left, $right] = $current->groups(); 89 | 90 | $current->cleanup(); 91 | 92 | $depth++; 93 | 94 | if ($left[1] === [] || $right[1] === []) { 95 | $node = $this->terminate(array_merge($left[1], $right[1])); 96 | 97 | $current->attachLeft($node); 98 | $current->attachRight($node); 99 | 100 | continue 1; 101 | } 102 | 103 | if ($depth >= $this->maxDepth) { 104 | $current->attachLeft($this->terminate($left[1])); 105 | $current->attachRight($this->terminate($right[1])); 106 | 107 | continue 1; 108 | } 109 | 110 | if (count($left[1]) > $this->maxLeafSize) { 111 | $node = $this->split($left[0], $left[1]); 112 | 113 | if ($node->purityIncrease() + 1e-8 > $this->minPurityIncrease) { 114 | $current->attachLeft($node); 115 | 116 | $stack[] = [$node, $depth]; 117 | } else { 118 | $current->attachLeft($this->terminate($left[1])); 119 | } 120 | } else { 121 | $current->attachLeft($this->terminate($left[1])); 122 | } 123 | 124 | if (count($right[1]) > $this->maxLeafSize) { 125 | $node = $this->split($right[0], $right[1]); 126 | 127 | if ($node->purityIncrease() + 1e-8 > $this->minPurityIncrease) { 128 | $current->attachRight($node); 129 | 130 | $stack[] = [$node, $depth]; 131 | } else { 132 | $current->attachRight($this->terminate($right[1])); 133 | } 134 | } else { 135 | $current->attachRight($this->terminate($right[1])); 136 | } 137 | } 138 | } 139 | 140 | public function search(array $sample): ?BinaryNode 141 | { 142 | $current = $this->root; 143 | 144 | while ($current) { 145 | if ($current instanceof DecisionNode) { 146 | $value = $current->value(); 147 | 148 | if (is_string($value)) { 149 | if ($sample[$current->column()] === $value) { 150 | $current = $current->left(); 151 | } else { 152 | $current = $current->right(); 153 | } 154 | } else { 155 | if ($sample[$current->column()] < $value) { 156 | $current = $current->left(); 157 | } else { 158 | $current = $current->right(); 159 | } 160 | } 161 | 162 | continue 1; 163 | } 164 | 165 | if ($current instanceof LeafNode) { 166 | break 1; 167 | } 168 | } 169 | 170 | return $current; 171 | } 172 | 173 | abstract protected function split(array $samples, array $targets): DecisionNode; 174 | 175 | abstract protected function terminate(array $targets): BinaryNode; 176 | } 177 | -------------------------------------------------------------------------------- /src/Tree/Node.php: -------------------------------------------------------------------------------- 1 | outcome = $outcome; 27 | $this->impurity = $impurity; 28 | $this->samplesCount = $samplesCount; 29 | } 30 | 31 | public function outcome(): float 32 | { 33 | return $this->outcome; 34 | } 35 | 36 | public function impurity(): float 37 | { 38 | return $this->impurity; 39 | } 40 | 41 | public function samplesCount(): int 42 | { 43 | return $this->samplesCount; 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/Tree/Node/BinaryNode.php: -------------------------------------------------------------------------------- 1 | parent; 29 | } 30 | 31 | public function left(): ?self 32 | { 33 | return $this->left; 34 | } 35 | 36 | public function right(): ?self 37 | { 38 | return $this->right; 39 | } 40 | 41 | public function height(): int 42 | { 43 | return 1 + max($this->left !== null ? $this->left->height() : 0, $this->right !== null ? $this->right->height() : 0); 44 | } 45 | 46 | public function balance(): int 47 | { 48 | return ($this->right !== null ? $this->right->height() : 0) - ($this->left !== null ? $this->left->height() : 0); 49 | } 50 | 51 | public function setParent(?self $node = null): void 52 | { 53 | $this->parent = $node; 54 | } 55 | 56 | public function attachLeft(self $node): void 57 | { 58 | $node->setParent($this); 59 | $this->left = $node; 60 | } 61 | 62 | public function detachLeft(): void 63 | { 64 | if ($this->left !== null) { 65 | $this->left->setParent(); 66 | $this->left = null; 67 | } 68 | } 69 | 70 | public function attachRight(self $node): void 71 | { 72 | $node->setParent($this); 73 | $this->right = $node; 74 | } 75 | 76 | public function detachRight(): void 77 | { 78 | if ($this->right !== null) { 79 | $this->right->setParent(); 80 | $this->right = null; 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/Tree/Node/DecisionNode.php: -------------------------------------------------------------------------------- 1 | column = $column; 50 | $this->value = $value; 51 | $this->groups = $groups; 52 | $this->impurity = $impurity; 53 | $this->samplesCount = (int) array_sum(array_map(static function (array $group): int { 54 | return count($group[0]); 55 | }, $groups)); 56 | } 57 | 58 | public function column(): int 59 | { 60 | return $this->column; 61 | } 62 | 63 | /** 64 | * @return mixed 65 | */ 66 | public function value() 67 | { 68 | return $this->value; 69 | } 70 | 71 | public function groups(): array 72 | { 73 | return $this->groups; 74 | } 75 | 76 | public function impurity(): float 77 | { 78 | return $this->impurity; 79 | } 80 | 81 | public function samplesCount(): int 82 | { 83 | return $this->samplesCount; 84 | } 85 | 86 | public function purityIncrease(): float 87 | { 88 | $impurity = $this->impurity; 89 | 90 | if ($this->left() instanceof PurityNode) { 91 | $impurity -= $this->left()->impurity() 92 | * ($this->left()->samplesCount() / $this->samplesCount); 93 | } 94 | 95 | if ($this->right() instanceof PurityNode) { 96 | $impurity -= $this->right()->impurity() 97 | * ($this->right()->samplesCount() / $this->samplesCount); 98 | } 99 | 100 | return $impurity; 101 | } 102 | 103 | public function cleanup(): void 104 | { 105 | $this->groups = [[], []]; 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/Tree/Node/LeafNode.php: -------------------------------------------------------------------------------- 1 |