├── KMeansClustering └── cluster.php ├── LICENSE ├── README ├── boolean.php ├── data └── webstats.csv ├── followrank.php ├── holtwinters.php ├── linreg.php ├── multivagraddec.php └── rareevents.php /KMeansClustering/cluster.php: -------------------------------------------------------------------------------- 1 | $d) { 18 | $data[$key] = normaliseValue($d, sqrt($d[0]*$d[0] + $d[1] * $d[1])); 19 | } 20 | 21 | var_dump(kMeans($data, 3)); 22 | 23 | function initialiseCentroids(array $data, $k) { 24 | $dimensions = count($data[0]); 25 | $centroids = array(); 26 | $dimmax = array(); 27 | $dimmin = array(); 28 | foreach($data as $document) { 29 | foreach($document as $dim => $val) { 30 | if(!isset($dimmax[$dim]) || $val > $dimmax[$dim]) { 31 | $dimmax[$dim] = $val; 32 | } 33 | if(!isset($dimmin[$dim]) || $val < $dimmin[$dim]) { 34 | $dimmin[$dim] = $val; 35 | } 36 | } 37 | } 38 | for($i = 0; $i < $k; $i++) { 39 | $centroids[$i] = initialiseCentroid($dimensions, $dimmax, $dimmin); 40 | } 41 | return $centroids; 42 | } 43 | 44 | function initialiseCentroid($dimensions, $dimmax, $dimmin) { 45 | $total = 0; 46 | $centroid = array(); 47 | for($j = 0; $j < $dimensions; $j++) { 48 | $centroid[$j] = (rand($dimmin[$j] * 1000, $dimmax[$j] * 1000)); 49 | $total += $centroid[$j] * $centroid[$j]; 50 | } 51 | $centroid = normaliseValue($centroid, sqrt($total)); 52 | return $centroid; 53 | } 54 | 55 | function kMeans($data, $k) { 56 | $centroids = initialiseCentroids($data, $k); 57 | $mapping = array(); 58 | 59 | while(true) { 60 | $new_mapping = assignCentroids($data, $centroids); 61 | $changed = false; 62 | foreach($new_mapping as $documentID => $centroidID) { 63 | if(!isset($mapping[$documentID]) || $centroidID != $mapping[$documentID]) { 64 | $mapping = $new_mapping; 65 | $changed = true; 66 | break; 67 | } 68 | } 69 | if(!$changed){ 70 | return formatResults($mapping, $data, $centroids); 71 | } 72 | $centroids = updateCentroids($mapping, $data, $k); 73 | } 74 | } 75 | 76 | function formatResults($mapping, $data, $centroids) { 77 | $result = array(); 78 | $result['centroids'] = $centroids; 79 | foreach($mapping as $documentID => $centroidID) { 80 | $result[$centroidID][] = implode(',', $data[$documentID]); 81 | } 82 | return $result; 83 | } 84 | 85 | function assignCentroids($data, $centroids) { 86 | $mapping = array(); 87 | 88 | foreach($data as $documentID => $document) { 89 | $minDist = 100; 90 | $minCentroid = null; 91 | foreach($centroids as $centroidID => $centroid) { 92 | $dist = 0; 93 | foreach($centroid as $dim => $value) { 94 | $dist += abs($value - $document[$dim]); 95 | } 96 | if($dist < $minDist) { 97 | $minDist = $dist; 98 | $minCentroid = $centroidID; 99 | } 100 | } 101 | $mapping[$documentID] = $minCentroid; 102 | } 103 | 104 | return $mapping; 105 | } 106 | 107 | function updateCentroids($mapping, $data, $k) { 108 | $centroids = array(); 109 | $counts = array_count_values($mapping); 110 | 111 | foreach($mapping as $documentID => $centroidID) { 112 | foreach($data[$documentID] as $dim => $value) { 113 | if(!isset($cenntroids[$centroidID][$dim])) { 114 | $centroids[$centroidID][$dim] = 0; 115 | } 116 | $centroids[$centroidID][$dim] += ($value/$counts[$centroidID]); 117 | } 118 | } 119 | 120 | if(count($centroids) < $k) { 121 | $centroids = array_merge($centroids, initialiseCentroids($data, $k - count($centroids))); 122 | } 123 | 124 | return $centroids; 125 | } 126 | 127 | function normaliseValue(array $vector, $total) { 128 | foreach($vector as &$value) { 129 | $value = $value/$total; 130 | } 131 | return $vector; 132 | } 133 | 134 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011, Ian Barber 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | 6 | * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 8 | 9 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 10 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Various code samples from http://phpir.com 2 | -------------------------------------------------------------------------------- /boolean.php: -------------------------------------------------------------------------------- 1 | index[$token]) ) { 8 | $this->index[$token] = array(); 9 | } 10 | $this->index[$token][] = $documentId; 11 | } 12 | 13 | public function getPostings($token) { 14 | return isset($this->index[$token]) ? $this->index[$token] : array(); 15 | } 16 | } 17 | 18 | class BooleanQuery { 19 | private $index = 0; 20 | private $tokens = array(); 21 | private $count; 22 | private $tree; 23 | 24 | public function __construct($query) { 25 | preg_match_all('/[a-zA-Z]+|[\(\)]/', strtolower($query), $matches); 26 | $this->count = count($matches[0]); 27 | $this->tokens = $matches[0]; 28 | $this->tree = $this->buildQueryTree(); 29 | } 30 | 31 | private function buildQueryTree() { 32 | while($this->index < $this->count) { 33 | $token = $this->tokens[$this->index]; 34 | $this->index++; 35 | 36 | if('(' == $token) { 37 | $tree = $this->buildQueryTree(); 38 | } else if(')' == $token) { 39 | return $tree; 40 | } else if(in_array($token, array('and', 'or', 'not'))) { 41 | $tree = array('action' => $token, 'left' => $tree, 42 | 'right' => $this->buildQueryTree()); 43 | } else { 44 | $tree = $token; 45 | } 46 | } 47 | return $tree; 48 | } 49 | 50 | public function search($index) { 51 | return $this->processQuery($this->tree, $index); 52 | } 53 | 54 | private function union($postings1, $postings2) { 55 | return array_unique(array_merge($postings1, $postings2)); 56 | } 57 | 58 | private function intersect($postings1, $postings2) { 59 | return array_unique(array_intersect($postings1, $postings2)); 60 | } 61 | 62 | private function complement($postings1, $postings2) { 63 | return array_unique(array_diff($postings1, $postings2)); 64 | } 65 | 66 | 67 | private function processQuery($queryTree, $index) { 68 | if(is_array($queryTree)) { 69 | $left = $this->processQuery($queryTree['left'], $index); 70 | $right = $this->processQuery($queryTree['right'], $index); 71 | switch($queryTree['action']) { 72 | case 'and': 73 | return $this->intersect($left, $right); 74 | case 'or': 75 | return $this->union($left, $right); 76 | case 'not': 77 | return $this->complement($left, $right); 78 | } 79 | } else { 80 | return $index->getPostings($queryTree); 81 | } 82 | } 83 | } 84 | 85 | 86 | 87 | $index = new Index(); 88 | $documents = array( 89 | "http://phpir.com/simple-searching-boolean-retrieval", 90 | "http://phpir.com/presentation-tips-from-benelux", 91 | "http://phpir.com/linear-regression-in-php-part-2", 92 | ); 93 | 94 | foreach($documents as $documentID => $document) { 95 | $contents = strtolower(strip_tags(file_get_contents($document))); 96 | preg_match_all('/[a-zA-Z]+/', $contents, $matches); 97 | $matches = array_unique($matches[0]); 98 | foreach($matches as $match) { 99 | $index->storeToken($documentID, $match); 100 | } 101 | unset($contents); 102 | } 103 | 104 | $query = 'PHP AND (Information OR Retrieval) NOT Spoons'; 105 | $q = new BooleanQuery($query); 106 | var_dump($q->search($index)); -------------------------------------------------------------------------------- /data/webstats.csv: -------------------------------------------------------------------------------- 1 | 12/1/11,120 2 | 12/2/11,83 3 | 12/3/11,86 4 | 12/4/11,81 5 | 12/5/11,99 6 | 12/6/11,113 7 | 12/7/11,107 8 | 12/8/11,128 9 | 12/9/11,126 10 | 12/10/11,72 11 | 12/11/11,79 12 | 12/12/11,135 13 | 12/13/11,151 14 | 12/14/11,123 15 | 12/15/11,123 16 | 12/16/11,76 17 | 12/17/11,81 18 | 12/18/11,76 19 | 12/19/11,121 20 | 12/20/11,105 21 | 12/21/11,79 22 | 12/22/11,89 23 | 12/23/11,63 24 | 12/24/11,53 25 | 12/25/11,52 26 | 12/26/11,73 27 | 12/27/11,75 28 | 12/28/11,72 29 | 12/29/11,65 30 | 12/30/11,96 31 | 12/31/11,38 32 | 1/1/12,52 33 | 1/2/12,73 34 | 1/3/12,81 35 | 1/4/12,90 36 | 1/5/12,96 37 | 1/6/12,106 38 | 1/7/12,70 39 | 1/8/12,75 40 | 1/9/12,114 41 | 1/10/12,121 42 | 1/11/12,119 43 | 1/12/12,100 44 | 1/13/12,91 45 | 1/14/12,106 46 | 1/15/12,66 47 | 1/16/12,106 48 | 1/17/12,114 49 | 1/18/12,86 50 | 1/19/12,79 51 | 1/20/12,88 52 | 1/21/12,69 53 | 1/22/12,62 54 | 1/23/12,107 55 | 1/24/12,174 56 | 1/25/12,112 57 | 1/26/12,116 58 | 1/27/12,106 59 | 1/28/12,89 60 | 1/29/12,103 61 | 1/30/12,129 62 | 1/31/12,391 63 | 2/1/12,225 64 | 2/2/12,168 65 | 2/3/12,149 66 | 2/4/12,103 67 | 2/5/12,100 68 | 2/6/12,157 69 | 2/7/12,157 70 | 2/8/12,160 71 | 2/9/12,148 72 | 2/10/12,122 73 | 2/11/12,93 74 | 2/12/12,87 75 | 2/13/12,150 76 | 2/14/12,34 77 | 2/15/12,111 78 | 2/16/12,130 79 | 2/17/12,122 80 | 2/18/12,78 81 | 2/19/12,69 82 | 2/20/12,128 83 | 2/21/12,145 84 | 2/22/12,130 85 | 2/23/12,172 86 | 2/24/12,143 87 | 2/25/12,98 88 | 2/26/12,98 89 | 2/27/12,154 90 | 2/28/12,185 91 | 2/29/12,138 92 | 3/1/12,282 93 | 3/2/12,160 94 | 3/3/12,94 95 | 3/4/12,89 96 | 3/5/12,153 97 | 3/6/12,153 98 | 3/7/12,145 99 | 3/8/12,122 100 | 3/9/12,36 -------------------------------------------------------------------------------- /followrank.php: -------------------------------------------------------------------------------- 1 | ids ); 12 | $matrix = array(); 13 | $followercount = count( $following->ids ); 14 | foreach( $following->ids as $key => $id ) { 15 | $user = array_fill( 0, $followercount, 0 ); 16 | $their_following = getFromCache( $id ); 17 | $intersect = array_intersect( $following->ids, $their_following->ids ); 18 | 19 | if( count( $intersect ) ) { 20 | $divisor = 1 / count( $intersect ); 21 | foreach( $intersect as $shared_id ) { 22 | $user[$userlookup[$shared_id]] = $divisor; 23 | } 24 | } 25 | 26 | $matrix[$key] = $user; 27 | } 28 | file_put_contents( $matrix_file, ' $value ) { 36 | $result[$following->ids[$key]] = $value[0]; 37 | } 38 | arsort( $result ); 39 | 40 | $url = "https://api.twitter.com/1/users/lookup.json?user_id=" . 41 | implode(",", array_slice( array_keys( $result ), 0, 10 ) ); 42 | $contents = json_decode( file_get_contents( $url ) ); 43 | foreach( $contents as $i => $user ) { 44 | echo $i, ": ", $user->screen_name, " ", "\n"; 45 | 46 | } 47 | } 48 | 49 | function getFromCache( $id ) { 50 | $cacheFile = 'cache/' . $id . ".json"; 51 | if( !file_exists( $cacheFile ) ) { 52 | $access = is_numeric( $id ) ? 'user_id=' . $id : "screen_name=" . $id; 53 | 54 | // Using cURL so we can get the error code 55 | $ch = curl_init(); 56 | curl_setopt ( $ch, CURLOPT_URL, "http://api.twitter.com/1/friends/ids.json?" . $access ); 57 | curl_setopt ( $ch, CURLOPT_RETURNTRANSFER, 1 ); 58 | $file = curl_exec( $ch ); 59 | 60 | if( curl_getinfo( $ch, CURLINFO_HTTP_CODE ) == 401 ) { 61 | // account for private users 62 | $data = new stdClass(); 63 | $data->ids = array(); 64 | $file = json_encode($data); 65 | } else if( curl_getinfo( $ch, CURLINFO_HTTP_CODE ) == 200 ){ 66 | $data = json_decode($file); 67 | } else { 68 | die( "Twitter rate limit hit, try again later"); 69 | } 70 | 71 | curl_close( $ch ); 72 | file_put_contents( $cacheFile, $file ); 73 | } else { 74 | $data = json_decode( file_get_contents( $cacheFile ) ); 75 | } 76 | return $data; 77 | } 78 | -------------------------------------------------------------------------------- /holtwinters.php: -------------------------------------------------------------------------------- 1 | $val) { 39 | $index[$key] = $val / ($initial_level + ($key + 1) * $initial_trend); 40 | } 41 | 42 | // Build season buffer 43 | $season = array_fill(0, count($data), 0); 44 | for($i = 0; $i < $season_length; $i++) { 45 | $season[$i] = ($index[$i] + $index[$i+$season_length]) / 2; 46 | } 47 | 48 | // Normalise season 49 | $season_factor = $season_length / array_sum($season); 50 | foreach($season as $key => $val) { 51 | $season[$key] *= $season_factor; 52 | } 53 | 54 | 55 | $holt_winters = array(); 56 | $deviations = array(); 57 | $alpha_level = $initial_level; 58 | $beta_trend = $initial_trend; 59 | foreach($data as $key => $value) { 60 | $temp_level = $alpha_level; 61 | $temp_trend = $beta_trend; 62 | 63 | $alpha_level = $alpha * $value / $season[$key] + (1.0 - $alpha) * ($temp_level + $temp_trend); 64 | $beta_trend = $beta * ($alpha_level - $temp_level) + ( 1.0 - $beta ) * $temp_trend; 65 | 66 | $season[$key + $season_length] = $gamma * $value / $alpha_level + (1.0 - $gamma) * $season[$key]; 67 | 68 | $holt_winters[$key] = ($alpha_level + $beta_trend * ($key + 1)) * $season[$key]; 69 | $deviations[$key] = $dev_gamma * abs($value - $holt_winters[$key]) + (1-$dev_gamma) 70 | * (isset($deviations[$key - $season_length]) ? $deviations[$key - $season_length] : 0); 71 | } 72 | 73 | /* Could forecast a bit! 74 | for($i = 1; $i <= $season_length; $i++) { 75 | $holt_winters[$key + $i] = $alpha_level + $beta_trend * $season[$key + $i]; 76 | } 77 | */ 78 | 79 | return array($holt_winters, $deviations); 80 | } 81 | 82 | /*************************/ 83 | /*** Use the function ***/ 84 | /*************************/ 85 | 86 | // Load in some web stats data 87 | $fh = fopen("data/webstats.csv", 'r'); 88 | $data = array(); 89 | while($csv = fgetcsv($fh)) { 90 | $data[] = $csv[1]; 91 | } 92 | 93 | list($newdata, $deviations) = holt_winters($data, 30); 94 | 95 | // Echo it out in a format to paste into to the charts JS 96 | foreach($newdata as $key => $d) { 97 | echo "data.addRow([" . 98 | $key . ", " . 99 | (isset($data[$key]) ? $data[$key] : 0) . ", " . 100 | $d . ", " . 101 | ($d + 3*$deviations[$key]) .", " . 102 | ($d - 3*$deviations[$key]) .", " . 103 | "]);\n"; 104 | } 105 | -------------------------------------------------------------------------------- /linreg.php: -------------------------------------------------------------------------------- 1 | (x1, x2, x3, x4), 1 => y 11 | */ 12 | public function set_data($data) { 13 | $this->data = $this->scale_data($data); 14 | } 15 | 16 | /** 17 | * Set the rate at which the algorithm updates. 18 | * Normal values are 0.1 - 0.001 19 | * 20 | * @param float $rate 21 | * @return void 22 | */ 23 | public function set_learning_rate($rate) { 24 | $this->learning_rate = $rate; 25 | } 26 | 27 | /** 28 | * Normalise variance and scale data to: 29 | * xi - avg(xi) / range(max-min) 30 | * so we get in a -0.5 - 0.5 range with an 31 | * avg of 0 32 | * - this is a bit of clunky method! 33 | */ 34 | protected function scale_data($data) { 35 | $minmax = array(); 36 | $rows = count($data); 37 | 38 | foreach($data as $key => $row) { 39 | foreach($row[0] as $id => $val) { 40 | /* Initialise Arrays */ 41 | if(!isset($minmax[$id])) { 42 | $minmax[$id] = array(); 43 | $minmax[$id]['min'] = false; 44 | $minmax[$id]['max'] = false; 45 | $minmax[$id]['total'] = 0; 46 | } 47 | 48 | /* Get stats */ 49 | if( $minmax[$id]['min'] == false || 50 | $minmax[$id]['min'] > $val) { 51 | $minmax[$id]['min'] = $val; 52 | } 53 | if( $minmax[$id]['max'] == false || 54 | $minmax[$id]['max'] < $val) { 55 | $minmax[$id]['max'] = $val; 56 | } 57 | 58 | $minmax[$id]['total'] += $val; 59 | } 60 | } 61 | 62 | /* Compute average and variance */ 63 | foreach($minmax as $id => $row) { 64 | $minmax[$id]['var'] = $row['max'] - $row['min']; 65 | $minmax[$id]['avg'] = $row['total'] / $rows; 66 | 67 | } 68 | 69 | foreach($data as $key => $row) { 70 | foreach($row[0] as $id => $val) { 71 | $data[$key][0][$id] = ( $val - $minmax[$id]['avg'] ) 72 | / $minmax[$id]['var']; 73 | } 74 | } 75 | 76 | return $data; 77 | } 78 | 79 | /** 80 | * Update the parameters, including using a dummy row value 81 | * of 1 for the first parameter. 82 | * 83 | * @param array $params 84 | * @return array 85 | */ 86 | protected function learn($params) { 87 | $data_rate = 1/count($this->data); 88 | 89 | foreach($params as $id => $p) { 90 | foreach($this->data as $row) { 91 | $score = $this->mv_hypothesis($row[0], $params) - $row[1]; 92 | 93 | // Update parameters 94 | $params[$id] -= $this->learning_rate * 95 | ($data_rate * 96 | ( $score * ($id == 0 ? 1 : $row[0][$id-1]) ) 97 | ); 98 | } 99 | } 100 | 101 | return $params; 102 | } 103 | 104 | /** 105 | * Generate a score based on the data and passed parameters 106 | * 107 | * @param array $params 108 | * @return int 109 | */ 110 | protected function mv_hypothesis($rowdata, $params) { 111 | $score = $params[0]; 112 | foreach($rowdata as $id => $value) { 113 | $score += $value * $params[$id+1]; 114 | } 115 | return $score; 116 | } 117 | 118 | /** 119 | * Return the sum of squared error score 120 | * 121 | * @param array $params 122 | * @return int 123 | */ 124 | public function score($params) { 125 | $score = 0; 126 | foreach($this->data as $row) { 127 | $score += pow($this->mv_hypothesis($row[0], $params) - $row[1], 2); 128 | } 129 | return $score; 130 | } 131 | 132 | /** 133 | * Update parameters 134 | * 135 | * @param string $data 136 | * @param string $parameters 137 | * @return array parameters 138 | */ 139 | function mv_gradient($parameters) { 140 | $score = $this->score($parameters); 141 | 142 | // Create a new hypothesis to test our score 143 | $parameters = $this->learn($parameters); 144 | 145 | if($score < $this->score($parameters)) { 146 | return false; 147 | } 148 | 149 | return $parameters; 150 | } 151 | 152 | /** 153 | * Find the parameters that best fit the data 154 | * 155 | * @param int $iterations - max iterations to run 156 | * @param array $defaults - optional starting params 157 | * @return array - best fit parameters 158 | */ 159 | public function find_params($iterations = 5000, $defaults = null) { 160 | if(!$defaults) { 161 | $defaults = array_fill(0, count($this->data[0][0]) + 1, 0); 162 | } 163 | 164 | $parameters = $defaults; 165 | $iters = 0; 166 | do { 167 | $last_parameters = $parameters; 168 | $parameters = $this->mv_gradient($parameters); 169 | } while($parameters != false && $iters++ < $iterations); 170 | 171 | return $parameters ? $parameters : $last_parameters; 172 | } 173 | 174 | } 175 | 176 | /* Nice regular data for testing */ 177 | $data = array( 178 | array(array(2, 4000, 0.5), 2+2+(2*4)+(3*5)), 179 | array(array(2, 4000, 0.4), 2+2+(2*4)+(3*4)), 180 | array(array(2, 4000, 0.6), 2+2+(2*4)+(3*6)), 181 | array(array(1, 5000, 0.5), 2+1+(2*5)+(3*5)), 182 | array(array(2, 5000, 0.1), 2+2+(2*5)+(3*1)), 183 | ); 184 | 185 | class PolyMV extends MVGradient { 186 | 187 | /** 188 | * Skip scaling just for the example 189 | */ 190 | protected function scale_data($data) { 191 | return $data; 192 | } 193 | 194 | /** 195 | * Generate a score based on the data and passed parameters 196 | * 197 | * @param array $params 198 | * @return int 199 | */ 200 | protected function mv_hypothesis($rowdata, $params) { 201 | $score = $params[0]; 202 | foreach($rowdata as $id => $value) { 203 | $score += pow($value, $id+2) * $params[$id+1]; 204 | } 205 | return $score; 206 | } 207 | 208 | /** 209 | * Update the parameters, including using a dummy row value 210 | * of 1 for the first parameter. 211 | * 212 | * @param array $params 213 | * @return array 214 | */ 215 | protected function learn($params) { 216 | $data_rate = 1/count($this->data); 217 | 218 | foreach($params as $id => $p) { 219 | foreach($this->data as $row) { 220 | $score = $this->mv_hypothesis($row[0], $params) - $row[1]; 221 | 222 | // Update parameters 223 | // We have to multiply by an appropriate power as part of the 224 | // partial derivative 225 | $params[$id] -= $this->learning_rate * 226 | ($data_rate * 227 | ( $score * ($id == 0 ? 1 : pow($row[0][$id-1], $id+1)) ) 228 | ); 229 | } 230 | } 231 | 232 | return $params; 233 | } 234 | } 235 | /* 236 | 237 | 238 | $iterations = array(10, 100, 500, 1000, 2000, 5000, 10000); 239 | $mvg = new MVGradient(); 240 | $mvg->set_data($data); 241 | foreach(array(0.1, 0.01, 0.001, 0.001) as $rate) { 242 | $mvg->set_learning_rate($rate); 243 | foreach($iterations as $i) { 244 | $params = $mvg->find_params($i); 245 | echo $mvg->score($params), "\n"; 246 | } 247 | echo "\n"; 248 | } 249 | die(); 250 | 251 | 252 | // We have a polynomial example here 253 | 254 | $data = array( 255 | array(array(2, 2), 1+(3*pow(2, 2))+(2*pow(2, 3))), 256 | array(array(3, 3), 1+(3*pow(3, 2))+(2*pow(3, 3))), 257 | array(array(4, 4), 1+(3*pow(4, 2))+(2*pow(4, 3))), 258 | array(array(5, 5), 1+(3*pow(5, 2))+(2*pow(5, 3))), 259 | ); 260 | 261 | $iterations = array(10000); 262 | $mvg = new PolyMV(); 263 | $mvg->set_data($data); 264 | $mvg->set_learning_rate(0.001); 265 | foreach($iterations as $i) { 266 | $params = $mvg->find_params($i); 267 | echo $mvg->score($params), "\n"; 268 | var_dump($params); 269 | } 270 | echo "\n"; 271 | */ -------------------------------------------------------------------------------- /rareevents.php: -------------------------------------------------------------------------------- 1 | $count) { 51 | if(!isset($w1[$word])) { 52 | $w1[$word] = $count; 53 | } else { 54 | $w1[$word] += $count; 55 | } 56 | } 57 | return $w1; 58 | } 59 | 60 | function rand_prob($w1, $w2) { 61 | echo "Probability of random word:\n"; 62 | $word = array_rand($w1); 63 | $r = $w1[$word]; 64 | // r* = (r+1)*nr+1/nr 65 | // prob = r*/N 66 | $rare_word_probability = (($r+1) * ( zval($r+1, $w1) / zval($r, $w1) )) / 67 | array_sum($w1); 68 | var_dump($word); 69 | var_dump($rare_word_probability); 70 | var_dump("Expected: " . ($rare_word_probability * array_sum($w2))); 71 | var_dump( isset($w2[$word]) ? "Count: " . $w2[$word] : 'not present in 2nd set' ); 72 | echo "\n"; 73 | } 74 | 75 | $words1 = generate_stats("http://blackroses.textfiles.com/fun/search.txt"); 76 | $words2 = generate_stats("http://blackroses.textfiles.com/fun/wdw4-92.txt"); 77 | 78 | rand_prob($words1, $words2); 79 | 80 | // 1−n1/N 81 | // num_types / 1−n1/N 82 | //$expected_new_types = round( count($words1) / ( 1 - ( rwords(1, $words1) / array_sum($words1) ) ) ); 83 | $expected_new_types = count($words2) * (rwords(1, $words1) / count($words1)); 84 | echo "Expected New Types:\n"; 85 | var_dump($expected_new_types); 86 | echo "Actual New Types:\n"; 87 | var_dump(newtypes($words1, $words2)); 88 | echo "\n"; 89 | 90 | $words1 = merge_results($words1, $words2); 91 | unset($words2); 92 | 93 | $words3 = generate_stats("http://blackroses.textfiles.com/fun/w-fact-1.txt"); 94 | 95 | $expected_new_types = round( count($words1) / ( 1 - ( rwords(1, $words1) / array_sum($words1) ) ) ); 96 | echo "Expected New Types:\n"; 97 | var_dump($expected_new_types); 98 | echo "Actual New Types:\n"; 99 | var_dump(newtypes($words1, $words3)); 100 | echo "\n"; 101 | 102 | rand_prob($words1, $words3); --------------------------------------------------------------------------------