17 | 25 | 29 |

30 |

31 | 32 | 33 | 34 | 35 | $pat2

"; 68 | while(preg_match($pat2, $word)){ 69 | array_push($res2, substr($word, -1)); 70 | $word = substr($word, 0, -1); 71 | 72 | } 73 | 74 | $pos = strpos($word, "'"); 75 | if ($pos === false){ 76 | array_push($result, $word); 77 | } 78 | else { 79 | array_push($result, substr($word, 0, $pos)); 80 | array_push($result, substr($word, $pos)); 81 | } 82 | $res2 = array_reverse($res2); 83 | foreach($res2 as $item){ 84 | array_push($result, $item); 85 | } 86 | 87 | return $result; 88 | 89 | } 90 | function deletePun($word){ 91 | 92 | $orig = $word; 93 | # separate punctuation from word. so treat punctuation as separate tokens 94 | $pat1 = "/^[\"'$\[<]/"; 95 | $pat2 = "/[\"'$\]\.\!\?;\:,]$/"; 96 | $result = array(); 97 | while(preg_match($pat1, $word)){ 98 | $word = substr($word, 1); 99 | } 100 | while(preg_match($pat2, $word)){ 101 | $word = substr($word, 0, -1); 102 | 103 | } 104 | if ($word == ''){ 105 | $word = $orig; 106 | } 107 | $result = array($word); 108 | return $result; 109 | 110 | } 111 | 112 | 113 | 114 | function processLine($line){ 115 | global $ngrams; 116 | global $previous; 117 | global $n; 118 | global $total; 119 | global $unigrams; 120 | global $punc; 121 | $words = preg_split('/\s+/', $line); 122 | #$ngrams = array(); 123 | foreach($words as $word){ 124 | if ($punc == 'checked'){ 125 | $wordcomponents = separatePun($word); 126 | } 127 | else { 128 | $wordcomponents = deletePun($word); 129 | } 130 | #$current = $word; 131 | foreach($wordcomponents as $current){ 132 | #echo "X$current "; 133 | if (isset($unigrams[$current])){ 134 | $unigrams[$current] = $unigrams[$current] + 1; 135 | } 136 | else { 137 | $unigrams[$current] = 1; 138 | } 139 | $gram = ''; 140 | $total++; 141 | for($i = $n - 2; $i >= 0; $i--){ 142 | $gram = "$gram$previous[$i] "; 143 | } 144 | $gram = "$gram$current"; 145 | #$ngrams[$gram] = 1; 146 | #if (array_key_exists($gram, $ngrams)) 147 | if (isset($ngrams[$gram])) 148 | { 149 | $ngrams[$gram] = $ngrams[$gram] + 1; 150 | } 151 | else { 152 | $ngrams[$gram] = 1; 153 | # #echo "..."; 154 | } 155 | $previous[4] = $previous[3]; 156 | $previous[3] = $previous[2]; 157 | $previous[2] = $previous[1]; 158 | $previous[1] = $previous[0]; 159 | $previous[0] = $current; 160 | 161 | } 162 | } 163 | 164 | } 165 | 166 | ### 167 | ### BY FREQUENCY 168 | ### 169 | function byFrequency(){ 170 | global $ngrams; 171 | global $unigrams; 172 | global $previous; 173 | global $n; 174 | global $total; 175 | global $thetext; 176 | global $cutoff; 177 | if (is_array($thetext) == true) 178 | { 179 | foreach($thetext as $aline){ 180 | processLine($aline); 181 | } 182 | } 183 | else { 184 | if (substr($thetext, 0, 4) == 'http') { 185 | $urls = preg_split('/\s+/', $thetext); 186 | foreach($urls as $url){ 187 | processLine(file_get_contents($url)); 188 | } 189 | } 190 | else{ 191 | processLine($thetext); 192 | } 193 | } 194 | 195 | } 196 | 197 | 198 | ### 199 | ### USING LOG LIKELIHOOD 200 | ### 201 | function ll($w2, $w1, $w1w2, $total){ 202 | #echo "

$w2 $w1 $w1w2 $total

"; 203 | $e1 = $w1 * ($w2 + $w1w2) / ($w1 + $total); 204 | 205 | $e2 = $total * ($w1w2 + $w2) / ($w1 + $total); 206 | #echo"

E1: $e1

E2: $e2

"; 207 | $g2 = 2 * (($w1w2 * log(($w1w2 / $e1), 2)) + ($w2 * log(($w2 / $e2), 2))); 208 | return $g2; 209 | } 210 | 211 | ### 212 | ### 213 | ### M A I N 214 | ### 215 | ### 216 | 217 | 218 | if (isset($_GET['ex'])) { 219 | #$server = 'http://localhost/ngramAnalyzer'; 220 | $server = 'http://guidetodatamining.com/ngramAnalyzer'; 221 | $text = $_GET['ex']; 222 | if ($text== 'walden'){ 223 | 224 | $thetext = "$server/walden.txt"; 225 | $cutoff = 3; 226 | $n = 2; 227 | $punc = 'no'; 228 | $method = 'byFreq'; 229 | } 230 | elseif ($text == 'moby'){ 231 | 232 | $thetext = "$server/moby.txt"; 233 | $cutoff = 3; 234 | $n = 2; 235 | $punc = 'no'; 236 | $method = 'logLikelihood'; 237 | 238 | } 239 | elseif ($text == 'lotus'){ 240 | 241 | $thetext = "$server/lotus.txt"; 242 | $cutoff = 3; 243 | $n = 1; 244 | $punc = 'no'; 245 | $method = 'byFreq'; 246 | 247 | } 248 | elseif ($text == 'lotus2'){ 249 | 250 | $thetext = "$server/lotus.txt"; 251 | $cutoff = 3; 252 | $n = 2; 253 | $punc = 'no'; 254 | $method = 'byFreq'; 255 | 256 | } 257 | elseif ($text == 'lotus3'){ 258 | 259 | $thetext = "$server/lotus.txt"; 260 | $cutoff = 3; 261 | $n = 2; 262 | $punc = 'no'; 263 | $method = 'loglikelihood'; 264 | 265 | } 266 | elseif ($text == 'moby2'){ 267 | 268 | $thetext = "$server/moby.txt"; 269 | $cutoff = 3; 270 | $n = 2; 271 | $punc = 'no'; 272 | $method = 'byFreq'; 273 | 274 | } 275 | elseif ($text == 'moby3'){ 276 | 277 | $thetext = "$server/moby.txt"; 278 | $cutoff = 3; 279 | $n = 3; 280 | $punc = 'no'; 281 | $method = 'byFreq'; 282 | } 283 | elseif ($text == 'super'){ 284 | 285 | $thetext = "$server/miserables.txt"; 286 | $cutoff = 3; 287 | $n = 2; 288 | $punc = 'no'; 289 | $method = 'logLikelihood'; 290 | 291 | } 292 | elseif ($text == 'super2'){ 293 | 294 | $thetext = "$server/miserables.txt"; 295 | $cutoff = 3; 296 | $n = 3; 297 | $punc = 'no'; 298 | $method = 'byFreq'; 299 | 300 | } 301 | 302 | } 303 | else { 304 | $thetext = $_POST['mytext']; 305 | $cutoff = $_POST['cutoff']; 306 | $n = $_POST['gram']; 307 | $punc = $_POST['punctuation']; 308 | $method = $_POST['method'] ; 309 | } 310 | #echo $thetext; 311 | $ngrams = array(); 312 | $previous = array('', '', '', ''); 313 | $total = 0; 314 | if ($method == 'byFreq'){ 315 | byFrequency(); 316 | $len = count($ngrams); 317 | echo "

Ngrams Ranked by Frequency

"; 318 | echo "

Total number of tokens: $total Types: $len

"; 319 | arsort($ngrams, SORT_NUMERIC); 320 | echo "\n"; 321 | foreach($ngrams as $gram =>$count){ 322 | if ($count < $cutoff){ 323 | break; 324 | } 325 | $freq = ($count * 100) / $total; 326 | echo "\n"; 327 | 328 | } 329 | 330 | 331 | echo "

ngram	count	frequency
$gram	$count	$freq

"; 332 | 333 | } 334 | else{ 335 | $n = 2; 336 | byFrequency(); 337 | $LOGGER = array(); 338 | #echo "

OKAY BEFORE ARSORT

"; 339 | arsort($ngrams, SORT_NUMERIC); 340 | $len = count($ngrams); 341 | #echo "

$len

"; 342 | foreach($ngrams as $gram=>$k11){ 343 | if ($k11 < 3){ 344 | break; 345 | } 346 | $words = preg_split('/\s+/', $gram); 347 | #echo "

$words[1], $words[0], $gram

"; 348 | $k12 = $unigrams[$words[1]] - $k11; 349 | $k21 = $unigrams[$words[0]] - $k11; 350 | $k22 = $total + $k11 - $k12 - $k21; 351 | $LOGGER[$gram] = logLikelihoodRatio($k11, $k12, $k21, $k22); 352 | #$LOGGER[$gram] = ll($unigrams[$words[1]], $unigrams[$words[0]], $count, $total); 353 | } 354 | #echo "

$LOG

"; 355 | 356 | arsort($LOGGER, SORT_NUMERIC); 357 | #$len = count($LOGGER); 358 | #echo "

$len

";; 359 | echo "

Ngrams Ranked by Log Likelihood

"; 360 | echo "

Total number of tokens: $total Types: $len

"; 361 | echo "\n"; 362 | foreach($LOGGER as $gram =>$log){ 363 | $count = $ngrams[$gram]; 364 | echo "\n"; 365 | 366 | } 367 | echo "

bigram	count	Log Likelihood
$gram	$count	$log

"; 368 | 369 | 370 | } 371 | 372 | 373 | 374 | 375 | ##### 376 | ##### 377 | ##### END ANALYSIS FUNCTIONS 378 | ##### 379 | ##### 380 | 381 | 382 | ?> 383 |

384 |

385 | 388 |

17 | 26 | 30 |

31 |

32 |

Compare the words in this text:

33 | 34 | 35 | 47 | 48 |

49 |

50 | 53 | 54 |

32 |

Analysis Complete

33 | 34 | 35 | 36 | $pat2

"; 71 | while(preg_match($pat2, $word)){ 72 | array_push($res2, substr($word, -1)); 73 | $word = substr($word, 0, -1); 74 | 75 | } 76 | 77 | $pos = strpos($word, "'"); 78 | if ($pos === false){ 79 | array_push($result, $word); 80 | } 81 | else { 82 | array_push($result, substr($word, 0, $pos)); 83 | array_push($result, substr($word, $pos)); 84 | } 85 | $res2 = array_reverse($res2); 86 | foreach($res2 as $item){ 87 | array_push($result, $item); 88 | } 89 | 90 | return $result; 91 | 92 | } 93 | function deletePun($word){ 94 | 95 | $orig = $word; 96 | # separate punctuation from word. so treat punctuation as separate tokens 97 | $pat1 = "/^[\"'$\[<]/"; 98 | $pat2 = "/[\"'$\]\.\!\?;\:,]$/"; 99 | $result = array(); 100 | while(preg_match($pat1, $word)){ 101 | $word = substr($word, 1); 102 | } 103 | while(preg_match($pat2, $word)){ 104 | $word = substr($word, 0, -1); 105 | 106 | } 107 | if ($word == ''){ 108 | $word = $orig; 109 | } 110 | $result = array($word); 111 | return $result; 112 | 113 | } 114 | 115 | function processLine2($line){ 116 | global $uni2; 117 | global $total2; 118 | global $unigrams; 119 | $words = preg_split('/\s+/', $line); 120 | #$ngrams = array(); 121 | foreach($words as $word){ 122 | $wordcomponents = deletePun($word); 123 | 124 | #$current = $word; 125 | foreach($wordcomponents as $current){ 126 | #echo "X$current "; 127 | $total2 = $total2 + 1; 128 | if (isset($uni2[$current])){ 129 | $uni2[$current] = $uni2[$current] + 1; 130 | } 131 | else { 132 | $uni2[$current] = 1; 133 | } 134 | 135 | } 136 | } 137 | 138 | } 139 | 140 | 141 | function processLine($line){ 142 | global $uni1; 143 | global $total; 144 | global $unigrams; 145 | $words = preg_split('/\s+/', $line); 146 | #$ngrams = array(); 147 | foreach($words as $word){ 148 | $wordcomponents = deletePun($word); 149 | 150 | #$current = $word; 151 | foreach($wordcomponents as $current){ 152 | #echo "X$current "; 153 | $total = $total + 1; 154 | if (isset($uni1[$current])){ 155 | $uni1[$current] = $uni1[$current] + 1; 156 | } 157 | else { 158 | $uni1[$current] = 1; 159 | } 160 | 161 | } 162 | } 163 | 164 | } 165 | 166 | ### 167 | ### BY FREQUENCY 168 | ### 169 | function byFrequency(){ 170 | 171 | global $uni1; 172 | global $total; 173 | global $thetext; 174 | 175 | if (is_array($thetext) == true) 176 | { 177 | foreach($thetext as $aline){ 178 | processLine($aline); 179 | } 180 | } 181 | else { 182 | if (substr($thetext, 0, 4) == 'http') { 183 | $urls = preg_split('/\s+/', $thetext); 184 | foreach($urls as $url){ 185 | if (substr($url, 0, 4) == 'http') { 186 | #echo "

URL: $url

"; 187 | processLine(file_get_contents($url)); 188 | } 189 | } 190 | } 191 | else{ 192 | processLine($thetext); 193 | } 194 | } 195 | 196 | } 197 | 198 | ### 199 | ### BY FREQUENCY 200 | ### 201 | function byFrequency2(){ 202 | 203 | global $uni2; 204 | global $total2; 205 | global $reference; 206 | 207 | if (is_array($reference) == true) 208 | { 209 | foreach($reference as $aline){ 210 | processLine2($aline); 211 | } 212 | } 213 | else { 214 | if (substr($reference, 0, 4) == 'http') { 215 | $urls = preg_split('/\s+/', $reference); 216 | foreach($urls as $url){ 217 | #echo "

$url

"; 218 | processLine2(file_get_contents($url)); 219 | } 220 | } 221 | else{ 222 | processLine2($reference); 223 | } 224 | } 225 | 226 | } 227 | 228 | ### 229 | ### USING LOG LIKELIHOOD 230 | ### 231 | function ll($w2, $w1, $w1w2, $total){ 232 | #echo "

$w2 $w1 $w1w2 $total

"; 233 | $e1 = $w1 * ($w2 + $w1w2) / ($w1 + $total); 234 | 235 | $e2 = $total * ($w1w2 + $w2) / ($w1 + $total); 236 | #echo"

E1: $e1

E2: $e2

"; 237 | if (($e1 != 0) and ($e2 != 0)) { 238 | $g2 = 2 * (($w1w2 * log(($w1w2 / $e1), 2)) + ($w2 * log(($w2 / $e2), 2))); 239 | return $g2; 240 | } 241 | else{ 242 | return 0; 243 | } 244 | } 245 | 246 | 247 | ### 248 | ### MUTUAL INFORMATION 249 | ### 250 | function mi($x, $y, $xy, $tot){ 251 | if ($xy < 5){ 252 | return 0; 253 | } 254 | $nom = $xy / $tot; 255 | $det = ($x / $tot) * ($y / $tot); 256 | return log(($non / $det), 2); 257 | 258 | } 259 | 260 | 261 | ### 262 | ### 263 | ### M A I N 264 | ### 265 | ### 266 | 267 | if (isset($_GET['ex'])) { 268 | #$server = 'http://localhost/ngramAnalyzer'; 269 | $server = 'http://guidetodatamining.com/ngramAnalyzer'; 270 | $text = $_GET['ex']; 271 | if ($text== 'walden'){ 272 | 273 | $thetext = "$server/walden.txt"; 274 | $reference = "$server/moby.txt"; 275 | } 276 | elseif ($text== 'lotus'){ 277 | 278 | $thetext = "$server/lotus.txt"; 279 | $reference = "$server/walden.txt"; 280 | } 281 | elseif ($text== 'lotus2'){ 282 | 283 | $thetext = "$server/lotus.txt"; 284 | $reference = "$server/moby.txt"; 285 | } 286 | } 287 | else { 288 | #echo "

"; 289 | $thetext = $_POST['mytext']; 290 | $reference = $_POST['reference']; 291 | } 292 | $total2 = 0; 293 | ########## 294 | ## I am editing here 295 | #### 296 | $n = $_POST['gram']; 297 | $punc = $_POST['punctuation']; 298 | $method = $_POST['method'] ; 299 | 300 | #echo $thetext; 301 | $uni1 = array(); 302 | $uni2 = array(); 303 | $dir = array(); 304 | $cuny = array(); # used to indicate word not in reference corpus 305 | $total = 0; 306 | 307 | byFrequency(); 308 | byFrequency2(); 309 | $len1 = count($uni1); 310 | $len2 = count($uni2); 311 | 312 | #echo "

LEN $len1 $len2

"; 313 | // echo "

TOTAL $total $total2

"; 314 | // echo "

the ".$uni1['the'].' '.$uni2['the']."

"; 315 | // echo "

Buddha ".$uni1['Buddha'].' '.$uni2['Buddha']."

"; 316 | // echo "

Compassion ".$uni1['compassion'].' '.$uni2['compassion']."

"; 317 | // echo "

".ll($uni2['compassion'], $total, $uni1['compassion'], $total2)."

"; 318 | // echo "

".ll($uni2['compassion'], $total, 0, $total2)."

"; 319 | // echo "

".ll($uni2['the'], $total, $uni1['the'], $total2)."

"; 320 | echo "

Total number of tokens in first text: $total In second: $total2

"; 321 | echo "

Count column refers to number of occurrences of the word in the first text. "; 322 | echo "Log Likelihood in a black font indicates the word was used more frequently in the first corpus than in the second; red font indicates it was less frequently used. An asterisk indicates that the word does not appear in the second corpus.

"; 323 | arsort($uni1, SORT_NUMERIC); 324 | $len1 = count($uni1); 325 | 326 | #echo "

LEN $len1 $len2

"; 327 | # okay now compute log likelihood. 328 | $log = array(); 329 | # 330 | # ok. adding kludge here 331 | if ($len1 < 2000){ 332 | $limit = 4; 333 | } 334 | else { 335 | $limit = 6; 336 | } 337 | 338 | echo "\n"; 339 | foreach($uni1 as $gram =>$count){ 340 | if ($count < $limit){ 341 | break; 342 | } 343 | $c2 = $uni2[$gram]; 344 | if ($c2 != 0){ 345 | #echo '.'; 346 | $loglike = logLikelihoodRatio($count, $c2, $total - $count, $total2 - $c2); 347 | #$loglike = logLikelihoodRatio($count + 1, $c2 + 1, $total - $count, $total2 - $c2); 348 | #echo $loglike; 349 | if ($loglike != 0){ 350 | $log[$gram] =$loglike; 351 | $cuny[$gram] = ''; 352 | if (($count / $total) > ($c2 / $total2)){ 353 | $dir[$gram] = ''; 354 | } 355 | else{ 356 | $dir[$gram] = 'neg'; 357 | } 358 | } 359 | 360 | 361 | 362 | 363 | 364 | } 365 | elseif($total2 > 2) { # c2 is zero - word doesn't appear in reference corpus 366 | #$loglike = logLikelihoodRatio($count, $c2, $total - $count, $total2 - $c2); 367 | $loglike = logLikelihoodRatio($count + 1, $c2 + 1, $total - $count, $total2 - $c2); 368 | #echo $loglike; 369 | if ($loglike != 0){ 370 | $log[$gram] =$loglike; 371 | $cuny[$gram] = '*'; 372 | if (($count / $total) > ($c2 / $total2)){ 373 | $dir[$gram] = ''; 374 | } 375 | else{ 376 | $dir[$gram] = 'neg'; 377 | }} 378 | } 379 | } 380 | $c3 = count($log); 381 | #echo "

LOG LEN $c3

"; 382 | arsort($log, SORT_NUMERIC); 383 | foreach($log as $gram=>$val){ 384 | $count = $uni1[$gram]; 385 | $direction = $dir[$gram]; 386 | $wordInRef = $cuny[$gram]; 387 | #$direction = '+'; 388 | echo "\n"; 389 | } 390 | 391 | 392 | 393 | 394 | echo "

Word	Count	Log Likelihood
$gram	$count	$val$wordInRef

"; 395 | 396 | 397 | 398 | 399 | 400 | 401 | ##### 402 | ##### 403 | ##### END ANALYSIS FUNCTIONS 404 | ##### 405 | ##### 406 | 407 | 408 | ?> 409 |

17 | 25 | 29 |

30 |

31 |

Welcome

32 | 33 | 34 | 73 | 74 |

75 |

76 | 79 | 80 |

Online NGram Analyzer

analyze your texts.

Ngrams Ranked by Frequency

Ngrams Ranked by Log Likelihood

Punctuation

Open Source

Examples

Description

Open Source

Examples

Open Source

Online Text Comparator

analyze your texts.

Compare the words in this text:

To those in this text:

Comparator Results

analyze your texts.

Analysis Complete

Online NGram Analyzer

analyze your texts.

Welcome