32 |
Analysis Complete
33 |
34 |
35 |
36 | $pat2";
71 | while(preg_match($pat2, $word)){
72 | array_push($res2, substr($word, -1));
73 | $word = substr($word, 0, -1);
74 |
75 | }
76 |
77 | $pos = strpos($word, "'");
78 | if ($pos === false){
79 | array_push($result, $word);
80 | }
81 | else {
82 | array_push($result, substr($word, 0, $pos));
83 | array_push($result, substr($word, $pos));
84 | }
85 | $res2 = array_reverse($res2);
86 | foreach($res2 as $item){
87 | array_push($result, $item);
88 | }
89 |
90 | return $result;
91 |
92 | }
93 | function deletePun($word){
94 |
95 | $orig = $word;
96 | # separate punctuation from word. so treat punctuation as separate tokens
97 | $pat1 = "/^[\"'\(\[<]/";
98 | $pat2 = "/[\"'\)\]\.\!\?;\:,]$/";
99 | $result = array();
100 | while(preg_match($pat1, $word)){
101 | $word = substr($word, 1);
102 | }
103 | while(preg_match($pat2, $word)){
104 | $word = substr($word, 0, -1);
105 |
106 | }
107 | if ($word == ''){
108 | $word = $orig;
109 | }
110 | $result = array($word);
111 | return $result;
112 |
113 | }
114 |
115 | function processLine2($line){
116 | global $uni2;
117 | global $total2;
118 | global $unigrams;
119 | $words = preg_split('/\s+/', $line);
120 | #$ngrams = array();
121 | foreach($words as $word){
122 | $wordcomponents = deletePun($word);
123 |
124 | #$current = $word;
125 | foreach($wordcomponents as $current){
126 | #echo "X$current ";
127 | $total2 = $total2 + 1;
128 | if (isset($uni2[$current])){
129 | $uni2[$current] = $uni2[$current] + 1;
130 | }
131 | else {
132 | $uni2[$current] = 1;
133 | }
134 |
135 | }
136 | }
137 |
138 | }
139 |
140 |
141 | function processLine($line){
142 | global $uni1;
143 | global $total;
144 | global $unigrams;
145 | $words = preg_split('/\s+/', $line);
146 | #$ngrams = array();
147 | foreach($words as $word){
148 | $wordcomponents = deletePun($word);
149 |
150 | #$current = $word;
151 | foreach($wordcomponents as $current){
152 | #echo "X$current ";
153 | $total = $total + 1;
154 | if (isset($uni1[$current])){
155 | $uni1[$current] = $uni1[$current] + 1;
156 | }
157 | else {
158 | $uni1[$current] = 1;
159 | }
160 |
161 | }
162 | }
163 |
164 | }
165 |
166 | ###
167 | ### BY FREQUENCY
168 | ###
169 | function byFrequency(){
170 |
171 | global $uni1;
172 | global $total;
173 | global $thetext;
174 |
175 | if (is_array($thetext) == true)
176 | {
177 | foreach($thetext as $aline){
178 | processLine($aline);
179 | }
180 | }
181 | else {
182 | if (substr($thetext, 0, 4) == 'http') {
183 | $urls = preg_split('/\s+/', $thetext);
184 | foreach($urls as $url){
185 | if (substr($url, 0, 4) == 'http') {
186 | #echo "
URL: $url
";
187 | processLine(file_get_contents($url));
188 | }
189 | }
190 | }
191 | else{
192 | processLine($thetext);
193 | }
194 | }
195 |
196 | }
197 |
198 | ###
199 | ### BY FREQUENCY
200 | ###
201 | function byFrequency2(){
202 |
203 | global $uni2;
204 | global $total2;
205 | global $reference;
206 |
207 | if (is_array($reference) == true)
208 | {
209 | foreach($reference as $aline){
210 | processLine2($aline);
211 | }
212 | }
213 | else {
214 | if (substr($reference, 0, 4) == 'http') {
215 | $urls = preg_split('/\s+/', $reference);
216 | foreach($urls as $url){
217 | #echo "
$url
";
218 | processLine2(file_get_contents($url));
219 | }
220 | }
221 | else{
222 | processLine2($reference);
223 | }
224 | }
225 |
226 | }
227 |
228 | ###
229 | ### USING LOG LIKELIHOOD
230 | ###
231 | function ll($w2, $w1, $w1w2, $total){
232 | #echo "
$w2 $w1 $w1w2 $total
";
233 | $e1 = $w1 * ($w2 + $w1w2) / ($w1 + $total);
234 |
235 | $e2 = $total * ($w1w2 + $w2) / ($w1 + $total);
236 | #echo"
E1: $e1
E2: $e2
";
237 | if (($e1 != 0) and ($e2 != 0)) {
238 | $g2 = 2 * (($w1w2 * log(($w1w2 / $e1), 2)) + ($w2 * log(($w2 / $e2), 2)));
239 | return $g2;
240 | }
241 | else{
242 | return 0;
243 | }
244 | }
245 |
246 |
247 | ###
248 | ### MUTUAL INFORMATION
249 | ###
250 | function mi($x, $y, $xy, $tot){
251 | if ($xy < 5){
252 | return 0;
253 | }
254 | $nom = $xy / $tot;
255 | $det = ($x / $tot) * ($y / $tot);
256 | return log(($non / $det), 2);
257 |
258 | }
259 |
260 |
261 | ###
262 | ###
263 | ### M A I N
264 | ###
265 | ###
266 |
267 | if (isset($_GET['ex'])) {
268 | #$server = 'http://localhost/ngramAnalyzer';
269 | $server = 'http://guidetodatamining.com/ngramAnalyzer';
270 | $text = $_GET['ex'];
271 | if ($text== 'walden'){
272 |
273 | $thetext = "$server/walden.txt";
274 | $reference = "$server/moby.txt";
275 | }
276 | elseif ($text== 'lotus'){
277 |
278 | $thetext = "$server/lotus.txt";
279 | $reference = "$server/walden.txt";
280 | }
281 | elseif ($text== 'lotus2'){
282 |
283 | $thetext = "$server/lotus.txt";
284 | $reference = "$server/moby.txt";
285 | }
286 | }
287 | else {
288 | #echo "
EX
";
289 | $thetext = $_POST['mytext'];
290 | $reference = $_POST['reference'];
291 | }
292 | $total2 = 0;
293 | ##########
294 | ## I am editing here
295 | ####
296 | $n = $_POST['gram'];
297 | $punc = $_POST['punctuation'];
298 | $method = $_POST['method'] ;
299 |
300 | #echo $thetext;
301 | $uni1 = array();
302 | $uni2 = array();
303 | $dir = array();
304 | $cuny = array(); # used to indicate word not in reference corpus
305 | $total = 0;
306 |
307 | byFrequency();
308 | byFrequency2();
309 | $len1 = count($uni1);
310 | $len2 = count($uni2);
311 |
312 | #echo "
LEN $len1 $len2
";
313 | // echo "
TOTAL $total $total2
";
314 | // echo "
the ".$uni1['the'].' '.$uni2['the']."
";
315 | // echo "
Buddha ".$uni1['Buddha'].' '.$uni2['Buddha']."
";
316 | // echo "
Compassion ".$uni1['compassion'].' '.$uni2['compassion']."
";
317 | // echo "
".ll($uni2['compassion'], $total, $uni1['compassion'], $total2)."
";
318 | // echo "
".ll($uni2['compassion'], $total, 0, $total2)."
";
319 | // echo "
".ll($uni2['the'], $total, $uni1['the'], $total2)."
";
320 | echo "
Total number of tokens in first text: $total In second: $total2
";
321 | echo "
Count column refers to number of occurrences of the word in the first text. ";
322 | echo "Log Likelihood in a black font indicates the word was used more frequently in the first corpus than in the second; red font indicates it was less frequently used. An asterisk indicates that the word does not appear in the second corpus.
";
323 | arsort($uni1, SORT_NUMERIC);
324 | $len1 = count($uni1);
325 |
326 | #echo "
LEN $len1 $len2
";
327 | # okay now compute log likelihood.
328 | $log = array();
329 | #
330 | # ok. adding kludge here
331 | if ($len1 < 2000){
332 | $limit = 4;
333 | }
334 | else {
335 | $limit = 6;
336 | }
337 |
338 | echo "
Word | Count | Log Likelihood |
\n";
339 | foreach($uni1 as $gram =>$count){
340 | if ($count < $limit){
341 | break;
342 | }
343 | $c2 = $uni2[$gram];
344 | if ($c2 != 0){
345 | #echo '.';
346 | $loglike = logLikelihoodRatio($count, $c2, $total - $count, $total2 - $c2);
347 | #$loglike = logLikelihoodRatio($count + 1, $c2 + 1, $total - $count, $total2 - $c2);
348 | #echo $loglike;
349 | if ($loglike != 0){
350 | $log[$gram] =$loglike;
351 | $cuny[$gram] = '';
352 | if (($count / $total) > ($c2 / $total2)){
353 | $dir[$gram] = '';
354 | }
355 | else{
356 | $dir[$gram] = 'neg';
357 | }
358 | }
359 |
360 |
361 |
362 |
363 |
364 | }
365 | elseif($total2 > 2) { # c2 is zero - word doesn't appear in reference corpus
366 | #$loglike = logLikelihoodRatio($count, $c2, $total - $count, $total2 - $c2);
367 | $loglike = logLikelihoodRatio($count + 1, $c2 + 1, $total - $count, $total2 - $c2);
368 | #echo $loglike;
369 | if ($loglike != 0){
370 | $log[$gram] =$loglike;
371 | $cuny[$gram] = '*';
372 | if (($count / $total) > ($c2 / $total2)){
373 | $dir[$gram] = '';
374 | }
375 | else{
376 | $dir[$gram] = 'neg';
377 | }}
378 | }
379 | }
380 | $c3 = count($log);
381 | #echo "LOG LEN $c3
";
382 | arsort($log, SORT_NUMERIC);
383 | foreach($log as $gram=>$val){
384 | $count = $uni1[$gram];
385 | $direction = $dir[$gram];
386 | $wordInRef = $cuny[$gram];
387 | #$direction = '+';
388 | echo "$gram | $count | $val$wordInRef |
\n";
389 | }
390 |
391 |
392 |
393 |
394 | echo "
";
395 |
396 |
397 |
398 |
399 |
400 |
401 | #####
402 | #####
403 | ##### END ANALYSIS FUNCTIONS
404 | #####
405 | #####
406 |
407 |
408 | ?>
409 |