├── data ├── whiteip.txt ├── spamvertignore.txt ├── bayesignore.txt ├── scalpel.txt └── blockisp.txt ├── admin ├── tpl │ ├── exception.inc │ ├── scorexml.inc │ ├── plonker_blocked.inc │ ├── tables.inc │ ├── bayes.inc │ ├── bayeslinks.inc │ ├── apc.inc │ ├── test.inc │ ├── bayeskill.inc │ ├── updatebayes.inc │ ├── daily.inc │ ├── stats.inc │ ├── proxies.inc │ ├── bayesadd.inc │ ├── accs.inc │ ├── main.inc │ ├── accmsg.inc │ ├── layout.inc │ ├── bayesinfo.inc │ ├── live.inc │ └── plonker.inc ├── .htaccess ├── phpinfo.php ├── debug.php ├── autologin.php ├── bayeslinks.php ├── apc.php ├── daily.php ├── classifyips.php ├── bayesadd.php ├── test.php ├── postformatter.php ├── proxies.php ├── cleanup.php ├── live.php ├── main.php ├── score.php ├── stats.php ├── updatebayes.php ├── accs.php ├── plonker.php └── index.php ├── README.md ├── class ├── sblamservices.php ├── sblamtest.php ├── sblambasepost.php ├── sblampdo.php ├── domainmatch.php ├── sblamhttp.php ├── plonker.php ├── asyncsocket.php ├── sblambase.php ├── interfaces.php ├── sblam.php ├── sblampost.php └── asyncdns.php ├── tests ├── optimist.php ├── latenight.php ├── hashes.php ├── mixedformatting.php ├── whiteip.php ├── scalpel.php ├── plonker.php ├── sorbs.php ├── phphttpbl.php ├── linkmania.php ├── surbl.php ├── correctfields.php ├── domains.php ├── networks.php ├── keywords.php ├── dronebl.php ├── dnsbl.php ├── dedupe.php ├── linksleeve.php ├── mailexploit.php ├── challenge.php ├── http.php ├── throttle.php ├── spamvertises.php └── bayes.php ├── config.ini ├── dbconn.php └── index.php /data/whiteip.txt: -------------------------------------------------------------------------------- 1 | 89.239.105.61 #biuro Koszalin -------------------------------------------------------------------------------- /admin/tpl/exception.inc: -------------------------------------------------------------------------------- 1 | 2 |

Zonk

3 |
4 | 


--------------------------------------------------------------------------------
/admin/tpl/scorexml.inc:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/admin/.htaccess:
--------------------------------------------------------------------------------
1 | 
2 | 	RewriteEngine On
3 | 	RewriteRule !\.(php|html)$ index.php
4 | 
5 | ErrorDocument 404 index.php
6 | php_value memory_limit 64M
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Sblam!
2 | ======
3 | 
4 | Sblam! is a transparent server-side anti-spam filter for HTTP web forms (blog comments, forums, guestbooks, etc.)
5 | 
6 | [More info](http://sblam.com/en.html)


--------------------------------------------------------------------------------
/admin/phpinfo.php:
--------------------------------------------------------------------------------
 1 | ob_get_clean());
10 | 	}
11 | }
12 | 


--------------------------------------------------------------------------------
/admin/tpl/plonker_blocked.inc:
--------------------------------------------------------------------------------
1 | 

${php:added?'Added':'Removed'} ${php:count(ips)} IP${php:count(ips) gt 1?'s':''}

2 | 5 | -------------------------------------------------------------------------------- /admin/tpl/tables.inc: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
6 |
7 | -------------------------------------------------------------------------------- /admin/debug.php: -------------------------------------------------------------------------------- 1 | db = $db; 10 | } 11 | 12 | function getDB() 13 | { 14 | return $this->db; 15 | } 16 | 17 | function getHTTP() 18 | { 19 | return new SblamHTTP(); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /admin/tpl/bayes.inc: -------------------------------------------------------------------------------- 1 |

Triggers

2 | 3 |
4 | 5 | 6 | 8 | 9 | 10 |
7 |
11 |

12 |
13 | -------------------------------------------------------------------------------- /admin/tpl/bayeslinks.inc: -------------------------------------------------------------------------------- 1 |

Banned

2 | 3 | 4 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /admin/tpl/apc.inc: -------------------------------------------------------------------------------- 1 | 2 |
3 | 4 |
5 |

File cache

6 |

Used ${info/num_entries} of ${info/num_slots} slots, ${info/expunges} expunges, ${info/num_hits} hits / ${info/num_misses} 7 | misses.

8 |

Allocated ${sma/num_seg} * ${sma/seg_size}, unused ${sma/avail_mem}

9 | 10 | -------------------------------------------------------------------------------- /tests/optimist.php: -------------------------------------------------------------------------------- 1 | score = isset($settings['score']) ? $settings['score'] : -0.35; 12 | } 13 | 14 | function testPost(ISblamPost $p) 15 | { 16 | return array($this->score, self::CERTAINITY_LOW/2, "Optimist"); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /class/sblamtest.php: -------------------------------------------------------------------------------- 1 | services = $services; 11 | } 12 | 13 | static function info() {return array();} 14 | } 15 | 16 | abstract class SblamTestPost extends SblamTest implements ISblamTestPost 17 | { 18 | function preTestPost(ISblamPost $p) {} 19 | function reportResult(ISblamPost $post,$score,$cert) {} 20 | } 21 | -------------------------------------------------------------------------------- /admin/autologin.php: -------------------------------------------------------------------------------- 1 | 'http://sblam.com/key.html?autologin='.md5("^&$@$2\n$apikey@@").":$time:".md5($time.$apikey)); 9 | } 10 | 11 | function account($acc) 12 | { 13 | $apikey = $this->services->getDB()->query('SELECT apikey FROM accounts WHERE id='.intval($acc))->fetchAll(); 14 | $apikey = reset($apikey); 15 | $apikey = reset($apikey); 16 | return $this->apikey($apikey); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /admin/tpl/test.inc: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |

Test

5 |

Scored ${php:round(score[0],2)} with cert ${php:round(score[1],2)}

6 |

7 | 8 | 9 | 10 | 11 | 12 |

Test posts

13 |
14 |

15 | 16 | 17 |

18 |
19 | -------------------------------------------------------------------------------- /admin/tpl/bayeskill.inc: -------------------------------------------------------------------------------- 1 | 8 | 9 | 10 |

is spam vs 11 | ham.

12 | 13 | 14 |

normalized spam vs ham. ${spammy}% spammy

15 | 16 | -------------------------------------------------------------------------------- /tests/latenight.php: -------------------------------------------------------------------------------- 1 | getPostTime()) 9 | { 10 | $hour = date("G",$t); 11 | if ($hour >= 2 and $hour <= 5) return array(0.15,self::CERTAINITY_LOW,"Late-night posting ({$hour}h)"); 12 | if ($hour >= 1 and $hour <= 7) return array(0.09,self::CERTAINITY_LOW,"Late-night posting ({$hour}h)"); 13 | } 14 | } 15 | 16 | 17 | static function info() 18 | { 19 | return array( 20 | 'name'=>'Late-night posting', 21 | 'desc'=>'Bots spam 24h/day, but humans usually don\'t', 22 | 'remote'=>false, 23 | ); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /admin/tpl/updatebayes.inc: -------------------------------------------------------------------------------- 1 | 2 |
3 | 4 |
5 | 6 |
7 | 8 |
9 | 10 |
11 | 12 |
13 | 14 | 15 |

Done

16 |

Processed ${done} posts. Failed to process ${failed} posts.

17 |

Because of server load pauses between posts took ${waited} seconds, which is ${waitperpost}s per post.

18 |
19 | -------------------------------------------------------------------------------- /class/sblambasepost.php: -------------------------------------------------------------------------------- 1 | reason = $r;} 9 | function getSpamReason() {return $this->reason;} 10 | 11 | protected $spam_score = array(); 12 | function setSpamScore(array $s) 13 | { 14 | assert('is_numeric($s[0])'); 15 | assert('is_numeric($s[1]) && $s[1]>=0'); 16 | $this->spam_score = $s; 17 | } 18 | 19 | /** 20 | * @return array(score, certainity) in scale 0-1 21 | */ 22 | function getSpamScore() {return $this->spam_score;} 23 | 24 | protected $post_id; 25 | function setPostId($p) {$this->post_id = $p;} 26 | function getPostId() {return $this->post_id;} 27 | } 28 | -------------------------------------------------------------------------------- /tests/hashes.php: -------------------------------------------------------------------------------- 1 | getRawContent(),$res)) 10 | { 11 | if (!preg_match('![a-f][0-9]!',$res[0])) return NULL; 12 | if (!empty($res[1]) && preg_match('![a-f][0-9]!',$res[1])) return array(0.3,self::CERTAINITY_NORMAL,"Hash marks (2)"); 13 | return array(0.2,self::CERTAINITY_LOW,"Hash marks (1)"); 14 | } 15 | return NULL; 16 | } 17 | 18 | 19 | 20 | static function info() 21 | { 22 | return array( 23 | 'name'=>'Hashes marking messages', 24 | 'desc'=>'Stupid spammers mark their messages with unique hashes, probably to find their own successful spammings later', 25 | 'remote'=>false, 26 | ); 27 | } 28 | } 29 | 30 | -------------------------------------------------------------------------------- /admin/tpl/daily.inc: -------------------------------------------------------------------------------- 1 |

Spams per hour – daily

2 | 3 | 5 | 6 |
4 |
${r/hour}
7 | 8 |

Spam congestion – per 30s

9 | 10 | 11 |
${php:date('H i s',r['timestamp'])}
12 | -------------------------------------------------------------------------------- /config.ini: -------------------------------------------------------------------------------- 1 | 2 | tlds = data/tlds.txt 3 | 4 | dns = 127.0.0.1 5 | 6 | [db] 7 | dsn = "mysql:host=localhost;dbname=sblam" 8 | user = sblam 9 | ; pass = "change me and uncomment this line" 10 | 11 | [optimist] 12 | phase=1 13 | 14 | [challenge] 15 | phase=1 16 | 17 | [plonker] 18 | phase=1 19 | add=1 20 | 21 | [throttle] 22 | phase=2 23 | 24 | [latenight] 25 | phase=1 26 | 27 | [scalpel] 28 | phase=1 29 | 30 | [http] 31 | phase=1 32 | 33 | [dedupe] 34 | phase=2 35 | 36 | [correctfields] 37 | phase=1 38 | 39 | [linkmania] 40 | phase=1 41 | 42 | [mailexploit] 43 | phase=1 44 | 45 | [mixedformatting] 46 | phase=1 47 | 48 | [hashes] 49 | phase=1 50 | 51 | [keywords] 52 | blocklist2 = data/sampleblocklist.txt 53 | phase=2 54 | 55 | [spamvertises] 56 | add=1 57 | 58 | [networks] 59 | 60 | [bayes] 61 | add=1 62 | 63 | [phphttpbl] 64 | enabled=0 65 | key = ??? 66 | 67 | [surbl] 68 | 69 | [linksleeve] 70 | -------------------------------------------------------------------------------- /dbconn.php: -------------------------------------------------------------------------------- 1 | setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); 21 | $pdo->initConnection(); 22 | } 23 | catch(PDOException $e) 24 | { 25 | if (!$max) throw $e; 26 | usleep(250000); 27 | } 28 | } 29 | return $pdo; 30 | } 31 | -------------------------------------------------------------------------------- /data/spamvertignore.txt: -------------------------------------------------------------------------------- 1 | about.com 2 | adobe.com 3 | amazon.com 4 | aol.com 5 | apple.com 6 | baidu.com 7 | bbc.co.uk 8 | blogger.com 9 | blogspot.com 10 | cnn.com 11 | co.uk 12 | facebook.com 13 | flickr.com 14 | fotolog.net 15 | geocities.com 16 | globo.com 17 | go.com 18 | google.ca 19 | google.cl 20 | google.cn 21 | google.co.il 22 | google.co.in 23 | google.co.jp 24 | google.co.uk 25 | google.com 26 | google.com.ar 27 | google.com.au 28 | google.com.br 29 | google.com.mx 30 | google.com.sa 31 | google.com.tr 32 | google.de 33 | google.es 34 | google.fr 35 | google.it 36 | google.pl 37 | imageshack.us 38 | imagevenue.com 39 | imdb.com 40 | live.com 41 | livejournal.com 42 | microsoft.com 43 | msn.com 44 | naver.com 45 | onet.pl 46 | org.uk 47 | orkut.com 48 | passport.net 49 | photobucket.com 50 | rapidshare.com 51 | rapidshare.de 52 | rediff.com 53 | sourceforge.net 54 | starware.com 55 | w3.org 56 | wikipedia.org 57 | wp.pl 58 | yahoo.com 59 | yandex.ru 60 | youtube.com 61 | -------------------------------------------------------------------------------- /tests/mixedformatting.php: -------------------------------------------------------------------------------- 1 | getRawContent().' '.$p->getAuthorName().' '.$p->getAuthorEmail(); 10 | 11 | $rawlinks = preg_match("!(?:^|\s)https?://!mi",$txt); 12 | $bbcode = preg_match("!\[url\s*[\]=]\s*http!i",$txt); 13 | $html = preg_match("!<]*href[^>]!i",$txt); 14 | $textile = preg_match("!\":https?://!i",$txt); 15 | 16 | if ($bbcode && $html && ($textile || $rawlinks)) return array(1,self::CERTAINITY_NORMAL,"Mixed BBcode, HTML and other links"); 17 | if ($bbcode && $html) return array(0.7,self::CERTAINITY_NORMAL,"Mixed BBcode and HTML"); 18 | return NULL; 19 | } 20 | 21 | static function info() 22 | { 23 | return array( 24 | 'name'=>'Don\'t allow different link formatting styles', 25 | 'desc'=>'Spammers sometimes try all kinds of formatting in case any of them works', 26 | 'remote'=>false, 27 | ); 28 | } 29 | } 30 | 31 | -------------------------------------------------------------------------------- /tests/whiteip.php: -------------------------------------------------------------------------------- 1 | whitelist = @file( isset($settings['whitelist'])?$settings['whitelist']:"data/whiteip.txt" ); 11 | } 12 | 13 | function testPost(ISblamPost $p) 14 | { 15 | $isWhiteIP = false; 16 | $out = array(); 17 | 18 | foreach($p->getAuthorIPs() as $ip) 19 | { 20 | if( array_search($ip, $this->whitelist) !== false ) { 21 | $isWhiteIP = true; 22 | } else { 23 | $isWhiteIP = false; 24 | } 25 | } 26 | 27 | if( $isWhiteIP ) { 28 | $out[] = array(-1.0, self::CERTAINITY_HIGH, "Sent from whitelisted IP"); 29 | } 30 | 31 | return $out; 32 | } 33 | 34 | static function info() 35 | { 36 | return array( 37 | 'name'=>'Author IP', 38 | 'desc'=>'Mark posts as HAM when sent from whitelisted IP', 39 | 'remote'=>false, 40 | ); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /admin/bayeslinks.php: -------------------------------------------------------------------------------- 1 | getSblam(); // init tlds 17 | $spamverts = new SblamTestSpamvertises(array(), $this->services); 18 | 19 | $linkstoadd = array(); 20 | 21 | if (false !== strpos($url,'@')) 22 | { 23 | d('adding email'); 24 | $spamverts->addEmail($linkstoadd,$url); 25 | } 26 | else 27 | { 28 | $spamverts->addURI($linkstoadd, new SblamURI($url),''); // split subdomains, etc. 29 | } 30 | 31 | $linkstoadd = array_keys($linkstoadd); 32 | 33 | $bayesbase = $this->getBayesStats(); 34 | 35 | $res = array( 36 | 'title'=>'Banned domains', 37 | 'result'=>$bayesbase->banWords($linkstoadd), 38 | 'linksadded'=>$linkstoadd, 39 | ); 40 | return $res; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /admin/apc.php: -------------------------------------------------------------------------------- 1 | 1000) break; 25 | 26 | if ($key && is_scalar($key)) $tmp[$label][$key] = $val.' '.$i['num_hits']; 27 | else $tmp['other'][$i['info']] = $val.' '.$i['num_hits']; 28 | } 29 | $usercache=NULL; 30 | foreach($tmp as &$tmp2) 31 | { 32 | arsort($tmp2); 33 | } 34 | */ 35 | return array( 36 | 'info'=>apc_cache_info(), 37 | 'sma'=>apc_sma_info(true), 38 | 'entrycount'=>0,//$entrycount, 39 | ); 40 | } 41 | 42 | function post_clear() 43 | { 44 | apc_clear_cache('user'); 45 | } 46 | } 47 | 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /admin/tpl/stats.inc: -------------------------------------------------------------------------------- 1 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |
${graph/totalhams} hams
${l/time}s${l/percent}%
12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 |
Number of posts: ${total} (${tempo}/day)
Unverified posts: ${unverified} (${tough} tough ones)
Unadded posts: ${unadded} unadded
Hams: ${hams} (${hamsprc}%)
false negatives${fhams} (${phams}%)
Spams: ${spams} (${spamsprc}%)
false positives ${fspams} (${pspams}%)
Accuracy:${accuracy}% (${unsure}% unsure)
23 | -------------------------------------------------------------------------------- /admin/tpl/proxies.inc: -------------------------------------------------------------------------------- 1 |

Trusted proxies

2 |

Proxy to be trusted must pass X-Forwarded-For header (or similar) and must have revDNS that is on this list.

3 |
4 |

For sake of performance, only cached lookups are checked. If any proxies listed below have 0 IPs, you have to fill the cache.

5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 | 13 | 14 | 15 | 16 | 17 |
HostnameIPsAction
18 |
19 | -------------------------------------------------------------------------------- /admin/tpl/bayesadd.inc: -------------------------------------------------------------------------------- 1 | 2 | 3 |

Added spam

4 |

Added ham

5 |

Test result

6 |

Added words

7 |

Added links: ${l}

8 | 9 |

Links scored ${php:round(spamvertresult[0],2)} with cert ${php:round(spamvertresult[1],2)} for ${spamvertresult/3}

10 |

Words scored ${php:round(bayesresult[0],2)} with cert ${php:round(bayesresult[1],2)} for ${bayesresult/3}

11 |
12 | 13 |

Add words/links

14 | 15 |
16 |
17 |
18 |
19 |
20 | 21 | 22 | 23 |
24 |
25 | -------------------------------------------------------------------------------- /index.php: -------------------------------------------------------------------------------- 1 | process(new ServerRequest($services->getDB())); 39 | } 40 | catch(ServerException $e) 41 | { 42 | if (!headers_sent()){ 43 | header("HTTP/1.1 ".$e->getCode()." ".$e->getMessage()); 44 | header("Content-Type: text/plain;charset=UTF-8"); 45 | } 46 | die($e->getMessage()); 47 | } 48 | catch(Exception $e) 49 | { 50 | if (!headers_sent()){ 51 | header("HTTP/1.1 500 err"); 52 | header("Content-Type: text/plain;charset=UTF-8");} 53 | if (ini_get('display_errors')) echo $e; else echo "Error"; 54 | error_log($e->getMessage());//." in ".$e->getSourceFile().':'.$e->getSourceLine()); 55 | } 56 | -------------------------------------------------------------------------------- /admin/tpl/accs.inc: -------------------------------------------------------------------------------- 1 | 2 | a.msg {font-size:1.35em;line-height:0.8;text-decoration:none} 3 | 4 | 5 |
6 |

7 | See detailed infoBack to brief info. 8 |

9 |

10 |
11 | 12 | 13 | 15 | 16 | 17 | 19 | 24 | 27 |
#emailuSpamsCntHamHostsDateJS
20 | 21 | 22 | 23 | 25 | 26 |
28 | 29 | -------------------------------------------------------------------------------- /admin/daily.php: -------------------------------------------------------------------------------- 1 | services->getDB(); 8 | 9 | $hours = $pdo->query("/*maxtime=20*/". 10 | "SELECT count(*)/greatest(1,count(distinct day(from_unixtime(\"timestamp\")))) as cnt, 11 | HOUR(from_unixtime(\"timestamp\")) as \"hour\" 12 | FROM posts_meta 13 | GROUP BY HOUR(from_unixtime(\"timestamp\")) 14 | ORDER BY \"hour\"")->fetchAll(PDO::FETCH_ASSOC); 15 | 16 | $max=1; 17 | foreach($hours as $h) 18 | { 19 | $max = max($h['cnt'],$max); 20 | } 21 | $scalefactor = 300 / $max; 22 | 23 | $top = $pdo->query("/*maxtime=20*/". 24 | "SELECT count(*) as cnt,max(\"timestamp\") >> 5 as \"slot\", max(\"timestamp\") as \"timestamp\" 25 | FROM posts_meta 26 | WHERE \"timestamp\" > unix_timestamp(NOW())-3600*24 27 | GROUP BY \"timestamp\" >> 5 28 | ORDER BY cnt DESC 29 | LIMIT 50")->fetchAll(PDO::FETCH_ASSOC); 30 | 31 | $max=1; 32 | foreach($top as $h) 33 | { 34 | $max = max($h['cnt'],$max); 35 | } 36 | $topscalefactor = 200 / $max; 37 | 38 | return array( 39 | 'scalefactor' => $scalefactor, 40 | 'topscalefactor' => $topscalefactor, 41 | 'hours'=>$hours, 42 | 'top'=>$top, 43 | ); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /admin/classifyips.php: -------------------------------------------------------------------------------- 1 | services->getDB(); 8 | $sblam = $this->getSblam(); // inits urls 9 | 10 | $table = 'plonker'; // FIXME: read config! 11 | $accumulate = array(); 12 | 13 | $max=10; 14 | while($max--) foreach($pdo->query("SELECT ip,added from $table where flags=0 order by rand() limit 2000") as $r) 15 | { 16 | try 17 | { 18 | $ip = long2ip($r['ip']); 19 | 20 | $rev = preg_replace('!(\d+)\.(\d+)\.(\d+)\.(\d+)!','\4.\3.\2.\1.dul.dnsbl.sorbs.net',$ip); 21 | $r['rev1'] = $rev; 22 | SblamURI::gethostbynameasync($rev); 23 | 24 | $rev = preg_replace('!(\d+)\.(\d+)\.(\d+)\.(\d+)!','\4.\3.\2.\1.korea.services.net',$ip); 25 | $r['rev2'] = $rev; 26 | SblamURI::gethostbynameasync($rev); 27 | 28 | $accumulate[] = $r; 29 | 30 | usleep(50000); 31 | 32 | if (count($accumulate)>=20) 33 | { 34 | foreach($accumulate as $r) 35 | { 36 | $res = SblamURI::gethostbyname($r['rev1']) ? 'dul':'nodul'; 37 | $res .= ',' . (SblamURI::gethostbyname($r['rev2']) ? 'wild':'nowild'); 38 | 39 | $q = "update $table set flags = '$res', added = added where ip = {$r['ip']}"; 40 | d($q); 41 | if (!$pdo->query($q)) warn($pdo->errorInfo()); 42 | } 43 | $accumulate = array(); 44 | } 45 | } 46 | catch(Exception $e){} 47 | } 48 | 49 | return array('redirect'=>'/admin/plonker'); 50 | } 51 | 52 | function post_index() 53 | { 54 | return $this->index(); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /tests/scalpel.php: -------------------------------------------------------------------------------- 1 | patterns = array(); 14 | foreach(file($name) as $line) 15 | { 16 | $line = trim($line); if (!$line || $line[0]=='#') continue; 17 | 18 | if (!preg_match('!^\s*(-?\d+(?:\.\d+)?)\s*[=:] ?(/.+/i?$|[^/].*)!',$line,$res)) 19 | { 20 | throw new Exception("Syntax error in patterns: $line"); 21 | } 22 | if ($res[2][0] !=='/') $res[2] = '/'.preg_quote($res[2],'/').'/'; 23 | 24 | $this->patterns[] = array($res[2]."u", (float)$res[1]); 25 | } 26 | //d($this->patterns,'scalpel patterns'); 27 | } 28 | 29 | function testPost(ISblamPost $p) 30 | { 31 | $matched=array(); 32 | $score = 0; 33 | $post = $p->getRawContent()."\n".$p->getAuthorName()."\n".$p->getAuthorEmail()."\n".$p->getAuthorURI(); 34 | 35 | foreach($this->patterns as $pattern) 36 | { 37 | if (preg_match($pattern[0], $post)) 38 | { 39 | $matched[] = $pattern[0]; 40 | $score += $pattern[1]; 41 | } 42 | } 43 | 44 | if ($score) return array($score, self::CERTAINITY_NORMAL, "Exact spam matches (".implode(', ',$matched).")"); 45 | } 46 | 47 | static function info() 48 | { 49 | return array( 50 | 'name'=>'Scalpel', 51 | 'desc'=>'Checks for exact patterns', 52 | 'remote'=>false, 53 | ); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /tests/plonker.php: -------------------------------------------------------------------------------- 1 | add = isset($settings['add'])?$settings['add']:1; 14 | $table = (isset($settings['table'])) ? $settings['table'] : 'plonker'; 15 | 16 | $this->plonker = new Plonker($this->services->getDB(), $table); 17 | } 18 | 19 | function testPost(ISblamPost $p) 20 | { 21 | $res = $this->plonker->testIPs($p->getAuthorIPs(), sprintf('%u',$p->getPostTime())); 22 | 23 | if (!$res) {return NULL;} 24 | list($total,$count) = $res; 25 | 26 | if ($total <0.1) return NULL; 27 | 28 | $rawtotal = round($total,1); 29 | 30 | $total = sqrt($total)/2 + $total/800; 31 | $total = max(0,$total-0.28); 32 | 33 | if ($total > 0.4) {$total = 0.4+($total-0.4)/2;} 34 | if ($total > 0.7) {$total = 0.7+($total-0.7)/2;} 35 | 36 | $total = min(7.5,$total+0.15); 37 | 38 | return array($total,$total>1.5?self::CERTAINITY_HIGH:self::CERTAINITY_NORMAL,"Automatically banned IPs/range ($count ips, $rawtotal R = ".round($total,1).")"); 39 | } 40 | 41 | function reportResult(ISblamPost $post, $score, $cert) 42 | { 43 | if (!$this->add) return; 44 | 45 | if ($score > 0.66 && $cert > 0.75) 46 | { 47 | $this->plonker->addIPs($post->getAuthorIPs(), $score); 48 | } 49 | else if ($score < -0.6 && $cert > 0.7) 50 | { 51 | $this->plonker->removeIPs($post->getAuthorIPs()); 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /tests/sorbs.php: -------------------------------------------------------------------------------- 1 | rblhost = 'dnsbl.sorbs.net'; 10 | self::$dnsrbls = array( 11 | '127.0.0.2'=>array('link' =>array(0.4,self::CERTAINITY_NORMAL, "Link in SORBS http"), 12 | 'sender'=>array(0.1,self::CERTAINITY_LOW, "Sender in SORBS http")), 13 | '127.0.0.3'=>array('link' =>array(0.3,self::CERTAINITY_NORMAL, "Link in SORBS socks"), 14 | 'sender'=>array(0.4,self::CERTAINITY_NORMAL, "Sender in SORBS socks")), 15 | '127.0.0.4'=>array('link' =>array(0.4,self::CERTAINITY_LOW, "Link in SORBS misc proxy"), 16 | 'sender'=>array(0.5,self::CERTAINITY_NORMAL, "Sender in SORBS misc proxy")), 17 | '127.0.0.7'=>array('link' =>array(0.5,self::CERTAINITY_LOW, "Link in SORBS web"), 18 | 'sender'=>array(0.5,self::CERTAINITY_LOW, "Sender in SORBS web")), 19 | '127.0.0.9'=>array('link' =>array(0.7,self::CERTAINITY_NORMAL, "Link in SORBS zombie"), 20 | 'sender'=>array(0.5,self::CERTAINITY_LOW, "Sender in SORBS zombie")), 21 | ); 22 | } 23 | 24 | function score($ip, $resip, $reason, $scorefactor) 25 | { 26 | if (isset(self::$dnsrbls[$resip]) && isset(self::$dnsrbls[$resip][$reason])) 27 | { 28 | $tempout = self::$dnsrbls[$resip][$reason]; 29 | $tempout[2] .= " ($ip * ".round($scorefactor,2).")"; 30 | $tempout[0] *= $scorefactor; 31 | return $tempout; 32 | } 33 | } 34 | 35 | static function info() 36 | { 37 | return array( 38 | 'name'=>'SORBS DNS RBL', 39 | 'desc'=>'Checks for banned IPs in Sorbs Realtime Blackhole List', 40 | 'remote'=>true, 41 | ); 42 | } 43 | } 44 | 45 | -------------------------------------------------------------------------------- /admin/tpl/main.inc: -------------------------------------------------------------------------------- 1 | 2 | 3 | form p abbr {font-variant:small-caps; text-transform:lowercase} 4 | 5 | 6 | 7 |

Load: ${load}. 8 | Last update ping: ${php:round((time()-apc_fetch('update_active'))/60,1)}m.

9 | 10 |
11 |

Posts archive

12 |

~${php:round(stats['posts']/1000)}K posts available. ${php:round(stats['posts_archive']/1000)}K posts archivized.

13 |

14 | Moved ${archive/moved} posts to archive. Not all moved posts were added to bayes base. 15 |

16 |

17 |
18 | 19 |

MySQL Status

20 |
21 | 22 | 23 | 24 | 25 | 27 | 28 | 29 |
26 |
30 |
31 | 32 |

Tables

33 | 34 | 35 |

Bayes triggers.

36 |

PHP Info.

37 | -------------------------------------------------------------------------------- /class/sblampdo.php: -------------------------------------------------------------------------------- 1 | prepare($q); 14 | if (!$statement) throw new PDOException(implode(',',$this->errorInfo().'// '.$q)); 15 | if (!$statement->execute($data)) throw new PDOException(implode(',',$statement->errorInfo().'// '.$q)); 16 | return $statement; 17 | } 18 | 19 | function initConnection() {} 20 | 21 | abstract function getTables(); 22 | abstract function getProcesslist(); 23 | 24 | abstract function timestampdiff($interval,$arg1,$arg2); 25 | } 26 | 27 | class SblamMysqlPDO extends SblamPDO 28 | { 29 | function initConnection() 30 | { 31 | $this->exec("SET SESSION sql_mode='ANSI'"); 32 | $this->exec("SET NAMES utf8"); 33 | } 34 | 35 | function getTables() 36 | { 37 | return $this->query("/*maxtime10*/SHOW table status")->fetchAll(PDO::FETCH_ASSOC); 38 | } 39 | 40 | function getProcesslist() 41 | { 42 | return $this->query("/*maxtime2*/SHOW processlist")->fetchAll(PDO::FETCH_ASSOC); 43 | } 44 | 45 | function timestampdiff($interval,$arg1,$arg2) {return "timestampdiff($interval, $arg1, $arg2)";} 46 | } 47 | 48 | class SblamPgsqlPDO extends SblamPDO 49 | { 50 | function getTables() 51 | { 52 | return array(); 53 | } 54 | 55 | function getProcesslist() 56 | { 57 | return array(); 58 | } 59 | 60 | function timestampdiff($interval,$arg1,$arg2) {return "timestampdiff('$interval', $arg1, $arg2)";} 61 | } 62 | -------------------------------------------------------------------------------- /admin/bayesadd.php: -------------------------------------------------------------------------------- 1 | 'Add to bayes base'); 16 | } 17 | 18 | function post_index() 19 | { 20 | $sblam = $this->getSblam(); 21 | $bayes = new SblamTestBayes(array(), $this->services); 22 | $spamverts = new SblamTestSpamvertises(array(), $this->services); 23 | 24 | $isspam = NULL; 25 | if (!empty($_POST['ham'])) $isspam = false; 26 | else if (!empty($_POST['spam'])) $isspam = true; 27 | 28 | $addtext = empty($_POST['nowords']); 29 | $linkstoadd = array(); 30 | 31 | $spamvertresult = NULL; 32 | $bayesresult = NULL; 33 | 34 | if (!empty($_POST['stuff']) && NULL !== $isspam) 35 | { 36 | if ($addtext) 37 | { 38 | $bayes->addText($_POST['stuff'], $isspam, (int)$_POST['howmuch']); 39 | } 40 | 41 | if (preg_match_all('@(?:https?://|www\.)([a-z0-9.-]+\.[a-z]{2,4}(?:/[^\s]{1,15})?)@',$_POST['stuff'],$links)) 42 | { 43 | 44 | foreach($links[0] as &$l) if (!preg_match('@^https?://@',$l)) $l = 'http://'.$l; 45 | 46 | $spamverts->addURIs($links[0], $isspam, (int)$_POST['howmuch']); 47 | $spamvertresult = $spamverts->testURIs($links[0]); 48 | } 49 | else d("no links found"); 50 | } 51 | 52 | if (isset($_POST['stuff']) && $addtext) 53 | { 54 | $bayesresult = $bayes->testText($_POST['stuff']); 55 | } 56 | else $bayesresult = NULL; 57 | 58 | return array( 59 | 'title'=>'Added to bayes base', 60 | 'isspam'=>$isspam, 61 | 'addtext'=>$addtext, 62 | 'linksadded'=>$linkstoadd, 63 | 'spamvertresult'=>$spamvertresult, 64 | 'bayesresult'=>$bayesresult, 65 | ); 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /tests/phphttpbl.php: -------------------------------------------------------------------------------- 1 | key = $settings['key']; 13 | } 14 | 15 | protected function reverse($ip) 16 | { 17 | return preg_replace('!(\d+)\.(\d+)\.(\d+)\.(\d+)!',$this->key.'.\4.\3.\2.\1.dnsbl.httpbl.org',$ip); 18 | } 19 | 20 | 21 | function score($ip, $resip, $reason, $scorefactor) 22 | { 23 | if (preg_match('!127\.(\d+)\.(\d+)\.(\d+)!', $resip, $m)) 24 | { 25 | list(,$days,$threat,$type) = $m; 26 | 27 | if ($type == 0) // search engine bots!? http:BL doesn't calculate threat/age for them 28 | { 29 | $score = 0.1; 30 | $cert = self::CERTAINITY_LOW; 31 | } 32 | else 33 | { 34 | $score = 0.1 + (255+$threat)/350 * 10/($days+9); 35 | 36 | if ($threat > 80) $cert = self::CERTAINITY_HIGH; 37 | elseif ($threat > 20) $cert = self::CERTAINITY_NORMAL; 38 | else $cert = self::CERTAINITY_LOW; 39 | 40 | if ($type & 4) $score = $score*1.2 + 0.1; // comment spammer 41 | if ($type & 1) $score += 0.1; 42 | if (!($type & 6)) $score /=2; // wtf? no type? 43 | } 44 | $score = min(1.5,$score); 45 | 46 | $typename = ''; 47 | if ($type & 4) $typename .= 'C'; 48 | if ($type & 2) $typename .= 'H'; 49 | if ($type & 1) $typename .= '?'; 50 | 51 | if ($score < 0.8) return NULL; 52 | return array($score/2 * $scorefactor, $cert, "HoneypotBL (".round($score,2).'*'.round($scorefactor,2)."; $ip = $typename, ^$threat, {$days}d old)"); 53 | } 54 | } 55 | 56 | static function info() 57 | { 58 | return array( 59 | 'name'=>'Project Honeypot DNS RBL', 60 | 'desc'=>'Checks for banned IPs in Project Honeypot http:BL', 61 | 'remote'=>true, 62 | ); 63 | } 64 | } 65 | 66 | -------------------------------------------------------------------------------- /tests/linkmania.php: -------------------------------------------------------------------------------- 1 | getLinks(); if ($links === NULL) return NULL; 9 | 10 | $linkcount = count($links); $authorlink = ($p->getAuthorURI())?1:0; // count separately, because this link may be unrelated to post's contents, so shouldn't skew link/words ratio 11 | 12 | if (($linkcount+$authorlink) == 0) 13 | { 14 | if (strlen($p->getText()) > 20) return array(-0.5,self::CERTAINITY_NORMAL, "No links"); 15 | return NULL; // don't give nolinks bonus to posts with no content (no content is abnormal and it may be another way to spam) 16 | } 17 | if (($linkcount+$authorlink) == 1) return array(0.1,self::CERTAINITY_LOW, "Single link"); 18 | if (($linkcount+$authorlink) == 2) return array(0.2,self::CERTAINITY_LOW, "Two links"); 19 | 20 | $numwords = count(preg_split('![^a-z0-9\x7F-\xFF-]+|https?://[^\]\[\s\'"<>]+!i',$p->getText(),500,PREG_SPLIT_NO_EMPTY)); 21 | 22 | // long posts may legitimately have more links. can't set any limits, because wiki pages may contain lots of links. 23 | $ratio = round($linkcount*100 / (10+$numwords)); 24 | 25 | if ($ratio > 22) return array(0.45, self::CERTAINITY_NORMAL, "Flooded with links (A$ratio: $linkcount per {$numwords} words)"); 26 | if ($ratio > 17) return array(0.35, self::CERTAINITY_NORMAL, "Flooded with links (B$ratio: $linkcount per {$numwords} words)"); 27 | if ($ratio > 12) return array(0.25, self::CERTAINITY_NORMAL, "Flooded with links (C$ratio: $linkcount per {$numwords} words)"); 28 | if ($ratio > 6) return array(0.25, self::CERTAINITY_NORMAL, "Lots of links (D$ratio: $linkcount per {$numwords} words)"); 29 | return array(0.25, self::CERTAINITY_LOW,"Some links (E$ratio: $linkcount per {$numwords} words)"); 30 | } 31 | 32 | static function info() 33 | { 34 | return array( 35 | 'name'=>'LinkMania', 36 | 'desc'=>'Assumes that posts flooded with links are spam', 37 | 'remote'=>false, 38 | ); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /class/domainmatch.php: -------------------------------------------------------------------------------- 1 | importFile($file)) throw new Exception("Unable to import domains from file $file"); 11 | } 12 | } 13 | 14 | function importFile($file) 15 | { 16 | $key = "DomainMatch.$file"; 17 | 18 | if (function_exists('apc_fetch') && ($res = apc_fetch($key))) 19 | { 20 | $this->domains = unserialize($res); 21 | return true; 22 | } 23 | 24 | $lines = @file($file); if (!$lines) return false; 25 | 26 | foreach($lines as $line) 27 | { 28 | $line = trim(preg_replace('!^\s*(?:https?://)?([a-z0-9.-]*)(?:\s*\#.*)?!','\1',$line)); 29 | if (!$line) continue; 30 | $this->add($line); 31 | } 32 | 33 | if (function_exists('apc_store')) 34 | { 35 | apc_store($key,serialize($this->domains), 3600*3); 36 | } 37 | 38 | return true; 39 | } 40 | 41 | function add($domain) 42 | { 43 | $domain = explode('.',trim($domain,'.')); 44 | $this->addArray($this->domains,$domain); 45 | } 46 | 47 | protected function addArray(&$to, &$from) 48 | { 49 | $key = array_pop($from); 50 | if (!isset($to[$key])) $to[$key] = array(); 51 | if (count($from)) $this->addArray($to[$key],$from); 52 | } 53 | 54 | function check($uri) 55 | { 56 | $parts = explode('.',trim($uri,'.')); 57 | 58 | return $this->checkPart($this->domains, $parts); 59 | } 60 | 61 | protected function checkPart(array $in, array &$what, $level=0) 62 | { 63 | if (!count($what)) {d($in,'no what? at '.$level);return NULL;} 64 | 65 | $key = array_pop($what); 66 | if (!isset($in[$key])) {d($key,"part not found"); return $level;} 67 | return $this->checkPart($in[$key],$what,$level+1); 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /admin/tpl/accmsg.inc: -------------------------------------------------------------------------------- 1 | 2 | 3 | #preview {float:right; width:45%; margin-right:5%; border:1px solid #ddd; background:#fafafa; padding:1em 1em 3em; min-height:15em;} 4 | #textarea {width:44%;} 5 | .read {color:#aaa;} 6 | 7 | td.action,td.sent {width:3em;white-space:nowrap;} 8 | td a {font-size:1.3em; line-height:0.7; text-decoration:none;} 9 | 10 | 11 |

Sending to account id #${account/id}

12 |
13 |

Editing old message. Write a new one.

14 |

15 | New

18 |

27 | 28 |
29 | 30 |
31 |
32 | 33 |
34 | 35 | '.htmlspecialchars($caption).''; 42 | echo ''; 43 | if ($cnt) echo 'IPs'; 44 | echo 'IPScoreAge'; 45 | 46 | $odd = 0; 47 | 48 | $now = time(); 49 | foreach($tab as $r) 50 | { 51 | $odd = 1-$odd; 52 | if (empty($r['ip'])) continue; 53 | 54 | $ip = long2ip($r['ip']); 55 | 56 | echo ''; 57 | 58 | if ($cnt) echo ''.$r['cnt'].''; 59 | 60 | $rev = SblamURI::gethostbyaddr($ip); 61 | 62 | echo ''.$rev; 78 | 79 | echo ''.' '. 80 | round(sqrt($r['spampoints'])/3 + $r['spampoints']/300).''; 81 | 82 | $age = $now - strtotime($r['added']); 83 | if ($age < 0) {echo '-'; $age = -$age;} 84 | if ($age > 3*60*60*24) echo round($age/(60*60*24))."d"; 85 | else if ($age > 60*60*24) echo round($age/(60*60*24),1)."d"; 86 | else if ($age > 3*60*60) echo round($age/(60*60))."h"; 87 | else if ($age > 60*60) echo round($age/(60*60),1)."h"; 88 | else if ($age > 3*60) echo round($age/60,1)."m"; 89 | else if ($age > 60) echo round($age/60)."m"; 90 | else echo round($age)."s"; 91 | 92 | echo ''; 93 | 94 | if (isset($r['flags'])) 95 | { 96 | echo ''; 97 | if (preg_match('/(^|,)dul/',$r['flags'])) echo '☎'; 98 | if (preg_match('/(^|,)wild/',$r['flags'])) echo '☢'; 99 | if ($r['flags']=='') echo '?'; 100 | echo ''; 101 | } 102 | 103 | echo ''; 104 | } 105 | echo ''; 106 | } 107 | 108 | ?> 109 |
110 | 111 | 112 | 113 | 114 | 134 |
135 | -------------------------------------------------------------------------------- /tests/http.php: -------------------------------------------------------------------------------- 1 | getHeaders(); if (!$h || count($h)<2) return NULL; // HTTP_HOST is hardcoded! :/// 9 | 10 | $out = array(); 11 | 12 | if (!empty($h['HTTP_MOD_SECURITY_MESSAGE'])) $out[] = array(1,self::CERTAINITY_HIGH,"mod_security warning"); 13 | 14 | // Buggy .Net always adds header which is only needed for large forms (and browsers tend not to use it) 15 | if (!empty($h["HTTP_EXPECT"]) && false!==strpos($h['HTTP_EXPECT'],'100-') && strlen($p->getRawContent()) < 5000) 16 | $out[] = array(0.3,self::CERTAINITY_NORMAL,"100-expect .Net header"); 17 | 18 | // Bots tend to send these 19 | if (!empty($h["HTTP_PRAGMA"])) $out[] = array(empty($h["HTTP_VIA"])?0.3:0.1,self::CERTAINITY_LOW,"Pragma header"); 20 | if (!empty($h["HTTP_RANGE"])) $out[] = array(0.5,self::CERTAINITY_HIGH,"Range header"); 21 | if (!empty($h["HTTP_PROXY_CONNECTION"])) $out[] = array(0.2,self::CERTAINITY_LOW,"Proxy-Connection header"); 22 | if (!empty($h["HTTP_REFERER"]) && ($cnt=substr_count($h["HTTP_REFERER"],"http://")) > 1) 23 | $out[] = array(min(1.5,0.5 + $cnt/6),self::CERTAINITY_HIGH,"Multiple links in referrer"); 24 | 25 | if (($cnt = count($p->getAuthorIPs())) > 4) $out[] = array(($cnt-2)/10, $cnt>7?self::CERTAINITY_HIGH:self::CERTAINITY_NORMAL, "Insane number of relays ($cnt)"); 26 | 27 | // Unpatched IE!? 28 | if (!empty($h["HTTP_USER_AGENT"]) && preg_match('/MSIE [456]\.[0-9]; Windows (?:9|NT 5)/', $h['HTTP_USER_AGENT'])) { 29 | $out[] = array(0.3,self::CERTAINITY_NORMAL,"Unpatched IE"); 30 | } 31 | 32 | // Browsers almost always send these 33 | if (empty($h["HTTP_ACCEPT"])) $out[] = array(0.7,self::CERTAINITY_NORMAL,"Missing Accept header"); 34 | if (empty($h["HTTP_USER_AGENT"])) $out[] = array(1,self::CERTAINITY_NORMAL,"Missing UA header"); 35 | if (empty($h["HTTP_ACCEPT_LANGUAGE"])) $out[] = array(0.5,self::CERTAINITY_NORMAL,"Missing Accept-Language header"); 36 | if (empty($h["HTTP_ACCEPT_ENCODING"]) && empty($h["HTTP_VIA"]) && (empty($h["HTTP_USER_AGENT"]) || false===strpos($h["HTTP_USER_AGENT"],'Mozilla/4.0 (compatible; MSIE '))) 37 | $out[] = array(0.4,self::CERTAINITY_LOW,"Missing Accept-Encoding header"); 38 | if (!empty($h["HTTP_ACCEPT_CHARSET"])) $out[] = array(-0.2,self::CERTAINITY_LOW,"Has Accept-Charset header"); 39 | 40 | // Non-transparent proxy must add Via header 41 | if (empty($h["HTTP_VIA"]) && (!empty($h['HTTP_X_FORWARDED_FOR']) || !empty($h['HTTP_MAX_FORWARDS']))) 42 | $out[] = array(0.2,self::CERTAINITY_LOW,"Lame proxy"); 43 | 44 | // TE: requires Connection:TE 45 | if (!empty($h["HTTP_TE"]) && (empty($h['HTTP_CONNECTION']) || !preg_match('!\bTE\b!',$h['HTTP_CONNECTION']))) 46 | $out[] = array(0.2,self::CERTAINITY_NORMAL,"Invalid TE header"); 47 | 48 | // Googlebot doesn't post comments! 49 | if (!empty($h['HTTP_USER_AGENT']) && preg_match('!Googlebot[/ -]|Slurp|Wget/|W3C_Validator|Advertise\.com|nicebot|MMCrawler/|MSIECrawler|ia_archiver|WebaltBot/|nutbot\.com|\+http://search\.!',$h['HTTP_USER_AGENT'])) { 50 | $out[] = array(1,self::CERTAINITY_NORMAL,"Bots don't post comments"); 51 | } 52 | 53 | // Headless browsers no thanks 54 | if (!empty($h['HTTP_USER_AGENT']) && preg_match('!PhantomJS|CasperJS!',$h['HTTP_USER_AGENT'])) { 55 | $out[] = array(1,self::CERTAINITY_HIGH,"Nice try, PhantomJS"); 56 | } 57 | 58 | if (!empty($h['HTTP_USERAGENT']) || 59 | (!empty($h['HTTP_USER_AGENT']) && preg_match('!^User-Agent!i',$h['HTTP_USER_AGENT'])) 60 | ) $out[] = array(1,self::CERTAINITY_NORMAL,"Really badly written bot"); 61 | 62 | // I assume multipart forms are too tricky for most bots 63 | if (!empty($h['HTTP_CONTENT_LENGTH']) && !empty($h['HTTP_CONTENT_TYPE']) && preg_match('!^\s*multipart/form-data\s*;\s*boundary\s*=!i',$h['HTTP_CONTENT_TYPE'])) 64 | { 65 | $out[] = array(-0.2, self::CERTAINITY_LOW, "Multipart form"); 66 | } 67 | 68 | // browsers nicely decode and normalize paths, remove fragment part 69 | if (($path = $p->getPath()) && preg_match('!&|^https?://|^//|/%7e|#|\.\./!i',$path)) 70 | { 71 | $out[] = array(0.3, self::CERTAINITY_NORMAL, "Improperly encoded path"); 72 | } 73 | 74 | if (!empty($h["HTTP_REFERER"]) && preg_match('!&|/%7e|\.\./!i',$h["HTTP_REFERER"])) 75 | { 76 | $out[] = array(0.25, self::CERTAINITY_LOW, "Improperly encoded referer"); 77 | } 78 | 79 | if (count($out)) return $out; 80 | } 81 | 82 | 83 | static function info() 84 | { 85 | return array( 86 | 'name'=>'Catch buggy HTTP implementations', 87 | 'desc'=>'Invalid HTTP headers, fake Googlebots, etc.', 88 | 'remote'=>false, 89 | ); 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /tests/throttle.php: -------------------------------------------------------------------------------- 1 | 30) $key = 'h!'.md5($key); 12 | 13 | $key = $label.':'.$key; 14 | 15 | // race condition, eh 16 | $val = apc_fetch($key); 17 | 18 | if ($val) {list($time,$val) = explode("\t",$val,2);} else {$time=time();$val=0;} 19 | 20 | if ($time + $mins*60 < time()) {d($key,"expired"); $val = 0; $time = time();} 21 | elseif ($time + $mins*45 < time()) {d($key,"halflife"); $val /= 2; $time = time();} 22 | 23 | apc_store($key, $time."\t".($val+1), $mins*60); 24 | // race finishes here! 25 | 26 | $this->tests[] = array($label, $key, $min, $max, $mins); 27 | } 28 | 29 | function kill($score = 15) 30 | { 31 | $this->kill += $score; 32 | } 33 | 34 | function getResult($maxbasescore = 4) 35 | { 36 | $labels = array(); 37 | $score=0; 38 | 39 | foreach($this->tests as $test) 40 | { 41 | list($label, $key, $min, $max, $mins) = $test; 42 | 43 | $val = apc_fetch($key); if (!$val) continue; 44 | list($time,$val) = explode("\t",$val,2); 45 | 46 | if ($val > $min) 47 | { 48 | $res = min(($val - $min/2) / $max, 3) + 0.1; 49 | 50 | d(round($res,2)." points for $key which is at $val of $max+$min per $mins mins"); 51 | 52 | $score += $res; 53 | $labels[$label] = $res; 54 | } 55 | else d("no points for $key which is at $val of $max+$min per $mins mins"); 56 | } 57 | 58 | $score = min($score,$maxbasescore); 59 | 60 | if ($this->kill) {$score += $this->kill; $labels['kill'] = $this->kill;} 61 | 62 | if (!$score) return array(0,''); 63 | 64 | arsort($labels); 65 | $res = implode(';',array_keys($labels)); 66 | if ($this->waittime) $res .= ' + w'.(round($this->waittime/1000000,2)); 67 | return array($score, $res); 68 | } 69 | 70 | 71 | function antiConcurrency(ISblamPost $p) 72 | { 73 | return; // slows down server under heavy load :( 74 | 75 | $ip = $p->getAuthorIP(); 76 | if (!apc_add('ip.lock:'.$ip,1,2) && apc_fetch('ip-ban:'.$ip) < time()) 77 | { 78 | $wait = 500000 + 100000 * (mt_rand()%10); 79 | usleep($wait); 80 | $this->waittime += $wait; 81 | d("*Throttled*"); 82 | } 83 | } 84 | 85 | } 86 | 87 | class SblamTestThrottle extends SblamTestPost 88 | { 89 | protected $accumulator; 90 | 91 | function preTestPost(ISblamPost $p) 92 | { 93 | $acc = new ThrottleAccumulator(); 94 | $acc->antiConcurrency($p); 95 | 96 | $mainIP = $p->getAuthorIP(); 97 | $authorEmail = $p->getAuthorEmail(); 98 | if ($mainIP && $authorEmail) { 99 | $lastEmailKey = "lastemail:$mainIP"; 100 | 101 | if ($lastEmail = apc_fetch($lastEmailKey)) { 102 | if ($lastEmail != $authorEmail) { 103 | $acc->increment("ip.emailchange", $mainIP, 3, 9, 60); 104 | } 105 | } 106 | apc_store($lastEmailKey, $authorEmail, 2*3600); 107 | } 108 | 109 | $isRegistration = false !== strpos($p->getPath(), 'mode=register'); 110 | 111 | foreach($p->getAuthorIPs() as $ip) 112 | { 113 | if (apc_fetch('ip-ban:'.$ip) > time()) $acc->kill(); 114 | 115 | if ($isRegistration) { 116 | $acc->increment('ip.register', $ip, 3, 9, 6*60); 117 | } 118 | 119 | $acc->increment("ip.sec",$ip, 2, 3, 10/60); 120 | $acc->increment("ip",$ip, 15, 38, 60); 121 | $acc->increment("ip.day",$ip, 6*15, 6*38, 24*60); 122 | $acc->increment("ip.range",ip2long($ip)>>8, 20*10, 20*30, 120); 123 | } 124 | if ($email = $p->getAuthorEmail()) 125 | { 126 | $acc->increment("email",$email, 5, 40, 10*60); 127 | $acc->increment("email.short",$email, 5, 15, 10); 128 | 129 | $domain = preg_replace('/^.*@/', '', $email); 130 | 131 | if ($isRegistration) { 132 | $acc->increment('email.register', $email, 2, 5, 9*60); 133 | if ($domain) { 134 | $acc->increment('email.domain.register', $domain, 10, 25, 9*60); 135 | } 136 | } 137 | } 138 | if ($name = $p->getAuthorName()) 139 | { 140 | $acc->increment("name",$name, 20, 70, 5*60); 141 | $acc->increment("name.short",$name, 10, 35, 5); 142 | } 143 | $domains = array(); 144 | foreach($p->getLinks() as $link) 145 | { 146 | if (($hostname = $link->getHostname()) && ($domain = $link->getDomain()) && $domain != $hostname) // if domain=hostname, it's known neutral domain 147 | { 148 | $domains[$domain] = true; 149 | } 150 | } 151 | foreach($domains as $domain => $x) 152 | { 153 | $acc->increment("link.domain.short",$domain, 15, 25, 40); 154 | $acc->increment("link.domain",$domain, 35, 85, 5*60); 155 | } 156 | $this->accumulator = $acc; 157 | } 158 | 159 | function testPost(ISblamPost $p) 160 | { 161 | if (!$this->accumulator) return NULL; 162 | 163 | $this->accumulator->antiConcurrency($p); 164 | 165 | list($points,$desc) = $this->accumulator->getResult(); 166 | 167 | if ($points > 0) return array($points/5, self::CERTAINITY_NORMAL, "Throttle ".round($points,1)." $desc"); 168 | } 169 | 170 | function reportResult(ISblamPost $p, $score, $cert) 171 | { 172 | if (!function_exists('apc_store')) throw new Exception("NO APC"); 173 | 174 | if ($score > 1.2 && $cert > 0.95) 175 | { 176 | foreach($p->getAuthorIPs() as $ip) 177 | { 178 | apc_store('ip-ban:'.$ip,time()+5,5); // block for 5 sec 179 | } 180 | } 181 | } 182 | 183 | static function info() 184 | { 185 | return array( 186 | 'name'=>'Throttle requests', 187 | 'desc'=>'Limit rate of posting', 188 | 'remote'=>false, 189 | 'unsupported'=>!function_exists('apc_store'), 190 | ); 191 | } 192 | } 193 | -------------------------------------------------------------------------------- /tests/spamvertises.php: -------------------------------------------------------------------------------- 1 | add = !empty($settings['add']); 16 | $tableprefix = !isset($settings['prefix'])?'links':$settings['prefix']; 17 | $ignorefile = !isset($settings['ignore'])?'data/spamvertignore.txt':$settings['ignore']; 18 | 19 | $this->db = new BayesBase($this->services->getDB(), $tableprefix, $ignorefile); 20 | } 21 | 22 | function reportResult(ISblamPost $p, $score, $cert) 23 | { 24 | if ($this->add && abs($score) > 0.6 && $cert > 0.8)//0.81) 25 | { 26 | $this->addPost($p, $score > 0); 27 | } 28 | } 29 | 30 | 31 | function testPost(ISblamPost $p) 32 | { 33 | $uris = $this->extractURIsFromPost($p); 34 | return $this->testURIs($uris); 35 | } 36 | 37 | function testURIs(array $uris) 38 | { 39 | if (!count($uris)) {return NULL;} 40 | 41 | list($totalspam, $totalham) = $this->db->getTotalPosts(); if (!$totalham || !$totalspam) {return;} 42 | $totalspam /= 10; $totalham /= 10; // totals are too inflated 43 | 44 | $wordlist = $this->db->getWordList($this->db->hashWords($uris)); if (!$wordlist || !count($wordlist)) {return;} 45 | $wordlist = $wordlist->fetchAll(PDO::FETCH_ASSOC); 46 | $spamness = 0; 47 | $maxspamness = 0; 48 | $spampop = 0; $hampop=0; 49 | foreach($wordlist as $r) 50 | { 51 | // make spam/ham in range 0-100 52 | $spam = min(100, $r['spam'] / ($totalspam/100)); 53 | $ham = min(100, $r['ham'] / ($totalham/100)); 54 | 55 | // and now make it 1-150 with nonlinear skew 56 | $spam += 5*sqrt($spam); 57 | $ham += 5*sqrt($ham); 58 | 59 | $spampop += max(0,$spam-$ham) * $spam/($ham+$spam) + ($ham<=0.001?min(max(1,$spam/2),3):0); 60 | $hampop += max(0,$ham-$spam) * $ham/($ham + 3*$spam); 61 | $spamness += $spam/($ham+$spam) - $ham/2/($ham+$spam); 62 | $maxspamness = max($maxspamness, $spam/($ham+$spam)); 63 | } 64 | 65 | $hampop = max(0, $hampop - $spampop); 66 | 67 | if ($hampop > 1 && $spamness < 0 && $maxspamness < 0.3) 68 | { 69 | $score = (($spamness - $hampop) / (count($wordlist)+1) * (0.3 - $maxspamness))/3; 70 | if ($score < -0.3) $score = max(-0.6, ($score+0.3)/2 - 0.3); 71 | return $score > -0.1 ? NULL : array($score/3, self::CERTAINITY_NORMAL - $maxspamness, "Clean websites (".round($score,2)." = ".round($spamness,2).", max ".round($maxspamness,2).")"); 72 | } 73 | 74 | $maxspamness *= $maxspamness; // if in doubt, don't punish 75 | $maxspamness *= $maxspamness; // if in doubt, don't punish 76 | 77 | $score = (($spampop+3) * $maxspamness)/18; 78 | if ($score > 0.7) $score = ($score-0.7)/2+0.7; 79 | if ($score > 1.1) $score = ($score-1.1)/3+1.1; 80 | 81 | return $score < 0.1 ? NULL : array($score, $maxspamness * self::CERTAINITY_NORMAL ,"Spamvertised websites (".round($score,2)." = ".round($spamness,2).", max ".round($maxspamness,2).")"); 82 | } 83 | 84 | function addPost(ISblamPost $p, $isspam, $howmuch = 1) 85 | { 86 | $this->db->add($this->extractURIsFromPost($p),$isspam, $howmuch); 87 | } 88 | 89 | function addURIs(array $links, $isspam, $howmuch = 1) 90 | { 91 | $parsed = array(); 92 | foreach($links as $l) 93 | { 94 | try { 95 | $this->addURI($parsed, new SblamURI($l)); 96 | } 97 | catch(Exception $e){warn($e);} 98 | } 99 | if (count($parsed)) 100 | { 101 | return $this->db->add(array_keys($parsed), $isspam, $howmuch); 102 | } 103 | return false; 104 | } 105 | 106 | function addURI(array &$urls, SblamURI $link, $prefix = '') 107 | { 108 | if ($link->isTLD()) {return;} 109 | 110 | if ($hostname = $link->getHostname()) 111 | { 112 | $hostname = preg_replace(array('!^www\.!','!\d\d+!'),array('','D'),$hostname); // normalise digits! (block bulk registrations) 113 | $urls[$prefix.$hostname] = true; 114 | 115 | if ($domain = $link->getDomain()) 116 | { 117 | $urls[$prefix.$domain] = true; 118 | } 119 | 120 | if ($p = $link->getPath()) 121 | { 122 | $p = preg_replace('!^(/[^#]{1,7}[^#/\?]{0,5}).*$!','\1',$p); // shorten path. its mainly for getting real tinyurl adresses, not every spammy subpage out there 123 | if ($p !== '/') $urls[$prefix.$hostname . $p] = true; 124 | } 125 | } 126 | 127 | if (preg_match('!\b(?:site:|https?://)([a-zA-Z0-9.-]+)!',urldecode($link->getPath()),$m)) 128 | { 129 | $this->addURI($urls, new SblamURI('http://'.$m[1]), $prefix); 130 | } 131 | } 132 | 133 | function addEmail(array &$uris, $email) 134 | { 135 | // adds e-mail in "@example.com" format. 136 | if (preg_match('/([^\s:\/#@!;]+)@([a-z0-9.-]+\.[a-z]{2,6})/',strtolower($email),$r)) 137 | { 138 | $r = array(new SblamURI('http://'.$r[2].'/'), $r[1]); // it's a hack to re-use SblamURI logic 139 | $this->addURI($uris, $r[0], '@'); 140 | $r[1] = str_replace('.','',$r[1]); // gmail 141 | $this->addURI($uris, $r[0], preg_replace('/\d+/','D',$r[1]).'@'); 142 | return true; 143 | } 144 | return false; 145 | } 146 | 147 | protected function extractURIsFromPost(ISblamPost $p) 148 | { 149 | $uris = array(); 150 | if ($uri = $p->getAuthorURI()) 151 | { 152 | $this->addURI($uris, new SblamURI($uri)); 153 | } 154 | foreach($p->getLinks() as $link) 155 | { 156 | $this->addURI($uris,$link); 157 | } 158 | $this->addEmail($uris, $p->getAuthorEmail()); 159 | 160 | return array_keys($uris); 161 | } 162 | 163 | 164 | static function info() 165 | { 166 | return array( 167 | 'name'=>'Spamvertised links database', 168 | 'desc'=>'Bayesian auto-learning filter for spamvertised domains', 169 | 'remote'=>false, 170 | ); 171 | } 172 | } 173 | 174 | -------------------------------------------------------------------------------- /admin/index.php: -------------------------------------------------------------------------------- 1 | config = $config; 32 | $this->services = $services; 33 | } 34 | 35 | function getSblam() 36 | { 37 | if (!$this->sblam) 38 | { 39 | $this->sblam = new Sblam($this->config, $this->services); 40 | } 41 | return $this->sblam; 42 | } 43 | 44 | function getSblamBase() 45 | { 46 | return new SblamBase($this->services->getDB()); 47 | } 48 | 49 | function execute(array $args) 50 | { 51 | $method = count($args) ? array_shift($args) : 'index'; 52 | 53 | //if (!ctype_alnum($method)) throw new Exception("Invalid method $method"); 54 | 55 | if ($_SERVER['REQUEST_METHOD'] != 'GET') $method = strtolower($_SERVER['REQUEST_METHOD']).'_'.$method; 56 | 57 | if (!method_exists($this, $method)) throw new Exception("There is no method $method"); 58 | 59 | $res = call_public_func_array(array($this,$method), $args); 60 | if (!isset($res['title'])) 61 | { 62 | $res['title'] = ucwords(preg_replace('/Page$/','',get_class($this)).': '.strtr($method,'_',' ')); 63 | } 64 | return $res; 65 | } 66 | } 67 | 68 | function call_public_func_array($callback, array $args) 69 | { 70 | return call_user_func_array($callback, $args); 71 | } 72 | 73 | class Admin 74 | { 75 | private static $baseuri = '/admin/'; 76 | private static function parseURI($uri) 77 | { 78 | $l = strlen(self::$baseuri); 79 | 80 | if (strlen($uri) < $l || substr($uri, 0, $l) !== self::$baseuri) throw new Exception("Not admin URI"); 81 | 82 | $components = explode('/',substr($uri, $l)); 83 | 84 | $pagename = array_shift($components); if ($pagename==='') $pagename = 'main'; 85 | 86 | foreach($components as &$c) $c = urldecode($c); 87 | 88 | return array('pagename'=>$pagename, 'args'=>$components); 89 | } 90 | 91 | public static function process(array $config, ISblamServices $services) 92 | { 93 | $pageinf = self::parseURI($_SERVER['REQUEST_URI']); 94 | 95 | try 96 | { 97 | $page = self::loadPage($pageinf['pagename'], $config, $services); 98 | 99 | $res = $page->execute($pageinf['args']); 100 | d($res,'res'); 101 | assert('is_array($res)'); 102 | } 103 | catch(Exception $e) 104 | { 105 | header('HTTP/1.1 500 argh'); 106 | self::display(array('exception'=>$e,'title'=>'Error: '.get_class($e), 'page_template'=>'exception'),$pageinf); 107 | return; 108 | } 109 | 110 | self::display($res, $pageinf); 111 | } 112 | 113 | private static function display(array $res, array $pageinf) 114 | { 115 | if (isset($res['redirect'])) 116 | { 117 | if ($_SERVER['REQUEST_METHOD'] != 'GET') header('HTTP/1.1 303 see'); 118 | 119 | if (preg_match('!^https?://!',$res['redirect'])) $url = $res['redirect']; 120 | else $url = 'http://'.$_SERVER['HTTP_HOST'].self::$baseuri.$res['redirect']; 121 | 122 | header("Location: $url"); 123 | die($url); 124 | } 125 | 126 | $phptal = new PHPTAL(); 127 | $phptal->set('POST',$_POST); 128 | 129 | foreach($res as $k => $v) 130 | { 131 | $phptal->set($k,$v); 132 | } 133 | 134 | if (!isset($res['page_template'])) $res['page_template'] = $pageinf['pagename']; 135 | if (!isset($res['page_content']) && $res['page_template']) 136 | { 137 | $phptal->setTemplate('admin/tpl/'.$res['page_template'].'.inc'); 138 | $res['page_content'] = $phptal->execute(); 139 | $phptal->set('page_content', $res['page_content']); 140 | } 141 | 142 | if (!isset($res['content_type'])) $res['content_type'] = 'text/html;charset=UTF-8'; 143 | header("Content-Type: ".$res['content_type']); 144 | 145 | if (!isset($res['layout_template'])) $res['layout_template'] = 'layout'; 146 | if ($res['layout_template']) 147 | { 148 | $phptal->setTemplate('admin/tpl/'.$res['layout_template'].'.inc'); 149 | echo $phptal->execute(); 150 | } 151 | else 152 | { 153 | echo $res['page_content']; 154 | } 155 | } 156 | 157 | private static function loadPage($name, array $config, ISblamServices $services) 158 | { 159 | if (!ctype_alnum($name)) throw new Exception("Invalid page name"); 160 | 161 | $basepath = dirname(__FILE__).'/'; 162 | $pagefile = $basepath.$name . '.php'; 163 | 164 | if (!file_exists($pagefile)) 165 | { 166 | throw new Exception("No file $pagefile"); 167 | } 168 | 169 | // ob_start(); 170 | require_once $pagefile; 171 | // ob_end_clean(); 172 | 173 | $class = ucfirst($name).'Page'; 174 | if (!class_exists($class)) throw new Exception("Class $class not found"); 175 | 176 | $page = new $class($config, $services); 177 | 178 | if (!$page instanceof AdminPage) throw new Exception("Not an admin page"); 179 | 180 | return $page; 181 | } 182 | } 183 | 184 | 185 | 186 | 187 | try 188 | { 189 | $config = Server::getDefaultConfig(); 190 | $services = new SblamServices(sblambaseconnect($config)); 191 | 192 | Admin::process($config, $services); 193 | } 194 | catch(Exception $e) 195 | { 196 | header('HTTP/1.1 500 ERR'); 197 | header("Content-Type: text/plain;charset=UTF-8"); 198 | if (ini_get('display_errors')) echo $e; else echo "Error ".$e->getSourceLine(); 199 | error_log($e->getMessage()." in ".$e->getSourceFile().':'.$e->getSourceLine()); 200 | warn($e,"Died"); 201 | } 202 | -------------------------------------------------------------------------------- /class/interfaces.php: -------------------------------------------------------------------------------- 1 | 'value'); 89 | @return array 90 | function getHTTPHeaders(); */ 91 | 92 | /* if you track visitors using cookies and/or session IDs, report it using this method. 93 | @return number of pages known to be requested by this poster (int) or true (bool) if session does exist, but has unknown length or false (bool) when no valid session initiated (no cookie, session id posted) 94 | 95 | function getSessionLength();*/ 96 | 97 | /** convert post to HTML fragment (// tags are not allowed). Must use UTF-8 encoding, don't add . 98 | @return string 99 | */ 100 | // function getHTML(); 101 | 102 | } 103 | 104 | interface ISblamTrackback 105 | { 106 | /** return associative array with trackback information, as per specification. 107 | \li title - title of the pinging entry 108 | \li excerpt - quoted fragment of the entry 109 | \li url - url of the pinging entry 110 | \li blog_name - name of pinging blog 111 | 112 | exception from trackback specification is that all strings must be UTF-8 113 | 114 | @see http://www.sixapart.com/pronet/docs/trackback_spec 115 | */ 116 | function getTrackbackInfo(); 117 | 118 | /** post that trackback refers to, NULL if not known. 119 | @return ISblamPost */ 120 | function getReferredPost(); 121 | } 122 | 123 | interface ISblamTest 124 | { 125 | const CERTAINITY_LOW = 0.5; 126 | const CERTAINITY_NORMAL = 0.75; 127 | const CERTAINITY_HIGH = 1; 128 | const CERTAINITY_SURE = 2; 129 | 130 | static function info(); 131 | 132 | public function __construct(array $settings, ISblamServices $services); 133 | } 134 | 135 | interface ISblamTestPost extends ISblamTest 136 | { 137 | /** perform full check and return array with three elements: 138 | \li 0 (probability) - how big chance is that this message is spam (or not spam). negative=not spam, positive=spam. scalar (float) between 0 and 1; 0 = dunno, ±0.5 = maybe, ±1 = surely. 139 | \li 1 (certainity) - how certain is that method; use predefined constants. 140 | \li 2 name of the test (should be constant) - to track accuracy, may be used for bayesian filtering 141 | */ 142 | function testPost(ISblamPost $p); 143 | 144 | /** FYI post that will be passed to testPost later. This allows launching tests asynchronously. */ 145 | function preTestPost(ISblamPost $p); 146 | 147 | 148 | // notify about final score (post,score,cert) - for auto blacklists, bayes 149 | // notify about moderated post (basepost, t/f) - for reporting/correcting mistakes 150 | } 151 | 152 | interface ISblamTestTrackback extends ISblamTest 153 | { 154 | /** @see ISblamTestPost::testPost() */ 155 | function testTrackback(ISblamTrackback $t); 156 | } 157 | 158 | interface ISblamHttp 159 | { 160 | function setPost($payload, $content_type); 161 | function setPath($path, array $query_string = array()); 162 | function setHost($host); 163 | function setTimeout($timeout); 164 | function requestAsync(); 165 | } 166 | 167 | interface ISblamHTTPAsyncResponse 168 | { 169 | function getStatus(); 170 | function getResponseBody(); 171 | } 172 | -------------------------------------------------------------------------------- /class/sblam.php: -------------------------------------------------------------------------------- 1 | services = $services; 16 | $this->readConfig($config); 17 | } 18 | 19 | protected function readConfig(array $ini) 20 | { 21 | if (!empty($ini['tlds'])) SblamURI::init($ini['tlds'], $this->services->getDB()); else warn('tlds not given!'); 22 | if (!empty($ini['dns'])) AsyncDNS::init(preg_split('![\s,]+!',$ini['dns'],NULL,PREG_SPLIT_NO_EMPTY)); else warn('dns not given!'); 23 | 24 | foreach($ini as $name => $settings) 25 | { 26 | if (!is_array($settings) || $name == 'db') continue; 27 | 28 | if (!empty($settings['disabled']) || (isset($settings['enabled']) && !$settings['enabled'])) {/*d($name,'disabled');*/continue;} 29 | 30 | try { 31 | include_once "tests/".strtolower($name).".php"; 32 | $classname = "SblamTest".ucfirst($name); 33 | if (!class_exists($classname)) warn($name,"Problem loading test plugin"); 34 | 35 | $info = call_user_func(array($classname,'info')); 36 | if (!empty($info['remote']) && isset($ini['remote']) && !$ini['remote']) {d($info,'Its a remote service, remote disabled, skipping'); continue;} 37 | if (!empty($info['unsupported'])) {d($info,'unsupported in this configuration'); continue;} 38 | 39 | $test = new $classname($settings, $this->services); 40 | 41 | if (!$test instanceof ISblamTest) {warn($test,'Not a test');continue;} 42 | //d($classname,"instantiated"); 43 | $this->addTest($test, isset($settings['phase'])? $settings['phase']:10); 44 | } 45 | catch(Exception $e) 46 | { 47 | warn($e,"Failed to initialize plugin $name"); 48 | } 49 | } 50 | 51 | 52 | return true; 53 | } 54 | 55 | protected $testPhases; 56 | function addTest(ISblamTest $t, $phase) 57 | { 58 | $this->testPhases[$phase][] = $t; 59 | d(get_class($t),"added to $phase"); 60 | } 61 | 62 | const EARLY_ESCAPE_LIMIT = 2.5; // maximum score 63 | 64 | function testPost(ISblamPost $p) 65 | { 66 | if (!$this->testPhases) return array(0,0,"No tests"); 67 | 68 | $profiling = array(); $asyncpolltime = 0; 69 | $results = array(); $totalspam=0; 70 | 71 | ksort($this->testPhases,SORT_NUMERIC); 72 | foreach($this->testPhases as $phase => $phaseTests) 73 | { 74 | foreach($phaseTests as $test) 75 | { 76 | if (!$test instanceof ISblamTestPost) continue; 77 | 78 | $start = microtime(true); 79 | $test->preTestPost($p); 80 | $profiling["p$phase:".get_class($test)] = (microtime(true)-$start); 81 | } 82 | 83 | foreach($phaseTests as $test) 84 | { 85 | if (!$test instanceof ISblamTestPost) continue; 86 | 87 | $start = microtime(true); 88 | AsyncSocket::poll(0); // get those queued DNS queries 89 | $asyncpolltime += microtime(true)-$start; 90 | 91 | $start = microtime(true); 92 | $tmpres = $test->testPost($p); 93 | $profiling["t$phase:".get_class($test)] = (microtime(true)-$start); 94 | 95 | $results[] = $tmpres; 96 | if ($tmpres && is_numeric($tmpres[0])) $totalspam += $tmpres[0]; 97 | if ($totalspam > self::EARLY_ESCAPE_LIMIT) 98 | { 99 | $results[] = array(6,1,"Early escape",$profiling); 100 | break 2; 101 | } 102 | } 103 | } 104 | 105 | $profiling['tst:AsyncSocket'] = $asyncpolltime; 106 | $results = $this->sumResults($results); 107 | $results[2] = implode('; ',$results[2]); 108 | $results[3] = $profiling; 109 | return $results; 110 | } 111 | 112 | function reportResult(ISblamPost $p, array $results, $force=false) 113 | { 114 | $profiling = array(); 115 | foreach($this->testPhases as $phaseTests) 116 | foreach($phaseTests as $test) 117 | { 118 | if (!$test instanceof ISblamTestPost) continue; 119 | 120 | $start = microtime(true); 121 | $test->reportResult($p, $results[0], $results[1], $force); 122 | $profiling['rep:'.get_class($test)] = microtime(true) - $start; 123 | } 124 | if (isset($results[3]) && is_array($results[3])) $results[3] = array_merge($results[3],$profiling); else $results[3] = $profiling; 125 | return $results; 126 | } 127 | 128 | function testTrackback(ISblamTrackback $p) 129 | { 130 | $results = array(); 131 | foreach($this->testPhases as $phaseTests) 132 | foreach($phaseTests as $test) 133 | { 134 | if (!$test instanceof ISblamTestTrackback) continue; 135 | $results[] = $test->testTrackback($p); 136 | } 137 | return $this->sumResults($results); 138 | } 139 | 140 | 141 | static function sumResults($results) 142 | { 143 | $probHam=0; 144 | $probSpam=0; 145 | $certHam=0; 146 | $certSpam=0; 147 | 148 | $names = array(); 149 | 150 | foreach($results as $r) 151 | { 152 | if (!is_array($r) || !count($r)) continue; 153 | if (is_array($r[0])) {$r = self::sumResults($r); d($r,'got result from parent');} 154 | 155 | if (!empty($r[2])) { 156 | if (is_array($r[2])) $names = array_merge($names,$r[2]); 157 | else $names[] = $r[2]; 158 | } 159 | 160 | if ($r[0] < 0) { 161 | $probHam -= $r[0]; 162 | $certHam -= $r[0] * $r[1]; 163 | } 164 | else 165 | { 166 | $probSpam += $r[0]; 167 | $certSpam += $r[0] * $r[1]; 168 | } 169 | } 170 | 171 | d("sum is: ham $probHam with $certHam cert, spam $probSpam with $certSpam cert - tested ".implode(';',$names)); 172 | 173 | $larger = max($certHam,$certSpam); 174 | $smaller = min($certHam,$certSpam); 175 | 176 | if ($larger) 177 | { 178 | // high certainity for ham and spam should cancel each other (smaller/larger). 179 | // if both were low, don't increase them (min). 180 | // if certainity was huge, preserve at least a bit of it ($larger/10) 181 | $endcert = ($larger/10) + min($larger,1-($smaller/$larger)); 182 | } 183 | else $endcert=0; 184 | 185 | if (abs($certSpam+$certHam) < 0.01) return array(0,0,$names); 186 | 187 | return array( (-$probHam*$certHam + $probSpam*$certSpam) / ($certSpam+$certHam),$endcert,$names); 188 | } 189 | 190 | public static function formatProfiling(array $profiling) 191 | { 192 | $profilingres = ''; 193 | arsort($profiling); $limit = 10; 194 | foreach($profiling as $k => $v) 195 | { 196 | $profilingres .= sprintf("% 5d %s\n",$v*1000,$k); 197 | if (!$limit-- || $v < 0.001) break; 198 | } 199 | return $profilingres; 200 | } 201 | 202 | /** 203 | * return ID that is unique to this server/installation 204 | */ 205 | static function getInstallationID() 206 | { 207 | return @md5(ini_get('extension_dir') . phpversion() . $_SERVER['HTTP_HOST'] . $_SERVER['SERVER_SOFTWARE'] . __FILE__); 208 | } 209 | } 210 | -------------------------------------------------------------------------------- /class/sblampost.php: -------------------------------------------------------------------------------- 1 | setRawContent($rawcontent); 13 | $this->setAuthor($name,$mail,$uri,$ip); 14 | $this->headers = $_SERVER; 15 | $this->post = $_POST; 16 | $this->posttime = time(); 17 | } 18 | 19 | protected $headers; 20 | function setHeaders(array $h) {$this->headers = $h;} 21 | function getHeaders() {return $this->headers;} 22 | 23 | protected $post; 24 | function setPost(array $p) {$this->post = $p;} 25 | function getPost() {return $this->post;} 26 | 27 | protected $path; 28 | function setPath($p) {$this->path = $p;} 29 | function getPath() {return $this->path;} 30 | 31 | private $html,$text,$dom,$links; 32 | protected function setRawContent($raw) 33 | { 34 | $this->raw = $raw; $this->dom = $this->text = $this->links = NULL; 35 | } 36 | function getRawContent() {return $this->raw;} 37 | protected function getDOM() 38 | { 39 | if (!$this->dom) 40 | { 41 | $this->dom = new DOMDocument(); 42 | if (!@$this->dom->loadHTML( 43 | ''. 44 | $this->getRawContent() 45 | )) {$this->dom = NULL;} 46 | } 47 | return $this->dom; 48 | } 49 | 50 | function _addBBlink($bblink) 51 | { 52 | //d($match,'bblink'); 53 | $this->links[] = new SblamURI($bblink[1],$bblink[2]); 54 | } 55 | 56 | function addLink($uri, $label = '') 57 | { 58 | $this->getLinks(); // prefill links array 59 | $this->links[] = new SblamURI($uri,$label); 60 | } 61 | 62 | function getLinks() 63 | { 64 | $seenlinks = array(); 65 | 66 | if ($this->links === NULL) 67 | { 68 | // find all links that are in HTML (DOM should be used to parse according to HTML rules) 69 | $this->links = array(); 70 | if ($dom = $this->getDOM()) 71 | { 72 | foreach($dom->getElementsByTagName('a') as $a) 73 | { 74 | if ($uri = $a->getAttribute('href')) 75 | { 76 | $seenlinks[preg_replace('!\#[^#]*$!','',$uri)] = true; 77 | $this->links[] = new SblamURI($uri,$a->textContent); 78 | } 79 | } 80 | } 81 | 82 | $nonlinks = $this->getText(); 83 | if (preg_match_all('!\[url\s*=\s*[\'\"]?((?:https?|www|//)[^]<>\s\'\"]+)\s*\]([^]\[]*?)\[/url!is',$nonlinks,$bb,PREG_SET_ORDER)) 84 | { 85 | foreach($bb as $bblink) 86 | { 87 | $this->links[] = new SblamURI($bblink[1],$bblink[2]); 88 | } 89 | } 90 | 91 | //d($nonlinks,'new text before debb'); 92 | $this->updateText(preg_replace('!\[url\s*=\s*[\'\"]?((?:https?|www|//)[^]<>\s\'\"]+)\s*\]([^]\[]*?)\[/url\]'. 93 | '|\[/?(?:url|b|u|i|quote|color|size|list|img|code|bi|pre|s|attach)(?:=[^\]]{1,12})?\]!is','',$nonlinks)); 94 | 95 | // find all links outside HTML 96 | if (preg_match_all('!https?://[^\s)#\'"\!]+|\bwww\.(?:[a-z0-9][a-z0-9-]+\.)+[a-z]{2,6}(?:/[^]\[\s()#\'"\!\*]*)?!',$this->getText(),$matches)) 97 | { 98 | foreach($matches[0] as $uri) 99 | { 100 | if (!isset($seenlinks[$uri])) 101 | $this->links[] = new SblamURI($uri);//,'label'=>NULL); 102 | } 103 | } 104 | 105 | // ignore links pointing to the site itself 106 | /*$headers = $this->getHeaders(); 107 | if (!empty($headers['HTTP_HOST'])) 108 | { 109 | $hosturi = new SblamURI('http://'.$headers['HTTP_HOST'].'/'); 110 | $domain = $hosturi->getDomain(); 111 | 112 | foreach($this->links as $key => $link) 113 | { 114 | if ($domain === $link->getDomain()) unset($this->links[$key]); 115 | } 116 | }*/ 117 | 118 | if (($headers = $this->getHeaders()) && !empty($headers['HTTP_HOST']) && 119 | !empty($headers['HTTP_REFERER']) && preg_match('!(?:https?:)?//([^/?#]+)[^\s]*!i',$headers['HTTP_REFERER'],$r)) 120 | { 121 | if (false === strpos($r[1],$headers['HTTP_HOST'])) 122 | { 123 | $this->links[] = new SblamURI($r[0],$headers['HTTP_REFERER']); 124 | } 125 | } 126 | } 127 | return $this->links; 128 | } 129 | 130 | function getText() 131 | { 132 | if (!$this->text) 133 | { 134 | if ($origdom = $this->getDOM()) 135 | { 136 | $doc = $origdom->documentElement->cloneNode(true); 137 | 138 | $temp = array(); 139 | foreach($doc->getElementsByTagName('a') as $a) $temp[] = $a; // live collections suck when removing things 140 | foreach($temp as $a) 141 | { 142 | $a->parentNode->removeChild($a); 143 | } 144 | $this->text = $doc->textContent; 145 | } 146 | } 147 | return $this->text; 148 | } 149 | 150 | private function updateText($text) 151 | { 152 | //d($text,"new text!"); 153 | $this->text = $text; 154 | } 155 | 156 | private $authorname,$authormail,$authoruri,$authorips; 157 | 158 | /** @param ip IP either single IP or array of IPs (proxy forwarded hosts). IPs should be in dot notation (11.22.33.44) 159 | */ 160 | function setAuthor($name,$mail=NULL,$uri=NULL,$ip=NULL) 161 | { 162 | if ($ip === NULL) {warn("No ip given for sblampost, taking from env!");$ip = $_SERVER['REMOTE_ADDR'];} 163 | else if (is_numeric($ip)) $ip = long2ip($ip); 164 | if (!is_array($ip)) $ip = array($ip); 165 | 166 | $this->authorname = $name; 167 | $this->authormail = $mail; 168 | $this->authoruri = $uri; 169 | $this->authorips = $ip; 170 | } 171 | 172 | function getAuthorName() {return $this->authorname;} 173 | function getAuthorEmail() {return $this->authormail;} 174 | function getAuthorURI() {return $this->authoruri !== 'http://'?$this->authoruri:NULL;} /** @todo should check if link looks valid. now just excludes one popular default */ 175 | function getAuthorIP() {return count($this->authorips)?$this->authorips[0]:NULL;} 176 | function getAuthorIPs() {return $this->authorips;} 177 | 178 | protected $signature; 179 | function setSignature($s) {$this->signature = $s;} 180 | function getSignature() {return $this->signature;} 181 | 182 | private $dates = array(); 183 | function getDates() {return $this->dates;} 184 | 185 | protected $posttime; 186 | function getPostTime() {return $this->posttime;} 187 | function setTime($t) {$this->posttime = $t;} 188 | 189 | protected $serverinstallid; 190 | function setInstallId($s) {$this->serverinstallid = $s;} 191 | function getInstallId() {return $this->serverinstallid;} 192 | } 193 | 194 | class SblamPostAuto extends SblamPost 195 | { 196 | function __construct($contentfield=NULL,$namefield=NULL,$mailfield=NULL,$urifield=NULL) 197 | { 198 | if ($contentfield && isset($_POST[$contentfield])) $contentfield = $_POST[$contentfield]; 199 | if ($namefield && isset($_POST[$namefield])) $namefield = $_POST[$namefield]; 200 | if ($mailfield && isset($_POST[$mailfield])) $mailfield = $_POST[$mailfield]; 201 | if ($urifield && isset($_POST[$urifield])) $urifield = $_POST[$urifield]; 202 | 203 | parent::__construct($contentfield, $namefield, $mailfield, $urifield, ServerRequest::getRequestIPs()); 204 | } 205 | } 206 | 207 | -------------------------------------------------------------------------------- /class/asyncdns.php: -------------------------------------------------------------------------------- 1 | owner = $owner; 17 | } 18 | 19 | function destroy() 20 | { 21 | $this->owner = NULL; // break circular reference 22 | } 23 | 24 | protected function result($resconst,$dat = NULL) 25 | { 26 | $this->owner->setResult($resconst, $dat); 27 | } 28 | 29 | protected $buffer = ''; 30 | 31 | function read($buf) 32 | { 33 | $this->buffer .= $buf; 34 | 35 | $ans = new Net_DNS_Packet(); 36 | if ($ans->parse($this->buffer)) 37 | { 38 | if ($ans->header->qr != '1') $this->result(AsyncDNS::RES_ERROR, "Not an answer"); 39 | else if ($ans->header->id != $this->packet->header->id) $this->result(AsyncDNS::RES_ERROR, "Invalid ID"); 40 | if ($ans->header->ancount <= 0) 41 | { 42 | if ($ans->header->rcode === 'FORMERR') 43 | { 44 | $this->result(AsyncDNS::RES_ERROR,"FormERR!?"); 45 | return; 46 | } 47 | else $this->result(AsyncDNS::RES_NOTFOUND, $ans); 48 | } 49 | else $this->result(AsyncDNS::RES_FOUND, $ans); 50 | } 51 | // unparseable, but maybe next time? 52 | } 53 | 54 | function ping() 55 | { 56 | $this->owner->ping(); 57 | } 58 | } 59 | 60 | class AsyncDNSTaskUDP extends AsyncDNSTask 61 | { 62 | protected $packet, $sock; 63 | 64 | function __construct(AsyncDNS $owner, Net_DNS_Packet $packet, $nextping = NULL) 65 | { 66 | parent::__construct($owner); 67 | 68 | $this->packet = $packet; 69 | 70 | if (!($sock = $this->connect(AsyncDNS::getNameservers()))) throw new Exception("Nameservers down"); 71 | 72 | if (!$sock->send($packet->data())) throw new Exception("Send error"); 73 | 74 | $sock->onRead(array($this,'read')); 75 | $sock->onError(array($this,'error')); 76 | $sock->onPing(array($this,'ping'), $nextping); 77 | $this->sock = $sock; 78 | } 79 | 80 | function destroy() 81 | { 82 | $this->sock->destroy(); 83 | $this->sock = NULL; 84 | $this->packet = NULL; 85 | parent::destroy(); 86 | } 87 | 88 | function error($msg) 89 | { 90 | $this->result(AsyncDNS::RES_ERROR,$msg); 91 | } 92 | 93 | protected function connect(array $nameservers) 94 | { 95 | foreach($nameservers as $nameserver) 96 | { 97 | try { 98 | $s = new AsyncSocketUDP($nameserver, 53); 99 | // d("Connected to $nameserver"); 100 | return $s; 101 | } 102 | catch(ExAsyncSocket $e) 103 | {warn($e,"connection to $nameserver failed");} 104 | } 105 | //d($nameservers, "no nameservers available!"); 106 | return NULL; 107 | } 108 | } 109 | 110 | 111 | /** queries number of DNS servers asynchronously 112 | this class is used statically as a factory. instances are 'resolvers' holding particular queries. 113 | */ 114 | class AsyncDNS 115 | { 116 | const RES_FOUND = 1; 117 | const RES_NOTFOUND = 2; 118 | const RES_ERROR = 4; 119 | 120 | static function supported() 121 | { 122 | return function_exists('socket_create'); 123 | } 124 | 125 | static protected $nameservers = array(); 126 | 127 | /** set up nameservers' IPs */ 128 | static function init(array $nameservers) 129 | { 130 | self::$nameservers = $nameservers; 131 | } 132 | 133 | static function getNameservers() 134 | { 135 | shuffle(self::$nameservers); 136 | return self::$nameservers; 137 | } 138 | 139 | static protected $resolvers = array(); 140 | 141 | /** query nameserver 142 | @return instance of AsyncDNS that will return acutual result 143 | */ 144 | static function query($host, $type = 'A', $class = 'IN') 145 | { 146 | static $queries; 147 | 148 | if (preg_match('/^(\d+)\.(\d+)\.(\d+)\.(\d+)$/', $host, $regs)) { 149 | $host = $regs[4].'.'.$regs[3].'.'.$regs[2].'.'.$regs[1].'.in-addr.arpa.'; 150 | $type = 'PTR'; 151 | } 152 | 153 | $key = $class . $type . $host; 154 | 155 | if (!isset(self::$resolvers[$key])) 156 | { 157 | self::$resolvers[$key] = new AsyncDNS($host, $type, $class, $key); 158 | $queries++; if ($queries%10==0) AsyncSocket::poll(); // prevents queue from getting too large 159 | } 160 | return self::$resolvers[$key]; 161 | } 162 | 163 | 164 | protected $packet; 165 | protected $tasks = array(); 166 | 167 | protected $finaltime, $nexttry, $retries = 4; 168 | 169 | protected $resolverskey; 170 | 171 | /** each logical query can actually consist of more than one network-level query AKA task (because retries don't kill already-open sockets in case late reply comes) */ 172 | function __construct($host, $type, $class, $resolverskey = NULL) 173 | { 174 | $this->resolverskey = $resolverskey; 175 | $this->packet = new Net_DNS_Packet(); 176 | $this->packet->buildQuestion($host, $type, $class); 177 | 178 | $this->finaltime = microtime(true) + 6; // when to give up 179 | $this->nexttry = $this->finaltime + 100; // this is not a mistake, initial nexttry must be impossible to reach 180 | $this->newTask(); 181 | } 182 | 183 | protected function newTask() 184 | { 185 | $this->nexttry = microtime(true) + 1.5; 186 | $this->tasks[] = new AsyncDNSTaskUDP($this, $this->packet, $this->nexttry); 187 | } 188 | 189 | protected $answer; 190 | protected $answerpositive; 191 | function setResult($resconst, $dat) 192 | { 193 | if ($this->answerpositive) return; // not interested 194 | 195 | if ($resconst == self::RES_ERROR) 196 | { 197 | //d($dat,"Reported error"); 198 | if ($this->retries > 0) 199 | { 200 | $this->retries--; 201 | $this->newTask(); 202 | } 203 | else {$this->answer = false;} 204 | } 205 | elseif ($resconst == self::RES_FOUND || $resconst == self::RES_NOTFOUND) 206 | { 207 | $this->answerpositive = ($resconst == self::RES_FOUND); 208 | $this->answer = $dat; 209 | foreach($this->tasks as $task) 210 | { 211 | $task->destroy(); 212 | } 213 | $this->tasks = array(); 214 | } 215 | } 216 | 217 | /** @return 0 on not-found, false on error, str or array on success. */ 218 | function getResult($blocking = true) 219 | { 220 | $res = $this->getRawResult($blocking); 221 | 222 | if ($res) 223 | { 224 | $out = array(); 225 | foreach($res->answer as $rr) 226 | { 227 | if ($rr instanceof Net_DNS_RR_A) $out[] = $rr->address; 228 | elseif ($rr instanceof Net_DNS_RR_PTR) return $rr->ptrdname; 229 | elseif ($rr instanceof Net_DNS_RR_NS) $out[] = $rr->nsdname; 230 | elseif ($rr instanceof Net_DNS_RR_CNAME) 231 | { 232 | $temp = gethostbynamel($rr->cname); 233 | if ($temp) $out = array_merge($out,$temp); 234 | } 235 | else 236 | { 237 | warn($rr,"Unusable record type"); 238 | } 239 | } 240 | 241 | if ($this->resolverskey) unset(self::$resolvers[$this->resolverskey]); 242 | return $out; 243 | } 244 | return $res; 245 | } 246 | 247 | function ping() 248 | { 249 | if ($this->answer === NULL && $this->retries > 0 && microtime(true) > $this->nexttry) 250 | { 251 | $this->retries--; 252 | $this->newTask(); 253 | } 254 | } 255 | 256 | function getRawResult($blocking) 257 | { 258 | if ($this->answer !== NULL) {return $this->answer;} 259 | 260 | do 261 | { 262 | $sleeptime = min(($this->retries?$this->nexttry:$this->finaltime),$this->finaltime) - microtime(true); 263 | 264 | if (!AsyncDNSTask::poll(max(0,$sleeptime + 0.1))) throw new Exception("Empty socket list!?"); 265 | $this->ping(); 266 | } 267 | while(($this->answer === NULL) && $blocking && $this->finaltime > microtime(true)); 268 | 269 | if (!$blocking || $this->answer !== NULL) return $this->answer; 270 | 271 | return $this->answer = false; 272 | } 273 | 274 | } 275 | -------------------------------------------------------------------------------- /tests/bayes.php: -------------------------------------------------------------------------------- 1 | add = !empty($settings['add']); 15 | $tableprefix = !isset($settings['prefix'])?'bayes':$settings['prefix']; 16 | $ignorefile = !isset($settings['ignore'])?'data/bayesignore.txt':$settings['ignore']; 17 | 18 | $this->db = new BayesBase($this->services->getDB(), $tableprefix, $ignorefile, $this->add ? 0.2 : 0); // FIXME: hardcoded 0.3 19 | } 20 | 21 | function reportResult(ISblamPost $p, $score, $cert, $force=false) 22 | { 23 | if ($force || ($this->add && abs($score) > 1.2 && $cert > 0.85)) 24 | { 25 | if (!$force) 26 | { 27 | if (function_exists('apc_add') && !apc_add('spambayesaddlock',1,1)) 28 | { 29 | return; 30 | } 31 | 32 | $load = sys_getloadavg(); 33 | if ($load[0]>1) 34 | { 35 | return; 36 | } 37 | } 38 | 39 | $p->bayesadded = 1; 40 | $this->addPost($p, $score > 0); 41 | } 42 | } 43 | 44 | private function extractWordsFromHeaders(ISblamPost $p) { 45 | $words = array(); 46 | 47 | $headers = $p->getHeaders(); 48 | foreach($headers as $name => $val) { 49 | if (!preg_match('/^HTTP_(?!REFERER|HOST|ORIGIN|CACHE|CONNECTION|X_FORWARD|X_REAL|X_CLIENT|CONTENT|PRAGMA|ACCEPT)/', $name)) continue; 50 | $words[] = "$name: $val"; 51 | } 52 | 53 | $ua = isset($headers['HTTP_USER_AGENT']) ? preg_replace('/(\d)[.\d]+/','\\1', $headers['HTTP_USER_AGENT']) : 'no-ua'; 54 | $words[] = $ua; 55 | 56 | $words[] = $ua . (isset($headers['HTTP_ACCEPT']) ? $headers['HTTP_ACCEPT'] : 'no-a'); 57 | $words[] = $ua . (isset($headers['HTTP_ACCEPT_LANGUAGE']) ? $headers['HTTP_ACCEPT_LANGUAGE'] : 'no-al'); 58 | $words[] = $ua . (isset($headers['HTTP_ACCEPT_ENCODING']) ? $headers['HTTP_ACCEPT_ENCODING'] : 'no-ae'); 59 | $words[] = $ua . (isset($headers['HTTP_ACCEPT_CHARSET']) ? $headers['HTTP_ACCEPT_CHARSET'] : 'no-ac'); 60 | $words[] = $ua . (isset($headers['HTTP_TE']) ? $headers['HTTP_TE'] : 'no-te') . 61 | (isset($headers['HTTP_PRAGMA']) ? $headers['HTTP_PRAGMA'] : 'no-p'). 62 | (isset($headers['HTTP_CACHE_CONTROL']) ? $headers['HTTP_CACHE_CONTROL'] : 'no-cc'); 63 | $words[] = $ua . (isset($headers['HTTP_CONNECTION']) ? $headers['HTTP_CONNECTION'] : 'no-c') . 64 | (isset($headers['HTTP_EXPECT']) ? $headers['HTTP_EXPECT'] : 'no-e') . 65 | (isset($headers['HTTP_VIA']) ? $headers['HTTP_VIA'] : 'no-v'); 66 | 67 | return $words; 68 | } 69 | 70 | function testPost(ISblamPost $p) 71 | { 72 | $spammiestword = ''; $spammiestwordnudge = 0; 73 | 74 | // test usual post content 75 | $postwords = array_merge($this->extractWordsFromHeaders($p), $this->extractWordsFromPost($p)); 76 | 77 | list($score,$cert, $newword, $newscore) = $this->db->testWords($postwords); 78 | if ($newscore > $spammiestwordnudge) {$newscore = $spammiestwordnudge; $spammiestword = $newword;} 79 | 80 | // test post content with signature 81 | if ($sig = $p->getSignature()) 82 | { 83 | $words = array_merge($postwords, self::extractWords($sig, $this->db->ignore)); 84 | list($score3, $cert3, $newword, $newscore) = $this->db->testWords($words); 85 | if ($newscore > $spammiestwordnudge) {$newscore = $spammiestwordnudge; $spammiestword = $newword;} 86 | 87 | // and use signature only if it's spammy 88 | if ($score3 > $score) 89 | { 90 | //d("bayes: signature is spammy"); 91 | $score = ($score3*2 + $score)/3 + 0.1; 92 | $cert = ($cert3*2 + $cert)/3; 93 | } 94 | } 95 | 96 | list($score2,$cert2, $newword, $newscore) = $this->db->testWords($this->extractWordsFromLinks($p)); 97 | if ($newscore > $spammiestwordnudge) {$newscore = $spammiestwordnudge; $spammiestword = $newword;} 98 | 99 | // if link labels are spammier, use that score (protects against stuffing innocent content + spammy link) 100 | if (count($postwords) > 2 && $cert2 > 0.5 && $score2 > 0.4 && abs($cert2*$score2) > abs($cert*$score)) 101 | { 102 | //d("bayes: link labels are spammier"); 103 | $score = ($score2*2 + $score)/3 + 0.1; 104 | $cert = ($cert2*2 + $cert)/3; 105 | } 106 | 107 | 108 | if ($score < -0.8) $score = ($score+0.8)/2-0.8; 109 | elseif ($score > 0.8) $score = ($score-0.8)/2+0.8; 110 | if ($score < -1.2) $score = ($score+1.2)/3-1.2; 111 | elseif ($score > 1.2) $score = ($score-1.2)/3+1.2; 112 | 113 | $scorecert = round((abs($score*$cert) + abs($score))/2,1); 114 | 115 | if ($score < 0) $score *= 0.8; 116 | 117 | if (abs($score) > 0.1 && $cert > 0.2) return array($score, ($cert + self::CERTAINITY_NORMAL)/2 , $score>0?"Bayesian filter spam ($scorecert $spammiestword)":"Bayesian filter ham ($scorecert $spammiestword)"); 118 | return NULL; 119 | } 120 | 121 | function addPost(ISblamPost $p, $isspam) 122 | { 123 | /** @todo add signature as well, but only if its spammy */ 124 | return $this->db->add(array_merge($this->extractWordsFromHeaders($p), $this->extractWordsFromPost($p)), $isspam); 125 | } 126 | 127 | function addText($txt, $isspam, $howmuch=1) 128 | { 129 | $this->db->add(self::extractWords($txt, $this->db->ignore),$isspam,$howmuch); 130 | } 131 | 132 | function testText($txt) 133 | { 134 | return $this->db->testWords(self::extractWords($txt, $this->db->ignore)); 135 | } 136 | 137 | protected function extractWordsFromLinks(ISblamPost $p) 138 | { 139 | // test link labels specifically 140 | $labels = ''; 141 | foreach($p->getLinks() as $link) 142 | { 143 | $labels .= ' '.$link->getLabel(); 144 | } 145 | 146 | return self::extractWords($labels, $this->db->ignore); 147 | } 148 | 149 | protected function extractWordsFromPost(ISblamPost $p) 150 | { 151 | // get both raw and stripped text, to find more phrases (word count doesn't matter) 152 | $txt = $p->getRawContent().' '.rawurldecode($p->getText()).' '.$p->getAuthorName().' '.$p->getAuthorEmail().' '.$p->getAuthorURI(); 153 | return self::extractWords($txt, $this->db->ignore); 154 | } 155 | 156 | protected static function splitStringUnicode($words) 157 | { 158 | $words = preg_replace(array("![\t\n\r]+!", // all other low ascii characters are removed 159 | '![\p{Cc}\p{Cf}\p{Cn}\p{Co}\p{Mn}\p{Me}\p{Lm}]!u', // remove all modifiers, private/reserved chars 160 | '![\p{Lo}\p{So}]{3}!u' // split CJK characters (unfortunately preg_split is used to remove 1- and 2-letter 'words', so they're made groups of 3) 161 | ),array(' ','',' \0 '),$words); 162 | 163 | 164 | if (function_exists('mb_strtolower')) 165 | { 166 | $words = mb_strtolower($words,'UTF-8'); 167 | } 168 | else 169 | { 170 | $words = strtolower($words); 171 | } 172 | 173 | return preg_split("![^a-z0-9\pN\pL]+(?:..?[^a-z0-9\pN\pL]+)*!u",$words,NULL,PREG_SPLIT_NO_EMPTY); 174 | } 175 | 176 | static function extractWords($words, array $ignore = array()) 177 | { 178 | $words = self::splitStringUnicode($words); 179 | 180 | $c = count($words); if (!$c) return array(); 181 | 182 | array_unshift($words, '^'); 183 | 184 | $tmp = array(); 185 | for($i=1;$i<$c;$i++) 186 | { 187 | $tmp[ $words[$i] ] = true; 188 | $tmp[ $words[$i-1].' '.$words[$i] ] = true; 189 | } 190 | $words = NULL; 191 | 192 | $final = array(); 193 | foreach($tmp as $v => $ignoredword) 194 | { 195 | if (strlen($v) >= 2 && preg_match('![a-z\pL]!u',$v) && !isset($ignore[$v])) 196 | { 197 | $final[] = $v; 198 | } 199 | } 200 | return $final; 201 | } 202 | 203 | static function info() 204 | { 205 | return array( 206 | 'name'=>'Bayesian Filter', 207 | 'desc'=>'Auto-learning filter judges by looking for words and phrases seen in spam/ham', 208 | 'remote'=>false, 209 | ); 210 | } 211 | } 212 | 213 | --------------------------------------------------------------------------------