├── .htaccess
├── LICENSE.md
├── README.md
├── api.php
├── cron.php
├── inc
├── .htaccess
├── crawl.php
├── db.php
├── index.php
├── info.php
├── instance.php
├── query.php
├── read.php
├── rest.php
├── skin.php
├── user.php
└── utilities.php
├── index.php
├── instance.php
├── join.php
├── privacy.php
└── site
├── .htaccess
├── configuration.php
├── feeds
└── .htaccess
├── files
├── .htaccess
├── elefant1.jpg
├── elefant1.png
└── style20230218.css
├── profiles
└── .htaccess
└── rejected
└── .htaccess
/.htaccess:
--------------------------------------------------------------------------------
1 | DirectoryIndex index.php
2 |
3 | ExpiresActive On
4 | ExpiresDefault "access plus 1 seconds"
5 | ExpiresByType image/gif "access plus 700000 seconds"
6 | ExpiresByType image/jpeg "access plus 700000 seconds"
7 | ExpiresByType image/png "access plus 700000 seconds"
8 | ExpiresByType audio/mp3 "access plus 700000 seconds"
9 | ExpiresByType text/css "access plus 700000 seconds"
10 | ExpiresByType text/javascript "access plus 700000 seconds"
11 | ExpiresByType application/javascript "access plus 700000 seconds"
12 |
13 |
Startup (%0d sec)',$ende - $start); 41 | 42 | 43 | if ($userlabel && isset($_GET['submitjoin'])) 44 | { 45 | $tfDebug = ''; 46 | $msg = addUser($userlabel); 47 | 48 | if (stristr($msg,'class="error"')) 49 | $echo .= $msg.$tfDebug; 50 | else 51 | { 52 | $echo .= '
ok'; 53 | 54 | 55 | 56 | $echo .= $tfDebug; 57 | 58 | } 59 | } 60 | elseif ($userlabel && isset($_GET['submitaddinstance'])) 61 | { 62 | $tfDebug = ''; 63 | $msg = addInstance($userlabel); 64 | 65 | if (stristr($msg,'class="error"')) 66 | $echo .= $msg.$tfDebug; 67 | else 68 | { 69 | $echo .= '
ok'; 70 | 71 | 72 | 73 | $echo .= $tfDebug; 74 | 75 | } 76 | $db = init(true); 77 | $echo .= '
'.$sql; 115 | 116 | // delete bak file 117 | @unlink($tfRoot.'/site/bak/'.$label.'.json'); 118 | @unlink($tfRoot.'/site/bak/'.$label.'.rss'); 119 | // delete profile files 120 | @unlink($tfRoot.'/site/profile/'.$label.'.json'); 121 | @unlink($tfRoot.'/site/profile/'.$label.'.html'); 122 | 123 | 124 | } 125 | elseif (isset($_GET['submitdvacuum'])) 126 | { 127 | $db = init(); 128 | $echo .= sqlTable($db,"VACUUM;"); 129 | $db->close(); 130 | $db = initQuery(); 131 | $echo .= sqlTable($db,"VACUUM;"); 132 | $db->close(); 133 | } 134 | elseif (isset($_GET['submittruncate'])) 135 | { 136 | //$db = init(); 137 | //$echo .= sqlTable($db,"PRAGMA wal_checkpoint(TRUNCATE);"); 138 | //$db->close(); 139 | 140 | $echo .= cleanDb(); 141 | } 142 | else 143 | { 144 | $echo .= '
index.db '.round(filesize($tfRoot.'/site/index.db')/1024/1024).' MB';
145 | $echo .= '
queries.db '.round(filesize($tfRoot.'/site/queries.db')/1024/1024).' MB';
146 |
147 | $ende = $start = time();
148 |
149 | $tfDebug = '';
150 |
151 | crawl();
152 |
153 | $echo .= $tfDebug;
154 |
155 |
156 | $ende = time();
157 | $echo .= '
'.sprintf('%0d',$ende - $start).' seconds'; 158 | 159 | $tfDebug = ''; 160 | 161 | index(); 162 | 163 | $echo .= $tfDebug; 164 | 165 | $ende = time(); 166 | $echo .= '
'.sprintf('%0d',$ende - $start).' seconds'; 167 | 168 | $db = init(); 169 | //$db->exec('DELETE FROM posts'); 170 | 171 | $echo .= '
'.sprintf('%0d',$ende - $start).' seconds'; 176 | 177 | $echo .= '
'.sprintf('%0d',$ende - $start).' seconds'; 182 | 183 | if ($userlabel && isset($_GET['submitindex'])) 184 | { 185 | 186 | $db2 = initQueries(true); 187 | $echo .= '
'.sprintf('%0d',$ende - $start).' seconds'; 193 | 194 | $echo .= '
'.sprintf('%0d',$ende - $start).' seconds'; 199 | 200 | $echo .= '
'.sprintf('%0d',$ende - $start).' seconds'; 206 | 207 | $echo .= '
'.sprintf('%0d',$ende - $start).' seconds'; 212 | 213 | if (rand(0,1000)<10) popularQueries(true); 214 | if (rand(0,1000)>990) trendingWords(true); 215 | 216 | $echo .= '
'.sprintf('%0d',$ende - $start).' seconds';
217 | }
218 | file_put_contents('site/job.txt',$echo);
219 | }
220 |
221 | echo $echo;
222 |
223 | ?>
224 |
225 |
226 |
227 |
--------------------------------------------------------------------------------
/inc/.htaccess:
--------------------------------------------------------------------------------
1 |
crawl empty profile '.$label); 44 | 45 | // common error 46 | if (strstr($label,'https://') or strstr($label,'http://')) 47 | $journal []= "DELETE FROM users WHERE label = '$label'; "; 48 | else 49 | $priority = time()+86400; 50 | $journal []= "UPDATE users SET priority = $priority WHERE label = '".$label."' ; "; 51 | 52 | continue; } 53 | 54 | $localpath = $tfRoot.'/site/feeds/'.$label.'.json'; 55 | 56 | $url = getOutboxLink($profile); 57 | 58 | debugLog(' url '.$url.''); 59 | 60 | if ($url) 61 | { 62 | if (substr($url,-4,4) == '.rss') $localpath = $tfRoot.'/site/feeds/'.$profile['label'].'.rss'; 63 | $shoppinglist[$url]=$localpath; 64 | if (!file_exists($localpath)) touch($localpath); 65 | $fc++; 66 | 67 | } 68 | } 69 | 70 | $journal []= 'COMMIT; '; 71 | $sql = join(PHP_EOL,$journal); 72 | if (count($journal)>2) 73 | { 74 | $db = init(); 75 | if ($db) 76 | { 77 | @$db->exec($sql); 78 | $db->close(); 79 | } 80 | } 81 | 82 | $ende = time(); 83 | 84 | 85 | debugLog('
crawl profiles: '.$pc. ' feeds: '.$fc.''); 86 | debugLog(sprintf(' (%0d sec)',$ende - $start)); 87 | 88 | getRemoteFiles($shoppinglist); 89 | 90 | } 91 | 92 | -------------------------------------------------------------------------------- /inc/db.php: -------------------------------------------------------------------------------- 1 | init errror index.db '.$err->getMessage(); return null; 33 | } 34 | 35 | if (!$db->busyTimeout(5000)) // sql tries to connect during 5000ms 36 | { 37 | echo '
db busy errror '.$db->lastErrorMsg(); return null; 38 | } 39 | 40 | $db->createFunction('score', 'score'); // must be also for readonly! 41 | $db->createFunction('time2date', 'time2date'); // must be also for readonly! 42 | 43 | 44 | 45 | if ($readonly) return $db; 46 | 47 | if (!$db->exec('CREATE VIRTUAL TABLE IF NOT EXISTS posts USING fts3(link, user, description, pubdate, image, media, soundex, followers, indexdate)')) 48 | { 49 | echo '
create table posts error '.$db->lastErrorMsg(); 50 | } 51 | 52 | 53 | 54 | if (!$db->exec('CREATE TABLE IF NOT EXISTS users (user, host, label, id, priority)')) 55 | { 56 | echo '
create table users error '.$db->lastErrorMsg(); 57 | } 58 | 59 | 60 | if (!$db->exec('CREATE INDEX IF NOT EXISTS users_label ON users (label)')) 61 | { 62 | echo '
index table users error '.$db->lastErrorMsg(); 63 | } 64 | if (!$db->exec('CREATE INDEX IF NOT EXISTS users_priority ON users (priority)')) 65 | { 66 | echo '
index table users error '.$db->lastErrorMsg(); 67 | } 68 | /* 69 | if (!$db->exec('CREATE INDEX IF NOT EXISTS posts_indexdate ON posts (indexdate)')) 70 | { 71 | echo '
index table posts error '.$db->lastErrorMsg(); 72 | } 73 | */ 74 | 75 | return $db; 76 | 77 | } 78 | 79 | function initQueries($readonly = false) 80 | { 81 | global $tfRoot; 82 | $path = $tfRoot.'/site/queries.db'; 83 | 84 | if (!file_exists($path)) $readonly = false; 85 | 86 | try 87 | { 88 | if ($readonly) 89 | $db = new SQLite3($path, SQLITE3_OPEN_READONLY); 90 | else 91 | $db = new SQLite3($path); 92 | } 93 | catch (Exception $err) 94 | { 95 | echo '
init errror queries.db '.$err->getMessage(); return null; 96 | } 97 | 98 | if (!$db->busyTimeout(5000)) // sql tries to connect during 5000ms 99 | { 100 | echo '
db busy errror '.$db->lastErrorMsg(); return null; 101 | } 102 | 103 | if ($readonly) return $db; 104 | 105 | if (!$db->exec('CREATE TABLE IF NOT EXISTS queries (query, date, results)')) 106 | { 107 | echo '
create table queries error '.$db->lastErrorMsg(); 108 | } 109 | 110 | if (!$db->exec('CREATE INDEX IF NOT EXISTS queries_query ON queries (query)')) 111 | { 112 | echo '
create table queries error '.$db->lastErrorMsg(); 113 | } 114 | 115 | 116 | return $db; 117 | 118 | } 119 | 120 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /inc/index.php: -------------------------------------------------------------------------------- 1 | index '.$usr.''); 20 | else debugLog('
index '); 21 | 22 | 23 | if ($usr) $files = glob($tfRoot.'/site/feeds/'.$usr.'.*'); 24 | else $files = glob($tfRoot.'/site/feeds/*.*'); 25 | 26 | $okfiles = array(); 27 | 28 | $labels = array(); 29 | $deletelist = array(); 30 | $fc = 0; 31 | $pc = 0; 32 | $uc = 0; 33 | $prioritystring = ''; 34 | 35 | $journal = array(); 36 | $journal []= 'BEGIN TRANSACTION; '; 37 | 38 | 39 | foreach($files as $file) 40 | { 41 | $start = time(); 42 | 43 | debugLog('
file '.basename($file) ); 44 | 45 | $fc++; 46 | $format = ''; 47 | if (substr(basename($file),-5)=='.json') 48 | { 49 | $format = 'json'; 50 | $label = preg_replace('/\.json$/','',basename($file)); 51 | } 52 | if (substr(basename($file),-4)=='.rss') 53 | { 54 | $format = 'rss'; 55 | $label = preg_replace('/\.rss$/','',basename($file)); 56 | } 57 | if (!$format) continue; 58 | 59 | preg_match('/@([a-zA-Z0-9_]+)@([a-zA-Z0-9_-]+\.[a-zA-Z0-9.-_]+)/',$label,$matches); // @user@example.com // can have two . in domain! 60 | if (count($matches)<3) continue; 61 | $host = $matches[2]; 62 | $user = $matches[1]; 63 | 64 | $bak = $tfRoot.'/site/bak/'.basename($file); 65 | if (file_exists($bak)) 66 | { 67 | if (@filesize($file) == @filesize($bak)) // did not change 68 | { 69 | $uc++; 70 | $priority = time()+86400; 71 | $journal []= "UPDATE users SET priority = $priority WHERE label = '".$label."' ; "; 72 | // delete feed file 73 | @unlink($file); 74 | // delete bak file 75 | @unlink($tfRoot.'/site/bak/'.$label.'.json'); 76 | @unlink($tfRoot.'/site/bak/'.$label.'.rss'); 77 | } 78 | 79 | } 80 | 81 | $profile = getProfile($label); 82 | $followers = $profile['followercount']; 83 | $magic = validUser($profile); 84 | 85 | if (!@$magic) 86 | { 87 | debugLog('
Invalid user '.$label.''); 88 | // we do not delete the user immediately, because the profile page may also have been on error. 89 | // we wait until there is no new post 90 | 91 | $sql = "SELECT count(link) as c FROM posts where user = '$label';"; 92 | $db2 = init(true); 93 | if ($db2) 94 | { 95 | $up = $db2->querySingle($sql); 96 | $db2->close(); 97 | } 98 | if (!$up) 99 | { 100 | $journal []= "DELETE FROM users WHERE label = '$label'; "; 101 | @unlink($file); 102 | } 103 | 104 | @rename($file,$tfRoot.'/site/rejected/invaliduser-'.basename($file)); 105 | $priority = time()+86400; 106 | $journal []= "UPDATE users SET priority = $priority WHERE label = '".$label."' ; "; 107 | $ende = time(); 108 | debugLog(sprintf(' invaliduser (%0d sec)',$ende - $start)); 109 | continue; 110 | } 111 | 112 | 113 | 114 | if (!file_exists($file)) 115 | { 116 | debugLog(' nofile '); 117 | $ende = time(); 118 | debugLog(sprintf(' (%0d sec)',$ende - $start)); 119 | continue; 120 | 121 | } 122 | $s = file_get_contents($file); 123 | debugLog(' read '.$format); 124 | 125 | if (!$s) 126 | { 127 | if ($usr) debugLog('rejected: '.basename($file)); 128 | 129 | @rename($file,$tfRoot.'/site/rejected/empty-'.basename($file)); 130 | $priority = time()+86400; 131 | $journal []= "UPDATE users SET priority = $priority WHERE label = '".$label."' ; "; 132 | debugLog(' empty '); 133 | continue; 134 | } 135 | 136 | if ($usr) debugLog(expandableSnippet($s)); 137 | 138 | if ($format == 'json') $feed = readJSONfeed($s,$label, $host, $user,$file); 139 | else 140 | { 141 | $feed = readRSSFeed($s,$file); 142 | if (!count($feed)) 143 | { 144 | // it didn't work. It may be that it was a gotosocial site, so we try the other url 145 | // it's a hack using ressources. should we add the URL to the database? 146 | $localpath = $tfRoot.'/site/feeds/'.$label.'.rss'; 147 | $url = 'https://'.$host.'/@'.$user.'/feed.rss'; 148 | $shoppinglist = array($url=>$localpath); 149 | getRemoteFiles($shoppinglist); 150 | 151 | if ($usr) 152 | { 153 | $s = getRemoteString($url); 154 | file_put_contents($localpath,$s); 155 | $feed = readRSSFeed($s,$file); 156 | 157 | debugLog('
Trying other URL '.$url.' '.count($feed)); 158 | debugLog(expandableSnippet(json_encode($feed))); 159 | 160 | } 161 | 162 | } 163 | } 164 | 165 | debugLog(' '.count($feed)); 166 | 167 | $generalfound = 0; 168 | $oldposts = 0; 169 | 170 | 171 | $minpubdate = 91676409379; 172 | $maxpubdate = 0; 173 | $postcount = 0; 174 | 175 | // we get a list of current posts. if some more recent than the oldest post may have disappeared, we can discard them 176 | // we cannot take link as orderes, because some instances do use md5 as id. 177 | // so we use pubdate and assume that second is unique for users that post 178 | 179 | 180 | 181 | $sql = "SELECT pubdate, link FROM posts where user = '$label';"; 182 | $db2 = init(true); 183 | 184 | if ($db2) 185 | { 186 | $res = $db2->query($sql); 187 | $currentlinks = array(); 188 | while ($d = $res->fetchArray(SQLITE3_ASSOC)) 189 | { 190 | $currentlinks[$d['link']] = $d['pubdate']; 191 | } 192 | $db2->close(); 193 | } 194 | 195 | // we get the oldest post 196 | 197 | $oldestid = false; 198 | if (count($feed)) 199 | { 200 | $post = end($feed); 201 | $oldestid = $post['pubdate']; 202 | 203 | foreach($currentlinks as $k=>$v) 204 | { 205 | if ($v < $oldestid) unset($currentlinks[$k]); 206 | } 207 | } 208 | else 209 | { 210 | // we ignore 211 | $currentlinks = array(); 212 | } 213 | $k=0; 214 | foreach($feed as $post) 215 | { 216 | debugLog(' '.$k.''); $k++; 217 | 218 | $postcount++; 219 | $found = false; 220 | 221 | $link = SQLite3::escapeString($post['link']); 222 | $edited = @$post['edited_at']; 223 | $minpubdate = min($minpubdate,$post['pd']); 224 | $maxpubdate = max($maxpubdate,$post['pd']); 225 | $pubdate = $post['pubdate']; 226 | $datelimit = date('Y-m-d',strtotime('-3 month', time())); 227 | if ($pubdate < $datelimit) continue; // too old 228 | $indexdate = date('Y-m-d H:i:s'); 229 | 230 | // if the post exists and is not edited, we go no further 231 | if (array_key_exists($link,$currentlinks)) 232 | { 233 | if ($edited) 234 | { 235 | $link = $currentlinks[$pubdate]; 236 | $journal []= "DELETE FROM posts WHERE link = '$link' ;"; 237 | } 238 | else 239 | { 240 | $found = true; 241 | $oldposts++; 242 | continue; 243 | } 244 | } 245 | unset($currentlinks[$link]); 246 | 247 | $avatar = SQLite3::escapeString($post['avatar']); 248 | 249 | 250 | $description = handleMentions($post['description']); 251 | 252 | $description = encodeSpacelessLanguage($description); 253 | 254 | if(is_array($post['medias'])) 255 | $media = join('::',$post['medias']); 256 | else 257 | $media = ''; 258 | $media = encodeSpacelessLanguage($media); 259 | 260 | $soundex = SQLite3::escapeString(soundexLong($description.' '.$label.' '.$media.' ')); 261 | $media = SQLite3::escapeString($media); 262 | $description = SQLite3::escapeString($description); 263 | 264 | 265 | 266 | $db2->close(); 267 | $sql = ""; 268 | if (!$found || true) // we think the posts are immutable 269 | { 270 | $sql = $journal []= "INSERT INTO posts (link, user, description, pubdate, image, media, soundex, followers, indexdate) VALUES ('".$link."','".$label."','".$description."','".$pubdate."','".$avatar."','".$media."','".$soundex."', ".max(intval($followers),1).",'".$indexdate."'); "; 271 | $generalfound++; 272 | $pc++; 273 | } 274 | 275 | } 276 | 277 | // remove posts that have been deleted. 278 | foreach($currentlinks as $k=>$v) 279 | { 280 | $localpath = $tfRoot.'/site/deleted/'.bin2hex($k); 281 | $deletelist[$k] = $localpath; 282 | 283 | } 284 | 285 | // we calculated when the next post is likely to happen, but we wait at most 1 day 286 | $period = round(($maxpubdate - $minpubdate ) / ($postcount+1)); 287 | if (!$generalfound) $period *= 2; 288 | if ($postcount == 0) $period = 86400; 289 | if ($postcount == 1) $period = time() - $maxpubdate; 290 | if (!$generalfound) $period *= 2; 291 | $period *= 2; // safety, if we ask too early we risk not to have a message 292 | $period = min($period,86400); 293 | $period = max($period,300); 294 | $priority = time() + $period; 295 | $journal []= "UPDATE users SET priority = $priority WHERE label = '".$label."' ; "; 296 | 297 | $okfiles []= $file; 298 | 299 | $ende = time(); 300 | debugLog(sprintf(' (%0d sec)',$ende - $start)); 301 | 302 | 303 | } 304 | 305 | $journal []= 'COMMIT; '; 306 | 307 | $limit = date('Y-m-d',strtotime('-3 month', time())); 308 | if (rand(0,100)>-98) $journal []= "DELETE FROM posts WHERE pubdate < '".$limit."'; "; 309 | 310 | // remove all deleted files 311 | $files = glob($tfRoot.'/site/deleted/*'); 312 | foreach($files as $file) 313 | { 314 | $link = hex2bin(basename($file)); 315 | if (@filesize($file)<2000) 316 | { 317 | $journal []= "DELETE FROM posts WHERE link = '$link' ;"; 318 | debugLog('
deleted post '.$link.''); 319 | } 320 | @unlink($file); 321 | } 322 | 323 | $ende = $start = time(); 324 | 325 | if (count($journal)) 326 | { 327 | 328 | $db = init(); 329 | if ($db) 330 | { 331 | $q = join(PHP_EOL,$journal); 332 | if (!@$db->exec($q)) 333 | { 334 | debugLog('index error '.$db->lastErrorMsg().' '.$q); 335 | return; 336 | } 337 | $db->close(); 338 | } 339 | } 340 | 341 | foreach($okfiles as $file) 342 | { 343 | $bak = str_replace('/site/feeds/','/site/bak/',$file); 344 | if (file_exists($bak)) @unlink($bak); 345 | @rename($file,$bak); 346 | } 347 | 348 | $time = sprintf(' (%0d sec)',$ende - $start); 349 | 350 | debugLog('
index feeds: '.$fc. ' Unchanged: '.$uc. ' Posts: '.$pc.''.$time); 351 | 352 | getRemoteFiles($deletelist); 353 | 354 | } 355 | -------------------------------------------------------------------------------- /inc/info.php: -------------------------------------------------------------------------------- 1 | querySingle($sql); 23 | $sql = 'SELECT count(user) FROM users'; 24 | $users = $db->querySingle($sql); 25 | $db->close(); 26 | $sql = 'SELECT count(query) FROM queries'; 27 | $db = initQueries(true); 28 | $queries = $db->querySingle($sql); 29 | $db->close(); 30 | } 31 | 32 | return "$users users, $posts posts, $queries queries"; 33 | 34 | } 35 | 36 | function indexStatus() 37 | { 38 | $db = init(true); 39 | if ($db) 40 | { 41 | $sql = 'SELECT min(priority) FROM users WHERE priority > 0'; 42 | $minpriority = $db->querySingle($sql); 43 | $db->close(); 44 | 45 | $delay = round(($minpriority-time())/60); 46 | if (!$delay) return 'in time'; 47 | elseif ($delay == 1) return '1 minute ahead'; 48 | elseif ($delay == -1) return '1 minute behind'; 49 | elseif ($delay > 0) return $delay.' minutes ahead'; 50 | elseif ($delay < 0) return -$delay.' minutes behind'; 51 | } 52 | } 53 | 54 | 55 | function popularQueries($refresh = false) 56 | { 57 | 58 | global $tfRoot; 59 | $file = $tfRoot.'/site/cache/popularQueries.txt'; 60 | 61 | // if (file_exists($file) && time() - filemtime($file) < 3600) 62 | if (file_exists($file) && ! $refresh) 63 | { 64 | $s = file_get_contents($file); 65 | $result = json_decode($s,true); 66 | return $result; 67 | } 68 | 69 | $limit = date('Y-m-d',strtotime('-2 day', time())); 70 | $sql = "SELECT DISTINCT query, count(query) as c FROM queries WHERE results > 0 and date > '$limit' GROUP BY query ORDER BY c DESC limit 5"; 71 | 72 | $db = initQueries(true); 73 | if ($db) 74 | { 75 | $list = $db->query($sql); 76 | 77 | $result = array(); 78 | while ($d = $list->fetchArray(SQLITE3_ASSOC)) $result[]=$d; 79 | $db->close(); 80 | $s = json_encode($result); 81 | file_put_contents($file, $s); 82 | return $result; 83 | } 84 | return array(); 85 | 86 | 87 | 88 | } 89 | 90 | function trendingWords($refresh = false) 91 | { 92 | global $tfRoot; 93 | $file = $tfRoot.'/site/cache/trendingWords.txt'; 94 | 95 | if (file_exists($file) && ! $refresh) return file_get_contents($file); 96 | 97 | 98 | $today = date('Y-m-d'); 99 | $q = "SELECT description, followers, user FROM posts ORDER BY pubdate DESC limit 10000"; 100 | 101 | $db = init(); 102 | $users = array(); 103 | if ($db) 104 | { 105 | $list = $db->query($q); 106 | $descriptions = array(); 107 | if (!$list) return ''; 108 | $i = 0; 109 | while ($d = $list->fetchArray(SQLITE3_ASSOC)) 110 | { 111 | if (isset($users[$d['user']])) $users[$d['user']] -= 0.3; // divide by 2 with log 10 112 | else $users[$d['user']] = log10($d['followers']+1); 113 | $descriptions[$d['description']] = $users[$d['user']]; // count only once per user 114 | $i++; 115 | } 116 | $db->close(); 117 | 118 | $s = relatedSearches($descriptions,''); 119 | file_put_contents($file, $s); 120 | return $s; 121 | } 122 | } 123 | 124 | 125 | function relatedSearches($descriptions,$query) 126 | { 127 | 128 | 129 | $dict = array(); 130 | foreach($descriptions as $p=>$n) 131 | { 132 | // clean 133 | $p = str_replace('
/','',$p);
136 | $p = preg_replace('#https?://\S*#','',$p);
137 | $p = preg_replace('/[!-)\+-,\.\/:-@[-`\{-~]|\*|#|’|-/',' ',$p);
138 |
139 | // [!-) ascii 33-41
140 | // \. ascii 46
141 | // \/ ascii 47
142 | // :-@ ascii 58-64
143 | // [-` ascii 91-96
144 | // \{-~ ascii 123-126
145 |
146 | $pwords = array_unique(explode(' ',$p));
147 |
148 | foreach($pwords as $t)
149 | {
150 | if (!$t) continue;
151 | if (preg_match('#\d#',$t)) continue;
152 | $t = strtolower($t);
153 | if (!isset($dict[$t])) $dict[$t] = $n; else $dict[$t]+=$n;
154 | }
155 | }
156 |
157 | $db = init();
158 | $doccount = $db->querysingle('SELECT count(link) FROM posts');
159 | $db->exec("CREATE VIRTUAL TABLE temp.terms USING fts4aux(main,posts);");
160 | $res = $db->query("SELECT DISTINCT term, documents FROM temp.terms WHERE col = '*' ");
161 |
162 | $dict2 = array();
163 | while ($d = $res->fetchArray(SQLITE3_ASSOC))
164 | {
165 | $dict2[$d['term']] = log10($doccount/$d['documents']) ; // IDF
166 | }
167 |
168 | $dict3 = array();
169 | foreach($dict as $k=>$v)
170 | {
171 | if (isset($dict2[$k])) $dict3[$k] = $dict[$k] * $dict2[$k]; // TF*IDF
172 | }
173 |
174 | arsort($dict3);
175 |
176 | $stopwords = array('about', 'after','anyone', 'because', 'being','better', 'check', 'diese','doesn', 'einem', 'einen', 'einfach', 'everything', 'every', 'everyone', 'found', 'getting', 'gerade', 'gewesen', 'going', 'gonna', 'great', 'haben', 'having', 'jetzt', 'meine', 'message','might','never', 'nicht', 'other', 'people', 'playing', 'really','right', 'region', 'seems', 'schon', 'should', 'someone', 'source', 'start', 'still', 'stuff','there', 'these', 'their', 'thing', 'things', 'think', 'those', 'times', 'today', 'translated','translation','users','using', 'video', 'watching', 'werden','which', 'while', 'would');
177 |
178 | $i = 0;
179 | $links = array();
180 | foreach($dict3 as $k=>$v)
181 | {
182 | if ($i>4) break;
183 | if (!stristr($query,$k) && mb_strlen($k,'UTF-8')>4 && !in_array($k, $stopwords) )
184 | {
185 | $links[] = ''.$k.'';
186 | $i++;
187 | }
188 | }
189 |
190 | return join('
',$links);
191 |
192 | }
193 |
194 | function trendingPosts()
195 | {
196 | global $tfRoot;
197 | $file = $tfRoot.'/site/cache/trendingPosts.txt';
198 |
199 | if (file_exists($file) && time() - filemtime($file) < 3600)
200 | {
201 | $s = file_get_contents($file);
202 | $result = json_decode($s,true);
203 |
204 |
205 |
206 | return $result;
207 | }
208 |
209 | $db = init();
210 |
211 | $datelimit = date('Y-m-d H:i:s',strtotime('-1 day', time()));
212 | $q = "SELECT link, description, followers, user, 2 as found, image, media, pubdate, indexdate FROM posts ORDER BY pubdate DESC limit 2000";
213 |
214 | $list = $db->query($q);
215 | $dict = array();
216 | if (!$list) return '';
217 | while ($d = $list->fetchArray(SQLITE3_ASSOC))
218 | {
219 | $wl = array_unique(wordList($d['description']));
220 | $wl = array_filter($wl, function($x) { if (strlen($x) < 5) return false; return true; });
221 | sort($wl);
222 | $d['wordlist'] = $wl;
223 | $d['pd'] = intval(strtotime($d['pubdate']));
224 | $dict[$d['link']] = $d;
225 | }
226 | foreach($dict as $k=>$x)
227 | {
228 | $score = 0;
229 | foreach($dict as $y)
230 | {
231 | if ($x['user'] != $y['user'])
232 | {
233 | $sc = count(array_intersect($x['wordlist'], $y['wordlist']));
234 | if ($sc) $sc /= max(100,count($x['wordlist']));
235 | $score += $sc;
236 | }
237 | }
238 | $dict[$k]['score'] = $score * log10(intval($y['followers'])+1);
239 | // $dict[$k]['description'] .= $dict[$k]['pd'];
240 | }
241 |
242 | uasort($dict, function($x,$y) { return $y['score'] - $x['score']; });
243 |
244 | // downvote following posts the same user
245 |
246 | $users = array();
247 | foreach($dict as $k=>$x)
248 | {
249 | if (isset($users[$x['user']]))
250 | {
251 | $users[$x['user']]++;
252 | $x['score'] /= pow($users[$x['user']],1.5);
253 | $result[$k] = $x;
254 | }
255 | else
256 | {
257 | $users[$x['user']] = 1;
258 | }
259 | }
260 |
261 | uasort($dict, function($x,$y) { return $y['score'] - $x['score']; });
262 |
263 | $dict = array_slice($dict,0,100);
264 |
265 | // uasort($dict, function($x,$y) { return $y['pd'] - $x['pd']; });
266 |
267 | $s = json_encode($dict);
268 | file_put_contents($file, $s);
269 |
270 | return $dict;
271 | }
272 |
273 |
274 |
275 |
276 |
--------------------------------------------------------------------------------
/inc/instance.php:
--------------------------------------------------------------------------------
1 | addInstance '.$host);
14 |
15 | // check the format of the user label
16 |
17 | $host = trim($host);
18 | if (!$host) return '
The instance field is empty. Please submit a username.'; 19 | 20 | preg_match('/[a-zA-Z0-9_-]+\.[a-zA-Z0-9.-]+/',$host,$matches); // @example.com // can have two . in domain! 21 | 22 | if (!count($matches)) return '
The instance domain '.$host.' is invalid. Please submit a complete domain (eg example.com) .'; 23 | 24 | // check first if there was a rules 25 | $magic = validInstance($host, true); 26 | 27 | if ($magic === false || stristr($magic,'error')) 28 | { 29 | $sql = "BEGIN TRANSACTION; 30 | DELETE FROM instances WHERE host = '$host'; 31 | COMMIT;"; // echo $sql; 32 | $db = init(); 33 | if ($db) 34 | { 35 | if (!@$db->exec($sql)) return '
Database error '.$db->lastErrorMsg().''; 36 | $db->close(); 37 | } 38 | else 39 | { 40 | 41 | return '
Database was not available. Please try later.'; 42 | } 43 | 44 | 45 | return '
Your instance rules are missing the magic sentence. Please update the instance rules first and then join again.'; 46 | 47 | 48 | } 49 | 50 | debugLog('
Valid instance. Going to add users. Magic: '.$magic); 51 | 52 | addInstanceUsers($host); 53 | 54 | 55 | $sql = "BEGIN TRANSACTION; 56 | DELETE FROM instances WHERE host = '$host'; 57 | REPLACE INTO instances (host) VALUES ('$host'); 58 | COMMIT;"; 59 | $db = init(); 60 | if ($db) 61 | { 62 | if (!@$db->exec($sql)) 63 | { 64 | $db->close(); 65 | return '
Your instance is ot but the database was not available. Please try later.'; 66 | } 67 | $db->close(); 68 | } 69 | 70 | $searchlink = 'Search for '.$host.''; 71 | 72 | return '
Magic sentence '.$magic.' found. From now on, the instance is indexed and listed below. '.$searchlink; 73 | 74 | } 75 | 76 | function validInstance($host, $forcerefresh=false) 77 | { 78 | 79 | $q = "SELECT host FROM instances WHERE host = '".$host."';"; 80 | $db = init(true); 81 | if ($db) 82 | { 83 | $list = $db->query($q); 84 | $result = array(); 85 | while($d = $list->fetchArray(SQLITE3_ASSOC)) $result[] = $d['host']; 86 | $db->close(); 87 | if (!count($result)) return false; 88 | } 89 | else return false; 90 | 91 | $rules = getInstanceRules($host, $forcerefresh); 92 | if (!count($rules)) return '
I cannot find the rules of the instance '.$host.''; 93 | $s = json_encode($rules); 94 | $validrules = array('Public posts from this instance are indexed by tootfinder.ch'); 95 | foreach($validrules as $v) 96 | 97 | if (stristr($s,$v)) 98 | { 99 | if (rand(0,100)> 95) addInstanceUsers($host); 100 | return $v; 101 | } 102 | 103 | 104 | debugLog('
'.$user); debugLog(' added'); 228 | 229 | $priority = time(); 230 | $label = '@'.$user.'@'.$host; 231 | $host = $host; 232 | 233 | $sql [] = "INSERT INTO users (user, host, label, priority) VALUES ('$user','$host','$label',$priority);" ; 234 | $found = true; 235 | } 236 | else 237 | { 238 | // debugLog(' existing'); 239 | } 240 | } 241 | $offset +=40; 242 | } 243 | 244 | $sql []= "COMMIT;"; 245 | $sql = join(PHP_EOL,$sql); 246 | 247 | $db = init(); 248 | if ($db) 249 | { 250 | if (!@$db->exec($sql)) 251 | { 252 | $db->close(); 253 | return '
Database was not available. Please try later.';
254 | }
255 | $db->close();
256 | }
257 |
258 | $ende = time();
259 | debugLog(sprintf(' (%0d sec)',$ende - $start));
260 |
261 | }
262 |
263 |
264 | function instanceList()
265 | {
266 |
267 |
268 | $q = "SELECT host FROM instances ORDER BY host";
269 |
270 | $result = array();
271 | $db = init(true);
272 | if ($db)
273 | {
274 | $list = $db->query($q);
275 | while($d = $list->fetchArray(SQLITE3_ASSOC)) $result[] = ''.$d['host'].'
';
276 | $db->close();
277 | }
278 |
279 | if (count($result)) return join(PHP_EOL,$result); else return "-";
280 |
281 | }
282 |
283 |
--------------------------------------------------------------------------------
/inc/query.php:
--------------------------------------------------------------------------------
1 | '.$q;
47 |
48 | $q = encodeSpacelessLanguage($q);
49 | $q = SQLite3::escapeString($q);
50 | $planb = false;
51 | $order = ' ORDER BY score DESC '; if ($newposts) $order = ' ORDER BY pubdate DESC ';
52 | $limit = ' LIMIT 100 '; if ($allposts) $limit = ' ';
53 |
54 |
55 | $sql = "SELECT '2' as found, score(offsets(posts), description, followers, pubdate) as score, link, user, description, pubdate, image, media, followers FROM posts
56 | WHERE posts MATCH '$q' $hastagquery
57 | $order
58 | $limit ";
59 |
60 | debugLog('
'.$sql); 61 | 62 | $start = $ende = time(); 63 | 64 | $db = init(true); 65 | if ($db) 66 | { 67 | $list = $db->query($sql); 68 | 69 | if (NULL == $list->fetchArray(SQLITE3_ASSOC)) 70 | { 71 | $planb = true; 72 | 73 | $q = queryStar($query0); 74 | $q = SQLite3::escapeString($q); 75 | 76 | $sql = "SELECT '1' as found, score(offsets(posts), description, followers, pubdate) as score, link, user, description, pubdate, image, media, followers FROM posts 77 | WHERE posts MATCH '$q' 78 | $order 79 | limit 10 "; 80 | debugLog('
Starred: '.$sql); 81 | 82 | $list = $db->query($sql); 83 | 84 | if (NULL == $list->fetchArray(SQLITE3_ASSOC)) 85 | { 86 | 87 | $q = QuerySoundex($query0); 88 | 89 | $sql = "SELECT '0' as found, score(offsets(posts), description, followers, pubdate) as score, link, user, description, pubdate, image, media, followers FROM posts 90 | WHERE soundex MATCH '$q' 91 | $order 92 | limit 10 "; 93 | debugLog('
Soundex: '.$sql); 94 | 95 | $list = $db->query($sql); 96 | 97 | } 98 | } 99 | 100 | $list->reset(); 101 | 102 | $rc = 0; 103 | $result = array(); 104 | $validusers = array(); 105 | 106 | while ($d = $list->fetchArray(SQLITE3_ASSOC)) 107 | { 108 | $profile = getProfile($d['user']); 109 | if (!isset($validusers[$d['user']]) && !validUser($profile)) continue; // WE MUST GET VALIDUSER BACK HERE HOTFIX 110 | $validusers[$d['user']] = 1; 111 | 112 | $rc++; 113 | 114 | $d['description-jp'] = $d['description']; 115 | $d['description'] = decodeSpacelessLanguage($d['description']); 116 | $d['media'] = decodeSpacelessLanguage($d['media']); 117 | 118 | $result[] = $d; 119 | } 120 | $db->close(); 121 | 122 | // downvote following posts the same user 123 | 124 | if (!$newposts) 125 | { 126 | // echo 'sort'; 127 | 128 | $users = array(); 129 | foreach($result as $k=>$x) 130 | { 131 | if (isset($users[$x['user']])) 132 | { 133 | $users[$x['user']]++; 134 | $x['score'] = round($x['score']/$users[$x['user']]); 135 | $result[$k] = $x; 136 | } 137 | else 138 | { 139 | $users[$x['user']] = 1; 140 | } 141 | } 142 | 143 | uasort($result, function($x,$y) { return $y['score'] - $x['score']; }); 144 | } 145 | 146 | } 147 | 148 | $ende = time(); 149 | debugLog(sprintf('
SQL (%0d sec)',$ende - $start)); 150 | $start = time(); 151 | 152 | if ($doindex && ! stristr($query0,'@')) 153 | { 154 | 155 | if ($planb) $rc = 0; 156 | $date = date("Y-m-d H:i"); // remove seconds to discourage clickbait 157 | $q = SQLite3::escapeString($query0); 158 | $sql2 = "INSERT INTO queries (query, date, results) VALUES ('".$q."','".$date."',".$rc.");"; 159 | $db = initQueries(); 160 | if ($db) 161 | { 162 | if (!$db->exec($sql2)) echo '
index error '.$db->lastErrorMsg(); 163 | $datelimit = date('Y-m-d',strtotime('-3 month', time())); 164 | $sql2 = "DELETE FROM queries WHERE date < '".$datelimit."' ;"; 165 | if (rand(0,1000)>998) $db->exec($sql2); 166 | $db->close(); 167 | } 168 | } 169 | $ende = time(); 170 | debugLog(sprintf('
Update queries (%0d sec)',$ende - $start));
171 | $start = time();
172 |
173 | return $result;
174 | }
175 |
176 | function score($s, $description, $followers, $pubdate)
177 | {
178 | // occurencies, the earlier the more
179 |
180 | $offsets = explode(' ',$s);
181 | $r = 0;
182 | for ($i=0;$i ";
179 |
180 | }
181 |
182 |
183 |
184 |
--------------------------------------------------------------------------------
/inc/user.php:
--------------------------------------------------------------------------------
1 | addUser '.$label);
13 |
14 | // check the format of the user label
15 |
16 | $label = trim($label);
17 | if (!$label) return ' The username field is empty. Please submit a username.';
18 |
19 | preg_match('/@([a-zA-Z0-9_]+)@([a-zA-Z0-9_-]+\.[a-zA-Z0-9.-]+)/',$label,$matches); // @user@example.com // can have two . in domain!
20 |
21 | if (!count($matches)) return ' The username '.$label.' is invalid. Please submit a complete username (eg @user@example.com) .';
22 |
23 | $profile = getProfile($label, true);
24 | getOutboxLink($profile, $label, true);
25 |
26 | // check first if there was a profile
27 | if (!count($profile)) return ' I cannot find the profile page';
28 |
29 | if (!$magic = validUser($profile)) return ' Your fediverse profile is missing the magic word. Please proceed to step 1 first and then join again.';
30 |
31 | debugLog(' Valid user. Going to index. Magic '.$magic);
32 |
33 |
34 | // user has magic word in profile. we can add it to the db and index.
35 | $user = $profile['user'];
36 | $host = $profile['host'];
37 | $priority = time();
38 |
39 |
40 | $sql = "BEGIN TRANSACTION;
41 | DELETE FROM users WHERE label = '$label';
42 | REPLACE INTO users (user, host, label, priority) VALUES ('$user','$host','$label',$priority);
43 | COMMIT;";
44 | $db = init();
45 | if ($db)
46 | {
47 | if (!@$db->exec($sql))
48 | {
49 | $db->close();
50 | return ' Database was not available. Please try later.';
51 | }
52 | $db->close();
53 | }
54 |
55 | crawl($label);
56 | index($label);
57 |
58 | $searchlink = 'Search for '.$label.'';
59 |
60 | return ' Magic word '.$magic.' found. From now on, you are indexed. '.$searchlink;
61 |
62 | }
63 |
64 | function validUser($profile)
65 | {
66 | // ActivityPub profile
67 | $bio = @$profile['summary'];
68 | $attachment = @$profile['attachment'];
69 | $head1 = json_encode($bio).json_encode($attachment);
70 |
71 | // Mastodon profile
72 | $bio = @$profile['note'];
73 | $attachment = @$profile['fields'];
74 | $head2 = json_encode($bio).json_encode($attachment);
75 |
76 | $head = $head1.$head2;
77 |
78 |
79 |
80 | if (stristr($head,'tootfinder')) return 'tootfinder';
81 | if (stristr($head,'tfr')) return 'tfr';
82 | if (stristr($head,'searchable')) return 'searchable';
83 |
84 | $test = validInstance($profile['host']);
85 |
86 | if ($test !== false)
87 | {
88 | if (stristr($head,'noindex')) return false;
89 |
90 | return 'instance opt-in '.$profile['host'];
91 | }
92 |
93 |
94 | // debugLog(expandableSnippet($head));
95 |
96 | return false;
97 | }
98 |
99 | function randomUsers()
100 | {
101 |
102 | $q = "SELECT user, host, label, id, priority FROM users WHERE priority > 0 ORDER BY priority LIMIT 10;";
103 |
104 | if (rand(0,100)> 90)
105 | $q = "SELECT user, host, label, id, priority FROM users ORDER BY RANDOM() DESC LIMIT 10;";
106 |
107 | $db = init(true);
108 | if ($db)
109 | {
110 | $list = $db->query($q);
111 | $result = array();
112 | while($d = $list->fetchArray(SQLITE3_ASSOC)) $result[] = $d['label'];
113 | $db->close();
114 | }
115 |
116 | return $result;
117 |
118 | }
119 |
120 | function getHostMeta($host, $forcerefresh=false)
121 | {
122 |
123 | global $tfRoot;
124 | $url = 'https://'.$host.'/.well-known/host-meta';
125 | $localpath = $tfRoot.'/site/hostmeta/'.$host.'.xml';
126 |
127 | if ($forcerefresh || !file_exists($localpath) || time()-filemtime($localpath) > 3600*24*7 || filesize($localpath) < 50 )
128 | {
129 | debugLog(' '.$url.'');
130 | $s = getRemoteString($url);
131 | file_put_contents($localpath,$s);
132 | debugLog(expandableSnippet($s));
133 | }
134 | else
135 | $s = file_get_contents($localpath);
136 |
137 |
138 | preg_match('/template="https:\/\/(.*?)\/\.well-known\/webfinger/',$s,$matches);
139 | if (isset($matches[1])) $host = $matches[1];
140 |
141 | debugLog(' host-meta '.$host);
142 |
143 | return $host;
144 | }
145 |
146 | function getWebfinger($host, $label, $forcerefresh=false)
147 | {
148 | $host = getHostMeta($host, $forcerefresh);
149 |
150 | global $tfRoot;
151 | $url = 'https://'.$host.'/.well-known/webfinger?resource='.substr($label,1); // remove first @
152 | $localpath = $tfRoot.'/site/webfinger/'.$label.'.json';
153 |
154 |
155 |
156 | if ($forcerefresh || !file_exists($localpath) || time()-filemtime($localpath) > 3600*24*7 || filesize($localpath) < 50 )
157 | {
158 | debugLog(' '.$url.'');
159 | $s = getRemoteString($url);
160 | file_put_contents($localpath,$s);
161 | debugLog(expandableSnippet($s));
162 | }
163 | else
164 | $s = file_get_contents($localpath);
165 |
166 |
167 | $dict = array();
168 | $j = json_decode($s,true);
169 | if (is_array($j) && isset($j['links']))
170 | {
171 | foreach($j['links'] as $l )
172 | {
173 | if (@$l['rel']=='self') $link = @$l['href']; else continue;
174 | preg_match('/https:\/\/(.*?)\/users\/(.*?)$/',$link,$matches);
175 | if ($matches) {
176 | $dict['host'] = $matches[1];
177 | $dict['user'] = $matches[2];
178 | $dict['link'] = $link;
179 | $dict['label'] = $label;
180 | }
181 | }
182 | }
183 | else
184 | {
185 | // fallback to host/user/user
186 | $dict['host'] = $host;
187 | $dict['label'] = $label;
188 | preg_match('/@([a-zA-Z0-9_]+)@([a-zA-Z0-9_-]+\.[a-zA-Z0-9.-]+)/',$label,$matches); // @user@example.com // can have two . in domain!
189 | $dict['user'] = @$matches[1];
190 | $dict['link'] = 'https://'.$host.'/users/'.$dict['user'];
191 |
192 | }
193 | debugLog(' webfinger '.$label);
194 | return $dict;
195 | }
196 |
197 | function getProfile($label, $forcerefresh=false)
198 | {
199 |
200 | preg_match('/@([a-zA-Z0-9_]+)@([a-zA-Z0-9_-]+\.[a-zA-Z0-9.-]+)/',$label,$matches); // @user@example.com // can have two . in domain!
201 | if (count($matches)<3) return array(); // error
202 |
203 | $user = $matches[1];
204 | $host = $matches[2];
205 |
206 | global $tfRoot;
207 |
208 | $localpath = $tfRoot.'/site/profiles/'.$label.'.json';
209 |
210 | if ($forcerefresh || !file_exists($localpath) || time()-filemtime($localpath) > 3600*24 || filesize($localpath) < 50 )
211 | {
212 | $dict = getWebfinger($host, $label, $forcerefresh);
213 | if (isset($dict['link']))
214 | {
215 | $url = $dict['link'];
216 | debugLog(' '.$url.'');
217 | $s = getRemoteString($url,$localpath);
218 | file_put_contents($localpath,$s);
219 | debugLog(expandableSnippet($s));
220 |
221 | $j = json_decode($s,true);
222 | if (!isset($j['error']))
223 | {
224 | $profile['followercount'] = 20;
225 | $profile['format'] = 'json';
226 | $profile['label'] = $label;
227 | $profile['user'] = $user;
228 | $profile['host'] = $host;
229 | $profile['summary'] = @$j['summary'];
230 | $profile['attachment'] = @json_encode($j['attachment']);
231 | $profile['outbox'] = @$j['outbox'];
232 |
233 | if (@$j['icon'])
234 | {
235 | $profile['avatar'] = @$j['icon']['url'];
236 |
237 | }
238 |
239 | if (isset($j['followers']))
240 | {
241 | $url2 = $j['followers'];
242 | $s2 = getRemoteString($url2,'.json');
243 | $j2 = json_decode($s2,true);
244 | if (isset($j2['totalItems']))
245 | $profile['followercount'] = $j2['totalItems'];
246 | }
247 |
248 | // get Mastodon ID
249 | $url = 'https://'.$profile['host'].'/api/v1/accounts/lookup?acct='.$profile['label'];
250 | $s = getRemoteString($url,'.json');
251 | $j = json_decode($s,true);
252 | $id = @$j['id'];
253 |
254 | if ($id)
255 | {
256 | $profile['id'] = $id;
257 | }
258 | else
259 | {
260 | $profile['id'] = '';
261 | }
262 |
263 |
264 | $s = json_encode($profile);
265 | file_put_contents($localpath,$s);
266 |
267 | }
268 |
269 | }
270 | else
271 | {
272 | $s = '';
273 | }
274 | }
275 | else
276 | $s = file_get_contents($localpath);
277 |
278 | if ($s) $profile = json_decode($s,true);
279 |
280 | if (isset($profile['error']))
281 | {
282 | unset($profile);
283 | @unlink($localpath);
284 | }
285 |
286 | if (!isset($profile))
287 | {
288 | // fallback HTML
289 | if (!isset($dict)) $dict = getWebfinger($host, $label, $forcerefresh);
290 |
291 | $url = $dict['link'];
292 |
293 | $s = getRemoteString($url,'.html');
294 | debugLog(expandableSnippet($s));
295 |
296 | // now we search also for followers
297 | // /i",$s,$matches);
318 | foreach($matches[1] as $m)
319 | {
320 | preg_match("/content=['\"](.*)['\"].+property=['\"]og:image['\"]/",$m, $matches2);
321 | if (@$matches2[1])
322 | {
323 | $profile['avatar'] = $matches2[1];
324 | }
325 | }
326 |
327 |
328 | // we create a json profile
329 | $s = json_encode($profile);
330 | $localpath = $tfRoot.'/site/profiles/'.$label.'.json';
331 | file_put_contents($localpath,$s);
332 |
333 |
334 | }
335 |
336 | return $profile;
337 |
338 | }
339 |
340 | function getOutboxLink($profile, $forcerefresh=false)
341 | {
342 | global $tfRoot;
343 |
344 | // we prefer Mastodon API because the feed is richer than fediverse
345 |
346 | if ($profile['id']) return 'https://'.$profile['host'].'/api/v1/accounts/'.$profile['id'].'/statuses/?limit=40';
347 |
348 | if ($profile['outbox']) // fediverse standard
349 | {
350 |
351 | $url = $profile['outbox'];
352 | //print_r($profile);
353 | $localpath = $tfRoot.'/site/outbox/'.$profile['label'].'.json';
354 |
355 |
356 |
357 | if ($forcerefresh || !file_exists($localpath) || time()-filemtime($localpath) > 3600*24*7 || filesize($localpath) < 50 )
358 | {
359 | debugLog(' '.$url.'');
360 | $s = getRemoteString($url);
361 | file_put_contents($localpath,$s);
362 | debugLog(expandableSnippet($s));
363 | }
364 | else
365 | $s = file_get_contents($localpath);
366 |
367 | $dict = array();
368 | $j = json_decode($s,true);
369 | if (is_array($j) && isset($j['first'])) return $j['first'];
370 |
371 | }
372 | // RSS fallback
373 |
374 |
375 | return 'https://'.$profile['host'].'/users/'.$profile['user'].'.rss';
376 |
377 | }
378 |
379 |
--------------------------------------------------------------------------------
/inc/utilities.php:
--------------------------------------------------------------------------------
1 | $v)
17 | {
18 | $f = fopen($v,"wb");
19 | $c = curl_init();
20 |
21 | curl_setopt($c, CURLOPT_RETURNTRANSFER, 1);
22 | curl_setopt($c, CURLOPT_SSL_VERIFYPEER, 0);
23 | curl_setopt($c, CURLOPT_URL, $k);
24 | curl_setopt($c, CURLOPT_TIMEOUT,5);
25 | curl_setopt($c, CURLOPT_USERAGENT, 'Tootfinder/1.1 (+https://www.tootfinder.ch/index.php)');
26 | if ($fv = @filemtime($v))
27 | {
28 | curl_setopt($c, CURLOPT_TIMEVALUE, $fv);
29 | curl_setopt($c, CURLOPT_TIMECONDITION, CURL_TIMECOND_IFMODSINCE);
30 | }
31 | curl_setopt($c, CURLOPT_FOLLOWLOCATION, true);
32 | curl_setopt($c, CURLOPT_FILE, $f);
33 |
34 | if (substr($v,-5)=='.json') curl_setopt($c, CURLOPT_HTTPHEADER, array('Accept: application/activity+json'));
35 |
36 | curl_multi_add_handle($mh,$c);
37 | }
38 | do {
39 | curl_multi_exec($mh, $running);
40 | curl_multi_select($mh);
41 | } while ($running > 0);
42 |
43 | }
44 |
45 |
46 | function getRemoteString($url,$v = '')
47 | {
48 | $c = curl_init();
49 | curl_setopt($c, CURLOPT_RETURNTRANSFER, 1);
50 | curl_setopt($c, CURLOPT_SSL_VERIFYPEER, 0);
51 | curl_setopt($c, CURLOPT_TIMEOUT,5);
52 | curl_setopt($c, CURLOPT_USERAGENT, 'Tootfinder/1.1 (+https://www.tootfinder.ch/index.php)');
53 | curl_setopt($c, CURLOPT_URL, $url);
54 | curl_setopt($c, CURLOPT_FOLLOWLOCATION, true);
55 | if (substr($v,-5)=='.json') curl_setopt($c, CURLOPT_HTTPHEADER, array('Accept: application/activity+json'));
56 | $contents = curl_exec($c);
57 | curl_close($c);
58 | return $contents;
59 | }
60 |
61 | function header2dict($s)
62 | {
63 | $result = array();
64 | $lines = explode(PHP_EOL,$s);
65 | foreach($lines as $line)
66 | {
67 | $fields = explode(':',$line);
68 | if (count($fields) < 2) continue;
69 | $key = array_shift($fields);
70 | $value = join(':',$fields);
71 | $result[$key][] = $value;
72 | }
73 | return $result;
74 | }
75 |
76 | function soundexLong($a)
77 | {
78 | $a = preg_replace('/[^a-zA-Z&0-9 ]/',' ',$a);
79 | $list = array();
80 | foreach(explode(' ',$a) as $w)
81 | {
82 | $list[] = soundex($w);
83 | }
84 | return join(' ',$list);
85 | }
86 |
87 |
88 | function xml2array( $xmlObject, $out = array () )
89 | {
90 | foreach ( (array) $xmlObject as $index => $node )
91 | $out[$index] = ( is_object ( $node ) || is_array ( $node ) ) ? xml2array ( $node ) : $node;
92 |
93 | return $out;
94 | }
95 |
96 | function debugLog($s)
97 | {
98 | global $tfDebug;
99 | if (strlen($tfDebug) < 10000000) $tfDebug .= $s; // prevent overflow
100 | }
101 |
102 | function wordList($s)
103 | {
104 | $p = str_replace(' /','',$p);
107 | $p = preg_replace('#https?://\S*#','',$p);
108 | $p = preg_replace('/[!-)\+-,\.\/:-@[-`\{-~]|\*|#|’|-/',' ',$p);
109 |
110 | // [!-) ascii 33-41
111 | // \. ascii 46
112 | // \/ ascii 47
113 | // :-@ ascii 58-64
114 | // [-` ascii 91-96
115 | // \{-~ ascii 123-126
116 |
117 | return array_unique(explode(' ',$p));
118 | }
119 |
120 |
--------------------------------------------------------------------------------
/index.php:
--------------------------------------------------------------------------------
1 |
42 |
43 |
44 | /",' Full text search on Mastodon
142 | Imagine searching any post on Mastodon. This is now possible - at least for the posts of users who opt in.
143 | Tootfinder indexes all public posts of consenting users and makes them searchable for 3 months. If you want to be part of it, join the index.
144 | Search syntax
149 |
150 | The search is case-insensitive. You can append * to the end of a word. You can use NEAR, OR and the prefix -.
151 | More about search
161 | If the crawler does not find an exact result, it looks for similar results. Click on the avatar to access the user, click on the date to access the post on Mastoton. Click on the image to access the original image. Privacy note
164 | This is pure opt-in: If you are not interested, just do not join the index. If you quit the index, your posts will be removed from the index after 3 months. Implementation
169 | Tootfinder uses the public Mastodon API for the profile and the JSON feed. The feeds are consulted on an optimized frequency, indexed in a SQLite database and deleted after 3 months. Check out the Tootfinder Wiki Contact
173 | @buercher@tooting.ch
174 | v'.$tfVersion.' '.$tfVersionDate.'
175 | ';
176 | echo getinfo();
177 | echo " Index ".indexStatus();
178 | echo " Mastodon instance administrators can opt in their instance on this page. If the instance opts in, all users of the instance are indexed, except if the opt-out (see below).
53 | Only the Mastodon API is supported. Instances must have an the following open API endpoints: /api/v1/instance/rules and /api/v1/directory?.
58 | Instances must add an instance rule with the text All public posts from this instance are indexed by tootfinder.ch. The purpose of this rule is that all users of the instance are aware of the opt-in. Instances commit themselves to actively communicate the opt-in to their users (rule of honor: this is something tootfinder.ch cannot verify).
63 | Submit the domain name of the host
69 | The rules of the instance are verified on a regularly basis. If the rules do not contain the magic sentence any more, the indexing stops. However, the instances are not removed from the list below. To be explicitely unlisted on the list, instance administrator must submit join again after removing the rule. Users that individually opt-in will stay on the index.
80 | Individual users of these instances can opt out, if they have the magic word ("noindex") on their Mastodon profile.
92 | Join the index (step 1)
64 | You need first to provide consent via your profile.
65 | Place the magic word anywhere in your profile (either bio or part of a well-formed link in a label). Possible values:
66 | Wait some minutes, to let the server cache update your profile. Join the index (step 2)
74 | Submit us your full username.
75 | Quit the index If you change your mind, just remove the magic word in your profile. Tootfinder will stop indexing your account and your toots will eventually disappear from our database (after 3 months).
81 | Instance opt-in
84 | Instances can opt-in globally on this page.
85 | The page lists the instances that have opted in. These instances must declare the indexing in their ruleset. Users on these instances can still opt out having the magic word "noindex" in their profile.
86 | Privacy note
93 | This is pure opt-in: If you are not interested, just do not join the index. If you quit the index, your posts will be removed from the index after 3 months. Contact
98 | @buercher@tooting.ch
99 | The application does only record data as far as it is needed for the purpose of the application. It does only index data that is public. Data that is obsolete will be deleted within two weeks.
34 | The purpose of the application is to make recent posts of consenting Mastodon users searchable.
39 | The application does not record visits. The underlying webserver does create logfiles of metada that include IP number, a timestamnp and the URI. The logfiles are deleted by the webserver after a month.
44 | For each query that does give results, the query, the timestamp and the number of results are recorded in the database. The data is used for statistical purposes and deleted after 3 months.
49 | Users that opt in are only recorded if they manifest their consent with a magic word ("tootfinder","tfr" or "searchable") on their Mastodon profile. For each verified user host, username, label and id are recorded and the public feed is indexed. A priority field is calculated for each user to optimise the frequency of the crawler.
54 | The application indexes the feeds of the consenting users. Posts that are public and that are neither replies nor boosts are indexed. For each post, the link, the content, the links and the descriptions of the attachments (media, card) and a timestamp are indexed as while the users name, its avatar and its current follower count.
59 | Mentions in posts are removed before indexing. It might me possible that a post cites names thet are not mentions.
64 | The application uses the public API of Mastodon to get the profile of the user and the feed. If the API is not available, it may use the HTML source of the user page and the RSS feed.
69 | Users might share the URL of the result page. The application may provide a REST API to provide the search functionality to third party applications, eg. Mastodon clients.
74 | The user can revoke the consent at any time by removing the magic word on the Mastodon profile. The application checks the profile on a daily base. If the magic word is missing, the user becomes inactive and the posts of the user will not be searchable any more.
79 | Post and query data will be deleted after 3 months. User data will be deleted when the user has revoked consent and there are no posts from the user.
84 | The application does not use cookies, neither own cookies not third party cookies.
91 | The source code of the application is available on GitHub. The production environment of the application may have hot fixes which are ahead of the source code.
96 | The result page depends on the user posts and the visitor query. The application has no influence on both factors which are entirely user-driven and may not be hold liable for the content of the page.
103 | If you have a privacy issue, you can contact @buercher@tooting.ch with a direct message.
108 | Version 2.0 2023-04-23
109 |
') $m = substr($m,4);
20 | $m = str_replace('
','',$m);
21 | $m = str_replace('','',$m);
22 | if ($matches[1]) $matches[1] = ': '.$matches[1];
23 | $line = '
24 | ';
85 |
86 | $lines []= ''.join(' ',$keys).' ';
87 | }
88 |
89 | return ''.join(' ',$values).' '.join(PHP_EOL,$lines).'
';
90 |
91 | }
92 |
93 | function handleMedia($media)
94 | {
95 | if (!$media) return '';
96 |
97 | $result = '';
98 |
99 | foreach(explode('::',$media) as $m)
100 | {
101 | $fields = explode('|',$m);
102 |
103 | if (count($fields)==4)
104 | {
105 | // card
106 | // $cardimage.'|'.$cardurl.'|'.$cardtitle.'|'.$carddescription;
107 | $thumb = $fields[0];
108 | $orig = $fields[1];
109 | $cardtitle = $fields[2];
110 | $carddescription = $fields[3];
111 |
112 | // remove single quotes in card description
113 | $carddescription = preg_replace("/^'/",'',$carddescription);
114 | $carddescription = preg_replace("/'$/",'',$carddescription);
115 | if (strlen($carddescription)>500) $carddescription = substr($carddescription,0,499).'…';
116 | if ($thumb || $carddescription)
117 | {
118 | if ($thumb) $thumb = '';
119 | $result .= '';
120 |
121 | }
122 | }
123 | else // 1-3 fields
124 | {
125 | $thumb = $fields[0];
126 | if (count($fields)>1) $orig = $fields[1]; else $orig = $thumb;
127 | if (count($fields)>2) $alt = 'alt="'.str_replace('"','"',$fields[2]).'"'; else $alt = '';
128 |
129 | $result .= '';
130 | }
131 | }
132 |
133 | return $result;
134 | }
135 |
136 | function handleHTMLHeader($s)
137 | {
138 | $s = preg_replace('//','',$s);
139 | $s = str_ireplace('','',$s);
140 | $s = str_ireplace('','',$s);
141 | $s = str_ireplace('','',$s);
142 | $s = str_ireplace('','',$s);
143 | return $s;
144 | }
145 |
146 | function encodeSpacelessLanguage($s)
147 | {
148 | // japanese
149 | // source https://gist.github.com/terrancesnyder/1345094
150 |
151 | $s = preg_replace('/([一-龠]|[ぁ-ゔ]|[ァ-ヴー]|[々〆〤ヶ])/u','$1 ',$s);
152 | // https://stackoverflow.com/questions/6787716/regular-expression-for-japanese-characters
153 | // Japanese katakana, hiragana and dashes
154 | // removed alphanumeric from snippet
155 |
156 | return $s;
157 |
158 | }
159 |
160 | function decodeSpacelessLanguage($s)
161 | {
162 | // japanese
163 | // source https://gist.github.com/terrancesnyder/1345094
164 |
165 | $s = preg_replace('/([一-龠]|[ぁ-ゔ]|[ァ-ヴー]|[々〆〤ヶ])\s/u','$1',$s);
166 | // https://stackoverflow.com/questions/6787716/regular-expression-for-japanese-characters
167 | // Japanese katakana, hiragana and dashes
168 | // removed alphanumeric from snippet
169 |
170 | return $s;
171 |
172 | }
173 |
174 | function expandableSnippet($s)
175 | {
176 |
177 | $script = 'ch = this.parentNode.children; for (var i = 1; i < ch.length; i++) { if (ch[i].style.display == "block") ch[i].style.display = "none"; else ch[i].style.display = "block"; }';
178 | return "
Tootfinder
53 | Opt-in global Mastodon full text search. Join the index!
54 |
55 | '.$msg.'
'.$row['pubdate'].'';
96 | $line = $row['description'];
97 |
98 |
99 | $line = handleMentions($line);
100 | $line = handleHashtags($line);
101 | // fix paragraphs
102 | $line = preg_replace("/<\/p>.*?
',$line);
103 |
104 |
105 | $line .= handleMedia(@$row['media']);
106 |
107 | $line = handleContentWarning($line);
108 |
109 | $line = '
152 |
157 | Principles
52 | Technical conditions
57 | Instance rules (step 1)
62 | Instance opt-in (step 2)
68 | Instance opt-out
79 | List of instances that have opted in
85 |
87 | Opt-out for users
91 | Join Tootfinder
45 | Opt-in global Mastodon full text search.
46 |
47 | '.$msg.'
70 | Principles
33 | Purpose
38 | Visitor data
43 | Query data
48 | Consenting user data
53 | Feed data
58 | Mentions
63 | Data sources
68 | Reuse of results
73 | Revoking consent
78 | Deleting data
83 | Cookies
90 | Source code
95 | Liabilty
102 | Contact
107 |