├── README.txt ├── TODO.txt ├── browse.php ├── config.php ├── crawl.php ├── create-tables.sql ├── export.php ├── includes ├── cookie.txt ├── functions.php └── mysql_functions.php ├── query.php ├── sitemap.php └── stats.php /README.txt: -------------------------------------------------------------------------------- 1 | TO USE: 2 | 3 | 1. Edit config.PHP with appropriate database and domain information 4 | 2. (for now) in phpMyAdmin insert the seed URL into the urls table. 5 | * URL should be something like: www.fcc.gov 6 | * URL should have a trailing slash 7 | * (for now) May also want to set clicks to '0' to avoid problems 8 | 3. Open crawler.php 9 | 4. (optional) open stats.php to watch progress 10 | 11 | TIPS: 12 | Changes to php.ini 13 | 1. Increase memory limit (1GB) 14 | 2. Remove execution time limit 15 | Changes to mysql.ini 16 | * Increased max query size (to avoid "mysql went away" error) 17 | 18 | Additional documentation (source code) in (/source) -------------------------------------------------------------------------------- /TODO.txt: -------------------------------------------------------------------------------- 1 | # TO DO 2 | 3 | - Review/improve handling of confirmed in/out domains and also silly links like ?font=large 4 | - Check that all queries for urls includes the crawl_tag -------------------------------------------------------------------------------- /browse.php: -------------------------------------------------------------------------------- 1 | 2 |
3 | 6 | 7 | 8 | 18 | 19 | 20 |No Links on this page
81 | 82 | -------------------------------------------------------------------------------- /config.php: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /crawl.php: -------------------------------------------------------------------------------- 1 | 23 | 24 | 25 | STARTED: " . date('Y-m-d H:i:s') . ""; 27 | echo "Domains: $domains
"; 28 | echo "crawl_tag: $crawl_tag
"; 29 | echo "database: $mysql_db
"; 30 | echo "Crawling...
"; 31 | 32 | /* 33 | * Grab list of uncrawled URLs, repeat while there are still URLs to crawl 34 | */ 35 | while ($urls = uncrawled_urls($crawl_tag)) { 36 | 37 | /** 38 | * Loop through the array of uncrawled URLs 39 | */ 40 | foreach ($urls as $id=>$url_data) { 41 | 42 | /** 43 | * If we're in debug mode, indicate that we are begining to crawl a new URL 44 | */ 45 | if (isset($_GET['debug'])) 46 | echo "Starting to crawl " . urldecode($url_data['url']) . "
No URLs to crawl!
"; 200 | echo "FINISHED: " . date('Y-m-d H:i:s') . "
"; 201 | ?> 202 | 203 | -------------------------------------------------------------------------------- /create-tables.sql: -------------------------------------------------------------------------------- 1 | SET SQL_MODE="NO_AUTO_VALUE_ON_ZERO"; 2 | 3 | CREATE TABLE IF NOT EXISTS `links` ( 4 | `from` int(10) NOT NULL, 5 | `to` int(10) NOT NULL, 6 | KEY `from` (`from`,`to`), 7 | KEY `to` (`to`) 8 | ) ENGINE=MyISAM DEFAULT CHARSET=latin1; 9 | 10 | CREATE TABLE IF NOT EXISTS `urls` ( 11 | `ID` int(10) NOT NULL AUTO_INCREMENT, 12 | `url` varchar(264) NOT NULL, 13 | `title` varchar(128) DEFAULT NULL, 14 | `crawled` int(1) NOT NULL DEFAULT '0', 15 | `clicks` int(3) DEFAULT NULL, 16 | `http_code` int(3) DEFAULT NULL, 17 | `size` int(11) DEFAULT NULL, 18 | `type` varchar(64) DEFAULT NULL, 19 | `modified` int(15) DEFAULT NULL, 20 | `md5` varchar(32) DEFAULT NULL, 21 | `crawl_tag` varchar(32) DEFAULT NULL, 22 | `html` text DEFAULT NULL, 23 | PRIMARY KEY (`ID`), 24 | UNIQUE KEY `url` (`url`) 25 | ) ENGINE=MyISAM DEFAULT CHARSET=latin1 AUTO_INCREMENT=1 ; 26 | 27 | CREATE INDEX crawl_tag ON urls (crawl_tag); 28 | -------------------------------------------------------------------------------- /export.php: -------------------------------------------------------------------------------- 1 | 65,000 URLs) Excel cannot open the file, however Access can 5 | * 6 | * @package Crawler 7 | */ 8 | 9 | /** 10 | * Include necessary files 11 | */ 12 | include('config.php'); 13 | include('includes/functions.php'); 14 | include('includes/mysql_functions.php'); 15 | 16 | /** 17 | * Set the content headers 18 | */ 19 | header('Content-type: application/CSV'); 20 | header("Content-Disposition: attachment; filename=export.csv"); 21 | 22 | /** 23 | * SQL query to generate our results 24 | */ 25 | $pages = mysql_query("SELECT url, title, clicks, http_code, crawl_tag, size, type, modified, (SELECT count(*) FROM links WHERE `to` = urls.ID) as incoming, (SELECT count(*) FROM links WHERE `to` = urls.ID) as outgoing from urls"); 26 | 27 | /** 28 | * Count the number of pages in our dataset 29 | */ 30 | $count = mysql_num_rows($pages); 31 | 32 | /** 33 | * Get array of fields by parsing keys of first row array 34 | * 35 | * NOTE TO SELF: There is a better way to do this 36 | */ 37 | $fields = array_keys(mysql_fetch_assoc($pages)); 38 | 39 | /** 40 | * Print the header row and a new line charecter 41 | */ 42 | foreach ($fields as $field) { 43 | echo "$field\t"; 44 | } 45 | echo "\n"; 46 | 47 | /** 48 | * When we looped through to grab the field names, we moevd the internal pointer. 49 | * Reset internal pointer so our loop includes the first row 50 | */ 51 | mysql_data_seek($pages,0); 52 | 53 | 54 | /** 55 | * Loop through the rows (pages) 56 | */ 57 | for ($i=0; $i < $count; $i++) { 58 | 59 | /** 60 | * Fetch the row as an associative array 61 | */ 62 | $page = mysql_fetch_assoc($pages); 63 | 64 | /** 65 | * Loop through each field within the row 66 | */ 67 | foreach ($page as $key=>$field) { 68 | 69 | /** 70 | * If it the 'size', or 'modified' field, make it human readible, otherwise just output 71 | */ 72 | switch($key) { 73 | case 'size': 74 | echo file_size($field); 75 | break; 76 | case 'modified': 77 | if (!is_null($field)) echo date('Y-m-d H:i:s',$field); 78 | break; 79 | default: 80 | echo $field; 81 | break; 82 | } //End switch 83 | echo "\t"; 84 | } //End Field 85 | echo "\n"; 86 | } //End Row 87 | 88 | ?> -------------------------------------------------------------------------------- /includes/cookie.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FCC/Crawler/cf4684e7637d4d133c64df32fac08604b4650a69/includes/cookie.txt -------------------------------------------------------------------------------- /includes/functions.php: -------------------------------------------------------------------------------- 1 | false, 24 | CURLOPT_COOKIEJAR => 'cookie.txt', 25 | CURLOPT_COOKIEFILE => 'cookie.txt', 26 | CURLOPT_USERAGENT => 'Mozilla/5.0 (FCC New Media Web Crawler)', 27 | CURLOPT_FOLLOWLOCATION => true, 28 | CURLOPT_RETURNTRANSFER => true, 29 | CURLOPT_FILETIME => true, 30 | CURLOPT_TIMEOUT => 15 31 | ); 32 | curl_setopt_array($ch, $options); 33 | $output['html'] = curl_exec($ch); 34 | $output['md5'] = md5($output['html']); 35 | $output['http_code'] = curl_getinfo($ch,CURLINFO_HTTP_CODE); 36 | $output['reported_size'] = curl_getinfo($ch,CURLINFO_CONTENT_LENGTH_DOWNLOAD); 37 | $output['actual_size'] = curl_getinfo($ch,CURLINFO_SIZE_DOWNLOAD); 38 | $output['type'] = curl_getinfo($ch,CURLINFO_CONTENT_TYPE); 39 | $output['modified'] = curl_getinfo($ch,CURLINFO_FILETIME); 40 | curl_close($ch); 41 | return $output; 42 | } 43 | 44 | /** 45 | * Function to parse page for title tags 46 | * 47 | * @params string $data HTML of page 48 | * @return string|bool title of page of null if not found 49 | */ 50 | function parse_title($data) { 51 | if (preg_match('#matched exclude pattern $pattern in ".urldecode($link)."
"; 150 | return TRUE; 151 | } 152 | } 153 | return FALSE; 154 | } 155 | 156 | /** 157 | * Checks to see if a given link is in fact a mailto: link 158 | * 159 | * @params string $link Link to check 160 | * @return bool true on mailto:, false on everything else 161 | */ 162 | function is_mailto($link) { 163 | if (stripos($link,'mailto:')===FALSE) return false; 164 | else return true; 165 | } 166 | 167 | /* 168 | * Data storage and retrieval functions 169 | */ 170 | 171 | /** 172 | * Adds a URL to the URLs table upon discovery in a link 173 | * 174 | * @params string $link URL to add 175 | * @params int $clicks number of clicks from initial page 176 | * @return bool true on sucess, false on fail 177 | */ 178 | function add_url($link,$clicks,$crawl_tag) { 179 | return mysql_insert('urls',array('url'=>urldecode($link),'clicks'=>$clicks,'crawl_tag'=>$crawl_tag)); 180 | } 181 | 182 | /** 183 | * Adds a link to the links table 184 | * 185 | * @params int $form ID of linking page 186 | * @params int $to ID of target page 187 | * @return int|bool LinkID on sucess, false on fail 188 | */ 189 | function add_link($from,$to) { 190 | if ($from == $to) return false; 191 | if (mysql_exists('links',array('from'=>$from,'to'=>$to))) return false; 192 | else return mysql_insert('links',array('from'=>$from,'to'=>$to)); 193 | } 194 | 195 | /** 196 | * Grab all links on a given page, optionally for a specific depth 197 | * 198 | * @params int $pageID pageID 199 | * @params int $click optionally the number of clicks from the homepage to restrict results 200 | * @return array Multidimensional array keyed by target pageID with page data 201 | */ 202 | function get_links($pageID,$click = '') { 203 | $links = mysql_array(mysql_select('links',array('from'=>$pageID)),FALSE); 204 | foreach ($links as $link) $output[$link['to']] = get_page($link['to']); 205 | return $output; 206 | } 207 | 208 | /** 209 | * Shorthand MySQL function to count links in or out of a given page 210 | * 211 | * @params int $pageID subject page 212 | * @params string $direction Direction to retrieve (either "to" or "from") 213 | * @return int Number of links 214 | */ 215 | function count_links($pageID,$direction) { 216 | $result = mysql_select('links',array($direction=>$pageID)); 217 | return mysql_num_rows($result); 218 | } 219 | 220 | /** 221 | * Shorthand MySQL function to get a particular page's row 222 | * 223 | * @params int $pageID target page 224 | * @return array Associative array of page data 225 | */ 226 | function get_page($pageID) { 227 | return mysql_row_array(mysql_select('urls',array('ID'=>$pageID))); 228 | } 229 | 230 | 231 | /** 232 | * Shorthand MySQL function to to get the first 100 uncrawled URLs 233 | * 234 | * @return array Associative array of uncrawled URLs & page data 235 | */ 236 | function uncrawled_urls($crawl_tag) { 237 | return mysql_array(mysql_query("SELECT * FROM `urls` WHERE `crawled` = '0' AND `crawl_tag`=\"$crawl_tag\" LIMIT 100")); 238 | } 239 | 240 | /** 241 | * Checks to see if a given URL is already in the pages table 242 | * 243 | * @params string $link URL to check 244 | * @return bool true if URL exists, false if not found 245 | */ 246 | function have_url($url,$crawl_tag) { 247 | $url = mysql_row_array(mysql_select('urls',array('url'=>urldecode($url)))); 248 | if (sizeof($url)==0) return false; 249 | else return $url['ID']; 250 | } 251 | 252 | /* Depreciated (I think) 253 | 254 | function count_slashes($url) { 255 | if (strlen($url)<7) return 0; 256 | return substr_count($url,'/',7); 257 | } 258 | 259 | function get_slashes($url) { 260 | if (preg_match_all('#/#',$url,$matches,PREG_OFFSET_CAPTURE,7)) return $matches[0]; 261 | else return array(); 262 | } 263 | */ 264 | 265 | /** 266 | * Converts a relative URL (/bar) to an absolute URL (http://www.foo.com/bar) 267 | * 268 | * Inspired from code available at http://nadeausoftware.com/node/79, 269 | * Code distributed under OSI BSD (http://www.opensource.org/licenses/bsd-license.php) 270 | * 271 | * @params string $baseUrl Directory of linking page 272 | * @params string $relativeURL URL to convert to absolute 273 | * @return string Absolute URL 274 | */ 275 | function url_to_absolute( $baseUrl, $relativeUrl ) { 276 | // If relative URL has a scheme, clean path and return. 277 | $r = split_url( $relativeUrl ); 278 | if ( $r === FALSE ) 279 | return FALSE; 280 | if ( !empty( $r['scheme'] ) ) 281 | { 282 | if ( !empty( $r['path'] ) && $r['path'][0] == '/' ) 283 | $r['path'] = url_remove_dot_segments( $r['path'] ); 284 | return join_url( $r ); 285 | } 286 | 287 | // Make sure the base URL is absolute. 288 | $b = split_url( $baseUrl ); 289 | if ( $b === FALSE || empty( $b['scheme'] ) || empty( $b['host'] ) ) 290 | return FALSE; 291 | $r['scheme'] = $b['scheme']; 292 | 293 | // If relative URL has an authority, clean path and return. 294 | if ( isset( $r['host'] ) ) 295 | { 296 | if ( !empty( $r['path'] ) ) 297 | $r['path'] = url_remove_dot_segments( $r['path'] ); 298 | return join_url( $r ); 299 | } 300 | unset( $r['port'] ); 301 | unset( $r['user'] ); 302 | unset( $r['pass'] ); 303 | 304 | // Copy base authority. 305 | $r['host'] = $b['host']; 306 | if ( isset( $b['port'] ) ) $r['port'] = $b['port']; 307 | if ( isset( $b['user'] ) ) $r['user'] = $b['user']; 308 | if ( isset( $b['pass'] ) ) $r['pass'] = $b['pass']; 309 | 310 | // If relative URL has no path, use base path 311 | if ( empty( $r['path'] ) ) 312 | { 313 | if ( !empty( $b['path'] ) ) 314 | $r['path'] = $b['path']; 315 | if ( !isset( $r['query'] ) && isset( $b['query'] ) ) 316 | $r['query'] = $b['query']; 317 | return join_url( $r ); 318 | } 319 | 320 | // If relative URL path doesn't start with /, merge with base path 321 | if ( $r['path'][0] != '/' ) 322 | { 323 | $base = mb_strrchr( $b['path'], '/', TRUE, 'UTF-8' ); 324 | if ( $base === FALSE ) $base = ''; 325 | $r['path'] = $base . '/' . $r['path']; 326 | } 327 | $r['path'] = url_remove_dot_segments( $r['path'] ); 328 | return join_url( $r ); 329 | } 330 | 331 | /** 332 | * Required function of URL to absolute 333 | * 334 | * Inspired from code available at http://nadeausoftware.com/node/79, 335 | * Code distributed under OSI BSD (http://www.opensource.org/licenses/bsd-license.php) 336 | * 337 | */ 338 | function url_remove_dot_segments( $path ) { 339 | // multi-byte character explode 340 | $inSegs = preg_split( '!/!u', $path ); 341 | $outSegs = array( ); 342 | foreach ( $inSegs as $seg ) 343 | { 344 | if ( $seg == '' || $seg == '.') 345 | continue; 346 | if ( $seg == '..' ) 347 | array_pop( $outSegs ); 348 | else 349 | array_push( $outSegs, $seg ); 350 | } 351 | $outPath = implode( '/', $outSegs ); 352 | if ( $path[0] == '/' ) 353 | $outPath = '/' . $outPath; 354 | // compare last multi-byte character against '/' 355 | if ( $outPath != '/' && 356 | (mb_strlen($path)-1) == mb_strrpos( $path, '/', 'UTF-8' ) ) 357 | $outPath .= '/'; 358 | return $outPath; 359 | } 360 | 361 | /** 362 | * Required function of URL to absolute 363 | * 364 | * Inspired from code available at http://nadeausoftware.com/node/79, 365 | * Code distributed under OSI BSD (http://www.opensource.org/licenses/bsd-license.php) 366 | * 367 | */ 368 | function split_url( $url, $decode=TRUE ) 369 | { 370 | $xunressub = 'a-zA-Z\d\-._~\!$&\'()*+,;='; 371 | $xpchar = $xunressub . ':@%'; 372 | 373 | $xscheme = '([a-zA-Z][a-zA-Z\d+-.]*)'; 374 | 375 | $xuserinfo = '(([' . $xunressub . '%]*)' . 376 | '(:([' . $xunressub . ':%]*))?)'; 377 | 378 | $xipv4 = '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'; 379 | 380 | $xipv6 = '(\[([a-fA-F\d.:]+)\])'; 381 | 382 | $xhost_name = '([a-zA-Z\d-.%]+)'; 383 | 384 | $xhost = '(' . $xhost_name . '|' . $xipv4 . '|' . $xipv6 . ')'; 385 | $xport = '(\d*)'; 386 | $xauthority = '((' . $xuserinfo . '@)?' . $xhost . 387 | '?(:' . $xport . ')?)'; 388 | 389 | $xslash_seg = '(/[' . $xpchar . ']*)'; 390 | $xpath_authabs = '((//' . $xauthority . ')((/[' . $xpchar . ']*)*))'; 391 | $xpath_rel = '([' . $xpchar . ']+' . $xslash_seg . '*)'; 392 | $xpath_abs = '(/(' . $xpath_rel . ')?)'; 393 | $xapath = '(' . $xpath_authabs . '|' . $xpath_abs . 394 | '|' . $xpath_rel . ')'; 395 | 396 | $xqueryfrag = '([' . $xpchar . '/?' . ']*)'; 397 | 398 | $xurl = '^(' . $xscheme . ':)?' . $xapath . '?' . 399 | '(\?' . $xqueryfrag . ')?(#' . $xqueryfrag . ')?$'; 400 | 401 | 402 | // Split the URL into components. 403 | if ( !preg_match( '!' . $xurl . '!', $url, $m ) ) 404 | return FALSE; 405 | 406 | if ( !empty($m[2]) ) $parts['scheme'] = strtolower($m[2]); 407 | 408 | if ( !empty($m[7]) ) { 409 | if ( isset( $m[9] ) ) $parts['user'] = $m[9]; 410 | else $parts['user'] = ''; 411 | } 412 | if ( !empty($m[10]) ) $parts['pass'] = $m[11]; 413 | 414 | if ( !empty($m[13]) ) $h=$parts['host'] = $m[13]; 415 | else if ( !empty($m[14]) ) $parts['host'] = $m[14]; 416 | else if ( !empty($m[16]) ) $parts['host'] = $m[16]; 417 | else if ( !empty( $m[5] ) ) $parts['host'] = ''; 418 | if ( !empty($m[17]) ) $parts['port'] = $m[18]; 419 | 420 | if ( !empty($m[19]) ) $parts['path'] = $m[19]; 421 | else if ( !empty($m[21]) ) $parts['path'] = $m[21]; 422 | else if ( !empty($m[25]) ) $parts['path'] = $m[25]; 423 | 424 | if ( !empty($m[27]) ) $parts['query'] = $m[28]; 425 | if ( !empty($m[29]) ) $parts['fragment']= $m[30]; 426 | 427 | if ( !$decode ) 428 | return $parts; 429 | if ( !empty($parts['user']) ) 430 | $parts['user'] = rawurldecode( $parts['user'] ); 431 | if ( !empty($parts['pass']) ) 432 | $parts['pass'] = rawurldecode( $parts['pass'] ); 433 | if ( !empty($parts['path']) ) 434 | $parts['path'] = rawurldecode( $parts['path'] ); 435 | if ( isset($h) ) 436 | $parts['host'] = rawurldecode( $parts['host'] ); 437 | if ( !empty($parts['query']) ) 438 | $parts['query'] = rawurldecode( $parts['query'] ); 439 | if ( !empty($parts['fragment']) ) 440 | $parts['fragment'] = rawurldecode( $parts['fragment'] ); 441 | return $parts; 442 | } 443 | 444 | /** 445 | * Required function of URL to absolute 446 | * 447 | * Inspired from code available at http://nadeausoftware.com/node/79, 448 | * Code distributed under OSI BSD (http://www.opensource.org/licenses/bsd-license.php) 449 | * 450 | */ 451 | function join_url( $parts, $encode=TRUE ) 452 | { 453 | if ( $encode ) 454 | { 455 | if ( isset( $parts['user'] ) ) 456 | $parts['user'] = rawurlencode( $parts['user'] ); 457 | if ( isset( $parts['pass'] ) ) 458 | $parts['pass'] = rawurlencode( $parts['pass'] ); 459 | if ( isset( $parts['host'] ) && 460 | !preg_match( '!^(\[[\da-f.:]+\]])|([\da-f.:]+)$!ui', $parts['host'] ) ) 461 | $parts['host'] = rawurlencode( $parts['host'] ); 462 | if ( !empty( $parts['path'] ) ) 463 | $parts['path'] = preg_replace( '!%2F!ui', '/', 464 | rawurlencode( $parts['path'] ) ); 465 | if ( isset( $parts['query'] ) ) 466 | $parts['query'] = rawurlencode( $parts['query'] ); 467 | if ( isset( $parts['fragment'] ) ) 468 | $parts['fragment'] = rawurlencode( $parts['fragment'] ); 469 | } 470 | 471 | $url = ''; 472 | if ( !empty( $parts['scheme'] ) ) 473 | $url .= $parts['scheme'] . ':'; 474 | if ( isset( $parts['host'] ) ) 475 | { 476 | $url .= '//'; 477 | if ( isset( $parts['user'] ) ) 478 | { 479 | $url .= $parts['user']; 480 | if ( isset( $parts['pass'] ) ) 481 | $url .= ':' . $parts['pass']; 482 | $url .= '@'; 483 | } 484 | if ( preg_match( '!^[\da-f]*:[\da-f.:]+$!ui', $parts['host'] ) ) 485 | $url .= '[' . $parts['host'] . ']'; // IPv6 486 | else 487 | $url .= $parts['host']; // IPv4 or name 488 | if ( isset( $parts['port'] ) ) 489 | $url .= ':' . $parts['port']; 490 | if ( !empty( $parts['path'] ) && $parts['path'][0] != '/' ) 491 | $url .= '/'; 492 | } 493 | if ( !empty( $parts['path'] ) ) 494 | $url .= $parts['path']; 495 | if ( isset( $parts['query'] ) ) 496 | $url .= '?' . $parts['query']; 497 | if ( isset( $parts['fragment'] ) ) 498 | $url .= '#' . $parts['fragment']; 499 | return $url; 500 | } 501 | 502 | /** 503 | * Returns filesize in human readable terms 504 | * 505 | * Inspired by code available at http://stackoverflow.com/questions/1222245/calculating-script-memory-usages-in-php 506 | * Code distributed under CC-Wiki License (http://creativecommons.org/licenses/by-sa/2.5/) 507 | * 508 | * @params int $size filesize in bytes 509 | */ 510 | function file_size($size) { 511 | $filesizename = array(" Bytes", " KB", " MB", " GB", " TB", " PB", " EB", " ZB", " YB"); 512 | return $size ? round($size/pow(1024, ($i = floor(log($size, 1024)))), 2) . $filesizename[$i] : '0 Bytes'; 513 | } 514 | 515 | ?> -------------------------------------------------------------------------------- /includes/mysql_functions.php: -------------------------------------------------------------------------------- 1 | 10 | * $db=mysql_connect (MYSQL_SERVER, MYSQL_USER, MYSQL_PASSWORD) or die ('I cannot connect to the database because: ' . mysql_error()); 11 | * mysql_select_db (MYSQL_DATABASE); 12 | * 13 | * 14 | * Changes: 1.1 Added option bool to mysql_array to toggle assoc., 1.2 switch mysql_real_escape_string for addslashes 15 | * 16 | * @author Benjamin J. Balter
39 | * array(
40 | * [1] => array(
41 | * ['ID'] => 1,
42 | * ['Name'] => "Kevin",
43 | * ['Position'] => '1B'
44 | * ),
45 | * [2] => array(
46 | * ['ID'] => 2,
47 | * ['Name'] => "Tom",
48 | * ['Position'] => 'LF'
49 | * ),
50 | *
51 | * [3] => array(
52 | * ['ID'] => 3,
53 | * ['Name'] => "Sally",
54 | * ['Position'] => 'SS'
55 | * )
56 | * )
57 | *
58 | * @param resource $result MySQL resource object (either output of mysql_query($sql) or mysql_select('table',$query))
59 | * @param bool $assoc makes Associate array optional (added 1.1)
60 | * @return array Multi-dimensional Associative array keyed to first field in table, returns empty array if no results
61 | *
62 | */
63 | function mysql_array($result,$assoc = TRUE) {
64 |
65 | //Start with a null results set
66 | $results = array();
67 |
68 | if ($assoc) {
69 | //Grab the first fieldname to key the array with
70 | $first = mysql_field_name($result,0);
71 |
72 | //Loop through each row and build an assoc. array
73 | while ($row = mysql_fetch_assoc($result)) $results += array($row[$first] => $row);
74 | } else {
75 | //Loop through each row and build a array
76 | while ($row = mysql_fetch_assoc($result)) $results[] = $row;
77 | }
78 |
79 | //Strip slashes and return
80 | return stripslashes_deep($results);
81 | }
82 |
83 | /**
84 | * Returns an array of a single MySQL result row.
85 | *
86 | * Similar to mysql_fetch_assoc exccept it strips slashes, returns an empty array (rather than an error) if resource is bad or no results are found
87 | *
88 | * @param resource $result MySQL resource object (either output of mysql_query($sql) or mysql_select('table',$query))
89 | * @return array Associative array of row, returns empty array if no results
90 | * @package mysql_functions
91 | */
92 | function mysql_row_array($result) {
93 |
94 | // Verify we have a valid MySQL resource, otherwise return an empty array
95 | if (!$result) return array();
96 |
97 | // Veryify there are results to the query, otherwise return an empty array
98 | if (mysql_num_rows($result) ==0) return array();
99 |
100 | //Strip slashes and return the result of mysql_fetch_assoc.
101 | return stripslashes_deep(mysql_fetch_assoc($result));
102 | }
103 |
104 | /**
105 | * Generates SQL query, sanitizes data, and inserts row into database.
106 | *
107 | * Example Usage
108 | *
109 | * $data = array( 'Name'=>'Joan',
110 | * 'Position'=>'2B'
111 | * );
112 | * mysql_insert('players',$data);
113 | *
114 | *
115 | * @param string $table Name of table to operate on
116 | * @param array $data Associative array of data fields and values
117 | * @returns int|bool ID of inserted row if valid, false if invalid
118 | * @package mysql_functions
119 | */
120 |
121 | function mysql_insert($table, $data) {
122 |
123 | //Build query
124 | $sql = "INSERT INTO `$table` (";
125 | foreach ($data as $field => $value) $sql .= "`$field`, ";
126 | $sql = substr($sql,0,strlen($sql)-2) . ") VALUES (";
127 | foreach ($data as $field => $value) $sql .= "'" . mysql_real_escape_string($value) . "', ";
128 |
129 | //Remove last comma
130 | $sql = substr($sql,0,strlen($sql)-2) . ")";
131 |
132 | // Run query and return either ID or false (error)
133 | if (mysql_query($sql)) return mysql_insert_id();
134 | return false;
135 | }
136 |
137 | /**
138 | * Generates SQL query, sanitizes data, and updates row in database.
139 | *
140 | * Example Usage
141 | *
142 | * //Updates Tom's row and moves him to Right Field
143 | * $data = array( 'Position' => 'RF');
144 | * $query = array( 'Name' => 'Tom' );
145 | * mysql_update('players', $data, $query);
146 | *
147 | * ----
148 | *
149 | * //Updates all players named 'Tom' or in Left Field and moves them to Right Field
150 | * $data = array( 'Position' => 'RF');
151 | * $query = array( 'Name' => 'Tom', 'Position' => 'LF' );
152 | * mysql_update('players', $data, $query, "OR");
153 | *
154 | *
155 | * @param string $table Name of table to operate on
156 | * @param array $data Associative array of data fields and values
157 | * @param array $query Associative array of query fields and values
158 | * @param string $connector (Optional) connector for quiery ('AND' or 'OR')
159 | * @return bool true or false on sucess or fail
160 | * @package mysql_functions
161 | */
162 |
163 | function mysql_update($table, $data, $query, $connector = "AND") {
164 |
165 | //Format the SQL query
166 | $sql = "UPDATE `$table` SET ";
167 | foreach ($data as $field => $value) $sql .= "`$field` = '" . mysql_real_escape_string($value) ."', ";
168 | $sql = substr($sql,0,strlen($sql)-2) . " WHERE ";
169 | foreach ($query as $field => $value) $sql .= "`$field` = '$value' $connector ";
170 |
171 | //Remove the last connector
172 | $sql = substr($sql,0,strlen($sql)-(strlen($connector)+1));
173 |
174 | //return a bool with the query's result
175 | if (mysql_query($sql)) return true;
176 | return false;
177 | }
178 | /**
179 | * Builds an SQL query, sanitizes the data, removes a row from the database.
180 | *
181 | * EXAMPLE USAGE
182 | *
183 | * $query = array('ID'=>'3');
184 | * mysql_remove('players',$query);
185 | *
186 | *
187 | * @param string $table Name of table to operate on
188 | * @param array $query Associative array of query field names and values
189 | * @param string $connector (Optional) query connector ('AND' or 'OR')
190 | * @return bool true or false for sucess or fail
191 | * @package mysql_functions
192 | *
193 | */
194 | function mysql_remove($table, $query=array(), $connector = "AND") {
195 |
196 | //Build the SQL Query
197 | $sql = "DELETE FROM `$table` WHERE ";
198 | foreach ($query as $field => $value) $sql .= "`$field` = '" . mysql_real_escape_string($value) . "' $connector ";
199 |
200 | //Remove the last connecter
201 | $sql = substr($sql,0,strlen($sql)-(strlen($connector)+1));
202 |
203 | //return a bool with the query's result
204 | if (mysql_query($sql)) return true;
205 | return false;
206 | }
207 |
208 | /**
209 | * Builds an SQL query, sanatizes data, and return a MySQL resource object with the results.
210 | *
211 | * Typically used in conjunction with mysql_array or mysql_row_array to handle simple MySQL queries
212 | *
213 | * For example, to return an entire table:
214 | *
215 | * mysql_select('Players');
216 | *
217 | * Or to return a select set of results:
218 | *
219 | * $query = array('Name'=>'Tom');
220 | * mysql_select('Players',$query);
221 | *
222 | *
223 | * @param string $table Name of table to operate on
224 | * @param array $query Associative array of query field names and values
225 | * @param string $connector (Optional) query connector ('AND' or 'OR')
226 | * @return object MySQL resource object with results
227 | * @package mysql_functions
228 | *
229 | */
230 | function mysql_select($table, $query=array(), $connector = "AND") {
231 |
232 | //Build the SQL Query
233 | $sql = "SELECT * FROM `$table` ";
234 |
235 | //If there is no WHERE clause, just run the query
236 | if (sizeof($query)>0) {
237 | $sql .= "WHERE ";
238 |
239 | //Loop through the fields/values
240 | foreach ($query as $field => $value) $sql .= "`$field` = '" . mysql_real_escape_string($value) . "' $connector ";
241 |
242 | //Remove the last connector
243 | $sql = substr($sql,0,strlen($sql)-(strlen($connector)+1));
244 | }
245 |
246 | //Run the query
247 | $result = mysql_query($sql);
248 |
249 | //Output an errors if applicable
250 | if (mysql_error()) echo "" . mysql_error() . ": $sql
";
251 |
252 | //Return the result (as a MySQL resource)
253 | return $result;
254 | }
255 |
256 | /**
257 | * Runs a simple mysql SELECT query and returns true or false if results are found.
258 | *
259 | * Used to verify data (such as a username or password) when the existence of the fields (rather than their value) is what is sought
260 | *
261 | * @param string $table Name of table to operate on
262 | * @param array $query Associative array of query field names and values
263 | * @param string $connector (Optional) query connector ('AND' or 'OR')
264 | * @return bool returns true if one or more results found, otherwise returns false
265 | * @package mysql_functions
266 | *
267 | */
268 | function mysql_exists($table,$query=array(),$connector="AND") {
269 | $result = mysql_select($table,$query,$connector);
270 | if (mysql_num_rows($result)!=0) return true;
271 | return false;
272 | }
273 |
274 | /**
275 | * Removes slashes from multi-dimensional arrays.
276 | *
277 | * Runs stripslashes() on all values in a multi-dimensial array. Used with mysql_array to remove slashes added by add_slashes() form mysql_insert().
278 | * Can also accept a standard array.
279 | *
280 | * @param array $value Array to be sanitized, may be single or multi-dimensional
281 | * @return array Return array identical to one given but with slashes removed
282 | * @package mysql_functions
283 | *
284 | */
285 | function stripslashes_deep($value) {
286 | $value = is_array($value) ?
287 | array_map('stripslashes_deep', $value) :
288 | stripslashes($value);
289 | return $value;
290 | }
291 |
292 |
293 | ?>
--------------------------------------------------------------------------------
/query.php:
--------------------------------------------------------------------------------
1 | $value) $sql .= " `$field` = '". urldecode($value) . "'";
13 | } else {
14 | $sql .= " WHERE `crawl_tag` = '$crawl_tag'";
15 | }
16 |
17 | $sql .= " LIMIT 300";
18 |
19 | $pages = mysql_query($sql);
20 |
21 |
22 | /**
23 | * Count the number of pages in our dataset
24 | */
25 | $count = mysql_num_rows($pages);
26 |
27 | /**
28 | * Get array of fields by parsing keys of first row array
29 | *
30 | * NOTE TO SELF: There is a better way to do this
31 | */
32 | $fields = array_keys(mysql_fetch_assoc($pages));
33 |
34 | ?>\r\n";
39 | foreach ($fields as $field) {
40 | echo "\t\t$field \r\n";
41 | }
42 | echo "\t\r\n";
43 |
44 | /**
45 | * When we looped through to grab the field names, we moevd the internal pointer.
46 | * Reset internal pointer so our loop includes the first row
47 | */
48 | mysql_data_seek($pages,0);
49 |
50 |
51 | /**
52 | * Loop through the rows (pages)
53 | */
54 | for ($i=0; $i < $count; $i++) {
55 | echo "\t\r\n";
56 | /**
57 | * Fetch the row as an associative array
58 | */
59 | $page = mysql_fetch_assoc($pages);
60 |
61 | /**
62 | * Loop through each field within the row
63 | */
64 | foreach ($page as $key=>$field) {
65 | echo "\t\t";
66 | /**
67 | * If it the 'size', or 'modified' field, make it human readible, otherwise just output
68 | */
69 | switch($key) {
70 | case 'size':
71 | echo file_size($field);
72 | break;
73 | case 'modified':
74 | if ($field != '') echo date('Y-m-d H:i:s',$field);
75 | break;
76 | default:
77 | echo $field;
78 | break;
79 | } //End switch
80 | echo " \r\n";
81 | } //End Field
82 | echo "\t \r\n";
83 | } //End Row
84 | ?>
85 |
--------------------------------------------------------------------------------
/sitemap.php:
--------------------------------------------------------------------------------
1 |
2 | '?>
3 |
4 | ', '?' ),
12 | array ( '&' , '"', ''' , '<' , '>', ''' ),
13 | $string
14 | );
15 | }
16 |
17 | $pages = mysql_array(mysql_query("SELECT url, modified from urls"));
18 | foreach ($pages as $page) { ?>
19 |
20 |
21 | 0) { ?>
22 |
23 |
24 |
25 |
--------------------------------------------------------------------------------
/stats.php:
--------------------------------------------------------------------------------
1 |
8 |
9 |
10 |
24 |
25 | Crawler Statistics
26 |
63 |
64 |
65 |
66 | Pages Crawled
67 |
68 |
75 |
83 |
91 |
99 |
100 |
101 |
102 | Estimated Percent Complete
103 |
104 | %
105 |
106 |
107 |
108 |
109 |
110 | Pages Indexed
111 |
112 |
113 | # of Clicks
114 | Count
115 | Cumulative Count
116 |
117 | 0 )';
119 | $clicks = mysql_array(mysql_query($sql));
120 | $cumulative = 0;
121 | foreach ($clicks as $click) { $cumulative += $click['NumOccurrences']; ?>
122 |
123 | '>
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 | Response Codes
134 |
135 | 0 )';
137 | $codes = mysql_array(mysql_query($sql));
138 | ?>
139 |
140 |
141 | Response Code
142 | Count
143 |
144 |
145 |
146 |
147 | '>
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 | File Types
160 |
161 | 0 ) ORDER BY NumOccurrences DESC';
163 | $types = mysql_array(mysql_query($sql));
164 | ?>
165 |
166 |
167 | File Type
168 | Count
169 |
170 |
171 |
172 |
173 | '>
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 | File Sizes
185 |
186 |
192 |
193 |
194 | Largest File:
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 | Average File Size:
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 | Current Domains:
214 | Current Crawl Tag:
215 |
219 | Last Page Crawled: ({$last['url']}";?>)
220 |
221 |
222 |
223 |
--------------------------------------------------------------------------------