├── Controllers └── ScrapeController.php ├── LICENSE.md ├── README.md ├── Util ├── CurlUtil.php ├── Page.php └── Scraper.php ├── autoloader.php ├── index.php └── test ├── index.html ├── page1.html ├── page2.html ├── page3.html ├── page4.html └── page5.html /Controllers/ScrapeController.php: -------------------------------------------------------------------------------- 1 | cliRequest() : $this->webRequest(); 18 | } 19 | 20 | /** 21 | * handle request coming from the cli 22 | * @return void 23 | */ 24 | private function cliRequest(): void 25 | { 26 | //validate arguments / show usage 27 | global $argv; 28 | if (!in_array(count($argv), [3, 4])) { //[0] is script name 29 | echo "Usage: php scrape.php url pageLimit [json: true|false]\n"; 30 | echo "Eg: php scrape.php http://insecure.com 5\n"; 31 | echo "Eg: php scrape.php https://secure.com 5 true\n"; 32 | die('Invalid arguments.'); 33 | } 34 | 35 | //parse arguments 36 | $url = CurlUtil::prependScheme($argv[1]); 37 | $pageLimit = intval($argv[2]); 38 | 39 | //trigger scrape 40 | $result = $this->scrapeTarget($url, $pageLimit); 41 | 42 | //output 43 | if (empty($argv[3])) { //regular output 44 | print_r($result); 45 | } else { //json output 46 | echo json_encode($result, JSON_PRETTY_PRINT); 47 | } 48 | } 49 | 50 | /** 51 | * handle request coming from the web 52 | * @return void 53 | */ 54 | private function webRequest(): void 55 | { 56 | //deal w/ CORS 57 | header("Access-Control-Allow-Origin: *"); 58 | 59 | //validate arguments / show errors 60 | if (empty($_REQUEST['url'])) { 61 | die('You must provide a "url" argument (GET/POST)'); 62 | } 63 | if (empty($_REQUEST['limit'])) { 64 | die('You must provide a maximum page "limit" argument (GET/POST)'); 65 | } 66 | 67 | //parse arguments 68 | $url = CurlUtil::prependScheme($_REQUEST['url']); 69 | $pageLimit = intval($_REQUEST['limit']); 70 | 71 | //trigger scrape 72 | $result = $this->scrapeTarget($url, $pageLimit); 73 | 74 | //return response as JSON 75 | header('Content-type: application/json; charset=utf-8'); 76 | echo json_encode($result, JSON_PRETTY_PRINT); 77 | } 78 | 79 | 80 | /** 81 | * handle the scrape operation 82 | * @param string $url target to scrape 83 | * @param int $pageLimit max # of pages to scrape 84 | * @return array 85 | */ 86 | private function scrapeTarget(string $url, int $pageLimit): array 87 | { 88 | //init our scraper 89 | $scraper = new Scraper($url); 90 | 91 | //will contain [title, url, response_code, load_time, word_count] for client-side table 92 | $pageListing = []; 93 | 94 | //iterate scraped pages & track values 95 | $pages = $scraper->scrape($pageLimit); 96 | foreach ($pages as $page) { 97 | $pageListing[] = [ 98 | 'title' => $page->getTitle(), 99 | 'url' => $page->getUrl(), 100 | 'status_code' => $page->getStatusCode(), 101 | 'load_time' => $page->getLoadTime(), 102 | 'word_count' => $page->getWordCount() 103 | ]; 104 | } 105 | 106 | //format & return output, including pageListing & links 107 | $output = [ 108 | 'pages' => $pageListing, 109 | 'links' => [ 110 | 'internal' => $scraper->getResource('internal_links'), 111 | 'external' => $scraper->getResource('external_links') 112 | ] 113 | ]; 114 | return $output; 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019, Kevin Dawe 2 | 3 | Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies. 4 | 5 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 6 | 7 | Source: http://opensource.org/licenses/ISC -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # php-curl-scraper 2 | 3 | A simple web scraper written using PHP + cURL. Lets you crawl a website and extract info. Can be used via the CLI or web. 4 | 5 | This is essentially a MVP/starting point that could be used for a specific objective, given a bit of tweaking/expansion. 6 | 7 | ## Usage 8 | 9 | ### Generic 10 | 11 | * **url**: url to start scraping from, will assume http:// if scheme isn't provided 12 | * **limit**: maximum number of pages to scrape before stopping 13 | 14 | ### CLI 15 | 16 | * php scrape.php url pageLimit [json: true|false] 17 | * php scrape.php http://insecure.com 5 18 | * php scrape.php https://secure.com 5 true 19 | 20 | Outputs via print_r unless 3rd argument is true, then it prints JSON. 21 | 22 | ### WEB 23 | 24 | * .../scrape.php?url=example.com&limit=5 25 | 26 | Outputs JSON. 27 | 28 | ## Caveats 29 | 30 | * Link parsing ignores query params and hashes 31 | * "/" and "/index.ext" are currently treated as two different pages 32 | * Error handling is pretty minimal 33 | * *Page* class can be expanded to extract additional info from scraped pages, then accessed via the *ScrapeController* 34 | * You can set a specific user agent string in *CurlUtil* 35 | 36 | ## Testing 37 | 38 | test/ folder contains a set of basic pages to crawl 39 | 40 | ## License 41 | 42 | ISC -------------------------------------------------------------------------------- /Util/CurlUtil.php: -------------------------------------------------------------------------------- 1 | ..., info => ...] 11 | */ 12 | public static function requestPage(string $url): array 13 | { 14 | //create handler 15 | $ch = curl_init($url); 16 | 17 | //settings 18 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); 19 | curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30); 20 | curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); 21 | //spoof user agent (ie. as a browser), if desired 22 | //curl_setopt($ch, CURLOPT_USERAGENT, 'UA GOES HERE'); 23 | 24 | //send request 25 | $content = curl_exec($ch); 26 | 27 | //error handling? 28 | if ($content === false) { 29 | //TODO: be more graceful 30 | print_r(curl_error($ch)); 31 | die(' --- CURL ERROR'); 32 | } 33 | 34 | //get metadata like response code, load time, etc. 35 | $info = curl_getinfo($ch); 36 | 37 | return [ 38 | 'content' => $content, 39 | 'info' => $info 40 | ]; 41 | } 42 | 43 | /** 44 | * Turn a URL into an absolute URL 45 | * @param string $url 46 | * @param string $relativeTo 47 | * @return string 48 | */ 49 | public static function absoluteUrl(string $url, string $relativeTo): string 50 | { 51 | //parse urls 52 | $url = parse_url($url); 53 | $relativeTo = parse_url($relativeTo); 54 | 55 | //fill in missing data about scheme/host re: relative urls 56 | if (empty($url['scheme'])) { //HTTP or HTTPS 57 | $url['scheme'] = $relativeTo['scheme']; 58 | } 59 | if (empty($url['host'])) { //relative link 60 | $url['host'] = $relativeTo['host']; 61 | 62 | //account for no leading slash, subfolders, etc. 63 | if (substr($url['path'], 0, 1) !== '/') { 64 | $parts = explode('/', $relativeTo['path']); 65 | if (count($parts) > 1) { 66 | array_pop($parts); 67 | } 68 | $base = implode('/', $parts); 69 | $url['path'] = $base . '/' . $url['path']; 70 | } 71 | } 72 | if (empty($url['path'])) { //page root 73 | $url['path'] = ''; 74 | } 75 | 76 | //TODO: handling for query params / hashes 77 | 78 | //format url and account for leading/trailing slashes 79 | $absolute = $url['host'] . '/' . trim($url['path'], '/'); 80 | return $url['scheme'] . '://' . $absolute; 81 | } 82 | 83 | /** 84 | * prepend http:// scheme if none is provided 85 | * @param string $url 86 | * @return string 87 | */ 88 | public static function prependScheme(string $url): string 89 | { 90 | if (!array_key_exists('scheme', parse_url($url))) { 91 | $url = 'http://' . $url; 92 | } 93 | return $url; 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /Util/Page.php: -------------------------------------------------------------------------------- 1 | ..., info => ...], @see CurlUtil::requestPage() 13 | */ 14 | public function __construct(array $curlResult) 15 | { 16 | $this->DOM = new \DOMDocument; 17 | //suppressing warnings w/ '@' prefix 18 | //see: https://stackoverflow.com/questions/6090667/php-domdocument-errors-warnings-on-html5-tags 19 | @$this->DOM->loadHTML($curlResult['content']); 20 | $this->info = $curlResult['info']; 21 | } 22 | 23 | /** 24 | * return links (excluding blanks, anchors, and JS) from the page 25 | * @return array 26 | */ 27 | public function getLinks(): array 28 | { 29 | $links = []; 30 | $tags = $this->DOM->getElementsByTagName('a'); 31 | 32 | foreach ($tags as $tag) { 33 | $href = trim($tag->getAttribute('href')); 34 | 35 | //skip anchors, empty links, and javascript segments 36 | if (empty($href) || substr($href, 0, 1) === '#' || substr(strtolower($href), 0, 11) === 'javascript:') { 37 | continue; 38 | } 39 | 40 | $links[] = $href; 41 | } 42 | 43 | return $links; 44 | } 45 | 46 | /** 47 | * count the number of words in the page, for a given list of tags 48 | * @return int 49 | */ 50 | public function getWordCount(array $textTags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li']): int 51 | { 52 | $words = 0; 53 | 54 | //find all our instances of the above target tags and get their word count 55 | foreach ($textTags as $textTag) { 56 | $tags = $this->DOM->getElementsByTagName($textTag); 57 | foreach ($tags as $tag) { 58 | $words += str_word_count($tag->textContent); 59 | } 60 | } 61 | 62 | return $words; 63 | } 64 | 65 | /** 66 | * return Page's title 67 | * @return int 68 | */ 69 | public function getTitle(): string 70 | { 71 | $title = $this->DOM->getElementsByTagName('title'); 72 | if ($title->length > 0) { 73 | return $title->item(0)->textContent; 74 | } 75 | return ''; 76 | } 77 | 78 | //various get... functions for metadata from the curl info object 79 | public function getStatusCode(): int 80 | { 81 | return $this->info['http_code']; 82 | } 83 | public function getLoadTime(): float 84 | { 85 | return round($this->info['total_time'], 2); //stored in seconds 86 | } 87 | public function getUrl(): string 88 | { 89 | return $this->info['url']; 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /Util/Scraper.php: -------------------------------------------------------------------------------- 1 | targetUrl = $targetUrl; 22 | } 23 | 24 | /** 25 | * Scrape up to a certain number of pages from the target 26 | * @param int|integer $pageLimit 27 | * @return array 28 | */ 29 | public function scrape(int $pageLimit = 1): array 30 | { 31 | //init vars 32 | $pages = []; 33 | 34 | //loop our scraper until we hit $pageLimit, or run out of unique URLs to visit 35 | for ($i = 0; $i < $pageLimit; $i++) { 36 | //figure out our next page to scrape, being mindful of urls we've already visited 37 | $url = $this->getNextUrl(); 38 | 39 | //abort loop if there's no other internal links to scrape 40 | if (!$url) { 41 | break; 42 | } 43 | 44 | //scrape our page 45 | $pages[] = new Page(CurlUtil::requestPage($url)); 46 | 47 | //mark link as visited so we don't re-visit going forward 48 | $this->visited[rtrim($url, '/')] = true; 49 | 50 | //get internal/external links from the page 51 | //also lets future getNextUrl() calls to work properly 52 | $this->siftLinks($pages[$i]->getLinks(), $url); 53 | } 54 | 55 | return $pages; 56 | } 57 | 58 | /** 59 | * get the next URL to scrape, being mindful to not re-visit URLs 60 | * will hit the target URL if it hasn't been yet 61 | * @return string 62 | */ 63 | private function getNextUrl() 64 | { 65 | //we haven't visted anywhere yet, so hit the target URL 66 | if (empty($this->visited)) { 67 | return $this->targetUrl; 68 | } 69 | 70 | //subsequent crawls 71 | foreach ($this->internalLinks as $link) { 72 | //TODO: compensate for ".../" vs ".../index.ext" being equivalent, it's a bit awkward 73 | if (empty($this->visited[rtrim($link, '/')])) { 74 | return $link; 75 | } 76 | } 77 | 78 | return false; 79 | } 80 | 81 | /** 82 | * sift links as internal/external 83 | * @param array $links 84 | * @param string $relativeTo 85 | */ 86 | private function siftLinks(array $links, string $relativeTo): void 87 | { 88 | $internal = []; 89 | $external = []; 90 | 91 | foreach ($links as $url) { 92 | //convert to absolute urls 93 | $url = CurlUtil::absoluteUrl($url, $relativeTo); 94 | 95 | //sift into internal/external links 96 | if (parse_url($url)['host'] === parse_url($this->targetUrl)['host']) { 97 | $internal[] = $url; 98 | } else { 99 | $external[] = $url; 100 | } 101 | } 102 | 103 | //record our findings, being mindful to not create duplicates 104 | //this could likely be optimized w/ hashes, we'll leave it explicit for now 105 | $this->internalLinks = array_unique(array_merge($this->internalLinks, $internal)); 106 | $this->externalLinks = array_unique(array_merge($this->externalLinks, $external)); 107 | } 108 | 109 | /** 110 | * return a sorted resource from scraping results 111 | * @param string $resource internal_links or external_links 112 | * @return array 113 | */ 114 | public function getResource(string $resource): array 115 | { 116 | switch ($resource) { 117 | case 'internal_links': 118 | $resource = $this->internalLinks; 119 | break; 120 | case 'external_links': 121 | $resource = $this->externalLinks; 122 | break; 123 | default: 124 | die("Invalid resource requested: $resource"); 125 | } 126 | 127 | sort($resource); 128 | return $resource; 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /autoloader.php: -------------------------------------------------------------------------------- 1 | ', $offset = $start); 9 | 10 | $length = $end - $start; 11 | 12 | $htmlSection = substr($html, $start, $length); 13 | 14 | preg_match_all('@
  • (.+)
  • @', $htmlSection, $matches); 15 | $listItems = $matches[1]; 16 | 17 | echo "Who was born on December 10th\n"; 18 | echo "=============================\n\n"; 19 | 20 | foreach ($listItems as $item) { 21 | preg_match('@(\d+)@', $item, $yearMatch); 22 | $year = (int) $yearMatch[0]; 23 | 24 | preg_match('@;\s]*>(.*?)@i', $item, $nameMatch); 25 | $name = $nameMatch[1]; 26 | 27 | echo "{$name} was born in {$year}\n"; 28 | } -------------------------------------------------------------------------------- /test/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Target Index 4 | 5 | 6 | 7 |

    Target Index

    8 | 9 |

    Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vestibulum et vehicula sapien. Sed non dignissim lorem, sit amet porta quam. Etiam dapibus diam id hendrerit molestie. In id nulla felis. Duis facilisis, dui et imperdiet volutpat, lectus urna eleifend leo, a aliquam risus magna eu nibh. Nulla sit amet sodales massa. Proin congue laoreet diam at sollicitudin. Ut facilisis mauris ut mauris varius euismod. Quisque ac lorem diam. Duis vestibulum, magna eget placerat blandit, nunc velit pulvinar nulla, accumsan pellentesque turpis leo a quam. Duis a felis ac libero suscipit egestas bibendum sit amet velit. Pellentesque a porttitor justo, eget egestas leo.

    10 |

    Donec faucibus urna vitae laoreet venenatis. Sed id aliquet elit. Duis placerat sit amet dolor quis auctor. Ut aliquet bibendum nisi. Nullam vulputate in metus vel condimentum. Mauris rutrum, libero nec aliquet facilisis, lacus ex rhoncus ligula, eget posuere ante ipsum a purus. Nulla accumsan est nec erat laoreet, eu rhoncus nisi malesuada. Maecenas condimentum ipsum mi, ut vulputate neque vehicula vel. Curabitur orci tortor, molestie vel velit nec, aliquet accumsan mauris.

    11 | 12 |

    Duplicated links

    13 |
      14 |
    1. P1 Instance 1
    2. 15 |
    3. P1 Instance 2
    4. 16 |
    5. P1 Instance 3
    6. 17 |
    7. P1 Instance 4
    8. 18 |
    9. P1 Instance 5
    10. 19 |
    20 | 21 |

    Other Links

    22 | 26 | 27 |

    External Links

    28 | 32 | 33 |

    Garbage

    34 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /test/page1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Page 1 4 | 5 | 6 | 7 |

    Page 1

    8 | 9 |

    This page should only be visited once. It also has a link page 2, which shouldn't be visited more than once (also referenced in the index). The images below also shouldn't be double-counted.

    10 | 11 | 12 | -------------------------------------------------------------------------------- /test/page2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Page 2 4 | 5 | 6 | 7 |

    Page 2

    8 | 9 |

    This is page 2, which should only be visited once.

    10 | 11 | 12 | -------------------------------------------------------------------------------- /test/page3.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Page 3 4 | 5 | 6 | 7 |

    Page 3

    8 | 9 |

    This page exposes a link to page 4, which isn't listed anywhere else.

    10 | 11 | 12 | -------------------------------------------------------------------------------- /test/page4.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Page 4 4 | 5 | 6 | 7 |

    Page 4

    8 | 9 |

    This page should be visited via the link in P3, even though it isn't linked from the index. It links to page 5, which shouldn't be visited at a test depth of 5.

    10 | 11 | 12 | -------------------------------------------------------------------------------- /test/page5.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Page 5 4 | 5 | 6 | 7 |

    Page 5

    8 | 9 |

    This page should not be visited at a test depth of 5 or less.

    10 | 11 | 12 | --------------------------------------------------------------------------------