├── README.md ├── sitemap-config.php ├── sitemap-generator.php └── sitemap.xml /README.md: -------------------------------------------------------------------------------- 1 | ## Sitemap generator 2 | 3 | --- 4 | 5 | Object based PHP script that generates a XML sitemap with the given config options. I made this script because I wanted to automate making a sitemap for google indexing and because there were not a lot of open source sitemap generators out there. 6 | 7 | Sitemap format: [http://www.sitemaps.org/protocol.html](http://www.sitemaps.org/protocol.html) 8 | 9 | ### Features 10 | 11 | Feel free to help me implement any of the missing features or add extra features 12 | 13 | - [x] Generate a sitemap for your website 14 | - [x] Multiple options for generating sitemaps 15 | - [ ] Option to only look through certain filetypes 16 | - [ ] Load client side Javascript content when crawling 17 | - [ ] Parse all relative link types (// , # , ?) and more 18 | 19 | ### Installation 20 | 21 | Installing this script is simply just downloading both `sitemap_config` and `sitemap_generator` and placing them into your project(same directory). 22 | 23 | ### Usage 24 | 25 | After installing the script you can use the script by including it into your script 26 | 27 | ```php 28 | include "/path/to/sitemap-generator.php"; 29 | ``` 30 | 31 | And initializing the class by calling the constructor 32 | 33 | ```php 34 | // Create an object of the generator class passing the config file 35 | $smg = new SitemapGenerator(include("sitemap-config.php")); 36 | // Run the generator 37 | $smg->GenerateSitemap(); 38 | ``` 39 | 40 | ### Config 41 | 42 | You can alter some of the configs settings by changing the config values. 43 | 44 | ```php 45 | // Site to crawl and create a sitemap for. 46 | // https://www.your-domain-name.com/ or http://www.your-domain-name.com/ 47 | "SITE_URL" => "https://student-laptop.nl/", 48 | 49 | // Boolean for crawling external links. 50 | // *Domain = https://www.student-laptop.nl* , *Link = https://www.google.com* 51 | "ALLOW_EXTERNAL_LINKS" => false, 52 | 53 | // Boolean for crawling element id links. 54 | // will not be crawled when this option is set to false 55 | "ALLOW_ELEMENT_LINKS" => false, 56 | 57 | // If set the crawler will only index the anchor tags with the given id. 58 | // If you wish to crawl all links set the value to "" 59 | // When CRAWL_ANCHORS_WITH_ID is set to "internal-link" this link will be crawled 60 | // but will not be crawled. 61 | "CRAWL_ANCHORS_WITH_ID" => "", 62 | 63 | // Array with absolute links or keywords for the pages to skip when crawling the given SITE_URL. 64 | // https://student-laptop.nl/info/laptops or you can just input student-laptop.nl/info/ and it will not crawl anything in that directory 65 | // Try to be as specific as you can so you dont skip 300 pages 66 | "KEYWORDS_TO_SKIP" => array( 67 | "http://localhost/student-laptop/index", // I already have a href for root ("/") on my page so skip this page 68 | "/student-laptop/student-laptop.nl/", // Invalid link example 69 | ), 70 | 71 | // Location + filename where the sitemap will be saved. 72 | "SAVE_LOC" => "sitemap.xml", 73 | 74 | // Static priority value for sitemap 75 | "PRIORITY" => 1, 76 | 77 | // Static update frequency 78 | "CHANGE_FREQUENCY" => "daily", 79 | 80 | // Date changed (today's date) 81 | "LAST_UPDATED" => date('Y-m-d'), 82 | ``` 83 | 84 | ### Output 85 | 86 | Example output when generating a sitemap using this script 87 | 88 | ```XML 89 | 90 | 91 | 92 | 93 | 94 | https://student-laptop.nl/ 95 | 2021-03-10 96 | daily 97 | 1 98 | 99 | 100 | https://student-laptop.nl/underConstruction 101 | 2021-03-10 102 | daily 103 | 1 104 | 105 | 106 | https://student-laptop.nl/article?article_id=1 107 | 2021-03-10 108 | daily 109 | 1 110 | 111 | 112 | ``` 113 | -------------------------------------------------------------------------------- /sitemap-config.php: -------------------------------------------------------------------------------- 1 | https://www.your-domain-name.com/ or http://www.your-domain-name.com/ 6 | "SITE_URL" => "https://student-laptop.nl/", 7 | 8 | // Boolean for crawling external links. 9 | // *Domain = https://www.student-laptop.nl* , *Link = https://www.google.com* 10 | "ALLOW_EXTERNAL_LINKS" => false, 11 | 12 | // Boolean for crawling element id links. 13 | // will not be crawled when this option is set to false 14 | "ALLOW_ELEMENT_LINKS" => false, 15 | 16 | // If set the crawler will only index the anchor tags with the given id. 17 | // If you wish to crawl all links set the value to "" 18 | // When CRAWL_ANCHORS_WITH_ID is set to "internal-link" this link will be crawled 19 | // but will not be crawled. 20 | "CRAWL_ANCHORS_WITH_ID" => "", 21 | 22 | // Array with absolute links or keywords for the pages to skip when crawling the given SITE_URL. 23 | // https://student-laptop.nl/info/laptops or you can just input student-laptop.nl/info/ and it will not crawl anything in that directory 24 | // Try to be as specific as you can so you dont skip 300 pages 25 | "KEYWORDS_TO_SKIP" => array(), 26 | 27 | // Location + filename where the sitemap will be saved. 28 | "SAVE_LOC" => "sitemap.xml", 29 | 30 | // Static priority value for sitemap 31 | "PRIORITY" => 1, 32 | 33 | // Static update frequency 34 | "CHANGE_FREQUENCY" => "daily", 35 | 36 | // Date changed (today's date) 37 | "LAST_UPDATED" => date('Y-m-d'), 38 | ); 39 | -------------------------------------------------------------------------------- /sitemap-generator.php: -------------------------------------------------------------------------------- 1 | config = $conf; 23 | $this->scanned = []; 24 | $this->site_url_base = parse_url($this->config['SITE_URL'])['scheme'] . "://" . parse_url($this->config['SITE_URL'])['host']; 25 | $this->sitemap_file = fopen($this->config['SAVE_LOC'], "w"); 26 | } 27 | 28 | public function GenerateSitemap() 29 | { 30 | // Call the recursive crawl function with the start url. 31 | $this->crawlPage($this->config['SITE_URL']); 32 | 33 | // Generate a sitemap with the scanned pages. 34 | $this->generateFile($this->scanned); 35 | } 36 | 37 | // Get the html content of a page and return it as a dom object 38 | private function getHtml($url) 39 | { 40 | // Get html from the given page 41 | $curl = curl_init(); 42 | curl_setopt($curl, CURLOPT_URL, $url); 43 | curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); 44 | $html = curl_exec($curl); 45 | curl_close($curl); 46 | 47 | //Load the html and store it into a DOM object 48 | $dom = new DOMDocument(); 49 | @$dom->loadHTML($html); 50 | 51 | return $dom; 52 | } 53 | 54 | // Recursive function that crawls a page's anchor tags and store them in the scanned array. 55 | private function crawlPage($page_url) 56 | { 57 | $url = filter_var($page_url, FILTER_SANITIZE_URL); 58 | 59 | // Check if the url is invalid or if the page is already scanned; 60 | if (in_array($url, $this->scanned) || !filter_var($page_url, FILTER_VALIDATE_URL)) { 61 | return; 62 | } 63 | 64 | // Add the page url to the scanned array 65 | array_push($this->scanned, $page_url); 66 | 67 | // Get the html content from the 68 | $html = $this->getHtml($url); 69 | $anchors = $html->getElementsByTagName('a'); 70 | 71 | // Loop through all anchor tags on the page 72 | foreach ($anchors as $a) { 73 | $next_url = $a->getAttribute('href'); 74 | 75 | // Check if there is a anchor ID set in the config. 76 | if ($this->config['CRAWL_ANCHORS_WITH_ID'] != "") { 77 | // Check if the id is set and matches the config setting, else it will move on to the next anchor 78 | if ($a->getAttribute('id') != "" || $a->getAttribute('id') == $this->config['CRAWL_ANCHORS_WITH_ID']) { 79 | continue; 80 | } 81 | } 82 | 83 | // Split page url into base and extra parameters 84 | $base_page_url = explode("?", $page_url)[0]; 85 | 86 | if (!$this->config['ALLOW_ELEMENT_LINKS']) { 87 | // Skip the url if it starts with a # or is equal to root. 88 | if (substr($next_url, 0, 1) == "#" || $next_url == "/") { 89 | continue; 90 | } 91 | } 92 | 93 | // Check if the given url is external, if yes it will skip the iteration 94 | // This code will only run if you set ALLOW_EXTERNAL_LINKS to false in the config. 95 | if (!$this->config['ALLOW_EXTERNAL_LINKS']) { 96 | $parsed_url = parse_url($next_url); 97 | if (isset($parsed_url['host'])) { 98 | if ($parsed_url['host'] != parse_url($this->config['SITE_URL'])['host']) { 99 | continue; 100 | } 101 | } 102 | } 103 | 104 | // Check if the link is absolute or relative. 105 | if (substr($next_url, 0, 7) != "http://" && substr($next_url, 0, 8) != "https://") { 106 | $next_url = $this->convertRelativeToAbsolute($base_page_url, $next_url); 107 | } 108 | 109 | // Check if the next link contains any of the pages to skip. If true, the loop will move on to the next iteration. 110 | $found = false; 111 | foreach ($this->config['KEYWORDS_TO_SKIP'] as $skip) { 112 | if (strpos($next_url, $skip) || $next_url === $skip) { 113 | $found = true; 114 | } 115 | } 116 | 117 | // Call the function again with the new URL 118 | if (!$found) { 119 | $this->crawlPage($next_url); 120 | } 121 | } 122 | } 123 | 124 | // Convert a relative link to a absolute link 125 | // Example: Relative /articles 126 | // Absolute https://student-laptop.nl/articles 127 | private function convertRelativeToAbsolute($page_base_url, $link) 128 | { 129 | $first_character = substr($link, 0, 1); 130 | if ($first_character == "?" || $first_character == "#") { 131 | return $page_base_url . $link; 132 | } else if ($first_character != "/") { 133 | return $this->site_url_base . "/" . $link; 134 | } else { 135 | return $this->site_url_base . $link; 136 | } 137 | } 138 | 139 | // Function to generate a Sitemap with the given pages array where the script has run through 140 | private function generateFile($pages) 141 | { 142 | $xml = ' 143 | 144 | 145 | '; 146 | 147 | 148 | // Print the amount of pages 149 | echo count($pages); 150 | 151 | foreach ($pages as $page) { 152 | $xml .= "" . $page . " 153 | " . $this->config['LAST_UPDATED'] . " 154 | " . $this->config['CHANGE_FREQUENCY'] . " 155 | " . $this->config['PRIORITY'] . ""; 156 | } 157 | 158 | $xml .= ""; 159 | $xml = str_replace('&', '&', $xml); 160 | 161 | // Format string to XML 162 | $dom = new DOMDocument; 163 | $dom->preserveWhiteSpace = FALSE; 164 | $dom->loadXML($xml); 165 | $dom->formatOutput = TRUE; 166 | 167 | // Write XML to file and close it 168 | fwrite($this->sitemap_file, $dom->saveXML()); 169 | fclose($this->sitemap_file); 170 | } 171 | } 172 | -------------------------------------------------------------------------------- /sitemap.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | https://student-laptop.nl/ 7 | 2021-03-10 8 | daily 9 | 1 10 | 11 | 12 | https://student-laptop.nl/underConstruction 13 | 2021-03-10 14 | daily 15 | 1 16 | 17 | 18 | https://student-laptop.nl/article?article_id=1 19 | 2021-03-10 20 | daily 21 | 1 22 | 23 | 24 | https://student-laptop.nl/laptopInfo?laptop_id=7 25 | 2021-03-10 26 | daily 27 | 1 28 | 29 | 30 | https://student-laptop.nl/laptopInfo?laptop_id=6 31 | 2021-03-10 32 | daily 33 | 1 34 | 35 | 36 | https://student-laptop.nl/laptopInfo?laptop_id=5 37 | 2021-03-10 38 | daily 39 | 1 40 | 41 | 42 | https://student-laptop.nl/laptopInfo?laptop_id=4 43 | 2021-03-10 44 | daily 45 | 1 46 | 47 | 48 | https://student-laptop.nl/laptopInfo?laptop_id=3 49 | 2021-03-10 50 | daily 51 | 1 52 | 53 | 54 | https://student-laptop.nl/laptopInfo?laptop_id=2 55 | 2021-03-10 56 | daily 57 | 1 58 | 59 | 60 | --------------------------------------------------------------------------------