├── src ├── helpers │ ├── stringifyURL.js │ ├── escapeUnsafe.js │ ├── getCurrentDateTime.js │ ├── extendFilename.js │ ├── validChangeFreq.js │ └── __tests__ │ │ ├── getCurrentDateTime.js │ │ ├── validChangeFreq.js │ │ ├── stringifyURL.js │ │ ├── extendFilename.js │ │ └── escapeUnsafe.js ├── __tests__ │ ├── discoverResources.js │ ├── index.js │ ├── createCrawler.js │ ├── createSitemapIndex.js │ ├── SitemapStream.js │ └── SitemapRotator.js ├── createSitemapIndex.js ├── SitemapStream.js ├── discoverResources.js ├── SitemapRotator.js ├── createCrawler.js └── index.js ├── .travis.yml ├── .github ├── ISSUE_TEMPLATE.md └── workflows │ └── nodejs.yml ├── .editorconfig ├── .gitignore ├── LICENSE ├── package.json └── README.md /src/helpers/stringifyURL.js: -------------------------------------------------------------------------------- 1 | module.exports = parsed => 2 | `${parsed.protocol}://${parsed.host}${parsed.uriPath}`; 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | 3 | node_js: 10 4 | 5 | deploy: 6 | provider: script 7 | script: 8 | - 'npx semantic-release' 9 | skip_cleanup: true 10 | -------------------------------------------------------------------------------- /src/helpers/escapeUnsafe.js: -------------------------------------------------------------------------------- 1 | module.exports = unsafe => 2 | unsafe 3 | .replace(/&/g, '&') 4 | .replace(//g, '>') 6 | .replace(/"/g, '"') 7 | .replace(/'/g, '''); 8 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | **Do you want to request a *feature* or report a *bug*?** 2 | 3 | **What is the current behavior?** 4 | 5 | **If the current behavior is a bug, please provide the steps to reproduce.** 6 | 7 | **What is the expected behavior?** 8 | 9 | -------------------------------------------------------------------------------- /src/__tests__/discoverResources.js: -------------------------------------------------------------------------------- 1 | const discoverResources = require('../discoverResources'); 2 | 3 | describe('#discoverResources', () => { 4 | test('should be a function', () => { 5 | expect(discoverResources).toBeInstanceOf(Function); 6 | }); 7 | }); 8 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 2 8 | end_of_line = lf 9 | charset = utf-8 10 | trim_trailing_whitespace = true 11 | insert_final_newline = true 12 | 13 | [*.md] 14 | trim_trailing_whitespace = false -------------------------------------------------------------------------------- /src/helpers/getCurrentDateTime.js: -------------------------------------------------------------------------------- 1 | module.exports = () => { 2 | const now = new Date(); 3 | const year = now.getFullYear(); 4 | const month = 5 | now.getMonth() + 1 < 10 ? `0${now.getMonth() + 1}` : now.getMonth() + 1; 6 | const date = now.getDate() < 10 ? `0${now.getDate()}` : now.getDate(); 7 | return `${year}-${month}-${date}`; 8 | }; 9 | -------------------------------------------------------------------------------- /src/helpers/extendFilename.js: -------------------------------------------------------------------------------- 1 | /* eslint no-bitwise:0 */ 2 | 3 | module.exports = (fpath, str) => { 4 | const ext = fpath.slice(((fpath.lastIndexOf('.') - 1) >>> 0) + 2); 5 | 6 | let newFilename; 7 | 8 | if (ext) { 9 | newFilename = fpath.replace(`.${ext}`, `${str}.${ext}`); 10 | } else { 11 | newFilename = `${fpath}${str}`; 12 | } 13 | 14 | return newFilename; 15 | }; 16 | -------------------------------------------------------------------------------- /src/helpers/validChangeFreq.js: -------------------------------------------------------------------------------- 1 | module.exports = desiredChangeFreq => { 2 | const acceptedChangeFreqs = [ 3 | 'always', 4 | 'hourly', 5 | 'daily', 6 | 'weekly', 7 | 'monthly', 8 | 'yearly', 9 | 'never', 10 | ]; 11 | if (acceptedChangeFreqs.indexOf(desiredChangeFreq) === -1) { 12 | // eslint-disable-next-line 13 | console.warn('Desired change frequency is not a valid type. Ignoring.'); 14 | return ''; 15 | } 16 | return desiredChangeFreq; 17 | }; 18 | -------------------------------------------------------------------------------- /src/helpers/__tests__/getCurrentDateTime.js: -------------------------------------------------------------------------------- 1 | const getCurrentDateTime = require('../getCurrentDateTime'); 2 | 3 | describe('#getCurrentDateTime', () => { 4 | test('should be a function', () => { 5 | expect(getCurrentDateTime).toBeInstanceOf(Function); 6 | }); 7 | 8 | test('should return a string', () => { 9 | expect(typeof getCurrentDateTime()).toBe('string'); 10 | }); 11 | 12 | test('should match standard date string', () => { 13 | expect(getCurrentDateTime()).toMatch(/\d{4}-\d{2}-\d{2}/); 14 | }); 15 | }); 16 | -------------------------------------------------------------------------------- /src/helpers/__tests__/validChangeFreq.js: -------------------------------------------------------------------------------- 1 | const validChangeFreq = require('../validChangeFreq'); 2 | 3 | describe('#validateChangeFreq', () => { 4 | test('should be a function', () => { 5 | expect(validChangeFreq).toBeInstanceOf(Function); 6 | }); 7 | 8 | test('should return string when valid', () => { 9 | expect(typeof validChangeFreq('daily')).toBe('string'); 10 | }); 11 | 12 | test('should return empty string when invalid', () => { 13 | const changeFreq = validChangeFreq('invalid'); 14 | expect(typeof changeFreq).toBe('string'); 15 | expect(changeFreq).toBe(''); 16 | }); 17 | }); 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | 5 | # Runtime data 6 | pids 7 | *.pid 8 | *.seed 9 | 10 | # Directory for instrumented libs generated by jscoverage/JSCover 11 | lib-cov 12 | 13 | # Coverage directory used by tools like istanbul 14 | coverage 15 | 16 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 17 | .grunt 18 | 19 | # node-waf configuration 20 | .lock-wscript 21 | 22 | # Compiled binary addons (http://nodejs.org/api/addons.html) 23 | build/Release 24 | 25 | # Dependency directory 26 | # https://www.npmjs.org/doc/misc/npm-faq.html#should-i-check-my-node_modules-folder-into-git 27 | node_modules 28 | -------------------------------------------------------------------------------- /src/__tests__/index.js: -------------------------------------------------------------------------------- 1 | const SitemapGenerator = require('../'); 2 | 3 | describe('#SitemapGenerator', () => { 4 | let gen; 5 | 6 | beforeEach(() => { 7 | gen = SitemapGenerator('http://foo.bar'); 8 | }); 9 | 10 | test('should be a function', () => { 11 | expect(SitemapGenerator).toBeInstanceOf(Function); 12 | }); 13 | 14 | test('should have method start', () => { 15 | expect(gen).toHaveProperty('start'); 16 | }); 17 | 18 | test('should have method stop', () => { 19 | expect(gen).toHaveProperty('stop'); 20 | }); 21 | 22 | test('should have method queueURL', () => { 23 | expect(gen).toHaveProperty('queueURL'); 24 | }); 25 | }); 26 | -------------------------------------------------------------------------------- /src/helpers/__tests__/stringifyURL.js: -------------------------------------------------------------------------------- 1 | const stringifyURL = require('../stringifyURL'); 2 | 3 | describe('#stringifyURL', () => { 4 | const url = { 5 | protocol: 'http', 6 | host: 'example.com', 7 | uriPath: '/test', 8 | }; 9 | 10 | test('should be a function', () => { 11 | expect(stringifyURL).toBeInstanceOf(Function); 12 | }); 13 | 14 | test('should return a string', () => { 15 | const str = stringifyURL(url); 16 | 17 | expect(typeof str).toBe('string'); 18 | }); 19 | 20 | test('should create valid URL string', () => { 21 | const str = stringifyURL(url); 22 | 23 | expect(str).toBe('http://example.com/test'); 24 | }); 25 | }); 26 | -------------------------------------------------------------------------------- /src/createSitemapIndex.js: -------------------------------------------------------------------------------- 1 | const extendFilename = require('./helpers/extendFilename'); 2 | 3 | module.exports = (url, filename, sitemapCount) => { 4 | let sitemapIndex = 5 | '\n'; 6 | 7 | for (let i = 1; i <= sitemapCount; i += 1) { 8 | // generate sitemap part url 9 | const newFilename = extendFilename(filename, `_part${i}`); 10 | 11 | const sitemapUrl = `${url.replace(/\/$/, '')}/${newFilename}`; 12 | sitemapIndex += `\n \n ${sitemapUrl}\n `; 13 | } 14 | sitemapIndex += '\n'; 15 | 16 | return sitemapIndex; 17 | }; 18 | -------------------------------------------------------------------------------- /src/__tests__/createCrawler.js: -------------------------------------------------------------------------------- 1 | const createCrawler = require('../createCrawler'); 2 | const Crawler = require('simplecrawler'); 3 | const parse = require('url-parse'); 4 | 5 | describe('#createCrawler', () => { 6 | test('should export a function', () => { 7 | expect(createCrawler).toBeInstanceOf(Function); 8 | }); 9 | 10 | test('should return crawler instance', () => { 11 | const crawler = createCrawler(parse('http://example.com')); 12 | expect(crawler).toBeInstanceOf(Crawler); 13 | }); 14 | 15 | test('should apply options to crawler', () => { 16 | const options = { 17 | maxDepth: 2, 18 | }; 19 | const crawler = createCrawler(parse('http://example.com'), options); 20 | expect(crawler).toHaveProperty('maxDepth', 2); 21 | }); 22 | }); 23 | -------------------------------------------------------------------------------- /src/helpers/__tests__/extendFilename.js: -------------------------------------------------------------------------------- 1 | const extendFilename = require('../extendFilename'); 2 | 3 | describe('#extendFilename', () => { 4 | test('should be a function', () => { 5 | expect(extendFilename).toBeInstanceOf(Function); 6 | }); 7 | 8 | test('should return a string', () => { 9 | const newFilename = extendFilename('sitemap.xml', '_part1'); 10 | 11 | expect(typeof newFilename).toBe('string'); 12 | }); 13 | 14 | test('should extend filename with string', () => { 15 | const newFilename = extendFilename('sitemap.xml', '_part1'); 16 | 17 | expect(newFilename).toBe('sitemap_part1.xml'); 18 | }); 19 | 20 | test('should extend filenames without extension', () => { 21 | const newFilename = extendFilename('sitemap', '_part1'); 22 | 23 | expect(newFilename).toBe('sitemap_part1'); 24 | }); 25 | }); 26 | -------------------------------------------------------------------------------- /src/__tests__/createSitemapIndex.js: -------------------------------------------------------------------------------- 1 | const createSitemapIndex = require('../createSitemapIndex'); 2 | 3 | describe('#createSitemapIndex', () => { 4 | const url = 'http://example.com'; 5 | const filename = 'sitemap.xml'; 6 | const count = 2; 7 | 8 | test('should be a function', () => { 9 | expect(createSitemapIndex).toBeInstanceOf(Function); 10 | }); 11 | 12 | test('should return string', () => { 13 | const sitemapIndex = createSitemapIndex(url, filename, count); 14 | expect(typeof sitemapIndex).toBe('string'); 15 | }); 16 | 17 | test('should contain sitemap part url', () => { 18 | const sitemapIndex = createSitemapIndex(url, filename, count); 19 | const regex = new RegExp( 20 | `${url.replace(/\/$/, '')}/sitemap_part${count}.xml` 21 | ); 22 | expect(sitemapIndex).toMatch(regex); 23 | }); 24 | }); 25 | -------------------------------------------------------------------------------- /.github/workflows/nodejs.yml: -------------------------------------------------------------------------------- 1 | # This workflow will do a clean install of node dependencies, build the source code and run tests across different versions of node 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-nodejs-with-github-actions 3 | 4 | name: Node.js CI 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | strategy: 18 | matrix: 19 | node-version: [10.x, 12.x] 20 | 21 | steps: 22 | - uses: actions/checkout@v2 23 | - name: Use Node.js ${{ matrix.node-version }} 24 | uses: actions/setup-node@v1 25 | with: 26 | node-version: ${{ matrix.node-version }} 27 | - run: npm ci 28 | - run: npm run build --if-present 29 | - run: npm test 30 | env: 31 | CI: true 32 | -------------------------------------------------------------------------------- /src/__tests__/SitemapStream.js: -------------------------------------------------------------------------------- 1 | const SitemapStream = require('../SitemapStream'); 2 | 3 | describe('#SitemapStream', () => { 4 | const stream = SitemapStream(); 5 | 6 | test('should be a function', () => { 7 | expect(SitemapStream).toBeInstanceOf(Function); 8 | }); 9 | 10 | describe('#getPath', () => { 11 | test('should have getPath method', () => { 12 | expect(stream).toHaveProperty('getPath'); 13 | }); 14 | 15 | test('should return path string', () => { 16 | const path = stream.getPath(); 17 | expect(typeof path).toBe('string'); 18 | }); 19 | }); 20 | 21 | describe('#write', () => { 22 | test('should have write method', () => { 23 | expect(stream).toHaveProperty('write'); 24 | }); 25 | }); 26 | 27 | describe('#end', () => { 28 | test('should have end method', () => { 29 | expect(stream).toHaveProperty('end'); 30 | }); 31 | }); 32 | }); 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Lars Graubner 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /src/SitemapStream.js: -------------------------------------------------------------------------------- 1 | const path = require('path'); 2 | const rand = require('crypto-random-string'); 3 | const os = require('os'); 4 | const fs = require('fs'); 5 | const escapeUnsafe = require('./helpers/escapeUnsafe'); 6 | 7 | module.exports = function SitemapStream() { 8 | const tmpPath = path.join(os.tmpdir(), `sitemap_${rand(10)}`); 9 | const stream = fs.createWriteStream(tmpPath); 10 | 11 | stream.write(''); 12 | stream.write( 13 | '\n' 14 | ); 15 | 16 | const getPath = () => tmpPath; 17 | 18 | const write = (url, currentDateTime, changeFreq, priority) => { 19 | const escapedUrl = escapeUnsafe(url); 20 | stream.write('\n \n'); 21 | stream.write(` ${escapedUrl}\n`); 22 | if (currentDateTime) { 23 | stream.write(` ${currentDateTime}\n`); 24 | } 25 | if (changeFreq) { 26 | stream.write(` ${changeFreq}\n`); 27 | } 28 | if (priority) { 29 | stream.write(` ${priority}\n`); 30 | } 31 | stream.write(' '); 32 | }; 33 | 34 | const end = () => { 35 | stream.write('\n'); 36 | stream.end(); 37 | }; 38 | 39 | return { 40 | getPath, 41 | write, 42 | end, 43 | }; 44 | }; 45 | -------------------------------------------------------------------------------- /src/__tests__/SitemapRotator.js: -------------------------------------------------------------------------------- 1 | const SitemapRotator = require('../SitemapRotator'); 2 | 3 | describe('#SitemapRotator', () => { 4 | const rotator = SitemapRotator(2); 5 | rotator.addURL('http://atest.com'); 6 | 7 | afterAll(() => { 8 | rotator.finish(); 9 | }); 10 | 11 | test('should be a function', () => { 12 | expect(SitemapRotator).toBeInstanceOf(Function); 13 | }); 14 | 15 | describe('#addURL', () => { 16 | test('should have addURL method', () => { 17 | expect(rotator).toHaveProperty('addURL'); 18 | }); 19 | }); 20 | 21 | describe('#getPaths', () => { 22 | test('should have getPaths method', () => { 23 | expect(rotator).toHaveProperty('getPaths'); 24 | }); 25 | 26 | test('should return array of paths', () => { 27 | const paths = rotator.getPaths(); 28 | const expected = [expect.stringMatching(/.+/)]; 29 | expect(paths).toEqual(expect.arrayContaining(expected)); 30 | }); 31 | 32 | test('should rotate sitemaps when max entries is reached', () => { 33 | rotator.addURL('http://atest.com/a'); 34 | rotator.addURL('http://atest.com/b'); 35 | 36 | expect(rotator.getPaths()).toHaveLength(2); 37 | }); 38 | }); 39 | 40 | describe('#finish', () => { 41 | test('should have finish method', () => { 42 | expect(rotator).toHaveProperty('finish'); 43 | }); 44 | }); 45 | }); 46 | -------------------------------------------------------------------------------- /src/helpers/__tests__/escapeUnsafe.js: -------------------------------------------------------------------------------- 1 | const escapeUnsafe = require('../escapeUnsafe'); 2 | 3 | describe('#escapeUnsafe', () => { 4 | test('should be a function', () => { 5 | expect(escapeUnsafe).toBeInstanceOf(Function); 6 | }); 7 | 8 | test('should escape < characters', () => { 9 | const url = 'http://test.com/<>&\'"<>&\'"'; 10 | const escapedUrl = escapeUnsafe(url); 11 | 12 | expect(url).toMatch(/ characters', () => { 17 | const url = 'http://test.com/<>&\'"<>&\'"'; 18 | const escapedUrl = escapeUnsafe(url); 19 | 20 | expect(url).toMatch(/>/); 21 | expect(escapedUrl).not.toMatch(/>/); 22 | }); 23 | 24 | test('should escape & characters', () => { 25 | const url = 'http://test.com/<>&\'"<>&\'"'; 26 | const escapedUrl = escapeUnsafe(url); 27 | 28 | expect(url).toMatch(/&/); 29 | // Regex with negative lookahead, matches non escaping &'s 30 | expect(escapedUrl).not.toMatch(/&(?!(?:apos|quot|[gl]t|amp);|#)/); 31 | }); 32 | 33 | test("should escape ' characters", () => { 34 | const url = 'http://test.com/<>&\'"<>&\'"'; 35 | const escapedUrl = escapeUnsafe(url); 36 | 37 | expect(url).toMatch(/'/); 38 | expect(escapedUrl).not.toMatch(/'/); 39 | }); 40 | 41 | test('should escape " characters', () => { 42 | const url = 'http://test.com/<>&\'"<>&\'"'; 43 | const escapedUrl = escapeUnsafe(url); 44 | 45 | expect(url).toMatch(/"/); 46 | expect(escapedUrl).not.toMatch(/"/); 47 | }); 48 | }); 49 | -------------------------------------------------------------------------------- /src/discoverResources.js: -------------------------------------------------------------------------------- 1 | const url = require('url'); 2 | const cheerio = require('cheerio'); 3 | 4 | module.exports = (buffer, queueItem) => { 5 | const $ = cheerio.load(buffer.toString('utf8')); 6 | 7 | const metaRobots = $('meta[name="robots"]'); 8 | 9 | if (metaRobots.length && /nofollow/i.test(metaRobots.attr('content'))) { 10 | return []; 11 | } 12 | 13 | const links = $('a[href]').map(function iteratee() { 14 | let href = $(this).attr('href'); 15 | 16 | // exclude "mailto:" etc 17 | if (/^[a-z]+:(?!\/\/)/i.test(href)) { 18 | return null; 19 | } 20 | 21 | // exclude rel="nofollow" links 22 | const rel = $(this).attr('rel'); 23 | if (/nofollow/i.test(rel)) { 24 | return null; 25 | } 26 | 27 | // remove anchors 28 | href = href.replace(/(#.*)$/, ''); 29 | 30 | //remove basic authentication 31 | href = href.replace(/^\/?([^/]*@)/, ''); 32 | 33 | // handle "//" 34 | if (/^\/\//.test(href)) { 35 | return `${queueItem.protocol}:${href}`; 36 | } 37 | 38 | // check if link is relative 39 | // (does not start with "http(s)" or "//") 40 | if (!/^https?:\/\//.test(href)) { 41 | const base = $('base').first(); 42 | if (base.length) { 43 | // base tag is set, prepend it 44 | if (base.attr('href') !== undefined) { 45 | // base tags sometimes don't define href, they sometimes they only set target="_top", target="_blank" 46 | href = url.resolve(base.attr('href'), href); 47 | } 48 | } 49 | 50 | // handle links such as "./foo", "../foo", "/foo" 51 | if (/^\.\.?\/.*/.test(href) || /^\/[^/].*/.test(href)) { 52 | href = url.resolve(queueItem.url, href); 53 | } 54 | } 55 | 56 | return href; 57 | }); 58 | 59 | return links.get(); 60 | }; 61 | -------------------------------------------------------------------------------- /src/SitemapRotator.js: -------------------------------------------------------------------------------- 1 | const SitemapStream = require('./SitemapStream'); 2 | const getCurrentDateTime = require('./helpers/getCurrentDateTime'); 3 | 4 | module.exports = function SitemapRotator( 5 | maxEntries, 6 | lastModEnabled, 7 | changeFreq, 8 | priorityMap 9 | ) { 10 | const sitemaps = []; 11 | let count = 0; 12 | let current = null; 13 | 14 | // return temp sitemap paths 15 | const getPaths = () => 16 | sitemaps.reduce((arr, map) => { 17 | arr.push(map.getPath()); 18 | return arr; 19 | }, []); 20 | 21 | // adds url to stream 22 | const addURL = (url, depth, lastMod = getCurrentDateTime()) => { 23 | const currentDateTime = lastModEnabled ? lastMod : null; 24 | 25 | // exclude existing sitemap.xml 26 | if (/sitemap\.xml$/.test(url)) { 27 | return; 28 | } 29 | 30 | // create stream if none exists 31 | if (current === null) { 32 | current = SitemapStream(); 33 | sitemaps.push(current); 34 | } 35 | 36 | // rotate stream 37 | if (count === maxEntries) { 38 | current.end(); 39 | current = SitemapStream(); 40 | sitemaps.push(current); 41 | count = 0; 42 | } 43 | 44 | let priority = ''; 45 | 46 | // if priorityMap exists, set priority based on depth 47 | // if depth is greater than map length, use the last value in the priorityMap 48 | if (priorityMap && priorityMap.length > 0) { 49 | priority = priorityMap[depth - 1] 50 | ? priorityMap[depth - 1] 51 | : priorityMap[priorityMap.length - 1]; 52 | } 53 | 54 | current.write(url, currentDateTime, changeFreq, priority); 55 | 56 | count += 1; 57 | }; 58 | 59 | // close stream 60 | const finish = () => { 61 | if (current) { 62 | current.end(); 63 | } 64 | }; 65 | 66 | return { 67 | getPaths, 68 | addURL, 69 | finish 70 | }; 71 | }; 72 | -------------------------------------------------------------------------------- /src/createCrawler.js: -------------------------------------------------------------------------------- 1 | const Crawler = require('simplecrawler'); 2 | const has = require('lodash/has'); 3 | 4 | const discoverResources = require('./discoverResources'); 5 | const stringifyURL = require('./helpers/stringifyURL'); 6 | 7 | module.exports = (uri, options = {}) => { 8 | // excluded filetypes 9 | const exclude = [ 10 | 'gif', 11 | 'jpg', 12 | 'jpeg', 13 | 'png', 14 | 'ico', 15 | 'bmp', 16 | 'ogg', 17 | 'webp', 18 | 'mp4', 19 | 'webm', 20 | 'mp3', 21 | 'ttf', 22 | 'woff', 23 | 'json', 24 | 'rss', 25 | 'atom', 26 | 'gz', 27 | 'zip', 28 | 'rar', 29 | '7z', 30 | 'css', 31 | 'js', 32 | 'gzip', 33 | 'exe', 34 | 'svg' 35 | ].join('|'); 36 | 37 | const extRegex = new RegExp(`\\.(${exclude})$`, 'i'); 38 | 39 | const crawler = new Crawler(uri.href); 40 | 41 | Object.keys(options).forEach(o => { 42 | if (has(crawler, o)) { 43 | crawler[o] = options[o]; 44 | } else if (o === 'crawlerMaxDepth') { 45 | // eslint-disable-next-line 46 | console.warn( 47 | 'Option "crawlerMaxDepth" is deprecated. Please use "maxDepth".' 48 | ); 49 | if (!options.maxDepth) { 50 | crawler.maxDepth = options.crawlerMaxDepth; 51 | } 52 | } 53 | }); 54 | 55 | // use custom discoverResources function 56 | crawler.discoverResources = discoverResources; 57 | 58 | // set crawler options 59 | // see https://github.com/cgiffard/node-simplecrawler#configuration 60 | crawler.initialPath = uri.pathname !== '' ? uri.pathname : '/'; 61 | crawler.initialProtocol = uri.protocol.replace(':', ''); 62 | 63 | // restrict to subpages if path is provided 64 | crawler.addFetchCondition(parsedUrl => { 65 | const initialURLRegex = new RegExp(`${uri.pathname}.*`); 66 | return stringifyURL(parsedUrl).match(initialURLRegex); 67 | }); 68 | 69 | // file type exclusion 70 | crawler.addFetchCondition(parsedUrl => !parsedUrl.path.match(extRegex)); 71 | 72 | return crawler; 73 | }; 74 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "sitemap-generator", 3 | "version": "0.0.0-semantically-released", 4 | "description": "Easily create XML sitemaps for your website.", 5 | "homepage": "https://github.com/lgraubner/sitemap-generator", 6 | "author": "Lars Graubner (https://larsgraubner.com)", 7 | "keywords": [ 8 | "sitemap", 9 | "xml", 10 | "sitemap.xml", 11 | "generator", 12 | "crawler", 13 | "seo", 14 | "google", 15 | "ecosystem:node" 16 | ], 17 | "main": "src/index.js", 18 | "repository": { 19 | "type": "git", 20 | "url": "https://github.com/lgraubner/sitemap-generator.git" 21 | }, 22 | "bugs": { 23 | "url": "https://github.com/lgraubner/sitemap-generator/issues" 24 | }, 25 | "dependencies": { 26 | "async": "2.6.1", 27 | "cheerio": "1.0.0-rc.2", 28 | "cp-file": "6.0.0", 29 | "crypto-random-string": "1.0.0", 30 | "date-fns": "1.29.0", 31 | "lodash": "4.17.20", 32 | "mitt": "1.1.3", 33 | "normalize-url": "3.3.0", 34 | "simplecrawler": "1.1.9", 35 | "url-parse": "1.4.7" 36 | }, 37 | "engines": { 38 | "node": ">=10" 39 | }, 40 | "license": "MIT", 41 | "files": [ 42 | "src", 43 | "!**/__tests__" 44 | ], 45 | "devDependencies": { 46 | "eslint": "5.8.0", 47 | "husky": "1.1.2", 48 | "jest": "24.8.0", 49 | "lint-staged": "7.3.0", 50 | "prettier": "1.14.3" 51 | }, 52 | "scripts": { 53 | "lint": "eslint src", 54 | "test": "jest", 55 | "test:watch": "npm test -- --watch", 56 | "flow": "flow" 57 | }, 58 | "lint-staged": { 59 | "*.js": [ 60 | "eslint --fix", 61 | "prettier --write", 62 | "git add" 63 | ] 64 | }, 65 | "prettier": { 66 | "singleQuote": true 67 | }, 68 | "eslintConfig": { 69 | "parserOptions": { 70 | "ecmaVersion": 6 71 | }, 72 | "extends": "eslint:recommended", 73 | "env": { 74 | "node": true, 75 | "jest": true 76 | } 77 | }, 78 | "husky": { 79 | "hooks": { 80 | "pre-commit": "lint-staged" 81 | } 82 | }, 83 | "release": { 84 | "tagFormat": "${version}" 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/index.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs'); 2 | const http = require('http'); 3 | const path = require('path'); 4 | const parseURL = require('url-parse'); 5 | const eachSeries = require('async/eachSeries'); 6 | const cpFile = require('cp-file'); 7 | const normalizeUrl = require('normalize-url'); 8 | const mitt = require('mitt'); 9 | const format = require('date-fns/format'); 10 | 11 | const createCrawler = require('./createCrawler'); 12 | const SitemapRotator = require('./SitemapRotator'); 13 | const createSitemapIndex = require('./createSitemapIndex'); 14 | const extendFilename = require('./helpers/extendFilename'); 15 | const validChangeFreq = require('./helpers/validChangeFreq'); 16 | 17 | module.exports = function SitemapGenerator(uri, opts) { 18 | const defaultOpts = { 19 | stripQuerystring: true, 20 | maxEntriesPerFile: 50000, 21 | maxDepth: 0, 22 | filepath: path.join(process.cwd(), 'sitemap.xml'), 23 | userAgent: 'Node/SitemapGenerator', 24 | respectRobotsTxt: true, 25 | ignoreInvalidSSL: true, 26 | timeout: 30000, 27 | decodeResponses: true, 28 | lastMod: false, 29 | changeFreq: '', 30 | priorityMap: [], 31 | ignoreAMP: true, 32 | ignore: null 33 | }; 34 | 35 | if (!uri) { 36 | throw new Error('Requires a valid URL.'); 37 | } 38 | 39 | const options = Object.assign({}, defaultOpts, opts); 40 | 41 | // if changeFreq option was passed, check to see if the value is valid 42 | if (opts && opts.changeFreq) { 43 | options.changeFreq = validChangeFreq(opts.changeFreq); 44 | } 45 | 46 | const emitter = mitt(); 47 | 48 | const parsedUrl = parseURL( 49 | normalizeUrl(uri, { 50 | stripWWW: false, 51 | removeTrailingSlash: false 52 | }) 53 | ); 54 | 55 | // only resolve if sitemap path is truthy (a string preferably) 56 | const sitemapPath = options.filepath && path.resolve(options.filepath); 57 | 58 | // we don't care about invalid certs 59 | process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0'; 60 | 61 | const crawler = createCrawler(parsedUrl, options); 62 | 63 | // create sitemap stream 64 | const sitemap = SitemapRotator( 65 | options.maxEntriesPerFile, 66 | options.lastMod, 67 | options.changeFreq, 68 | options.priorityMap 69 | ); 70 | 71 | const emitError = (code, url) => { 72 | emitter.emit('error', { 73 | code, 74 | message: http.STATUS_CODES[code], 75 | url 76 | }); 77 | }; 78 | 79 | crawler.on('fetch404', ({ url }) => emitError(404, url)); 80 | crawler.on('fetchtimeout', ({ url }) => emitError(408, url)); 81 | crawler.on('fetch410', ({ url }) => emitError(410, url)); 82 | crawler.on('fetcherror', (queueItem, response) => 83 | emitError(response.statusCode, queueItem.url) 84 | ); 85 | 86 | crawler.on('fetchclienterror', (queueError, errorData) => { 87 | if (errorData.code === 'ENOTFOUND') { 88 | emitError(404, `Site ${JSON.stringify(queueError)} could not be found. REQUEST: ${JSON.stringify(errorData)}`); 89 | } else { 90 | emitError(400, errorData.message); 91 | } 92 | }); 93 | 94 | crawler.on('fetchdisallowed', ({ url }) => emitter.emit('ignore', url)); 95 | 96 | // fetch complete event 97 | crawler.on('fetchcomplete', (queueItem, page) => { 98 | const { url, depth } = queueItem; 99 | 100 | if ( 101 | (opts.ignore && opts.ignore(url)) || 102 | /(]+noindex).*?>)/.test(page) || // check if robots noindex is present 103 | (options.ignoreAMP && /]+(amp|⚡)[^>]*>/.test(page)) // check if it's an amp page 104 | ) { 105 | emitter.emit('ignore', url); 106 | } else { 107 | emitter.emit('add', url); 108 | 109 | if (sitemapPath !== null) { 110 | // eslint-disable-next-line 111 | const lastMod = queueItem.stateData.headers['last-modified']; 112 | sitemap.addURL(url, depth, lastMod && format(lastMod, 'YYYY-MM-DD')); 113 | } 114 | } 115 | }); 116 | 117 | crawler.on('complete', () => { 118 | sitemap.finish(); 119 | 120 | const sitemaps = sitemap.getPaths(); 121 | 122 | const cb = () => emitter.emit('done'); 123 | 124 | if (sitemapPath !== null) { 125 | // move files 126 | if (sitemaps.length > 1) { 127 | // multiple sitemaps 128 | let count = 1; 129 | eachSeries( 130 | sitemaps, 131 | (tmpPath, done) => { 132 | const newPath = extendFilename(sitemapPath, `_part${count}`); 133 | 134 | // copy and remove tmp file 135 | cpFile(tmpPath, newPath).then(() => { 136 | fs.unlink(tmpPath, () => { 137 | done(); 138 | }); 139 | }); 140 | 141 | count += 1; 142 | }, 143 | () => { 144 | const filename = path.basename(sitemapPath); 145 | fs.writeFile( 146 | sitemapPath, 147 | createSitemapIndex( 148 | parsedUrl.toString(), 149 | filename, 150 | sitemaps.length 151 | ), 152 | cb 153 | ); 154 | } 155 | ); 156 | } else if (sitemaps.length) { 157 | cpFile(sitemaps[0], sitemapPath).then(() => { 158 | fs.unlink(sitemaps[0], cb); 159 | }); 160 | } else { 161 | cb(); 162 | } 163 | } else { 164 | cb(); 165 | } 166 | }); 167 | 168 | return { 169 | start: () => crawler.start(), 170 | stop: () => crawler.stop(), 171 | getCrawler: () => crawler, 172 | getSitemap: () => sitemap, 173 | queueURL: url => { 174 | crawler.queueURL(url, undefined, false); 175 | }, 176 | on: emitter.on, 177 | off: emitter.off 178 | }; 179 | }; 180 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sitemap Generator 2 | 3 | [![Travis](https://img.shields.io/travis/lgraubner/sitemap-generator.svg)](https://travis-ci.org/lgraubner/sitemap-generator) [![David](https://img.shields.io/david/lgraubner/sitemap-generator.svg)](https://david-dm.org/lgraubner/sitemap-generator) [![npm](https://img.shields.io/npm/v/sitemap-generator.svg)](https://www.npmjs.com/package/sitemap-generator) 4 | 5 | > Easily create XML sitemaps for your website. 6 | 7 | Generates a sitemap by crawling your site. Uses streams to efficiently write the sitemap to your drive and runs asynchronously to avoid blocking the thread. Is cappable of creating multiple sitemaps if threshold is reached. Respects robots.txt and meta tags. 8 | 9 | This package is not meant to be used in a production code base directly, but rather on the deployed product. This means you develop your app/website as usual, deploy it and create the sitemap with this tool _afterwards_. The simplest way is to use the [CLI](https://github.com/lgraubner/sitemap-generator-cli) (this is a different package!) to create the sitemap on the command line. If you have a more advanced usecase or want to adjust the crawler behavior you should use the programmtic version (this package). Create the crawler as needed and simply run it via `node mycrawler.js`. 10 | 11 | ## Table of contents 12 | 13 | - [Install](#install) 14 | - [Usage](#usage) 15 | - [API](#api) 16 | - [Options](#options) 17 | - [Events](#events) 18 | - [FAQ](#faq) 19 | - [License](#license) 20 | 21 | ## Install 22 | 23 | This module is available on [npm](https://www.npmjs.com/). 24 | 25 | ``` 26 | $ npm install -S sitemap-generator 27 | ``` 28 | 29 | This module is running only with Node.js and is not meant to be used in the browser. 30 | 31 | ## Usage 32 | 33 | ```JavaScript 34 | const SitemapGenerator = require('sitemap-generator'); 35 | 36 | // create generator 37 | const generator = SitemapGenerator('http://example.com', { 38 | stripQuerystring: false 39 | }); 40 | 41 | // register event listeners 42 | generator.on('done', () => { 43 | // sitemaps created 44 | }); 45 | 46 | // start the crawler 47 | generator.start(); 48 | ``` 49 | 50 | The crawler will fetch all folder URL pages and file types [parsed by Google](https://support.google.com/webmasters/answer/35287?hl=en). If present the `robots.txt` will be taken into account and possible rules are applied for each URL to consider if it should be added to the sitemap. Also the crawler will not fetch URL's from a page if the robots meta tag with the value `nofollow` is present and ignore them completely if `noindex` rule is present. The crawler is able to apply the `base` value to found links. 51 | 52 | ## API 53 | 54 | The generator offers straightforward methods to start and stop it. You can also add URL's manually. 55 | 56 | ### start() 57 | 58 | Starts crawler asynchronously and writes sitemap to disk. 59 | 60 | ### stop() 61 | 62 | Stops the running crawler and halts the sitemap generation. 63 | 64 | ### getCrawler() 65 | 66 | Returns the crawler instance. For more information about the crawler check the [simplecrawler docs](https://github.com/simplecrawler/simplecrawler#readme). 67 | 68 | This can be useful to ignore certain sites and don't add them to the sitemap. 69 | 70 | ```JavaScript 71 | const crawler = generator.getCrawler(); 72 | crawler.addFetchCondition((queueItem, referrerQueueItem, callback) => { 73 | callback(null, !queueItem.path.match(/myregex/)); 74 | }); 75 | ``` 76 | 77 | ### getSitemap() 78 | 79 | Returns the sitemap instance (`SitemapRotator`). 80 | 81 | This can be useful to add static URLs to the sitemap: 82 | 83 | ```JavaScript 84 | const crawler = generator.getCrawler() 85 | const sitemap = generator.getSitemap() 86 | 87 | // Add static URL on crawl init. 88 | crawler.on('crawlstart', () => { 89 | sitemap.addURL('/my/static/url') 90 | }) 91 | ```` 92 | 93 | ### queueURL(url) 94 | 95 | Add a URL to crawler's queue. Useful to help crawler fetch pages it can't find itself. 96 | 97 | ## Options 98 | 99 | There are a couple of options to adjust the sitemap output. In addition to the options beneath the options of the used crawler can be changed. For a complete list please check it's [official documentation](https://github.com/simplecrawler/simplecrawler#configuration). 100 | 101 | ```JavaScript 102 | var generator = SitemapGenerator('http://example.com', { 103 | maxDepth: 0, 104 | filepath: './sitemap.xml', 105 | maxEntriesPerFile: 50000, 106 | stripQuerystring: true 107 | }); 108 | ``` 109 | 110 | ### changeFreq 111 | 112 | Type: `string` 113 | Default: `undefined` 114 | 115 | If defined, adds a `` line to each URL in the sitemap. Possible values are `always`, `hourly`, `daily`, `weekly`, `monthly`, `yearly`, `never`. All other values are ignored. 116 | 117 | ### filepath 118 | 119 | Type: `string` 120 | Default: `./sitemap.xml` 121 | 122 | Filepath for the new sitemap. If multiple sitemaps are created "part\_$index" is appended to each filename. If you don't want to write a file at all you can pass `null` as filepath. 123 | 124 | ### httpAgent 125 | 126 | Type: `HTTPAgent` 127 | Default: `http.globalAgent` 128 | 129 | Controls what HTTP agent to use. This is useful if you want configure HTTP connection through a HTTP/HTTPS proxy (see [http-proxy-agent](https://www.npmjs.com/package/http-proxy-agent)). 130 | 131 | ### httpsAgent 132 | 133 | Type: `HTTPAgent` 134 | Default: `https.globalAgent` 135 | 136 | Controls what HTTPS agent to use. This is useful if you want configure HTTPS connection through a HTTP/HTTPS proxy (see [https-proxy-agent](https://www.npmjs.com/package/https-proxy-agent)). 137 | 138 | Example: 139 | 140 | ```JavaScript 141 | // don't forget to: 142 | // npm i http-proxy-agent https-proxy-agent 143 | const HttpProxyAgent = require("http-proxy-agent"); 144 | const HttpsProxyAgent = require("https-proxy-agent"); 145 | const proxyAddress = 'http://localhost:1234'; 146 | const httpProxyAgent = new HttpProxyAgent(proxyAddress); 147 | const httpsProxyAgent = new HttpsProxyAgent(proxyAddress); 148 | options.httpAgent = httpProxyAgent; 149 | options.httpsAgent = httpsProxyAgent; 150 | ``` 151 | 152 | ### ignore(url) 153 | 154 | Apply a test condition to a URL before it's added to the sitemap. 155 | 156 | Type: `function` 157 | Default: `null` 158 | 159 | Example: 160 | 161 | ```JavaScript 162 | const generator = SitemapGenerator(url, { 163 | ignore: url => { 164 | // Prevent URLs from being added that contain ``. 165 | return //g.test(url) 166 | } 167 | }) 168 | ``` 169 | 170 | ### ignoreAMP 171 | 172 | Type: `boolean` 173 | Default: `true` 174 | 175 | Indicates whether [Google AMP pages](https://www.ampproject.org/) should be ignored and not be added to the sitemap. 176 | 177 | ### lastMod 178 | 179 | Type: `boolean` 180 | Default: `false` 181 | 182 | Whether to add a `` line to each URL in the sitemap. If present the responses `Last-Modified` header will be used. Otherwise todays date is added. 183 | 184 | ### maxEntriesPerFile 185 | 186 | Type: `number` 187 | Default: `50000` 188 | 189 | Google limits the maximum number of URLs in one sitemap to 50000. If this limit is reached the sitemap-generator creates another sitemap. A sitemap index file will be created as well. 190 | 191 | ### priorityMap 192 | 193 | Type: `array` 194 | Default: `[]` 195 | 196 | If provided, adds a `` line to each URL in the sitemap. Each value in priorityMap array corresponds with the depth of the URL being added. For example, the priority value given to a URL equals `priorityMap[depth - 1]`. If a URL's depth is greater than the length of the priorityMap array, the last value in the array will be used. Valid values are between `1.0` and `0.0`. 197 | 198 | Example: 199 | 200 | ```javascript 201 | [1.0, 0.8, 0.6, 0.4, 0.2, 0] 202 | ``` 203 | 204 | ### userAgent 205 | 206 | Type: `string` 207 | Default: `Node/SitemapGenerator` 208 | 209 | Change the default crawler user agent. 210 | 211 | ## Events 212 | 213 | The Sitemap Generator emits several events which can be listened to. 214 | 215 | ### `add` 216 | 217 | Triggered when the crawler successfully added a resource to the sitemap. Passes the url as argument. 218 | 219 | ```JavaScript 220 | generator.on('add', (url) => { 221 | // log url 222 | }); 223 | ``` 224 | 225 | ### `done` 226 | 227 | Triggered when the crawler finished and the sitemap is created. 228 | 229 | ```JavaScript 230 | generator.on('done', () => { 231 | // sitemaps created 232 | }); 233 | ``` 234 | 235 | ### `error` 236 | 237 | Thrown if there was an error while fetching an URL. Passes an object with the http status code, a message and the url as argument. 238 | 239 | ```JavaScript 240 | generator.on('error', (error) => { 241 | console.log(error); 242 | // => { code: 404, message: 'Not found.', url: 'http://example.com/foo' } 243 | }); 244 | ``` 245 | 246 | ### `ignore` 247 | 248 | If an URL matches a disallow rule in the `robots.txt` file or meta robots noindex is present this event is triggered. The URL will not be added to the sitemap. Passes the ignored url as argument. 249 | 250 | ```JavaScript 251 | generator.on('ignore', (url) => { 252 | // log ignored url 253 | }); 254 | ``` 255 | 256 | ## FAQ 257 | 258 |
259 | Does this work with React, Angular, ... 260 |

This package don't care what frameworks and technologies you are using under the hood. The only requirement is, that your URL's return valid HTML. Therefore SSR (server side rendering) is required for single page apps as no JavaScript is executed.

261 |
262 | 263 |
264 | Where to put this code 265 |

This is basically up to you. You can execute this code manually and upload your sitemap by hand, or you can put this on your server and run this periodically to keep your sitemap up to date.

266 |
267 | 268 |
269 | Should I use this package or the CLI 270 |

The CLI should suffice most of the common use cases. It has several options to tweak in case you want it to behave differently. If your use case is more advanced and you need fine control about what the crawler should fetch, you should use this package and the programmatic API.

271 |
272 | 273 | ## License 274 | 275 | [MIT](https://github.com/lgraubner/sitemap-generator/blob/master/LICENSE) © [Lars Graubner](https://larsgraubner.com) 276 | --------------------------------------------------------------------------------