├── .gitignore ├── LICENSE.md ├── README.md ├── package.json ├── phantomjs └── bridge.js └── tasks └── htmlSnapshot.js /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | npm-debug.log -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012 Christoph Burgdorf, contributors 2 | 3 | Permission is hereby granted, free of charge, to any person 4 | obtaining a copy of this software and associated documentation 5 | files (the "Software"), to deal in the Software without 6 | restriction, including without limitation the rights to use, 7 | copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the 9 | Software is furnished to do so, subject to the following 10 | conditions: 11 | 12 | The above copyright notice and this permission notice shall be 13 | included in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 17 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 19 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 20 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # grunt-html-snapshot 2 | 3 | > Makes it easy to provide html snapshots for client side applications so that they can be indexed by web crawlers 4 | 5 | 6 | 7 | ## Getting Started 8 | This plugin requires Grunt `~0.4.0` 9 | 10 | If you haven't used [Grunt](http://gruntjs.com/) before, be sure to check out the [Getting Started](http://gruntjs.com/getting-started) guide, as it explains how to create a [Gruntfile](http://gruntjs.com/sample-gruntfile) as well as install and use Grunt plugins. Once you're familiar with that process, you may install this plugin with this command: 11 | 12 | ```shell 13 | npm install grunt-html-snapshot --save-dev 14 | ``` 15 | 16 | Once the plugin has been installed, it may be enabled inside your Gruntfile with this line of JavaScript: 17 | 18 | ```js 19 | grunt.loadNpmTasks('grunt-html-snapshot'); 20 | ``` 21 | 22 | 23 | ## htmlSnapshot task 24 | _Run this task with the `grunt htmlSnapshot` command._ 25 | 26 | ## configuring the htmlSnapshot task 27 | 28 | ```js 29 | grunt.initConfig({ 30 | htmlSnapshot: { 31 | all: { 32 | options: { 33 | //that's the path where the snapshots should be placed 34 | //it's empty by default which means they will go into the directory 35 | //where your Gruntfile.js is placed 36 | snapshotPath: 'snapshots/', 37 | //This should be either the base path to your index.html file 38 | //or your base URL. Currently the task does not use it's own 39 | //webserver. So if your site needs a webserver to be fully 40 | //functional configure it here. 41 | sitePath: 'http://localhost:8888/my-website/', 42 | //you can choose a prefix for your snapshots 43 | //by default it's 'snapshot_' 44 | fileNamePrefix: 'sp_', 45 | //by default the task waits 500ms before fetching the html. 46 | //this is to give the page enough time to to assemble itself. 47 | //if your page needs more time, tweak here. 48 | msWaitForPages: 1000, 49 | //sanitize function to be used for filenames. Converts '#!/' to '_' as default 50 | //has a filename argument, must have a return that is a sanitized string 51 | sanitize: function (requestUri) { 52 | //returns 'index.html' if the url is '/', otherwise a prefix 53 | if (/\/$/.test(requestUri)) { 54 | return 'index.html'; 55 | } else { 56 | return requestUri.replace(/\//g, 'prefix-'); 57 | } 58 | }, 59 | //if you would rather not keep the script tags in the html snapshots 60 | //set `removeScripts` to true. It's false by default 61 | removeScripts: true, 62 | //set `removeLinkTags` to true. It's false by default 63 | removeLinkTags: true, 64 | //set `removeMetaTags` to true. It's false by default 65 | removeMetaTags: true, 66 | //Replace arbitrary parts of the html 67 | replaceStrings:[ 68 | {'this': 'will get replaced by this'}, 69 | {'/old/path/': '/new/path'} 70 | ], 71 | // allow to add a custom attribute to the body 72 | bodyAttr: 'data-prerendered', 73 | //here goes the list of all urls that should be fetched 74 | urls: [ 75 | '', 76 | '#!/en-gb/showcase' 77 | ], 78 | // a list of cookies to be put into the phantomjs cookies jar for the visited page 79 | cookies: [ 80 | {"path": "/", "domain": "localhost", "name": "lang", "value": "en-gb"} 81 | ], 82 | // options for phantomJs' page object 83 | // see http://phantomjs.org/api/webpage/ for available options 84 | pageOptions: { 85 | viewportSize : { 86 | width: 1200, 87 | height: 800 88 | } 89 | } 90 | } 91 | } 92 | } 93 | }); 94 | ``` 95 | 96 | ## Release History 97 | 98 | - 0.6.1 - trigger warnings with grunt.warn(msg, 6) instead of grunt.log(msg) 99 | - 0.6.0 - Provide a function hook for the file name sanitization (by @mrgamer) 100 | - 0.5.0 - Add option to set cookies. Also fixed a bug for scenarios where multiple instances of the tasks are being used in parallel. 101 | - 0.4.0 - Add more sophisticated replace functionality to transform the html output (thanks to @okcoker) 102 | - 0.3.0 - Escape tabs & introduced new option bodyAttr to place a custom attribute on the body 103 | - 0.2.1 - fixed a bug where quotes where missing from the html 104 | - 0.2.0 - added option to remove script tags from the output 105 | - 0.1.0 - Initial release 106 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "grunt-html-snapshot", 3 | "version": "0.6.1", 4 | "description": "A grunt task that fetches html snapshots of your web app for easier SEO", 5 | "author": { 6 | "name": "Christoph Burgdorf", 7 | "email": "christoph.burgdorf@bvsn.org", 8 | "url": "http://cburgdorf.wordpress.com" 9 | }, 10 | "main": "Gruntfile.js", 11 | "scripts": { 12 | "test": "echo \"Error: no test specified\" && exit 1" 13 | }, 14 | "repository": { 15 | "type": "git", 16 | "url": "https://github.com/cburgdorf/grunt-html-snapshot.git" 17 | }, 18 | "licenses": [ 19 | { 20 | "type": "MIT", 21 | "url": "https://github.com/cburgdorf/grunt-html-snapshot/blob/master/LICENSE-MIT" 22 | } 23 | ], 24 | "engines": { 25 | "node": ">= 0.8.0" 26 | }, 27 | "dependencies": { 28 | "grunt-lib-phantomjs": "0.3.0" 29 | }, 30 | "keywords": [ 31 | "SEO", 32 | "html", 33 | "snapshots", 34 | "gruntplugin" 35 | ] 36 | } 37 | -------------------------------------------------------------------------------- /phantomjs/bridge.js: -------------------------------------------------------------------------------- 1 | 2 | "use strict"; 3 | 4 | var fs = require("fs"); 5 | 6 | // The temporary file used for communications. 7 | var tmpfile = phantom.args[0]; 8 | // The page .html file to load. 9 | var url = phantom.args[1]; 10 | // Extra, optionally overridable stuff. 11 | var options = JSON.parse(phantom.args[2] || {}); 12 | 13 | // Messages are sent to the parent by appending them to the tempfile. 14 | // NOTE, the tempfile appears to be shared between asynchronously running grunt tasks 15 | var sendMessage = function (arg) { 16 | var args = Array.isArray(arg) ? arg : [].slice.call(arguments); 17 | var channel = options.taskChannelPrefix + '.' + args[0]; 18 | args[0] = channel; 19 | fs.write(tmpfile, JSON.stringify(args) + "\n", "a"); 20 | }; 21 | 22 | var sanitizeHtml = function(html,options){ 23 | //remove weird pseudo new lines and tabs 24 | html = html.replace(/\\n|\\t/g,""); 25 | // add a custom attribute if so required 26 | if (options.bodyAttr) 27 | html = html.replace(/
)<[^<]*)*<\/script>/gi, ''); 70 | } 71 | 72 | if (options.removeLinkTags){ 73 | msg = msg.replace(//gi, ''); 74 | } 75 | 76 | if (options.removeMetaTags) { 77 | msg = msg.replace(//gi, ''); 78 | } 79 | 80 | options.replaceStrings.forEach(function(obj) { 81 | var key = Object.keys(obj); 82 | var value = obj[key]; 83 | var regex = new RegExp(key, 'g'); 84 | msg = msg.replace(regex, value); 85 | }); 86 | 87 | grunt.file.write(fileName, msg); 88 | grunt.log.writeln(fileName, 'written'); 89 | phantom.halt(); 90 | 91 | isLastUrl(plainUrl) && done(); 92 | }); 93 | 94 | var done = this.async(); 95 | 96 | var urls = options.urls; 97 | var sitePath = options.sitePath; 98 | 99 | grunt.util.async.forEachSeries(urls, function(url, next) { 100 | 101 | phantom.spawn(sitePath + url, { 102 | // Additional PhantomJS options. 103 | options: { 104 | phantomScript: asset('phantomjs/bridge.js'), 105 | msWaitForPages: options.msWaitForPages, 106 | bodyAttr: options.bodyAttr, 107 | cookies: options.cookies, 108 | taskChannelPrefix: taskChannelPrefix, 109 | pageOptions: options.pageOptions 110 | }, 111 | // Complete the task when done. 112 | done: function (err) { 113 | if (err) { 114 | // If there was an error, abort the series. 115 | done(); 116 | } 117 | else { 118 | // Otherwise, process next url. 119 | next(); 120 | } 121 | } 122 | }); 123 | }); 124 | grunt.log.writeln('running html-snapshot task...hold your horses'); 125 | }); 126 | }; 127 | 128 | }()); 129 | --------------------------------------------------------------------------------