├── .gitignore ├── README.md ├── node_modules └── website-scraper │ ├── .npmignore │ ├── LICENSE │ ├── README.md │ ├── app.js │ ├── index.js │ ├── lib │ ├── config │ │ ├── defaults.js │ │ ├── recursive-sources.js │ │ ├── resource-types-by-tag.js │ │ └── resource-types.js │ ├── file-handlers │ │ ├── css.js │ │ └── html.js │ ├── request.js │ ├── resource.js │ ├── scraper.js │ └── utils.js │ └── package.json ├── package.json ├── public └── index.html └── server ├── controllers ├── dbController.js ├── mdnJS.js └── updateModel.js ├── middleware ├── folderHandler.js ├── mdnCSS.js ├── mdnHTML.js ├── mdnJS.js ├── nodeparser_working.js ├── parseEntryPoint.js ├── parser.js ├── requestProps.js ├── rewrite.js ├── scrapeParseWrite.js └── versionCheck.js ├── server.js └── updater.js /.gitignore: -------------------------------------------------------------------------------- 1 | # npm dependencies 2 | node_modules/* 3 | 4 | # debug logs 5 | npm-debug.log 6 | JavaScript.tgz 7 | 8 | #docs 9 | docs 10 | test 11 | ignore_test_files 12 | website-scraper 13 | zips 14 | temp 15 | 16 | #include edited version of website_scraper 17 | !node_modules/website-scraper 18 | node_modules/website-scraper/node_modules 19 | 20 | # packaged application (end-user) 21 | Doc-tor-darwin-x64 22 | 23 | # front-end build 24 | build 25 | 26 | # misc 27 | scraper.bak.js 28 | server/middleware/nodeparser_working.js 29 | ..bfg-report 30 | JavaScript.tgz 31 | mdnFiles/ 32 | doc/ 33 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Doc-Server 2 | -------------------------------------------------------------------------------- /node_modules/website-scraper/.npmignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .gitignore 3 | .travis.yml 4 | coverage 5 | test 6 | -------------------------------------------------------------------------------- /node_modules/website-scraper/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Sophia Nepochataya 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /node_modules/website-scraper/README.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | Download website to a local directory (including all css, images, js, etc.) 3 | 4 | [![Build Status](https://img.shields.io/travis/s0ph1e/node-website-scraper/master.svg?style=flat)](https://travis-ci.org/s0ph1e/node-website-scraper) 5 | [![Test Coverage](https://codeclimate.com/github/s0ph1e/node-website-scraper/badges/coverage.svg)](https://codeclimate.com/github/s0ph1e/node-website-scraper/coverage) 6 | [![Code Climate](https://codeclimate.com/github/s0ph1e/node-website-scraper/badges/gpa.svg)](https://codeclimate.com/github/s0ph1e/node-website-scraper) 7 | [![Version](https://img.shields.io/npm/v/website-scraper.svg?style=flat)](https://www.npmjs.org/package/website-scraper) 8 | [![Downloads](https://img.shields.io/npm/dm/website-scraper.svg?style=flat)](https://www.npmjs.org/package/website-scraper) 9 | [![Dependency Status](https://david-dm.org/s0ph1e/node-website-scraper.svg?style=flat)](https://david-dm.org/s0ph1e/node-website-scraper) 10 | 11 | [![NPM Stats](https://nodei.co/npm/website-scraper.png?downloadRank=true&stars=true)](https://www.npmjs.org/package/website-scraper) 12 | 13 | You can try it in [demo app](https://scraper.nepochataya.pp.ua/) ([source](https://github.com/s0ph1e/web-scraper)) 14 | 15 | ## Installation 16 | ``` 17 | npm install website-scraper 18 | ``` 19 | 20 | ## Usage 21 | ```javascript 22 | var scraper = require('website-scraper'); 23 | var options = { 24 | urls: ['http://nodejs.org/'], 25 | directory: '/path/to/save/', 26 | }; 27 | 28 | // with callback 29 | scraper.scrape(options, function (error, result) { 30 | /* some code here */ 31 | }); 32 | 33 | // or with promise 34 | scraper.scrape(options).then(function (result) { 35 | /* some code here */ 36 | }); 37 | ``` 38 | 39 | ## API 40 | ### scrape(options, callback) 41 | Makes requests to `urls` and saves all files found with `sources` to `directory`. 42 | 43 | **options** - object containing next options: 44 | 45 | - `urls:` array of urls to load and filenames for them *(required, see example below)* 46 | - `directory:` path to save loaded files *(required)* 47 | - `defaultFilename:` filename for index page *(optional, default: 'index.html')* 48 | - `sources:` array of objects to load, specifies selectors and attribute values to select files for loading *(optional, see default value in `lib/config/defaults.js`)* 49 | - `subdirectories:` array of objects, specifies subdirectories for file extensions. If `null` all files will be saved to `directory` *(optional, see example below)* 50 | - `request`: object, custom options for [request](https://github.com/request/request#requestoptions-callback) *(optional, see example below)* 51 | - `recursive`: boolean, if `true` scraper will follow anchors in html files. Don't forget to set `maxDepth` to avoid infinite downloading *(optional, see example below)* 52 | - `maxDepth`: positive number, maximum allowed depth for dependencies *(optional, see example below)* 53 | 54 | 55 | **callback** - callback function *(optional)*, includes following parameters: 56 | 57 | - `error:` if error - `Error` object, if success - `null` 58 | - `result:` if error - `null`, if success - array if objects containing: 59 | - `url:` url of loaded page 60 | - `filename:` filename where page was saved (relative to `directory`) 61 | 62 | 63 | ## Examples 64 | #### Example 1 65 | Let's scrape some pages from [http://nodejs.org/](http://nodejs.org/) with images, css, js files and save them to `/path/to/save/`. 66 | Imagine we want to load: 67 | - [Home page](http://nodejs.org/) to `index.html` 68 | - [About page](http://nodejs.org/about/) to `about.html` 69 | - [Blog](http://blog.nodejs.org/) to `blog.html` 70 | 71 | and separate files into directories: 72 | 73 | - `img` for .jpg, .png, .svg (full path `/path/to/save/img`) 74 | - `js` for .js (full path `/path/to/save/js`) 75 | - `css` for .css (full path `/path/to/save/css`) 76 | 77 | ```javascript 78 | var scraper = require('website-scraper'); 79 | scraper.scrape({ 80 | urls: [ 81 | 'http://nodejs.org/', // Will be saved with default filename 'index.html' 82 | {url: 'http://nodejs.org/about', filename: 'about.html'}, 83 | {url: 'http://blog.nodejs.org/', filename: 'blog.html'} 84 | ], 85 | directory: '/path/to/save', 86 | subdirectories: [ 87 | {directory: 'img', extensions: ['.jpg', '.png', '.svg']}, 88 | {directory: 'js', extensions: ['.js']}, 89 | {directory: 'css', extensions: ['.css']} 90 | ], 91 | sources: [ 92 | {selector: 'img', attr: 'src'}, 93 | {selector: 'link[rel="stylesheet"]', attr: 'href'}, 94 | {selector: 'script', attr: 'src'} 95 | ], 96 | request: { 97 | headers: { 98 | 'User-Agent': 'Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 4 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19' 99 | } 100 | } 101 | }).then(function (result) { 102 | console.log(result); 103 | }).catch(function(err){ 104 | console.log(err); 105 | }); 106 | ``` 107 | 108 | #### Example 2. Recursive downloading 109 | ```javascript 110 | // Links from example.com will be followed 111 | // Links from links will be ignored because theirs depth = 2 is greater than maxDepth 112 | var scraper = require('website-scraper'); 113 | scraper.scrape({ 114 | urls: ['http://example.com/'], 115 | directory: '/path/to/save', 116 | recursive: true, 117 | maxDepth: 1 118 | }).then(console.log).catch(console.log); 119 | ``` 120 | -------------------------------------------------------------------------------- /node_modules/website-scraper/app.js: -------------------------------------------------------------------------------- 1 | var scraper = require('./index'); 2 | scraper.scrape({ 3 | urls: [ 4 | 'http://nodejs.org/', // Will be saved with default filename 'index.html' 5 | {url: 'http://nodejs.org/about', filename: 'about.html'}, 6 | {url: 'http://blog.nodejs.org/', filename: 'blog.html'} 7 | ], 8 | directory: './path/to/save', 9 | subdirectories: [ 10 | {directory: 'img', extensions: ['.jpg', '.png', '.svg']}, 11 | {directory: 'js', extensions: ['.js']}, 12 | {directory: 'css', extensions: ['.css']} 13 | ], 14 | sources: [ 15 | {selector: 'img', attr: 'src'}, 16 | {selector: 'link[rel="stylesheet"]', attr: 'href'}, 17 | {selector: 'script', attr: 'src'} 18 | ], 19 | request: { 20 | headers: { 21 | 'User-Agent': 'Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 4 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19' 22 | } 23 | } 24 | }).then(function (result) { 25 | console.log(result); 26 | }).catch(function(err){ 27 | console.log(err); 28 | }); -------------------------------------------------------------------------------- /node_modules/website-scraper/index.js: -------------------------------------------------------------------------------- 1 | var Scraper = require('./lib/scraper.js'); 2 | 3 | module.exports.scrape = function (options, callback) { 4 | return new Scraper(options).scrape(callback); 5 | }; -------------------------------------------------------------------------------- /node_modules/website-scraper/lib/config/defaults.js: -------------------------------------------------------------------------------- 1 | var config = { 2 | defaultFilename: 'index.html', 3 | sources: [ 4 | { 5 | selector: 'img', 6 | attr: 'src' 7 | }, 8 | { 9 | selector: 'input', 10 | attr: 'src' 11 | }, 12 | { 13 | selector: 'object', 14 | attr: 'data' 15 | }, 16 | { 17 | selector: 'embed', 18 | attr: 'src' 19 | }, 20 | { 21 | selector: 'param[name="movie"]', 22 | attr: 'value' 23 | }, 24 | { 25 | selector: 'script', 26 | attr: 'src' 27 | }, 28 | { 29 | selector: 'link[rel="stylesheet"]', 30 | attr: 'href' 31 | }, 32 | { 33 | selector: 'link[rel*="icon"]', 34 | attr: 'href' 35 | } 36 | ], 37 | subdirectories: [ 38 | { 39 | directory: 'images', 40 | extensions: ['.png', '.jpg', '.jpeg', '.gif'] 41 | }, 42 | { 43 | directory: 'js', 44 | extensions: ['.js'] 45 | }, 46 | { 47 | directory: 'css', 48 | extensions: ['.css'] 49 | }, 50 | { 51 | directory: 'fonts', 52 | extensions: ['.ttf', '.woff', '.eot', '.svg'] 53 | } 54 | ] 55 | }; 56 | 57 | module.exports = config; 58 | -------------------------------------------------------------------------------- /node_modules/website-scraper/lib/config/recursive-sources.js: -------------------------------------------------------------------------------- 1 | module.exports = [ 2 | { selector: 'a', attr: 'href' } 3 | ]; -------------------------------------------------------------------------------- /node_modules/website-scraper/lib/config/resource-types-by-tag.js: -------------------------------------------------------------------------------- 1 | var types = require('./resource-types'); 2 | 3 | var typesByHtmlTag = {}; 4 | 5 | typesByHtmlTag[types.css] = [ 6 | { tagName: 'link', attributeName: 'href' } 7 | ]; 8 | typesByHtmlTag[types.html] = [ 9 | { tagName: 'a', attributeName: 'href' }, 10 | { tagName: 'iframe', attributeName: 'src' } 11 | ]; 12 | 13 | module.exports = typesByHtmlTag; 14 | -------------------------------------------------------------------------------- /node_modules/website-scraper/lib/config/resource-types.js: -------------------------------------------------------------------------------- 1 | var types = { 2 | css: 'css', 3 | html: 'html', 4 | other: 'other' 5 | }; 6 | 7 | module.exports = types; 8 | -------------------------------------------------------------------------------- /node_modules/website-scraper/lib/file-handlers/css.js: -------------------------------------------------------------------------------- 1 | var _ = require('underscore'); 2 | var Promise = require('bluebird'); 3 | var getCssUrls = require('css-url-parser'); 4 | var utils = require('../utils'); 5 | 6 | function loadCss (context, resource) { 7 | var url = resource.getUrl(); 8 | var filename = resource.getFilename(); 9 | var text = resource.getText(); 10 | var cssUrls = getCssUrls(text); 11 | 12 | var promises = _.map(cssUrls, function loadResourceFromCssUrl (cssUrl) { 13 | var resourceUrl = utils.getUrl(url, cssUrl); 14 | var cssResource = resource.createChild(resourceUrl); 15 | 16 | return context.loadResource(cssResource).then(function handleLoadedSource (loadedResource) { 17 | var relativePath = utils.getRelativePath(filename, loadedResource.getFilename()); 18 | text = text.replace(cssUrl, relativePath); 19 | return Promise.resolve(); 20 | }); 21 | }); 22 | 23 | return utils.waitAllFulfilled(promises).then(function () { 24 | resource.setText(text); 25 | return resource; 26 | }); 27 | } 28 | 29 | module.exports = loadCss; 30 | -------------------------------------------------------------------------------- /node_modules/website-scraper/lib/file-handlers/html.js: -------------------------------------------------------------------------------- 1 | var cheerio = require('cheerio'); 2 | var Promise = require('bluebird'); 3 | var utils = require('../utils'); 4 | 5 | function loadHtml (context, resource) { 6 | var sources = context.getHtmlSources(); 7 | var handleResources = loadResources.bind(null, context, resource); 8 | 9 | var p = beforeHandle(resource); 10 | 11 | sources.forEach(function (src) { 12 | p = p.then(function loadSource () { 13 | return handleResources(src); 14 | }); 15 | }); 16 | return p; 17 | } 18 | 19 | function beforeHandle (resource) { 20 | var text = resource.getText(); 21 | var $ = cheerio.load(text); 22 | 23 | // Handle tag 24 | $('base').each(function () { 25 | var el = $(this); 26 | var href = el.attr('href'); 27 | if (href) { 28 | var newUrl = utils.getUrl(resource.getUrl(), href); 29 | resource.setUrl(newUrl); 30 | el.remove(); 31 | } 32 | }); 33 | 34 | text = $.html(); 35 | resource.setText(text); 36 | 37 | return Promise.resolve(resource); 38 | } 39 | 40 | function loadResources (context, resource, source) { 41 | var url = resource.getUrl(); 42 | var text = resource.getText(); 43 | var filename = resource.getFilename(); 44 | var $ = cheerio.load(text); 45 | 46 | var promises = $(source.selector).map(function loadForSelector () { 47 | var el = $(this); 48 | var attr = el.attr(source.attr); 49 | 50 | if (attr) { 51 | var resourceUrl = utils.getUrl(url, attr); 52 | var htmlResource = resource.createChild(resourceUrl); 53 | htmlResource.setHtmlData({ tagName: el[0].name, attributeName: source.attr }); 54 | 55 | return context.loadResource(htmlResource).then(function handleLoadedSource (loadedResource) { 56 | var relativePath = utils.getRelativePath(filename, loadedResource.getFilename()); 57 | el.attr(source.attr, relativePath); 58 | return Promise.resolve(); 59 | }); 60 | } 61 | return Promise.reject(); 62 | }); 63 | 64 | return utils.waitAllFulfilled(promises).then(function () { 65 | text = $.html(); 66 | resource.setText(text); 67 | return resource; 68 | }); 69 | } 70 | 71 | module.exports = loadHtml; 72 | -------------------------------------------------------------------------------- /node_modules/website-scraper/lib/request.js: -------------------------------------------------------------------------------- 1 | var _ = require('underscore'); 2 | var Promise = require('bluebird'); 3 | var request = require('request'); 4 | var get = Promise.promisify(request.get); 5 | 6 | var defaultOptions = { 7 | method: 'GET', 8 | encoding: 'binary', 9 | strictSSL: false, 10 | jar: true 11 | }; 12 | 13 | function getDefaultOptions() { 14 | return defaultOptions; 15 | } 16 | 17 | function getCustomOptions(options) { 18 | return _.extend({}, defaultOptions, options); 19 | } 20 | 21 | function makeRequest(options, url) { 22 | var requestOptions = getCustomOptions(options); 23 | requestOptions.url = url; 24 | 25 | return get(requestOptions).then(function handleResponse(data) { 26 | return { 27 | url: data.request.href, 28 | body: data.body 29 | }; 30 | }); 31 | } 32 | 33 | module.exports.makeRequest = makeRequest; 34 | module.exports.getDefaultOptions = getDefaultOptions; 35 | module.exports.getCustomOptions = getCustomOptions; 36 | -------------------------------------------------------------------------------- /node_modules/website-scraper/lib/resource.js: -------------------------------------------------------------------------------- 1 | var _ = require('underscore'); 2 | var path = require('path'); 3 | var types = require('./config/resource-types'); 4 | var typesByHtmlData = require('./config/resource-types-by-tag'); 5 | 6 | function getTypeByHtmlData (htmlData) { 7 | var type = _.findKey(typesByHtmlData, function containsHtmlData (rules) { 8 | return _.findWhere(rules, htmlData); 9 | }); 10 | return type || types.other; 11 | } 12 | 13 | function Resource (url, filename) { 14 | this.url = url; 15 | this.filename = filename; 16 | } 17 | 18 | Resource.prototype.createChild = function createChild (url, filename) { 19 | var child = new Resource(url, filename); 20 | 21 | var currentDepth = this.getDepth(); 22 | 23 | child.setParent(this); 24 | child.setDepth(++currentDepth); 25 | 26 | return child; 27 | }; 28 | 29 | Resource.prototype.getUrl = function getUrl () { 30 | return this.url; 31 | }; 32 | 33 | Resource.prototype.setUrl = function setUrl (url) { 34 | this.url = url; 35 | }; 36 | 37 | Resource.prototype.getFilename = function getFilename () { 38 | return this.filename; 39 | }; 40 | 41 | Resource.prototype.setFilename = function setFilename (filename) { 42 | this.filename = filename; 43 | }; 44 | 45 | Resource.prototype.getText = function getText () { 46 | return this.text; 47 | }; 48 | 49 | Resource.prototype.setText = function setText (text) { 50 | this.text = text; 51 | }; 52 | 53 | Resource.prototype.setParent = function setParent (parent) { 54 | this.parent = parent; 55 | }; 56 | 57 | Resource.prototype.getDepth = function getDepth () { 58 | return this.depth || 0; 59 | }; 60 | 61 | Resource.prototype.setDepth = function setDepth (depth) { 62 | this.depth = depth; 63 | }; 64 | 65 | /** 66 | * 67 | * @param {Object} data - html element data 68 | * @param {string} data.tagName - tag name which contain resource 69 | * @param {string} data.attributeName - attribute name with value of resource's url 70 | */ 71 | Resource.prototype.setHtmlData = function setHtmlData (data) { 72 | this.htmlData = data; 73 | }; 74 | 75 | Resource.prototype.getType = function getType () { 76 | var ext = path.extname(this.filename); 77 | var parentType = this.parent && this.parent.getType(); 78 | var hasHtmlData = !!this.htmlData; 79 | 80 | switch (true) { 81 | case ext == '.html' || ext == '.htm': 82 | return types.html; 83 | case ext == '.css': 84 | case !ext && parentType == types.css: 85 | return types.css; 86 | case !ext && parentType == types.html && hasHtmlData: 87 | return getTypeByHtmlData(this.htmlData); 88 | default: 89 | return types.other; 90 | } 91 | }; 92 | 93 | module.exports = Resource; 94 | -------------------------------------------------------------------------------- /node_modules/website-scraper/lib/scraper.js: -------------------------------------------------------------------------------- 1 | var Promise = require('bluebird'); 2 | 3 | var fs = require('fs-extra'); 4 | var existsAsync = Promise.promisify(fs.stat); 5 | var outputFileAsync = Promise.promisify(fs.outputFile); 6 | var ensureDirAsync = Promise.promisify(fs.ensureDir); 7 | 8 | var path = require('path'); 9 | var _ = require('underscore'); 10 | 11 | var defaults = require('./config/defaults'); 12 | var types = require('./config/resource-types'); 13 | var recursiveSources = require('./config/recursive-sources'); 14 | var utils = require('./utils.js'); 15 | var request = require('./request'); 16 | var Resource = require('./resource'); 17 | var compareUrls = require('compare-urls'); 18 | 19 | var loadHtml = require('./file-handlers/html'); 20 | var loadCss = require('./file-handlers/css'); 21 | function loadHtmlAndCss (context, po) { 22 | return loadHtml(context, po).then(function (loaded) { 23 | return loadCss(context, loaded); 24 | }); 25 | } 26 | 27 | function Scraper (options) { 28 | this.originalResources = []; 29 | this.loadedResources = []; 30 | 31 | this.options = _.extend({}, defaults, options); 32 | this.options.directory = path.resolve(process.cwd(), this.options.directory || ''); 33 | } 34 | 35 | Scraper.prototype.getLoadedResource = function getLoadedResource (resource) { 36 | return _.find(this.loadedResources, function(lr) { 37 | return compareUrls(resource.getUrl(), lr.getUrl()); 38 | }); 39 | }; 40 | 41 | Scraper.prototype.addLoadedResource = function addLoadedResource (resource) { 42 | this.loadedResources.push(resource); 43 | }; 44 | 45 | Scraper.prototype.getOccupiedFilenames = function getOccupiedFilenames () { 46 | var subdirectories = _.map(this.options.subdirectories, function (dir) { return dir.directory; }); 47 | var loadedFiles = _.map(this.loadedResources, function(r) { return r.getFilename(); }); 48 | return subdirectories.concat(loadedFiles); 49 | }; 50 | 51 | Scraper.prototype.getHtmlSources = function getHtmlSources () { 52 | return this.options.sources; 53 | }; 54 | 55 | Scraper.prototype.generateFilename = function generateFilename (resource) { 56 | var self = this; 57 | 58 | var occupiedFilenames = self.getOccupiedFilenames(); 59 | 60 | var preferredFilename = resource.getFilename(); // which was set in options 61 | var urlFilename = utils.getFilenameFromUrl(resource.getUrl()); // try to get filename from url 62 | var filename = preferredFilename || urlFilename || self.options.defaultFilename; 63 | 64 | var ext = path.extname(filename); 65 | var dir = self.getDirectoryByExtension(ext); 66 | var currentFilename = path.join(dir, filename); 67 | var basename = path.basename(filename, ext); 68 | var index = 1; 69 | 70 | while (_.contains(occupiedFilenames, currentFilename)) { 71 | currentFilename = path.join(dir, basename + '_' + index + ext); 72 | index++; 73 | } 74 | return currentFilename; 75 | }; 76 | 77 | Scraper.prototype.getDirectoryByExtension = function getDirectoryByExtension (ext) { 78 | return _.chain(this.options.subdirectories) 79 | .filter(function (dir) { return _.contains(dir.extensions, ext); }) 80 | .map(function (dir) { return dir.directory; }) 81 | .first() 82 | .value() || ''; 83 | }; 84 | 85 | Scraper.prototype.getResourceHandler = function getHandler (resource) { 86 | var self = this; 87 | var type = resource.getType(); 88 | var depth = resource.getDepth(); 89 | var depthGreaterThanMax = self.options.maxDepth && depth >= self.options.maxDepth; 90 | 91 | switch (true) { 92 | case depthGreaterThanMax: return _.noop; 93 | case type == types.css: return loadCss; 94 | case type == types.html: return loadHtmlAndCss; 95 | default: return _.noop; 96 | } 97 | }; 98 | 99 | Scraper.prototype.loadResource = function loadResource (resource) { 100 | var self = this; 101 | 102 | var loaded = self.getLoadedResource(resource); // try to find already loaded 103 | 104 | var url = resource.getUrl(); 105 | var filename; 106 | var handleFile; 107 | 108 | if (!loaded) { 109 | filename = self.generateFilename(resource); 110 | resource.setFilename(filename); 111 | 112 | self.addLoadedResource(resource); 113 | 114 | // Request -> processing -> save to fs 115 | return self.makeRequest(url).then(function requestCompleted(data) { 116 | resource.setUrl(data.url); // Url may be changed in redirects 117 | resource.setText(data.body); 118 | handleFile = self.getResourceHandler(resource); 119 | return handleFile(self, resource); 120 | }).then(function fileHandled() { 121 | var filename = path.join(self.options.directory, resource.getFilename()); 122 | var text = resource.getText(); 123 | return outputFileAsync(filename, text, { encoding: 'binary' }); 124 | }).then(function fileSaved() { 125 | return Promise.resolve(resource); 126 | }); 127 | } 128 | return Promise.resolve(loaded); 129 | }; 130 | 131 | Scraper.prototype.validate = function validate () { 132 | var dir = this.options.directory; 133 | return existsAsync(dir).then(function handleDirectoryExist () { 134 | return Promise.reject(new Error('Path ' + dir + ' exists')); 135 | }, function handleDirectoryNotExist () { 136 | return Promise.resolve(); 137 | }); 138 | }; 139 | 140 | Scraper.prototype.prepare = function prepare () { 141 | var self = this; 142 | 143 | // Create makeRequest function with custom request params 144 | self.makeRequest = request.makeRequest.bind(null, self.options.request); 145 | 146 | // Create array of Resource for downloading 147 | self.options.urls = _.isArray(self.options.urls) ? self.options.urls : [self.options.urls]; 148 | self.originalResources = _.map(self.options.urls, function createResource(obj) { 149 | var url = _.isObject(obj) && _.has(obj, 'url') ? obj.url : obj; 150 | var filename = _.isObject(obj) && _.has(obj, 'filename') ? obj.filename : self.options.defaultFilename; 151 | return new Resource(url, filename); 152 | }); 153 | 154 | if (self.options.recursive) { 155 | self.options.sources = _.union(self.options.sources, recursiveSources); 156 | } 157 | 158 | return ensureDirAsync(self.options.directory); 159 | }; 160 | 161 | Scraper.prototype.load = function load () { 162 | var self = this; 163 | return Promise.map(self.originalResources, function loadPage (po) { 164 | return self.loadResource(po).then(function pageLoaded (loaded) { 165 | return Promise.resolve({ 166 | url: loaded.getUrl(), 167 | filename: loaded.getFilename() 168 | }); 169 | }); 170 | }); 171 | }; 172 | 173 | Scraper.prototype.errorCleanup = function errorCleanup (error) { 174 | if (!_.isEmpty(this.loadedResources)) { 175 | fs.removeSync(this.options.directory); 176 | } 177 | throw error; 178 | }; 179 | 180 | Scraper.prototype.scrape = function scrape(callback) { 181 | var self = this; 182 | return Promise.bind(self) 183 | .then(self.validate) 184 | .then(self.prepare) 185 | .then(self.load) 186 | .catch(self.errorCleanup) 187 | .asCallback(callback); 188 | }; 189 | 190 | module.exports = Scraper; 191 | -------------------------------------------------------------------------------- /node_modules/website-scraper/lib/utils.js: -------------------------------------------------------------------------------- 1 | var url = require('url'); 2 | var path = require('path'); 3 | var Promise = require('bluebird'); 4 | 5 | function isUrl(path) { 6 | var urlRegexp = /^((http[s]?:)?\/\/)/; 7 | return urlRegexp.test(path); 8 | } 9 | 10 | function getUrl(currentUrl, path) { 11 | var pathObj = url.parse(path); 12 | if (isUrl(path) && !pathObj.protocol) { 13 | pathObj.protocol = 'http'; 14 | path = url.format(pathObj); 15 | } 16 | return url.resolve(currentUrl, path); 17 | } 18 | 19 | function getUnixPath(filepath) { 20 | return filepath.replace(/\\/g, '/'); 21 | } 22 | 23 | function getRelativePath(path1, path2) { 24 | var dirname = path.dirname(path1); 25 | var relativePath = path.relative(dirname, path2); 26 | return getUnixPath(relativePath); 27 | } 28 | 29 | function getFilenameFromUrl (u) { 30 | return path.basename(url.parse(u).pathname); 31 | } 32 | 33 | function waitAllFulfilled(promises) { 34 | return Promise.all(promises.map(function(promise) { 35 | return promise.reflect(); 36 | })); 37 | } 38 | 39 | module.exports = { 40 | isUrl: isUrl, 41 | getUrl: getUrl, 42 | getUnixPath: getUnixPath, 43 | getRelativePath: getRelativePath, 44 | getFilenameFromUrl: getFilenameFromUrl, 45 | waitAllFulfilled: waitAllFulfilled 46 | }; 47 | -------------------------------------------------------------------------------- /node_modules/website-scraper/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "_args": [ 3 | [ 4 | "website-scraper", 5 | "/Users/daniellevin/Projects/Doc-Server" 6 | ], 7 | [ 8 | "website-scraper", 9 | "/Users/daniellevin/Projects/Doc-tor" 10 | ] 11 | ], 12 | "_from": "website-scraper@latest", 13 | "_id": "website-scraper@0.3.1", 14 | "_inCache": true, 15 | "_installable": true, 16 | "_location": "/website-scraper", 17 | "_nodeVersion": "0.10.25", 18 | "_npmUser": { 19 | "email": "sophia.nepochataya@gmail.com", 20 | "name": "s0ph1e" 21 | }, 22 | "_npmVersion": "2.12.1", 23 | "_phantomChildren": { 24 | "cheerio-select": "0.0.3", 25 | "domelementtype": "1.3.0" 26 | }, 27 | "_requested": { 28 | "name": null, 29 | "raw": "website-scraper", 30 | "rawSpec": "website-scraper", 31 | "scope": null, 32 | "spec": "/Users/daniellevin/Projects/Doc-Server/website-scraper", 33 | "type": "directory" 34 | }, 35 | "_requiredBy": [ 36 | "#USER", 37 | "/" 38 | ], 39 | "_resolved": "https://registry.npmjs.org/website-scraper/-/website-scraper-0.3.1.tgz", 40 | "_shasum": "fcad9a05e2155655e2226334bd9d6fe5ef1c8276", 41 | "_shrinkwrap": null, 42 | "_spec": "website-scraper", 43 | "_where": "/Users/daniellevin/Projects/Doc-Server", 44 | "author": { 45 | "name": "s0ph1e" 46 | }, 47 | "bugs": { 48 | "url": "https://github.com/s0ph1e/node-website-scraper/issues" 49 | }, 50 | "dependencies": { 51 | "bluebird": "^3.0.1", 52 | "cheerio": "0.11.0", 53 | "compare-urls": "^1.0.0", 54 | "css-url-parser": "^0.1.0", 55 | "fs-extra": "^0.26.0", 56 | "request": "^2.42.0", 57 | "underscore": "^1.7.0" 58 | }, 59 | "description": "Download website to a local directory (including all css, images, js, etc.)", 60 | "devDependencies": { 61 | "codeclimate-test-reporter": "^0.1.0", 62 | "istanbul": "^0.4.0", 63 | "mocha": "^2.2.5", 64 | "nock": "^2.9.1", 65 | "proxyquire": "^1.7.3", 66 | "should": "^7.0.2", 67 | "sinon": "^1.15.4", 68 | "sinon-as-promised": "^4.0.0" 69 | }, 70 | "directories": {}, 71 | "dist": { 72 | "shasum": "fcad9a05e2155655e2226334bd9d6fe5ef1c8276", 73 | "tarball": "http://registry.npmjs.org/website-scraper/-/website-scraper-0.3.1.tgz" 74 | }, 75 | "gitHead": "f1983f79ced795563ee964868e0048b9fbf431b0", 76 | "homepage": "https://github.com/s0ph1e/node-website-scraper", 77 | "keywords": [ 78 | "css", 79 | "download", 80 | "html", 81 | "image", 82 | "js", 83 | "page", 84 | "scrape", 85 | "scraper", 86 | "site", 87 | "url", 88 | "web" 89 | ], 90 | "license": "MIT", 91 | "main": "index.js", 92 | "maintainers": [ 93 | { 94 | "name": "s0ph1e", 95 | "email": "sophia.nepochataya@gmail.com" 96 | } 97 | ], 98 | "name": "website-scraper", 99 | "optionalDependencies": {}, 100 | "readme": "ERROR: No README data found!", 101 | "repository": { 102 | "type": "git", 103 | "url": "git://github.com/s0ph1e/node-website-scraper.git" 104 | }, 105 | "scripts": { 106 | "test": "istanbul cover ./node_modules/mocha/bin/_mocha --dir ./coverage --report lcov -- -R spec --recursive ./test" 107 | }, 108 | "version": "0.3.1" 109 | } 110 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "doc-server", 3 | "version": "1.0.0", 4 | "description": "JS doc repository server and scraper", 5 | "main": "server/server.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1", 8 | "start": "nodemon --ignore docs/ --ignore temp/ --ignore zips/ server/server.js" 9 | }, 10 | "repository": { 11 | "type": "git", 12 | "url": "git+https://github.com/DocWave/Doc-Server.git" 13 | }, 14 | "author": "Sporks, Dan Levin, Cruz Welborn, Lea Fox", 15 | "license": "MIT", 16 | "dependencies": { 17 | "archiver": "^0.21.0", 18 | "bluebird": "^3.3.1", 19 | "body-parser": "^1.14.2", 20 | "cheerio": "^0.20.0", 21 | "express": "^4.13.4", 22 | "mongoose": "^4.4.2", 23 | "nightmare": "^2.1.6", 24 | "path": "^0.12.7", 25 | "phantom": "^0.9.0", 26 | "request": "^2.69.0", 27 | "tar": "^2.2.1", 28 | "tar.gz": "^1.0.3", 29 | "vo": "^1.0.3", 30 | "website-scraper": "file:website-scraper" 31 | }, 32 | "devDependencies": { 33 | "morgan": "^1.7.0", 34 | "single-line-log": "^1.0.1" 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |

Our site homepage

9 | 10 | 11 | -------------------------------------------------------------------------------- /server/controllers/dbController.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | const Update = require('./updateModel'); 3 | const path = require('path'); 4 | const fs = require('fs'); 5 | 6 | module.exports = { 7 | latestVer: function(req, res, next){ 8 | let query = Update.where({sourceName: req.scrapeProps.sourceName}); 9 | console.log(req.scrapeProps.sourceName); 10 | query.findOne({},{},{ sort: { 'createdAt' : -1 } }, function(err, foundVer){ 11 | if(err) console.log(err); 12 | try{ 13 | let fileStats = fs.statSync(path.resolve(foundVer.filePath)); 14 | //If we find that we have the same version, send the version we already have 15 | req.scrapeProps = foundVer.filePath; 16 | return res.sendFile(path.resolve(foundVer.filePath)); 17 | // next(); 18 | } 19 | //We didn't find the file in the directory, so proceed as usual 20 | catch(e){ 21 | next(); 22 | } 23 | console.log(foundVer); 24 | // req.scrapeProps = foundVer.filePath; 25 | return res.sendFile(path.resolve(foundVer.filePath)); 26 | }); 27 | }, 28 | needUpdate : function(req, res, next){ 29 | if(!req.needUpdate) 30 | req.needUpdate = {}; 31 | let query = Update.where({versionNo: req.scrapeProps.versionNo, 32 | sourceName: req.scrapeProps.sourceName}); 33 | query.findOne( function (err, foundUpdate){ 34 | //takes in an err from findOne and the returned Doc 35 | if(err) console.log(err); 36 | // console.log("finding"); 37 | if(!foundUpdate){ 38 | //no update found, send continue the middleware! 39 | console.log("\n\n\t\tNew version, updating\n\n"); 40 | next(); 41 | } 42 | 43 | else if ( foundUpdate ){ // if the Doc exists update 44 | //Also check if we have the file right now, just in case it got deleted 45 | try{ 46 | console.log("found "); 47 | 48 | let fileStats = fs.statSync(path.resolve(foundUpdate.filePath)); 49 | //If we find that we have the same version, send the version we already have 50 | //break out of the middleware! 51 | // console.log("\n\n\t\tFile Found, sending local copy\n\n"); 52 | // return res.sendFile(path.resolve(foundUpdate.filePath)); 53 | next(); 54 | } 55 | //We didn't find the file in the directory, so proceed as usual 56 | catch(e){ 57 | console.log("File not found...."); 58 | console.log(foundUpdate.filePath); 59 | req.needUpdate[req.scrapeProps.sourceName.replace(/\s/g, "_")] = true; 60 | 61 | next(); 62 | } 63 | 64 | } 65 | }); 66 | }, 67 | addToDB : function(req, res, next){ 68 | //assigns a new Update document to the variable update 69 | let update = new Update ({sourceName : req.scrapeProps.sourceName, 70 | versionNo : req.scrapeProps.versionNo, 71 | filePath : req.scrapeProps.filePath, 72 | retrieved : Date.now(), 73 | createdAt : Date.now()}); 74 | 75 | //store our query in a variable 76 | //fileName = the name of documentation 77 | let query = Update.where({versionNo: req.scrapeProps.versionNo, 78 | sourceName: req.scrapeProps.sourceName}); 79 | // console.log(res.fileName, res.versionNo, res.filePath); 80 | //Checks database to see if doc already exists 81 | // runs callback found(err,foundUpdate) 82 | 83 | // Checks database to see if doc already exists 84 | // runs callback found(err,foundUpdate) 85 | query.findOne( function (err, foundUpdate){ 86 | //takes in an err from findOne and the returned Doc 87 | if(err)console.log(err); 88 | 89 | if(!foundUpdate){ 90 | update.save( function(err, update){ 91 | if(err) { 92 | console.error(err); 93 | } 94 | else { 95 | console.log (`${req.scrapeProps.sourceName} - versionNo:${req.scrapeProps.versionNo} has been added to the database.`); 96 | next(); 97 | } 98 | }); 99 | } 100 | 101 | if ( foundUpdate ){ // if the Doc exists update 102 | //currently only updating the Date - can handle version numbers at a later date 103 | query.findOneAndUpdate( {retrieved: Date.now()}, function(err, newInfo){ 104 | if (err) console.log(err); 105 | else{ 106 | console.log("NewInfo ", newInfo); 107 | next(); 108 | } 109 | }); 110 | } 111 | }); 112 | } 113 | }; 114 | -------------------------------------------------------------------------------- /server/controllers/mdnJS.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | const cheerio = require( 'cheerio' ); 3 | const request = require( 'request' ); 4 | const fs = require( 'fs' ); 5 | const targz = require( 'tar.gz' ); 6 | const zlib = require( 'zlib' ); 7 | const path = require( 'path' ); 8 | const tar = require( 'tar' ); 9 | const SQL = require( 'sql.js' ); 10 | const archiver = require( 'archiver' ); 11 | 12 | let mdn = { 13 | 14 | /* 15 | * This function goes to kapeli.com, grabs the Javascript link, 16 | * then attaches it to the req obj 17 | */ 18 | 19 | download: function ( req, res, next ) { 20 | request( 'https://kapeli.com/mdn_offline', function ( err, html ) { 21 | if ( err ) console.log( err ); 22 | let $ = cheerio.load( html.body ); 23 | 24 | //Only use the link that contains the text 'Javascript.tgz' 25 | let downloadLink = "https://kapeli.com/" + $( ".download:contains('JavaScript.tgz')" ) 26 | .attr( "href" ); 27 | req.JSdownloadLink = downloadLink; 28 | next(); 29 | } ); 30 | }, 31 | //downloads tar file from kapeli.com 32 | getJavascript: function ( req, res, next ) { 33 | //downloading 116 MB .tar to disk 34 | 35 | //Check if js file exists 36 | 37 | let write = fs.createWriteStream( './JavaScript.tgz' ); 38 | 39 | /////////////////////////////////////////////////////// 40 | // using the request stream as a ReadStream 41 | // NOTE: req.downloadLink initialized in mdn.download 42 | ////////////////////////////////////////////////////// 43 | let read = request( req.JSdownloadLink ) 44 | .on( 'error', function ( err ) { 45 | throw err; 46 | } ) 47 | .pipe( write ); 48 | 49 | //just to log bytes written - not necessary 50 | let watcher = fs.watch( './JavaScript.tgz' ) 51 | .on( 'change', function () { 52 | let bytes=(read.bytesWritten/1000000).toFixed(2); 53 | require('single-line-log').stdout('JS: ',bytes +' MB'); 54 | }); 55 | //close readStream and watcher 56 | read.on( 'finish', function () { 57 | read.close( function(){ 58 | watcher.close(); 59 | next(); 60 | }); 61 | } ); 62 | }, 63 | extract: function ( req, res, next ) { 64 | console.log( 'extracting...' ); 65 | let inflate = zlib.Unzip(); 66 | let extractor = tar.Extract( { 67 | path: './docs' 68 | } ) 69 | .on( 'error', function ( err ) { 70 | throw err; 71 | } ) 72 | .on( 'end', function () { 73 | console.log( 'extracted' ); 74 | } ); 75 | let extracting = fs.createReadStream( './JavaScript.tgz' ) 76 | .on( 'error', function ( err ) { 77 | throw err; 78 | } ) 79 | .pipe( inflate ) 80 | .pipe( extractor ); 81 | extracting.on( 'finish', function () { 82 | next(); 83 | } ); 84 | }, 85 | createClassObj: function ( req, res, next ) { 86 | let base = 'JavaScript/developer.mozilla.org/en-US/docs/Web/API/'; 87 | let classObj = {}; 88 | 89 | fs.readdir( './docs/' + base, function ( err, files ) { 90 | if ( err ) console.log( err ); 91 | files = files.filter( elem => { 92 | return elem.includes( '.html' ); 93 | } ); 94 | for ( let k of files ) { 95 | classObj[ k.replace( '.html', "" ) ] = base + k; 96 | } 97 | req.classObj = classObj; 98 | next(); 99 | } ); 100 | }, 101 | createMethodsObj: function ( req, res, next ) { 102 | function getDirectories( srcpath ) { 103 | return fs.readdirSync( srcpath ) 104 | .filter( function ( file ) { 105 | return fs.statSync( path.join( srcpath, file ) ) 106 | .isDirectory(); 107 | } ); 108 | } 109 | let base = 'JavaScript/developer.mozilla.org/en-US/docs/Web/API/'; 110 | let methodObj = {}; 111 | 112 | let directories = getDirectories( './docs/' + base ); 113 | directories.forEach( elem => { 114 | fs.readdir( `docs/${base}/${elem}`, function ( err, files ) { 115 | files.forEach( fileElem => { 116 | let key = `${elem}.${fileElem}`; 117 | methodObj[ key.replace( ".html", "" ) ] = `${base}/${elem}/${fileElem}`; 118 | } ); 119 | req.methodObj = methodObj; 120 | } ); 121 | } ); 122 | next(); 123 | }, 124 | createEventObj: function ( req, res, next ) { 125 | let base = 'JavaScript/developer.mozilla.org/en-US/docs/Web/Events/'; 126 | let eventsObj = {}; 127 | 128 | fs.readdir( './docs/' + base, function ( err, files ) { 129 | if ( err ) console.log( err ); 130 | files = files.filter( elem => { 131 | return elem.includes( '.html' ); 132 | } ); 133 | for ( let k of files ) { 134 | eventsObj[ k.replace( '.html', "" ) ] = base + k; 135 | } 136 | req.eventsObj = eventsObj; 137 | next(); 138 | } ); 139 | }, 140 | createKWObj: function ( req, res, next ) { 141 | let base1 = 'JavaScript/developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/'; 142 | let base2 = 'JavaScript/developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/'; 143 | let KWObj = {}; 144 | fs.readdir( './docs/' + base1, function ( err, files ) { 145 | if ( err ) console.log( err ); 146 | files = files.filter( elem => { 147 | return elem.includes( '.html' ); 148 | } ); 149 | for ( let k of files ) { 150 | KWObj[ k.replace( '.html', "" ) ] = base1 + k; 151 | } 152 | } ); 153 | fs.readdir( './docs/' + base2, function ( err, files ) { 154 | if ( err ) console.log( err ); 155 | files = files.filter( elem => { 156 | return elem.includes( '.html' ); 157 | } ); 158 | for ( let k of files ) { 159 | KWObj[ k.replace( '.html', "" ) ] = base2 + k; 160 | } 161 | req.KWObj = KWObj; 162 | next(); 163 | } ); 164 | }, 165 | createFuncObj: function ( req, res, next ) { 166 | let base = 'JavaScript/developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/'; 167 | let funcObj = {}; 168 | 169 | fs.readdir( './docs/' + base, function ( err, files ) { 170 | if ( err ) console.log( err ); 171 | files = files.filter( elem => { 172 | return elem.includes( '.html' ); 173 | } ); 174 | for ( let k of files ) { 175 | funcObj[ k.replace( '.html', "" ) ] = base + k; 176 | } 177 | req.funcObj = funcObj; 178 | next(); 179 | } ); 180 | }, 181 | sqlFile: function ( req, res, next ) { 182 | let i = 0; 183 | let objects = { 184 | function: req.funcObj, 185 | key_word: req.KWObj, 186 | events: req.eventsObj, 187 | methods: req.methodObj, 188 | class: req.classObj 189 | }; 190 | 191 | let db = new SQL.Database(); 192 | db.run( "CREATE TABLE docsearch (ID int, NAME char, TYPE char, LINK char);" ); 193 | 194 | for ( let k in objects ) { 195 | console.log( k ); 196 | for ( let j in objects[ k ] ) { 197 | db.run( "INSERT INTO docsearch VALUES (:ID, :NAME, :TYPE, :LINK)", { 198 | ':ID': i++, 199 | ':NAME': j, 200 | ':TYPE': k, 201 | ':LINK': objects[ k ][ j ] 202 | } ); 203 | } 204 | } 205 | let data = db.export(); 206 | let buffer = new Buffer( data ); 207 | 208 | fs.writeFileSync( "docs/mdn_javascript.sqlite", buffer ); 209 | 210 | next(); 211 | }, 212 | zip: function ( req, res, next ) { 213 | console.log('zipping'); 214 | let output = fs.createWriteStream( 'zips/mdn/javascript/mdn_javascript.zip'); 215 | let archive = archiver('zip'); 216 | 217 | output.on('close', function() { 218 | fs.unlink('./JavaScript.tgz', (err) => { 219 | if(err) console.log(err); 220 | console.log(archive.pointer() + ' total bytes'); 221 | console.log('archiver has been finalized and the output file descriptor has closed.'); 222 | } ); 223 | }); 224 | 225 | archive.on('error', function(err) { 226 | throw err; 227 | }); 228 | 229 | archive.pipe(output); 230 | 231 | archive.bulk([ 232 | { expand: true, cwd: 'docs/', src: ['**'], dest:'mdn_javascript.docs' } 233 | ]); 234 | 235 | archive.finalize(); 236 | next(); 237 | } 238 | }; 239 | 240 | 241 | module.exports = mdn; 242 | -------------------------------------------------------------------------------- /server/controllers/updateModel.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | var mongoose = require('mongoose'); 3 | var Schema = mongoose.Schema; 4 | 5 | var updateSchema = new Schema({ 6 | sourceName: String, 7 | versionNo: String, 8 | filePath: String, 9 | retrieved: { type: Date, default: Date.now }, 10 | createdAt: {type: Date, default: Date.now} 11 | }); 12 | 13 | module.exports = mongoose.model('Update', updateSchema); 14 | -------------------------------------------------------------------------------- /server/middleware/folderHandler.js: -------------------------------------------------------------------------------- 1 | var fs = require('fs'); 2 | 3 | var folderHandler = { 4 | checkOrCreateFolder: function(path){ 5 | if(this.checkFolders(path)){ 6 | console.log("Folder exists for zip file, continue"); 7 | } 8 | else{ 9 | fs.mkdir(path, err => { 10 | if(err){ console.error(err) }; 11 | console.log("Zip folder does not exits, creating"); 12 | }) 13 | } 14 | }, 15 | checkToDelete: function(path){ 16 | // if the diretory exists, delete it 17 | if(this.checkFolders(path)){ 18 | //We need to delete the directory 19 | console.log("Temp folder exists, deleting") 20 | this.deleteFolderRecursive(path); 21 | } 22 | else{ 23 | console.log("Temp folder does not exist, continuing"); 24 | } 25 | }, 26 | //Generic function to check if folder exists 27 | checkFolders: function(path){ 28 | var that = this; 29 | // Use try, if dir does not exist, it will throw an error 30 | try{ 31 | var stats = fs.statSync(path) 32 | if(stats.isDirectory()){ 33 | return true; 34 | } 35 | } 36 | catch(err){ 37 | if(err){ 38 | // console.log(err, 'Folder does not exist'); 39 | return false 40 | } 41 | } 42 | }, 43 | //Recursively delete folders should I make it async? 44 | deleteFolderRecursive: function(path) { 45 | var that = this; 46 | if( fs.existsSync(path) ) { 47 | fs.readdirSync(path).forEach(function(file,index){ 48 | var curPath = path + "/" + file; 49 | if(fs.lstatSync(curPath).isDirectory()) { // recurse 50 | that.deleteFolderRecursive(curPath); 51 | } else { // delete file 52 | fs.unlinkSync(curPath); 53 | } 54 | }); 55 | fs.rmdirSync(path); 56 | } 57 | } 58 | } 59 | 60 | module.exports = folderHandler; 61 | -------------------------------------------------------------------------------- /server/middleware/mdnCSS.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | const cheerio = require( 'cheerio' ); 3 | const request = require( 'request' ); 4 | const fs = require( 'fs' ); 5 | const targz = require( 'tar.gz' ); 6 | const zlib = require( 'zlib' ); 7 | const path = require( 'path' ); 8 | const tar = require( 'tar' ); 9 | const archiver = require( 'archiver' ); 10 | const folderHandler = require('./folderHandler'); 11 | 12 | 13 | let mdnCSS = { 14 | /* 15 | * This function goes to kapeli.com, grabs the Javascript link, 16 | * then attaches it to the req obj 17 | */ 18 | download: function ( req, res, next ) { 19 | request( 'https://kapeli.com/mdn_offline', function ( err, html ) { 20 | if ( err ) console.log( err ); 21 | let $ = cheerio.load( html.body ); 22 | 23 | //Only use the link that contains the text 'Javascript.tgz' 24 | let CSSdownloadLink = "https://kapeli.com/" + $( ".download:contains('CSS.tgz')" ) 25 | .attr( "href" ); 26 | req.CSSdownloadLink = CSSdownloadLink; 27 | next(); 28 | } ); 29 | }, 30 | //downloads tar file from kapeli.com 31 | getCSS: function ( req, res, next ) { 32 | //NOTE:downloading 22 MB .tar to disk 33 | 34 | let write = fs.createWriteStream( './temp/CSS.tgz' ); 35 | 36 | /////////////////////////////////////////////////////// 37 | // using the request stream as a ReadStream 38 | // NOTE: req.CSSdownloadLink initialized in mdn.download 39 | ////////////////////////////////////////////////////// 40 | let read = request( req.CSSdownloadLink ) 41 | .on( 'error', function ( err ) { 42 | throw err; 43 | } ) 44 | .pipe( write ); 45 | 46 | //just to log bytes written - not necessary 47 | let watcher = fs.watch( './temp/CSS.tgz' ) 48 | .on( 'change', function () { 49 | let bytes=(read.bytesWritten/1000000).toFixed(2); 50 | require('single-line-log').stdout('CSS: ',bytes +' MB'); 51 | }); 52 | //close readStream and watcher 53 | read.on( 'finish', function () { 54 | read.close( function(){ 55 | watcher.close(); 56 | next(); 57 | }); 58 | } ); 59 | }, 60 | extract: function ( req, res, next ) { 61 | console.log( 'extracting...' ); 62 | let inflate = zlib.Unzip(); 63 | let extractor = tar.Extract( { 64 | path: './docs/mdn/css/documents' 65 | } ) 66 | .on( 'error', function ( err ) { 67 | throw err; 68 | } ) 69 | .on( 'end', function () { 70 | console.log( 'extracted' ); 71 | next(); 72 | } ); 73 | let extracting = fs.createReadStream( './temp/CSS.tgz' ) 74 | .on( 'error', function ( err ) { 75 | throw err; 76 | } ) 77 | .pipe( inflate ) 78 | .pipe( extractor ); 79 | extracting.on( 'finish', function () { 80 | // next(); 81 | } ); 82 | }, 83 | getObjs: function(req, res, next){ 84 | let base = 'CSS/developer.mozilla.org/en-US/docs/Web/CSS/'; 85 | let $ = cheerio.load(fs.readFileSync('./docs/mdn/css/documents/CSS/developer.mozilla.org/en-US/docs/Web/CSS/Reference.html')); 86 | let classObj = {}; 87 | let elemObj = {}; 88 | let funcObj = {}; 89 | let typesObj = {}; 90 | let propObj = {}; 91 | let guideObj = {}; 92 | $('div .index a').each((i, el) => { 93 | let text = $(el).text(); 94 | let link = $(el).attr('href'); 95 | let classReg = new RegExp (/^:[^:].+/g ); 96 | let elemReg = new RegExp (/^::/g ); 97 | let funcReg = new RegExp (/^@|\(\)$/g ); 98 | let typeReg = new RegExp (/^ { 115 | guideObj[$(el).text()] = base + $(el).attr('href'); 116 | }); 117 | req.classObj = classObj; 118 | req.elemObj = elemObj; 119 | req.funcObj = funcObj; 120 | req.typesObj = typesObj; 121 | req.propObj = propObj; 122 | req.guideObj = guideObj; 123 | next(); 124 | }, 125 | getMoz : function(req, res, next){ 126 | let base = 'CSS/developer.mozilla.org/en-US/docs/Web/CSS/'; 127 | let $ = cheerio.load(fs.readFileSync('./docs/mdn/css/documents/CSS/developer.mozilla.org/en-US/docs/Web/CSS/Mozilla_Extensions.html')); 128 | 129 | $('div .index a').each((i, el) => { 130 | let text = $(el).text(); 131 | let link = $(el).attr('href'); 132 | let classReg = new RegExp (/^:[^:].+/g ); 133 | let elemReg = new RegExp (/^::/g ); 134 | if(classReg.test(text)){ 135 | req.classObj[text] = base + link; 136 | } 137 | if(elemReg.test(text)){ 138 | req.elemObj[text] = base + link; 139 | } 140 | }); 141 | next(); 142 | }, 143 | sqlFile: function ( req, res, next ) { 144 | let i = 0; 145 | let jsonIndex = {"sourceName": req.scrapeProps.sourceName, 146 | "versionNo": req.scrapeProps.versionNo, "result": []}; 147 | let objects = { 148 | Classes:req.classObj , 149 | Elements:req.elemObj, 150 | Functions:req.funcObj , 151 | Types:req.typesObj , 152 | Properties:req.propObj, 153 | Guides:req.guideObj 154 | }; 155 | req.classObj = null; 156 | req.elemObj = null; 157 | req.funcObj = null; 158 | req.typesObj = null; 159 | req.propObj = null; 160 | req.guideObj = null; 161 | for ( let k in objects ) { 162 | // console.log( k ); 163 | for ( let j in objects[ k ] ) { 164 | jsonIndex.result.push({"NAME": j, "TYPE": k, "LINK": objects[k][j]}); 165 | } 166 | } 167 | jsonIndex = JSON.stringify(jsonIndex); 168 | fs.writeFileSync( "docs/mdn/css/index.json", jsonIndex ); 169 | //Null out jsonIndex 170 | jsonIndex = null; 171 | next(); 172 | }, 173 | zip: function ( req, res, next ) { 174 | let output = fs.createWriteStream( 'zips/mdn/mdn_css'+req.scrapeProps.versionNo+'.zip'); 175 | let archive = archiver('zip'); 176 | req.scrapeProps.filePath = './zips/mdn/mdn_css'+req.scrapeProps.versionNo+'.zip'; 177 | output.on('close', function() { 178 | fs.unlink('./temp/CSS.tgz', (err) => { 179 | //Null out jsonindex and req stuff 180 | req.classObj = null; 181 | req.elemObj = null; 182 | req.funcObj = null; 183 | req.typesObj = null; 184 | req.propObj = null; 185 | req.guideObj = null; 186 | console.log(archive.pointer() + ' total bytes'); 187 | folderHandler.deleteFolderRecursive(req.scrapeProps.baseDir); 188 | console.log('archiver has been finalized and the output file descriptor has closed.'); 189 | next(); 190 | }); 191 | }); 192 | 193 | archive.on('error', function(err) { 194 | throw err; 195 | }); 196 | 197 | archive.pipe(output); 198 | 199 | archive.bulk([ 200 | { expand: true, cwd: 'docs/mdn/css/', src: ['**'], dest:'mdn_css.docs' } 201 | ]); 202 | 203 | archive.finalize(); 204 | } 205 | }; 206 | 207 | 208 | module.exports = mdnCSS; 209 | -------------------------------------------------------------------------------- /server/middleware/mdnHTML.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | const cheerio = require( 'cheerio' ); 3 | const request = require( 'request' ); 4 | const fs = require( 'fs' ); 5 | const targz = require( 'tar.gz' ); 6 | const zlib = require( 'zlib' ); 7 | const path = require( 'path' ); 8 | const tar = require( 'tar' ); 9 | const archiver = require( 'archiver' ); 10 | const folderHandler = require('./folderHandler'); 11 | 12 | let mdnHTML = { 13 | /* 14 | * This function goes to kapeli.com, grabs the HTML link, 15 | * then attaches it to the req obj 16 | */ 17 | download: function ( req, res, next ) { 18 | request( 'https://kapeli.com/mdn_offline', function ( err, html ) { 19 | if ( err ) console.log( err ); 20 | let $ = cheerio.load( html.body ); 21 | 22 | //Only use the link that contains the text 'HTML.tgz' 23 | let HTMLdownloadLink = "https://kapeli.com/" + $( ".download:contains('HTML.tgz')" ) 24 | .attr( "href" ); 25 | req.HTMLdownloadLink = HTMLdownloadLink; 26 | next(); 27 | } ); 28 | }, 29 | //downloads tar file from kapeli.com 30 | getHTML: function ( req, res, next ) { 31 | //NOTE:downloading 24 MB .tar to disk 32 | try { 33 | fs.mkdirSync('./temp'); 34 | } catch (e) { 35 | console.log('./temp already exists'); 36 | } 37 | 38 | let write = fs.createWriteStream( './temp/HTML.tgz' ); 39 | 40 | /////////////////////////////////////////////////////// 41 | // using the request stream as a ReadStream 42 | // NOTE: req.downloadLink initialized in mdn.download 43 | ////////////////////////////////////////////////////// 44 | let read = request( req.HTMLdownloadLink ) 45 | .on( 'error', function ( err ) { 46 | throw err; 47 | } ) 48 | .pipe( write ); 49 | 50 | //just to log bytes written - not necessary 51 | let watcher = fs.watch( './temp/HTML.tgz' ) 52 | .on( 'change', function () { 53 | let bytes=(read.bytesWritten/1000000).toFixed(2); 54 | require('single-line-log').stdout('HTML: ',bytes +' MB'); 55 | }); 56 | //close readStream and watcher 57 | read.on( 'finish', function () { 58 | read.close( function(){ 59 | watcher.close(); 60 | next(); 61 | }); 62 | } ); 63 | }, 64 | extract: function ( req, res, next ) { 65 | console.log( 'extracting...' ); 66 | let inflate = zlib.Unzip(); 67 | let extractor = tar.Extract( { 68 | path: './docs/mdn/html/documents/' 69 | } ) 70 | .on( 'error', function ( err ) { 71 | console.log(err); 72 | } ) 73 | .on( 'end', function () { 74 | console.log( 'extracted' ); 75 | next(); 76 | } ); 77 | let extracting = fs.createReadStream( './temp/HTML.tgz' ) 78 | .on( 'error', function ( err ) { 79 | console.log(err); 80 | } ) 81 | .pipe( inflate ) 82 | .pipe( extractor ); 83 | extracting.on( 'finish', function () { 84 | // next(); 85 | } ); 86 | }, 87 | getElements: function ( req, res, next ) { 88 | let base = 'HTML/developer.mozilla.org/en-US/docs/Web/HTML/Element', 89 | attrObj = {}, 90 | elemObj = {}; 91 | 92 | fs.readdir( './docs/mdn/html/documents/' + base, function ( err, files ) { 93 | if ( err ) console.log( err ); 94 | files = files.filter( elem => { 95 | return elem.includes( '.html' ) && !elem.includes( '.dashtoc' ); 96 | } ); 97 | for ( let file of files ) { 98 | let nameOfElem = file.replace( '.html', "" ), 99 | attrLinks = [], 100 | attrIds; 101 | 102 | let $ = cheerio.load( fs.readFileSync( `./docs/mdn/html/documents/${base}/${file}` ) ); 103 | 104 | $( "a[name*='attr-']" ).each( (i , el) => { 105 | if($(el).attr('name')){ 106 | attrIds = $( el ).attr('name').replace(/attr-/g, ""); 107 | $(el).attr('id', attrIds); 108 | console.log($(el).attr('id')); 109 | attrObj[`${nameOfElem}.${attrIds}`] = `${base}/${file}#${attrIds}`; 110 | } 111 | }); 112 | var html = $.html(); 113 | fs.writeFileSync( `./docs/mdn/html/documents/${base}/${file}`, html) 114 | // console.log(attrObj); 115 | elemObj[ nameOfElem ] = base + file; 116 | } 117 | 118 | req.elemObj = elemObj; 119 | req.attrObj = attrObj; 120 | next(); 121 | } ); 122 | }, 123 | sqlFile: function ( req, res, next ) { 124 | let i = 0; 125 | // let db = new SQL.Database(); 126 | // db.run( "CREATE TABLE docsearch (ID int, NAME char, TYPE char, LINK char);" ); 127 | let jsonIndex = {"sourceName": req.scrapeProps.sourceName, 128 | "versionNo": req.scrapeProps.versionNo, "result": []}; 129 | for ( let elemName in req.elemObj ) { 130 | jsonIndex.result.push({"NAME": elemName, "TYPE": "element", "LINK": req.elemObj[elemName]}); 131 | // ':ID': i++, 132 | // ':NAME': elemName, 133 | // ':TYPE': "element", 134 | // ':LINK': req.elemObj[ elemName ] 135 | } 136 | for ( let attrName in req.attrObj ) { 137 | jsonIndex.result.push({"NAME": attrName, "TYPE": "attribute", "LINK": req.attrObj[attrName]}); 138 | // ':ID': i++, 139 | // ':NAME': attrName, 140 | // ':TYPE': "attribute", 141 | // ':LINK': req.attrObj[attrName] 142 | } 143 | // let data = db.export(); 144 | jsonIndex = JSON.stringify(jsonIndex); 145 | fs.writeFileSync( "./docs/mdn/html/index.json", jsonIndex ); 146 | //Null out jsonIndex 147 | jsonIndex = null; 148 | next(); 149 | }, 150 | 151 | zip: function ( req, res, next ) { 152 | let output = fs.createWriteStream( './zips/mdn/mdn_html'+req.scrapeProps.versionNo+'.zip'); 153 | let archive = archiver('zip'); 154 | req.scrapeProps.filePath = './zips/mdn/mdn_html'+req.scrapeProps.versionNo+'.zip'; 155 | 156 | output.on('close', function() { 157 | fs.unlink('./temp/HTML.tgz', (err) => { 158 | if(err) console.log(err); 159 | req.elemObj = null; 160 | req.attrObj = null; 161 | console.log(archive.pointer() + ' total bytes'); 162 | folderHandler.deleteFolderRecursive(req.scrapeProps.baseDir); 163 | console.log('archiver has been finalized and the output file descriptor has closed.'); 164 | next(); 165 | }); 166 | }); 167 | 168 | archive.on('error', function(err) { 169 | throw err; 170 | }); 171 | 172 | archive.pipe(output); 173 | 174 | archive.bulk([ 175 | { expand: true, cwd: 'docs/mdn/html', src: ['**'], dest:'mdn_html.docs' } 176 | ]); 177 | 178 | archive.finalize(); 179 | } 180 | }; 181 | 182 | 183 | module.exports = mdnHTML; 184 | -------------------------------------------------------------------------------- /server/middleware/mdnJS.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | const cheerio = require( 'cheerio' ); 3 | const request = require( 'request' ); 4 | const fs = require( 'fs' ); 5 | const zlib = require( 'zlib' ); 6 | const path = require( 'path' ); 7 | const tar = require( 'tar' ); 8 | const archiver = require( 'archiver' ); 9 | const folderHandler = require('./folderHandler'); 10 | 11 | let mdnJS = { 12 | 13 | /* 14 | * This function goes to kapeli.com, grabs the Javascript link, 15 | * then attaches it to the req obj 16 | */ 17 | 18 | download: function ( req, res, next ) { 19 | request( 'https://kapeli.com/mdn_offline', function ( err, html ) { 20 | if ( err ) console.log( err ); 21 | let $ = cheerio.load( html.body ); 22 | var d = new Date(); 23 | console.log("requesting ", d.getMinutes(), ":", d.getSeconds()); 24 | //Only use the link that contains the text 'Javascript.tgz' 25 | let downloadLink = "https://kapeli.com/" + $( ".download:contains('JavaScript.tgz')" ) 26 | .attr( "href" ); 27 | // req.downloadLink = downloadLink; 28 | req.downloadLink = 'http://localhost:8080/js2'; 29 | next(); 30 | } ); 31 | }, 32 | //downloads tar file from kapeli.com 33 | getJavascript: function ( req, res, next ) { 34 | //downloading 116 MB .tar to disk 35 | 36 | //Check if js file exists 37 | 38 | let write = fs.createWriteStream( './temp/JavaScript.tgz' ); 39 | var d = new Date(); 40 | console.log("Downloading ", d.getMinutes(), ":", d.getSeconds()); 41 | /////////////////////////////////////////////////////// 42 | // using the request stream as a ReadStream 43 | // NOTE: req.downloadLink initialized in mdn.download 44 | ////////////////////////////////////////////////////// 45 | let read = request( req.downloadLink ) 46 | .on( 'error', function ( err ) { 47 | throw err; 48 | } ) 49 | .pipe( write ); 50 | 51 | //just to log bytes written - not necessary 52 | // let watcher = fs.watch( './temp/JavaScript.tgz' ) 53 | // .on( 'change', function () { 54 | // let bytes=(read.bytesWritten/1000000).toFixed(2); 55 | // // console.log( bytes +' MB'); 56 | // require('single-line-log').stdout(bytes +' MB') 57 | // } ); 58 | //close readStream and watcher 59 | read.on( 'finish', function () { 60 | read.close( function(){ 61 | console.log("done ", d.getMinutes(), ":", d.getSeconds()); 62 | // watcher.close(); 63 | // res.send("DONE") 64 | next(); 65 | }); 66 | } ); 67 | }, 68 | extract: function ( req, res, next ) { 69 | console.log( 'extracting...' ); 70 | var d = new Date(); 71 | console.log("extracting ", d.getMinutes(), ":", d.getSeconds()); 72 | let inflate = zlib.Unzip(); 73 | let extractor = tar.Extract( { 74 | path: './docs/mdn/javascript/documents' 75 | } ) 76 | .on( 'error', function ( err ) { 77 | throw err; 78 | } ) 79 | .on( 'end', function () { 80 | console.log( 'extracted' ); 81 | next(); 82 | } ); 83 | let extracting = fs.createReadStream( './temp/JavaScript.tgz' ) 84 | .on( 'error', function ( err ) { 85 | throw err; 86 | } ) 87 | .pipe( inflate ) 88 | .pipe( extractor ); 89 | extracting.on( 'finish', function () { 90 | // next(); 91 | } ); 92 | }, 93 | createClassObj: function ( req, res, next ) { 94 | let base = 'JavaScript/developer.mozilla.org/en-US/docs/Web/API/'; 95 | let classObj = {}; 96 | var d = new Date(); 97 | console.log(d.getMinutes(), d.getSeconds()); 98 | fs.readdir( './docs/mdn/javascript/documents/' + base, function ( err, files ) { 99 | if ( err ) console.log( err ); 100 | // console.log(files); 101 | files = files.filter( elem => { 102 | return elem.includes( '.html' ); 103 | } ); 104 | for ( let k of files ) { 105 | classObj[ k.replace( '.html', "" ) ] = base + k; 106 | } 107 | req.classObj = classObj; 108 | next(); 109 | } ); 110 | }, 111 | createMethodsObj: function ( req, res, next ) { 112 | function getDirectories( srcpath ) { 113 | return fs.readdirSync( srcpath ) 114 | .filter( function ( file ) { 115 | return fs.statSync( path.join( srcpath, file ) ) 116 | .isDirectory(); 117 | } ); 118 | } 119 | let base = 'JavaScript/developer.mozilla.org/en-US/docs/Web/API'; 120 | let methodObj = {}; 121 | 122 | let directories = getDirectories( './docs/mdn/javascript/documents/' + base ); 123 | 124 | directories.forEach( elem => { 125 | fs.readdir( `docs/mdn/javascript/documents/${base}/${elem}`, function ( err, files ) { 126 | // console.log(files, err) 127 | files.forEach( fileElem => { 128 | let key = `${elem}.${fileElem}`; 129 | methodObj[ key.replace( ".html", "" ) ] = `${base}/${elem}/${fileElem}`; 130 | } ); 131 | req.methodObj = methodObj; 132 | } ); 133 | } ); 134 | next(); 135 | }, 136 | createEventObj: function ( req, res, next ) { 137 | let base = 'JavaScript/developer.mozilla.org/en-US/docs/Web/Events/'; 138 | let eventsObj = {}; 139 | 140 | fs.readdir( './docs/mdn/javascript/documents/' + base, function ( err, files ) { 141 | if ( err ) console.log( err ); 142 | files = files.filter( elem => { 143 | return elem.includes( '.html' ); 144 | } ); 145 | for ( let k of files ) { 146 | eventsObj[ k.replace( '.html', "" ) ] = base + k; 147 | } 148 | req.eventsObj = eventsObj; 149 | next(); 150 | } ); 151 | }, 152 | createKWObj: function ( req, res, next ) { 153 | let base1 = 'JavaScript/developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/'; 154 | let base2 = 'JavaScript/developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/'; 155 | let KWObj = {}; 156 | fs.readdir( './docs/mdn/javascript/documents/' + base1, function ( err, files ) { 157 | if ( err ) console.log( err ); 158 | files = files.filter( elem => { 159 | return elem.includes( '.html' ); 160 | } ); 161 | for ( let k of files ) { 162 | KWObj[ k.replace( '.html', "" ) ] = base1 + k; 163 | } 164 | } ); 165 | fs.readdir( './docs/mdn/javascript/documents/' + base2, function ( err, files ) { 166 | if ( err ) console.log( err ); 167 | files = files.filter( elem => { 168 | return elem.includes( '.html' ); 169 | } ); 170 | for ( let k of files ) { 171 | KWObj[ k.replace( '.html', "" ) ] = base2 + k; 172 | } 173 | req.KWObj = KWObj; 174 | next(); 175 | } ); 176 | }, 177 | createFuncObj: function ( req, res, next ) { 178 | let base = 'JavaScript/developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/'; 179 | let funcObj = {}; 180 | 181 | fs.readdir( './docs/mdn/javascript/documents/' + base, function ( err, files ) { 182 | if ( err ) console.log( err ); 183 | files = files.filter( elem => { 184 | return elem.includes( '.html' ); 185 | } ); 186 | for ( let k of files ) { 187 | funcObj[ k.replace( '.html', "" ) ] = base + k; 188 | } 189 | req.funcObj = funcObj; 190 | next(); 191 | } ); 192 | }, 193 | sqlFile: function ( req, res, next ) { 194 | var d = new Date(); 195 | let i = 0; 196 | let objects = { 197 | function: req.funcObj, 198 | key_word: req.KWObj, 199 | events: req.eventsObj, 200 | methods: req.methodObj, 201 | class: req.classObj 202 | }; 203 | console.log(d.getMinutes(), d.getSeconds()); 204 | 205 | let jsonIndex = {"sourceName": req.scrapeProps.sourceName, 206 | "versionNo": req.scrapeProps.versionNo, "result": []}; 207 | for ( let k in objects ) { 208 | // console.log( k ); 209 | for ( let j in objects[ k ] ) { 210 | jsonIndex.result.push({"NAME": j, "TYPE": k, "LINK": objects[k][j]}); 211 | } 212 | } 213 | jsonIndex = JSON.stringify(jsonIndex); 214 | fs.writeFileSync( "docs/mdn/javascript/index.json", jsonIndex ); 215 | //Null out jsonIndex 216 | jsonIndex = null; 217 | next(); 218 | }, 219 | zip: function ( req, res, next ) { 220 | console.log('zipping'); 221 | let output = fs.createWriteStream( './zips/mdn/mdn_javascript'+req.scrapeProps.versionNo+'.zip'); 222 | //Add to req 223 | req.scrapeProps.filePath = './zips/mdn/mdn_javascript'+req.scrapeProps.versionNo+'.zip'; 224 | let archive = archiver('zip'); 225 | var d = new Date(); 226 | console.log(d.getMinutes(), d.getSeconds()); 227 | 228 | output.on('close', function() { 229 | fs.unlink('./temp/JavaScript.tgz', (err) => { 230 | if(err) console.log(err); 231 | d = new Date; 232 | req.funcObj = null; 233 | req.KWObj = null; 234 | req.eventsObj = null; 235 | req.methodObj = null; 236 | req.classObj = null; 237 | console.log(d.getMinutes(), d.getSeconds()); 238 | console.log(archive.pointer() + ' total bytes'); 239 | console.log('archiver has been finalized and the output file descriptor has closed.'); 240 | folderHandler.deleteFolderRecursive(req.scrapeProps.baseDir); 241 | next(); 242 | 243 | } ) 244 | }); 245 | 246 | archive.on('error', function(err) { 247 | throw err; 248 | }); 249 | 250 | archive.pipe(output); 251 | 252 | archive.bulk([ 253 | { expand: true, cwd: 'docs/mdn/javascript', src: ['**'], dest:'mdn_javascript.docs' } 254 | ]); 255 | 256 | archive.finalize(); 257 | } 258 | }; 259 | 260 | 261 | module.exports = mdnJS; 262 | -------------------------------------------------------------------------------- /server/middleware/nodeparser_working.js: -------------------------------------------------------------------------------- 1 | var fs = require('fs'); 2 | var sql = require('sql.js') 3 | var cheerio = require('cheerio'); 4 | 5 | module.exports = function parser(file, db, i) { 6 | // var db = new sql.Database(); 7 | //initialize sql query 8 | //move outside of function? 9 | var sqlstr = ""; 10 | // var sqlstr = "CREATE TABLE docsearch (ID int, NAME char, TYPE char, LINK char);"; 11 | // console.log(sqlstr.length) 12 | var filename = file.slice(file.lastIndexOf('/')+1) 13 | var data = fs.readFileSync(file, 'utf-8'); 14 | var $ = cheerio.load(data); 15 | var methods = []; 16 | //Keep track of index independently for sake of sql database 17 | //Go thru all h3 and h2 to get methods props and events 18 | // Pass in a size so you dont check previous h2 for class and instead insert the module 19 | function firstPass(ind, el, size){ 20 | var name = $(el).parent().parent().text(); 21 | //Add href of link to filename 22 | var link = $(el).attr('href'); 23 | //Match Methods (they have X.string(blah) ) 24 | if(name.match(/\w+\(\w*\)\#$/g)){ 25 | name = name.replace(/\(.*\)\#/g, ""); 26 | //Handle Class Methods 27 | if(name.match(/^Class\sMethod:\s/)){ 28 | name = name.replace(/^Class\sMethod:\s/, "") 29 | } 30 | sqlstr += `INSERT INTO docsearch VALUES (${i}, '${name}', 'method', '${filename.concat(link)}');`; 31 | //Push into methods for determining if its an addon page or not 32 | i++; 33 | methods.push($(el).attr('href')); 34 | } 35 | //Properties are similar to method notation but lack the () 36 | else if(name.match(/\.\w+(?!\()#/g) || name.match(/.+\[.*\]#/g)){ 37 | //sometimes classes have a . in them too we will grab classes later 38 | if(!name.match(/Class/)){ 39 | name = name.slice(0,-1); 40 | sqlstr += `INSERT INTO docsearch VALUES (${i}, '${name}', 'property', '${filename.concat(link)}');`; 41 | i++; 42 | } 43 | } 44 | //Find events they start with Event: 45 | else if(name.match(/^Event:/g)){ 46 | //get rid of Event: and # and ''s 47 | name = name.replace(/^Event:\s/g, "").replace(/\'|#/g, ""); 48 | if(size === 'h3'){ 49 | //Find previous h2, prevuntil goes up to but not including, then do one more prev, but filter to just 50 | var classname = $(el).parent().parent().prevUntil('h2').prev('h2').text(); 51 | classname = classname.replace(/Class:\s/g, "").slice(0,-1); 52 | } 53 | else if(size === 'h2'){ 54 | // console.log()) 55 | var classname = filename.slice(0,filename.indexOf('.')); 56 | } 57 | name = classname.concat("."+name); 58 | //Concatenate the classname and event name and 59 | //get rid of # in h2 className 60 | sqlstr += `INSERT INTO docsearch VALUES(${i}, '${name}', 'event', '${filename.concat(link)}');`; 61 | i++; 62 | 63 | } 64 | //Keep track of that Index 65 | } 66 | $('h3 a').each((ind,el)=>{ 67 | firstPass(ind,el, 'h3') 68 | }); 69 | $('h2 a').each((ind, el) =>{ 70 | firstPass(ind, el, 'h2') 71 | }) 72 | //Check if anything has been put into the sql string, if not, it's not a module. 73 | if(sqlstr.length >= 65){ 74 | //Get Module name and put in database 75 | var name = $('#apicontent > h1').text().replace(/#/g, ""); 76 | var link = $('#apicontent > h1 a').attr('href'); 77 | sqlstr += `INSERT INTO docsearch VALUES(${i}, '${name}', 'module', '${filename.concat(link)}');`; 78 | i++; 79 | 80 | //Time to grab classes and other stragglers 81 | $('h2 a').each((ind, el) => { 82 | var name = $(el).parent().parent().text(); 83 | //Add href of link to filename 84 | var link = $(el).attr('href'); 85 | if(name.match(/^Class\:\s/g)){ 86 | //replace the class and get rid of the # 87 | name = name.replace(/^Class\:\s/g, "").replace(/\'/g,"").slice(0, -1); 88 | sqlstr += `INSERT INTO docsearch VALUES(${i}, '${name}', 'class', '${filename.concat(link)}');`; 89 | i++; 90 | 91 | } 92 | //Bad semantic html, check for properties that are in h2 93 | // else if(name.match(/\.\w+(?!\()#/g) || name.match(/.+\[.*\]#/g)){ 94 | // // name = name.replace(/#$/g, ""); 95 | // name = name.replace(/\'/g,"").slice(0,-1); 96 | // sqlstr += `INSERT INTO docsearch VALUES (${i}, '${name}', 'property', '${filename.concat(link)}');`; 97 | // i++; 98 | // 99 | // } 100 | // Otherwise they are probably sections / chapters. to be safe, check against matches for 101 | // events props classes and methods 102 | else if(!name.match(/Class|Event|\(.*\)|\.\w+(?!\()/)){ 103 | name = name.replace(/\'/g, "").slice(0,-1); 104 | sqlstr += `INSERT INTO docsearch VALUES (${i}, '${name}', 'chapter', '${filename.concat(link)}');`; 105 | i++; 106 | 107 | } 108 | }) 109 | } 110 | 111 | // fs.writeFileSync('docs/'+filename+".js", sqlstr) 112 | //Insert into sql database 113 | db.run(sqlstr); 114 | return ({"DB": db, "index": i}) 115 | // var data = db.export(); 116 | // var buff = new Buffer(data); 117 | // // fs.writeFileSync('docs/'+filename+'.sqlite', buff); 118 | // fs.writeFileSync('docs/files.sqlite', buff); 119 | 120 | }; 121 | -------------------------------------------------------------------------------- /server/middleware/parseEntryPoint.js: -------------------------------------------------------------------------------- 1 | var fs = require('fs'); 2 | var parser = require('./parser'); 3 | // var sql = require('sql.js') 4 | 5 | 6 | var parseEntry = { 7 | allFiles: function(req, resolve, reject){ 8 | // var db = new sql.Database(); 9 | //initialize sql query 10 | //move outside of function? 11 | var i = 0; 12 | var jsonFile = {"sourceName": req.scrapeProps.sourceName, 13 | "versionNo": req.scrapeProps.versionNo, "result": []}; 14 | //create an object to store the index and the database 15 | // var storage = {"DB": db, "index": i}; 16 | // var sqlstr = "CREATE TABLE docsearch (ID int, NAME char, TYPE char, LINK char);"; 17 | // db.run(sqlstr) 18 | fs.readdir(req.scrapeProps.downloadDir, (err, file) => { 19 | console.log(err); 20 | list = file; 21 | // console.log(storage.DB); 22 | list.forEach((name) => { 23 | // Add directory name to file name for FS 24 | name = req.scrapeProps.downloadDir.concat(name); 25 | if(req.scrapeProps.scrapeDir.slice(0,-1) === 'node'){ 26 | //For node, don't parse all.html, it will break the sql 27 | if(name.match(/\.html$/) && !name.match(/all\.html/)){ 28 | jsonFile = parser.node(name, jsonFile); 29 | } 30 | } 31 | //Express stuff here 32 | else if(req.scrapeProps.scrapeDir.slice(0,-1) === 'express'){ 33 | if(name.match(/\.html$/)){ 34 | jsonFile = parser.express(name, jsonFile); 35 | } 36 | } 37 | }); 38 | //Export the database so we can write it to file 39 | // var data = db.export(); 40 | //Create a buffer for writing to 41 | // var buff = new Buffer(data); 42 | jsonFile = JSON.stringify(jsonFile); 43 | fs.writeFileSync(req.scrapeProps.baseDir+'/index.json', jsonFile); 44 | //Null out jsonFile 45 | jsonFile = null; 46 | //Be sure to resolve the promise when readdir is done 47 | resolve("Resolved"); 48 | }) 49 | } 50 | } 51 | module.exports = parseEntry; 52 | -------------------------------------------------------------------------------- /server/middleware/parser.js: -------------------------------------------------------------------------------- 1 | var fs = require('fs'); 2 | var cheerio = require('cheerio'); 3 | 4 | var parser = { 5 | node: function(file, jsonFile){ 6 | // sqlstr = ""; 7 | var i = 0; 8 | var filename = file.slice(file.lastIndexOf('/')+1); 9 | var data = fs.readFileSync(file, 'utf-8'); 10 | var $ = cheerio.load(data); 11 | var methods = []; 12 | //Keep track of index independently for sake of sql database 13 | //Go thru all h3 and h2 to get methods props and events 14 | //Pass in a size so you dont check previous h2 for class and instead insert the module 15 | function firstPass(ind, el, size){ 16 | var name = $(el).parent().parent().text(); 17 | //Add href of link to filename 18 | var link = $(el).attr('href'); 19 | //Match Methods (they have X.string(blah) ) 20 | if(name.match(/\w+\(.*\)\#$/g)){ 21 | name = name.replace(/\(.*\)\#/g, ""); 22 | //Handle Class Methods 23 | if(name.match(/^Class\sMethod:\s/)){ 24 | name = name.replace(/^Class\sMethod:\s/, ""); 25 | } 26 | jsonFile.result.push({"NAME": name, "TYPE": "method", "LINK":filename.concat(link)}); 27 | //Push into methods for determining if its an addon page or not 28 | i++; 29 | methods.push($(el).attr('href')); 30 | } 31 | //Properties are similar to method notation but lack the () 32 | else if(name.match(/\.\w+(?!\()#/g) || name.match(/.+\[.*\]#/g)){ 33 | //sometimes classes have a . in them too we will grab classes later 34 | if(!name.match(/Class/)){ 35 | name = name.slice(0,-1); 36 | // sqlstr += `INSERT INTO docsearch VALUES (${i}, '${name}', 'property', '${filename.concat(link)}');`; 37 | jsonFile.result.push({"NAME": name, "TYPE": "property", "LINK":filename.concat(link)}); 38 | 39 | i++; 40 | } 41 | } 42 | //Find events they start with Event: 43 | else if(name.match(/^Event:/g)){ 44 | //get rid of Event: and # and ''s 45 | name = name.replace(/^Event:\s/g, "").replace(/\'|#/g, ""); 46 | var classname; 47 | if(size === 'h3'){ 48 | //Find previous h2, prevuntil goes up to but not including, then do one more prev, but filter to just 49 | classname = $(el).parent().parent().prevUntil('h2').prev('h2').text(); 50 | classname = classname.replace(/Class:\s/g, "").slice(0,-1); 51 | } 52 | else if(size === 'h2'){ 53 | classname = filename.slice(0,filename.indexOf('.')); 54 | } 55 | name = classname.concat("."+name); 56 | //Concatenate the classname and event name and 57 | //get rid of # in h2 className 58 | // sqlstr += `INSERT INTO docsearch VALUES(${i}, '${name}', 'event', '${filename.concat(link)}');`; 59 | jsonFile.result.push({"NAME": name, "TYPE": "event", "LINK":filename.concat(link)}); 60 | i++; 61 | 62 | } 63 | //Keep track of that Index 64 | } 65 | $('h3 a').each((ind,el)=>{ 66 | firstPass(ind,el, 'h3'); 67 | }); 68 | $('h2 a').each((ind, el) =>{ 69 | firstPass(ind, el, 'h2'); 70 | }); 71 | //Check if anything has been put into the sql string, if not, it's not a module. 72 | if(i >= 1){ 73 | //Get Module name and put in database 74 | var name = $('#apicontent > h1').text().replace(/#/g, ""); 75 | var link = $('#apicontent > h1 a').attr('href'); 76 | // sqlstr += `INSERT INTO docsearch VALUES(${i}, '${name}', 'module', '${filename.concat(link)}');`; 77 | jsonFile.result.push({"NAME": name, "TYPE": "module", "LINK":filename.concat(link)}); 78 | 79 | // i++; 80 | 81 | //Time to grab classes and other stragglers 82 | $('h2 a').each((ind, el) => { 83 | var name = $(el).parent().parent().text(); 84 | //Add href of link to filename 85 | var link = $(el).attr('href'); 86 | if(name.match(/^Class\:\s/g)){ 87 | //replace the class and get rid of the # 88 | name = name.replace(/^Class\:\s/g, "").replace(/\'/g,"").slice(0, -1); 89 | // sqlstr += `INSERT INTO docsearch VALUES(${i}, '${name}', 'class', '${filename.concat(link)}');`; 90 | jsonFile.result.push({"NAME": name, "TYPE": "class", "LINK":filename.concat(link)}); 91 | 92 | // i++; 93 | 94 | } 95 | //Bad semantic html, check for properties that are in h2 96 | // Otherwise they are probably sections / chapters. to be safe, check against matches for 97 | // events props classes and methods 98 | else if(!name.match(/Class|Event|\(.*\)|\.\w+(?!\()/)){ 99 | name = name.replace(/\'/g, "").slice(0,-1); 100 | // sqlstr += `INSERT INTO docsearch VALUES (${i}, '${name}', 'chapter', '${filename.concat(link)}');`; 101 | jsonFile.result.push({"NAME": name, "TYPE": "chapter", "LINK":filename.concat(link)}); 102 | // i++; 103 | 104 | } 105 | }); 106 | } 107 | if(!jsonFile.sections) jsonFile.sections = ["chapter", "class", "event", "method", "module", "property"]; 108 | //Insert into sql database 109 | // db.run(sqlstr); 110 | return (jsonFile); 111 | }, 112 | express: function(file, jsonFile){ 113 | var filename = file.slice(file.lastIndexOf('/')+1); 114 | var data = fs.readFileSync(file, 'utf-8'); 115 | // var sqlstr = ""; 116 | var $ = cheerio.load(data); 117 | 118 | var type = ''; 119 | //Only api.html has the diff classes etc 120 | if(filename === "api.html"){ 121 | //All methods/props/events and names of those are in H3s --- Created a nightmare since they arent nested 122 | //All the methods/props/events at least are inside sections located 'underneath' the names 123 | //Unfortunately cheerio freaks out if an ID has the character "." in it. 124 | $('h3').each((ind, ele) => { 125 | var truthy = ($(ele).text() === "Methods" || $(ele).text() === "Properties" || $(ele).text() === "Events"); 126 | var name = $(ele).attr('id'); 127 | var link = ("#").concat(name); 128 | //If the H3 matches one of these, set the type of the entry to that 129 | if(truthy){ 130 | type = $(ele).text().toLowerCase(); 131 | } 132 | //Otherwise add to the sql string 133 | else{ 134 | jsonFile.result.push({"NAME": name, "TYPE": type, "LINK":filename.concat(link)}); 135 | // sqlstr += `INSERT INTO docsearch VALUES (${i}, '${name}', '${type}', '${filename.concat(link)}');`; 136 | } 137 | // i++; 138 | }); 139 | //Module / Class names are all in H2 140 | $('h2').each((ind, ele) => { 141 | // console.log(); 142 | var name = $(ele).text(); 143 | var link = ("#").concat($(ele).prev('p').children().first().attr('id')); 144 | // sqlstr += `INSERT INTO docsearch VALUES (${i}, '${name}', 'class', '${filename.concat(link)}');`; 145 | jsonFile.result.push({"NAME": name, "TYPE": "class", "LINK":filename.concat(link)}); 146 | // i++ 147 | }); 148 | } 149 | // For all the chapters/guides, just grab the first H1 as the title, and put the link as the file name 150 | else{ 151 | var name = $('h1').first().text(); 152 | type = 'chapter'; 153 | // sqlstr += `INSERT INTO docsearch VALUES (${i}, '${name}', '${type}', '${filename}');` 154 | jsonFile.result.push({"NAME": name, "TYPE": "chapter", "LINK":filename}); 155 | // i++; 156 | } 157 | // db.run(sqlstr) 158 | return (jsonFile); 159 | } 160 | }; 161 | 162 | module.exports = parser; 163 | -------------------------------------------------------------------------------- /server/middleware/requestProps.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | //Constants to be changed or added later with inputs to program 3 | /* Structure of directory to be eg /node.docs/docs/ 4 | * with the sql file in /node.docs 5 | * and temporary directory to be documentation/ 6 | * so docs/+scrapeDir+/documents will be downloadDir 7 | * baseDir will be docs/scrapeDir maybe rename scrapeDir? 8 | */ 9 | 10 | let requestProps = { 11 | node: function(req, res, next){ 12 | //Just in case garbage collection 13 | req.scrapeProps = { 14 | urlsToScrape: ['http://nodejs.org/api/'], 15 | sourceName: 'NodeJS', 16 | cssDir: 'assets', 17 | jsDir: 'assets', 18 | scrapeDir: 'node/', 19 | //FIX THIS LATER TO ADD IN ANYTHING, AND BE PASSED IN AS AN OBJECT 20 | //WHY CANT I USE THIS. HERE? 21 | baseDir: 'docs/node/', 22 | downloadDir: 'docs/node/documents/', 23 | RECURSIVE: true, 24 | versionNo: "", 25 | }; 26 | next(); 27 | }, 28 | express: function(req, res, next){ 29 | //Just in case garbage collection 30 | req.scrapeProps = { 31 | urlsToScrape: [ 32 | {url: 'http://expressjs.com/en/4x/api.html', filename: 'api.html'}, 33 | {url: 'http://expressjs.com/en/starter/installing.html', filename: 'installing.html'}, 34 | {url: 'http://expressjs.com/en/starter/hello-world.html', filename: 'hello-world.html'}, 35 | {url: 'http://expressjs.com/en/starter/generator.html', filename: 'generator.html'}, 36 | {url: 'http://expressjs.com/en/starter/static-files.html', filename: 'static-files.html'}, 37 | {url: 'http://expressjs.com/en/starter/faq.html', filename: 'faq.html'}, 38 | {url: 'http://expressjs.com/en/guide/routing.html', filename: 'routing.html'}, 39 | {url: 'http://expressjs.com/en/guide/writing-middleware.html', filename: 'writing-middleware.html'}, 40 | {url: 'http://expressjs.com/en/guide/using-middleware.html', filename: 'using-middleware.html'}, 41 | {url: 'http://expressjs.com/en/guide/using-template-engines.html', filename: 'using-template-engines.html'}, 42 | {url: 'http://expressjs.com/en/guide/error-handling.html', filename: 'error-handling.html'}, 43 | {url: 'http://expressjs.com/en/guide/debugging.html', filename: 'debugging.html'}, 44 | {url: 'http://expressjs.com/en/guide/database-integration.html', filename: 'database-integration.html'}, 45 | {url: 'http://expressjs.com/en/guide/migrating-4.html', filename: 'migrating-4.html'}, 46 | {url: 'http://expressjs.com/en/advanced/developing-template-engines.html', filename: 'developing-template-engines.html'}, 47 | {url: 'http://expressjs.com/en/advanced/best-practice-performance.html', filename: 'best-practice-performance.html'}, 48 | {url: 'http://expressjs.com/en/advanced/best-practice-security.html', filename: 'best-practice-security.html'} 49 | ], 50 | sourceName: 'Express API', 51 | cssDir: 'css', 52 | jsDir: 'js', 53 | scrapeDir: 'express/', 54 | //FIX THIS LATER TO ADD IN ANYTHING, AND BE PASSED IN AS AN OBJECT 55 | //WHY CANT I USE THIS. HERE? 56 | baseDir: 'docs/express/', 57 | downloadDir: 'docs/express/documents/', 58 | RECURSIVE: false, 59 | versionNo: "", 60 | }; 61 | next(); 62 | }, 63 | js: function(req, res, next){ 64 | //Just in case garbage collection 65 | req.scrapeProps = { 66 | // URL_TO_SCRAPE: , 67 | sourceName:"MDN Javascript", 68 | // CSS_DIR: , 69 | // JS_DIR: , 70 | scrapeDir: 'mdn/javascript/', 71 | baseDir: 'docs/mdn/javascript/', 72 | downloadDir: 'docs/mdn/javascript/JavaScript/documents' 73 | }; 74 | next(); 75 | }, 76 | html: function(req, res, next){ 77 | //Just in case garbage collection 78 | req.scrapeProps = { 79 | // URL_TO_SCRAPE: , 80 | sourceName:"MDN HTML", 81 | // CSS_DIR: , 82 | // JS_DIR: , 83 | scrapeDir: 'mdn/html/', 84 | baseDir: 'docs/mdn/html/', 85 | downloadDir: 'docs/mdn/html/HTML/documents' 86 | }; 87 | next(); 88 | }, 89 | css: function(req, res, next){ 90 | //Just in case garbage collection 91 | req.scrapeProps = { 92 | // URL_TO_SCRAPE: , 93 | sourceName:"MDN CSS", 94 | // CSS_DIR: , 95 | // JS_DIR: , 96 | scrapeDir: 'mdn/css/', 97 | baseDir: 'docs/mdn/css/', 98 | downloadDir: 'docs/mdn/css/CSS/documents' 99 | }; 100 | next(); 101 | }, 102 | 103 | }; 104 | 105 | module.exports = requestProps; 106 | -------------------------------------------------------------------------------- /server/middleware/rewrite.js: -------------------------------------------------------------------------------- 1 | var cheerio = require('cheerio'); 2 | 3 | //Object for varios sites to strip out parts of html 4 | var rewrite = { 5 | //Specific nodejs.com documentation removal of ToC and sidebar 6 | node: function(req, res, next, html){ 7 | var $ = cheerio.load(html); 8 | $('#column2').remove(); 9 | $('#toc').remove(); 10 | $('header').remove(); 11 | html = $.html(); 12 | //Return full html to be written as file instead of html and cheerio data 13 | return html; 14 | }, 15 | express: function(req, res, next, html){ 16 | var $ = cheerio.load(html); 17 | $('header').remove(); 18 | $('footer').remove(); 19 | $('#menu').remove(); 20 | // $('header').remove(); 21 | html = $.html(); 22 | //Return full html to be written as file instead of html and cheerio data 23 | return html; 24 | } 25 | 26 | } 27 | 28 | 29 | module.exports = rewrite; 30 | -------------------------------------------------------------------------------- /server/middleware/scrapeParseWrite.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | var scraper = require('website-scraper'); 3 | var fs = require('fs'); 4 | var cheerio = require('cheerio'); 5 | var archiver = require('archiver'); 6 | 7 | var rewrite = require('./rewrite') 8 | var folderHandler = require('./folderHandler'); 9 | var parseEntry = require('./parseEntryPoint'); 10 | 11 | var scrapeParseWrite = { 12 | 13 | createZip: function(req, res, next){ 14 | //Initialize Archiver 15 | //Specify type of archive - zip or tar 16 | req.archive = archiver('zip'); 17 | var zipFolder = 'zips/'+req.scrapeProps.scrapeDir; 18 | //check to see if folder exists or create folder to store zip if it doesn't exist 19 | folderHandler.checkOrCreateFolder(zipFolder); 20 | //Create output file stream from scrapeDir 21 | req.output = fs.createWriteStream(zipFolder+req.scrapeProps.scrapeDir.slice(0,-1)+req.scrapeProps.versionNo+'.zip'); 22 | this.scrape(req, res, next); 23 | }, 24 | 25 | scrape: function(req, res, next){ 26 | //Check to see if folder was deleted or not, and if so, delete it 27 | folderHandler.checkToDelete(req.scrapeProps.baseDir); 28 | 29 | /* 30 | * Initialize scraper and provide URL, directory to store files, subdirectories 31 | * FOR files, recurse 1 level deep, and then edit files 32 | */ 33 | scraper.scrape({ 34 | urls: req.scrapeProps.urlsToScrape, 35 | directory: req.scrapeProps.downloadDir, 36 | subdirectories: [ 37 | {directory: 'img', extensions: ['.jpg', '.png', '.svg']}, 38 | {directory: req.scrapeProps.jsDir, extensions: ['.js']}, 39 | {directory: req.scrapeProps.cssDir, extensions: ['.css']} 40 | ], 41 | recursive: req.scrapeProps.RECURSIVE, 42 | maxDepth: 1 43 | }).then((data)=>{ 44 | this.getFiles(req, res, next); 45 | }).catch(console.log); 46 | 47 | //Event listener for end of zipping function - delete folder 48 | req.output.on('close', ()=>{ 49 | console.log(req.archive.pointer() + ' total bytes'); 50 | console.log('archiver has been finalized and the output file descriptor has closed.'); 51 | folderHandler.deleteFolderRecursive(req.scrapeProps.baseDir); 52 | req.scrapeProps.filePath = req.output.path; 53 | console.log(req.output.path); 54 | // res.versionNo = versionNo; 55 | next(); 56 | }); 57 | // Event listener for archive errors 58 | req.archive.on('error', function(err){ 59 | throw err; 60 | }); 61 | }, 62 | 63 | 64 | //get list of files to change the hrefs for css and js files to exclude beggining / if they have it 65 | getFiles: function(req, res, next) { 66 | let list; 67 | 68 | //Add that because this object will be out of context in archive.bulk 69 | let that = this; 70 | //Get list of files in directory 71 | fs.readdir(req.scrapeProps.downloadDir, (err, file) => { 72 | list = file; 73 | list.forEach((name) => { 74 | //Add directory name to file name for FS 75 | name = req.scrapeProps.downloadDir.concat(name); 76 | //only edit html files 77 | if(name.match(/\.html$/)){ 78 | //pass file names off to be read and rewritten 79 | this.editFile(req, res, next, name); 80 | } 81 | }); 82 | 83 | //Since readdir is async, and is also called by parseEntry, we need to promisify it, and 84 | //send the resolve over 85 | var p1 = new Promise((resolve, reject)=>{ 86 | parseEntry.allFiles(req, resolve, reject); 87 | }); 88 | 89 | p1.then(function(val){ 90 | //Time to zip the file 91 | //Pipe zip to the output file 92 | req.archive.pipe(req.output); 93 | //specify what to zip up (in this case the directory itself) and append them to the zip 94 | //Make the directory the z1ip file extracts to to be based on the scrapeDir 95 | //Use that, since this is bound to archive module 96 | req.archive.bulk([ 97 | { expand: true, cwd: req.scrapeProps.baseDir, src: ['**'], dest: req.scrapeProps.scrapeDir.slice(0,-1)+'.docs'} 98 | ]); 99 | //Finalize archive and prevent further appends 100 | req.archive.finalize(); 101 | }).catch((val)=>{ 102 | console.log("Promise rejected: ", val) 103 | }) 104 | 105 | }); 106 | }, 107 | 108 | editFile: function(req, res, next, file) { 109 | fs.readFile(file, 'utf-8', (err, data) => { 110 | //Remove front slash on src and href of js and css file locations 111 | // console.log("ok", data); 112 | var newData = data.replace(/href=\"\/(?!\/)/gi, 'href="'). 113 | replace(/src=\"\/(?!\/)/gi, 'src="'); 114 | //Made the rewriter universal for whatever we are scraping 115 | //Will need to impliment checks to make sure we have methods for those sites 116 | 117 | var writeMethod = req.scrapeProps.scrapeDir.slice(0, -1) 118 | //Call function to remove extraneous stuff but ITS DYNAMIC!!! 119 | //Try and catch in case we don't have the required methods 120 | try{ 121 | newData = rewrite[writeMethod](req, res, next, newData); 122 | } 123 | catch(err){ 124 | console.error("WHOA WE DONT HAVE A FUNCTION FOR THIS") 125 | // res.send(`

Sorry, there seems to be a problem with our parsing engine,
Please contact us`) 126 | return res.end() 127 | } 128 | //Rewrite file 129 | fs.writeFileSync(file, newData, 'utf-8') 130 | }); 131 | }, 132 | 133 | 134 | } 135 | module.exports = scrapeParseWrite; 136 | -------------------------------------------------------------------------------- /server/middleware/versionCheck.js: -------------------------------------------------------------------------------- 1 | var request = require('request'); 2 | var cheerio = require('cheerio'); 3 | 4 | var versionCheck = { 5 | node: function(req, res, next){ 6 | //Grab front page of node and check the version number; 7 | request.get('https://nodejs.org/api/index.html', (err, resp, body) =>{ 8 | var $ = cheerio.load(body); 9 | var versionString = $('header h1').text(); 10 | //Match returns an array, first element is the match!! 11 | versionString = versionString.match(/\sv.*\s/)[0].trim().slice(1); 12 | req.scrapeProps.versionNo = versionString; 13 | next(); 14 | }); 15 | }, 16 | express: function(req, res, next){ 17 | request.get('http://expressjs.com/en/4x/api.html', (err, resp, body) =>{ 18 | var $ = cheerio.load(body); 19 | //Grab first anchor after #application-menu, most current ver 20 | var versionString = $('#application-menu a').attr('href'); 21 | //Match returns an array, first element is the match!! slice off trailing / 22 | versionString = versionString.match(/[0-9]+.+\//)[0].slice(0,-1); 23 | req.scrapeProps.versionNo = versionString; 24 | next(); 25 | }); 26 | }, 27 | js: function(req, res, next){ 28 | request.get('https://kapeli.com/mdn_offline', (err, resp, body) => { 29 | var $ = cheerio.load(body); 30 | //version string here is going to be the update date 31 | var jsLink = $( ".download:contains('JavaScript.tgz')" ); 32 | var versionString = $(jsLink).parent().next('td').text(); 33 | req.scrapeProps.versionNo = versionString; 34 | next(); 35 | }); 36 | }, 37 | css: function(req, res, next){ 38 | request.get('https://kapeli.com/mdn_offline', (err, resp, body) => { 39 | var $ = cheerio.load(body); 40 | //version string here is going to be the update date 41 | var jsLink = $( ".download:contains('CSS.tgz')" ); 42 | var versionString = $(jsLink).parent().next('td').text(); 43 | req.scrapeProps.versionNo = versionString; 44 | next(); 45 | }); 46 | }, 47 | html: function(req, res, next){ 48 | request.get('https://kapeli.com/mdn_offline', (err, resp, body) => { 49 | var $ = cheerio.load(body); 50 | //version string here is going to be the update date 51 | var jsLink = $( ".download:contains('HTML.tgz')" ); 52 | var versionString = $(jsLink).parent().next('td').text(); 53 | req.scrapeProps.versionNo = versionString; 54 | next(); 55 | }); 56 | } 57 | }; 58 | module.exports = versionCheck; 59 | -------------------------------------------------------------------------------- /server/server.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const express = require( 'express' ); 4 | const bodyParser = require( 'body-parser' ); 5 | const path = require( 'path' ); 6 | const mongoose = require( 'mongoose' ); 7 | const dbController = require( './controllers/dbController' ); 8 | const mdnJS = require( './middleware/mdnJS' ); 9 | const mdnHTML = require( './middleware/mdnHTML' ); 10 | const mdnCSS = require( './middleware/mdnCSS' ); 11 | //Scraping middleware 12 | const scrapeParseWrite = require('./middleware/scrapeParseWrite'); 13 | const parseEntry = require('./middleware/parseEntryPoint'); 14 | //Middleware to add proper request properties for each site to scrape 15 | const requestProps = require( './middleware/requestProps' ); 16 | //Add middleware to check version of various sites 17 | const version = require( './middleware/versionCheck' ); 18 | const fs = require( 'fs' ); 19 | mongoose.connect( 'mongodb://Doc:tor@ds059215.mongolab.com:59215/doc-tor' ); 20 | const db = mongoose.connection; 21 | const app = express(); 22 | 23 | 24 | require( 'dns' ) 25 | .lookup( require( 'os' ) 26 | .hostname(), 27 | function ( err, add, fam ) { 28 | console.log( 'addr: ' + add ); 29 | } ); 30 | // log output 31 | // app.use(require('morgan') 32 | // ('STATUS=:status IP=:remote-addr REQ=":method :url" TIME=:response-time :res[content-length]')); 33 | 34 | db.on( 'error', console.error.bind( console, 'connection error:' ) ); 35 | db.once( 'open', function () { 36 | console.log( "your db is open" ); 37 | } ); 38 | 39 | app.use( bodyParser.urlencoded( { 40 | extended: true 41 | } ) ); 42 | app.use( express.static( path.join( __dirname, './../public' ) ) ); 43 | ///////////////////////////////////////////////// 44 | //// Handle requests to our main page(site) 45 | ///////////////////////////////////////////////// 46 | app.get( '/', function ( req, res ) { 47 | console.log( "Our website homepage!" ); 48 | res.sendFile( path.join( __dirname, '/../public/index.html' ) ); 49 | } ); 50 | 51 | 52 | /***** API *****/ 53 | /* 54 | TODO: optimize download and extraction 55 | TODO: make create functions DRY with helper function 56 | NOTE: mdn.download only provides a link for request module, 57 | mdn.getJavascript actually downloads the .tgz 58 | */ 59 | // app.get( '/js', mdnJS.download, mdnJS.getJavascript, mdnJS.extract, mdnJS.createClassObj, mdnJS.createMethodsObj, mdnJS.createEventObj, mdnJS.createKWObj, mdnJS.createFuncObj, mdnJS.sqlFile, mdnJS.zip, function ( req, res ) { 60 | // res.sendFile(path.resolve('./mdn_javascript.zip')); 61 | // console.log('\n finished'); 62 | // }); 63 | 64 | 65 | app.get( '/mdn_html', requestProps.html, dbController.latestVer, function ( req, res ) { 66 | res.sendFile(path.resolve(req.scrapeProps.filePath)); 67 | req.scrapeProps = null; 68 | // console.log('\n finished'); 69 | }); 70 | app.get( '/mdn_css', requestProps.css, dbController.latestVer, function ( req, res ) { 71 | res.sendFile(path.resolve(req.scrapeProps.filePath)); 72 | req.scrapeProps = null; 73 | // console.log('\n finished'); 74 | }); 75 | app.get('/mdn_javascript', requestProps.js, dbController.latestVer, function(req, res){ 76 | res.sendFile(path.resolve(req.scrapeProps.filePath)); 77 | req.scrapeProps = null; 78 | // console.log("sending full html back to client"); 79 | }); 80 | /////////////////////////////////////////////////////////////////////////////// 81 | /// BIND SCRAPEPARSEWRITE.CREATEZIP TO ITSELF SO IT BIND TO THE CORRECT CONTEXT 82 | /////////////////////////////////////////////////////////////////////////////// 83 | app.get('/node', requestProps.node, dbController.latestVer, function(req,res){ 84 | res.sendFile(path.resolve(req.scrapeProps.filePath)); 85 | req.scrapeProps = null; 86 | // console.log("sending full html back to client"); 87 | }); 88 | app.get('/express', requestProps.express, dbController.latestVer, function(req,res){ 89 | res.sendFile(path.resolve(req.scrapeProps.filePath)); 90 | req.scrapeProps = null; 91 | // console.log("sending full html back to client"); 92 | }); 93 | ////////////////////////////////////////////////// 94 | // Test crash reporting route 95 | ////////////////////////////////////////////////// 96 | // app.post( '/error', function ( req, res ) { 97 | // console.log( "this func is running" ); 98 | // fs.writeFile( 'crashReport.txt', req.body, function () { 99 | // console.log( 'crash report\'s a go' ); 100 | // } ); 101 | // } ); 102 | ////////////////////////////////////////////////// 103 | // delete zip/or section from server update DB 104 | ////////////////////////////////////////////////// 105 | app.delete( '/node', function ( req, res ) {} ); 106 | ////////////////////////////////////////////////// 107 | // handle changes to node update DB 108 | ////////////////////////////////////////////////// 109 | app.put( '/node', function ( req, res ) {}); 110 | 111 | 112 | 113 | 114 | 115 | /////////////////////////////////////////////// 116 | // Handle requests for data 117 | // (option for multiple sites) 118 | /////////////////////////////////////////////// 119 | // app.get('/html', function(req,res){ 120 | // res.sendFile(path.join(__dirname, '/../index.html')); 121 | // console.log("send full html back to client"); 122 | // }); 123 | 124 | app.listen( 8080, function () { 125 | console.log( "Server is listening on port 80" ); 126 | } ); 127 | -------------------------------------------------------------------------------- /server/updater.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const express = require( 'express' ); 4 | const bodyParser = require( 'body-parser' ); 5 | const path = require( 'path' ); 6 | const mongoose = require( 'mongoose' ); 7 | const dbController = require( './controllers/dbController' ); 8 | const mdnJS = require( './middleware/mdnJS' ); 9 | const mdnHTML = require( './middleware/mdnHTML' ); 10 | const mdnCSS = require( './middleware/mdnCSS' ); 11 | //Scraping middleware 12 | const scrapeParseWrite = require('./middleware/scrapeParseWrite'); 13 | const parseEntry = require('./middleware/parseEntryPoint'); 14 | //Middleware to add proper request properties for each site to scrape 15 | const requestProps = require( './middleware/requestProps' ); 16 | //Add middleware to check version of various sites 17 | const version = require( './middleware/versionCheck' ); 18 | const fs = require( 'fs' ); 19 | mongoose.connect( 'mongodb://Doc:tor@ds059215.mongolab.com:59215/doc-tor' ); 20 | const db = mongoose.connection; 21 | const app = express(); 22 | 23 | const updates = {"MDN_HTML":[requestProps.html, version.html, mdnHTML.download, mdnHTML.getHTML, 24 | mdnHTML.extract, mdnHTML.getElements, mdnHTML.sqlFile, mdnHTML.zip, dbController.addToDB], 25 | 26 | "MDN_CSS": [requestProps.css, version.css, mdnCSS.download, mdnCSS.getCSS, 27 | mdnCSS.extract, mdnCSS.getObjs, mdnCSS.getMoz, 28 | mdnCSS.sqlFile, mdnCSS.zip, dbController.addToDB], 29 | 30 | "MDN_Javascript": [requestProps.js, version.js, dbController.needUpdate, mdnJS.download, mdnJS.getJavascript, 31 | mdnJS.extract, mdnJS.createClassObj, mdnJS.createMethodsObj, mdnJS.createEventObj, 32 | mdnJS.createKWObj, mdnJS.createFuncObj, mdnJS.sqlFile, mdnJS.zip, dbController.addToDB], 33 | 34 | "NodeJS": [requestProps.node, version.node, scrapeParseWrite.createZip.bind(scrapeParseWrite), dbController.addToDB], 35 | 36 | "Express_API":[requestProps.express, version.express, scrapeParseWrite.createZip.bind(scrapeParseWrite), dbController.addToDB] 37 | }; 38 | 39 | require( 'dns' ) 40 | .lookup( require( 'os' ) 41 | .hostname(), 42 | function ( err, add, fam ) { 43 | console.log( 'addr: ' + add ); 44 | } ); 45 | // log output 46 | // app.use(require('morgan') 47 | // ('STATUS=:status IP=:remote-addr REQ=":method :url" TIME=:response-time :res[content-length]')); 48 | 49 | db.on( 'error', console.error.bind( console, 'connection error:' ) ); 50 | db.once( 'open', function () { 51 | console.log( "your db is open" ); 52 | } ); 53 | 54 | app.use( bodyParser.urlencoded( { 55 | extended: true 56 | } ) ); 57 | app.use( express.static( path.join( __dirname, './../public' ) ) ); 58 | ///////////////////////////////////////////////// 59 | //// Handle requests to our main page(site) 60 | ///////////////////////////////////////////////// 61 | app.get( '/', function ( req, res ) { 62 | console.log( "Our website homepage!" ); 63 | res.sendFile( path.join( __dirname, '/../public/index.html' ) ); 64 | } ); 65 | 66 | 67 | /***** API *****/ 68 | /* 69 | TODO: optimize download and extraction 70 | TODO: make create functions DRY with helper function 71 | NOTE: mdn.download only provides a link for request module, 72 | mdn.getJavascript actually downloads the .tgz 73 | */ 74 | // app.get( '/js', mdnJS.download, mdnJS.getJavascript, mdnJS.extract, mdnJS.createClassObj, mdnJS.createMethodsObj, mdnJS.createEventObj, mdnJS.createKWObj, mdnJS.createFuncObj, mdnJS.sqlFile, mdnJS.zip, function ( req, res ) { 75 | // res.sendFile(path.resolve('./mdn_javascript.zip')); 76 | // console.log('\n finished'); 77 | // }); 78 | app.get('/uphtml', updates.MDN_HTML, function(req, res, next){ 79 | res.sendFile(path.resolve(req.scrapeProps.filePath)) 80 | }) 81 | 82 | app.get('/updateVersions', updates.MDN_CSS, updates.MDN_HTML, updates.MDN_Javascript, updates.NodeJS, updates.Express_API, 83 | function(req, res){ 84 | req.scrapeProps = null; 85 | res.end(); 86 | }); 87 | app.get( '/mdn_html', requestProps.html, dbController.latestVer, function ( req, res ) { 88 | res.sendFile(path.resolve(req.scrapeProps.filePath)); 89 | req.scrapeProps = null; 90 | // console.log('\n finished'); 91 | }); 92 | app.get( '/mdn_css', requestProps.css, dbController.latestVer, function ( req, res ) { 93 | res.sendFile(path.resolve(req.scrapeProps.filePath)); 94 | req.scrapeProps = null; 95 | // console.log('\n finished'); 96 | }); 97 | app.get('/mdn_javascript', requestProps.js, dbController.latestVer, function(req, res){ 98 | res.sendFile(path.resolve(req.scrapeProps.filePath)); 99 | req.scrapeProps = null; 100 | // console.log("sending full html back to client"); 101 | }); 102 | /////////////////////////////////////////////////////////////////////////////// 103 | /// BIND SCRAPEPARSEWRITE.CREATEZIP TO ITSELF SO IT BIND TO THE CORRECT CONTEXT 104 | /////////////////////////////////////////////////////////////////////////////// 105 | app.get('/node', requestProps.node, dbController.latestVer, function(req,res){ 106 | res.sendFile(path.resolve(req.scrapeProps.filePath)); 107 | req.scrapeProps = null; 108 | // console.log("sending full html back to client"); 109 | }); 110 | app.get('/express', requestProps.express, dbController.latestVer, function(req,res){ 111 | res.sendFile(path.resolve(req.scrapeProps.filePath)); 112 | req.scrapeProps = null; 113 | // console.log("sending full html back to client"); 114 | }); 115 | ////////////////////////////////////////////////// 116 | // Test crash reporting route 117 | ////////////////////////////////////////////////// 118 | // app.post( '/error', function ( req, res ) { 119 | // console.log( "this func is running" ); 120 | // fs.writeFile( 'crashReport.txt', req.body, function () { 121 | // console.log( 'crash report\'s a go' ); 122 | // } ); 123 | // } ); 124 | ////////////////////////////////////////////////// 125 | // delete zip/or section from server update DB 126 | ////////////////////////////////////////////////// 127 | app.delete( '/node', function ( req, res ) {} ); 128 | ////////////////////////////////////////////////// 129 | // handle changes to node update DB 130 | ////////////////////////////////////////////////// 131 | app.put( '/node', function ( req, res ) {}); 132 | 133 | 134 | 135 | 136 | 137 | /////////////////////////////////////////////// 138 | // Handle requests for data 139 | // (option for multiple sites) 140 | /////////////////////////////////////////////// 141 | // app.get('/html', function(req,res){ 142 | // res.sendFile(path.join(__dirname, '/../index.html')); 143 | // console.log("send full html back to client"); 144 | // }); 145 | 146 | app.listen( 8085, function () { 147 | console.log( "Updater is listening on port 8085" ); 148 | } ); 149 | --------------------------------------------------------------------------------