├── .gitignore
├── README.md
├── node_modules
└── website-scraper
│ ├── .npmignore
│ ├── LICENSE
│ ├── README.md
│ ├── app.js
│ ├── index.js
│ ├── lib
│ ├── config
│ │ ├── defaults.js
│ │ ├── recursive-sources.js
│ │ ├── resource-types-by-tag.js
│ │ └── resource-types.js
│ ├── file-handlers
│ │ ├── css.js
│ │ └── html.js
│ ├── request.js
│ ├── resource.js
│ ├── scraper.js
│ └── utils.js
│ └── package.json
├── package.json
├── public
└── index.html
└── server
├── controllers
├── dbController.js
├── mdnJS.js
└── updateModel.js
├── middleware
├── folderHandler.js
├── mdnCSS.js
├── mdnHTML.js
├── mdnJS.js
├── nodeparser_working.js
├── parseEntryPoint.js
├── parser.js
├── requestProps.js
├── rewrite.js
├── scrapeParseWrite.js
└── versionCheck.js
├── server.js
└── updater.js
/.gitignore:
--------------------------------------------------------------------------------
1 | # npm dependencies
2 | node_modules/*
3 |
4 | # debug logs
5 | npm-debug.log
6 | JavaScript.tgz
7 |
8 | #docs
9 | docs
10 | test
11 | ignore_test_files
12 | website-scraper
13 | zips
14 | temp
15 |
16 | #include edited version of website_scraper
17 | !node_modules/website-scraper
18 | node_modules/website-scraper/node_modules
19 |
20 | # packaged application (end-user)
21 | Doc-tor-darwin-x64
22 |
23 | # front-end build
24 | build
25 |
26 | # misc
27 | scraper.bak.js
28 | server/middleware/nodeparser_working.js
29 | ..bfg-report
30 | JavaScript.tgz
31 | mdnFiles/
32 | doc/
33 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Doc-Server
2 |
--------------------------------------------------------------------------------
/node_modules/website-scraper/.npmignore:
--------------------------------------------------------------------------------
1 | .idea
2 | .gitignore
3 | .travis.yml
4 | coverage
5 | test
6 |
--------------------------------------------------------------------------------
/node_modules/website-scraper/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2014 Sophia Nepochataya
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/node_modules/website-scraper/README.md:
--------------------------------------------------------------------------------
1 | ## Introduction
2 | Download website to a local directory (including all css, images, js, etc.)
3 |
4 | [](https://travis-ci.org/s0ph1e/node-website-scraper)
5 | [](https://codeclimate.com/github/s0ph1e/node-website-scraper/coverage)
6 | [](https://codeclimate.com/github/s0ph1e/node-website-scraper)
7 | [](https://www.npmjs.org/package/website-scraper)
8 | [](https://www.npmjs.org/package/website-scraper)
9 | [](https://david-dm.org/s0ph1e/node-website-scraper)
10 |
11 | [](https://www.npmjs.org/package/website-scraper)
12 |
13 | You can try it in [demo app](https://scraper.nepochataya.pp.ua/) ([source](https://github.com/s0ph1e/web-scraper))
14 |
15 | ## Installation
16 | ```
17 | npm install website-scraper
18 | ```
19 |
20 | ## Usage
21 | ```javascript
22 | var scraper = require('website-scraper');
23 | var options = {
24 | urls: ['http://nodejs.org/'],
25 | directory: '/path/to/save/',
26 | };
27 |
28 | // with callback
29 | scraper.scrape(options, function (error, result) {
30 | /* some code here */
31 | });
32 |
33 | // or with promise
34 | scraper.scrape(options).then(function (result) {
35 | /* some code here */
36 | });
37 | ```
38 |
39 | ## API
40 | ### scrape(options, callback)
41 | Makes requests to `urls` and saves all files found with `sources` to `directory`.
42 |
43 | **options** - object containing next options:
44 |
45 | - `urls:` array of urls to load and filenames for them *(required, see example below)*
46 | - `directory:` path to save loaded files *(required)*
47 | - `defaultFilename:` filename for index page *(optional, default: 'index.html')*
48 | - `sources:` array of objects to load, specifies selectors and attribute values to select files for loading *(optional, see default value in `lib/config/defaults.js`)*
49 | - `subdirectories:` array of objects, specifies subdirectories for file extensions. If `null` all files will be saved to `directory` *(optional, see example below)*
50 | - `request`: object, custom options for [request](https://github.com/request/request#requestoptions-callback) *(optional, see example below)*
51 | - `recursive`: boolean, if `true` scraper will follow anchors in html files. Don't forget to set `maxDepth` to avoid infinite downloading *(optional, see example below)*
52 | - `maxDepth`: positive number, maximum allowed depth for dependencies *(optional, see example below)*
53 |
54 |
55 | **callback** - callback function *(optional)*, includes following parameters:
56 |
57 | - `error:` if error - `Error` object, if success - `null`
58 | - `result:` if error - `null`, if success - array if objects containing:
59 | - `url:` url of loaded page
60 | - `filename:` filename where page was saved (relative to `directory`)
61 |
62 |
63 | ## Examples
64 | #### Example 1
65 | Let's scrape some pages from [http://nodejs.org/](http://nodejs.org/) with images, css, js files and save them to `/path/to/save/`.
66 | Imagine we want to load:
67 | - [Home page](http://nodejs.org/) to `index.html`
68 | - [About page](http://nodejs.org/about/) to `about.html`
69 | - [Blog](http://blog.nodejs.org/) to `blog.html`
70 |
71 | and separate files into directories:
72 |
73 | - `img` for .jpg, .png, .svg (full path `/path/to/save/img`)
74 | - `js` for .js (full path `/path/to/save/js`)
75 | - `css` for .css (full path `/path/to/save/css`)
76 |
77 | ```javascript
78 | var scraper = require('website-scraper');
79 | scraper.scrape({
80 | urls: [
81 | 'http://nodejs.org/', // Will be saved with default filename 'index.html'
82 | {url: 'http://nodejs.org/about', filename: 'about.html'},
83 | {url: 'http://blog.nodejs.org/', filename: 'blog.html'}
84 | ],
85 | directory: '/path/to/save',
86 | subdirectories: [
87 | {directory: 'img', extensions: ['.jpg', '.png', '.svg']},
88 | {directory: 'js', extensions: ['.js']},
89 | {directory: 'css', extensions: ['.css']}
90 | ],
91 | sources: [
92 | {selector: 'img', attr: 'src'},
93 | {selector: 'link[rel="stylesheet"]', attr: 'href'},
94 | {selector: 'script', attr: 'src'}
95 | ],
96 | request: {
97 | headers: {
98 | 'User-Agent': 'Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 4 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19'
99 | }
100 | }
101 | }).then(function (result) {
102 | console.log(result);
103 | }).catch(function(err){
104 | console.log(err);
105 | });
106 | ```
107 |
108 | #### Example 2. Recursive downloading
109 | ```javascript
110 | // Links from example.com will be followed
111 | // Links from links will be ignored because theirs depth = 2 is greater than maxDepth
112 | var scraper = require('website-scraper');
113 | scraper.scrape({
114 | urls: ['http://example.com/'],
115 | directory: '/path/to/save',
116 | recursive: true,
117 | maxDepth: 1
118 | }).then(console.log).catch(console.log);
119 | ```
120 |
--------------------------------------------------------------------------------
/node_modules/website-scraper/app.js:
--------------------------------------------------------------------------------
1 | var scraper = require('./index');
2 | scraper.scrape({
3 | urls: [
4 | 'http://nodejs.org/', // Will be saved with default filename 'index.html'
5 | {url: 'http://nodejs.org/about', filename: 'about.html'},
6 | {url: 'http://blog.nodejs.org/', filename: 'blog.html'}
7 | ],
8 | directory: './path/to/save',
9 | subdirectories: [
10 | {directory: 'img', extensions: ['.jpg', '.png', '.svg']},
11 | {directory: 'js', extensions: ['.js']},
12 | {directory: 'css', extensions: ['.css']}
13 | ],
14 | sources: [
15 | {selector: 'img', attr: 'src'},
16 | {selector: 'link[rel="stylesheet"]', attr: 'href'},
17 | {selector: 'script', attr: 'src'}
18 | ],
19 | request: {
20 | headers: {
21 | 'User-Agent': 'Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 4 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19'
22 | }
23 | }
24 | }).then(function (result) {
25 | console.log(result);
26 | }).catch(function(err){
27 | console.log(err);
28 | });
--------------------------------------------------------------------------------
/node_modules/website-scraper/index.js:
--------------------------------------------------------------------------------
1 | var Scraper = require('./lib/scraper.js');
2 |
3 | module.exports.scrape = function (options, callback) {
4 | return new Scraper(options).scrape(callback);
5 | };
--------------------------------------------------------------------------------
/node_modules/website-scraper/lib/config/defaults.js:
--------------------------------------------------------------------------------
1 | var config = {
2 | defaultFilename: 'index.html',
3 | sources: [
4 | {
5 | selector: 'img',
6 | attr: 'src'
7 | },
8 | {
9 | selector: 'input',
10 | attr: 'src'
11 | },
12 | {
13 | selector: 'object',
14 | attr: 'data'
15 | },
16 | {
17 | selector: 'embed',
18 | attr: 'src'
19 | },
20 | {
21 | selector: 'param[name="movie"]',
22 | attr: 'value'
23 | },
24 | {
25 | selector: 'script',
26 | attr: 'src'
27 | },
28 | {
29 | selector: 'link[rel="stylesheet"]',
30 | attr: 'href'
31 | },
32 | {
33 | selector: 'link[rel*="icon"]',
34 | attr: 'href'
35 | }
36 | ],
37 | subdirectories: [
38 | {
39 | directory: 'images',
40 | extensions: ['.png', '.jpg', '.jpeg', '.gif']
41 | },
42 | {
43 | directory: 'js',
44 | extensions: ['.js']
45 | },
46 | {
47 | directory: 'css',
48 | extensions: ['.css']
49 | },
50 | {
51 | directory: 'fonts',
52 | extensions: ['.ttf', '.woff', '.eot', '.svg']
53 | }
54 | ]
55 | };
56 |
57 | module.exports = config;
58 |
--------------------------------------------------------------------------------
/node_modules/website-scraper/lib/config/recursive-sources.js:
--------------------------------------------------------------------------------
1 | module.exports = [
2 | { selector: 'a', attr: 'href' }
3 | ];
--------------------------------------------------------------------------------
/node_modules/website-scraper/lib/config/resource-types-by-tag.js:
--------------------------------------------------------------------------------
1 | var types = require('./resource-types');
2 |
3 | var typesByHtmlTag = {};
4 |
5 | typesByHtmlTag[types.css] = [
6 | { tagName: 'link', attributeName: 'href' }
7 | ];
8 | typesByHtmlTag[types.html] = [
9 | { tagName: 'a', attributeName: 'href' },
10 | { tagName: 'iframe', attributeName: 'src' }
11 | ];
12 |
13 | module.exports = typesByHtmlTag;
14 |
--------------------------------------------------------------------------------
/node_modules/website-scraper/lib/config/resource-types.js:
--------------------------------------------------------------------------------
1 | var types = {
2 | css: 'css',
3 | html: 'html',
4 | other: 'other'
5 | };
6 |
7 | module.exports = types;
8 |
--------------------------------------------------------------------------------
/node_modules/website-scraper/lib/file-handlers/css.js:
--------------------------------------------------------------------------------
1 | var _ = require('underscore');
2 | var Promise = require('bluebird');
3 | var getCssUrls = require('css-url-parser');
4 | var utils = require('../utils');
5 |
6 | function loadCss (context, resource) {
7 | var url = resource.getUrl();
8 | var filename = resource.getFilename();
9 | var text = resource.getText();
10 | var cssUrls = getCssUrls(text);
11 |
12 | var promises = _.map(cssUrls, function loadResourceFromCssUrl (cssUrl) {
13 | var resourceUrl = utils.getUrl(url, cssUrl);
14 | var cssResource = resource.createChild(resourceUrl);
15 |
16 | return context.loadResource(cssResource).then(function handleLoadedSource (loadedResource) {
17 | var relativePath = utils.getRelativePath(filename, loadedResource.getFilename());
18 | text = text.replace(cssUrl, relativePath);
19 | return Promise.resolve();
20 | });
21 | });
22 |
23 | return utils.waitAllFulfilled(promises).then(function () {
24 | resource.setText(text);
25 | return resource;
26 | });
27 | }
28 |
29 | module.exports = loadCss;
30 |
--------------------------------------------------------------------------------
/node_modules/website-scraper/lib/file-handlers/html.js:
--------------------------------------------------------------------------------
1 | var cheerio = require('cheerio');
2 | var Promise = require('bluebird');
3 | var utils = require('../utils');
4 |
5 | function loadHtml (context, resource) {
6 | var sources = context.getHtmlSources();
7 | var handleResources = loadResources.bind(null, context, resource);
8 |
9 | var p = beforeHandle(resource);
10 |
11 | sources.forEach(function (src) {
12 | p = p.then(function loadSource () {
13 | return handleResources(src);
14 | });
15 | });
16 | return p;
17 | }
18 |
19 | function beforeHandle (resource) {
20 | var text = resource.getText();
21 | var $ = cheerio.load(text);
22 |
23 | // Handle tag
24 | $('base').each(function () {
25 | var el = $(this);
26 | var href = el.attr('href');
27 | if (href) {
28 | var newUrl = utils.getUrl(resource.getUrl(), href);
29 | resource.setUrl(newUrl);
30 | el.remove();
31 | }
32 | });
33 |
34 | text = $.html();
35 | resource.setText(text);
36 |
37 | return Promise.resolve(resource);
38 | }
39 |
40 | function loadResources (context, resource, source) {
41 | var url = resource.getUrl();
42 | var text = resource.getText();
43 | var filename = resource.getFilename();
44 | var $ = cheerio.load(text);
45 |
46 | var promises = $(source.selector).map(function loadForSelector () {
47 | var el = $(this);
48 | var attr = el.attr(source.attr);
49 |
50 | if (attr) {
51 | var resourceUrl = utils.getUrl(url, attr);
52 | var htmlResource = resource.createChild(resourceUrl);
53 | htmlResource.setHtmlData({ tagName: el[0].name, attributeName: source.attr });
54 |
55 | return context.loadResource(htmlResource).then(function handleLoadedSource (loadedResource) {
56 | var relativePath = utils.getRelativePath(filename, loadedResource.getFilename());
57 | el.attr(source.attr, relativePath);
58 | return Promise.resolve();
59 | });
60 | }
61 | return Promise.reject();
62 | });
63 |
64 | return utils.waitAllFulfilled(promises).then(function () {
65 | text = $.html();
66 | resource.setText(text);
67 | return resource;
68 | });
69 | }
70 |
71 | module.exports = loadHtml;
72 |
--------------------------------------------------------------------------------
/node_modules/website-scraper/lib/request.js:
--------------------------------------------------------------------------------
1 | var _ = require('underscore');
2 | var Promise = require('bluebird');
3 | var request = require('request');
4 | var get = Promise.promisify(request.get);
5 |
6 | var defaultOptions = {
7 | method: 'GET',
8 | encoding: 'binary',
9 | strictSSL: false,
10 | jar: true
11 | };
12 |
13 | function getDefaultOptions() {
14 | return defaultOptions;
15 | }
16 |
17 | function getCustomOptions(options) {
18 | return _.extend({}, defaultOptions, options);
19 | }
20 |
21 | function makeRequest(options, url) {
22 | var requestOptions = getCustomOptions(options);
23 | requestOptions.url = url;
24 |
25 | return get(requestOptions).then(function handleResponse(data) {
26 | return {
27 | url: data.request.href,
28 | body: data.body
29 | };
30 | });
31 | }
32 |
33 | module.exports.makeRequest = makeRequest;
34 | module.exports.getDefaultOptions = getDefaultOptions;
35 | module.exports.getCustomOptions = getCustomOptions;
36 |
--------------------------------------------------------------------------------
/node_modules/website-scraper/lib/resource.js:
--------------------------------------------------------------------------------
1 | var _ = require('underscore');
2 | var path = require('path');
3 | var types = require('./config/resource-types');
4 | var typesByHtmlData = require('./config/resource-types-by-tag');
5 |
6 | function getTypeByHtmlData (htmlData) {
7 | var type = _.findKey(typesByHtmlData, function containsHtmlData (rules) {
8 | return _.findWhere(rules, htmlData);
9 | });
10 | return type || types.other;
11 | }
12 |
13 | function Resource (url, filename) {
14 | this.url = url;
15 | this.filename = filename;
16 | }
17 |
18 | Resource.prototype.createChild = function createChild (url, filename) {
19 | var child = new Resource(url, filename);
20 |
21 | var currentDepth = this.getDepth();
22 |
23 | child.setParent(this);
24 | child.setDepth(++currentDepth);
25 |
26 | return child;
27 | };
28 |
29 | Resource.prototype.getUrl = function getUrl () {
30 | return this.url;
31 | };
32 |
33 | Resource.prototype.setUrl = function setUrl (url) {
34 | this.url = url;
35 | };
36 |
37 | Resource.prototype.getFilename = function getFilename () {
38 | return this.filename;
39 | };
40 |
41 | Resource.prototype.setFilename = function setFilename (filename) {
42 | this.filename = filename;
43 | };
44 |
45 | Resource.prototype.getText = function getText () {
46 | return this.text;
47 | };
48 |
49 | Resource.prototype.setText = function setText (text) {
50 | this.text = text;
51 | };
52 |
53 | Resource.prototype.setParent = function setParent (parent) {
54 | this.parent = parent;
55 | };
56 |
57 | Resource.prototype.getDepth = function getDepth () {
58 | return this.depth || 0;
59 | };
60 |
61 | Resource.prototype.setDepth = function setDepth (depth) {
62 | this.depth = depth;
63 | };
64 |
65 | /**
66 | *
67 | * @param {Object} data - html element data
68 | * @param {string} data.tagName - tag name which contain resource
69 | * @param {string} data.attributeName - attribute name with value of resource's url
70 | */
71 | Resource.prototype.setHtmlData = function setHtmlData (data) {
72 | this.htmlData = data;
73 | };
74 |
75 | Resource.prototype.getType = function getType () {
76 | var ext = path.extname(this.filename);
77 | var parentType = this.parent && this.parent.getType();
78 | var hasHtmlData = !!this.htmlData;
79 |
80 | switch (true) {
81 | case ext == '.html' || ext == '.htm':
82 | return types.html;
83 | case ext == '.css':
84 | case !ext && parentType == types.css:
85 | return types.css;
86 | case !ext && parentType == types.html && hasHtmlData:
87 | return getTypeByHtmlData(this.htmlData);
88 | default:
89 | return types.other;
90 | }
91 | };
92 |
93 | module.exports = Resource;
94 |
--------------------------------------------------------------------------------
/node_modules/website-scraper/lib/scraper.js:
--------------------------------------------------------------------------------
1 | var Promise = require('bluebird');
2 |
3 | var fs = require('fs-extra');
4 | var existsAsync = Promise.promisify(fs.stat);
5 | var outputFileAsync = Promise.promisify(fs.outputFile);
6 | var ensureDirAsync = Promise.promisify(fs.ensureDir);
7 |
8 | var path = require('path');
9 | var _ = require('underscore');
10 |
11 | var defaults = require('./config/defaults');
12 | var types = require('./config/resource-types');
13 | var recursiveSources = require('./config/recursive-sources');
14 | var utils = require('./utils.js');
15 | var request = require('./request');
16 | var Resource = require('./resource');
17 | var compareUrls = require('compare-urls');
18 |
19 | var loadHtml = require('./file-handlers/html');
20 | var loadCss = require('./file-handlers/css');
21 | function loadHtmlAndCss (context, po) {
22 | return loadHtml(context, po).then(function (loaded) {
23 | return loadCss(context, loaded);
24 | });
25 | }
26 |
27 | function Scraper (options) {
28 | this.originalResources = [];
29 | this.loadedResources = [];
30 |
31 | this.options = _.extend({}, defaults, options);
32 | this.options.directory = path.resolve(process.cwd(), this.options.directory || '');
33 | }
34 |
35 | Scraper.prototype.getLoadedResource = function getLoadedResource (resource) {
36 | return _.find(this.loadedResources, function(lr) {
37 | return compareUrls(resource.getUrl(), lr.getUrl());
38 | });
39 | };
40 |
41 | Scraper.prototype.addLoadedResource = function addLoadedResource (resource) {
42 | this.loadedResources.push(resource);
43 | };
44 |
45 | Scraper.prototype.getOccupiedFilenames = function getOccupiedFilenames () {
46 | var subdirectories = _.map(this.options.subdirectories, function (dir) { return dir.directory; });
47 | var loadedFiles = _.map(this.loadedResources, function(r) { return r.getFilename(); });
48 | return subdirectories.concat(loadedFiles);
49 | };
50 |
51 | Scraper.prototype.getHtmlSources = function getHtmlSources () {
52 | return this.options.sources;
53 | };
54 |
55 | Scraper.prototype.generateFilename = function generateFilename (resource) {
56 | var self = this;
57 |
58 | var occupiedFilenames = self.getOccupiedFilenames();
59 |
60 | var preferredFilename = resource.getFilename(); // which was set in options
61 | var urlFilename = utils.getFilenameFromUrl(resource.getUrl()); // try to get filename from url
62 | var filename = preferredFilename || urlFilename || self.options.defaultFilename;
63 |
64 | var ext = path.extname(filename);
65 | var dir = self.getDirectoryByExtension(ext);
66 | var currentFilename = path.join(dir, filename);
67 | var basename = path.basename(filename, ext);
68 | var index = 1;
69 |
70 | while (_.contains(occupiedFilenames, currentFilename)) {
71 | currentFilename = path.join(dir, basename + '_' + index + ext);
72 | index++;
73 | }
74 | return currentFilename;
75 | };
76 |
77 | Scraper.prototype.getDirectoryByExtension = function getDirectoryByExtension (ext) {
78 | return _.chain(this.options.subdirectories)
79 | .filter(function (dir) { return _.contains(dir.extensions, ext); })
80 | .map(function (dir) { return dir.directory; })
81 | .first()
82 | .value() || '';
83 | };
84 |
85 | Scraper.prototype.getResourceHandler = function getHandler (resource) {
86 | var self = this;
87 | var type = resource.getType();
88 | var depth = resource.getDepth();
89 | var depthGreaterThanMax = self.options.maxDepth && depth >= self.options.maxDepth;
90 |
91 | switch (true) {
92 | case depthGreaterThanMax: return _.noop;
93 | case type == types.css: return loadCss;
94 | case type == types.html: return loadHtmlAndCss;
95 | default: return _.noop;
96 | }
97 | };
98 |
99 | Scraper.prototype.loadResource = function loadResource (resource) {
100 | var self = this;
101 |
102 | var loaded = self.getLoadedResource(resource); // try to find already loaded
103 |
104 | var url = resource.getUrl();
105 | var filename;
106 | var handleFile;
107 |
108 | if (!loaded) {
109 | filename = self.generateFilename(resource);
110 | resource.setFilename(filename);
111 |
112 | self.addLoadedResource(resource);
113 |
114 | // Request -> processing -> save to fs
115 | return self.makeRequest(url).then(function requestCompleted(data) {
116 | resource.setUrl(data.url); // Url may be changed in redirects
117 | resource.setText(data.body);
118 | handleFile = self.getResourceHandler(resource);
119 | return handleFile(self, resource);
120 | }).then(function fileHandled() {
121 | var filename = path.join(self.options.directory, resource.getFilename());
122 | var text = resource.getText();
123 | return outputFileAsync(filename, text, { encoding: 'binary' });
124 | }).then(function fileSaved() {
125 | return Promise.resolve(resource);
126 | });
127 | }
128 | return Promise.resolve(loaded);
129 | };
130 |
131 | Scraper.prototype.validate = function validate () {
132 | var dir = this.options.directory;
133 | return existsAsync(dir).then(function handleDirectoryExist () {
134 | return Promise.reject(new Error('Path ' + dir + ' exists'));
135 | }, function handleDirectoryNotExist () {
136 | return Promise.resolve();
137 | });
138 | };
139 |
140 | Scraper.prototype.prepare = function prepare () {
141 | var self = this;
142 |
143 | // Create makeRequest function with custom request params
144 | self.makeRequest = request.makeRequest.bind(null, self.options.request);
145 |
146 | // Create array of Resource for downloading
147 | self.options.urls = _.isArray(self.options.urls) ? self.options.urls : [self.options.urls];
148 | self.originalResources = _.map(self.options.urls, function createResource(obj) {
149 | var url = _.isObject(obj) && _.has(obj, 'url') ? obj.url : obj;
150 | var filename = _.isObject(obj) && _.has(obj, 'filename') ? obj.filename : self.options.defaultFilename;
151 | return new Resource(url, filename);
152 | });
153 |
154 | if (self.options.recursive) {
155 | self.options.sources = _.union(self.options.sources, recursiveSources);
156 | }
157 |
158 | return ensureDirAsync(self.options.directory);
159 | };
160 |
161 | Scraper.prototype.load = function load () {
162 | var self = this;
163 | return Promise.map(self.originalResources, function loadPage (po) {
164 | return self.loadResource(po).then(function pageLoaded (loaded) {
165 | return Promise.resolve({
166 | url: loaded.getUrl(),
167 | filename: loaded.getFilename()
168 | });
169 | });
170 | });
171 | };
172 |
173 | Scraper.prototype.errorCleanup = function errorCleanup (error) {
174 | if (!_.isEmpty(this.loadedResources)) {
175 | fs.removeSync(this.options.directory);
176 | }
177 | throw error;
178 | };
179 |
180 | Scraper.prototype.scrape = function scrape(callback) {
181 | var self = this;
182 | return Promise.bind(self)
183 | .then(self.validate)
184 | .then(self.prepare)
185 | .then(self.load)
186 | .catch(self.errorCleanup)
187 | .asCallback(callback);
188 | };
189 |
190 | module.exports = Scraper;
191 |
--------------------------------------------------------------------------------
/node_modules/website-scraper/lib/utils.js:
--------------------------------------------------------------------------------
1 | var url = require('url');
2 | var path = require('path');
3 | var Promise = require('bluebird');
4 |
5 | function isUrl(path) {
6 | var urlRegexp = /^((http[s]?:)?\/\/)/;
7 | return urlRegexp.test(path);
8 | }
9 |
10 | function getUrl(currentUrl, path) {
11 | var pathObj = url.parse(path);
12 | if (isUrl(path) && !pathObj.protocol) {
13 | pathObj.protocol = 'http';
14 | path = url.format(pathObj);
15 | }
16 | return url.resolve(currentUrl, path);
17 | }
18 |
19 | function getUnixPath(filepath) {
20 | return filepath.replace(/\\/g, '/');
21 | }
22 |
23 | function getRelativePath(path1, path2) {
24 | var dirname = path.dirname(path1);
25 | var relativePath = path.relative(dirname, path2);
26 | return getUnixPath(relativePath);
27 | }
28 |
29 | function getFilenameFromUrl (u) {
30 | return path.basename(url.parse(u).pathname);
31 | }
32 |
33 | function waitAllFulfilled(promises) {
34 | return Promise.all(promises.map(function(promise) {
35 | return promise.reflect();
36 | }));
37 | }
38 |
39 | module.exports = {
40 | isUrl: isUrl,
41 | getUrl: getUrl,
42 | getUnixPath: getUnixPath,
43 | getRelativePath: getRelativePath,
44 | getFilenameFromUrl: getFilenameFromUrl,
45 | waitAllFulfilled: waitAllFulfilled
46 | };
47 |
--------------------------------------------------------------------------------
/node_modules/website-scraper/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "_args": [
3 | [
4 | "website-scraper",
5 | "/Users/daniellevin/Projects/Doc-Server"
6 | ],
7 | [
8 | "website-scraper",
9 | "/Users/daniellevin/Projects/Doc-tor"
10 | ]
11 | ],
12 | "_from": "website-scraper@latest",
13 | "_id": "website-scraper@0.3.1",
14 | "_inCache": true,
15 | "_installable": true,
16 | "_location": "/website-scraper",
17 | "_nodeVersion": "0.10.25",
18 | "_npmUser": {
19 | "email": "sophia.nepochataya@gmail.com",
20 | "name": "s0ph1e"
21 | },
22 | "_npmVersion": "2.12.1",
23 | "_phantomChildren": {
24 | "cheerio-select": "0.0.3",
25 | "domelementtype": "1.3.0"
26 | },
27 | "_requested": {
28 | "name": null,
29 | "raw": "website-scraper",
30 | "rawSpec": "website-scraper",
31 | "scope": null,
32 | "spec": "/Users/daniellevin/Projects/Doc-Server/website-scraper",
33 | "type": "directory"
34 | },
35 | "_requiredBy": [
36 | "#USER",
37 | "/"
38 | ],
39 | "_resolved": "https://registry.npmjs.org/website-scraper/-/website-scraper-0.3.1.tgz",
40 | "_shasum": "fcad9a05e2155655e2226334bd9d6fe5ef1c8276",
41 | "_shrinkwrap": null,
42 | "_spec": "website-scraper",
43 | "_where": "/Users/daniellevin/Projects/Doc-Server",
44 | "author": {
45 | "name": "s0ph1e"
46 | },
47 | "bugs": {
48 | "url": "https://github.com/s0ph1e/node-website-scraper/issues"
49 | },
50 | "dependencies": {
51 | "bluebird": "^3.0.1",
52 | "cheerio": "0.11.0",
53 | "compare-urls": "^1.0.0",
54 | "css-url-parser": "^0.1.0",
55 | "fs-extra": "^0.26.0",
56 | "request": "^2.42.0",
57 | "underscore": "^1.7.0"
58 | },
59 | "description": "Download website to a local directory (including all css, images, js, etc.)",
60 | "devDependencies": {
61 | "codeclimate-test-reporter": "^0.1.0",
62 | "istanbul": "^0.4.0",
63 | "mocha": "^2.2.5",
64 | "nock": "^2.9.1",
65 | "proxyquire": "^1.7.3",
66 | "should": "^7.0.2",
67 | "sinon": "^1.15.4",
68 | "sinon-as-promised": "^4.0.0"
69 | },
70 | "directories": {},
71 | "dist": {
72 | "shasum": "fcad9a05e2155655e2226334bd9d6fe5ef1c8276",
73 | "tarball": "http://registry.npmjs.org/website-scraper/-/website-scraper-0.3.1.tgz"
74 | },
75 | "gitHead": "f1983f79ced795563ee964868e0048b9fbf431b0",
76 | "homepage": "https://github.com/s0ph1e/node-website-scraper",
77 | "keywords": [
78 | "css",
79 | "download",
80 | "html",
81 | "image",
82 | "js",
83 | "page",
84 | "scrape",
85 | "scraper",
86 | "site",
87 | "url",
88 | "web"
89 | ],
90 | "license": "MIT",
91 | "main": "index.js",
92 | "maintainers": [
93 | {
94 | "name": "s0ph1e",
95 | "email": "sophia.nepochataya@gmail.com"
96 | }
97 | ],
98 | "name": "website-scraper",
99 | "optionalDependencies": {},
100 | "readme": "ERROR: No README data found!",
101 | "repository": {
102 | "type": "git",
103 | "url": "git://github.com/s0ph1e/node-website-scraper.git"
104 | },
105 | "scripts": {
106 | "test": "istanbul cover ./node_modules/mocha/bin/_mocha --dir ./coverage --report lcov -- -R spec --recursive ./test"
107 | },
108 | "version": "0.3.1"
109 | }
110 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "doc-server",
3 | "version": "1.0.0",
4 | "description": "JS doc repository server and scraper",
5 | "main": "server/server.js",
6 | "scripts": {
7 | "test": "echo \"Error: no test specified\" && exit 1",
8 | "start": "nodemon --ignore docs/ --ignore temp/ --ignore zips/ server/server.js"
9 | },
10 | "repository": {
11 | "type": "git",
12 | "url": "git+https://github.com/DocWave/Doc-Server.git"
13 | },
14 | "author": "Sporks, Dan Levin, Cruz Welborn, Lea Fox",
15 | "license": "MIT",
16 | "dependencies": {
17 | "archiver": "^0.21.0",
18 | "bluebird": "^3.3.1",
19 | "body-parser": "^1.14.2",
20 | "cheerio": "^0.20.0",
21 | "express": "^4.13.4",
22 | "mongoose": "^4.4.2",
23 | "nightmare": "^2.1.6",
24 | "path": "^0.12.7",
25 | "phantom": "^0.9.0",
26 | "request": "^2.69.0",
27 | "tar": "^2.2.1",
28 | "tar.gz": "^1.0.3",
29 | "vo": "^1.0.3",
30 | "website-scraper": "file:website-scraper"
31 | },
32 | "devDependencies": {
33 | "morgan": "^1.7.0",
34 | "single-line-log": "^1.0.1"
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/public/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | Our site homepage
9 |
10 |
11 |
--------------------------------------------------------------------------------
/server/controllers/dbController.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 | const Update = require('./updateModel');
3 | const path = require('path');
4 | const fs = require('fs');
5 |
6 | module.exports = {
7 | latestVer: function(req, res, next){
8 | let query = Update.where({sourceName: req.scrapeProps.sourceName});
9 | console.log(req.scrapeProps.sourceName);
10 | query.findOne({},{},{ sort: { 'createdAt' : -1 } }, function(err, foundVer){
11 | if(err) console.log(err);
12 | try{
13 | let fileStats = fs.statSync(path.resolve(foundVer.filePath));
14 | //If we find that we have the same version, send the version we already have
15 | req.scrapeProps = foundVer.filePath;
16 | return res.sendFile(path.resolve(foundVer.filePath));
17 | // next();
18 | }
19 | //We didn't find the file in the directory, so proceed as usual
20 | catch(e){
21 | next();
22 | }
23 | console.log(foundVer);
24 | // req.scrapeProps = foundVer.filePath;
25 | return res.sendFile(path.resolve(foundVer.filePath));
26 | });
27 | },
28 | needUpdate : function(req, res, next){
29 | if(!req.needUpdate)
30 | req.needUpdate = {};
31 | let query = Update.where({versionNo: req.scrapeProps.versionNo,
32 | sourceName: req.scrapeProps.sourceName});
33 | query.findOne( function (err, foundUpdate){
34 | //takes in an err from findOne and the returned Doc
35 | if(err) console.log(err);
36 | // console.log("finding");
37 | if(!foundUpdate){
38 | //no update found, send continue the middleware!
39 | console.log("\n\n\t\tNew version, updating\n\n");
40 | next();
41 | }
42 |
43 | else if ( foundUpdate ){ // if the Doc exists update
44 | //Also check if we have the file right now, just in case it got deleted
45 | try{
46 | console.log("found ");
47 |
48 | let fileStats = fs.statSync(path.resolve(foundUpdate.filePath));
49 | //If we find that we have the same version, send the version we already have
50 | //break out of the middleware!
51 | // console.log("\n\n\t\tFile Found, sending local copy\n\n");
52 | // return res.sendFile(path.resolve(foundUpdate.filePath));
53 | next();
54 | }
55 | //We didn't find the file in the directory, so proceed as usual
56 | catch(e){
57 | console.log("File not found....");
58 | console.log(foundUpdate.filePath);
59 | req.needUpdate[req.scrapeProps.sourceName.replace(/\s/g, "_")] = true;
60 |
61 | next();
62 | }
63 |
64 | }
65 | });
66 | },
67 | addToDB : function(req, res, next){
68 | //assigns a new Update document to the variable update
69 | let update = new Update ({sourceName : req.scrapeProps.sourceName,
70 | versionNo : req.scrapeProps.versionNo,
71 | filePath : req.scrapeProps.filePath,
72 | retrieved : Date.now(),
73 | createdAt : Date.now()});
74 |
75 | //store our query in a variable
76 | //fileName = the name of documentation
77 | let query = Update.where({versionNo: req.scrapeProps.versionNo,
78 | sourceName: req.scrapeProps.sourceName});
79 | // console.log(res.fileName, res.versionNo, res.filePath);
80 | //Checks database to see if doc already exists
81 | // runs callback found(err,foundUpdate)
82 |
83 | // Checks database to see if doc already exists
84 | // runs callback found(err,foundUpdate)
85 | query.findOne( function (err, foundUpdate){
86 | //takes in an err from findOne and the returned Doc
87 | if(err)console.log(err);
88 |
89 | if(!foundUpdate){
90 | update.save( function(err, update){
91 | if(err) {
92 | console.error(err);
93 | }
94 | else {
95 | console.log (`${req.scrapeProps.sourceName} - versionNo:${req.scrapeProps.versionNo} has been added to the database.`);
96 | next();
97 | }
98 | });
99 | }
100 |
101 | if ( foundUpdate ){ // if the Doc exists update
102 | //currently only updating the Date - can handle version numbers at a later date
103 | query.findOneAndUpdate( {retrieved: Date.now()}, function(err, newInfo){
104 | if (err) console.log(err);
105 | else{
106 | console.log("NewInfo ", newInfo);
107 | next();
108 | }
109 | });
110 | }
111 | });
112 | }
113 | };
114 |
--------------------------------------------------------------------------------
/server/controllers/mdnJS.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 | const cheerio = require( 'cheerio' );
3 | const request = require( 'request' );
4 | const fs = require( 'fs' );
5 | const targz = require( 'tar.gz' );
6 | const zlib = require( 'zlib' );
7 | const path = require( 'path' );
8 | const tar = require( 'tar' );
9 | const SQL = require( 'sql.js' );
10 | const archiver = require( 'archiver' );
11 |
12 | let mdn = {
13 |
14 | /*
15 | * This function goes to kapeli.com, grabs the Javascript link,
16 | * then attaches it to the req obj
17 | */
18 |
19 | download: function ( req, res, next ) {
20 | request( 'https://kapeli.com/mdn_offline', function ( err, html ) {
21 | if ( err ) console.log( err );
22 | let $ = cheerio.load( html.body );
23 |
24 | //Only use the link that contains the text 'Javascript.tgz'
25 | let downloadLink = "https://kapeli.com/" + $( ".download:contains('JavaScript.tgz')" )
26 | .attr( "href" );
27 | req.JSdownloadLink = downloadLink;
28 | next();
29 | } );
30 | },
31 | //downloads tar file from kapeli.com
32 | getJavascript: function ( req, res, next ) {
33 | //downloading 116 MB .tar to disk
34 |
35 | //Check if js file exists
36 |
37 | let write = fs.createWriteStream( './JavaScript.tgz' );
38 |
39 | ///////////////////////////////////////////////////////
40 | // using the request stream as a ReadStream
41 | // NOTE: req.downloadLink initialized in mdn.download
42 | //////////////////////////////////////////////////////
43 | let read = request( req.JSdownloadLink )
44 | .on( 'error', function ( err ) {
45 | throw err;
46 | } )
47 | .pipe( write );
48 |
49 | //just to log bytes written - not necessary
50 | let watcher = fs.watch( './JavaScript.tgz' )
51 | .on( 'change', function () {
52 | let bytes=(read.bytesWritten/1000000).toFixed(2);
53 | require('single-line-log').stdout('JS: ',bytes +' MB');
54 | });
55 | //close readStream and watcher
56 | read.on( 'finish', function () {
57 | read.close( function(){
58 | watcher.close();
59 | next();
60 | });
61 | } );
62 | },
63 | extract: function ( req, res, next ) {
64 | console.log( 'extracting...' );
65 | let inflate = zlib.Unzip();
66 | let extractor = tar.Extract( {
67 | path: './docs'
68 | } )
69 | .on( 'error', function ( err ) {
70 | throw err;
71 | } )
72 | .on( 'end', function () {
73 | console.log( 'extracted' );
74 | } );
75 | let extracting = fs.createReadStream( './JavaScript.tgz' )
76 | .on( 'error', function ( err ) {
77 | throw err;
78 | } )
79 | .pipe( inflate )
80 | .pipe( extractor );
81 | extracting.on( 'finish', function () {
82 | next();
83 | } );
84 | },
85 | createClassObj: function ( req, res, next ) {
86 | let base = 'JavaScript/developer.mozilla.org/en-US/docs/Web/API/';
87 | let classObj = {};
88 |
89 | fs.readdir( './docs/' + base, function ( err, files ) {
90 | if ( err ) console.log( err );
91 | files = files.filter( elem => {
92 | return elem.includes( '.html' );
93 | } );
94 | for ( let k of files ) {
95 | classObj[ k.replace( '.html', "" ) ] = base + k;
96 | }
97 | req.classObj = classObj;
98 | next();
99 | } );
100 | },
101 | createMethodsObj: function ( req, res, next ) {
102 | function getDirectories( srcpath ) {
103 | return fs.readdirSync( srcpath )
104 | .filter( function ( file ) {
105 | return fs.statSync( path.join( srcpath, file ) )
106 | .isDirectory();
107 | } );
108 | }
109 | let base = 'JavaScript/developer.mozilla.org/en-US/docs/Web/API/';
110 | let methodObj = {};
111 |
112 | let directories = getDirectories( './docs/' + base );
113 | directories.forEach( elem => {
114 | fs.readdir( `docs/${base}/${elem}`, function ( err, files ) {
115 | files.forEach( fileElem => {
116 | let key = `${elem}.${fileElem}`;
117 | methodObj[ key.replace( ".html", "" ) ] = `${base}/${elem}/${fileElem}`;
118 | } );
119 | req.methodObj = methodObj;
120 | } );
121 | } );
122 | next();
123 | },
124 | createEventObj: function ( req, res, next ) {
125 | let base = 'JavaScript/developer.mozilla.org/en-US/docs/Web/Events/';
126 | let eventsObj = {};
127 |
128 | fs.readdir( './docs/' + base, function ( err, files ) {
129 | if ( err ) console.log( err );
130 | files = files.filter( elem => {
131 | return elem.includes( '.html' );
132 | } );
133 | for ( let k of files ) {
134 | eventsObj[ k.replace( '.html', "" ) ] = base + k;
135 | }
136 | req.eventsObj = eventsObj;
137 | next();
138 | } );
139 | },
140 | createKWObj: function ( req, res, next ) {
141 | let base1 = 'JavaScript/developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/';
142 | let base2 = 'JavaScript/developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/';
143 | let KWObj = {};
144 | fs.readdir( './docs/' + base1, function ( err, files ) {
145 | if ( err ) console.log( err );
146 | files = files.filter( elem => {
147 | return elem.includes( '.html' );
148 | } );
149 | for ( let k of files ) {
150 | KWObj[ k.replace( '.html', "" ) ] = base1 + k;
151 | }
152 | } );
153 | fs.readdir( './docs/' + base2, function ( err, files ) {
154 | if ( err ) console.log( err );
155 | files = files.filter( elem => {
156 | return elem.includes( '.html' );
157 | } );
158 | for ( let k of files ) {
159 | KWObj[ k.replace( '.html', "" ) ] = base2 + k;
160 | }
161 | req.KWObj = KWObj;
162 | next();
163 | } );
164 | },
165 | createFuncObj: function ( req, res, next ) {
166 | let base = 'JavaScript/developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/';
167 | let funcObj = {};
168 |
169 | fs.readdir( './docs/' + base, function ( err, files ) {
170 | if ( err ) console.log( err );
171 | files = files.filter( elem => {
172 | return elem.includes( '.html' );
173 | } );
174 | for ( let k of files ) {
175 | funcObj[ k.replace( '.html', "" ) ] = base + k;
176 | }
177 | req.funcObj = funcObj;
178 | next();
179 | } );
180 | },
181 | sqlFile: function ( req, res, next ) {
182 | let i = 0;
183 | let objects = {
184 | function: req.funcObj,
185 | key_word: req.KWObj,
186 | events: req.eventsObj,
187 | methods: req.methodObj,
188 | class: req.classObj
189 | };
190 |
191 | let db = new SQL.Database();
192 | db.run( "CREATE TABLE docsearch (ID int, NAME char, TYPE char, LINK char);" );
193 |
194 | for ( let k in objects ) {
195 | console.log( k );
196 | for ( let j in objects[ k ] ) {
197 | db.run( "INSERT INTO docsearch VALUES (:ID, :NAME, :TYPE, :LINK)", {
198 | ':ID': i++,
199 | ':NAME': j,
200 | ':TYPE': k,
201 | ':LINK': objects[ k ][ j ]
202 | } );
203 | }
204 | }
205 | let data = db.export();
206 | let buffer = new Buffer( data );
207 |
208 | fs.writeFileSync( "docs/mdn_javascript.sqlite", buffer );
209 |
210 | next();
211 | },
212 | zip: function ( req, res, next ) {
213 | console.log('zipping');
214 | let output = fs.createWriteStream( 'zips/mdn/javascript/mdn_javascript.zip');
215 | let archive = archiver('zip');
216 |
217 | output.on('close', function() {
218 | fs.unlink('./JavaScript.tgz', (err) => {
219 | if(err) console.log(err);
220 | console.log(archive.pointer() + ' total bytes');
221 | console.log('archiver has been finalized and the output file descriptor has closed.');
222 | } );
223 | });
224 |
225 | archive.on('error', function(err) {
226 | throw err;
227 | });
228 |
229 | archive.pipe(output);
230 |
231 | archive.bulk([
232 | { expand: true, cwd: 'docs/', src: ['**'], dest:'mdn_javascript.docs' }
233 | ]);
234 |
235 | archive.finalize();
236 | next();
237 | }
238 | };
239 |
240 |
241 | module.exports = mdn;
242 |
--------------------------------------------------------------------------------
/server/controllers/updateModel.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 | var mongoose = require('mongoose');
3 | var Schema = mongoose.Schema;
4 |
5 | var updateSchema = new Schema({
6 | sourceName: String,
7 | versionNo: String,
8 | filePath: String,
9 | retrieved: { type: Date, default: Date.now },
10 | createdAt: {type: Date, default: Date.now}
11 | });
12 |
13 | module.exports = mongoose.model('Update', updateSchema);
14 |
--------------------------------------------------------------------------------
/server/middleware/folderHandler.js:
--------------------------------------------------------------------------------
1 | var fs = require('fs');
2 |
3 | var folderHandler = {
4 | checkOrCreateFolder: function(path){
5 | if(this.checkFolders(path)){
6 | console.log("Folder exists for zip file, continue");
7 | }
8 | else{
9 | fs.mkdir(path, err => {
10 | if(err){ console.error(err) };
11 | console.log("Zip folder does not exits, creating");
12 | })
13 | }
14 | },
15 | checkToDelete: function(path){
16 | // if the diretory exists, delete it
17 | if(this.checkFolders(path)){
18 | //We need to delete the directory
19 | console.log("Temp folder exists, deleting")
20 | this.deleteFolderRecursive(path);
21 | }
22 | else{
23 | console.log("Temp folder does not exist, continuing");
24 | }
25 | },
26 | //Generic function to check if folder exists
27 | checkFolders: function(path){
28 | var that = this;
29 | // Use try, if dir does not exist, it will throw an error
30 | try{
31 | var stats = fs.statSync(path)
32 | if(stats.isDirectory()){
33 | return true;
34 | }
35 | }
36 | catch(err){
37 | if(err){
38 | // console.log(err, 'Folder does not exist');
39 | return false
40 | }
41 | }
42 | },
43 | //Recursively delete folders should I make it async?
44 | deleteFolderRecursive: function(path) {
45 | var that = this;
46 | if( fs.existsSync(path) ) {
47 | fs.readdirSync(path).forEach(function(file,index){
48 | var curPath = path + "/" + file;
49 | if(fs.lstatSync(curPath).isDirectory()) { // recurse
50 | that.deleteFolderRecursive(curPath);
51 | } else { // delete file
52 | fs.unlinkSync(curPath);
53 | }
54 | });
55 | fs.rmdirSync(path);
56 | }
57 | }
58 | }
59 |
60 | module.exports = folderHandler;
61 |
--------------------------------------------------------------------------------
/server/middleware/mdnCSS.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 | const cheerio = require( 'cheerio' );
3 | const request = require( 'request' );
4 | const fs = require( 'fs' );
5 | const targz = require( 'tar.gz' );
6 | const zlib = require( 'zlib' );
7 | const path = require( 'path' );
8 | const tar = require( 'tar' );
9 | const archiver = require( 'archiver' );
10 | const folderHandler = require('./folderHandler');
11 |
12 |
13 | let mdnCSS = {
14 | /*
15 | * This function goes to kapeli.com, grabs the Javascript link,
16 | * then attaches it to the req obj
17 | */
18 | download: function ( req, res, next ) {
19 | request( 'https://kapeli.com/mdn_offline', function ( err, html ) {
20 | if ( err ) console.log( err );
21 | let $ = cheerio.load( html.body );
22 |
23 | //Only use the link that contains the text 'Javascript.tgz'
24 | let CSSdownloadLink = "https://kapeli.com/" + $( ".download:contains('CSS.tgz')" )
25 | .attr( "href" );
26 | req.CSSdownloadLink = CSSdownloadLink;
27 | next();
28 | } );
29 | },
30 | //downloads tar file from kapeli.com
31 | getCSS: function ( req, res, next ) {
32 | //NOTE:downloading 22 MB .tar to disk
33 |
34 | let write = fs.createWriteStream( './temp/CSS.tgz' );
35 |
36 | ///////////////////////////////////////////////////////
37 | // using the request stream as a ReadStream
38 | // NOTE: req.CSSdownloadLink initialized in mdn.download
39 | //////////////////////////////////////////////////////
40 | let read = request( req.CSSdownloadLink )
41 | .on( 'error', function ( err ) {
42 | throw err;
43 | } )
44 | .pipe( write );
45 |
46 | //just to log bytes written - not necessary
47 | let watcher = fs.watch( './temp/CSS.tgz' )
48 | .on( 'change', function () {
49 | let bytes=(read.bytesWritten/1000000).toFixed(2);
50 | require('single-line-log').stdout('CSS: ',bytes +' MB');
51 | });
52 | //close readStream and watcher
53 | read.on( 'finish', function () {
54 | read.close( function(){
55 | watcher.close();
56 | next();
57 | });
58 | } );
59 | },
60 | extract: function ( req, res, next ) {
61 | console.log( 'extracting...' );
62 | let inflate = zlib.Unzip();
63 | let extractor = tar.Extract( {
64 | path: './docs/mdn/css/documents'
65 | } )
66 | .on( 'error', function ( err ) {
67 | throw err;
68 | } )
69 | .on( 'end', function () {
70 | console.log( 'extracted' );
71 | next();
72 | } );
73 | let extracting = fs.createReadStream( './temp/CSS.tgz' )
74 | .on( 'error', function ( err ) {
75 | throw err;
76 | } )
77 | .pipe( inflate )
78 | .pipe( extractor );
79 | extracting.on( 'finish', function () {
80 | // next();
81 | } );
82 | },
83 | getObjs: function(req, res, next){
84 | let base = 'CSS/developer.mozilla.org/en-US/docs/Web/CSS/';
85 | let $ = cheerio.load(fs.readFileSync('./docs/mdn/css/documents/CSS/developer.mozilla.org/en-US/docs/Web/CSS/Reference.html'));
86 | let classObj = {};
87 | let elemObj = {};
88 | let funcObj = {};
89 | let typesObj = {};
90 | let propObj = {};
91 | let guideObj = {};
92 | $('div .index a').each((i, el) => {
93 | let text = $(el).text();
94 | let link = $(el).attr('href');
95 | let classReg = new RegExp (/^:[^:].+/g );
96 | let elemReg = new RegExp (/^::/g );
97 | let funcReg = new RegExp (/^@|\(\)$/g );
98 | let typeReg = new RegExp (/^ {
115 | guideObj[$(el).text()] = base + $(el).attr('href');
116 | });
117 | req.classObj = classObj;
118 | req.elemObj = elemObj;
119 | req.funcObj = funcObj;
120 | req.typesObj = typesObj;
121 | req.propObj = propObj;
122 | req.guideObj = guideObj;
123 | next();
124 | },
125 | getMoz : function(req, res, next){
126 | let base = 'CSS/developer.mozilla.org/en-US/docs/Web/CSS/';
127 | let $ = cheerio.load(fs.readFileSync('./docs/mdn/css/documents/CSS/developer.mozilla.org/en-US/docs/Web/CSS/Mozilla_Extensions.html'));
128 |
129 | $('div .index a').each((i, el) => {
130 | let text = $(el).text();
131 | let link = $(el).attr('href');
132 | let classReg = new RegExp (/^:[^:].+/g );
133 | let elemReg = new RegExp (/^::/g );
134 | if(classReg.test(text)){
135 | req.classObj[text] = base + link;
136 | }
137 | if(elemReg.test(text)){
138 | req.elemObj[text] = base + link;
139 | }
140 | });
141 | next();
142 | },
143 | sqlFile: function ( req, res, next ) {
144 | let i = 0;
145 | let jsonIndex = {"sourceName": req.scrapeProps.sourceName,
146 | "versionNo": req.scrapeProps.versionNo, "result": []};
147 | let objects = {
148 | Classes:req.classObj ,
149 | Elements:req.elemObj,
150 | Functions:req.funcObj ,
151 | Types:req.typesObj ,
152 | Properties:req.propObj,
153 | Guides:req.guideObj
154 | };
155 | req.classObj = null;
156 | req.elemObj = null;
157 | req.funcObj = null;
158 | req.typesObj = null;
159 | req.propObj = null;
160 | req.guideObj = null;
161 | for ( let k in objects ) {
162 | // console.log( k );
163 | for ( let j in objects[ k ] ) {
164 | jsonIndex.result.push({"NAME": j, "TYPE": k, "LINK": objects[k][j]});
165 | }
166 | }
167 | jsonIndex = JSON.stringify(jsonIndex);
168 | fs.writeFileSync( "docs/mdn/css/index.json", jsonIndex );
169 | //Null out jsonIndex
170 | jsonIndex = null;
171 | next();
172 | },
173 | zip: function ( req, res, next ) {
174 | let output = fs.createWriteStream( 'zips/mdn/mdn_css'+req.scrapeProps.versionNo+'.zip');
175 | let archive = archiver('zip');
176 | req.scrapeProps.filePath = './zips/mdn/mdn_css'+req.scrapeProps.versionNo+'.zip';
177 | output.on('close', function() {
178 | fs.unlink('./temp/CSS.tgz', (err) => {
179 | //Null out jsonindex and req stuff
180 | req.classObj = null;
181 | req.elemObj = null;
182 | req.funcObj = null;
183 | req.typesObj = null;
184 | req.propObj = null;
185 | req.guideObj = null;
186 | console.log(archive.pointer() + ' total bytes');
187 | folderHandler.deleteFolderRecursive(req.scrapeProps.baseDir);
188 | console.log('archiver has been finalized and the output file descriptor has closed.');
189 | next();
190 | });
191 | });
192 |
193 | archive.on('error', function(err) {
194 | throw err;
195 | });
196 |
197 | archive.pipe(output);
198 |
199 | archive.bulk([
200 | { expand: true, cwd: 'docs/mdn/css/', src: ['**'], dest:'mdn_css.docs' }
201 | ]);
202 |
203 | archive.finalize();
204 | }
205 | };
206 |
207 |
208 | module.exports = mdnCSS;
209 |
--------------------------------------------------------------------------------
/server/middleware/mdnHTML.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 | const cheerio = require( 'cheerio' );
3 | const request = require( 'request' );
4 | const fs = require( 'fs' );
5 | const targz = require( 'tar.gz' );
6 | const zlib = require( 'zlib' );
7 | const path = require( 'path' );
8 | const tar = require( 'tar' );
9 | const archiver = require( 'archiver' );
10 | const folderHandler = require('./folderHandler');
11 |
12 | let mdnHTML = {
13 | /*
14 | * This function goes to kapeli.com, grabs the HTML link,
15 | * then attaches it to the req obj
16 | */
17 | download: function ( req, res, next ) {
18 | request( 'https://kapeli.com/mdn_offline', function ( err, html ) {
19 | if ( err ) console.log( err );
20 | let $ = cheerio.load( html.body );
21 |
22 | //Only use the link that contains the text 'HTML.tgz'
23 | let HTMLdownloadLink = "https://kapeli.com/" + $( ".download:contains('HTML.tgz')" )
24 | .attr( "href" );
25 | req.HTMLdownloadLink = HTMLdownloadLink;
26 | next();
27 | } );
28 | },
29 | //downloads tar file from kapeli.com
30 | getHTML: function ( req, res, next ) {
31 | //NOTE:downloading 24 MB .tar to disk
32 | try {
33 | fs.mkdirSync('./temp');
34 | } catch (e) {
35 | console.log('./temp already exists');
36 | }
37 |
38 | let write = fs.createWriteStream( './temp/HTML.tgz' );
39 |
40 | ///////////////////////////////////////////////////////
41 | // using the request stream as a ReadStream
42 | // NOTE: req.downloadLink initialized in mdn.download
43 | //////////////////////////////////////////////////////
44 | let read = request( req.HTMLdownloadLink )
45 | .on( 'error', function ( err ) {
46 | throw err;
47 | } )
48 | .pipe( write );
49 |
50 | //just to log bytes written - not necessary
51 | let watcher = fs.watch( './temp/HTML.tgz' )
52 | .on( 'change', function () {
53 | let bytes=(read.bytesWritten/1000000).toFixed(2);
54 | require('single-line-log').stdout('HTML: ',bytes +' MB');
55 | });
56 | //close readStream and watcher
57 | read.on( 'finish', function () {
58 | read.close( function(){
59 | watcher.close();
60 | next();
61 | });
62 | } );
63 | },
64 | extract: function ( req, res, next ) {
65 | console.log( 'extracting...' );
66 | let inflate = zlib.Unzip();
67 | let extractor = tar.Extract( {
68 | path: './docs/mdn/html/documents/'
69 | } )
70 | .on( 'error', function ( err ) {
71 | console.log(err);
72 | } )
73 | .on( 'end', function () {
74 | console.log( 'extracted' );
75 | next();
76 | } );
77 | let extracting = fs.createReadStream( './temp/HTML.tgz' )
78 | .on( 'error', function ( err ) {
79 | console.log(err);
80 | } )
81 | .pipe( inflate )
82 | .pipe( extractor );
83 | extracting.on( 'finish', function () {
84 | // next();
85 | } );
86 | },
87 | getElements: function ( req, res, next ) {
88 | let base = 'HTML/developer.mozilla.org/en-US/docs/Web/HTML/Element',
89 | attrObj = {},
90 | elemObj = {};
91 |
92 | fs.readdir( './docs/mdn/html/documents/' + base, function ( err, files ) {
93 | if ( err ) console.log( err );
94 | files = files.filter( elem => {
95 | return elem.includes( '.html' ) && !elem.includes( '.dashtoc' );
96 | } );
97 | for ( let file of files ) {
98 | let nameOfElem = file.replace( '.html', "" ),
99 | attrLinks = [],
100 | attrIds;
101 |
102 | let $ = cheerio.load( fs.readFileSync( `./docs/mdn/html/documents/${base}/${file}` ) );
103 |
104 | $( "a[name*='attr-']" ).each( (i , el) => {
105 | if($(el).attr('name')){
106 | attrIds = $( el ).attr('name').replace(/attr-/g, "");
107 | $(el).attr('id', attrIds);
108 | console.log($(el).attr('id'));
109 | attrObj[`${nameOfElem}.${attrIds}`] = `${base}/${file}#${attrIds}`;
110 | }
111 | });
112 | var html = $.html();
113 | fs.writeFileSync( `./docs/mdn/html/documents/${base}/${file}`, html)
114 | // console.log(attrObj);
115 | elemObj[ nameOfElem ] = base + file;
116 | }
117 |
118 | req.elemObj = elemObj;
119 | req.attrObj = attrObj;
120 | next();
121 | } );
122 | },
123 | sqlFile: function ( req, res, next ) {
124 | let i = 0;
125 | // let db = new SQL.Database();
126 | // db.run( "CREATE TABLE docsearch (ID int, NAME char, TYPE char, LINK char);" );
127 | let jsonIndex = {"sourceName": req.scrapeProps.sourceName,
128 | "versionNo": req.scrapeProps.versionNo, "result": []};
129 | for ( let elemName in req.elemObj ) {
130 | jsonIndex.result.push({"NAME": elemName, "TYPE": "element", "LINK": req.elemObj[elemName]});
131 | // ':ID': i++,
132 | // ':NAME': elemName,
133 | // ':TYPE': "element",
134 | // ':LINK': req.elemObj[ elemName ]
135 | }
136 | for ( let attrName in req.attrObj ) {
137 | jsonIndex.result.push({"NAME": attrName, "TYPE": "attribute", "LINK": req.attrObj[attrName]});
138 | // ':ID': i++,
139 | // ':NAME': attrName,
140 | // ':TYPE': "attribute",
141 | // ':LINK': req.attrObj[attrName]
142 | }
143 | // let data = db.export();
144 | jsonIndex = JSON.stringify(jsonIndex);
145 | fs.writeFileSync( "./docs/mdn/html/index.json", jsonIndex );
146 | //Null out jsonIndex
147 | jsonIndex = null;
148 | next();
149 | },
150 |
151 | zip: function ( req, res, next ) {
152 | let output = fs.createWriteStream( './zips/mdn/mdn_html'+req.scrapeProps.versionNo+'.zip');
153 | let archive = archiver('zip');
154 | req.scrapeProps.filePath = './zips/mdn/mdn_html'+req.scrapeProps.versionNo+'.zip';
155 |
156 | output.on('close', function() {
157 | fs.unlink('./temp/HTML.tgz', (err) => {
158 | if(err) console.log(err);
159 | req.elemObj = null;
160 | req.attrObj = null;
161 | console.log(archive.pointer() + ' total bytes');
162 | folderHandler.deleteFolderRecursive(req.scrapeProps.baseDir);
163 | console.log('archiver has been finalized and the output file descriptor has closed.');
164 | next();
165 | });
166 | });
167 |
168 | archive.on('error', function(err) {
169 | throw err;
170 | });
171 |
172 | archive.pipe(output);
173 |
174 | archive.bulk([
175 | { expand: true, cwd: 'docs/mdn/html', src: ['**'], dest:'mdn_html.docs' }
176 | ]);
177 |
178 | archive.finalize();
179 | }
180 | };
181 |
182 |
183 | module.exports = mdnHTML;
184 |
--------------------------------------------------------------------------------
/server/middleware/mdnJS.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 | const cheerio = require( 'cheerio' );
3 | const request = require( 'request' );
4 | const fs = require( 'fs' );
5 | const zlib = require( 'zlib' );
6 | const path = require( 'path' );
7 | const tar = require( 'tar' );
8 | const archiver = require( 'archiver' );
9 | const folderHandler = require('./folderHandler');
10 |
11 | let mdnJS = {
12 |
13 | /*
14 | * This function goes to kapeli.com, grabs the Javascript link,
15 | * then attaches it to the req obj
16 | */
17 |
18 | download: function ( req, res, next ) {
19 | request( 'https://kapeli.com/mdn_offline', function ( err, html ) {
20 | if ( err ) console.log( err );
21 | let $ = cheerio.load( html.body );
22 | var d = new Date();
23 | console.log("requesting ", d.getMinutes(), ":", d.getSeconds());
24 | //Only use the link that contains the text 'Javascript.tgz'
25 | let downloadLink = "https://kapeli.com/" + $( ".download:contains('JavaScript.tgz')" )
26 | .attr( "href" );
27 | // req.downloadLink = downloadLink;
28 | req.downloadLink = 'http://localhost:8080/js2';
29 | next();
30 | } );
31 | },
32 | //downloads tar file from kapeli.com
33 | getJavascript: function ( req, res, next ) {
34 | //downloading 116 MB .tar to disk
35 |
36 | //Check if js file exists
37 |
38 | let write = fs.createWriteStream( './temp/JavaScript.tgz' );
39 | var d = new Date();
40 | console.log("Downloading ", d.getMinutes(), ":", d.getSeconds());
41 | ///////////////////////////////////////////////////////
42 | // using the request stream as a ReadStream
43 | // NOTE: req.downloadLink initialized in mdn.download
44 | //////////////////////////////////////////////////////
45 | let read = request( req.downloadLink )
46 | .on( 'error', function ( err ) {
47 | throw err;
48 | } )
49 | .pipe( write );
50 |
51 | //just to log bytes written - not necessary
52 | // let watcher = fs.watch( './temp/JavaScript.tgz' )
53 | // .on( 'change', function () {
54 | // let bytes=(read.bytesWritten/1000000).toFixed(2);
55 | // // console.log( bytes +' MB');
56 | // require('single-line-log').stdout(bytes +' MB')
57 | // } );
58 | //close readStream and watcher
59 | read.on( 'finish', function () {
60 | read.close( function(){
61 | console.log("done ", d.getMinutes(), ":", d.getSeconds());
62 | // watcher.close();
63 | // res.send("DONE")
64 | next();
65 | });
66 | } );
67 | },
68 | extract: function ( req, res, next ) {
69 | console.log( 'extracting...' );
70 | var d = new Date();
71 | console.log("extracting ", d.getMinutes(), ":", d.getSeconds());
72 | let inflate = zlib.Unzip();
73 | let extractor = tar.Extract( {
74 | path: './docs/mdn/javascript/documents'
75 | } )
76 | .on( 'error', function ( err ) {
77 | throw err;
78 | } )
79 | .on( 'end', function () {
80 | console.log( 'extracted' );
81 | next();
82 | } );
83 | let extracting = fs.createReadStream( './temp/JavaScript.tgz' )
84 | .on( 'error', function ( err ) {
85 | throw err;
86 | } )
87 | .pipe( inflate )
88 | .pipe( extractor );
89 | extracting.on( 'finish', function () {
90 | // next();
91 | } );
92 | },
93 | createClassObj: function ( req, res, next ) {
94 | let base = 'JavaScript/developer.mozilla.org/en-US/docs/Web/API/';
95 | let classObj = {};
96 | var d = new Date();
97 | console.log(d.getMinutes(), d.getSeconds());
98 | fs.readdir( './docs/mdn/javascript/documents/' + base, function ( err, files ) {
99 | if ( err ) console.log( err );
100 | // console.log(files);
101 | files = files.filter( elem => {
102 | return elem.includes( '.html' );
103 | } );
104 | for ( let k of files ) {
105 | classObj[ k.replace( '.html', "" ) ] = base + k;
106 | }
107 | req.classObj = classObj;
108 | next();
109 | } );
110 | },
111 | createMethodsObj: function ( req, res, next ) {
112 | function getDirectories( srcpath ) {
113 | return fs.readdirSync( srcpath )
114 | .filter( function ( file ) {
115 | return fs.statSync( path.join( srcpath, file ) )
116 | .isDirectory();
117 | } );
118 | }
119 | let base = 'JavaScript/developer.mozilla.org/en-US/docs/Web/API';
120 | let methodObj = {};
121 |
122 | let directories = getDirectories( './docs/mdn/javascript/documents/' + base );
123 |
124 | directories.forEach( elem => {
125 | fs.readdir( `docs/mdn/javascript/documents/${base}/${elem}`, function ( err, files ) {
126 | // console.log(files, err)
127 | files.forEach( fileElem => {
128 | let key = `${elem}.${fileElem}`;
129 | methodObj[ key.replace( ".html", "" ) ] = `${base}/${elem}/${fileElem}`;
130 | } );
131 | req.methodObj = methodObj;
132 | } );
133 | } );
134 | next();
135 | },
136 | createEventObj: function ( req, res, next ) {
137 | let base = 'JavaScript/developer.mozilla.org/en-US/docs/Web/Events/';
138 | let eventsObj = {};
139 |
140 | fs.readdir( './docs/mdn/javascript/documents/' + base, function ( err, files ) {
141 | if ( err ) console.log( err );
142 | files = files.filter( elem => {
143 | return elem.includes( '.html' );
144 | } );
145 | for ( let k of files ) {
146 | eventsObj[ k.replace( '.html', "" ) ] = base + k;
147 | }
148 | req.eventsObj = eventsObj;
149 | next();
150 | } );
151 | },
152 | createKWObj: function ( req, res, next ) {
153 | let base1 = 'JavaScript/developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/';
154 | let base2 = 'JavaScript/developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/';
155 | let KWObj = {};
156 | fs.readdir( './docs/mdn/javascript/documents/' + base1, function ( err, files ) {
157 | if ( err ) console.log( err );
158 | files = files.filter( elem => {
159 | return elem.includes( '.html' );
160 | } );
161 | for ( let k of files ) {
162 | KWObj[ k.replace( '.html', "" ) ] = base1 + k;
163 | }
164 | } );
165 | fs.readdir( './docs/mdn/javascript/documents/' + base2, function ( err, files ) {
166 | if ( err ) console.log( err );
167 | files = files.filter( elem => {
168 | return elem.includes( '.html' );
169 | } );
170 | for ( let k of files ) {
171 | KWObj[ k.replace( '.html', "" ) ] = base2 + k;
172 | }
173 | req.KWObj = KWObj;
174 | next();
175 | } );
176 | },
177 | createFuncObj: function ( req, res, next ) {
178 | let base = 'JavaScript/developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/';
179 | let funcObj = {};
180 |
181 | fs.readdir( './docs/mdn/javascript/documents/' + base, function ( err, files ) {
182 | if ( err ) console.log( err );
183 | files = files.filter( elem => {
184 | return elem.includes( '.html' );
185 | } );
186 | for ( let k of files ) {
187 | funcObj[ k.replace( '.html', "" ) ] = base + k;
188 | }
189 | req.funcObj = funcObj;
190 | next();
191 | } );
192 | },
193 | sqlFile: function ( req, res, next ) {
194 | var d = new Date();
195 | let i = 0;
196 | let objects = {
197 | function: req.funcObj,
198 | key_word: req.KWObj,
199 | events: req.eventsObj,
200 | methods: req.methodObj,
201 | class: req.classObj
202 | };
203 | console.log(d.getMinutes(), d.getSeconds());
204 |
205 | let jsonIndex = {"sourceName": req.scrapeProps.sourceName,
206 | "versionNo": req.scrapeProps.versionNo, "result": []};
207 | for ( let k in objects ) {
208 | // console.log( k );
209 | for ( let j in objects[ k ] ) {
210 | jsonIndex.result.push({"NAME": j, "TYPE": k, "LINK": objects[k][j]});
211 | }
212 | }
213 | jsonIndex = JSON.stringify(jsonIndex);
214 | fs.writeFileSync( "docs/mdn/javascript/index.json", jsonIndex );
215 | //Null out jsonIndex
216 | jsonIndex = null;
217 | next();
218 | },
219 | zip: function ( req, res, next ) {
220 | console.log('zipping');
221 | let output = fs.createWriteStream( './zips/mdn/mdn_javascript'+req.scrapeProps.versionNo+'.zip');
222 | //Add to req
223 | req.scrapeProps.filePath = './zips/mdn/mdn_javascript'+req.scrapeProps.versionNo+'.zip';
224 | let archive = archiver('zip');
225 | var d = new Date();
226 | console.log(d.getMinutes(), d.getSeconds());
227 |
228 | output.on('close', function() {
229 | fs.unlink('./temp/JavaScript.tgz', (err) => {
230 | if(err) console.log(err);
231 | d = new Date;
232 | req.funcObj = null;
233 | req.KWObj = null;
234 | req.eventsObj = null;
235 | req.methodObj = null;
236 | req.classObj = null;
237 | console.log(d.getMinutes(), d.getSeconds());
238 | console.log(archive.pointer() + ' total bytes');
239 | console.log('archiver has been finalized and the output file descriptor has closed.');
240 | folderHandler.deleteFolderRecursive(req.scrapeProps.baseDir);
241 | next();
242 |
243 | } )
244 | });
245 |
246 | archive.on('error', function(err) {
247 | throw err;
248 | });
249 |
250 | archive.pipe(output);
251 |
252 | archive.bulk([
253 | { expand: true, cwd: 'docs/mdn/javascript', src: ['**'], dest:'mdn_javascript.docs' }
254 | ]);
255 |
256 | archive.finalize();
257 | }
258 | };
259 |
260 |
261 | module.exports = mdnJS;
262 |
--------------------------------------------------------------------------------
/server/middleware/nodeparser_working.js:
--------------------------------------------------------------------------------
1 | var fs = require('fs');
2 | var sql = require('sql.js')
3 | var cheerio = require('cheerio');
4 |
5 | module.exports = function parser(file, db, i) {
6 | // var db = new sql.Database();
7 | //initialize sql query
8 | //move outside of function?
9 | var sqlstr = "";
10 | // var sqlstr = "CREATE TABLE docsearch (ID int, NAME char, TYPE char, LINK char);";
11 | // console.log(sqlstr.length)
12 | var filename = file.slice(file.lastIndexOf('/')+1)
13 | var data = fs.readFileSync(file, 'utf-8');
14 | var $ = cheerio.load(data);
15 | var methods = [];
16 | //Keep track of index independently for sake of sql database
17 | //Go thru all h3 and h2 to get methods props and events
18 | // Pass in a size so you dont check previous h2 for class and instead insert the module
19 | function firstPass(ind, el, size){
20 | var name = $(el).parent().parent().text();
21 | //Add href of link to filename
22 | var link = $(el).attr('href');
23 | //Match Methods (they have X.string(blah) )
24 | if(name.match(/\w+\(\w*\)\#$/g)){
25 | name = name.replace(/\(.*\)\#/g, "");
26 | //Handle Class Methods
27 | if(name.match(/^Class\sMethod:\s/)){
28 | name = name.replace(/^Class\sMethod:\s/, "")
29 | }
30 | sqlstr += `INSERT INTO docsearch VALUES (${i}, '${name}', 'method', '${filename.concat(link)}');`;
31 | //Push into methods for determining if its an addon page or not
32 | i++;
33 | methods.push($(el).attr('href'));
34 | }
35 | //Properties are similar to method notation but lack the ()
36 | else if(name.match(/\.\w+(?!\()#/g) || name.match(/.+\[.*\]#/g)){
37 | //sometimes classes have a . in them too we will grab classes later
38 | if(!name.match(/Class/)){
39 | name = name.slice(0,-1);
40 | sqlstr += `INSERT INTO docsearch VALUES (${i}, '${name}', 'property', '${filename.concat(link)}');`;
41 | i++;
42 | }
43 | }
44 | //Find events they start with Event:
45 | else if(name.match(/^Event:/g)){
46 | //get rid of Event: and # and ''s
47 | name = name.replace(/^Event:\s/g, "").replace(/\'|#/g, "");
48 | if(size === 'h3'){
49 | //Find previous h2, prevuntil goes up to but not including, then do one more prev, but filter to just
50 | var classname = $(el).parent().parent().prevUntil('h2').prev('h2').text();
51 | classname = classname.replace(/Class:\s/g, "").slice(0,-1);
52 | }
53 | else if(size === 'h2'){
54 | // console.log())
55 | var classname = filename.slice(0,filename.indexOf('.'));
56 | }
57 | name = classname.concat("."+name);
58 | //Concatenate the classname and event name and
59 | //get rid of # in h2 className
60 | sqlstr += `INSERT INTO docsearch VALUES(${i}, '${name}', 'event', '${filename.concat(link)}');`;
61 | i++;
62 |
63 | }
64 | //Keep track of that Index
65 | }
66 | $('h3 a').each((ind,el)=>{
67 | firstPass(ind,el, 'h3')
68 | });
69 | $('h2 a').each((ind, el) =>{
70 | firstPass(ind, el, 'h2')
71 | })
72 | //Check if anything has been put into the sql string, if not, it's not a module.
73 | if(sqlstr.length >= 65){
74 | //Get Module name and put in database
75 | var name = $('#apicontent > h1').text().replace(/#/g, "");
76 | var link = $('#apicontent > h1 a').attr('href');
77 | sqlstr += `INSERT INTO docsearch VALUES(${i}, '${name}', 'module', '${filename.concat(link)}');`;
78 | i++;
79 |
80 | //Time to grab classes and other stragglers
81 | $('h2 a').each((ind, el) => {
82 | var name = $(el).parent().parent().text();
83 | //Add href of link to filename
84 | var link = $(el).attr('href');
85 | if(name.match(/^Class\:\s/g)){
86 | //replace the class and get rid of the #
87 | name = name.replace(/^Class\:\s/g, "").replace(/\'/g,"").slice(0, -1);
88 | sqlstr += `INSERT INTO docsearch VALUES(${i}, '${name}', 'class', '${filename.concat(link)}');`;
89 | i++;
90 |
91 | }
92 | //Bad semantic html, check for properties that are in h2
93 | // else if(name.match(/\.\w+(?!\()#/g) || name.match(/.+\[.*\]#/g)){
94 | // // name = name.replace(/#$/g, "");
95 | // name = name.replace(/\'/g,"").slice(0,-1);
96 | // sqlstr += `INSERT INTO docsearch VALUES (${i}, '${name}', 'property', '${filename.concat(link)}');`;
97 | // i++;
98 | //
99 | // }
100 | // Otherwise they are probably sections / chapters. to be safe, check against matches for
101 | // events props classes and methods
102 | else if(!name.match(/Class|Event|\(.*\)|\.\w+(?!\()/)){
103 | name = name.replace(/\'/g, "").slice(0,-1);
104 | sqlstr += `INSERT INTO docsearch VALUES (${i}, '${name}', 'chapter', '${filename.concat(link)}');`;
105 | i++;
106 |
107 | }
108 | })
109 | }
110 |
111 | // fs.writeFileSync('docs/'+filename+".js", sqlstr)
112 | //Insert into sql database
113 | db.run(sqlstr);
114 | return ({"DB": db, "index": i})
115 | // var data = db.export();
116 | // var buff = new Buffer(data);
117 | // // fs.writeFileSync('docs/'+filename+'.sqlite', buff);
118 | // fs.writeFileSync('docs/files.sqlite', buff);
119 |
120 | };
121 |
--------------------------------------------------------------------------------
/server/middleware/parseEntryPoint.js:
--------------------------------------------------------------------------------
1 | var fs = require('fs');
2 | var parser = require('./parser');
3 | // var sql = require('sql.js')
4 |
5 |
6 | var parseEntry = {
7 | allFiles: function(req, resolve, reject){
8 | // var db = new sql.Database();
9 | //initialize sql query
10 | //move outside of function?
11 | var i = 0;
12 | var jsonFile = {"sourceName": req.scrapeProps.sourceName,
13 | "versionNo": req.scrapeProps.versionNo, "result": []};
14 | //create an object to store the index and the database
15 | // var storage = {"DB": db, "index": i};
16 | // var sqlstr = "CREATE TABLE docsearch (ID int, NAME char, TYPE char, LINK char);";
17 | // db.run(sqlstr)
18 | fs.readdir(req.scrapeProps.downloadDir, (err, file) => {
19 | console.log(err);
20 | list = file;
21 | // console.log(storage.DB);
22 | list.forEach((name) => {
23 | // Add directory name to file name for FS
24 | name = req.scrapeProps.downloadDir.concat(name);
25 | if(req.scrapeProps.scrapeDir.slice(0,-1) === 'node'){
26 | //For node, don't parse all.html, it will break the sql
27 | if(name.match(/\.html$/) && !name.match(/all\.html/)){
28 | jsonFile = parser.node(name, jsonFile);
29 | }
30 | }
31 | //Express stuff here
32 | else if(req.scrapeProps.scrapeDir.slice(0,-1) === 'express'){
33 | if(name.match(/\.html$/)){
34 | jsonFile = parser.express(name, jsonFile);
35 | }
36 | }
37 | });
38 | //Export the database so we can write it to file
39 | // var data = db.export();
40 | //Create a buffer for writing to
41 | // var buff = new Buffer(data);
42 | jsonFile = JSON.stringify(jsonFile);
43 | fs.writeFileSync(req.scrapeProps.baseDir+'/index.json', jsonFile);
44 | //Null out jsonFile
45 | jsonFile = null;
46 | //Be sure to resolve the promise when readdir is done
47 | resolve("Resolved");
48 | })
49 | }
50 | }
51 | module.exports = parseEntry;
52 |
--------------------------------------------------------------------------------
/server/middleware/parser.js:
--------------------------------------------------------------------------------
1 | var fs = require('fs');
2 | var cheerio = require('cheerio');
3 |
4 | var parser = {
5 | node: function(file, jsonFile){
6 | // sqlstr = "";
7 | var i = 0;
8 | var filename = file.slice(file.lastIndexOf('/')+1);
9 | var data = fs.readFileSync(file, 'utf-8');
10 | var $ = cheerio.load(data);
11 | var methods = [];
12 | //Keep track of index independently for sake of sql database
13 | //Go thru all h3 and h2 to get methods props and events
14 | //Pass in a size so you dont check previous h2 for class and instead insert the module
15 | function firstPass(ind, el, size){
16 | var name = $(el).parent().parent().text();
17 | //Add href of link to filename
18 | var link = $(el).attr('href');
19 | //Match Methods (they have X.string(blah) )
20 | if(name.match(/\w+\(.*\)\#$/g)){
21 | name = name.replace(/\(.*\)\#/g, "");
22 | //Handle Class Methods
23 | if(name.match(/^Class\sMethod:\s/)){
24 | name = name.replace(/^Class\sMethod:\s/, "");
25 | }
26 | jsonFile.result.push({"NAME": name, "TYPE": "method", "LINK":filename.concat(link)});
27 | //Push into methods for determining if its an addon page or not
28 | i++;
29 | methods.push($(el).attr('href'));
30 | }
31 | //Properties are similar to method notation but lack the ()
32 | else if(name.match(/\.\w+(?!\()#/g) || name.match(/.+\[.*\]#/g)){
33 | //sometimes classes have a . in them too we will grab classes later
34 | if(!name.match(/Class/)){
35 | name = name.slice(0,-1);
36 | // sqlstr += `INSERT INTO docsearch VALUES (${i}, '${name}', 'property', '${filename.concat(link)}');`;
37 | jsonFile.result.push({"NAME": name, "TYPE": "property", "LINK":filename.concat(link)});
38 |
39 | i++;
40 | }
41 | }
42 | //Find events they start with Event:
43 | else if(name.match(/^Event:/g)){
44 | //get rid of Event: and # and ''s
45 | name = name.replace(/^Event:\s/g, "").replace(/\'|#/g, "");
46 | var classname;
47 | if(size === 'h3'){
48 | //Find previous h2, prevuntil goes up to but not including, then do one more prev, but filter to just
49 | classname = $(el).parent().parent().prevUntil('h2').prev('h2').text();
50 | classname = classname.replace(/Class:\s/g, "").slice(0,-1);
51 | }
52 | else if(size === 'h2'){
53 | classname = filename.slice(0,filename.indexOf('.'));
54 | }
55 | name = classname.concat("."+name);
56 | //Concatenate the classname and event name and
57 | //get rid of # in h2 className
58 | // sqlstr += `INSERT INTO docsearch VALUES(${i}, '${name}', 'event', '${filename.concat(link)}');`;
59 | jsonFile.result.push({"NAME": name, "TYPE": "event", "LINK":filename.concat(link)});
60 | i++;
61 |
62 | }
63 | //Keep track of that Index
64 | }
65 | $('h3 a').each((ind,el)=>{
66 | firstPass(ind,el, 'h3');
67 | });
68 | $('h2 a').each((ind, el) =>{
69 | firstPass(ind, el, 'h2');
70 | });
71 | //Check if anything has been put into the sql string, if not, it's not a module.
72 | if(i >= 1){
73 | //Get Module name and put in database
74 | var name = $('#apicontent > h1').text().replace(/#/g, "");
75 | var link = $('#apicontent > h1 a').attr('href');
76 | // sqlstr += `INSERT INTO docsearch VALUES(${i}, '${name}', 'module', '${filename.concat(link)}');`;
77 | jsonFile.result.push({"NAME": name, "TYPE": "module", "LINK":filename.concat(link)});
78 |
79 | // i++;
80 |
81 | //Time to grab classes and other stragglers
82 | $('h2 a').each((ind, el) => {
83 | var name = $(el).parent().parent().text();
84 | //Add href of link to filename
85 | var link = $(el).attr('href');
86 | if(name.match(/^Class\:\s/g)){
87 | //replace the class and get rid of the #
88 | name = name.replace(/^Class\:\s/g, "").replace(/\'/g,"").slice(0, -1);
89 | // sqlstr += `INSERT INTO docsearch VALUES(${i}, '${name}', 'class', '${filename.concat(link)}');`;
90 | jsonFile.result.push({"NAME": name, "TYPE": "class", "LINK":filename.concat(link)});
91 |
92 | // i++;
93 |
94 | }
95 | //Bad semantic html, check for properties that are in h2
96 | // Otherwise they are probably sections / chapters. to be safe, check against matches for
97 | // events props classes and methods
98 | else if(!name.match(/Class|Event|\(.*\)|\.\w+(?!\()/)){
99 | name = name.replace(/\'/g, "").slice(0,-1);
100 | // sqlstr += `INSERT INTO docsearch VALUES (${i}, '${name}', 'chapter', '${filename.concat(link)}');`;
101 | jsonFile.result.push({"NAME": name, "TYPE": "chapter", "LINK":filename.concat(link)});
102 | // i++;
103 |
104 | }
105 | });
106 | }
107 | if(!jsonFile.sections) jsonFile.sections = ["chapter", "class", "event", "method", "module", "property"];
108 | //Insert into sql database
109 | // db.run(sqlstr);
110 | return (jsonFile);
111 | },
112 | express: function(file, jsonFile){
113 | var filename = file.slice(file.lastIndexOf('/')+1);
114 | var data = fs.readFileSync(file, 'utf-8');
115 | // var sqlstr = "";
116 | var $ = cheerio.load(data);
117 |
118 | var type = '';
119 | //Only api.html has the diff classes etc
120 | if(filename === "api.html"){
121 | //All methods/props/events and names of those are in H3s --- Created a nightmare since they arent nested
122 | //All the methods/props/events at least are inside sections located 'underneath' the names
123 | //Unfortunately cheerio freaks out if an ID has the character "." in it.
124 | $('h3').each((ind, ele) => {
125 | var truthy = ($(ele).text() === "Methods" || $(ele).text() === "Properties" || $(ele).text() === "Events");
126 | var name = $(ele).attr('id');
127 | var link = ("#").concat(name);
128 | //If the H3 matches one of these, set the type of the entry to that
129 | if(truthy){
130 | type = $(ele).text().toLowerCase();
131 | }
132 | //Otherwise add to the sql string
133 | else{
134 | jsonFile.result.push({"NAME": name, "TYPE": type, "LINK":filename.concat(link)});
135 | // sqlstr += `INSERT INTO docsearch VALUES (${i}, '${name}', '${type}', '${filename.concat(link)}');`;
136 | }
137 | // i++;
138 | });
139 | //Module / Class names are all in H2
140 | $('h2').each((ind, ele) => {
141 | // console.log();
142 | var name = $(ele).text();
143 | var link = ("#").concat($(ele).prev('p').children().first().attr('id'));
144 | // sqlstr += `INSERT INTO docsearch VALUES (${i}, '${name}', 'class', '${filename.concat(link)}');`;
145 | jsonFile.result.push({"NAME": name, "TYPE": "class", "LINK":filename.concat(link)});
146 | // i++
147 | });
148 | }
149 | // For all the chapters/guides, just grab the first H1 as the title, and put the link as the file name
150 | else{
151 | var name = $('h1').first().text();
152 | type = 'chapter';
153 | // sqlstr += `INSERT INTO docsearch VALUES (${i}, '${name}', '${type}', '${filename}');`
154 | jsonFile.result.push({"NAME": name, "TYPE": "chapter", "LINK":filename});
155 | // i++;
156 | }
157 | // db.run(sqlstr)
158 | return (jsonFile);
159 | }
160 | };
161 |
162 | module.exports = parser;
163 |
--------------------------------------------------------------------------------
/server/middleware/requestProps.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 | //Constants to be changed or added later with inputs to program
3 | /* Structure of directory to be eg /node.docs/docs/
4 | * with the sql file in /node.docs
5 | * and temporary directory to be documentation/
6 | * so docs/+scrapeDir+/documents will be downloadDir
7 | * baseDir will be docs/scrapeDir maybe rename scrapeDir?
8 | */
9 |
10 | let requestProps = {
11 | node: function(req, res, next){
12 | //Just in case garbage collection
13 | req.scrapeProps = {
14 | urlsToScrape: ['http://nodejs.org/api/'],
15 | sourceName: 'NodeJS',
16 | cssDir: 'assets',
17 | jsDir: 'assets',
18 | scrapeDir: 'node/',
19 | //FIX THIS LATER TO ADD IN ANYTHING, AND BE PASSED IN AS AN OBJECT
20 | //WHY CANT I USE THIS. HERE?
21 | baseDir: 'docs/node/',
22 | downloadDir: 'docs/node/documents/',
23 | RECURSIVE: true,
24 | versionNo: "",
25 | };
26 | next();
27 | },
28 | express: function(req, res, next){
29 | //Just in case garbage collection
30 | req.scrapeProps = {
31 | urlsToScrape: [
32 | {url: 'http://expressjs.com/en/4x/api.html', filename: 'api.html'},
33 | {url: 'http://expressjs.com/en/starter/installing.html', filename: 'installing.html'},
34 | {url: 'http://expressjs.com/en/starter/hello-world.html', filename: 'hello-world.html'},
35 | {url: 'http://expressjs.com/en/starter/generator.html', filename: 'generator.html'},
36 | {url: 'http://expressjs.com/en/starter/static-files.html', filename: 'static-files.html'},
37 | {url: 'http://expressjs.com/en/starter/faq.html', filename: 'faq.html'},
38 | {url: 'http://expressjs.com/en/guide/routing.html', filename: 'routing.html'},
39 | {url: 'http://expressjs.com/en/guide/writing-middleware.html', filename: 'writing-middleware.html'},
40 | {url: 'http://expressjs.com/en/guide/using-middleware.html', filename: 'using-middleware.html'},
41 | {url: 'http://expressjs.com/en/guide/using-template-engines.html', filename: 'using-template-engines.html'},
42 | {url: 'http://expressjs.com/en/guide/error-handling.html', filename: 'error-handling.html'},
43 | {url: 'http://expressjs.com/en/guide/debugging.html', filename: 'debugging.html'},
44 | {url: 'http://expressjs.com/en/guide/database-integration.html', filename: 'database-integration.html'},
45 | {url: 'http://expressjs.com/en/guide/migrating-4.html', filename: 'migrating-4.html'},
46 | {url: 'http://expressjs.com/en/advanced/developing-template-engines.html', filename: 'developing-template-engines.html'},
47 | {url: 'http://expressjs.com/en/advanced/best-practice-performance.html', filename: 'best-practice-performance.html'},
48 | {url: 'http://expressjs.com/en/advanced/best-practice-security.html', filename: 'best-practice-security.html'}
49 | ],
50 | sourceName: 'Express API',
51 | cssDir: 'css',
52 | jsDir: 'js',
53 | scrapeDir: 'express/',
54 | //FIX THIS LATER TO ADD IN ANYTHING, AND BE PASSED IN AS AN OBJECT
55 | //WHY CANT I USE THIS. HERE?
56 | baseDir: 'docs/express/',
57 | downloadDir: 'docs/express/documents/',
58 | RECURSIVE: false,
59 | versionNo: "",
60 | };
61 | next();
62 | },
63 | js: function(req, res, next){
64 | //Just in case garbage collection
65 | req.scrapeProps = {
66 | // URL_TO_SCRAPE: ,
67 | sourceName:"MDN Javascript",
68 | // CSS_DIR: ,
69 | // JS_DIR: ,
70 | scrapeDir: 'mdn/javascript/',
71 | baseDir: 'docs/mdn/javascript/',
72 | downloadDir: 'docs/mdn/javascript/JavaScript/documents'
73 | };
74 | next();
75 | },
76 | html: function(req, res, next){
77 | //Just in case garbage collection
78 | req.scrapeProps = {
79 | // URL_TO_SCRAPE: ,
80 | sourceName:"MDN HTML",
81 | // CSS_DIR: ,
82 | // JS_DIR: ,
83 | scrapeDir: 'mdn/html/',
84 | baseDir: 'docs/mdn/html/',
85 | downloadDir: 'docs/mdn/html/HTML/documents'
86 | };
87 | next();
88 | },
89 | css: function(req, res, next){
90 | //Just in case garbage collection
91 | req.scrapeProps = {
92 | // URL_TO_SCRAPE: ,
93 | sourceName:"MDN CSS",
94 | // CSS_DIR: ,
95 | // JS_DIR: ,
96 | scrapeDir: 'mdn/css/',
97 | baseDir: 'docs/mdn/css/',
98 | downloadDir: 'docs/mdn/css/CSS/documents'
99 | };
100 | next();
101 | },
102 |
103 | };
104 |
105 | module.exports = requestProps;
106 |
--------------------------------------------------------------------------------
/server/middleware/rewrite.js:
--------------------------------------------------------------------------------
1 | var cheerio = require('cheerio');
2 |
3 | //Object for varios sites to strip out parts of html
4 | var rewrite = {
5 | //Specific nodejs.com documentation removal of ToC and sidebar
6 | node: function(req, res, next, html){
7 | var $ = cheerio.load(html);
8 | $('#column2').remove();
9 | $('#toc').remove();
10 | $('header').remove();
11 | html = $.html();
12 | //Return full html to be written as file instead of html and cheerio data
13 | return html;
14 | },
15 | express: function(req, res, next, html){
16 | var $ = cheerio.load(html);
17 | $('header').remove();
18 | $('footer').remove();
19 | $('#menu').remove();
20 | // $('header').remove();
21 | html = $.html();
22 | //Return full html to be written as file instead of html and cheerio data
23 | return html;
24 | }
25 |
26 | }
27 |
28 |
29 | module.exports = rewrite;
30 |
--------------------------------------------------------------------------------
/server/middleware/scrapeParseWrite.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 | var scraper = require('website-scraper');
3 | var fs = require('fs');
4 | var cheerio = require('cheerio');
5 | var archiver = require('archiver');
6 |
7 | var rewrite = require('./rewrite')
8 | var folderHandler = require('./folderHandler');
9 | var parseEntry = require('./parseEntryPoint');
10 |
11 | var scrapeParseWrite = {
12 |
13 | createZip: function(req, res, next){
14 | //Initialize Archiver
15 | //Specify type of archive - zip or tar
16 | req.archive = archiver('zip');
17 | var zipFolder = 'zips/'+req.scrapeProps.scrapeDir;
18 | //check to see if folder exists or create folder to store zip if it doesn't exist
19 | folderHandler.checkOrCreateFolder(zipFolder);
20 | //Create output file stream from scrapeDir
21 | req.output = fs.createWriteStream(zipFolder+req.scrapeProps.scrapeDir.slice(0,-1)+req.scrapeProps.versionNo+'.zip');
22 | this.scrape(req, res, next);
23 | },
24 |
25 | scrape: function(req, res, next){
26 | //Check to see if folder was deleted or not, and if so, delete it
27 | folderHandler.checkToDelete(req.scrapeProps.baseDir);
28 |
29 | /*
30 | * Initialize scraper and provide URL, directory to store files, subdirectories
31 | * FOR files, recurse 1 level deep, and then edit files
32 | */
33 | scraper.scrape({
34 | urls: req.scrapeProps.urlsToScrape,
35 | directory: req.scrapeProps.downloadDir,
36 | subdirectories: [
37 | {directory: 'img', extensions: ['.jpg', '.png', '.svg']},
38 | {directory: req.scrapeProps.jsDir, extensions: ['.js']},
39 | {directory: req.scrapeProps.cssDir, extensions: ['.css']}
40 | ],
41 | recursive: req.scrapeProps.RECURSIVE,
42 | maxDepth: 1
43 | }).then((data)=>{
44 | this.getFiles(req, res, next);
45 | }).catch(console.log);
46 |
47 | //Event listener for end of zipping function - delete folder
48 | req.output.on('close', ()=>{
49 | console.log(req.archive.pointer() + ' total bytes');
50 | console.log('archiver has been finalized and the output file descriptor has closed.');
51 | folderHandler.deleteFolderRecursive(req.scrapeProps.baseDir);
52 | req.scrapeProps.filePath = req.output.path;
53 | console.log(req.output.path);
54 | // res.versionNo = versionNo;
55 | next();
56 | });
57 | // Event listener for archive errors
58 | req.archive.on('error', function(err){
59 | throw err;
60 | });
61 | },
62 |
63 |
64 | //get list of files to change the hrefs for css and js files to exclude beggining / if they have it
65 | getFiles: function(req, res, next) {
66 | let list;
67 |
68 | //Add that because this object will be out of context in archive.bulk
69 | let that = this;
70 | //Get list of files in directory
71 | fs.readdir(req.scrapeProps.downloadDir, (err, file) => {
72 | list = file;
73 | list.forEach((name) => {
74 | //Add directory name to file name for FS
75 | name = req.scrapeProps.downloadDir.concat(name);
76 | //only edit html files
77 | if(name.match(/\.html$/)){
78 | //pass file names off to be read and rewritten
79 | this.editFile(req, res, next, name);
80 | }
81 | });
82 |
83 | //Since readdir is async, and is also called by parseEntry, we need to promisify it, and
84 | //send the resolve over
85 | var p1 = new Promise((resolve, reject)=>{
86 | parseEntry.allFiles(req, resolve, reject);
87 | });
88 |
89 | p1.then(function(val){
90 | //Time to zip the file
91 | //Pipe zip to the output file
92 | req.archive.pipe(req.output);
93 | //specify what to zip up (in this case the directory itself) and append them to the zip
94 | //Make the directory the z1ip file extracts to to be based on the scrapeDir
95 | //Use that, since this is bound to archive module
96 | req.archive.bulk([
97 | { expand: true, cwd: req.scrapeProps.baseDir, src: ['**'], dest: req.scrapeProps.scrapeDir.slice(0,-1)+'.docs'}
98 | ]);
99 | //Finalize archive and prevent further appends
100 | req.archive.finalize();
101 | }).catch((val)=>{
102 | console.log("Promise rejected: ", val)
103 | })
104 |
105 | });
106 | },
107 |
108 | editFile: function(req, res, next, file) {
109 | fs.readFile(file, 'utf-8', (err, data) => {
110 | //Remove front slash on src and href of js and css file locations
111 | // console.log("ok", data);
112 | var newData = data.replace(/href=\"\/(?!\/)/gi, 'href="').
113 | replace(/src=\"\/(?!\/)/gi, 'src="');
114 | //Made the rewriter universal for whatever we are scraping
115 | //Will need to impliment checks to make sure we have methods for those sites
116 |
117 | var writeMethod = req.scrapeProps.scrapeDir.slice(0, -1)
118 | //Call function to remove extraneous stuff but ITS DYNAMIC!!!
119 | //Try and catch in case we don't have the required methods
120 | try{
121 | newData = rewrite[writeMethod](req, res, next, newData);
122 | }
123 | catch(err){
124 | console.error("WHOA WE DONT HAVE A FUNCTION FOR THIS")
125 | // res.send(`Sorry, there seems to be a problem with our parsing engine,
Please contact us`)
126 | return res.end()
127 | }
128 | //Rewrite file
129 | fs.writeFileSync(file, newData, 'utf-8')
130 | });
131 | },
132 |
133 |
134 | }
135 | module.exports = scrapeParseWrite;
136 |
--------------------------------------------------------------------------------
/server/middleware/versionCheck.js:
--------------------------------------------------------------------------------
1 | var request = require('request');
2 | var cheerio = require('cheerio');
3 |
4 | var versionCheck = {
5 | node: function(req, res, next){
6 | //Grab front page of node and check the version number;
7 | request.get('https://nodejs.org/api/index.html', (err, resp, body) =>{
8 | var $ = cheerio.load(body);
9 | var versionString = $('header h1').text();
10 | //Match returns an array, first element is the match!!
11 | versionString = versionString.match(/\sv.*\s/)[0].trim().slice(1);
12 | req.scrapeProps.versionNo = versionString;
13 | next();
14 | });
15 | },
16 | express: function(req, res, next){
17 | request.get('http://expressjs.com/en/4x/api.html', (err, resp, body) =>{
18 | var $ = cheerio.load(body);
19 | //Grab first anchor after #application-menu, most current ver
20 | var versionString = $('#application-menu a').attr('href');
21 | //Match returns an array, first element is the match!! slice off trailing /
22 | versionString = versionString.match(/[0-9]+.+\//)[0].slice(0,-1);
23 | req.scrapeProps.versionNo = versionString;
24 | next();
25 | });
26 | },
27 | js: function(req, res, next){
28 | request.get('https://kapeli.com/mdn_offline', (err, resp, body) => {
29 | var $ = cheerio.load(body);
30 | //version string here is going to be the update date
31 | var jsLink = $( ".download:contains('JavaScript.tgz')" );
32 | var versionString = $(jsLink).parent().next('td').text();
33 | req.scrapeProps.versionNo = versionString;
34 | next();
35 | });
36 | },
37 | css: function(req, res, next){
38 | request.get('https://kapeli.com/mdn_offline', (err, resp, body) => {
39 | var $ = cheerio.load(body);
40 | //version string here is going to be the update date
41 | var jsLink = $( ".download:contains('CSS.tgz')" );
42 | var versionString = $(jsLink).parent().next('td').text();
43 | req.scrapeProps.versionNo = versionString;
44 | next();
45 | });
46 | },
47 | html: function(req, res, next){
48 | request.get('https://kapeli.com/mdn_offline', (err, resp, body) => {
49 | var $ = cheerio.load(body);
50 | //version string here is going to be the update date
51 | var jsLink = $( ".download:contains('HTML.tgz')" );
52 | var versionString = $(jsLink).parent().next('td').text();
53 | req.scrapeProps.versionNo = versionString;
54 | next();
55 | });
56 | }
57 | };
58 | module.exports = versionCheck;
59 |
--------------------------------------------------------------------------------
/server/server.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const express = require( 'express' );
4 | const bodyParser = require( 'body-parser' );
5 | const path = require( 'path' );
6 | const mongoose = require( 'mongoose' );
7 | const dbController = require( './controllers/dbController' );
8 | const mdnJS = require( './middleware/mdnJS' );
9 | const mdnHTML = require( './middleware/mdnHTML' );
10 | const mdnCSS = require( './middleware/mdnCSS' );
11 | //Scraping middleware
12 | const scrapeParseWrite = require('./middleware/scrapeParseWrite');
13 | const parseEntry = require('./middleware/parseEntryPoint');
14 | //Middleware to add proper request properties for each site to scrape
15 | const requestProps = require( './middleware/requestProps' );
16 | //Add middleware to check version of various sites
17 | const version = require( './middleware/versionCheck' );
18 | const fs = require( 'fs' );
19 | mongoose.connect( 'mongodb://Doc:tor@ds059215.mongolab.com:59215/doc-tor' );
20 | const db = mongoose.connection;
21 | const app = express();
22 |
23 |
24 | require( 'dns' )
25 | .lookup( require( 'os' )
26 | .hostname(),
27 | function ( err, add, fam ) {
28 | console.log( 'addr: ' + add );
29 | } );
30 | // log output
31 | // app.use(require('morgan')
32 | // ('STATUS=:status IP=:remote-addr REQ=":method :url" TIME=:response-time :res[content-length]'));
33 |
34 | db.on( 'error', console.error.bind( console, 'connection error:' ) );
35 | db.once( 'open', function () {
36 | console.log( "your db is open" );
37 | } );
38 |
39 | app.use( bodyParser.urlencoded( {
40 | extended: true
41 | } ) );
42 | app.use( express.static( path.join( __dirname, './../public' ) ) );
43 | /////////////////////////////////////////////////
44 | //// Handle requests to our main page(site)
45 | /////////////////////////////////////////////////
46 | app.get( '/', function ( req, res ) {
47 | console.log( "Our website homepage!" );
48 | res.sendFile( path.join( __dirname, '/../public/index.html' ) );
49 | } );
50 |
51 |
52 | /***** API *****/
53 | /*
54 | TODO: optimize download and extraction
55 | TODO: make create functions DRY with helper function
56 | NOTE: mdn.download only provides a link for request module,
57 | mdn.getJavascript actually downloads the .tgz
58 | */
59 | // app.get( '/js', mdnJS.download, mdnJS.getJavascript, mdnJS.extract, mdnJS.createClassObj, mdnJS.createMethodsObj, mdnJS.createEventObj, mdnJS.createKWObj, mdnJS.createFuncObj, mdnJS.sqlFile, mdnJS.zip, function ( req, res ) {
60 | // res.sendFile(path.resolve('./mdn_javascript.zip'));
61 | // console.log('\n finished');
62 | // });
63 |
64 |
65 | app.get( '/mdn_html', requestProps.html, dbController.latestVer, function ( req, res ) {
66 | res.sendFile(path.resolve(req.scrapeProps.filePath));
67 | req.scrapeProps = null;
68 | // console.log('\n finished');
69 | });
70 | app.get( '/mdn_css', requestProps.css, dbController.latestVer, function ( req, res ) {
71 | res.sendFile(path.resolve(req.scrapeProps.filePath));
72 | req.scrapeProps = null;
73 | // console.log('\n finished');
74 | });
75 | app.get('/mdn_javascript', requestProps.js, dbController.latestVer, function(req, res){
76 | res.sendFile(path.resolve(req.scrapeProps.filePath));
77 | req.scrapeProps = null;
78 | // console.log("sending full html back to client");
79 | });
80 | ///////////////////////////////////////////////////////////////////////////////
81 | /// BIND SCRAPEPARSEWRITE.CREATEZIP TO ITSELF SO IT BIND TO THE CORRECT CONTEXT
82 | ///////////////////////////////////////////////////////////////////////////////
83 | app.get('/node', requestProps.node, dbController.latestVer, function(req,res){
84 | res.sendFile(path.resolve(req.scrapeProps.filePath));
85 | req.scrapeProps = null;
86 | // console.log("sending full html back to client");
87 | });
88 | app.get('/express', requestProps.express, dbController.latestVer, function(req,res){
89 | res.sendFile(path.resolve(req.scrapeProps.filePath));
90 | req.scrapeProps = null;
91 | // console.log("sending full html back to client");
92 | });
93 | //////////////////////////////////////////////////
94 | // Test crash reporting route
95 | //////////////////////////////////////////////////
96 | // app.post( '/error', function ( req, res ) {
97 | // console.log( "this func is running" );
98 | // fs.writeFile( 'crashReport.txt', req.body, function () {
99 | // console.log( 'crash report\'s a go' );
100 | // } );
101 | // } );
102 | //////////////////////////////////////////////////
103 | // delete zip/or section from server update DB
104 | //////////////////////////////////////////////////
105 | app.delete( '/node', function ( req, res ) {} );
106 | //////////////////////////////////////////////////
107 | // handle changes to node update DB
108 | //////////////////////////////////////////////////
109 | app.put( '/node', function ( req, res ) {});
110 |
111 |
112 |
113 |
114 |
115 | ///////////////////////////////////////////////
116 | // Handle requests for data
117 | // (option for multiple sites)
118 | ///////////////////////////////////////////////
119 | // app.get('/html', function(req,res){
120 | // res.sendFile(path.join(__dirname, '/../index.html'));
121 | // console.log("send full html back to client");
122 | // });
123 |
124 | app.listen( 8080, function () {
125 | console.log( "Server is listening on port 80" );
126 | } );
127 |
--------------------------------------------------------------------------------
/server/updater.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | const express = require( 'express' );
4 | const bodyParser = require( 'body-parser' );
5 | const path = require( 'path' );
6 | const mongoose = require( 'mongoose' );
7 | const dbController = require( './controllers/dbController' );
8 | const mdnJS = require( './middleware/mdnJS' );
9 | const mdnHTML = require( './middleware/mdnHTML' );
10 | const mdnCSS = require( './middleware/mdnCSS' );
11 | //Scraping middleware
12 | const scrapeParseWrite = require('./middleware/scrapeParseWrite');
13 | const parseEntry = require('./middleware/parseEntryPoint');
14 | //Middleware to add proper request properties for each site to scrape
15 | const requestProps = require( './middleware/requestProps' );
16 | //Add middleware to check version of various sites
17 | const version = require( './middleware/versionCheck' );
18 | const fs = require( 'fs' );
19 | mongoose.connect( 'mongodb://Doc:tor@ds059215.mongolab.com:59215/doc-tor' );
20 | const db = mongoose.connection;
21 | const app = express();
22 |
23 | const updates = {"MDN_HTML":[requestProps.html, version.html, mdnHTML.download, mdnHTML.getHTML,
24 | mdnHTML.extract, mdnHTML.getElements, mdnHTML.sqlFile, mdnHTML.zip, dbController.addToDB],
25 |
26 | "MDN_CSS": [requestProps.css, version.css, mdnCSS.download, mdnCSS.getCSS,
27 | mdnCSS.extract, mdnCSS.getObjs, mdnCSS.getMoz,
28 | mdnCSS.sqlFile, mdnCSS.zip, dbController.addToDB],
29 |
30 | "MDN_Javascript": [requestProps.js, version.js, dbController.needUpdate, mdnJS.download, mdnJS.getJavascript,
31 | mdnJS.extract, mdnJS.createClassObj, mdnJS.createMethodsObj, mdnJS.createEventObj,
32 | mdnJS.createKWObj, mdnJS.createFuncObj, mdnJS.sqlFile, mdnJS.zip, dbController.addToDB],
33 |
34 | "NodeJS": [requestProps.node, version.node, scrapeParseWrite.createZip.bind(scrapeParseWrite), dbController.addToDB],
35 |
36 | "Express_API":[requestProps.express, version.express, scrapeParseWrite.createZip.bind(scrapeParseWrite), dbController.addToDB]
37 | };
38 |
39 | require( 'dns' )
40 | .lookup( require( 'os' )
41 | .hostname(),
42 | function ( err, add, fam ) {
43 | console.log( 'addr: ' + add );
44 | } );
45 | // log output
46 | // app.use(require('morgan')
47 | // ('STATUS=:status IP=:remote-addr REQ=":method :url" TIME=:response-time :res[content-length]'));
48 |
49 | db.on( 'error', console.error.bind( console, 'connection error:' ) );
50 | db.once( 'open', function () {
51 | console.log( "your db is open" );
52 | } );
53 |
54 | app.use( bodyParser.urlencoded( {
55 | extended: true
56 | } ) );
57 | app.use( express.static( path.join( __dirname, './../public' ) ) );
58 | /////////////////////////////////////////////////
59 | //// Handle requests to our main page(site)
60 | /////////////////////////////////////////////////
61 | app.get( '/', function ( req, res ) {
62 | console.log( "Our website homepage!" );
63 | res.sendFile( path.join( __dirname, '/../public/index.html' ) );
64 | } );
65 |
66 |
67 | /***** API *****/
68 | /*
69 | TODO: optimize download and extraction
70 | TODO: make create functions DRY with helper function
71 | NOTE: mdn.download only provides a link for request module,
72 | mdn.getJavascript actually downloads the .tgz
73 | */
74 | // app.get( '/js', mdnJS.download, mdnJS.getJavascript, mdnJS.extract, mdnJS.createClassObj, mdnJS.createMethodsObj, mdnJS.createEventObj, mdnJS.createKWObj, mdnJS.createFuncObj, mdnJS.sqlFile, mdnJS.zip, function ( req, res ) {
75 | // res.sendFile(path.resolve('./mdn_javascript.zip'));
76 | // console.log('\n finished');
77 | // });
78 | app.get('/uphtml', updates.MDN_HTML, function(req, res, next){
79 | res.sendFile(path.resolve(req.scrapeProps.filePath))
80 | })
81 |
82 | app.get('/updateVersions', updates.MDN_CSS, updates.MDN_HTML, updates.MDN_Javascript, updates.NodeJS, updates.Express_API,
83 | function(req, res){
84 | req.scrapeProps = null;
85 | res.end();
86 | });
87 | app.get( '/mdn_html', requestProps.html, dbController.latestVer, function ( req, res ) {
88 | res.sendFile(path.resolve(req.scrapeProps.filePath));
89 | req.scrapeProps = null;
90 | // console.log('\n finished');
91 | });
92 | app.get( '/mdn_css', requestProps.css, dbController.latestVer, function ( req, res ) {
93 | res.sendFile(path.resolve(req.scrapeProps.filePath));
94 | req.scrapeProps = null;
95 | // console.log('\n finished');
96 | });
97 | app.get('/mdn_javascript', requestProps.js, dbController.latestVer, function(req, res){
98 | res.sendFile(path.resolve(req.scrapeProps.filePath));
99 | req.scrapeProps = null;
100 | // console.log("sending full html back to client");
101 | });
102 | ///////////////////////////////////////////////////////////////////////////////
103 | /// BIND SCRAPEPARSEWRITE.CREATEZIP TO ITSELF SO IT BIND TO THE CORRECT CONTEXT
104 | ///////////////////////////////////////////////////////////////////////////////
105 | app.get('/node', requestProps.node, dbController.latestVer, function(req,res){
106 | res.sendFile(path.resolve(req.scrapeProps.filePath));
107 | req.scrapeProps = null;
108 | // console.log("sending full html back to client");
109 | });
110 | app.get('/express', requestProps.express, dbController.latestVer, function(req,res){
111 | res.sendFile(path.resolve(req.scrapeProps.filePath));
112 | req.scrapeProps = null;
113 | // console.log("sending full html back to client");
114 | });
115 | //////////////////////////////////////////////////
116 | // Test crash reporting route
117 | //////////////////////////////////////////////////
118 | // app.post( '/error', function ( req, res ) {
119 | // console.log( "this func is running" );
120 | // fs.writeFile( 'crashReport.txt', req.body, function () {
121 | // console.log( 'crash report\'s a go' );
122 | // } );
123 | // } );
124 | //////////////////////////////////////////////////
125 | // delete zip/or section from server update DB
126 | //////////////////////////////////////////////////
127 | app.delete( '/node', function ( req, res ) {} );
128 | //////////////////////////////////////////////////
129 | // handle changes to node update DB
130 | //////////////////////////////////////////////////
131 | app.put( '/node', function ( req, res ) {});
132 |
133 |
134 |
135 |
136 |
137 | ///////////////////////////////////////////////
138 | // Handle requests for data
139 | // (option for multiple sites)
140 | ///////////////////////////////////////////////
141 | // app.get('/html', function(req,res){
142 | // res.sendFile(path.join(__dirname, '/../index.html'));
143 | // console.log("send full html back to client");
144 | // });
145 |
146 | app.listen( 8085, function () {
147 | console.log( "Updater is listening on port 8085" );
148 | } );
149 |
--------------------------------------------------------------------------------