├── .github ├── no-response.yml └── stale.yml ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── index.js ├── package.json ├── src ├── get-phantom-html.js └── script.js └── test ├── mock └── index.html └── phantom-plugin.test.js /.github/no-response.yml: -------------------------------------------------------------------------------- 1 | # Configuration for probot-no-response - https://github.com/probot/no-response 2 | 3 | # Number of days of inactivity before an Issue is closed for lack of response 4 | daysUntilClose: 14 5 | # Label requiring a response 6 | responseRequiredLabel: wait-response 7 | # Comment to post when closing an Issue for lack of response. Set to `false` to disable 8 | closeComment: > 9 | This issue has been automatically closed because there has been no response from the original author. With only the 10 | information that is currently in the issue, we don't have enough information 11 | to take action. Please reach out if you have or find the answers we need so 12 | that we can investigate further. 13 | -------------------------------------------------------------------------------- /.github/stale.yml: -------------------------------------------------------------------------------- 1 | # Number of days of inactivity before an issue becomes stale 2 | daysUntilStale: 60 3 | # Number of days of inactivity before a stale issue is closed 4 | daysUntilClose: 7 5 | # Issues with these labels will never be considered stale 6 | exemptLabels: 7 | - bug 8 | - maybe-later 9 | - security 10 | # Label to use when marking an issue as stale 11 | staleLabel: wontfix 12 | # Comment to post when marking an issue as stale. Set to `false` to disable 13 | markComment: > 14 | This issue has been automatically marked as stale because it has not had 15 | recent activity. It will be closed if no further activity occurs. Thank you 16 | for your contributions. 17 | # Comment to post when closing a stale issue. Set to `false` to disable 18 | closeComment: false 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .DS_Store 3 | node_modules 4 | npm-debug.log 5 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | sudo: false 3 | node_js: 4 | - '8' 5 | - '9' 6 | - '10' 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Sophia Antipenko 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## ⚠️ This plugin is deprecated and no longer maintained. Please consider using [website-scraper-puppeteer](https://github.com/website-scraper/website-scraper-puppeteer) instead. 2 | 3 | [![Version](https://img.shields.io/npm/v/website-scraper-phantom.svg?style=flat)](https://www.npmjs.org/package/website-scraper-phantom) 4 | [![Downloads](https://img.shields.io/npm/dm/website-scraper-phantom.svg?style=flat)](https://www.npmjs.org/package/website-scraper-phantom) 5 | [![Build Status](https://travis-ci.com/website-scraper/node-website-scraper-phantom.svg?branch=master)](https://travis-ci.com/github/website-scraper/node-website-scraper-phantom) 6 | 7 | # website-scraper-phantom 8 | Plugin for [website-scraper](https://github.com/website-scraper/node-website-scraper) which returns html for dynamic websites using PhantomJS. 9 | 10 | This module is an Open Source Software maintained by one developer in free time. If you want to thank the author of this module you can use [GitHub Sponsors](https://github.com/sponsors/s0ph1e) or [Patreon](https://www.patreon.com/s0ph1e). 11 | 12 | ## Requirements 13 | * nodejs version >= 8 14 | * website-scraper version >= 4 15 | 16 | if you need plugin for website-scraper version < 4, you can find it [here](https://github.com/website-scraper/node-website-scraper-phantom/blob/0.1/README.md) (version 0.1.0) 17 | 18 | ## Installation 19 | ```sh 20 | npm install website-scraper website-scraper-phantom 21 | ``` 22 | 23 | ## Usage 24 | ```javascript 25 | const scrape = require('website-scraper'); 26 | const PhantomPlugin = require('website-scraper-phantom'); 27 | 28 | scrape({ 29 | urls: ['https://www.instagram.com/gopro/'], 30 | directory: '/path/to/save', 31 | plugins: [ new PhantomPlugin() ] 32 | }); 33 | ``` 34 | 35 | ## How it works 36 | It starts PhantomJS which just opens page and waits when page is loaded. 37 | It is far from ideal because probably you need to wait until some resource is loaded or click some button or log in. Currently this module doesn't support such functionality. 38 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const Promise = require('bluebird'); 4 | const getPhantomHtml = require('./src/get-phantom-html.js'); 5 | 6 | /** 7 | * Makes phantom request if response contains html, returns original response body otherwise 8 | * @param {Object} response - response object from `request` module 9 | * @return {Promise} - resolved with body if success, rejected if error 10 | */ 11 | function handleResponse (response) { 12 | const contentType = response.headers['content-type']; 13 | const isHtml = contentType && contentType.split(';')[0] === 'text/html'; 14 | if (isHtml) { 15 | return getPhantomHtml(response.request.href); 16 | } else { 17 | return Promise.resolve(response.body); 18 | } 19 | } 20 | 21 | class AfterResponsePhantomPlugin { 22 | apply(registerAction) { 23 | registerAction('afterResponse', ({response}) => handleResponse(response)); 24 | } 25 | } 26 | 27 | module.exports = AfterResponsePhantomPlugin; 28 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "website-scraper-phantom", 3 | "version": "1.0.0", 4 | "description": "Plugin for website-scraper which receives html for dynamic websites using PhantomJS", 5 | "readmeFilename": "README.md", 6 | "main": "index.js", 7 | "keywords": [ 8 | "website-scraper", 9 | "phantomjs", 10 | "html" 11 | ], 12 | "dependencies": { 13 | "phantomjs-prebuilt": "^2.1.14", 14 | "system": "^1.2.0", 15 | "webpage": "^0.3.0", 16 | "bluebird": "^3.4.7" 17 | }, 18 | "peerDependencies": { 19 | "website-scraper": "^4.0.0" 20 | }, 21 | "devDependencies": { 22 | "chai": "^4.2.0", 23 | "finalhandler": "^1.1.1", 24 | "fs-extra": "^7.0.1", 25 | "mocha": "^5.2.0", 26 | "serve-static": "^1.13.2", 27 | "website-scraper": "^4.0.0" 28 | }, 29 | "scripts": { 30 | "test": "mocha --timeout 10000 --exit" 31 | }, 32 | "repository": { 33 | "type": "git", 34 | "url": "git+https://github.com/website-scraper/node-website-scraper-phantom.git" 35 | }, 36 | "author": "Sophia Antipenko ", 37 | "license": "MIT", 38 | "bugs": { 39 | "url": "https://github.com/website-scraper/node-website-scraper-phantom/issues" 40 | }, 41 | "homepage": "https://github.com/website-scraper/node-website-scraper-phantom" 42 | } 43 | -------------------------------------------------------------------------------- /src/get-phantom-html.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const path = require('path'); 4 | const phantomjs = require('phantomjs-prebuilt'); 5 | const Promise = require('bluebird'); 6 | 7 | const scriptPath = path.join(__dirname, 'script.js'); 8 | 9 | module.exports = (url) => { 10 | return new Promise((resolve, reject) => { 11 | const program = phantomjs.exec(scriptPath, url); 12 | let stdout = ''; 13 | let stderr = ''; 14 | 15 | program.stdout.on('data', (data) => { 16 | stdout += data; 17 | }); 18 | 19 | program.stderr.on('data', (data) => { 20 | stderr += data; 21 | }); 22 | 23 | program.on('exit', (code) => { 24 | if (code === 0) { 25 | // convert utf-8 -> binary string because website-scraper needs binary 26 | resolve(Buffer.from(stdout).toString('binary')); 27 | } else { 28 | reject(new Error(`Phantomjs finished with exit code ${code}. ${stderr}`)); 29 | } 30 | }); 31 | }); 32 | }; 33 | -------------------------------------------------------------------------------- /src/script.js: -------------------------------------------------------------------------------- 1 | var page = require('webpage').create(); 2 | var system = require('system'); 3 | 4 | if (system.args.length < 2) { 5 | system.stderr.write('Url is missed'); 6 | phantom.exit(1); 7 | } 8 | 9 | var url = system.args[1]; 10 | 11 | function done() { 12 | system.stdout.write(page.content); 13 | phantom.exit(0); 14 | } 15 | 16 | page.open(url, function (status) { 17 | if (status !== 'success') { 18 | system.stderr.write('Can\'t open page'); 19 | phantom.exit(1); 20 | } 21 | setTimeout(done, 1000); 22 | }); 23 | -------------------------------------------------------------------------------- /test/mock/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Test 6 | 7 | 8 | 9 |
10 | 11 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /test/phantom-plugin.test.js: -------------------------------------------------------------------------------- 1 | const { expect } = require('chai'); 2 | const http = require('http') 3 | const finalhandler = require('finalhandler'); 4 | const serveStatic = require('serve-static'); 5 | const fs = require('fs-extra'); 6 | const scrape = require('website-scraper'); 7 | const PhantomPlugin = require('../index'); 8 | 9 | const directory = __dirname + '/tmp'; 10 | 11 | describe('Phantom plugin test', () => { 12 | before('serve website', () => serveWebsite(4567)); 13 | 14 | after('delete dir', () => fs.removeSync(directory)); 15 | 16 | it('should render dymanic website', async () => { 17 | const result = await scrape({ 18 | urls: ['http://localhost:4567'], 19 | directory: directory, 20 | plugins: [ new PhantomPlugin() ] 21 | }); 22 | 23 | expect(result.length).eql(1); 24 | 25 | const content = fs.readFileSync(`${directory}/${result[0].filename}`).toString(); 26 | expect(content).to.contain('
Hello world from JS!
'); 27 | }); 28 | }); 29 | 30 | function serveWebsite(port = 3000) { 31 | const serve = serveStatic(__dirname + '/mock', {'index': ['index.html']}); 32 | const server = http.createServer(function onRequest (req, res) { 33 | serve(req, res, finalhandler(req, res)) 34 | }); 35 | server.listen(port) 36 | } 37 | --------------------------------------------------------------------------------