├── README.md ├── downVideo.js ├── get-by-user.js ├── package.json └── yarn-error.log /README.md: -------------------------------------------------------------------------------- 1 | # node-spider 2 | nodejs+puppeteer 3 | nodejs爬取西瓜视频(今日头条视频) 4 | 使用思路:输入一个头条号的名称,程序会去搜索对应用户,获取他的下面的视频,然后解析下载。 5 | 6 | 启动步骤: 7 | 1、 cnpm i 安装所有依赖 8 | 2、 node get-by-user.js 9 | 3、  随意打开一个浏览器 输入:http://localhost:3003/?keyword=头条号名称 10 | 11 | ![S0mFy.png](https://s1.ax2x.com/2018/05/02/S0mFy.png) 12 | 13 | 14 | -------------------------------------------------------------------------------- /downVideo.js: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: jaxQin 3 | * @Date: 2018-03-21 21:32:13 4 | * @Last Modified by: jaxQin 5 | * @Last Modified time: 2018-04-14 13:35:21 6 | */ 7 | 8 | 'use strict'; 9 | const devices = require('puppeteer/DeviceDescriptors'); 10 | const iPhone = devices['iPhone 6']; 11 | const fs = require('fs'); 12 | (async () => { 13 | const getVideo = async (browser, url) => { 14 | let arr = url.split('/'); 15 | let arr2 = arr.reverse(); 16 | url = `https://m.365yg.com/i${arr2[1]}/` 17 | let page = await browser.newPage(); 18 | try { 19 | await page.emulate(iPhone); 20 | await page.goto(url) 21 | await page.reload() 22 | } catch (e) { 23 | console.log(e) 24 | } 25 | // 判断是否是西瓜视频 26 | const pageTitle = await page.evaluate(() => { 27 | return document.title; 28 | }); 29 | console.log(pageTitle) 30 | if (pageTitle !== '西瓜视频') { 31 | await page.close() 32 | return null 33 | } 34 | 35 | 36 | // 获取视频名称 37 | let title; 38 | try { 39 | await page.waitForSelector('.info-title>h1'); 40 | title = await page.$eval('.info-title>h1', item => { 41 | return item.innerText 42 | }); 43 | if (!title) { 44 | await page.waitForSelector('.info-title>h1'); 45 | title = await page.$eval('.info-title>h1', item => { 46 | return item.innerText 47 | }); 48 | } 49 | if (!title) { 50 | await page.waitForSelector('.info-title>h1'); 51 | title = await page.$eval('meta[name=description]', item => { 52 | return item.content 53 | }); 54 | } 55 | // 检查已经下载的视频中是否包含 56 | let files = fs.readdirSync('./video'); 57 | files = files.join(','); 58 | if (files.includes(title)) return null 59 | // 获取视频真实地址abs-title 60 | await page.waitForSelector('#vjs_video_3_html5_api'); 61 | const videoSrc = await page.$eval('#vjs_video_3_html5_api', item => { 62 | return item.src 63 | }); 64 | console.log(`${title}--${videoSrc}`) 65 | await page.close(); 66 | title = title.replace(/“/g, '') 67 | title = title.replace(/”/g, '') 68 | title = title.replace(/"/g, '') 69 | return { 70 | videoSrc, 71 | title, 72 | } 73 | } catch (e) { 74 | console.log(e) 75 | return null 76 | } 77 | 78 | } 79 | module.exports = getVideo 80 | })() 81 | -------------------------------------------------------------------------------- /get-by-user.js: -------------------------------------------------------------------------------- 1 | const cheerio = require('cheerio'); //可以像jquer一样操作界面 2 | const charset = require('superagent-charset'); //解决乱码问题: 3 | const async = require('async'); //异步抓取 4 | const express = require('express'); 5 | const eventproxy = require('eventproxy'); //流程控制 6 | const ep = eventproxy(); 7 | const app = express(); 8 | const path = require('path') 9 | const fs = require('fs') 10 | const util = require('util'); 11 | const puppeteer = require('puppeteer'); 12 | const request = require('superagent'); 13 | 14 | const getVideo = require('./downVideo') //引入下载视频方法 15 | function sleep(ms) { 16 | return new Promise(resolve => setTimeout(resolve, ms)) 17 | } 18 | 19 | charset(request); 20 | 21 | let superagentPromisePlugin = require('superagent-promise-plugin'); 22 | superagentPromisePlugin.Promise = require('es6-promise'); 23 | 24 | let url = 'https://www.toutiao.com/search_content/?offset=0&format=json&autoload=true&count=20&cur_tab=4&from=media'; 25 | let name = null; 26 | 27 | (async () => { 28 | let browser; 29 | const searchUser = async keyword => { 30 | let arr = await request.get(url).query({ keyword, }).use(superagentPromisePlugin); 31 | arr = arr.text; 32 | let data = JSON.parse(arr); 33 | arr = data.data; 34 | for (item of arr) { 35 | if (item.media_id && item.name === keyword) return item; 36 | } 37 | }; 38 | const getSearchUrl = async url => { 39 | let page; 40 | try { 41 | page = await browser.newPage(); 42 | } catch (e) { 43 | console.log(`${item.title}浏览器创建失败`); 44 | // page = await browser.newPage(); 45 | } 46 | if (!page) { 47 | return null 48 | } 49 | await page.goto(url); 50 | await page.reload(); 51 | await (500) 52 | await page.reload(); 53 | await (500) 54 | const getUrl = async () => { 55 | return new Promise((resolve, reject) => { 56 | page.on('response', res => { 57 | // console.log(res.url) 58 | if (res.url.indexOf('user/article/') > -1) { 59 | page.close() 60 | resolve(res.url) 61 | } 62 | }) 63 | }) 64 | } 65 | let a = await getUrl(); 66 | page.close() 67 | return a 68 | } 69 | const getSearchArr = async url => { 70 | console.log('搜索用户的视频') 71 | url = url.replace('count=20', 'count=200') 72 | let page = await browser.newPage(); 73 | await page.goto(url) 74 | // 获取视频名称 75 | let data; 76 | await page.waitForSelector('pre'); 77 | data = await page.$eval('pre', item => { 78 | return item.innerText 79 | }); 80 | data = JSON.parse(data) 81 | 82 | arr = data.data; 83 | let newArr = [] 84 | // console.log(arr[0]) 85 | for (let video of arr) { 86 | let playNum = video.detail_play_effective_count 87 | if (playNum > 20000) { 88 | newArr.push(video) 89 | } 90 | } 91 | console.log(`可以进行爬取的视频有${newArr.length}条`) 92 | return newArr; 93 | } 94 | 95 | app.get('/', async (req, res, next) => { 96 | browser = await puppeteer.launch({ 97 | headless: false, 98 | executablePath: 'C:/Program Files (x86)/Google/Chrome/Application/chrome.exe', 99 | 100 | }); 101 | const userName = req.query.keyword; 102 | name = userName 103 | console.log(`name--${name}`) 104 | let user = await searchUser(userName); 105 | let url = user.source_url; 106 | console.log(url) 107 | // 获取到可以进行搜索的url 108 | let surl = await getSearchUrl(url); 109 | surl = surl.replace('page_type=1', 'page_type=0') 110 | let arr = await getSearchArr(surl); 111 | res.send({ url: arr }) 112 | 113 | let myArr = [] 114 | for (let item of arr) { 115 | let video = await getVideo(browser, item.display_url); 116 | if (!video) continue 117 | video = Object.assign(item, video) 118 | myArr.push(video) 119 | let filePath 120 | try { 121 | filePath = fs.createWriteStream(`./video/${video.title}.mp4`) 122 | } catch (e) { 123 | console.log('创建目录出错!!!!'); 124 | continue 125 | } 126 | request.get(video.videoSrc).pipe(filePath) 127 | } 128 | await sleep(10000) 129 | console.log('----下载完成------') 130 | browser.close() 131 | }); 132 | 133 | 134 | 135 | app.listen(3003, function (req, res) { 136 | console.log('app is running at port 3003'); 137 | }); 138 | 139 | })() 140 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "node-spider", 3 | "version": "0.0.0", 4 | "private": true, 5 | "scripts": { 6 | }, 7 | "dependencies": { 8 | "body-parser": "~1.13.1", 9 | "cookie-parser": "~1.3.5", 10 | "debug": "~2.2.0", 11 | "express": "~4.13.0", 12 | "morgan": "~1.6.1", 13 | "node-schedule": "^1.2.0", 14 | "puppeteer": "^0.13.0", 15 | "request": "^2.78.0", 16 | "request-promise": "^4.2.2", 17 | "request-promise-native": "^1.0.5", 18 | "serve-favicon": "~2.3.0", 19 | "superagent": "^2.3.0", 20 | "superagent-charset": "^1.1.1", 21 | "url": "^0.11.0" 22 | }, 23 | "devDependencies": { 24 | "request": "^2.83.0", 25 | "superagent-promise-plugin": "^3.2.0" 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /yarn-error.log: -------------------------------------------------------------------------------- 1 | Arguments: 2 | D:\nodejs\node.exe C:\Users\Administrator\AppData\Roaming\npm\node_modules\yarn\bin\yarn.js 3 | 4 | PATH: 5 | C:\Users\Administrator\bin;C:\Program Files\Git\mingw64\bin;C:\Program Files\Git\usr\local\bin;C:\Program Files\Git\usr\bin;C:\Program Files\Git\usr\bin;C:\Program Files\Git\mingw64\bin;C:\Program Files\Git\usr\bin;C:\Users\Administrator\bin;C:\windows\system32;C:\windows;C:\windows\System32\Wbem;C:\windows\System32\WindowsPowerShell\v1.0;C:\Program Files\Git\cmd;D:\Program Files\TortoiseGit\bin;D:\nodejs;D:\svn\bin;C:\Users\Administrator\AppData\Local\Microsoft\WindowsApps;C:\Users\Administrator\AppData\Roaming\npm;D:\Microsoft VS Code\bin;C:\Program Files\Git\usr\bin\vendor_perl;C:\Program Files\Git\usr\bin\core_perl 6 | 7 | Yarn version: 8 | 1.5.1 9 | 10 | Node version: 11 | 8.2.1 12 | 13 | Platform: 14 | win32 x64 15 | 16 | npm manifest: 17 | { 18 | "name": "node-spider", 19 | "version": "0.0.0", 20 | "private": true, 21 | "scripts": { 22 | }, 23 | "dependencies": { 24 | "body-parser": "~1.13.1", 25 | "cookie-parser": "~1.3.5", 26 | "debug": "~2.2.0", 27 | "express": "~4.13.0", 28 | "morgan": "~1.6.1", 29 | "node-schedule": "^1.2.0", 30 | "puppeteer": "^0.13.0", 31 | "request": "^2.78.0", 32 | "request-promise": "^4.2.2", 33 | "request-promise-native": "^1.0.5", 34 | "serve-favicon": "~2.3.0", 35 | "superagent": "^2.3.0", 36 | "superagent-charset": "^1.1.1", 37 | "url": "^0.11.0" 38 | }, 39 | "devDependencies": { 40 | "request": "^2.83.0", 41 | "superagent-promise-plugin": "^3.2.0" 42 | } 43 | } 44 | 45 | yarn manifest: 46 | No manifest 47 | 48 | Lockfile: 49 | No lockfile 50 | 51 | Trace: 52 | Error: connect ETIMEDOUT 104.16.63.173:443 53 | at Object.exports._errnoException (util.js:1024:11) 54 | at exports._exceptionWithHostPort (util.js:1047:20) 55 | at TCPConnectWrap.afterConnect [as oncomplete] (net.js:1150:14) 56 | --------------------------------------------------------------------------------