├── .gitignore ├── chromedriver.exe ├── README.md ├── db ├── index.js └── lagou.js ├── package.json ├── duanzi.js ├── duanzi_eazy.js └── lagou.js /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | debug.* 3 | pics 4 | .vscode -------------------------------------------------------------------------------- /chromedriver.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TianchengLee/crawler-demo/master/chromedriver.exe -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # crawler-demo 2 | 爬虫demo 3 | 4 | ## 技术栈 5 | 6 | node.js + selenuim + mysql 7 | 8 | selenuim访问目标网站, 爬取动态网页数据 9 | 10 | 使用MySQL存储数据 11 | -------------------------------------------------------------------------------- /db/index.js: -------------------------------------------------------------------------------- 1 | const mysql = require('mysql'); 2 | const connection = mysql.createConnection({ 3 | host : 'localhost', 4 | user : 'root', 5 | password : 'root', 6 | database : 'crawler' 7 | }); 8 | 9 | module.exports = connection -------------------------------------------------------------------------------- /db/lagou.js: -------------------------------------------------------------------------------- 1 | const conn = require('./index.js') 2 | 3 | module.exports = { 4 | addData(data) { 5 | conn.query('select count(*) as count from lagou where companyId = ? limit 0, 1', data.companyId, (err, res) => { 6 | if (err) return; 7 | if (res[0].count == 0) { 8 | conn.query('insert into lagou set ?', data) 9 | } 10 | }) 11 | } 12 | } -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "crawler-demo", 3 | "version": "1.0.0", 4 | "main": "index.js", 5 | "scripts": { 6 | "test": "echo \"Error: no test specified\" && exit 1" 7 | }, 8 | "keywords": [], 9 | "author": "", 10 | "license": "ISC", 11 | "description": "", 12 | "dependencies": { 13 | "cheerio": "^1.0.0-rc.2", 14 | "download": "^7.1.0", 15 | "mysql": "^2.16.0", 16 | "selenium-webdriver": "^4.0.0-alpha.1" 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /duanzi.js: -------------------------------------------------------------------------------- 1 | const { Builder, By, Key, until } = require('selenium-webdriver') 2 | const { Options } = require('selenium-webdriver/chrome') 3 | const download = require('download') 4 | 5 | let options = new Options().addArguments('--headless') 6 | 7 | let currentPageNum = 1 8 | let maxPage = 10 9 | let url = 'https://www.haha.mx/pic/new/' 10 | let driver = new Builder().forBrowser('chrome').setChromeOptions(options).build() 11 | 12 | start() 13 | 14 | async function start() { 15 | await getData() 16 | } 17 | 18 | async function getData() { 19 | let imgUrls = [] 20 | driver.get(url + currentPageNum) 21 | let els = await driver.findElements(By.css('.joke-main-img-wrapper .joke-main-img')) 22 | for (let i = 0; i < els.length; i++) { 23 | const el = els[i]; 24 | let imgUrl = await el.getAttribute('data-original') 25 | imgUrl = 'https:' + imgUrl 26 | imgUrls.push(imgUrl) 27 | } 28 | downloadImg(imgUrls) 29 | currentPageNum++ 30 | if (currentPageNum <= maxPage) { 31 | await getData() 32 | } 33 | } 34 | 35 | function downloadImg(arr) { 36 | Promise.all(arr.map(x => download(x, 'pics'))).then(() => { 37 | console.log('files downloaded!'); 38 | }); 39 | } -------------------------------------------------------------------------------- /duanzi_eazy.js: -------------------------------------------------------------------------------- 1 | const https = require('https') 2 | const cheerio = require('cheerio') 3 | const download = require('download') 4 | 5 | let currentPage = 1 6 | 7 | const options = { 8 | hostname: 'www.haha.mx', 9 | port: 443, 10 | path: '/good/day/' + currentPage, 11 | method: 'GET' 12 | }; 13 | 14 | 15 | let imgUrls = [] 16 | 17 | const req = https.request(options, (res) => { 18 | // console.log('statusCode:', res.statusCode); 19 | // console.log('headers:', res.headers); 20 | 21 | let chunkArr = [] 22 | res.on('data', (chunk) => { 23 | // process.stdout.write(chunk); 24 | chunkArr.push(chunk) 25 | }); 26 | 27 | res.on('end', () => { 28 | let result = Buffer.concat(chunkArr).toString() 29 | // console.log(result) 30 | let $ = cheerio.load(result) 31 | $('.joke-list-item-main .joke-main-content .joke-main-img').each((index, item) => { 32 | // console.log(index) 33 | // console.log($(item).attr('src')) 34 | console.log($(item).data('original')) 35 | imgUrls.push('https:' + $(item).data('original')) 36 | }) 37 | 38 | Promise.all(imgUrls.map(x => download(x, 'pics'))).then(() => { 39 | console.log('files downloaded!'); 40 | }); 41 | }) 42 | }); 43 | 44 | 45 | req.end(); -------------------------------------------------------------------------------- /lagou.js: -------------------------------------------------------------------------------- 1 | const { Builder, By, Key, until } = require('selenium-webdriver'); 2 | const lagou = require('./db/lagou.js') 3 | // const { Options } = require('selenium-webdriver/chrome'); 4 | 5 | // const options = new Options() 6 | // options.addArguments('Cookie=user_trace_token=20181130095945-889e634a-a79b-4b61-9ced-996eca44b107; X_HTTP_TOKEN=7470c50044327b9a2af2946eaad67653; _ga=GA1.2.2111156102.1543543186; _gid=GA1.2.1593040181.1543543186; LGUID=20181130095946-9c90e147-f443-11e8-87e4-525400f775ce; sajssdk_2015_cross_new_user=1; JSESSIONID=ABAAABAAAGGABCB5E0E82B87052ECD8CED0421F1D36020D; index_location_city=%E5%85%A8%E5%9B%BD; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1543543186,1543545866; LGSID=20181130104426-da2fc57f-f449-11e8-87ea-525400f775ce; PRE_UTM=; PRE_HOST=www.cnblogs.com; PRE_SITE=https%3A%2F%2Fwww.cnblogs.com%2F; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_%25E5%2589%258D%25E7%25AB%25AF%25E5%25BC%2580%25E5%258F%2591%3Fkd%3D%25E5%2589%258D%25E7%25AB%25AF%25E5%25BC%2580%25E5%258F%2591%26spc%3D1%26pl%3D%26gj%3D%26xl%3D%26yx%3D%26gx%3D%26st%3D%26labelWords%3Dlabel%26lc%3D%26workAddress%3D%26city%3D%25E5%2585%25A8%25E5%259B%25BD%26requestId%3D%26pn%3D1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221676257e1bd8cc-060451fc44d124-9393265-2359296-1676257e1be898%22%2C%22%24device_id%22%3A%221676257e1bd8cc-060451fc44d124-9393265-2359296-1676257e1be898%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; ab_test_random_num=0; _putrc=30FD5A7177A00E45123F89F2B170EADC; login=true; unick=%E5%A4%A9%E6%88%90; hasDeliver=0; gate_login_token=3e9da07186150513b28b29e8e74f485b86439e1fd26fc4939d32ed2660e8421a; _gat=1; SEARCH_ID=334cf2a080f44f2fb42841f473719162; LGRID=20181130110855-45ea2d22-f44d-11e8-87ee-525400f775ce; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1543547335; TG-TRACK-CODE=search_code'); 7 | // options.addArguments('user-agent="Mozilla/5.0 (iPod; U; CPU iPhone OS 2_1 like Mac OS X; ja-jp) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5F137 Safari/525.20') 8 | 9 | // .setChromeOptions(options) 10 | 11 | let currentPageNum = 1; 12 | let maxPageNum = 10; 13 | const url = 'https://www.lagou.com/jobs/list_%E5%89%8D%E7%AB%AF?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=' 14 | let driver = new Builder().forBrowser('chrome').build(); 15 | 16 | start() 17 | 18 | async function start() { 19 | await driver.get(url); 20 | getData() 21 | } 22 | 23 | async function getData() { 24 | console.log(`正在获取第${currentPageNum}页的数据`) 25 | while (true) { 26 | let flag = true 27 | try { 28 | let els = await driver.findElements(By.className('con_list_item')) 29 | let results = [] 30 | for (let i = 0; i < els.length; i++) { 31 | let el = els[i] 32 | let companyId = await el.getAttribute('data-companyid') 33 | let job = await el.findElement(By.tagName('h3')).getText() 34 | let area = await el.findElement(By.tagName('em')).getText() 35 | let money = await el.findElement(By.className('money')).getText() 36 | let link = await el.findElement(By.className('position_link')).getAttribute('href') 37 | let need = await el.findElement(By.css('.p_bot .li_b_l')).getText() 38 | let companyLink = await el.findElement(By.css('.company_name>a')).getAttribute('href') 39 | let companyName = await el.findElement(By.css('.company_name>a')).getText() 40 | let companyIcon = await el.findElement(By.css('.com_logo img')).getAttribute('src') 41 | let industry = await el.findElement(By.className('industry')).getText() 42 | let tags = await el.findElement(By.css('.list_item_bot .li_b_l')).getText() 43 | let welfare = await el.findElement(By.css('.list_item_bot .li_b_r')).getText() 44 | need = need.replace(/\d+k-\d+k/, '') 45 | // console.log(id, job, area, money, link, need, companyLink, industry, tags, welfare) 46 | results.push({ 47 | companyId, 48 | job, 49 | area, 50 | link, 51 | money, 52 | companyName, 53 | companyLink, 54 | companyIcon, 55 | need, 56 | industry, 57 | tags, 58 | welfare, 59 | }) 60 | } 61 | 62 | console.log(results) 63 | 64 | results.forEach(data => { 65 | lagou.addData(data) 66 | }) 67 | 68 | currentPageNum++ 69 | if (currentPageNum <= maxPageNum) { 70 | await driver.findElement(By.className('pager_next')).click() 71 | // console.log(result, driver) 72 | await getData(driver) 73 | // driver.executeScript(`window.open('${url}')`) 74 | } 75 | } catch (e) { 76 | if (e) { 77 | flag = false 78 | } 79 | } finally { 80 | if (flag) { 81 | break 82 | } 83 | } 84 | 85 | } 86 | } 87 | 88 | // const {URL} = require('url') 89 | 90 | // const options = new URL('https://www.lagou.com/jobs/list_%E5%89%8D%E7%AB%AF?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput='); 91 | 92 | // const options = { 93 | // hostname: 'www.lagou.com', 94 | // port: 443, 95 | // path: '/jobs/list_%E5%89%8D%E7%AB%AF?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=', 96 | // method: 'GET', 97 | // headers: { 98 | // 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 99 | // 'Accept-Encoding': 'utf-8', //这里设置返回的编码方式 设置其他的会是乱码 100 | // 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 101 | // 'Connection': 'keep-alive', 102 | // 'Cookie': 'user_trace_token=20181130095945-889e634a-a79b-4b61-9ced-996eca44b107; X_HTTP_TOKEN=7470c50044327b9a2af2946eaad67653; _ga=GA1.2.2111156102.1543543186; _gid=GA1.2.1593040181.1543543186; LGUID=20181130095946-9c90e147-f443-11e8-87e4-525400f775ce; sajssdk_2015_cross_new_user=1; JSESSIONID=ABAAABAAAGGABCB5E0E82B87052ECD8CED0421F1D36020D; index_location_city=%E5%85%A8%E5%9B%BD; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1543543186,1543545866; LGSID=20181130104426-da2fc57f-f449-11e8-87ea-525400f775ce; PRE_UTM=; PRE_HOST=www.cnblogs.com; PRE_SITE=https%3A%2F%2Fwww.cnblogs.com%2F; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_%25E5%2589%258D%25E7%25AB%25AF%25E5%25BC%2580%25E5%258F%2591%3Fkd%3D%25E5%2589%258D%25E7%25AB%25AF%25E5%25BC%2580%25E5%258F%2591%26spc%3D1%26pl%3D%26gj%3D%26xl%3D%26yx%3D%26gx%3D%26st%3D%26labelWords%3Dlabel%26lc%3D%26workAddress%3D%26city%3D%25E5%2585%25A8%25E5%259B%25BD%26requestId%3D%26pn%3D1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221676257e1bd8cc-060451fc44d124-9393265-2359296-1676257e1be898%22%2C%22%24device_id%22%3A%221676257e1bd8cc-060451fc44d124-9393265-2359296-1676257e1be898%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; ab_test_random_num=0; sm_auth_id=v6e99z7u633siv3o; _putrc=30FD5A7177A00E45123F89F2B170EADC; login=true; unick=%E5%A4%A9%E6%88%90; hasDeliver=0; gate_login_token=3e9da07186150513b28b29e8e74f485b86439e1fd26fc4939d32ed2660e8421a; _gat=1; TG-TRACK-CODE=index_search; LGRID=20181130110854-4519882e-f44d-11e8-8ca7-5254005c3644; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1543547334; SEARCH_ID=5c8084bf98d1470387b14b181d411bb0', 103 | // 'DNT': '1', 104 | // 'Host': 'www.lagou.com', 105 | // 'Referer': 'https://www.lagou.com/jobs/list_%E5%89%8D%E7%AB%AF?px=default&city=%E5%85%A8%E5%9B%BD', 106 | // 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' 107 | // } 108 | // }; 109 | 110 | // const req = https.request(options, res => { 111 | // console.log('statusCode:', res.statusCode) 112 | // console.log('headers:', res.headers) 113 | // res.on('data', (d) => { 114 | // // process.stdout.write(d); 115 | // let sourceHtmlStr = d.toString() 116 | // // console.log(sourceHtmlStr) 117 | // const $ = cheerio.load(sourceHtmlStr) 118 | // console.log($.html()) 119 | // }) 120 | // }) 121 | 122 | // req.on('error', (e) => { 123 | // console.error(e); 124 | // }); 125 | // req.end(); --------------------------------------------------------------------------------