├── .gitignore ├── .babelrc ├── helpers ├── common.js └── puppeteer.js ├── handlers ├── saver.js └── listItemsHandler.js ├── package.json └── index.js /.gitignore: -------------------------------------------------------------------------------- 1 | data/* 2 | node_modules 3 | -------------------------------------------------------------------------------- /.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | "presets": ["@babel/preset-env"] 3 | } 4 | 5 | -------------------------------------------------------------------------------- /helpers/common.js: -------------------------------------------------------------------------------- 1 | export function arrayFromLength(number) { 2 | return Array.from(new Array(number).keys()).map(k => k+1) 3 | } 4 | 5 | export function formatPrice(priceStr) { 6 | const priceArr = priceStr.split(' – ') 7 | const [low, high] = priceArr.map(price => parseInt(price.replace(/[^0-9]/g, ''))) 8 | 9 | return { 10 | low, 11 | high 12 | } 13 | } 14 | 15 | export function formatPeriod(periodStr) { 16 | const periodArr = periodStr.split(' – ') 17 | const [start, end] = periodArr.map(year => parseInt(year)) 18 | 19 | return { 20 | start, 21 | end 22 | } 23 | } -------------------------------------------------------------------------------- /handlers/saver.js: -------------------------------------------------------------------------------- 1 | import path from 'path'; 2 | import fs from 'fs'; 3 | import chalk from 'chalk'; 4 | 5 | export default async function saveData(data) { 6 | const { code } = data; 7 | const fileName = `${code}.json`; 8 | const savePath = path.join(__dirname, '..', 'data', fileName); 9 | 10 | return new Promise((resolve, reject) => { 11 | fs.writeFile(savePath, JSON.stringify(data, null, 4), err => { 12 | if (err) { 13 | return reject(err); 14 | } 15 | 16 | console.log(chalk.blue('File was saved successfully: ') + chalk.blue.bold(fileName) + '\n'); 17 | 18 | resolve(); 19 | }); 20 | }); 21 | } 22 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "scrapper", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "index.js", 6 | "scripts": { 7 | "start": "npm run clean && babel-node index.js", 8 | "clean": "rm -rf data && mkdir data " 9 | }, 10 | "keywords": [], 11 | "author": "", 12 | "license": "ISC", 13 | "dependencies": { 14 | "@babel/cli": "^7.4.4", 15 | "@babel/core": "^7.4.5", 16 | "@babel/node": "^7.4.5", 17 | "@babel/preset-env": "^7.4.5", 18 | "async": "^3.1.0", 19 | "chalk": "^2.4.2", 20 | "cherio": "^1.0.0-rc.2", 21 | "puppeteer": "^1.18.1", 22 | "transliteration": "^2.1.4" 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /helpers/puppeteer.js: -------------------------------------------------------------------------------- 1 | import puppeteer from 'puppeteer'; 2 | 3 | export const LAUNCH_PUPPETEER_OPTS = { 4 | args: [ 5 | '--no-sandbox', 6 | '--disable-setuid-sandbox', 7 | '--disable-dev-shm-usage', 8 | '--disable-accelerated-2d-canvas', 9 | '--disable-gpu', 10 | '--window-size=1920x1080' 11 | ] 12 | }; 13 | 14 | export const PAGE_PUPPETEER_OPTS = { 15 | networkIdle2Timeout: 5000, 16 | waitUntil: 'networkidle2', 17 | timeout: 3000000 18 | }; 19 | 20 | export class PuppeteerHandler { 21 | constructor() { 22 | this.browser = null; 23 | } 24 | async initBrowser() { 25 | this.browser = await puppeteer.launch(LAUNCH_PUPPETEER_OPTS); 26 | } 27 | closeBrowser() { 28 | this.browser.close(); 29 | } 30 | async getPageContent(url) { 31 | if (!this.browser) { 32 | await this.initBrowser(); 33 | } 34 | 35 | try { 36 | const page = await this.browser.newPage(); 37 | await page.goto(url, PAGE_PUPPETEER_OPTS); 38 | const content = await page.content(); 39 | return content; 40 | } catch (err) { 41 | throw err; 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /handlers/listItemsHandler.js: -------------------------------------------------------------------------------- 1 | import cherio from 'cherio'; 2 | import chalk from 'chalk'; 3 | 4 | import saveData from './saver'; 5 | import { formatPrice, formatPeriod } from '../helpers/common'; 6 | import { taskQueue, p } from '../index'; 7 | 8 | const task = async initialData => { 9 | try { 10 | console.log(chalk.green(`Getting data from: `) + chalk.green.bold(initialData.url)); 11 | const detailContent = await p.getPageContent(initialData.url); 12 | const $ = cherio.load(detailContent); 13 | 14 | let period = $('.catalog-generation-summary__desc_period') 15 | .clone() 16 | .children() 17 | .remove() 18 | .end() 19 | .text(); 20 | 21 | const priceNewStr = $( 22 | '.catalog-generation-summary__info .catalog-generation-summary__desc:nth-of-type(2)' 23 | ).text(); 24 | 25 | const priceWithMileageStr = $( 26 | '.catalog-generation-summary__info .catalog-generation-summary__desc:nth-of-type(3)' 27 | ).text(); 28 | 29 | let priceNew = priceNewStr ? formatPrice(priceNewStr) : null; 30 | let priceWithMileage = priceWithMileageStr ? formatPrice(priceWithMileageStr) : null; 31 | period = formatPeriod(period); 32 | 33 | if (!priceWithMileage && priceNew) { 34 | priceWithMileage = priceNew; 35 | priceNew = null; 36 | } 37 | 38 | await saveData({ 39 | ...initialData, 40 | priceNew, 41 | priceWithMileage, 42 | period 43 | }); 44 | } catch (err) { 45 | throw err; 46 | } 47 | }; 48 | export default function listItemsHandler(data) { 49 | data.forEach(initialData => { 50 | taskQueue.push( 51 | () => task(initialData), 52 | err => { 53 | if (err) { 54 | console.log(err); 55 | throw new Error('Error getting data from url[ ' + initialData.url + ' ]'); 56 | } 57 | console.log(chalk.green.bold(`Success getting data from: \n${initialData.url}\n`)); 58 | } 59 | ); 60 | }); 61 | } 62 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | import cherio from 'cherio'; 2 | import chalk from 'chalk'; 3 | import { slugify } from 'transliteration'; 4 | 5 | import listItemsHandler from './handlers/listItemsHandler'; 6 | import { arrayFromLength } from './helpers/common'; 7 | import { PuppeteerHandler } from './helpers/puppeteer'; 8 | import queue from 'async/queue'; 9 | 10 | const SITE = 'https://auto.ru/catalog/cars/all/?page_num='; 11 | const pages = 4; 12 | const concurrency = 10; 13 | const startTime = new Date(); 14 | 15 | export const p = new PuppeteerHandler(); 16 | export const taskQueue = queue(async (task, done) => { 17 | try { 18 | await task(); 19 | console.log(chalk.bold.magenta('Task completed, tasks left: ' + taskQueue.length() + '\n')); 20 | done(); 21 | } catch (err) { 22 | throw err; 23 | } 24 | }, concurrency); 25 | 26 | taskQueue.drain(function() { 27 | const endTime = new Date(); 28 | console.log(chalk.green.bold(`🎉 All items completed [${(endTime - startTime) / 1000}s]\n`)); 29 | p.closeBrowser(); 30 | process.exit(); 31 | }); 32 | 33 | (function main() { 34 | arrayFromLength(pages).forEach(page => { 35 | taskQueue.push( 36 | () => listPageHandle(`${SITE}${page}`), 37 | err => { 38 | if (err) { 39 | console.log(err); 40 | throw new Error('🚫 Error getting data from page#' + page); 41 | } 42 | console.log(chalk.green.bold(`Completed getting data from page#${page}\n`)); 43 | } 44 | ); 45 | }); 46 | })(); 47 | 48 | async function listPageHandle(url) { 49 | try { 50 | const pageContent = await p.getPageContent(url); 51 | const $ = cherio.load(pageContent); 52 | const carsItems = []; 53 | 54 | $('.mosaic__title').each((i, header) => { 55 | const url = $(header).attr('href'); 56 | const title = $(header).text(); 57 | 58 | carsItems.push({ 59 | title, 60 | url, 61 | code: slugify(title) 62 | }); 63 | }); 64 | listItemsHandler(carsItems); 65 | } catch (err) { 66 | console.log(chalk.red('An error has occured \n')); 67 | console.log(err); 68 | } 69 | } 70 | --------------------------------------------------------------------------------