├── scripts ├── meliBdDao.js ├── meliLibs.js ├── meliGetData.js ├── meliParsers.js └── meliEntryPoint.js ├── .gitignore ├── README.md ├── example.png ├── .vscode └── settings.json ├── .env ├── .sequelizerc ├── database ├── config │ └── config.js └── models │ └── index.js ├── package.json └── index.js /scripts/meliBdDao.js: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /node_modules 2 | .env 3 | .env 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | "# mercadolibre-scrapper-nodejs" 2 | -------------------------------------------------------------------------------- /example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/damiansire/mercadolibre-scrapper-nodejs/HEAD/example.png -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "spellright.language": ["es"], 3 | "spellright.documentTypes": ["markdown", "latex", "plaintext"] 4 | } 5 | -------------------------------------------------------------------------------- /.env: -------------------------------------------------------------------------------- 1 | PGUSER=xcztwcqefnroqp 2 | PGHOST=ec2-52-87-123-108.compute-1.amazonaws.com 3 | PGPASSWORD=ff98efaf9c355f3dba58e8b2c9ded96a1610698469b8443eb0de57b27a084ecd 4 | PGDATABASE=dcf80f6icsc0q5 5 | PGPORT=5432 -------------------------------------------------------------------------------- /.sequelizerc: -------------------------------------------------------------------------------- 1 | const path = require("path"); 2 | 3 | module.exports = { 4 | config: path.resolve("./database/config", "config.js"), 5 | "models-path": path.resolve("./database/models"), 6 | "seeders-path": path.resolve("./database/seeders"), 7 | "migrations-path": path.resolve("./database/migrations"), 8 | }; 9 | -------------------------------------------------------------------------------- /database/config/config.js: -------------------------------------------------------------------------------- 1 | require("dotenv").config(); 2 | 3 | module.exports = { 4 | development: { 5 | url: process.env.DEV_DATABASE_URL, 6 | dialect: "postgres", 7 | }, 8 | test: { 9 | url: process.env.TEST_DATABASE_URL, 10 | dialect: "postgres", 11 | }, 12 | production: { 13 | url: process.env.DATABASE_URL, 14 | dialect: "postgres", 15 | }, 16 | }; 17 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "scrapper-mercadolibre-node-js", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "index.js", 6 | "scripts": { 7 | "start": "node index.js", 8 | "test": "echo \"Error: no test specified\" && exit 1" 9 | }, 10 | "keywords": [], 11 | "author": "Damian Sire ", 12 | "license": "ISC", 13 | "dependencies": { 14 | "dotenv": "^10.0.0", 15 | "pg": "^8.7.1", 16 | "pg-hstore": "^2.3.4", 17 | "puppeteer": "^11.0.0", 18 | "sequelize": "^6.9.0", 19 | "sequelize-cli": "^6.3.0" 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | const ParserHandler = require("./scripts/meliEntryPoint"); 2 | 3 | async function initApp() { 4 | console.log("Iniciando"); 5 | const parserHandler = new ParserHandler(); 6 | await parserHandler.setup(); 7 | console.info("Obteniendo los links para enviar a parsear"); 8 | const barrios = [ 9 | "centro", 10 | "ciudad-vieja", 11 | "cordon", 12 | "parque-rodo", 13 | "pocitos-nuevo", 14 | "tres-cruces", 15 | ]; 16 | for (let index = 0; index < barrios.length; index++) { 17 | await parserHandler.sendToParserFromBarrio(barrios[index]); 18 | } 19 | //await parserHandler.sendToParserForToday(); 20 | console.info("Comenzando a parsear la informacion de las casas"); 21 | await parserHandler.startPendingParser(); 22 | process.exit(0); 23 | } 24 | 25 | initApp(); 26 | 27 | /* 28 | __PRELOADED_STATE__.initialState.components.gallery.picture_config 29 | */ 30 | -------------------------------------------------------------------------------- /database/models/index.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const fs = require('fs'); 4 | const path = require('path'); 5 | const Sequelize = require('sequelize'); 6 | const basename = path.basename(__filename); 7 | const env = process.env.NODE_ENV || 'development'; 8 | const config = require(__dirname + '/../config/config.js')[env]; 9 | const db = {}; 10 | 11 | let sequelize; 12 | if (config.use_env_variable) { 13 | sequelize = new Sequelize(process.env[config.use_env_variable], config); 14 | } else { 15 | sequelize = new Sequelize(config.database, config.username, config.password, config); 16 | } 17 | 18 | fs 19 | .readdirSync(__dirname) 20 | .filter(file => { 21 | return (file.indexOf('.') !== 0) && (file !== basename) && (file.slice(-3) === '.js'); 22 | }) 23 | .forEach(file => { 24 | const model = require(path.join(__dirname, file))(sequelize, Sequelize.DataTypes); 25 | db[model.name] = model; 26 | }); 27 | 28 | Object.keys(db).forEach(modelName => { 29 | if (db[modelName].associate) { 30 | db[modelName].associate(db); 31 | } 32 | }); 33 | 34 | db.sequelize = sequelize; 35 | db.Sequelize = Sequelize; 36 | 37 | module.exports = db; 38 | -------------------------------------------------------------------------------- /scripts/meliLibs.js: -------------------------------------------------------------------------------- 1 | function generatePageUrl( 2 | pageNumber, 3 | barrio = "pocitos", 4 | departamento = "montevideo" 5 | ) { 6 | let fromText = ""; 7 | if (pageNumber > 1) { 8 | let fromNumber = 1 + 48 * (pageNumber - 1); 9 | fromText = `_Desde_${fromNumber}`; 10 | } 11 | 12 | const pageUrl = `https://listado.mercadolibre.com.uy/inmuebles/apartamentos/alquiler/1-dormitorio/${departamento}/${barrio}/${fromText}_PriceRange_10000UYU-22000UYU_NoIndex_True`; 13 | 14 | return pageUrl; 15 | } 16 | 17 | function generateForTodayPageUrl(pageNumber, departamento = "montevideo") { 18 | let fromText = ""; 19 | if (pageNumber >= 1) { 20 | let fromNumber = 1 + 48 * (pageNumber - 1); 21 | fromText = `_Desde_${fromNumber}`; 22 | } 23 | 24 | const pageUrl = `https://listado.mercadolibre.com.uy/inmuebles/apartamentos/alquiler/${departamento}/${fromText}_PublishedToday_YES_NoIndex_True`; 25 | 26 | return pageUrl; 27 | } 28 | 29 | function attributeTextToDataBaseName(text) { 30 | const nameInDataBase = { 31 | "Superficie total": "superficietotal", 32 | "Área privada": "superficie", 33 | Ambientes: "ambientes", 34 | Dormitorios: "dormitorios", 35 | Baños: "baños", 36 | Cocheras: "cocheras", 37 | "Número de piso de la unidad": "numerodepisodellaunidad", 38 | Antigüedad: "antiguedad", 39 | "Tipo de departamento": "tipo", 40 | "Gastos comunes": "gastoscomunes", 41 | Disposición: "disposicion", 42 | Orientación: "orientacion", 43 | "Admite mascotas": "admitemascotas", 44 | "Apartamentos por piso": "apartamentosporpiso", 45 | "Cantidad de pisos": "cantidaddepisos", 46 | Bodegas: "bodegas", 47 | }; 48 | if (nameInDataBase[text]) { 49 | return nameInDataBase[text]; 50 | } else { 51 | console.log(`La columna ${text} falta en la db`); 52 | } 53 | } 54 | 55 | function getViviendaIdFromUrl(url) { 56 | const idData = url.split("/")[3].split("-"); 57 | return `${idData[0]}${idData[1]}`; 58 | } 59 | 60 | module.exports = { 61 | generatePageUrl, 62 | attributeTextToDataBaseName, 63 | generateForTodayPageUrl, 64 | getViviendaIdFromUrl, 65 | }; 66 | -------------------------------------------------------------------------------- /scripts/meliGetData.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require("puppeteer"); 2 | const { generatePageUrl, generateForTodayPageUrl } = require("./meliLibs"); 3 | const { 4 | SearchPageParser, 5 | HousePreviewParser, 6 | HousePageParser, 7 | } = require("./meliParsers"); 8 | 9 | class MeliData { 10 | async initBrowser() { 11 | this.browser = await puppeteer.launch(); 12 | this.page = await this.browser.newPage(); 13 | await this.page.setViewport({ 14 | width: 1920, 15 | height: 1080, 16 | deviceScaleFactor: 1, 17 | }); 18 | } 19 | 20 | async getPageAmountForBarrio(barrioName) { 21 | const pageUrl = generatePageUrl(1); 22 | //Voy a la url con los apartamento 23 | await this.page.goto(pageUrl); 24 | //Aca selecciono un apartamento 25 | 26 | const pageAmount = await SearchPageParser.getPageAmount(this.page); 27 | 28 | return Number(pageAmount); 29 | } 30 | 31 | async getPageAmountForToday() { 32 | const pageUrl = generateForTodayPageUrl(1); 33 | //Voy a la url con los apartamento 34 | await this.page.goto(pageUrl); 35 | //Aca selecciono un apartamento 36 | 37 | const pageAmount = await SearchPageParser.getPageAmount(this.page); 38 | 39 | return Number(pageAmount); 40 | } 41 | 42 | async getApartamentsLinks(pageNumber, barrio) { 43 | const pageUrl = generatePageUrl(pageNumber, barrio); 44 | //Voy a la url con los apartamento 45 | await this.page.goto(pageUrl); 46 | //Aca selecciono un apartamento 47 | const housesElement = await this.page.$$(".ui-search-layout__item"); 48 | 49 | //Todo: Cambiar esto por promise.all 50 | let housesData = []; 51 | for (let houseElement of housesElement) { 52 | const houseData = await HousePreviewParser.getLink(houseElement); 53 | const link = houseData.link.split("#")[0]; 54 | housesData.push(link); 55 | } 56 | 57 | return housesData; 58 | } 59 | 60 | async getApartamentsLinksForToday(pageNumber) { 61 | const pageUrl = generateForTodayPageUrl(pageNumber); 62 | //Voy a la url con los apartamento 63 | await this.page.goto(pageUrl); 64 | //Aca selecciono un apartamento 65 | const housesElement = await this.page.$$(".ui-search-layout__item"); 66 | 67 | //Todo: Cambiar esto por promise.all 68 | let housesData = []; 69 | for (let houseElement of housesElement) { 70 | const houseData = await HousePreviewParser.getLink(houseElement); 71 | const link = houseData.link.split("#")[0]; 72 | housesData.push(link); 73 | } 74 | 75 | return housesData; 76 | } 77 | 78 | async getHouseDataFromUrl(url) { 79 | //Voy a la url con los apartamento 80 | await this.page.goto(url); 81 | const result = await HousePageParser.parserHousePage(this.page); 82 | return result; 83 | } 84 | 85 | async getImageDataFromUrl(url) { 86 | await this.page.goto(url); 87 | const imagesLinks = await HousePageParser.parserAllImg(this.page); 88 | return imagesLinks; 89 | } 90 | } 91 | 92 | module.exports = MeliData; 93 | -------------------------------------------------------------------------------- /scripts/meliParsers.js: -------------------------------------------------------------------------------- 1 | const { 2 | attributeTextToDataBaseName, 3 | getViviendaIdFromUrl, 4 | } = require("./meliLibs"); 5 | 6 | class SearchPageParser { 7 | static async getPageAmount(page) { 8 | const pageNumberText = await page.$eval( 9 | ".andes-pagination__page-count", 10 | (data) => data.innerText 11 | ); 12 | 13 | const pageAmount = pageNumberText.split(" ")[1]; 14 | 15 | return pageAmount; 16 | } 17 | } 18 | 19 | class HousePreviewParser { 20 | static async getLink(houseElement) { 21 | const linkSelector = ".ui-search-link"; 22 | 23 | let houseData = {}; 24 | 25 | houseData["link"] = await houseElement.$eval( 26 | linkSelector, 27 | (data) => data.href 28 | ); 29 | 30 | return houseData; 31 | } 32 | } 33 | 34 | class HousePageParser { 35 | static async parserHousePage(page) { 36 | const textAttributeToScrap = { 37 | location: ".ui-vip-location .ui-pdp-media__title", 38 | }; 39 | 40 | const dataPreloaded = await page.evaluate(() => { 41 | return __PRELOADED_STATE__.initialState.components; 42 | }); 43 | 44 | let houseData = {}; 45 | 46 | houseData.price = dataPreloaded.price.price.value; 47 | houseData.priceCurrency = dataPreloaded.price.price.currency_symbol; 48 | houseData.title = dataPreloaded.header.title; 49 | 50 | //Obtengo los label 51 | const labelText = await page.$$eval( 52 | ".andes-table__body tr .ui-pdp-specs__table__column-title", 53 | (data) => data.map((anchor) => anchor.innerText) 54 | ); 55 | 56 | //Obtengo los values 57 | const valueText = await page.$$eval( 58 | ".andes-table__body tr span.andes-table__column--value", 59 | (data) => data.map((anchor) => anchor.innerText) 60 | ); 61 | 62 | for (let index = 0; index < labelText.length; index++) { 63 | const dataBaseLabelName = attributeTextToDataBaseName(labelText[index]); 64 | houseData[dataBaseLabelName] = valueText[index]; 65 | } 66 | 67 | houseData.link = await page.url(); 68 | houseData.id = getViviendaIdFromUrl(houseData.link); 69 | houseData.calle = dataPreloaded.location.map_info?.item_address; 70 | 71 | try { 72 | const locationData = 73 | dataPreloaded.location.map_info?.item_location?.split(",") || [ 74 | null, 75 | null, 76 | ]; 77 | houseData.barrio = locationData[0]; 78 | houseData.ciudad = locationData[1]; 79 | } catch (err) { 80 | const locationData = 81 | dataPreloaded.location.map_info.item_location.split(","); 82 | console.error( 83 | "Hay un problema con la locacion de", 84 | houseData.id, 85 | " ", 86 | locationData 87 | ); 88 | } 89 | 90 | //Parseo gastos comunes 91 | try { 92 | const gastosComunesPartes = houseData.gastoscomunes.split(" "); 93 | houseData.gastoscomunes = gastosComunesPartes[0]; 94 | houseData.gastoscomunescurrency = gastosComunesPartes[1]; 95 | } catch (error) { 96 | console.error( 97 | "Hay problemas con los gastos comunes de ", 98 | houseData.id, 99 | " ", 100 | houseData.gastosComunes 101 | ); 102 | throw new Error("Problema con los gastos comunes"); 103 | } 104 | 105 | return houseData; 106 | } 107 | 108 | static async parserAllImg(page) { 109 | const { picture_config, pictures } = await page.evaluate(() => { 110 | return __PRELOADED_STATE__.initialState.components.gallery; 111 | }); 112 | 113 | return pictures.map((img) => 114 | picture_config.template.replace("{id}", img.id) 115 | ); 116 | } 117 | } 118 | 119 | //La preview es lo que se ve en la pagina de busqueda 120 | 121 | module.exports = { 122 | SearchPageParser, 123 | HousePreviewParser, 124 | HousePageParser, 125 | }; 126 | -------------------------------------------------------------------------------- /scripts/meliEntryPoint.js: -------------------------------------------------------------------------------- 1 | const { getViviendaIdFromUrl } = require("./meliLibs"); 2 | require("dotenv").config(); 3 | const { Pool } = require("pg"); 4 | const MeliData = require("./meliGetData"); 5 | 6 | class ParserHandler { 7 | async setup() { 8 | this.meliData = new MeliData(); 9 | await this.meliData.initBrowser(); 10 | } 11 | 12 | async sendToParserFromBarrio(barrio) { 13 | const pageAmount = await this.meliData.getPageAmountForBarrio(barrio); 14 | for (let actualPage = 1; actualPage <= pageAmount; actualPage++) { 15 | const apartamentLink = await this.meliData.getApartamentsLinks( 16 | actualPage, 17 | barrio 18 | ); 19 | try { 20 | await saveApartamentsLinks(apartamentLink); 21 | } catch (err) { 22 | await deletePendingParser(apartamentLink); 23 | if (!(err.constraint == "pendingsaves_pk")) { 24 | throw new Error( 25 | `Problemas con el apartamento: ${apartamentLink} : ${err.name} : ${err.message}` 26 | ); 27 | } else { 28 | console.log("Ya estaba el apartamento, se ignora"); 29 | } 30 | } 31 | } 32 | } 33 | 34 | async sendToParserForToday() { 35 | const pageAmount = await this.meliData.getPageAmountForToday(); 36 | console.info(`Se van a parsear ${pageAmount} paginas`); 37 | for (let actualPage = pageAmount; actualPage <= pageAmount; actualPage++) { 38 | console.log("Parseando la pagina 1 de ", barrio); 39 | try { 40 | console.info(`Obteniendo apartamentos para la pagina ${actualPage}`); 41 | const apartamentLink = await this.meliData.getApartamentsLinksForToday( 42 | actualPage 43 | ); 44 | await saveApartamentsLinks(apartamentLink); 45 | } catch (err) { 46 | if (!(err.constraint == "pendingsaves_pk")) { 47 | throw new Error( 48 | `Problemas con el apartamento: ${apartamentData.link} : ${err.name} : ${err.message}` 49 | ); 50 | } 51 | throw new Error(err); 52 | } 53 | } 54 | } 55 | 56 | //Empieza a parsear las casas pendientes 57 | async startPendingParser() { 58 | let apartamentList = await getPagesToParser(1); 59 | while (apartamentList.length) { 60 | for (const apartament of apartamentList) { 61 | try { 62 | await this.parserApartamentData(apartament); 63 | await this.parserImage(apartament); 64 | await deletePendingParser(apartament.link); 65 | console.info( 66 | `\n \n El apartamento ${apartament.link} fue parseado correctamente \n \n` 67 | ); 68 | } catch (err) { 69 | console.log(err.message); 70 | } 71 | } 72 | apartamentList = await getPagesToParser(1); 73 | } 74 | } 75 | 76 | async parserApartamentData(apartament) { 77 | try { 78 | console.info(`Parseando apartamento ${apartament.id}`); 79 | if (apartament.id == 9321) { 80 | debugger; 81 | } 82 | const result = await this.meliData.getHouseDataFromUrl(apartament.link); 83 | console.info(`Guardando apartamento ${apartament.id}`); 84 | await saveApartamentData(result); 85 | } catch (error) { 86 | console.log("Error."); 87 | if ( 88 | error?.constraint == "viviendas_pkey" || 89 | error == "Problema con los gastos comunes" 90 | ) { 91 | await deletePendingParser(apartamentData.link); 92 | } else { 93 | console.error(`${error.name} : ${error.message}`); 94 | await logError(apartament.link, error.message); 95 | await deletePendingParser(apartament.link); 96 | throw new Error(`Problemas con el apartamento: ${apartament.link}`); 97 | } 98 | } 99 | } 100 | 101 | async parserImage(apartament) { 102 | console.info(`Parseando imagenes de ${apartament.link}`); 103 | const imagesLinks = await this.meliData.getImageDataFromUrl( 104 | apartament.link 105 | ); 106 | console.info(`Guardando imagenes de ${apartament.link}`); 107 | const viviendaId = getViviendaIdFromUrl(apartament.link); 108 | await saveImagesLink(imagesLinks, viviendaId); 109 | } 110 | } 111 | 112 | //Base de datos 113 | 114 | const config = { 115 | user: process.env.PGUSER, 116 | host: process.env.PGHOST, 117 | database: process.env.PGDATABASE, 118 | password: process.env.PGPASSWORD, 119 | port: process.env.PGPORT, 120 | ssl: { 121 | rejectUnauthorized: false, 122 | }, 123 | }; 124 | 125 | const pool = new Pool(config); 126 | 127 | async function saveApartamentsLinks(links) { 128 | const client = await pool.connect(); 129 | try { 130 | for (let link of links) { 131 | const text = `INSERT INTO public.pendingparser(link) VALUES($1) RETURNING *`; 132 | const values = [link]; 133 | const res = await client.query(text, values); 134 | } 135 | } catch (err) { 136 | throw err; 137 | } finally { 138 | client.release(); 139 | } 140 | } 141 | 142 | async function saveApartamentData(apartamentData) { 143 | let amount = Object.keys(apartamentData).length; 144 | let valuesKeys = ""; 145 | for (let index = 1; index <= amount; index++) { 146 | if (index == 1) { 147 | valuesKeys = "$1"; 148 | } else { 149 | valuesKeys += `,$${index}`; 150 | } 151 | } 152 | const fields = Object.keys(apartamentData).join(","); 153 | const text = `INSERT INTO public.viviendas(${fields}) VALUES(${valuesKeys}) RETURNING *`; 154 | const values = Object.values(apartamentData); 155 | console.log("Conectando al pool"); 156 | let client; 157 | try { 158 | client = await pool.connect(); 159 | } catch (err) { 160 | console.log(err); 161 | } 162 | console.log("Coneccion al pool ok"); 163 | try { 164 | //Si da un error el insert, va al catch 165 | console.log("Haciendo query de guardado."); 166 | const res = await client.query(text, values); 167 | } catch (err) { 168 | console.log("Error. Capa Query a bd."); 169 | throw new Error(err); 170 | } finally { 171 | client.release(); 172 | } 173 | } 174 | 175 | async function saveImagesLink(imagesLink, viviendaId) { 176 | for (let imgLink of imagesLink) { 177 | await saveImg(imgLink, viviendaId); 178 | } 179 | } 180 | 181 | async function saveImg(imgLink, viviendaId) { 182 | //Cuidado la inyeccion sql 183 | const text = `INSERT INTO public.imagenes(viviendaid,imageurl) VALUES($1,$2) RETURNING *`; 184 | const values = [viviendaId, imgLink]; 185 | const client = await pool.connect(); 186 | try { 187 | const res = await client.query(text, values); 188 | } catch (err) { 189 | console.error(err.message); 190 | throw new Error( 191 | `Problemas la imagen ${imgLink} del apartamento ${viviendaId}` 192 | ); 193 | } finally { 194 | client.release(); 195 | } 196 | } 197 | 198 | async function getPagesToParser(numberOfLinks) { 199 | const client = await pool.connect(); 200 | let res; 201 | try { 202 | const getElementQuery = `SELECT * FROM public.pendingparser limit ${numberOfLinks}`; 203 | res = await client.query(getElementQuery); 204 | } catch (err) { 205 | console.log(err.stack); 206 | } finally { 207 | client.release(); 208 | } 209 | return res.rows; 210 | } 211 | 212 | async function deletePendingParser(apartamentLink) { 213 | //Cuidado la inyeccion sql 214 | const text = `DELETE FROM public.pendingparser WHERE link = $1 RETURNING *`; 215 | const values = [apartamentLink]; 216 | const client = await pool.connect(); 217 | try { 218 | const res = await client.query(text, values); 219 | } catch (err) { 220 | console.error(err.message); 221 | throw new Error( 222 | `Problemas al eliminar de la tabla pendingParser el apartamento ${apartamentLink}` 223 | ); 224 | } finally { 225 | client.release(); 226 | } 227 | } 228 | 229 | async function logError(link, error) { 230 | const text = `INSERT INTO public.error (link, error) VALUES($1, $2) RETURNING *`; 231 | const values = [link, error]; 232 | const client = await pool.connect(); 233 | try { 234 | const res = await client.query(text, values); 235 | } catch (err) { 236 | console.error(err.stack); 237 | throw new Error("No se ha podido insertar el error"); 238 | } finally { 239 | client.release(); 240 | } 241 | } 242 | 243 | module.exports = ParserHandler; 244 | --------------------------------------------------------------------------------