├── README.md ├── package.json └── index.js /README.md: -------------------------------------------------------------------------------- 1 | # acquizition-puppeteer-scraper 2 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "cron-acquizition", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "index.js", 6 | "scripts": { 7 | "start": "nodemon index.js" 8 | }, 9 | "author": "", 10 | "license": "ISC", 11 | "dependencies": { 12 | "csv": "^6.2.0", 13 | "firebase-admin": "^11.0.1", 14 | "puppeteer": "^16.2.0" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs'); 2 | const puppeteer = require("puppeteer"); 3 | const { parse } = require("csv-parse"); 4 | 5 | const { initializeApp, applicationDefault } = require('firebase-admin/app'); 6 | const { getFirestore } = require('firebase-admin/firestore'); 7 | 8 | initializeApp({ 9 | credential: applicationDefault(), 10 | databaseURL: 'https://openfair-app-default-rtdb.firebaseio.com', 11 | }); 12 | const db = getFirestore(); 13 | 14 | const LISTINGID_DIGIT_COUNT = 7; 15 | const HISTOGRAM_STEP_COUNT = 50; 16 | 17 | const META_COLLECTION_NAME = 'meta_v3'; 18 | const META_DOCUMENT_NAME = 'meta'; 19 | const OPENFAIR_COLLECTION_NAME = 'openfair_prod_v3'; 20 | 21 | const METAPROVINCE_COLLECTION_NAME = 'meta_province_v3'; 22 | const METACITY_COLLECTION_NAME = 'meta_city_v3'; 23 | const METAINDUSTRY_COLLECTION_NAME = 'meta_industry_v3'; 24 | const HISTOGRAM_PRICE_COLLECTION_NAME = 'hist_price_v3'; 25 | const HISTOGRAM_REVENUE_COLLECTION_NAME = 'hist_revenue_v3'; 26 | const HISTOGRAM_PROFIT_COLLECTION_NAME = 'hist_profit_v3'; 27 | const HISTOGRAM_BS_COLLECTION_NAME = 'hist_bs_v3'; 28 | const HISTOGRAM_META_COLLECTION_NAME = 'hist_meta_v3'; 29 | 30 | const canadaCities = []; 31 | const canadaProvinces = []; 32 | 33 | const hasProvince = (province_id, province_name) => { 34 | const length = canadaProvinces.length; 35 | for (let i = 0; i < length; i++) { 36 | const canadaProvince = canadaProvinces[i]; 37 | if (canadaProvince['province_id'] == province_id && canadaProvince['province_name'] == province_name) return true; 38 | } 39 | return false; 40 | } 41 | 42 | fs.createReadStream("./data/canadacities.csv") 43 | .pipe(parse({ delimiter: ",", from_line: 2 })) 44 | .on("data", function (row) { 45 | canadaCities.push({ 46 | 'city': row[0], 47 | 'city_ascii': row[1], 48 | 'province_id': row[2], 49 | 'province_name': row[3], 50 | }); 51 | 52 | if (!hasProvince(row[2], row[3])) { 53 | canadaProvinces.push({ 54 | 'province_id': row[2], 55 | 'province_name': row[3], 56 | }); 57 | } 58 | }) 59 | .on("end", function () { 60 | console.log('Finished to read the csv file'); 61 | }) 62 | .on("error", function (error) { 63 | console.log(error.message); 64 | }); 65 | 66 | const pad = (num, size) => { 67 | num = num.toString(); 68 | while (num.length < size) num = "0" + num; 69 | return num; 70 | } 71 | 72 | const scraping = async () => { 73 | const categories = [ 74 | { 75 | 'name': 'Agriculture', 76 | 'url': 'https://www.acquizition.biz/prod/trx?r=BS&b=CNCAN-PR...-RG....-SC105-CT10500-ST1&a=&c=LGc-SO.-PS020-PN000', 77 | 'pageCount': 2, 78 | }, 79 | { 80 | 'name': 'Automobile', 81 | 'url': 'https://www.acquizition.biz/prod/trx?r=BS&b=CNCAN-PR...-RG....-SC110-CT11000-ST1&a=&c=LGc-SO.-PS020-PN000', 82 | 'pageCount': 3, 83 | }, 84 | { 85 | 'name': 'Commercial real-estate', 86 | 'url': 'https://www.acquizition.biz/prod/trx?r=BS&b=CNCAN-PR...-RG....-SC163-CT16300-ST1&a=&c=LGc-SO.-PS020-PN000', 87 | 'pageCount': 6, 88 | }, 89 | { 90 | 'name': 'Communications', 91 | 'url': 'https://www.acquizition.biz/prod/trx?r=BS&b=CNCAN-PR...-RG....-SC125-CT12500-ST1&a=&c=LGc-SO.-PS020-PN000', 92 | 'pageCount': 1, 93 | }, 94 | { 95 | 'name': 'Construction', 96 | 'url': 'https://www.acquizition.biz/prod/trx?r=BS&b=CNCAN-PR...-RG....-SC130-CT13000-ST1&a=&c=LGc-SO.-PS020-PN000', 97 | 'pageCount': 3, 98 | }, 99 | { 100 | 'name': 'Education', 101 | 'url': 'https://www.acquizition.biz/prod/trx?r=BS&b=CNCAN-PR...-RG....-SC135-CT13500-ST1&a=&c=LGc-SO.-PS020-PN000', 102 | 'pageCount': 1, 103 | }, 104 | { 105 | 'name': 'Finance and insurance', 106 | 'url': 'https://www.acquizition.biz/prod/trx?r=BS&b=CNCAN-PR...-RG....-SC150-CT15000-ST1&a=&c=LGc-SO.-PS020-PN000', 107 | 'pageCount': 1, 108 | }, 109 | { 110 | 'name': 'Health and social services', 111 | 'url': 'https://www.acquizition.biz/prod/trx?r=BS&b=CNCAN-PR...-RG....-SC180-CT18000-ST1&a=&c=LGc-SO.-PS020-PN000', 112 | 'pageCount': 2, 113 | }, 114 | { 115 | 'name': 'High technology', 116 | 'url': 'https://www.acquizition.biz/prod/trx?r=BS&b=CNCAN-PR...-RG....-SC155-CT15500-ST1&a=&c=LGc-SO.-PS020-PN000', 117 | 'pageCount': 1, 118 | }, 119 | { 120 | 'name': 'Insolvency', 121 | 'url': 'https://www.acquizition.biz/prod/trx?r=BS&b=CNCAN-PR...-RG....-SC164-CT16400-ST1&a=&c=LGc-SO.-PS020-PN000', 122 | 'pageCount': 0, 123 | }, 124 | { 125 | 'name': 'Internet', 126 | 'url': 'https://www.acquizition.biz/prod/trx?r=BS&b=CNCAN-PR...-RG....-SC165-CT16500-ST1&a=&c=LGc-SO.-PS020-PN000', 127 | 'pageCount': 1, 128 | }, 129 | { 130 | 'name': 'Accommodations and restaurants', 131 | 'url': 'https://www.acquizition.biz/prod/trx?r=BS&b=CNCAN-PR...-RG....-SC160-CT16000-ST1&a=&c=LGc-SO.-PS020-PN000', 132 | 'pageCount': 12, 133 | }, 134 | { 135 | 'name': 'Maintenance and cleaning', 136 | 'url': 'https://www.acquizition.biz/prod/trx?r=BS&b=CNCAN-PR...-RG....-SC140-CT14000-ST1&a=&c=LGc-SO.-PS020-PN000', 137 | 'pageCount': 2, 138 | }, 139 | { 140 | 'name': 'Manufacturing/Transformation', 141 | 'url': 'https://www.acquizition.biz/prod/trx?r=BS&b=CNCAN-PR...-RG....-SC145-CT14500-ST1&a=&c=LGc-SO.-PS020-PN000', 142 | 'pageCount': 5, 143 | }, 144 | { 145 | 'name': 'Personal and residential services', 146 | 'url': 'https://www.acquizition.biz/prod/trx?r=BS&b=CNCAN-PR...-RG....-SC190-CT19000-ST1&a=&c=LGc-SO.-PS020-PN000', 147 | 'pageCount': 3, 148 | }, 149 | { 150 | 'name': 'Professional and technical services', 151 | 'url': 'https://www.acquizition.biz/prod/trx?r=BS&b=CNCAN-PR...-RG....-SC185-CT18500-ST1&a=&c=LGc-SO.-PS020-PN000', 152 | 'pageCount': 2, 153 | }, 154 | { 155 | 'name': 'Renting', 156 | 'url': 'https://www.acquizition.biz/prod/trx?r=BS&b=CNCAN-PR...-RG....-SC170-CT17000-ST1&a=&c=LGc-SO.-PS020-PN000', 157 | 'pageCount': 1, 158 | }, 159 | { 160 | 'name': 'Retail business', 161 | 'url': 'https://www.acquizition.biz/prod/trx?r=BS&b=CNCAN-PR...-RG....-SC115-CT11500-ST1&a=&c=LGc-SO.-PS020-PN000', 162 | 'pageCount': 10, 163 | }, 164 | { 165 | 'name': 'Shows and recreation', 166 | 'url': 'https://www.acquizition.biz/prod/trx?r=BS&b=CNCAN-PR...-RG....-SC195-CT19500-ST1&a=&c=LGc-SO.-PS020-PN000', 167 | 'pageCount': 1, 168 | }, 169 | { 170 | 'name': 'Tourism', 171 | 'url': 'https://www.acquizition.biz/prod/trx?r=BS&b=CNCAN-PR...-RG....-SC196-CT19600-ST1&a=&c=LGc-SO.-PS020-PN000', 172 | 'pageCount': 1, 173 | }, 174 | { 175 | 'name': 'Transportation and storage', 176 | 'url': 'https://www.acquizition.biz/prod/trx?r=BS&b=CNCAN-PR...-RG....-SC198-CT19800-ST1&a=&c=LGc-SO.-PS020-PN000', 177 | 'pageCount': 1, 178 | }, 179 | { 180 | 'name': 'Wholesale business', 181 | 'url': 'https://www.acquizition.biz/prod/trx?r=BS&b=CNCAN-PR...-RG....-SC120-CT12000-ST1&a=&c=LGc-SO.-PS020-PN000', 182 | 'pageCount': 2, 183 | }, 184 | ]; 185 | 186 | try { 187 | const browser = await puppeteer.launch(); 188 | const page = await browser.newPage(); 189 | 190 | page.setDefaultNavigationTimeout(0); 191 | 192 | await page.goto('https://www.acquizition.biz/prod/trx?r=UH&g=e', { 193 | waitUntil: 'load', 194 | }); 195 | 196 | console.log('111'); 197 | 198 | // Login 199 | await page.type("[name=UP_MAIL]", "mohammad.o@openfairmarket.com") 200 | await page.type("[name=US_PSWD]", "Mohammad@500") 201 | await page.select('[name=US_REGN]', 'CON') 202 | await page.click('[name=button1]'); 203 | await page.waitForNavigation({ 204 | waitUntil: 'load', 205 | }); 206 | // Login end 207 | 208 | console.log('222'); 209 | 210 | let result = []; 211 | let nCount = 0; 212 | 213 | for (let catIndex = 0; catIndex < categories.length; catIndex++) { 214 | const category = categories[catIndex]; 215 | console.log('scraping - ', category['name']); 216 | 217 | const nPageCount = category['pageCount']; 218 | for (let index = 0; index < nPageCount; index++) { 219 | let baseUrl = category['url']; 220 | if (index >= 9) baseUrl = baseUrl.substring(0, baseUrl.length - 1); 221 | 222 | const listUrl = `${baseUrl}${index + 1}`; 223 | await page.goto(listUrl, { 224 | waitUntil: 'load', 225 | }); 226 | 227 | const aTagHrefs = await page.evaluate(() => { 228 | const elements = document.querySelectorAll(".ls-name a, .ls-name2 a"); 229 | return Array.from(elements).map(e => e.href); 230 | }); 231 | 232 | for (let i = 0; i < aTagHrefs.length; i++) { 233 | const href = aTagHrefs[i]; 234 | if (!href.startsWith('javascript:goListing(')) continue; 235 | 236 | const params = href.substring(href.indexOf('(') + 1, href.indexOf(')')); 237 | const tokens = params.split(','); 238 | if (tokens.length != 3) continue; 239 | 240 | const listingId = tokens[0].replaceAll('\'', ''); 241 | const listingPos = tokens[1].replaceAll('\'', ''); 242 | const url = `${listUrl}-IDS${listingPos}0000${listingId}`; 243 | 244 | await page.goto(url, { 245 | waitUntil: 'load', 246 | }); 247 | 248 | const data = await page.evaluate((canadaCities) => { 249 | const capitalizeFirstLetter = (string) => string.charAt(0).toUpperCase() + string.slice(1); 250 | 251 | const getCityAndProvince = (strRegion) => { 252 | let city = ''; 253 | let province_id = ''; 254 | let province_name = ''; 255 | 256 | if (strRegion.length === 0) { 257 | return { 258 | 'city': city, 259 | 'province_id': province_id, 260 | 'province_name': province_name, 261 | } 262 | } 263 | 264 | strRegion = strRegion.split('\n')[0]; 265 | 266 | let strProvince = ''; 267 | let nIndex = strRegion.indexOf('-'); 268 | if (nIndex == -1) { 269 | strProvince = strRegion.trim(); 270 | } else { 271 | strProvince = strRegion.substring(0, nIndex).trim(); 272 | city = strRegion.substring(nIndex + 1).trim(); 273 | nIndex = city.indexOf(' Area'); 274 | if (nIndex != -1) city = city.substring(0, nIndex).trim(); 275 | } 276 | 277 | strProvince = strProvince.toLowerCase(); 278 | 279 | const cityLength = canadaCities.length; 280 | for (let i = 0; i < cityLength; i++) { 281 | const canadaCity = canadaCities[i]; 282 | if (canadaCity['province_id'].toLowerCase() === strProvince || canadaCity['province_name'].toLowerCase() === strProvince) { 283 | province_id = canadaCity['province_id']; 284 | province_name = canadaCity['province_name']; 285 | break; 286 | } 287 | } 288 | 289 | if (city.includes('entire province')) city = ''; 290 | 291 | return { 292 | 'city': city.length == 0 ? '' : capitalizeFirstLetter(city), 293 | 'province_id': province_id, 294 | 'province_name': province_name, 295 | } 296 | }; 297 | 298 | const numberFromMoneyString = (strMoney) => { 299 | strMoney = strMoney.replaceAll(/\s+/g, ''); 300 | strMoney = strMoney.replaceAll('$', ''); 301 | match = strMoney.match('[()a-zA-Z]'); 302 | if (match) { 303 | strMoney = strMoney.substring(0, match.index); 304 | } 305 | return strMoney == '' ? 0 : Number(strMoney); 306 | } 307 | 308 | const numberFromEmployeeString = (strEmployee) => { 309 | let nResult = 0; 310 | const tokens = strEmployee.split(' - '); 311 | for (let i = 0; i < tokens.length; i++) { 312 | let token = tokens[i].trim(); 313 | const nPos = token.indexOf(' '); 314 | if (nPos == -1) continue; 315 | const nNumber = Number(token.substring(0, nPos).trim()); 316 | if (!Number.isNaN(nNumber)) nResult += nNumber; 317 | } 318 | 319 | return nResult; 320 | } 321 | 322 | let tokens = []; 323 | let match = undefined; 324 | 325 | let _listing_id = ''; 326 | let _business_name = document.querySelector(".ls-titre").innerText; 327 | _business_name = _business_name.trim(); 328 | tokens = _business_name.split('\n'); 329 | if (tokens.length > 1) { 330 | const foundIndex = tokens[1].indexOf('-'); 331 | _listing_id = tokens[1].substring(0, foundIndex).trim(); 332 | _business_name = tokens[1].substring(foundIndex + 1).trim(); 333 | } 334 | 335 | const ld_labels = Array.from(document.querySelectorAll('.ld-label')); 336 | 337 | const region_labels = ld_labels.filter(el => el.textContent.includes('Region')); 338 | let _region = ''; 339 | if (region_labels.length) { 340 | _region = region_labels[0].parentNode.children[1].innerText; 341 | } 342 | const cityAndProvince = getCityAndProvince(_region.trim()); 343 | 344 | const country_labels = ld_labels.filter(el => el.textContent.includes('Country')); 345 | let _country = ''; 346 | if (country_labels.length) { 347 | _country = country_labels[0].parentNode.children[1].innerText; 348 | _country = _country.trim(); 349 | } 350 | 351 | const year_labels = ld_labels.filter(el => el.textContent.includes('Year founded')); 352 | let _foundedYear = 0; 353 | if (year_labels.length) { 354 | _foundedYear = year_labels.length == 0 ? '' : year_labels[0].parentNode.children[1].innerText; 355 | _foundedYear = _foundedYear.trim(); 356 | _foundedYear = _foundedYear == '' ? 0 : Number(_foundedYear); 357 | } 358 | 359 | const ld_photos = Array.from(document.querySelectorAll('.ld-photo img')); 360 | let _img = []; 361 | if (ld_photos.length) { 362 | _img = ld_photos.map(el => el.src); 363 | } else { 364 | const img_labels = ld_labels.filter(el => el.textContent.includes('img src=')); 365 | if (img_labels.length) { 366 | _img.push(img_labels[0].children[0].src); 367 | } 368 | } 369 | 370 | const businessSector_labels = ld_labels.filter(el => el.textContent.includes('Business sector')); 371 | let _industry = []; 372 | if (businessSector_labels.length) { 373 | const strIndustries = businessSector_labels[0].parentNode.children[1].innerText; 374 | const _industries = strIndustries.split('\n'); 375 | _industry = _industries.map(e => e.substring(0, e.indexOf(' - ')).trim().replaceAll('/', ' & ')); 376 | _industry = [...new Set(_industry)]; 377 | } 378 | 379 | let _l_description = document.querySelector(".ld-desc").innerText; 380 | _l_description = _l_description.trim(); 381 | 382 | const employee_labels = ld_labels.filter(el => el.textContent.includes('Number of employees')); 383 | let _numberOfEmployees = 0; 384 | if (employee_labels.length) { 385 | _numberOfEmployees = employee_labels[0].parentNode.children[1].innerText; 386 | _numberOfEmployees = numberFromEmployeeString(_numberOfEmployees.trim()); 387 | } 388 | 389 | const price_labels = ld_labels.filter(el => el.textContent.includes('Selling price')); 390 | let _price = 0; 391 | if (price_labels.length) { 392 | _price = price_labels[0].parentNode.children[1].innerText; 393 | _price = numberFromMoneyString(_price.trim()); 394 | } 395 | 396 | const profit_labels = ld_labels.filter(el => el.textContent.includes('Profit')); 397 | let _profit = 0; 398 | if (profit_labels.length) { 399 | _profit = profit_labels[0].parentNode.children[1].innerText; 400 | _profit = numberFromMoneyString(_profit.trim()); 401 | } 402 | 403 | const revenue_labels = ld_labels.filter(el => el.textContent.includes('revenue')); 404 | let _revenue = 0; 405 | if (revenue_labels.length) { 406 | _revenue = revenue_labels[0].parentNode.children[1].innerText; 407 | _revenue = numberFromMoneyString(_revenue.trim()); 408 | } 409 | 410 | const _s_description = _l_description.length > 200 ? _l_description.substring(0, 200) + '...' : _l_description; 411 | 412 | const sales_labels = ld_labels.filter(el => el.textContent.includes('sale')); 413 | let _sales = ''; 414 | if (sales_labels.length) { 415 | _sales = sales_labels[0].parentNode.children[1].innerText; 416 | _sales = _sales.trim(); 417 | if (_sales.startsWith('$')) { 418 | _sales = numberFromMoneyString(_sales); 419 | } 420 | } 421 | 422 | const pr_labelrs = Array.from(document.querySelectorAll('.pr-labelr')); 423 | 424 | const email_labels = pr_labelrs.filter(el => el.textContent.includes('E-mail')); 425 | let _seller_email = ''; 426 | if (email_labels.length) { 427 | _seller_email = email_labels[0].parentNode.children[1].innerText; 428 | _seller_email = _seller_email.trim(); 429 | } 430 | 431 | const firstname_labels = pr_labelrs.filter(el => el.textContent.includes('First name')); 432 | let _firstname = ''; 433 | if (firstname_labels.length) { 434 | _firstname = firstname_labels[0].parentNode.children[1].innerText; 435 | _firstname = _firstname.trim(); 436 | } 437 | const lastname_labels = pr_labelrs.filter(el => el.textContent.includes('Last name')); 438 | let _lastname = ''; 439 | if (lastname_labels.length) { 440 | _lastname = lastname_labels[0].parentNode.children[1].innerText; 441 | _lastname = _lastname.trim(); 442 | } 443 | let _seller_name = _firstname; 444 | if (_lastname.length != 0) { 445 | _seller_name += ' ' + _lastname; 446 | } 447 | 448 | const phone_labels = pr_labelrs.filter(el => el.textContent.includes('Phone number')); 449 | let _seller_phone_number = ''; 450 | if (phone_labels.length) { 451 | _seller_phone_number = phone_labels[0].parentNode.children[1].innerText; 452 | _seller_phone_number = _seller_phone_number.trim(); 453 | } 454 | 455 | const phone2_labels = pr_labelrs.filter(el => el.textContent.includes('Other phone')); 456 | let _seller_phone_number_2 = ''; 457 | if (phone2_labels.length) { 458 | _seller_phone_number_2 = phone2_labels[0].parentNode.children[1].innerText; 459 | _seller_phone_number_2 = _seller_phone_number_2.trim(); 460 | } 461 | 462 | return { 463 | 'business_name': _business_name, 464 | 'city': cityAndProvince['city'], 465 | 'province_id': cityAndProvince['province_id'], 466 | 'province_name': cityAndProvince['province_name'], 467 | 'country': _country, 468 | "currency":"CAD", 469 | 'date_scraped': (new Date()).toUTCString(), 470 | 'f_business_name': _business_name, 471 | 'f_l_description': _l_description, 472 | 'f_s_description': _s_description, 473 | 'foundedYear': _foundedYear, 474 | 'images': _img, 475 | 'img': _img, 476 | 'industry': _industry, 477 | 'l_description': _l_description, 478 | 'listing_created_date': (new Date()).toUTCString(), 479 | 'listing_id': _listing_id, 480 | 'listing_url': document.URL, 481 | 'numOfViews': 0, 482 | 'numberOfEmployees': _numberOfEmployees, 483 | 'of_listing_id': '', 484 | 'price': _price, 485 | 'profit': _profit, 486 | 'revenue': _revenue, 487 | 's_description': _s_description, 488 | 'sales': _sales, 489 | 'seller_email': _seller_email, 490 | 'seller_name': _seller_name, 491 | 'seller_phone_number': _seller_phone_number, 492 | 'seller_phone_number_2': _seller_phone_number_2, 493 | "source":"acquizition.biz", 494 | "source_url":"https://www.acquizition.biz/", 495 | }; 496 | }, canadaCities); 497 | 498 | result.push(data); 499 | nCount++; 500 | console.log('Done - ', nCount); 501 | } 502 | } 503 | } 504 | 505 | // fs.writeFile('test.txt', JSON.stringify(result, null, "\t"), err => { 506 | // if (err) { 507 | // console.error(err); 508 | // } else { 509 | // console.log('File write successful.'); 510 | // } 511 | // }); 512 | 513 | await browser.close(); 514 | 515 | console.log(`Saving ${result.length} results to ${OPENFAIR_COLLECTION_NAME}...`); 516 | const docs = (await db.collection(OPENFAIR_COLLECTION_NAME).get()).docs; 517 | const collectionData = docs.map(d => d.data()); 518 | await buildCollections(result, collectionData); 519 | 520 | console.log('Finished'); 521 | } catch (error) { 522 | console.log(error); 523 | } 524 | } 525 | 526 | const findIndexForBusinessName = (strBusinessName, collectionData) => { 527 | for (let i = 0; i < collectionData.length; i++) { 528 | if (collectionData[i]['business_name'] == strBusinessName) return i; 529 | } 530 | 531 | return -1; 532 | } 533 | 534 | const buildCollections = async (result, collectionData) => { 535 | const metaReference = db.collection(META_COLLECTION_NAME).doc(META_DOCUMENT_NAME); 536 | const metaSnapshot = metaReference.get(); 537 | const metaData = (await metaSnapshot).data(); 538 | let lastNumId = metaData ? metaData['last_num_id'] : 0; 539 | let listingId; 540 | 541 | const collection = db.collection(OPENFAIR_COLLECTION_NAME); 542 | 543 | let bDirty = false; 544 | const nResultLength = result.length; 545 | for (let i = 0; i < nResultLength; i++) { 546 | const data = result[i]; 547 | 548 | const nFoundIndex = findIndexForBusinessName(data['business_name'], collectionData); 549 | if (nFoundIndex != -1) continue; 550 | 551 | bDirty = true; 552 | lastNumId++; 553 | listingId = `OF${pad(lastNumId, LISTINGID_DIGIT_COUNT)}`; 554 | data['of_listing_id'] = listingId; 555 | await collection.doc(listingId).set(data); 556 | collectionData.push(data); 557 | 558 | console.log('Saved - ', i + 1); 559 | } 560 | 561 | if (bDirty) { 562 | await deleteCollection(META_COLLECTION_NAME); 563 | await metaReference.set({ 564 | count: lastNumId, 565 | last_id: listingId, 566 | last_num_id: lastNumId, 567 | }); 568 | 569 | await buildMetaProvince(collectionData); 570 | await buildMetaCity(collectionData); 571 | await buildMetaIndustry(collectionData); 572 | await buildFilter(collectionData); 573 | } 574 | } 575 | 576 | const deleteCollection = async (collectionName) => { 577 | const batch = db.batch(); 578 | 579 | let docs = (await db.collection(collectionName).get()).docs; 580 | for (let i = 0; i < docs.length; i++) { 581 | batch.delete(docs[i].ref); 582 | } 583 | 584 | await batch.commit(); 585 | } 586 | 587 | const buildMetaProvince = async (collectionData) => { 588 | const result = []; 589 | 590 | const indexOfProvince = (province_name) => { 591 | for (let i = 0; i < result.length; i++) { 592 | if (result[i]['name'] == province_name) return i; 593 | } 594 | 595 | return -1; 596 | } 597 | 598 | const length = collectionData.length; 599 | for (let i = 0; i < length; i++) { 600 | const data = collectionData[i]; 601 | const province = data['province_name']; 602 | if (province.length == 0) continue; 603 | 604 | const nIndex = indexOfProvince(province); 605 | if (nIndex == -1) { 606 | result.push({ 607 | 'count': 1, 608 | 'image': data['images'].length != 0 ? data['images'][0] : '', 609 | 'name': province, 610 | }); 611 | } else { 612 | result[nIndex]['count'] = result[nIndex]['count'] + 1; 613 | if (result[nIndex]['image'] == '' && data['images'].length) { 614 | result[nIndex]['image'] = data['images'][0]; 615 | } 616 | } 617 | } 618 | 619 | // temp code 620 | const imageUrls = [ 621 | 'https://www.acquizition.biz/prod/photos/A139627-4.jpg', 622 | 'https://www.acquizition.biz/prod/photos/A140129-4.jpg', 623 | 'https://www.acquizition.biz/prod/photos/A139885-6.jpg', 624 | 'https://www.acquizition.biz/prod/photos/A139811-5.jpg', 625 | 'https://www.acquizition.biz/prod/photos/A140048-3.jpg', 626 | 'https://www.acquizition.biz/prod/photos/A140049-4.jpg', 627 | 'https://www.acquizition.biz/prod/photos/A140214-3.jpg', 628 | 'https://www.acquizition.biz/prod/photos/A132070-3.jpg', 629 | 'https://www.acquizition.biz/prod/photos/A132068-3.jpg', 630 | 'https://www.acquizition.biz/prod/photos/A140194-4.jpg', 631 | 'https://www.acquizition.biz/prod/photos/A140165-6.jpg', 632 | 'https://www.acquizition.biz/prod/photos/A140177-6.jpg', 633 | 'https://www.acquizition.biz/prod/photos/A140161-6.jpg', 634 | 'https://www.acquizition.biz/prod/photos/A140121-5.jpg', 635 | 'https://www.acquizition.biz/prod/photos/A140160-6.jpg', 636 | 'https://www.acquizition.biz/prod/photos/A138937-5.jpg', 637 | 'https://www.acquizition.biz/prod/photos/A138937-3.jpg', 638 | 'https://www.acquizition.biz/prod/photos/A140106-5.jpg', 639 | 'https://www.acquizition.biz/prod/photos/A140007-4.jpg', 640 | 'https://www.acquizition.biz/prod/photos/A138187-5.jpg', 641 | ]; 642 | 643 | for (let i = 0; i < result.length; i++) { 644 | if (result[i]['image'] == '') { 645 | const imageIndex = Math.floor(Math.random() * imageUrls.length); 646 | result[i]['image'] = imageUrls[imageIndex]; 647 | } 648 | } 649 | // temp code end 650 | 651 | await deleteCollection(METAPROVINCE_COLLECTION_NAME); 652 | const collection = db.collection(METAPROVINCE_COLLECTION_NAME); 653 | for (let i = 0; i < result.length; i++) { 654 | const d = result[i]; 655 | await collection.doc(d['name']).set(d); 656 | } 657 | 658 | console.log(`Built ${METAPROVINCE_COLLECTION_NAME} successfully`); 659 | } 660 | 661 | const buildMetaCity = async (collectionData) => { 662 | const result = []; 663 | 664 | const indexOfCity = (city_name) => { 665 | for (let i = 0; i < result.length; i++) { 666 | if (result[i]['name'] == city_name) return i; 667 | } 668 | 669 | return -1; 670 | } 671 | 672 | const length = collectionData.length; 673 | for (let i = 0; i < length; i++) { 674 | const data = collectionData[i]; 675 | const city = data['city']; 676 | if (city.length == 0) continue; 677 | 678 | const nIndex = indexOfCity(city); 679 | if (nIndex == -1) { 680 | result.push({ 681 | 'count': 1, 682 | 'image': data['images'].length != 0 ? data['images'][0] : '', 683 | 'name': city, 684 | }); 685 | } else { 686 | result[nIndex]['count'] = result[nIndex]['count'] + 1; 687 | if (result[nIndex]['image'] == '' && data['images'].length) { 688 | result[nIndex]['image'] = data['images'][0]; 689 | } 690 | } 691 | } 692 | 693 | // temp code 694 | const imageUrls = [ 695 | 'https://www.acquizition.biz/prod/photos/A139627-4.jpg', 696 | 'https://www.acquizition.biz/prod/photos/A140129-4.jpg', 697 | 'https://www.acquizition.biz/prod/photos/A139885-6.jpg', 698 | 'https://www.acquizition.biz/prod/photos/A139811-5.jpg', 699 | 'https://www.acquizition.biz/prod/photos/A140048-3.jpg', 700 | 'https://www.acquizition.biz/prod/photos/A140049-4.jpg', 701 | 'https://www.acquizition.biz/prod/photos/A140214-3.jpg', 702 | 'https://www.acquizition.biz/prod/photos/A132070-3.jpg', 703 | 'https://www.acquizition.biz/prod/photos/A132068-3.jpg', 704 | 'https://www.acquizition.biz/prod/photos/A140194-4.jpg', 705 | 'https://www.acquizition.biz/prod/photos/A140165-6.jpg', 706 | 'https://www.acquizition.biz/prod/photos/A140177-6.jpg', 707 | 'https://www.acquizition.biz/prod/photos/A140161-6.jpg', 708 | 'https://www.acquizition.biz/prod/photos/A140121-5.jpg', 709 | 'https://www.acquizition.biz/prod/photos/A140160-6.jpg', 710 | 'https://www.acquizition.biz/prod/photos/A138937-5.jpg', 711 | 'https://www.acquizition.biz/prod/photos/A138937-3.jpg', 712 | 'https://www.acquizition.biz/prod/photos/A140106-5.jpg', 713 | 'https://www.acquizition.biz/prod/photos/A140007-4.jpg', 714 | 'https://www.acquizition.biz/prod/photos/A138187-5.jpg', 715 | ]; 716 | 717 | for (let i = 0; i < result.length; i++) { 718 | if (result[i]['image'] == '') { 719 | const imageIndex = Math.floor(Math.random() * imageUrls.length); 720 | result[i]['image'] = imageUrls[imageIndex]; 721 | } 722 | } 723 | // temp code end 724 | 725 | await deleteCollection(METACITY_COLLECTION_NAME); 726 | const collection = db.collection(METACITY_COLLECTION_NAME); 727 | for (let i = 0; i < result.length; i++) { 728 | const d = result[i]; 729 | await collection.doc(d['name']).set(d); 730 | } 731 | 732 | console.log(`Built ${METACITY_COLLECTION_NAME} successfully`); 733 | } 734 | 735 | const buildMetaIndustry = async (collectionData) => { 736 | const industries = new Set(); 737 | 738 | const length = collectionData.length; 739 | for (let i = 0; i < length; i++) { 740 | const data = collectionData[i]; 741 | const industry = data['industry']; 742 | for (let k = 0; k < industry.length; k++) { 743 | industries.add(industry[k]); 744 | } 745 | } 746 | 747 | const result = []; 748 | [...industries].forEach(d => { 749 | result.push({ 750 | 'count': 0, 751 | 'image': '', 752 | 'name': d, 753 | }); 754 | }); 755 | 756 | const indexOfIndustry = (industry_name) => { 757 | for (let i = 0; i < result.length; i++) { 758 | if (result[i]['name'] == industry_name) return i; 759 | } 760 | 761 | return -1; 762 | } 763 | 764 | for (let i = 0; i < length; i++) { 765 | const data = collectionData[i]; 766 | const industry = data['industry']; 767 | for (let k = 0; k < industry.length; k++) { 768 | const nIndex = indexOfIndustry(industry[k]); 769 | if (nIndex == -1) continue; 770 | 771 | result[nIndex]['count'] = result[nIndex]['count'] + 1; 772 | if (result[nIndex]['image'] == '' && data['images'].length) { 773 | result[nIndex]['image'] = data['images'][0]; 774 | } 775 | } 776 | } 777 | 778 | await deleteCollection(METAINDUSTRY_COLLECTION_NAME); 779 | const collection = db.collection(METAINDUSTRY_COLLECTION_NAME); 780 | for (let i = 0; i < result.length; i++) { 781 | const d = result[i]; 782 | await collection.doc(d['name']).set(d); 783 | } 784 | 785 | console.log(`Built ${METAINDUSTRY_COLLECTION_NAME} successfully`); 786 | } 787 | 788 | const buildFilter = async (collectionData) => { 789 | let objPrice = { 790 | 'key': 'price', 791 | 'min': 0, 792 | 'max': 0, 793 | 'sum': 0, 794 | }; 795 | 796 | let objRevenue = { 797 | 'key': 'revenue', 798 | 'min': 0, 799 | 'max': 0, 800 | 'sum': 0, 801 | } 802 | 803 | let objProfit = { 804 | 'key': 'profit', 805 | 'min': 0, 806 | 'max': 0, 807 | 'sum': 0, 808 | } 809 | 810 | let objBS = { 811 | 'key': 'business_size', 812 | 'min': 0, 813 | 'max': 0, 814 | 'sum': 0, 815 | } 816 | 817 | const keys = ['price', 'revenue', 'profit', 'numberOfEmployees']; 818 | const metaValues = [objPrice, objRevenue, objProfit, objBS]; 819 | const histValues = []; 820 | 821 | const length = collectionData.length; 822 | for (let i = 0; i < length; i++) { 823 | const data = collectionData[i]; 824 | 825 | for (let k = 0; k < 4; k++) { 826 | const value = data[keys[k]]; 827 | metaValues[k]['sum'] += value; 828 | 829 | if (metaValues[k]['min'] > value) { 830 | metaValues[k]['min'] = value; 831 | } 832 | if (metaValues[k]['max'] < value) { 833 | metaValues[k]['max'] = value; 834 | } 835 | } 836 | } 837 | 838 | for (let k = 0; k < 4; k++) { 839 | metaValues[k]['count'] = length; 840 | metaValues[k]['avg'] = length == 0 ? 0 : Math.floor(metaValues[k]['sum'] / length); 841 | metaValues[k]['step'] = Math.ceil((metaValues[k]['max'] - metaValues[k]['min']) / HISTOGRAM_STEP_COUNT); 842 | 843 | const values = []; 844 | for (let i = metaValues[k]['min']; i <= metaValues[k]['max']; i += metaValues[k]['step']) { 845 | values.push({ 846 | 'min': i, 847 | 'max': i + metaValues[k]['step'], 848 | 'count': 0, 849 | }); 850 | } 851 | histValues.push(values); 852 | } 853 | 854 | const indexOfValue = (value, values) => { 855 | for (let i = 0; i < values.length; i++) { 856 | const d = values[i]; 857 | if (value >= d['min'] && value < d['max']) return i; 858 | } 859 | 860 | return -1; 861 | } 862 | 863 | for (let i = 0; i < length; i++) { 864 | const data = collectionData[i]; 865 | 866 | for (let k = 0; k < 4; k++) { 867 | const value = data[keys[k]]; 868 | 869 | const nIndex = indexOfValue(value, histValues[k]); 870 | if (nIndex == -1) continue; 871 | histValues[k][nIndex]['count'] = histValues[k][nIndex]['count'] + 1; 872 | } 873 | } 874 | 875 | await deleteCollection(HISTOGRAM_META_COLLECTION_NAME); 876 | let collection = db.collection(HISTOGRAM_META_COLLECTION_NAME); 877 | for (let k = 0; k < 4; k++) { 878 | await collection.doc(metaValues[k]['key']).set(metaValues[k]); 879 | } 880 | 881 | const collectionNames = [ 882 | HISTOGRAM_PRICE_COLLECTION_NAME, 883 | HISTOGRAM_REVENUE_COLLECTION_NAME, 884 | HISTOGRAM_PROFIT_COLLECTION_NAME, 885 | HISTOGRAM_BS_COLLECTION_NAME, 886 | ]; 887 | 888 | for (let k = 0; k < 4; k++) { 889 | await deleteCollection(collectionNames[k]); 890 | collection = db.collection(collectionNames[k]); 891 | const values = histValues[k]; 892 | for (let i = 0; i < values.length; i++) { 893 | const d = values[i]; 894 | await collection.doc(d['min'].toString()).set(d); 895 | } 896 | } 897 | 898 | console.log(`Built filter collections successfully`); 899 | } 900 | 901 | scraping(); 902 | --------------------------------------------------------------------------------