├── .gitignore ├── lib └── db.js └── scraper.js /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | .vscode 3 | scraper.code-workspace 4 | .env 5 | partners.json -------------------------------------------------------------------------------- /lib/db.js: -------------------------------------------------------------------------------- 1 | const firebase = require('firebase/app'); 2 | require('firebase/firestore'); 3 | 4 | module.exports = function loadFirebase() { 5 | try { 6 | // Initialize Firebase 7 | const config = { 8 | apiKey: process.env.FIREBASE_API_KEY, 9 | authDomain: process.env.FIREBASE_AUTH_DOMAIN, 10 | databaseURL: process.env.FIREBASE_DATA_URL, 11 | projectId: process.env.FIREBASE_PROJECT_ID, 12 | storageBucket: process.env.FIREBASE_STORAGE_BUCKET, 13 | messagingSenderId: process.env.FIREBASE_MESSAGING_SENDER_ID 14 | } 15 | firebase.initializeApp(config) 16 | firebase.firestore().settings({timestampsInSnapshots: true}) 17 | } catch (error) { 18 | if (!/already exists/.test(error.message)) { 19 | console.log(`Firebase didn't initialize correctly: ${error.message}`) 20 | } 21 | } 22 | return firebase 23 | } -------------------------------------------------------------------------------- /scraper.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require("puppeteer"); 2 | const fs = require('fs'); 3 | const loadFirebase = require('./lib/db.js'); 4 | 5 | (async () => { 6 | 7 | // Extract partners on the page, recursively check the next page in the URL pattern 8 | const extractPartners = async url => { 9 | // Scrape the data we want 10 | const page = await browser.newPage(); 11 | await page.goto(url); 12 | const partnersOnPage = await page.evaluate(() => 13 | Array.from(document.querySelectorAll("div.compact")).map(compact => ({ 14 | title: compact.querySelector("h3.title").innerText.trim(), 15 | logo: compact.querySelector(".logo img").src 16 | })) 17 | ); 18 | await page.close(); 19 | 20 | // Recursively scrape the next page 21 | if (partnersOnPage.length < 1) { 22 | // Terminate if no partners exist 23 | return partnersOnPage 24 | } else { 25 | // Go fetch the next page ?page=X+1 26 | const nextPageNumber = parseInt(url.match(/page=(\d+)$/)[1], 10) + 1; 27 | const nextUrl = `https://marketingplatform.google.com/about/partners/find-a-partner?page=${nextPageNumber}`; 28 | 29 | return partnersOnPage.concat(await extractPartners(nextUrl)) 30 | } 31 | }; 32 | 33 | let partners; 34 | // If there's a local JSON, don't fetch anything 35 | const rawdata = fs.readFileSync('partners.json'); 36 | const browser = await puppeteer.launch(); 37 | 38 | if (rawdata) { 39 | partners = JSON.parse(rawdata) 40 | } else { 41 | const firstUrl = 42 | "https://marketingplatform.google.com/about/partners/find-a-partner?page=1"; 43 | partners = await extractPartners(firstUrl); 44 | } 45 | 46 | 47 | 48 | 49 | // Todo: Update database with partners 50 | console.log(partners); 51 | 52 | // loadFirebase().firestore().collection('agencies') 53 | // .limit(10) 54 | // .get() 55 | // .then(snapshot => { 56 | // let data = [] 57 | // snapshot.forEach((doc) => { 58 | // data.push({ 59 | // id: doc.id, 60 | // ...doc.data() 61 | // }) 62 | // }) 63 | // return { agencies: data } 64 | // }) 65 | 66 | 67 | // // Save to JSON file 68 | // const data = JSON.stringify(partners); 69 | // fs.writeFileSync('partners.json', data); 70 | 71 | 72 | await browser.close(); 73 | })(); 74 | --------------------------------------------------------------------------------