└── main.js /main.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require('puppeteer'); 2 | const fs = require('fs'); 3 | 4 | async function run() { 5 | 6 | // How to use this scraper! 7 | 8 | // Step 1: Make sure to have Node.js installed on your computer 9 | // Step 2: Install the following packages by running the following commands in your terminal: npm install puppeteer fs 10 | // Step 3: Copy and paste this entire script into a new file called main.js 11 | // Step 4: Update the baseUrl, csvUrl, email, and password variables with your own information 12 | // Step 5: Run the script by running the following command in your terminal: node main.js 13 | // Step 6: Sit back and relax as the scraper does the hard work for you! 14 | 15 | const baseUrl = 'YourApolloURLHere'; 16 | const csvUrl = 'NameOfCSV.csv'; 17 | const email = 'YourEmailHere'; 18 | const password = 'YourPasswordHere'; 19 | 20 | // Start the Puppeteer browser 21 | 22 | console.time("ScriptRunTime"); 23 | const browser = await puppeteer.launch({ 24 | headless: false, // Set to true to run headless 25 | defaultViewport: null 26 | }); 27 | 28 | const page = await browser.newPage(); 29 | await page.goto('https://app.apollo.io/#/login'); 30 | await page.waitForSelector('input[name="email"]', { visible: true }); 31 | await page.waitForSelector('input[name="password"]', { visible: true }); 32 | await page.type('input[name="email"]', email); 33 | await page.type('input[name="password"]', password); 34 | await page.click('button[type="submit"]'); 35 | await page.waitForTimeout(5000); 36 | await page.goto(baseUrl); 37 | await page.waitForTimeout(5000); 38 | const totalText = await page.evaluate(() => { 39 | const targetElement = Array.from(document.querySelectorAll('a')).find(e => e.textContent.trim().startsWith('Total')); 40 | return targetElement ? targetElement.textContent.trim() : null; 41 | }); 42 | 43 | let totalItems = 0; 44 | if (totalText) { 45 | const totalItemsMatch = totalText.match(/\d+/); 46 | if (totalItemsMatch) { 47 | totalItems = parseInt(totalItemsMatch[0], 10); 48 | console.log(`Total items: ${totalItems}`); 49 | } 50 | } 51 | if (totalItems > 0) { 52 | const itemsPerPage = 25; 53 | const totalPages = Math.ceil(totalItems / itemsPerPage); 54 | console.log(`Total pages: ${totalPages}`); 55 | let allData = []; 56 | for (let i = 1; i <= totalPages; i++) { 57 | const pageUrl = `${baseUrl}&page=${i}`; 58 | console.log(`Scraping page: ${pageUrl}`); 59 | await page.goto(pageUrl); 60 | await page.waitForSelector('tbody', { visible: true }); 61 | 62 | const data = await page.$$eval('tbody', tbodies => tbodies.map(tbody => { 63 | const tr = tbody.querySelector('tr'); 64 | const tdName = tr ? tr.querySelector('td') : null; 65 | let name = tdName ? tdName.innerText.trim() : null; 66 | name = name.replace("------", "").trim(); 67 | 68 | let parts = name.split(' '); 69 | let firstName = parts.shift(); 70 | let lastName = parts.join(' '); 71 | 72 | const quote = (str) => `"${str.replace(/"/g, '""')}"`; 73 | 74 | firstName = quote(firstName); 75 | lastName = quote(lastName); 76 | fullName = quote(name); 77 | 78 | const tdJobTitle = tr ? tr.querySelector('td:nth-child(2)') : null; 79 | let jobTitle = tdJobTitle ? tdJobTitle.innerText.trim() : ''; 80 | jobTitle = quote(jobTitle); 81 | 82 | const tdCompanyName = tr ? tr.querySelector('td:nth-child(3)') : null; 83 | let companyName = tdCompanyName ? tdCompanyName.innerText.trim() : ''; 84 | companyName = quote(companyName); 85 | 86 | const tdLocation = tr ? tr.querySelector('td:nth-child(5) .zp_Y6y8d') : null; 87 | let location = tdLocation ? tdLocation.innerText.trim() : ''; 88 | location = quote(location); 89 | 90 | const tdEmployeeCount = tr ? tr.querySelector('td:nth-child(6)') : null; 91 | let employeeCount = tdEmployeeCount ? tdEmployeeCount.innerText.trim() : ''; 92 | employeeCount = quote(employeeCount); 93 | 94 | const tdPhone = tr ? tr.querySelector('td:nth-child(7)') : null; 95 | let phone = tdPhone ? tdPhone.innerText.trim() : ''; 96 | phone = phone.replace(/\D/g, ''); 97 | phone = phone.replace(/(\d{3})(\d{3})(\d{4})/, '($1) $2-$3'); 98 | phone = quote(phone); 99 | 100 | const tdIndustry = tr ? tr.querySelector('td:nth-child(8)') : null; 101 | let industry = tdIndustry ? tdIndustry.innerText.trim() : ''; 102 | industry = quote(industry); 103 | 104 | const tdKeywords = tr ? tr.querySelector('td:nth-child(9)') : null; 105 | let keywords = tdKeywords ? tdKeywords.innerText.trim() : ''; 106 | keywords = quote(keywords); 107 | 108 | let facebookUrl = '', twitterUrl = '', companyLinkedinUrl = '', companyUrl = ''; 109 | 110 | if (tdCompanyName) { 111 | const links = tdCompanyName.querySelectorAll('a[href]'); 112 | links.forEach(link => { 113 | const href = link.href.trim(); 114 | if (href.includes('facebook.com')) facebookUrl = quote(href); 115 | if (href.includes('twitter.com')) twitterUrl = quote(href); 116 | else if (href.includes('linkedin.com/company')) companyLinkedinUrl = quote(href); 117 | else if (link.querySelector('.apollo-icon-link')) companyUrl = quote(href); 118 | }); 119 | } 120 | 121 | const firstHref = tbody.querySelector('a[href]') ? tbody.querySelector('a[href]').href : ''; 122 | const linkedinUrl = tdName && tdName.querySelector('a[href*="linkedin.com/in"]') ? tdName.querySelector('a[href*="linkedin.com/in"]').href : ''; 123 | 124 | return { 125 | firstName: firstName, 126 | lastName: lastName, 127 | fullName: fullName, 128 | jobTitle: jobTitle, 129 | companyName: companyName, 130 | location: location, 131 | employeeCount: employeeCount, 132 | phone: phone, 133 | industry: industry, 134 | firstHref: quote(firstHref), 135 | linkedinUrl: quote(linkedinUrl), 136 | facebookUrl: facebookUrl, 137 | twitterUrl: twitterUrl, 138 | companyLinkedinUrl: companyLinkedinUrl, 139 | companyUrl: companyUrl, 140 | keywords: keywords, 141 | }; 142 | })); 143 | allData = allData.concat(data); 144 | } 145 | async function processPerson(person, newPage) { 146 | console.log(`Processing person: ${person.name}`); 147 | const cleanedUrl = person.firstHref.replace(/"/g, ''); 148 | console.log(`Navigating to cleaned URL: ${cleanedUrl}`); 149 | 150 | try { 151 | await newPage.goto(cleanedUrl, { waitUntil: 'networkidle0' }); 152 | console.log(`Page navigated to ${cleanedUrl}`); 153 | 154 | await newPage.waitForSelector('#general_information_card', { timeout: 10000 }); 155 | console.log(`Found #general_information_card`); 156 | 157 | const emailElements = await newPage.$$eval('#general_information_card', elements => elements.map(element => element.innerText)); 158 | const emailRegex = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g; 159 | let emails = emailElements.flatMap(element => element.match(emailRegex) || []); 160 | 161 | person.emails = emails.length > 0 ? emails : ['']; 162 | } catch (error) { 163 | console.error(`Error processing ${person.name} at ${cleanedUrl}: ${error}`); 164 | person.emails = ['']; 165 | } 166 | } 167 | 168 | const batchSize = 5; 169 | for (let i = 0; i < allData.length; i += batchSize) { 170 | const batch = allData.slice(i, i + batchSize); 171 | console.log(`Processing batch from index ${i} to ${i + batchSize - 1}`); 172 | 173 | await Promise.all(batch.map(async person => { 174 | const newPage = await browser.newPage(); 175 | try { 176 | return await processPerson(person, newPage); 177 | } catch (error) { 178 | console.error(`Error processing ${person.name}: ${error}`); 179 | } finally { 180 | await newPage.close(); 181 | } 182 | })); 183 | console.log(`Completed batch from index ${i} to ${i + batchSize - 1}`); 184 | } 185 | 186 | const maxEmails = allData.reduce((max, p) => Math.max(max, p.emails.length), 0); 187 | const emailHeaders = Array.from({ length: maxEmails }, (_, i) => `Email ${i + 1}`).join(','); 188 | const csvHeader = `First Name,Last Name,Full Name,Job Title,Company Name,Location,Employee Count,Phone,Industry,URL,LinkedIn URL,Facebook URL,Twitter URL,Company LinkedIn URL,Company URL,Keywords,${emailHeaders}\n`; 189 | 190 | const csvRows = allData.map(person => { 191 | const paddedEmails = [...person.emails, ...Array(maxEmails - person.emails.length).fill('')]; 192 | return `${person.firstName},${person.lastName},${person.fullName},${person.jobTitle},${person.companyName},${person.location},${person.employeeCount},${person.phone},${person.industry},${person.firstHref},${person.linkedinUrl},${person.facebookUrl},${person.twitterUrl},${person.companyLinkedinUrl},${person.companyUrl},${person.keywords},${paddedEmails.join(',')}`; 193 | }).join('\n'); 194 | 195 | fs.writeFileSync(csvUrl, csvHeader + csvRows); 196 | } else { 197 | console.log('Element not found'); 198 | } 199 | await browser.close(); 200 | console.timeEnd("ScriptRunTime"); 201 | } 202 | run().catch(console.error); 203 | --------------------------------------------------------------------------------