├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── logo.png ├── package.json └── src ├── company ├── company.js └── companyScraperTemplate.js ├── logger.js ├── login.js ├── openPage.js ├── package.js ├── profile ├── cleanProfileData.js ├── contactInfo.js ├── profile.js ├── profileScraperTemplate.js ├── scrapAccomplishmentPanel.js ├── scrollToPageBottom.js └── seeMoreButtons.js ├── scrapSection.js ├── scrapedin.js └── scrapedin.test.js /.gitignore: -------------------------------------------------------------------------------- 1 | test.js 2 | cookies.json 3 | 4 | # Logs 5 | logs 6 | *.log 7 | npm-debug.log* 8 | yarn-debug.log* 9 | yarn-error.log* 10 | 11 | # Runtime data 12 | pids 13 | *.pid 14 | *.seed 15 | *.pid.lock 16 | 17 | # Directory for instrumented libs generated by jscoverage/JSCover 18 | lib-cov 19 | 20 | # Coverage directory used by tools like istanbul 21 | coverage 22 | 23 | # nyc test coverage 24 | .nyc_output 25 | 26 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 27 | .grunt 28 | 29 | # Bower dependency directory (https://bower.io/) 30 | bower_components 31 | 32 | # node-waf configuration 33 | .lock-wscript 34 | 35 | # Compiled binary addons (https://nodejs.org/api/addons.html) 36 | build/Release 37 | 38 | # Dependency directories 39 | node_modules/ 40 | jspm_packages/ 41 | 42 | # TypeScript v1 declaration files 43 | typings/ 44 | 45 | # Optional npm cache directory 46 | .npm 47 | 48 | # Optional eslint cache 49 | .eslintcache 50 | 51 | # Optional REPL history 52 | .node_repl_history 53 | 54 | # Output of 'npm pack' 55 | *.tgz 56 | 57 | # Yarn Integrity file 58 | .yarn-integrity 59 | 60 | # dotenv environment variables file 61 | .env 62 | 63 | # next.js build output 64 | .next 65 | 66 | # Package lock was removed from the repo as of b0dad2 67 | package-lock.json 68 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - "7.6" 4 | script: 5 | - npm test 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![](https://github.com/linkedtales/scrapedin/raw/master/logo.png) 2 | [![Build Status](https://travis-ci.org/leonardiwagner/scrapedin.svg?branch=master)](https://travis-ci.org/leonardiwagner/scrapedin) 3 | [![NPM version](https://img.shields.io/npm/v/scrapedin.svg)](https://www.npmjs.com/package/scrapedin) 4 | ---- 5 | Scraper for LinkedIn full profile data. Unlike others scrapers, it's working in 2020 with their new website. 6 | 7 | `npm i scrapedin` 8 | 9 | ### Usage Example: 10 | 11 | ```javascript 12 | const scrapedin = require('scrapedin') 13 | 14 | const profileScraper = await scrapedin({ email: 'login@mail.com', password: 'pass' }) 15 | const profile = await profileScraper('https://www.linkedin.com/in/some-profile/') 16 | ``` 17 | 18 | - If you are looking for a crawler to automatically extract multiple profiles see [scrapedin-crawler](https://github.com/linkedtales/scrapedin-linkedin-crawler) 19 | 20 | ### Start Guide: 21 | 22 | - [Basic Tutorial](https://github.com/linkedtales/scrapedin/wiki/Basic-Tutorial) 23 | - [Using Cookies to Login](https://github.com/linkedtales/scrapedin/wiki/Using-Cookies-To-Login) 24 | - [Tips](https://github.com/linkedtales/scrapedin/wiki/Tips) 25 | - [Documentation](https://github.com/linkedtales/scrapedin/wiki/Documentation) 26 | 27 | 28 | ### Contribution 29 | 30 | Feel free to contribute. Just open an issue to discuss something before creating a PR. 31 | 32 | ### License 33 | 34 | [Apache 2.0][apache-license] 35 | 36 | [apache-license]:./LICENSE 37 | -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linkedtales/scrapedin/e0612c49d018d2331b9f76ec4b74df8dd73b5695/logo.png -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "scrapedin", 3 | "version": "1.0.21", 4 | "description": "linkedin scraper for 2020 website", 5 | "keywords": [ 6 | "linkedin", 7 | "scraper", 8 | "crawler" 9 | ], 10 | "main": "src/scrapedin.js", 11 | "scripts": { 12 | "test": "mocha src/**.test.js", 13 | "lint": "standard --fix 'src/*.js'", 14 | "coverage": "nyc npm test", 15 | "coverage:report": "nyc --reporter=lcov npm test" 16 | }, 17 | "repository": { 18 | "type": "git", 19 | "url": "https://github.com/linkedtales/scrapedin" 20 | }, 21 | "bugs": { 22 | "url": "https://github.com/linkedtales/scrapedin/issues" 23 | }, 24 | "author": "Wagner Leonardi ", 25 | "license": "Apache-2.0", 26 | "dependencies": { 27 | "puppeteer": "1.13.0", 28 | "winston": "3.1.0" 29 | }, 30 | "devDependencies": { 31 | "chai": "4.2.0", 32 | "coveralls": "3.0.2", 33 | "faker": "4.1.0", 34 | "mocha": "5.2.0", 35 | "nyc": "^13.1.0", 36 | "sinon": "7.2.2", 37 | "standard": "12.0.1" 38 | }, 39 | "engines": { 40 | "node": ">= 7.6.0" 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/company/company.js: -------------------------------------------------------------------------------- 1 | const openPage = require('../openPage') 2 | const scrapSection = require('../scrapSection') 3 | const template = require('./companyScraperTemplate') 4 | 5 | const logger = require('../logger')(__filename) 6 | 7 | module.exports = async (browser, cookies, url, waitTimeToScrapMs = 500, puppeteerAuthenticate = undefined) => { 8 | logger.info(`starting scraping url: ${url}`); 9 | 10 | let company = {}; 11 | 12 | let page; 13 | if(url.includes('legacySchoolId=')){ 14 | page = await openPage({ browser, cookies, url, puppeteerAuthenticate }); 15 | 16 | const aboutSelector = 'a[href$="/about/"]'; 17 | 18 | company.url = page.url(); 19 | 20 | await page.$eval(aboutSelector, async about => await about.click()); 21 | await page.waitForNavigation(); 22 | } else{ 23 | company.url = url; 24 | url = url + '/about'; 25 | page = await openPage({ browser, cookies, url, puppeteerAuthenticate }); 26 | } 27 | company.about = (await scrapSection(page, template.about))[0]; 28 | company.profile = (await scrapSection(page, template.profile))[0]; 29 | 30 | await page.close(); 31 | logger.info(`finished scraping url: ${url}`); 32 | 33 | return company 34 | 35 | } 36 | -------------------------------------------------------------------------------- /src/company/companyScraperTemplate.js: -------------------------------------------------------------------------------- 1 | const template = { 2 | profile: { 3 | selector: '.org-top-card', 4 | fields: { 5 | name: `h1`, 6 | headline: `p`, 7 | imageurl: { 8 | selector: `img.org-top-card-primary-content__logo`, 9 | attribute: 'src' 10 | } 11 | } 12 | }, 13 | about: { 14 | selector: '.org-grid__core-rail--no-margin-left', 15 | fields: { 16 | overview: 'p', 17 | types:{ 18 | selector: 'dl dt', 19 | isMultipleFields: true 20 | }, 21 | values:{ 22 | selector: 'dl dd:not(.org-page-details__employees-on-linkedin-count)', 23 | isMultipleFields: true 24 | } 25 | } 26 | } 27 | } 28 | 29 | 30 | module.exports = template 31 | -------------------------------------------------------------------------------- /src/logger.js: -------------------------------------------------------------------------------- 1 | const path = require('path') 2 | const pkg = require('./package') 3 | const winston = require('winston') 4 | const logger = winston.createLogger({ 5 | format: winston.format.combine( 6 | winston.format.splat(), 7 | winston.format.simple(), 8 | winston.format.timestamp(), 9 | winston.format.colorize(), 10 | winston.format.printf(info => `${pkg.name}: ${info.timestamp} ${info.level}: ${info.message}`) 11 | ), 12 | transports: [new winston.transports.Console()] 13 | }) 14 | 15 | const loggerWrapper = (absoluteFilePath) => { 16 | const file = path.relative(__dirname, absoluteFilePath) 17 | // Because this file is in the source code root folder, the above will make all paths relative to it: just the info needed for the log. 18 | 19 | return { 20 | info: (message) => logger.info(`[${file}] ${message}`), 21 | warn: (message) => logger.warn(`[${file}] ${message}`), 22 | error: (message, error) => logger.error(`[${file}] ${message}${error && error.stack ? error.stack : (error || '')}`), 23 | stopLogging: () => { 24 | logger.silent = true 25 | } 26 | } 27 | } 28 | 29 | module.exports = loggerWrapper 30 | -------------------------------------------------------------------------------- /src/login.js: -------------------------------------------------------------------------------- 1 | const openPage = require('./openPage') 2 | const logger = require('./logger')(__filename) 3 | const pkg = require('./package') 4 | 5 | module.exports = async (browser, email, password) => { 6 | const url = 'https://www.linkedin.com/login' 7 | const page = await openPage({ browser, url }) 8 | logger.info(`logging at: ${url}`) 9 | 10 | await page.goto(url) 11 | await page.waitFor('#username') 12 | 13 | await page.$('#username') 14 | .then((emailElement) => emailElement.type(email)) 15 | await page.$('#password') 16 | .then((passwordElement) => passwordElement.type(password)) 17 | 18 | await page.$x("//button[contains(text(), 'Sign in')]") 19 | .then((button) => button[0].click()) 20 | 21 | return page.waitFor('input[role=combobox]', { 22 | timeout: 15000 23 | }) 24 | .then(async () => { 25 | logger.info('logged feed page selector found') 26 | await page.close() 27 | }) 28 | .catch(async () => { 29 | logger.warn('successful login element was not found') 30 | const emailError = await page.evaluate(() => { 31 | const e = document.querySelector('div[error-for=username]') 32 | if (!e) { return false } 33 | const style = window.getComputedStyle(e) 34 | return style && style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0' 35 | }) 36 | 37 | const passwordError = await page.evaluate(() => { 38 | const e = document.querySelector('div[error-for=password]') 39 | if (!e) { return false } 40 | const style = window.getComputedStyle(e) 41 | return style && style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0' 42 | }) 43 | 44 | const manualChallengeRequested = await page.evaluate(() => { 45 | const e = document.querySelector('.flow-challenge-content') 46 | if (!e) { return false } 47 | const style = window.getComputedStyle(e) 48 | return style && style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0' 49 | }) 50 | 51 | if (emailError) { 52 | logger.info('wrong username element found') 53 | return Promise.reject(new Error(`linkedin: invalid username: ${email}`)) 54 | } 55 | 56 | if (passwordError) { 57 | logger.info('wrong password element found') 58 | return Promise.reject(new Error('linkedin: invalid password')) 59 | } 60 | 61 | if (page.$(manualChallengeRequested)) { 62 | logger.warn('manual check was required') 63 | return Promise.reject(new Error(`linkedin: manual check was required, verify if your login is properly working manually or report this issue: ${pkg.name} ${pkg.version} ${pkg.bugs.url}`)) 64 | } 65 | 66 | logger.error('could not find any element to retrieve a proper error') 67 | return Promise.reject(new Error(`${pkg.name} ${pkg.version} login is not working, please report: ${pkg.bugs.url}`)) 68 | }) 69 | } 70 | -------------------------------------------------------------------------------- /src/openPage.js: -------------------------------------------------------------------------------- 1 | const agents = [ 2 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36' 3 | // "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36", 4 | // "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36", 5 | // "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:34.0) Gecko/20100101 Firefox/34.0", 6 | // "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36", 7 | // "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0", 8 | // "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36", 9 | // "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0", 10 | // "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36" 11 | ] 12 | 13 | module.exports = ({ browser, cookies, url, puppeteerAuthenticate }) => new Promise( async (resolve, reject) => { 14 | const page = await browser.newPage() 15 | page.on('error', err => {reject(err)}) 16 | 17 | if (cookies) { 18 | await page.setCookie(...cookies) 19 | } 20 | await page.setUserAgent(agents[Math.floor(Math.random() * agents.length)]) 21 | await page.setExtraHTTPHeaders({ 'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8' }) 22 | await page.setViewport({ 23 | width: 1920, 24 | height: 1080 25 | }) 26 | 27 | if (puppeteerAuthenticate) { 28 | await page.authenticate(puppeteerAuthenticate) 29 | } 30 | 31 | await page.goto(url) 32 | 33 | resolve(page) 34 | }) 35 | -------------------------------------------------------------------------------- /src/package.js: -------------------------------------------------------------------------------- 1 | const pkg = require('../package.json') 2 | 3 | // Only specific keys are needed, not the whole file. 4 | 5 | module.exports = { 6 | bugs: { 7 | url: pkg.bugs.url 8 | }, 9 | name: pkg.name, 10 | version: pkg.version 11 | } 12 | -------------------------------------------------------------------------------- /src/profile/cleanProfileData.js: -------------------------------------------------------------------------------- 1 | const logger = require('../logger')(__filename) 2 | const pkg = require('../package') 3 | 4 | module.exports = (profile) => { 5 | if(!profile.profile.name) { 6 | const messageError = `LinkedIn website changed and ${pkg.name} ${pkg.version} can't read basic data. Please report this issue at ${pkg.bugs.url}` 7 | logger.error(messageError, '') 8 | throw new Error(messageError) 9 | } 10 | 11 | profile.profile.summary = profile.about.text 12 | 13 | profile.positions.forEach((position) => { 14 | if(position.title){ 15 | position.title = position.title.replace('Company Name\n', '') 16 | } 17 | if(position.description) { 18 | position.description = position.description.replace('See more', ''); 19 | position.description = position.description.replace('see more', ''); 20 | position.description = position.description.replace('See less', ''); 21 | } 22 | if(position.roles) { 23 | position.roles.forEach((role) => { 24 | if(role.title) { 25 | role.title = role.title.replace('Title\n', '') 26 | } 27 | if(role.description) { 28 | role.description = role.description.replace('See more', '') 29 | role.description = role.description.replace('see more', '') 30 | } 31 | }) 32 | } 33 | }) 34 | 35 | if(profile.recommendations.receivedCount) { 36 | profile.recommendations.receivedCount = profile.recommendations.receivedCount.replace(/[^\d]/g, '') 37 | } 38 | 39 | if(profile.recommendations.givenCount) { 40 | profile.recommendations.givenCount = profile.recommendations.givenCount.replace(/[^\d]/g, '') 41 | } 42 | 43 | if(profile.recommendations.received) { 44 | profile.recommendations.received.forEach((recommendation) => { 45 | if(recommendation.summary){ 46 | recommendation.summary = recommendation.summary.replace('See more', '') 47 | recommendation.summary = recommendation.summary.replace('See less', '') 48 | } 49 | }) 50 | } 51 | 52 | if(profile.recommendations.given) { 53 | profile.recommendations.given.forEach((recommendation) => { 54 | if(recommendation.summary){ 55 | recommendation.summary = recommendation.summary.replace('See more', '') 56 | recommendation.summary = recommendation.summary.replace('See less', '') 57 | } 58 | }) 59 | } 60 | 61 | if(profile.courses){ 62 | profile.courses = profile.courses.map(({ name, year }) => { 63 | const coursesObj = {} 64 | if(name) { 65 | coursesObj.name = name.replace('Course name\n', '') 66 | } 67 | if(year) { 68 | coursesObj.year = year.replace('Course number\n', '') 69 | } 70 | return coursesObj 71 | } 72 | ); 73 | } 74 | 75 | if(profile.languages){ 76 | profile.languages = profile.languages.map(({ name, proficiency }) => ({ 77 | name: name ? name.replace('Language name\n', '') : undefined, 78 | proficiency, 79 | })); 80 | } 81 | 82 | if(profile.projects){ 83 | profile.projects = profile.projects.map( 84 | ({ name, date, description, link }) => ({ 85 | name: name ? name.replace('Project name\n', '') : undefined, 86 | date, 87 | description: description ? description.replace('Project description\n', '') : undefined, 88 | link, 89 | }), 90 | ); 91 | } 92 | 93 | return profile 94 | } 95 | -------------------------------------------------------------------------------- /src/profile/contactInfo.js: -------------------------------------------------------------------------------- 1 | const logger = require('../logger')(__filename) 2 | const scrapSection = require('../scrapSection') 3 | 4 | const SEE_MORE_SELECTOR = 'a[data-control-name=contact_see_more]' 5 | const CLOSE_MODAL_SELECTOR = '.artdeco-modal__dismiss'; 6 | 7 | const template = { 8 | selector: '.pv-contact-info__contact-type', 9 | fields: { 10 | type: 'header', 11 | values: { 12 | selector: '.pv-contact-info__ci-container', 13 | isMultipleFields: true 14 | }, 15 | links: { 16 | selector: 'a', 17 | attribute: 'href', 18 | isMultipleFields: true 19 | } 20 | } 21 | } 22 | const getContactInfo = async(page) => { 23 | await page.waitFor(SEE_MORE_SELECTOR, { timeout: 2000 }) 24 | .catch(() => { 25 | logger.warn('contact-info', 'selector not found') 26 | return {} 27 | }) 28 | 29 | const element = await page.$(SEE_MORE_SELECTOR) 30 | if(element){ 31 | await element.click() 32 | const contactInfoIndicatorSelector = '#pv-contact-info' 33 | await page.waitFor(contactInfoIndicatorSelector, { timeout: 5000 }) 34 | .catch(() => { 35 | logger.warn('contact info was not found') 36 | }) 37 | 38 | const contactInfo = await scrapSection(page, template) 39 | const closeButton = await page.$(CLOSE_MODAL_SELECTOR) 40 | if(closeButton) 41 | await closeButton.click() 42 | 43 | return contactInfo 44 | } 45 | 46 | } 47 | 48 | module.exports = getContactInfo 49 | -------------------------------------------------------------------------------- /src/profile/profile.js: -------------------------------------------------------------------------------- 1 | const openPage = require('../openPage') 2 | const scrapSection = require('../scrapSection') 3 | const scrapAccomplishmentPanel = require('./scrapAccomplishmentPanel') 4 | const scrollToPageBottom = require('./scrollToPageBottom') 5 | const seeMoreButtons = require('./seeMoreButtons') 6 | const contactInfo = require('./contactInfo') 7 | const template = require('./profileScraperTemplate') 8 | const cleanProfileData = require('./cleanProfileData') 9 | 10 | const logger = require('../logger')(__filename) 11 | 12 | module.exports = async (browser, cookies, url, waitTimeToScrapMs = 500, hasToGetContactInfo = false, puppeteerAuthenticate = undefined) => { 13 | logger.info(`starting scraping url: ${url}`) 14 | 15 | const page = await openPage({ browser, cookies, url, puppeteerAuthenticate }) 16 | const profilePageIndicatorSelector = '.pv-profile-section' 17 | await page.waitFor(profilePageIndicatorSelector, { timeout: 5000 }) 18 | .catch(() => { 19 | //why doesn't throw error instead of continuing scraping? 20 | //because it can be just a false negative meaning LinkedIn only changed that selector but everything else is fine :) 21 | logger.warn('profile selector was not found') 22 | }) 23 | 24 | logger.info('scrolling page to the bottom') 25 | await scrollToPageBottom(page) 26 | 27 | if(waitTimeToScrapMs) { 28 | logger.info(`applying 1st delay`) 29 | await new Promise((resolve) => { setTimeout(() => { resolve() }, waitTimeToScrapMs / 2)}) 30 | } 31 | 32 | await seeMoreButtons.clickAll(page) 33 | 34 | if(waitTimeToScrapMs) { 35 | logger.info(`applying 2nd (and last) delay`) 36 | await new Promise((resolve) => { setTimeout(() => { resolve() }, waitTimeToScrapMs / 2)}) 37 | } 38 | 39 | const [profile] = await scrapSection(page, template.profile) 40 | const [about] = await scrapSection(page, template.about) 41 | const positions = await scrapSection(page, template.positions) 42 | const educations = await scrapSection(page, template.educations) 43 | const [recommendationsCount] = await scrapSection(page, template.recommendationsCount) 44 | const recommendationsReceived = await scrapSection(page, template.recommendationsReceived) 45 | const recommendationsGiven = await scrapSection(page, template.recommendationsGiven) 46 | const skills = await scrapSection(page, template.skills) 47 | const accomplishments = await scrapSection(page, template.accomplishments) 48 | const courses = await scrapAccomplishmentPanel(page, 'courses') 49 | const languages = await scrapAccomplishmentPanel(page, 'languages') 50 | const projects = await scrapAccomplishmentPanel(page, 'projects') 51 | const volunteerExperience = await scrapSection(page, template.volunteerExperience) 52 | const peopleAlsoViewed = await scrapSection(page, template.peopleAlsoViewed) 53 | const contact = hasToGetContactInfo ? await contactInfo(page) : [] 54 | 55 | await page.close() 56 | logger.info(`finished scraping url: ${url}`) 57 | 58 | const rawProfile = { 59 | profile, 60 | about, 61 | positions, 62 | educations, 63 | skills, 64 | recommendations: { 65 | givenCount: recommendationsCount ? recommendationsCount.given : "0", 66 | receivedCount: recommendationsCount ? recommendationsCount.received : "0", 67 | given: recommendationsReceived, 68 | received: recommendationsGiven 69 | }, 70 | accomplishments, 71 | courses, 72 | languages, 73 | projects, 74 | peopleAlsoViewed, 75 | volunteerExperience, 76 | contact 77 | } 78 | 79 | const cleanedProfile = cleanProfileData(rawProfile) 80 | return cleanedProfile 81 | } 82 | -------------------------------------------------------------------------------- /src/profile/profileScraperTemplate.js: -------------------------------------------------------------------------------- 1 | const profileSelector = '.core-rail > *:first-child section >' 2 | 3 | const template = { 4 | profile: { 5 | selector: '.pv-top-card', 6 | fields: { 7 | name: `.text-heading-xlarge`, 8 | headline: `.text-body-medium`, 9 | location: `.pb2 .text-body-small`, 10 | connections: `li.text-body-small`, 11 | imageurl: { 12 | selector: `img.pv-top-card__photo`, 13 | attribute: 'src' 14 | } 15 | } 16 | }, 17 | about: { 18 | selector: '.pv-about-section', 19 | fields: { 20 | text: 'p' 21 | } 22 | }, 23 | positions: { 24 | selector: '#experience-section li:not(.pv-entity__position-group-role-item)', 25 | fields: { 26 | title: 'h3', 27 | link: { 28 | selector: 'a', 29 | attribute: 'href', 30 | }, 31 | url: { 32 | selector: 'a', 33 | attribute: 'href' 34 | }, 35 | companyName: '.pv-entity__secondary-title', 36 | location: '.pv-entity__location span:last-child', 37 | description: '.pv-entity__description', 38 | date1: '.pv-entity__date-range span:last-child', 39 | date2: '.pv-entity__bullet-item-v2', 40 | roles: { 41 | selector: 'li', 42 | hasChildrenFields: true, 43 | fields: { 44 | title: 'h3', 45 | description: '.pv-entity__description', 46 | date1: '.pv-entity__date-range span:last-child', 47 | date2: '.pv-entity__bullet-item-v2', 48 | location: '.pv-entity__location span:last-child' 49 | } 50 | } 51 | } 52 | }, 53 | educations: { 54 | selector: '#education-section li', 55 | fields: { 56 | title: 'h3', 57 | degree: 'span[class=pv-entity__comma-item]', 58 | url: { 59 | selector: 'a', 60 | attribute: 'href' 61 | }, 62 | fieldOfStudy: 'p.pv-entity__fos span:nth-child(2)', 63 | date1: '.pv-entity__dates time:nth-child(1)', 64 | date2: '.pv-entity__dates time:nth-child(2)', 65 | description: '.pv-entity__description' 66 | } 67 | }, 68 | skills: { 69 | selector: '.pv-skill-category-entity__skill-wrapper', 70 | fields: { 71 | title: '.pv-skill-category-entity__name-text', 72 | count: '.pv-skill-category-entity__endorsement-count' 73 | } 74 | }, 75 | recommendationsCount: { 76 | selector: '.recommendations-inlining', 77 | fields: { 78 | received: '.artdeco-tab:nth-child(1)', 79 | given: '.artdeco-tab:nth-child(2)' 80 | } 81 | }, 82 | recommendationsReceived: { 83 | selector: '.recommendations-inlining', 84 | fields: { 85 | user: { 86 | selector: '.pv-recommendation-entity__member', 87 | attribute: 'href' 88 | }, 89 | text: 'blockquote.pv-recommendation-entity__text', 90 | profileImage: { 91 | selector: 'a img', 92 | attribute: 'src' 93 | }, 94 | name: { 95 | selector: 'a h3' 96 | }, 97 | userDescription: { 98 | selector: '.pv-recommendation-entity__headline' 99 | } 100 | } 101 | }, 102 | recommendationsGiven: { 103 | selector: '.artdeco-tabpanel li.pv-recommendation-entity', 104 | fields: { 105 | user: { 106 | selector: '.pv-recommendation-entity__member', 107 | attribute: 'href' 108 | }, 109 | text: 'blockquote.pv-recommendation-entity__text', 110 | profileImage: { 111 | selector: 'a img', 112 | attribute: 'src' 113 | }, 114 | name: { 115 | selector: 'a h3' 116 | }, 117 | userDescription: { 118 | selector: '.pv-recommendation-entity__headline' 119 | } 120 | } 121 | }, 122 | accomplishments: { 123 | selector: '.pv-accomplishments-section > div', 124 | fields: { 125 | count: 'h3 span:last-child', 126 | title: '.pv-accomplishments-block__title', 127 | items: { 128 | selector: 'li', 129 | isMultipleFields: true 130 | } 131 | } 132 | }, 133 | peopleAlsoViewed: { 134 | selector: 'li.pv-browsemap-section__member-container', 135 | fields: { 136 | user: { 137 | selector: 'a', 138 | attribute: 'href' 139 | }, 140 | text: 'p', 141 | profileImage: { 142 | selector: 'a img', 143 | attribute: 'src' 144 | }, 145 | name: { 146 | selector: '.name' 147 | } 148 | } 149 | }, 150 | volunteerExperience: { 151 | selector: 'section.volunteering-section li', 152 | fields: { 153 | title: 'h3', 154 | experience: 'span[class=pv-entity__secondary-title]', 155 | location: '.pv-entity__location span:nth-child(2)', 156 | description: '.pv-volunteer-causes', 157 | date1: '.pv-entity__date-range span:nth-child(2)', 158 | date2: '.pv-entity__bullet-item' 159 | } 160 | }, 161 | courses: { 162 | selector: '.pv-accomplishments-section', 163 | fields: { 164 | name: '.pv-accomplishment-entity__title', 165 | year: '.pv-accomplishment-entity__course-number' 166 | } 167 | }, 168 | languages: { 169 | selector: '.pv-accomplishments-section', 170 | fields: { 171 | name: '.pv-accomplishment-entity__title', 172 | proficiency: '.pv-accomplishment-entity__proficiency', 173 | } 174 | }, 175 | projects: { 176 | selector: '.pv-accomplishments-section', 177 | fields: { 178 | name: '.pv-accomplishment-entity__title', 179 | date: '.pv-accomplishment-entity__date', 180 | description: '.pv-accomplishment-entity__description', 181 | link: { 182 | selector: '.pv-accomplishment-entity__external-source', 183 | attribute: 'href' 184 | } 185 | } 186 | } 187 | } 188 | 189 | 190 | module.exports = template -------------------------------------------------------------------------------- /src/profile/scrapAccomplishmentPanel.js: -------------------------------------------------------------------------------- 1 | const scrapSection = require('../scrapSection'); 2 | const template = require('./profileScraperTemplate'); 3 | 4 | const scrapAccomplishmentPanel = async (page, section) => { 5 | const queryString = `.pv-accomplishments-block.${section} button` 6 | 7 | const openingButton = await page.$(queryString); 8 | 9 | if (openingButton) { 10 | await page.evaluate((q) => { 11 | document.querySelector(q).click(); 12 | }, queryString); 13 | 14 | return scrapSection(page, template[section]); 15 | } 16 | }; 17 | 18 | module.exports = scrapAccomplishmentPanel; -------------------------------------------------------------------------------- /src/profile/scrollToPageBottom.js: -------------------------------------------------------------------------------- 1 | const logger = require('../logger')(__filename) 2 | 3 | module.exports = async (page) => { 4 | const MAX_TIMES_TO_SCROLL = 25 5 | const TIMEOUT_BETWEEN_SCROLLS = 500 6 | const PAGE_BOTTOM_SELECTOR_STRING = '#expanded-footer' 7 | 8 | for (let i = 0; i < MAX_TIMES_TO_SCROLL; i++) { 9 | await page.evaluate(() => window.scrollBy(0, window.innerHeight)) 10 | 11 | const hasReachedEnd = await page.waitForSelector(PAGE_BOTTOM_SELECTOR_STRING, { 12 | visible: true, 13 | timeout: TIMEOUT_BETWEEN_SCROLLS 14 | }).catch(() => { 15 | logger.info(`scrolling to page bottom (${i + 1})`) 16 | }) 17 | 18 | if (hasReachedEnd) { 19 | return 20 | } 21 | } 22 | 23 | logger.warn('page bottom not found') 24 | } 25 | -------------------------------------------------------------------------------- /src/profile/seeMoreButtons.js: -------------------------------------------------------------------------------- 1 | const logger = require('../logger')(__filename) 2 | const seeMoreButtons = [ 3 | { 4 | id: 'SHOW_MORE_ABOUT', 5 | selector: '#line-clamp-show-more-button' 6 | },{ 7 | id: 'SHOW_MORE_EXPERIENCES', 8 | selector: '#experience-section .pv-profile-section__see-more-inline' 9 | },{ 10 | id: 'SEE_MORE_EXPERIENCES', 11 | selector: '#experience-section .inline-show-more-text__button' 12 | },{ 13 | id: 'SHOW_MORE_CERTIFICATIONS', 14 | selector: '#certifications-section .pv-profile-section__see-more-inline' 15 | },{ 16 | id: 'SHOW_MORE_SKILLS', 17 | selector: '.pv-skills-section__additional-skills' 18 | },{ 19 | id: 'SEE_MORE_RECOMMENDATIONS', 20 | selector: '.recommendations-inlining #line-clamp-show-more-button' 21 | } 22 | ] 23 | 24 | 25 | const clickAll = async(page) => { 26 | for(let i = 0; i < seeMoreButtons.length; i++){ 27 | const button = seeMoreButtons[i] 28 | const elems = await page.$$(button.selector) 29 | 30 | for(let j = 0; j < elems.length; j++){ 31 | const elem = elems[j] 32 | if (elem) { 33 | await elem.click() 34 | .catch((e) => logger.warn(`couldn't click on ${button.selector}, it's probably invisible`)) 35 | } 36 | } 37 | } 38 | 39 | return 40 | } 41 | 42 | module.exports = { clickAll } 43 | -------------------------------------------------------------------------------- /src/scrapSection.js: -------------------------------------------------------------------------------- 1 | const scrapSelectorFields = (selector, section) => async (scrapedObjectPromise, fieldKey) => { 2 | const scrapedObject = await scrapedObjectPromise 3 | const field = section.fields[fieldKey] 4 | 5 | // currently field can be a selector string, or an object containing a selector field 6 | const fieldSelectorString = await field.selector 7 | ? field.selector 8 | : field 9 | 10 | const isFieldPresent = await selector.$(fieldSelectorString) 11 | 12 | if (!isFieldPresent) { return scrapedObject } 13 | 14 | if (field.isMultipleFields) { 15 | if (field.attribute === 'href') { 16 | scrapedObject[fieldKey] = await selector.$$eval(fieldSelectorString, (elems) => elems.map(elem => elem.href ? elem.href.trim() : elem.innerHTML.trim())) 17 | } else if(field.attribute === 'src'){ 18 | scrapedObject[fieldKey] = await selector.$$eval(fieldSelectorString, (elems) => elems.map(elem => elem.src ? elem.src.trim() : elem.innerHTML.trim())) 19 | }else{ 20 | scrapedObject[fieldKey] = await selector.$$eval(fieldSelectorString, (elems) => elems.map(elem => elem.innerText.trim())) 21 | } 22 | } else if (field.hasChildrenFields) { 23 | const fieldChildrenSelectors = await selector.$$(field.selector) 24 | 25 | scrapedObject[fieldKey] = await Promise.all( 26 | fieldChildrenSelectors.map((s) => scrapSelector(s, field)) 27 | ) 28 | } else if (field.attribute && field.attribute === 'href') { 29 | scrapedObject[fieldKey] = await selector.$eval(fieldSelectorString, (elem) => elem && elem.href ? elem.href.trim() : '') 30 | } else if (field.attribute && field.attribute === 'src') { 31 | scrapedObject[fieldKey] = await selector.$eval(fieldSelectorString, (elem) => elem && elem.src ? elem.src.trim() : '') 32 | } else { 33 | scrapedObject[fieldKey] = await selector.$eval(fieldSelectorString, (elem) => elem && elem.innerText ? elem.innerText.trim() : '') 34 | } 35 | 36 | return scrapedObject 37 | } 38 | const scrapSelector = (selector, section) => 39 | Object.keys(section.fields) 40 | .reduce(scrapSelectorFields(selector, section), Promise.resolve({})) 41 | 42 | module.exports = async (page, section) => { 43 | const sectionSelectors = await page.$$(section.selector) 44 | 45 | const scrapedPromises = sectionSelectors 46 | .map((selector) => scrapSelector(selector, section)) 47 | 48 | return Promise.all(scrapedPromises) 49 | } 50 | -------------------------------------------------------------------------------- /src/scrapedin.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require('puppeteer') 2 | const login = require('./login') 3 | const profile = require('./profile/profile') 4 | const company = require('./company/company') 5 | const logger = require('./logger')(__filename) 6 | 7 | module.exports = async ({ cookies, email, password, isHeadless, hasToLog, hasToGetContactInfo, puppeteerArgs, puppeteerAuthenticate, endpoint } = { isHeadless: true, hasToLog: false }) => { 8 | if (!hasToLog) { 9 | logger.stopLogging() 10 | } 11 | logger.info('initializing') 12 | 13 | let browser; 14 | if(endpoint){ 15 | browser = await puppeteer.connect({ 16 | browserWSEndpoint: endpoint, 17 | }); 18 | }else{ 19 | const args = Object.assign({ headless: isHeadless, args: ['--no-sandbox'] }, puppeteerArgs) 20 | browser = await puppeteer.launch(args) 21 | } 22 | 23 | if (cookies) { 24 | logger.info('using cookies, login will be bypassed') 25 | } else if (email && password) { 26 | logger.info('email and password was provided, we\'re going to login...') 27 | 28 | try { 29 | await login(browser, email, password, logger) 30 | } catch (e) { 31 | if(!endpoint){ 32 | await browser.close() 33 | } 34 | throw e 35 | } 36 | } else { 37 | logger.warn('email/password and cookies wasn\'t provided, only public data will be collected') 38 | } 39 | 40 | return (url, waitMs) => url.includes('/school/') || url.includes('/company/') ? company(browser, cookies, url, waitMs, hasToGetContactInfo, puppeteerAuthenticate) :profile(browser, cookies, url, waitMs, hasToGetContactInfo, puppeteerAuthenticate) 41 | } 42 | -------------------------------------------------------------------------------- /src/scrapedin.test.js: -------------------------------------------------------------------------------- 1 | const faker = require('faker') 2 | const { expect } = require('chai') 3 | const profile = require('./profile/profile') 4 | const logger = require('./logger')(__filename) 5 | const { mock, match } = require('sinon') 6 | const profileScraperTemplate = require('./profile/profileScraperTemplate') 7 | const url = faker.internet.url() 8 | const fakeEvalResult = faker.lorem.words(1) 9 | 10 | // Make the linter happy. 11 | var mocha = require('mocha') 12 | var it = mocha.it 13 | 14 | logger.stopLogging() 15 | 16 | it('should get complete profile', async () => { 17 | const browserMock = prepareBrowserMock() 18 | const result = await profile(browserMock, [], url, 0) 19 | const expectedResult = { 20 | aboutAlternative: { 21 | text: fakeEvalResult 22 | }, 23 | aboutLegacy: { 24 | text: fakeEvalResult 25 | }, 26 | accomplishments: [ 27 | { 28 | count: fakeEvalResult, 29 | items: [fakeEvalResult], 30 | title: fakeEvalResult 31 | } 32 | ], 33 | contact: {}, 34 | courses: [ 35 | { 36 | name: fakeEvalResult, 37 | year: fakeEvalResult 38 | } 39 | ], 40 | educations: [ 41 | { 42 | date1: fakeEvalResult, 43 | date2: fakeEvalResult, 44 | degree: fakeEvalResult, 45 | fieldOfStudy: fakeEvalResult, 46 | url: fakeEvalResult, 47 | title: fakeEvalResult 48 | } 49 | ], 50 | languages: [ 51 | { 52 | name: fakeEvalResult, 53 | proficiency: fakeEvalResult 54 | } 55 | ], 56 | peopleAlsoViewed: [ 57 | { 58 | text: fakeEvalResult, 59 | user: fakeEvalResult 60 | } 61 | ], 62 | positions: [ 63 | { 64 | companyName: fakeEvalResult, 65 | date1: fakeEvalResult, 66 | date2: fakeEvalResult, 67 | description: fakeEvalResult, 68 | link: fakeEvalResult, 69 | location: fakeEvalResult, 70 | roles: [ 71 | { 72 | date1: fakeEvalResult, 73 | date2: fakeEvalResult, 74 | description: fakeEvalResult, 75 | location: fakeEvalResult, 76 | title: fakeEvalResult 77 | } 78 | ], 79 | title: fakeEvalResult, 80 | url: fakeEvalResult 81 | } 82 | ], 83 | profile: { 84 | connections: fakeEvalResult, 85 | headline: fakeEvalResult, 86 | location: fakeEvalResult, 87 | name: fakeEvalResult, 88 | summary: fakeEvalResult 89 | }, 90 | profileAlternative: { 91 | connections: fakeEvalResult, 92 | headline: fakeEvalResult, 93 | imageurl: fakeEvalResult, 94 | location: fakeEvalResult, 95 | name: fakeEvalResult 96 | }, 97 | profileLegacy: { 98 | connections: fakeEvalResult, 99 | headline: fakeEvalResult, 100 | location: fakeEvalResult, 101 | name: fakeEvalResult, 102 | summary: fakeEvalResult 103 | }, 104 | projects: [ 105 | { 106 | date: fakeEvalResult, 107 | description: fakeEvalResult, 108 | link: fakeEvalResult, 109 | name: fakeEvalResult 110 | } 111 | ], 112 | recommendations: { 113 | given: [ 114 | { 115 | text: fakeEvalResult, 116 | user: fakeEvalResult 117 | } 118 | ], 119 | givenCount: '', 120 | received: [ 121 | { 122 | text: fakeEvalResult, 123 | user: fakeEvalResult 124 | } 125 | ], 126 | receivedCount: '' 127 | }, 128 | skills: [ 129 | { 130 | count: fakeEvalResult, 131 | title: fakeEvalResult 132 | } 133 | ], 134 | volunteerExperience: [ 135 | { 136 | date1: fakeEvalResult, 137 | date2: fakeEvalResult, 138 | description: fakeEvalResult, 139 | experience: fakeEvalResult, 140 | location: fakeEvalResult, 141 | title: fakeEvalResult 142 | } 143 | ] 144 | } 145 | 146 | expect(result).to.deep.equals(expectedResult) 147 | }) 148 | 149 | it('should get an incomplete profile', async () => { 150 | const browser = prepareBrowserMock(true) 151 | 152 | const result = await profile(browser, [], url, 0) 153 | const expectedResult = { 154 | aboutAlternative: { 155 | text: '' 156 | }, 157 | aboutLegacy: { 158 | text: '' 159 | }, 160 | accomplishments: [ 161 | { 162 | count: '', 163 | items: [fakeEvalResult], 164 | title: '' 165 | } 166 | ], 167 | contact: {}, 168 | courses: [{}], 169 | educations: [ 170 | { 171 | date1: '', 172 | date2: '', 173 | degree: '', 174 | fieldOfStudy: '', 175 | url: '' 176 | } 177 | ], 178 | languages: [ 179 | { 180 | name: undefined, 181 | proficiency: '' 182 | } 183 | ], 184 | peopleAlsoViewed: [ 185 | { 186 | text: '', 187 | user: '' 188 | } 189 | ], 190 | positions: [ 191 | { 192 | companyName: '', 193 | date1: '', 194 | date2: '', 195 | description: '', 196 | link: '', 197 | location: '', 198 | roles: [ 199 | { 200 | date1: '', 201 | date2: '', 202 | description: '', 203 | location: '', 204 | title: '' 205 | } 206 | ], 207 | url: '' 208 | } 209 | ], 210 | profile: { 211 | connections: '', 212 | headline: '', 213 | location: '', 214 | name: '' 215 | }, 216 | profileAlternative: { 217 | connections: '', 218 | headline: '', 219 | imageurl: '', 220 | location: '', 221 | name: '' 222 | }, 223 | profileLegacy: { 224 | connections: '', 225 | headline: '', 226 | location: '', 227 | name: '' 228 | }, 229 | projects: [ 230 | { 231 | date: '', 232 | description: undefined, 233 | link: '', 234 | name: undefined 235 | } 236 | ], 237 | recommendations: { 238 | given: [ 239 | { 240 | text: '', 241 | user: '' 242 | } 243 | ], 244 | givenCount: '', 245 | received: [ 246 | { 247 | text: '', 248 | user: '' 249 | } 250 | ], 251 | receivedCount: '' 252 | }, 253 | skills: [ 254 | { 255 | count: '', 256 | title: '' 257 | } 258 | ], 259 | volunteerExperience: [ 260 | { 261 | date1: '', 262 | date2: '', 263 | description: '', 264 | experience: '', 265 | location: '' 266 | } 267 | ] 268 | } 269 | 270 | expect(result).to.deep.equals(expectedResult) 271 | }) 272 | 273 | const prepareBrowserMock = (isIncompleteProfile) => { 274 | const Page = function () { 275 | this.goto = mock().once().withExactArgs(url).resolves() 276 | this.setUserAgent = mock().once().resolves() 277 | this.setExtraHTTPHeaders = mock().once().resolves() 278 | this.setViewport = mock().once().resolves() 279 | this.waitFor = mock().once().resolves() 280 | 281 | this.evaluate = mock() 282 | .twice() 283 | .withExactArgs(match.func) 284 | .atLeast(1) 285 | .resolves() 286 | this.waitForSelector = mock() 287 | .withExactArgs(match.string, match.object) 288 | .twice() 289 | .onCall(0) 290 | .rejects() 291 | .onCall(1) 292 | .resolves(true) 293 | 294 | this.setCookie = mock().once().withExactArgs().resolves() 295 | 296 | this.click = mock().atLeast(1).withExactArgs().resolves() 297 | this.$$eval = mock() 298 | .withExactArgs(match.string, match.func) 299 | .atLeast(1) 300 | .callsArgWith(1, [{ innerText: fakeEvalResult }]) 301 | .resolves([fakeEvalResult]) 302 | 303 | this.$eval = mock() 304 | .withExactArgs(match.string, match.func) 305 | .atLeast(1) 306 | .callsArgWith( 307 | 1, 308 | isIncompleteProfile 309 | ? undefined 310 | : { 311 | innerText: fakeEvalResult, 312 | src: fakeEvalResult, 313 | href: fakeEvalResult 314 | } 315 | ) 316 | .resolves(isIncompleteProfile ? '' : fakeEvalResult) 317 | 318 | this.close = mock().once().resolves() 319 | } 320 | 321 | Page.prototype.$ = () => new Page() 322 | 323 | if (isIncompleteProfile) { 324 | // I couldn't do that with sinon :( 325 | Page.prototype.$ = (arg) => 326 | arg === profileScraperTemplate.positions.fields.title 327 | ? undefined 328 | : Promise.resolve(new Page()) 329 | } 330 | 331 | Page.prototype.$$ = () => [new Page()] 332 | 333 | const browser = { 334 | newPage: mock().once().withExactArgs().resolves(new Page()) 335 | } 336 | 337 | return browser 338 | } 339 | --------------------------------------------------------------------------------