├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── logo.png
├── package.json
└── src
    ├── company
        ├── company.js
        └── companyScraperTemplate.js
    ├── logger.js
    ├── login.js
    ├── openPage.js
    ├── package.js
    ├── profile
        ├── cleanProfileData.js
        ├── contactInfo.js
        ├── profile.js
        ├── profileScraperTemplate.js
        ├── scrapAccomplishmentPanel.js
        ├── scrollToPageBottom.js
        └── seeMoreButtons.js
    ├── scrapSection.js
    ├── scrapedin.js
    └── scrapedin.test.js


/.gitignore:
--------------------------------------------------------------------------------
 1 | test.js
 2 | cookies.json
 3 | 
 4 | # Logs
 5 | logs
 6 | *.log
 7 | npm-debug.log*
 8 | yarn-debug.log*
 9 | yarn-error.log*
10 | 
11 | # Runtime data
12 | pids
13 | *.pid
14 | *.seed
15 | *.pid.lock
16 | 
17 | # Directory for instrumented libs generated by jscoverage/JSCover
18 | lib-cov
19 | 
20 | # Coverage directory used by tools like istanbul
21 | coverage
22 | 
23 | # nyc test coverage
24 | .nyc_output
25 | 
26 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
27 | .grunt
28 | 
29 | # Bower dependency directory (https://bower.io/)
30 | bower_components
31 | 
32 | # node-waf configuration
33 | .lock-wscript
34 | 
35 | # Compiled binary addons (https://nodejs.org/api/addons.html)
36 | build/Release
37 | 
38 | # Dependency directories
39 | node_modules/
40 | jspm_packages/
41 | 
42 | # TypeScript v1 declaration files
43 | typings/
44 | 
45 | # Optional npm cache directory
46 | .npm
47 | 
48 | # Optional eslint cache
49 | .eslintcache
50 | 
51 | # Optional REPL history
52 | .node_repl_history
53 | 
54 | # Output of 'npm pack'
55 | *.tgz
56 | 
57 | # Yarn Integrity file
58 | .yarn-integrity
59 | 
60 | # dotenv environment variables file
61 | .env
62 | 
63 | # next.js build output
64 | .next
65 | 
66 | # Package lock was removed from the repo as of b0dad2
67 | package-lock.json
68 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: node_js
2 | node_js:
3 |   - "7.6"
4 | script:
5 |   - npm test
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![](https://github.com/linkedtales/scrapedin/raw/master/logo.png)
 2 | [![Build Status](https://travis-ci.org/leonardiwagner/scrapedin.svg?branch=master)](https://travis-ci.org/leonardiwagner/scrapedin)
 3 | [![NPM version](https://img.shields.io/npm/v/scrapedin.svg)](https://www.npmjs.com/package/scrapedin)
 4 | ----
 5 | Scraper for LinkedIn full profile data. Unlike others scrapers, it's working in 2020 with their new website.
 6 | 
 7 | `npm i scrapedin`
 8 | 
 9 | ### Usage Example:
10 | 
11 | ```javascript
12 | const scrapedin = require('scrapedin')
13 | 
14 | const profileScraper = await scrapedin({ email: 'login@mail.com', password: 'pass' })
15 | const profile = await profileScraper('https://www.linkedin.com/in/some-profile/')
16 | ```
17 | 
18 | - If you are looking for a crawler to automatically extract multiple profiles see [scrapedin-crawler](https://github.com/linkedtales/scrapedin-linkedin-crawler)
19 | 
20 | ### Start Guide:
21 | 
22 | - [Basic Tutorial](https://github.com/linkedtales/scrapedin/wiki/Basic-Tutorial)
23 | - [Using Cookies to Login](https://github.com/linkedtales/scrapedin/wiki/Using-Cookies-To-Login)
24 | - [Tips](https://github.com/linkedtales/scrapedin/wiki/Tips)
25 | - [Documentation](https://github.com/linkedtales/scrapedin/wiki/Documentation)
26 | 
27 | 
28 | ### Contribution
29 | 
30 | Feel free to contribute. Just open an issue to discuss something before creating a PR.
31 | 
32 | ### License
33 | 
34 | [Apache 2.0][apache-license]
35 | 
36 | [apache-license]:./LICENSE
37 | 


--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linkedtales/scrapedin/e0612c49d018d2331b9f76ec4b74df8dd73b5695/logo.png


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "scrapedin",
 3 |   "version": "1.0.21",
 4 |   "description": "linkedin scraper for 2020 website",
 5 |   "keywords": [
 6 |     "linkedin",
 7 |     "scraper",
 8 |     "crawler"
 9 |   ],
10 |   "main": "src/scrapedin.js",
11 |   "scripts": {
12 |     "test": "mocha src/**.test.js",
13 |     "lint": "standard --fix 'src/*.js'",
14 |     "coverage": "nyc npm test",
15 |     "coverage:report": "nyc --reporter=lcov npm test"
16 |   },
17 |   "repository": {
18 |     "type": "git",
19 |     "url": "https://github.com/linkedtales/scrapedin"
20 |   },
21 |   "bugs": {
22 |     "url": "https://github.com/linkedtales/scrapedin/issues"
23 |   },
24 |   "author": "Wagner Leonardi <leonardiwagner@gmail.com>",
25 |   "license": "Apache-2.0",
26 |   "dependencies": {
27 |     "puppeteer": "1.13.0",
28 |     "winston": "3.1.0"
29 |   },
30 |   "devDependencies": {
31 |     "chai": "4.2.0",
32 |     "coveralls": "3.0.2",
33 |     "faker": "4.1.0",
34 |     "mocha": "5.2.0",
35 |     "nyc": "^13.1.0",
36 |     "sinon": "7.2.2",
37 |     "standard": "12.0.1"
38 |   },
39 |   "engines": {
40 |     "node": ">= 7.6.0"
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/company/company.js:
--------------------------------------------------------------------------------
 1 | const openPage = require('../openPage')
 2 | const scrapSection = require('../scrapSection')
 3 | const template = require('./companyScraperTemplate')
 4 | 
 5 | const logger = require('../logger')(__filename)
 6 | 
 7 | module.exports = async (browser, cookies, url, waitTimeToScrapMs = 500, puppeteerAuthenticate = undefined) => {
 8 |   logger.info(`starting scraping url: ${url}`);
 9 | 
10 |   let company = {};
11 | 
12 |   let page;
13 |   if(url.includes('legacySchoolId=')){
14 |       page = await openPage({ browser, cookies, url, puppeteerAuthenticate });
15 | 
16 |       const aboutSelector = 'a[href$="/about/"]';
17 | 
18 |       company.url = page.url();
19 |       
20 |       await page.$eval(aboutSelector, async about => await about.click());
21 |       await page.waitForNavigation();
22 |   } else{
23 |       company.url = url;
24 |       url = url + '/about';
25 |       page = await openPage({ browser, cookies, url, puppeteerAuthenticate });
26 |   }
27 |   company.about = (await scrapSection(page, template.about))[0];
28 |   company.profile = (await scrapSection(page, template.profile))[0];
29 | 
30 |   await page.close();
31 |   logger.info(`finished scraping url: ${url}`);
32 |   
33 |   return company
34 |     
35 | }
36 | 


--------------------------------------------------------------------------------
/src/company/companyScraperTemplate.js:
--------------------------------------------------------------------------------
 1 | const template = {
 2 |     profile: {
 3 |         selector: '.org-top-card',
 4 |         fields: {
 5 |             name: `h1`,
 6 |             headline: `p`,
 7 |             imageurl: {
 8 |                 selector: `img.org-top-card-primary-content__logo`,
 9 |                 attribute: 'src'
10 |             }
11 |         }
12 |     },
13 |     about: {
14 |         selector: '.org-grid__core-rail--no-margin-left',
15 |         fields: {
16 |             overview: 'p',
17 |             types:{
18 |                 selector: 'dl dt',
19 |                 isMultipleFields: true
20 |             },
21 |             values:{
22 |                 selector: 'dl dd:not(.org-page-details__employees-on-linkedin-count)',
23 |                 isMultipleFields: true
24 |             }
25 |         }
26 |     }
27 | }
28 | 
29 | 
30 | module.exports = template
31 | 


--------------------------------------------------------------------------------
/src/logger.js:
--------------------------------------------------------------------------------
 1 | const path = require('path')
 2 | const pkg = require('./package')
 3 | const winston = require('winston')
 4 | const logger = winston.createLogger({
 5 |   format: winston.format.combine(
 6 |     winston.format.splat(),
 7 |     winston.format.simple(),
 8 |     winston.format.timestamp(),
 9 |     winston.format.colorize(),
10 |     winston.format.printf(info => `${pkg.name}: ${info.timestamp} ${info.level}: ${info.message}`)
11 |   ),
12 |   transports: [new winston.transports.Console()]
13 | })
14 | 
15 | const loggerWrapper = (absoluteFilePath) => {
16 |   const file = path.relative(__dirname, absoluteFilePath)
17 |   // Because this file is in the source code root folder, the above will make all paths relative to it: just the info needed for the log.
18 | 
19 |   return {
20 |     info: (message) => logger.info(`[${file}] ${message}`),
21 |     warn: (message) => logger.warn(`[${file}] ${message}`),
22 |     error: (message, error) => logger.error(`[${file}] ${message}${error && error.stack ? error.stack : (error || '')}`),
23 |     stopLogging: () => {
24 |       logger.silent = true
25 |     }
26 |   }
27 | }
28 | 
29 | module.exports = loggerWrapper
30 | 


--------------------------------------------------------------------------------
/src/login.js:
--------------------------------------------------------------------------------
 1 | const openPage = require('./openPage')
 2 | const logger = require('./logger')(__filename)
 3 | const pkg = require('./package')
 4 | 
 5 | module.exports = async (browser, email, password) => {
 6 |   const url = 'https://www.linkedin.com/login'
 7 |   const page = await openPage({ browser, url })
 8 |   logger.info(`logging at: ${url}`)
 9 | 
10 |   await page.goto(url)
11 |   await page.waitFor('#username')
12 | 
13 |   await page.$('#username')
14 |     .then((emailElement) => emailElement.type(email))
15 |   await page.$('#password')
16 |     .then((passwordElement) => passwordElement.type(password))
17 | 
18 |   await page.$x("//button[contains(text(), 'Sign in')]")
19 |     .then((button) => button[0].click())
20 | 
21 |   return page.waitFor('input[role=combobox]', {
22 |     timeout: 15000
23 |   })
24 |     .then(async () => {
25 |       logger.info('logged feed page selector found')
26 |       await page.close()
27 |     })
28 |     .catch(async () => {
29 |       logger.warn('successful login element was not found')
30 |       const emailError = await page.evaluate(() => {
31 |         const e = document.querySelector('div[error-for=username]')
32 |         if (!e) { return false }
33 |         const style = window.getComputedStyle(e)
34 |         return style && style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0'
35 |       })
36 | 
37 |       const passwordError = await page.evaluate(() => {
38 |         const e = document.querySelector('div[error-for=password]')
39 |         if (!e) { return false }
40 |         const style = window.getComputedStyle(e)
41 |         return style && style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0'
42 |       })
43 | 
44 |       const manualChallengeRequested = await page.evaluate(() => {
45 |         const e = document.querySelector('.flow-challenge-content')
46 |         if (!e) { return false }
47 |         const style = window.getComputedStyle(e)
48 |         return style && style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0'
49 |       })
50 | 
51 |       if (emailError) {
52 |         logger.info('wrong username element found')
53 |         return Promise.reject(new Error(`linkedin: invalid username: ${email}`))
54 |       }
55 | 
56 |       if (passwordError) {
57 |         logger.info('wrong password element found')
58 |         return Promise.reject(new Error('linkedin: invalid password'))
59 |       }
60 | 
61 |       if (page.$(manualChallengeRequested)) {
62 |         logger.warn('manual check was required')
63 |         return Promise.reject(new Error(`linkedin: manual check was required, verify if your login is properly working manually or report this issue: ${pkg.name} ${pkg.version} ${pkg.bugs.url}`))
64 |       }
65 | 
66 |       logger.error('could not find any element to retrieve a proper error')
67 |       return Promise.reject(new Error(`${pkg.name} ${pkg.version} login is not working, please report: ${pkg.bugs.url}`))
68 |     })
69 | }
70 | 


--------------------------------------------------------------------------------
/src/openPage.js:
--------------------------------------------------------------------------------
 1 | const agents = [
 2 |   'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
 3 |   // "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
 4 |   // "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
 5 |   // "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:34.0) Gecko/20100101 Firefox/34.0",
 6 |   // "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
 7 |   // "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
 8 |   // "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
 9 |   // "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
10 |   // "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
11 | ]
12 | 
13 | module.exports = ({ browser, cookies, url, puppeteerAuthenticate }) => new Promise( async (resolve, reject) => {
14 |   const page = await browser.newPage()
15 |   page.on('error', err => {reject(err)})
16 | 
17 |   if (cookies) {
18 |     await page.setCookie(...cookies)
19 |   }
20 |   await page.setUserAgent(agents[Math.floor(Math.random() * agents.length)])
21 |   await page.setExtraHTTPHeaders({ 'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8' })
22 |   await page.setViewport({
23 |     width: 1920,
24 |     height: 1080
25 |   })
26 | 
27 |   if (puppeteerAuthenticate) {
28 |     await page.authenticate(puppeteerAuthenticate)
29 |   }
30 | 
31 |   await page.goto(url)
32 | 
33 |   resolve(page)
34 | })
35 | 


--------------------------------------------------------------------------------
/src/package.js:
--------------------------------------------------------------------------------
 1 | const pkg = require('../package.json')
 2 | 
 3 | // Only specific keys are needed, not the whole file.
 4 | 
 5 | module.exports = {
 6 |   bugs: {
 7 |     url: pkg.bugs.url
 8 |   },
 9 |   name: pkg.name,
10 |   version: pkg.version
11 | }
12 | 


--------------------------------------------------------------------------------
/src/profile/cleanProfileData.js:
--------------------------------------------------------------------------------
 1 | const logger = require('../logger')(__filename)
 2 | const pkg = require('../package')
 3 | 
 4 | module.exports = (profile) => {
 5 |   if(!profile.profile.name) {
 6 |     const messageError = `LinkedIn website changed and ${pkg.name} ${pkg.version} can't read basic data. Please report this issue at ${pkg.bugs.url}`
 7 |     logger.error(messageError, '')
 8 |     throw new Error(messageError)
 9 |   }
10 | 
11 |   profile.profile.summary = profile.about.text
12 | 
13 |   profile.positions.forEach((position) => {
14 |     if(position.title){
15 |         position.title = position.title.replace('Company Name\n', '')
16 |     }
17 |     if(position.description) {
18 |       position.description = position.description.replace('See more', '');
19 |       position.description = position.description.replace('see more', '');
20 | 	    position.description = position.description.replace('See less', '');
21 |     }
22 |     if(position.roles) {
23 |       position.roles.forEach((role) => {
24 |         if(role.title) {
25 |           role.title = role.title.replace('Title\n', '')
26 |         }
27 |         if(role.description) {
28 |           role.description = role.description.replace('See more', '')
29 |           role.description = role.description.replace('see more', '')
30 |         }
31 |       })
32 |     }
33 |   })
34 | 
35 |   if(profile.recommendations.receivedCount) {
36 |     profile.recommendations.receivedCount = profile.recommendations.receivedCount.replace(/[^\d]/g, '')
37 |   }
38 | 
39 |   if(profile.recommendations.givenCount) {
40 |     profile.recommendations.givenCount = profile.recommendations.givenCount.replace(/[^\d]/g, '')
41 |   }
42 | 
43 |   if(profile.recommendations.received) {
44 |     profile.recommendations.received.forEach((recommendation) => {
45 |       if(recommendation.summary){
46 |         recommendation.summary = recommendation.summary.replace('See more', '')
47 |         recommendation.summary = recommendation.summary.replace('See less', '')
48 |       }
49 |     })
50 |   }
51 | 
52 |   if(profile.recommendations.given) {
53 |     profile.recommendations.given.forEach((recommendation) => {
54 |       if(recommendation.summary){
55 |         recommendation.summary = recommendation.summary.replace('See more', '')
56 |         recommendation.summary = recommendation.summary.replace('See less', '')
57 |       }
58 |     })
59 |   }
60 | 
61 |   if(profile.courses){
62 |     profile.courses = profile.courses.map(({ name, year }) => {
63 |       const coursesObj = {}
64 |       if(name) {
65 |         coursesObj.name = name.replace('Course name\n', '')
66 |       }
67 |       if(year) {
68 |         coursesObj.year = year.replace('Course number\n', '')
69 |       }
70 |       return coursesObj
71 |     }
72 |     );
73 |   }
74 | 
75 |   if(profile.languages){
76 |     profile.languages = profile.languages.map(({ name, proficiency }) => ({
77 |       name: name ? name.replace('Language name\n', '') : undefined,
78 |       proficiency,
79 |     }));
80 |   }
81 | 
82 |   if(profile.projects){
83 |     profile.projects = profile.projects.map(
84 |       ({ name, date, description, link }) => ({
85 |         name: name ? name.replace('Project name\n', '') : undefined,
86 |         date,
87 |         description: description ? description.replace('Project description\n', '') : undefined,
88 |         link,
89 |       }),
90 |     );
91 |   }
92 |   
93 |   return profile
94 | }
95 | 


--------------------------------------------------------------------------------
/src/profile/contactInfo.js:
--------------------------------------------------------------------------------
 1 | const logger = require('../logger')(__filename)
 2 | const scrapSection = require('../scrapSection')
 3 | 
 4 | const SEE_MORE_SELECTOR = 'a[data-control-name=contact_see_more]'
 5 | const CLOSE_MODAL_SELECTOR = '.artdeco-modal__dismiss';
 6 | 
 7 | const template = {
 8 |   selector: '.pv-contact-info__contact-type',
 9 |   fields: {
10 |     type: 'header',
11 |     values: {
12 |       selector: '.pv-contact-info__ci-container',
13 |       isMultipleFields: true
14 |     },
15 |     links: {
16 |       selector: 'a',
17 |       attribute: 'href',
18 |       isMultipleFields: true
19 |     }
20 |   }
21 | } 
22 | const getContactInfo = async(page) => {
23 |   await page.waitFor(SEE_MORE_SELECTOR, { timeout: 2000 })
24 |     .catch(() => {
25 |       logger.warn('contact-info', 'selector not found')
26 |       return {}
27 |     })
28 | 
29 |   const element = await page.$(SEE_MORE_SELECTOR)
30 |   if(element){
31 |     await element.click()
32 |     const contactInfoIndicatorSelector = '#pv-contact-info'
33 |     await page.waitFor(contactInfoIndicatorSelector, { timeout: 5000 })
34 |         .catch(() => {
35 |           logger.warn('contact info was not found')
36 |         })
37 |     
38 |     const contactInfo = await scrapSection(page, template)
39 |     const closeButton = await page.$(CLOSE_MODAL_SELECTOR)
40 |     if(closeButton)
41 |       await closeButton.click()
42 | 
43 |     return contactInfo
44 |   }
45 |   
46 | }
47 | 
48 | module.exports = getContactInfo
49 | 


--------------------------------------------------------------------------------
/src/profile/profile.js:
--------------------------------------------------------------------------------
 1 | const openPage = require('../openPage')
 2 | const scrapSection = require('../scrapSection')
 3 | const scrapAccomplishmentPanel = require('./scrapAccomplishmentPanel')
 4 | const scrollToPageBottom = require('./scrollToPageBottom')
 5 | const seeMoreButtons = require('./seeMoreButtons')
 6 | const contactInfo = require('./contactInfo')
 7 | const template = require('./profileScraperTemplate')
 8 | const cleanProfileData = require('./cleanProfileData')
 9 | 
10 | const logger = require('../logger')(__filename)
11 | 
12 | module.exports = async (browser, cookies, url, waitTimeToScrapMs = 500, hasToGetContactInfo = false, puppeteerAuthenticate = undefined) => {
13 |   logger.info(`starting scraping url: ${url}`)
14 | 
15 |   const page = await openPage({ browser, cookies, url, puppeteerAuthenticate })
16 |   const profilePageIndicatorSelector = '.pv-profile-section'
17 |   await page.waitFor(profilePageIndicatorSelector, { timeout: 5000 })
18 |     .catch(() => {
19 |       //why doesn't throw error instead of continuing scraping?
20 |       //because it can be just a false negative meaning LinkedIn only changed that selector but everything else is fine :)
21 |       logger.warn('profile selector was not found')
22 |     })
23 | 
24 |   logger.info('scrolling page to the bottom')
25 |   await scrollToPageBottom(page)
26 |   
27 |   if(waitTimeToScrapMs) {
28 |     logger.info(`applying 1st delay`)
29 |     await new Promise((resolve) => { setTimeout(() => { resolve() }, waitTimeToScrapMs / 2)})
30 |   }
31 | 
32 |   await seeMoreButtons.clickAll(page)
33 | 
34 |   if(waitTimeToScrapMs) {
35 |     logger.info(`applying 2nd (and last) delay`)
36 |     await new Promise((resolve) => { setTimeout(() => { resolve() }, waitTimeToScrapMs / 2)})
37 |   }
38 | 
39 |   const [profile] = await scrapSection(page, template.profile)
40 |   const [about] = await scrapSection(page, template.about)
41 |   const positions = await scrapSection(page, template.positions)
42 |   const educations = await scrapSection(page, template.educations)
43 |   const [recommendationsCount] = await scrapSection(page, template.recommendationsCount)
44 |   const recommendationsReceived = await scrapSection(page, template.recommendationsReceived)
45 |   const recommendationsGiven = await scrapSection(page, template.recommendationsGiven)
46 |   const skills = await scrapSection(page, template.skills)
47 |   const accomplishments = await scrapSection(page, template.accomplishments)
48 |   const courses = await scrapAccomplishmentPanel(page, 'courses')
49 |   const languages = await scrapAccomplishmentPanel(page, 'languages')
50 |   const projects = await scrapAccomplishmentPanel(page, 'projects')
51 |   const volunteerExperience = await scrapSection(page, template.volunteerExperience)
52 |   const peopleAlsoViewed = await scrapSection(page, template.peopleAlsoViewed)
53 |   const contact = hasToGetContactInfo ? await contactInfo(page) : []
54 | 
55 |   await page.close()
56 |   logger.info(`finished scraping url: ${url}`)
57 | 
58 |   const rawProfile = {
59 |     profile,
60 |     about,
61 |     positions,
62 |     educations,
63 |     skills,
64 |     recommendations: {
65 |       givenCount: recommendationsCount ? recommendationsCount.given : "0",
66 |       receivedCount: recommendationsCount ? recommendationsCount.received : "0",
67 |       given: recommendationsReceived,
68 |       received: recommendationsGiven
69 |     },
70 |     accomplishments,
71 |     courses,
72 |     languages,
73 |     projects,
74 |     peopleAlsoViewed,
75 |     volunteerExperience,
76 |     contact
77 |   }
78 | 
79 |   const cleanedProfile = cleanProfileData(rawProfile)
80 |   return cleanedProfile
81 | }
82 | 


--------------------------------------------------------------------------------
/src/profile/profileScraperTemplate.js:
--------------------------------------------------------------------------------
  1 | const profileSelector = '.core-rail > *:first-child section >'
  2 | 
  3 | const template = {
  4 |   profile: {
  5 |     selector: '.pv-top-card',
  6 |     fields: {
  7 |       name: `.text-heading-xlarge`,
  8 |       headline: `.text-body-medium`,
  9 |       location: `.pb2 .text-body-small`,
 10 |       connections: `li.text-body-small`,
 11 |       imageurl: {
 12 | 		    selector: `img.pv-top-card__photo`,
 13 |         attribute: 'src'
 14 |       }
 15 |     }
 16 |   },
 17 |   about: {
 18 |     selector: '.pv-about-section',
 19 |     fields: {
 20 |       text: 'p'
 21 |     }
 22 |   },
 23 |   positions: {
 24 |     selector: '#experience-section li:not(.pv-entity__position-group-role-item)',
 25 |     fields: {
 26 |       title: 'h3',
 27 |       link: {
 28 |         selector: 'a',
 29 |         attribute: 'href',
 30 |       },
 31 |       url: {
 32 |         selector: 'a',
 33 |         attribute: 'href'
 34 |       },
 35 |       companyName: '.pv-entity__secondary-title',
 36 |       location: '.pv-entity__location span:last-child',
 37 |       description: '.pv-entity__description',
 38 |       date1: '.pv-entity__date-range span:last-child',
 39 |       date2: '.pv-entity__bullet-item-v2',
 40 |       roles: {
 41 |         selector: 'li',
 42 |         hasChildrenFields: true,
 43 |         fields: {
 44 |           title: 'h3',
 45 |           description: '.pv-entity__description',
 46 |           date1: '.pv-entity__date-range span:last-child',
 47 |           date2: '.pv-entity__bullet-item-v2',
 48 |           location: '.pv-entity__location span:last-child'
 49 |         }
 50 |       }
 51 |     }
 52 |   },
 53 |   educations: {
 54 |     selector: '#education-section li',
 55 |     fields: {
 56 |       title: 'h3',
 57 |       degree: 'span[class=pv-entity__comma-item]',
 58 |       url: {
 59 |         selector: 'a',
 60 |         attribute: 'href'
 61 |       },
 62 | 	    fieldOfStudy: 'p.pv-entity__fos span:nth-child(2)',
 63 |       date1: '.pv-entity__dates time:nth-child(1)',
 64 |       date2: '.pv-entity__dates time:nth-child(2)',
 65 |       description: '.pv-entity__description'
 66 |     }
 67 |   },
 68 |   skills: {
 69 |     selector: '.pv-skill-category-entity__skill-wrapper',
 70 |     fields: {
 71 |       title: '.pv-skill-category-entity__name-text',
 72 |       count: '.pv-skill-category-entity__endorsement-count'
 73 |     }
 74 |   },
 75 |   recommendationsCount: {
 76 |     selector: '.recommendations-inlining',
 77 |     fields: {
 78 |       received: '.artdeco-tab:nth-child(1)',
 79 |       given: '.artdeco-tab:nth-child(2)'
 80 |     }
 81 |   },
 82 |   recommendationsReceived: {
 83 |     selector: '.recommendations-inlining',
 84 |     fields: {
 85 |       user: {
 86 |         selector: '.pv-recommendation-entity__member',
 87 |         attribute: 'href'
 88 |       },
 89 |       text: 'blockquote.pv-recommendation-entity__text',
 90 |       profileImage: {
 91 |         selector: 'a img',
 92 |         attribute: 'src'
 93 |       },
 94 |       name: {
 95 |         selector: 'a h3'
 96 |       },
 97 |       userDescription: {
 98 |         selector: '.pv-recommendation-entity__headline'
 99 |       }
100 |     }
101 |   },
102 |   recommendationsGiven: {
103 |     selector: '.artdeco-tabpanel li.pv-recommendation-entity',
104 |     fields: {
105 |       user: {
106 |         selector: '.pv-recommendation-entity__member',
107 |         attribute: 'href'
108 |       },
109 |       text: 'blockquote.pv-recommendation-entity__text',
110 |       profileImage: {
111 |         selector: 'a img',
112 |         attribute: 'src'
113 |       },
114 |       name: {
115 |         selector: 'a h3'
116 |       },
117 |       userDescription: {
118 |         selector: '.pv-recommendation-entity__headline'
119 |       }
120 |     }
121 |   },
122 |   accomplishments: {
123 |     selector: '.pv-accomplishments-section > div',
124 |     fields: {
125 |       count: 'h3 span:last-child',
126 |       title: '.pv-accomplishments-block__title',
127 |       items: {
128 |         selector: 'li',
129 |         isMultipleFields: true
130 |       }
131 |     }
132 |   },
133 |   peopleAlsoViewed: {
134 |     selector: 'li.pv-browsemap-section__member-container',
135 |     fields: {
136 |       user: {
137 |         selector: 'a',
138 |         attribute: 'href'
139 |       },
140 |       text: 'p',
141 |       profileImage: {
142 |         selector: 'a img',
143 |         attribute: 'src'
144 |       },
145 |       name: {
146 |         selector: '.name'
147 |       }
148 |     }
149 |   },
150 |   volunteerExperience: {
151 |     selector: 'section.volunteering-section li',
152 |     fields: {
153 |       title: 'h3',
154 |       experience: 'span[class=pv-entity__secondary-title]',
155 |       location: '.pv-entity__location span:nth-child(2)',
156 |       description: '.pv-volunteer-causes',
157 |       date1: '.pv-entity__date-range span:nth-child(2)',
158 |       date2: '.pv-entity__bullet-item'
159 |     }
160 |   },
161 |   courses: {
162 |     selector: '.pv-accomplishments-section',
163 |     fields: {
164 |       name: '.pv-accomplishment-entity__title',
165 |       year: '.pv-accomplishment-entity__course-number'
166 |     }
167 |   },
168 |   languages: {
169 |     selector: '.pv-accomplishments-section',
170 |     fields: {
171 |       name: '.pv-accomplishment-entity__title',
172 |       proficiency: '.pv-accomplishment-entity__proficiency',
173 |     }
174 |   },
175 |   projects: {
176 |     selector: '.pv-accomplishments-section',
177 |     fields: {
178 |       name: '.pv-accomplishment-entity__title',
179 |       date: '.pv-accomplishment-entity__date',
180 |       description: '.pv-accomplishment-entity__description',
181 |       link: {
182 |         selector: '.pv-accomplishment-entity__external-source',
183 |         attribute: 'href'
184 |       }
185 |     }
186 |   }
187 | }
188 | 
189 | 
190 | module.exports = template


--------------------------------------------------------------------------------
/src/profile/scrapAccomplishmentPanel.js:
--------------------------------------------------------------------------------
 1 | const scrapSection = require('../scrapSection');
 2 | const template = require('./profileScraperTemplate');
 3 | 
 4 | const scrapAccomplishmentPanel = async (page, section) => {
 5 |   const queryString = `.pv-accomplishments-block.${section} button`
 6 | 
 7 |   const openingButton = await page.$(queryString);
 8 | 
 9 |   if (openingButton) {
10 |     await page.evaluate((q) => {
11 |       document.querySelector(q).click();
12 |     }, queryString);
13 | 
14 |     return scrapSection(page, template[section]);
15 |   }
16 | };
17 | 
18 | module.exports = scrapAccomplishmentPanel;


--------------------------------------------------------------------------------
/src/profile/scrollToPageBottom.js:
--------------------------------------------------------------------------------
 1 | const logger = require('../logger')(__filename)
 2 | 
 3 | module.exports = async (page) => {
 4 |   const MAX_TIMES_TO_SCROLL = 25
 5 |   const TIMEOUT_BETWEEN_SCROLLS = 500
 6 |   const PAGE_BOTTOM_SELECTOR_STRING = '#expanded-footer'
 7 | 
 8 |   for (let i = 0; i < MAX_TIMES_TO_SCROLL; i++) {
 9 |     await page.evaluate(() => window.scrollBy(0, window.innerHeight))
10 | 
11 |     const hasReachedEnd = await page.waitForSelector(PAGE_BOTTOM_SELECTOR_STRING, {
12 |       visible: true,
13 |       timeout: TIMEOUT_BETWEEN_SCROLLS
14 |     }).catch(() => {
15 |       logger.info(`scrolling to page bottom (${i + 1})`)
16 |     })
17 | 
18 |     if (hasReachedEnd) {
19 |       return
20 |     }
21 |   }
22 | 
23 |   logger.warn('page bottom not found')
24 | }
25 | 


--------------------------------------------------------------------------------
/src/profile/seeMoreButtons.js:
--------------------------------------------------------------------------------
 1 | const logger = require('../logger')(__filename)
 2 | const seeMoreButtons = [
 3 |   {
 4 |     id: 'SHOW_MORE_ABOUT',
 5 |     selector: '#line-clamp-show-more-button'
 6 |   },{
 7 |     id: 'SHOW_MORE_EXPERIENCES',
 8 |     selector: '#experience-section .pv-profile-section__see-more-inline'
 9 |   },{
10 |     id: 'SEE_MORE_EXPERIENCES',
11 |     selector: '#experience-section .inline-show-more-text__button'
12 |   },{
13 |     id: 'SHOW_MORE_CERTIFICATIONS',
14 |     selector: '#certifications-section .pv-profile-section__see-more-inline'
15 |   },{
16 |     id: 'SHOW_MORE_SKILLS',
17 |     selector: '.pv-skills-section__additional-skills'
18 |   },{
19 |     id: 'SEE_MORE_RECOMMENDATIONS',
20 |     selector: '.recommendations-inlining #line-clamp-show-more-button'
21 |   }
22 | ]
23 | 
24 | 
25 | const clickAll = async(page) => {
26 |   for(let i = 0; i < seeMoreButtons.length; i++){
27 |     const button = seeMoreButtons[i]
28 |     const elems = await page.$$(button.selector)
29 | 
30 |     for(let j = 0; j < elems.length; j++){
31 |       const elem = elems[j]
32 |       if (elem) {
33 |         await elem.click()
34 |           .catch((e) => logger.warn(`couldn't click on ${button.selector}, it's probably invisible`))
35 |       }
36 |     }
37 |   }
38 | 
39 |   return
40 | }
41 | 
42 | module.exports = { clickAll }
43 | 


--------------------------------------------------------------------------------
/src/scrapSection.js:
--------------------------------------------------------------------------------
 1 | const scrapSelectorFields = (selector, section) => async (scrapedObjectPromise, fieldKey) => {
 2 |   const scrapedObject = await scrapedObjectPromise
 3 |   const field = section.fields[fieldKey]
 4 | 
 5 |   // currently field can be a selector string, or an object containing a selector field
 6 |   const fieldSelectorString = await field.selector
 7 |     ? field.selector
 8 |     : field
 9 | 
10 |   const isFieldPresent = await selector.$(fieldSelectorString)
11 | 
12 |   if (!isFieldPresent) { return scrapedObject }
13 | 
14 |   if (field.isMultipleFields) {
15 |     if (field.attribute === 'href') {
16 |       scrapedObject[fieldKey] = await selector.$$eval(fieldSelectorString, (elems) => elems.map(elem => elem.href ? elem.href.trim() : elem.innerHTML.trim()))
17 |     } else if(field.attribute === 'src'){
18 |       scrapedObject[fieldKey] = await selector.$$eval(fieldSelectorString, (elems) => elems.map(elem => elem.src ? elem.src.trim() : elem.innerHTML.trim()))
19 |     }else{
20 |       scrapedObject[fieldKey] = await selector.$$eval(fieldSelectorString, (elems) => elems.map(elem => elem.innerText.trim()))
21 |     }
22 |   } else if (field.hasChildrenFields) {
23 |     const fieldChildrenSelectors = await selector.$$(field.selector)
24 | 
25 |     scrapedObject[fieldKey] = await Promise.all(
26 |       fieldChildrenSelectors.map((s) => scrapSelector(s, field))
27 |     )
28 |   } else if (field.attribute && field.attribute === 'href') {
29 |     scrapedObject[fieldKey] = await selector.$eval(fieldSelectorString, (elem) => elem && elem.href ? elem.href.trim() : '')
30 |   } else if (field.attribute && field.attribute === 'src') {
31 |     scrapedObject[fieldKey] = await selector.$eval(fieldSelectorString, (elem) => elem && elem.src ? elem.src.trim() : '')
32 |   } else {
33 |     scrapedObject[fieldKey] = await selector.$eval(fieldSelectorString, (elem) => elem && elem.innerText ? elem.innerText.trim() : '')
34 |   }
35 | 
36 |   return scrapedObject
37 | }
38 | const scrapSelector = (selector, section) =>
39 |   Object.keys(section.fields)
40 |     .reduce(scrapSelectorFields(selector, section), Promise.resolve({}))
41 | 
42 | module.exports = async (page, section) => {
43 |   const sectionSelectors = await page.$$(section.selector)
44 | 
45 |   const scrapedPromises = sectionSelectors
46 |     .map((selector) => scrapSelector(selector, section))
47 | 
48 |   return Promise.all(scrapedPromises)
49 | }
50 | 


--------------------------------------------------------------------------------
/src/scrapedin.js:
--------------------------------------------------------------------------------
 1 | const puppeteer = require('puppeteer')
 2 | const login = require('./login')
 3 | const profile = require('./profile/profile')
 4 | const company = require('./company/company')
 5 | const logger = require('./logger')(__filename)
 6 | 
 7 | module.exports = async ({ cookies, email, password, isHeadless, hasToLog, hasToGetContactInfo, puppeteerArgs, puppeteerAuthenticate, endpoint } = { isHeadless: true, hasToLog: false }) => {
 8 |   if (!hasToLog) {
 9 |     logger.stopLogging()
10 |   }
11 |   logger.info('initializing')
12 | 
13 |   let browser;
14 |   if(endpoint){
15 |     browser = await puppeteer.connect({
16 |       browserWSEndpoint: endpoint,
17 |     });
18 |   }else{
19 |     const args = Object.assign({ headless: isHeadless, args: ['--no-sandbox'] }, puppeteerArgs)
20 |     browser = await puppeteer.launch(args)
21 |   }
22 | 
23 |   if (cookies) {
24 |     logger.info('using cookies, login will be bypassed')
25 |   } else if (email && password) {
26 |     logger.info('email and password was provided, we\'re going to login...')
27 | 
28 |     try {
29 |       await login(browser, email, password, logger)
30 |     } catch (e) {
31 |       if(!endpoint){
32 |         await browser.close()
33 |       }
34 |       throw e
35 |     }
36 |   } else {
37 |     logger.warn('email/password and cookies wasn\'t provided, only public data will be collected')
38 |   }
39 | 
40 |   return (url, waitMs) => url.includes('/school/') || url.includes('/company/') ? company(browser, cookies, url, waitMs, hasToGetContactInfo, puppeteerAuthenticate) :profile(browser, cookies, url, waitMs, hasToGetContactInfo, puppeteerAuthenticate)
41 | }
42 | 


--------------------------------------------------------------------------------
/src/scrapedin.test.js:
--------------------------------------------------------------------------------
  1 | const faker = require('faker')
  2 | const { expect } = require('chai')
  3 | const profile = require('./profile/profile')
  4 | const logger = require('./logger')(__filename)
  5 | const { mock, match } = require('sinon')
  6 | const profileScraperTemplate = require('./profile/profileScraperTemplate')
  7 | const url = faker.internet.url()
  8 | const fakeEvalResult = faker.lorem.words(1)
  9 | 
 10 | // Make the linter happy.
 11 | var mocha = require('mocha')
 12 | var it = mocha.it
 13 | 
 14 | logger.stopLogging()
 15 | 
 16 | it('should get complete profile', async () => {
 17 |   const browserMock = prepareBrowserMock()
 18 |   const result = await profile(browserMock, [], url, 0)
 19 |   const expectedResult = {
 20 |     aboutAlternative: {
 21 |       text: fakeEvalResult
 22 |     },
 23 |     aboutLegacy: {
 24 |       text: fakeEvalResult
 25 |     },
 26 |     accomplishments: [
 27 |       {
 28 |         count: fakeEvalResult,
 29 |         items: [fakeEvalResult],
 30 |         title: fakeEvalResult
 31 |       }
 32 |     ],
 33 |     contact: {},
 34 |     courses: [
 35 |       {
 36 |         name: fakeEvalResult,
 37 |         year: fakeEvalResult
 38 |       }
 39 |     ],
 40 |     educations: [
 41 |       {
 42 |         date1: fakeEvalResult,
 43 |         date2: fakeEvalResult,
 44 |         degree: fakeEvalResult,
 45 |         fieldOfStudy: fakeEvalResult,
 46 |         url: fakeEvalResult,
 47 |         title: fakeEvalResult
 48 |       }
 49 |     ],
 50 |     languages: [
 51 |       {
 52 |         name: fakeEvalResult,
 53 |         proficiency: fakeEvalResult
 54 |       }
 55 |     ],
 56 |     peopleAlsoViewed: [
 57 |       {
 58 |         text: fakeEvalResult,
 59 |         user: fakeEvalResult
 60 |       }
 61 |     ],
 62 |     positions: [
 63 |       {
 64 |         companyName: fakeEvalResult,
 65 |         date1: fakeEvalResult,
 66 |         date2: fakeEvalResult,
 67 |         description: fakeEvalResult,
 68 |         link: fakeEvalResult,
 69 |         location: fakeEvalResult,
 70 |         roles: [
 71 |           {
 72 |             date1: fakeEvalResult,
 73 |             date2: fakeEvalResult,
 74 |             description: fakeEvalResult,
 75 |             location: fakeEvalResult,
 76 |             title: fakeEvalResult
 77 |           }
 78 |         ],
 79 |         title: fakeEvalResult,
 80 |         url: fakeEvalResult
 81 |       }
 82 |     ],
 83 |     profile: {
 84 |       connections: fakeEvalResult,
 85 |       headline: fakeEvalResult,
 86 |       location: fakeEvalResult,
 87 |       name: fakeEvalResult,
 88 |       summary: fakeEvalResult
 89 |     },
 90 |     profileAlternative: {
 91 |       connections: fakeEvalResult,
 92 |       headline: fakeEvalResult,
 93 |       imageurl: fakeEvalResult,
 94 |       location: fakeEvalResult,
 95 |       name: fakeEvalResult
 96 |     },
 97 |     profileLegacy: {
 98 |       connections: fakeEvalResult,
 99 |       headline: fakeEvalResult,
100 |       location: fakeEvalResult,
101 |       name: fakeEvalResult,
102 |       summary: fakeEvalResult
103 |     },
104 |     projects: [
105 |       {
106 |         date: fakeEvalResult,
107 |         description: fakeEvalResult,
108 |         link: fakeEvalResult,
109 |         name: fakeEvalResult
110 |       }
111 |     ],
112 |     recommendations: {
113 |       given: [
114 |         {
115 |           text: fakeEvalResult,
116 |           user: fakeEvalResult
117 |         }
118 |       ],
119 |       givenCount: '',
120 |       received: [
121 |         {
122 |           text: fakeEvalResult,
123 |           user: fakeEvalResult
124 |         }
125 |       ],
126 |       receivedCount: ''
127 |     },
128 |     skills: [
129 |       {
130 |         count: fakeEvalResult,
131 |         title: fakeEvalResult
132 |       }
133 |     ],
134 |     volunteerExperience: [
135 |       {
136 |         date1: fakeEvalResult,
137 |         date2: fakeEvalResult,
138 |         description: fakeEvalResult,
139 |         experience: fakeEvalResult,
140 |         location: fakeEvalResult,
141 |         title: fakeEvalResult
142 |       }
143 |     ]
144 |   }
145 | 
146 |   expect(result).to.deep.equals(expectedResult)
147 | })
148 | 
149 | it('should get an incomplete profile', async () => {
150 |   const browser = prepareBrowserMock(true)
151 | 
152 |   const result = await profile(browser, [], url, 0)
153 |   const expectedResult = {
154 |     aboutAlternative: {
155 |       text: ''
156 |     },
157 |     aboutLegacy: {
158 |       text: ''
159 |     },
160 |     accomplishments: [
161 |       {
162 |         count: '',
163 |         items: [fakeEvalResult],
164 |         title: ''
165 |       }
166 |     ],
167 |     contact: {},
168 |     courses: [{}],
169 |     educations: [
170 |       {
171 |         date1: '',
172 |         date2: '',
173 |         degree: '',
174 |         fieldOfStudy: '',
175 |         url: ''
176 |       }
177 |     ],
178 |     languages: [
179 |       {
180 |         name: undefined,
181 |         proficiency: ''
182 |       }
183 |     ],
184 |     peopleAlsoViewed: [
185 |       {
186 |         text: '',
187 |         user: ''
188 |       }
189 |     ],
190 |     positions: [
191 |       {
192 |         companyName: '',
193 |         date1: '',
194 |         date2: '',
195 |         description: '',
196 |         link: '',
197 |         location: '',
198 |         roles: [
199 |           {
200 |             date1: '',
201 |             date2: '',
202 |             description: '',
203 |             location: '',
204 |             title: ''
205 |           }
206 |         ],
207 |         url: ''
208 |       }
209 |     ],
210 |     profile: {
211 |       connections: '',
212 |       headline: '',
213 |       location: '',
214 |       name: ''
215 |     },
216 |     profileAlternative: {
217 |       connections: '',
218 |       headline: '',
219 |       imageurl: '',
220 |       location: '',
221 |       name: ''
222 |     },
223 |     profileLegacy: {
224 |       connections: '',
225 |       headline: '',
226 |       location: '',
227 |       name: ''
228 |     },
229 |     projects: [
230 |       {
231 |         date: '',
232 |         description: undefined,
233 |         link: '',
234 |         name: undefined
235 |       }
236 |     ],
237 |     recommendations: {
238 |       given: [
239 |         {
240 |           text: '',
241 |           user: ''
242 |         }
243 |       ],
244 |       givenCount: '',
245 |       received: [
246 |         {
247 |           text: '',
248 |           user: ''
249 |         }
250 |       ],
251 |       receivedCount: ''
252 |     },
253 |     skills: [
254 |       {
255 |         count: '',
256 |         title: ''
257 |       }
258 |     ],
259 |     volunteerExperience: [
260 |       {
261 |         date1: '',
262 |         date2: '',
263 |         description: '',
264 |         experience: '',
265 |         location: ''
266 |       }
267 |     ]
268 |   }
269 | 
270 |   expect(result).to.deep.equals(expectedResult)
271 | })
272 | 
273 | const prepareBrowserMock = (isIncompleteProfile) => {
274 |   const Page = function () {
275 |     this.goto = mock().once().withExactArgs(url).resolves()
276 |     this.setUserAgent = mock().once().resolves()
277 |     this.setExtraHTTPHeaders = mock().once().resolves()
278 |     this.setViewport = mock().once().resolves()
279 |     this.waitFor = mock().once().resolves()
280 | 
281 |     this.evaluate = mock()
282 |       .twice()
283 |       .withExactArgs(match.func)
284 |       .atLeast(1)
285 |       .resolves()
286 |     this.waitForSelector = mock()
287 |       .withExactArgs(match.string, match.object)
288 |       .twice()
289 |       .onCall(0)
290 |       .rejects()
291 |       .onCall(1)
292 |       .resolves(true)
293 | 
294 |     this.setCookie = mock().once().withExactArgs().resolves()
295 | 
296 |     this.click = mock().atLeast(1).withExactArgs().resolves()
297 |     this.$$eval = mock()
298 |       .withExactArgs(match.string, match.func)
299 |       .atLeast(1)
300 |       .callsArgWith(1, [{ innerText: fakeEvalResult }])
301 |       .resolves([fakeEvalResult])
302 | 
303 |     this.$eval = mock()
304 |       .withExactArgs(match.string, match.func)
305 |       .atLeast(1)
306 |       .callsArgWith(
307 |         1,
308 |         isIncompleteProfile
309 |           ? undefined
310 |           : {
311 |             innerText: fakeEvalResult,
312 |             src: fakeEvalResult,
313 |             href: fakeEvalResult
314 |           }
315 |       )
316 |       .resolves(isIncompleteProfile ? '' : fakeEvalResult)
317 | 
318 |     this.close = mock().once().resolves()
319 |   }
320 | 
321 |   Page.prototype.$ = () => new Page()
322 | 
323 |   if (isIncompleteProfile) {
324 |     // I couldn't do that with sinon :(
325 |     Page.prototype.$ = (arg) =>
326 |       arg === profileScraperTemplate.positions.fields.title
327 |         ? undefined
328 |         : Promise.resolve(new Page())
329 |   }
330 | 
331 |   Page.prototype.$$ = () => [new Page()]
332 | 
333 |   const browser = {
334 |     newPage: mock().once().withExactArgs().resolves(new Page())
335 |   }
336 | 
337 |   return browser
338 | }
339 | 


--------------------------------------------------------------------------------