├── screenshots
├── screenshot 1.png
├── screenshot 2.png
└── screenshot 3.png
├── assets
├── resizing_script.js
├── styles.css
└── s.css
├── LICENSE
├── README.md
├── Profile Scraper.ipynb
└── final_app.py
/screenshots/screenshot 1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chawla201/LinkedIn-Connections-Analyzer/HEAD/screenshots/screenshot 1.png
--------------------------------------------------------------------------------
/screenshots/screenshot 2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chawla201/LinkedIn-Connections-Analyzer/HEAD/screenshots/screenshot 2.png
--------------------------------------------------------------------------------
/screenshots/screenshot 3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chawla201/LinkedIn-Connections-Analyzer/HEAD/screenshots/screenshot 3.png
--------------------------------------------------------------------------------
/assets/resizing_script.js:
--------------------------------------------------------------------------------
1 | if (!window.dash_clientside) {
2 | window.dash_clientside = {};
3 | }
4 | window.dash_clientside.clientside = {
5 | resize: function(value) {
6 | console.log("resizing..."); // for testing
7 | setTimeout(function() {
8 | window.dispatchEvent(new Event("resize"));
9 | console.log("fired resize");
10 | }, 500);
11 | return null;
12 | }
13 | };
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Parth Chawla
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/assets/styles.css:
--------------------------------------------------------------------------------
1 | .js-plotly-plot .plotly .modebar {
2 | padding-top: 5%;
3 | margin-right: 3.5%;
4 | }
5 |
6 | body {
7 | background-color: #f2f2f2;
8 | margin: 5%;
9 | }
10 |
11 | .two.columns {
12 | width: 16.25%;
13 | }
14 |
15 | .column,
16 | .columns {
17 | margin-left: 0.5%;
18 | }
19 |
20 | .pretty_container {
21 | border-radius: 5px;
22 | background-color: #f9f9f9;
23 | margin: 10px;
24 | padding: 15px;
25 | position: relative;
26 | box-shadow: 2px 2px 2px lightgrey;
27 | }
28 |
29 | .bare_container {
30 | margin: 0 0 0 0;
31 | padding: 0 0 0 0;
32 | }
33 |
34 | .dcc_control {
35 | margin: 0;
36 | padding: 5px;
37 | width: calc(100%-40px);
38 | }
39 |
40 | .control_label {
41 | margin: 0;
42 | padding: 10px;
43 | padding-bottom: 0px;
44 | margin-bottom: 0px;
45 | width: calc(100%-40px);
46 | }
47 |
48 | .rc-slider {
49 | margin-left: 0px;
50 | padding-left: 0px;
51 | }
52 |
53 | .flex-display {
54 | display: flex;
55 | }
56 |
57 | .container-display {
58 | display: flex;
59 | }
60 |
61 | #individual_graph,
62 | #aggregate_graph {
63 | width: calc(100% - 30px);
64 | position: absolute;
65 | }
66 |
67 | #count_graph {
68 | position: absolute;
69 | height: calc(100% - 30px);
70 | width: calc(100% - 30px);
71 | }
72 |
73 | #countGraphContainer {
74 | flex: 5;
75 | position: relative;
76 | }
77 |
78 | #header {
79 | align-items: center;
80 | }
81 |
82 | #learn-more-button {
83 | text-align: center;
84 | height: 100%;
85 | padding: 0 20px;
86 | text-transform: none;
87 | font-size: 15px;
88 | float: right;
89 | margin-right: 10px;
90 | margin-top: 30px;
91 | }
92 | #title {
93 | text-align: center;
94 | }
95 |
96 | .mini_container {
97 | border-radius: 5px;
98 | background-color: white;
99 | margin: 10px;
100 | padding: 15px;
101 | position: relative;
102 | box-shadow: 2px 2px 2px lightgrey;
103 | }
104 |
105 | .contain {
106 | border-radius: 5px;
107 | background-color: white;
108 | padding: 15px;
109 | box-shadow: 2px 2px 2px lightgrey;
110 | }
111 |
112 | #right-column {
113 | display: flex;
114 | flex-direction: column;
115 | }
116 |
117 | #wells {
118 | flex: 1;
119 | }
120 |
121 | #gas {
122 | flex: 1;
123 | }
124 |
125 | #aggregate_data {
126 | align-items: center;
127 | }
128 |
129 | #oil {
130 | flex: 1;
131 | }
132 |
133 | #water {
134 | flex: 1;
135 | }
136 |
137 | #tripleContainer {
138 | display: flex;
139 | flex: 3;
140 | }
141 |
142 | #mainContainer {
143 | display: flex;
144 | flex-direction: column;
145 | }
146 |
147 | #pie_graph > div > div > svg:nth-child(3) > g.infolayer > g.legend {
148 | pointer-events: all;
149 | transform: translate(30px, 349px);
150 | }
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # LinkedIn Connections Analyzer
2 | ### Web Scraping | Data Analysis | Web Development
3 | Designed a web scraping script in Python using Selenium and Beautiful Soup libraries to extract
4 | information of all the LinkedIn connections of the user, transformed the collected data and
5 | performed basic data analysis on the synthesized data. Then developed a web application dashboard
6 | using dash framework to present the findings of the analysis.
7 | As can be observed above, the project is divided into 3 parts:
8 |
9 | ## tl;dr
10 | - Designed a web scraping script in Python to scrape LinkedIn
11 | connections
12 | - Cleaned the data and performed exploratory data analysis
13 | - Presented the findings as an interactive web application dashboard
14 | built using Dash framework
15 |
16 | ## Tecnologies Used:
17 |
18 | * Python
19 | * Pandas
20 | * Numpy
21 | * Selenium
22 | * Beautiful Soup
23 | * Matplotlib
24 | * Seaborn
25 | * Plotly
26 | * Dash Framework
27 |
28 | ## Web Scraping
29 | Used the Selenium and Beautiful Soup libraries to perform web scraping to extract information from LinkedIn users' profiles. Used 3 methods: login, connections_scraper and profile_scraper. These were divided into 3 dataframes: connections_data, education and experience.
30 |
31 | connections_data: Extracted Name, Title, Location, Profile, Number of connections, Number of Projects, Number of Languages known and Top Skills for the connections_data.
32 |
33 | education: Extracted Institute, Degree and Year range for education.
34 |
35 | experience: Extracted Profile, Position, Company, Duration for the experience dataframe.
36 |
37 | ## Data Pre-processing/ Transformation
38 | The collected data was in a raw form and had to be cleaned and transformed for it to be analysed and gained insights from. There are 3 dataframes namely: connections_data, experience and education.
39 |
40 | For the connections_data dataframe, cleaned the Location column to just display the City name without the words like 'Area', divided Number of Connections into 6 categories of range such as 0-100, 100-200,... to 500+, Number of Languages, Number of Projects and created a dictionary for the Top 3 featured Skills of each of the connections and then finally counting the number of people for each skill.
41 |
42 | For the education dataframe, on the basis of the institute and degree name classified the field of study into 3 categories (for the time being, for simplicity): Science, Management and Arts, found out the status of education on the basis of the year range provided on the profile for a particular education level. Also found out the the highest level of education for the connections based on the words 'Bachelor's', 'Master's' etc given in the education field on the profile.
43 |
44 | For the experience dataframe, divided the position column into 3 categories: full time, interns, student representatives or volunteers, made 6 categories under the duration column starting with <6 months to 20+ years.
45 |
46 | ## Visulization of the transformed data on Dash Framework using Plotly Express
47 | Dash is the most downloaded, trusted framework for building ML & data science web apps. Full stack apps that would typically require a front-end, backend, and dev ops team can now be built and deployed in hours by data scientists with Dash. With Dash Open Source, Dash apps run on your local laptop or workstation, but cannot be easily accessed by others in your organization. To read more and understand Dash, visit https://plotly.com/dash/
48 |
49 | Plotly's Python graphing library makes interactive, publication-quality graphs. The plotly.express module (usually imported as px) contains functions that can create entire figures at once, and is referred to as Plotly Express or PX. Plotly Express is a built-in part of the plotly library, and is the recommended starting point for creating most common figures. To know more about plotly, visit https://plotly.com/python/
50 |
51 | Since this is the first time we have used Dash, the dashboard looks fairly simple (consisting of interactive bar charts and pie charts with tiles and tree maps), yet very informative. We plan to incorporate more changes with respect to intricacies in the level or field of study/work later.
52 |
53 | Note: It's important to have the assets folder in the same folder you implement your application in, since it's necessary for the stlying purposes.
54 |
55 |
56 | ## Screenshots:
57 |
58 |
59 |
60 |
--------------------------------------------------------------------------------
/assets/s.css:
--------------------------------------------------------------------------------
1 | /* Table of contents
2 | ––––––––––––––––––––––––––––––––––––––––––––––––––
3 | - Plotly.js
4 | - Grid
5 | - Base Styles
6 | - Typography
7 | - Links
8 | - Buttons
9 | - Forms
10 | - Lists
11 | - Code
12 | - Tables
13 | - Spacing
14 | - Utilities
15 | - Clearing
16 | - Media Queries
17 | */
18 |
19 | /* PLotly.js
20 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
21 | /* plotly.js's modebar's z-index is 1001 by default
22 | * https://github.com/plotly/plotly.js/blob/7e4d8ab164258f6bd48be56589dacd9bdd7fded2/src/css/_modebar.scss#L5
23 | * In case a dropdown is above the graph, the dropdown's options
24 | * will be rendered below the modebar
25 | * Increase the select option's z-index
26 | */
27 |
28 | /* This was actually not quite right -
29 | dropdowns were overlapping each other (edited October 26)
30 |
31 | .Select {
32 | z-index: 1002;
33 | }*/
34 |
35 |
36 | /* Grid
37 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
38 | .container {
39 | position: relative;
40 | width: 100%;
41 | max-width: 960px;
42 | margin: 0 auto;
43 | padding: 0 20px;
44 | box-sizing: border-box; }
45 | .column,
46 | .columns {
47 | width: 100%;
48 | float: left;
49 | box-sizing: border-box; }
50 |
51 | /* For devices larger than 400px */
52 | @media (min-width: 400px) {
53 | .container {
54 | width: 85%;
55 | padding: 0; }
56 | }
57 |
58 | /* For devices larger than 550px */
59 | @media (min-width: 550px) {
60 | .container {
61 | width: 80%; }
62 | .column,
63 | .columns {
64 | margin-left: 4%; }
65 | .column:first-child,
66 | .columns:first-child {
67 | margin-left: 0; }
68 |
69 | .one.column,
70 | .one.columns { width: 4.66666666667%; }
71 | .two.columns { width: 13.3333333333%; }
72 | .three.columns { width: 22%; }
73 | .four.columns { width: 30.6666666667%; }
74 | .five.columns { width: 39.3333333333%; }
75 | .six.columns { width: 48%; }
76 | .seven.columns { width: 56.6666666667%; }
77 | .eight.columns { width: 65.3333333333%; }
78 | .nine.columns { width: 74.0%; }
79 | .ten.columns { width: 82.6666666667%; }
80 | .eleven.columns { width: 91.3333333333%; }
81 | .twelve.columns { width: 100%; margin-left: 0; }
82 |
83 | .one-third.column { width: 30.6666666667%; }
84 | .two-thirds.column { width: 65.3333333333%; }
85 |
86 | .one-half.column { width: 48%; }
87 |
88 | /* Offsets */
89 | .offset-by-one.column,
90 | .offset-by-one.columns { margin-left: 8.66666666667%; }
91 | .offset-by-two.column,
92 | .offset-by-two.columns { margin-left: 17.3333333333%; }
93 | .offset-by-three.column,
94 | .offset-by-three.columns { margin-left: 26%; }
95 | .offset-by-four.column,
96 | .offset-by-four.columns { margin-left: 34.6666666667%; }
97 | .offset-by-five.column,
98 | .offset-by-five.columns { margin-left: 43.3333333333%; }
99 | .offset-by-six.column,
100 | .offset-by-six.columns { margin-left: 52%; }
101 | .offset-by-seven.column,
102 | .offset-by-seven.columns { margin-left: 60.6666666667%; }
103 | .offset-by-eight.column,
104 | .offset-by-eight.columns { margin-left: 69.3333333333%; }
105 | .offset-by-nine.column,
106 | .offset-by-nine.columns { margin-left: 78.0%; }
107 | .offset-by-ten.column,
108 | .offset-by-ten.columns { margin-left: 86.6666666667%; }
109 | .offset-by-eleven.column,
110 | .offset-by-eleven.columns { margin-left: 95.3333333333%; }
111 |
112 | .offset-by-one-third.column,
113 | .offset-by-one-third.columns { margin-left: 34.6666666667%; }
114 | .offset-by-two-thirds.column,
115 | .offset-by-two-thirds.columns { margin-left: 69.3333333333%; }
116 |
117 | .offset-by-one-half.column,
118 | .offset-by-one-half.columns { margin-left: 52%; }
119 |
120 | }
121 |
122 |
123 | /* Base Styles
124 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
125 | /* NOTE
126 | html is set to 62.5% so that all the REM measurements throughout Skeleton
127 | are based on 10px sizing. So basically 1.5rem = 15px :) */
128 | html {
129 | font-size: 62.5%; }
130 | body {
131 | font-size: 1.5em; /* currently ems cause chrome bug misinterpreting rems on body element */
132 | line-height: 1.6;
133 | font-weight: 400;
134 | font-family: "Open Sans", "HelveticaNeue", "Helvetica Neue", Helvetica, Arial, sans-serif;
135 | color: rgb(50, 50, 50); }
136 |
137 |
138 | /* Typography
139 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
140 | h1, h2, h3, h4, h5, h6 {
141 | margin-top: 0;
142 | margin-bottom: 0;
143 | font-weight: 300; }
144 | h1 { font-size: 4.5rem; line-height: 1.2; letter-spacing: -.1rem; margin-bottom: 2rem; }
145 | h2 { font-size: 3.6rem; line-height: 1.25; letter-spacing: -.1rem; margin-bottom: 1.8rem; margin-top: 1.8rem;}
146 | h3 { font-size: 3.0rem; line-height: 1.3; letter-spacing: -.1rem; margin-bottom: 1.5rem; margin-top: 1.5rem;}
147 | h4 { font-size: 2.6rem; line-height: 1.35; letter-spacing: -.08rem; margin-bottom: 1.2rem; margin-top: 1.2rem;}
148 | h5 { font-size: 2.2rem; line-height: 1.5; letter-spacing: -.05rem; margin-bottom: 0.6rem; margin-top: 0.6rem;}
149 | h6 { font-size: 2.0rem; line-height: 1.6; letter-spacing: 0; margin-bottom: 0.75rem; margin-top: 0.75rem;}
150 |
151 | p {
152 | margin-top: 0; }
153 |
154 |
155 | /* Blockquotes
156 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
157 | blockquote {
158 | border-left: 4px lightgrey solid;
159 | padding-left: 1rem;
160 | margin-top: 2rem;
161 | margin-bottom: 2rem;
162 | margin-left: 0rem;
163 | }
164 |
165 |
166 | /* Links
167 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
168 | a {
169 | color: #1EAEDB;
170 | text-decoration: underline;
171 | cursor: pointer;}
172 | a:hover {
173 | color: #0FA0CE; }
174 |
175 |
176 | /* Buttons
177 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
178 | .button,
179 | button,
180 | input[type="submit"],
181 | input[type="reset"],
182 | input[type="button"] {
183 | display: inline-block;
184 | height: 38px;
185 | padding: 0 30px;
186 | color: #555;
187 | text-align: center;
188 | font-size: 11px;
189 | font-weight: 600;
190 | line-height: 38px;
191 | letter-spacing: .1rem;
192 | text-transform: uppercase;
193 | text-decoration: none;
194 | white-space: nowrap;
195 | background-color: transparent;
196 | border-radius: 4px;
197 | border: 1px solid #bbb;
198 | cursor: pointer;
199 | box-sizing: border-box; }
200 | .button:hover,
201 | button:hover,
202 | input[type="submit"]:hover,
203 | input[type="reset"]:hover,
204 | input[type="button"]:hover,
205 | .button:focus,
206 | button:focus,
207 | input[type="submit"]:focus,
208 | input[type="reset"]:focus,
209 | input[type="button"]:focus {
210 | color: #333;
211 | border-color: #888;
212 | outline: 0; }
213 | .button.button-primary,
214 | button.button-primary,
215 | input[type="submit"].button-primary,
216 | input[type="reset"].button-primary,
217 | input[type="button"].button-primary {
218 | color: #FFF;
219 | background-color: #33C3F0;
220 | border-color: #33C3F0; }
221 | .button.button-primary:hover,
222 | button.button-primary:hover,
223 | input[type="submit"].button-primary:hover,
224 | input[type="reset"].button-primary:hover,
225 | input[type="button"].button-primary:hover,
226 | .button.button-primary:focus,
227 | button.button-primary:focus,
228 | input[type="submit"].button-primary:focus,
229 | input[type="reset"].button-primary:focus,
230 | input[type="button"].button-primary:focus {
231 | color: #FFF;
232 | background-color: #1EAEDB;
233 | border-color: #1EAEDB; }
234 |
235 |
236 | /* Forms
237 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
238 | input[type="email"],
239 | input[type="number"],
240 | input[type="search"],
241 | input[type="text"],
242 | input[type="tel"],
243 | input[type="url"],
244 | input[type="password"],
245 | textarea,
246 | select {
247 | height: 38px;
248 | padding: 6px 10px; /* The 6px vertically centers text on FF, ignored by Webkit */
249 | background-color: #fff;
250 | border: 1px solid #D1D1D1;
251 | border-radius: 4px;
252 | box-shadow: none;
253 | box-sizing: border-box;
254 | font-family: inherit;
255 | font-size: inherit; /*https://stackoverflow.com/questions/6080413/why-doesnt-input-inherit-the-font-from-body*/}
256 | /* Removes awkward default styles on some inputs for iOS */
257 | input[type="email"],
258 | input[type="number"],
259 | input[type="search"],
260 | input[type="text"],
261 | input[type="tel"],
262 | input[type="url"],
263 | input[type="password"],
264 | textarea {
265 | -webkit-appearance: none;
266 | -moz-appearance: none;
267 | appearance: none; }
268 | textarea {
269 | min-height: 65px;
270 | padding-top: 6px;
271 | padding-bottom: 6px; }
272 | input[type="email"]:focus,
273 | input[type="number"]:focus,
274 | input[type="search"]:focus,
275 | input[type="text"]:focus,
276 | input[type="tel"]:focus,
277 | input[type="url"]:focus,
278 | input[type="password"]:focus,
279 | textarea:focus,
280 | select:focus {
281 | border: 1px solid #33C3F0;
282 | outline: 0; }
283 | label,
284 | legend {
285 | display: block;
286 | margin-bottom: 0px; }
287 | fieldset {
288 | padding: 0;
289 | border-width: 0; }
290 | input[type="checkbox"],
291 | input[type="radio"] {
292 | display: inline; }
293 | label > .label-body {
294 | display: inline-block;
295 | margin-left: .5rem;
296 | font-weight: normal; }
297 |
298 |
299 | /* Lists
300 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
301 | ul {
302 | list-style: circle inside; }
303 | ol {
304 | list-style: decimal inside; }
305 | ol, ul {
306 | padding-left: 0;
307 | margin-top: 0; }
308 | ul ul,
309 | ul ol,
310 | ol ol,
311 | ol ul {
312 | margin: 1.5rem 0 1.5rem 3rem;
313 | font-size: 90%; }
314 | li {
315 | margin-bottom: 1rem; }
316 |
317 |
318 | /* Tables
319 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
320 | table {
321 | border-collapse: collapse;
322 | }
323 | th:not(.CalendarDay),
324 | td:not(.CalendarDay) {
325 | padding: 12px 15px;
326 | text-align: left;
327 | border-bottom: 1px solid #E1E1E1; }
328 | th:first-child:not(.CalendarDay),
329 | td:first-child:not(.CalendarDay) {
330 | padding-left: 0; }
331 | th:last-child:not(.CalendarDay),
332 | td:last-child:not(.CalendarDay) {
333 | padding-right: 0; }
334 |
335 |
336 | /* Spacing
337 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
338 | button,
339 | .button {
340 | margin-bottom: 0rem; }
341 | input,
342 | textarea,
343 | select,
344 | fieldset {
345 | margin-bottom: 0rem; }
346 | pre,
347 | dl,
348 | figure,
349 | table,
350 | form {
351 | margin-bottom: 0rem; }
352 | p,
353 | ul,
354 | ol {
355 | margin-bottom: 0.75rem; }
356 |
357 | /* Utilities
358 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
359 | .u-full-width {
360 | width: 100%;
361 | box-sizing: border-box; }
362 | .u-max-full-width {
363 | max-width: 100%;
364 | box-sizing: border-box; }
365 | .u-pull-right {
366 | float: right; }
367 | .u-pull-left {
368 | float: left; }
369 |
370 |
371 | /* Misc
372 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
373 | hr {
374 | margin-top: 3rem;
375 | margin-bottom: 3.5rem;
376 | border-width: 0;
377 | border-top: 1px solid #E1E1E1; }
378 |
379 |
380 | /* Clearing
381 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
382 |
383 | /* Self Clearing Goodness */
384 | .container:after,
385 | .row:after,
386 | .u-cf {
387 | content: "";
388 | display: table;
389 | clear: both; }
390 |
391 |
392 | /* Media Queries
393 | –––––––––––––––––––––––––––––––––––––––––––––––––– */
394 | /*
395 | Note: The best way to structure the use of media queries is to create the queries
396 | near the relevant code. For example, if you wanted to change the styles for buttons
397 | on small devices, paste the mobile query code up in the buttons section and style it
398 | there.
399 | */
400 |
401 |
402 | /* Larger than mobile */
403 | @media (min-width: 400px) {}
404 |
405 | /* Larger than phablet (also point when grid becomes active) */
406 | @media (min-width: 550px) {}
407 |
408 | /* Larger than tablet */
409 | @media (min-width: 750px) {}
410 |
411 | /* Larger than desktop */
412 | @media (min-width: 1000px) {}
413 |
414 | /* Larger than Desktop HD */
415 | @media (min-width: 1200px) {}
--------------------------------------------------------------------------------
/Profile Scraper.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from selenium import webdriver\n",
10 | "from selenium.webdriver.common.keys import Keys\n",
11 | "from selenium.webdriver.common.action_chains import ActionChains\n",
12 | "import pandas as pd\n",
13 | "import time\n",
14 | "from bs4 import BeautifulSoup"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "def login(browser):\n",
24 | " username = input('Enter Username: ')\n",
25 | " password = input('Enter Password: ')\n",
26 | " browser.get('https://www.linkedin.com/login?fromSignIn=true&trk=guest_homepage-basic_nav-header-signin')\n",
27 | " time.sleep(3)\n",
28 | " browser.find_element_by_name('session_key').send_keys(username + Keys.RETURN)\n",
29 | " browser.find_element_by_name('session_password').send_keys(password + Keys.RETURN)\n",
30 | " time.sleep(3)"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "def connections_scraper(browser):\n",
40 | " connections_page = \"https://www.linkedin.com/search/results/people/?facetNetwork=%5B%22F%22%5D&origin=MEMBER_PROFILE_CANNED_SEARCH\"\n",
41 | " browser.get(connections_page)\n",
42 | " soup = BeautifulSoup(browser.page_source, 'html.parser')\n",
43 | " conn_num = soup.find_all('h3', class_='search-results__total')\n",
44 | " num = int(conn_num[0].text.strip().split()[0])\n",
45 | " time.sleep(3)\n",
46 | " i = 2\n",
47 | " x = 1\n",
48 | " names = []\n",
49 | " titles = []\n",
50 | " locations = []\n",
51 | " profiles = []\n",
52 | " print('\\nScraping your connections...\\n')\n",
53 | " while True:\n",
54 | " browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)\n",
55 | " time.sleep(.75)\n",
56 | " browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)\n",
57 | " time.sleep(.75)\n",
58 | " browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)\n",
59 | " time.sleep(.75)\n",
60 | " browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)\n",
61 | " time.sleep(.75)\n",
62 | " browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)\n",
63 | " soup = BeautifulSoup(browser.page_source, 'html.parser')\n",
64 | " name_tag = soup.find_all('span', class_='name actor-name')\n",
65 | " title_tag = soup.find_all('p', class_='subline-level-1')\n",
66 | " location_tag = soup.find_all('p', class_= 'subline-level-2')\n",
67 | " profile_tag = soup.find_all('a', class_= 'search-result__result-link')\n",
68 | " names += list(map(lambda x: x.text, name_tag))\n",
69 | " titles += list(map(lambda x: x.text.replace('\\n','').strip(), title_tag))\n",
70 | " locations += list(map(lambda x: x.text.replace('\\n','').strip(), location_tag))\n",
71 | " profiles += list(map(lambda x: 'https://linkedin.com' + x['href'], profile_tag))[::2]\n",
72 | " if len(names)>=num:\n",
73 | " break\n",
74 | " y = x\n",
75 | " x = len(names)\n",
76 | " if x==y:\n",
77 | " break\n",
78 | " browser.get('https://www.linkedin.com/search/results/people/?facetNetwork=%5B%22F%22%5D&origin=MEMBER_PROFILE_CANNED_SEARCH&page='+str(i))\n",
79 | " i+=1\n",
80 | " time.sleep(3)\n",
81 | " df = pd.DataFrame({'Name':names, 'Title':titles, 'Location':locations, 'Profile':profiles})\n",
82 | " return df"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "browser = webdriver.Chrome(executable_path=\"E:/RnD/linkedin_scraper/chromedriver\", options= webdriver.ChromeOptions())\n",
92 | "login(browser)\n",
93 | "connections = connections_scraper(browser)\n",
94 | "browser.quit()\n",
95 | "connections.to_csv('connections.csv')"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": null,
101 | "metadata": {},
102 | "outputs": [],
103 | "source": [
104 | "def profile_scraper(df, browser):\n",
105 | " num_projects = []\n",
106 | " num_languages = []\n",
107 | " top_skills = []\n",
108 | " num_connections = []\n",
109 | " positions = []\n",
110 | " company = []\n",
111 | " duration = []\n",
112 | " institutes = []\n",
113 | " courses = []\n",
114 | " year_range = []\n",
115 | " ex_profiles = []\n",
116 | " ed_profiles = []\n",
117 | " for profile in df['Profile']:\n",
118 | " try:\n",
119 | " browser.get(profile)\n",
120 | " time.sleep(2)\n",
121 | " browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)\n",
122 | " time.sleep(.75)\n",
123 | " browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)\n",
124 | " time.sleep(.75)\n",
125 | " browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)\n",
126 | " time.sleep(.75)\n",
127 | " browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)\n",
128 | " time.sleep(.75)\n",
129 | " browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)\n",
130 | " time.sleep(.75)\n",
131 | " browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)\n",
132 | " time.sleep(.75)\n",
133 | " browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)\n",
134 | " time.sleep(.75)\n",
135 | " browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)\n",
136 | " time.sleep(.75)\n",
137 | " soup = BeautifulSoup(browser.page_source, 'lxml')\n",
138 | "\n",
139 | " conn_tag = soup.find_all('span', class_='t-16 t-bold')\n",
140 | " if conn_tag[0].text.strip().split()[0].isdigit() or conn_tag[0].text.strip().split()[0] == '500+':\n",
141 | " num_connections.append(conn_tag[0].text.strip().split()[0])\n",
142 | " elif len(soup.find_all('span', class_='t-16 t-black t-normal'))>0:\n",
143 | " num_connections.append(soup.find_all('span', class_='t-16 t-black t-normal')[0].text.strip().split()[0])\n",
144 | " else:\n",
145 | " num_connections.append(None)\n",
146 | "\n",
147 | " accom_tag = soup.find_all('h3', class_='pv-accomplishments-block__count t-32 t-black t-normal pr3')\n",
148 | " np = 0\n",
149 | " nl = 2\n",
150 | " for at in accom_tag:\n",
151 | " if at.text.strip().split('\\n')[0].split()[-1] == 'projects' or at.text.strip().split('\\n')[0].split()[-1] == 'project':\n",
152 | " np = int(at.text.strip().split('\\n')[1])\n",
153 | " if at.text.strip().split('\\n')[0].split()[-1] == 'languages' or at.text.strip().split('\\n')[0].split()[-1] == 'language':\n",
154 | " nl = int(at.text.strip().split('\\n')[1])\n",
155 | " num_projects.append(np)\n",
156 | " num_languages.append(nl)\n",
157 | "\n",
158 | " skills_tag = soup.find_all('span', class_='pv-skill-category-entity__name-text')\n",
159 | " ts = []\n",
160 | " for st in skills_tag:\n",
161 | " ts.append(st.text.strip())\n",
162 | " top_skills.append(ts)\n",
163 | "\n",
164 | " position_tag = soup.find_all('h3', class_='t-16 t-black t-bold')\n",
165 | " ex_pos = list(map(lambda x: x.text.strip(), position_tag))\n",
166 | " company_tag = soup.find_all('p', class_='pv-entity__secondary-title t-14 t-black t-normal')\n",
167 | " ex_comp = list(map(lambda x: x.text.strip().split('\\n')[0], company_tag))\n",
168 | " ex_duration_tag = soup.find_all('span', class_='pv-entity__bullet-item-v2')\n",
169 | " durr = []\n",
170 | " for dur in ex_duration_tag:\n",
171 | " d_list = dur.text.strip().split()\n",
172 | " if d_list[0].isdigit():\n",
173 | " if len(d_list)==2:\n",
174 | " if d_list[1] == 'mo' or d_list[1]=='mos':\n",
175 | " durr.append(int(d_list[0]))\n",
176 | " if d_list[1] == 'yr' or d_list[1]=='yrs':\n",
177 | " durr.append(int(d_list[0])*12)\n",
178 | " if len(d_list)==4:\n",
179 | " durr.append((int(d_list[0])*12)+int(d_list[2]))\n",
180 | " else:\n",
181 | " durr.append(None)\n",
182 | " x = min(len(ex_comp), len(ex_pos), len(durr))\n",
183 | " ex_comp = ex_comp[:x]\n",
184 | " ex_pos = ex_pos[:x]\n",
185 | " durr = durr[:x]\n",
186 | " ex_profiles += [profile]*x\n",
187 | " positions += ex_pos\n",
188 | " company += ex_comp\n",
189 | " duration += durr\n",
190 | "\n",
191 | " institute_tag = soup.find_all('h3', class_='pv-entity__school-name t-16 t-black t-bold')\n",
192 | " inst = list(map(lambda x: x.text.strip(), institute_tag))\n",
193 | " course_tag = soup.find_all('p', class_='pv-entity__secondary-title pv-entity__degree-name t-14 t-black t-normal')\n",
194 | " course_t = list(map(lambda x: x.text.strip().split('\\n')[1], course_tag))\n",
195 | " ed_date_tag = soup.find_all('p', class_='pv-entity__dates t-14 t-black--light t-normal')\n",
196 | " ed_dates = list(map(lambda x: x.text.strip().split('\\n')[-1], ed_date_tag))\n",
197 | " y = min(len(inst), len(course_t), len(ed_dates))\n",
198 | " inst = inst[:y]\n",
199 | " course_t = course_t[:y]\n",
200 | " ed_dates = ed_dates[:y]\n",
201 | " ed_profiles += [profile]*y\n",
202 | " institutes += inst\n",
203 | " courses += course_t\n",
204 | " year_range += ed_dates\n",
205 | " except:\n",
206 | " continue\n",
207 | " df['Number of connections'] = num_connections\n",
208 | " df['Number of Projects'] = num_projects\n",
209 | " df['Number of Languages known'] = num_languages\n",
210 | " df['Top Skills'] = top_skills\n",
211 | " exp_df = pd.DataFrame({'Profile':ex_profiles, 'Position':positions, 'Company':company, 'Duration':duration})\n",
212 | " ed_df = pd.DataFrame({'Profile':ed_profiles, 'Institute':institutes, 'Degree':courses, 'Year range':year_range})\n",
213 | " \n",
214 | " return df, exp_df, ed_df"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": null,
220 | "metadata": {
221 | "scrolled": false
222 | },
223 | "outputs": [],
224 | "source": [
225 | "browser = webdriver.Chrome(executable_path=\"E:/RnD/linkedin_scraper/chromedriver\", options= webdriver.ChromeOptions())\n",
226 | "login(browser)\n",
227 | "data = pd.read_csv('connections.csv')\n",
228 | "df, exp, ed = profile_scraper(data, browser)\n",
229 | "browser.quit()\n",
230 | "df.to_csv('connections_data.csv', index=False)\n",
231 | "exp.to_csv('experience.csv', index=False)\n",
232 | "ed.to_csv('education.csv', index=False)"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": null,
238 | "metadata": {},
239 | "outputs": [],
240 | "source": [
241 | "browser = webdriver.Chrome(executable_path=\"E:/RnD/linkedin_scraper/chromedriver\", options= webdriver.ChromeOptions())\n",
242 | "login(browser)\n",
243 | "connections = connections_scraper(browser)\n",
244 | "conn, exp, ed = profile_scraper(connections, browser)\n",
245 | "browser.quit()"
246 | ]
247 | }
248 | ],
249 | "metadata": {
250 | "kernelspec": {
251 | "display_name": "Python 3",
252 | "language": "python",
253 | "name": "python3"
254 | },
255 | "language_info": {
256 | "codemirror_mode": {
257 | "name": "ipython",
258 | "version": 3
259 | },
260 | "file_extension": ".py",
261 | "mimetype": "text/x-python",
262 | "name": "python",
263 | "nbconvert_exporter": "python",
264 | "pygments_lexer": "ipython3",
265 | "version": "3.7.7"
266 | }
267 | },
268 | "nbformat": 4,
269 | "nbformat_minor": 2
270 | }
271 |
--------------------------------------------------------------------------------
/final_app.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | from selenium.webdriver.common.keys import Keys
3 | from selenium.webdriver.common.action_chains import ActionChains
4 | import pandas as pd
5 | import time
6 | from bs4 import BeautifulSoup
7 | import datetime
8 | import plotly.express as px
9 | import dash
10 | import dash_core_components as dcc
11 | import dash_html_components as html
12 | import plotly.graph_objs as go
13 | import webbrowser
14 |
15 | def login(browser):
16 | username = input('Enter Username: ')
17 | password = input('Enter Password: ')
18 | browser.get('https://www.linkedin.com/login?fromSignIn=true&trk=guest_homepage-basic_nav-header-signin')
19 | time.sleep(3)
20 | browser.find_element_by_name('session_key').send_keys(username + Keys.RETURN)
21 | browser.find_element_by_name('session_password').send_keys(password + Keys.RETURN)
22 | time.sleep(3)
23 |
24 | def connections_scraper(browser):
25 | connections_page = "https://www.linkedin.com/search/results/people/?facetNetwork=%5B%22F%22%5D&origin=MEMBER_PROFILE_CANNED_SEARCH"
26 | browser.get(connections_page)
27 | soup = BeautifulSoup(browser.page_source, 'html.parser')
28 | conn_num = soup.find_all('h3', class_='search-results__total')
29 | num = int(conn_num[0].text.strip().split()[0])
30 | time.sleep(3)
31 | i = 2
32 | x = 1
33 | names = []
34 | titles = []
35 | locations = []
36 | profiles = []
37 | print('Scraping your connections...\n')
38 | while True:
39 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)
40 | time.sleep(.75)
41 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)
42 | time.sleep(.75)
43 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)
44 | time.sleep(.75)
45 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)
46 | time.sleep(.75)
47 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)
48 | soup = BeautifulSoup(browser.page_source, 'html.parser')
49 | name_tag = soup.find_all('span', class_='name actor-name')
50 | title_tag = soup.find_all('p', class_='subline-level-1')
51 | location_tag = soup.find_all('p', class_= 'subline-level-2')
52 | profile_tag = soup.find_all('a', class_= 'search-result__result-link')
53 | names += list(map(lambda x: x.text, name_tag))
54 | titles += list(map(lambda x: x.text.replace('\n','').strip(), title_tag))
55 | locations += list(map(lambda x: x.text.replace('\n','').strip(), location_tag))
56 | profiles += list(map(lambda x: 'https://linkedin.com' + x['href'], profile_tag))[::2]
57 | if len(names)>=num:
58 | break
59 | y = x
60 | x = len(names)
61 | if x==y:
62 | break
63 | browser.get('https://www.linkedin.com/search/results/people/?facetNetwork=%5B%22F%22%5D&origin=MEMBER_PROFILE_CANNED_SEARCH&page='+str(i))
64 | i+=1
65 | time.sleep(3)
66 | df = pd.DataFrame({'Name':names, 'Title':titles, 'Location':locations, 'Profile':profiles})
67 | return df
68 |
69 | def profile_scraper(df, browser):
70 | num_projects = []
71 | num_languages = []
72 | top_skills = []
73 | num_connections = []
74 | positions = []
75 | company = []
76 | duration = []
77 | institutes = []
78 | courses = []
79 | year_range = []
80 | ex_profiles = []
81 | ed_profiles = []
82 | print('Extracting information form individual profiles. Please wait...\n')
83 | for profile in df['Profile']:
84 | try:
85 | browser.get(profile)
86 | time.sleep(2)
87 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)
88 | time.sleep(.75)
89 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)
90 | time.sleep(.75)
91 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)
92 | time.sleep(.75)
93 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)
94 | time.sleep(.75)
95 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)
96 | time.sleep(.75)
97 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)
98 | time.sleep(.75)
99 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)
100 | time.sleep(.75)
101 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)
102 | time.sleep(.75)
103 | soup = BeautifulSoup(browser.page_source, 'lxml')
104 |
105 | conn_tag = soup.find_all('span', class_='t-16 t-bold')
106 | if conn_tag[0].text.strip().split()[0].isdigit() or conn_tag[0].text.strip().split()[0] == '500+':
107 | num_connections.append(conn_tag[0].text.strip().split()[0])
108 | elif len(soup.find_all('span', class_='t-16 t-black t-normal'))>0:
109 | num_connections.append(soup.find_all('span', class_='t-16 t-black t-normal')[0].text.strip().split()[0])
110 | else:
111 | num_connections.append(None)
112 |
113 | accom_tag = soup.find_all('h3', class_='pv-accomplishments-block__count t-32 t-black t-normal pr3')
114 | np = 0
115 | nl = 2
116 | for at in accom_tag:
117 | if at.text.strip().split('\n')[0].split()[-1] == 'projects' or at.text.strip().split('\n')[0].split()[-1] == 'project':
118 | np = int(at.text.strip().split('\n')[1])
119 | if at.text.strip().split('\n')[0].split()[-1] == 'languages' or at.text.strip().split('\n')[0].split()[-1] == 'language':
120 | nl = int(at.text.strip().split('\n')[1])
121 | num_projects.append(np)
122 | num_languages.append(nl)
123 |
124 | skills_tag = soup.find_all('span', class_='pv-skill-category-entity__name-text')
125 | ts = []
126 | for st in skills_tag:
127 | ts.append(st.text.strip())
128 | top_skills.append(ts)
129 |
130 | position_tag = soup.find_all('h3', class_='t-16 t-black t-bold')
131 | ex_pos = list(map(lambda x: x.text.strip(), position_tag))
132 | company_tag = soup.find_all('p', class_='pv-entity__secondary-title t-14 t-black t-normal')
133 | ex_comp = list(map(lambda x: x.text.strip().split('\n')[0], company_tag))
134 | ex_duration_tag = soup.find_all('span', class_='pv-entity__bullet-item-v2')
135 | durr = []
136 | for dur in ex_duration_tag:
137 | d_list = dur.text.strip().split()
138 | if d_list[0].isdigit():
139 | if len(d_list)==2:
140 | if d_list[1] == 'mo' or d_list[1]=='mos':
141 | durr.append(int(d_list[0]))
142 | if d_list[1] == 'yr' or d_list[1]=='yrs':
143 | durr.append(int(d_list[0])*12)
144 | if len(d_list)==4:
145 | durr.append((int(d_list[0])*12)+int(d_list[2]))
146 | else:
147 | durr.append(None)
148 | x = min(len(ex_comp), len(ex_pos), len(durr))
149 | ex_comp = ex_comp[:x]
150 | ex_pos = ex_pos[:x]
151 | durr = durr[:x]
152 | ex_profiles += [profile]*x
153 | positions += ex_pos
154 | company += ex_comp
155 | duration += durr
156 |
157 | institute_tag = soup.find_all('h3', class_='pv-entity__school-name t-16 t-black t-bold')
158 | inst = list(map(lambda x: x.text.strip(), institute_tag))
159 | course_tag = soup.find_all('p', class_='pv-entity__secondary-title pv-entity__degree-name t-14 t-black t-normal')
160 | course_t = list(map(lambda x: x.text.strip().split('\n')[1], course_tag))
161 | ed_date_tag = soup.find_all('p', class_='pv-entity__dates t-14 t-black--light t-normal')
162 | ed_dates = list(map(lambda x: x.text.strip().split('\n')[-1], ed_date_tag))
163 | y = min(len(inst), len(course_t), len(ed_dates))
164 | inst = inst[:y]
165 | course_t = course_t[:y]
166 | ed_dates = ed_dates[:y]
167 | ed_profiles += [profile]*y
168 | institutes += inst
169 | courses += course_t
170 | year_range += ed_dates
171 | except:
172 | continue
173 | df['Number of connections'] = num_connections
174 | df['Number of Projects'] = num_projects
175 | df['Number of Languages known'] = num_languages
176 | df['Top Skills'] = top_skills
177 | exp_df = pd.DataFrame({'Profile':ex_profiles, 'Position':positions, 'Company':company, 'Duration':duration})
178 | ed_df = pd.DataFrame({'Profile':ed_profiles, 'Institute':institutes, 'Degree':courses, 'Year range':year_range})
179 |
180 | return df, exp_df, ed_df
181 |
182 | try:
183 | conn = pd.read_csv('connections_data.csv')
184 | exp = pd.read_csv('experience.csv')
185 | ed = pd.read_csv('education.csv')
186 | except:
187 | browser = webdriver.Chrome(executable_path="E:/RnD/linkedin_scraper/chromedriver", options= webdriver.ChromeOptions())
188 | login(browser)
189 | print('\nPlease Wait. This may take a while...\n')
190 | time.sleep(3)
191 | connections = connections_scraper(browser)
192 | time.sleep(3)
193 | conn_data, exp_data, ed_data = profile_scraper(connections, browser)
194 | conn_data.to_csv('connections_data.csv', index=False)
195 | exp_data.to_csv('experience.csv', index=False)
196 | ed_data.to_csv('education.csv', index=False)
197 | browser.quit()
198 | conn = pd.read_csv('connections_data.csv')
199 | exp = pd.read_csv('experience.csv')
200 | ed = pd.read_csv('education.csv')
201 |
202 | exp = exp[exp['Position']!='Student']
203 | #conn = conn.drop_duplicates().reset_index(drop=True)
204 | #ed = ed.drop_duplicates().reset_index(drop=True)
205 | #exp = exp.drop_duplicates().reset_index(drop=True)
206 |
207 |
208 | ed['start_year'] = list(map(lambda x: int(x.split()[0]), ed['Year range']))
209 | end = []
210 | for ran in ed['Year range']:
211 | try:
212 | end.append(int(ran.split()[2]))
213 | except:
214 | end.append(int(ran.split()[0])+4)
215 | ed['passing_year'] = end
216 |
217 | profiles = ed['Profile'].unique()
218 | current_year = datetime.datetime.now().year
219 | hins = []
220 | hdeg = []
221 | hstart = []
222 | hpass = []
223 | status = []
224 | level = []
225 | field = []
226 | for prof in profiles:
227 | hins.append(ed[ed['Profile']==prof]['Institute'].iloc[0])
228 | hdeg.append(ed[ed['Profile']==prof]['Degree'].iloc[0])
229 | hstart.append(ed[ed['Profile']==prof]['start_year'].iloc[0])
230 | hpass.append(ed[ed['Profile']==prof]['passing_year'].iloc[0])
231 | if ed[ed['Profile']==prof]['passing_year'].iloc[0]<=current_year:
232 | status.append('Completed')
233 | else:
234 | status.append('Ongoing')
235 | deg = ed[ed['Profile']==prof]['Degree'].iloc[0]
236 | if (deg.lower().find('bachelor')!=-1 or deg.lower()[0]=='b'):
237 | level.append('Bachelor\'s')
238 | elif (deg.lower().find('master')!=-1
239 | or deg.lower().find('post graduate')!=-1
240 | or deg.lower()[0]=='m'
241 | or deg.lower().find('pgdm')!=-1):
242 | level.append('Master\'s')
243 | elif (deg.lower().find('phd')!=-1 or deg.lower().find('Doctor of Philosophy')!=-1):
244 | level.append('Phd')
245 | elif (deg.lower().find('nanodegree')!=-1 or deg.lower().find('certificate program')!=-1):
246 | level.append('Diploma')
247 | else:
248 | level.append('Other')
249 | ins = ed[ed['Profile']==prof]['Institute'].iloc[0]
250 | if (deg.lower().find('engineer')!=-1
251 | or deg.lower().find('technology')!=-1
252 | or deg.lower().find('science')!=-1
253 | or deg.lower().find('computer')!=-1
254 | or ins.lower().find('technology')!=-1
255 | or ins.lower().find('science')!=-1
256 | or ins.lower().find('medical')!=-1
257 | or ins.lower().find('engineering')!=-1):
258 | field.append('Science')
259 | elif(deg.lower().find('management')!=-1
260 | or deg.lower().find('mba')!=-1
261 | or deg.lower().find('business')!=-1
262 | or deg.lower().find('finance')!=-1
263 | or deg.lower().find('accountancy')!=-1
264 | or ins.lower().find('business')!=-1):
265 | field.append('MNGMT')
266 | else:
267 | field.append('Arts')
268 | dic = {'Profile':profiles,
269 | 'Institute':hins,
270 | 'Degree':hdeg,
271 | 'Field of study':field,
272 | 'Level':level,
273 | 'start_year':hstart,
274 | 'passing_year':hpass,
275 | 'Level':level,
276 | 'Status':status}
277 | highest_ed = pd.DataFrame(dic)
278 |
279 | conn_cat = []
280 | skills = []
281 | for ri, row in conn.iterrows():
282 | if row['Top Skills']!='[]':
283 | skills.append(row['Top Skills'])
284 | if row['Number of connections'] == '500+':
285 | conn_cat.append('500+')
286 | elif (int(row['Number of connections'])>400 and int(row['Number of connections'])<=500):
287 | conn_cat.append('400-500')
288 | elif (int(row['Number of connections'])>300 and int(row['Number of connections'])<=400):
289 | conn_cat.append('300-400')
290 | elif (int(row['Number of connections'])>200 and int(row['Number of connections'])<=300):
291 | conn_cat.append('200-300')
292 | elif (int(row['Number of connections'])>100 and int(row['Number of connections'])<=200):
293 | conn_cat.append('100-200')
294 | elif (int(row['Number of connections'])<=100):
295 | conn_cat.append('0-100')
296 | conn["Number of connections"] = conn_cat
297 |
298 | sk = []
299 | i = 0
300 | for skill in skills:
301 | try:
302 | x=skill.split('\'')
303 | sk.append(x[1])
304 | try:
305 | sk.append(x[3])
306 | except:
307 | continue
308 | try:
309 | sk.append(x[5])
310 | except:
311 | continue
312 | except:
313 | sk += skill
314 | skills_dict = {}
315 | for skill in sk:
316 | if skill in skills_dict:
317 | skills_dict[skill] += 1
318 | else:
319 | skills_dict[skill] = 1
320 | lang = []
321 | for k,v in skills_dict.items():
322 | if k.lower().find('programming language')!=-1:
323 | lang.append(k)
324 | for l in lang:
325 | try:
326 | skills_dict[l.split()[0]] += skills_dict[l]
327 | del skills_dict[l]
328 | except:
329 | continue
330 | x=sorted(skills_dict.values(), reverse=True)
331 | dic = {}
332 | for skill_cnt in x:
333 | for k,v in skills_dict.items():
334 | if ((v==skill_cnt) and (k not in dic)):
335 | dic[k]=skill_cnt
336 | skills_dict = dic
337 |
338 | locs = []
339 | for loc in list(conn['Location'].values):
340 | x = loc.split(',')[0]
341 | if x.split()[-1] == 'Area':
342 | locs.append(" ".join(x.split()[:-1]))
343 | else:
344 | locs.append(x)
345 | locations = []
346 | for x in locs:
347 | if x not in locations:
348 | locations.append(x)
349 | lc = []
350 | for loc in list(conn['Location'].values):
351 | for l in locations:
352 | x = loc.split(',')[0]
353 | if x.find(l)!=-1:
354 | lc.append(l)
355 | break
356 | conn['Location'] = lc
357 |
358 | dur = []
359 | for index, d in exp.iterrows():
360 | try:
361 | if int(d["Duration"]) < 6:
362 | dur.append('< 6 Months')
363 | elif int(d["Duration"]) >= 6 and int(d["Duration"]) < 12:
364 | dur.append('6 Months to 1 Year')
365 | elif int(d["Duration"]) >= 12 and int(d["Duration"]) <= 60:
366 | dur.append('1-5 Years')
367 | elif int(d["Duration"]) > 60 and int(d["Duration"]) <= 120:
368 | dur.append('6-10 Years')
369 | elif int(d["Duration"]) > 120 and int(d["Duration"]) <= 240:
370 | dur.append('11-20 Years')
371 | else:
372 | dur.append('20+ Years')
373 | except:
374 | dur.append('< 6 Months')
375 | exp["Experience"] = dur
376 |
377 | cat = []
378 | for pos in list(exp['Position']):
379 | if (pos.lower().find('intern')!=-1
380 | or pos.lower().find('internship')!=-1
381 | or pos.lower().find('trainee')!=-1):
382 | cat.append('Intern')
383 | elif (pos.lower().find('campus')!=-1
384 | or pos.lower().find('student')!=-1
385 | or pos.lower().find('teaching assistant')!=-1
386 | or pos.lower().find('ambassador')!=-1
387 | or pos.lower().find('college')!=-1
388 | or pos.lower().find('member')!=-1
389 | or pos.lower().find('core committee member')!=-1
390 | or pos.lower().find('volunteer')!=-1
391 | or pos.lower().find('hustler')!=-1
392 | or pos.lower().find('scholar')!=-1
393 | or pos.lower().find('contributor')!=-1
394 | or pos.lower().find('fest')!=-1
395 | or pos.lower().find('event')!=-1
396 | or pos.lower().find('representative')!=-1):
397 | cat.append('Student Representative/ Volunteer')
398 | else:
399 | cat.append('Full Time')
400 | exp["Category"] = cat
401 |
402 | category_count = pd.DataFrame(exp['Category'].value_counts()).reset_index().rename(columns={'index':'Category', 'Category':'Count'})
403 | intern_company_count = pd.DataFrame(exp[exp['Category']=='Intern']['Company'].value_counts()).reset_index().rename(columns={'index':'Company', 'Company':'Count'})
404 | ft_company_count = pd.DataFrame(exp[exp['Category']=='Full Time']['Company'].value_counts()).reset_index().rename(columns={'index':'Company', 'Company':'Count'})
405 | srv_company_count = pd.DataFrame(exp[exp['Category']=='Student Representative/ Volunteer']['Company'].value_counts()).reset_index().rename(columns={'index':'Company', 'Company':'Count'})
406 |
407 | skills_count = pd.DataFrame(columns=['Skill', 'Count'])
408 | skills_count['Skill'] = list(skills_dict.keys())
409 | skills_count['Count'] = list(skills_dict.values())
410 |
411 | location_count = pd.DataFrame(conn['Location'].value_counts()).reset_index().rename(columns={'index':'Location', 'Location':'Count'})
412 | exp_dur_count = pd.DataFrame(exp['Experience'].value_counts()).reset_index().rename(columns={'index':'Duration', 'Experience':'Count'})
413 | conn_num_count = pd.DataFrame(conn['Number of connections'].value_counts()).reset_index().rename(columns={'index':'Number of connections', 'Number of connections':'Count'})
414 | lang_count = pd.DataFrame(conn['Number of Languages known'].value_counts()).reset_index().rename(columns={'index':'Number of Languages known', 'Number of Languages known':'Count'})
415 |
416 | intern_company_count['Category'] = ['Intern']*intern_company_count.shape[0]
417 | intern_company_count['Category count'] = [category_count['Count'].iloc[1]]*intern_company_count.shape[0]
418 | ft_company_count['Category'] = ['Full Time']*ft_company_count.shape[0]
419 | ft_company_count['Category count'] = [category_count['Count'].iloc[0]]*ft_company_count.shape[0]
420 | srv_company_count['Category'] = ['Student Representative/ Volunteer']*srv_company_count.shape[0]
421 | srv_company_count['Category count'] = [category_count['Count'].iloc[2]]*srv_company_count.shape[0]
422 |
423 | status_count = pd.DataFrame(highest_ed['Status'].value_counts().reset_index().rename(columns={'index':'category name', 'Status':'Count'}))
424 | level_count = pd.DataFrame(highest_ed['Level'].value_counts().reset_index().rename(columns={'index':'category name', 'Level':'Count'}))
425 | fos_count = pd.DataFrame(highest_ed['Field of study'].value_counts().reset_index().rename(columns={'index':'category name', 'Field of study':'Count'}))
426 |
427 | category_count['Count %'] = list(map(lambda x: '%.2f'%(x*100/sum(list(category_count['Count']))), category_count['Count']))
428 | intern_company_count['Count %'] = list(map(lambda x: '%.2f'%(x*100/sum(list(intern_company_count['Count']))), intern_company_count['Count']))
429 | ft_company_count['Count %'] = list(map(lambda x: '%.2f'%(x*100/sum(list(ft_company_count['Count']))), ft_company_count['Count']))
430 | srv_company_count['Count %'] = list(map(lambda x: '%.2f'%(x*100/sum(list(srv_company_count['Count']))), srv_company_count['Count']))
431 | skills_count['Count %'] = list(map(lambda x: '%.2f'%(x*100/sum(list(skills_count['Count']))), skills_count['Count']))
432 | location_count['Count %'] = list(map(lambda x: '%.2f'%(x*100/sum(list(location_count['Count']))), location_count['Count']))
433 | exp_dur_count['Count %'] = list(map(lambda x: '%.2f'%(x*100/sum(list(exp_dur_count['Count']))), exp_dur_count['Count']))
434 | conn_num_count['Count %'] = list(map(lambda x: '%.2f'%(x*100/sum(list(conn_num_count['Count']))), conn_num_count['Count']))
435 | lang_count['Count %'] = list(map(lambda x: '%.2f'%(x*100/sum(list(lang_count['Count']))), lang_count['Count']))
436 | status_count['Count %'] = list(map(lambda x: '%.2f'%(x*100/sum(list(status_count['Count']))), status_count['Count']))
437 | level_count['Count %'] = list(map(lambda x: '%.2f'%(x*100/sum(list(level_count['Count']))), level_count['Count']))
438 | fos_count['Count %'] = list(map(lambda x: '%.2f'%(x*100/sum(list(fos_count['Count']))), fos_count['Count']))
439 |
440 | tt_int = int(len(intern_company_count)*0.1)
441 | tt_ft = int(len(ft_company_count)*0.1)
442 | tt_srv = int(len(srv_company_count)*0.1)
443 |
444 | company_count = intern_company_count[:tt_int].append(ft_company_count[:tt_ft]).append(srv_company_count[:tt_srv])
445 |
446 | app = dash.Dash(__name__)
447 |
448 | app.layout = html.Div([
449 | html.Div([
450 | html.Div([
451 | html.Div([
452 | html.H1(children='LinkedIn Connections Analyzer with Visualization',style={'textAlign': 'center'}),
453 | html.Div([
454 | html.H6(children=
455 | 'LinkedIn has always been one of the most useful social media networks for us. The reason is simple: it is business-oriented and (almost) clutter-free.The best thing is that people are connected for a reason there: when sending a connection request, you can/need to specify how you are connected to a person. This way your network contains a wealth of business-oriented information: you can find out how you are related to any company or person you need to get in touch with. Alternatively, you can be introduced to anyone outside your immediate network using your first-level connections.Tapping into your Linkedin connections can be a great way to discover new career opportunities for personal and professional growth.',
456 | style={'textAlign': 'center', 'margin-bottom':'25px'})
457 | ])
458 | ])
459 | ], className='title twelve columns')
460 | ], id='header', className='title', style={'margin-bottom':'25px', 'margin-top':'0px'}),
461 | html.Div([
462 | html.Div([
463 | dcc.Graph(id='location_graph',
464 | figure=px.treemap(location_count,path=['Location'], values='Count %', title='Demographic Distribution of Connections', color='Count', color_continuous_scale=['#AED6F1', '#3498DB']).update_layout(title_x=0.5, coloraxis_showscale=False)
465 | )], className='contain six columns'
466 | ),
467 | html.Div([
468 | dcc.Graph(id='skills_graph',
469 | figure=px.bar(skills_count[:10][::-1], x= 'Count %', y='Skill', title='Most Popular Skills amongst your Connections', orientation='h', color='Count', color_continuous_scale=['#AED6F1', '#3498DB']).update_layout(title_x=0.5, coloraxis_showscale=False)
470 | )], className='contain six columns'
471 | )
472 | ], className='row', id='row1', style={'margin-bottom':'10px'}),
473 | html.Div([
474 | html.Div([
475 | dcc.Graph(id='conn_graph',
476 | figure=px.bar(conn_num_count[::-1], y='Number of connections', x = 'Count %', range_x=[0, 100], title='Network Strength of your Connections', orientation='h',color='Count', color_continuous_scale=['#AED6F1', '#3498DB']).update_layout(title_x=0.5, coloraxis_showscale=False)
477 | )], className='contain five columns'
478 | ),
479 | html.Div([
480 | dcc.Graph(id='lang_graph',
481 | figure=px.pie(lang_count, names='Number of Languages known', values='Count %', title='Number of Languages known',color ='Number of Languages known', color_discrete_sequence=px.colors.sequential.Blues[-9:-3][::-1]).update_layout(title_x=0.5)
482 | )], className='contain five columns'
483 | ),
484 | html.Div([
485 | html.Div([
486 | html.H6('%.2f'%exp[exp['Category']=='Intern']['Duration'].mean()),
487 | html.P('Average Duration of Internships (in Months)')
488 | ], className='mini_container', style={'margin-bottom':'20px', 'margin-top':'20px'}),
489 | html.Div([
490 | html.H6('%.2f'%((len(exp[exp['Category']=='Intern'])/len(exp))*100)+'%'),
491 | html.P('Connections who have worked as interns')
492 | ], className='mini_container', style={'margin-bottom':'20px'}),
493 | html.Div([
494 | html.H6('%.2f'%((len(exp[exp['Category']=='Full Time'])/len(exp))*100)+'%'),
495 | html.P('Connections who have worked full time')
496 | ], className='mini_container')
497 | ], className='two columns')
498 | ], className='row', id='row2', style={'margin-bottom':'10px'}),
499 | html.Div([
500 | html.Div([
501 | dcc.Graph(id='cat_comp_tree',
502 | figure=px.treemap(company_count,path=['Category', 'Company'], values='Count %', title='Popular Companies in Each Category', color='Category', color_discrete_map={'Full Time':'#3498DB', 'Intern':'#85C1E9', 'Student Representative/ Volunteer':'#AED6F1'}).update_layout(title_x=0.5))
503 | ], className='contain six columns'),
504 | html.Div([
505 | dcc.Graph(id='duration_plot',
506 | figure= px.bar(exp_dur_count[::-1], y='Duration', x='Count %', title='Duration of the Job', orientation='h',color='Count', color_continuous_scale=['#AED6F1', '#3498DB'], range_x=[0, 100]).update_layout(title_x=0.5,coloraxis_showscale=False)
507 | )], className='contain six columns')
508 | ], className='row', id='row3', style={'margin-bottom':'10px'}),
509 | html.Div([
510 | html.Div([
511 | dcc.Graph(id='fos_plot',
512 | figure=px.pie(fos_count, names='category name', values='Count %', title='Field of Study',color ='category name',color_discrete_sequence=px.colors.sequential.Blues[-9:-3][::-1]).update_layout(title_x=0.5))
513 | ], className='contain four columns', style={'margin-right':'12px', 'margin-left':'10px'}),
514 | html.Div([
515 | dcc.Graph(id='level_plot',
516 | figure=px.pie(level_count, names='category name', values='Count %', title='Level of Highest Education',color='category name',color_discrete_sequence=px.colors.sequential.Blues[-9:-3][::-1]).update_layout(title_x=0.5))
517 | ], className='contain four columns', style={'margin-right':'12px'}),
518 | html.Div([
519 | dcc.Graph(id='status_plot',
520 | figure=px.pie(status_count, names='category name', values='Count %', title='Status of Education',color='category name',color_discrete_sequence=px.colors.sequential.Blues[-9:-3][::-1]).update_layout(title_x=0.5))
521 | ], className='contain four columns')
522 | ], className='row', id='row4', style={'margin-bottom':'10px'})
523 | ])
524 |
525 | webbrowser.get('C:/Program Files (x86)/Google/Chrome/Application/chrome.exe %s').open('http://127.0.0.1:8050/')
526 |
527 | if __name__ == '__main__':
528 | app.run_server(debug=True)
529 |
530 |
--------------------------------------------------------------------------------