├── screenshots ├── screenshot 1.png ├── screenshot 2.png └── screenshot 3.png ├── assets ├── resizing_script.js ├── styles.css └── s.css ├── LICENSE ├── README.md ├── Profile Scraper.ipynb └── final_app.py /screenshots/screenshot 1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chawla201/LinkedIn-Connections-Analyzer/HEAD/screenshots/screenshot 1.png -------------------------------------------------------------------------------- /screenshots/screenshot 2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chawla201/LinkedIn-Connections-Analyzer/HEAD/screenshots/screenshot 2.png -------------------------------------------------------------------------------- /screenshots/screenshot 3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chawla201/LinkedIn-Connections-Analyzer/HEAD/screenshots/screenshot 3.png -------------------------------------------------------------------------------- /assets/resizing_script.js: -------------------------------------------------------------------------------- 1 | if (!window.dash_clientside) { 2 | window.dash_clientside = {}; 3 | } 4 | window.dash_clientside.clientside = { 5 | resize: function(value) { 6 | console.log("resizing..."); // for testing 7 | setTimeout(function() { 8 | window.dispatchEvent(new Event("resize")); 9 | console.log("fired resize"); 10 | }, 500); 11 | return null; 12 | } 13 | }; -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Parth Chawla 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /assets/styles.css: -------------------------------------------------------------------------------- 1 | .js-plotly-plot .plotly .modebar { 2 | padding-top: 5%; 3 | margin-right: 3.5%; 4 | } 5 | 6 | body { 7 | background-color: #f2f2f2; 8 | margin: 5%; 9 | } 10 | 11 | .two.columns { 12 | width: 16.25%; 13 | } 14 | 15 | .column, 16 | .columns { 17 | margin-left: 0.5%; 18 | } 19 | 20 | .pretty_container { 21 | border-radius: 5px; 22 | background-color: #f9f9f9; 23 | margin: 10px; 24 | padding: 15px; 25 | position: relative; 26 | box-shadow: 2px 2px 2px lightgrey; 27 | } 28 | 29 | .bare_container { 30 | margin: 0 0 0 0; 31 | padding: 0 0 0 0; 32 | } 33 | 34 | .dcc_control { 35 | margin: 0; 36 | padding: 5px; 37 | width: calc(100%-40px); 38 | } 39 | 40 | .control_label { 41 | margin: 0; 42 | padding: 10px; 43 | padding-bottom: 0px; 44 | margin-bottom: 0px; 45 | width: calc(100%-40px); 46 | } 47 | 48 | .rc-slider { 49 | margin-left: 0px; 50 | padding-left: 0px; 51 | } 52 | 53 | .flex-display { 54 | display: flex; 55 | } 56 | 57 | .container-display { 58 | display: flex; 59 | } 60 | 61 | #individual_graph, 62 | #aggregate_graph { 63 | width: calc(100% - 30px); 64 | position: absolute; 65 | } 66 | 67 | #count_graph { 68 | position: absolute; 69 | height: calc(100% - 30px); 70 | width: calc(100% - 30px); 71 | } 72 | 73 | #countGraphContainer { 74 | flex: 5; 75 | position: relative; 76 | } 77 | 78 | #header { 79 | align-items: center; 80 | } 81 | 82 | #learn-more-button { 83 | text-align: center; 84 | height: 100%; 85 | padding: 0 20px; 86 | text-transform: none; 87 | font-size: 15px; 88 | float: right; 89 | margin-right: 10px; 90 | margin-top: 30px; 91 | } 92 | #title { 93 | text-align: center; 94 | } 95 | 96 | .mini_container { 97 | border-radius: 5px; 98 | background-color: white; 99 | margin: 10px; 100 | padding: 15px; 101 | position: relative; 102 | box-shadow: 2px 2px 2px lightgrey; 103 | } 104 | 105 | .contain { 106 | border-radius: 5px; 107 | background-color: white; 108 | padding: 15px; 109 | box-shadow: 2px 2px 2px lightgrey; 110 | } 111 | 112 | #right-column { 113 | display: flex; 114 | flex-direction: column; 115 | } 116 | 117 | #wells { 118 | flex: 1; 119 | } 120 | 121 | #gas { 122 | flex: 1; 123 | } 124 | 125 | #aggregate_data { 126 | align-items: center; 127 | } 128 | 129 | #oil { 130 | flex: 1; 131 | } 132 | 133 | #water { 134 | flex: 1; 135 | } 136 | 137 | #tripleContainer { 138 | display: flex; 139 | flex: 3; 140 | } 141 | 142 | #mainContainer { 143 | display: flex; 144 | flex-direction: column; 145 | } 146 | 147 | #pie_graph > div > div > svg:nth-child(3) > g.infolayer > g.legend { 148 | pointer-events: all; 149 | transform: translate(30px, 349px); 150 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LinkedIn Connections Analyzer 2 | ### Web Scraping | Data Analysis | Web Development 3 | Designed a web scraping script in Python using Selenium and Beautiful Soup libraries to extract 4 | information of all the LinkedIn connections of the user, transformed the collected data and 5 | performed basic data analysis on the synthesized data. Then developed a web application dashboard 6 | using dash framework to present the findings of the analysis. 7 | As can be observed above, the project is divided into 3 parts: 8 | 9 | ## tl;dr 10 | - Designed a web scraping script in Python to scrape LinkedIn 11 | connections 12 | - Cleaned the data and performed exploratory data analysis 13 | - Presented the findings as an interactive web application dashboard 14 | built using Dash framework 15 | 16 | ## Tecnologies Used: 17 | 18 | * Python 19 | * Pandas 20 | * Numpy 21 | * Selenium 22 | * Beautiful Soup 23 | * Matplotlib 24 | * Seaborn 25 | * Plotly 26 | * Dash Framework 27 | 28 | ## Web Scraping 29 | Used the Selenium and Beautiful Soup libraries to perform web scraping to extract information from LinkedIn users' profiles. Used 3 methods: login, connections_scraper and profile_scraper. These were divided into 3 dataframes: connections_data, education and experience. 30 | 31 | connections_data: Extracted Name, Title, Location, Profile, Number of connections, Number of Projects, Number of Languages known and Top Skills for the connections_data. 32 | 33 | education: Extracted Institute, Degree and Year range for education. 34 | 35 | experience: Extracted Profile, Position, Company, Duration for the experience dataframe. 36 | 37 | ## Data Pre-processing/ Transformation 38 | The collected data was in a raw form and had to be cleaned and transformed for it to be analysed and gained insights from. There are 3 dataframes namely: connections_data, experience and education. 39 | 40 | For the connections_data dataframe, cleaned the Location column to just display the City name without the words like 'Area', divided Number of Connections into 6 categories of range such as 0-100, 100-200,... to 500+, Number of Languages, Number of Projects and created a dictionary for the Top 3 featured Skills of each of the connections and then finally counting the number of people for each skill. 41 | 42 | For the education dataframe, on the basis of the institute and degree name classified the field of study into 3 categories (for the time being, for simplicity): Science, Management and Arts, found out the status of education on the basis of the year range provided on the profile for a particular education level. Also found out the the highest level of education for the connections based on the words 'Bachelor's', 'Master's' etc given in the education field on the profile. 43 | 44 | For the experience dataframe, divided the position column into 3 categories: full time, interns, student representatives or volunteers, made 6 categories under the duration column starting with <6 months to 20+ years. 45 | 46 | ## Visulization of the transformed data on Dash Framework using Plotly Express 47 | Dash is the most downloaded, trusted framework for building ML & data science web apps. Full stack apps that would typically require a front-end, backend, and dev ops team can now be built and deployed in hours by data scientists with Dash. With Dash Open Source, Dash apps run on your local laptop or workstation, but cannot be easily accessed by others in your organization. To read more and understand Dash, visit https://plotly.com/dash/ 48 | 49 | Plotly's Python graphing library makes interactive, publication-quality graphs. The plotly.express module (usually imported as px) contains functions that can create entire figures at once, and is referred to as Plotly Express or PX. Plotly Express is a built-in part of the plotly library, and is the recommended starting point for creating most common figures. To know more about plotly, visit https://plotly.com/python/ 50 | 51 | Since this is the first time we have used Dash, the dashboard looks fairly simple (consisting of interactive bar charts and pie charts with tiles and tree maps), yet very informative. We plan to incorporate more changes with respect to intricacies in the level or field of study/work later. 52 | 53 | Note: It's important to have the assets folder in the same folder you implement your application in, since it's necessary for the stlying purposes. 54 | 55 | 56 | ## Screenshots: 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /assets/s.css: -------------------------------------------------------------------------------- 1 | /* Table of contents 2 | –––––––––––––––––––––––––––––––––––––––––––––––––– 3 | - Plotly.js 4 | - Grid 5 | - Base Styles 6 | - Typography 7 | - Links 8 | - Buttons 9 | - Forms 10 | - Lists 11 | - Code 12 | - Tables 13 | - Spacing 14 | - Utilities 15 | - Clearing 16 | - Media Queries 17 | */ 18 | 19 | /* PLotly.js 20 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 21 | /* plotly.js's modebar's z-index is 1001 by default 22 | * https://github.com/plotly/plotly.js/blob/7e4d8ab164258f6bd48be56589dacd9bdd7fded2/src/css/_modebar.scss#L5 23 | * In case a dropdown is above the graph, the dropdown's options 24 | * will be rendered below the modebar 25 | * Increase the select option's z-index 26 | */ 27 | 28 | /* This was actually not quite right - 29 | dropdowns were overlapping each other (edited October 26) 30 | 31 | .Select { 32 | z-index: 1002; 33 | }*/ 34 | 35 | 36 | /* Grid 37 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 38 | .container { 39 | position: relative; 40 | width: 100%; 41 | max-width: 960px; 42 | margin: 0 auto; 43 | padding: 0 20px; 44 | box-sizing: border-box; } 45 | .column, 46 | .columns { 47 | width: 100%; 48 | float: left; 49 | box-sizing: border-box; } 50 | 51 | /* For devices larger than 400px */ 52 | @media (min-width: 400px) { 53 | .container { 54 | width: 85%; 55 | padding: 0; } 56 | } 57 | 58 | /* For devices larger than 550px */ 59 | @media (min-width: 550px) { 60 | .container { 61 | width: 80%; } 62 | .column, 63 | .columns { 64 | margin-left: 4%; } 65 | .column:first-child, 66 | .columns:first-child { 67 | margin-left: 0; } 68 | 69 | .one.column, 70 | .one.columns { width: 4.66666666667%; } 71 | .two.columns { width: 13.3333333333%; } 72 | .three.columns { width: 22%; } 73 | .four.columns { width: 30.6666666667%; } 74 | .five.columns { width: 39.3333333333%; } 75 | .six.columns { width: 48%; } 76 | .seven.columns { width: 56.6666666667%; } 77 | .eight.columns { width: 65.3333333333%; } 78 | .nine.columns { width: 74.0%; } 79 | .ten.columns { width: 82.6666666667%; } 80 | .eleven.columns { width: 91.3333333333%; } 81 | .twelve.columns { width: 100%; margin-left: 0; } 82 | 83 | .one-third.column { width: 30.6666666667%; } 84 | .two-thirds.column { width: 65.3333333333%; } 85 | 86 | .one-half.column { width: 48%; } 87 | 88 | /* Offsets */ 89 | .offset-by-one.column, 90 | .offset-by-one.columns { margin-left: 8.66666666667%; } 91 | .offset-by-two.column, 92 | .offset-by-two.columns { margin-left: 17.3333333333%; } 93 | .offset-by-three.column, 94 | .offset-by-three.columns { margin-left: 26%; } 95 | .offset-by-four.column, 96 | .offset-by-four.columns { margin-left: 34.6666666667%; } 97 | .offset-by-five.column, 98 | .offset-by-five.columns { margin-left: 43.3333333333%; } 99 | .offset-by-six.column, 100 | .offset-by-six.columns { margin-left: 52%; } 101 | .offset-by-seven.column, 102 | .offset-by-seven.columns { margin-left: 60.6666666667%; } 103 | .offset-by-eight.column, 104 | .offset-by-eight.columns { margin-left: 69.3333333333%; } 105 | .offset-by-nine.column, 106 | .offset-by-nine.columns { margin-left: 78.0%; } 107 | .offset-by-ten.column, 108 | .offset-by-ten.columns { margin-left: 86.6666666667%; } 109 | .offset-by-eleven.column, 110 | .offset-by-eleven.columns { margin-left: 95.3333333333%; } 111 | 112 | .offset-by-one-third.column, 113 | .offset-by-one-third.columns { margin-left: 34.6666666667%; } 114 | .offset-by-two-thirds.column, 115 | .offset-by-two-thirds.columns { margin-left: 69.3333333333%; } 116 | 117 | .offset-by-one-half.column, 118 | .offset-by-one-half.columns { margin-left: 52%; } 119 | 120 | } 121 | 122 | 123 | /* Base Styles 124 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 125 | /* NOTE 126 | html is set to 62.5% so that all the REM measurements throughout Skeleton 127 | are based on 10px sizing. So basically 1.5rem = 15px :) */ 128 | html { 129 | font-size: 62.5%; } 130 | body { 131 | font-size: 1.5em; /* currently ems cause chrome bug misinterpreting rems on body element */ 132 | line-height: 1.6; 133 | font-weight: 400; 134 | font-family: "Open Sans", "HelveticaNeue", "Helvetica Neue", Helvetica, Arial, sans-serif; 135 | color: rgb(50, 50, 50); } 136 | 137 | 138 | /* Typography 139 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 140 | h1, h2, h3, h4, h5, h6 { 141 | margin-top: 0; 142 | margin-bottom: 0; 143 | font-weight: 300; } 144 | h1 { font-size: 4.5rem; line-height: 1.2; letter-spacing: -.1rem; margin-bottom: 2rem; } 145 | h2 { font-size: 3.6rem; line-height: 1.25; letter-spacing: -.1rem; margin-bottom: 1.8rem; margin-top: 1.8rem;} 146 | h3 { font-size: 3.0rem; line-height: 1.3; letter-spacing: -.1rem; margin-bottom: 1.5rem; margin-top: 1.5rem;} 147 | h4 { font-size: 2.6rem; line-height: 1.35; letter-spacing: -.08rem; margin-bottom: 1.2rem; margin-top: 1.2rem;} 148 | h5 { font-size: 2.2rem; line-height: 1.5; letter-spacing: -.05rem; margin-bottom: 0.6rem; margin-top: 0.6rem;} 149 | h6 { font-size: 2.0rem; line-height: 1.6; letter-spacing: 0; margin-bottom: 0.75rem; margin-top: 0.75rem;} 150 | 151 | p { 152 | margin-top: 0; } 153 | 154 | 155 | /* Blockquotes 156 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 157 | blockquote { 158 | border-left: 4px lightgrey solid; 159 | padding-left: 1rem; 160 | margin-top: 2rem; 161 | margin-bottom: 2rem; 162 | margin-left: 0rem; 163 | } 164 | 165 | 166 | /* Links 167 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 168 | a { 169 | color: #1EAEDB; 170 | text-decoration: underline; 171 | cursor: pointer;} 172 | a:hover { 173 | color: #0FA0CE; } 174 | 175 | 176 | /* Buttons 177 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 178 | .button, 179 | button, 180 | input[type="submit"], 181 | input[type="reset"], 182 | input[type="button"] { 183 | display: inline-block; 184 | height: 38px; 185 | padding: 0 30px; 186 | color: #555; 187 | text-align: center; 188 | font-size: 11px; 189 | font-weight: 600; 190 | line-height: 38px; 191 | letter-spacing: .1rem; 192 | text-transform: uppercase; 193 | text-decoration: none; 194 | white-space: nowrap; 195 | background-color: transparent; 196 | border-radius: 4px; 197 | border: 1px solid #bbb; 198 | cursor: pointer; 199 | box-sizing: border-box; } 200 | .button:hover, 201 | button:hover, 202 | input[type="submit"]:hover, 203 | input[type="reset"]:hover, 204 | input[type="button"]:hover, 205 | .button:focus, 206 | button:focus, 207 | input[type="submit"]:focus, 208 | input[type="reset"]:focus, 209 | input[type="button"]:focus { 210 | color: #333; 211 | border-color: #888; 212 | outline: 0; } 213 | .button.button-primary, 214 | button.button-primary, 215 | input[type="submit"].button-primary, 216 | input[type="reset"].button-primary, 217 | input[type="button"].button-primary { 218 | color: #FFF; 219 | background-color: #33C3F0; 220 | border-color: #33C3F0; } 221 | .button.button-primary:hover, 222 | button.button-primary:hover, 223 | input[type="submit"].button-primary:hover, 224 | input[type="reset"].button-primary:hover, 225 | input[type="button"].button-primary:hover, 226 | .button.button-primary:focus, 227 | button.button-primary:focus, 228 | input[type="submit"].button-primary:focus, 229 | input[type="reset"].button-primary:focus, 230 | input[type="button"].button-primary:focus { 231 | color: #FFF; 232 | background-color: #1EAEDB; 233 | border-color: #1EAEDB; } 234 | 235 | 236 | /* Forms 237 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 238 | input[type="email"], 239 | input[type="number"], 240 | input[type="search"], 241 | input[type="text"], 242 | input[type="tel"], 243 | input[type="url"], 244 | input[type="password"], 245 | textarea, 246 | select { 247 | height: 38px; 248 | padding: 6px 10px; /* The 6px vertically centers text on FF, ignored by Webkit */ 249 | background-color: #fff; 250 | border: 1px solid #D1D1D1; 251 | border-radius: 4px; 252 | box-shadow: none; 253 | box-sizing: border-box; 254 | font-family: inherit; 255 | font-size: inherit; /*https://stackoverflow.com/questions/6080413/why-doesnt-input-inherit-the-font-from-body*/} 256 | /* Removes awkward default styles on some inputs for iOS */ 257 | input[type="email"], 258 | input[type="number"], 259 | input[type="search"], 260 | input[type="text"], 261 | input[type="tel"], 262 | input[type="url"], 263 | input[type="password"], 264 | textarea { 265 | -webkit-appearance: none; 266 | -moz-appearance: none; 267 | appearance: none; } 268 | textarea { 269 | min-height: 65px; 270 | padding-top: 6px; 271 | padding-bottom: 6px; } 272 | input[type="email"]:focus, 273 | input[type="number"]:focus, 274 | input[type="search"]:focus, 275 | input[type="text"]:focus, 276 | input[type="tel"]:focus, 277 | input[type="url"]:focus, 278 | input[type="password"]:focus, 279 | textarea:focus, 280 | select:focus { 281 | border: 1px solid #33C3F0; 282 | outline: 0; } 283 | label, 284 | legend { 285 | display: block; 286 | margin-bottom: 0px; } 287 | fieldset { 288 | padding: 0; 289 | border-width: 0; } 290 | input[type="checkbox"], 291 | input[type="radio"] { 292 | display: inline; } 293 | label > .label-body { 294 | display: inline-block; 295 | margin-left: .5rem; 296 | font-weight: normal; } 297 | 298 | 299 | /* Lists 300 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 301 | ul { 302 | list-style: circle inside; } 303 | ol { 304 | list-style: decimal inside; } 305 | ol, ul { 306 | padding-left: 0; 307 | margin-top: 0; } 308 | ul ul, 309 | ul ol, 310 | ol ol, 311 | ol ul { 312 | margin: 1.5rem 0 1.5rem 3rem; 313 | font-size: 90%; } 314 | li { 315 | margin-bottom: 1rem; } 316 | 317 | 318 | /* Tables 319 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 320 | table { 321 | border-collapse: collapse; 322 | } 323 | th:not(.CalendarDay), 324 | td:not(.CalendarDay) { 325 | padding: 12px 15px; 326 | text-align: left; 327 | border-bottom: 1px solid #E1E1E1; } 328 | th:first-child:not(.CalendarDay), 329 | td:first-child:not(.CalendarDay) { 330 | padding-left: 0; } 331 | th:last-child:not(.CalendarDay), 332 | td:last-child:not(.CalendarDay) { 333 | padding-right: 0; } 334 | 335 | 336 | /* Spacing 337 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 338 | button, 339 | .button { 340 | margin-bottom: 0rem; } 341 | input, 342 | textarea, 343 | select, 344 | fieldset { 345 | margin-bottom: 0rem; } 346 | pre, 347 | dl, 348 | figure, 349 | table, 350 | form { 351 | margin-bottom: 0rem; } 352 | p, 353 | ul, 354 | ol { 355 | margin-bottom: 0.75rem; } 356 | 357 | /* Utilities 358 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 359 | .u-full-width { 360 | width: 100%; 361 | box-sizing: border-box; } 362 | .u-max-full-width { 363 | max-width: 100%; 364 | box-sizing: border-box; } 365 | .u-pull-right { 366 | float: right; } 367 | .u-pull-left { 368 | float: left; } 369 | 370 | 371 | /* Misc 372 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 373 | hr { 374 | margin-top: 3rem; 375 | margin-bottom: 3.5rem; 376 | border-width: 0; 377 | border-top: 1px solid #E1E1E1; } 378 | 379 | 380 | /* Clearing 381 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 382 | 383 | /* Self Clearing Goodness */ 384 | .container:after, 385 | .row:after, 386 | .u-cf { 387 | content: ""; 388 | display: table; 389 | clear: both; } 390 | 391 | 392 | /* Media Queries 393 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 394 | /* 395 | Note: The best way to structure the use of media queries is to create the queries 396 | near the relevant code. For example, if you wanted to change the styles for buttons 397 | on small devices, paste the mobile query code up in the buttons section and style it 398 | there. 399 | */ 400 | 401 | 402 | /* Larger than mobile */ 403 | @media (min-width: 400px) {} 404 | 405 | /* Larger than phablet (also point when grid becomes active) */ 406 | @media (min-width: 550px) {} 407 | 408 | /* Larger than tablet */ 409 | @media (min-width: 750px) {} 410 | 411 | /* Larger than desktop */ 412 | @media (min-width: 1000px) {} 413 | 414 | /* Larger than Desktop HD */ 415 | @media (min-width: 1200px) {} -------------------------------------------------------------------------------- /Profile Scraper.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from selenium import webdriver\n", 10 | "from selenium.webdriver.common.keys import Keys\n", 11 | "from selenium.webdriver.common.action_chains import ActionChains\n", 12 | "import pandas as pd\n", 13 | "import time\n", 14 | "from bs4 import BeautifulSoup" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "def login(browser):\n", 24 | " username = input('Enter Username: ')\n", 25 | " password = input('Enter Password: ')\n", 26 | " browser.get('https://www.linkedin.com/login?fromSignIn=true&trk=guest_homepage-basic_nav-header-signin')\n", 27 | " time.sleep(3)\n", 28 | " browser.find_element_by_name('session_key').send_keys(username + Keys.RETURN)\n", 29 | " browser.find_element_by_name('session_password').send_keys(password + Keys.RETURN)\n", 30 | " time.sleep(3)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "def connections_scraper(browser):\n", 40 | " connections_page = \"https://www.linkedin.com/search/results/people/?facetNetwork=%5B%22F%22%5D&origin=MEMBER_PROFILE_CANNED_SEARCH\"\n", 41 | " browser.get(connections_page)\n", 42 | " soup = BeautifulSoup(browser.page_source, 'html.parser')\n", 43 | " conn_num = soup.find_all('h3', class_='search-results__total')\n", 44 | " num = int(conn_num[0].text.strip().split()[0])\n", 45 | " time.sleep(3)\n", 46 | " i = 2\n", 47 | " x = 1\n", 48 | " names = []\n", 49 | " titles = []\n", 50 | " locations = []\n", 51 | " profiles = []\n", 52 | " print('\\nScraping your connections...\\n')\n", 53 | " while True:\n", 54 | " browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)\n", 55 | " time.sleep(.75)\n", 56 | " browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)\n", 57 | " time.sleep(.75)\n", 58 | " browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)\n", 59 | " time.sleep(.75)\n", 60 | " browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)\n", 61 | " time.sleep(.75)\n", 62 | " browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)\n", 63 | " soup = BeautifulSoup(browser.page_source, 'html.parser')\n", 64 | " name_tag = soup.find_all('span', class_='name actor-name')\n", 65 | " title_tag = soup.find_all('p', class_='subline-level-1')\n", 66 | " location_tag = soup.find_all('p', class_= 'subline-level-2')\n", 67 | " profile_tag = soup.find_all('a', class_= 'search-result__result-link')\n", 68 | " names += list(map(lambda x: x.text, name_tag))\n", 69 | " titles += list(map(lambda x: x.text.replace('\\n','').strip(), title_tag))\n", 70 | " locations += list(map(lambda x: x.text.replace('\\n','').strip(), location_tag))\n", 71 | " profiles += list(map(lambda x: 'https://linkedin.com' + x['href'], profile_tag))[::2]\n", 72 | " if len(names)>=num:\n", 73 | " break\n", 74 | " y = x\n", 75 | " x = len(names)\n", 76 | " if x==y:\n", 77 | " break\n", 78 | " browser.get('https://www.linkedin.com/search/results/people/?facetNetwork=%5B%22F%22%5D&origin=MEMBER_PROFILE_CANNED_SEARCH&page='+str(i))\n", 79 | " i+=1\n", 80 | " time.sleep(3)\n", 81 | " df = pd.DataFrame({'Name':names, 'Title':titles, 'Location':locations, 'Profile':profiles})\n", 82 | " return df" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "browser = webdriver.Chrome(executable_path=\"E:/RnD/linkedin_scraper/chromedriver\", options= webdriver.ChromeOptions())\n", 92 | "login(browser)\n", 93 | "connections = connections_scraper(browser)\n", 94 | "browser.quit()\n", 95 | "connections.to_csv('connections.csv')" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "def profile_scraper(df, browser):\n", 105 | " num_projects = []\n", 106 | " num_languages = []\n", 107 | " top_skills = []\n", 108 | " num_connections = []\n", 109 | " positions = []\n", 110 | " company = []\n", 111 | " duration = []\n", 112 | " institutes = []\n", 113 | " courses = []\n", 114 | " year_range = []\n", 115 | " ex_profiles = []\n", 116 | " ed_profiles = []\n", 117 | " for profile in df['Profile']:\n", 118 | " try:\n", 119 | " browser.get(profile)\n", 120 | " time.sleep(2)\n", 121 | " browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)\n", 122 | " time.sleep(.75)\n", 123 | " browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)\n", 124 | " time.sleep(.75)\n", 125 | " browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)\n", 126 | " time.sleep(.75)\n", 127 | " browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)\n", 128 | " time.sleep(.75)\n", 129 | " browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)\n", 130 | " time.sleep(.75)\n", 131 | " browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)\n", 132 | " time.sleep(.75)\n", 133 | " browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)\n", 134 | " time.sleep(.75)\n", 135 | " browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)\n", 136 | " time.sleep(.75)\n", 137 | " soup = BeautifulSoup(browser.page_source, 'lxml')\n", 138 | "\n", 139 | " conn_tag = soup.find_all('span', class_='t-16 t-bold')\n", 140 | " if conn_tag[0].text.strip().split()[0].isdigit() or conn_tag[0].text.strip().split()[0] == '500+':\n", 141 | " num_connections.append(conn_tag[0].text.strip().split()[0])\n", 142 | " elif len(soup.find_all('span', class_='t-16 t-black t-normal'))>0:\n", 143 | " num_connections.append(soup.find_all('span', class_='t-16 t-black t-normal')[0].text.strip().split()[0])\n", 144 | " else:\n", 145 | " num_connections.append(None)\n", 146 | "\n", 147 | " accom_tag = soup.find_all('h3', class_='pv-accomplishments-block__count t-32 t-black t-normal pr3')\n", 148 | " np = 0\n", 149 | " nl = 2\n", 150 | " for at in accom_tag:\n", 151 | " if at.text.strip().split('\\n')[0].split()[-1] == 'projects' or at.text.strip().split('\\n')[0].split()[-1] == 'project':\n", 152 | " np = int(at.text.strip().split('\\n')[1])\n", 153 | " if at.text.strip().split('\\n')[0].split()[-1] == 'languages' or at.text.strip().split('\\n')[0].split()[-1] == 'language':\n", 154 | " nl = int(at.text.strip().split('\\n')[1])\n", 155 | " num_projects.append(np)\n", 156 | " num_languages.append(nl)\n", 157 | "\n", 158 | " skills_tag = soup.find_all('span', class_='pv-skill-category-entity__name-text')\n", 159 | " ts = []\n", 160 | " for st in skills_tag:\n", 161 | " ts.append(st.text.strip())\n", 162 | " top_skills.append(ts)\n", 163 | "\n", 164 | " position_tag = soup.find_all('h3', class_='t-16 t-black t-bold')\n", 165 | " ex_pos = list(map(lambda x: x.text.strip(), position_tag))\n", 166 | " company_tag = soup.find_all('p', class_='pv-entity__secondary-title t-14 t-black t-normal')\n", 167 | " ex_comp = list(map(lambda x: x.text.strip().split('\\n')[0], company_tag))\n", 168 | " ex_duration_tag = soup.find_all('span', class_='pv-entity__bullet-item-v2')\n", 169 | " durr = []\n", 170 | " for dur in ex_duration_tag:\n", 171 | " d_list = dur.text.strip().split()\n", 172 | " if d_list[0].isdigit():\n", 173 | " if len(d_list)==2:\n", 174 | " if d_list[1] == 'mo' or d_list[1]=='mos':\n", 175 | " durr.append(int(d_list[0]))\n", 176 | " if d_list[1] == 'yr' or d_list[1]=='yrs':\n", 177 | " durr.append(int(d_list[0])*12)\n", 178 | " if len(d_list)==4:\n", 179 | " durr.append((int(d_list[0])*12)+int(d_list[2]))\n", 180 | " else:\n", 181 | " durr.append(None)\n", 182 | " x = min(len(ex_comp), len(ex_pos), len(durr))\n", 183 | " ex_comp = ex_comp[:x]\n", 184 | " ex_pos = ex_pos[:x]\n", 185 | " durr = durr[:x]\n", 186 | " ex_profiles += [profile]*x\n", 187 | " positions += ex_pos\n", 188 | " company += ex_comp\n", 189 | " duration += durr\n", 190 | "\n", 191 | " institute_tag = soup.find_all('h3', class_='pv-entity__school-name t-16 t-black t-bold')\n", 192 | " inst = list(map(lambda x: x.text.strip(), institute_tag))\n", 193 | " course_tag = soup.find_all('p', class_='pv-entity__secondary-title pv-entity__degree-name t-14 t-black t-normal')\n", 194 | " course_t = list(map(lambda x: x.text.strip().split('\\n')[1], course_tag))\n", 195 | " ed_date_tag = soup.find_all('p', class_='pv-entity__dates t-14 t-black--light t-normal')\n", 196 | " ed_dates = list(map(lambda x: x.text.strip().split('\\n')[-1], ed_date_tag))\n", 197 | " y = min(len(inst), len(course_t), len(ed_dates))\n", 198 | " inst = inst[:y]\n", 199 | " course_t = course_t[:y]\n", 200 | " ed_dates = ed_dates[:y]\n", 201 | " ed_profiles += [profile]*y\n", 202 | " institutes += inst\n", 203 | " courses += course_t\n", 204 | " year_range += ed_dates\n", 205 | " except:\n", 206 | " continue\n", 207 | " df['Number of connections'] = num_connections\n", 208 | " df['Number of Projects'] = num_projects\n", 209 | " df['Number of Languages known'] = num_languages\n", 210 | " df['Top Skills'] = top_skills\n", 211 | " exp_df = pd.DataFrame({'Profile':ex_profiles, 'Position':positions, 'Company':company, 'Duration':duration})\n", 212 | " ed_df = pd.DataFrame({'Profile':ed_profiles, 'Institute':institutes, 'Degree':courses, 'Year range':year_range})\n", 213 | " \n", 214 | " return df, exp_df, ed_df" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": { 221 | "scrolled": false 222 | }, 223 | "outputs": [], 224 | "source": [ 225 | "browser = webdriver.Chrome(executable_path=\"E:/RnD/linkedin_scraper/chromedriver\", options= webdriver.ChromeOptions())\n", 226 | "login(browser)\n", 227 | "data = pd.read_csv('connections.csv')\n", 228 | "df, exp, ed = profile_scraper(data, browser)\n", 229 | "browser.quit()\n", 230 | "df.to_csv('connections_data.csv', index=False)\n", 231 | "exp.to_csv('experience.csv', index=False)\n", 232 | "ed.to_csv('education.csv', index=False)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "browser = webdriver.Chrome(executable_path=\"E:/RnD/linkedin_scraper/chromedriver\", options= webdriver.ChromeOptions())\n", 242 | "login(browser)\n", 243 | "connections = connections_scraper(browser)\n", 244 | "conn, exp, ed = profile_scraper(connections, browser)\n", 245 | "browser.quit()" 246 | ] 247 | } 248 | ], 249 | "metadata": { 250 | "kernelspec": { 251 | "display_name": "Python 3", 252 | "language": "python", 253 | "name": "python3" 254 | }, 255 | "language_info": { 256 | "codemirror_mode": { 257 | "name": "ipython", 258 | "version": 3 259 | }, 260 | "file_extension": ".py", 261 | "mimetype": "text/x-python", 262 | "name": "python", 263 | "nbconvert_exporter": "python", 264 | "pygments_lexer": "ipython3", 265 | "version": "3.7.7" 266 | } 267 | }, 268 | "nbformat": 4, 269 | "nbformat_minor": 2 270 | } 271 | -------------------------------------------------------------------------------- /final_app.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.keys import Keys 3 | from selenium.webdriver.common.action_chains import ActionChains 4 | import pandas as pd 5 | import time 6 | from bs4 import BeautifulSoup 7 | import datetime 8 | import plotly.express as px 9 | import dash 10 | import dash_core_components as dcc 11 | import dash_html_components as html 12 | import plotly.graph_objs as go 13 | import webbrowser 14 | 15 | def login(browser): 16 | username = input('Enter Username: ') 17 | password = input('Enter Password: ') 18 | browser.get('https://www.linkedin.com/login?fromSignIn=true&trk=guest_homepage-basic_nav-header-signin') 19 | time.sleep(3) 20 | browser.find_element_by_name('session_key').send_keys(username + Keys.RETURN) 21 | browser.find_element_by_name('session_password').send_keys(password + Keys.RETURN) 22 | time.sleep(3) 23 | 24 | def connections_scraper(browser): 25 | connections_page = "https://www.linkedin.com/search/results/people/?facetNetwork=%5B%22F%22%5D&origin=MEMBER_PROFILE_CANNED_SEARCH" 26 | browser.get(connections_page) 27 | soup = BeautifulSoup(browser.page_source, 'html.parser') 28 | conn_num = soup.find_all('h3', class_='search-results__total') 29 | num = int(conn_num[0].text.strip().split()[0]) 30 | time.sleep(3) 31 | i = 2 32 | x = 1 33 | names = [] 34 | titles = [] 35 | locations = [] 36 | profiles = [] 37 | print('Scraping your connections...\n') 38 | while True: 39 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN) 40 | time.sleep(.75) 41 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN) 42 | time.sleep(.75) 43 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN) 44 | time.sleep(.75) 45 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN) 46 | time.sleep(.75) 47 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN) 48 | soup = BeautifulSoup(browser.page_source, 'html.parser') 49 | name_tag = soup.find_all('span', class_='name actor-name') 50 | title_tag = soup.find_all('p', class_='subline-level-1') 51 | location_tag = soup.find_all('p', class_= 'subline-level-2') 52 | profile_tag = soup.find_all('a', class_= 'search-result__result-link') 53 | names += list(map(lambda x: x.text, name_tag)) 54 | titles += list(map(lambda x: x.text.replace('\n','').strip(), title_tag)) 55 | locations += list(map(lambda x: x.text.replace('\n','').strip(), location_tag)) 56 | profiles += list(map(lambda x: 'https://linkedin.com' + x['href'], profile_tag))[::2] 57 | if len(names)>=num: 58 | break 59 | y = x 60 | x = len(names) 61 | if x==y: 62 | break 63 | browser.get('https://www.linkedin.com/search/results/people/?facetNetwork=%5B%22F%22%5D&origin=MEMBER_PROFILE_CANNED_SEARCH&page='+str(i)) 64 | i+=1 65 | time.sleep(3) 66 | df = pd.DataFrame({'Name':names, 'Title':titles, 'Location':locations, 'Profile':profiles}) 67 | return df 68 | 69 | def profile_scraper(df, browser): 70 | num_projects = [] 71 | num_languages = [] 72 | top_skills = [] 73 | num_connections = [] 74 | positions = [] 75 | company = [] 76 | duration = [] 77 | institutes = [] 78 | courses = [] 79 | year_range = [] 80 | ex_profiles = [] 81 | ed_profiles = [] 82 | print('Extracting information form individual profiles. Please wait...\n') 83 | for profile in df['Profile']: 84 | try: 85 | browser.get(profile) 86 | time.sleep(2) 87 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN) 88 | time.sleep(.75) 89 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN) 90 | time.sleep(.75) 91 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN) 92 | time.sleep(.75) 93 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN) 94 | time.sleep(.75) 95 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN) 96 | time.sleep(.75) 97 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN) 98 | time.sleep(.75) 99 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN) 100 | time.sleep(.75) 101 | browser.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN) 102 | time.sleep(.75) 103 | soup = BeautifulSoup(browser.page_source, 'lxml') 104 | 105 | conn_tag = soup.find_all('span', class_='t-16 t-bold') 106 | if conn_tag[0].text.strip().split()[0].isdigit() or conn_tag[0].text.strip().split()[0] == '500+': 107 | num_connections.append(conn_tag[0].text.strip().split()[0]) 108 | elif len(soup.find_all('span', class_='t-16 t-black t-normal'))>0: 109 | num_connections.append(soup.find_all('span', class_='t-16 t-black t-normal')[0].text.strip().split()[0]) 110 | else: 111 | num_connections.append(None) 112 | 113 | accom_tag = soup.find_all('h3', class_='pv-accomplishments-block__count t-32 t-black t-normal pr3') 114 | np = 0 115 | nl = 2 116 | for at in accom_tag: 117 | if at.text.strip().split('\n')[0].split()[-1] == 'projects' or at.text.strip().split('\n')[0].split()[-1] == 'project': 118 | np = int(at.text.strip().split('\n')[1]) 119 | if at.text.strip().split('\n')[0].split()[-1] == 'languages' or at.text.strip().split('\n')[0].split()[-1] == 'language': 120 | nl = int(at.text.strip().split('\n')[1]) 121 | num_projects.append(np) 122 | num_languages.append(nl) 123 | 124 | skills_tag = soup.find_all('span', class_='pv-skill-category-entity__name-text') 125 | ts = [] 126 | for st in skills_tag: 127 | ts.append(st.text.strip()) 128 | top_skills.append(ts) 129 | 130 | position_tag = soup.find_all('h3', class_='t-16 t-black t-bold') 131 | ex_pos = list(map(lambda x: x.text.strip(), position_tag)) 132 | company_tag = soup.find_all('p', class_='pv-entity__secondary-title t-14 t-black t-normal') 133 | ex_comp = list(map(lambda x: x.text.strip().split('\n')[0], company_tag)) 134 | ex_duration_tag = soup.find_all('span', class_='pv-entity__bullet-item-v2') 135 | durr = [] 136 | for dur in ex_duration_tag: 137 | d_list = dur.text.strip().split() 138 | if d_list[0].isdigit(): 139 | if len(d_list)==2: 140 | if d_list[1] == 'mo' or d_list[1]=='mos': 141 | durr.append(int(d_list[0])) 142 | if d_list[1] == 'yr' or d_list[1]=='yrs': 143 | durr.append(int(d_list[0])*12) 144 | if len(d_list)==4: 145 | durr.append((int(d_list[0])*12)+int(d_list[2])) 146 | else: 147 | durr.append(None) 148 | x = min(len(ex_comp), len(ex_pos), len(durr)) 149 | ex_comp = ex_comp[:x] 150 | ex_pos = ex_pos[:x] 151 | durr = durr[:x] 152 | ex_profiles += [profile]*x 153 | positions += ex_pos 154 | company += ex_comp 155 | duration += durr 156 | 157 | institute_tag = soup.find_all('h3', class_='pv-entity__school-name t-16 t-black t-bold') 158 | inst = list(map(lambda x: x.text.strip(), institute_tag)) 159 | course_tag = soup.find_all('p', class_='pv-entity__secondary-title pv-entity__degree-name t-14 t-black t-normal') 160 | course_t = list(map(lambda x: x.text.strip().split('\n')[1], course_tag)) 161 | ed_date_tag = soup.find_all('p', class_='pv-entity__dates t-14 t-black--light t-normal') 162 | ed_dates = list(map(lambda x: x.text.strip().split('\n')[-1], ed_date_tag)) 163 | y = min(len(inst), len(course_t), len(ed_dates)) 164 | inst = inst[:y] 165 | course_t = course_t[:y] 166 | ed_dates = ed_dates[:y] 167 | ed_profiles += [profile]*y 168 | institutes += inst 169 | courses += course_t 170 | year_range += ed_dates 171 | except: 172 | continue 173 | df['Number of connections'] = num_connections 174 | df['Number of Projects'] = num_projects 175 | df['Number of Languages known'] = num_languages 176 | df['Top Skills'] = top_skills 177 | exp_df = pd.DataFrame({'Profile':ex_profiles, 'Position':positions, 'Company':company, 'Duration':duration}) 178 | ed_df = pd.DataFrame({'Profile':ed_profiles, 'Institute':institutes, 'Degree':courses, 'Year range':year_range}) 179 | 180 | return df, exp_df, ed_df 181 | 182 | try: 183 | conn = pd.read_csv('connections_data.csv') 184 | exp = pd.read_csv('experience.csv') 185 | ed = pd.read_csv('education.csv') 186 | except: 187 | browser = webdriver.Chrome(executable_path="E:/RnD/linkedin_scraper/chromedriver", options= webdriver.ChromeOptions()) 188 | login(browser) 189 | print('\nPlease Wait. This may take a while...\n') 190 | time.sleep(3) 191 | connections = connections_scraper(browser) 192 | time.sleep(3) 193 | conn_data, exp_data, ed_data = profile_scraper(connections, browser) 194 | conn_data.to_csv('connections_data.csv', index=False) 195 | exp_data.to_csv('experience.csv', index=False) 196 | ed_data.to_csv('education.csv', index=False) 197 | browser.quit() 198 | conn = pd.read_csv('connections_data.csv') 199 | exp = pd.read_csv('experience.csv') 200 | ed = pd.read_csv('education.csv') 201 | 202 | exp = exp[exp['Position']!='Student'] 203 | #conn = conn.drop_duplicates().reset_index(drop=True) 204 | #ed = ed.drop_duplicates().reset_index(drop=True) 205 | #exp = exp.drop_duplicates().reset_index(drop=True) 206 | 207 | 208 | ed['start_year'] = list(map(lambda x: int(x.split()[0]), ed['Year range'])) 209 | end = [] 210 | for ran in ed['Year range']: 211 | try: 212 | end.append(int(ran.split()[2])) 213 | except: 214 | end.append(int(ran.split()[0])+4) 215 | ed['passing_year'] = end 216 | 217 | profiles = ed['Profile'].unique() 218 | current_year = datetime.datetime.now().year 219 | hins = [] 220 | hdeg = [] 221 | hstart = [] 222 | hpass = [] 223 | status = [] 224 | level = [] 225 | field = [] 226 | for prof in profiles: 227 | hins.append(ed[ed['Profile']==prof]['Institute'].iloc[0]) 228 | hdeg.append(ed[ed['Profile']==prof]['Degree'].iloc[0]) 229 | hstart.append(ed[ed['Profile']==prof]['start_year'].iloc[0]) 230 | hpass.append(ed[ed['Profile']==prof]['passing_year'].iloc[0]) 231 | if ed[ed['Profile']==prof]['passing_year'].iloc[0]<=current_year: 232 | status.append('Completed') 233 | else: 234 | status.append('Ongoing') 235 | deg = ed[ed['Profile']==prof]['Degree'].iloc[0] 236 | if (deg.lower().find('bachelor')!=-1 or deg.lower()[0]=='b'): 237 | level.append('Bachelor\'s') 238 | elif (deg.lower().find('master')!=-1 239 | or deg.lower().find('post graduate')!=-1 240 | or deg.lower()[0]=='m' 241 | or deg.lower().find('pgdm')!=-1): 242 | level.append('Master\'s') 243 | elif (deg.lower().find('phd')!=-1 or deg.lower().find('Doctor of Philosophy')!=-1): 244 | level.append('Phd') 245 | elif (deg.lower().find('nanodegree')!=-1 or deg.lower().find('certificate program')!=-1): 246 | level.append('Diploma') 247 | else: 248 | level.append('Other') 249 | ins = ed[ed['Profile']==prof]['Institute'].iloc[0] 250 | if (deg.lower().find('engineer')!=-1 251 | or deg.lower().find('technology')!=-1 252 | or deg.lower().find('science')!=-1 253 | or deg.lower().find('computer')!=-1 254 | or ins.lower().find('technology')!=-1 255 | or ins.lower().find('science')!=-1 256 | or ins.lower().find('medical')!=-1 257 | or ins.lower().find('engineering')!=-1): 258 | field.append('Science') 259 | elif(deg.lower().find('management')!=-1 260 | or deg.lower().find('mba')!=-1 261 | or deg.lower().find('business')!=-1 262 | or deg.lower().find('finance')!=-1 263 | or deg.lower().find('accountancy')!=-1 264 | or ins.lower().find('business')!=-1): 265 | field.append('MNGMT') 266 | else: 267 | field.append('Arts') 268 | dic = {'Profile':profiles, 269 | 'Institute':hins, 270 | 'Degree':hdeg, 271 | 'Field of study':field, 272 | 'Level':level, 273 | 'start_year':hstart, 274 | 'passing_year':hpass, 275 | 'Level':level, 276 | 'Status':status} 277 | highest_ed = pd.DataFrame(dic) 278 | 279 | conn_cat = [] 280 | skills = [] 281 | for ri, row in conn.iterrows(): 282 | if row['Top Skills']!='[]': 283 | skills.append(row['Top Skills']) 284 | if row['Number of connections'] == '500+': 285 | conn_cat.append('500+') 286 | elif (int(row['Number of connections'])>400 and int(row['Number of connections'])<=500): 287 | conn_cat.append('400-500') 288 | elif (int(row['Number of connections'])>300 and int(row['Number of connections'])<=400): 289 | conn_cat.append('300-400') 290 | elif (int(row['Number of connections'])>200 and int(row['Number of connections'])<=300): 291 | conn_cat.append('200-300') 292 | elif (int(row['Number of connections'])>100 and int(row['Number of connections'])<=200): 293 | conn_cat.append('100-200') 294 | elif (int(row['Number of connections'])<=100): 295 | conn_cat.append('0-100') 296 | conn["Number of connections"] = conn_cat 297 | 298 | sk = [] 299 | i = 0 300 | for skill in skills: 301 | try: 302 | x=skill.split('\'') 303 | sk.append(x[1]) 304 | try: 305 | sk.append(x[3]) 306 | except: 307 | continue 308 | try: 309 | sk.append(x[5]) 310 | except: 311 | continue 312 | except: 313 | sk += skill 314 | skills_dict = {} 315 | for skill in sk: 316 | if skill in skills_dict: 317 | skills_dict[skill] += 1 318 | else: 319 | skills_dict[skill] = 1 320 | lang = [] 321 | for k,v in skills_dict.items(): 322 | if k.lower().find('programming language')!=-1: 323 | lang.append(k) 324 | for l in lang: 325 | try: 326 | skills_dict[l.split()[0]] += skills_dict[l] 327 | del skills_dict[l] 328 | except: 329 | continue 330 | x=sorted(skills_dict.values(), reverse=True) 331 | dic = {} 332 | for skill_cnt in x: 333 | for k,v in skills_dict.items(): 334 | if ((v==skill_cnt) and (k not in dic)): 335 | dic[k]=skill_cnt 336 | skills_dict = dic 337 | 338 | locs = [] 339 | for loc in list(conn['Location'].values): 340 | x = loc.split(',')[0] 341 | if x.split()[-1] == 'Area': 342 | locs.append(" ".join(x.split()[:-1])) 343 | else: 344 | locs.append(x) 345 | locations = [] 346 | for x in locs: 347 | if x not in locations: 348 | locations.append(x) 349 | lc = [] 350 | for loc in list(conn['Location'].values): 351 | for l in locations: 352 | x = loc.split(',')[0] 353 | if x.find(l)!=-1: 354 | lc.append(l) 355 | break 356 | conn['Location'] = lc 357 | 358 | dur = [] 359 | for index, d in exp.iterrows(): 360 | try: 361 | if int(d["Duration"]) < 6: 362 | dur.append('< 6 Months') 363 | elif int(d["Duration"]) >= 6 and int(d["Duration"]) < 12: 364 | dur.append('6 Months to 1 Year') 365 | elif int(d["Duration"]) >= 12 and int(d["Duration"]) <= 60: 366 | dur.append('1-5 Years') 367 | elif int(d["Duration"]) > 60 and int(d["Duration"]) <= 120: 368 | dur.append('6-10 Years') 369 | elif int(d["Duration"]) > 120 and int(d["Duration"]) <= 240: 370 | dur.append('11-20 Years') 371 | else: 372 | dur.append('20+ Years') 373 | except: 374 | dur.append('< 6 Months') 375 | exp["Experience"] = dur 376 | 377 | cat = [] 378 | for pos in list(exp['Position']): 379 | if (pos.lower().find('intern')!=-1 380 | or pos.lower().find('internship')!=-1 381 | or pos.lower().find('trainee')!=-1): 382 | cat.append('Intern') 383 | elif (pos.lower().find('campus')!=-1 384 | or pos.lower().find('student')!=-1 385 | or pos.lower().find('teaching assistant')!=-1 386 | or pos.lower().find('ambassador')!=-1 387 | or pos.lower().find('college')!=-1 388 | or pos.lower().find('member')!=-1 389 | or pos.lower().find('core committee member')!=-1 390 | or pos.lower().find('volunteer')!=-1 391 | or pos.lower().find('hustler')!=-1 392 | or pos.lower().find('scholar')!=-1 393 | or pos.lower().find('contributor')!=-1 394 | or pos.lower().find('fest')!=-1 395 | or pos.lower().find('event')!=-1 396 | or pos.lower().find('representative')!=-1): 397 | cat.append('Student Representative/ Volunteer') 398 | else: 399 | cat.append('Full Time') 400 | exp["Category"] = cat 401 | 402 | category_count = pd.DataFrame(exp['Category'].value_counts()).reset_index().rename(columns={'index':'Category', 'Category':'Count'}) 403 | intern_company_count = pd.DataFrame(exp[exp['Category']=='Intern']['Company'].value_counts()).reset_index().rename(columns={'index':'Company', 'Company':'Count'}) 404 | ft_company_count = pd.DataFrame(exp[exp['Category']=='Full Time']['Company'].value_counts()).reset_index().rename(columns={'index':'Company', 'Company':'Count'}) 405 | srv_company_count = pd.DataFrame(exp[exp['Category']=='Student Representative/ Volunteer']['Company'].value_counts()).reset_index().rename(columns={'index':'Company', 'Company':'Count'}) 406 | 407 | skills_count = pd.DataFrame(columns=['Skill', 'Count']) 408 | skills_count['Skill'] = list(skills_dict.keys()) 409 | skills_count['Count'] = list(skills_dict.values()) 410 | 411 | location_count = pd.DataFrame(conn['Location'].value_counts()).reset_index().rename(columns={'index':'Location', 'Location':'Count'}) 412 | exp_dur_count = pd.DataFrame(exp['Experience'].value_counts()).reset_index().rename(columns={'index':'Duration', 'Experience':'Count'}) 413 | conn_num_count = pd.DataFrame(conn['Number of connections'].value_counts()).reset_index().rename(columns={'index':'Number of connections', 'Number of connections':'Count'}) 414 | lang_count = pd.DataFrame(conn['Number of Languages known'].value_counts()).reset_index().rename(columns={'index':'Number of Languages known', 'Number of Languages known':'Count'}) 415 | 416 | intern_company_count['Category'] = ['Intern']*intern_company_count.shape[0] 417 | intern_company_count['Category count'] = [category_count['Count'].iloc[1]]*intern_company_count.shape[0] 418 | ft_company_count['Category'] = ['Full Time']*ft_company_count.shape[0] 419 | ft_company_count['Category count'] = [category_count['Count'].iloc[0]]*ft_company_count.shape[0] 420 | srv_company_count['Category'] = ['Student Representative/ Volunteer']*srv_company_count.shape[0] 421 | srv_company_count['Category count'] = [category_count['Count'].iloc[2]]*srv_company_count.shape[0] 422 | 423 | status_count = pd.DataFrame(highest_ed['Status'].value_counts().reset_index().rename(columns={'index':'category name', 'Status':'Count'})) 424 | level_count = pd.DataFrame(highest_ed['Level'].value_counts().reset_index().rename(columns={'index':'category name', 'Level':'Count'})) 425 | fos_count = pd.DataFrame(highest_ed['Field of study'].value_counts().reset_index().rename(columns={'index':'category name', 'Field of study':'Count'})) 426 | 427 | category_count['Count %'] = list(map(lambda x: '%.2f'%(x*100/sum(list(category_count['Count']))), category_count['Count'])) 428 | intern_company_count['Count %'] = list(map(lambda x: '%.2f'%(x*100/sum(list(intern_company_count['Count']))), intern_company_count['Count'])) 429 | ft_company_count['Count %'] = list(map(lambda x: '%.2f'%(x*100/sum(list(ft_company_count['Count']))), ft_company_count['Count'])) 430 | srv_company_count['Count %'] = list(map(lambda x: '%.2f'%(x*100/sum(list(srv_company_count['Count']))), srv_company_count['Count'])) 431 | skills_count['Count %'] = list(map(lambda x: '%.2f'%(x*100/sum(list(skills_count['Count']))), skills_count['Count'])) 432 | location_count['Count %'] = list(map(lambda x: '%.2f'%(x*100/sum(list(location_count['Count']))), location_count['Count'])) 433 | exp_dur_count['Count %'] = list(map(lambda x: '%.2f'%(x*100/sum(list(exp_dur_count['Count']))), exp_dur_count['Count'])) 434 | conn_num_count['Count %'] = list(map(lambda x: '%.2f'%(x*100/sum(list(conn_num_count['Count']))), conn_num_count['Count'])) 435 | lang_count['Count %'] = list(map(lambda x: '%.2f'%(x*100/sum(list(lang_count['Count']))), lang_count['Count'])) 436 | status_count['Count %'] = list(map(lambda x: '%.2f'%(x*100/sum(list(status_count['Count']))), status_count['Count'])) 437 | level_count['Count %'] = list(map(lambda x: '%.2f'%(x*100/sum(list(level_count['Count']))), level_count['Count'])) 438 | fos_count['Count %'] = list(map(lambda x: '%.2f'%(x*100/sum(list(fos_count['Count']))), fos_count['Count'])) 439 | 440 | tt_int = int(len(intern_company_count)*0.1) 441 | tt_ft = int(len(ft_company_count)*0.1) 442 | tt_srv = int(len(srv_company_count)*0.1) 443 | 444 | company_count = intern_company_count[:tt_int].append(ft_company_count[:tt_ft]).append(srv_company_count[:tt_srv]) 445 | 446 | app = dash.Dash(__name__) 447 | 448 | app.layout = html.Div([ 449 | html.Div([ 450 | html.Div([ 451 | html.Div([ 452 | html.H1(children='LinkedIn Connections Analyzer with Visualization',style={'textAlign': 'center'}), 453 | html.Div([ 454 | html.H6(children= 455 | 'LinkedIn has always been one of the most useful social media networks for us. The reason is simple: it is business-oriented and (almost) clutter-free.The best thing is that people are connected for a reason there: when sending a connection request, you can/need to specify how you are connected to a person. This way your network contains a wealth of business-oriented information: you can find out how you are related to any company or person you need to get in touch with. Alternatively, you can be introduced to anyone outside your immediate network using your first-level connections.Tapping into your Linkedin connections can be a great way to discover new career opportunities for personal and professional growth.', 456 | style={'textAlign': 'center', 'margin-bottom':'25px'}) 457 | ]) 458 | ]) 459 | ], className='title twelve columns') 460 | ], id='header', className='title', style={'margin-bottom':'25px', 'margin-top':'0px'}), 461 | html.Div([ 462 | html.Div([ 463 | dcc.Graph(id='location_graph', 464 | figure=px.treemap(location_count,path=['Location'], values='Count %', title='Demographic Distribution of Connections', color='Count', color_continuous_scale=['#AED6F1', '#3498DB']).update_layout(title_x=0.5, coloraxis_showscale=False) 465 | )], className='contain six columns' 466 | ), 467 | html.Div([ 468 | dcc.Graph(id='skills_graph', 469 | figure=px.bar(skills_count[:10][::-1], x= 'Count %', y='Skill', title='Most Popular Skills amongst your Connections', orientation='h', color='Count', color_continuous_scale=['#AED6F1', '#3498DB']).update_layout(title_x=0.5, coloraxis_showscale=False) 470 | )], className='contain six columns' 471 | ) 472 | ], className='row', id='row1', style={'margin-bottom':'10px'}), 473 | html.Div([ 474 | html.Div([ 475 | dcc.Graph(id='conn_graph', 476 | figure=px.bar(conn_num_count[::-1], y='Number of connections', x = 'Count %', range_x=[0, 100], title='Network Strength of your Connections', orientation='h',color='Count', color_continuous_scale=['#AED6F1', '#3498DB']).update_layout(title_x=0.5, coloraxis_showscale=False) 477 | )], className='contain five columns' 478 | ), 479 | html.Div([ 480 | dcc.Graph(id='lang_graph', 481 | figure=px.pie(lang_count, names='Number of Languages known', values='Count %', title='Number of Languages known',color ='Number of Languages known', color_discrete_sequence=px.colors.sequential.Blues[-9:-3][::-1]).update_layout(title_x=0.5) 482 | )], className='contain five columns' 483 | ), 484 | html.Div([ 485 | html.Div([ 486 | html.H6('%.2f'%exp[exp['Category']=='Intern']['Duration'].mean()), 487 | html.P('Average Duration of Internships (in Months)') 488 | ], className='mini_container', style={'margin-bottom':'20px', 'margin-top':'20px'}), 489 | html.Div([ 490 | html.H6('%.2f'%((len(exp[exp['Category']=='Intern'])/len(exp))*100)+'%'), 491 | html.P('Connections who have worked as interns') 492 | ], className='mini_container', style={'margin-bottom':'20px'}), 493 | html.Div([ 494 | html.H6('%.2f'%((len(exp[exp['Category']=='Full Time'])/len(exp))*100)+'%'), 495 | html.P('Connections who have worked full time') 496 | ], className='mini_container') 497 | ], className='two columns') 498 | ], className='row', id='row2', style={'margin-bottom':'10px'}), 499 | html.Div([ 500 | html.Div([ 501 | dcc.Graph(id='cat_comp_tree', 502 | figure=px.treemap(company_count,path=['Category', 'Company'], values='Count %', title='Popular Companies in Each Category', color='Category', color_discrete_map={'Full Time':'#3498DB', 'Intern':'#85C1E9', 'Student Representative/ Volunteer':'#AED6F1'}).update_layout(title_x=0.5)) 503 | ], className='contain six columns'), 504 | html.Div([ 505 | dcc.Graph(id='duration_plot', 506 | figure= px.bar(exp_dur_count[::-1], y='Duration', x='Count %', title='Duration of the Job', orientation='h',color='Count', color_continuous_scale=['#AED6F1', '#3498DB'], range_x=[0, 100]).update_layout(title_x=0.5,coloraxis_showscale=False) 507 | )], className='contain six columns') 508 | ], className='row', id='row3', style={'margin-bottom':'10px'}), 509 | html.Div([ 510 | html.Div([ 511 | dcc.Graph(id='fos_plot', 512 | figure=px.pie(fos_count, names='category name', values='Count %', title='Field of Study',color ='category name',color_discrete_sequence=px.colors.sequential.Blues[-9:-3][::-1]).update_layout(title_x=0.5)) 513 | ], className='contain four columns', style={'margin-right':'12px', 'margin-left':'10px'}), 514 | html.Div([ 515 | dcc.Graph(id='level_plot', 516 | figure=px.pie(level_count, names='category name', values='Count %', title='Level of Highest Education',color='category name',color_discrete_sequence=px.colors.sequential.Blues[-9:-3][::-1]).update_layout(title_x=0.5)) 517 | ], className='contain four columns', style={'margin-right':'12px'}), 518 | html.Div([ 519 | dcc.Graph(id='status_plot', 520 | figure=px.pie(status_count, names='category name', values='Count %', title='Status of Education',color='category name',color_discrete_sequence=px.colors.sequential.Blues[-9:-3][::-1]).update_layout(title_x=0.5)) 521 | ], className='contain four columns') 522 | ], className='row', id='row4', style={'margin-bottom':'10px'}) 523 | ]) 524 | 525 | webbrowser.get('C:/Program Files (x86)/Google/Chrome/Application/chrome.exe %s').open('http://127.0.0.1:8050/') 526 | 527 | if __name__ == '__main__': 528 | app.run_server(debug=True) 529 | 530 | --------------------------------------------------------------------------------