├── data
    └── placeholder
├── licookies.png
├── requirements.txt
├── config.json
├── src
    ├── core.py
    ├── login.py
    ├── profile_extractor.py
    └── profile_sourcer.py
├── run.py
├── LICENSE
├── .gitignore
└── README.md


/data/placeholder:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/licookies.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hakimkhalafi/linkedin-scraper/HEAD/licookies.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | beautifulsoup4==4.6.3
 2 | certifi==2018.11.29
 3 | chardet==3.0.4
 4 | idna==2.8
 5 | numpy==1.22.0
 6 | pandas==0.23.4
 7 | python-dateutil==2.7.5
 8 | pytz==2018.7
 9 | requests==2.21.0
10 | six==1.12.0
11 | urllib3>=1.24.2
12 | 


--------------------------------------------------------------------------------
/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "username": "user_email@example.com",
 3 |     "password": "password",
 4 |     "proxylist": [],
 5 |     "timeout": 10,
 6 |     "search_results": {
 7 |         "pages_to_scrape": 2,
 8 |         "results_per_page": 40
 9 |     },
10 |     "profile_extractor": {
11 |         "amount_profiles": 10
12 |     },
13 |     "cookie": {
14 |         "li_at": "AexampleexampleexampleO_gexampleexampleexampleexamplb_TexampleexampleM_VexampleexB_wexampleexampleexamm_7exampleexampleexa1-fexampleexampleP_Hex9-qe9-1l",
15 |         "Csrf-Token": "ajax:1234567890123456789",
16 |         "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0 Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0."
17 |     }
18 | }


--------------------------------------------------------------------------------
/src/core.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import json
 4 | import argparse
 5 | import traceback
 6 | import subprocess
 7 | 
 8 | with open(os.getcwd() + '/config.json') as config_file:
 9 |     config = json.load(config_file)
10 | 
11 | 
12 | def arguments_setup(option):
13 |     """ Setup Argument Parameters """
14 |     parser = argparse.ArgumentParser()
15 |     parser.add_argument(option, '--keywords')
16 |     return parser.parse_args()
17 | 
18 | 
19 | def authenticate():
20 |     try:
21 |         subproc = subprocess.Popen(['python', 'login.py'],
22 |                                    stdout=subprocess.PIPE,
23 |                                    cwd='src')
24 | 
25 |         sess = subproc.communicate()[0].decode('utf-8').replace("\n", "")
26 | 
27 |         if len(sess) == 0:
28 |             sys.exit("[Error] Unable to login to LinkedIn.com")
29 | 
30 |         session_cookies = dict(li_at=sess)
31 | 
32 |     except Exception:
33 |         print(traceback.format_exc())
34 |         sys.exit("[Fatal] Could not authenticate to linkedin.")
35 | 
36 |     return session_cookies
37 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from src.profile_sourcer import scrape_search_results
 3 | from src.profile_extractor import scrape_profile_texts
 4 | from src.core import config, arguments_setup, authenticate
 5 | 
 6 | if __name__ == '__main__':  # For command line usage
 7 |     args = arguments_setup('-s')  # Search string
 8 | 
 9 |     search_string = args.keywords
10 | 
11 |     cookies = authenticate()
12 | 
13 |     cookies['JSESSIONID'] = config['cookie']['Csrf-Token']
14 |     cookies['li_at'] = config['cookie']['li_at']
15 | 
16 |     headers = {'Csrf-Token': config['cookie']['Csrf-Token'],
17 |                'User-Agent': config['cookie']['User-Agent']}
18 | 
19 |     print("-------- Scrape search results stage")
20 | 
21 |     search_results_filename = scrape_search_results(search_string, cookies,
22 |                                                     headers)
23 | 
24 |     search_file_folder = './data/search_results/'
25 |     full_file_path = os.path.join(search_file_folder, search_results_filename)
26 | 
27 |     print("-------- Scrape profile contents stage")
28 |     scrape_profile_texts(full_file_path, cookies, headers, save_dicts=False)
29 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Hakim Khan
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | .static_storage/
 56 | .media/
 57 | local_settings.py
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # PyCharm
107 | .idea/
108 | .idea
109 | 
110 | # data directories
111 | data/profile_data
112 | data/profile_pages
113 | data/search_results


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # linkedin-scraper
 2 | 
 3 | This tool allows you to scrape LinkedIn profiles based on search queries. You can use the end result for NLP/text analysis.
 4 | 
 5 | This repo is a heavily modified version of [dchrastil's ScrapedIn](https://github.com/dchrastil/ScrapedIn) with the addition of in-profile scraping.
 6 | 
 7 | ### Prerequisites
 8 | 
 9 | You will need python 3+. 
10 | 
11 | First clone this repo, then navigate to the folder and install the requirements
12 | 
13 | ```
14 | git clone https://github.com/hakimkhalafi/linkedin-scraper.git
15 | cd linkedin-scraper
16 | pip install -r requirements.txt
17 | ```
18 | 
19 | ### Configuring
20 | 
21 | There are 4 config settings you must change before the tool runs successfully.
22 | 
23 | In config.json, change the following values to match your LinkedIn sign-on info
24 | ```
25 | "username": "user_email@example.com",
26 | "password": "password",
27 | ```
28 | 
29 | Additionally, you will need to extract some cookie settings in order to login successfully
30 | 
31 | ```
32 | "li_at": "Aexampleexampleexamplee........",
33 | "Csrf-Token": "ajax:1234567890123456789",
34 | ```
35 | 
36 | You can do this in chrome by logging in to LinkedIn -> right click page -> "Inspect" -> "Application" tab -> "Cookies".
37 |  
38 |  ![Getting cookie config settings](licookies.png)
39 |  
40 |  Then double click the relevant values marked in red. "JSESSIONID" goes into "Csrf-Token" and "li_at" into "li_at". 
41 | 
42 | ### Running
43 | 
44 | Once you're set up and configured, you can run the tool via
45 | 
46 | ```
47 | python run.py -s "search query"
48 | ```
49 | 
50 | Where search query can be any job title such as "Data Scientist" etc.
51 | 
52 | The end result will be a CSV containing the following information
53 | 
54 | | Column name | Content |
55 | | --- | --- |
56 | | person_id | Identifier for profile |
57 | | fs_profile | Main profile information |
58 | | fs_position | All information for all job positions listed |
59 | | fs_education | All information for all attained education  |
60 | | fs_language | Any languages the person speaks |
61 | | fs_skill | Any skills the person has provided |
62 | | fs_project | Any projects the person has completed |
63 | | fs_honor | Any activities and honors the person has |
64 | | fs_publication | Any publications the person has published |
65 | | fs_course | Courses the person has completed |
66 | 
67 | Enjoy!
68 | 
69 | ### Disclaimer
70 | This educational tool probably violates LinkedIns terms of service. Use at your own risk.
71 | 


--------------------------------------------------------------------------------
/src/login.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os
 3 | import sys
 4 | import json
 5 | import urllib
 6 | from bs4 import BeautifulSoup
 7 | import urllib.request as urllib2
 8 | import http.cookiejar as cookielib
 9 | 
10 | with open('../config.json') as config_file:
11 |     config = json.load(config_file)
12 | 
13 | 
14 | def linkedin():
15 |         global opener
16 |         cookie_filename = "cookies.txt"
17 | 
18 |         # Simulate browser with cookies enabled
19 |         cj = cookielib.MozillaCookieJar(cookie_filename)
20 |         if os.access(cookie_filename, os.F_OK):
21 |             cj.load()
22 | 
23 |         # Load Proxy settings
24 |         if len(config['proxylist']) > 0:
25 |             proxy_handler = urllib2.ProxyHandler(
26 |                 {'https': config['proxylist'][0]})
27 | 
28 |             opener = urllib2.build_opener(
29 |                 proxy_handler,
30 |                 urllib2.HTTPRedirectHandler(),
31 |                 urllib2.HTTPHandler(debuglevel=0),
32 |                 urllib2.HTTPSHandler(debuglevel=0),
33 |                 urllib2.HTTPCookieProcessor(cj)
34 |             )
35 |         else:
36 |             opener = urllib2.build_opener(
37 |                 urllib2.HTTPRedirectHandler(),
38 |                 urllib2.HTTPHandler(debuglevel=0),
39 |                 urllib2.HTTPSHandler(debuglevel=0),
40 |                 urllib2.HTTPCookieProcessor(cj)
41 |             )
42 | 
43 |         user_agent = config['cookie']['User-Agent']
44 | 
45 |         opener.addheaders = [('User-Agent', user_agent)]
46 | 
47 |         # Get CSRF Token
48 |         html = load_page("https://www.linkedin.com/")
49 |         soup = BeautifulSoup(html, "html.parser")
50 |         csrf = soup.find(id="loginCsrfParam-login")['value']
51 | 
52 |         # Authenticate
53 |         login_data = urllib.urlencode({
54 |             'session_key': config['username'],
55 |             'session_password': config['password'],
56 |             'loginCsrfParam': csrf,
57 |         })
58 | 
59 |         html = load_page("https://www.linkedin.com/uas/login-submit", login_data)
60 |         soup = BeautifulSoup(html, "html.parser")
61 | 
62 |         try:
63 |             print(cj._cookies['.www.linkedin.com']['/']['li_at'].value)
64 | 
65 |         except Exception:
66 |             print("error")
67 | 
68 |         cj.save()
69 |         os.remove(cookie_filename)
70 | 
71 | 
72 | def load_page(url, data=None):
73 |         try:
74 |             response = opener.open(url)
75 |         except Exception:
76 |             print("\n[Fatal] Your IP may have been temporarily blocked")
77 | 
78 |         try:
79 |             if data is not None:
80 |                 response = opener.open(url, data)
81 |             else:
82 |                 response = opener.open(url)
83 | 
84 |             return ''.join(response.readlines())
85 | 
86 |         except Exception:
87 |             # If URL doesn't load for ANY reason, try again...
88 |             # Quick n dirty solution for 404 returns because of network problems
89 |             # However, this could infinite loop if there's an actual problem
90 |             print("Scraping search results")
91 |             sys.exit(0)
92 | 
93 | 
94 | linkedin()
95 | 


--------------------------------------------------------------------------------
/src/profile_extractor.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import os
  4 | import json
  5 | import requests
  6 | import traceback
  7 | import pandas as pd
  8 | from src.core import config
  9 | from bs4 import BeautifulSoup
 10 | 
 11 | 
 12 | def scrape_profile_texts(source_csv, cookies, headers, save_dicts=False):
 13 | 
 14 |     source_filename = os.path.basename(source_csv)
 15 | 
 16 |     amount_profiles = config['profile_extractor']['amount_profiles']
 17 | 
 18 |     print("Config set to scrape first " + str(amount_profiles)
 19 |           + " search results")
 20 | 
 21 |     in_df = pd.read_csv(source_csv, encoding='utf-8', nrows=amount_profiles)
 22 | 
 23 |     url_list = in_df['url'].tolist()
 24 | 
 25 |     fields_to_scrape = scraping_dict().keys()
 26 | 
 27 |     empty_dict = dict.fromkeys(fields_to_scrape, None)
 28 | 
 29 |     out_df = pd.DataFrame(columns=['person_id'] + list(fields_to_scrape))
 30 | 
 31 |     for profile_url in url_list:
 32 |         data_dict = {}
 33 |         try:
 34 |             r = requests.get(profile_url, cookies=cookies, headers=headers)
 35 |         except Exception:
 36 |             print(traceback.format_exc())
 37 |             exit()
 38 | 
 39 |         person_id = profile_url.rsplit('/', 1)[-1]
 40 | 
 41 |         outdir = './data/profile_pages'
 42 |         if not os.path.exists(outdir):
 43 |             os.mkdir(outdir)
 44 | 
 45 |         profile_filename = person_id
 46 | 
 47 |         html_file = os.path.join(outdir, profile_filename + '.html')
 48 |         json_file = os.path.join(outdir, profile_filename + '.json')
 49 | 
 50 |         with open(html_file, 'w') as the_file:
 51 |             the_file.write(r.text)
 52 | 
 53 |         soup = BeautifulSoup(open(html_file), "html.parser")
 54 | 
 55 |         # Locate part of the page that contains the data-json we want to scrape
 56 |         found = soup.find(
 57 |             lambda tag: tag.name == "code" and "*profile" in tag.text)
 58 | 
 59 |         extract = found.contents[0].strip()
 60 | 
 61 |         # Print the content of data to see all scraping possibilities
 62 |         # or optionally use below json file
 63 |         data = json.loads(extract)
 64 | 
 65 |         if save_dicts:  # Option to save the raw loaded dictionary to json file
 66 |             with open(json_file, 'w') as fp:
 67 |                 json.dump(data, fp, indent=4)
 68 | 
 69 |         # This is where the scraping action happens
 70 |         for entity in data['included']:
 71 |             col_name = entity['entityUrn'].rsplit(':')[2]
 72 |             fill_data(data_dict, col_name, entity)
 73 | 
 74 |         # This creates {item: None}'s when a field is left empty
 75 |         # Which is required by the .loc operation below.
 76 |         # Order of summation matters!
 77 |         filled_dict = dict(list(empty_dict.items()) + list(data_dict.items()))
 78 | 
 79 |         filled_dict['person_id'] = person_id
 80 | 
 81 |         # Sample (filled_dict) as a row into df operation
 82 |         out_df.loc[len(out_df)] = filled_dict
 83 | 
 84 |     # Remove newlines and other whitespaces
 85 |     out_df.replace(r'\s', ' ', regex=True, inplace=True)
 86 | 
 87 |     datadir = './data/profile_data'
 88 |     if not os.path.exists(datadir):
 89 |         os.mkdir(datadir)
 90 | 
 91 |     csv_outfile = os.path.join(datadir, source_filename)
 92 | 
 93 |     out_df.to_csv(csv_outfile, index=False)
 94 | 
 95 |     print("Managed to fully scrape "
 96 |           + str(len(out_df)) +
 97 |           " profiles into " +
 98 |           str(csv_outfile))
 99 | 
100 | 
101 | def scraping_dict():
102 |     # Below dict is organized as follows:
103 |     # part_to_scrape: list_of_subfields_to_get
104 |     items = {
105 |         "fs_course": ["name"],
106 | 
107 |         "fs_education":
108 |             ['schoolName', 'description', 'degreeName', 'activities', 'grade',
109 |              'fieldOfStudy', 'projects', 'entityLocale', 'recommendations'],
110 | 
111 |         "fs_honor": ['title', 'description', 'issuer'],
112 | 
113 |         "fs_language": ['name'],
114 | 
115 |         "fs_position":
116 |             ['companyName', 'description', 'title', {"company": "industries"},
117 |              'courses', 'locationName', 'projects', 'entityLocale',
118 |              'organizations', 'region', 'recommendations', 'honors',
119 |              'promotion'],
120 | 
121 |         "fs_profile": ["headline", "summary", "industryName", "locationName"],
122 | 
123 |         "fs_project": ['title', 'occupation', 'description'],
124 | 
125 |         "fs_publication": ['name', 'publisher', 'description'],
126 | 
127 |         "fs_skill": ["name"]
128 |     }
129 | 
130 |     # Extendable. See "..json.loads(extract)" for more scraping possibilities
131 |     return items
132 | 
133 | 
134 | def fill_data(data_dict, col_name, entity):
135 |     if col_name in scraping_dict():
136 |         subfields = scraping_dict()[col_name]
137 |     else:
138 |         return  # Don't scrape if not in scraping dictionary
139 | 
140 |     text = []
141 | 
142 |     for subfield in subfields:
143 | 
144 |         if isinstance(subfield, str):
145 |             if subfield in entity:
146 |                 text.append(str(entity[subfield]))
147 | 
148 |         elif isinstance(subfield, dict):  # Nested value!
149 |             for key, value in subfield.items():
150 |                 if key in entity and value in entity[key]:
151 |                     text.append(str(entity[key][value]))
152 | 
153 |     # This concatenates all subfield texts into a single sentence in "col_name"
154 |     # Also merges includes all entities of the same time
155 |     # For ex. job1 job2 job3 will all be incl. in fs_position as one long string
156 |     if col_name in data_dict:
157 |         data_dict[col_name] += " " + " ".join(text)
158 |     else:
159 |         data_dict[col_name] = " ".join(text)
160 | 


--------------------------------------------------------------------------------
/src/profile_sourcer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import os
  3 | import json
  4 | import time
  5 | import requests
  6 | import traceback
  7 | import pandas as pd
  8 | from src.core import config
  9 | 
 10 | 
 11 | def scrape_search_results(search_string, cookies, headers):
 12 |     # Fetch the initial page to get results/page counts
 13 | 
 14 |     # search_url / page_url not in config: not expected to be changed by user
 15 |     search_url = "https://www.linkedin.com/voyager/api/search/cluster?" \
 16 |                    "count=%i&guides=List(v-%%3EPEOPLE,facetGeoRegion-%%3Ear" \
 17 |                    "%%3A0)&keywords=%s&origin=FACETED_SEARCH&q=guided&start=0"
 18 | 
 19 |     page_url = "https://www.linkedin.com/voyager/api/search/cluster?" \
 20 |                "count=%i&guides=List(v-%%3EPEOPLE,facetGeoRegion-%%3Ear" \
 21 |                "%%3A0)&keywords=%s&origin=FACETED_SEARCH&q=guided&start=%i"
 22 | 
 23 |     url = search_url % (config['search_results']['results_per_page'],
 24 |                         search_string)
 25 | 
 26 |     try:
 27 |         r = requests.get(url, cookies=cookies, headers=headers)
 28 |     except Exception:
 29 |         print(traceback.format_exc())
 30 |         exit()
 31 | 
 32 |     content = json.loads(r.text)
 33 |     data_total = content['paging']['total']
 34 |     
 35 |     # Calculate pages of final results at X results/page
 36 |     pages = data_total / config['search_results']['results_per_page']
 37 |     if data_total % config['search_results']['results_per_page'] == 0:
 38 |         # Including 0, subtract a page if no leftover results on last page
 39 |         pages = pages - 1 
 40 |     if pages == 0: 
 41 |         pages = 1
 42 |     
 43 |     print("Found %i results for search query \"%s\"" %
 44 |           (data_total, search_string))
 45 | 
 46 |     if data_total > 1000:
 47 |         pages = config['search_results']['pages_to_scrape']
 48 |         # FYI: LinkedIn only allows 1000 results
 49 | 
 50 |     print("Fetching first %i pages" % pages)
 51 | 
 52 |     search_results = pd.DataFrame()
 53 | 
 54 |     for p in range(pages):
 55 |         # Request results for each page using the start offset
 56 | 
 57 |         url = page_url % (config['search_results']
 58 |                           ['results_per_page'],
 59 |                           search_string,
 60 |                           p*config['search_results']
 61 |                           ['results_per_page'])
 62 | 
 63 |         r = requests.get(url, cookies=cookies, headers=headers)
 64 | 
 65 |         content = r.text.encode('UTF-8')
 66 |         content = json.loads(content.decode("utf-8"))
 67 | 
 68 |         print("Fetching page %i (contains %i results)" %
 69 |               (p+1, len(content['elements'][0]['elements'])))
 70 | 
 71 |         profiles_skipped = False
 72 |         for c in content['elements'][0]['elements']:
 73 |             try:
 74 |                 # Using these lookup strings to shorten query lines below
 75 |                 lookup = 'com.linkedin.voyager.search.SearchProfile'
 76 |                 h = 'hitInfo'
 77 |                 m = 'miniProfile'
 78 | 
 79 |                 # Doesn't work anymore
 80 |                 # pic_url = "https://media.licdn.com/mpr/mpr/shrinknp_400_400%s"
 81 |                 # pic_query = "com.linkedin.voyager.common.MediaProcessorImage"
 82 | 
 83 |                 if not c[h][lookup]['headless']:
 84 |                     try:
 85 |                         data_industry = c[h][lookup]['industry']
 86 |                     except Exception:
 87 |                         data_industry = ""
 88 | 
 89 |                     data_firstname = c[h][lookup][m]['firstName']
 90 | 
 91 |                     data_lastname = c[h][lookup][m]['lastName']
 92 | 
 93 |                     data_url = "https://www.linkedin.com/in/%s" % \
 94 |                                c[h][lookup][m]['publicIdentifier']
 95 | 
 96 |                     data_occupation = c[h][lookup][m]['occupation']
 97 | 
 98 |                     data_location = c[h][lookup]['location']
 99 | 
100 |                     '''
101 |                     # This section doesn't work
102 |                     try:
103 |                         extract_id = c[h][lookup][m]['picture'][pic_query]['id']
104 |                         data_picture = pic_url % extract_id
105 | 
106 |                     except Exception:
107 |                         # No pic found for (data_firstn, data_lastn, d_occ)
108 |                         data_picture = ""
109 |                     '''
110 | 
111 |                     data_dict = {
112 |                         "name": data_firstname + " " + data_lastname,
113 |                         "occupation": data_occupation,
114 |                         "location": data_location,
115 |                         "industry": data_industry,
116 |                         "url": data_url
117 |                         # "pic": data_picture  # Doesn't work
118 |                     }
119 | 
120 |                     search_results = search_results.append([data_dict])
121 | 
122 |                 else:
123 |                     print("[Notice] Headless profile found. Skipping")
124 |             except Exception:
125 |                 profiles_skipped = True
126 |                 print("Skipped profile.. ", end='')
127 |                 continue
128 |         if profiles_skipped:  # Just for prettyness of printing..
129 |             print()
130 | 
131 |     timestamp = str(int(time.time()))
132 | 
133 |     filename = timestamp + '.csv'
134 | 
135 |     outdir = './data/search_results'
136 |     if not os.path.exists(outdir):
137 |         os.mkdir(outdir)
138 | 
139 |     full_file_path = os.path.join(outdir, filename)
140 | 
141 |     amount_results = len(search_results)
142 | 
143 |     if amount_results > 0:
144 |         print("Stored total of " + str(amount_results)
145 |               + " search results in file "
146 |               + str(full_file_path))
147 | 
148 |         search_results.to_csv(full_file_path,
149 |                               index=False,
150 |                               columns=["name", "occupation",
151 |                                        "location", "industry", "url"])
152 |     else:
153 |         print("Zero valid search results! Increase amount to scrape in config")
154 |         exit(0)
155 | 
156 |     return filename
157 | 
158 | 


--------------------------------------------------------------------------------