├── data └── placeholder ├── licookies.png ├── requirements.txt ├── config.json ├── src ├── core.py ├── login.py ├── profile_extractor.py └── profile_sourcer.py ├── run.py ├── LICENSE ├── .gitignore └── README.md /data/placeholder: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /licookies.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hakimkhalafi/linkedin-scraper/HEAD/licookies.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.6.3 2 | certifi==2018.11.29 3 | chardet==3.0.4 4 | idna==2.8 5 | numpy==1.22.0 6 | pandas==0.23.4 7 | python-dateutil==2.7.5 8 | pytz==2018.7 9 | requests==2.21.0 10 | six==1.12.0 11 | urllib3>=1.24.2 12 | -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "username": "user_email@example.com", 3 | "password": "password", 4 | "proxylist": [], 5 | "timeout": 10, 6 | "search_results": { 7 | "pages_to_scrape": 2, 8 | "results_per_page": 40 9 | }, 10 | "profile_extractor": { 11 | "amount_profiles": 10 12 | }, 13 | "cookie": { 14 | "li_at": "AexampleexampleexampleO_gexampleexampleexampleexamplb_TexampleexampleM_VexampleexB_wexampleexampleexamm_7exampleexampleexa1-fexampleexampleP_Hex9-qe9-1l", 15 | "Csrf-Token": "ajax:1234567890123456789", 16 | "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0 Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0." 17 | } 18 | } -------------------------------------------------------------------------------- /src/core.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import argparse 5 | import traceback 6 | import subprocess 7 | 8 | with open(os.getcwd() + '/config.json') as config_file: 9 | config = json.load(config_file) 10 | 11 | 12 | def arguments_setup(option): 13 | """ Setup Argument Parameters """ 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument(option, '--keywords') 16 | return parser.parse_args() 17 | 18 | 19 | def authenticate(): 20 | try: 21 | subproc = subprocess.Popen(['python', 'login.py'], 22 | stdout=subprocess.PIPE, 23 | cwd='src') 24 | 25 | sess = subproc.communicate()[0].decode('utf-8').replace("\n", "") 26 | 27 | if len(sess) == 0: 28 | sys.exit("[Error] Unable to login to LinkedIn.com") 29 | 30 | session_cookies = dict(li_at=sess) 31 | 32 | except Exception: 33 | print(traceback.format_exc()) 34 | sys.exit("[Fatal] Could not authenticate to linkedin.") 35 | 36 | return session_cookies 37 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import os 2 | from src.profile_sourcer import scrape_search_results 3 | from src.profile_extractor import scrape_profile_texts 4 | from src.core import config, arguments_setup, authenticate 5 | 6 | if __name__ == '__main__': # For command line usage 7 | args = arguments_setup('-s') # Search string 8 | 9 | search_string = args.keywords 10 | 11 | cookies = authenticate() 12 | 13 | cookies['JSESSIONID'] = config['cookie']['Csrf-Token'] 14 | cookies['li_at'] = config['cookie']['li_at'] 15 | 16 | headers = {'Csrf-Token': config['cookie']['Csrf-Token'], 17 | 'User-Agent': config['cookie']['User-Agent']} 18 | 19 | print("-------- Scrape search results stage") 20 | 21 | search_results_filename = scrape_search_results(search_string, cookies, 22 | headers) 23 | 24 | search_file_folder = './data/search_results/' 25 | full_file_path = os.path.join(search_file_folder, search_results_filename) 26 | 27 | print("-------- Scrape profile contents stage") 28 | scrape_profile_texts(full_file_path, cookies, headers, save_dicts=False) 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Hakim Khan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | .static_storage/ 56 | .media/ 57 | local_settings.py 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # PyCharm 107 | .idea/ 108 | .idea 109 | 110 | # data directories 111 | data/profile_data 112 | data/profile_pages 113 | data/search_results -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # linkedin-scraper 2 | 3 | This tool allows you to scrape LinkedIn profiles based on search queries. You can use the end result for NLP/text analysis. 4 | 5 | This repo is a heavily modified version of [dchrastil's ScrapedIn](https://github.com/dchrastil/ScrapedIn) with the addition of in-profile scraping. 6 | 7 | ### Prerequisites 8 | 9 | You will need python 3+. 10 | 11 | First clone this repo, then navigate to the folder and install the requirements 12 | 13 | ``` 14 | git clone https://github.com/hakimkhalafi/linkedin-scraper.git 15 | cd linkedin-scraper 16 | pip install -r requirements.txt 17 | ``` 18 | 19 | ### Configuring 20 | 21 | There are 4 config settings you must change before the tool runs successfully. 22 | 23 | In config.json, change the following values to match your LinkedIn sign-on info 24 | ``` 25 | "username": "user_email@example.com", 26 | "password": "password", 27 | ``` 28 | 29 | Additionally, you will need to extract some cookie settings in order to login successfully 30 | 31 | ``` 32 | "li_at": "Aexampleexampleexamplee........", 33 | "Csrf-Token": "ajax:1234567890123456789", 34 | ``` 35 | 36 | You can do this in chrome by logging in to LinkedIn -> right click page -> "Inspect" -> "Application" tab -> "Cookies". 37 | 38 | ![Getting cookie config settings](licookies.png) 39 | 40 | Then double click the relevant values marked in red. "JSESSIONID" goes into "Csrf-Token" and "li_at" into "li_at". 41 | 42 | ### Running 43 | 44 | Once you're set up and configured, you can run the tool via 45 | 46 | ``` 47 | python run.py -s "search query" 48 | ``` 49 | 50 | Where search query can be any job title such as "Data Scientist" etc. 51 | 52 | The end result will be a CSV containing the following information 53 | 54 | | Column name | Content | 55 | | --- | --- | 56 | | person_id | Identifier for profile | 57 | | fs_profile | Main profile information | 58 | | fs_position | All information for all job positions listed | 59 | | fs_education | All information for all attained education | 60 | | fs_language | Any languages the person speaks | 61 | | fs_skill | Any skills the person has provided | 62 | | fs_project | Any projects the person has completed | 63 | | fs_honor | Any activities and honors the person has | 64 | | fs_publication | Any publications the person has published | 65 | | fs_course | Courses the person has completed | 66 | 67 | Enjoy! 68 | 69 | ### Disclaimer 70 | This educational tool probably violates LinkedIns terms of service. Use at your own risk. 71 | -------------------------------------------------------------------------------- /src/login.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os 3 | import sys 4 | import json 5 | import urllib 6 | from bs4 import BeautifulSoup 7 | import urllib.request as urllib2 8 | import http.cookiejar as cookielib 9 | 10 | with open('../config.json') as config_file: 11 | config = json.load(config_file) 12 | 13 | 14 | def linkedin(): 15 | global opener 16 | cookie_filename = "cookies.txt" 17 | 18 | # Simulate browser with cookies enabled 19 | cj = cookielib.MozillaCookieJar(cookie_filename) 20 | if os.access(cookie_filename, os.F_OK): 21 | cj.load() 22 | 23 | # Load Proxy settings 24 | if len(config['proxylist']) > 0: 25 | proxy_handler = urllib2.ProxyHandler( 26 | {'https': config['proxylist'][0]}) 27 | 28 | opener = urllib2.build_opener( 29 | proxy_handler, 30 | urllib2.HTTPRedirectHandler(), 31 | urllib2.HTTPHandler(debuglevel=0), 32 | urllib2.HTTPSHandler(debuglevel=0), 33 | urllib2.HTTPCookieProcessor(cj) 34 | ) 35 | else: 36 | opener = urllib2.build_opener( 37 | urllib2.HTTPRedirectHandler(), 38 | urllib2.HTTPHandler(debuglevel=0), 39 | urllib2.HTTPSHandler(debuglevel=0), 40 | urllib2.HTTPCookieProcessor(cj) 41 | ) 42 | 43 | user_agent = config['cookie']['User-Agent'] 44 | 45 | opener.addheaders = [('User-Agent', user_agent)] 46 | 47 | # Get CSRF Token 48 | html = load_page("https://www.linkedin.com/") 49 | soup = BeautifulSoup(html, "html.parser") 50 | csrf = soup.find(id="loginCsrfParam-login")['value'] 51 | 52 | # Authenticate 53 | login_data = urllib.urlencode({ 54 | 'session_key': config['username'], 55 | 'session_password': config['password'], 56 | 'loginCsrfParam': csrf, 57 | }) 58 | 59 | html = load_page("https://www.linkedin.com/uas/login-submit", login_data) 60 | soup = BeautifulSoup(html, "html.parser") 61 | 62 | try: 63 | print(cj._cookies['.www.linkedin.com']['/']['li_at'].value) 64 | 65 | except Exception: 66 | print("error") 67 | 68 | cj.save() 69 | os.remove(cookie_filename) 70 | 71 | 72 | def load_page(url, data=None): 73 | try: 74 | response = opener.open(url) 75 | except Exception: 76 | print("\n[Fatal] Your IP may have been temporarily blocked") 77 | 78 | try: 79 | if data is not None: 80 | response = opener.open(url, data) 81 | else: 82 | response = opener.open(url) 83 | 84 | return ''.join(response.readlines()) 85 | 86 | except Exception: 87 | # If URL doesn't load for ANY reason, try again... 88 | # Quick n dirty solution for 404 returns because of network problems 89 | # However, this could infinite loop if there's an actual problem 90 | print("Scraping search results") 91 | sys.exit(0) 92 | 93 | 94 | linkedin() 95 | -------------------------------------------------------------------------------- /src/profile_extractor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os 4 | import json 5 | import requests 6 | import traceback 7 | import pandas as pd 8 | from src.core import config 9 | from bs4 import BeautifulSoup 10 | 11 | 12 | def scrape_profile_texts(source_csv, cookies, headers, save_dicts=False): 13 | 14 | source_filename = os.path.basename(source_csv) 15 | 16 | amount_profiles = config['profile_extractor']['amount_profiles'] 17 | 18 | print("Config set to scrape first " + str(amount_profiles) 19 | + " search results") 20 | 21 | in_df = pd.read_csv(source_csv, encoding='utf-8', nrows=amount_profiles) 22 | 23 | url_list = in_df['url'].tolist() 24 | 25 | fields_to_scrape = scraping_dict().keys() 26 | 27 | empty_dict = dict.fromkeys(fields_to_scrape, None) 28 | 29 | out_df = pd.DataFrame(columns=['person_id'] + list(fields_to_scrape)) 30 | 31 | for profile_url in url_list: 32 | data_dict = {} 33 | try: 34 | r = requests.get(profile_url, cookies=cookies, headers=headers) 35 | except Exception: 36 | print(traceback.format_exc()) 37 | exit() 38 | 39 | person_id = profile_url.rsplit('/', 1)[-1] 40 | 41 | outdir = './data/profile_pages' 42 | if not os.path.exists(outdir): 43 | os.mkdir(outdir) 44 | 45 | profile_filename = person_id 46 | 47 | html_file = os.path.join(outdir, profile_filename + '.html') 48 | json_file = os.path.join(outdir, profile_filename + '.json') 49 | 50 | with open(html_file, 'w') as the_file: 51 | the_file.write(r.text) 52 | 53 | soup = BeautifulSoup(open(html_file), "html.parser") 54 | 55 | # Locate part of the page that contains the data-json we want to scrape 56 | found = soup.find( 57 | lambda tag: tag.name == "code" and "*profile" in tag.text) 58 | 59 | extract = found.contents[0].strip() 60 | 61 | # Print the content of data to see all scraping possibilities 62 | # or optionally use below json file 63 | data = json.loads(extract) 64 | 65 | if save_dicts: # Option to save the raw loaded dictionary to json file 66 | with open(json_file, 'w') as fp: 67 | json.dump(data, fp, indent=4) 68 | 69 | # This is where the scraping action happens 70 | for entity in data['included']: 71 | col_name = entity['entityUrn'].rsplit(':')[2] 72 | fill_data(data_dict, col_name, entity) 73 | 74 | # This creates {item: None}'s when a field is left empty 75 | # Which is required by the .loc operation below. 76 | # Order of summation matters! 77 | filled_dict = dict(list(empty_dict.items()) + list(data_dict.items())) 78 | 79 | filled_dict['person_id'] = person_id 80 | 81 | # Sample (filled_dict) as a row into df operation 82 | out_df.loc[len(out_df)] = filled_dict 83 | 84 | # Remove newlines and other whitespaces 85 | out_df.replace(r'\s', ' ', regex=True, inplace=True) 86 | 87 | datadir = './data/profile_data' 88 | if not os.path.exists(datadir): 89 | os.mkdir(datadir) 90 | 91 | csv_outfile = os.path.join(datadir, source_filename) 92 | 93 | out_df.to_csv(csv_outfile, index=False) 94 | 95 | print("Managed to fully scrape " 96 | + str(len(out_df)) + 97 | " profiles into " + 98 | str(csv_outfile)) 99 | 100 | 101 | def scraping_dict(): 102 | # Below dict is organized as follows: 103 | # part_to_scrape: list_of_subfields_to_get 104 | items = { 105 | "fs_course": ["name"], 106 | 107 | "fs_education": 108 | ['schoolName', 'description', 'degreeName', 'activities', 'grade', 109 | 'fieldOfStudy', 'projects', 'entityLocale', 'recommendations'], 110 | 111 | "fs_honor": ['title', 'description', 'issuer'], 112 | 113 | "fs_language": ['name'], 114 | 115 | "fs_position": 116 | ['companyName', 'description', 'title', {"company": "industries"}, 117 | 'courses', 'locationName', 'projects', 'entityLocale', 118 | 'organizations', 'region', 'recommendations', 'honors', 119 | 'promotion'], 120 | 121 | "fs_profile": ["headline", "summary", "industryName", "locationName"], 122 | 123 | "fs_project": ['title', 'occupation', 'description'], 124 | 125 | "fs_publication": ['name', 'publisher', 'description'], 126 | 127 | "fs_skill": ["name"] 128 | } 129 | 130 | # Extendable. See "..json.loads(extract)" for more scraping possibilities 131 | return items 132 | 133 | 134 | def fill_data(data_dict, col_name, entity): 135 | if col_name in scraping_dict(): 136 | subfields = scraping_dict()[col_name] 137 | else: 138 | return # Don't scrape if not in scraping dictionary 139 | 140 | text = [] 141 | 142 | for subfield in subfields: 143 | 144 | if isinstance(subfield, str): 145 | if subfield in entity: 146 | text.append(str(entity[subfield])) 147 | 148 | elif isinstance(subfield, dict): # Nested value! 149 | for key, value in subfield.items(): 150 | if key in entity and value in entity[key]: 151 | text.append(str(entity[key][value])) 152 | 153 | # This concatenates all subfield texts into a single sentence in "col_name" 154 | # Also merges includes all entities of the same time 155 | # For ex. job1 job2 job3 will all be incl. in fs_position as one long string 156 | if col_name in data_dict: 157 | data_dict[col_name] += " " + " ".join(text) 158 | else: 159 | data_dict[col_name] = " ".join(text) 160 | -------------------------------------------------------------------------------- /src/profile_sourcer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os 3 | import json 4 | import time 5 | import requests 6 | import traceback 7 | import pandas as pd 8 | from src.core import config 9 | 10 | 11 | def scrape_search_results(search_string, cookies, headers): 12 | # Fetch the initial page to get results/page counts 13 | 14 | # search_url / page_url not in config: not expected to be changed by user 15 | search_url = "https://www.linkedin.com/voyager/api/search/cluster?" \ 16 | "count=%i&guides=List(v-%%3EPEOPLE,facetGeoRegion-%%3Ear" \ 17 | "%%3A0)&keywords=%s&origin=FACETED_SEARCH&q=guided&start=0" 18 | 19 | page_url = "https://www.linkedin.com/voyager/api/search/cluster?" \ 20 | "count=%i&guides=List(v-%%3EPEOPLE,facetGeoRegion-%%3Ear" \ 21 | "%%3A0)&keywords=%s&origin=FACETED_SEARCH&q=guided&start=%i" 22 | 23 | url = search_url % (config['search_results']['results_per_page'], 24 | search_string) 25 | 26 | try: 27 | r = requests.get(url, cookies=cookies, headers=headers) 28 | except Exception: 29 | print(traceback.format_exc()) 30 | exit() 31 | 32 | content = json.loads(r.text) 33 | data_total = content['paging']['total'] 34 | 35 | # Calculate pages of final results at X results/page 36 | pages = data_total / config['search_results']['results_per_page'] 37 | if data_total % config['search_results']['results_per_page'] == 0: 38 | # Including 0, subtract a page if no leftover results on last page 39 | pages = pages - 1 40 | if pages == 0: 41 | pages = 1 42 | 43 | print("Found %i results for search query \"%s\"" % 44 | (data_total, search_string)) 45 | 46 | if data_total > 1000: 47 | pages = config['search_results']['pages_to_scrape'] 48 | # FYI: LinkedIn only allows 1000 results 49 | 50 | print("Fetching first %i pages" % pages) 51 | 52 | search_results = pd.DataFrame() 53 | 54 | for p in range(pages): 55 | # Request results for each page using the start offset 56 | 57 | url = page_url % (config['search_results'] 58 | ['results_per_page'], 59 | search_string, 60 | p*config['search_results'] 61 | ['results_per_page']) 62 | 63 | r = requests.get(url, cookies=cookies, headers=headers) 64 | 65 | content = r.text.encode('UTF-8') 66 | content = json.loads(content.decode("utf-8")) 67 | 68 | print("Fetching page %i (contains %i results)" % 69 | (p+1, len(content['elements'][0]['elements']))) 70 | 71 | profiles_skipped = False 72 | for c in content['elements'][0]['elements']: 73 | try: 74 | # Using these lookup strings to shorten query lines below 75 | lookup = 'com.linkedin.voyager.search.SearchProfile' 76 | h = 'hitInfo' 77 | m = 'miniProfile' 78 | 79 | # Doesn't work anymore 80 | # pic_url = "https://media.licdn.com/mpr/mpr/shrinknp_400_400%s" 81 | # pic_query = "com.linkedin.voyager.common.MediaProcessorImage" 82 | 83 | if not c[h][lookup]['headless']: 84 | try: 85 | data_industry = c[h][lookup]['industry'] 86 | except Exception: 87 | data_industry = "" 88 | 89 | data_firstname = c[h][lookup][m]['firstName'] 90 | 91 | data_lastname = c[h][lookup][m]['lastName'] 92 | 93 | data_url = "https://www.linkedin.com/in/%s" % \ 94 | c[h][lookup][m]['publicIdentifier'] 95 | 96 | data_occupation = c[h][lookup][m]['occupation'] 97 | 98 | data_location = c[h][lookup]['location'] 99 | 100 | ''' 101 | # This section doesn't work 102 | try: 103 | extract_id = c[h][lookup][m]['picture'][pic_query]['id'] 104 | data_picture = pic_url % extract_id 105 | 106 | except Exception: 107 | # No pic found for (data_firstn, data_lastn, d_occ) 108 | data_picture = "" 109 | ''' 110 | 111 | data_dict = { 112 | "name": data_firstname + " " + data_lastname, 113 | "occupation": data_occupation, 114 | "location": data_location, 115 | "industry": data_industry, 116 | "url": data_url 117 | # "pic": data_picture # Doesn't work 118 | } 119 | 120 | search_results = search_results.append([data_dict]) 121 | 122 | else: 123 | print("[Notice] Headless profile found. Skipping") 124 | except Exception: 125 | profiles_skipped = True 126 | print("Skipped profile.. ", end='') 127 | continue 128 | if profiles_skipped: # Just for prettyness of printing.. 129 | print() 130 | 131 | timestamp = str(int(time.time())) 132 | 133 | filename = timestamp + '.csv' 134 | 135 | outdir = './data/search_results' 136 | if not os.path.exists(outdir): 137 | os.mkdir(outdir) 138 | 139 | full_file_path = os.path.join(outdir, filename) 140 | 141 | amount_results = len(search_results) 142 | 143 | if amount_results > 0: 144 | print("Stored total of " + str(amount_results) 145 | + " search results in file " 146 | + str(full_file_path)) 147 | 148 | search_results.to_csv(full_file_path, 149 | index=False, 150 | columns=["name", "occupation", 151 | "location", "industry", "url"]) 152 | else: 153 | print("Zero valid search results! Increase amount to scrape in config") 154 | exit(0) 155 | 156 | return filename 157 | 158 | --------------------------------------------------------------------------------