├── .DS_Store ├── .gitattributes ├── .gitignore ├── .vscode └── settings.json ├── README.md ├── browser.py ├── extracted └── .keep ├── extractor.py ├── main.py ├── println.py ├── process.txt ├── requirements.txt └── static └── .DS_Store /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/creativeJoe007/google-leads-scraper/24ca42b5e30870126a21ddab36e3fe986cc47009/.DS_Store -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Folders 10 | extracted/* 11 | env/ 12 | static/ 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | pip-wheel-metadata/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | *.py,cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | db.sqlite3-journal 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # celery beat schedule file 100 | celerybeat-schedule 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "cortex-debug.armToolchainPath": "/usr/local/bin", 3 | "cortex-debug.openocdPath": "/usr/local/bin/openocd" 4 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Google-Leads-Scraper 3 | A simple web scraper that makes it easy for marketers to extract emails and phone numbers (leads) from google search results. 4 | 5 | # Process 6 | Quite a simple process, you simply enter your search query, file-name to store the data, start and stop to denote number of pages to scrape. The system will load up your query headlessly, meaning you wouldn't even see it while it does its scrapping. 7 | This scraper will scrape phone numbers and email addresses including a full screenshot of the webpages and navigate google search pages until it gets to your inputed stop page. 8 | 9 | # Get Started: 10 | To start you would need to activate the virtual environment 11 | ## For linux/Mac: 12 | 13 | source env/bin/activate 14 | 15 | ## For Windows (not supported): 16 | 17 | 18 | .\env\Scripts\activate 19 | 20 | ## 21 | 22 | 23 | 1. `python3 --version (supports 3.7 and above)` 24 | 2. `pip3 install -r requirements.txt` 25 | 3. `python3 main.py "my query" --start=1 --stop=5 --file="a_file_name_to_save_data"` 26 | 27 | # Note: 28 | A screenshot of websites the scraper visits is taken and stored in the STATIC folder. 29 | Check the EXTRACTED folder to view content extracted by this tool (usually in csv format) 30 | 31 | 32 | 33 | ***Please share and support this libary with your code contributions.*** 34 | -------------------------------------------------------------------------------- /browser.py: -------------------------------------------------------------------------------- 1 | #---------------------------------------------------------------------------------------------------- 2 | # Authur: creativeJoe007 3 | # Website: https://creativejoe007.com 4 | #---------------------------------------------------------------------------------------------------- 5 | # A Google bot that allows anyone search for businesses using a keyword 6 | # We extract the website title, description, email (if any), mobile number (if any), web-link 7 | # An ideal bot for marketers looking to find leads/prospects 8 | #---------------------------------------------------------------------------------------------------- 9 | from selenium import webdriver 10 | from webdriver_manager.core.utils import ChromeType 11 | from webdriver_manager.firefox import GeckoDriverManager 12 | from selenium.common.exceptions import WebDriverException 13 | 14 | 15 | def determine_browser(preferred_browser="chrome", binary_path=""): 16 | #---------------------------------------------------------------------------------------------- 17 | # We would perform try and catch for multiple browser type until we find one that exits 18 | # ON the host system 19 | #---------------------------------------------------------------------------------------------- 20 | supported_browser = ["chrome", "chromium"] 21 | 22 | try: 23 | if preferred_browser not in supported_browser: 24 | return f"This browser is not supported by this library, only supported browsers are {supported_browser}" 25 | else: 26 | if preferred_browser == "chrome" or preferred_browser == "chromium": 27 | return start_chrome(preferred_browser, binary_path) 28 | except WebDriverException as e: 29 | return f"Browser error: {str(e)}" 30 | except OSError as e: 31 | return f"OS Error: {str(e)}" 32 | 33 | def start_chrome(_preferred_type, binary_path): 34 | from webdriver_manager.chrome import ChromeDriverManager 35 | from selenium.webdriver.chrome.options import Options 36 | from webdriver_manager.core.utils import ChromeType 37 | 38 | options = Options() 39 | options.add_argument('--headless') 40 | options.add_argument('start-maximized') 41 | options.add_argument("enable-automation") 42 | options.add_argument("--disable-extensions") 43 | options.add_argument("--window-size=1920,8000") 44 | options.add_argument("enable-features=NetworkServiceInProcess") 45 | options.add_argument("disable-features=NetworkService") 46 | options.add_argument("--no-sandbox") 47 | options.add_argument("--disable-infobars") 48 | options.add_argument("--disable-dev-shm-usage") 49 | options.add_argument("--disable-gpu") 50 | options.add_argument("--disable-browser-side-navigation") 51 | options.add_argument("--force-device-scale-factor=1") 52 | 53 | # If binary path was passed 54 | if binary_path: options.binary_location=binary_path 55 | 56 | if _preferred_type == "chrome": 57 | return webdriver.Chrome(ChromeDriverManager().install(), options=options) 58 | else: 59 | return webdriver.Chrome(ChromeDriverManager(chrome_type=ChromeType.CHROMIUM).install(), options=options) 60 | 61 | def start_firefox(binary_path): 62 | from selenium.webdriver.firefox.firefox_binary import FirefoxBinary 63 | from selenium.webdriver.firefox.options import Options 64 | 65 | options = Options() 66 | options.set_headless() 67 | 68 | binary = FirefoxBinary(binary_path) 69 | return webdriver.Firefox(firefox_binary=binary, options=options) 70 | -------------------------------------------------------------------------------- /extracted/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/creativeJoe007/google-leads-scraper/24ca42b5e30870126a21ddab36e3fe986cc47009/extracted/.keep -------------------------------------------------------------------------------- /extractor.py: -------------------------------------------------------------------------------- 1 | #---------------------------------------------------------------------------------------------------- 2 | # Authur: creativeJoe007 3 | # Website: https://creativejoe007.com 4 | #---------------------------------------------------------------------------------------------------- 5 | # A Google bot that allows anyone search for businesses using a keyword 6 | # We extract the website title, description, email (if any), mobile number (if any), web-link 7 | # An ideal bot for marketers looking to find leads/prospects 8 | #---------------------------------------------------------------------------------------------------- 9 | import re 10 | import time 11 | import csv 12 | from pathlib import Path 13 | from selenium.common.exceptions import NoSuchElementException,\ 14 | TimeoutException,\ 15 | WebDriverException 16 | from selenium.webdriver import ActionChains 17 | from selenium.webdriver.common.keys import Keys 18 | from println import println 19 | 20 | extracted_data_sample = { 21 | 'title': '', 22 | 'url': '', 23 | 'description': '', 24 | 'site_description': '', 25 | 'screen_shot': '', 26 | 'contact_email': '', 27 | 'contact_number': '' 28 | } 29 | class Extractor(): 30 | #------------------------------------------------------------------------ 31 | # This is where we extract all the data we need while scrapping 32 | # We take our screenshots here, get titles, find social media pages 33 | # Of users we extract 34 | #------------------------------------------------------------------------ 35 | def __init__(self, driver, query: str, start_page: int, stop_page: int, file_name: str): 36 | self._driver = driver 37 | self._file_name = file_name 38 | self._page = start_page # start counting from zero due to google's seek algorithm 39 | self._stop_page = stop_page # start counting from zero due to google's seek algorithm 40 | self._site_content = extracted_data_sample 41 | 42 | # We loop through the enter data wrapped under pagination 43 | self.paginate_page(query) 44 | 45 | def paginate_page(self, query): 46 | #------------------------------------------------------------------------ 47 | # We are going to fetch all first 10 pages 48 | # Of google's result 49 | #------------------------------------------------------------------------ 50 | start_url = f"https://www.google.com/search?q={query}&sourceid=chrome&ie=UTF-8" 51 | seek_number = 0 52 | seek_url_query = f"&start={seek_number}" 53 | 54 | # while self._page <= 9: 55 | while self._page <= self._stop_page: 56 | self._driver.get(start_url + f"&start={(self._page * 10)}") 57 | print(f"{start_url}&start={(self._page * 10)}") 58 | try: 59 | self.extract_page_content() 60 | except WebDriverException as e: 61 | println(f"Selenium error: {str(e)}", "warn") 62 | except TimeoutException as e: 63 | println(f"Timeout error: {str(e)}", "warn") 64 | 65 | self._page += 1 66 | self._data_extract = [] 67 | 68 | println("Congratulations, scraping complete", "normal") 69 | 70 | def words_in_string(self, word_list, a_string): 71 | return set(word_list).intersection(a_string.lower().split()) 72 | 73 | def extract_page_content(self): 74 | #------------------------------------------------------------------------ 75 | # We are going to get all major links in a page 76 | # Match that they do not contain the words 77 | # "english", "translate" or "translation" 78 | # Any item that passes this page would be considered for scrapping 79 | #------------------------------------------------------------------------ 80 | dictionary_words = ["english", "translate", "translation", "dictionary", "Thesaurus", "translations", "definition"] 81 | response = self._driver.find_elements_by_css_selector("div.MjjYud") 82 | 83 | # Now we look through all search results 84 | for result in response: 85 | self._site_content = extracted_data_sample 86 | 87 | google_result = result.find_element_by_css_selector("div.tF2Cxc") 88 | 89 | self._site_content['title'] = google_result.find_elements_by_tag_name("div")[0]\ 90 | .find_element_by_css_selector("h3.LC20lb").text 91 | # .find_element_by_css_selector("h3.LC20lb").find_element_by_tag_name("span").text 92 | 93 | self._site_content['description'] = google_result.find_element_by_css_selector("div.Z26q7c")\ 94 | .find_element_by_tag_name("div").find_element_by_tag_name("span").text 95 | # .find_element_by_tag_name("span.aCOpRe").text 96 | 97 | self._site_content['url'] = google_result.find_elements_by_tag_name("div")[0]\ 98 | .find_element_by_tag_name("a").get_attribute("href") 99 | 100 | if(not self.words_in_string(dictionary_words, self._site_content['title']) and \ 101 | not self.words_in_string(dictionary_words, self._site_content['description'])): 102 | #------------------------------------------------------------------------ 103 | # This website is not a dictionary, now we can start 104 | # scanning through to extract just 105 | # The data we need 106 | #------------------------------------------------------------------------ 107 | if "youtube" in self._site_content['url']: 108 | continue 109 | elif "facebook" in self._site_content['url']: 110 | #------------------------------------------------------------------------ 111 | # First we split by "/" 112 | # We check if the last "/" is empty in case the URL ended with "/" 113 | # If its empty we use the second to last 114 | # If its not empty we check if the value contains "?" meaning a query 115 | # If it does, we still use second to last 116 | #------------------------------------------------------------------------ 117 | split_page_url_list = self._site_content['url'].split("/") 118 | page_name = "" 119 | 120 | if split_page_url_list[len(split_page_url_list) - 1] == "": 121 | page_name = split_page_url_list[len(split_page_url_list) - 2] 122 | else: 123 | if "?" in split_page_url_list[len(split_page_url_list) - 1]: 124 | page_name = split_page_url_list[len(split_page_url_list) - 2] 125 | else: 126 | page_name = split_page_url_list[len(split_page_url_list) - 1] 127 | 128 | self._site_content['url'] = f"https://web.facebook.com/pg/{page_name}/about/" 129 | 130 | try: 131 | self.extract_info_from_link() 132 | println(f"Finished Scrapping, {self._site_content['url']}", "normal") 133 | except NoSuchElementException as e: 134 | # Had cases where body element was empty, meaning the website didn't exist 135 | # So since a new window was launched before that error, 136 | # We have to close the window and switch back to the search result 137 | self._driver.close() 138 | self._driver.switch_to.window(self._driver.window_handles[len(self._driver.window_handles) - 1]) 139 | println(f"This website ({self._site_content['url']}) has an issue and could not be parsed", "warn") 140 | 141 | def extract_info_from_link(self): 142 | #------------------------------------------------------------------------ 143 | # We will access all the different websites, and 144 | # extract every email address, and phone number 145 | # Found in them 146 | #------------------------------------------------------------------------ 147 | 148 | # Load up a new tab to handle this 149 | self._driver.execute_script("window.open('')") 150 | self._driver.switch_to.window(self._driver.window_handles[len(self._driver.window_handles) - 1]) 151 | 152 | self._driver.get(self._site_content['url']) 153 | println("-----------------------------------------------------------------------------------------", "bold") 154 | println(f"Currently Scrapping, {self._site_content['url']}", "bold") 155 | time.sleep(5) 156 | 157 | html_source = self._driver.find_element_by_tag_name('body').get_attribute('innerHTML') 158 | extracted_numbers = "" 159 | extracted_emails = "" 160 | 161 | #------------------------------------------------------------------------ 162 | # Now we use regex to match all occurrences of email or phone numbers 163 | # in the page source 164 | #------------------------------------------------------------------------ 165 | try: 166 | self._site_content['site_description'] = self._driver.find_element_by_xpath("//meta[@name='description']")\ 167 | .get_attribute("content") 168 | except NoSuchElementException as e: 169 | println(f"Opps, we couldn't find a meta description for this website ({self._site_content['url']})", "warn") 170 | 171 | screen_shot_name = 'static/' + re.sub(r"[\-\(\)\+ .*]", "", self._site_content["title"]) + '.png' 172 | 173 | found_numbers = self.scan_for_numbers(html_source) 174 | found_emails = self.scan_for_emails(html_source) 175 | 176 | # Extract true mobile numbers 177 | verified_numbers = self.extract_mobile_number(found_numbers) 178 | # Extract true email addresses 179 | verified_emails = self.extract_real_email_address(found_emails) 180 | 181 | self._site_content['contact_number'] = verified_numbers 182 | self._site_content['contact_email'] = verified_emails 183 | 184 | if len(verified_numbers) or len(verified_emails): 185 | # Increase the size of the page for our screenshot 186 | # self._driver.set_window_size(1920, 8000) 187 | self.screengrab(screen_shot_name) 188 | self._site_content['screen_shot'] = screen_shot_name 189 | 190 | # We are done with processing now lets add to our csv 191 | # Save extracted files 192 | self.write_to_file(self._site_content) 193 | 194 | # Close new tab first 195 | self._driver.close() 196 | self._driver.switch_to.window(self._driver.window_handles[len(self._driver.window_handles) - 1]) 197 | 198 | def scan_for_numbers(self, source: str) -> list: 199 | found_numbers: list = [] 200 | phone_regex = [ 201 | "^(((\+44\s?\d{4}|\(?0\d{4}\)?)\s?\d{3}\s?\d{3})|((\+44\s?\d{3}|\(?0\d{3}\)?)\s?\d{3}\s?\d{4})|((\+44\s?\d{2}|\(?0\d{2}\)?)\s?\d{4}\s?\d{4}))(\s?\#(\d{4}|\d{3}))?$", 202 | "\+[\(]?[0-9][0-9 .\-\(\)]{8,}[0-9]", # Priority 1 203 | "((tel|p|t|phone|call|dial|ring)[: -]?[\+\(]?[0-9][0-9 .\-\(\)]{8,}[0-9])", # Priority 2 204 | # "[\+\(]?[0-9][0-9 .\-\(\)]{8,}[0-9]" # Priority 3 205 | ] 206 | 207 | for regex in phone_regex: 208 | is_found = re.findall(regex, source, re.IGNORECASE) 209 | if len(is_found) > 0: 210 | if type(is_found[0]) is tuple: 211 | #------------------------------------------------------------------------ 212 | # Our second regex returns a tuple instead of a string like the other one 213 | # I haven't figured how to resolve that but this is just a work around 214 | #------------------------------------------------------------------------ 215 | found_numbers = [is_found[0][0]] 216 | else: found_numbers = is_found 217 | break 218 | 219 | return found_numbers 220 | 221 | def scan_for_emails(self, source: str) -> list: 222 | extracted_email_addresses: list = [] 223 | email_regex = "[A-Za-z0-9\.\+_-]+@[A-Za-z0-9\._-]+\.[a-zA-Z]*" 224 | emails_found = re.findall(email_regex, source, re.IGNORECASE) 225 | 226 | return emails_found 227 | 228 | def screengrab(self, file_name: str): 229 | try: 230 | # Close every modal should any arise 231 | ActionChains(self._driver).send_keys(Keys.ESCAPE).perform() 232 | 233 | self._driver.find_element_by_tag_name('body').screenshot(file_name) 234 | 235 | except NoSuchElementException as e: 236 | println(f"We experienced an issue while trying to screenshot this website ({self._site_content['url']})", "warn") 237 | 238 | 239 | def extract_mobile_number(self, found_numbers: list) -> list: 240 | number_list: list = [] 241 | final_phone_regex = "[\+\(]?[0-9][0-9 .\-\(\)]{8,}[0-9]" 242 | strip_regex = r"[\-\(\) .]" 243 | for number in found_numbers: 244 | number = re.search(final_phone_regex, number, re.IGNORECASE) 245 | if number: 246 | number = re.sub(strip_regex, "", number.group(0)) 247 | total_count = len(number) 248 | if total_count > 10 and total_count < 15: 249 | if(number not in number_list): number_list.append(number) 250 | 251 | return number_list 252 | 253 | def extract_real_email_address(self, found_emails: list) -> list: 254 | # Sometimes images take the form of an email address 255 | email_list: list = [] 256 | check_against_strings = (".png", ".jpg", ".jpeg", ".mv", ".mp3", ".mp4", ".gif", ".webp") 257 | for email in found_emails: 258 | if email.endswith(check_against_strings) is False: 259 | if(email not in email_list): email_list.append(email) 260 | 261 | return email_list 262 | 263 | def write_to_file(self, data: dict): 264 | #------------------------------------------------------------------------ 265 | # We check if the file already exist before we being, if the file 266 | # Exist, we simply append the new data as the header for the CSV file has 267 | # Already be created 268 | # Else we add CSV header first before adding the data to file 269 | #------------------------------------------------------------------------ 270 | extracted_path = Path("extracted/") 271 | save_file_to = extracted_path / f"{self._file_name}.csv" 272 | file_path_object = Path(save_file_to) 273 | file_exist = file_path_object.is_file() 274 | if file_exist is False: 275 | Path(save_file_to).touch() 276 | 277 | with open(save_file_to, 'a', newline='') as file: 278 | writer = csv.writer(file, delimiter='|') 279 | # Add header only if the file doesn't exist 280 | if file_exist is False: writer.writerow(data.keys()) 281 | # Add new data 282 | writer.writerow(data.values()) 283 | file.close() 284 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #---------------------------------------------------------------------------------------------------- 2 | # Authur: creativeJoe007 3 | # Website: https://creativejoe007.com 4 | #---------------------------------------------------------------------------------------------------- 5 | # A Google bot that allows anyone search for businesses using a keyword 6 | # We extract the website title, description, email (if any), mobile number (if any), web-link 7 | # An ideal bot for marketers looking to find leads/prospects 8 | #---------------------------------------------------------------------------------------------------- 9 | import argparse 10 | from browser import determine_browser 11 | from extractor import Extractor 12 | from println import println 13 | 14 | driver = None 15 | arguments = argparse.ArgumentParser() 16 | 17 | arguments.add_argument('query', action='store', type=str, help="This is your google query and should be written as a string") 18 | arguments.add_argument('--start', action='store', type=int, required=False, default=0, help="What page would you like to us\ 19 | to start scrape from Google's search result") 20 | arguments.add_argument('--stop', action='store', type=int, required=False, default=14, help="At what page would you want to stop\ 21 | scraping Google's search result") 22 | arguments.add_argument('--file', action='store', type=str, required=True, help="File name to save extracted data") 23 | arguments.add_argument('--browser', action='store', type=str, required=False, default="chrome", help="What browser should we\ 24 | scrape with?") 25 | arguments.add_argument('--driver', action='store', type=str, required=False, help="Browser executable path") 26 | 27 | args = arguments.parse_args() 28 | 29 | def main(): 30 | executor_url = "" 31 | session_id = "" 32 | selected_browser = args.browser 33 | browser_driver_path = args.driver 34 | query = args.query 35 | file_name = args.file 36 | start_page = args.start - 1 37 | stop_page = args.stop - 1 38 | 39 | if start_page < 0: start_page = 0 # If the user puts in 0, we auto make it one 40 | elif (stop_page - start_page) > 15: 41 | println("You cannot search more than 15 pages at a time") 42 | 43 | # Determine what browser to use for this tool 44 | driver = determine_browser(selected_browser, browser_driver_path) 45 | if type(driver) == str: 46 | println(driver) 47 | else: 48 | executor_url = driver.command_executor._url 49 | session_id = driver.session_id 50 | 51 | # Maximize chrome height to highest 52 | driver.set_window_size(1920, 8000) 53 | 54 | println(f"Google's Query: {query}", "normal") 55 | extractor = Extractor(driver, query, start_page, stop_page, file_name) 56 | driver.close() 57 | 58 | try: 59 | main() 60 | except Exception as e: 61 | println("Oops, something's off here", "fail") -------------------------------------------------------------------------------- /println.py: -------------------------------------------------------------------------------- 1 | #---------------------------------------------------------------------------------------------------- 2 | # Authur: creativeJoe007 3 | # Website: https://creativejoe007.com 4 | #---------------------------------------------------------------------------------------------------- 5 | # A Google bot that allows anyone search for businesses using a keyword 6 | # We extract the website title, description, email (if any), mobile number (if any), web-link 7 | # An ideal bot for marketers looking to find leads/prospects 8 | #---------------------------------------------------------------------------------------------------- 9 | import os 10 | 11 | HEADER = '\033[95m' 12 | OKBLUE = '\033[94m' 13 | OKGREEN = '\033[92m' 14 | WARNING = '\033[93m' 15 | FAIL = '\033[91m' 16 | ENDC = '\033[0m' 17 | BOLD = '\033[1m' 18 | UNDERLINE = '\033[4m' 19 | 20 | def println(text, _type='fail'): 21 | # os.system('color') 22 | if _type == 'fail': 23 | print(f"\n{FAIL} {text} {ENDC}\n") 24 | elif _type == 'success': 25 | print(f"\n{OKGREEN} {text} {ENDC}\n") 26 | elif _type == 'normal': 27 | print(f"\n{OKBLUE} {text} {ENDC}\n") 28 | elif _type == 'bold': 29 | print(f"\n{BOLD} {text} {ENDC}\n") 30 | elif _type == 'warn': 31 | print(f"\n{WARNING} {text} {ENDC}\n") -------------------------------------------------------------------------------- /process.txt: -------------------------------------------------------------------------------- 1 | Work Flow: 2 | 3 | 4 | * Search "chef a domicle" on google 5 | * Extract all urls in the page 6 | * Verify if they have the word "translate" or "english", "translation", or "Thesaurus" 7 | * Skip if they do, because it is a dictionary website we don't need 8 | * Open all links that passed Step 3 test 9 | * Scan through the pages for email and/or number 10 | * If we find, we extract it, attach the site's name and URL then save 11 | * After a page is completed we click on the next button 12 | * And start again from Step 2 13 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | webdriver_manager==3.7.1 3 | selenium 4 | argparse -------------------------------------------------------------------------------- /static/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/creativeJoe007/google-leads-scraper/24ca42b5e30870126a21ddab36e3fe986cc47009/static/.DS_Store --------------------------------------------------------------------------------