├── navigate ├── __init__.py ├── __pycache__ │ ├── ioCSV.cpython-311.pyc │ ├── ioCSV.cpython-312.pyc │ ├── __init__.cpython-311.pyc │ ├── __init__.cpython-312.pyc │ ├── navigateDomain.cpython-311.pyc │ └── navigateDomain.cpython-312.pyc ├── ioCSV.py └── navigateDomain.py ├── domains.csv ├── .gitignore ├── anchor.txt ├── main.py └── README.md /navigate/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /domains.csv: -------------------------------------------------------------------------------- 1 | parkstreet.com 2 | goguardian.com 3 | laneterralever.com -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | savedata.csv 2 | job description.txt 3 | domains copy.csv 4 | anchor.txt -------------------------------------------------------------------------------- /navigate/__pycache__/ioCSV.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devflash101/scrape-domain-form/HEAD/navigate/__pycache__/ioCSV.cpython-311.pyc -------------------------------------------------------------------------------- /navigate/__pycache__/ioCSV.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devflash101/scrape-domain-form/HEAD/navigate/__pycache__/ioCSV.cpython-312.pyc -------------------------------------------------------------------------------- /navigate/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devflash101/scrape-domain-form/HEAD/navigate/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /navigate/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devflash101/scrape-domain-form/HEAD/navigate/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /navigate/__pycache__/navigateDomain.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devflash101/scrape-domain-form/HEAD/navigate/__pycache__/navigateDomain.cpython-311.pyc -------------------------------------------------------------------------------- /navigate/__pycache__/navigateDomain.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devflash101/scrape-domain-form/HEAD/navigate/__pycache__/navigateDomain.cpython-312.pyc -------------------------------------------------------------------------------- /anchor.txt: -------------------------------------------------------------------------------- 1 | Demo, Call, Book, Schedule, Consultation, Consult, Appointment, Get Started, Start, Inquire, Learn, Discover, More Info, Find Out, Get a Quote, Talk, Explore, Details, Request, Connect, Get in, Contact -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from navigate.navigateDomain import NavigateDomain 2 | from navigate.ioCSV import ReadDomains 3 | 4 | domain_list = ReadDomains('domains.csv') 5 | 6 | for domain in domain_list: 7 | # print(domain) 8 | domain_url = 'https://' + domain + '/' 9 | print(domain_url) 10 | NavigateDomain(domain_url) 11 | # print(domain_url) -------------------------------------------------------------------------------- /navigate/ioCSV.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import csv 3 | 4 | def ReadDomains(file_path): 5 | domains_df = pd.read_csv(file_path, header=None) 6 | domains_list = domains_df[0].tolist() 7 | # print(domains_list) 8 | return domains_list 9 | 10 | # read_domains('domains.csv') 11 | 12 | def ExtractData(data): 13 | 14 | # Open the CSV file in append mode 15 | with open('savedata.csv', 'a', newline='') as csvfile: 16 | # Create a writer object 17 | csvwriter = csv.writer(csvfile) 18 | 19 | # Append the row 20 | csvwriter.writerow(data) 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scrape Domain Form Project 2 | 3 | Welcome to the Scrape Domain Form project! This project aims to provide a simple yet powerful tool for scraping information from websites based on domain names. 4 | 5 | ## Overview 6 | 7 | The Scrape Domain Form project offers a user-friendly interface for users to input domain names and retrieve relevant data from those domains. Whether you're conducting market research, competitive analysis, or simply gathering data for your project, this tool makes the process efficient and hassle-free. 8 | 9 | ## Features 10 | 11 | - **Easy Input**: Users can input domain names effortlessly through a user-friendly form. 12 | - **Scalable**: The tool is designed to handle a large volume of domain names efficiently. 13 | - **Customizable**: Users can specify the type of information they want to scrape from the domains. 14 | - **Data Parsing**: The tool intelligently parses the scraped data for easy analysis and interpretation. 15 | - **Export Options**: Users can export the scraped data in various formats for further analysis or integration into other tools. 16 | 17 | ## How It Works 18 | 19 | 1. **Input Domain Name**: Users provide the domain name(s) they want to scrape data from. 20 | 2. **Select Data Type**: Users choose the type of data they're interested in scraping (e.g., email addresses, contact information, product listings). 21 | 3. **Scrape Data**: The tool retrieves the requested information from the provided domain(s). 22 | 4. **Parse Data**: The scraped data is parsed and presented to the user in a readable format. 23 | 5. **Export (Optional)**: Users have the option to export the data for further analysis or integration. 24 | 25 | ## Technologies Used 26 | 27 | - **Frontend**: HTML, CSS, JavaScript 28 | - **Backend**: Python, Flask 29 | - **Scraping**: Beautiful Soup, Scrapy 30 | - **Data Parsing**: Python libraries (e.g., Pandas) 31 | - **Database**: SQLite, MongoDB (optional for scalability) 32 | 33 | ## Get Started 34 | 35 | To get started with the Scrape Domain Form project, follow these steps: 36 | 37 | 1. **Clone the Repository**: Clone the project repository to your local machine. 38 | 2. **Install Dependencies**: Ensure you have all the necessary dependencies installed. Refer to the project documentation for details. 39 | 3. **Run the Application**: Launch the application locally and start scraping domain data right away! 40 | 41 | ## Contributions 42 | 43 | Contributions to the Scrape Domain Form project are welcome! Whether you want to add new features, improve existing functionality, or fix bugs, your contributions are highly appreciated. Please refer to the contribution guidelines in the project repository for more information. 44 | 45 | ## Feedback 46 | 47 | We value your feedback! If you have any suggestions, feature requests, or encounter any issues while using the Scrape Domain Form tool, please don't hesitate to reach out. Your input helps us improve the tool and provide a better experience for all users. 48 | 49 | ## License 50 | 51 | The Scrape Domain Form project is licensed under the [MIT License](https://opensource.org/licenses/MIT). Feel free to use, modify, and distribute the code as per the terms of the license. 52 | -------------------------------------------------------------------------------- /navigate/navigateDomain.py: -------------------------------------------------------------------------------- 1 | # get the domain url from main.py 2 | 3 | # navigate the domain using selenium 4 | # LeadGeneration() 5 | # navigate subpages within the domain 6 | # LeadGeneration() 7 | # 8 | 9 | # dev LeadGeneration(): 10 | # if form_exist(): 11 | # fill_form(), extract_data() 12 | # else return False 13 | 14 | # def form_exist(): 15 | # ... 16 | # check phone number 17 | # check captcha 18 | from re import sub 19 | from selenium import webdriver 20 | from selenium.webdriver.common.by import By 21 | from selenium.webdriver.chrome.service import Service 22 | from selenium.webdriver.support.ui import WebDriverWait 23 | from selenium.webdriver.support import expected_conditions as EC 24 | from selenium.webdriver.common.action_chains import ActionChains 25 | import time 26 | from urllib.parse import urlparse 27 | from datetime import datetime 28 | from .ioCSV import ExtractData 29 | 30 | anchor_words = ['Demo', 'Call', 'Book', 'Schedule', 'Consultation', 'Consult', 'Appointment', 'Get Started', 'Start', 'Inquire', 'Learn', 'Discover', 'More Info', 'Find Out', 'Get a Quote', 'Talk', 31 | 'Explore', 'Details', 'Request', 'Connect', 'Get in', 'Contact'] 32 | 33 | # anchor_words = ['Contact'] 34 | 35 | form_fields = ['firstname', 'First Name', 'Last Name', 'lastname', 'name', 'email', 'company', 'company_website', 'phone', 'contact_reason', 'message'] 36 | 37 | phone_fields = ['phone', 'your-phone', 'phone number', 'mobile phone'] 38 | 39 | def FillForm(driver, anchor_text, phone_require): 40 | data = [] 41 | 42 | # keep domain url. 43 | current_url = driver.current_url 44 | parsed_url = urlparse(current_url) 45 | data.append('domain: ' + parsed_url.netloc) 46 | # keep date filled out form 47 | today = datetime.today() 48 | data.append('date: ' + today.strftime("%Y-%m-%d")) # This formats the date as YYYY-MM-DD 49 | # keep time filled out form 50 | now = datetime.now() 51 | data.append('time: '+ now.strftime("%H:%M:%S")) 52 | 53 | data.append('anchor text: ' + anchor_text) 54 | 55 | if phone_require == 2: 56 | data.append('asterisk found') 57 | else: 58 | data.append('asterisk not found') 59 | 60 | for field in form_fields: 61 | try: 62 | form_element = driver.find_element(By.NAME, field) 63 | if form_element: 64 | form_element.send_keys('John Doe') 65 | except: 66 | pass 67 | 68 | xpath_expression = f"//input[@type='submit']" 69 | submit_button = driver.find_element(By.XPATH, xpath_expression) 70 | try: 71 | if submit_button: 72 | submit_button.click() 73 | except: 74 | pass 75 | 76 | 77 | return data 78 | 79 | def FindForm(driver): # check whether form exist in the frame. 80 | ret = 0 81 | is_phone = 0 82 | 83 | # check if form exist 84 | for field in form_fields: 85 | xpath_expression = f"//input[@name='{field}']" 86 | xpath_expression1 = f"//input[contains(@placeholder, '{field}')]" 87 | try: 88 | form_element = driver.find_element(By.XPATH, xpath_expression) 89 | if form_element: 90 | ret = 1 91 | except: 92 | pass 93 | try: 94 | form_element1 = driver.find_element(By.XPATH, xpath_expression1) 95 | if form_element1: 96 | ret = 1 97 | print(' - found with placeholder') 98 | except: 99 | pass 100 | 101 | # check phone field exist 102 | for field in phone_fields: 103 | xpath_expression = f"//input[@name='{field}']" 104 | try: 105 | phone_field = driver.find_element(By.XPATH, xpath_expression) 106 | if phone_field and phone_field.is_displayed(): 107 | # name_field.send_keys('John Doe') 108 | is_phone = 1 109 | # check for asterisk 110 | placeholder_text = phone_field.get_attribute('placeholder') 111 | if '*' in placeholder_text: 112 | print('The placeholder contains an asterisk.') 113 | is_phone = 2 114 | else: 115 | print('The placeholder does not contain an asterisk.') 116 | 117 | asterisk_expression = f"//label[.//span[contains(., '{field}')]]" 118 | try: 119 | phone_label = driver.find_element(By.XPATH, asterisk_expression) 120 | if phone_label: 121 | label_html = phone_label.get_attribute('innerHTML') 122 | # print('Full Name label found.') 123 | if '*' in label_html: 124 | is_phone = 2 125 | print('An asterisk is present within the label.') 126 | else: 127 | print("No asterisk found in label:") 128 | # Do something if no asterisk is found 129 | except: 130 | pass 131 | except: 132 | # print('failed') 133 | pass 134 | 135 | if ret == 1: 136 | print(' - field found') 137 | if is_phone == 0: 138 | print(' - no phone field.') 139 | ret = -1 140 | if is_phone == 2: 141 | print(' - phone field is required') 142 | ret = 2 143 | pass 144 | # ret &= is_phone 145 | 146 | # check if there's captcha break 147 | try: 148 | captcha_element = driver.find_element(By.CLASS_NAME, 'grecaptcha-badge') 149 | if captcha_element and captcha_element.get_attribute('data-style') == 'inline': 150 | print(' - captcha detected') 151 | ret = -2 152 | except: 153 | pass 154 | 155 | print('return the value when FindForm', ret) 156 | 157 | return ret 158 | 159 | def LeadGeneration(driver, anchor_text): # check whether form exist in the page. 160 | 161 | # chrome_options = webdriver.ChromeOptions() 162 | # driver = webdriver.Chrome(options = chrome_options) 163 | # driver.get(url) 164 | print('LeadGeneration', anchor_text) 165 | 166 | form_exist = FindForm(driver) 167 | 168 | try: 169 | if form_exist > 0: 170 | data = FillForm(driver, anchor_text, form_exist) 171 | ExtractData(data) 172 | else: 173 | print('form does not exist in the page', anchor_text, 'so looking into frames.') 174 | frames = driver.find_elements(By.TAG_NAME, 'iframe') 175 | # print('checking.......') 176 | for index, frame in enumerate(frames): 177 | # Switch to each frame by index 178 | print(' - looking for an iframe..') 179 | if not frame.is_displayed(): 180 | print("form skipped due to it's not displayed.") 181 | continue 182 | 183 | actions = ActionChains(driver) 184 | actions.move_to_element(frame).perform() 185 | 186 | src_value = frame.get_attribute('src') 187 | if "video" in src_value: 188 | print(' - src value continas "video"') 189 | continue 190 | 191 | driver.switch_to.frame(index) 192 | 193 | # Perform operations within the frame 194 | # ... 195 | form_exist = FindForm(driver) 196 | if form_exist > 0: 197 | data = FillForm(driver, anchor_text, form_exist) 198 | ExtractData(data) 199 | driver.switch_to.default_content() 200 | break 201 | 202 | # Switch back to the default content before moving to the next frame 203 | driver.switch_to.default_content() 204 | 205 | if form_exist > 0: 206 | print(' - form filled') 207 | else: 208 | print(' - this page was skipped') 209 | except Exception as e: 210 | print(e) 211 | # pass 212 | return form_exist 213 | 214 | def NavigateDomain(url): 215 | chrome_options = webdriver.ChromeOptions() 216 | chrome_options.add_argument("--start-maximized") 217 | #chrome_options.page_load_strategy = self.load_mode 218 | driver = webdriver.Chrome(options = chrome_options) 219 | # chrome_options.add_argument('--ignore-certificate-errors') 220 | # chrome_options.add_argument('--allow-running-insecure-content') 221 | 222 | try: 223 | driver.get(url) 224 | 225 | # site returns ERROR 226 | try: 227 | title_element = driver.find_element(By.XPATH, "//title") 228 | if title_element: 229 | title_text = title_element.get_attribute('textContent') 230 | if "ERROR" in title_text: 231 | data = [] 232 | current_url = driver.current_url 233 | parsed_url = urlparse(current_url) 234 | data.append('domain: ' + parsed_url.netloc) 235 | 236 | try: 237 | h1_xpath = "/html/body/h1" 238 | h1_element = driver.find_element(By.XPATH, h1_xpath) 239 | h1_text = h1_element.text 240 | data.append(' skipped because ' + h1_text) 241 | except: 242 | data.append(' skipped because Page ERROR') 243 | 244 | ExtractData(data) 245 | return 246 | except: 247 | pass 248 | 249 | # close recommandation system 250 | try: 251 | # This XPath finds any button element whose class attribute contains the word 'close' 252 | close_button = driver.find_element(By.XPATH, "//button[contains(@aria-label, 'Dismiss')]") 253 | if close_button: 254 | print('#close button found') 255 | close_button.click() 256 | print("#Close button clicked.") 257 | except Exception as e: 258 | print("#Close button not clicked.") 259 | 260 | 261 | 262 | # Accept cookies button 263 | button_expression = f"//button[contains(text(), 'Accept')]" 264 | try: 265 | accept_link = driver.find_element(By.XPATH, button_expression) 266 | if accept_link: 267 | print('@accept button clicked') 268 | accept_link.click() 269 | except: 270 | pass 271 | 272 | # Accept cookies 273 | button_expression = f"//a[contains(text(), 'Accept')]" 274 | try: 275 | accept_link = driver.find_element(By.XPATH, button_expression) 276 | if accept_link: 277 | print('@accept anchor clicked') 278 | accept_link.click() 279 | except: 280 | pass 281 | 282 | # close popup dlg. 283 | # Find the button by class name containing 'close' and click it 284 | # This XPath finds any button element whose class attribute contains the word 'close' 285 | close_buttons = driver.find_elements(By.XPATH, "//button[contains(@aria-label, 'close popup')]") 286 | if close_buttons: 287 | print('&close button found') 288 | for close_button in close_buttons: 289 | # close_buttons[1].click() 290 | try: 291 | close_button.click() 292 | except Exception as e: 293 | # print("Close button not clicked.") 294 | pass 295 | 296 | 297 | form_filled = LeadGeneration(driver, 'hompage') 298 | if not form_filled > 0: 299 | for word in anchor_words: 300 | if form_filled > 0: 301 | break 302 | print(word) 303 | # within 304 | xpath_expression = f"//a[.//span[contains(text(), '{word}')]]" 305 | try: 306 | anchor_link = driver.find_element(By.XPATH, xpath_expression) 307 | 308 | if anchor_link: 309 | anchor_link.click() 310 | form_filled = LeadGeneration(driver, word) 311 | driver.back() 312 | except: 313 | pass 314 | 315 | # without 316 | xpath_expression = f"//a[contains(text(), '{word}')]" 317 | try: 318 | anchor_links = driver.find_elements(By.XPATH, xpath_expression) 319 | 320 | for anchor_link in anchor_links: 321 | if not anchor_link.is_displayed(): 322 | continue 323 | print('**anchor: ' + word) 324 | anchor_link.click() 325 | print('**anchor ', word, ' clicked') 326 | form_filled = LeadGeneration(driver, word) 327 | driver.back() 328 | except Exception as e: 329 | print(e) 330 | # pass 331 | if not form_filled > 0: 332 | data = [] 333 | if form_filled == 0: 334 | data.append('domain: ' + url + ' skipped because no form found') 335 | elif form_filled == -2: 336 | data.append('domain: ' + url + ' skipped because CAPTCHA detected.') 337 | else: 338 | data.append('domain: ' + url + ' skipped because no phone field found') 339 | 340 | ExtractData(data) 341 | 342 | time.sleep(5) 343 | # while True: 344 | # pass 345 | except Exception as e: 346 | # print(e) 347 | pass 348 | driver.quit() 349 | 350 | 351 | 352 | # NavigateDomain('https://adstage.io/') 353 | # NavigateDomain('https://capeanalytics.com/contact/') 354 | # NavigateDomain('https://dotloop.com/') 355 | # NavigateDomain('https://homelight.com/') 356 | # NavigateDomain('https://indinero.com/') 357 | # NavigateDomain('https://jyve.com/') 358 | # NavigateDomain('https://parkstreet.com/') 359 | # NavigateDomain('https://www.goguardian.com/') 360 | # NavigateDomain('https://claylacy.com/') 361 | # NavigateDomain('https://marketshareonline.com/') 362 | # NavigateDomain('https://intrepidib.com/') 363 | # NavigateDomain('https://alpertandalpert.com/') 364 | # NavigateDomain('https://yscouts.com/') 365 | # NavigateDomain('https://laneterralever.com/') 366 | # NavigateDomain('https://valorglobal.com/') 367 | 368 | 369 | ''' 370 | 371 | parkstreet.com // success 372 | chassi.com/contact // frame, no-phone 373 | https://www.claylacy.com/contact-us/ // success 374 | https://info.marketshareonline.com/contact // capcha 375 | https://intrepidib.com/contact-us/ // your-phone 376 | https://www.alpertandalpert.com/contact-us.html // no phone 377 | https://yscouts.com/contact/ // no phone 378 | https://valorglobal.com/get-a-quote/ // captcha 379 | https://myfw.com/contact/ // captcha 380 | https://laneterralever.com/ // cookie, popup close, no phone 381 | 382 | firstname 383 | lastname 384 | company 385 | phone 386 | email 387 | ''' 388 | --------------------------------------------------------------------------------