├── navigate
    ├── __init__.py
    ├── __pycache__
    │   ├── ioCSV.cpython-311.pyc
    │   ├── ioCSV.cpython-312.pyc
    │   ├── __init__.cpython-311.pyc
    │   ├── __init__.cpython-312.pyc
    │   ├── navigateDomain.cpython-311.pyc
    │   └── navigateDomain.cpython-312.pyc
    ├── ioCSV.py
    └── navigateDomain.py
├── domains.csv
├── .gitignore
├── anchor.txt
├── main.py
└── README.md


/navigate/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/domains.csv:
--------------------------------------------------------------------------------
1 | parkstreet.com
2 | goguardian.com
3 | laneterralever.com


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | savedata.csv
2 | job description.txt
3 | domains copy.csv
4 | anchor.txt


--------------------------------------------------------------------------------
/navigate/__pycache__/ioCSV.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devflash101/scrape-domain-form/HEAD/navigate/__pycache__/ioCSV.cpython-311.pyc


--------------------------------------------------------------------------------
/navigate/__pycache__/ioCSV.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devflash101/scrape-domain-form/HEAD/navigate/__pycache__/ioCSV.cpython-312.pyc


--------------------------------------------------------------------------------
/navigate/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devflash101/scrape-domain-form/HEAD/navigate/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/navigate/__pycache__/__init__.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devflash101/scrape-domain-form/HEAD/navigate/__pycache__/__init__.cpython-312.pyc


--------------------------------------------------------------------------------
/navigate/__pycache__/navigateDomain.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devflash101/scrape-domain-form/HEAD/navigate/__pycache__/navigateDomain.cpython-311.pyc


--------------------------------------------------------------------------------
/navigate/__pycache__/navigateDomain.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devflash101/scrape-domain-form/HEAD/navigate/__pycache__/navigateDomain.cpython-312.pyc


--------------------------------------------------------------------------------
/anchor.txt:
--------------------------------------------------------------------------------
1 | Demo, Call, Book, Schedule, Consultation, Consult, Appointment, Get Started, Start, Inquire, Learn, Discover, More Info, Find Out, Get a Quote, Talk, Explore, Details, Request, Connect, Get in, Contact


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | from navigate.navigateDomain import NavigateDomain
 2 | from navigate.ioCSV import ReadDomains
 3 | 
 4 | domain_list = ReadDomains('domains.csv')
 5 | 
 6 | for domain in domain_list:
 7 |     # print(domain)
 8 |     domain_url = 'https://' + domain + '/'
 9 |     print(domain_url)
10 |     NavigateDomain(domain_url)
11 |     # print(domain_url)


--------------------------------------------------------------------------------
/navigate/ioCSV.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import csv
 3 | 
 4 | def ReadDomains(file_path):
 5 |     domains_df = pd.read_csv(file_path, header=None)
 6 |     domains_list = domains_df[0].tolist()
 7 |     # print(domains_list)
 8 |     return domains_list
 9 | 
10 | # read_domains('domains.csv')
11 | 
12 | def ExtractData(data):
13 |     
14 |     # Open the CSV file in append mode
15 |     with open('savedata.csv', 'a', newline='') as csvfile:
16 |         # Create a writer object
17 |         csvwriter = csv.writer(csvfile)
18 |         
19 |         # Append the row
20 |         csvwriter.writerow(data)
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Scrape Domain Form Project
 2 | 
 3 | Welcome to the Scrape Domain Form project! This project aims to provide a simple yet powerful tool for scraping information from websites based on domain names.
 4 | 
 5 | ## Overview
 6 | 
 7 | The Scrape Domain Form project offers a user-friendly interface for users to input domain names and retrieve relevant data from those domains. Whether you're conducting market research, competitive analysis, or simply gathering data for your project, this tool makes the process efficient and hassle-free.
 8 | 
 9 | ## Features
10 | 
11 | - **Easy Input**: Users can input domain names effortlessly through a user-friendly form.
12 | - **Scalable**: The tool is designed to handle a large volume of domain names efficiently.
13 | - **Customizable**: Users can specify the type of information they want to scrape from the domains.
14 | - **Data Parsing**: The tool intelligently parses the scraped data for easy analysis and interpretation.
15 | - **Export Options**: Users can export the scraped data in various formats for further analysis or integration into other tools.
16 | 
17 | ## How It Works
18 | 
19 | 1. **Input Domain Name**: Users provide the domain name(s) they want to scrape data from.
20 | 2. **Select Data Type**: Users choose the type of data they're interested in scraping (e.g., email addresses, contact information, product listings).
21 | 3. **Scrape Data**: The tool retrieves the requested information from the provided domain(s).
22 | 4. **Parse Data**: The scraped data is parsed and presented to the user in a readable format.
23 | 5. **Export (Optional)**: Users have the option to export the data for further analysis or integration.
24 | 
25 | ## Technologies Used
26 | 
27 | - **Frontend**: HTML, CSS, JavaScript
28 | - **Backend**: Python, Flask
29 | - **Scraping**: Beautiful Soup, Scrapy
30 | - **Data Parsing**: Python libraries (e.g., Pandas)
31 | - **Database**: SQLite, MongoDB (optional for scalability)
32 | 
33 | ## Get Started
34 | 
35 | To get started with the Scrape Domain Form project, follow these steps:
36 | 
37 | 1. **Clone the Repository**: Clone the project repository to your local machine.
38 | 2. **Install Dependencies**: Ensure you have all the necessary dependencies installed. Refer to the project documentation for details.
39 | 3. **Run the Application**: Launch the application locally and start scraping domain data right away!
40 | 
41 | ## Contributions
42 | 
43 | Contributions to the Scrape Domain Form project are welcome! Whether you want to add new features, improve existing functionality, or fix bugs, your contributions are highly appreciated. Please refer to the contribution guidelines in the project repository for more information.
44 | 
45 | ## Feedback
46 | 
47 | We value your feedback! If you have any suggestions, feature requests, or encounter any issues while using the Scrape Domain Form tool, please don't hesitate to reach out. Your input helps us improve the tool and provide a better experience for all users.
48 | 
49 | ## License
50 | 
51 | The Scrape Domain Form project is licensed under the [MIT License](https://opensource.org/licenses/MIT). Feel free to use, modify, and distribute the code as per the terms of the license.
52 | 


--------------------------------------------------------------------------------
/navigate/navigateDomain.py:
--------------------------------------------------------------------------------
  1 | # get the domain url from main.py
  2 | 
  3 | # navigate the domain using selenium
  4 | #   LeadGeneration()
  5 | #   navigate subpages within the domain
  6 | #       LeadGeneration()
  7 | #               
  8 | 
  9 | # dev LeadGeneration():
 10 | #   if form_exist():
 11 | #       fill_form(), extract_data()
 12 | #   else return False
 13 | 
 14 | # def form_exist():
 15 | #   ...
 16 | #   check phone number
 17 | #   check captcha
 18 | from re import sub
 19 | from selenium import webdriver
 20 | from selenium.webdriver.common.by import By
 21 | from selenium.webdriver.chrome.service import Service
 22 | from selenium.webdriver.support.ui import WebDriverWait
 23 | from selenium.webdriver.support import expected_conditions as EC
 24 | from selenium.webdriver.common.action_chains import ActionChains
 25 | import time
 26 | from urllib.parse import urlparse
 27 | from datetime import datetime
 28 | from .ioCSV import ExtractData
 29 | 
 30 | anchor_words = ['Demo', 'Call', 'Book', 'Schedule', 'Consultation', 'Consult', 'Appointment', 'Get Started', 'Start', 'Inquire', 'Learn', 'Discover', 'More Info', 'Find Out', 'Get a Quote', 'Talk',
 31 |                 'Explore', 'Details', 'Request', 'Connect', 'Get in', 'Contact']
 32 | 
 33 | # anchor_words = ['Contact']
 34 | 
 35 | form_fields = ['firstname', 'First Name', 'Last Name', 'lastname', 'name', 'email', 'company', 'company_website', 'phone', 'contact_reason', 'message']
 36 | 
 37 | phone_fields = ['phone', 'your-phone', 'phone number', 'mobile phone']
 38 | 
 39 | def FillForm(driver, anchor_text, phone_require):
 40 |     data = []
 41 | 
 42 |     # keep domain url.
 43 |     current_url = driver.current_url
 44 |     parsed_url = urlparse(current_url)
 45 |     data.append('domain: ' + parsed_url.netloc)
 46 |     # keep date filled out form
 47 |     today = datetime.today()
 48 |     data.append('date: ' + today.strftime("%Y-%m-%d"))  # This formats the date as YYYY-MM-DD
 49 |     # keep time filled out form
 50 |     now = datetime.now()
 51 |     data.append('time: '+ now.strftime("%H:%M:%S"))
 52 | 
 53 |     data.append('anchor text: ' + anchor_text)
 54 | 
 55 |     if phone_require == 2:
 56 |         data.append('asterisk found')
 57 |     else:
 58 |         data.append('asterisk not found')
 59 | 
 60 |     for field in form_fields:
 61 |         try:
 62 |             form_element = driver.find_element(By.NAME, field)
 63 |             if form_element:
 64 |                 form_element.send_keys('John Doe')
 65 |         except:
 66 |             pass
 67 |         
 68 |     xpath_expression = f"//input[@type='submit']"
 69 |     submit_button = driver.find_element(By.XPATH, xpath_expression)
 70 |     try:
 71 |         if submit_button:
 72 |             submit_button.click()
 73 |     except:
 74 |         pass
 75 | 
 76 | 
 77 |     return data
 78 | 
 79 | def FindForm(driver): # check whether form exist in the frame.
 80 |     ret = 0
 81 |     is_phone = 0
 82 | 
 83 |     # check if form exist
 84 |     for field in form_fields:
 85 |         xpath_expression = f"//input[@name='{field}']"
 86 |         xpath_expression1 = f"//input[contains(@placeholder, '{field}')]"
 87 |         try:
 88 |             form_element = driver.find_element(By.XPATH, xpath_expression)
 89 |             if form_element:
 90 |                 ret = 1
 91 |         except:
 92 |             pass
 93 |         try:
 94 |             form_element1 = driver.find_element(By.XPATH, xpath_expression1)
 95 |             if form_element1:
 96 |                 ret = 1
 97 |                 print('  - found with placeholder')
 98 |         except:
 99 |             pass
100 | 
101 |     # check phone field exist
102 |     for field in phone_fields:
103 |         xpath_expression = f"//input[@name='{field}']"
104 |         try:
105 |             phone_field = driver.find_element(By.XPATH, xpath_expression)
106 |             if phone_field and phone_field.is_displayed():
107 |                 # name_field.send_keys('John Doe')
108 |                 is_phone = 1
109 |                 # check for asterisk
110 |                 placeholder_text = phone_field.get_attribute('placeholder')
111 |                 if '*' in placeholder_text:
112 |                     print('The placeholder contains an asterisk.')
113 |                     is_phone = 2
114 |                 else:
115 |                     print('The placeholder does not contain an asterisk.')
116 | 
117 |                 asterisk_expression = f"//label[.//span[contains(., '{field}')]]"
118 |                 try:
119 |                     phone_label = driver.find_element(By.XPATH, asterisk_expression)
120 |                     if phone_label:
121 |                         label_html = phone_label.get_attribute('innerHTML')
122 |                         # print('Full Name label found.')
123 |                         if '*' in label_html:
124 |                             is_phone = 2
125 |                             print('An asterisk is present within the label.')
126 |                         else:
127 |                             print("No asterisk found in label:")
128 |                             # Do something if no asterisk is found
129 |                 except:
130 |                     pass
131 |         except:
132 |             # print('failed')
133 |             pass
134 |     
135 |     if ret == 1:
136 |         print('   - field found')
137 |         if is_phone == 0:
138 |             print('   - no phone field.')
139 |             ret = -1
140 |         if is_phone == 2:
141 |             print('   - phone field is required')
142 |             ret = 2
143 |             pass
144 |     # ret &= is_phone
145 | 
146 |     # check if there's captcha break
147 |     try:
148 |         captcha_element = driver.find_element(By.CLASS_NAME, 'grecaptcha-badge')
149 |         if captcha_element and captcha_element.get_attribute('data-style') == 'inline':
150 |             print('   - captcha detected')
151 |             ret = -2
152 |     except:
153 |         pass
154 | 
155 |     print('return the value when FindForm', ret)
156 | 
157 |     return ret
158 | 
159 | def LeadGeneration(driver, anchor_text): # check whether form exist in the page.
160 |     
161 |     # chrome_options = webdriver.ChromeOptions()
162 |     # driver = webdriver.Chrome(options = chrome_options)
163 |     # driver.get(url)
164 |     print('LeadGeneration', anchor_text)
165 | 
166 |     form_exist = FindForm(driver)
167 | 
168 |     try:
169 |         if form_exist > 0:
170 |             data = FillForm(driver, anchor_text, form_exist)
171 |             ExtractData(data)
172 |         else:
173 |             print('form does not exist in the page', anchor_text, 'so looking into frames.')
174 |             frames = driver.find_elements(By.TAG_NAME, 'iframe')
175 |             # print('checking.......')
176 |             for index, frame in enumerate(frames):
177 |                 # Switch to each frame by index
178 |                 print(' - looking for an iframe..')
179 |                 if not frame.is_displayed():
180 |                     print("form skipped due to it's not displayed.")
181 |                     continue
182 |             
183 |                 actions = ActionChains(driver)
184 |                 actions.move_to_element(frame).perform()
185 | 
186 |                 src_value = frame.get_attribute('src')
187 |                 if "video" in src_value:
188 |                     print(' - src value continas "video"')
189 |                     continue
190 | 
191 |                 driver.switch_to.frame(index)
192 | 
193 |                 # Perform operations within the frame
194 |                 # ...
195 |                 form_exist = FindForm(driver)
196 |                 if form_exist > 0:
197 |                     data = FillForm(driver, anchor_text, form_exist)
198 |                     ExtractData(data)
199 |                     driver.switch_to.default_content()
200 |                     break
201 | 
202 |                 # Switch back to the default content before moving to the next frame
203 |                 driver.switch_to.default_content()
204 | 
205 |         if form_exist > 0:
206 |             print(' - form filled')
207 |         else:
208 |             print(' - this page was skipped')
209 |     except Exception as e:
210 |         print(e)
211 |         # pass
212 |     return form_exist
213 | 
214 | def NavigateDomain(url):
215 |     chrome_options = webdriver.ChromeOptions()
216 |     chrome_options.add_argument("--start-maximized")
217 |     #chrome_options.page_load_strategy = self.load_mode
218 |     driver = webdriver.Chrome(options = chrome_options)
219 |     # chrome_options.add_argument('--ignore-certificate-errors')
220 |     # chrome_options.add_argument('--allow-running-insecure-content')   
221 | 
222 |     try:
223 |         driver.get(url)
224 | 
225 |         # site returns ERROR
226 |         try:
227 |             title_element = driver.find_element(By.XPATH, "//title")
228 |             if title_element:
229 |                 title_text = title_element.get_attribute('textContent')
230 |                 if "ERROR" in title_text:
231 |                     data = []                
232 |                     current_url = driver.current_url
233 |                     parsed_url = urlparse(current_url)
234 |                     data.append('domain: ' + parsed_url.netloc)
235 |                     
236 |                     try:
237 |                         h1_xpath = "/html/body/h1"
238 |                         h1_element = driver.find_element(By.XPATH, h1_xpath)
239 |                         h1_text = h1_element.text
240 |                         data.append(' skipped because ' + h1_text)
241 |                     except:
242 |                         data.append(' skipped because Page ERROR')
243 |                     
244 |                     ExtractData(data)
245 |                     return
246 |         except:
247 |             pass
248 | 
249 |         # close recommandation system
250 |         try:
251 |             # This XPath finds any button element whose class attribute contains the word 'close'
252 |             close_button = driver.find_element(By.XPATH, "//button[contains(@aria-label, 'Dismiss')]")
253 |             if close_button:
254 |                 print('#close button found')
255 |                 close_button.click()
256 |                 print("#Close button clicked.")
257 |         except Exception as e:
258 |             print("#Close button not clicked.")
259 | 
260 | 
261 | 
262 |         # Accept cookies button
263 |         button_expression = f"//button[contains(text(), 'Accept')]"
264 |         try:
265 |             accept_link = driver.find_element(By.XPATH, button_expression)
266 |             if accept_link:
267 |                 print('@accept button clicked')
268 |                 accept_link.click()
269 |         except:
270 |             pass
271 | 
272 |         # Accept cookies <a>
273 |         button_expression = f"//a[contains(text(), 'Accept')]"
274 |         try:
275 |             accept_link = driver.find_element(By.XPATH, button_expression)
276 |             if accept_link:
277 |                 print('@accept anchor clicked')
278 |                 accept_link.click()
279 |         except:
280 |             pass
281 | 
282 |         # close popup dlg.
283 |         # Find the button by class name containing 'close' and click it
284 |         # This XPath finds any button element whose class attribute contains the word 'close'
285 |         close_buttons = driver.find_elements(By.XPATH, "//button[contains(@aria-label, 'close popup')]")
286 |         if close_buttons:
287 |             print('&close button found')
288 |             for close_button in close_buttons:
289 |                 # close_buttons[1].click()
290 |                 try:
291 |                     close_button.click()
292 |                 except Exception as e:
293 |                     # print("Close button not clicked.")
294 |                     pass
295 | 
296 |         
297 |         form_filled = LeadGeneration(driver, 'hompage')
298 |         if not form_filled > 0:
299 |             for word in anchor_words:
300 |                 if form_filled > 0:
301 |                     break
302 |                 print(word)
303 |                 # within <span>
304 |                 xpath_expression = f"//a[.//span[contains(text(), '{word}')]]"
305 |                 try:
306 |                     anchor_link = driver.find_element(By.XPATH, xpath_expression)
307 | 
308 |                     if anchor_link:
309 |                         anchor_link.click()
310 |                         form_filled = LeadGeneration(driver, word)
311 |                         driver.back()
312 |                 except:
313 |                     pass
314 | 
315 |                 # without <span>
316 |                 xpath_expression = f"//a[contains(text(), '{word}')]"
317 |                 try:
318 |                     anchor_links = driver.find_elements(By.XPATH, xpath_expression)
319 | 
320 |                     for anchor_link in anchor_links:
321 |                         if not anchor_link.is_displayed():
322 |                             continue
323 |                         print('**anchor: ' + word)
324 |                         anchor_link.click()
325 |                         print('**anchor ', word, ' clicked')
326 |                         form_filled = LeadGeneration(driver, word)
327 |                         driver.back()
328 |                 except Exception as e:
329 |                     print(e)
330 |                     # pass
331 |             if not form_filled > 0:
332 |                 data = []
333 |                 if form_filled == 0:
334 |                     data.append('domain: ' + url + ' skipped because no form found')
335 |                 elif form_filled == -2:
336 |                     data.append('domain: ' + url + ' skipped because CAPTCHA detected.')
337 |                 else:
338 |                     data.append('domain: ' + url + ' skipped because no phone field found')
339 | 
340 |                 ExtractData(data)
341 | 
342 |         time.sleep(5)
343 |         # while True:
344 |         #     pass
345 |     except Exception as e:
346 |         # print(e)
347 |         pass
348 |     driver.quit()
349 | 
350 | 
351 | 
352 | # NavigateDomain('https://adstage.io/')
353 | # NavigateDomain('https://capeanalytics.com/contact/')
354 | # NavigateDomain('https://dotloop.com/')
355 | # NavigateDomain('https://homelight.com/')
356 | # NavigateDomain('https://indinero.com/')
357 | # NavigateDomain('https://jyve.com/')
358 | # NavigateDomain('https://parkstreet.com/')
359 | # NavigateDomain('https://www.goguardian.com/')
360 | # NavigateDomain('https://claylacy.com/')
361 | # NavigateDomain('https://marketshareonline.com/')
362 | # NavigateDomain('https://intrepidib.com/')
363 | # NavigateDomain('https://alpertandalpert.com/')
364 | # NavigateDomain('https://yscouts.com/')
365 | # NavigateDomain('https://laneterralever.com/')
366 | # NavigateDomain('https://valorglobal.com/')
367 | 
368 | 
369 | '''
370 | 
371 | parkstreet.com                  // success
372 | chassi.com/contact              // frame, no-phone
373 | https://www.claylacy.com/contact-us/        // success
374 | https://info.marketshareonline.com/contact  // capcha
375 | https://intrepidib.com/contact-us/          // your-phone
376 | https://www.alpertandalpert.com/contact-us.html // no phone
377 | https://yscouts.com/contact/            // no phone
378 | https://valorglobal.com/get-a-quote/        // captcha
379 | https://myfw.com/contact/               // captcha
380 | https://laneterralever.com/             // cookie, popup close, no phone
381 | 
382 | firstname
383 | lastname
384 | company
385 | phone
386 | email
387 | '''
388 | 


--------------------------------------------------------------------------------