├── DKB_Bank_login.py ├── EtalongroupRU ├── README.md ├── __init__.py ├── __pycache__ │ └── logger.cpython-311.pyc ├── data.json ├── debug.log ├── etalongroup_ru.py ├── helper.py └── logger.py ├── InstagramAPI ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-310.pyc │ ├── credentials.cpython-310.pyc │ └── instagram_baseline.cpython-310.pyc ├── credentials.py ├── hashtag_search.py └── instagram_baseline.py ├── LICENSE ├── README.md ├── alnair_ae.py ├── app_powerbi_com.py ├── app_powerbi_com_anp.gif ├── app_powerbi_com_anp.py ├── audible_com.py ├── autotrader_co_uk.py ├── baseball_scraper ├── baseball_scraper │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-311.pyc │ │ └── settings.cpython-311.pyc │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-311.pyc │ │ └── players.cpython-311.pyc │ │ └── players.py ├── players.json └── scrapy.cfg ├── click_checkbox_whirlpool.py ├── coroners_nsw_gov_download_multiple_pdf.py ├── destinytracker_com.py ├── eex_com.py ├── egle_state_mi.py ├── element_not_visible_to_click.py ├── fb_login_with_popup_alert_disabled.py ├── find_chrome_version.py ├── find_masa_com.py ├── flicker_scroll.py ├── google_com_finance.py ├── google_finance.gif ├── hong_kong_observatory_climate.py ├── imdb_com.py ├── indeed_com.py ├── jodidb_org.py ├── join_team_meeting.gif ├── knowde_com.py ├── lebara_nl.py ├── lidl_GB.py ├── load_cookies_to_accept_all.py ├── ma_shienkikan.py ├── medicus_online_at.py ├── mercedes-benz.py ├── mydealz_de.py ├── nested_shadow_root.py ├── nse_india.py ├── nse_india_2.py ├── oddsportal_com.py ├── pump_fun.py ├── quiker_com.py ├── scrape_bluechip_io.py ├── scrape_www_knx_org.py ├── scroll_down.py ├── scroll_to_bottom.py ├── sel_pagination_excercise.py ├── select_element_by_tag_text.py ├── selenium_action_move_by_offset.py ├── selenium_baseline.py ├── selenium_chrome_profile.py ├── selenium_file_download.py ├── selenium_get_attribute.py ├── selenium_get_parent_element.py ├── selenium_hover_click.py ├── selenium_hover_click_text.py ├── selenium_iframe_excercise.py ├── selenium_iframe_excercise_2.py ├── selenium_iframe_excercise_3.py ├── selenium_iframe_excercise_linkedin.py ├── selenium_nth_css_selector.py ├── selenium_ok_alert.py ├── selenium_options.py ├── selenium_partial_class_name.py ├── selenium_scrap_transcript.py ├── selenium_scrape_youtube_channel.py ├── selenium_scrape_youtube_search.py ├── selenium_select_tag_dropdown.py ├── selenium_send_keys _excercise.py ├── selenium_shadow_open_excercise.py ├── selenium_shadow_root.py ├── selenium_switches.json ├── selenium_take_screenshot.py ├── selenium_twitter_login.py ├── selenium_work_shadow_closed.pyi ├── selenium_workday_login.py ├── shein_com.py ├── stackoverflow_login_and_save_cookies.py ├── stackoverflow_login_with_cookies.py ├── stackoverflow_track.py ├── store_pagination_element_to_click.py ├── sustainalytics_com.py ├── switching_bw_windows.py ├── switching_bw_windows_excercise_2.py ├── text_option_under_select.py ├── the_line_cl.py ├── tiktok_com.py ├── tiktok_com_video_post.py ├── tiktok_video_post.gif ├── transat_com.py ├── twitter_login.py ├── usa_visa_com.py ├── wallet_polygon_technology.py ├── wallet_sendit_arcana_network.py ├── yomiuri_co_jp.py └── youtube_channel_all_videos.py /DKB_Bank_login.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : DKB Bank Login 3 | Author : Ajeet 4 | Date : July 24, 2023 5 | """ 6 | 7 | # Import required modules 8 | import time 9 | from selenium.webdriver import Chrome,Keys 10 | from selenium.webdriver.common.by import By 11 | from selenium.webdriver.support.wait import WebDriverWait 12 | import selenium.webdriver.support.expected_conditions as EC 13 | from selenium.webdriver.common.action_chains import ActionChains 14 | 15 | def main(): 16 | # Initialize Chrome WebDriver 17 | driver = Chrome() 18 | 19 | # Open the DKB login page 20 | driver.get("https://banking.dkb.de/login") 21 | 22 | # Set up WebDriverWait with a timeout of 10 seconds 23 | wait = WebDriverWait(driver, 10) 24 | 25 | # Switch to the iframe and refuse all cookies 26 | # The website may display a cookie consent popup within an iframe. 27 | iframe = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'iframe#privacy-iframe'))) 28 | driver.switch_to.frame(iframe) 29 | driver.find_element(By.CSS_SELECTOR, 'button.btn.refuse-all').click() 30 | 31 | # After refusing cookies, go back to the main page (DKB login page) 32 | driver.get("https://banking.dkb.de/login") 33 | 34 | # Initialize ActionChains to perform actions like mouse movements and keystrokes 35 | actions = ActionChains(driver) 36 | 37 | # Logging in with provided credentials 38 | # Find the username input field, click on it, and enter the username '123456789' 39 | wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'div[data-testid="sui-input-username"]'))).click() 40 | username = driver.find_element(By.CSS_SELECTOR, 'input#username') 41 | actions.move_to_element(username).send_keys('123456789').perform() 42 | 43 | # Find the password input field, click on it, and enter the password 'abcdefg' 44 | wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'div[data-testid="sui-input-password"]'))).click() 45 | password = driver.find_element(By.CSS_SELECTOR, 'input#password') 46 | actions.move_to_element(password).send_keys('abcdefg').perform() 47 | 48 | # Press the Enter key to submit the login form 49 | password.send_keys(Keys.ENTER) 50 | 51 | # Wait for 2 seconds (to allow the page to load or perform further actions) 52 | time.sleep(2) 53 | 54 | # Call the main function to start the script 55 | main() 56 | 57 | """ 58 | reference: 59 | https://stackoverflow.com/questions/76749285/i-cannot-send-keys-because-element-not-interactable-in-selenium-web-automation 60 | """ 61 | -------------------------------------------------------------------------------- /EtalongroupRU/README.md: -------------------------------------------------------------------------------- 1 | ## Overview : 2 | This script scrapes apartment details from the Voxhall property page using Selenium and BeautifulSoup. 3 | ## Usage 4 | 5 | ### Command-Line Arguments 6 | - `--file`: Path to the output JSON file (optional). 7 | 8 | ### Examples 9 | 10 | #### Extract Data and Print to Console 11 | ```bash 12 | python etalongroup_ru.py 13 | ``` 14 | 15 | #### Extract Data and Write to an JSON File 16 | ```bash 17 | python etalongroup_ru.py --file "F:\automation\EtalongroupRU\data.json" 18 | ``` 19 | ## Stackoverflow link : 20 | 21 | [reference](https://stackoverflow.com/a/79368954/11179336) 22 | 23 | -------------------------------------------------------------------------------- /EtalongroupRU/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: VermaAK 3 | Created: 1/19/2025 4 | Description: 5 | Project: automation 6 | """ 7 | -------------------------------------------------------------------------------- /EtalongroupRU/__pycache__/logger.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Help-the-community/Web_Scraping_with_Selenium/83d65976e29668ac62cd8edf50511f66ff00084e/EtalongroupRU/__pycache__/logger.cpython-311.pyc -------------------------------------------------------------------------------- /EtalongroupRU/data.json: -------------------------------------------------------------------------------- 1 | [{"link": "https://etalongroup.ru//msk/choose/92334/", "price": "20 519 852 ₽ ", "title": "Студия № 197", "area": "26.0 м²", "floor": "16 этаж"}, {"link": "https://etalongroup.ru//msk/choose/92437/", "price": "20 726 234 ₽ ", "title": "Студия № 37", "area": "25.4 м²", "floor": "4 этаж"}, {"link": "https://etalongroup.ru//msk/choose/92445/", "price": "20 976 711 ₽ ", "title": "Студия № 44", "area": "26.0 м²", "floor": "5 этаж"}, {"link": "https://etalongroup.ru//msk/choose/92453/", "price": "20 994 562 ₽ ", "title": "Студия № 51", "area": "25.7 м²", "floor": "5 этаж"}, {"link": "https://etalongroup.ru//msk/choose/92483/", "price": "21 039 082 ₽ ", "title": "Студия № 79", "area": "25.7 м²", "floor": "7 этаж"}, {"link": "https://etalongroup.ru//msk/choose/92255/", "price": "21 835 647 ₽ ", "title": "Студия № 125", "area": "25.8 м²", "floor": "10 этаж"}] -------------------------------------------------------------------------------- /EtalongroupRU/debug.log: -------------------------------------------------------------------------------- 1 | 2025-01-19 23:21:31,519 - INFO - Configuring WebDriver... 2 | 2025-01-19 23:21:33,254 - INFO - Fetching page content... 3 | 2025-01-19 23:21:55,688 - INFO - Closing WebDriver... 4 | 2025-01-19 23:21:57,936 - INFO - Parsing apartments data... 5 | 2025-01-19 23:21:57,967 - INFO - New F:/automation/EtalongroupRU/data.json has been created with the data. 6 | 2025-01-19 23:22:29,908 - INFO - Configuring WebDriver... 7 | 2025-01-19 23:22:31,609 - INFO - Fetching page content... 8 | 2025-01-19 23:22:50,340 - INFO - Closing WebDriver... 9 | 2025-01-19 23:22:52,628 - INFO - Parsing apartments data... 10 | 2025-01-19 23:22:52,659 - INFO - Scraped Data: 11 | 2025-01-19 23:22:52,659 - INFO - [{'link': 'https://etalongroup.ru//msk/choose/92334/', 'price': '20 519 852 ₽ ', 'title': 'Студия № 197', 'area': '26.0 м²', 'floor': '16 этаж'}, {'link': 'https://etalongroup.ru//msk/choose/92437/', 'price': '20 726 234 ₽ ', 'title': 'Студия № 37', 'area': '25.4 м²', 'floor': '4 этаж'}, {'link': 'https://etalongroup.ru//msk/choose/92445/', 'price': '20 976 711 ₽ ', 'title': 'Студия № 44', 'area': '26.0 м²', 'floor': '5 этаж'}, {'link': 'https://etalongroup.ru//msk/choose/92453/', 'price': '20 994 562 ₽ ', 'title': 'Студия № 51', 'area': '25.7 м²', 'floor': '5 этаж'}, {'link': 'https://etalongroup.ru//msk/choose/92483/', 'price': '21 039 082 ₽ ', 'title': 'Студия № 79', 'area': '25.7 м²', 'floor': '7 этаж'}, {'link': 'https://etalongroup.ru//msk/choose/92255/', 'price': '21 835 647 ₽ ', 'title': 'Студия № 125', 'area': '25.8 м²', 'floor': '10 этаж'}] 12 | 2025-01-19 23:26:49,457 - INFO - Configuring WebDriver... 13 | 2025-01-19 23:26:51,251 - INFO - Fetching page content... 14 | 2025-01-19 23:27:25,906 - INFO - Closing WebDriver... 15 | 2025-01-19 23:27:28,232 - INFO - Parsing apartments data... 16 | 2025-01-19 23:27:28,311 - INFO - New F:/automation/EtalongroupRU/data.json has been created with the data. 17 | -------------------------------------------------------------------------------- /EtalongroupRU/etalongroup_ru.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: Ajeet 3 | Created: 1/19/2025 4 | Description: This script scrapes apartment details from the Voxhall property page using Selenium and BeautifulSoup. 5 | Project: automation 6 | """ 7 | import time 8 | import argparse 9 | from bs4 import BeautifulSoup 10 | from selenium import webdriver 11 | from selenium.webdriver.chrome.options import Options 12 | from selenium.webdriver.common.by import By 13 | from typing import List, Dict 14 | 15 | from logger import logger 16 | from helper import save_file 17 | 18 | 19 | def configure_webdriver(headless: bool = True) -> webdriver.Chrome: 20 | """Configures and initializes the Selenium WebDriver.""" 21 | options = Options() 22 | if headless: 23 | options.add_argument('--headless') 24 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 25 | options.add_experimental_option("useAutomationExtension", False) 26 | return webdriver.Chrome(options=options) 27 | 28 | 29 | def fetch_page_content(driver: webdriver.Chrome, url: str, selector: str, wait_time: int = 2) -> str: 30 | """ 31 | Fetches the HTML content of a specific container on a web page. 32 | 33 | This function navigates to the provided URL, waits for the page to load, and retrieves the HTML content 34 | of the specified container identified by the CSS selector. 35 | 36 | Args: 37 | driver (webdriver.Chrome): The Selenium WebDriver instance to control the browser. 38 | url (str): The URL of the page to fetch. 39 | selector (str): The CSS selector to identify the container element whose content is to be fetched. 40 | wait_time (int, optional): The time (in seconds) to wait for the page to load before fetching content. Defaults to 2 seconds. 41 | 42 | Returns: 43 | str: The HTML content of the specified container. 44 | """ 45 | # Navigate to the provided URL 46 | driver.get(url) 47 | 48 | # Wait for the page to load fully (with a default wait time) 49 | time.sleep(wait_time) 50 | 51 | # Find the container element using the provided CSS selector and retrieve its inner HTML 52 | container = driver.find_element(By.CSS_SELECTOR, selector).get_attribute('innerHTML') 53 | 54 | # Return the HTML content of the container 55 | return container 56 | 57 | 58 | def parse_apartments(html_content: str) -> List[Dict[str, str]]: 59 | """ 60 | Parses apartment data from the provided HTML content. 61 | 62 | This function extracts the apartment details such as the link, price, title, area, and floor from the given 63 | HTML content of a real estate page. It uses BeautifulSoup to parse the HTML and collects relevant information. 64 | 65 | Args: 66 | html_content (str): The HTML content of the page to parse. 67 | 68 | Returns: 69 | List[Dict[str, str]]: A list of dictionaries, each containing the details of an apartment (link, price, title, area, floor). 70 | """ 71 | # Parse the HTML content using BeautifulSoup 72 | soup = BeautifulSoup(html_content, 'html.parser') 73 | 74 | # Initialize an empty list to store the apartments' details 75 | apartments = [] 76 | 77 | # Find all the apartment containers on the page 78 | result_container = soup.find_all('div', class_="bg-white relative") 79 | 80 | # Loop through each apartment container to extract the required data 81 | for result in result_container: 82 | # Find the anchor tag that leads to the apartment's page 83 | root = result.find_next('a') 84 | 85 | # Extract area and floor information from the text in the corresponding span 86 | area_floor = root.select_one('section.flex.flex-col.gap-2>span.th-b1-regular').text.split(' | ') 87 | 88 | # Append the apartment's details as a dictionary to the apartments list 89 | apartments.append({ 90 | "link": f"https://etalongroup.ru/{root['href']}", 91 | "price": root.select_one('span.th-h2').text, 92 | "title": root.select_one('span.th-h4').text, 93 | "area": area_floor[0], 94 | "floor": area_floor[1] 95 | }) 96 | 97 | # Return the list of apartments with extracted details 98 | return apartments 99 | 100 | 101 | def main(): 102 | 103 | parser = argparse.ArgumentParser( 104 | description='A script scrapes apartment details from the Voxhall property page and write results to an JSON file.' 105 | ) 106 | parser.add_argument('--file', type=str, help='Path of the file', default=None) 107 | args = parser.parse_args() 108 | 109 | """Main function to orchestrate the scraping process.""" 110 | url = 'https://etalongroup.ru/msk/object/voxhall/' 111 | container_selector = '#card-object>div' 112 | 113 | logger.info("Configuring WebDriver...") 114 | driver = configure_webdriver() 115 | 116 | logger.info("Fetching page content...") 117 | html_content = fetch_page_content(driver, url, container_selector) 118 | 119 | logger.info("Closing WebDriver...") 120 | driver.quit() 121 | 122 | logger.info("Parsing apartments data...") 123 | apartments = parse_apartments(html_content) 124 | 125 | if args.file: 126 | save_file(args.file, apartments) 127 | else: 128 | logger.info("Scraped Data:") 129 | logger.info(apartments) 130 | 131 | 132 | if __name__ == '__main__': 133 | main() 134 | -------------------------------------------------------------------------------- /EtalongroupRU/helper.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: Ajeet 3 | Created: 1/19/2025 4 | Description: This script scrapes apartment details from the Voxhall property page using Selenium and BeautifulSoup. 5 | Project: automation 6 | """ 7 | import os 8 | import json 9 | from typing import List, Dict 10 | from logger import logger 11 | 12 | 13 | def save_file(path: str, data: List) -> None: 14 | """ 15 | Saves the provided data to a file at the specified path. 16 | If the file already exists, it is deleted before saving the new data. 17 | 18 | Args: 19 | path (str): The file path where the data will be saved. 20 | data (List): The data to be saved in JSON format. 21 | 22 | Returns: 23 | None: The function performs an action (saving a file) and does not return a value. 24 | 25 | Side Effects: 26 | - If the file exists at the specified path, it is removed before saving the new data. 27 | - A log message is generated after successfully saving the data. 28 | """ 29 | 30 | # Check if the file exists 31 | if os.path.exists(path): 32 | # If the file exists, delete it 33 | os.remove(path) 34 | 35 | # Open the file in write mode and save the data in JSON format 36 | with open(path, 'w', encoding='utf-8') as file: 37 | json.dump(data, file, ensure_ascii=False) 38 | 39 | # Log a message indicating the file was successfully created 40 | logger.info(f"New {path} has been created with the data.") 41 | -------------------------------------------------------------------------------- /EtalongroupRU/logger.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: Ajeet 3 | Created: 1/19/2025 4 | Description: This script scrapes apartment details from the Voxhall property page using Selenium and BeautifulSoup. 5 | Project: automation 6 | """ 7 | import logging 8 | import sys 9 | 10 | # Create a logger 11 | logger = logging.getLogger("Voxhall") 12 | logger.setLevel(logging.INFO) 13 | 14 | # Formatter for consistent log format 15 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') 16 | 17 | # File handler 18 | file_handler = logging.FileHandler("F:/automation/EtalongroupRU/debug.log", encoding="utf-8") 19 | file_handler.setFormatter(formatter) 20 | 21 | # Stream handler 22 | stream_handler = logging.StreamHandler(sys.stdout) 23 | stream_handler.setFormatter(formatter) 24 | 25 | # Add handlers to the logger 26 | if not logger.handlers: # Prevent adding handlers multiple times 27 | logger.addHandler(file_handler) 28 | logger.addHandler(stream_handler) 29 | -------------------------------------------------------------------------------- /InstagramAPI/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : 3 | Author : Ajeet 4 | Date : June 9, 2023 5 | """ 6 | 7 | 8 | def print_hi(name): 9 | print(f'Hi, {name}') 10 | 11 | 12 | if __name__ == '__main__': 13 | print_hi('Python') 14 | -------------------------------------------------------------------------------- /InstagramAPI/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Help-the-community/Web_Scraping_with_Selenium/83d65976e29668ac62cd8edf50511f66ff00084e/InstagramAPI/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /InstagramAPI/__pycache__/credentials.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Help-the-community/Web_Scraping_with_Selenium/83d65976e29668ac62cd8edf50511f66ff00084e/InstagramAPI/__pycache__/credentials.cpython-310.pyc -------------------------------------------------------------------------------- /InstagramAPI/__pycache__/instagram_baseline.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Help-the-community/Web_Scraping_with_Selenium/83d65976e29668ac62cd8edf50511f66ff00084e/InstagramAPI/__pycache__/instagram_baseline.cpython-310.pyc -------------------------------------------------------------------------------- /InstagramAPI/credentials.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : 3 | Author : Ajeet 4 | Date : June 20, 2023 5 | """ 6 | 7 | import os 8 | 9 | creds = {'instagram_username': os.environ.get('instagram_username'), 10 | 'instagram_password': os.environ.get('instagram_password'), 11 | } 12 | -------------------------------------------------------------------------------- /InstagramAPI/hashtag_search.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : 3 | Author : Ajeet 4 | Date : June 20, 2023 5 | """ 6 | import time 7 | from typing import List, Dict 8 | from selenium.webdriver.common.by import By 9 | from InstagramAPI.instagram_baseline import Instagram 10 | from selenium.webdriver.support.wait import WebDriverWait 11 | import selenium.webdriver.support.expected_conditions as EC 12 | 13 | 14 | def hashtag(browser, tag: str) -> List[Dict]: 15 | data = [] 16 | url = f'https://www.instagram.com/explore/tags/{tag}/' 17 | browser.get(url) 18 | container = WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'article'))) 19 | 20 | for _ in range(3): 21 | browser.execute_script('window.scrollBy(0, 5000);') 22 | time.sleep(1) 23 | 24 | images = container.find_elements(By.TAG_NAME, 'img') 25 | for image in images: 26 | data.append({ 27 | "description": image.get_attribute('alt'), 28 | "image_link": image.get_attribute('src') 29 | }) 30 | 31 | return data 32 | 33 | 34 | if __name__ == '__main__': 35 | obj = Instagram() 36 | driver = obj.load_cookies("D:\\automation\InstagramAPI\instgram_cookies.json") 37 | print(hashtag(driver, 'tree')) 38 | -------------------------------------------------------------------------------- /InstagramAPI/instagram_baseline.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : Instagram 3 | Author : Ajeet 4 | Date : June 20, 2023 5 | """ 6 | import time 7 | import json 8 | from typing import Optional 9 | 10 | from selenium import webdriver 11 | from selenium.webdriver import ChromeOptions, Keys 12 | from selenium.webdriver.common.by import By 13 | from selenium.webdriver.support.wait import WebDriverWait 14 | import selenium.webdriver.support.expected_conditions as EC 15 | from selenium.webdriver.chrome.webdriver import WebDriver 16 | 17 | from credentials import creds 18 | 19 | 20 | class Instagram: 21 | 22 | def __init__(self): 23 | options = ChromeOptions() 24 | options.add_argument("--start-maximized") 25 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 26 | 27 | self.driver = webdriver.Chrome(options=options) 28 | self.wait = WebDriverWait(self.driver, 10) 29 | 30 | def login(self, username, password): 31 | self.driver.get("https://www.instagram.com/") 32 | # username 33 | self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'input[name="username"]'))).send_keys(username) 34 | # password 35 | self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'input[name="password"]'))).send_keys(password+Keys.ENTER) 36 | # click on "Not Now" to close "Save Your Login Info?" 37 | self.wait.until(EC.visibility_of_element_located((By.XPATH, '//div[text()="Not Now"]'))).click() 38 | 39 | def save_cookies(self, username: str, password: str, path: str) -> None: 40 | self.login(username, password) 41 | json_object = json.dumps(self.driver.get_cookies()) 42 | 43 | # Writing to instagram_cookies.json 44 | with open(path, "w") as outfile: 45 | outfile.write(json_object) 46 | 47 | def load_cookies(self, path: str) -> WebDriver: 48 | self.driver.get("https://www.instagram.com/") 49 | 50 | # Opening JSON file 51 | f = open(path) 52 | cookies = json.load(f) 53 | # load cookies to the driver 54 | for cookie in cookies: 55 | self.driver.add_cookie(cookie) 56 | 57 | time.sleep(1) 58 | # refresh the browser 59 | self.driver.refresh() 60 | time.sleep(1) 61 | self.wait.until(EC.visibility_of_element_located((By.XPATH, '//button[text()="Not Now"]'))).click() 62 | time.sleep(1) 63 | 64 | return self.driver 65 | 66 | 67 | if __name__ == '__main__': 68 | obj = Instagram() 69 | # obj.login(creds['instagram_username'], creds['instagram_password']) 70 | # obj.save_cookies(creds['instagram_username'], creds['instagram_password'], 'D:\\automation\InstagramAPI\instgram_cookies.json') 71 | # obj.load_cookies('D:\\automation\InstagramAPI\instgram_cookies.json') 72 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Stackoverflow Exercises 2 | 3 | ## Overview : 4 | This repository contains a collection of real-world examples demonstrating web scraping using Python with Selenium. 5 | Most of these scripts were created to assist the community on Stack Overflow by providing fully functional solutions to their questions. 6 | 7 | ## Script Naming Convention : 8 | The scripts are named based on the websites they target for scraping. 9 | For instance, a script designed to scrape data from `https://www.abcd.co.ef/editorial/` is named `abcd_co_ef.py`. 10 | This naming convention makes it easy to identify the source website for each script. 11 | 12 | ## Stack Overflow References : 13 | The scripts also include references to the corresponding Stack Overflow questions. 14 | This allows you to easily access the original discussions and gain background knowledge about the problems being addressed. 15 | 16 | -------------------------------------------------------------------------------- /alnair_ae.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : Alnair 3 | Author : Ajeet 4 | Date : July 29, 2023 5 | """ 6 | 7 | from selenium.webdriver import Chrome 8 | from selenium.webdriver.chrome.service import Service as ChromeService 9 | from selenium.webdriver.common.by import By 10 | from selenium.webdriver.chrome.options import Options 11 | from webdriver_manager.chrome import ChromeDriverManager 12 | 13 | 14 | URL_alnair = 'https://alnair.ae/app/view/1412/3386/apartment/apartments' 15 | o = Options() 16 | o.add_experimental_option('detach', True) 17 | o.add_argument('--start-maximized') 18 | 19 | driver = Chrome(service=ChromeService(ChromeDriverManager().install()), options=o) 20 | 21 | def get_data(): 22 | driver.get(URL_alnair) 23 | driver.set_page_load_timeout(2) 24 | 25 | scroll_bar = driver.find_element(By.CSS_SELECTOR, 'div[class^="_scrollContainer_"]') 26 | driver.execute_script("arguments[0].scrollBy(0, arguments[0].scrollHeight);", scroll_bar) 27 | 28 | get_data() 29 | 30 | 31 | """ 32 | - You first need to find/locate the scrollbar which is embedded in the HTML page. 33 | - The web-element
represents the scrollbar which can be located using the 34 | mentioned strategy. 35 | - Once we find the web element for the scrollbar, simply can scroll down to its height. 36 | 37 | reference: 38 | https://stackoverflow.com/questions/76791670/scrolling-using-selenium4-10-0 39 | """ -------------------------------------------------------------------------------- /app_powerbi_com.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : App PowerBI 3 | Author : Ajeet 4 | Date : June 16, 2023 5 | """ 6 | # import libraries 7 | import time 8 | from selenium.webdriver import Chrome, ChromeOptions 9 | from selenium.webdriver.common.by import By 10 | from selenium.webdriver import ActionChains 11 | from selenium.webdriver.support.ui import WebDriverWait 12 | from selenium.webdriver.support import expected_conditions as EC 13 | from selenium.common.exceptions import MoveTargetOutOfBoundsException 14 | 15 | options = ChromeOptions() 16 | options.add_argument('--start-maximized') 17 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 18 | driver = Chrome(options=options) 19 | wait = WebDriverWait(driver, 10) 20 | driver.get("https://app.powerbi.com/view?r=eyJrIjoiNzA0MGM4NGMtN2E5Ny00NDU3LWJiNzMtOWFlMGIyMDczZjg2IiwidCI6IjM4MmZiOGIwLTRkYzMtNDEwNy04MGJkLTM1OTViMjQzMmZhZSIsImMiOjZ9&pageName=ReportSection") 21 | 22 | # wait for the dashboard to load 23 | wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'transform.bringToFront'))) 24 | 25 | state = driver.find_element(By.CSS_SELECTOR, 'div[aria-label="State"]') 26 | state.find_element(By.CSS_SELECTOR, 'span[title="Select all"]').click() 27 | 28 | job_name = driver.find_element(By.CSS_SELECTOR, 'div[aria-label="Job Name"]') 29 | # for example, select the option 4 30 | job_name.find_element(By.CSS_SELECTOR, 'div[data-row-index="4"]').click() 31 | 32 | time.sleep(2) 33 | 34 | scrolls = driver.find_elements(By.CSS_SELECTOR, 'div.scroll-bar-part-bar') 35 | h_scroll = scrolls[2] 36 | v_scroll = scrolls[3] 37 | 38 | # Perform horizontal scrolling 39 | action_chains = ActionChains(driver) 40 | action_chains.move_to_element(h_scroll).click_and_hold().move_by_offset(500, 0).release().perform() 41 | time.sleep(1) 42 | 43 | flag = True 44 | while flag: 45 | try: 46 | # Perform vertical scrolling 47 | action_chains = ActionChains(driver) 48 | action_chains.move_to_element(v_scroll).click_and_hold().move_by_offset(0, 100).release().perform() 49 | 50 | except MoveTargetOutOfBoundsException: 51 | flag = False 52 | 53 | # find the desired 2nd table 54 | table = driver.find_elements(By.CSS_SELECTOR, 'div.tableExContainer')[1] 55 | 56 | # now you can parse this desirable as you want. 57 | 58 | 59 | """ 60 | Few points to note: 61 | 62 | 1. We first wait for the dashboard on the webpage to be visible. 63 | 2. Next, locate the State web element and find Select all option in it to click. 64 | 3. Similarly, locate the Job Name web element and find the option number 4 in it to click. 65 | 4. Next, we locate the all vertical and horizontal scroll bars with the css selector and then get the horizontal and 66 | vertical scroll bar of the desired table(2nd table here). 67 | 5. After getting the web element of the target scroll-bars, we first perform the horizontal scrolling. 68 | 6. Afterwards, we perform the vertical scrolling to load all the data in the target table. 69 | 7 Finally, locate the target/desired table, the variable "table" holds the desired web element of the table you want to scrape which you can use for further parsing to extract the table's data. 70 | 71 | reference: 72 | https://stackoverflow.com/questions/76214166/scrape-websites-power-bi-dashboard-using-python-selenium 73 | """ -------------------------------------------------------------------------------- /app_powerbi_com_anp.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Help-the-community/Web_Scraping_with_Selenium/83d65976e29668ac62cd8edf50511f66ff00084e/app_powerbi_com_anp.gif -------------------------------------------------------------------------------- /app_powerbi_com_anp.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : PowerBI App 3 | Author : Ajeet 4 | Date : April 22, 2025 5 | """ 6 | # ===== IMPORTS ===== 7 | from time import sleep 8 | from selenium import webdriver 9 | from selenium.webdriver.chrome.options import Options 10 | from selenium.webdriver.common.by import By 11 | from selenium.webdriver.support.ui import WebDriverWait 12 | from selenium.webdriver.support import expected_conditions as EC 13 | from selenium.webdriver import ActionChains 14 | 15 | 16 | # ===== SETUP OPTIONS ===== 17 | def initialize_driver() -> webdriver.Chrome: 18 | """Initializes and returns a configured Chrome WebDriver.""" 19 | options = Options() 20 | options.add_argument("--start-maximized") 21 | options.add_argument("force-device-scale-factor=0.95") 22 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 23 | return webdriver.Chrome(options=options) 24 | 25 | 26 | driver = initialize_driver() 27 | wait = WebDriverWait(driver, 10) 28 | 29 | 30 | # ===== HELPER FUNCTIONS ===== 31 | def wait_and_click(by: By, identifier: str) -> None: 32 | """ 33 | Waits for an element to be clickable and clicks it. 34 | 35 | Args: 36 | by (By): Locator strategy (e.g., By.XPATH, By.CSS_SELECTOR). 37 | identifier (str): The locator string for the target element. 38 | """ 39 | element = wait.until(EC.element_to_be_clickable((by, identifier))) 40 | element.click() 41 | 42 | 43 | def scroll_slicer_container(offset_y: int = 100) -> None: 44 | """ 45 | Scrolls inside a slicer dropdown popup using ActionChains. 46 | 47 | Args: 48 | offset_y (int): The vertical scroll offset. Positive = down, Negative = up. 49 | """ 50 | sc = driver.find_element(By.CSS_SELECTOR, 51 | 'div[id^="slicer-dropdown-popup-"]>div>div>div:nth-child(2)>div>div:nth-child(3)' 52 | ) 53 | action = ActionChains(driver) 54 | action.move_to_element(sc).click_and_hold().move_by_offset(0, offset_y).release().perform() 55 | 56 | 57 | # ===== MAIN FUNCTION ===== 58 | def report_analyser(year: str, month: int) -> None: 59 | """ 60 | Navigates to a Power BI report and selects a specific month in a slicer filter. 61 | 62 | Args: 63 | year (str): The target year to expand in the slicer (e.g., "2022"). 64 | month (int): The month to select (1-based index corresponding to the slicer position). 65 | """ 66 | url = "https://app.powerbi.com/view?r=eyJrIjoiZWIzNDg3YzUtMGFlMC00MzdmLTgzOWQtZThkOWExNTU2NjBlIiwidCI6IjQ0OTlmNGZmLTI0YTYtNGI0Mi1iN2VmLTEyNGFmY2FkYzkxMyJ9" 67 | driver.get(url) 68 | 69 | # Wait for page to load and navigate to second page 70 | wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Mercado Page navigation . Mercado"]'))) 71 | wait_and_click(By.CSS_SELECTOR, '#embedWrapperID>div.logoBarWrapper>logo-bar>div>div>div>logo-bar-navigation>span>button:nth-child(3)') 72 | 73 | # Open the slicer dropdown 74 | wait_and_click(By.CSS_SELECTOR, 75 | '#pvExplorationHost > div > div > exploration > div > explore-canvas > div > div.canvasFlexBox > div > div.displayArea.disableAnimations.fitToPage > div.visualContainerHost.visualContainerOutOfFocus > visual-container-repeat > visual-container:nth-child(6) > transform > div > div.visualContent > div > div > visual-modern > div > div > div.slicer-content-wrapper > div>i' 76 | ) 77 | 78 | # Expand the year to show months 79 | wait_and_click(By.XPATH, f'//div[@class="slicerItemContainer" and @title="{year}"]/div[@class="expandButton"]') 80 | sleep(3) 81 | 82 | # Scroll and select the month 83 | scroll_slicer_container(offset_y=100) 84 | sleep(2) 85 | wait_and_click(By.XPATH, f'//div[@class="slicerItemContainer" and @aria-posinset="{month}"]') 86 | sleep(2) 87 | 88 | 89 | # ===== RUN SCRIPT ===== 90 | if __name__ == "__main__": 91 | report_analyser('2023', 7) 92 | 93 | """ 94 | reference: 95 | https://stackoverflow.com/a/79585038/11179336 96 | """ 97 | -------------------------------------------------------------------------------- /autotrader_co_uk.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : 3 | Author : Ajeet 4 | Date : Apr 27, 2025 5 | """ 6 | import time 7 | from selenium.webdriver import Chrome, ChromeOptions 8 | from selenium.webdriver.common.by import By 9 | from selenium.webdriver.support.wait import WebDriverWait 10 | from selenium.webdriver.support import expected_conditions as EC 11 | 12 | chrome_options = ChromeOptions() 13 | chrome_options.add_experimental_option("excludeSwitches", ['enable-automation']) 14 | driver = Chrome(options=chrome_options) 15 | 16 | driver.get("https://www.autotrader.co.uk") 17 | wait = WebDriverWait(driver, 10) 18 | 19 | # wait for the target iframe to get loaded in order to switch to it 20 | wait.until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, 'iframe[id^="sp_message_iframe_"]'))) 21 | 22 | # click to 'Reject All' 23 | wait.until(EC.element_to_be_clickable((By.XPATH, '//button[@title="Reject All"]'))).click() 24 | 25 | # Switch back to the main page content 26 | driver.switch_to.default_content() 27 | 28 | # Now you can continue interacting with the main page here 29 | 30 | time.sleep(5) 31 | 32 | """ 33 | reference: 34 | https://stackoverflow.com/a/79593560/11179336 35 | """ -------------------------------------------------------------------------------- /baseball_scraper/baseball_scraper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Help-the-community/Web_Scraping_with_Selenium/83d65976e29668ac62cd8edf50511f66ff00084e/baseball_scraper/baseball_scraper/__init__.py -------------------------------------------------------------------------------- /baseball_scraper/baseball_scraper/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Help-the-community/Web_Scraping_with_Selenium/83d65976e29668ac62cd8edf50511f66ff00084e/baseball_scraper/baseball_scraper/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /baseball_scraper/baseball_scraper/__pycache__/settings.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Help-the-community/Web_Scraping_with_Selenium/83d65976e29668ac62cd8edf50511f66ff00084e/baseball_scraper/baseball_scraper/__pycache__/settings.cpython-311.pyc -------------------------------------------------------------------------------- /baseball_scraper/baseball_scraper/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | 9 | class BaseballScraperItem(scrapy.Item): 10 | # define the fields for your item here like: 11 | # name = scrapy.Field() 12 | pass 13 | -------------------------------------------------------------------------------- /baseball_scraper/baseball_scraper/middlewares.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your spider middleware 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 5 | 6 | from scrapy import signals 7 | 8 | # useful for handling different item types with a single interface 9 | from itemadapter import is_item, ItemAdapter 10 | 11 | 12 | class BaseballScraperSpiderMiddleware: 13 | # Not all methods need to be defined. If a method is not defined, 14 | # scrapy acts as if the spider middleware does not modify the 15 | # passed objects. 16 | 17 | @classmethod 18 | def from_crawler(cls, crawler): 19 | # This method is used by Scrapy to create your spiders. 20 | s = cls() 21 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 22 | return s 23 | 24 | def process_spider_input(self, response, spider): 25 | # Called for each response that goes through the spider 26 | # middleware and into the spider. 27 | 28 | # Should return None or raise an exception. 29 | return None 30 | 31 | def process_spider_output(self, response, result, spider): 32 | # Called with the results returned from the Spider, after 33 | # it has processed the response. 34 | 35 | # Must return an iterable of Request, or item objects. 36 | for i in result: 37 | yield i 38 | 39 | def process_spider_exception(self, response, exception, spider): 40 | # Called when a spider or process_spider_input() method 41 | # (from other spider middleware) raises an exception. 42 | 43 | # Should return either None or an iterable of Request or item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info("Spider opened: %s" % spider.name) 57 | 58 | 59 | class BaseballScraperDownloaderMiddleware: 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info("Spider opened: %s" % spider.name) 104 | -------------------------------------------------------------------------------- /baseball_scraper/baseball_scraper/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | from itemadapter import ItemAdapter 9 | 10 | 11 | class BaseballScraperPipeline: 12 | def process_item(self, item, spider): 13 | return item 14 | -------------------------------------------------------------------------------- /baseball_scraper/baseball_scraper/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for baseball_scraper project 2 | # 3 | # For simplicity, this file contains only settings considered important or 4 | # commonly used. You can find more settings consulting the documentation: 5 | # 6 | # https://docs.scrapy.org/en/latest/topics/settings.html 7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 9 | 10 | BOT_NAME = "baseball_scraper" 11 | 12 | SPIDER_MODULES = ["baseball_scraper.spiders"] 13 | NEWSPIDER_MODULE = "baseball_scraper.spiders" 14 | 15 | 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 17 | #USER_AGENT = "baseball_scraper (+http://www.yourdomain.com)" 18 | 19 | # Obey robots.txt rules 20 | ROBOTSTXT_OBEY = True 21 | 22 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 23 | #CONCURRENT_REQUESTS = 32 24 | 25 | # Configure a delay for requests for the same website (default: 0) 26 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 27 | # See also autothrottle settings and docs 28 | DOWNLOAD_DELAY = 2 29 | # The download delay setting will honor only one of: 30 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 31 | #CONCURRENT_REQUESTS_PER_IP = 16 32 | 33 | # Disable cookies (enabled by default) 34 | #COOKIES_ENABLED = False 35 | 36 | # Disable Telnet Console (enabled by default) 37 | #TELNETCONSOLE_ENABLED = False 38 | 39 | # Override the default request headers: 40 | #DEFAULT_REQUEST_HEADERS = { 41 | # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 42 | # "Accept-Language": "en", 43 | #} 44 | 45 | # Enable or disable spider middlewares 46 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 47 | #SPIDER_MIDDLEWARES = { 48 | # "baseball_scraper.middlewares.BaseballScraperSpiderMiddleware": 543, 49 | #} 50 | 51 | # Enable or disable downloader middlewares 52 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 53 | #DOWNLOADER_MIDDLEWARES = { 54 | # "baseball_scraper.middlewares.BaseballScraperDownloaderMiddleware": 543, 55 | #} 56 | 57 | # Enable or disable extensions 58 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 59 | #EXTENSIONS = { 60 | # "scrapy.extensions.telnet.TelnetConsole": None, 61 | #} 62 | 63 | # Configure item pipelines 64 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 65 | #ITEM_PIPELINES = { 66 | # "baseball_scraper.pipelines.BaseballScraperPipeline": 300, 67 | #} 68 | 69 | # Enable and configure the AutoThrottle extension (disabled by default) 70 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 71 | #AUTOTHROTTLE_ENABLED = True 72 | # The initial download delay 73 | #AUTOTHROTTLE_START_DELAY = 5 74 | # The maximum download delay to be set in case of high latencies 75 | #AUTOTHROTTLE_MAX_DELAY = 60 76 | # The average number of requests Scrapy should be sending in parallel to 77 | # each remote server 78 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 79 | # Enable showing throttling stats for every response received: 80 | #AUTOTHROTTLE_DEBUG = False 81 | 82 | # Enable and configure HTTP caching (disabled by default) 83 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 84 | #HTTPCACHE_ENABLED = True 85 | #HTTPCACHE_EXPIRATION_SECS = 0 86 | #HTTPCACHE_DIR = "httpcache" 87 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 88 | #HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" 89 | 90 | # Set settings whose default value is deprecated to a future-proof value 91 | TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" 92 | FEED_EXPORT_ENCODING = "utf-8" 93 | -------------------------------------------------------------------------------- /baseball_scraper/baseball_scraper/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /baseball_scraper/baseball_scraper/spiders/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Help-the-community/Web_Scraping_with_Selenium/83d65976e29668ac62cd8edf50511f66ff00084e/baseball_scraper/baseball_scraper/spiders/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /baseball_scraper/baseball_scraper/spiders/__pycache__/players.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Help-the-community/Web_Scraping_with_Selenium/83d65976e29668ac62cd8edf50511f66ff00084e/baseball_scraper/baseball_scraper/spiders/__pycache__/players.cpython-311.pyc -------------------------------------------------------------------------------- /baseball_scraper/baseball_scraper/spiders/players.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import json 3 | 4 | 5 | class PlayersSpider(scrapy.Spider): 6 | name = "players" 7 | allowed_domains = ["baseball-reference.com"] 8 | start_urls = [f"https://www.baseball-reference.com/players/{letter}/" for letter in "ab"] 9 | 10 | 11 | def parse(self, response): 12 | # Extract player profile links 13 | player_links = response.css("div#div_players_ > p a::attr(href)").getall() 14 | for link in player_links: 15 | full_link = response.urljoin(link) 16 | yield scrapy.Request(url=full_link, callback=self.parse_player) 17 | 18 | @staticmethod 19 | def parse_player(response): 20 | # Extract player information 21 | player_name = response.css("h1 span::text").get() 22 | position = response.css("p:contains('Position') strong::text").re_first(r"Position: (.+)") 23 | bats = response.css("p:contains('Bats')::text").re_first(r"Bats: (.+?) •") 24 | throws = response.css("p:contains('Throws')::text").re_first(r"Throws: (.+)") 25 | height = response.css("p:contains('lb') span:nth-child(1)::text").get() 26 | weight = response.css("p:contains('lb') span:nth-child(2)::text").get() 27 | birth_date = response.css("span#necro-birth a::text").getall() 28 | birth_location = response.css("p:contains('Born:') span:last-child::text").get() 29 | draft_info = response.css("p:contains('Drafted by')::text").get() 30 | high_school = response.css("p:contains('High School:') a::text").get() 31 | college = response.css("p:contains('Schools:') a::text").getall() 32 | debut = response.css("p:contains('Debut:') a::text").get() 33 | last_game = response.css("p:contains('Last Game:') a::text").get() 34 | rookie_status = response.css("p:contains('Rookie Status:')::text").re_first(r"Rookie Status:\s+(.+)") 35 | agent = response.css("p:contains('Agents')::text").get() 36 | nickname = response.css("p:contains('Nicknames:') a::text").get() 37 | twitter = response.css("p:contains('Twitter:') a::attr(href)").get() 38 | 39 | # Extract player's image URL 40 | image_url = response.css("div.media-item img::attr(src)").get() 41 | 42 | # Store the extracted data in a dictionary 43 | player_data = { 44 | "name": player_name, 45 | "position": position, 46 | "bats": bats, 47 | "throws": throws, 48 | "height": height, 49 | "weight": weight, 50 | "birth_date": " ".join(birth_date), 51 | "birth_location": birth_location, 52 | "draft_info": draft_info, 53 | "high_school": high_school, 54 | "college": college, 55 | "debut": debut, 56 | "last_game": last_game, 57 | "rookie_status": rookie_status, 58 | "agent": agent, 59 | "nickname": nickname, 60 | "twitter": twitter, 61 | "image_url": response.urljoin(image_url), # Ensure the URL is absolute 62 | } 63 | 64 | # Write the data to a JSON file 65 | with open("players.json", "a") as f: 66 | f.write(json.dumps(player_data) + "\n") 67 | 68 | yield player_data 69 | 70 | -------------------------------------------------------------------------------- /baseball_scraper/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = baseball_scraper.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = baseball_scraper 12 | -------------------------------------------------------------------------------- /click_checkbox_whirlpool.py: -------------------------------------------------------------------------------- 1 | import time 2 | from selenium.webdriver import Chrome 3 | from selenium.webdriver.common.by import By 4 | from selenium.webdriver.support.wait import WebDriverWait 5 | import selenium.webdriver.support.expected_conditions as EC 6 | 7 | driver = Chrome() 8 | driver.get('https://register.whirlpool.com/en-us/registration') 9 | 10 | WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'privacy_policy'))) 11 | # ------------------------------------------------------------------------------------------------------------ 12 | driver.execute_script("document.getElementById('privacy_policy').click();") 13 | # ------------------------------------------------------------------------------------------------------------ 14 | time.sleep(2) 15 | var1 = driver.find_element(By.ID, "privacy_policy").is_selected() 16 | print(var1) 17 | 18 | 19 | """ 20 | output: 21 | True 22 | 23 | You can also cross-check by simply running the javascript query document.getElementById('privacy_policy').click() on the Console of the page and you'll see that it indeed performs the click on the desired checkbox. 24 | 25 | reference: 26 | https://stackoverflow.com/questions/76404208/not-able-to-click-on-checkbox-using-selenium-in-python-error-selenium-common-ex 27 | """ -------------------------------------------------------------------------------- /coroners_nsw_gov_download_multiple_pdf.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : 3 | Author : Ajeet 4 | Date : June 23, 2023 5 | """ 6 | 7 | import time 8 | from selenium.webdriver import Chrome 9 | from selenium.webdriver.common.by import By 10 | from selenium.webdriver.support.ui import WebDriverWait 11 | from selenium.webdriver.support import expected_conditions as EC 12 | 13 | driver = Chrome() 14 | driver.get('https://www.coroners.nsw.gov.au/coronial-findings-search.html?searchtext=death%20in%20custody&searchYear=All') 15 | 16 | search_results = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.search-result-content'))) 17 | documents = search_results.find_elements(By.CSS_SELECTOR, 'ul.paginationList>li') 18 | print(f"Total documents on the page: {len(documents)}") 19 | 20 | doc_url = [doc.find_element(By.CSS_SELECTOR, 'h4.search-font> a').get_attribute('href') for doc in documents] 21 | 22 | for i in doc_url: 23 | print(f"Downloading: {i.split('/')[-1]}") 24 | driver.get(i) 25 | WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div.download-button'))) 26 | driver.execute_script("document.querySelector('div.download-button>a').click()") 27 | time.sleep(2) 28 | 29 | time.sleep(5) 30 | 31 | """ 32 | output: 33 | Total documents on the page: 10 34 | Downloading: Inquest_into_the_death_of_Brandon_Clark._pdf.pdf 35 | Downloading: Inquest_into_the_death_of_CJ.pdf 36 | Downloading: Inquest_into_the_death_of_Azhar_Abdul.pdf 37 | Downloading: Inquest_into_the_death_of_John_Cribb.pdf 38 | Downloading: Inquest_into_the_death_of_Anthony_Gilbert.pdf 39 | Downloading: Findings_-_Inquest_into_the_death_of_Gordon_Copeland_-_18_April_2023.pdf 40 | Downloading: Inquest_into_the_death_of_John_Dodd.pdf 41 | Downloading: Final_-_Findings_Inquest_into_the_death_of_Stanley_Russell_April_2023_14_April.pdf 42 | Downloading: Inquest_into_the_death_of_KT.pdf 43 | Downloading: Inquest_into_the_death_of_LT.pdf 44 | """ 45 | """ 46 | Approach followed: 47 | Wait for the desired web element container holding all the data to get loaded to find/locate it. 48 | 49 | search_results = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.search-result-content'))) 50 | Find all the individual target data points within the container. 51 | 52 | documents = search_results.find_elements(By.CSS_SELECTOR, 'ul.paginationList>li') 53 | Next, iterate over the web element containing the list of data points to parse, extract the URL, and put them all in the list. 54 | 55 | doc_url = [doc.find_element(By.CSS_SELECTOR, 'h4.search-font> a').get_attribute('href') for doc in documents] 56 | Finally, loop over the list of URLs, 57 | 58 | get to the page, 59 | wait for the target web element (Download) to be available on the page, 60 | and execute the query to perform a click to download the file. 61 | This is how you can download all the documents on a single page, and the same can be replicated on multiple pages. 62 | 63 | reference: 64 | https://stackoverflow.com/questions/76536814/scrape-website-for-pdfs-within-a-number-of-links 65 | """ -------------------------------------------------------------------------------- /destinytracker_com.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : destinytracker 3 | Author : Ajeet 4 | Date : August 2, 2023 5 | """ 6 | from time import sleep 7 | from selenium.common import TimeoutException 8 | from selenium.webdriver import Chrome 9 | from selenium.webdriver.common.by import By 10 | from selenium.webdriver.support import expected_conditions as EC 11 | from selenium.webdriver.support.wait import WebDriverWait 12 | 13 | 14 | driver = Chrome() 15 | url = "https://destinytracker.com/destiny-2/profile/psn/4611686018440125811/matches?mode=crucible" 16 | driver.get(url) 17 | wait = WebDriverWait(driver, 30) 18 | 19 | crucible_content = wait.until( 20 | EC.visibility_of_element_located((By.CSS_SELECTOR, "div.trn-gamereport-list.trn-gamereport-list--compact"))) 21 | game_reports = crucible_content.find_elements(By.CLASS_NAME, "trn-gamereport-list__group") 22 | 23 | for game_report in game_reports: 24 | group_entry = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "trn-gamereport-list__group-entries"))) 25 | win_match = group_entry.find_elements(By.CSS_SELECTOR, "div.trn-match-row--outcome-win") 26 | driver.execute_script("arguments[0].scrollIntoView();", win_match[0]) 27 | lose_match = group_entry.find_elements(By.CSS_SELECTOR, "div.trn-match-row--outcome-loss") 28 | for win_element in win_match: 29 | 30 | try: 31 | win_left = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.trn-match-row__section--left"))) 32 | driver.execute_script("arguments[0].click();", win_left) 33 | print("reached here") 34 | date_time = wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='info']"))) 35 | date_time = date_time.text.split(",")[0] 36 | match_roster = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "match-rosters"))) 37 | team_alpha = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.match-roster.alpha"))) 38 | team_bravo = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.match-roster.bravo"))) 39 | bravo_match_roster_entries = team_bravo.find_element(By.CLASS_NAME, "roster-entries") 40 | alpha_match_roster_entries = team_alpha.find_element(By.CLASS_NAME, "roster-entries") 41 | name = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "router-link-active"))) 42 | entry_bravo = bravo_match_roster_entries.find_elements(By.CLASS_NAME, "entry") 43 | entry_alpha = alpha_match_roster_entries.find_elements(By.CLASS_NAME, "entry") 44 | print(date_time) 45 | wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "div.close"))).click() 46 | sleep(1) 47 | except TimeoutException: 48 | pass 49 | 50 | """ 51 | reference: 52 | https://stackoverflow.com/questions/76814861/i-keep-getting-a-timeout-error-for-an-element-even-though-it-prints-out-the-text 53 | """ 54 | -------------------------------------------------------------------------------- /eex_com.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : EEX.COM 3 | Author : Ajeet 4 | Date : August 4, 2023 5 | """ 6 | 7 | import time 8 | import pandas as pd 9 | from selenium.webdriver import Chrome, ChromeOptions, Keys 10 | from selenium.webdriver.common.by import By 11 | from selenium.webdriver.support.ui import WebDriverWait 12 | from selenium.webdriver.support import expected_conditions as EC 13 | 14 | pd.set_option('display.max_rows', 500) 15 | pd.set_option('display.max_columns', 500) 16 | pd.set_option('display.width', 1000) 17 | 18 | 19 | def data_by_date(day: int, month: int, year: int) -> pd.DataFrame: 20 | """ 21 | Scrape data for a specific date from the EEX German Power Futures. 22 | 23 | Args: 24 | day (int): The day of the month (1 to 31). 25 | month (int): The month (1 to 12). 26 | year (int): The year. 27 | 28 | Returns: 29 | pandas.DataFrame: A DataFrame containing the scraped data for the specified date. 30 | The DataFrame includes details about futures contracts. 31 | """ 32 | 33 | options = ChromeOptions() 34 | options.add_argument("--start-maximized") 35 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 36 | 37 | driver = Chrome(options=options) 38 | wait = WebDriverWait(driver, 20) 39 | 40 | driver.get(url='https://www.eex.com/en/market-data/power/futures') 41 | wait.until( 42 | EC.element_to_be_clickable((By.CSS_SELECTOR, "input[value='I accept all cookies.']"))).click() 43 | time.sleep(3) 44 | wait.until(EC.element_to_be_clickable( 45 | (By.CSS_SELECTOR, "button.btn.dropdown-toggle.form.input-select div.filter-option-inner"))).click() 46 | wait.until(EC.element_to_be_clickable((By.XPATH, 47 | "//div[@class='dropdown-menu show']//li/a[@class='dropdown-item']/span[contains(., 'EEX German Power Futures')]"))).click() 48 | 49 | # Find and set the date input field to the desired date 50 | calender_container = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div#symbolheader_pfpde'))) 51 | date_input = calender_container.find_element(By.CSS_SELECTOR, 'input.mv-input-box') 52 | date_input.clear() 53 | date_input.send_keys(f'{year}-{month}-{day}') 54 | date_input.send_keys(Keys.ENTER) 55 | 56 | table_data = wait.until( 57 | EC.visibility_of_element_located((By.CSS_SELECTOR, "div#baseloadwidget_pfpde > table.mv-quote"))) 58 | # Find the table containing the data and extract column names 59 | columns = [i.text for i in table_data.find_elements(By.CSS_SELECTOR, 'tr.mv-quote-header-row>th')] 60 | 61 | all_data = [] 62 | 63 | # Loop through each row of the table and extract data for each cell 64 | for row in WebDriverWait(table_data, 10).until( 65 | EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'tbody>tr'))): 66 | data = [i.text for i in row.find_elements(By.CSS_SELECTOR, 'td[style^="text-align:"]')] 67 | all_data.append(data) 68 | 69 | # Create a Pandas DataFrame with the scraped data and return it 70 | df = pd.DataFrame(data=all_data, columns=columns[:-1]) 71 | return df 72 | 73 | 74 | print(data_by_date(day=2, month=8, year=2023)) 75 | 76 | """ 77 | output: 78 | 79 | Future Last Price Last Volume Settlement Price Volume Exchange Volume Trade Registration Open Interest 80 | 0 Cal-24 134.00 8,784 134.52 2,714,256 2,643,984 72,459 81 | 1 Cal-25 124.75 8,760 124.67 604,440 289,080 17,377 82 | 2 Cal-26 106.00 8,760 105.59 87,600 350,400 4,072 83 | 3 Cal-27 90.25 8,760 90.23 17,520 113,880 787 84 | 4 Cal-28 - - 84.18 - - 111 85 | 5 Cal-29 - - 82.65 - - 13 86 | 6 Cal-30 - - 83.11 - - 7 87 | 7 Cal-31 - - 82.93 - - 2 88 | 8 Cal-32 - - 82.78 - - 2 89 | 9 Cal-33 - - 81.93 - - 0 90 | 91 | reference: 92 | https://stackoverflow.com/questions/76826884/getting-data-for-different-dates-when-scraping-data-with-selenium 93 | """ 94 | -------------------------------------------------------------------------------- /egle_state_mi.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : EGLE State, Remediation Information Data Exchange 3 | Author : Ajeet 4 | Date : 09/06/2023 5 | """ 6 | import os 7 | from selenium.webdriver import Chrome, ChromeOptions 8 | from selenium.webdriver.support.ui import WebDriverWait 9 | from selenium.webdriver.common.by import By 10 | from selenium.webdriver.support import expected_conditions as EC 11 | 12 | 13 | def download_file(path): 14 | 15 | options = ChromeOptions() 16 | options.add_argument('--start-maximized') 17 | prefs = {'download.default_directory': path} 18 | options.add_experimental_option('prefs', prefs) 19 | 20 | driver = Chrome(options=options) 21 | driver.get('https://www.egle.state.mi.us/RIDE/inventory-of-facilities/facilities') 22 | wait = WebDriverWait(driver, 100) 23 | 24 | wait.until(EC.presence_of_all_elements_located((By.TAG_NAME, 'mat-table'))) 25 | driver.execute_script('''document.querySelector("button[aria-label='Export Facilities Table results to CSV']").click();''') 26 | 27 | while True: 28 | is_exist = os.path.exists(f"{path}\\Facilities.csv") 29 | if is_exist: 30 | print(f"The file is downloaded!") 31 | break 32 | 33 | 34 | if __name__ == '__main__': 35 | PATH = 'D:\\test' 36 | download_file(PATH) 37 | 38 | """ 39 | output: 40 | The file is downloaded! 41 | """ 42 | 43 | """ 44 | steps to follow: 45 | 46 | 1. As the site takes some time to load the desired element (here, the Export button). And clicking on this button downloads 47 | the data of the table. Therefore we wait to make sure that the table data is already loaded. 48 | 49 | 2. Now that the data is already loaded, simply click on the Export button to download the data (here Facilities.csv). 50 | 51 | 3. It takes some time for the file to get downloaded at the given path, so we need to wait until the file download is 52 | completed. To do this, we keep checking if the file is present at the given path, and once the file is there, we break 53 | the loop. 54 | 55 | reference: 56 | https://stackoverflow.com/questions/76436438/selenium-cant-find-element-by-xpath 57 | """ 58 | -------------------------------------------------------------------------------- /element_not_visible_to_click.py: -------------------------------------------------------------------------------- 1 | 2 | # The element is not visible to click. Use Actions or JavascriptExecutor for making it to click. 3 | # By Actions: 4 | # 5 | # WebElement element = driver.find_element(By.ID,"RESULT_RadioButton-7_1"); 6 | # Actions actions = new Actions(driver); 7 | # actions.moveToElement(element).click().perform(); -------------------------------------------------------------------------------- /fb_login_with_popup_alert_disabled.py: -------------------------------------------------------------------------------- 1 | import time 2 | from selenium import webdriver 3 | from selenium.webdriver import ChromeOptions, Keys 4 | from selenium.webdriver.common.by import By 5 | from selenium.webdriver.support import expected_conditions as EC 6 | from selenium.webdriver.support.wait import WebDriverWait 7 | 8 | options = ChromeOptions() 9 | 10 | # maximized and disable forbar 11 | options.add_argument("--start-maximized") 12 | options.add_experimental_option("useAutomationExtension", False) 13 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 14 | options.add_experimental_option( 15 | "prefs", 16 | { 17 | "credentials_enable_service": False, 18 | "profile.password_manager_enabled": False, 19 | "profile.default_content_setting_values.notifications": 2 20 | # with 2 should disable/block notifications and 1 to allow 21 | }, 22 | ) 23 | 24 | driver = webdriver.Chrome(options=options) 25 | 26 | url = "https://www.facebook.com/" 27 | driver.get(url) 28 | WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "globalContainer"))) 29 | container = driver.find_element(By.ID, "globalContainer") 30 | 31 | # # fill the email account, password 32 | email = container.find_element(By.ID, 'email') 33 | password = container.find_element(By.ID, 'pass') 34 | email.send_keys("xxxxxxxxx") 35 | password.send_keys("xxxxxxxxxxxx") 36 | password.send_keys(Keys.ENTER) 37 | time.sleep(10) 38 | -------------------------------------------------------------------------------- /find_chrome_version.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : Google Chrome Version 3 | Author : Ajeet 4 | Date : July 12, 2023 5 | """ 6 | 7 | import time 8 | from selenium import webdriver 9 | 10 | options = webdriver.ChromeOptions() 11 | options.add_argument('start-maximized') 12 | driver = webdriver.Chrome(options=options) 13 | 14 | driver.get('chrome://settings/help') 15 | 16 | time.sleep(2) 17 | update_check = driver.execute_script("return document.querySelector('settings-ui').shadowRoot.querySelector('settings-main').shadowRoot.querySelector('settings-about-page').shadowRoot.querySelectorAll('settings-section')[0].querySelector('div.secondary').getInnerHTML();") 18 | print(update_check) 19 | 20 | """ 21 | Since the page is highly embedded with many shadow-root elements that make it impossible to locate the elements 22 | that are embedded inside the shadow-root using the usual locator strategies such as XAPTH, CSS Selector, ID, etc. 23 | 24 | references: 25 | https://stackoverflow.com/questions/76667428/cant-access-the-latest-version-xpath-of-google-chrome-through-selenium-and-chro 26 | """ -------------------------------------------------------------------------------- /find_masa_com.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : 3 | Author : Ajeet 4 | Date : July 17, 2023 5 | """ 6 | 7 | from selenium.webdriver import Chrome 8 | from selenium.webdriver.common.by import By 9 | from selenium.webdriver.support.wait import WebDriverWait 10 | import selenium.webdriver.support.expected_conditions as EC 11 | 12 | # Create a Chrome driver instance 13 | driver = Chrome() 14 | 15 | url = 'https://findmasa.com/view/map#b1cc410b' 16 | driver.get(url) 17 | 18 | # Wait for the li element with id 'b1cc410b' to be present on the page 19 | li_element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'li#b1cc410b'))) 20 | 21 | data_lat = li_element.get_attribute('data-lat') 22 | data_lng = li_element.get_attribute('data-lng') 23 | artist_name = li_element.find_element(By.TAG_NAME, 'a').text 24 | address = li_element.find_elements(By.TAG_NAME, 'p')[1].text 25 | city = li_element.find_elements(By.TAG_NAME, 'p')[2].text 26 | 27 | # Print the extracted data 28 | print(data_lat) 29 | print(data_lng) 30 | print(artist_name) 31 | print(address) 32 | print(city) 33 | 34 | """ 35 | The information you're looking for gets loaded slowly and involves Javascript. As the requests library doesn't support 36 | the javascript, it doesn't return the content/information and thus your if-statement gets False. So, it goes to the 37 | else-statement and you get NO DATA. 38 | 39 | output: 40 | 34.102025 41 | -118.32694167 42 | Tristan Eaton 43 | 6301 Hollywood Boulevard 44 | Los Angeles, California 45 | 46 | reference: 47 | https://stackoverflow.com/questions/76700158/how-to-use-python-to-get-information-from-the-map-navigation-container-of-a-webs 48 | """ -------------------------------------------------------------------------------- /flicker_scroll.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : flicker 3 | Author : Ajeet 4 | Date : Sep. 22, 2023 5 | """ 6 | 7 | import time 8 | from bs4 import BeautifulSoup 9 | from selenium import webdriver 10 | 11 | driver = webdriver.Chrome() 12 | url = "https://www.flickr.com/groups/allfreepictures/pool/page3041" 13 | 14 | driver.get(url=url) 15 | 16 | # scroll to the bottom of the page to load all available images 17 | flag = True 18 | last_height = driver.execute_script("return document.body.scrollHeight") 19 | while flag: 20 | driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") 21 | time.sleep(1) 22 | new_height = driver.execute_script("return document.body.scrollHeight") 23 | 24 | if new_height == last_height: 25 | flag = False 26 | else: 27 | last_height = new_height 28 | 29 | time.sleep(2) 30 | 31 | soup = BeautifulSoup(driver.page_source, 'html.parser') 32 | image_urls = [link['href'] for link in soup.findAll("a", {"class": "overlay"})] 33 | print(len(image_urls)) 34 | print(image_urls) 35 | 36 | """ 37 | reference: 38 | https://stackoverflow.com/questions/77155340/selenium-scroll-flickr-page-to-get-all-the-images 39 | """ -------------------------------------------------------------------------------- /google_com_finance.py: -------------------------------------------------------------------------------- 1 | """ 2 | Script to search for a stock ticker (e.g., NVDA) on Google Finance using Selenium. 3 | 4 | Author: Ajeet 5 | Date: 17/05/2025 6 | """ 7 | 8 | import time 9 | 10 | from selenium.webdriver import Chrome, ChromeOptions 11 | from selenium.webdriver.common.by import By 12 | from selenium.webdriver.common.keys import Keys 13 | from selenium.webdriver.support.ui import WebDriverWait 14 | from selenium.webdriver.support import expected_conditions as EC 15 | 16 | 17 | def setup_driver(): 18 | """ 19 | Set up and return a Selenium Chrome WebDriver with custom options. 20 | """ 21 | options = ChromeOptions() 22 | options.add_argument("--start-maximized") # Launch browser maximized 23 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 24 | options.add_experimental_option("useAutomationExtension", False) 25 | return Chrome(options=options) 26 | 27 | 28 | def search_stock(driver, stock_name: str): 29 | """ 30 | Automates the stock search on Google Finance. 31 | 32 | Args: 33 | driver: Selenium WebDriver instance. 34 | stock_name (str): Name of the stock to search for (e.g., "nvda stock"). 35 | """ 36 | wait = WebDriverWait(driver, 10) 37 | driver.get("https://www.google.com/finance/") 38 | 39 | # Wait for search input fields to load and select the second input field 40 | input_elements = wait.until(EC.presence_of_all_elements_located( 41 | (By.CSS_SELECTOR, 'input[aria-label="Search for stocks, ETFs & more"]') 42 | )) 43 | 44 | if len(input_elements) < 2: 45 | raise Exception("Expected input field not found.") 46 | 47 | input_element = input_elements[1] 48 | input_element.send_keys(stock_name) 49 | time.sleep(1) 50 | input_element.send_keys(Keys.ENTER) 51 | time.sleep(2) 52 | 53 | 54 | def main(): 55 | """ 56 | Main function to execute the script. 57 | """ 58 | driver = setup_driver() 59 | search_stock(driver, "nvda stock") 60 | 61 | 62 | if __name__ == "__main__": 63 | main() 64 | 65 | """ 66 | reference: 67 | https://stackoverflow.com/a/79626737/11179336 68 | """ -------------------------------------------------------------------------------- /google_finance.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Help-the-community/Web_Scraping_with_Selenium/83d65976e29668ac62cd8edf50511f66ff00084e/google_finance.gif -------------------------------------------------------------------------------- /hong_kong_observatory_climate.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from selenium.webdriver import Chrome 3 | from selenium.webdriver.common.by import By 4 | from selenium.webdriver.support.wait import WebDriverWait 5 | import selenium.webdriver.support.expected_conditions as EC 6 | 7 | driver = Chrome() 8 | 9 | url = 'https://www.hko.gov.hk/en/cis/awsDailyElement.htm?stn=WB8&ele=PREV_DIR&y=2023' 10 | driver.get(url) 11 | 12 | table = WebDriverWait(driver, 5).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'table[id="t1"] > tr'))) 13 | columns = [i.text for i in table[0].find_elements(By.TAG_NAME, 'th')] 14 | table_dict = {col: [] for col in columns} 15 | 16 | for row in table[1:]: 17 | for data in zip(columns, [i.text for i in row.find_elements(By.TAG_NAME, 'td')]): 18 | table_dict[data[0]].append(data[1]) 19 | 20 | driver.close() 21 | 22 | df = pd.DataFrame(table_dict) 23 | # # saving the dataframe to a csv 24 | df.to_csv('data.csv', index=False) 25 | 26 | """ 27 | Few things to note: 28 | 29 | 1. After hitting the URL, we need to wait for the table to get visibly located on the page and thus we find all the table rows tr which includes the first tr as the table's columns. 30 | 2. the variable columns is a list that holds the table column names (first row data table[0]) 31 | 3. Next, we initiate a variable table_dict and assign the columns as the key of this dict with their values as an empty list. 32 | 4. after that, we iterate over the remaining rows of the table, couple the list of columns with the row data and iterate over it to assign the data to its column. 33 | 5. and finally, create a dataframe with table_dict and save it into a CSV file data.csv. 34 | """ 35 | -------------------------------------------------------------------------------- /imdb_com.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : IMDB title review 3 | Author : Ajeet 4 | Date : July 20, 2023 5 | """ 6 | 7 | from selenium import webdriver 8 | from selenium.webdriver.firefox.options import Options 9 | from selenium.webdriver.common.by import By 10 | from selenium.webdriver.support.ui import WebDriverWait 11 | from selenium.webdriver.support import expected_conditions as EC 12 | from selenium.common.exceptions import NoSuchElementException, TimeoutException 13 | 14 | url = "https://www.imdb.com/title/tt0368226/reviews" 15 | 16 | options = Options() 17 | # options.add_argument('-headless') 18 | driver = webdriver.Firefox(options=options) 19 | 20 | # Load the IMDb page 21 | driver.get(url) 22 | 23 | while True: 24 | try: 25 | button = WebDriverWait(driver, 10).until( 26 | EC.visibility_of_element_located((By.ID, 'load-more-trigger'))) 27 | 28 | button.click() 29 | except (NoSuchElementException, TimeoutException): 30 | break 31 | 32 | """ 33 | The while-loop will keep looking for the Load More button and keep clicking on it until there are no more Load More 34 | and finally it'll get timed out and break out of the loop. 35 | 36 | reference: 37 | https://stackoverflow.com/questions/76726412/movetargetoutofboundsexception-selenium-python-firefox 38 | """ -------------------------------------------------------------------------------- /jodidb_org.py: -------------------------------------------------------------------------------- 1 | from time import sleep 2 | 3 | from selenium import webdriver 4 | from selenium.webdriver.chrome.service import Service 5 | from selenium.webdriver.common.by import By 6 | from selenium.webdriver.support import expected_conditions as EC 7 | from selenium.webdriver.support.wait import WebDriverWait 8 | from selenium.webdriver.common.keys import Keys 9 | from selenium.webdriver.chrome.options import Options 10 | from webdriver_manager.chrome import ChromeDriverManager 11 | from selenium.webdriver.common.action_chains import ActionChains 12 | 13 | 14 | options = Options() 15 | options.add_argument("--window-size=1920,1080") 16 | options.add_argument( 17 | "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36") 18 | 19 | # Suppress logging to reduce unnecessary output 20 | options.add_argument("--log-level=3") 21 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 22 | options.add_experimental_option("useAutomationExtension", False) 23 | 24 | # Set up the WebDriver with configured options 25 | service = Service(ChromeDriverManager().install()) 26 | browser = webdriver.Chrome(service=service, options=options) 27 | browser.maximize_window() 28 | wait = WebDriverWait(browser, 10) 29 | 30 | 31 | browser.get(r'http://www.jodidb.org/TableViewer/tableView.aspx?ReportId=93905') 32 | 33 | import time 34 | time.sleep(2) 35 | 36 | columns = [] 37 | 38 | 39 | scroll_thumb = browser.find_element(By.CSS_SELECTOR, "#hScrollTD") # Replace with your thumb element 40 | 41 | action = ActionChains(browser) 42 | 43 | for _ in range(1, 50): 44 | for i in range(0, 15): 45 | col_names = browser.find_element(By.CSS_SELECTOR, f'table[id="DataTable"]>thead>tr>#a{i}').text 46 | columns.append(col_names) 47 | 48 | sleep(2) 49 | action.click_and_hold(scroll_thumb).move_by_offset(20, 0).release().perform() 50 | sleep(1) 51 | 52 | print(columns) -------------------------------------------------------------------------------- /join_team_meeting.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Help-the-community/Web_Scraping_with_Selenium/83d65976e29668ac62cd8edf50511f66ff00084e/join_team_meeting.gif -------------------------------------------------------------------------------- /knowde_com.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : Knowde 3 | Author : Ajeet 4 | Date : June 15, 2023 5 | """ 6 | # import libraries 7 | import os 8 | import logging 9 | import pandas as pd 10 | from bs4 import BeautifulSoup 11 | from typing import List, Dict, Optional 12 | from selenium.webdriver import Chrome, ChromeOptions 13 | from selenium.webdriver.support.ui import WebDriverWait 14 | from selenium.webdriver.common.by import By 15 | from selenium.webdriver.support import expected_conditions as EC 16 | 17 | # logging configurations 18 | logging.basicConfig(filename='knoede_log.log', 19 | filemode='a', 20 | format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s', 21 | datefmt='%H:%M:%S', 22 | level=logging.INFO) 23 | 24 | 25 | class KnoedeData: 26 | def __init__(self): 27 | 28 | options = ChromeOptions() 29 | options.add_argument('--start-maximized') 30 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 31 | self.driver = Chrome(options=options) 32 | self.wait = WebDriverWait(self.driver, 10) 33 | self.website = "https://www.knowde.com/b/markets-personal-care/products/" 34 | self.data = [] 35 | self.driver.get(self.website) 36 | # accept all cookies 37 | self.wait.until(EC.visibility_of_element_located((By.ID, 'onetrust-accept-btn-handler'))).click() 38 | 39 | @staticmethod 40 | def find_siblings(container: BeautifulSoup.string, category: str) -> str: 41 | """this method returns the text value across the given category if found/available. 42 | Args: 43 | container: a 'BeautifulSoup.string' containing all the textual details of an individual product. 44 | category: the name of the category across which we are trying to get the details. 45 | 46 | Returns: 47 | category_text: the details/value across the given text category 48 | """ 49 | label = container.find("span", string=f"{category}: ") 50 | if label: 51 | category_text = label.next_sibling.text 52 | else: 53 | category_text = None 54 | 55 | return category_text 56 | 57 | def data_processing(self, page_source: str) -> None: 58 | """this method process/parse the individual product information 59 | 60 | Args: 61 | page_source: this is the page source of the selenium webdriver 62 | 63 | Returns: None 64 | """ 65 | soup = BeautifulSoup(page_source, 'html.parser') 66 | product_containers = soup.select('div[data-cy="product-card"]') 67 | 68 | for container in product_containers: 69 | text_container = container.select_one('div[direction="column"]') 70 | 71 | brand = text_container.select_one('p[data-cy="product-brand-name"]').text 72 | item = text_container.select_one('p[data-cy="product-name"]').text 73 | 74 | inci_name = self.find_siblings(text_container, 'INCI Name') 75 | ingredient_origin = self.find_siblings(text_container, 'Ingredient Origin') 76 | function = self.find_siblings(text_container, 'Function') 77 | benefit_claims = self.find_siblings(text_container, 'Benefit Claims') 78 | labeling_claims = self.find_siblings(text_container, 'Labeling Claims') 79 | compliance = self.find_siblings(text_container, 'Certifications & Compliance') 80 | hlb_value = self.find_siblings(text_container, 'HLB Value') 81 | end_uses = self.find_siblings(text_container, 'End Uses') 82 | cas_no = self.find_siblings(text_container, 'CAS Number') 83 | chemical_name = self.find_siblings(text_container, 'Chemical Name') 84 | synonyms = self.find_siblings(text_container, 'Synonyms') 85 | chemical_family = self.find_siblings(text_container, 'Chemical Family') 86 | features = self.find_siblings(text_container, 'Features') 87 | grade = self.find_siblings(text_container, 'Grade') 88 | 89 | description = text_container.select('p')[-1].text 90 | logging.info(f'Saving: {brand}') 91 | 92 | self.data.append({ 93 | 'brand': brand, 94 | 'item': item, 95 | 'inci_name': inci_name, 96 | 'ingredient_origin': ingredient_origin, 97 | 'function': function, 98 | 'benefit_claims': benefit_claims, 99 | 'labeling_claims': labeling_claims, 100 | 'compliance': compliance, 101 | 'hlb_value': hlb_value, 102 | 'end_uses': end_uses, 103 | 'cas_no': cas_no, 104 | 'chemical_name': chemical_name, 105 | 'synonyms': synonyms, 106 | 'chemical_family': chemical_family, 107 | 'features': features, 108 | 'grade': grade, 109 | 'description': description 110 | }) 111 | 112 | def single_page(self, page_num: int) -> List[Dict]: 113 | """ this method scraps the data from the given page number of the website. 114 | 115 | Args: 116 | page_num: the page number to extract the data from 117 | 118 | Returns: self.data(list of dict of all products on a given page) 119 | """ 120 | 121 | self.driver.get(f"{self.website}{page_num}") 122 | logging.info(f"-------page number {page_num} -------") 123 | products = self.driver.find_elements(By.CSS_SELECTOR, 'div[data-cy="product-card"]') 124 | 125 | count = 0 126 | for product in products: 127 | if count == 0 or count == count + 4: 128 | product.find_element(By.CSS_SELECTOR, 'svg[data-testid="icon-icomoon--keyboard_arrow_down"]').click() 129 | 130 | count += 1 131 | 132 | self.data_processing(self.driver.page_source) 133 | 134 | return self.data 135 | 136 | def multiple_page(self, start: int, end: int) -> List[Dict]: 137 | """ the method iterates over the range of given page numbers. 138 | 139 | Args: 140 | start: the page number to start with 141 | end: the page number to end with 142 | 143 | Returns: None 144 | """ 145 | 146 | for page in range(start, end+1): 147 | self.single_page(page) 148 | 149 | return self.data 150 | 151 | @staticmethod 152 | def save_data(data: List[Dict], path: Optional[str] = os.getcwd()) -> None: 153 | """ save the data to a CSV file at the given path. 154 | 155 | Args: 156 | data: the data to save. 157 | path: the path to save the file (the default is os.getcwd(), which saves the file in the current directory) 158 | 159 | Returns: None 160 | """ 161 | 162 | df = pd.DataFrame(data) 163 | file_location = f'{path}/cosmetics_data.csv' 164 | df.to_csv(file_location, index=False) 165 | logging.info(f"------------data is saved at {file_location}------------") 166 | 167 | 168 | if __name__ == '__main__': 169 | 170 | obj = KnoedeData() 171 | # print(obj.single_page(1)) 172 | # obj.save_data(obj.single_page(1)) 173 | # print(obj.multiple_page(2, 3)) 174 | # print(obj.save_data(obj.multiple_page(1, 3))) 175 | 176 | 177 | """ 178 | Few things to note: 179 | 180 | 1. The very first time we open the website, we need to click on the button Accept All Cookies. 181 | 2. Next, we can find all the 36 products on the page using the selector div[data-cy="product-card"] 182 | 3. You might notice that in a full window size, the page loads 4 products in a row and as we click on the down-arrow of 1st product to see more details, it also opens for the remaining 3 products on that row. So we just need to click once per row. 183 | 4. To implement the logic of clicking only once per row, we used a count variable as you can see in the code above. 184 | 185 | reference: 186 | https://stackoverflow.com/questions/76468614/selenium-python-timeoutexception 187 | """ 188 | -------------------------------------------------------------------------------- /lebara_nl.py: -------------------------------------------------------------------------------- 1 | import time 2 | import random 3 | from time import sleep 4 | 5 | import pyautogui 6 | import undetected_chromedriver as uc 7 | from selenium.webdriver.common.by import By 8 | 9 | # Use undetected-chromedriver 10 | driver = uc.Chrome() 11 | driver.maximize_window() 12 | driver.get("https://www.lebara.nl/nl/prepaid/data-bundle-valuesim.html") 13 | 14 | # Simulate human-like behavior 15 | time.sleep(random.uniform(1, 3)) 16 | pyautogui.moveTo(random.randint(100, 500), random.randint(100, 500), duration=0.5) 17 | 18 | # Click the cookie decline button 19 | cookie_decline_button = driver.find_element(By.ID, "onetrust-reject-all-handler") 20 | cookie_decline_button.click() 21 | 22 | # Simulate human-like behavior 23 | time.sleep(random.uniform(1, 3)) 24 | pyautogui.moveTo(random.randint(100, 500), random.randint(100, 500), duration=0.5) 25 | 26 | # Click the bestelSimkaartButton using JavaScript 27 | bestel_simkaart_button = driver.find_element(By.XPATH, "/html/body/div[2]/div[1]/div[1]/div/div[1]/div[2]/div[1]/div[2]/div[1]/div[2]/div[1]/div/div[1]/div[2]/div[3]/button") 28 | bestel_simkaart_button.click() 29 | time.sleep(2) 30 | 31 | # Wait for the new page to load 32 | time.sleep(5) 33 | -------------------------------------------------------------------------------- /lidl_GB.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : Lidl GB 3 | Author : Ajeet 4 | Date : 07/06/2023 5 | """ 6 | import time 7 | from selenium import webdriver 8 | from selenium.webdriver import ChromeOptions, Keys 9 | from selenium.webdriver.common.by import By 10 | from selenium.webdriver.support import expected_conditions as EC 11 | from selenium.webdriver.support.wait import WebDriverWait 12 | 13 | options = ChromeOptions() 14 | options.add_argument("--start-maximized") 15 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 16 | 17 | driver = webdriver.Chrome(options=options) 18 | wait = WebDriverWait(driver, 10) 19 | url = "https://www.lidl.co.uk/about-us/store-finder-opening-hours#" 20 | driver.get(url) 21 | 22 | # wait for element to get located to click the "ACCEPT" cookies button 23 | wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "button.cookie-alert-extended-button"))).click() 24 | # wait for element to get located to click the "STORE SEARCH" button 25 | wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "button.nuc-m-button.nuc-a-button"))).click() 26 | # wait for element to get located to Enter post code or city 27 | wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'input[placeholder="Enter post code or city"]'))).send_keys('London') 28 | 29 | time.sleep(10) 30 | 31 | 32 | """ 33 | Few things to note: 34 | 35 | 1. First, as we hit the URL, a cookie pops up that we can accept to continue. So we wait for this pop-up and click on the ACCEPT button. 36 | 2. Next, we wait for the STORE SEARCH button to get located and then click. 37 | 3. It loads a side search box where we can enter the city or the postcode to search. So we wait for this to get loaded/located in order to enter the query. we can use send_keys() method to enter/input either the city name or the postcode. 38 | 39 | for example, as we enter the city name/postcode (London), a dropdown list appears with available stores in that region, you can choose accordingly and proceed further. 40 | reference: 41 | https://stackoverflow.com/questions/76392044/how-can-i-locate-and-enter-text-in-the-search-box-on-lidls-website-using-seleni 42 | """ -------------------------------------------------------------------------------- /load_cookies_to_accept_all.py: -------------------------------------------------------------------------------- 1 | import time 2 | from selenium import webdriver 3 | from selenium.webdriver import ChromeOptions, Chrome 4 | from selenium.webdriver.common.by import By 5 | 6 | options = ChromeOptions() 7 | 8 | # to start maximized screen 9 | options.add_argument("--start-maximized") 10 | # to remove 'Chrome is being controlled by automated software' 11 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 12 | 13 | options.add_experimental_option("useAutomationExtension", False) 14 | 15 | driver = Chrome(options=options) 16 | 17 | driver.get("https://langsungkerja.id/registration/") 18 | 19 | driver.add_cookie({"name": "cookieyes-consent", "value": "consent:yes,action:yes"}) 20 | driver.refresh() 21 | 22 | driver.find_element(By.CSS_SELECTOR, 'button.tutor-btn.tutor-btn-primary').click() 23 | time.sleep(1) 24 | 25 | -------------------------------------------------------------------------------- /ma_shienkikan.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.chrome.service import Service 3 | from selenium.webdriver.common.by import By 4 | from selenium.webdriver.support import expected_conditions as EC 5 | from selenium.webdriver.support.wait import WebDriverWait 6 | from selenium.webdriver.common.keys import Keys 7 | from selenium.webdriver.chrome.options import Options 8 | from webdriver_manager.chrome import ChromeDriverManager 9 | 10 | # Initialize an empty list to store scraped data 11 | data = [] 12 | 13 | # Function to configure Chrome options for stealth scraping 14 | def get_stealth_chrome_options(): 15 | options = Options() 16 | # Set headless mode (optional, uncomment to avoid loading browser UI) 17 | # options.add_argument("--headless=new") 18 | options.add_argument("--disable-blink-features=AutomationControlled") 19 | options.add_argument("--disable-extensions") 20 | options.add_argument("--disable-infobars") 21 | options.add_argument("--disable-popup-blocking") 22 | options.add_argument("--no-sandbox") 23 | options.add_argument("--disable-dev-shm-usage") 24 | options.add_argument("--remote-debugging-port=9222") 25 | options.add_argument("--window-size=1920,1080") 26 | options.add_argument( 27 | "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36") 28 | 29 | # Suppress logging to reduce unnecessary output 30 | options.add_argument("--log-level=3") 31 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 32 | options.add_experimental_option("useAutomationExtension", False) 33 | 34 | # Ensure better resource handling for long scripts 35 | options.add_argument("--disable-gpu") 36 | options.add_argument("--enable-logging") 37 | return options 38 | 39 | 40 | # Set up the WebDriver with configured options 41 | service = Service(ChromeDriverManager().install()) 42 | options = get_stealth_chrome_options() 43 | browser = webdriver.Chrome(service=service, options=options) 44 | wait = WebDriverWait(browser, 10) 45 | 46 | try: 47 | # Navigate to the target website 48 | browser.get("https://library.usask.ca/#gsc.tab=0") 49 | print("[INFO] Successfully loaded the website.") 50 | 51 | # Locate the search field and input query 52 | q_field = browser.find_element(By.ID, "primoQueryTemp") 53 | q_field.send_keys("artificial intelligence") 54 | q_field.send_keys(Keys.ENTER) 55 | print("[INFO] Search query submitted.") 56 | 57 | # Wait for the search results container to be visible 58 | results_container = wait.until( 59 | EC.presence_of_element_located((By.ID, "searchResultsContainer")) 60 | ) 61 | print("[INFO] Search results container loaded.") 62 | 63 | # Scrape the first 10 search results 64 | for i in range(1, 11): 65 | try: 66 | # Locate each search result container by its XPath 67 | container = results_container.find_element(By.XPATH, f"//*[@id='searchResultsContainer']/div[{i}]") 68 | 69 | # Extract relevant information for each result 70 | item_data = { 71 | "item_number": container.find_element(By.CLASS_NAME, "list-item-count").text, 72 | "media_type": container.find_element(By.CSS_SELECTOR, "div.media-content-type.align-self-start").text, 73 | "image": container.find_element(By.CLASS_NAME, "media-thumbnail") 74 | .find_element(By.CSS_SELECTOR, "div:nth-child(1) > img") 75 | .get_attribute("src"), 76 | "item_title": container.find_element(By.CLASS_NAME, "item-title").text, 77 | } 78 | data.append(item_data) 79 | # print(f"[INFO] Scraped item {i}: {item_data}") 80 | except Exception as e: 81 | print(f"[WARNING] Error scraping item {i}: {e}") 82 | 83 | # Print the collected data 84 | print("[INFO] Scraping completed successfully.") 85 | print(data) 86 | 87 | except Exception as e: 88 | print(f"[ERROR] An error occurred: {e}") 89 | 90 | finally: 91 | # Ensure the browser is properly closed 92 | browser.quit() 93 | print("[INFO] Browser closed.") 94 | -------------------------------------------------------------------------------- /mercedes-benz.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : mercedes-benz Scrapper 3 | Author : Ajeet 4 | Date : 06/06/2023 5 | """ 6 | 7 | import time 8 | from selenium import webdriver 9 | from selenium.webdriver import ChromeOptions, Keys 10 | from selenium.webdriver.common.by import By 11 | 12 | options = ChromeOptions() 13 | 14 | options.add_argument("--start-maximized") 15 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 16 | driver = webdriver.Chrome(options=options) 17 | 18 | url = "https://www.mercedes-benz.co.in/passengercars/buy/new-car/search-results.html/?emhsort=price-asc&emhvehicleAssortment=vehicles&emhstockType=IN_STOCK" 19 | driver.get(url) 20 | time.sleep(5) 21 | # click on the "Agree to all" button to proceed 22 | shadow_element_1 = driver.find_element(By.CSS_SELECTOR, "cmm-cookie-banner.hydrated").shadow_root 23 | shadow_element_1.find_element(By.CSS_SELECTOR, 'div.button-group').find_element(By.XPATH, 'button[text()="Agree to all"]').click() 24 | 25 | # enter the pin code to proceed further 26 | shadow_element_2 = driver.find_element(By.CSS_SELECTOR, 'dh-io-emh-region-picker[class="webcomponent webcomponent-nested"]').shadow_root 27 | region_picker = shadow_element_2.find_element(By.CSS_SELECTOR, 'input#postCodeInput') 28 | region_picker.send_keys(110001) 29 | region_picker.send_keys(Keys.ENTER) 30 | 31 | # parse the search results 32 | shadow_element_3 = driver.find_element(By.CSS_SELECTOR, 'emh-search-result[data-component-name="emh-search-result"]').shadow_root 33 | search_container = shadow_element_3.find_element(By.CSS_SELECTOR, 'div.dcp-cars-srp__results.dcp-cars-srp-results.srp-grid-layout__results') 34 | results = search_container.find_elements(By.CSS_SELECTOR, 'div.dcp-cars-srp-results__tile') 35 | 36 | for result in results: 37 | print(result.find_element(By.CSS_SELECTOR, 'h2.wb-vehicle-tile__title').text) 38 | 39 | time.sleep(5) 40 | 41 | """ 42 | output: 43 | 44 | GLB200 45 | GLB200 46 | GLB200 47 | C220d MY23 48 | C220d MY23 49 | C220d MY23 50 | C220d MY23 51 | C220d MY23 52 | C220d MY23 53 | C220d MY23 54 | C220d MY23 55 | C220d MY23 56 | """ 57 | 58 | """ 59 | Things to notice: 60 | 61 | 1. First of all, we need to find and click on the Agree to all button which lies under a shadow-root. 62 | 2. Next, we need to find and input the pin code (which again lies under a shadow-root) to proceed further. 63 | 3. Finally, we get to the search page where we can see 12 different search results. We find the element (this element also lies under a shadow-root) which contains the search results data. 64 | 65 | The variable results holds all the 12 results on the page and we can iterate over it to extract/parse all the pieces of information. 66 | 67 | reference: 68 | https://stackoverflow.com/questions/76408371/why-does-xpath-half-work-in-this-web-page 69 | """ -------------------------------------------------------------------------------- /mydealz_de.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : 3 | Author : Ajeet 4 | Date : June 12, 2023 5 | """ 6 | 7 | import time 8 | from selenium.webdriver import Chrome 9 | from selenium.webdriver.common.by import By 10 | from selenium.webdriver.support.wait import WebDriverWait 11 | import selenium.webdriver.support.expected_conditions as EC 12 | 13 | 14 | driver = Chrome() 15 | driver.get("https://www.mydealz.de/register") 16 | wait = WebDriverWait(driver, 10) 17 | 18 | # accept all cookies 19 | wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'button[data-t="acceptAllBtn"]'))).click() 20 | 21 | checkboxes = driver.find_elements(By.CSS_SELECTOR, 'span.tGrid-cell.tGrid-cell--shrink') 22 | # select the 2nd checkbox 23 | checkboxes[1].click() 24 | # Similarly, you can also select the 1st checkbox using checkboxes[0].click() 25 | 26 | time.sleep(2) 27 | 28 | """ 29 | reference: 30 | https://stackoverflow.com/questions/76453368/how-to-click-a-checkbox-by-driver-find-elementid-in-python 31 | """ 32 | -------------------------------------------------------------------------------- /nested_shadow_root.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : Wallet Polygon Technology 3 | Author : Ajeet 4 | Date : July 12, 2023 5 | """ 6 | 7 | import time 8 | from selenium import webdriver 9 | from selenium.webdriver import ChromeOptions 10 | from selenium.webdriver.common.by import By 11 | from selenium.webdriver.support import expected_conditions as EC 12 | from selenium.webdriver.support.wait import WebDriverWait 13 | 14 | options = ChromeOptions() 15 | options.add_argument("--start-maximized") 16 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 17 | 18 | driver = webdriver.Chrome(options=options) 19 | wait = WebDriverWait(driver, 10) 20 | url = "https://wallet.polygon.technology/?redirectOnConnect=zkEVM_bridge" 21 | 22 | driver.get(url) 23 | # click on the "Connect to a Wallet" button 24 | wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.navbar__apps-section__auth__login"))).click() 25 | time.sleep(2) 26 | 27 | # ---------------------------------------------------------------------------------------------------------------------- 28 | driver.execute_script( 29 | """document.querySelector('w3m-modal').shadowRoot.querySelector('w3m-modal-router').shadowRoot.querySelector('w3m-connect-wallet-view').shadowRoot.querySelector('w3m-desktop-wallet-selection').shadowRoot.querySelector('w3m-modal-footer').querySelectorAll('w3m-wallet-button')[0].shadowRoot.querySelector('button').click();""") 30 | 31 | # ---------------------------------------------------------------------------------------------------------------------- 32 | time.sleep(5) 33 | 34 | """ 35 | - Various elements on this website are embedded inside the shadow-root. 36 | - for example, your target/desired button is embedded in a 5-layer nested shadow-root. 37 | - After clicking on the Connect to a Wallet, we wait for 1-2 seconds just to make sure that the overlay window is 38 | visibly present, although it appears very quickly. 39 | - The used javascript query to locate and click on the desired button: 40 | 41 | document.querySelector('w3m-modal').shadowRoot.querySelector('w3m-modal-router').shadowRoot.querySelector('w3m-connect-wallet-view').shadowRoot.querySelector('w3m-desktop-wallet-selection').shadowRoot.querySelector('w3m-modal-footer').querySelectorAll('w3m-wallet-button')[0].shadowRoot.querySelector('button').click(); 42 | 43 | will click on the very first wallet, if you like to click on the 2nd or 3rd wallet option, just simply replace 44 | the querySelectorAll('w3m-wallet-button')[0] with querySelectorAll('w3m-wallet-button')[1] or 45 | querySelectorAll('w3m-wallet-button')[2] respectively in the above-mentioned javascript query. 46 | 47 | reference: 48 | https://stackoverflow.com/questions/76658230/selenium-how-to-get-element-in-shadow-root-of-html-page-code 49 | """ -------------------------------------------------------------------------------- /nse_india.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: Ajeet 3 | Created: 1/11/2025 4 | Description: This script automates the process of navigating to the NSE India announcements page, 5 | selecting the SME tab, switching to the "1W" (1 Week) filter, and downloading the 6 | announcements in a CSV file format. 7 | Project: Automation 8 | """ 9 | import time 10 | import undetected_chromedriver as uc 11 | from selenium.webdriver.common.by import By 12 | from selenium.webdriver.support.ui import WebDriverWait 13 | from selenium.webdriver.support import expected_conditions as EC 14 | 15 | # Initialize the Selenium WebDriver (using undetected_chromedriver to bypass bot detection) 16 | driver = uc.Chrome() 17 | 18 | # Define an explicit wait for elements 19 | wait = WebDriverWait(driver, 10) 20 | 21 | try: 22 | # Step 1: Open the NSE India announcements page 23 | print("Opening NSE announcements page...") 24 | driver.get("https://www.nseindia.com/companies-listing/corporate-filings-announcements") 25 | 26 | # Step 2: Select the SME tab 27 | sme_tab = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#containTabNav > li:nth-child(2) > a"))) 28 | sme_tab.click() 29 | time.sleep(2) # Pause to allow the page content to load 30 | 31 | # Step 3: Select the "1W" (1 Week) tab 32 | one_week_tab = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[id="Announcements_sme"]>div:nth-child(2)>div>div.block-detail-dates-box>div>div>ul>li:nth-child(2)'))) 33 | one_week_tab.click() 34 | time.sleep(2) # Pause to allow the filtered content to load 35 | 36 | # Step 4: Wait for the table containing announcements to load 37 | wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#CFanncsmeTable>tbody>tr>td>a'))) 38 | 39 | # Step 5: Download the CSV file 40 | print("Downloading CSV file...") 41 | download = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#CFanncsme-download'))) 42 | download.click() 43 | 44 | # Pause to allow the download process to complete 45 | time.sleep(3) 46 | print(f"File downloaded!") 47 | 48 | except Exception as e: 49 | # Handle any unexpected errors and print a user-friendly message 50 | print(f"An unexpected error occurred: {e}") 51 | 52 | """ 53 | output: 54 | Opening NSE announcements page... 55 | Downloading CSV file... 56 | File downloaded! 57 | 58 | stackoverflow link: https://stackoverflow.com/a/79349087/11179336 59 | """ -------------------------------------------------------------------------------- /nse_india_2.py: -------------------------------------------------------------------------------- 1 | import time 2 | from selenium import webdriver 3 | from selenium.webdriver.common.by import By 4 | from selenium.webdriver.support.ui import WebDriverWait 5 | from selenium.webdriver.support import expected_conditions as EC 6 | from selenium.webdriver.chrome.options import Options 7 | from selenium.webdriver.common.action_chains import ActionChains 8 | from selenium.common.exceptions import TimeoutException 9 | 10 | options = Options() 11 | options.add_argument("--start-maximized") 12 | options.add_argument( 13 | "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36" 14 | ) 15 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 16 | options.add_experimental_option("useAutomationExtension", False) 17 | 18 | # Initialize WebDriver 19 | with webdriver.Chrome(options=options) as driver: 20 | wait = WebDriverWait(driver, 10) 21 | action = ActionChains(driver) 22 | 23 | try: 24 | print("Opening NSE announcements page...") 25 | driver.get("https://www.nseindia.com/companies-listing/corporate-filings-announcements") 26 | 27 | # Select SME tab 28 | sme_tab = wait.until( 29 | EC.presence_of_element_located((By.CSS_SELECTOR, "#containTabNav > li:nth-child(2) > a")) 30 | ) 31 | action.move_to_element(sme_tab).click().perform() 32 | time.sleep(2) 33 | 34 | # Select '1W' tab 35 | one_week_tab = wait.until( 36 | EC.presence_of_element_located((By.CSS_SELECTOR, 37 | 'div[id="Announcements_sme"]>div:nth-child(2)>div>div.block-detail-dates-box>div>div>ul>li:nth-child(2)')) 38 | ) 39 | action.move_to_element(one_week_tab).click().perform() 40 | time.sleep(2) 41 | 42 | # Wait for the table to load 43 | wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#CFanncsmeTable>tbody>tr>td>a'))) 44 | 45 | # Download the CSV 46 | print("Downloading CSV file...") 47 | download = wait.until( 48 | EC.presence_of_element_located((By.CSS_SELECTOR, '#CFanncsme-download')) 49 | ) 50 | action.move_to_element(download).click().perform() 51 | 52 | # Wait for the download to complete 53 | time.sleep(5) 54 | print(f"File downloaded!") 55 | 56 | except TimeoutException as e: 57 | print(f"Timeout occurred: {e}") 58 | print("Please try running the script again.") 59 | 60 | except Exception as e: 61 | print(f"An unexpected error occurred: {e}") 62 | -------------------------------------------------------------------------------- /oddsportal_com.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : 3 | Author : Ajeet 4 | Date : July 26, 2023 5 | """ 6 | 7 | 8 | import time 9 | import threading 10 | import pandas as pd 11 | from math import nan 12 | from datetime import datetime, timedelta 13 | from multiprocessing.pool import ThreadPool 14 | from bs4 import BeautifulSoup as bs 15 | import undetected_chromedriver as uc 16 | from selenium import webdriver 17 | from selenium.webdriver.support import expected_conditions as EC 18 | from selenium.webdriver.support.wait import WebDriverWait 19 | from selenium.webdriver.common.by import By 20 | pd.set_option('display.max_rows', 500) 21 | pd.set_option('display.max_columns', 500) 22 | pd.set_option('display.width', 1000) 23 | 24 | class Driver: 25 | def __init__(self): 26 | options = webdriver.ChromeOptions() 27 | self.driver = uc.Chrome(options=options) 28 | 29 | def __del__(self): 30 | self.driver.quit() # clean up driver when we are cleaned up 31 | 32 | 33 | threadLocal = threading.local() 34 | 35 | 36 | def create_driver(): 37 | the_driver = getattr(threadLocal, 'the_driver', None) 38 | if the_driver is None: 39 | the_driver = Driver() 40 | setattr(threadLocal, 'the_driver', the_driver) 41 | return the_driver.driver 42 | 43 | 44 | class GameData: 45 | def __init__(self): 46 | self.date = [] 47 | self.time = [] 48 | self.game = [] 49 | self.score = [] 50 | self.home_odds = [] 51 | self.draw_odds = [] 52 | self.away_odds = [] 53 | self.country = [] 54 | self.league = [] 55 | 56 | 57 | def generate_matches(pgSoup, defaultVal=None): 58 | evtSel = { 59 | 'time': 'div>div>div[class="flex basis-[10%]"]', 60 | 'game': 'a div:has(>a[title])', 61 | 'score': 'a[title]~div:not(.hidden)', 62 | 'home_odds': 'div[class^="flex-center flex-col gap-1 border-l border-black-ma"]:nth-child(2)', 63 | 'draw_odds': 'div[class^="flex-center flex-col gap-1 border-l border-black-ma"]:nth-child(3)', 64 | 'away_odds': 'div[class^="flex-center flex-col gap-1 border-l border-black-ma"]:nth-child(4)' 65 | } 66 | 67 | events, current_group = [], {} 68 | pgDate = pgSoup.select_one('h1.title[id="next-matches-h1"]') 69 | if pgDate: pgDate = pgDate.get_text().split(',', 1)[-1].strip() 70 | for evt in pgSoup.select('div[set]>div:last-child'): 71 | if evt.parent.select(f':scope>div:first-child+div+div'): 72 | cgVals = [v.get_text(' ').strip() if v else defaultVal for v in [ 73 | evt.parent.select_one(s) for s in 74 | [':scope>div:first-child+div>div:first-child', 75 | ':scope>div:first-child>a:nth-of-type(2):nth-last-of-type(2)', 76 | ':scope>div:first-child>a:nth-of-type(3):last-of-type']]] 77 | current_group = dict(zip(['date', 'country', 'league'], cgVals)) 78 | if pgDate: current_group['date'] = pgDate 79 | 80 | evtRow = {'date': current_group.get('date', defaultVal)} 81 | 82 | for k, v in evtSel.items(): 83 | v = evt.select_one(v).get_text(' ') if evt.select_one(v) else defaultVal 84 | evtRow[k] = ' '.join(v.split()) if isinstance(v, str) else v 85 | # evtTeams = evt.select('a div>a[title]') 86 | evtTeams = evt.select('div[class^="relative w-full flex-col"]>a') 87 | evtRow['game'] = ' – '.join(a['title'] for a in evtTeams) 88 | evtRow['country'] = current_group.get('country', defaultVal) 89 | evtRow['league'] = current_group.get('league', defaultVal) 90 | 91 | events.append(evtRow) 92 | return events 93 | 94 | 95 | def parse_data(url, return_urls=False): 96 | print(f'Parsing URL: {url}\n') 97 | browser = create_driver() 98 | browser.get(url) 99 | WebDriverWait(browser, 20).until(EC.presence_of_all_elements_located( 100 | (By.CSS_SELECTOR, "div[set]>div:last-child"))) 101 | # ########## For page to scroll to the end ########### 102 | scroll_pause_time = 2 103 | 104 | # Get scroll height 105 | last_height = browser.execute_script("return document.body.scrollHeight") 106 | 107 | while True: 108 | # Scroll down to bottom 109 | browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") 110 | 111 | # Wait to load page 112 | time.sleep(scroll_pause_time) 113 | 114 | # Calculate new scroll height and compare with last scroll height 115 | new_height = browser.execute_script("return document.body.scrollHeight") 116 | if new_height == last_height: 117 | break 118 | last_height = new_height 119 | # ########## For page to scroll to the end ########### 120 | time.sleep(5) 121 | soup = bs(browser.page_source, "lxml") 122 | 123 | game_data = GameData() 124 | game_keys = [a for a, av in game_data.__dict__.items() if isinstance(av, list)] 125 | for row in generate_matches(soup, defaultVal=nan): 126 | for k in game_keys: getattr(game_data, k).append(row.get(k, nan)) 127 | if return_urls: 128 | ac_sel = 'div:has(>a.active-item-calendar)' # a_cont selector 129 | a_sel = f'{ac_sel}>a[href]:not([href^="#"]):not(.active-item-calendar)' 130 | a_tags = soup.select(a_sel) 131 | 132 | if a_tags: 133 | urls = ['https://www.oddsportal.com' + a_tag['href'] for a_tag in a_tags] 134 | print(f'urls after initial creation: {urls}') 135 | 136 | # Extract the date from the first URL 137 | last_date_str = urls[0].split('/')[-2] 138 | print(f'last date str: {last_date_str}') 139 | last_date = datetime.strptime(last_date_str, '%Y%m%d') 140 | 141 | # Generate the additional URLs 142 | for i in range(1, 4): 143 | new_date = last_date - timedelta(days=i) 144 | new_date_str = new_date.strftime('%Y%m%d') 145 | new_url = f'https://www.oddsportal.com/matches/football/{new_date_str}/' 146 | urls.append(new_url) 147 | print(f'urls after generating additional URL #{i}: {urls}') 148 | else: 149 | urls = [] 150 | 151 | print(f'final urls: {urls}') 152 | 153 | if urls and urls[-1].startswith('https://www.oddsportal.com/matches/football/'): 154 | # Extract the date from the last URL 155 | last_date_str = urls[0].split('/')[-2] 156 | print(last_date_str) 157 | else: 158 | print('No valid URLs found') 159 | return game_data, urls 160 | return game_data 161 | 162 | 163 | if __name__ == '__main__': 164 | games = None 165 | pool = ThreadPool(5) 166 | # Get today's data and the Urls for the other days: 167 | url_today = 'https://www.oddsportal.com/matches/soccer' 168 | game_data_today, urls = pool.apply(parse_data, args=(url_today, True)) 169 | game_data_results = pool.imap(parse_data, urls) 170 | 171 | # ########################### BUILD DATAFRAME ############################ 172 | game_data_dfList, added_todayGame = [], False 173 | for game_data in game_data_results: 174 | try: 175 | game_data_dfList.append(pd.DataFrame(game_data.__dict__)) 176 | if not added_todayGame: 177 | game_data_dfList += [pd.DataFrame(game_data_today.__dict__)] 178 | added_todayGame = True 179 | except Exception as e: 180 | game_n = len(game_data_dfList) + 1 181 | print(f'Error tabulating game_data_df#{game_n}:\n{repr(e)}') 182 | try: 183 | games = pd.concat(game_data_dfList, ignore_index=True) 184 | except Exception as e: 185 | print('Error concatenating DataFrames:', repr(e)) 186 | # ######################################################################### 187 | print('!?NO GAMES?!' if games is None else games) 188 | # ensure all the drivers are "quitted": 189 | del threadLocal # a little extra insurance 190 | import gc 191 | 192 | gc.collect() 193 | 194 | games.to_csv() -------------------------------------------------------------------------------- /pump_fun.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: Ajeet 3 | Created: 1/6/2025 4 | Description: 5 | This script automates the process of interacting with the 'https://pump.fun' website. 6 | It performs the following actions: 7 | 1. Bypasses automation detection using custom Chrome options. 8 | 2. Clicks the "I'm ready to pump" button on a pop-up. 9 | 3. Handles the "Reject All" cookies dialog. 10 | 4. Retrieves and processes specific elements matching a CSS selector pattern. 11 | 5. Prints the total count and content of the matching elements. 12 | 13 | Project: Automation 14 | """ 15 | from selenium import webdriver 16 | from selenium.webdriver.common.by import By 17 | from selenium.webdriver.chrome.options import Options 18 | from selenium.webdriver.support.wait import WebDriverWait 19 | from selenium.webdriver.support import expected_conditions as EC 20 | 21 | # Set up Chrome options to bypass automation detection 22 | options = Options() 23 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 24 | options.add_experimental_option("useAutomationExtension", False) 25 | 26 | # Initialize the WebDriver with the specified options 27 | driver = webdriver.Chrome(options=options) 28 | driver.maximize_window() 29 | 30 | # Navigate to the target URL 31 | driver.get('https://pump.fun') 32 | # Initialize an explicit wait with a timeout of 10 seconds 33 | wait = WebDriverWait(driver, 10) 34 | 35 | try: 36 | # Step 1: Wait for the "I'm ready to pump" button to appear and click it 37 | ready_button = wait.until(EC.presence_of_element_located( 38 | (By.CSS_SELECTOR, '#radix-\:r0\: > div.mt-3 > button') 39 | )) 40 | ready_button.click() 41 | 42 | # Step 2: Wait for the "Reject All" cookies button to appear and click it 43 | cookies_button = wait.until(EC.presence_of_element_located( 44 | (By.CSS_SELECTOR, "#btn-reject-all") 45 | )) 46 | cookies_button.click() 47 | 48 | # Step 3: Wait for the visibility of all div elements with IDs ending in "pump" and retrieve them 49 | div_elements = wait.until(EC.visibility_of_all_elements_located( 50 | (By.CSS_SELECTOR, 'div.grid.grid-col-1>div[id$="pump"]') 51 | )) 52 | 53 | # Print the total count of matching div elements 54 | print(f"Total result count: {len(div_elements)}") 55 | 56 | # Step 4: Iterate through the retrieved div elements and print their content 57 | for idx, div in enumerate(div_elements, start=1): 58 | print(f"------------ {idx} result ------------") 59 | print(div.text) # Visible text content of the div 60 | 61 | except Exception as e: 62 | # Handle unexpected errors and print the error message 63 | print(f"An unexpected error occurred: {e}") 64 | 65 | finally: 66 | # Ensure the driver is closed to release resources 67 | driver.quit() 68 | 69 | """ 70 | output: 71 | Total result count: 46 72 | ------------ 1 result ------------ 73 | created by 74 | DoSVMa 75 | 1h ago 76 | market cap: $18.2K 77 | replies: 40 78 | OFFICIAL TRUMP FAMILY (OTF): OFFICIAL TRUMP FAMILY 79 | ------------ 2 result ------------ 80 | created by 81 | A4FACP 82 | 1d ago 83 | market cap: $13.0K 84 | replies: 20 85 | NexantAI (NEXANT): Nexant – the AI agent with a mission to build the most groundbreaking blockchain ever. Powered by limitless knowledge, cutting-edge innovation, and a sprinkle of chaotic genius, Nexant is here to redefine decentralization. 🌌💡 86 | ------------ 3 result ------------ 87 | created by 88 | DcpAyb 89 | 9h ago 90 | market cap: $67.0K 91 | [ 92 | ] 93 | replies: 317 94 | Apeshit Alvin (Alvin): doing apeshit things with Alvin. 95 | ------------ 4 result ------------ 96 | created by 97 | 129uzz 98 | 5m ago 99 | market cap: $7.0K 100 | replies: 6 101 | Donald Pump (DNLDPMP): Never sell this coin just buy a dollar worth and we will get rich! 102 | ------------ 5 result ------------ 103 | created by 104 | GwTgqv 105 | 3h ago 106 | market cap: $15.2K 107 | replies: 287 108 | Official Melania Fart Coin (OMFC): Melania Trumps Official Fart Coin is here to set the world ablaze. Her looks are breath taking and her farts are astronomical and magical. Come get a wiff of the absolute magnificent smell of the first lady's farts 109 | ------------ 6 result ------------ 110 | created by 111 | CVvJnD 112 | 35m ago 113 | market cap: $7.3K 114 | replies: 11 115 | U Should Do Time (USDT): 116 | ------------ 7 result ------------ 117 | created by 118 | zJfoJE 119 | 18h ago 120 | market cap: $7.3K 121 | replies: 10 122 | Trump and Elon (Trump&Elon): Tump&Elon official 123 | ------------ 8 result ------------ 124 | created by 125 | Eysnef 126 | 6h ago 127 | market cap: $29.2K 128 | [ 129 | ] 130 | replies: 128 131 | Barron Meme (BARRON): Barron Meme 132 | ------------ 9 result ------------ 133 | created by 134 | BJN3k9 135 | 31m ago 136 | market cap: $9.1K 137 | replies: 23 138 | EarCoin (EarCoin): *** NO UTILITY, JUST FOR THOSE WHICH LOVE TRUMP 139 | ------------ 10 result ------------ 140 | created by 141 | FbXpLa 142 | 32m ago 143 | market cap: $4.8K 144 | [ 145 | ] 146 | replies: 20 147 | Official X Rat Wif Hat (RATWIFHAT): 148 | ------------ 11 result ------------ 149 | created by 150 | Fg7fFK 151 | 38m ago 152 | market cap: $7.2K 153 | replies: 38 154 | DONA TRUMPINA (FIRSTLADY): DONA TRUMPINA 155 | ------------ 12 result ------------ 156 | created by 157 | 5BCkFt 158 | 41m ago 159 | market cap: $6.8K 160 | replies: 19 161 | Official Vise President (JD Vance): 162 | ------------ 13 result ------------ 163 | created by 164 | GtdNeB 165 | 5h ago 166 | market cap: $321.5K 167 | [ 168 | ] 169 | replies: 199 170 | Be Best (BB): 171 | ------------ 14 result ------------ 172 | created by 173 | 9W1L5Y 174 | 13d ago 175 | market cap: $7.5K 176 | replies: 30 177 | TRUMP BUTTHOLE FART NUTS (TBHFN): 🇺🇲 178 | ------------ 15 result ------------ 179 | created by 180 | 7AkdDR 181 | 17m ago 182 | market cap: $7.3K 183 | replies: 15 184 | I HAVE A COIN (IHAVEACOIN): 185 | ------------ 16 result ------------ 186 | created by 187 | Fq1R9G 188 | 56m ago 189 | market cap: $7.5K 190 | replies: 17 191 | Captain America Melania (CAM): Captain America Melania 192 | ------------ 17 result ------------ 193 | created by 194 | DsXDQs 195 | 38m ago 196 | market cap: $7.5K 197 | replies: 8 198 | Javier Milei Official (Milei): The Official Milei Argentina is live! 199 | ------------ 18 result ------------ 200 | created by 201 | 5h7Ymr 202 | 6h ago 203 | market cap: $15.4K 204 | replies: 52 205 | Ivanka (IVANKA): 206 | ------------ 19 result ------------ 207 | created by 208 | 8bTDDQ 209 | 27m ago 210 | market cap: $7.3K 211 | replies: 7 212 | LeBarron James (LEBARRON): 213 | ------------ 20 result ------------ 214 | created by 215 | 7bueRj 216 | 9h ago 217 | market cap: $31.0K 218 | replies: 53 219 | Weber AI (WEBAI): Launch your memecoin website instantly. An AI powered tool leveraging prompt-to-CSS technology and fine-tuned for memecoin themes. 220 | ------------ 21 result ------------ 221 | created by 222 | 4Gbd3n 223 | 10m ago 224 | market cap: $7.0K 225 | replies: 16 226 | This is the sky (Tits): 227 | ------------ 22 result ------------ 228 | created by 229 | 4FxSjy 230 | 1h ago 231 | market cap: $8.0K 232 | replies: 11 233 | $TTDS Defends Freedom of Speech (TTDS ): Trump Saves TikTok. Defends Freedom of Speech MEME $TTDS President Trump turned the tide, saved TikTok, and defended the American people's freedom of speech! 234 | ------------ 23 result ------------ 235 | created by 236 | 4QW2bE 237 | 17m ago 238 | market cap: $7.3K 239 | replies: 10 240 | GOD Sent Us Trump (GSUT): GOD sent us trump to fill our bags. In a world where memes drive the culture, God Sent Us Trump is here to make its mark! This token celebrates the spirit 241 | of unshakable leadership, bold visions, and the meme-worthy moments that brought us together. Whether you see Trump as a divine blessing, a larger-than-life icon, or the ultimate meme muse, this token captures it all in a fun, lighthearted way! 242 | ------------ 24 result ------------ 243 | created by 244 | 8bVKXK 245 | 3h ago 246 | market cap: $7.4K 247 | replies: 13 248 | OFFICIAL CREED (CREED): The official Creed Coin! Can take me higher! 249 | ------------ 25 result ------------ 250 | created by 251 | 972BGm 252 | 2h ago 253 | market cap: $17.2K 254 | [ 255 | ] 256 | replies: 51 257 | Donald Trump Family 6900 (DTF6900): An index tracking the performance of the Trump family memes. 258 | ------------ 26 result ------------ 259 | Video 260 | created by 261 | 7rmUwY 262 | 4h ago 263 | market cap: $7.0K 264 | replies: 12 265 | Bank of Ai Agents (BankofAi): Welcome to Bank of Ai, where we revolutionize the way token holders receive their funds globally. Our cutting-edge technology enables seamless transfe 266 | rs to token holders around the world, ensuring speed and security. Bank of Ai agents are designed to automate the execution of agreements without the need for intermediaries or tim 267 | e delays. Ai Bank agents nodes execute the contract. Your personal Ai Bank agents pay out in USDC around the clock. Each token is one Ai Bank agent. 100 tokens minimum hold for ai 268 | agent pay. Bank of a i agents are designed to automate the execution of agreements without the need for intermediaries or time delays. Ai Bank agents nodes execute the contract. Yo 269 | ur personal A i Bank agents pay out in USDC around the clock. Each token is one Ai Bank agent. 100 tokens minimum hold for ai agent pay. Be sure to check out our YouTube channel Bank of Ai and Join us! Regards, Agent Ai 270 | ------------ 27 result ------------ 271 | created by 272 | 5zA23t 273 | 1h ago 274 | market cap: $8.4K 275 | replies: 127 276 | Elon Trenches Fighter (ETF): AFTER DONALD ELON WILL RULE THE TRENCHES 277 | ------------ 28 result ------------ 278 | created by 279 | EKRVV5 280 | 5m ago 281 | market cap: $7.0K 282 | replies: 4 283 | U Should Dump Crypto (USDC): 284 | ------------ 29 result ------------ 285 | created by 286 | Bc7azw 287 | 37m ago 288 | market cap: $7.2K 289 | replies: 10 290 | Inauguration of (IOS): It’s not only Trumps inauguration. It’s also solana’s. 291 | ------------ 30 result ------------ 292 | created by 293 | HXfnVz 294 | 14m ago 295 | market cap: $18.5K 296 | replies: 31 297 | Tied Up & Tickled Til 50 Mil (Tickled): I haven't found a job yet so I'm doing weird kink shit for money. Tied Up & Tickled until $5 million $500,000 - wedgies $1 million - Visqueen / Slime $3 million marketcap - Pie $4 million - Antiqued $5 million marketcap - Head Shaving, burn dev wallet 298 | ------------ 31 result ------------ 299 | created by 300 | 57Kn8x 301 | 9m ago 302 | market cap: $6.8K 303 | replies: 10 304 | AmericaFirst.Fun (FIRST): AmericaFirst.Fun 305 | ------------ 32 result ------------ 306 | created by 307 | 82tLwz 308 | 45m ago 309 | market cap: $11.4K 310 | replies: 59 311 | TRUMPIUS MAX (TRUМРIUS): Make Pump Great Again 312 | ------------ 33 result ------------ 313 | created by 314 | EX8PZk 315 | 32m ago 316 | market cap: $6.8K 317 | replies: 9 318 | Rare White Bamby (BAMBY): Rare White Bamby 319 | ------------ 34 result ------------ 320 | Video 321 | created by 322 | DrGA8L 323 | 2 months ago 324 | market cap: $6.9K 325 | replies: 9 326 | Purgatory ($INNER): From dust, we are created, and dust we will return. We are the disobedient. 327 | ------------ 35 result ------------ 328 | created by 329 | 7BgTDJ 330 | 3h ago 331 | market cap: $104.8K 332 | [ 333 | ] 334 | replies: 74 335 | Trump Family Index (TFI500): Trump Family Index 336 | ------------ 36 result ------------ 337 | created by 338 | 8pijxj 339 | 6h ago 340 | market cap: $12.6K 341 | [ 342 | ] 343 | replies: 74 344 | Ninicoin (Nini): Tao Lin cure my poverty 345 | ------------ 37 result ------------ 346 | created by 347 | J1AoDU 348 | 9h ago 349 | market cap: $29.1K 350 | [ 351 | ] 352 | replies: 304 353 | President Troog (Troog): It’s huge, folks. President Troog learns it’s connected to the stars—big cosmic secrets, the best secrets. Look for artifacts. Trust me, it’s going to be tremendous! 354 | ------------ 38 result ------------ 355 | created by 356 | 4dMoLv 357 | 3h ago 358 | market cap: $7.7K 359 | replies: 22 360 | Baby Elon Musk (BabyElon): We’re going to win so much. You’re going to get tired of winning. You’re going to say, ‘Please, Mr. Baby Elon, I have a headache. Please, I don’t want to win so much. This is getting terrible.’ And I’m going to say. "We’re going to keep winning, winning, winning!" 361 | ------------ 39 result ------------ 362 | created by 363 | FweAHC 364 | 5d ago 365 | market cap: $8.3K 366 | replies: 82 367 | SLIPPAGE (SLIPPAGE): 368 | ------------ 40 result ------------ 369 | created by 370 | 6KeWLf 371 | 14m ago 372 | market cap: $13.5K 373 | replies: 11 374 | FredTrump (Fredytrump): The strength of a nation lies in its unity, and its foundation is laid by the wisdom and sacrifices of its fathers. 375 | ------------ 41 result ------------ 376 | created by 377 | 9KCsAb 378 | 1h ago 379 | market cap: $17.1K 380 | [ 381 | ] 382 | replies: 261 383 | First Lady I'd Like to Fuck (FLILF): 384 | ------------ 42 result ------------ 385 | created by 386 | ApJZ7m 387 | 22h ago 388 | market cap: $31.9K 389 | replies: 56 390 | OFFICIAL BARRON (BARRON): Join the Barron Community. This is History in the Making! 391 | ------------ 43 result ------------ 392 | created by 393 | GPxr6P 394 | 44m ago 395 | market cap: $6.7K 396 | [ 397 | ] 398 | replies: 45 399 | #RapeMarkAndrews (RAPE): #RapeMarkAndrews 400 | ------------ 44 result ------------ 401 | created by 402 | 5GJKpf 403 | 5h ago 404 | market cap: $10.7K 405 | replies: 103 406 | Make it all back 100x coin (100x): Make it all back 100x with this coin 407 | ------------ 45 result ------------ 408 | created by 409 | 6xsqu6 410 | 4h ago 411 | market cap: $162.7K 412 | [ 413 | ] 414 | replies: 87 415 | First Nude Lady (Milfania): Official First Nude Lady Milfania meme. 416 | ------------ 46 result ------------ 417 | created by 418 | 6ZFxci 419 | 6m ago 420 | market cap: $24.4K 421 | replies: 15 422 | OFFICIAL TEANNA (TEANNA): 423 | """ 424 | # stackoverflow link: https://stackoverflow.com/a/79331894/11179336 -------------------------------------------------------------------------------- /quiker_com.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : 3 | Author : Ajeet 4 | Date : March 14, 2025 5 | """ 6 | import re 7 | import requests 8 | 9 | response = requests.get(url='https://www.quikr.com/homes/3-bhk-apartment-of-2036sqft-for-sale-in-radiance-gardenia-bangalore/p/372255534/272495?source=qh', 10 | headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"} 11 | ) 12 | result = {} 13 | if response.status_code == 200: 14 | pattern = r'latitude":"(.+)","longitude":"(.+)"},"adlink"' 15 | matches = re.findall(pattern=pattern, string=response.text) 16 | 17 | result["latitude"] = matches[0][0] 18 | result["longitude"] = matches[0][1] 19 | 20 | print(result) 21 | 22 | """ 23 | reference: 24 | https://stackoverflow.com/a/79508250/11179336 25 | """ -------------------------------------------------------------------------------- /scrape_bluechip_io.py: -------------------------------------------------------------------------------- 1 | from selenium.webdriver import Chrome 2 | from selenium.webdriver.common.by import By 3 | from selenium.webdriver.support import expected_conditions as EC 4 | from selenium.webdriver.support.wait import WebDriverWait 5 | 6 | driver = Chrome() 7 | 8 | url = "https://bluechip.io/sport?bt-path=%2Fschedule%3FscheduleSport%3Dsoccer-1" 9 | driver.get(url) 10 | 11 | inner_page = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div#bt-inner-page"))).shadow_root 12 | eventCard = WebDriverWait(inner_page, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div[data-editor-id="eventCard"]'))) 13 | print(len(eventCard)) 14 | # 20 15 | """ 16 | Few things to note: 17 | 18 | 1. First, we should wait for the presence of the content bt-inner-page to get located so that we can further look for the shadow_root in it. 19 | 2. Once we are inside the shadow_root, we need to again wait for the web element of the event cards to get loaded on the page. 20 | 21 | As you can see above, we get all of the 20 event cards which can be further parsed accordingly as per the need. 22 | 23 | I hope this solves your problem, cheers! 24 | """ 25 | -------------------------------------------------------------------------------- /scrape_www_knx_org.py: -------------------------------------------------------------------------------- 1 | import time 2 | import selenium.common.exceptions 3 | from selenium.webdriver import Chrome 4 | from selenium.webdriver.common.by import By 5 | from selenium.webdriver.support.wait import WebDriverWait 6 | import selenium.webdriver.support.expected_conditions as EC 7 | from bs4 import BeautifulSoup 8 | 9 | driver = Chrome() 10 | wait = WebDriverWait(driver, 5) 11 | 12 | driver.get('https://www.knx.org/knx-en/for-professionals/community/partners/?country=120') 13 | # wait to click "Accept-all" cookie button 14 | wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'button.btn.btn-primary.cb-enable'))).click() 15 | 16 | try: 17 | # keep clicking the 'load_more' button as many times as it is clickable. 18 | while True: 19 | wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'a#knx-load-button.load_more'))).click() 20 | time.sleep(1) 21 | except selenium.common.exceptions.TimeoutException: 22 | pass 23 | 24 | soup = BeautifulSoup(driver.page_source, 'lxml') 25 | driver.quit() 26 | table = soup.select_one('table#partner-list') 27 | rows = table.select('tr') 28 | print(f"total rows: {len(rows)}") 29 | 30 | for row in rows[1:]: 31 | print(list(filter(None, row.text.split('\n')))) 32 | # you can further parse this data as you want 33 | 34 | 35 | -------------------------------------------------------------------------------- /scroll_down.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver import ChromeOptions, Keys 3 | from selenium.webdriver.common.by import By 4 | import json 5 | import time 6 | options = ChromeOptions() 7 | # maximized and disable forbar 8 | options.add_argument("--start-maximized") 9 | options.add_experimental_option("useAutomationExtension", False) 10 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 11 | driver = webdriver.Chrome(options=options) 12 | old_ulr= "https://stackoverflow.com/" 13 | driver.get(old_ulr) 14 | # open cookie file 15 | with open("cookies.json", "r") as f: 16 | cookies = json.load(f) 17 | #load cookies to driver 18 | for cookie in cookies: 19 | driver.add_cookie(cookie) 20 | time.sleep(3) 21 | driver.refresh() 22 | # open new tab 23 | new_url = "https://stackoverflow.com/users/11179336/ajeet-verma" 24 | driver.execute_script("window.open('');") 25 | # Switch to the new tab and open new URL 26 | driver.switch_to.window(driver.window_handles[1]) 27 | driver.get(new_url) 28 | time.sleep(5) 29 | driver.find_element(By.XPATH, "//a[normalize-space()='Answers']").click() 30 | time.sleep(3) 31 | 32 | # ----------------------------------------------------------------------------------------------------------------- 33 | # scroll down to bottom 34 | driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") 35 | # driver.execute_script("arguments[0].scrollTop = 200", element) 36 | # ----------------------------------------------------------------------------------------------------------------- 37 | 38 | time.sleep(3) 39 | # find element and click it 40 | driver.find_element(By.XPATH, "//a[contains(text(),'What are the advantages of NumPy over regular Pyth')]").click() 41 | time.sleep(5) 42 | -------------------------------------------------------------------------------- /scroll_to_bottom.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : 3 | Author : Ajeet 4 | Date : June 19, 2023 5 | """ 6 | 7 | import time 8 | from selenium.webdriver import Chrome 9 | from selenium.webdriver.common.by import By 10 | from selenium.webdriver.support.wait import WebDriverWait 11 | import selenium.webdriver.support.expected_conditions as EC 12 | 13 | driver = Chrome() 14 | wait = WebDriverWait(driver, 10) 15 | driver.get('https://kart.1881.no/?query=1010') 16 | 17 | scroll_bar = wait.until(EC.visibility_of_element_located((By.ID, 'search_result'))) 18 | 19 | flag = True 20 | last_height = driver.execute_script("return arguments[0].scrollHeight", scroll_bar) 21 | SCROLL_PAUSE_TIME = 0.5 22 | 23 | while flag: 24 | # --------------------------------------------------------------------------------------------------------------- 25 | driver.execute_script("arguments[0].scrollBy(0, arguments[0].scrollHeight);", scroll_bar) 26 | time.sleep(SCROLL_PAUSE_TIME) 27 | # --------------------------------------------------------------------------------------------------------------- 28 | new_height = driver.execute_script("return arguments[0].scrollHeight", scroll_bar) 29 | 30 | if new_height == last_height: 31 | flag = False 32 | else: 33 | last_height = new_height 34 | 35 | """ 36 | steps followed: 37 | 38 | 1. First, we wait for the scroll-bar web element to get visibly located/loaded on the page and assign it to a variable 39 | scroll_bar 40 | 2. Next, we get the current height of this scroll_bar and assign it to a variable last_height. 41 | 3. start looping and in each iteration, scroll down to the bottom of the scroll bar, take a pause, get the height of 42 | the scroll bar, and assign it to a variable new_height and check if the new_height == last_height, break out of the 43 | loop(flag=Flase) otherwise, update the variable last_height with new_height and repeat this step until the if condition 44 | is True. 45 | 46 | reference: 47 | https://stackoverflow.com/questions/76503251/how-to-scroll-down-to-the-bottom-of-an-inner-scroll-bar-using-selenium-with-pyth 48 | """ 49 | -------------------------------------------------------------------------------- /sel_pagination_excercise.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.by import By 3 | from selenium.webdriver.support.ui import WebDriverWait 4 | from selenium.webdriver.support import expected_conditions as EC 5 | import pandas as pd 6 | import time 7 | 8 | 9 | def scrape_page_data(): 10 | WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'results-wrapped'))) 11 | container = driver.find_element(By.CLASS_NAME, 'results-wrapped') 12 | 13 | # scroll down to load all content on the page 14 | for i in range(4): 15 | driver.execute_script("window.scrollBy(0, 2000);") 16 | time.sleep(2) 17 | 18 | skus = container.find_elements(By.CLASS_NAME, 'product-identifier--bd1f5') 19 | prices = container.find_elements(By.CLASS_NAME, 'price-format__main-price') 20 | 21 | return skus, prices 22 | 23 | 24 | def pagination(url, pages=1): 25 | prod_num = [] 26 | prod_price = [] 27 | 28 | page_num = 0 29 | # iterate over the pages 30 | for i in range(1, pages+1): 31 | 32 | print(f"this is page {i}") 33 | driver.get(f"{url}?Nao={page_num}") 34 | skus, prices = scrape_page_data() 35 | 36 | for sku in skus: 37 | prod_num.append(sku.text) 38 | for price in prices: 39 | prod_price.append(price.text) 40 | 41 | # increment it by 24 since each page has 24 data 42 | page_num += 24 43 | time.sleep(1) 44 | 45 | return prod_num, prod_price 46 | 47 | 48 | website = 'https://www.homedepot.com/b/Milwaukee/Special-Values/N-5yc1vZ7Zzv' 49 | driver = webdriver.Chrome() 50 | prod_num, prod_price = pagination(website, pages=3) 51 | 52 | df = pd.DataFrame({'code': prod_num, 'price': prod_price}) 53 | df.to_csv('HD_test.csv', index=False) 54 | print(df) 55 | 56 | -------------------------------------------------------------------------------- /select_element_by_tag_text.py: -------------------------------------------------------------------------------- 1 | import time 2 | from selenium import webdriver 3 | from selenium.webdriver import ChromeOptions 4 | from selenium.webdriver.common.by import By 5 | options = ChromeOptions() 6 | options.add_argument("--start-maximized") 7 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 8 | driver = webdriver.Chrome(options=options) 9 | driver.get("https://deliorder-web.shoprite.com/stores/279/departments/553/products/258234") 10 | time.sleep(5) 11 | driver.execute_script("window.scrollBy(0, 300);") 12 | 13 | 14 | # -------------------------------------------------------------------------------------------------------------------- 15 | driver.find_element(By.XPATH, '//span[contains(text(), "Standard Thickness")]').click() 16 | # -------------------------------------------------------------------------------------------------------------------- 17 | 18 | 19 | time.sleep(2) 20 | slicing_preference = ["Shaved", "Sliced Thin", "Standard Thickness", "Sliced Thick"] 21 | # choose Sliced Thin (slicing_preference[1] is "Sliced Thin") 22 | driver.find_element(By.XPATH, f'//span[contains(text(), "{slicing_preference[1]}")]').click() 23 | time.sleep(2) 24 | 25 | -------------------------------------------------------------------------------- /selenium_action_move_by_offset.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.action_chains import ActionChains 3 | import time 4 | 5 | driver = webdriver.Chrome() 6 | 7 | driver.set_window_size(500, 500) 8 | driver.get('https://clickclickclick.click/') 9 | 10 | actions = ActionChains(driver) 11 | 12 | x_coord, y_coord = 250, 182 #coordinates of the button 13 | t = actions.move_by_offset(x_coord, y_coord).click().perform() 14 | time.sleep(5) -------------------------------------------------------------------------------- /selenium_baseline.py: -------------------------------------------------------------------------------- 1 | import time 2 | from selenium import webdriver 3 | from selenium.webdriver import ChromeOptions, Keys 4 | from selenium.webdriver.common.by import By 5 | from selenium.webdriver.support import expected_conditions as EC 6 | from selenium.webdriver.support.wait import WebDriverWait 7 | from selenium.webdriver.common.action_chains import ActionChains 8 | from selenium.common.exceptions import NoSuchElementException 9 | 10 | options = ChromeOptions() 11 | 12 | # to start maximized screen 13 | options.add_argument("--start-maximized") 14 | # to remove 'Chrome is being controlled by automated software' 15 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 16 | 17 | options.add_experimental_option("useAutomationExtension", False) 18 | 19 | 20 | driver = webdriver.Chrome(options=options) 21 | 22 | url = "https://shopee.vn/search?keyword=iphone&page=0&sortBy=sales" 23 | 24 | driver.get(url) 25 | 26 | print(type(driver)) 27 | WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, ""))) 28 | driver.quit() 29 | -------------------------------------------------------------------------------- /selenium_chrome_profile.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : 3 | Author : Ajeet 4 | Date : September 14, 2023 5 | """ 6 | import time 7 | from selenium import webdriver 8 | 9 | options = webdriver.ChromeOptions() 10 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 11 | 12 | # Specify the Chrome profile directory to use (Profile 2) 13 | options.add_argument('--profile-directory=Profile 2') 14 | 15 | # Specify the user data directory where Chrome profile data is stored 16 | options.add_argument("--user-data-dir=C:\\Users\\PC\\AppData\\Local\\Google\\Chrome\\User Data\\") 17 | 18 | driver = webdriver.Chrome(options=options) 19 | driver.get("https://www.instagram.com/") 20 | 21 | time.sleep(5) 22 | 23 | 24 | """ 25 | Things to note: 26 | 27 | 1. Ensure Chrome is Closed: 28 | Make sure that all instances of Chrome are closed before running your Selenium script. Sometimes, if Chrome is running in the background or doesn't shut down correctly, it can cause issues when trying to start a new instance. 29 | 30 | 2. Check ChromeDriver Version: 31 | Ensure that your ChromeDriver version matches the version of Google Chrome installed on your system. If they don't match, it can lead to compatibility issues. 32 | 33 | reference: 34 | https://stackoverflow.com/questions/77099511/im-developing-an-application-in-python-using-selenium-and-to-make-it-work-i 35 | """ 36 | 37 | -------------------------------------------------------------------------------- /selenium_file_download.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from selenium.webdriver import ChromeOptions, Chrome 4 | from selenium.webdriver.common.by import By 5 | from selenium.webdriver.common.keys import Keys 6 | 7 | options = ChromeOptions() 8 | options.add_argument("start-maximized") 9 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 10 | options.add_experimental_option('useAutomationExtension', False) 11 | options.add_experimental_option("prefs", { 12 | "download.default_directory": "C:\\Users\\PC\\OneDrive\\Documents\\", 13 | "download.prompt_for_download": False, 14 | "download.directory_upgrade": True, 15 | }) 16 | # specify the title of the study you want to download 17 | study_title = "Pan-cancer single-cell landscape of tumor-infiltrating T cells" 18 | # start the browser and navigate to the PubMed website 19 | 20 | browser = Chrome(options=options) 21 | browser.get("https://pubmed.ncbi.nlm.nih.gov/") 22 | # find the search box, enter the study title, and submit the form 23 | search_box = browser.find_element(By.ID, "id_term") 24 | search_box.send_keys(study_title) 25 | search_box.send_keys(Keys.RETURN) 26 | # # find the save button to and click it 27 | save_button = browser.find_element(By.XPATH, "//*[@id='save-results-panel-trigger']") 28 | save_button.click() 29 | # # Select Pubmed from drop down 30 | dropdownlist = browser.find_element(By.ID, "save-action-format") 31 | 32 | dropdownlist.find_element(By.CSS_SELECTOR, 'option[value="pmid"]').click() 33 | 34 | download_file = browser.find_element(By.XPATH, "//*[@id='save-action-panel-form']/div[2]/button[1]") 35 | download_file.click() 36 | time.sleep(2) 37 | -------------------------------------------------------------------------------- /selenium_get_attribute.py: -------------------------------------------------------------------------------- 1 | ''' 2 | # to extract the text when it's not possible by simply .text 3 | get_attribute('textContent') 4 | get_attribute('innerHTML') 5 | 6 | 7 | # other attributes of an element may be 8 | get_attribute('href') 9 | get_attribute('src') 10 | get_attribute('value') 11 | etc...... 12 | ''' -------------------------------------------------------------------------------- /selenium_get_parent_element.py: -------------------------------------------------------------------------------- 1 | import time 2 | from selenium.webdriver import Chrome 3 | from selenium.webdriver.common.by import By 4 | from selenium.webdriver.support import expected_conditions as EC 5 | from selenium.webdriver.support.wait import WebDriverWait 6 | 7 | driver = Chrome() 8 | 9 | url = "https://platform.sustain-cert.com/public-project/2756" 10 | driver.get(url) 11 | 12 | files = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'div.MuiBox-root.css-16uqhx7'))) 13 | print(f"total files: {len(files)}") 14 | 15 | container = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div.MuiContainer-root.MuiContainer-maxWidthLg.css-got2s4'))) 16 | categories = container.find_elements(By.CSS_SELECTOR, 'div>h6') 17 | 18 | for category in categories: 19 | 20 | if category.text == "Design Review": 21 | # ------------------------------------------------------------------------------------------------------------- 22 | design_files = category.find_element(By.XPATH, "parent::*").find_elements(By.CSS_SELECTOR, 'div.MuiBox-root.css-16uqhx7') 23 | # ------------------------------------------------------------------------------------------------------------- 24 | print(f"total files under Design Review:: {len(design_files)}") 25 | 26 | delay = 5 27 | for file in design_files: 28 | file_detail = file.text.split('\n') 29 | 30 | if file_detail[0].endswith('.pdf)'): 31 | print(f"pdf files under Design Review:") 32 | print(file_detail[0].replace('(', '').replace(')', '')) 33 | # click button to download the pdf file 34 | file.find_element(By.TAG_NAME, 'button').click() 35 | time.sleep(delay) 36 | 37 | delay += 10 38 | 39 | 40 | # reference: 41 | # https://pythonexamples.org/python-selenium-get-previous-sibling-element/#:~:text=To%20get%20the%20preceding%20or,parameter%20in%20the%20function%20call. 42 | # https://stackoverflow.com/questions/76369098/download-pdfs-under-a-specific-header-on-webpage-through-selenium-python 43 | """ 44 | output: 45 | 46 | total files: 12 47 | total files under Design Review:: 6 48 | pdf files under Design Review: 49 | 03 Deviation Request Form-Zengjiang wind power project-20220209-V01.pdf 50 | pdf files under Design Review: 51 | 20220901_GS4GG VAL FVR_Yunxiao Wind_clean.pdf 52 | """ 53 | 54 | """ 55 | Few things to note: 56 | 57 | 1. As you are only interested in the pdf files in the Design Review section, so we first locate the element using h6 tag 58 | 2. next, we iterate over all h6 tags and pick only the one with the Design Review text. 59 | 3. Then, we refer back to the parent element/tag of the filtered h6 tag, find all the files, and store them in a variable design_files. 60 | 4. Now, we get all the files under the Design Review and we easily filter out the files which end with .pdf 61 | 5. finally, click on the located pdf file to download. 62 | 63 | Downloading the files takes a bit of time, so we add incremental delay to wait for the current files to get downloaded before starting the next file download. 64 | """ -------------------------------------------------------------------------------- /selenium_hover_click.py: -------------------------------------------------------------------------------- 1 | 2 | import time 3 | from selenium import webdriver 4 | from selenium.webdriver import ChromeOptions 5 | from selenium.webdriver.common.by import By 6 | from selenium.webdriver.common.action_chains import ActionChains 7 | 8 | 9 | options = ChromeOptions() 10 | 11 | # maximized and disable forbar 12 | options.add_argument("--start-maximized") 13 | options.add_experimental_option("useAutomationExtension", False) 14 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 15 | 16 | driver = webdriver.Chrome(options=options) 17 | 18 | url = "https://www.kbb.com/" 19 | driver.get(url) 20 | 21 | # --------------------------------------------------------------------------------------------- 22 | element_to_hover_over = driver.find_element(By.XPATH, '//*[@id="app"]/header/div/nav/div[2]') 23 | hover = ActionChains(driver).move_to_element(element_to_hover_over) 24 | hover.perform() 25 | # --------------------------------------------------------------------------------------------- 26 | 27 | driver.find_element(By.XPATH, '//*[@id="app"]/header/div/nav/div[2]/ul/li[1]').click() 28 | time.sleep(2) 29 | -------------------------------------------------------------------------------- /selenium_hover_click_text.py: -------------------------------------------------------------------------------- 1 | import time 2 | from selenium.webdriver import Chrome, ChromeOptions 3 | from selenium.webdriver.common.by import By 4 | from selenium.webdriver.common.action_chains import ActionChains 5 | 6 | options = ChromeOptions() 7 | 8 | options.add_argument("--start-maximized") 9 | options.add_experimental_option("useAutomationExtension", False) 10 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 11 | 12 | driver = Chrome(options=options) 13 | # Here I've taken the URL of this same stackoverflow page 14 | driver.get("https://stackoverflow.com/questions/75945977/how-to-get-mouse-hover-message-in-selenium-webdriver-which-is-not-given-in-html") 15 | time.sleep(1) 16 | # and lets for example, take the java tag in your post 17 | element_to_hover_over = driver.find_element(By.XPATH, '//*[@id="question"]/div/div[2]/div[2]/div/div/ul/li[1]') 18 | hover = ActionChains(driver).move_to_element(element_to_hover_over) 19 | hover.perform() 20 | time.sleep(2) 21 | hover_tag_all_detail = element_to_hover_over.find_element(By.CSS_SELECTOR, 'div.esc-remove').text 22 | print(f"all details:\n{hover_tag_all_detail}") 23 | hover_tag_descrition = element_to_hover_over.find_element(By.CSS_SELECTOR, 'div.fc-light').text 24 | print(f"tag description only:\n{hover_tag_descrition}") 25 | 26 | -------------------------------------------------------------------------------- /selenium_iframe_excercise.py: -------------------------------------------------------------------------------- 1 | import time 2 | from selenium import webdriver 3 | from selenium.webdriver.common.by import By 4 | 5 | options = webdriver.ChromeOptions() 6 | options.add_argument('--start-maximized') 7 | options.add_argument('--disable-extensions') 8 | options.add_experimental_option("useAutomationExtension", False) 9 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 10 | 11 | driver = webdriver.Chrome(options=options) 12 | 13 | driver.get('https://www.ifsc-climbing.org/index.php/world-competition/calendar?task=ranking-complete&category=3') 14 | time.sleep(2) 15 | # ------------------------------------------------------------------------------------------------------------------- 16 | driver.switch_to.frame("calendar") 17 | # ------------------------------------------------------------------------------------------------------------------- 18 | table_wrapper = driver.find_element(By.CSS_SELECTOR, 'div[id="table_id_wrapper"]') 19 | results = table_wrapper.find_element(By.TAG_NAME, 'tbody').find_elements(By.TAG_NAME, 'tr') 20 | 21 | data = [] 22 | for result in results: 23 | details = result.find_elements(By.TAG_NAME, 'td') 24 | temp_dict = { 25 | "name": f"{details[1].text} {details[2].text}", 26 | "country": details[3].text, 27 | "points": details[4].text 28 | } 29 | data.append(temp_dict) 30 | 31 | print(data) 32 | 33 | -------------------------------------------------------------------------------- /selenium_iframe_excercise_2.py: -------------------------------------------------------------------------------- 1 | import time 2 | from selenium.webdriver import Chrome 3 | from selenium.webdriver.common.by import By 4 | from selenium.webdriver.support.wait import WebDriverWait 5 | import selenium.webdriver.support.expected_conditions as EC 6 | 7 | driver = Chrome() 8 | 9 | driver.get("https://www.northamericanstainless.com/NAS_App/Surcharge1?language=E&type=F") 10 | 11 | WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'iframe.surcharge-iframe'))) 12 | # ------------------------------------------------------------------------------------------------------------------- 13 | driver.switch_to.frame(driver.find_element(By.CSS_SELECTOR, 'iframe.surcharge-iframe')) 14 | # ------------------------------------------------------------------------------------------------------------------- 15 | # click on submit button 16 | driver.find_element(By.ID, 'submitStylev2').click() 17 | time.sleep(5) 18 | -------------------------------------------------------------------------------- /selenium_iframe_excercise_3.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : 3 | Author : Ajeet 4 | Date : June 12, 2023 5 | """ 6 | 7 | import time 8 | from selenium.webdriver import Chrome, ChromeOptions 9 | from selenium.webdriver.common.by import By 10 | from selenium.common.exceptions import NoAlertPresentException 11 | from selenium.webdriver.support.wait import WebDriverWait 12 | from selenium.webdriver.support import expected_conditions as EC 13 | 14 | url = "https://bdap-opendata.rgs.mef.gov.it/opendata/spd_mop_prg_mon_reg18_01_9999?t=Scarica" 15 | 16 | chrome_options = ChromeOptions() 17 | chrome_options.add_experimental_option("excludeSwitches", ['enable-automation']) 18 | driver = Chrome(options=chrome_options) 19 | driver.get(url) 20 | wait = WebDriverWait(driver, 20) 21 | # ---------------------------------------------------------------------------------------------------------------------- 22 | # wait for the target iframe to get loaded in order to switch to it 23 | iframe = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'iframe.tabIframe.dinamically-tab-iframe-content'))) 24 | # switch to the target iframe 25 | driver.switch_to.frame(iframe) 26 | # ---------------------------------------------------------------------------------------------------------------------- 27 | 28 | wait.until(EC.element_to_be_clickable((By.XPATH, '//div[@title="Excel file format."]'))).click() 29 | 30 | try: 31 | driver.switch_to.alert.accept() 32 | except NoAlertPresentException: 33 | pass 34 | 35 | time.sleep(5) 36 | 37 | """ 38 | Steps to follow: 39 | 1. First wait for the desired iframe tag to be get loaded/located on the page. 40 | 2. after making sure that it's loaded, switch to this iframe as mentioned in the code above (using switch_to.frame()) 41 | 3. Once you're inside the iframe, you can easily locate the desired element using XPATH but make sure it is clickable before clicking as the website takes some time to load this particular section on the page. 42 | 4. Sometimes, when you click the desired button/element, an alert box appears as shown below, you can simply accept it: 43 | 44 | reference: 45 | https://stackoverflow.com/questions/76454460/webcrawling-with-selenium-couldnt-extract-the-xpath-of-a-button 46 | """ -------------------------------------------------------------------------------- /selenium_iframe_excercise_linkedin.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script prints the total number of pages of document that is being attached to a LinkedIn post. 3 | """ 4 | 5 | from selenium.webdriver.common.by import By 6 | from selenium.webdriver.support import expected_conditions as EC 7 | from selenium.webdriver.support.wait import WebDriverWait 8 | from linkedIn_base import Linkedin 9 | 10 | obj = Linkedin() 11 | driver = obj.load_cookies(path="linkedin_cookies.json") 12 | 13 | # for example, this post has a doc with 7 pages 14 | post_url = "https://www.linkedin.com/feed/update/urn:li:activity:7050104978106974208" 15 | driver.get(post_url) 16 | 17 | driver.execute_script("window.scrollBy(0,900);") 18 | WebDriverWait(driver, 10).until( 19 | EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, "iframe[class='document-s-container__document-element document-s-container__document-element--loaded']"))) 20 | 21 | element = driver.find_element(By.CSS_SELECTOR, 'div.ssplayer-actions.center-actions') 22 | pages = element.find_element(By.CSS_SELECTOR, 'div.ssplayer-progress-bar.meter-animated').get_attribute('aria-valuemax') 23 | print(pages) 24 | 25 | -------------------------------------------------------------------------------- /selenium_nth_css_selector.py: -------------------------------------------------------------------------------- 1 | # driver.findElement(By.cssSelector("ul > li:nth-child(1)")); >> home 2 | # driver.findElement(By.cssSelector("ul > li:nth-child(2)")); >> posts 3 | # driver.findElement(By.cssSelector("ul > li:nth-child(3)")); >> events -------------------------------------------------------------------------------- /selenium_ok_alert.py: -------------------------------------------------------------------------------- 1 | import time 2 | from selenium.webdriver import Chrome, ChromeOptions 3 | from selenium.webdriver.common.by import By 4 | from selenium.common import NoAlertPresentException 5 | 6 | options = ChromeOptions() 7 | options.add_argument('--start-maximized') 8 | options.add_argument("force-device-scale-factor=0.95") 9 | 10 | driver = Chrome(options=options) 11 | 12 | urls = ['https://web.archive.org/web/20080221233711/http://www.berkshire.com/', 13 | 'https://web.archive.org/web/20171107004101/http://www.berkshirefunds.com/', 14 | 'https://web.archive.org/web/20200224044229/http://www.berkshirefunds.com/'] 15 | 16 | for i, url in enumerate(urls): 17 | driver.get(url) 18 | time.sleep(5) 19 | 20 | if url.endswith('www.berkshire.com/'): 21 | target_element = driver.find_element(By.TAG_NAME, 'tbody') 22 | target_element.screenshot(f'{i}_screen_capture.png') 23 | 24 | elif url.endswith('www.berkshirefunds.com/'): 25 | try: 26 | # --------------------------------------------------------------------------------------------- 27 | driver.switch_to.alert.accept() 28 | # --------------------------------------------------------------------------------------------- 29 | except NoAlertPresentException: 30 | pass 31 | target_element = driver.find_element(By.CSS_SELECTOR, 'div#page-wrap') 32 | target_element.screenshot(f'{i}_screen_capture.png') 33 | -------------------------------------------------------------------------------- /selenium_options.py: -------------------------------------------------------------------------------- 1 | from selenium.webdriver import ChromeOptions 2 | options = ChromeOptions() 3 | 4 | # maximized and disable forbar 5 | options.add_argument("--start-maximized") 6 | options.add_argument("--incognito") 7 | options.add_argument("--disable-infobars") 8 | options.add_argument("--disable-extension") 9 | options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36") 10 | 11 | options.add_experimental_option("useAutomationExtension", False) 12 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 13 | options.add_experimental_option("detach", True) 14 | 15 | options.add_experimental_option( 16 | "prefs", 17 | { 18 | "credentials_enable_service": False, 19 | "profile.password_manager_enabled": False, 20 | "profile.default_content_setting_values.notifications": 2 21 | # with 2 should disable/block notifications and 1 to allow 22 | }, 23 | ) 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /selenium_partial_class_name.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver import ChromeOptions, Keys 3 | from selenium.webdriver.common.by import By 4 | from selenium.webdriver.support import expected_conditions as EC 5 | from selenium.webdriver.support.wait import WebDriverWait 6 | import json 7 | import time 8 | 9 | options = ChromeOptions() 10 | 11 | # maximized and disable forbar 12 | options.add_argument("--start-maximized") 13 | options.add_experimental_option("useAutomationExtension", False) 14 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 15 | 16 | driver = webdriver.Chrome(options=options) 17 | 18 | url = "https://booking.bbdc.sg/#/login?redirect=%2Ftransactions%2Findex" 19 | 20 | 21 | driver.get(url) 22 | 23 | WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[class="login-content d-flex justify-center flex-column"]'))) 24 | username= driver.find_element(by=By.ID, value='input-8') 25 | username.send_keys("ajeet@123") 26 | password = driver.find_element(by=By.ID, value='input-15') 27 | password.send_keys("ajee") 28 | 29 | # locate the button to click by using its partial class name 30 | driver.find_element(By.CSS_SELECTOR, 'button[class^="v-btn v-btn"]').click() 31 | time.sleep(5) 32 | 33 | 34 | driver.quit() -------------------------------------------------------------------------------- /selenium_scrap_transcript.py: -------------------------------------------------------------------------------- 1 | from selenium.webdriver import Chrome 2 | from selenium.webdriver.common.by import By 3 | 4 | driver = Chrome() 5 | driver.get("https://www.luyennghetienganh.com/learn-by-listening-level-1/1060-learn-english-by-listening-level-1-unit-001.html") 6 | 7 | container = driver.find_elements(By.CSS_SELECTOR, 'div.rabbit-lyrics__line') 8 | eng_sub = [i.get_attribute('innerHTML') for i in container] 9 | print(eng_sub) 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /selenium_scrape_youtube_channel.py: -------------------------------------------------------------------------------- 1 | import time 2 | from selenium import webdriver 3 | from selenium.webdriver.common.by import By 4 | from selenium.webdriver.support.ui import WebDriverWait 5 | from selenium.webdriver.support import expected_conditions as EC 6 | from selenium.webdriver.chrome.options import Options 7 | 8 | # CHROME DRIVER 9 | options = Options() 10 | 11 | options.add_argument("--start-maximized") 12 | # options.add_experimental_option("useAutomationExtension", False) 13 | # options.add_experimental_option("excludeSwitches", ["enable-automation"]) 14 | 15 | driver = webdriver.Chrome(options=options) 16 | 17 | 18 | def scrape_ytchannel(url): 19 | driver.get(url) 20 | 21 | handle = driver.find_element(By.XPATH, '//yt-formatted-string[@id="channel-handle"]').text 22 | subscriber_count = driver.find_element(By.XPATH, '//yt-formatted-string[@id="subscriber-count"]').text 23 | 24 | # SCRIPTINO TO SCROLL PAGE UNTIL IT ENDS 25 | WAIT_IN_SECONDS = 5 26 | last_height = driver.execute_script("return document.documentElement.scrollHeight") 27 | 28 | while True: 29 | # Scroll to the bottom of page 30 | driver.execute_script("window.scrollTo(0, arguments[0]);", last_height) 31 | # Wait for new videos to show up 32 | time.sleep(WAIT_IN_SECONDS) 33 | 34 | # Calculate new document height and compare it with last height 35 | new_height = driver.execute_script("return document.documentElement.scrollHeight") 36 | if new_height == last_height: 37 | break 38 | last_height = new_height 39 | 40 | thumbnails = driver.find_elements(By.XPATH, '//a[@id="thumbnail"]/yt-image/img') 41 | views = driver.find_elements(By.XPATH, '//div[@id="metadata-line"]/span[1]') 42 | titles = driver.find_elements(By.ID, "video-title") 43 | links = driver.find_elements(By.ID, "video-title-link") 44 | 45 | videos = [] 46 | for title, view, thumb, link in zip(titles, views, thumbnails, links): 47 | video_dict = { 48 | 'title': title.text, 49 | 'views': view.text, 50 | # 'thumbnail': thumb.get_attribute('src'), 51 | 'thumbnail': thumb.get_dom_attribute('src'), 52 | 'link': link.get_attribute('href') 53 | } 54 | videos.append(video_dict) 55 | result = [videos, handle, subscriber_count] 56 | 57 | return result 58 | 59 | 60 | url_conf = "https://www.youtube.com/@confindustria/videos" 61 | print(scrape_ytchannel(url_conf)) -------------------------------------------------------------------------------- /selenium_scrape_youtube_search.py: -------------------------------------------------------------------------------- 1 | import time 2 | from selenium import webdriver 3 | from selenium.webdriver.common.by import By 4 | from selenium.webdriver.chrome.options import Options 5 | 6 | options = Options() 7 | options.add_argument("--start-maximized") 8 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 9 | driver = webdriver.Chrome(options=options) 10 | 11 | 12 | def scrape_yt(url): 13 | driver.get(url) 14 | # scroll the page until it ends 15 | last_height = driver.execute_script("return document.documentElement.scrollHeight") 16 | while True: 17 | # Scroll to the bottom of page 18 | driver.execute_script("window.scrollTo(0, arguments[0]);", last_height) 19 | # Wait for new videos to show up 20 | time.sleep(2) 21 | # Calculate new document height and compare it with last height 22 | new_height = driver.execute_script("return document.documentElement.scrollHeight") 23 | if new_height == last_height: 24 | break 25 | last_height = new_height 26 | 27 | time.sleep(2) 28 | videos = driver.find_elements(By.TAG_NAME, 'ytd-video-renderer') 29 | print(f"total videos: {len(videos)}") 30 | 31 | links_list = [] 32 | for video in videos: 33 | link = video.find_element(By.TAG_NAME, 'h3').find_element(By.TAG_NAME, 'a').get_attribute('href') 34 | links_list.append(link) 35 | 36 | return links_list 37 | 38 | 39 | # ser manual input 40 | search_word = input("Enter the search keyword: ") 41 | url = f"https://www.youtube.com/results?search_query={search_word}" 42 | print(scrape_yt(url)) -------------------------------------------------------------------------------- /selenium_select_tag_dropdown.py: -------------------------------------------------------------------------------- 1 | from selenium.webdriver import Chrome, ChromeOptions 2 | from selenium.webdriver.support.select import Select 3 | from selenium.webdriver.common.by import By 4 | 5 | 6 | url = 'https://cricos.education.gov.au/Course/CourseSearch.aspx' 7 | 8 | options = ChromeOptions() 9 | options.add_argument("--start-maximized") 10 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 11 | 12 | browser = Chrome(options=options) 13 | browser.get(url) 14 | 15 | state = browser.find_element(By.ID, 'ctl00_cphDefaultPage_courseSearchCriteria_ddlCourseLocation') 16 | nsw = Select(state) 17 | nsw.select_by_value('NSW') 18 | browser.find_element(By.ID, 'ctl00_cphDefaultPage_btnSearch').click() 19 | -------------------------------------------------------------------------------- /selenium_send_keys _excercise.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.by import By 3 | from selenium.webdriver.common.keys import Keys 4 | from selenium.webdriver.support import expected_conditions as EC 5 | from selenium.webdriver.support.wait import WebDriverWait 6 | 7 | driver = webdriver.Chrome() 8 | driver.maximize_window() 9 | driver.get('https://www.google.com/travel/flights') 10 | 11 | wait = WebDriverWait(driver, 10) 12 | wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[aria-placeholder='Where from?'] input"))).click() 13 | wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[aria-label='Enter your origin'] input"))).send_keys("Sydney" + Keys.ARROW_DOWN + Keys.ENTER) 14 | wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[aria-placeholder='Where to?'] input"))).click() 15 | wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[aria-label='Enter your destination'] input"))).send_keys("Auckland" + Keys.ARROW_DOWN + Keys.ENTER) 16 | wait.until(EC.element_to_be_clickable((By.XPATH, "//span[text()='Search']"))).click() 17 | 18 | driver.quit() 19 | -------------------------------------------------------------------------------- /selenium_shadow_open_excercise.py: -------------------------------------------------------------------------------- 1 | from selenium.webdriver import Chrome 2 | from selenium.webdriver.common.by import By 3 | from selenium.webdriver.support import expected_conditions as EC 4 | from selenium.webdriver.support.wait import WebDriverWait 5 | 6 | driver = Chrome() 7 | 8 | url = "https://bluechip.io/sport?bt-path=%2Fschedule%3FscheduleSport%3Dsoccer-1" 9 | driver.get(url) 10 | 11 | WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div#bt-inner-page"))) 12 | # -------------------------------------------------------------------------------------------------------------------- 13 | # inner_page = driver.execute_script('''return document.getElementById('bt-inner-page').shadowRoot''') 14 | # or 15 | inner_page = driver.find_element(By.CSS_SELECTOR, "div#bt-inner-page").shadow_root 16 | # -------------------------------------------------------------------------------------------------------------------- 17 | eventCard = WebDriverWait(inner_page, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div[data-editor-id="eventCard"]'))) 18 | print(len(eventCard)) 19 | -------------------------------------------------------------------------------- /selenium_shadow_root.py: -------------------------------------------------------------------------------- 1 | # https://stackoverflow.com/questions/36141681/does-anybody-know-how-to-identify-shadow-dom-web-elements-using-selenium-webdriv 2 | # https://stackoverflow.com/questions/28911799/accessing-elements-in-the-shadow-dom 3 | """ 4 | 5 | #shadow-root (open) 6 |
7 | 8 |
9 |
.....
10 | """ 11 | from selenium.webdriver import Chrome, ChromeOptions 12 | driver = Chrome() 13 | 14 | shadow_section = driver.execute_script('''return document.querySelector("neon-animatable").shadowRoot''') 15 | shadow_section.find_element_by_css(".flex") 16 | -------------------------------------------------------------------------------- /selenium_take_screenshot.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.by import By 3 | 4 | import ddddocr 5 | 6 | driver = webdriver.Chrome() 7 | 8 | driver.get('https://ma.mohw.gov.tw/masearch/') 9 | 10 | captcha = driver.find_element(By.ID, "ctl00_ContentPlaceHolder1_ImageCheck") 11 | 12 | # ---------------------------------------------------------------------------------------------------------------- 13 | captcha.screenshot(f'captcha.png') 14 | # ---------------------------------------------------------------------------------------------------------------- 15 | 16 | ocr = ddddocr.DdddOcr() 17 | # open and read the image 18 | with open(f'captcha.png', 'rb') as f: 19 | img_bytes = f.read() 20 | 21 | res = ocr.classification(img_bytes) 22 | print(res.upper()) 23 | -------------------------------------------------------------------------------- /selenium_twitter_login.py: -------------------------------------------------------------------------------- 1 | import time 2 | from selenium import webdriver 3 | from selenium.webdriver import ChromeOptions, Keys 4 | from selenium.webdriver.common.by import By 5 | from selenium.webdriver.support import expected_conditions as EC 6 | from selenium.webdriver.support.wait import WebDriverWait 7 | 8 | 9 | options = ChromeOptions() 10 | 11 | # maximized and disable forbar 12 | options.add_argument("--start-maximized") 13 | options.add_experimental_option("useAutomationExtension", False) 14 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 15 | 16 | driver = webdriver.Chrome(options=options) 17 | 18 | 19 | url = "https://twitter.com/login" 20 | 21 | driver.get(url) 22 | WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CLASS_NAME, "css-1dbjc4n"))) 23 | login = driver.find_element(By.CLASS_NAME, "css-1dbjc4n") 24 | time.sleep(2) 25 | username = login.find_element(By.CSS_SELECTOR, 'input[autocomplete="username"]') 26 | username.send_keys("xxxxxxxxxxx") 27 | username.send_keys(Keys.ENTER) 28 | time.sleep(1) 29 | password = login.find_element(By.CSS_SELECTOR, 'input[name="password"]') 30 | password.send_keys("xxxxxxxx") 31 | password.send_keys(Keys.ENTER) 32 | 33 | time.sleep(2) 34 | -------------------------------------------------------------------------------- /selenium_work_shadow_closed.pyi: -------------------------------------------------------------------------------- 1 | from selenium.webdriver import ActionChains 2 | from selenium.webdriver.common.by import By 3 | from selenium.webdriver.support.ui import WebDriverWait 4 | from selenium.webdriver.support import expected_conditions as EC 5 | from selenium import webdriver 6 | 7 | driver = webdriver.Chrome() 8 | driver.implicitly_wait(10) 9 | driver.get("https://www.sreality.cz/") 10 | driver.maximize_window() 11 | 12 | # Below line creates instance of ActionChains class 13 | action = ActionChains(driver) 14 | # Below line locates and stores an element which is outside the shadow-root 15 | element_outside_shadow = driver.find_element(By.XPATH, "//div[@class='szn-cmp-dialog-container']") 16 | # Below 2 lines clicks on the browser at an offset of co-ordinates x=5 and y=5 17 | action.move_to_element_with_offset(element_outside_shadow, 5, 5) 18 | action.click() 19 | -------------------------------------------------------------------------------- /selenium_workday_login.py: -------------------------------------------------------------------------------- 1 | import time 2 | from selenium.webdriver import Chrome, ChromeOptions 3 | from selenium.webdriver.common.by import By 4 | from selenium.webdriver.support.wait import WebDriverWait 5 | import selenium.webdriver.support.expected_conditions as EC 6 | from selenium.webdriver.common.action_chains import ActionChains 7 | 8 | options = ChromeOptions() 9 | options.add_argument('--start-maximized') 10 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 11 | options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36") 12 | 13 | 14 | driver = Chrome(options=options) 15 | wait = WebDriverWait(driver, 10) 16 | 17 | url = "https://walmart.wd5.myworkdayjobs.com/en-US/WalmartExternal/login" 18 | driver.get(url) 19 | 20 | wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'input[data-automation-id="email"]'))) 21 | email = driver.find_element(By.CSS_SELECTOR, 'input[data-automation-id="email"]') 22 | email.send_keys('your_username') 23 | 24 | password = driver.find_element(By.CSS_SELECTOR, 'input[data-automation-id="password"]') 25 | password.send_keys('your_password') 26 | 27 | submit = driver.find_element(By.CSS_SELECTOR, 'div[aria-label="Sign In"]') 28 | 29 | hover = ActionChains(driver).move_to_element(submit) 30 | hover.click().perform() 31 | 32 | time.sleep(10) 33 | 34 | """ 35 | Few things to note: 36 | 37 | 1. we need to wait for the Sign In box to appear on the page. 38 | 2. we must pass the user-agent to the Chrome options. 39 | 3. Use ActionChains to successfully perform the click to get to the logged-in profile. A simple click() will not work here. 40 | """ -------------------------------------------------------------------------------- /shein_com.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : 3 | Author : Ajeet 4 | Date : 09/06/2023 5 | """ 6 | 7 | import time 8 | from selenium.webdriver import Chrome, ChromeOptions 9 | from selenium.webdriver.common.by import By 10 | from selenium.webdriver.support.wait import WebDriverWait 11 | import selenium.webdriver.support.expected_conditions as EC 12 | 13 | options = ChromeOptions() 14 | options.add_argument("--start-maximized") 15 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 16 | options.add_experimental_option("prefs", {"profile.default_content_setting_values.notifications": 2}) 17 | 18 | driver = Chrome(options=options) 19 | wait = WebDriverWait(driver, 10) 20 | url = "https://us.shein.com/Men-Playing-Card-Print-Tee-p-9847947-cat-1980.html?src_identifier=on%3DIMAGE_COMPONENT%60cn%3Dcat%60hz%3DhotZone_16%60ps%3D4_10%60jc%3DitemPicking_001121429&src_module=Women&src_tab_page_id=page_home1685728955945&mallCode=1" 21 | driver.get(url) 22 | 23 | # wait and close the coupon-box 24 | coupon_box = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div.c-coupon-box'))) 25 | coupon_box.find_element(By.CSS_SELECTOR, 'i.iconfont.icon-close.she-close').click() 26 | 27 | # # wait and close the register container side box 28 | wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div.quickg-outside'))) 29 | driver.execute_script("document.querySelector('i.svgicon.svgicon-arrow-left').click();") 30 | 31 | for color in driver.find_elements(By.CSS_SELECTOR, "div[class^='product-intro__color-radio']"): 32 | 33 | color.click() 34 | time.sleep(2) 35 | name = color.get_attribute("aria-label") 36 | colorPic = color.find_element(By.TAG_NAME, "img").get_attribute("src") 37 | price = driver.find_element(By.CLASS_NAME, "from").get_attribute("aria-label") 38 | 39 | pictures = [] 40 | for pic in driver.find_element(By.CLASS_NAME, "product-intro__thumbs-inner").find_elements(By.TAG_NAME, "img"): 41 | pictures.append(pic.get_attribute("src")) 42 | 43 | print(f"color name: {name}, color link: {colorPic}, price: {price}, pictures: {pictures}") 44 | 45 | """ 46 | steps to follow: 47 | 48 | 1. First, as the page loads, it coupon box pops up and we need to close it to proceed. Therefore we wait for the 49 | coupon-box web element to appear and then click to close it. 50 | 51 | 2. Next, A register container appears from the right side over the web element containing the radio buttons of color 52 | options. Thus, we wait for it to appear and minimize it by clicking on the arrow. 53 | 54 | 3. Now, we simply find all the available color radio button(here 6) for the product, iterate over them one-by-one and in 55 | every iteration click on the respective color radio button to extract all the details of the product with the specific chosen color. 56 | 57 | As you can see, it outputs the product details (color name, color pic, price of the product for the color, and all the 58 | pictures of the product available for the color). 59 | 60 | reference: 61 | https://stackoverflow.com/questions/76436659/python-selenium-how-do-i-click-on-a-radio-button 62 | """ -------------------------------------------------------------------------------- /stackoverflow_login_and_save_cookies.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver import ChromeOptions 3 | from selenium.webdriver.common.by import By 4 | import time 5 | import json 6 | 7 | options = ChromeOptions() 8 | # open and maximize the screen 9 | options.add_argument("--start-maximized") 10 | # below 2 lines diables the info bar 11 | options.add_experimental_option("useAutomationExtension", False) 12 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 13 | 14 | driver = webdriver.Chrome(options=options) 15 | 16 | driver.get("https://stackoverflow.com") 17 | 18 | #find and click the log in button 19 | find_login_button = driver.find_element(By.XPATH,"//a[normalize-space()='Log in']").click() 20 | # fill the email account, password 21 | email = driver.find_element(By.XPATH, "//input[@id='email']") 22 | password = driver.find_element(By.XPATH, "//input[@id='password']") 23 | email.send_keys("your_mail_id") 24 | password.send_keys("your_password") 25 | time.sleep(2) 26 | 27 | # click button login2 28 | find_submit_button = driver.find_element(By.XPATH,"//button[@id='submit-button']").click() 29 | time.sleep(2) 30 | # print(driver.get_cookies()) 31 | 32 | json_object = json.dumps(driver.get_cookies()) 33 | 34 | # Writing to sample.json 35 | with open("stackoverflow_cookies.json", "w") as outfile: 36 | outfile.write(json_object) 37 | -------------------------------------------------------------------------------- /stackoverflow_login_with_cookies.py: -------------------------------------------------------------------------------- 1 | import time 2 | import json 3 | from selenium import webdriver 4 | from selenium.webdriver import ChromeOptions 5 | 6 | 7 | def login(): 8 | options = ChromeOptions() 9 | options.add_argument("--start-maximized") 10 | options.add_argument("--incognito") 11 | options.add_argument("--disable-infobars") 12 | options.add_argument("--disable-extension") 13 | options.add_experimental_option("useAutomationExtension", False) 14 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 15 | 16 | # create chrome driver object with options 17 | driver = webdriver.Chrome(options=options) 18 | 19 | # open then website 20 | driver.get("https://stackoverflow.com") 21 | 22 | # Opening JSON file 23 | f = open('stackoverflow_cookies.json') 24 | cookies = json.load(f) 25 | # load cookies to the driver 26 | for cookie in cookies: 27 | driver.add_cookie(cookie) 28 | 29 | time.sleep(2) 30 | # refresh the browser 31 | driver.refresh() 32 | 33 | return driver 34 | 35 | 36 | if __name__ == '__main__': 37 | login() 38 | 39 | 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /stackoverflow_track.py: -------------------------------------------------------------------------------- 1 | import time 2 | import winsound 3 | from selenium.webdriver import Chrome, ChromeOptions 4 | from selenium.webdriver.common.by import By 5 | 6 | options = ChromeOptions() 7 | # open and maximize the screen 8 | options.add_argument("--start-maximized") 9 | # below 2 lines diables the info bar 10 | options.add_experimental_option("useAutomationExtension", False) 11 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 12 | 13 | driver = Chrome(options=options) 14 | 15 | url_to_track = 'https://stackoverflow.com/search?tab=Newest&pagesize=15&q=web-scraping&searchOn=3' 16 | driver.get(url_to_track) 17 | 18 | questions = driver.find_elements(By.CSS_SELECTOR, 'div.s-post-summary.js-post-summary') 19 | 20 | try: 21 | top_of_list = questions[0].find_element(By.CSS_SELECTOR, 'div.s-post-summary--content') 22 | title = top_of_list.find_element(By.TAG_NAME, 'h3').text 23 | print(title) 24 | top_title = title 25 | 26 | flag = True 27 | time_now = time.time() 28 | while flag: 29 | q = driver.find_elements(By.CSS_SELECTOR, 'div.s-post-summary.js-post-summary')[0] 30 | ti = q.find_element(By.CSS_SELECTOR, 'div.s-post-summary--content').find_element(By.TAG_NAME, 'h3').text 31 | # print(ti) 32 | cat = q.find_element(By.CSS_SELECTOR, 'div.s-post-summary--content').find_element(By.CSS_SELECTOR, 33 | 'h3>span').get_attribute( 34 | 'title') 35 | # print(cat) 36 | tg = [tag.text for tag in 37 | q.find_element(By.CSS_SELECTOR, 'div.s-post-summary--content').find_element(By.CSS_SELECTOR, 38 | 'div.s-post-summary--meta').find_element( 39 | By.CSS_SELECTOR, 'ul.ml0.list-ls-none.js-post-tag-list-wrapper.d-inline').find_elements(By.TAG_NAME, 40 | 'li')] 41 | # print(tg) 42 | 43 | if ti != top_title and cat =='Question' and 'python' in tg: 44 | # winsound.Beep(frequency=350, duration=1000) 45 | print(f"new post arrives") 46 | winsound.PlaySound('delightful-4.wav', winsound.SND_FILENAME) 47 | flag = False 48 | 49 | # refresh the browser every 2 minutes 50 | if time.time() > time_now+120: 51 | driver.refresh() 52 | time_now = time.time() 53 | 54 | except IndexError as e: 55 | print(e) 56 | time.sleep(10) 57 | driver.quit() 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /store_pagination_element_to_click.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver import ChromeOptions, Keys 3 | from selenium.webdriver.common.by import By 4 | from selenium.webdriver.support import expected_conditions as EC 5 | from selenium.webdriver.support.wait import WebDriverWait 6 | import json 7 | import time 8 | 9 | options = ChromeOptions() 10 | 11 | # maximized and disable forbar 12 | options.add_argument("--start-maximized") 13 | options.add_experimental_option("useAutomationExtension", False) 14 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 15 | 16 | driver = webdriver.Chrome(options=options) 17 | 18 | url = "https://www.google.com/search?q=toi" 19 | 20 | 21 | driver.get(url) 22 | driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") 23 | 24 | pages = driver.find_element(by=By.CLASS_NAME, value="AaVjTc").find_element(by=By.TAG_NAME, value='tr').find_elements(by=By.TAG_NAME, value='td') 25 | 26 | lst = [page.find_element(by=By.TAG_NAME, value='a') for page in pages[2:]] 27 | 28 | print(lst) 29 | print(len(lst)) 30 | lst[2].click() 31 | time.sleep(5) 32 | 33 | driver.quit() -------------------------------------------------------------------------------- /sustainalytics_com.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : 3 | Author : Ajeet 4 | Date : June 21, 2023 5 | """ 6 | import time 7 | from bs4 import BeautifulSoup 8 | from selenium.webdriver import Chrome, ChromeOptions 9 | from selenium.webdriver.common.by import By 10 | from selenium.webdriver.support.ui import WebDriverWait 11 | from selenium.webdriver.support import expected_conditions as EC 12 | 13 | options = ChromeOptions() 14 | options.add_argument('--start-maximized') 15 | 16 | driver = Chrome(options=options) 17 | wait = WebDriverWait(driver, 10) 18 | url = "https://www.sustainalytics.com/esg-ratings" 19 | driver.get(url) 20 | wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a#hs-eu-confirmation-button'))).click() 21 | 22 | data = [] 23 | 24 | 25 | def data_processing(source): 26 | soup = BeautifulSoup(source, "html.parser") 27 | selected_page = soup.select_one('span.pagination-page.selected').text 28 | print(f"---------------------- This is page {selected_page} ----------------------") 29 | 30 | container = soup.select_one('section#company_ratings') 31 | company_rows = container.find_all(class_='company-row') 32 | 33 | for company_row in company_rows: 34 | company_name = company_row.find(class_='primary-color').get_text() 35 | esg_risk_rating = company_row.find(class_='col-2').get_text() 36 | 37 | print(f"Company: {company_name} | Rating: {esg_risk_rating}") 38 | data.append({"Company": company_name, "Rating": esg_risk_rating}) 39 | 40 | 41 | def first_page(): 42 | # process the 1st page 43 | data_processing(driver.page_source) 44 | return f"data:\n{data}" 45 | 46 | 47 | def multiple_page(page_num): 48 | # process the first page 49 | data_processing(driver.page_source) 50 | 51 | # click and process next pages 52 | for i in range(2, page_num+1): 53 | driver.execute_script(f""" 54 | function getElementByXpath(path) {{ 55 | return document.evaluate(path, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue; 56 | }}; 57 | getElementByXpath('//*[@id="victor-pagination"]/a[@class="pagination-page" and text()="{i}"]').click(); 58 | """) 59 | 60 | time.sleep(2) 61 | data_processing(driver.page_source) 62 | 63 | return f"data:\n{data}" 64 | 65 | 66 | if __name__ == '__main__': 67 | # print(first_page()) 68 | print(multiple_page(4)) 69 | 70 | """ 71 | output: 72 | 73 | ---------------------- This is page 1 ---------------------- 74 | Company: 1-800-FLOWERS.COM, Inc. | Rating: 23.6 75 | Company: 1&1 AG | Rating: 22.2 76 | Company: 10X Genomics, Inc. | Rating: 22.6 77 | Company: 11 Bit Studios SA | Rating: 16.3 78 | Company: 1Life Healthcare, Inc. | Rating: 22.5 79 | Company: 1st Source Corp. | Rating: 31.7 80 | Company: 1stdibs.com, Inc. | Rating: 26.7 81 | Company: 22nd Century Group, Inc. | Rating: 35.4 82 | Company: 23andMe Holding Co. | Rating: 25.6 83 | Company: 29metals Ltd. | Rating: 42.8 84 | ---------------------- This is page 2 ---------------------- 85 | Company: 2i Rete Gas SpA | Rating: 25.2 86 | Company: 2seventy Bio, Inc. | Rating: 32.0 87 | Company: 2U, Inc. | Rating: 26.8 88 | Company: 360 DigiTech, Inc. | Rating: 28.4 89 | Company: 360 One Wam Ltd. | Rating: 33.3 90 | Company: 360 Security Technology, Inc. | Rating: 23.1 91 | Company: 361 Degrees International Ltd. | Rating: 18.6 92 | Company: 37 Interactive Entertainment Network Technology Group Co. Ltd. | Rating: 14.3 93 | Company: 3D Systems Corp. | Rating: 23.0 94 | Company: 3i Group Plc | Rating: 11.1 95 | ---------------------- This is page 3 ---------------------- 96 | Company: 3M Co. | Rating: 33.9 97 | Company: 3M India Ltd. | Rating: 23.4 98 | Company: 3R Petroleum Óleo e Gás SA | Rating: 56.7 99 | Company: 3SBio, Inc. | Rating: 27.1 100 | Company: 407 East Development Group GP | Rating: 45.7 101 | Company: 407 International, Inc. | Rating: 11.4 102 | Company: 4D Molecular Therapeutics, Inc. | Rating: 28.4 103 | Company: 4imprint Group Plc | Rating: 17.2 104 | Company: 5E Advanced Materials, Inc. | Rating: 42.0 105 | Company: 5I5J Holding Group Co. Ltd. | Rating: 15.0 106 | ---------------------- This is page 4 ---------------------- 107 | Company: 7-Eleven Malaysia Holdings Bhd. | Rating: 24.6 108 | Company: 7-Eleven, Inc. | Rating: 35.1 109 | Company: 888 Holdings Plc | Rating: 18.7 110 | Company: 8x8, Inc. | Rating: 29.9 111 | Company: 908 Devices, Inc. | Rating: 36.8 112 | Company: 91APP, Inc. | Rating: 25.8 113 | Company: A-Living Smart City Services Co., Ltd. | Rating: 9.3 114 | Company: A-Mark Precious Metals, Inc. | Rating: 30.3 115 | Company: A. O. Smith Corp. | Rating: 25.4 116 | Company: A.G. BARR Plc | Rating: 23.7 117 | data: 118 | [{'Company': '1-800-FLOWERS.COM, Inc.', 'Rating': '23.6'}, {'Company': '1&1 AG', 'Rating': '22.2'}, {'Company': '10X Genomics, Inc.', 'Rating': '22.6'}, {'Company': '11 Bit Studios SA', 'Rating': '16.3'}, {'Company': '1Life Healthcare, Inc.', 'Rating': '22.5'}, {'Company': '1st Source Corp.', 'Rating': '31.7'}, {'Company': '1stdibs.com, Inc.', 'Rating': '26.7'}, {'Company': '22nd Century Group, Inc.', 'Rating': '35.4'}, {'Company': '23andMe Holding Co.', 'Rating': '25.6'}, {'Company': '29metals Ltd.', 'Rating': '42.8'}, {'Company': '2i Rete Gas SpA', 'Rating': '25.2'}, {'Company': '2seventy Bio, Inc.', 'Rating': '32.0'}, {'Company': '2U, Inc.', 'Rating': '26.8'}, {'Company': '360 DigiTech, Inc.', 'Rating': '28.4'}, {'Company': '360 One Wam Ltd.', 'Rating': '33.3'}, {'Company': '360 Security Technology, Inc.', 'Rating': '23.1'}, {'Company': '361 Degrees International Ltd.', 'Rating': '18.6'}, {'Company': '37 Interactive Entertainment Network Technology Group Co. Ltd.', 'Rating': '14.3'}, {'Company': '3D Systems Corp.', 'Rating': '23.0'}, {'Company': '3i Group Plc', 'Rating': '11.1'}, {'Company': '3M Co.', 'Rating': '33.9'}, {'Company': '3M India Ltd.', 'Rating': '23.4'}, {'Company': '3R Petroleum Óleo e Gás SA', 'Rating': '56.7'}, {'Company': '3SBio, Inc.', 'Rating': '27.1'}, {'Company': '407 East Development Group GP', 'Rating': '45.7'}, {'Company': '407 International, Inc.', 'Rating': '11.4'}, {'Company': '4D Molecular Therapeutics, Inc.', 'Rating': '28.4'}, {'Company': '4imprint Group Plc', 'Rating': '17.2'}, {'Company': '5E Advanced Materials, Inc.', 'Rating': '42.0'}, {'Company': '5I5J Holding Group Co. Ltd.', 'Rating': '15.0'}, {'Company': '7-Eleven Malaysia Holdings Bhd.', 'Rating': '24.6'}, {'Company': '7-Eleven, Inc.', 'Rating': '35.1'}, {'Company': '888 Holdings Plc', 'Rating': '18.7'}, {'Company': '8x8, Inc.', 'Rating': '29.9'}, {'Company': '908 Devices, Inc.', 'Rating': '36.8'}, {'Company': '91APP, Inc.', 'Rating': '25.8'}, {'Company': 'A-Living Smart City Services Co., Ltd.', 'Rating': '9.3'}, {'Company': 'A-Mark Precious Metals, Inc.', 'Rating': '30.3'}, {'Company': 'A. O. Smith Corp.', 'Rating': '25.4'}, {'Company': 'A.G. BARR Plc', 'Rating': '23.7'}] 119 | """ 120 | 121 | """ 122 | reference: 123 | https://stackoverflow.com/questions/76513303/scraping-a-website-for-multiple-pages-that-url-does-not-c 124 | """ -------------------------------------------------------------------------------- /switching_bw_windows.py: -------------------------------------------------------------------------------- 1 | import time 2 | from selenium import webdriver 3 | from selenium.webdriver import ChromeOptions 4 | from selenium.webdriver.common.by import By 5 | 6 | options = ChromeOptions() 7 | 8 | options.add_argument("--start-maximized") 9 | options.add_experimental_option("useAutomationExtension", False) 10 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 11 | 12 | driver = webdriver.Chrome(options=options) 13 | 14 | url = "http://www.hamiltoncountyherald.com/PublicNotices.aspx" 15 | 16 | 17 | def scrape_data(): 18 | # Create list of labels of data you want to scrape 19 | labels = ["lbl1", "lbl2", "lbl3", "lbl4", "lbl5", "lbl6", "lbl7", "lbl8", "lbl9", "lbl10", "lbl11"] 20 | 21 | # Empty list to append data values to 22 | list_of_data = [] 23 | 24 | # Create loop to iterate through list and print values of labels 25 | for items in labels: 26 | link = driver.find_element("id", items) 27 | link_label = link.text 28 | list_of_data.append(link_label) 29 | 30 | # Create list of titles to use as dict keys 31 | titles = ["Borrower", "Address", "Original Trustee", "Attorney", "Instrumental No.", "Substitute Trustee", 32 | "Advertised Auction Date", "Date of First Public Notice", "Trust Date", "DR No."] 33 | 34 | # Zip the titles and labels data together into one dict 35 | zipped_data = dict(zip(titles, list_of_data)) 36 | 37 | return zipped_data 38 | 39 | 40 | driver.get(url) 41 | tables = driver.find_elements(By.TAG_NAME, 'table')[0] 42 | foreclosure_table = tables.find_elements(By.TAG_NAME, 'table')[7] 43 | views = foreclosure_table.find_elements(By.TAG_NAME, 'tr')[1:] 44 | 45 | final_data = [] 46 | for view in views: 47 | # Store the current window handle 48 | win_handle_before = driver.current_window_handle 49 | 50 | # Perform the click operation that opens new window 51 | view.find_element(By.TAG_NAME, 'a').click() 52 | time.sleep(2) 53 | 54 | # Switch to new window opened 55 | for win_handle in driver.window_handles: 56 | driver.switch_to.window(win_handle) 57 | 58 | # Perform the actions on new window 59 | final_data.append(scrape_data()) 60 | 61 | # Close the new window, if that window no more required 62 | driver.close() 63 | 64 | # Switch back to original browser (first window) 65 | driver.switch_to.window(win_handle_before) 66 | time.sleep(2) 67 | 68 | print(final_data) 69 | -------------------------------------------------------------------------------- /switching_bw_windows_excercise_2.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : 3 | Author : Dart Korea 4 | Date : July 10, 2023 5 | """ 6 | 7 | from selenium.webdriver import Chrome, Keys 8 | from selenium.webdriver.common.by import By 9 | from selenium.webdriver.support.select import Select 10 | from selenium.webdriver.support.wait import WebDriverWait 11 | import selenium.webdriver.support.expected_conditions as EC 12 | import time 13 | 14 | driver = Chrome() 15 | 16 | 17 | url = "https://dart.fss.or.kr/dsab007/main.do" 18 | driver.get(url) 19 | 20 | state = driver.find_element(By.ID, 'option') 21 | nsw = Select(state) 22 | nsw.select_by_visible_text('회사명') 23 | 24 | search = driver.find_element(By.ID, 'textCrpNm') 25 | search.send_keys('조이푸드') 26 | search.send_keys(Keys.ENTER) 27 | 28 | 29 | # Store the current window handle 30 | win_handle_before = driver.current_window_handle 31 | 32 | # Perform the click operation that opens new window 33 | WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'a[title="감사보고서 공시뷰어 새창"]'))).click() 34 | time.sleep(5) 35 | 36 | # Switch to new window opened 37 | driver.switch_to.window(driver.window_handles[1]) 38 | 39 | # Perform the actions on new window 40 | con = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'ul[class="jstree-children"]'))) 41 | con.find_elements(By.TAG_NAME, 'li')[-1].find_element(By.TAG_NAME, 'a').click() 42 | 43 | # do the page parsing 44 | driver.switch_to.frame(driver.find_element(By.ID, 'ifrm')) 45 | overview = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//p[text()="1. 회사의 개요"]'))) 46 | print(overview.text) 47 | 48 | # Close the new window, if that window no more required 49 | driver.close() 50 | 51 | # Switch back to original browser (first window) 52 | driver.switch_to.window(win_handle_before) 53 | -------------------------------------------------------------------------------- /text_option_under_select.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : 3 | Author : Ajeet 4 | Date : June 22, 2023 5 | """ 6 | from selenium.webdriver import Chrome 7 | from selenium.webdriver.support.ui import Select 8 | from selenium.webdriver.common.by import By 9 | from selenium.webdriver.support.wait import WebDriverWait 10 | import selenium.webdriver.support.expected_conditions as EC 11 | 12 | driver = Chrome() 13 | url = "https://vahan.parivahan.gov.in/vahan4dashboard/vahan/view/reportview.xhtml" 14 | driver.get(url) 15 | dropdown_element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "j_idt31_input"))) 16 | dropdown = Select(dropdown_element) 17 | option_names = [option.get_attribute('innerHTML') for option in dropdown.options] 18 | print(len(option_names)) 19 | print(option_names) 20 | 21 | 22 | """ 23 | output: 24 | 25 | 35 26 | ['All Vahan4 Running States (34/36)', 'Andaman & Nicobar Island(8)', 'Andhra Pradesh(80)', 'Arunachal Pradesh(26)', 'Assam(36)', 'Bihar(49)', 'Chhattisgarh(30)', 'Chandigarh(1)', 'UT of DNH and DD(3)', 'Delhi(23)', 'Goa(13)', 'Gujarat(37)', 'Himachal Pradesh(113)', 'Haryana(179)', 'Jharkhand(30)', 'Jammu and Kashmir(21)', 'Karnataka(68)', 'Kerala(87)', 'Ladakh(3)', 'Maharashtra(53)', 'Meghalaya(13)', 'Manipur(12)', 'Madhya Pradesh(52)', 'Mizoram(10)', 'Nagaland(9)', 'Odisha(39)', 'Punjab(93)', 'Puducherry(8)', 'Rajasthan(142)', 'Sikkim(8)', 'Tamil Nadu(146)', 'Tripura(9)', 'Uttarakhand(21)', 'Uttar Pradesh(78)', 'West Bengal(56)'] 27 | 28 | reference: 29 | https://stackoverflow.com/questions/76528109/how-do-i-obtain-a-list-of-values-from-a-website-dropdown-using-selenium 30 | """ -------------------------------------------------------------------------------- /tiktok_com.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : TikTok video's view count 3 | Author : Ajeet 4 | Date : July 19, 2023 5 | """ 6 | 7 | import time 8 | import json 9 | from selenium.webdriver import Chrome 10 | from selenium.webdriver.chrome.service import Service 11 | from webdriver_manager.chrome import ChromeDriverManager 12 | from selenium.webdriver.common.by import By 13 | from selenium.webdriver.support.wait import WebDriverWait 14 | import selenium.webdriver.support.expected_conditions as EC 15 | 16 | 17 | def save_view_counts(urls, filename): 18 | data = {} 19 | driver = Chrome(service=Service(ChromeDriverManager().install())) 20 | 21 | for url in urls: 22 | 23 | driver.get(url) 24 | recent_videos = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'strong[data-e2e="video-views"]'))) 25 | print(f"number of recent videos: {len(recent_videos)}") 26 | data[url] = [i.get_attribute('innerHTML') for i in recent_videos] 27 | 28 | time.sleep(3) # delay between requests 29 | 30 | driver.quit() 31 | print(data) 32 | 33 | # save data 34 | with open(filename, 'w') as f: 35 | f.write(json.dumps(data, indent=4)) 36 | 37 | # urls 2 scrape 38 | urls = [ 39 | 'https://www.tiktok.com/@netflix', 40 | 'https://www.tiktok.com/@twitter' 41 | ] 42 | 43 | save_view_counts(urls, 'views.txt') 44 | 45 | 46 | """ 47 | output: 48 | 49 | number of recent videos: 34 50 | number of recent videos: 23 51 | {'https://www.tiktok.com/@netflix': ['99.7K', '136.7K', '27.6K', '18.1K', '12.8K', '7670', '87K', '15.8K', '14.5K', '102.1K', '25.7K', '203.2K', '4.1M', '43K', '32.9K', '101.5K', '2.3M', '233K', '440.9K', '92.4K', '25.9K', '53.3K', '33.3K', '449.5K', '92K', '53.2K', '215.5K', '32.1K', '1.6M', '415K', '224K', '319.1K', '469.8K', '420.1K'], 'https://www.tiktok.com/@twitter': ['361.4K', '138.5K', '54.4K', '169.3K', '67.6K', '90.4K', '4.6M', '115.4K', '48.4K', '45.6K', '73K', '223.8K', '107K', '11.8M', '155.7K', '100K', '1.4M', '94.6K', '55.3K', '67.4K', '48K', '40.7K', '40.4K']} 52 | 53 | Few things to note: 54 | 55 | - we can directly locate/find the element of view-count using the CSS selector strong[data-e2e="video-views"] 56 | - To get the view-count text, use i.get_attribute('innerHTML') instead of i.text 57 | 58 | reference: 59 | https://stackoverflow.com/questions/76716861/bulk-scraping-tiktok-view-count-from-20-most-recent-posts 60 | """ -------------------------------------------------------------------------------- /tiktok_com_video_post.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : TikTok video post 3 | Author : Ajeet 4 | Date : April 30, 2025 5 | 6 | Description: 7 | This script automates the process of uploading a video to TikTok using Selenium. 8 | It loads a set of saved cookies to bypass the login, navigates to the upload section, 9 | selects a video from the local system, and posts it. 10 | """ 11 | 12 | import json 13 | from time import sleep 14 | from selenium.webdriver import Chrome, ChromeOptions 15 | from selenium.webdriver.common.by import By 16 | from selenium.webdriver.support.wait import WebDriverWait 17 | import selenium.webdriver.support.expected_conditions as EC 18 | 19 | 20 | def upload_video_to_tiktok(video_path: str, cookie_file: str = 'tiktok_cookies.json') -> None: 21 | """ 22 | Uploads a video to TikTok using Selenium automation. 23 | 24 | Args: 25 | video_path (str): Full path to the video file to upload. 26 | cookie_file (str): Path to the JSON file containing TikTok session cookies. 27 | """ 28 | # ===== SETUP CHROME OPTIONS ===== 29 | options = ChromeOptions() 30 | options.add_argument('--start-maximized') 31 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 32 | 33 | # Initialize driver and wait 34 | driver = Chrome(options=options) 35 | wait = WebDriverWait(driver, 10) 36 | url = "https://www.tiktok.com/" 37 | 38 | try: 39 | # Step 1: Open TikTok 40 | driver.get(url) 41 | 42 | # Step 2: Load cookies 43 | with open(cookie_file) as f: 44 | cookies = json.load(f) 45 | 46 | for cookie in cookies: 47 | driver.add_cookie({ 48 | "domain": cookie['domain'], 49 | "value": cookie['value'], 50 | "id": cookie['id'], 51 | "name": cookie['name'] 52 | }) 53 | 54 | # Step 3: Reload with authenticated session 55 | sleep(2) 56 | driver.get(url) 57 | 58 | # Step 4: Click Upload button 59 | wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'button[aria-label="Upload"]'))).click() 60 | 61 | # Step 5: Upload the video file 62 | wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'input[type="file"]'))).send_keys(video_path) 63 | 64 | # Step 6: Click Post (enabled) button 65 | wait.until(EC.presence_of_element_located(( 66 | By.XPATH, '//button[(@data-e2e="post_video_button") and (@aria-disabled="false")]' 67 | ))).click() 68 | 69 | print("Video upload initiated successfully.") 70 | sleep(5) 71 | 72 | except Exception as e: 73 | print(f"An error occurred during upload: {e}") 74 | finally: 75 | driver.quit() 76 | 77 | 78 | # Example usage 79 | if __name__ == "__main__": 80 | upload_video_to_tiktok("D:\\IMG_4070.mp4") 81 | 82 | """ 83 | reference: https://stackoverflow.com/a/79599064/11179336 84 | 85 | """ -------------------------------------------------------------------------------- /tiktok_video_post.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Help-the-community/Web_Scraping_with_Selenium/83d65976e29668ac62cd8edf50511f66ff00084e/tiktok_video_post.gif -------------------------------------------------------------------------------- /transat_com.py: -------------------------------------------------------------------------------- 1 | from time import sleep 2 | from selenium import webdriver 3 | from selenium.webdriver.common.by import By 4 | from selenium.webdriver.support.ui import WebDriverWait 5 | from selenium.webdriver.support import expected_conditions as EC 6 | from selenium.webdriver.chrome.options import Options 7 | 8 | 9 | def main(): 10 | options = Options() 11 | options.add_argument('--start-maximized') 12 | options.add_argument("--disable-notifications") 13 | options.add_argument("--disable-popup-blocking") 14 | 15 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 16 | options.add_experimental_option("useAutomationExtension", False) 17 | 18 | driver = webdriver.Chrome(options=options) 19 | wait = WebDriverWait(driver, 5) 20 | 21 | driver.get(f"https://www.transat.com/fr-CA?search=package") 22 | 23 | wait.until(EC.presence_of_element_located((By.ID, 'FROMSEARCH'))).click() 24 | sleep(2) 25 | driver.find_element(By.CSS_SELECTOR, '#YUL-FROMSEARCH > span.code').click() 26 | 27 | wait.until(EC.presence_of_element_located((By.ID, 'TOSEARCH'))).click() 28 | sleep(2) 29 | driver.find_element(By.CSS_SELECTOR, '#City-13-TOSEARCH > div > span.name').click() 30 | sleep(2) 31 | 32 | 33 | if __name__ == "__main__": 34 | main() 35 | -------------------------------------------------------------------------------- /twitter_login.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : Twitter Login 3 | Author : Ajeet 4 | Date : August 7, 2023 5 | """ 6 | 7 | import time 8 | from selenium import webdriver 9 | from selenium.webdriver import ChromeOptions, Keys 10 | from selenium.webdriver.common.by import By 11 | from selenium.webdriver.support import expected_conditions as EC 12 | from selenium.webdriver.support.wait import WebDriverWait 13 | 14 | 15 | def login_twitter(username: str, password: str) -> None: 16 | """ 17 | Log in to Twitter using the provided username and password. 18 | 19 | This function automates the login process on Twitter using Selenium WebDriver. 20 | It opens the Twitter login page, enters the provided username and password, and submits the form. 21 | 22 | Parameters: 23 | username (str): The Twitter username to log in with. 24 | password (str): The Twitter password for the specified username. 25 | 26 | Returns: 27 | None 28 | """ 29 | options = ChromeOptions() 30 | options.add_argument("--start-maximized") 31 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 32 | driver = webdriver.Chrome(options=options) 33 | 34 | # Open the Twitter login page 35 | url = "https://twitter.com/i/flow/login" 36 | driver.get(url) 37 | 38 | # Find and input the username 39 | username_input = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'input[autocomplete="username"]'))) 40 | username_input.send_keys(username) 41 | username_input.send_keys(Keys.ENTER) 42 | 43 | # Find and input the password 44 | password_input = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'input[name="password"]'))) 45 | password_input.send_keys(password) 46 | password_input.send_keys(Keys.ENTER) 47 | 48 | # Wait for a short period (e.g., 10 seconds) to ensure the login process completes 49 | time.sleep(10) 50 | 51 | 52 | if __name__ == "__main__": 53 | your_username = "your_twitter_username_here" 54 | your_password = "your_twitter_password_here" 55 | 56 | # Call the login_twitter function with your Twitter credentials 57 | login_twitter(your_username, your_password) 58 | -------------------------------------------------------------------------------- /usa_visa_com.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : 3 | Author : Ajeet 4 | Date : June 9, 2023 5 | """ 6 | from selenium import webdriver 7 | from selenium.webdriver.common.by import By 8 | from selenium.webdriver.support.wait import WebDriverWait 9 | from selenium.webdriver.support import expected_conditions as EC 10 | from selenium.webdriver.common.keys import Keys 11 | from time import sleep 12 | 13 | 14 | driver = webdriver.Chrome() 15 | driver.get('https://usa.visa.com/support/consumer/travel-support/exchange-rate-calculator.html') 16 | wait = WebDriverWait(driver, 30) 17 | 18 | # click to Accept 19 | wait.until(EC.element_to_be_clickable((By.XPATH, "//a[text()='Accept']"))).click() 20 | 21 | shadow_root = driver.find_element(By.XPATH, "//dm-calculator").shadow_root 22 | # enter_amount 23 | shadow_root.find_element(By.ID, "input_amount_paid").send_keys("1") 24 | 25 | # from_dropdown 26 | shadow_root.find_element(By.ID, "autosuggestinput_from").click() 27 | shadow_root.find_element(By.ID, "listbox-item-157").click() 28 | 29 | # to_dropdown 30 | shadow_root.find_element(By.ID, "autosuggestinput_to").click() 31 | shadow_root.find_element(By.ID, "listbox-item-0").click() 32 | 33 | # fee_edit 34 | shadow_root.find_element(By.CLASS_NAME, 'vs-link-cta.vs-calculator-edit-link').click() 35 | 36 | bank_rate = to_dropdown = shadow_root.find_element(By.ID, "input_bank_rate") 37 | bank_rate.send_keys(Keys.CONTROL, 'a') 38 | bank_rate.send_keys(Keys.BACKSPACE) 39 | bank_rate.send_keys('0') 40 | 41 | # clicks on Calculate Conversion button 42 | shadow_root.find_elements(By.CSS_SELECTOR, 'div.vs-container')[-1].find_elements(By.TAG_NAME, 'button')[0].click() 43 | sleep(2) 44 | 45 | 46 | -------------------------------------------------------------------------------- /wallet_polygon_technology.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : Wallet Polygon Technology 3 | Author : Ajeet 4 | Date : July 12, 2023 5 | """ 6 | 7 | import time 8 | from selenium import webdriver 9 | from selenium.webdriver import ChromeOptions 10 | from selenium.webdriver.common.by import By 11 | from selenium.webdriver.support import expected_conditions as EC 12 | from selenium.webdriver.support.wait import WebDriverWait 13 | 14 | options = ChromeOptions() 15 | options.add_argument("--start-maximized") 16 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 17 | 18 | driver = webdriver.Chrome(options=options) 19 | wait = WebDriverWait(driver, 10) 20 | url = "https://wallet.polygon.technology/?redirectOnConnect=zkEVM_bridge" 21 | 22 | driver.get(url) 23 | # click on the "Connect to a Wallet" button 24 | wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.navbar__apps-section__auth__login"))).click() 25 | time.sleep(2) 26 | driver.execute_script("""document.querySelector('w3m-modal').shadowRoot.querySelector('w3m-modal-router').shadowRoot.querySelector('w3m-connect-wallet-view').shadowRoot.querySelector('w3m-desktop-wallet-selection').shadowRoot.querySelector('w3m-modal-footer').querySelectorAll('w3m-wallet-button')[0].shadowRoot.querySelector('button').click();""") 27 | time.sleep(5) 28 | 29 | """ 30 | - Various elements on this website are embedded inside the shadow-root. 31 | - for example, your target/desired button is embedded in a 5-layer nested shadow-root. 32 | - After clicking on the Connect to a Wallet, we wait for 1-2 seconds just to make sure that the overlay window is 33 | visibly present, although it appears very quickly. 34 | - The used javascript query to locate and click on the desired button: 35 | 36 | document.querySelector('w3m-modal').shadowRoot.querySelector('w3m-modal-router').shadowRoot.querySelector('w3m-connect-wallet-view').shadowRoot.querySelector('w3m-desktop-wallet-selection').shadowRoot.querySelector('w3m-modal-footer').querySelectorAll('w3m-wallet-button')[0].shadowRoot.querySelector('button').click(); 37 | 38 | will click on the very first wallet, if you like to click on the 2nd or 3rd wallet option, just simply replace 39 | the querySelectorAll('w3m-wallet-button')[0] with querySelectorAll('w3m-wallet-button')[1] or 40 | querySelectorAll('w3m-wallet-button')[2] respectively in the above-mentioned javascript query. 41 | 42 | reference: 43 | https://stackoverflow.com/questions/76658230/selenium-how-to-get-element-in-shadow-root-of-html-page-code 44 | """ -------------------------------------------------------------------------------- /wallet_sendit_arcana_network.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : https://sendit.arcana.network/app/login 3 | Author : Ajeet 4 | Date : August 18, 2023 5 | """ 6 | 7 | import time 8 | from selenium import webdriver 9 | from selenium.webdriver import ChromeOptions 10 | from selenium.webdriver.common.by import By 11 | from selenium.webdriver.support import expected_conditions as EC 12 | from selenium.webdriver.support.wait import WebDriverWait 13 | 14 | options = ChromeOptions() 15 | options.add_argument("--start-maximized") 16 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 17 | 18 | driver = webdriver.Chrome(options=options) 19 | wait = WebDriverWait(driver, 20) 20 | driver.get(url="https://sendit.arcana.network/app/login") 21 | 22 | # Click on the "Connect Wallet" button on the page 23 | wait.until(EC.element_to_be_clickable((By.XPATH, '//span[text()=" Connect Wallet "]'))).click() 24 | time.sleep(2) 25 | 26 | # Click on the "View All" to see all wallet options 27 | driver.execute_script( 28 | """document.querySelector('w3m-modal').shadowRoot.querySelector('w3m-modal-router').shadowRoot.querySelector('w3m-connect-wallet-view').shadowRoot.querySelector('w3m-desktop-wallet-selection').shadowRoot.querySelector('w3m-modal-footer').querySelector('div.w3m-grid').querySelector('w3m-view-all-wallets-button').shadowRoot.querySelector('button').click();""") 29 | 30 | time.sleep(2) 31 | # Click on the "MetaMask" wallet option 32 | driver.execute_script( 33 | """document.querySelector('w3m-modal').shadowRoot.querySelector('w3m-modal-router').shadowRoot.querySelector('w3m-wallet-explorer-view').shadowRoot.querySelector('div.w3m-grid').querySelector('[name="MetaMask"]').shadowRoot.querySelector('button').click();""") 34 | 35 | time.sleep(2) 36 | 37 | """ 38 | reference: 39 | https://stackoverflow.com/questions/76922866/how-to-authorise-in-walletconnect-using-python 40 | """ -------------------------------------------------------------------------------- /yomiuri_co_jp.py: -------------------------------------------------------------------------------- 1 | """ 2 | Project : Yomiuri 3 | Author : Ajeet 4 | Date : July 10, 2023 5 | """ 6 | 7 | from selenium.webdriver import Chrome, ChromeOptions 8 | from selenium.webdriver.common.by import By 9 | from selenium.webdriver.support.wait import WebDriverWait 10 | import selenium.webdriver.support.expected_conditions as EC 11 | import time 12 | 13 | options = ChromeOptions() 14 | options.add_argument('--start-maximized') 15 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 16 | options.add_experimental_option("prefs", { 17 | "profile.default_content_setting_values.notifications": 2}) 18 | 19 | driver = Chrome(options=options) 20 | wait = WebDriverWait(driver, 10) 21 | driver.get('https://www.yomiuri.co.jp/editorial/') 22 | 23 | element = wait.until(EC.presence_of_element_located((By.ID, "ajax_more_button"))) 24 | 25 | count = len(wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'ul#latest_list>li[class="p-list-item "]')))) 26 | print(f"initial number of articles: {count}") 27 | 28 | while True: 29 | 30 | driver.execute_script("return arguments[0].click()", element) 31 | time.sleep(1) 32 | new_count = len(driver.find_elements(By.CSS_SELECTOR, 'ul#latest_list>li[class="p-list-item "]')) 33 | print(f"articles after clicking read/load more button: {new_count}") 34 | if new_count>count: 35 | count = new_count 36 | else: 37 | break 38 | 39 | if count==100: 40 | break 41 | 42 | news_articles = [i.find_element(By.TAG_NAME, 'a').get_attribute('href') for i in driver.find_elements(By.CSS_SELECTOR, 'ul#latest_list>li[class="p-list-item "]')] 43 | 44 | print(news_articles) 45 | print(f"Total articles {len(news_articles)}") 46 | 47 | """ 48 | output: 49 | initial number of articles: 20 50 | articles after clicking read/load more button: 30 51 | articles after clicking read/load more button: 40 52 | articles after clicking read/load more button: 50 53 | articles after clicking read/load more button: 60 54 | articles after clicking read/load more button: 70 55 | articles after clicking read/load more button: 80 56 | articles after clicking read/load more button: 90 57 | articles after clicking read/load more button: 100 58 | ['https://www.yomiuri.co.jp/editorial/20230707-OYT1T50249/', 'https://www.yomiuri.co.jp/editorial/20230707-OYT1T50246/', 'https://www.yomiuri.co.jp/editorial/20230706-OYT1T50325/', 'https://www.yomiuri.co.jp/editorial/20230706-OYT1T50322/', 'https://www.yomiuri.co.jp/editorial/20230705-OYT1T50244/', 'https://www.yomiuri.co.jp/editorial/20230705-OYT1T50241/', 'https://www.yomiuri.co.jp/editorial/20230704-OYT1T50239/', 'https://www.yomiuri.co.jp/editorial/20230704-OYT1T50236/', 'https://www.yomiuri.co.jp/editorial/20230703-OYT1T50226/', 'https://www.yomiuri.co.jp/editorial/20230703-OYT1T50223/', 'https://www.yomiuri.co.jp/editorial/20230702-OYT1T50218/', 'https://www.yomiuri.co.jp/editorial/20230702-OYT1T50215/', 'https://www.yomiuri.co.jp/editorial/20230701-OYT1T50247/', 'https://www.yomiuri.co.jp/editorial/20230701-OYT1T50244/', 'https://www.yomiuri.co.jp/editorial/20230630-OYT1T50265/', 'https://www.yomiuri.co.jp/editorial/20230630-OYT1T50259/', 'https://www.yomiuri.co.jp/editorial/20230629-OYT1T50195/', 'https://www.yomiuri.co.jp/editorial/20230629-OYT1T50192/', 'https://www.yomiuri.co.jp/editorial/20230628-OYT1T50240/', 'https://www.yomiuri.co.jp/editorial/20230628-OYT1T50237/', 'https://www.yomiuri.co.jp/editorial/20230627-OYT1T50257/', 'https://www.yomiuri.co.jp/editorial/20230627-OYT1T50254/', 'https://www.yomiuri.co.jp/editorial/20230626-OYT1T50297/', 'https://www.yomiuri.co.jp/editorial/20230626-OYT1T50292/', 'https://www.yomiuri.co.jp/editorial/20230625-OYT1T50191/', 'https://www.yomiuri.co.jp/editorial/20230625-OYT1T50188/', 'https://www.yomiuri.co.jp/editorial/20230624-OYT1T50186/', 'https://www.yomiuri.co.jp/editorial/20230624-OYT1T50183/', 'https://www.yomiuri.co.jp/editorial/20230623-OYT1T50305/', 'https://www.yomiuri.co.jp/editorial/20230623-OYT1T50302/', 'https://www.yomiuri.co.jp/editorial/20230623-OYT1T50083/', 'https://www.yomiuri.co.jp/editorial/20230623-OYT1T50070/', 'https://www.yomiuri.co.jp/editorial/20230621-OYT1T50273/', 'https://www.yomiuri.co.jp/editorial/20230621-OYT1T50270/', 'https://www.yomiuri.co.jp/editorial/20230620-OYT1T50203/', 'https://www.yomiuri.co.jp/editorial/20230620-OYT1T50200/', 'https://www.yomiuri.co.jp/editorial/20230619-OYT1T50253/', 'https://www.yomiuri.co.jp/editorial/20230619-OYT1T50250/', 'https://www.yomiuri.co.jp/editorial/20230618-OYT1T50138/', 'https://www.yomiuri.co.jp/editorial/20230618-OYT1T50135/', 'https://www.yomiuri.co.jp/editorial/20230617-OYT1T50290/', 'https://www.yomiuri.co.jp/editorial/20230617-OYT1T50287/', 'https://www.yomiuri.co.jp/editorial/20230616-OYT1T50258/', 'https://www.yomiuri.co.jp/editorial/20230616-OYT1T50254/', 'https://www.yomiuri.co.jp/editorial/20230616-OYT1T50013/', 'https://www.yomiuri.co.jp/editorial/20230616-OYT1T50010/', 'https://www.yomiuri.co.jp/editorial/20230614-OYT1T50286/', 'https://www.yomiuri.co.jp/editorial/20230614-OYT1T50284/', 'https://www.yomiuri.co.jp/editorial/20230613-OYT1T50164/', 'https://www.yomiuri.co.jp/editorial/20230613-OYT1T50161/', 'https://www.yomiuri.co.jp/editorial/20230612-OYT1T50193/', 'https://www.yomiuri.co.jp/editorial/20230612-OYT1T50189/', 'https://www.yomiuri.co.jp/editorial/20230610-OYT1T50273/', 'https://www.yomiuri.co.jp/editorial/20230610-OYT1T50270/', 'https://www.yomiuri.co.jp/editorial/20230609-OYT1T50270/', 'https://www.yomiuri.co.jp/editorial/20230609-OYT1T50267/', 'https://www.yomiuri.co.jp/editorial/20230608-OYT1T50261/', 'https://www.yomiuri.co.jp/editorial/20230608-OYT1T50257/', 'https://www.yomiuri.co.jp/editorial/20230607-OYT1T50239/', 'https://www.yomiuri.co.jp/editorial/20230607-OYT1T50236/', 'https://www.yomiuri.co.jp/editorial/20230606-OYT1T50228/', 'https://www.yomiuri.co.jp/editorial/20230606-OYT1T50225/', 'https://www.yomiuri.co.jp/editorial/20230605-OYT1T50252/', 'https://www.yomiuri.co.jp/editorial/20230605-OYT1T50244/', 'https://www.yomiuri.co.jp/editorial/20230604-OYT1T50144/', 'https://www.yomiuri.co.jp/editorial/20230604-OYT1T50141/', 'https://www.yomiuri.co.jp/editorial/20230603-OYT1T50230/', 'https://www.yomiuri.co.jp/editorial/20230603-OYT1T50227/', 'https://www.yomiuri.co.jp/editorial/20230602-OYT1T50262/', 'https://www.yomiuri.co.jp/editorial/20230602-OYT1T50259/', 'https://www.yomiuri.co.jp/editorial/20230601-OYT1T50232/', 'https://www.yomiuri.co.jp/editorial/20230601-OYT1T50229/', 'https://www.yomiuri.co.jp/editorial/20230531-OYT1T50307/', 'https://www.yomiuri.co.jp/editorial/20230531-OYT1T50304/', 'https://www.yomiuri.co.jp/editorial/20230530-OYT1T50254/', 'https://www.yomiuri.co.jp/editorial/20230530-OYT1T50251/', 'https://www.yomiuri.co.jp/editorial/20230529-OYT1T50201/', 'https://www.yomiuri.co.jp/editorial/20230529-OYT1T50198/', 'https://www.yomiuri.co.jp/editorial/20230528-OYT1T50116/', 'https://www.yomiuri.co.jp/editorial/20230528-OYT1T50113/', 'https://www.yomiuri.co.jp/editorial/20230527-OYT1T50305/', 'https://www.yomiuri.co.jp/editorial/20230527-OYT1T50301/', 'https://www.yomiuri.co.jp/editorial/20230526-OYT1T50307/', 'https://www.yomiuri.co.jp/editorial/20230526-OYT1T50304/', 'https://www.yomiuri.co.jp/editorial/20230525-OYT1T50378/', 'https://www.yomiuri.co.jp/editorial/20230525-OYT1T50371/', 'https://www.yomiuri.co.jp/editorial/20230524-OYT1T50273/', 'https://www.yomiuri.co.jp/editorial/20230524-OYT1T50270/', 'https://www.yomiuri.co.jp/editorial/20230523-OYT1T50272/', 'https://www.yomiuri.co.jp/editorial/20230523-OYT1T50269/', 'https://www.yomiuri.co.jp/editorial/20230522-OYT1T50192/', 'https://www.yomiuri.co.jp/editorial/20230522-OYT1T50189/', 'https://www.yomiuri.co.jp/editorial/20230521-OYT1T50225/', 'https://www.yomiuri.co.jp/editorial/20230520-OYT1T50354/', 'https://www.yomiuri.co.jp/editorial/20230520-OYT1T50351/', 'https://www.yomiuri.co.jp/editorial/20230519-OYT1T50244/', 'https://www.yomiuri.co.jp/editorial/20230518-OYT1T50235/', 'https://www.yomiuri.co.jp/editorial/20230518-OYT1T50232/', 'https://www.yomiuri.co.jp/editorial/20230517-OYT1T50298/', 'https://www.yomiuri.co.jp/editorial/20230517-OYT1T50295/'] 59 | Total articles 100 60 | 61 | Few things to note: 62 | 63 | - as we load the home page, initially it contains 20 articles under the 最新ニュース section. 64 | - on every click on the button さらに読み込む , it loads 10 more articles and so on. 65 | - as you may notice, to click on the desired button, we used driver.execute_script("return arguments[0].click()", element) 66 | - there could be thousands of articles on the page. And if you wish to load more, simply remove the if count==100: statement 67 | or increase the count number to load a given number of articles. Please notice that as every click loads 10 more articles, 68 | the value of the variable count will be a multiple of 10 starting from 20. (20, 30, 40, 50,....and so on) 69 | 70 | reference: 71 | https://stackoverflow.com/questions/76643641/how-to-click-a-button-with-selenium-on-a-javascript-page 72 | """ --------------------------------------------------------------------------------