├── DKB_Bank_login.py
├── EtalongroupRU
    ├── README.md
    ├── __init__.py
    ├── __pycache__
    │   └── logger.cpython-311.pyc
    ├── data.json
    ├── debug.log
    ├── etalongroup_ru.py
    ├── helper.py
    └── logger.py
├── InstagramAPI
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-310.pyc
    │   ├── credentials.cpython-310.pyc
    │   └── instagram_baseline.cpython-310.pyc
    ├── credentials.py
    ├── hashtag_search.py
    └── instagram_baseline.py
├── LICENSE
├── README.md
├── alnair_ae.py
├── app_powerbi_com.py
├── app_powerbi_com_anp.gif
├── app_powerbi_com_anp.py
├── audible_com.py
├── autotrader_co_uk.py
├── baseball_scraper
    ├── baseball_scraper
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-311.pyc
    │   │   └── settings.cpython-311.pyc
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-311.pyc
    │   │       └── players.cpython-311.pyc
    │   │   └── players.py
    ├── players.json
    └── scrapy.cfg
├── click_checkbox_whirlpool.py
├── coroners_nsw_gov_download_multiple_pdf.py
├── destinytracker_com.py
├── eex_com.py
├── egle_state_mi.py
├── element_not_visible_to_click.py
├── fb_login_with_popup_alert_disabled.py
├── find_chrome_version.py
├── find_masa_com.py
├── flicker_scroll.py
├── google_com_finance.py
├── google_finance.gif
├── hong_kong_observatory_climate.py
├── imdb_com.py
├── indeed_com.py
├── jodidb_org.py
├── join_team_meeting.gif
├── knowde_com.py
├── lebara_nl.py
├── lidl_GB.py
├── load_cookies_to_accept_all.py
├── ma_shienkikan.py
├── medicus_online_at.py
├── mercedes-benz.py
├── mydealz_de.py
├── nested_shadow_root.py
├── nse_india.py
├── nse_india_2.py
├── oddsportal_com.py
├── pump_fun.py
├── quiker_com.py
├── scrape_bluechip_io.py
├── scrape_www_knx_org.py
├── scroll_down.py
├── scroll_to_bottom.py
├── sel_pagination_excercise.py
├── select_element_by_tag_text.py
├── selenium_action_move_by_offset.py
├── selenium_baseline.py
├── selenium_chrome_profile.py
├── selenium_file_download.py
├── selenium_get_attribute.py
├── selenium_get_parent_element.py
├── selenium_hover_click.py
├── selenium_hover_click_text.py
├── selenium_iframe_excercise.py
├── selenium_iframe_excercise_2.py
├── selenium_iframe_excercise_3.py
├── selenium_iframe_excercise_linkedin.py
├── selenium_nth_css_selector.py
├── selenium_ok_alert.py
├── selenium_options.py
├── selenium_partial_class_name.py
├── selenium_scrap_transcript.py
├── selenium_scrape_youtube_channel.py
├── selenium_scrape_youtube_search.py
├── selenium_select_tag_dropdown.py
├── selenium_send_keys _excercise.py
├── selenium_shadow_open_excercise.py
├── selenium_shadow_root.py
├── selenium_switches.json
├── selenium_take_screenshot.py
├── selenium_twitter_login.py
├── selenium_work_shadow_closed.pyi
├── selenium_workday_login.py
├── shein_com.py
├── stackoverflow_login_and_save_cookies.py
├── stackoverflow_login_with_cookies.py
├── stackoverflow_track.py
├── store_pagination_element_to_click.py
├── sustainalytics_com.py
├── switching_bw_windows.py
├── switching_bw_windows_excercise_2.py
├── text_option_under_select.py
├── the_line_cl.py
├── tiktok_com.py
├── tiktok_com_video_post.py
├── tiktok_video_post.gif
├── transat_com.py
├── twitter_login.py
├── usa_visa_com.py
├── wallet_polygon_technology.py
├── wallet_sendit_arcana_network.py
├── yomiuri_co_jp.py
└── youtube_channel_all_videos.py


/DKB_Bank_login.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : DKB Bank Login
 3 | Author : Ajeet
 4 | Date : July 24, 2023
 5 | """
 6 | 
 7 | # Import required modules
 8 | import time
 9 | from selenium.webdriver import Chrome,Keys
10 | from selenium.webdriver.common.by import By
11 | from selenium.webdriver.support.wait import WebDriverWait
12 | import selenium.webdriver.support.expected_conditions as EC
13 | from selenium.webdriver.common.action_chains import ActionChains
14 | 
15 | def main():
16 |     # Initialize Chrome WebDriver
17 |     driver = Chrome()
18 | 
19 |     # Open the DKB login page
20 |     driver.get("https://banking.dkb.de/login")
21 | 
22 |     # Set up WebDriverWait with a timeout of 10 seconds
23 |     wait = WebDriverWait(driver, 10)
24 | 
25 |     # Switch to the iframe and refuse all cookies
26 |     # The website may display a cookie consent popup within an iframe.
27 |     iframe = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'iframe#privacy-iframe')))
28 |     driver.switch_to.frame(iframe)
29 |     driver.find_element(By.CSS_SELECTOR, 'button.btn.refuse-all').click()
30 | 
31 |     # After refusing cookies, go back to the main page (DKB login page)
32 |     driver.get("https://banking.dkb.de/login")
33 | 
34 |     # Initialize ActionChains to perform actions like mouse movements and keystrokes
35 |     actions = ActionChains(driver)
36 | 
37 |     # Logging in with provided credentials
38 |     # Find the username input field, click on it, and enter the username '123456789'
39 |     wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'div[data-testid="sui-input-username"]'))).click()
40 |     username = driver.find_element(By.CSS_SELECTOR, 'input#username')
41 |     actions.move_to_element(username).send_keys('123456789').perform()
42 | 
43 |     # Find the password input field, click on it, and enter the password 'abcdefg'
44 |     wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'div[data-testid="sui-input-password"]'))).click()
45 |     password = driver.find_element(By.CSS_SELECTOR, 'input#password')
46 |     actions.move_to_element(password).send_keys('abcdefg').perform()
47 | 
48 |     # Press the Enter key to submit the login form
49 |     password.send_keys(Keys.ENTER)
50 | 
51 |     # Wait for 2 seconds (to allow the page to load or perform further actions)
52 |     time.sleep(2)
53 | 
54 | # Call the main function to start the script
55 | main()
56 | 
57 | """
58 | reference:
59 | https://stackoverflow.com/questions/76749285/i-cannot-send-keys-because-element-not-interactable-in-selenium-web-automation
60 | """
61 | 


--------------------------------------------------------------------------------
/EtalongroupRU/README.md:
--------------------------------------------------------------------------------
 1 | ## Overview : 
 2 | This script scrapes apartment details from the Voxhall property page using Selenium and BeautifulSoup.
 3 | ## Usage
 4 | 
 5 | ### Command-Line Arguments
 6 | - `--file`: Path to the output JSON file (optional).
 7 | 
 8 | ### Examples
 9 | 
10 | #### Extract Data and Print to Console
11 | ```bash
12 | python etalongroup_ru.py 
13 | ```
14 | 
15 | #### Extract Data and Write to an JSON File
16 | ```bash
17 | python etalongroup_ru.py --file "F:\automation\EtalongroupRU\data.json"
18 | ```
19 | ## Stackoverflow link :
20 | 
21 | [reference](https://stackoverflow.com/a/79368954/11179336)
22 | 
23 | 


--------------------------------------------------------------------------------
/EtalongroupRU/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Author: VermaAK
3 | Created: 1/19/2025
4 | Description: 
5 | Project: automation
6 | """
7 | 


--------------------------------------------------------------------------------
/EtalongroupRU/__pycache__/logger.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Help-the-community/Web_Scraping_with_Selenium/83d65976e29668ac62cd8edf50511f66ff00084e/EtalongroupRU/__pycache__/logger.cpython-311.pyc


--------------------------------------------------------------------------------
/EtalongroupRU/data.json:
--------------------------------------------------------------------------------
1 | [{"link": "https://etalongroup.ru//msk/choose/92334/", "price": "20 519 852 ₽ ", "title": "Студия № 197", "area": "26.0 м²", "floor": "16 этаж"}, {"link": "https://etalongroup.ru//msk/choose/92437/", "price": "20 726 234 ₽ ", "title": "Студия № 37", "area": "25.4 м²", "floor": "4 этаж"}, {"link": "https://etalongroup.ru//msk/choose/92445/", "price": "20 976 711 ₽ ", "title": "Студия № 44", "area": "26.0 м²", "floor": "5 этаж"}, {"link": "https://etalongroup.ru//msk/choose/92453/", "price": "20 994 562 ₽ ", "title": "Студия № 51", "area": "25.7 м²", "floor": "5 этаж"}, {"link": "https://etalongroup.ru//msk/choose/92483/", "price": "21 039 082 ₽ ", "title": "Студия № 79", "area": "25.7 м²", "floor": "7 этаж"}, {"link": "https://etalongroup.ru//msk/choose/92255/", "price": "21 835 647 ₽ ", "title": "Студия № 125", "area": "25.8 м²", "floor": "10 этаж"}]


--------------------------------------------------------------------------------
/EtalongroupRU/debug.log:
--------------------------------------------------------------------------------
 1 | 2025-01-19 23:21:31,519 - INFO - Configuring WebDriver...
 2 | 2025-01-19 23:21:33,254 - INFO - Fetching page content...
 3 | 2025-01-19 23:21:55,688 - INFO - Closing WebDriver...
 4 | 2025-01-19 23:21:57,936 - INFO - Parsing apartments data...
 5 | 2025-01-19 23:21:57,967 - INFO - New F:/automation/EtalongroupRU/data.json has been created with the data.
 6 | 2025-01-19 23:22:29,908 - INFO - Configuring WebDriver...
 7 | 2025-01-19 23:22:31,609 - INFO - Fetching page content...
 8 | 2025-01-19 23:22:50,340 - INFO - Closing WebDriver...
 9 | 2025-01-19 23:22:52,628 - INFO - Parsing apartments data...
10 | 2025-01-19 23:22:52,659 - INFO - Scraped Data:
11 | 2025-01-19 23:22:52,659 - INFO - [{'link': 'https://etalongroup.ru//msk/choose/92334/', 'price': '20 519 852 ₽ ', 'title': 'Студия № 197', 'area': '26.0 м²', 'floor': '16 этаж'}, {'link': 'https://etalongroup.ru//msk/choose/92437/', 'price': '20 726 234 ₽ ', 'title': 'Студия № 37', 'area': '25.4 м²', 'floor': '4 этаж'}, {'link': 'https://etalongroup.ru//msk/choose/92445/', 'price': '20 976 711 ₽ ', 'title': 'Студия № 44', 'area': '26.0 м²', 'floor': '5 этаж'}, {'link': 'https://etalongroup.ru//msk/choose/92453/', 'price': '20 994 562 ₽ ', 'title': 'Студия № 51', 'area': '25.7 м²', 'floor': '5 этаж'}, {'link': 'https://etalongroup.ru//msk/choose/92483/', 'price': '21 039 082 ₽ ', 'title': 'Студия № 79', 'area': '25.7 м²', 'floor': '7 этаж'}, {'link': 'https://etalongroup.ru//msk/choose/92255/', 'price': '21 835 647 ₽ ', 'title': 'Студия № 125', 'area': '25.8 м²', 'floor': '10 этаж'}]
12 | 2025-01-19 23:26:49,457 - INFO - Configuring WebDriver...
13 | 2025-01-19 23:26:51,251 - INFO - Fetching page content...
14 | 2025-01-19 23:27:25,906 - INFO - Closing WebDriver...
15 | 2025-01-19 23:27:28,232 - INFO - Parsing apartments data...
16 | 2025-01-19 23:27:28,311 - INFO - New F:/automation/EtalongroupRU/data.json has been created with the data.
17 | 


--------------------------------------------------------------------------------
/EtalongroupRU/etalongroup_ru.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Author: Ajeet
  3 | Created: 1/19/2025
  4 | Description: This script scrapes apartment details from the Voxhall property page using Selenium and BeautifulSoup.
  5 | Project: automation
  6 | """
  7 | import time
  8 | import argparse
  9 | from bs4 import BeautifulSoup
 10 | from selenium import webdriver
 11 | from selenium.webdriver.chrome.options import Options
 12 | from selenium.webdriver.common.by import By
 13 | from typing import List, Dict
 14 | 
 15 | from logger import logger
 16 | from helper import save_file
 17 | 
 18 | 
 19 | def configure_webdriver(headless: bool = True) -> webdriver.Chrome:
 20 |     """Configures and initializes the Selenium WebDriver."""
 21 |     options = Options()
 22 |     if headless:
 23 |         options.add_argument('--headless')
 24 |     options.add_experimental_option("excludeSwitches", ["enable-automation"])
 25 |     options.add_experimental_option("useAutomationExtension", False)
 26 |     return webdriver.Chrome(options=options)
 27 | 
 28 | 
 29 | def fetch_page_content(driver: webdriver.Chrome, url: str, selector: str, wait_time: int = 2) -> str:
 30 |     """
 31 |         Fetches the HTML content of a specific container on a web page.
 32 | 
 33 |         This function navigates to the provided URL, waits for the page to load, and retrieves the HTML content
 34 |         of the specified container identified by the CSS selector.
 35 | 
 36 |         Args:
 37 |             driver (webdriver.Chrome): The Selenium WebDriver instance to control the browser.
 38 |             url (str): The URL of the page to fetch.
 39 |             selector (str): The CSS selector to identify the container element whose content is to be fetched.
 40 |             wait_time (int, optional): The time (in seconds) to wait for the page to load before fetching content. Defaults to 2 seconds.
 41 | 
 42 |         Returns:
 43 |             str: The HTML content of the specified container.
 44 |         """
 45 |     # Navigate to the provided URL
 46 |     driver.get(url)
 47 | 
 48 |     # Wait for the page to load fully (with a default wait time)
 49 |     time.sleep(wait_time)
 50 | 
 51 |     # Find the container element using the provided CSS selector and retrieve its inner HTML
 52 |     container = driver.find_element(By.CSS_SELECTOR, selector).get_attribute('innerHTML')
 53 | 
 54 |     # Return the HTML content of the container
 55 |     return container
 56 | 
 57 | 
 58 | def parse_apartments(html_content: str) -> List[Dict[str, str]]:
 59 |     """
 60 |     Parses apartment data from the provided HTML content.
 61 | 
 62 |     This function extracts the apartment details such as the link, price, title, area, and floor from the given
 63 |     HTML content of a real estate page. It uses BeautifulSoup to parse the HTML and collects relevant information.
 64 | 
 65 |     Args:
 66 |         html_content (str): The HTML content of the page to parse.
 67 | 
 68 |     Returns:
 69 |         List[Dict[str, str]]: A list of dictionaries, each containing the details of an apartment (link, price, title, area, floor).
 70 |     """
 71 |     # Parse the HTML content using BeautifulSoup
 72 |     soup = BeautifulSoup(html_content, 'html.parser')
 73 | 
 74 |     # Initialize an empty list to store the apartments' details
 75 |     apartments = []
 76 | 
 77 |     # Find all the apartment containers on the page
 78 |     result_container = soup.find_all('div', class_="bg-white relative")
 79 | 
 80 |     # Loop through each apartment container to extract the required data
 81 |     for result in result_container:
 82 |         # Find the anchor tag that leads to the apartment's page
 83 |         root = result.find_next('a')
 84 | 
 85 |         # Extract area and floor information from the text in the corresponding span
 86 |         area_floor = root.select_one('section.flex.flex-col.gap-2>span.th-b1-regular').text.split(' | ')
 87 | 
 88 |         # Append the apartment's details as a dictionary to the apartments list
 89 |         apartments.append({
 90 |             "link": f"https://etalongroup.ru/{root['href']}",
 91 |             "price": root.select_one('span.th-h2').text,
 92 |             "title": root.select_one('span.th-h4').text,
 93 |             "area": area_floor[0],
 94 |             "floor": area_floor[1]
 95 |         })
 96 | 
 97 |     # Return the list of apartments with extracted details
 98 |     return apartments
 99 | 
100 | 
101 | def main():
102 | 
103 |     parser = argparse.ArgumentParser(
104 |         description='A script scrapes apartment details from the Voxhall property page and write results to an JSON file.'
105 |     )
106 |     parser.add_argument('--file', type=str, help='Path of the file', default=None)
107 |     args = parser.parse_args()
108 | 
109 |     """Main function to orchestrate the scraping process."""
110 |     url = 'https://etalongroup.ru/msk/object/voxhall/'
111 |     container_selector = '#card-object>div'
112 | 
113 |     logger.info("Configuring WebDriver...")
114 |     driver = configure_webdriver()
115 | 
116 |     logger.info("Fetching page content...")
117 |     html_content = fetch_page_content(driver, url, container_selector)
118 | 
119 |     logger.info("Closing WebDriver...")
120 |     driver.quit()
121 | 
122 |     logger.info("Parsing apartments data...")
123 |     apartments = parse_apartments(html_content)
124 | 
125 |     if args.file:
126 |         save_file(args.file, apartments)
127 |     else:
128 |         logger.info("Scraped Data:")
129 |         logger.info(apartments)
130 | 
131 | 
132 | if __name__ == '__main__':
133 |     main()
134 | 


--------------------------------------------------------------------------------
/EtalongroupRU/helper.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Author: Ajeet
 3 | Created: 1/19/2025
 4 | Description: This script scrapes apartment details from the Voxhall property page using Selenium and BeautifulSoup.
 5 | Project: automation
 6 | """
 7 | import os
 8 | import json
 9 | from typing import List, Dict
10 | from logger import logger
11 | 
12 | 
13 | def save_file(path: str, data: List) -> None:
14 |     """
15 |        Saves the provided data to a file at the specified path.
16 |        If the file already exists, it is deleted before saving the new data.
17 | 
18 |        Args:
19 |            path (str): The file path where the data will be saved.
20 |            data (List): The data to be saved in JSON format.
21 | 
22 |        Returns:
23 |            None: The function performs an action (saving a file) and does not return a value.
24 | 
25 |        Side Effects:
26 |            - If the file exists at the specified path, it is removed before saving the new data.
27 |            - A log message is generated after successfully saving the data.
28 |        """
29 | 
30 |     # Check if the file exists
31 |     if os.path.exists(path):
32 |         # If the file exists, delete it
33 |         os.remove(path)
34 | 
35 |     # Open the file in write mode and save the data in JSON format
36 |     with open(path, 'w', encoding='utf-8') as file:
37 |         json.dump(data, file, ensure_ascii=False)
38 | 
39 |         # Log a message indicating the file was successfully created
40 |         logger.info(f"New {path} has been created with the data.")
41 | 


--------------------------------------------------------------------------------
/EtalongroupRU/logger.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Author: Ajeet
 3 | Created: 1/19/2025
 4 | Description: This script scrapes apartment details from the Voxhall property page using Selenium and BeautifulSoup.
 5 | Project: automation
 6 | """
 7 | import logging
 8 | import sys
 9 | 
10 | # Create a logger
11 | logger = logging.getLogger("Voxhall")
12 | logger.setLevel(logging.INFO)
13 | 
14 | # Formatter for consistent log format
15 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
16 | 
17 | # File handler
18 | file_handler = logging.FileHandler("F:/automation/EtalongroupRU/debug.log", encoding="utf-8")
19 | file_handler.setFormatter(formatter)
20 | 
21 | # Stream handler
22 | stream_handler = logging.StreamHandler(sys.stdout)
23 | stream_handler.setFormatter(formatter)
24 | 
25 | # Add handlers to the logger
26 | if not logger.handlers:  # Prevent adding handlers multiple times
27 |     logger.addHandler(file_handler)
28 |     logger.addHandler(stream_handler)
29 | 


--------------------------------------------------------------------------------
/InstagramAPI/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : 
 3 | Author : Ajeet
 4 | Date : June 9, 2023
 5 | """
 6 | 
 7 | 
 8 | def print_hi(name):
 9 |     print(f'Hi, {name}')
10 | 
11 | 
12 | if __name__ == '__main__':
13 |     print_hi('Python')
14 | 


--------------------------------------------------------------------------------
/InstagramAPI/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Help-the-community/Web_Scraping_with_Selenium/83d65976e29668ac62cd8edf50511f66ff00084e/InstagramAPI/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/InstagramAPI/__pycache__/credentials.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Help-the-community/Web_Scraping_with_Selenium/83d65976e29668ac62cd8edf50511f66ff00084e/InstagramAPI/__pycache__/credentials.cpython-310.pyc


--------------------------------------------------------------------------------
/InstagramAPI/__pycache__/instagram_baseline.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Help-the-community/Web_Scraping_with_Selenium/83d65976e29668ac62cd8edf50511f66ff00084e/InstagramAPI/__pycache__/instagram_baseline.cpython-310.pyc


--------------------------------------------------------------------------------
/InstagramAPI/credentials.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : 
 3 | Author : Ajeet
 4 | Date : June 20, 2023
 5 | """
 6 | 
 7 | import os
 8 | 
 9 | creds = {'instagram_username': os.environ.get('instagram_username'),
10 |          'instagram_password': os.environ.get('instagram_password'),
11 |          }
12 | 


--------------------------------------------------------------------------------
/InstagramAPI/hashtag_search.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : 
 3 | Author : Ajeet
 4 | Date : June 20, 2023
 5 | """
 6 | import time
 7 | from typing import List, Dict
 8 | from selenium.webdriver.common.by import By
 9 | from InstagramAPI.instagram_baseline import Instagram
10 | from selenium.webdriver.support.wait import WebDriverWait
11 | import selenium.webdriver.support.expected_conditions as EC
12 | 
13 | 
14 | def hashtag(browser, tag: str) -> List[Dict]:
15 |     data = []
16 |     url = f'https://www.instagram.com/explore/tags/{tag}/'
17 |     browser.get(url)
18 |     container = WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'article')))
19 | 
20 |     for _ in range(3):
21 |         browser.execute_script('window.scrollBy(0, 5000);')
22 |         time.sleep(1)
23 | 
24 |     images = container.find_elements(By.TAG_NAME, 'img')
25 |     for image in images:
26 |         data.append({
27 |             "description": image.get_attribute('alt'),
28 |             "image_link": image.get_attribute('src')
29 |         })
30 | 
31 |     return data
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     obj = Instagram()
36 |     driver = obj.load_cookies("D:\\automation\InstagramAPI\instgram_cookies.json")
37 |     print(hashtag(driver, 'tree'))
38 | 


--------------------------------------------------------------------------------
/InstagramAPI/instagram_baseline.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : Instagram
 3 | Author : Ajeet
 4 | Date : June 20, 2023
 5 | """
 6 | import time
 7 | import json
 8 | from typing import Optional
 9 | 
10 | from selenium import webdriver
11 | from selenium.webdriver import ChromeOptions, Keys
12 | from selenium.webdriver.common.by import By
13 | from selenium.webdriver.support.wait import WebDriverWait
14 | import selenium.webdriver.support.expected_conditions as EC
15 | from selenium.webdriver.chrome.webdriver import WebDriver
16 | 
17 | from credentials import creds
18 | 
19 | 
20 | class Instagram:
21 | 
22 |     def __init__(self):
23 |         options = ChromeOptions()
24 |         options.add_argument("--start-maximized")
25 |         options.add_experimental_option("excludeSwitches", ["enable-automation"])
26 | 
27 |         self.driver = webdriver.Chrome(options=options)
28 |         self.wait = WebDriverWait(self.driver, 10)
29 | 
30 |     def login(self, username, password):
31 |         self.driver.get("https://www.instagram.com/")
32 |         # username
33 |         self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'input[name="username"]'))).send_keys(username)
34 |         # password
35 |         self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'input[name="password"]'))).send_keys(password+Keys.ENTER)
36 |         # click on "Not Now" to close "Save Your Login Info?"
37 |         self.wait.until(EC.visibility_of_element_located((By.XPATH, '//div[text()="Not Now"]'))).click()
38 | 
39 |     def save_cookies(self, username: str, password: str, path: str) -> None:
40 |         self.login(username, password)
41 |         json_object = json.dumps(self.driver.get_cookies())
42 | 
43 |         # Writing to instagram_cookies.json
44 |         with open(path, "w") as outfile:
45 |             outfile.write(json_object)
46 | 
47 |     def load_cookies(self, path: str) -> WebDriver:
48 |         self.driver.get("https://www.instagram.com/")
49 | 
50 |         # Opening JSON file
51 |         f = open(path)
52 |         cookies = json.load(f)
53 |         # load cookies to the driver
54 |         for cookie in cookies:
55 |             self.driver.add_cookie(cookie)
56 | 
57 |         time.sleep(1)
58 |         # refresh the browser
59 |         self.driver.refresh()
60 |         time.sleep(1)
61 |         self.wait.until(EC.visibility_of_element_located((By.XPATH, '//button[text()="Not Now"]'))).click()
62 |         time.sleep(1)
63 | 
64 |         return self.driver
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     obj = Instagram()
69 |     # obj.login(creds['instagram_username'], creds['instagram_password'])
70 |     # obj.save_cookies(creds['instagram_username'], creds['instagram_password'], 'D:\\automation\InstagramAPI\instgram_cookies.json')
71 |     # obj.load_cookies('D:\\automation\InstagramAPI\instgram_cookies.json')
72 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Stackoverflow Exercises
 2 | 
 3 | ## Overview :
 4 | This repository contains a collection of real-world examples demonstrating web scraping using Python with Selenium. 
 5 | Most of these scripts were created to assist the community on Stack Overflow by providing fully functional solutions to their questions.
 6 | 
 7 | ## Script Naming Convention :
 8 | The scripts are named based on the websites they target for scraping. 
 9 | For instance, a script designed to scrape data from `https://www.abcd.co.ef/editorial/` is named `abcd_co_ef.py`. 
10 | This naming convention makes it easy to identify the source website for each script.
11 | 
12 | ## Stack Overflow References : 
13 | The scripts also include references to the corresponding Stack Overflow questions. 
14 | This allows you to easily access the original discussions and gain background knowledge about the problems being addressed.
15 | 
16 | 


--------------------------------------------------------------------------------
/alnair_ae.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : Alnair
 3 | Author : Ajeet
 4 | Date : July 29, 2023
 5 | """
 6 | 
 7 | from selenium.webdriver import Chrome
 8 | from selenium.webdriver.chrome.service import Service as ChromeService
 9 | from selenium.webdriver.common.by import By
10 | from selenium.webdriver.chrome.options import Options
11 | from webdriver_manager.chrome import ChromeDriverManager
12 | 
13 | 
14 | URL_alnair = 'https://alnair.ae/app/view/1412/3386/apartment/apartments'
15 | o = Options()
16 | o.add_experimental_option('detach', True)
17 | o.add_argument('--start-maximized')
18 | 
19 | driver = Chrome(service=ChromeService(ChromeDriverManager().install()), options=o)
20 | 
21 | def get_data():
22 |     driver.get(URL_alnair)
23 |     driver.set_page_load_timeout(2)
24 | 
25 |     scroll_bar = driver.find_element(By.CSS_SELECTOR, 'div[class^="_scrollContainer_"]')
26 |     driver.execute_script("arguments[0].scrollBy(0, arguments[0].scrollHeight);", scroll_bar)
27 | 
28 | get_data()
29 | 
30 | 
31 | """
32 | - You first need to find/locate the scrollbar which is embedded in the HTML page.
33 | - The web-element <div class="_scrollContainer_1ah3s_14"> represents the scrollbar which can be located using the 
34 | mentioned strategy.
35 | - Once we find the web element for the scrollbar, simply can scroll down to its height.
36 | 
37 | reference:
38 | https://stackoverflow.com/questions/76791670/scrolling-using-selenium4-10-0
39 | """


--------------------------------------------------------------------------------
/app_powerbi_com.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : App PowerBI
 3 | Author : Ajeet
 4 | Date : June 16, 2023
 5 | """
 6 | # import libraries
 7 | import time
 8 | from selenium.webdriver import Chrome, ChromeOptions
 9 | from selenium.webdriver.common.by import By
10 | from selenium.webdriver import ActionChains
11 | from selenium.webdriver.support.ui import WebDriverWait
12 | from selenium.webdriver.support import expected_conditions as EC
13 | from selenium.common.exceptions import MoveTargetOutOfBoundsException
14 | 
15 | options = ChromeOptions()
16 | options.add_argument('--start-maximized')
17 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
18 | driver = Chrome(options=options)
19 | wait = WebDriverWait(driver, 10)
20 | driver.get("https://app.powerbi.com/view?r=eyJrIjoiNzA0MGM4NGMtN2E5Ny00NDU3LWJiNzMtOWFlMGIyMDczZjg2IiwidCI6IjM4MmZiOGIwLTRkYzMtNDEwNy04MGJkLTM1OTViMjQzMmZhZSIsImMiOjZ9&pageName=ReportSection")
21 | 
22 | # wait for the dashboard to load
23 | wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'transform.bringToFront')))
24 | 
25 | state = driver.find_element(By.CSS_SELECTOR, 'div[aria-label="State"]')
26 | state.find_element(By.CSS_SELECTOR, 'span[title="Select all"]').click()
27 | 
28 | job_name = driver.find_element(By.CSS_SELECTOR, 'div[aria-label="Job Name"]')
29 | # for example, select the option 4
30 | job_name.find_element(By.CSS_SELECTOR, 'div[data-row-index="4"]').click()
31 | 
32 | time.sleep(2)
33 | 
34 | scrolls = driver.find_elements(By.CSS_SELECTOR, 'div.scroll-bar-part-bar')
35 | h_scroll = scrolls[2]
36 | v_scroll = scrolls[3]
37 | 
38 | # Perform horizontal scrolling
39 | action_chains = ActionChains(driver)
40 | action_chains.move_to_element(h_scroll).click_and_hold().move_by_offset(500, 0).release().perform()
41 | time.sleep(1)
42 | 
43 | flag = True
44 | while flag:
45 |     try:
46 |         # Perform vertical scrolling
47 |         action_chains = ActionChains(driver)
48 |         action_chains.move_to_element(v_scroll).click_and_hold().move_by_offset(0, 100).release().perform()
49 | 
50 |     except MoveTargetOutOfBoundsException:
51 |         flag = False
52 | 
53 | # find the desired 2nd table
54 | table = driver.find_elements(By.CSS_SELECTOR, 'div.tableExContainer')[1]
55 | 
56 | # now you can parse this desirable as you want.
57 | 
58 | 
59 | """
60 | Few points to note:
61 | 
62 | 1. We first wait for the dashboard on the webpage to be visible.
63 | 2. Next, locate the State web element and find Select all option in it to click.
64 | 3. Similarly, locate the Job Name web element and find the option number 4 in it to click.
65 | 4. Next, we locate the all vertical and horizontal scroll bars with the css selector and then get the horizontal and 
66 |    vertical scroll bar of the desired table(2nd table here).
67 | 5. After getting the web element of the target scroll-bars, we first perform the horizontal scrolling.
68 | 6. Afterwards, we perform the vertical scrolling to load all the data in the target table.
69 | 7 Finally, locate the target/desired table, the variable "table" holds the desired web element of the table you want to scrape which you can use for further parsing to extract the table's data.
70 | 
71 | reference:
72 | https://stackoverflow.com/questions/76214166/scrape-websites-power-bi-dashboard-using-python-selenium
73 | """


--------------------------------------------------------------------------------
/app_powerbi_com_anp.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Help-the-community/Web_Scraping_with_Selenium/83d65976e29668ac62cd8edf50511f66ff00084e/app_powerbi_com_anp.gif


--------------------------------------------------------------------------------
/app_powerbi_com_anp.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : PowerBI App
 3 | Author : Ajeet
 4 | Date : April 22, 2025
 5 | """
 6 | # ===== IMPORTS =====
 7 | from time import sleep
 8 | from selenium import webdriver
 9 | from selenium.webdriver.chrome.options import Options
10 | from selenium.webdriver.common.by import By
11 | from selenium.webdriver.support.ui import WebDriverWait
12 | from selenium.webdriver.support import expected_conditions as EC
13 | from selenium.webdriver import ActionChains
14 | 
15 | 
16 | # ===== SETUP OPTIONS =====
17 | def initialize_driver() -> webdriver.Chrome:
18 |     """Initializes and returns a configured Chrome WebDriver."""
19 |     options = Options()
20 |     options.add_argument("--start-maximized")
21 |     options.add_argument("force-device-scale-factor=0.95")
22 |     options.add_experimental_option("excludeSwitches", ["enable-automation"])
23 |     return webdriver.Chrome(options=options)
24 | 
25 | 
26 | driver = initialize_driver()
27 | wait = WebDriverWait(driver, 10)
28 | 
29 | 
30 | # ===== HELPER FUNCTIONS =====
31 | def wait_and_click(by: By, identifier: str) -> None:
32 |     """
33 |     Waits for an element to be clickable and clicks it.
34 | 
35 |     Args:
36 |         by (By): Locator strategy (e.g., By.XPATH, By.CSS_SELECTOR).
37 |         identifier (str): The locator string for the target element.
38 |     """
39 |     element = wait.until(EC.element_to_be_clickable((by, identifier)))
40 |     element.click()
41 | 
42 | 
43 | def scroll_slicer_container(offset_y: int = 100) -> None:
44 |     """
45 |     Scrolls inside a slicer dropdown popup using ActionChains.
46 | 
47 |     Args:
48 |         offset_y (int): The vertical scroll offset. Positive = down, Negative = up.
49 |     """
50 |     sc = driver.find_element(By.CSS_SELECTOR,
51 |         'div[id^="slicer-dropdown-popup-"]>div>div>div:nth-child(2)>div>div:nth-child(3)'
52 |     )
53 |     action = ActionChains(driver)
54 |     action.move_to_element(sc).click_and_hold().move_by_offset(0, offset_y).release().perform()
55 | 
56 | 
57 | # ===== MAIN FUNCTION =====
58 | def report_analyser(year: str, month: int) -> None:
59 |     """
60 |     Navigates to a Power BI report and selects a specific month in a slicer filter.
61 | 
62 |     Args:
63 |         year (str): The target year to expand in the slicer (e.g., "2022").
64 |         month (int): The month to select (1-based index corresponding to the slicer position).
65 |     """
66 |     url = "https://app.powerbi.com/view?r=eyJrIjoiZWIzNDg3YzUtMGFlMC00MzdmLTgzOWQtZThkOWExNTU2NjBlIiwidCI6IjQ0OTlmNGZmLTI0YTYtNGI0Mi1iN2VmLTEyNGFmY2FkYzkxMyJ9"
67 |     driver.get(url)
68 | 
69 |     # Wait for page to load and navigate to second page
70 |     wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Mercado Page navigation . Mercado"]')))
71 |     wait_and_click(By.CSS_SELECTOR, '#embedWrapperID>div.logoBarWrapper>logo-bar>div>div>div>logo-bar-navigation>span>button:nth-child(3)')
72 | 
73 |     # Open the slicer dropdown
74 |     wait_and_click(By.CSS_SELECTOR,
75 |         '#pvExplorationHost > div > div > exploration > div > explore-canvas > div > div.canvasFlexBox > div > div.displayArea.disableAnimations.fitToPage > div.visualContainerHost.visualContainerOutOfFocus > visual-container-repeat > visual-container:nth-child(6) > transform > div > div.visualContent > div > div > visual-modern > div > div > div.slicer-content-wrapper > div>i'
76 |     )
77 | 
78 |     # Expand the year to show months
79 |     wait_and_click(By.XPATH, f'//div[@class="slicerItemContainer" and @title="{year}"]/div[@class="expandButton"]')
80 |     sleep(3)
81 | 
82 |     # Scroll and select the month
83 |     scroll_slicer_container(offset_y=100)
84 |     sleep(2)
85 |     wait_and_click(By.XPATH, f'//div[@class="slicerItemContainer" and @aria-posinset="{month}"]')
86 |     sleep(2)
87 | 
88 | 
89 | # ===== RUN SCRIPT =====
90 | if __name__ == "__main__":
91 |     report_analyser('2023', 7)
92 | 
93 | """
94 | reference:
95 | https://stackoverflow.com/a/79585038/11179336
96 | """
97 | 


--------------------------------------------------------------------------------
/autotrader_co_uk.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : 
 3 | Author : Ajeet
 4 | Date : Apr 27, 2025
 5 | """
 6 | import time
 7 | from selenium.webdriver import Chrome, ChromeOptions
 8 | from selenium.webdriver.common.by import By
 9 | from selenium.webdriver.support.wait import WebDriverWait
10 | from selenium.webdriver.support import expected_conditions as EC
11 | 
12 | chrome_options = ChromeOptions()
13 | chrome_options.add_experimental_option("excludeSwitches", ['enable-automation'])
14 | driver = Chrome(options=chrome_options)
15 | 
16 | driver.get("https://www.autotrader.co.uk")
17 | wait = WebDriverWait(driver, 10)
18 | 
19 | # wait for the target iframe to get loaded in order to switch to it
20 | wait.until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, 'iframe[id^="sp_message_iframe_"]')))
21 | 
22 | # click to 'Reject All'
23 | wait.until(EC.element_to_be_clickable((By.XPATH, '//button[@title="Reject All"]'))).click()
24 | 
25 | # Switch back to the main page content
26 | driver.switch_to.default_content()
27 | 
28 | # Now you can continue interacting with the main page here
29 | 
30 | time.sleep(5)
31 | 
32 | """
33 | reference:
34 | https://stackoverflow.com/a/79593560/11179336
35 | """


--------------------------------------------------------------------------------
/baseball_scraper/baseball_scraper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Help-the-community/Web_Scraping_with_Selenium/83d65976e29668ac62cd8edf50511f66ff00084e/baseball_scraper/baseball_scraper/__init__.py


--------------------------------------------------------------------------------
/baseball_scraper/baseball_scraper/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Help-the-community/Web_Scraping_with_Selenium/83d65976e29668ac62cd8edf50511f66ff00084e/baseball_scraper/baseball_scraper/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/baseball_scraper/baseball_scraper/__pycache__/settings.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Help-the-community/Web_Scraping_with_Selenium/83d65976e29668ac62cd8edf50511f66ff00084e/baseball_scraper/baseball_scraper/__pycache__/settings.cpython-311.pyc


--------------------------------------------------------------------------------
/baseball_scraper/baseball_scraper/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # https://docs.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | import scrapy
 7 | 
 8 | 
 9 | class BaseballScraperItem(scrapy.Item):
10 |     # define the fields for your item here like:
11 |     # name = scrapy.Field()
12 |     pass
13 | 


--------------------------------------------------------------------------------
/baseball_scraper/baseball_scraper/middlewares.py:
--------------------------------------------------------------------------------
  1 | # Define here the models for your spider middleware
  2 | #
  3 | # See documentation in:
  4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  5 | 
  6 | from scrapy import signals
  7 | 
  8 | # useful for handling different item types with a single interface
  9 | from itemadapter import is_item, ItemAdapter
 10 | 
 11 | 
 12 | class BaseballScraperSpiderMiddleware:
 13 |     # Not all methods need to be defined. If a method is not defined,
 14 |     # scrapy acts as if the spider middleware does not modify the
 15 |     # passed objects.
 16 | 
 17 |     @classmethod
 18 |     def from_crawler(cls, crawler):
 19 |         # This method is used by Scrapy to create your spiders.
 20 |         s = cls()
 21 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 22 |         return s
 23 | 
 24 |     def process_spider_input(self, response, spider):
 25 |         # Called for each response that goes through the spider
 26 |         # middleware and into the spider.
 27 | 
 28 |         # Should return None or raise an exception.
 29 |         return None
 30 | 
 31 |     def process_spider_output(self, response, result, spider):
 32 |         # Called with the results returned from the Spider, after
 33 |         # it has processed the response.
 34 | 
 35 |         # Must return an iterable of Request, or item objects.
 36 |         for i in result:
 37 |             yield i
 38 | 
 39 |     def process_spider_exception(self, response, exception, spider):
 40 |         # Called when a spider or process_spider_input() method
 41 |         # (from other spider middleware) raises an exception.
 42 | 
 43 |         # Should return either None or an iterable of Request or item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info("Spider opened: %s" % spider.name)
 57 | 
 58 | 
 59 | class BaseballScraperDownloaderMiddleware:
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info("Spider opened: %s" % spider.name)
104 | 


--------------------------------------------------------------------------------
/baseball_scraper/baseball_scraper/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | 
 7 | # useful for handling different item types with a single interface
 8 | from itemadapter import ItemAdapter
 9 | 
10 | 
11 | class BaseballScraperPipeline:
12 |     def process_item(self, item, spider):
13 |         return item
14 | 


--------------------------------------------------------------------------------
/baseball_scraper/baseball_scraper/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for baseball_scraper project
 2 | #
 3 | # For simplicity, this file contains only settings considered important or
 4 | # commonly used. You can find more settings consulting the documentation:
 5 | #
 6 | #     https://docs.scrapy.org/en/latest/topics/settings.html
 7 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 8 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 9 | 
10 | BOT_NAME = "baseball_scraper"
11 | 
12 | SPIDER_MODULES = ["baseball_scraper.spiders"]
13 | NEWSPIDER_MODULE = "baseball_scraper.spiders"
14 | 
15 | 
16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
17 | #USER_AGENT = "baseball_scraper (+http://www.yourdomain.com)"
18 | 
19 | # Obey robots.txt rules
20 | ROBOTSTXT_OBEY = True
21 | 
22 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
23 | #CONCURRENT_REQUESTS = 32
24 | 
25 | # Configure a delay for requests for the same website (default: 0)
26 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
27 | # See also autothrottle settings and docs
28 | DOWNLOAD_DELAY = 2
29 | # The download delay setting will honor only one of:
30 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
31 | #CONCURRENT_REQUESTS_PER_IP = 16
32 | 
33 | # Disable cookies (enabled by default)
34 | #COOKIES_ENABLED = False
35 | 
36 | # Disable Telnet Console (enabled by default)
37 | #TELNETCONSOLE_ENABLED = False
38 | 
39 | # Override the default request headers:
40 | #DEFAULT_REQUEST_HEADERS = {
41 | #    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
42 | #    "Accept-Language": "en",
43 | #}
44 | 
45 | # Enable or disable spider middlewares
46 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
47 | #SPIDER_MIDDLEWARES = {
48 | #    "baseball_scraper.middlewares.BaseballScraperSpiderMiddleware": 543,
49 | #}
50 | 
51 | # Enable or disable downloader middlewares
52 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
53 | #DOWNLOADER_MIDDLEWARES = {
54 | #    "baseball_scraper.middlewares.BaseballScraperDownloaderMiddleware": 543,
55 | #}
56 | 
57 | # Enable or disable extensions
58 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
59 | #EXTENSIONS = {
60 | #    "scrapy.extensions.telnet.TelnetConsole": None,
61 | #}
62 | 
63 | # Configure item pipelines
64 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
65 | #ITEM_PIPELINES = {
66 | #    "baseball_scraper.pipelines.BaseballScraperPipeline": 300,
67 | #}
68 | 
69 | # Enable and configure the AutoThrottle extension (disabled by default)
70 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
71 | #AUTOTHROTTLE_ENABLED = True
72 | # The initial download delay
73 | #AUTOTHROTTLE_START_DELAY = 5
74 | # The maximum download delay to be set in case of high latencies
75 | #AUTOTHROTTLE_MAX_DELAY = 60
76 | # The average number of requests Scrapy should be sending in parallel to
77 | # each remote server
78 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
79 | # Enable showing throttling stats for every response received:
80 | #AUTOTHROTTLE_DEBUG = False
81 | 
82 | # Enable and configure HTTP caching (disabled by default)
83 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
84 | #HTTPCACHE_ENABLED = True
85 | #HTTPCACHE_EXPIRATION_SECS = 0
86 | #HTTPCACHE_DIR = "httpcache"
87 | #HTTPCACHE_IGNORE_HTTP_CODES = []
88 | #HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
89 | 
90 | # Set settings whose default value is deprecated to a future-proof value
91 | TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
92 | FEED_EXPORT_ENCODING = "utf-8"
93 | 


--------------------------------------------------------------------------------
/baseball_scraper/baseball_scraper/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/baseball_scraper/baseball_scraper/spiders/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Help-the-community/Web_Scraping_with_Selenium/83d65976e29668ac62cd8edf50511f66ff00084e/baseball_scraper/baseball_scraper/spiders/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/baseball_scraper/baseball_scraper/spiders/__pycache__/players.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Help-the-community/Web_Scraping_with_Selenium/83d65976e29668ac62cd8edf50511f66ff00084e/baseball_scraper/baseball_scraper/spiders/__pycache__/players.cpython-311.pyc


--------------------------------------------------------------------------------
/baseball_scraper/baseball_scraper/spiders/players.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import json
 3 | 
 4 | 
 5 | class PlayersSpider(scrapy.Spider):
 6 |     name = "players"
 7 |     allowed_domains = ["baseball-reference.com"]
 8 |     start_urls = [f"https://www.baseball-reference.com/players/{letter}/" for letter in "ab"]
 9 | 
10 | 
11 |     def parse(self, response):
12 |         # Extract player profile links
13 |         player_links = response.css("div#div_players_ > p a::attr(href)").getall()
14 |         for link in player_links:
15 |             full_link = response.urljoin(link)
16 |             yield scrapy.Request(url=full_link, callback=self.parse_player)
17 | 
18 |     @staticmethod
19 |     def parse_player(response):
20 |         # Extract player information
21 |         player_name = response.css("h1 span::text").get()
22 |         position = response.css("p:contains('Position') strong::text").re_first(r"Position: (.+)")
23 |         bats = response.css("p:contains('Bats')::text").re_first(r"Bats: (.+?) •")
24 |         throws = response.css("p:contains('Throws')::text").re_first(r"Throws: (.+)")
25 |         height = response.css("p:contains('lb') span:nth-child(1)::text").get()
26 |         weight = response.css("p:contains('lb') span:nth-child(2)::text").get()
27 |         birth_date = response.css("span#necro-birth a::text").getall()
28 |         birth_location = response.css("p:contains('Born:') span:last-child::text").get()
29 |         draft_info = response.css("p:contains('Drafted by')::text").get()
30 |         high_school = response.css("p:contains('High School:') a::text").get()
31 |         college = response.css("p:contains('Schools:') a::text").getall()
32 |         debut = response.css("p:contains('Debut:') a::text").get()
33 |         last_game = response.css("p:contains('Last Game:') a::text").get()
34 |         rookie_status = response.css("p:contains('Rookie Status:')::text").re_first(r"Rookie Status:\s+(.+)")
35 |         agent = response.css("p:contains('Agents')::text").get()
36 |         nickname = response.css("p:contains('Nicknames:') a::text").get()
37 |         twitter = response.css("p:contains('Twitter:') a::attr(href)").get()
38 | 
39 |         # Extract player's image URL
40 |         image_url = response.css("div.media-item img::attr(src)").get()
41 | 
42 |         # Store the extracted data in a dictionary
43 |         player_data = {
44 |             "name": player_name,
45 |             "position": position,
46 |             "bats": bats,
47 |             "throws": throws,
48 |             "height": height,
49 |             "weight": weight,
50 |             "birth_date": " ".join(birth_date),
51 |             "birth_location": birth_location,
52 |             "draft_info": draft_info,
53 |             "high_school": high_school,
54 |             "college": college,
55 |             "debut": debut,
56 |             "last_game": last_game,
57 |             "rookie_status": rookie_status,
58 |             "agent": agent,
59 |             "nickname": nickname,
60 |             "twitter": twitter,
61 |             "image_url": response.urljoin(image_url),  # Ensure the URL is absolute
62 |         }
63 | 
64 |         # Write the data to a JSON file
65 |         with open("players.json", "a") as f:
66 |             f.write(json.dumps(player_data) + "\n")
67 | 
68 |         yield player_data
69 | 
70 | 


--------------------------------------------------------------------------------
/baseball_scraper/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = baseball_scraper.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = baseball_scraper
12 | 


--------------------------------------------------------------------------------
/click_checkbox_whirlpool.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from selenium.webdriver import Chrome
 3 | from selenium.webdriver.common.by import By
 4 | from selenium.webdriver.support.wait import WebDriverWait
 5 | import selenium.webdriver.support.expected_conditions as EC
 6 | 
 7 | driver = Chrome()
 8 | driver.get('https://register.whirlpool.com/en-us/registration')
 9 | 
10 | WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'privacy_policy')))
11 | # ------------------------------------------------------------------------------------------------------------
12 | driver.execute_script("document.getElementById('privacy_policy').click();")
13 | # ------------------------------------------------------------------------------------------------------------
14 | time.sleep(2)
15 | var1 = driver.find_element(By.ID, "privacy_policy").is_selected()
16 | print(var1)
17 | 
18 | 
19 | """
20 | output:
21 | True
22 | 
23 | You can also cross-check by simply running the javascript query document.getElementById('privacy_policy').click() on the Console of the page and you'll see that it indeed performs the click on the desired checkbox.
24 | 
25 | reference:
26 | https://stackoverflow.com/questions/76404208/not-able-to-click-on-checkbox-using-selenium-in-python-error-selenium-common-ex
27 | """


--------------------------------------------------------------------------------
/coroners_nsw_gov_download_multiple_pdf.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : 
 3 | Author : Ajeet
 4 | Date : June 23, 2023
 5 | """
 6 | 
 7 | import time
 8 | from selenium.webdriver import Chrome
 9 | from selenium.webdriver.common.by import By
10 | from selenium.webdriver.support.ui import WebDriverWait
11 | from selenium.webdriver.support import expected_conditions as EC
12 | 
13 | driver = Chrome()
14 | driver.get('https://www.coroners.nsw.gov.au/coronial-findings-search.html?searchtext=death%20in%20custody&searchYear=All')
15 | 
16 | search_results = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.search-result-content')))
17 | documents = search_results.find_elements(By.CSS_SELECTOR, 'ul.paginationList>li')
18 | print(f"Total documents on the page: {len(documents)}")
19 | 
20 | doc_url = [doc.find_element(By.CSS_SELECTOR, 'h4.search-font> a').get_attribute('href') for doc in documents]
21 | 
22 | for i in doc_url:
23 |     print(f"Downloading: {i.split('/')[-1]}")
24 |     driver.get(i)
25 |     WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div.download-button')))
26 |     driver.execute_script("document.querySelector('div.download-button>a').click()")
27 |     time.sleep(2)
28 | 
29 | time.sleep(5)
30 | 
31 | """
32 | output:
33 | Total documents on the page: 10
34 | Downloading: Inquest_into_the_death_of_Brandon_Clark._pdf.pdf
35 | Downloading: Inquest_into_the_death_of_CJ.pdf
36 | Downloading: Inquest_into_the_death_of_Azhar_Abdul.pdf
37 | Downloading: Inquest_into_the_death_of_John_Cribb.pdf
38 | Downloading: Inquest_into_the_death_of_Anthony_Gilbert.pdf
39 | Downloading: Findings_-_Inquest_into_the_death_of_Gordon_Copeland_-_18_April_2023.pdf
40 | Downloading: Inquest_into_the_death_of_John_Dodd.pdf
41 | Downloading: Final_-_Findings_Inquest_into_the_death_of_Stanley_Russell_April_2023_14_April.pdf
42 | Downloading: Inquest_into_the_death_of_KT.pdf
43 | Downloading: Inquest_into_the_death_of_LT.pdf
44 | """
45 | """
46 | Approach followed:
47 | Wait for the desired web element container holding all the data to get loaded to find/locate it.
48 | 
49 | search_results = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.search-result-content')))
50 | Find all the individual target data points within the container.
51 | 
52 | documents = search_results.find_elements(By.CSS_SELECTOR, 'ul.paginationList>li')    
53 | Next, iterate over the web element containing the list of data points to parse, extract the URL, and put them all in the list.
54 | 
55 | doc_url = [doc.find_element(By.CSS_SELECTOR, 'h4.search-font> a').get_attribute('href') for doc in documents]
56 | Finally, loop over the list of URLs,
57 | 
58 | get to the page,
59 | wait for the target web element (Download) to be available on the page,
60 | and execute the query to perform a click to download the file.
61 | This is how you can download all the documents on a single page, and the same can be replicated on multiple pages.
62 | 
63 | reference:
64 | https://stackoverflow.com/questions/76536814/scrape-website-for-pdfs-within-a-number-of-links
65 | """


--------------------------------------------------------------------------------
/destinytracker_com.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : destinytracker
 3 | Author : Ajeet
 4 | Date : August 2, 2023
 5 | """
 6 | from time import sleep
 7 | from selenium.common import TimeoutException
 8 | from selenium.webdriver import Chrome
 9 | from selenium.webdriver.common.by import By
10 | from selenium.webdriver.support import expected_conditions as EC
11 | from selenium.webdriver.support.wait import WebDriverWait
12 | 
13 | 
14 | driver = Chrome()
15 | url = "https://destinytracker.com/destiny-2/profile/psn/4611686018440125811/matches?mode=crucible"
16 | driver.get(url)
17 | wait = WebDriverWait(driver, 30)
18 | 
19 | crucible_content = wait.until(
20 |     EC.visibility_of_element_located((By.CSS_SELECTOR, "div.trn-gamereport-list.trn-gamereport-list--compact")))
21 | game_reports = crucible_content.find_elements(By.CLASS_NAME, "trn-gamereport-list__group")
22 | 
23 | for game_report in game_reports:
24 |     group_entry = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "trn-gamereport-list__group-entries")))
25 |     win_match = group_entry.find_elements(By.CSS_SELECTOR, "div.trn-match-row--outcome-win")
26 |     driver.execute_script("arguments[0].scrollIntoView();", win_match[0])
27 |     lose_match = group_entry.find_elements(By.CSS_SELECTOR, "div.trn-match-row--outcome-loss")
28 |     for win_element in win_match:
29 | 
30 |         try:
31 |             win_left = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.trn-match-row__section--left")))
32 |             driver.execute_script("arguments[0].click();", win_left)
33 |             print("reached here")
34 |             date_time = wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='info']")))
35 |             date_time = date_time.text.split(",")[0]
36 |             match_roster = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "match-rosters")))
37 |             team_alpha = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.match-roster.alpha")))
38 |             team_bravo = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.match-roster.bravo")))
39 |             bravo_match_roster_entries = team_bravo.find_element(By.CLASS_NAME, "roster-entries")
40 |             alpha_match_roster_entries = team_alpha.find_element(By.CLASS_NAME, "roster-entries")
41 |             name = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "router-link-active")))
42 |             entry_bravo = bravo_match_roster_entries.find_elements(By.CLASS_NAME, "entry")
43 |             entry_alpha = alpha_match_roster_entries.find_elements(By.CLASS_NAME, "entry")
44 |             print(date_time)
45 |             wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "div.close"))).click()
46 |             sleep(1)
47 |         except TimeoutException:
48 |             pass
49 | 
50 | """
51 | reference:
52 | https://stackoverflow.com/questions/76814861/i-keep-getting-a-timeout-error-for-an-element-even-though-it-prints-out-the-text
53 | """
54 | 


--------------------------------------------------------------------------------
/eex_com.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : EEX.COM
 3 | Author : Ajeet
 4 | Date : August 4, 2023
 5 | """
 6 | 
 7 | import time
 8 | import pandas as pd
 9 | from selenium.webdriver import Chrome, ChromeOptions, Keys
10 | from selenium.webdriver.common.by import By
11 | from selenium.webdriver.support.ui import WebDriverWait
12 | from selenium.webdriver.support import expected_conditions as EC
13 | 
14 | pd.set_option('display.max_rows', 500)
15 | pd.set_option('display.max_columns', 500)
16 | pd.set_option('display.width', 1000)
17 | 
18 | 
19 | def data_by_date(day: int, month: int, year: int) -> pd.DataFrame:
20 |     """
21 |     Scrape data for a specific date from the EEX German Power Futures.
22 | 
23 |     Args:
24 |         day (int): The day of the month (1 to 31).
25 |         month (int): The month (1 to 12).
26 |         year (int): The year.
27 | 
28 |     Returns:
29 |         pandas.DataFrame: A DataFrame containing the scraped data for the specified date.
30 |                           The DataFrame includes details about futures contracts.
31 |     """
32 | 
33 |     options = ChromeOptions()
34 |     options.add_argument("--start-maximized")
35 |     options.add_experimental_option("excludeSwitches", ["enable-automation"])
36 | 
37 |     driver = Chrome(options=options)
38 |     wait = WebDriverWait(driver, 20)
39 | 
40 |     driver.get(url='https://www.eex.com/en/market-data/power/futures')
41 |     wait.until(
42 |         EC.element_to_be_clickable((By.CSS_SELECTOR, "input[value='I accept all cookies.']"))).click()
43 |     time.sleep(3)
44 |     wait.until(EC.element_to_be_clickable(
45 |         (By.CSS_SELECTOR, "button.btn.dropdown-toggle.form.input-select div.filter-option-inner"))).click()
46 |     wait.until(EC.element_to_be_clickable((By.XPATH,
47 |                                            "//div[@class='dropdown-menu show']//li/a[@class='dropdown-item']/span[contains(., 'EEX German Power Futures')]"))).click()
48 | 
49 |     # Find and set the date input field to the desired date
50 |     calender_container = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div#symbolheader_pfpde')))
51 |     date_input = calender_container.find_element(By.CSS_SELECTOR, 'input.mv-input-box')
52 |     date_input.clear()
53 |     date_input.send_keys(f'{year}-{month}-{day}')
54 |     date_input.send_keys(Keys.ENTER)
55 | 
56 |     table_data = wait.until(
57 |         EC.visibility_of_element_located((By.CSS_SELECTOR, "div#baseloadwidget_pfpde > table.mv-quote")))
58 |     # Find the table containing the data and extract column names
59 |     columns = [i.text for i in table_data.find_elements(By.CSS_SELECTOR, 'tr.mv-quote-header-row>th')]
60 | 
61 |     all_data = []
62 | 
63 |     # Loop through each row of the table and extract data for each cell
64 |     for row in WebDriverWait(table_data, 10).until(
65 |             EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'tbody>tr'))):
66 |         data = [i.text for i in row.find_elements(By.CSS_SELECTOR, 'td[style^="text-align:"]')]
67 |         all_data.append(data)
68 | 
69 |     # Create a Pandas DataFrame with the scraped data and return it
70 |     df = pd.DataFrame(data=all_data, columns=columns[:-1])
71 |     return df
72 | 
73 | 
74 | print(data_by_date(day=2, month=8, year=2023))
75 | 
76 | """
77 | output:
78 | 
79 |    Future Last Price Last Volume Settlement Price Volume Exchange Volume Trade Registration Open Interest
80 | 0  Cal-24     134.00       8,784           134.52       2,714,256                 2,643,984        72,459
81 | 1  Cal-25     124.75       8,760           124.67         604,440                   289,080        17,377
82 | 2  Cal-26     106.00       8,760           105.59          87,600                   350,400         4,072
83 | 3  Cal-27      90.25       8,760            90.23          17,520                   113,880           787
84 | 4  Cal-28          -           -            84.18               -                         -           111
85 | 5  Cal-29          -           -            82.65               -                         -            13
86 | 6  Cal-30          -           -            83.11               -                         -             7
87 | 7  Cal-31          -           -            82.93               -                         -             2
88 | 8  Cal-32          -           -            82.78               -                         -             2
89 | 9  Cal-33          -           -            81.93               -                         -             0
90 | 
91 | reference:
92 | https://stackoverflow.com/questions/76826884/getting-data-for-different-dates-when-scraping-data-with-selenium
93 | """
94 | 


--------------------------------------------------------------------------------
/egle_state_mi.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : EGLE State, Remediation Information Data Exchange
 3 | Author : Ajeet
 4 | Date : 09/06/2023
 5 | """
 6 | import os
 7 | from selenium.webdriver import Chrome, ChromeOptions
 8 | from selenium.webdriver.support.ui import WebDriverWait
 9 | from selenium.webdriver.common.by import By
10 | from selenium.webdriver.support import expected_conditions as EC
11 | 
12 | 
13 | def download_file(path):
14 | 
15 |     options = ChromeOptions()
16 |     options.add_argument('--start-maximized')
17 |     prefs = {'download.default_directory': path}
18 |     options.add_experimental_option('prefs', prefs)
19 | 
20 |     driver = Chrome(options=options)
21 |     driver.get('https://www.egle.state.mi.us/RIDE/inventory-of-facilities/facilities')
22 |     wait = WebDriverWait(driver, 100)
23 | 
24 |     wait.until(EC.presence_of_all_elements_located((By.TAG_NAME, 'mat-table')))
25 |     driver.execute_script('''document.querySelector("button[aria-label='Export Facilities Table results to CSV']").click();''')
26 | 
27 |     while True:
28 |         is_exist = os.path.exists(f"{path}\\Facilities.csv")
29 |         if is_exist:
30 |             print(f"The file is downloaded!")
31 |             break
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     PATH = 'D:\\test'
36 |     download_file(PATH)
37 | 
38 |     """
39 |     output:
40 |     The file is downloaded!
41 |     """
42 | 
43 | """
44 | steps to follow:
45 | 
46 | 1. As the site takes some time to load the desired element (here, the Export button). And clicking on this button downloads 
47 | the data of the table. Therefore we wait to make sure that the table data is already loaded.
48 | 
49 | 2. Now that the data is already loaded, simply click on the Export button to download the data (here Facilities.csv).
50 | 
51 | 3. It takes some time for the file to get downloaded at the given path, so we need to wait until the file download is 
52 | completed. To do this, we keep checking if the file is present at the given path, and once the file is there, we break 
53 | the loop.
54 | 
55 | reference:
56 | https://stackoverflow.com/questions/76436438/selenium-cant-find-element-by-xpath
57 | """
58 | 


--------------------------------------------------------------------------------
/element_not_visible_to_click.py:
--------------------------------------------------------------------------------
1 | 
2 | # The element is not visible to click. Use Actions or JavascriptExecutor for making it to click.
3 | # By Actions:
4 | #
5 | # WebElement element = driver.find_element(By.ID,"RESULT_RadioButton-7_1");
6 | # Actions actions = new Actions(driver);
7 | # actions.moveToElement(element).click().perform();


--------------------------------------------------------------------------------
/fb_login_with_popup_alert_disabled.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from selenium import webdriver
 3 | from selenium.webdriver import ChromeOptions, Keys
 4 | from selenium.webdriver.common.by import By
 5 | from selenium.webdriver.support import expected_conditions as EC
 6 | from selenium.webdriver.support.wait import WebDriverWait
 7 | 
 8 | options = ChromeOptions()
 9 | 
10 | # maximized and disable forbar
11 | options.add_argument("--start-maximized")
12 | options.add_experimental_option("useAutomationExtension", False)
13 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
14 | options.add_experimental_option(
15 |     "prefs",
16 |     {
17 |         "credentials_enable_service": False,
18 |         "profile.password_manager_enabled": False,
19 |         "profile.default_content_setting_values.notifications": 2
20 |         # with 2 should disable/block notifications and 1 to allow
21 |     },
22 | )
23 | 
24 | driver = webdriver.Chrome(options=options)
25 | 
26 | url = "https://www.facebook.com/"
27 | driver.get(url)
28 | WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "globalContainer")))
29 | container = driver.find_element(By.ID, "globalContainer")
30 | 
31 | # # fill the email account, password
32 | email = container.find_element(By.ID, 'email')
33 | password = container.find_element(By.ID, 'pass')
34 | email.send_keys("xxxxxxxxx")
35 | password.send_keys("xxxxxxxxxxxx")
36 | password.send_keys(Keys.ENTER)
37 | time.sleep(10)
38 | 


--------------------------------------------------------------------------------
/find_chrome_version.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : Google Chrome Version
 3 | Author : Ajeet
 4 | Date : July 12, 2023
 5 | """
 6 | 
 7 | import time
 8 | from selenium import webdriver
 9 | 
10 | options = webdriver.ChromeOptions()
11 | options.add_argument('start-maximized')
12 | driver = webdriver.Chrome(options=options)
13 | 
14 | driver.get('chrome://settings/help')
15 | 
16 | time.sleep(2)
17 | update_check = driver.execute_script("return document.querySelector('settings-ui').shadowRoot.querySelector('settings-main').shadowRoot.querySelector('settings-about-page').shadowRoot.querySelectorAll('settings-section')[0].querySelector('div.secondary').getInnerHTML();")
18 | print(update_check)
19 | 
20 | """
21 | Since the page is highly embedded with many shadow-root elements that make it impossible to locate the elements 
22 | that are embedded inside the shadow-root using the usual locator strategies such as XAPTH, CSS Selector, ID, etc.
23 | 
24 | references:
25 | https://stackoverflow.com/questions/76667428/cant-access-the-latest-version-xpath-of-google-chrome-through-selenium-and-chro
26 | """


--------------------------------------------------------------------------------
/find_masa_com.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : 
 3 | Author : Ajeet
 4 | Date : July 17, 2023
 5 | """
 6 | 
 7 | from selenium.webdriver import Chrome
 8 | from selenium.webdriver.common.by import By
 9 | from selenium.webdriver.support.wait import WebDriverWait
10 | import selenium.webdriver.support.expected_conditions as EC
11 | 
12 | # Create a Chrome driver instance
13 | driver = Chrome()
14 | 
15 | url = 'https://findmasa.com/view/map#b1cc410b'
16 | driver.get(url)
17 | 
18 | # Wait for the li element with id 'b1cc410b' to be present on the page
19 | li_element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'li#b1cc410b')))
20 | 
21 | data_lat = li_element.get_attribute('data-lat')
22 | data_lng = li_element.get_attribute('data-lng')
23 | artist_name = li_element.find_element(By.TAG_NAME, 'a').text
24 | address = li_element.find_elements(By.TAG_NAME, 'p')[1].text
25 | city = li_element.find_elements(By.TAG_NAME, 'p')[2].text
26 | 
27 | # Print the extracted data
28 | print(data_lat)
29 | print(data_lng)
30 | print(artist_name)
31 | print(address)
32 | print(city)
33 | 
34 | """
35 | The information you're looking for gets loaded slowly and involves Javascript. As the requests library doesn't support 
36 | the javascript, it doesn't return the content/information and thus your if-statement gets False. So, it goes to the 
37 | else-statement and you get NO DATA.
38 | 
39 | output:
40 | 34.102025
41 | -118.32694167
42 | Tristan Eaton
43 | 6301 Hollywood Boulevard
44 | Los Angeles, California
45 | 
46 | reference:
47 | https://stackoverflow.com/questions/76700158/how-to-use-python-to-get-information-from-the-map-navigation-container-of-a-webs
48 | """


--------------------------------------------------------------------------------
/flicker_scroll.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : flicker
 3 | Author : Ajeet
 4 | Date : Sep. 22, 2023
 5 | """
 6 | 
 7 | import time
 8 | from bs4 import BeautifulSoup
 9 | from selenium import webdriver
10 | 
11 | driver = webdriver.Chrome()
12 | url = "https://www.flickr.com/groups/allfreepictures/pool/page3041"
13 | 
14 | driver.get(url=url)
15 | 
16 | # scroll to the bottom of the page to load all available images
17 | flag = True
18 | last_height = driver.execute_script("return document.body.scrollHeight")
19 | while flag:
20 |     driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
21 |     time.sleep(1)
22 |     new_height = driver.execute_script("return document.body.scrollHeight")
23 | 
24 |     if new_height == last_height:
25 |         flag = False
26 |     else:
27 |         last_height = new_height
28 | 
29 | time.sleep(2)
30 | 
31 | soup = BeautifulSoup(driver.page_source, 'html.parser')
32 | image_urls = [link['href'] for link in soup.findAll("a", {"class": "overlay"})]
33 | print(len(image_urls))
34 | print(image_urls)
35 | 
36 | """
37 | reference:
38 | https://stackoverflow.com/questions/77155340/selenium-scroll-flickr-page-to-get-all-the-images
39 | """


--------------------------------------------------------------------------------
/google_com_finance.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Script to search for a stock ticker (e.g., NVDA) on Google Finance using Selenium.
 3 | 
 4 | Author: Ajeet
 5 | Date: 17/05/2025
 6 | """
 7 | 
 8 | import time
 9 | 
10 | from selenium.webdriver import Chrome, ChromeOptions
11 | from selenium.webdriver.common.by import By
12 | from selenium.webdriver.common.keys import Keys
13 | from selenium.webdriver.support.ui import WebDriverWait
14 | from selenium.webdriver.support import expected_conditions as EC
15 | 
16 | 
17 | def setup_driver():
18 |     """
19 |     Set up and return a Selenium Chrome WebDriver with custom options.
20 |     """
21 |     options = ChromeOptions()
22 |     options.add_argument("--start-maximized")  # Launch browser maximized
23 |     options.add_experimental_option("excludeSwitches", ["enable-automation"])
24 |     options.add_experimental_option("useAutomationExtension", False)
25 |     return Chrome(options=options)
26 | 
27 | 
28 | def search_stock(driver, stock_name: str):
29 |     """
30 |     Automates the stock search on Google Finance.
31 | 
32 |     Args:
33 |         driver: Selenium WebDriver instance.
34 |         stock_name (str): Name of the stock to search for (e.g., "nvda stock").
35 |     """
36 |     wait = WebDriverWait(driver, 10)
37 |     driver.get("https://www.google.com/finance/")
38 | 
39 |     # Wait for search input fields to load and select the second input field
40 |     input_elements = wait.until(EC.presence_of_all_elements_located(
41 |         (By.CSS_SELECTOR, 'input[aria-label="Search for stocks, ETFs & more"]')
42 |     ))
43 | 
44 |     if len(input_elements) < 2:
45 |         raise Exception("Expected input field not found.")
46 | 
47 |     input_element = input_elements[1]
48 |     input_element.send_keys(stock_name)
49 |     time.sleep(1)
50 |     input_element.send_keys(Keys.ENTER)
51 |     time.sleep(2)
52 | 
53 | 
54 | def main():
55 |     """
56 |     Main function to execute the script.
57 |     """
58 |     driver = setup_driver()
59 |     search_stock(driver, "nvda stock")
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     main()
64 | 
65 | """
66 | reference:
67 | https://stackoverflow.com/a/79626737/11179336
68 | """


--------------------------------------------------------------------------------
/google_finance.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Help-the-community/Web_Scraping_with_Selenium/83d65976e29668ac62cd8edf50511f66ff00084e/google_finance.gif


--------------------------------------------------------------------------------
/hong_kong_observatory_climate.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from selenium.webdriver import Chrome
 3 | from selenium.webdriver.common.by import By
 4 | from selenium.webdriver.support.wait import WebDriverWait
 5 | import selenium.webdriver.support.expected_conditions as EC
 6 | 
 7 | driver = Chrome()
 8 | 
 9 | url = 'https://www.hko.gov.hk/en/cis/awsDailyElement.htm?stn=WB8&ele=PREV_DIR&y=2023'
10 | driver.get(url)
11 | 
12 | table = WebDriverWait(driver, 5).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'table[id="t1"] > tr')))
13 | columns = [i.text for i in table[0].find_elements(By.TAG_NAME, 'th')]
14 | table_dict = {col: [] for col in columns}
15 | 
16 | for row in table[1:]:
17 |     for data in zip(columns, [i.text for i in row.find_elements(By.TAG_NAME, 'td')]):
18 |         table_dict[data[0]].append(data[1])
19 | 
20 | driver.close()
21 | 
22 | df = pd.DataFrame(table_dict)
23 | # # saving the dataframe to a csv
24 | df.to_csv('data.csv', index=False)
25 | 
26 | """
27 | Few things to note:
28 | 
29 | 1. After hitting the URL, we need to wait for the table to get visibly located on the page and thus we find all the table rows tr which includes the first tr as the table's columns.
30 | 2. the variable columns is a list that holds the table column names (first row data table[0])
31 | 3. Next, we initiate a variable table_dict and assign the columns as the key of this dict with their values as an empty list.
32 | 4. after that, we iterate over the remaining rows of the table, couple the list of columns with the row data and iterate over it to assign the data to its column.
33 | 5. and finally, create a dataframe with table_dict and save it into a CSV file data.csv.
34 | """
35 | 


--------------------------------------------------------------------------------
/imdb_com.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : IMDB title review
 3 | Author : Ajeet
 4 | Date : July 20, 2023
 5 | """
 6 | 
 7 | from selenium import webdriver
 8 | from selenium.webdriver.firefox.options import Options
 9 | from selenium.webdriver.common.by import By
10 | from selenium.webdriver.support.ui import WebDriverWait
11 | from selenium.webdriver.support import expected_conditions as EC
12 | from selenium.common.exceptions import NoSuchElementException, TimeoutException
13 | 
14 | url = "https://www.imdb.com/title/tt0368226/reviews"
15 | 
16 | options = Options()
17 | # options.add_argument('-headless')
18 | driver = webdriver.Firefox(options=options)
19 | 
20 | # Load the IMDb page
21 | driver.get(url)
22 | 
23 | while True:
24 |     try:
25 |         button = WebDriverWait(driver, 10).until(
26 |             EC.visibility_of_element_located((By.ID, 'load-more-trigger')))
27 | 
28 |         button.click()
29 |     except (NoSuchElementException, TimeoutException):
30 |         break
31 | 
32 | """
33 | The while-loop will keep looking for the Load More button and keep clicking on it until there are no more Load More 
34 | and finally it'll get timed out and break out of the loop.
35 | 
36 | reference:
37 | https://stackoverflow.com/questions/76726412/movetargetoutofboundsexception-selenium-python-firefox
38 | """


--------------------------------------------------------------------------------
/jodidb_org.py:
--------------------------------------------------------------------------------
 1 | from time import sleep
 2 | 
 3 | from selenium import webdriver
 4 | from selenium.webdriver.chrome.service import Service
 5 | from selenium.webdriver.common.by import By
 6 | from selenium.webdriver.support import expected_conditions as EC
 7 | from selenium.webdriver.support.wait import WebDriverWait
 8 | from selenium.webdriver.common.keys import Keys
 9 | from selenium.webdriver.chrome.options import Options
10 | from webdriver_manager.chrome import ChromeDriverManager
11 | from selenium.webdriver.common.action_chains import ActionChains
12 | 
13 | 
14 | options = Options()
15 | options.add_argument("--window-size=1920,1080")
16 | options.add_argument(
17 |     "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36")
18 | 
19 | # Suppress logging to reduce unnecessary output
20 | options.add_argument("--log-level=3")
21 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
22 | options.add_experimental_option("useAutomationExtension", False)
23 | 
24 | # Set up the WebDriver with configured options
25 | service = Service(ChromeDriverManager().install())
26 | browser = webdriver.Chrome(service=service, options=options)
27 | browser.maximize_window()
28 | wait = WebDriverWait(browser, 10)
29 | 
30 | 
31 | browser.get(r'http://www.jodidb.org/TableViewer/tableView.aspx?ReportId=93905')
32 | 
33 | import time
34 | time.sleep(2)
35 | 
36 | columns = []
37 | 
38 | 
39 | scroll_thumb = browser.find_element(By.CSS_SELECTOR, "#hScrollTD")  # Replace with your thumb element
40 | 
41 | action = ActionChains(browser)
42 | 
43 | for _ in range(1, 50):
44 |     for i in range(0, 15):
45 |         col_names = browser.find_element(By.CSS_SELECTOR, f'table[id="DataTable"]>thead>tr>#a{i}').text
46 |         columns.append(col_names)
47 | 
48 |     sleep(2)
49 |     action.click_and_hold(scroll_thumb).move_by_offset(20, 0).release().perform()
50 |     sleep(1)
51 | 
52 | print(columns)


--------------------------------------------------------------------------------
/join_team_meeting.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Help-the-community/Web_Scraping_with_Selenium/83d65976e29668ac62cd8edf50511f66ff00084e/join_team_meeting.gif


--------------------------------------------------------------------------------
/knowde_com.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Project : Knowde
  3 | Author : Ajeet
  4 | Date : June 15, 2023
  5 | """
  6 | # import libraries
  7 | import os
  8 | import logging
  9 | import pandas as pd
 10 | from bs4 import BeautifulSoup
 11 | from typing import List, Dict, Optional
 12 | from selenium.webdriver import Chrome, ChromeOptions
 13 | from selenium.webdriver.support.ui import WebDriverWait
 14 | from selenium.webdriver.common.by import By
 15 | from selenium.webdriver.support import expected_conditions as EC
 16 | 
 17 | # logging configurations
 18 | logging.basicConfig(filename='knoede_log.log',
 19 |                     filemode='a',
 20 |                     format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
 21 |                     datefmt='%H:%M:%S',
 22 |                     level=logging.INFO)
 23 | 
 24 | 
 25 | class KnoedeData:
 26 |     def __init__(self):
 27 | 
 28 |         options = ChromeOptions()
 29 |         options.add_argument('--start-maximized')
 30 |         options.add_experimental_option("excludeSwitches", ["enable-automation"])
 31 |         self.driver = Chrome(options=options)
 32 |         self.wait = WebDriverWait(self.driver, 10)
 33 |         self.website = "https://www.knowde.com/b/markets-personal-care/products/"
 34 |         self.data = []
 35 |         self.driver.get(self.website)
 36 |         # accept all cookies
 37 |         self.wait.until(EC.visibility_of_element_located((By.ID, 'onetrust-accept-btn-handler'))).click()
 38 | 
 39 |     @staticmethod
 40 |     def find_siblings(container: BeautifulSoup.string, category: str) -> str:
 41 |         """this method returns the text value across the given category if found/available.
 42 |         Args:
 43 |         container: a 'BeautifulSoup.string' containing all the textual details of an individual product.
 44 |         category: the name of the category across which we are trying to get the details.
 45 | 
 46 |         Returns:
 47 |         category_text: the details/value across the given text category
 48 |         """
 49 |         label = container.find("span", string=f"{category}: ")
 50 |         if label:
 51 |             category_text = label.next_sibling.text
 52 |         else:
 53 |             category_text = None
 54 | 
 55 |         return category_text
 56 | 
 57 |     def data_processing(self, page_source: str) -> None:
 58 |         """this method process/parse the individual product information
 59 | 
 60 |         Args:
 61 |         page_source: this is the page source of the selenium webdriver
 62 | 
 63 |         Returns: None
 64 |         """
 65 |         soup = BeautifulSoup(page_source, 'html.parser')
 66 |         product_containers = soup.select('div[data-cy="product-card"]')
 67 | 
 68 |         for container in product_containers:
 69 |             text_container = container.select_one('div[direction="column"]')
 70 | 
 71 |             brand = text_container.select_one('p[data-cy="product-brand-name"]').text
 72 |             item = text_container.select_one('p[data-cy="product-name"]').text
 73 | 
 74 |             inci_name = self.find_siblings(text_container, 'INCI Name')
 75 |             ingredient_origin = self.find_siblings(text_container, 'Ingredient Origin')
 76 |             function = self.find_siblings(text_container, 'Function')
 77 |             benefit_claims = self.find_siblings(text_container, 'Benefit Claims')
 78 |             labeling_claims = self.find_siblings(text_container, 'Labeling Claims')
 79 |             compliance = self.find_siblings(text_container, 'Certifications & Compliance')
 80 |             hlb_value = self.find_siblings(text_container, 'HLB Value')
 81 |             end_uses = self.find_siblings(text_container, 'End Uses')
 82 |             cas_no = self.find_siblings(text_container, 'CAS Number')
 83 |             chemical_name = self.find_siblings(text_container, 'Chemical Name')
 84 |             synonyms = self.find_siblings(text_container, 'Synonyms')
 85 |             chemical_family = self.find_siblings(text_container, 'Chemical Family')
 86 |             features = self.find_siblings(text_container, 'Features')
 87 |             grade = self.find_siblings(text_container, 'Grade')
 88 | 
 89 |             description = text_container.select('p')[-1].text
 90 |             logging.info(f'Saving: {brand}')
 91 | 
 92 |             self.data.append({
 93 |                     'brand': brand,
 94 |                     'item': item,
 95 |                     'inci_name': inci_name,
 96 |                     'ingredient_origin': ingredient_origin,
 97 |                     'function': function,
 98 |                     'benefit_claims': benefit_claims,
 99 |                     'labeling_claims': labeling_claims,
100 |                     'compliance': compliance,
101 |                     'hlb_value': hlb_value,
102 |                     'end_uses': end_uses,
103 |                     'cas_no': cas_no,
104 |                     'chemical_name': chemical_name,
105 |                     'synonyms': synonyms,
106 |                     'chemical_family': chemical_family,
107 |                     'features': features,
108 |                     'grade': grade,
109 |                     'description': description
110 |             })
111 | 
112 |     def single_page(self, page_num: int) -> List[Dict]:
113 |         """ this method scraps the data from the given page number of the website.
114 | 
115 |         Args:
116 |         page_num: the page number to extract the data from
117 | 
118 |         Returns: self.data(list of dict of all products on a given page)
119 |         """
120 | 
121 |         self.driver.get(f"{self.website}{page_num}")
122 |         logging.info(f"-------page number {page_num} -------")
123 |         products = self.driver.find_elements(By.CSS_SELECTOR, 'div[data-cy="product-card"]')
124 | 
125 |         count = 0
126 |         for product in products:
127 |             if count == 0 or count == count + 4:
128 |                 product.find_element(By.CSS_SELECTOR, 'svg[data-testid="icon-icomoon--keyboard_arrow_down"]').click()
129 | 
130 |             count += 1
131 | 
132 |         self.data_processing(self.driver.page_source)
133 | 
134 |         return self.data
135 | 
136 |     def multiple_page(self, start: int, end: int) -> List[Dict]:
137 |         """ the method iterates over the range of given page numbers.
138 | 
139 |         Args:
140 |         start: the page number to start with
141 |         end: the page number to end with
142 | 
143 |         Returns: None
144 |         """
145 | 
146 |         for page in range(start, end+1):
147 |             self.single_page(page)
148 | 
149 |         return self.data
150 | 
151 |     @staticmethod
152 |     def save_data(data: List[Dict], path: Optional[str] = os.getcwd()) -> None:
153 |         """ save the data to a CSV file at the given path.
154 | 
155 |         Args:
156 |         data: the data to save.
157 |         path: the path to save the file (the default is os.getcwd(), which saves the file in the current directory)
158 | 
159 |         Returns: None
160 |         """
161 | 
162 |         df = pd.DataFrame(data)
163 |         file_location = f'{path}/cosmetics_data.csv'
164 |         df.to_csv(file_location, index=False)
165 |         logging.info(f"------------data is saved at {file_location}------------")
166 | 
167 | 
168 | if __name__ == '__main__':
169 | 
170 |     obj = KnoedeData()
171 |     # print(obj.single_page(1))
172 |     # obj.save_data(obj.single_page(1))
173 |     # print(obj.multiple_page(2, 3))
174 |     # print(obj.save_data(obj.multiple_page(1, 3)))
175 | 
176 | 
177 | """
178 | Few things to note:
179 | 
180 | 1. The very first time we open the website, we need to click on the button Accept All Cookies.
181 | 2. Next, we can find all the 36 products on the page using the selector div[data-cy="product-card"]
182 | 3. You might notice that in a full window size, the page loads 4 products in a row and as we click on the down-arrow of 1st product to see more details, it also opens for the remaining 3 products on that row. So we just need to click once per row.
183 | 4. To implement the logic of clicking only once per row, we used a count variable as you can see in the code above.
184 | 
185 | reference:
186 | https://stackoverflow.com/questions/76468614/selenium-python-timeoutexception
187 | """
188 | 


--------------------------------------------------------------------------------
/lebara_nl.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import random
 3 | from time import sleep
 4 | 
 5 | import pyautogui
 6 | import undetected_chromedriver as uc
 7 | from selenium.webdriver.common.by import By
 8 | 
 9 | # Use undetected-chromedriver
10 | driver = uc.Chrome()
11 | driver.maximize_window()
12 | driver.get("https://www.lebara.nl/nl/prepaid/data-bundle-valuesim.html")
13 | 
14 | # Simulate human-like behavior
15 | time.sleep(random.uniform(1, 3))
16 | pyautogui.moveTo(random.randint(100, 500), random.randint(100, 500), duration=0.5)
17 | 
18 | # Click the cookie decline button
19 | cookie_decline_button = driver.find_element(By.ID, "onetrust-reject-all-handler")
20 | cookie_decline_button.click()
21 | 
22 | # Simulate human-like behavior
23 | time.sleep(random.uniform(1, 3))
24 | pyautogui.moveTo(random.randint(100, 500), random.randint(100, 500), duration=0.5)
25 | 
26 | # Click the bestelSimkaartButton using JavaScript
27 | bestel_simkaart_button = driver.find_element(By.XPATH, "/html/body/div[2]/div[1]/div[1]/div/div[1]/div[2]/div[1]/div[2]/div[1]/div[2]/div[1]/div/div[1]/div[2]/div[3]/button")
28 | bestel_simkaart_button.click()
29 | time.sleep(2)
30 | 
31 | # Wait for the new page to load
32 | time.sleep(5)
33 | 


--------------------------------------------------------------------------------
/lidl_GB.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : Lidl GB
 3 | Author : Ajeet
 4 | Date : 07/06/2023
 5 | """
 6 | import time
 7 | from selenium import webdriver
 8 | from selenium.webdriver import ChromeOptions, Keys
 9 | from selenium.webdriver.common.by import By
10 | from selenium.webdriver.support import expected_conditions as EC
11 | from selenium.webdriver.support.wait import WebDriverWait
12 | 
13 | options = ChromeOptions()
14 | options.add_argument("--start-maximized")
15 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
16 | 
17 | driver = webdriver.Chrome(options=options)
18 | wait = WebDriverWait(driver, 10)
19 | url = "https://www.lidl.co.uk/about-us/store-finder-opening-hours#"
20 | driver.get(url)
21 | 
22 | # wait for element to get located to click the "ACCEPT" cookies button
23 | wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "button.cookie-alert-extended-button"))).click()
24 | # wait for element to get located to click the "STORE SEARCH" button
25 | wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "button.nuc-m-button.nuc-a-button"))).click()
26 | # wait for element to get located to Enter post code or city
27 | wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'input[placeholder="Enter post code or city"]'))).send_keys('London')
28 | 
29 | time.sleep(10)
30 | 
31 | 
32 | """
33 | Few things to note:
34 | 
35 | 1. First, as we hit the URL, a cookie pops up that we can accept to continue. So we wait for this pop-up and click on the ACCEPT button.
36 | 2. Next, we wait for the STORE SEARCH button to get located and then click.
37 | 3. It loads a side search box where we can enter the city or the postcode to search. So we wait for this to get loaded/located in order to enter the query. we can use send_keys() method to enter/input either the city name or the postcode.
38 | 
39 | for example, as we enter the city name/postcode (London), a dropdown list appears with available stores in that region, you can choose accordingly and proceed further.
40 | reference:
41 | https://stackoverflow.com/questions/76392044/how-can-i-locate-and-enter-text-in-the-search-box-on-lidls-website-using-seleni
42 | """


--------------------------------------------------------------------------------
/load_cookies_to_accept_all.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from selenium import webdriver
 3 | from selenium.webdriver import ChromeOptions, Chrome
 4 | from selenium.webdriver.common.by import By
 5 | 
 6 | options = ChromeOptions()
 7 | 
 8 | # to start maximized screen
 9 | options.add_argument("--start-maximized")
10 | # to remove 'Chrome is being controlled by automated software'
11 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
12 | 
13 | options.add_experimental_option("useAutomationExtension", False)
14 | 
15 | driver = Chrome(options=options)
16 | 
17 | driver.get("https://langsungkerja.id/registration/")
18 | 
19 | driver.add_cookie({"name": "cookieyes-consent", "value": "consent:yes,action:yes"})
20 | driver.refresh()
21 | 
22 | driver.find_element(By.CSS_SELECTOR, 'button.tutor-btn.tutor-btn-primary').click()
23 | time.sleep(1)
24 | 
25 | 


--------------------------------------------------------------------------------
/ma_shienkikan.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | from selenium.webdriver.chrome.service import Service
 3 | from selenium.webdriver.common.by import By
 4 | from selenium.webdriver.support import expected_conditions as EC
 5 | from selenium.webdriver.support.wait import WebDriverWait
 6 | from selenium.webdriver.common.keys import Keys
 7 | from selenium.webdriver.chrome.options import Options
 8 | from webdriver_manager.chrome import ChromeDriverManager
 9 | 
10 | # Initialize an empty list to store scraped data
11 | data = []
12 | 
13 | # Function to configure Chrome options for stealth scraping
14 | def get_stealth_chrome_options():
15 |     options = Options()
16 |     # Set headless mode (optional, uncomment to avoid loading browser UI)
17 |     # options.add_argument("--headless=new")
18 |     options.add_argument("--disable-blink-features=AutomationControlled")
19 |     options.add_argument("--disable-extensions")
20 |     options.add_argument("--disable-infobars")
21 |     options.add_argument("--disable-popup-blocking")
22 |     options.add_argument("--no-sandbox")
23 |     options.add_argument("--disable-dev-shm-usage")
24 |     options.add_argument("--remote-debugging-port=9222")
25 |     options.add_argument("--window-size=1920,1080")
26 |     options.add_argument(
27 |         "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36")
28 | 
29 |     # Suppress logging to reduce unnecessary output
30 |     options.add_argument("--log-level=3")
31 |     options.add_experimental_option("excludeSwitches", ["enable-automation"])
32 |     options.add_experimental_option("useAutomationExtension", False)
33 | 
34 |     # Ensure better resource handling for long scripts
35 |     options.add_argument("--disable-gpu")
36 |     options.add_argument("--enable-logging")
37 |     return options
38 | 
39 | 
40 | # Set up the WebDriver with configured options
41 | service = Service(ChromeDriverManager().install())
42 | options = get_stealth_chrome_options()
43 | browser = webdriver.Chrome(service=service, options=options)
44 | wait = WebDriverWait(browser, 10)
45 | 
46 | try:
47 |     # Navigate to the target website
48 |     browser.get("https://library.usask.ca/#gsc.tab=0")
49 |     print("[INFO] Successfully loaded the website.")
50 | 
51 |     # Locate the search field and input query
52 |     q_field = browser.find_element(By.ID, "primoQueryTemp")
53 |     q_field.send_keys("artificial intelligence")
54 |     q_field.send_keys(Keys.ENTER)
55 |     print("[INFO] Search query submitted.")
56 | 
57 |     # Wait for the search results container to be visible
58 |     results_container = wait.until(
59 |         EC.presence_of_element_located((By.ID, "searchResultsContainer"))
60 |     )
61 |     print("[INFO] Search results container loaded.")
62 | 
63 |     # Scrape the first 10 search results
64 |     for i in range(1, 11):
65 |         try:
66 |             # Locate each search result container by its XPath
67 |             container = results_container.find_element(By.XPATH, f"//*[@id='searchResultsContainer']/div[{i}]")
68 | 
69 |             # Extract relevant information for each result
70 |             item_data = {
71 |                 "item_number": container.find_element(By.CLASS_NAME, "list-item-count").text,
72 |                 "media_type": container.find_element(By.CSS_SELECTOR, "div.media-content-type.align-self-start").text,
73 |                 "image": container.find_element(By.CLASS_NAME, "media-thumbnail")
74 |                 .find_element(By.CSS_SELECTOR, "div:nth-child(1) > img")
75 |                 .get_attribute("src"),
76 |                 "item_title": container.find_element(By.CLASS_NAME, "item-title").text,
77 |             }
78 |             data.append(item_data)
79 |             # print(f"[INFO] Scraped item {i}: {item_data}")
80 |         except Exception as e:
81 |             print(f"[WARNING] Error scraping item {i}: {e}")
82 | 
83 |     # Print the collected data
84 |     print("[INFO] Scraping completed successfully.")
85 |     print(data)
86 | 
87 | except Exception as e:
88 |     print(f"[ERROR] An error occurred: {e}")
89 | 
90 | finally:
91 |     # Ensure the browser is properly closed
92 |     browser.quit()
93 |     print("[INFO] Browser closed.")
94 | 


--------------------------------------------------------------------------------
/mercedes-benz.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : mercedes-benz Scrapper
 3 | Author : Ajeet
 4 | Date : 06/06/2023
 5 | """
 6 | 
 7 | import time
 8 | from selenium import webdriver
 9 | from selenium.webdriver import ChromeOptions, Keys
10 | from selenium.webdriver.common.by import By
11 | 
12 | options = ChromeOptions()
13 | 
14 | options.add_argument("--start-maximized")
15 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
16 | driver = webdriver.Chrome(options=options)
17 | 
18 | url = "https://www.mercedes-benz.co.in/passengercars/buy/new-car/search-results.html/?emhsort=price-asc&emhvehicleAssortment=vehicles&emhstockType=IN_STOCK"
19 | driver.get(url)
20 | time.sleep(5)
21 | # click on the "Agree to all" button to proceed
22 | shadow_element_1 = driver.find_element(By.CSS_SELECTOR, "cmm-cookie-banner.hydrated").shadow_root
23 | shadow_element_1.find_element(By.CSS_SELECTOR, 'div.button-group').find_element(By.XPATH, 'button[text()="Agree to all"]').click()
24 | 
25 | # enter the pin code to proceed further
26 | shadow_element_2 = driver.find_element(By.CSS_SELECTOR, 'dh-io-emh-region-picker[class="webcomponent webcomponent-nested"]').shadow_root
27 | region_picker = shadow_element_2.find_element(By.CSS_SELECTOR, 'input#postCodeInput')
28 | region_picker.send_keys(110001)
29 | region_picker.send_keys(Keys.ENTER)
30 | 
31 | # parse the search results
32 | shadow_element_3 = driver.find_element(By.CSS_SELECTOR, 'emh-search-result[data-component-name="emh-search-result"]').shadow_root
33 | search_container = shadow_element_3.find_element(By.CSS_SELECTOR, 'div.dcp-cars-srp__results.dcp-cars-srp-results.srp-grid-layout__results')
34 | results = search_container.find_elements(By.CSS_SELECTOR, 'div.dcp-cars-srp-results__tile')
35 | 
36 | for result in results:
37 |     print(result.find_element(By.CSS_SELECTOR, 'h2.wb-vehicle-tile__title').text)
38 | 
39 | time.sleep(5)
40 | 
41 | """
42 | output:
43 | 
44 | GLB200
45 | GLB200
46 | GLB200
47 | C220d MY23
48 | C220d MY23
49 | C220d MY23
50 | C220d MY23
51 | C220d MY23
52 | C220d MY23
53 | C220d MY23
54 | C220d MY23
55 | C220d MY23
56 | """
57 | 
58 | """
59 | Things to notice:
60 | 
61 | 1. First of all, we need to find and click on the Agree to all button which lies under a shadow-root.
62 | 2. Next, we need to find and input the pin code (which again lies under a shadow-root) to proceed further.
63 | 3. Finally, we get to the search page where we can see 12 different search results. We find the element (this element also lies under a shadow-root) which contains the search results data.
64 | 
65 | The variable results holds all the 12 results on the page and we can iterate over it to extract/parse all the pieces of information.
66 | 
67 | reference:
68 | https://stackoverflow.com/questions/76408371/why-does-xpath-half-work-in-this-web-page
69 | """


--------------------------------------------------------------------------------
/mydealz_de.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : 
 3 | Author : Ajeet
 4 | Date : June 12, 2023
 5 | """
 6 | 
 7 | import time
 8 | from selenium.webdriver import Chrome
 9 | from selenium.webdriver.common.by import By
10 | from selenium.webdriver.support.wait import WebDriverWait
11 | import selenium.webdriver.support.expected_conditions as EC
12 | 
13 | 
14 | driver = Chrome()
15 | driver.get("https://www.mydealz.de/register")
16 | wait = WebDriverWait(driver, 10)
17 | 
18 | # accept all cookies
19 | wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'button[data-t="acceptAllBtn"]'))).click()
20 | 
21 | checkboxes = driver.find_elements(By.CSS_SELECTOR, 'span.tGrid-cell.tGrid-cell--shrink')
22 | # select the 2nd checkbox
23 | checkboxes[1].click()
24 | # Similarly, you can also select the 1st checkbox using checkboxes[0].click()
25 | 
26 | time.sleep(2)
27 | 
28 | """
29 | reference:
30 | https://stackoverflow.com/questions/76453368/how-to-click-a-checkbox-by-driver-find-elementid-in-python
31 | """
32 | 


--------------------------------------------------------------------------------
/nested_shadow_root.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : Wallet Polygon Technology
 3 | Author : Ajeet
 4 | Date : July 12, 2023
 5 | """
 6 | 
 7 | import time
 8 | from selenium import webdriver
 9 | from selenium.webdriver import ChromeOptions
10 | from selenium.webdriver.common.by import By
11 | from selenium.webdriver.support import expected_conditions as EC
12 | from selenium.webdriver.support.wait import WebDriverWait
13 | 
14 | options = ChromeOptions()
15 | options.add_argument("--start-maximized")
16 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
17 | 
18 | driver = webdriver.Chrome(options=options)
19 | wait = WebDriverWait(driver, 10)
20 | url = "https://wallet.polygon.technology/?redirectOnConnect=zkEVM_bridge"
21 | 
22 | driver.get(url)
23 | # click on the "Connect to a Wallet" button
24 | wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.navbar__apps-section__auth__login"))).click()
25 | time.sleep(2)
26 | 
27 | # ----------------------------------------------------------------------------------------------------------------------
28 | driver.execute_script(
29 |     """document.querySelector('w3m-modal').shadowRoot.querySelector('w3m-modal-router').shadowRoot.querySelector('w3m-connect-wallet-view').shadowRoot.querySelector('w3m-desktop-wallet-selection').shadowRoot.querySelector('w3m-modal-footer').querySelectorAll('w3m-wallet-button')[0].shadowRoot.querySelector('button').click();""")
30 | 
31 | # ----------------------------------------------------------------------------------------------------------------------
32 | time.sleep(5)
33 | 
34 | """
35 | - Various elements on this website are embedded inside the shadow-root.
36 | - for example, your target/desired button is embedded in a 5-layer nested shadow-root.
37 | - After clicking on the Connect to a Wallet, we wait for 1-2 seconds just to make sure that the overlay window is 
38 |   visibly present, although it appears very quickly.
39 | - The used javascript query to locate and click on the desired button:
40 | 
41 |   document.querySelector('w3m-modal').shadowRoot.querySelector('w3m-modal-router').shadowRoot.querySelector('w3m-connect-wallet-view').shadowRoot.querySelector('w3m-desktop-wallet-selection').shadowRoot.querySelector('w3m-modal-footer').querySelectorAll('w3m-wallet-button')[0].shadowRoot.querySelector('button').click();
42 | 
43 |   will click on the very first wallet, if you like to click on the 2nd or 3rd wallet option, just simply replace 
44 |   the querySelectorAll('w3m-wallet-button')[0] with querySelectorAll('w3m-wallet-button')[1] or 
45 |   querySelectorAll('w3m-wallet-button')[2] respectively in the above-mentioned javascript query.
46 | 
47 | reference:
48 | https://stackoverflow.com/questions/76658230/selenium-how-to-get-element-in-shadow-root-of-html-page-code
49 | """


--------------------------------------------------------------------------------
/nse_india.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Author: Ajeet
 3 | Created: 1/11/2025
 4 | Description: This script automates the process of navigating to the NSE India announcements page,
 5 |              selecting the SME tab, switching to the "1W" (1 Week) filter, and downloading the
 6 |              announcements in a CSV file format.
 7 | Project: Automation
 8 | """
 9 | import time
10 | import undetected_chromedriver as uc
11 | from selenium.webdriver.common.by import By
12 | from selenium.webdriver.support.ui import WebDriverWait
13 | from selenium.webdriver.support import expected_conditions as EC
14 | 
15 | # Initialize the Selenium WebDriver (using undetected_chromedriver to bypass bot detection)
16 | driver = uc.Chrome()
17 | 
18 | # Define an explicit wait for elements
19 | wait = WebDriverWait(driver, 10)
20 | 
21 | try:
22 |     # Step 1: Open the NSE India announcements page
23 |     print("Opening NSE announcements page...")
24 |     driver.get("https://www.nseindia.com/companies-listing/corporate-filings-announcements")
25 | 
26 |     # Step 2: Select the SME tab
27 |     sme_tab = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#containTabNav > li:nth-child(2) > a")))
28 |     sme_tab.click()
29 |     time.sleep(2) # Pause to allow the page content to load
30 | 
31 |     # Step 3: Select the "1W" (1 Week) tab
32 |     one_week_tab = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[id="Announcements_sme"]>div:nth-child(2)>div>div.block-detail-dates-box>div>div>ul>li:nth-child(2)')))
33 |     one_week_tab.click()
34 |     time.sleep(2) # Pause to allow the filtered content to load
35 | 
36 |     # Step 4: Wait for the table containing announcements to load
37 |     wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#CFanncsmeTable>tbody>tr>td>a')))
38 | 
39 |     # Step 5: Download the CSV file
40 |     print("Downloading CSV file...")
41 |     download = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#CFanncsme-download')))
42 |     download.click()
43 | 
44 |     # Pause to allow the download process to complete
45 |     time.sleep(3)
46 |     print(f"File downloaded!")
47 | 
48 | except Exception as e:
49 |     # Handle any unexpected errors and print a user-friendly message
50 |     print(f"An unexpected error occurred: {e}")
51 | 
52 | """
53 | output:
54 | Opening NSE announcements page...
55 | Downloading CSV file...
56 | File downloaded!
57 | 
58 | stackoverflow link: https://stackoverflow.com/a/79349087/11179336
59 | """


--------------------------------------------------------------------------------
/nse_india_2.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from selenium import webdriver
 3 | from selenium.webdriver.common.by import By
 4 | from selenium.webdriver.support.ui import WebDriverWait
 5 | from selenium.webdriver.support import expected_conditions as EC
 6 | from selenium.webdriver.chrome.options import Options
 7 | from selenium.webdriver.common.action_chains import ActionChains
 8 | from selenium.common.exceptions import TimeoutException
 9 | 
10 | options = Options()
11 | options.add_argument("--start-maximized")
12 | options.add_argument(
13 |     "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
14 | )
15 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
16 | options.add_experimental_option("useAutomationExtension", False)
17 | 
18 | # Initialize WebDriver
19 | with webdriver.Chrome(options=options) as driver:
20 |     wait = WebDriverWait(driver, 10)
21 |     action = ActionChains(driver)
22 | 
23 |     try:
24 |         print("Opening NSE announcements page...")
25 |         driver.get("https://www.nseindia.com/companies-listing/corporate-filings-announcements")
26 | 
27 |         # Select SME tab
28 |         sme_tab = wait.until(
29 |             EC.presence_of_element_located((By.CSS_SELECTOR, "#containTabNav > li:nth-child(2) > a"))
30 |         )
31 |         action.move_to_element(sme_tab).click().perform()
32 |         time.sleep(2)
33 | 
34 |         # Select '1W' tab
35 |         one_week_tab = wait.until(
36 |             EC.presence_of_element_located((By.CSS_SELECTOR,
37 |                                             'div[id="Announcements_sme"]>div:nth-child(2)>div>div.block-detail-dates-box>div>div>ul>li:nth-child(2)'))
38 |         )
39 |         action.move_to_element(one_week_tab).click().perform()
40 |         time.sleep(2)
41 | 
42 |         # Wait for the table to load
43 |         wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#CFanncsmeTable>tbody>tr>td>a')))
44 | 
45 |         # Download the CSV
46 |         print("Downloading CSV file...")
47 |         download = wait.until(
48 |             EC.presence_of_element_located((By.CSS_SELECTOR, '#CFanncsme-download'))
49 |         )
50 |         action.move_to_element(download).click().perform()
51 | 
52 |         # Wait for the download to complete
53 |         time.sleep(5)
54 |         print(f"File downloaded!")
55 | 
56 |     except TimeoutException as e:
57 |         print(f"Timeout occurred: {e}")
58 |         print("Please try running the script again.")
59 | 
60 |     except Exception as e:
61 |         print(f"An unexpected error occurred: {e}")
62 | 


--------------------------------------------------------------------------------
/oddsportal_com.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Project : 
  3 | Author : Ajeet
  4 | Date : July 26, 2023
  5 | """
  6 | 
  7 | 
  8 | import time
  9 | import threading
 10 | import pandas as pd
 11 | from math import nan
 12 | from datetime import datetime, timedelta
 13 | from multiprocessing.pool import ThreadPool
 14 | from bs4 import BeautifulSoup as bs
 15 | import undetected_chromedriver as uc
 16 | from selenium import webdriver
 17 | from selenium.webdriver.support import expected_conditions as EC
 18 | from selenium.webdriver.support.wait import WebDriverWait
 19 | from selenium.webdriver.common.by import By
 20 | pd.set_option('display.max_rows', 500)
 21 | pd.set_option('display.max_columns', 500)
 22 | pd.set_option('display.width', 1000)
 23 | 
 24 | class Driver:
 25 |     def __init__(self):
 26 |         options = webdriver.ChromeOptions()
 27 |         self.driver = uc.Chrome(options=options)
 28 | 
 29 |     def __del__(self):
 30 |         self.driver.quit()  # clean up driver when we are cleaned up
 31 | 
 32 | 
 33 | threadLocal = threading.local()
 34 | 
 35 | 
 36 | def create_driver():
 37 |     the_driver = getattr(threadLocal, 'the_driver', None)
 38 |     if the_driver is None:
 39 |         the_driver = Driver()
 40 |         setattr(threadLocal, 'the_driver', the_driver)
 41 |     return the_driver.driver
 42 | 
 43 | 
 44 | class GameData:
 45 |     def __init__(self):
 46 |         self.date = []
 47 |         self.time = []
 48 |         self.game = []
 49 |         self.score = []
 50 |         self.home_odds = []
 51 |         self.draw_odds = []
 52 |         self.away_odds = []
 53 |         self.country = []
 54 |         self.league = []
 55 | 
 56 | 
 57 | def generate_matches(pgSoup, defaultVal=None):
 58 |     evtSel = {
 59 |         'time': 'div>div>div[class="flex basis-[10%]"]',
 60 |         'game': 'a div:has(>a[title])',
 61 |         'score': 'a[title]~div:not(.hidden)',
 62 |         'home_odds': 'div[class^="flex-center flex-col gap-1 border-l border-black-ma"]:nth-child(2)',
 63 |         'draw_odds': 'div[class^="flex-center flex-col gap-1 border-l border-black-ma"]:nth-child(3)',
 64 |         'away_odds': 'div[class^="flex-center flex-col gap-1 border-l border-black-ma"]:nth-child(4)'
 65 |     }
 66 | 
 67 |     events, current_group = [], {}
 68 |     pgDate = pgSoup.select_one('h1.title[id="next-matches-h1"]')
 69 |     if pgDate: pgDate = pgDate.get_text().split(',', 1)[-1].strip()
 70 |     for evt in pgSoup.select('div[set]>div:last-child'):
 71 |         if evt.parent.select(f':scope>div:first-child+div+div'):
 72 |             cgVals = [v.get_text(' ').strip() if v else defaultVal for v in [
 73 |                 evt.parent.select_one(s) for s in
 74 |                 [':scope>div:first-child+div>div:first-child',
 75 |                  ':scope>div:first-child>a:nth-of-type(2):nth-last-of-type(2)',
 76 |                  ':scope>div:first-child>a:nth-of-type(3):last-of-type']]]
 77 |             current_group = dict(zip(['date', 'country', 'league'], cgVals))
 78 |             if pgDate: current_group['date'] = pgDate
 79 | 
 80 |         evtRow = {'date': current_group.get('date', defaultVal)}
 81 | 
 82 |         for k, v in evtSel.items():
 83 |             v = evt.select_one(v).get_text(' ') if evt.select_one(v) else defaultVal
 84 |             evtRow[k] = ' '.join(v.split()) if isinstance(v, str) else v
 85 |         # evtTeams = evt.select('a div>a[title]')
 86 |         evtTeams = evt.select('div[class^="relative w-full flex-col"]>a')
 87 |         evtRow['game'] = ' – '.join(a['title'] for a in evtTeams)
 88 |         evtRow['country'] = current_group.get('country', defaultVal)
 89 |         evtRow['league'] = current_group.get('league', defaultVal)
 90 | 
 91 |         events.append(evtRow)
 92 |     return events
 93 | 
 94 | 
 95 | def parse_data(url, return_urls=False):
 96 |     print(f'Parsing URL: {url}\n')
 97 |     browser = create_driver()
 98 |     browser.get(url)
 99 |     WebDriverWait(browser, 20).until(EC.presence_of_all_elements_located(
100 |         (By.CSS_SELECTOR, "div[set]>div:last-child")))
101 |     # ########## For page to scroll to the end ###########
102 |     scroll_pause_time = 2
103 | 
104 |     # Get scroll height
105 |     last_height = browser.execute_script("return document.body.scrollHeight")
106 | 
107 |     while True:
108 |         # Scroll down to bottom
109 |         browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
110 | 
111 |         # Wait to load page
112 |         time.sleep(scroll_pause_time)
113 | 
114 |         # Calculate new scroll height and compare with last scroll height
115 |         new_height = browser.execute_script("return document.body.scrollHeight")
116 |         if new_height == last_height:
117 |             break
118 |         last_height = new_height
119 |     # ########## For page to scroll to the end ###########
120 |     time.sleep(5)
121 |     soup = bs(browser.page_source, "lxml")
122 | 
123 |     game_data = GameData()
124 |     game_keys = [a for a, av in game_data.__dict__.items() if isinstance(av, list)]
125 |     for row in generate_matches(soup, defaultVal=nan):
126 |         for k in game_keys: getattr(game_data, k).append(row.get(k, nan))
127 |     if return_urls:
128 |         ac_sel = 'div:has(>a.active-item-calendar)'  # a_cont selector
129 |         a_sel = f'{ac_sel}>a[href]:not([href^="#"]):not(.active-item-calendar)'
130 |         a_tags = soup.select(a_sel)
131 | 
132 |         if a_tags:
133 |             urls = ['https://www.oddsportal.com' + a_tag['href'] for a_tag in a_tags]
134 |             print(f'urls after initial creation: {urls}')
135 | 
136 |             # Extract the date from the first URL
137 |             last_date_str = urls[0].split('/')[-2]
138 |             print(f'last date str: {last_date_str}')
139 |             last_date = datetime.strptime(last_date_str, '%Y%m%d')
140 | 
141 |             # Generate the additional URLs
142 |             for i in range(1, 4):
143 |                 new_date = last_date - timedelta(days=i)
144 |                 new_date_str = new_date.strftime('%Y%m%d')
145 |                 new_url = f'https://www.oddsportal.com/matches/football/{new_date_str}/'
146 |                 urls.append(new_url)
147 |                 print(f'urls after generating additional URL #{i}: {urls}')
148 |         else:
149 |             urls = []
150 | 
151 |         print(f'final urls: {urls}')
152 | 
153 |         if urls and urls[-1].startswith('https://www.oddsportal.com/matches/football/'):
154 |             # Extract the date from the last URL
155 |             last_date_str = urls[0].split('/')[-2]
156 |             print(last_date_str)
157 |         else:
158 |             print('No valid URLs found')
159 |         return game_data, urls
160 |     return game_data
161 | 
162 | 
163 | if __name__ == '__main__':
164 |     games = None
165 |     pool = ThreadPool(5)
166 |     # Get today's data and the Urls for the other days:
167 |     url_today = 'https://www.oddsportal.com/matches/soccer'
168 |     game_data_today, urls = pool.apply(parse_data, args=(url_today, True))
169 |     game_data_results = pool.imap(parse_data, urls)
170 | 
171 |     # ########################### BUILD  DATAFRAME ############################
172 |     game_data_dfList, added_todayGame = [], False
173 |     for game_data in game_data_results:
174 |         try:
175 |             game_data_dfList.append(pd.DataFrame(game_data.__dict__))
176 |             if not added_todayGame:
177 |                 game_data_dfList += [pd.DataFrame(game_data_today.__dict__)]
178 |                 added_todayGame = True
179 |         except Exception as e:
180 |             game_n = len(game_data_dfList) + 1
181 |             print(f'Error tabulating game_data_df#{game_n}:\n{repr(e)}')
182 |     try:
183 |         games = pd.concat(game_data_dfList, ignore_index=True)
184 |     except Exception as e:
185 |         print('Error concatenating DataFrames:', repr(e))
186 |     # #########################################################################
187 |     print('!?NO GAMES?!' if games is None else games)
188 |     # ensure all the drivers are "quitted":
189 |     del threadLocal  # a little extra insurance
190 |     import gc
191 | 
192 |     gc.collect()
193 | 
194 |     games.to_csv()


--------------------------------------------------------------------------------
/pump_fun.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Author: Ajeet
  3 | Created: 1/6/2025
  4 | Description:
  5 |     This script automates the process of interacting with the 'https://pump.fun' website.
  6 |     It performs the following actions:
  7 |     1. Bypasses automation detection using custom Chrome options.
  8 |     2. Clicks the "I'm ready to pump" button on a pop-up.
  9 |     3. Handles the "Reject All" cookies dialog.
 10 |     4. Retrieves and processes specific elements matching a CSS selector pattern.
 11 |     5. Prints the total count and content of the matching elements.
 12 | 
 13 | Project: Automation
 14 | """
 15 | from selenium import webdriver
 16 | from selenium.webdriver.common.by import By
 17 | from selenium.webdriver.chrome.options import Options
 18 | from selenium.webdriver.support.wait import WebDriverWait
 19 | from selenium.webdriver.support import expected_conditions as EC
 20 | 
 21 | # Set up Chrome options to bypass automation detection
 22 | options = Options()
 23 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
 24 | options.add_experimental_option("useAutomationExtension", False)
 25 | 
 26 | # Initialize the WebDriver with the specified options
 27 | driver = webdriver.Chrome(options=options)
 28 | driver.maximize_window()
 29 | 
 30 | # Navigate to the target URL
 31 | driver.get('https://pump.fun')
 32 | # Initialize an explicit wait with a timeout of 10 seconds
 33 | wait = WebDriverWait(driver, 10)
 34 | 
 35 | try:
 36 |     # Step 1: Wait for the "I'm ready to pump" button to appear and click it
 37 |     ready_button = wait.until(EC.presence_of_element_located(
 38 |         (By.CSS_SELECTOR, '#radix-\:r0\: > div.mt-3 > button')
 39 |     ))
 40 |     ready_button.click()
 41 | 
 42 |     # Step 2: Wait for the "Reject All" cookies button to appear and click it
 43 |     cookies_button = wait.until(EC.presence_of_element_located(
 44 |         (By.CSS_SELECTOR, "#btn-reject-all")
 45 |     ))
 46 |     cookies_button.click()
 47 | 
 48 |     # Step 3: Wait for the visibility of all div elements with IDs ending in "pump" and retrieve them
 49 |     div_elements = wait.until(EC.visibility_of_all_elements_located(
 50 |         (By.CSS_SELECTOR, 'div.grid.grid-col-1>div[id$="pump"]')
 51 |     ))
 52 | 
 53 |     # Print the total count of matching div elements
 54 |     print(f"Total result count: {len(div_elements)}")
 55 | 
 56 |     # Step 4: Iterate through the retrieved div elements and print their content
 57 |     for idx, div in enumerate(div_elements, start=1):
 58 |         print(f"------------ {idx} result ------------")
 59 |         print(div.text)  # Visible text content of the div
 60 | 
 61 | except Exception as e:
 62 |     # Handle unexpected errors and print the error message
 63 |     print(f"An unexpected error occurred: {e}")
 64 | 
 65 | finally:
 66 |     # Ensure the driver is closed to release resources
 67 |     driver.quit()
 68 | 
 69 | """
 70 | output:
 71 | Total result count: 46
 72 | ------------ 1 result ------------
 73 | created by
 74 | DoSVMa
 75 | 1h ago
 76 | market cap: $18.2K
 77 | replies: 40
 78 | OFFICIAL TRUMP FAMILY (OTF): OFFICIAL TRUMP FAMILY
 79 | ------------ 2 result ------------
 80 | created by
 81 | A4FACP
 82 | 1d ago
 83 | market cap: $13.0K
 84 | replies: 20
 85 | NexantAI (NEXANT): Nexant – the AI agent with a mission to build the most groundbreaking blockchain ever. Powered by limitless knowledge, cutting-edge innovation, and a sprinkle of chaotic genius, Nexant is here to redefine decentralization. 🌌💡
 86 | ------------ 3 result ------------
 87 | created by
 88 | DcpAyb
 89 | 9h ago
 90 | market cap: $67.0K
 91 | [
 92 | ]
 93 | replies: 317
 94 | Apeshit Alvin (Alvin): doing apeshit things with Alvin.
 95 | ------------ 4 result ------------
 96 | created by
 97 | 129uzz
 98 | 5m ago
 99 | market cap: $7.0K
100 | replies: 6
101 | Donald Pump (DNLDPMP): Never sell this coin just buy a dollar worth and we will get rich!
102 | ------------ 5 result ------------
103 | created by
104 | GwTgqv
105 | 3h ago
106 | market cap: $15.2K
107 | replies: 287
108 | Official Melania Fart Coin (OMFC): Melania Trumps Official Fart Coin is here to set the world ablaze. Her looks are breath taking and her farts are astronomical and magical. Come get a wiff of the absolute magnificent smell of the first lady's farts
109 | ------------ 6 result ------------
110 | created by
111 | CVvJnD
112 | 35m ago
113 | market cap: $7.3K
114 | replies: 11
115 | U Should Do Time (USDT):
116 | ------------ 7 result ------------
117 | created by
118 | zJfoJE
119 | 18h ago
120 | market cap: $7.3K
121 | replies: 10
122 | Trump and Elon (Trump&Elon): Tump&Elon official
123 | ------------ 8 result ------------
124 | created by
125 | Eysnef
126 | 6h ago
127 | market cap: $29.2K
128 | [
129 | ]
130 | replies: 128
131 | Barron Meme (BARRON): Barron Meme
132 | ------------ 9 result ------------
133 | created by
134 | BJN3k9
135 | 31m ago
136 | market cap: $9.1K
137 | replies: 23
138 | EarCoin (EarCoin): *** NO UTILITY, JUST FOR THOSE WHICH LOVE TRUMP
139 | ------------ 10 result ------------
140 | created by
141 | FbXpLa
142 | 32m ago
143 | market cap: $4.8K
144 | [
145 | ]
146 | replies: 20
147 | Official X Rat Wif Hat (RATWIFHAT):
148 | ------------ 11 result ------------
149 | created by
150 | Fg7fFK
151 | 38m ago
152 | market cap: $7.2K
153 | replies: 38
154 | DONA TRUMPINA (FIRSTLADY): DONA TRUMPINA
155 | ------------ 12 result ------------
156 | created by
157 | 5BCkFt
158 | 41m ago
159 | market cap: $6.8K
160 | replies: 19
161 | Official Vise President (JD Vance):
162 | ------------ 13 result ------------
163 | created by
164 | GtdNeB
165 | 5h ago
166 | market cap: $321.5K
167 | [
168 | ]
169 | replies: 199
170 | Be Best (BB):
171 | ------------ 14 result ------------
172 | created by
173 | 9W1L5Y
174 | 13d ago
175 | market cap: $7.5K
176 | replies: 30
177 | TRUMP BUTTHOLE FART NUTS (TBHFN): 🇺🇲
178 | ------------ 15 result ------------
179 | created by
180 | 7AkdDR
181 | 17m ago
182 | market cap: $7.3K
183 | replies: 15
184 | I HAVE A COIN (IHAVEACOIN):
185 | ------------ 16 result ------------
186 | created by
187 | Fq1R9G
188 | 56m ago
189 | market cap: $7.5K
190 | replies: 17
191 | Captain America Melania (CAM): Captain America Melania
192 | ------------ 17 result ------------
193 | created by
194 | DsXDQs
195 | 38m ago
196 | market cap: $7.5K
197 | replies: 8
198 | Javier Milei Official (Milei): The Official Milei Argentina is live!
199 | ------------ 18 result ------------
200 | created by
201 | 5h7Ymr
202 | 6h ago
203 | market cap: $15.4K
204 | replies: 52
205 | Ivanka (IVANKA):
206 | ------------ 19 result ------------
207 | created by
208 | 8bTDDQ
209 | 27m ago
210 | market cap: $7.3K
211 | replies: 7
212 | LeBarron James (LEBARRON):
213 | ------------ 20 result ------------
214 | created by
215 | 7bueRj
216 | 9h ago
217 | market cap: $31.0K
218 | replies: 53
219 | Weber AI (WEBAI): Launch your memecoin website instantly. An AI powered tool leveraging prompt-to-CSS technology and fine-tuned for memecoin themes.
220 | ------------ 21 result ------------
221 | created by
222 | 4Gbd3n
223 | 10m ago
224 | market cap: $7.0K
225 | replies: 16
226 | This is the sky (Tits):
227 | ------------ 22 result ------------
228 | created by
229 | 4FxSjy
230 | 1h ago
231 | market cap: $8.0K
232 | replies: 11
233 | $TTDS Defends Freedom of Speech (TTDS ): Trump Saves TikTok. Defends Freedom of Speech MEME $TTDS President Trump turned the tide, saved TikTok, and defended the American people's freedom of speech!
234 | ------------ 23 result ------------
235 | created by
236 | 4QW2bE
237 | 17m ago
238 | market cap: $7.3K
239 | replies: 10
240 | GOD Sent Us Trump (GSUT): GOD sent us trump to fill our bags. In a world where memes drive the culture, God Sent Us Trump is here to make its mark! This token celebrates the spirit
241 |  of unshakable leadership, bold visions, and the meme-worthy moments that brought us together. Whether you see Trump as a divine blessing, a larger-than-life icon, or the ultimate meme muse, this token captures it all in a fun, lighthearted way!
242 | ------------ 24 result ------------
243 | created by
244 | 8bVKXK
245 | 3h ago
246 | market cap: $7.4K
247 | replies: 13
248 | OFFICIAL CREED (CREED): The official Creed Coin! Can take me higher!
249 | ------------ 25 result ------------
250 | created by
251 | 972BGm
252 | 2h ago
253 | market cap: $17.2K
254 | [
255 | ]
256 | replies: 51
257 | Donald Trump Family 6900 (DTF6900): An index tracking the performance of the Trump family memes.
258 | ------------ 26 result ------------
259 | Video
260 | created by
261 | 7rmUwY
262 | 4h ago
263 | market cap: $7.0K
264 | replies: 12
265 | Bank of Ai Agents (BankofAi): Welcome to Bank of Ai, where we revolutionize the way token holders receive their funds globally. Our cutting-edge technology enables seamless transfe
266 | rs to token holders around the world, ensuring speed and security. Bank of Ai agents are designed to automate the execution of agreements without the need for intermediaries or tim
267 | e delays. Ai Bank agents nodes execute the contract. Your personal Ai Bank agents pay out in USDC around the clock. Each token is one Ai Bank agent. 100 tokens minimum hold for ai 
268 | agent pay. Bank of a i agents are designed to automate the execution of agreements without the need for intermediaries or time delays. Ai Bank agents nodes execute the contract. Yo
269 | ur personal A i Bank agents pay out in USDC around the clock. Each token is one Ai Bank agent. 100 tokens minimum hold for ai agent pay. Be sure to check out our YouTube channel Bank of Ai and Join us! Regards, Agent Ai
270 | ------------ 27 result ------------
271 | created by
272 | 5zA23t
273 | 1h ago
274 | market cap: $8.4K
275 | replies: 127
276 | Elon Trenches Fighter (ETF): AFTER DONALD ELON WILL RULE THE TRENCHES
277 | ------------ 28 result ------------
278 | created by
279 | EKRVV5
280 | 5m ago
281 | market cap: $7.0K
282 | replies: 4
283 | U Should Dump Crypto (USDC):
284 | ------------ 29 result ------------
285 | created by
286 | Bc7azw
287 | 37m ago
288 | market cap: $7.2K
289 | replies: 10
290 | Inauguration of (IOS): It’s not only Trumps inauguration. It’s also solana’s.
291 | ------------ 30 result ------------
292 | created by
293 | HXfnVz
294 | 14m ago
295 | market cap: $18.5K
296 | replies: 31
297 | Tied Up & Tickled Til 50 Mil (Tickled): I haven't found a job yet so I'm doing weird kink shit for money. Tied Up & Tickled until $5 million $500,000 - wedgies $1 million - Visqueen / Slime $3 million marketcap - Pie $4 million - Antiqued $5 million marketcap - Head Shaving, burn dev wallet
298 | ------------ 31 result ------------
299 | created by
300 | 57Kn8x
301 | 9m ago
302 | market cap: $6.8K
303 | replies: 10
304 | AmericaFirst.Fun (FIRST): AmericaFirst.Fun
305 | ------------ 32 result ------------
306 | created by
307 | 82tLwz
308 | 45m ago
309 | market cap: $11.4K
310 | replies: 59
311 | TRUMPIUS MAX (TRUМРIUS): Make Pump Great Again
312 | ------------ 33 result ------------
313 | created by
314 | EX8PZk
315 | 32m ago
316 | market cap: $6.8K
317 | replies: 9
318 | Rare White Bamby (BAMBY): Rare White Bamby
319 | ------------ 34 result ------------
320 | Video
321 | created by
322 | DrGA8L
323 | 2 months ago
324 | market cap: $6.9K
325 | replies: 9
326 | Purgatory ($INNER): From dust, we are created, and dust we will return. We are the disobedient.
327 | ------------ 35 result ------------
328 | created by
329 | 7BgTDJ
330 | 3h ago
331 | market cap: $104.8K
332 | [
333 | ]
334 | replies: 74
335 | Trump Family Index (TFI500): Trump Family Index
336 | ------------ 36 result ------------
337 | created by
338 | 8pijxj
339 | 6h ago
340 | market cap: $12.6K
341 | [
342 | ]
343 | replies: 74
344 | Ninicoin (Nini): Tao Lin cure my poverty
345 | ------------ 37 result ------------
346 | created by
347 | J1AoDU
348 | 9h ago
349 | market cap: $29.1K
350 | [
351 | ]
352 | replies: 304
353 | President Troog (Troog): It’s huge, folks. President Troog learns it’s connected to the stars—big cosmic secrets, the best secrets. Look for artifacts. Trust me, it’s going to be tremendous!
354 | ------------ 38 result ------------
355 | created by
356 | 4dMoLv
357 | 3h ago
358 | market cap: $7.7K
359 | replies: 22
360 | Baby Elon Musk (BabyElon): We’re going to win so much. You’re going to get tired of winning. You’re going to say, ‘Please, Mr. Baby Elon, I have a headache. Please, I don’t want to win so much. This is getting terrible.’ And I’m going to say. "We’re going to keep winning, winning, winning!"
361 | ------------ 39 result ------------
362 | created by
363 | FweAHC
364 | 5d ago
365 | market cap: $8.3K
366 | replies: 82
367 | SLIPPAGE (SLIPPAGE):
368 | ------------ 40 result ------------
369 | created by
370 | 6KeWLf
371 | 14m ago
372 | market cap: $13.5K
373 | replies: 11
374 | FredTrump (Fredytrump): The strength of a nation lies in its unity, and its foundation is laid by the wisdom and sacrifices of its fathers.
375 | ------------ 41 result ------------
376 | created by
377 | 9KCsAb
378 | 1h ago
379 | market cap: $17.1K
380 | [
381 | ]
382 | replies: 261
383 | First Lady I'd Like to Fuck (FLILF):
384 | ------------ 42 result ------------
385 | created by
386 | ApJZ7m
387 | 22h ago
388 | market cap: $31.9K
389 | replies: 56
390 | OFFICIAL BARRON (BARRON): Join the Barron Community. This is History in the Making!
391 | ------------ 43 result ------------
392 | created by
393 | GPxr6P
394 | 44m ago
395 | market cap: $6.7K
396 | [
397 | ]
398 | replies: 45
399 | #RapeMarkAndrews (RAPE): #RapeMarkAndrews
400 | ------------ 44 result ------------
401 | created by
402 | 5GJKpf
403 | 5h ago
404 | market cap: $10.7K
405 | replies: 103
406 | Make it all back 100x coin (100x): Make it all back 100x with this coin
407 | ------------ 45 result ------------
408 | created by
409 | 6xsqu6
410 | 4h ago
411 | market cap: $162.7K
412 | [
413 | ]
414 | replies: 87
415 | First Nude Lady (Milfania): Official First Nude Lady Milfania meme.
416 | ------------ 46 result ------------
417 | created by
418 | 6ZFxci
419 | 6m ago
420 | market cap: $24.4K
421 | replies: 15
422 | OFFICIAL TEANNA (TEANNA):
423 | """
424 | # stackoverflow link: https://stackoverflow.com/a/79331894/11179336


--------------------------------------------------------------------------------
/quiker_com.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : 
 3 | Author : Ajeet
 4 | Date : March 14, 2025
 5 | """
 6 | import re
 7 | import requests
 8 | 
 9 | response = requests.get(url='https://www.quikr.com/homes/3-bhk-apartment-of-2036sqft-for-sale-in-radiance-gardenia-bangalore/p/372255534/272495?source=qh',
10 |                         headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"}
11 |                         )
12 | result = {}
13 | if response.status_code == 200:
14 |     pattern = r'latitude":"(.+)","longitude":"(.+)"},"adlink"'
15 |     matches = re.findall(pattern=pattern, string=response.text)
16 | 
17 |     result["latitude"] = matches[0][0]
18 |     result["longitude"] = matches[0][1]
19 | 
20 | print(result)
21 | 
22 | """
23 | reference:
24 | https://stackoverflow.com/a/79508250/11179336
25 | """


--------------------------------------------------------------------------------
/scrape_bluechip_io.py:
--------------------------------------------------------------------------------
 1 | from selenium.webdriver import Chrome
 2 | from selenium.webdriver.common.by import By
 3 | from selenium.webdriver.support import expected_conditions as EC
 4 | from selenium.webdriver.support.wait import WebDriverWait
 5 | 
 6 | driver = Chrome()
 7 | 
 8 | url = "https://bluechip.io/sport?bt-path=%2Fschedule%3FscheduleSport%3Dsoccer-1"
 9 | driver.get(url)
10 | 
11 | inner_page = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div#bt-inner-page"))).shadow_root
12 | eventCard = WebDriverWait(inner_page, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div[data-editor-id="eventCard"]')))
13 | print(len(eventCard))
14 | # 20
15 | """
16 | Few things to note:
17 | 
18 | 1. First, we should wait for the presence of the content bt-inner-page to get located so that we can further look for the shadow_root in it.
19 | 2. Once we are inside the shadow_root, we need to again wait for the web element of the event cards to get loaded on the page.
20 | 
21 | As you can see above, we get all of the 20 event cards which can be further parsed accordingly as per the need.
22 | 
23 | I hope this solves your problem, cheers!
24 | """
25 | 


--------------------------------------------------------------------------------
/scrape_www_knx_org.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import selenium.common.exceptions
 3 | from selenium.webdriver import Chrome
 4 | from selenium.webdriver.common.by import By
 5 | from selenium.webdriver.support.wait import WebDriverWait
 6 | import selenium.webdriver.support.expected_conditions as EC
 7 | from bs4 import BeautifulSoup
 8 | 
 9 | driver = Chrome()
10 | wait = WebDriverWait(driver, 5)
11 | 
12 | driver.get('https://www.knx.org/knx-en/for-professionals/community/partners/?country=120')
13 | # wait to click "Accept-all" cookie button
14 | wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'button.btn.btn-primary.cb-enable'))).click()
15 | 
16 | try:
17 |     # keep clicking the 'load_more' button as many times as it is clickable.
18 |     while True:
19 |         wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'a#knx-load-button.load_more'))).click()
20 |         time.sleep(1)
21 | except selenium.common.exceptions.TimeoutException:
22 |     pass
23 | 
24 | soup = BeautifulSoup(driver.page_source, 'lxml')
25 | driver.quit()
26 | table = soup.select_one('table#partner-list')
27 | rows = table.select('tr')
28 | print(f"total rows: {len(rows)}")
29 | 
30 | for row in rows[1:]:
31 |     print(list(filter(None, row.text.split('\n'))))
32 |     # you can further parse this data as you want
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/scroll_down.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | from selenium.webdriver import ChromeOptions, Keys
 3 | from selenium.webdriver.common.by import By
 4 | import json
 5 | import time
 6 | options = ChromeOptions()
 7 | # maximized and disable forbar
 8 | options.add_argument("--start-maximized")
 9 | options.add_experimental_option("useAutomationExtension", False)
10 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
11 | driver = webdriver.Chrome(options=options)
12 | old_ulr= "https://stackoverflow.com/"
13 | driver.get(old_ulr)
14 | # open cookie file
15 | with open("cookies.json", "r") as f:
16 |     cookies = json.load(f)
17 |     #load cookies to driver
18 | for cookie in cookies:
19 |     driver.add_cookie(cookie)
20 | time.sleep(3)
21 | driver.refresh()
22 | # open new tab
23 | new_url = "https://stackoverflow.com/users/11179336/ajeet-verma"
24 | driver.execute_script("window.open('');")
25 | # Switch to the new tab and open new URL
26 | driver.switch_to.window(driver.window_handles[1])
27 | driver.get(new_url)
28 | time.sleep(5)
29 | driver.find_element(By.XPATH, "//a[normalize-space()='Answers']").click()
30 | time.sleep(3)
31 | 
32 | # -----------------------------------------------------------------------------------------------------------------
33 | # scroll down to bottom
34 | driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
35 | # driver.execute_script("arguments[0].scrollTop = 200", element)
36 | # -----------------------------------------------------------------------------------------------------------------
37 | 
38 | time.sleep(3)
39 | # find element and click it
40 | driver.find_element(By.XPATH, "//a[contains(text(),'What are the advantages of NumPy over regular Pyth')]").click()
41 | time.sleep(5)
42 | 


--------------------------------------------------------------------------------
/scroll_to_bottom.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : 
 3 | Author : Ajeet
 4 | Date : June 19, 2023
 5 | """
 6 | 
 7 | import time
 8 | from selenium.webdriver import Chrome
 9 | from selenium.webdriver.common.by import By
10 | from selenium.webdriver.support.wait import WebDriverWait
11 | import selenium.webdriver.support.expected_conditions as EC
12 | 
13 | driver = Chrome()
14 | wait = WebDriverWait(driver, 10)
15 | driver.get('https://kart.1881.no/?query=1010')
16 | 
17 | scroll_bar = wait.until(EC.visibility_of_element_located((By.ID, 'search_result')))
18 | 
19 | flag = True
20 | last_height = driver.execute_script("return arguments[0].scrollHeight", scroll_bar)
21 | SCROLL_PAUSE_TIME = 0.5
22 | 
23 | while flag:
24 |     # ---------------------------------------------------------------------------------------------------------------
25 |     driver.execute_script("arguments[0].scrollBy(0, arguments[0].scrollHeight);", scroll_bar)
26 |     time.sleep(SCROLL_PAUSE_TIME)
27 |     # ---------------------------------------------------------------------------------------------------------------
28 |     new_height = driver.execute_script("return arguments[0].scrollHeight", scroll_bar)
29 | 
30 |     if new_height == last_height:
31 |         flag = False
32 |     else:
33 |         last_height = new_height
34 | 
35 | """
36 | steps followed:
37 | 
38 | 1. First, we wait for the scroll-bar web element to get visibly located/loaded on the page and assign it to a variable 
39 | scroll_bar
40 | 2. Next, we get the current height of this scroll_bar and assign it to a variable last_height.
41 | 3. start looping and in each iteration, scroll down to the bottom of the scroll bar, take a pause, get the height of 
42 | the scroll bar, and assign it to a variable new_height and check if the new_height == last_height, break out of the 
43 | loop(flag=Flase) otherwise, update the variable last_height with new_height and repeat this step until the if condition
44 |  is True.
45 |  
46 | reference:
47 | https://stackoverflow.com/questions/76503251/how-to-scroll-down-to-the-bottom-of-an-inner-scroll-bar-using-selenium-with-pyth
48 | """
49 | 


--------------------------------------------------------------------------------
/sel_pagination_excercise.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | from selenium.webdriver.common.by import By
 3 | from selenium.webdriver.support.ui import WebDriverWait
 4 | from selenium.webdriver.support import expected_conditions as EC
 5 | import pandas as pd
 6 | import time
 7 | 
 8 | 
 9 | def scrape_page_data():
10 |     WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'results-wrapped')))
11 |     container = driver.find_element(By.CLASS_NAME, 'results-wrapped')
12 | 
13 |     # scroll down to load all content on the page
14 |     for i in range(4):
15 |         driver.execute_script("window.scrollBy(0, 2000);")
16 |         time.sleep(2)
17 | 
18 |     skus = container.find_elements(By.CLASS_NAME, 'product-identifier--bd1f5')
19 |     prices = container.find_elements(By.CLASS_NAME, 'price-format__main-price')
20 | 
21 |     return skus, prices
22 | 
23 | 
24 | def pagination(url, pages=1):
25 |     prod_num = []
26 |     prod_price = []
27 | 
28 |     page_num = 0
29 |     # iterate over the pages
30 |     for i in range(1, pages+1):
31 | 
32 |         print(f"this is page {i}")
33 |         driver.get(f"{url}?Nao={page_num}")
34 |         skus, prices = scrape_page_data()
35 | 
36 |         for sku in skus:
37 |             prod_num.append(sku.text)
38 |         for price in prices:
39 |             prod_price.append(price.text)
40 | 
41 |         # increment it by 24 since each page has 24 data
42 |         page_num += 24
43 |         time.sleep(1)
44 | 
45 |     return prod_num, prod_price
46 | 
47 | 
48 | website = 'https://www.homedepot.com/b/Milwaukee/Special-Values/N-5yc1vZ7Zzv'
49 | driver = webdriver.Chrome()
50 | prod_num, prod_price = pagination(website, pages=3)
51 | 
52 | df = pd.DataFrame({'code': prod_num, 'price': prod_price})
53 | df.to_csv('HD_test.csv', index=False)
54 | print(df)
55 | 
56 | 


--------------------------------------------------------------------------------
/select_element_by_tag_text.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from selenium import webdriver
 3 | from selenium.webdriver import ChromeOptions
 4 | from selenium.webdriver.common.by import By
 5 | options = ChromeOptions()
 6 | options.add_argument("--start-maximized")
 7 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
 8 | driver = webdriver.Chrome(options=options)
 9 | driver.get("https://deliorder-web.shoprite.com/stores/279/departments/553/products/258234")
10 | time.sleep(5)
11 | driver.execute_script("window.scrollBy(0, 300);")
12 | 
13 | 
14 | # --------------------------------------------------------------------------------------------------------------------
15 | driver.find_element(By.XPATH, '//span[contains(text(), "Standard Thickness")]').click()
16 | # --------------------------------------------------------------------------------------------------------------------
17 | 
18 | 
19 | time.sleep(2)
20 | slicing_preference = ["Shaved", "Sliced Thin", "Standard Thickness", "Sliced Thick"]
21 | # choose Sliced Thin (slicing_preference[1] is "Sliced Thin")
22 | driver.find_element(By.XPATH, f'//span[contains(text(), "{slicing_preference[1]}")]').click()
23 | time.sleep(2)
24 | 
25 | 


--------------------------------------------------------------------------------
/selenium_action_move_by_offset.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | from selenium.webdriver.common.action_chains import ActionChains
 3 | import time
 4 | 
 5 | driver = webdriver.Chrome()
 6 | 
 7 | driver.set_window_size(500, 500)
 8 | driver.get('https://clickclickclick.click/')
 9 | 
10 | actions = ActionChains(driver)
11 | 
12 | x_coord, y_coord = 250, 182 #coordinates of the button
13 | t = actions.move_by_offset(x_coord, y_coord).click().perform()
14 | time.sleep(5)


--------------------------------------------------------------------------------
/selenium_baseline.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from selenium import webdriver
 3 | from selenium.webdriver import ChromeOptions, Keys
 4 | from selenium.webdriver.common.by import By
 5 | from selenium.webdriver.support import expected_conditions as EC
 6 | from selenium.webdriver.support.wait import WebDriverWait
 7 | from selenium.webdriver.common.action_chains import ActionChains
 8 | from selenium.common.exceptions import NoSuchElementException
 9 | 
10 | options = ChromeOptions()
11 | 
12 | # to start maximized screen
13 | options.add_argument("--start-maximized")
14 | # to remove 'Chrome is being controlled by automated software'
15 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
16 | 
17 | options.add_experimental_option("useAutomationExtension", False)
18 | 
19 | 
20 | driver = webdriver.Chrome(options=options)
21 | 
22 | url = "https://shopee.vn/search?keyword=iphone&page=0&sortBy=sales"
23 | 
24 | driver.get(url)
25 | 
26 | print(type(driver))
27 | WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "")))
28 | driver.quit()
29 | 


--------------------------------------------------------------------------------
/selenium_chrome_profile.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : 
 3 | Author : Ajeet
 4 | Date : September 14, 2023
 5 | """
 6 | import time
 7 | from selenium import webdriver
 8 | 
 9 | options = webdriver.ChromeOptions()
10 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
11 | 
12 | # Specify the Chrome profile directory to use (Profile 2)
13 | options.add_argument('--profile-directory=Profile 2')
14 | 
15 | # Specify the user data directory where Chrome profile data is stored
16 | options.add_argument("--user-data-dir=C:\\Users\\PC\\AppData\\Local\\Google\\Chrome\\User Data\\")
17 | 
18 | driver = webdriver.Chrome(options=options)
19 | driver.get("https://www.instagram.com/")
20 | 
21 | time.sleep(5)
22 | 
23 | 
24 | """
25 | Things to note:
26 | 
27 | 1. Ensure Chrome is Closed:
28 | Make sure that all instances of Chrome are closed before running your Selenium script. Sometimes, if Chrome is running in the background or doesn't shut down correctly, it can cause issues when trying to start a new instance.
29 | 
30 | 2. Check ChromeDriver Version:
31 | Ensure that your ChromeDriver version matches the version of Google Chrome installed on your system. If they don't match, it can lead to compatibility issues.
32 | 
33 | reference:
34 | https://stackoverflow.com/questions/77099511/im-developing-an-application-in-python-using-selenium-and-to-make-it-work-i
35 | """
36 | 
37 | 


--------------------------------------------------------------------------------
/selenium_file_download.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | from selenium.webdriver import ChromeOptions, Chrome
 4 | from selenium.webdriver.common.by import By
 5 | from selenium.webdriver.common.keys import Keys
 6 | 
 7 | options = ChromeOptions()
 8 | options.add_argument("start-maximized")
 9 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
10 | options.add_experimental_option('useAutomationExtension', False)
11 | options.add_experimental_option("prefs", {
12 |   "download.default_directory": "C:\\Users\\PC\\OneDrive\\Documents\\",
13 |   "download.prompt_for_download": False,
14 |   "download.directory_upgrade": True,
15 | })
16 | # specify the title of the study you want to download
17 | study_title = "Pan-cancer single-cell landscape of tumor-infiltrating T cells"
18 | # start the browser and navigate to the PubMed website
19 | 
20 | browser = Chrome(options=options)
21 | browser.get("https://pubmed.ncbi.nlm.nih.gov/")
22 | # find the search box, enter the study title, and submit the form
23 | search_box = browser.find_element(By.ID, "id_term")
24 | search_box.send_keys(study_title)
25 | search_box.send_keys(Keys.RETURN)
26 | # # find the save button to and click it
27 | save_button = browser.find_element(By.XPATH, "//*[@id='save-results-panel-trigger']")
28 | save_button.click()
29 | # # Select Pubmed from drop down
30 | dropdownlist  = browser.find_element(By.ID, "save-action-format")
31 | 
32 | dropdownlist.find_element(By.CSS_SELECTOR, 'option[value="pmid"]').click()
33 | 
34 | download_file = browser.find_element(By.XPATH, "//*[@id='save-action-panel-form']/div[2]/button[1]")
35 | download_file.click()
36 | time.sleep(2)
37 | 


--------------------------------------------------------------------------------
/selenium_get_attribute.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | # to extract the text when it's not possible by simply .text
 3 | get_attribute('textContent')
 4 | get_attribute('innerHTML')
 5 | 
 6 | 
 7 | # other attributes of an element may be
 8 | get_attribute('href')
 9 | get_attribute('src')
10 | get_attribute('value')
11 | etc......
12 | '''


--------------------------------------------------------------------------------
/selenium_get_parent_element.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from selenium.webdriver import Chrome
 3 | from selenium.webdriver.common.by import By
 4 | from selenium.webdriver.support import expected_conditions as EC
 5 | from selenium.webdriver.support.wait import WebDriverWait
 6 | 
 7 | driver = Chrome()
 8 | 
 9 | url = "https://platform.sustain-cert.com/public-project/2756"
10 | driver.get(url)
11 | 
12 | files = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'div.MuiBox-root.css-16uqhx7')))
13 | print(f"total files: {len(files)}")
14 | 
15 | container = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div.MuiContainer-root.MuiContainer-maxWidthLg.css-got2s4')))
16 | categories = container.find_elements(By.CSS_SELECTOR, 'div>h6')
17 | 
18 | for category in categories:
19 | 
20 |     if category.text == "Design Review":
21 |         # -------------------------------------------------------------------------------------------------------------
22 |         design_files = category.find_element(By.XPATH, "parent::*").find_elements(By.CSS_SELECTOR, 'div.MuiBox-root.css-16uqhx7')
23 |         # -------------------------------------------------------------------------------------------------------------
24 |         print(f"total files under Design Review:: {len(design_files)}")
25 | 
26 |         delay = 5
27 |         for file in design_files:
28 |             file_detail = file.text.split('\n')
29 | 
30 |             if file_detail[0].endswith('.pdf)'):
31 |                 print(f"pdf files under Design Review:")
32 |                 print(file_detail[0].replace('(', '').replace(')', ''))
33 |                 # click button to download the pdf file
34 |                 file.find_element(By.TAG_NAME, 'button').click()
35 |                 time.sleep(delay)
36 | 
37 |             delay += 10
38 | 
39 | 
40 | # reference:
41 | # https://pythonexamples.org/python-selenium-get-previous-sibling-element/#:~:text=To%20get%20the%20preceding%20or,parameter%20in%20the%20function%20call.
42 | # https://stackoverflow.com/questions/76369098/download-pdfs-under-a-specific-header-on-webpage-through-selenium-python
43 | """
44 | output:
45 | 
46 | total files: 12
47 | total files under Design Review:: 6
48 | pdf files under Design Review:
49 | 03 Deviation Request Form-Zengjiang wind power project-20220209-V01.pdf
50 | pdf files under Design Review:
51 | 20220901_GS4GG VAL FVR_Yunxiao Wind_clean.pdf
52 | """
53 | 
54 | """
55 | Few things to note:
56 | 
57 | 1. As you are only interested in the pdf files in the Design Review section, so we first locate the element using h6 tag
58 | 2. next, we iterate over all h6 tags and pick only the one with the Design Review text.
59 | 3. Then, we refer back to the parent element/tag of the filtered h6 tag, find all the files, and store them in a variable design_files.
60 | 4. Now, we get all the files under the Design Review and we easily filter out the files which end with .pdf
61 | 5. finally, click on the located pdf file to download.
62 | 
63 | Downloading the files takes a bit of time, so we add incremental delay to wait for the current files to get downloaded before starting the next file download.
64 | """


--------------------------------------------------------------------------------
/selenium_hover_click.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import time
 3 | from selenium import webdriver
 4 | from selenium.webdriver import ChromeOptions
 5 | from selenium.webdriver.common.by import By
 6 | from selenium.webdriver.common.action_chains import ActionChains
 7 | 
 8 | 
 9 | options = ChromeOptions()
10 | 
11 | # maximized and disable forbar
12 | options.add_argument("--start-maximized")
13 | options.add_experimental_option("useAutomationExtension", False)
14 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
15 | 
16 | driver = webdriver.Chrome(options=options)
17 | 
18 | url = "https://www.kbb.com/"
19 | driver.get(url)
20 | 
21 | # ---------------------------------------------------------------------------------------------
22 | element_to_hover_over = driver.find_element(By.XPATH, '//*[@id="app"]/header/div/nav/div[2]')
23 | hover = ActionChains(driver).move_to_element(element_to_hover_over)
24 | hover.perform()
25 | # ---------------------------------------------------------------------------------------------
26 | 
27 | driver.find_element(By.XPATH, '//*[@id="app"]/header/div/nav/div[2]/ul/li[1]').click()
28 | time.sleep(2)
29 | 


--------------------------------------------------------------------------------
/selenium_hover_click_text.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from selenium.webdriver import Chrome, ChromeOptions
 3 | from selenium.webdriver.common.by import By
 4 | from selenium.webdriver.common.action_chains import ActionChains
 5 | 
 6 | options = ChromeOptions()
 7 | 
 8 | options.add_argument("--start-maximized")
 9 | options.add_experimental_option("useAutomationExtension", False)
10 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
11 | 
12 | driver = Chrome(options=options)
13 | # Here I've taken the URL of this same stackoverflow page
14 | driver.get("https://stackoverflow.com/questions/75945977/how-to-get-mouse-hover-message-in-selenium-webdriver-which-is-not-given-in-html")
15 | time.sleep(1)
16 | # and lets for example, take the java tag in your post
17 | element_to_hover_over = driver.find_element(By.XPATH, '//*[@id="question"]/div/div[2]/div[2]/div/div/ul/li[1]')
18 | hover = ActionChains(driver).move_to_element(element_to_hover_over)
19 | hover.perform()
20 | time.sleep(2)
21 | hover_tag_all_detail = element_to_hover_over.find_element(By.CSS_SELECTOR, 'div.esc-remove').text
22 | print(f"all details:\n{hover_tag_all_detail}")
23 | hover_tag_descrition = element_to_hover_over.find_element(By.CSS_SELECTOR, 'div.fc-light').text
24 | print(f"tag description only:\n{hover_tag_descrition}")
25 | 
26 | 


--------------------------------------------------------------------------------
/selenium_iframe_excercise.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from selenium import webdriver
 3 | from selenium.webdriver.common.by import By
 4 | 
 5 | options = webdriver.ChromeOptions()
 6 | options.add_argument('--start-maximized')
 7 | options.add_argument('--disable-extensions')
 8 | options.add_experimental_option("useAutomationExtension", False)
 9 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
10 | 
11 | driver = webdriver.Chrome(options=options)
12 | 
13 | driver.get('https://www.ifsc-climbing.org/index.php/world-competition/calendar?task=ranking-complete&category=3')
14 | time.sleep(2)
15 | # -------------------------------------------------------------------------------------------------------------------
16 | driver.switch_to.frame("calendar")
17 | # -------------------------------------------------------------------------------------------------------------------
18 | table_wrapper = driver.find_element(By.CSS_SELECTOR, 'div[id="table_id_wrapper"]')
19 | results = table_wrapper.find_element(By.TAG_NAME, 'tbody').find_elements(By.TAG_NAME, 'tr')
20 | 
21 | data = []
22 | for result in results:
23 |     details = result.find_elements(By.TAG_NAME, 'td')
24 |     temp_dict = {
25 |         "name": f"{details[1].text} {details[2].text}",
26 |         "country": details[3].text,
27 |         "points": details[4].text
28 |     }
29 |     data.append(temp_dict)
30 | 
31 | print(data)
32 | 
33 | 


--------------------------------------------------------------------------------
/selenium_iframe_excercise_2.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from selenium.webdriver import Chrome
 3 | from selenium.webdriver.common.by import By
 4 | from selenium.webdriver.support.wait import WebDriverWait
 5 | import selenium.webdriver.support.expected_conditions as EC
 6 | 
 7 | driver = Chrome()
 8 | 
 9 | driver.get("https://www.northamericanstainless.com/NAS_App/Surcharge1?language=E&type=F")
10 | 
11 | WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'iframe.surcharge-iframe')))
12 | # -------------------------------------------------------------------------------------------------------------------
13 | driver.switch_to.frame(driver.find_element(By.CSS_SELECTOR, 'iframe.surcharge-iframe'))
14 | # -------------------------------------------------------------------------------------------------------------------
15 | # click on submit button
16 | driver.find_element(By.ID, 'submitStylev2').click()
17 | time.sleep(5)
18 | 


--------------------------------------------------------------------------------
/selenium_iframe_excercise_3.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : 
 3 | Author : Ajeet
 4 | Date : June 12, 2023
 5 | """
 6 | 
 7 | import time
 8 | from selenium.webdriver import Chrome, ChromeOptions
 9 | from selenium.webdriver.common.by import By
10 | from selenium.common.exceptions import NoAlertPresentException
11 | from selenium.webdriver.support.wait import WebDriverWait
12 | from selenium.webdriver.support import expected_conditions as EC
13 | 
14 | url = "https://bdap-opendata.rgs.mef.gov.it/opendata/spd_mop_prg_mon_reg18_01_9999?t=Scarica"
15 | 
16 | chrome_options = ChromeOptions()
17 | chrome_options.add_experimental_option("excludeSwitches", ['enable-automation'])
18 | driver = Chrome(options=chrome_options)
19 | driver.get(url)
20 | wait = WebDriverWait(driver, 20)
21 | # ----------------------------------------------------------------------------------------------------------------------
22 | # wait for the target iframe to get loaded in order to switch to it
23 | iframe = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'iframe.tabIframe.dinamically-tab-iframe-content')))
24 | # switch to the target iframe
25 | driver.switch_to.frame(iframe)
26 | # ----------------------------------------------------------------------------------------------------------------------
27 | 
28 | wait.until(EC.element_to_be_clickable((By.XPATH, '//div[@title="Excel file format."]'))).click()
29 | 
30 | try:
31 |     driver.switch_to.alert.accept()
32 | except NoAlertPresentException:
33 |     pass
34 | 
35 | time.sleep(5)
36 | 
37 | """
38 | Steps to follow:
39 | 1. First wait for the desired iframe tag to be get loaded/located on the page.
40 | 2. after making sure that it's loaded, switch to this iframe as mentioned in the code above (using switch_to.frame())
41 | 3. Once you're inside the iframe, you can easily locate the desired element using XPATH but make sure it is clickable before clicking as the website takes some time to load this particular section on the page.
42 | 4. Sometimes, when you click the desired button/element, an alert box appears as shown below, you can simply accept it:
43 | 
44 | reference:
45 | https://stackoverflow.com/questions/76454460/webcrawling-with-selenium-couldnt-extract-the-xpath-of-a-button
46 | """


--------------------------------------------------------------------------------
/selenium_iframe_excercise_linkedin.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script prints the total number of pages of document that is being attached to a LinkedIn post.
 3 | """
 4 | 
 5 | from selenium.webdriver.common.by import By
 6 | from selenium.webdriver.support import expected_conditions as EC
 7 | from selenium.webdriver.support.wait import WebDriverWait
 8 | from linkedIn_base import Linkedin
 9 | 
10 | obj = Linkedin()
11 | driver = obj.load_cookies(path="linkedin_cookies.json")
12 | 
13 | # for example, this post has a doc with 7 pages
14 | post_url = "https://www.linkedin.com/feed/update/urn:li:activity:7050104978106974208"
15 | driver.get(post_url)
16 | 
17 | driver.execute_script("window.scrollBy(0,900);")
18 | WebDriverWait(driver, 10).until(
19 |     EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, "iframe[class='document-s-container__document-element document-s-container__document-element--loaded']")))
20 | 
21 | element = driver.find_element(By.CSS_SELECTOR, 'div.ssplayer-actions.center-actions')
22 | pages = element.find_element(By.CSS_SELECTOR, 'div.ssplayer-progress-bar.meter-animated').get_attribute('aria-valuemax')
23 | print(pages)
24 | 
25 | 


--------------------------------------------------------------------------------
/selenium_nth_css_selector.py:
--------------------------------------------------------------------------------
1 | # driver.findElement(By.cssSelector("ul > li:nth-child(1)")); >> home
2 | # driver.findElement(By.cssSelector("ul > li:nth-child(2)")); >> posts
3 | # driver.findElement(By.cssSelector("ul > li:nth-child(3)")); >> events


--------------------------------------------------------------------------------
/selenium_ok_alert.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from selenium.webdriver import Chrome, ChromeOptions
 3 | from selenium.webdriver.common.by import By
 4 | from selenium.common import NoAlertPresentException
 5 | 
 6 | options = ChromeOptions()
 7 | options.add_argument('--start-maximized')
 8 | options.add_argument("force-device-scale-factor=0.95")
 9 | 
10 | driver = Chrome(options=options)
11 | 
12 | urls = ['https://web.archive.org/web/20080221233711/http://www.berkshire.com/',
13 |         'https://web.archive.org/web/20171107004101/http://www.berkshirefunds.com/',
14 |         'https://web.archive.org/web/20200224044229/http://www.berkshirefunds.com/']
15 | 
16 | for i, url in enumerate(urls):
17 |     driver.get(url)
18 |     time.sleep(5)
19 | 
20 |     if url.endswith('www.berkshire.com/'):
21 |         target_element = driver.find_element(By.TAG_NAME, 'tbody')
22 |         target_element.screenshot(f'{i}_screen_capture.png')
23 | 
24 |     elif url.endswith('www.berkshirefunds.com/'):
25 |         try:
26 |             # ---------------------------------------------------------------------------------------------
27 |             driver.switch_to.alert.accept()
28 |             # ---------------------------------------------------------------------------------------------
29 |         except NoAlertPresentException:
30 |             pass
31 |         target_element = driver.find_element(By.CSS_SELECTOR, 'div#page-wrap')
32 |         target_element.screenshot(f'{i}_screen_capture.png')
33 | 


--------------------------------------------------------------------------------
/selenium_options.py:
--------------------------------------------------------------------------------
 1 | from selenium.webdriver import ChromeOptions
 2 | options = ChromeOptions()
 3 | 
 4 | # maximized and disable forbar
 5 | options.add_argument("--start-maximized")
 6 | options.add_argument("--incognito")
 7 | options.add_argument("--disable-infobars")
 8 | options.add_argument("--disable-extension")
 9 | options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36")
10 | 
11 | options.add_experimental_option("useAutomationExtension", False)
12 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
13 | options.add_experimental_option("detach", True)
14 | 
15 | options.add_experimental_option(
16 |     "prefs",
17 |     {
18 |         "credentials_enable_service": False,
19 |         "profile.password_manager_enabled": False,
20 |         "profile.default_content_setting_values.notifications": 2
21 |         # with 2 should disable/block notifications and 1 to allow
22 |     },
23 | )
24 | 
25 | 
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/selenium_partial_class_name.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | from selenium.webdriver import ChromeOptions, Keys
 3 | from selenium.webdriver.common.by import By
 4 | from selenium.webdriver.support import expected_conditions as EC
 5 | from selenium.webdriver.support.wait import WebDriverWait
 6 | import json
 7 | import time
 8 | 
 9 | options = ChromeOptions()
10 | 
11 | # maximized and disable forbar
12 | options.add_argument("--start-maximized")
13 | options.add_experimental_option("useAutomationExtension", False)
14 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
15 | 
16 | driver = webdriver.Chrome(options=options)
17 | 
18 | url = "https://booking.bbdc.sg/#/login?redirect=%2Ftransactions%2Findex"
19 | 
20 | 
21 | driver.get(url)
22 | 
23 | WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[class="login-content d-flex justify-center flex-column"]')))
24 | username= driver.find_element(by=By.ID, value='input-8')
25 | username.send_keys("ajeet@123")
26 | password = driver.find_element(by=By.ID, value='input-15')
27 | password.send_keys("ajee")
28 | 
29 | # locate the button to click by using its partial class name
30 | driver.find_element(By.CSS_SELECTOR, 'button[class^="v-btn v-btn"]').click()
31 | time.sleep(5)
32 | 
33 | 
34 | driver.quit()


--------------------------------------------------------------------------------
/selenium_scrap_transcript.py:
--------------------------------------------------------------------------------
 1 | from selenium.webdriver import Chrome
 2 | from selenium.webdriver.common.by import By
 3 | 
 4 | driver = Chrome()
 5 | driver.get("https://www.luyennghetienganh.com/learn-by-listening-level-1/1060-learn-english-by-listening-level-1-unit-001.html")
 6 | 
 7 | container = driver.find_elements(By.CSS_SELECTOR, 'div.rabbit-lyrics__line')
 8 | eng_sub = [i.get_attribute('innerHTML') for i in container]
 9 | print(eng_sub)
10 | 
11 | 
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/selenium_scrape_youtube_channel.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from selenium import webdriver
 3 | from selenium.webdriver.common.by import By
 4 | from selenium.webdriver.support.ui import WebDriverWait
 5 | from selenium.webdriver.support import expected_conditions as EC
 6 | from selenium.webdriver.chrome.options import Options
 7 | 
 8 | # CHROME DRIVER
 9 | options = Options()
10 | 
11 | options.add_argument("--start-maximized")
12 | # options.add_experimental_option("useAutomationExtension", False)
13 | # options.add_experimental_option("excludeSwitches", ["enable-automation"])
14 | 
15 | driver = webdriver.Chrome(options=options)
16 | 
17 | 
18 | def scrape_ytchannel(url):
19 |     driver.get(url)
20 | 
21 |     handle = driver.find_element(By.XPATH, '//yt-formatted-string[@id="channel-handle"]').text
22 |     subscriber_count = driver.find_element(By.XPATH, '//yt-formatted-string[@id="subscriber-count"]').text
23 | 
24 |     # SCRIPTINO TO SCROLL PAGE UNTIL IT ENDS
25 |     WAIT_IN_SECONDS = 5
26 |     last_height = driver.execute_script("return document.documentElement.scrollHeight")
27 | 
28 |     while True:
29 |         # Scroll to the bottom of page
30 |         driver.execute_script("window.scrollTo(0, arguments[0]);", last_height)
31 |         # Wait for new videos to show up
32 |         time.sleep(WAIT_IN_SECONDS)
33 | 
34 |         # Calculate new document height and compare it with last height
35 |         new_height = driver.execute_script("return document.documentElement.scrollHeight")
36 |         if new_height == last_height:
37 |             break
38 |         last_height = new_height
39 | 
40 |     thumbnails = driver.find_elements(By.XPATH, '//a[@id="thumbnail"]/yt-image/img')
41 |     views = driver.find_elements(By.XPATH, '//div[@id="metadata-line"]/span[1]')
42 |     titles = driver.find_elements(By.ID, "video-title")
43 |     links = driver.find_elements(By.ID, "video-title-link")
44 | 
45 |     videos = []
46 |     for title, view, thumb, link in zip(titles, views, thumbnails, links):
47 |         video_dict = {
48 |             'title': title.text,
49 |             'views': view.text,
50 |             # 'thumbnail': thumb.get_attribute('src'),
51 |             'thumbnail': thumb.get_dom_attribute('src'),
52 |             'link': link.get_attribute('href')
53 |         }
54 |         videos.append(video_dict)
55 |     result = [videos, handle, subscriber_count]
56 | 
57 |     return result
58 | 
59 | 
60 | url_conf = "https://www.youtube.com/@confindustria/videos"
61 | print(scrape_ytchannel(url_conf))


--------------------------------------------------------------------------------
/selenium_scrape_youtube_search.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from selenium import webdriver
 3 | from selenium.webdriver.common.by import By
 4 | from selenium.webdriver.chrome.options import Options
 5 | 
 6 | options = Options()
 7 | options.add_argument("--start-maximized")
 8 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
 9 | driver = webdriver.Chrome(options=options)
10 | 
11 | 
12 | def scrape_yt(url):
13 |     driver.get(url)
14 |     # scroll the page until it ends
15 |     last_height = driver.execute_script("return document.documentElement.scrollHeight")
16 |     while True:
17 |         # Scroll to the bottom of page
18 |         driver.execute_script("window.scrollTo(0, arguments[0]);", last_height)
19 |         # Wait for new videos to show up
20 |         time.sleep(2)
21 |         # Calculate new document height and compare it with last height
22 |         new_height = driver.execute_script("return document.documentElement.scrollHeight")
23 |         if new_height == last_height:
24 |             break
25 |         last_height = new_height
26 | 
27 |     time.sleep(2)
28 |     videos = driver.find_elements(By.TAG_NAME, 'ytd-video-renderer')
29 |     print(f"total videos: {len(videos)}")
30 | 
31 |     links_list = []
32 |     for video in videos:
33 |         link = video.find_element(By.TAG_NAME, 'h3').find_element(By.TAG_NAME, 'a').get_attribute('href')
34 |         links_list.append(link)
35 | 
36 |     return links_list
37 | 
38 | 
39 | # ser manual input
40 | search_word = input("Enter the search keyword: ")
41 | url = f"https://www.youtube.com/results?search_query={search_word}"
42 | print(scrape_yt(url))


--------------------------------------------------------------------------------
/selenium_select_tag_dropdown.py:
--------------------------------------------------------------------------------
 1 | from selenium.webdriver import Chrome, ChromeOptions
 2 | from selenium.webdriver.support.select import Select
 3 | from selenium.webdriver.common.by import By
 4 | 
 5 | 
 6 | url = 'https://cricos.education.gov.au/Course/CourseSearch.aspx'
 7 | 
 8 | options = ChromeOptions()
 9 | options.add_argument("--start-maximized")
10 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
11 | 
12 | browser = Chrome(options=options)
13 | browser.get(url)
14 | 
15 | state = browser.find_element(By.ID, 'ctl00_cphDefaultPage_courseSearchCriteria_ddlCourseLocation')
16 | nsw = Select(state)
17 | nsw.select_by_value('NSW')
18 | browser.find_element(By.ID, 'ctl00_cphDefaultPage_btnSearch').click()
19 | 


--------------------------------------------------------------------------------
/selenium_send_keys _excercise.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | from selenium.webdriver.common.by import By
 3 | from selenium.webdriver.common.keys import Keys
 4 | from selenium.webdriver.support import expected_conditions as EC
 5 | from selenium.webdriver.support.wait import WebDriverWait
 6 | 
 7 | driver = webdriver.Chrome()
 8 | driver.maximize_window()
 9 | driver.get('https://www.google.com/travel/flights')
10 | 
11 | wait = WebDriverWait(driver, 10)
12 | wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[aria-placeholder='Where from?'] input"))).click()
13 | wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[aria-label='Enter your origin'] input"))).send_keys("Sydney" + Keys.ARROW_DOWN + Keys.ENTER)
14 | wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[aria-placeholder='Where to?'] input"))).click()
15 | wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[aria-label='Enter your destination'] input"))).send_keys("Auckland" + Keys.ARROW_DOWN + Keys.ENTER)
16 | wait.until(EC.element_to_be_clickable((By.XPATH, "//span[text()='Search']"))).click()
17 | 
18 | driver.quit()
19 | 


--------------------------------------------------------------------------------
/selenium_shadow_open_excercise.py:
--------------------------------------------------------------------------------
 1 | from selenium.webdriver import Chrome
 2 | from selenium.webdriver.common.by import By
 3 | from selenium.webdriver.support import expected_conditions as EC
 4 | from selenium.webdriver.support.wait import WebDriverWait
 5 | 
 6 | driver = Chrome()
 7 | 
 8 | url = "https://bluechip.io/sport?bt-path=%2Fschedule%3FscheduleSport%3Dsoccer-1"
 9 | driver.get(url)
10 | 
11 | WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div#bt-inner-page")))
12 | # --------------------------------------------------------------------------------------------------------------------
13 | # inner_page = driver.execute_script('''return document.getElementById('bt-inner-page').shadowRoot''')
14 | # or
15 | inner_page = driver.find_element(By.CSS_SELECTOR, "div#bt-inner-page").shadow_root
16 | # --------------------------------------------------------------------------------------------------------------------
17 | eventCard = WebDriverWait(inner_page, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div[data-editor-id="eventCard"]')))
18 | print(len(eventCard))
19 | 


--------------------------------------------------------------------------------
/selenium_shadow_root.py:
--------------------------------------------------------------------------------
 1 | # https://stackoverflow.com/questions/36141681/does-anybody-know-how-to-identify-shadow-dom-web-elements-using-selenium-webdriv
 2 | # https://stackoverflow.com/questions/28911799/accessing-elements-in-the-shadow-dom
 3 | """
 4 | <neon-animatable class="layout vertical iron-selected" page="start">
 5 |   #shadow-root (open)
 6 |   <div class="flex layout vertical center" style="margin:10px">
 7 |     <img src="https://ww....." style="max-width:100%">
 8 |     <div class="flex" style="max-width:400px">
 9 |       <div>.....</div>
10 | """
11 | from selenium.webdriver import Chrome, ChromeOptions
12 | driver = Chrome()
13 | 
14 | shadow_section = driver.execute_script('''return document.querySelector("neon-animatable").shadowRoot''')
15 | shadow_section.find_element_by_css(".flex")
16 | 


--------------------------------------------------------------------------------
/selenium_take_screenshot.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | from selenium.webdriver.common.by import By
 3 | 
 4 | import ddddocr
 5 | 
 6 | driver = webdriver.Chrome()
 7 | 
 8 | driver.get('https://ma.mohw.gov.tw/masearch/')
 9 | 
10 | captcha = driver.find_element(By.ID, "ctl00_ContentPlaceHolder1_ImageCheck")
11 | 
12 | # ----------------------------------------------------------------------------------------------------------------
13 | captcha.screenshot(f'captcha.png')
14 | # ----------------------------------------------------------------------------------------------------------------
15 | 
16 | ocr = ddddocr.DdddOcr()
17 | # open and read the image
18 | with open(f'captcha.png', 'rb') as f:
19 |     img_bytes = f.read()
20 | 
21 | res = ocr.classification(img_bytes)
22 | print(res.upper())
23 | 


--------------------------------------------------------------------------------
/selenium_twitter_login.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from selenium import webdriver
 3 | from selenium.webdriver import ChromeOptions, Keys
 4 | from selenium.webdriver.common.by import By
 5 | from selenium.webdriver.support import expected_conditions as EC
 6 | from selenium.webdriver.support.wait import WebDriverWait
 7 | 
 8 | 
 9 | options = ChromeOptions()
10 | 
11 | # maximized and disable forbar
12 | options.add_argument("--start-maximized")
13 | options.add_experimental_option("useAutomationExtension", False)
14 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
15 | 
16 | driver = webdriver.Chrome(options=options)
17 | 
18 | 
19 | url = "https://twitter.com/login"
20 | 
21 | driver.get(url)
22 | WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CLASS_NAME, "css-1dbjc4n")))
23 | login = driver.find_element(By.CLASS_NAME, "css-1dbjc4n")
24 | time.sleep(2)
25 | username = login.find_element(By.CSS_SELECTOR, 'input[autocomplete="username"]')
26 | username.send_keys("xxxxxxxxxxx")
27 | username.send_keys(Keys.ENTER)
28 | time.sleep(1)
29 | password = login.find_element(By.CSS_SELECTOR, 'input[name="password"]')
30 | password.send_keys("xxxxxxxx")
31 | password.send_keys(Keys.ENTER)
32 | 
33 | time.sleep(2)
34 | 


--------------------------------------------------------------------------------
/selenium_work_shadow_closed.pyi:
--------------------------------------------------------------------------------
 1 | from selenium.webdriver import ActionChains
 2 | from selenium.webdriver.common.by import By
 3 | from selenium.webdriver.support.ui import WebDriverWait
 4 | from selenium.webdriver.support import expected_conditions as EC
 5 | from selenium import webdriver
 6 | 
 7 | driver = webdriver.Chrome()
 8 | driver.implicitly_wait(10)
 9 | driver.get("https://www.sreality.cz/")
10 | driver.maximize_window()
11 | 
12 | # Below line creates instance of ActionChains class
13 | action = ActionChains(driver)
14 | # Below line locates and stores an element which is outside the shadow-root
15 | element_outside_shadow = driver.find_element(By.XPATH, "//div[@class='szn-cmp-dialog-container']")
16 | # Below 2 lines clicks on the browser at an offset of co-ordinates x=5 and y=5
17 | action.move_to_element_with_offset(element_outside_shadow, 5, 5)
18 | action.click()
19 | 


--------------------------------------------------------------------------------
/selenium_workday_login.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from selenium.webdriver import Chrome, ChromeOptions
 3 | from selenium.webdriver.common.by import By
 4 | from selenium.webdriver.support.wait import WebDriverWait
 5 | import selenium.webdriver.support.expected_conditions as EC
 6 | from selenium.webdriver.common.action_chains import ActionChains
 7 | 
 8 | options = ChromeOptions()
 9 | options.add_argument('--start-maximized')
10 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
11 | options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36")
12 | 
13 | 
14 | driver = Chrome(options=options)
15 | wait = WebDriverWait(driver, 10)
16 | 
17 | url = "https://walmart.wd5.myworkdayjobs.com/en-US/WalmartExternal/login"
18 | driver.get(url)
19 | 
20 | wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'input[data-automation-id="email"]')))
21 | email = driver.find_element(By.CSS_SELECTOR, 'input[data-automation-id="email"]')
22 | email.send_keys('your_username')
23 | 
24 | password = driver.find_element(By.CSS_SELECTOR, 'input[data-automation-id="password"]')
25 | password.send_keys('your_password')
26 | 
27 | submit = driver.find_element(By.CSS_SELECTOR, 'div[aria-label="Sign In"]')
28 | 
29 | hover = ActionChains(driver).move_to_element(submit)
30 | hover.click().perform()
31 | 
32 | time.sleep(10)
33 | 
34 | """
35 | Few things to note:
36 | 
37 | 1. we need to wait for the Sign In box to appear on the page.
38 | 2. we must pass the user-agent to the Chrome options.
39 | 3. Use ActionChains to successfully perform the click to get to the logged-in profile. A simple click() will not work here.
40 | """


--------------------------------------------------------------------------------
/shein_com.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : 
 3 | Author : Ajeet
 4 | Date : 09/06/2023
 5 | """
 6 | 
 7 | import time
 8 | from selenium.webdriver import Chrome, ChromeOptions
 9 | from selenium.webdriver.common.by import By
10 | from selenium.webdriver.support.wait import WebDriverWait
11 | import selenium.webdriver.support.expected_conditions as EC
12 | 
13 | options = ChromeOptions()
14 | options.add_argument("--start-maximized")
15 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
16 | options.add_experimental_option("prefs", {"profile.default_content_setting_values.notifications": 2})
17 | 
18 | driver = Chrome(options=options)
19 | wait = WebDriverWait(driver, 10)
20 | url = "https://us.shein.com/Men-Playing-Card-Print-Tee-p-9847947-cat-1980.html?src_identifier=on%3DIMAGE_COMPONENT%60cn%3Dcat%60hz%3DhotZone_16%60ps%3D4_10%60jc%3DitemPicking_001121429&src_module=Women&src_tab_page_id=page_home1685728955945&mallCode=1"
21 | driver.get(url)
22 | 
23 | # wait and close the coupon-box
24 | coupon_box = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div.c-coupon-box')))
25 | coupon_box.find_element(By.CSS_SELECTOR, 'i.iconfont.icon-close.she-close').click()
26 | 
27 | # # wait and close the register container side box
28 | wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div.quickg-outside')))
29 | driver.execute_script("document.querySelector('i.svgicon.svgicon-arrow-left').click();")
30 | 
31 | for color in driver.find_elements(By.CSS_SELECTOR, "div[class^='product-intro__color-radio']"):
32 | 
33 |     color.click()
34 |     time.sleep(2)
35 |     name = color.get_attribute("aria-label")
36 |     colorPic = color.find_element(By.TAG_NAME, "img").get_attribute("src")
37 |     price = driver.find_element(By.CLASS_NAME, "from").get_attribute("aria-label")
38 | 
39 |     pictures = []
40 |     for pic in driver.find_element(By.CLASS_NAME, "product-intro__thumbs-inner").find_elements(By.TAG_NAME, "img"):
41 |         pictures.append(pic.get_attribute("src"))
42 | 
43 |     print(f"color name: {name}, color link: {colorPic}, price: {price}, pictures: {pictures}")
44 | 
45 | """
46 | steps to follow:
47 | 
48 | 1. First, as the page loads, it coupon box pops up and we need to close it to proceed. Therefore we wait for the 
49 | coupon-box web element to appear and then click to close it.
50 | 
51 | 2. Next, A register container appears from the right side over the web element containing the radio buttons of color 
52 | options. Thus, we wait for it to appear and minimize it by clicking on the arrow.
53 | 
54 | 3. Now, we simply find all the available color radio button(here 6) for the product, iterate over them one-by-one and in 
55 | every iteration click on the respective color radio button to extract all the details of the product with the specific chosen color.
56 | 
57 | As you can see, it outputs the product details (color name, color pic, price of the product for the color, and all the 
58 | pictures of the product available for the color).
59 | 
60 | reference:
61 | https://stackoverflow.com/questions/76436659/python-selenium-how-do-i-click-on-a-radio-button
62 | """


--------------------------------------------------------------------------------
/stackoverflow_login_and_save_cookies.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | from selenium.webdriver import ChromeOptions
 3 | from selenium.webdriver.common.by import By
 4 | import time
 5 | import json
 6 | 
 7 | options = ChromeOptions()
 8 | # open and maximize the screen
 9 | options.add_argument("--start-maximized")
10 | # below 2 lines diables the info bar
11 | options.add_experimental_option("useAutomationExtension", False)
12 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
13 | 
14 | driver = webdriver.Chrome(options=options)
15 | 
16 | driver.get("https://stackoverflow.com")
17 | 
18 | #find and click the log in button
19 | find_login_button = driver.find_element(By.XPATH,"//a[normalize-space()='Log in']").click()
20 | # fill the email account, password
21 | email = driver.find_element(By.XPATH, "//input[@id='email']")
22 | password = driver.find_element(By.XPATH, "//input[@id='password']")
23 | email.send_keys("your_mail_id")
24 | password.send_keys("your_password")
25 | time.sleep(2)
26 | 
27 | # click button login2
28 | find_submit_button = driver.find_element(By.XPATH,"//button[@id='submit-button']").click()
29 | time.sleep(2)
30 | # print(driver.get_cookies())
31 | 
32 | json_object = json.dumps(driver.get_cookies())
33 | 
34 | # Writing to sample.json
35 | with open("stackoverflow_cookies.json", "w") as outfile:
36 |     outfile.write(json_object)
37 | 


--------------------------------------------------------------------------------
/stackoverflow_login_with_cookies.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import json
 3 | from selenium import webdriver
 4 | from selenium.webdriver import ChromeOptions
 5 | 
 6 | 
 7 | def login():
 8 |     options = ChromeOptions()
 9 |     options.add_argument("--start-maximized")
10 |     options.add_argument("--incognito")
11 |     options.add_argument("--disable-infobars")
12 |     options.add_argument("--disable-extension")
13 |     options.add_experimental_option("useAutomationExtension", False)
14 |     options.add_experimental_option("excludeSwitches", ["enable-automation"])
15 | 
16 |     # create chrome driver object with options
17 |     driver = webdriver.Chrome(options=options)
18 | 
19 |     # open then website
20 |     driver.get("https://stackoverflow.com")
21 | 
22 |     # Opening JSON file
23 |     f = open('stackoverflow_cookies.json')
24 |     cookies = json.load(f)
25 |     # load cookies to the driver
26 |     for cookie in cookies:
27 |         driver.add_cookie(cookie)
28 | 
29 |     time.sleep(2)
30 |     # refresh the browser
31 |     driver.refresh()
32 | 
33 |     return driver
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     login()
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/stackoverflow_track.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import winsound
 3 | from selenium.webdriver import Chrome, ChromeOptions
 4 | from selenium.webdriver.common.by import By
 5 | 
 6 | options = ChromeOptions()
 7 | # open and maximize the screen
 8 | options.add_argument("--start-maximized")
 9 | # below 2 lines diables the info bar
10 | options.add_experimental_option("useAutomationExtension", False)
11 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
12 | 
13 | driver = Chrome(options=options)
14 | 
15 | url_to_track = 'https://stackoverflow.com/search?tab=Newest&pagesize=15&q=web-scraping&searchOn=3'
16 | driver.get(url_to_track)
17 | 
18 | questions = driver.find_elements(By.CSS_SELECTOR, 'div.s-post-summary.js-post-summary')
19 | 
20 | try:
21 |     top_of_list = questions[0].find_element(By.CSS_SELECTOR, 'div.s-post-summary--content')
22 |     title = top_of_list.find_element(By.TAG_NAME, 'h3').text
23 |     print(title)
24 |     top_title = title
25 | 
26 |     flag = True
27 |     time_now = time.time()
28 |     while flag:
29 |         q = driver.find_elements(By.CSS_SELECTOR, 'div.s-post-summary.js-post-summary')[0]
30 |         ti = q.find_element(By.CSS_SELECTOR, 'div.s-post-summary--content').find_element(By.TAG_NAME, 'h3').text
31 |         # print(ti)
32 |         cat = q.find_element(By.CSS_SELECTOR, 'div.s-post-summary--content').find_element(By.CSS_SELECTOR,
33 |                                                                                           'h3>span').get_attribute(
34 |             'title')
35 |         # print(cat)
36 |         tg = [tag.text for tag in
37 |               q.find_element(By.CSS_SELECTOR, 'div.s-post-summary--content').find_element(By.CSS_SELECTOR,
38 |                                                                                           'div.s-post-summary--meta').find_element(
39 |                   By.CSS_SELECTOR, 'ul.ml0.list-ls-none.js-post-tag-list-wrapper.d-inline').find_elements(By.TAG_NAME,
40 |                                                                                                           'li')]
41 |         # print(tg)
42 | 
43 |         if ti != top_title and cat =='Question' and 'python' in tg:
44 |             # winsound.Beep(frequency=350, duration=1000)
45 |             print(f"new post arrives")
46 |             winsound.PlaySound('delightful-4.wav', winsound.SND_FILENAME)
47 |             flag = False
48 | 
49 |         # refresh the browser every 2 minutes
50 |         if time.time() > time_now+120:
51 |             driver.refresh()
52 |             time_now = time.time()
53 | 
54 | except IndexError as e:
55 |     print(e)
56 |     time.sleep(10)
57 |     driver.quit()
58 | 
59 | 
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/store_pagination_element_to_click.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | from selenium.webdriver import ChromeOptions, Keys
 3 | from selenium.webdriver.common.by import By
 4 | from selenium.webdriver.support import expected_conditions as EC
 5 | from selenium.webdriver.support.wait import WebDriverWait
 6 | import json
 7 | import time
 8 | 
 9 | options = ChromeOptions()
10 | 
11 | # maximized and disable forbar
12 | options.add_argument("--start-maximized")
13 | options.add_experimental_option("useAutomationExtension", False)
14 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
15 | 
16 | driver = webdriver.Chrome(options=options)
17 | 
18 | url = "https://www.google.com/search?q=toi"
19 | 
20 | 
21 | driver.get(url)
22 | driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
23 | 
24 | pages = driver.find_element(by=By.CLASS_NAME, value="AaVjTc").find_element(by=By.TAG_NAME, value='tr').find_elements(by=By.TAG_NAME, value='td')
25 | 
26 | lst = [page.find_element(by=By.TAG_NAME, value='a') for page in pages[2:]]
27 | 
28 | print(lst)
29 | print(len(lst))
30 | lst[2].click()
31 | time.sleep(5)
32 | 
33 | driver.quit()


--------------------------------------------------------------------------------
/sustainalytics_com.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Project : 
  3 | Author : Ajeet
  4 | Date : June 21, 2023
  5 | """
  6 | import time
  7 | from bs4 import BeautifulSoup
  8 | from selenium.webdriver import Chrome, ChromeOptions
  9 | from selenium.webdriver.common.by import By
 10 | from selenium.webdriver.support.ui import WebDriverWait
 11 | from selenium.webdriver.support import expected_conditions as EC
 12 | 
 13 | options = ChromeOptions()
 14 | options.add_argument('--start-maximized')
 15 | 
 16 | driver = Chrome(options=options)
 17 | wait = WebDriverWait(driver, 10)
 18 | url = "https://www.sustainalytics.com/esg-ratings"
 19 | driver.get(url)
 20 | wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a#hs-eu-confirmation-button'))).click()
 21 | 
 22 | data = []
 23 | 
 24 | 
 25 | def data_processing(source):
 26 |     soup = BeautifulSoup(source, "html.parser")
 27 |     selected_page = soup.select_one('span.pagination-page.selected').text
 28 |     print(f"---------------------- This is page {selected_page} ----------------------")
 29 | 
 30 |     container = soup.select_one('section#company_ratings')
 31 |     company_rows = container.find_all(class_='company-row')
 32 | 
 33 |     for company_row in company_rows:
 34 |         company_name = company_row.find(class_='primary-color').get_text()
 35 |         esg_risk_rating = company_row.find(class_='col-2').get_text()
 36 | 
 37 |         print(f"Company: {company_name} | Rating: {esg_risk_rating}")
 38 |         data.append({"Company": company_name, "Rating": esg_risk_rating})
 39 | 
 40 | 
 41 | def first_page():
 42 |     # process the 1st page
 43 |     data_processing(driver.page_source)
 44 |     return f"data:\n{data}"
 45 | 
 46 | 
 47 | def multiple_page(page_num):
 48 |     # process the first page
 49 |     data_processing(driver.page_source)
 50 | 
 51 |     # click and process next pages
 52 |     for i in range(2, page_num+1):
 53 |         driver.execute_script(f"""
 54 |         function getElementByXpath(path) {{
 55 |            return document.evaluate(path, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
 56 |         }};
 57 |         getElementByXpath('//*[@id="victor-pagination"]/a[@class="pagination-page" and text()="{i}"]').click();
 58 |         """)
 59 | 
 60 |         time.sleep(2)
 61 |         data_processing(driver.page_source)
 62 | 
 63 |     return f"data:\n{data}"
 64 | 
 65 | 
 66 | if __name__ == '__main__':
 67 |     # print(first_page())
 68 |     print(multiple_page(4))
 69 | 
 70 | """
 71 | output:
 72 | 
 73 | ---------------------- This is page 1 ----------------------
 74 | Company: 1-800-FLOWERS.COM, Inc. | Rating: 23.6
 75 | Company: 1&1 AG | Rating: 22.2
 76 | Company: 10X Genomics, Inc. | Rating: 22.6
 77 | Company: 11 Bit Studios SA | Rating: 16.3
 78 | Company: 1Life Healthcare, Inc. | Rating: 22.5
 79 | Company: 1st Source Corp. | Rating: 31.7
 80 | Company: 1stdibs.com, Inc. | Rating: 26.7
 81 | Company: 22nd Century Group, Inc. | Rating: 35.4
 82 | Company: 23andMe Holding Co. | Rating: 25.6
 83 | Company: 29metals Ltd. | Rating: 42.8
 84 | ---------------------- This is page 2 ----------------------
 85 | Company: 2i Rete Gas SpA | Rating: 25.2
 86 | Company: 2seventy Bio, Inc. | Rating: 32.0
 87 | Company: 2U, Inc. | Rating: 26.8
 88 | Company: 360 DigiTech, Inc. | Rating: 28.4
 89 | Company: 360 One Wam Ltd. | Rating: 33.3
 90 | Company: 360 Security Technology, Inc. | Rating: 23.1
 91 | Company: 361 Degrees International Ltd. | Rating: 18.6
 92 | Company: 37 Interactive Entertainment Network Technology Group Co. Ltd. | Rating: 14.3
 93 | Company: 3D Systems Corp. | Rating: 23.0
 94 | Company: 3i Group Plc | Rating: 11.1
 95 | ---------------------- This is page 3 ----------------------
 96 | Company: 3M Co. | Rating: 33.9
 97 | Company: 3M India Ltd. | Rating: 23.4
 98 | Company: 3R Petroleum Óleo e Gás SA | Rating: 56.7
 99 | Company: 3SBio, Inc. | Rating: 27.1
100 | Company: 407 East Development Group GP | Rating: 45.7
101 | Company: 407 International, Inc. | Rating: 11.4
102 | Company: 4D Molecular Therapeutics, Inc. | Rating: 28.4
103 | Company: 4imprint Group Plc | Rating: 17.2
104 | Company: 5E Advanced Materials, Inc. | Rating: 42.0
105 | Company: 5I5J Holding Group Co. Ltd. | Rating: 15.0
106 | ---------------------- This is page 4 ----------------------
107 | Company: 7-Eleven Malaysia Holdings Bhd. | Rating: 24.6
108 | Company: 7-Eleven, Inc. | Rating: 35.1
109 | Company: 888 Holdings Plc | Rating: 18.7
110 | Company: 8x8, Inc. | Rating: 29.9
111 | Company: 908 Devices, Inc. | Rating: 36.8
112 | Company: 91APP, Inc. | Rating: 25.8
113 | Company: A-Living Smart City Services Co., Ltd. | Rating: 9.3
114 | Company: A-Mark Precious Metals, Inc. | Rating: 30.3
115 | Company: A. O. Smith Corp. | Rating: 25.4
116 | Company: A.G. BARR Plc | Rating: 23.7
117 | data:
118 | [{'Company': '1-800-FLOWERS.COM, Inc.', 'Rating': '23.6'}, {'Company': '1&1 AG', 'Rating': '22.2'}, {'Company': '10X Genomics, Inc.', 'Rating': '22.6'}, {'Company': '11 Bit Studios SA', 'Rating': '16.3'}, {'Company': '1Life Healthcare, Inc.', 'Rating': '22.5'}, {'Company': '1st Source Corp.', 'Rating': '31.7'}, {'Company': '1stdibs.com, Inc.', 'Rating': '26.7'}, {'Company': '22nd Century Group, Inc.', 'Rating': '35.4'}, {'Company': '23andMe Holding Co.', 'Rating': '25.6'}, {'Company': '29metals Ltd.', 'Rating': '42.8'}, {'Company': '2i Rete Gas SpA', 'Rating': '25.2'}, {'Company': '2seventy Bio, Inc.', 'Rating': '32.0'}, {'Company': '2U, Inc.', 'Rating': '26.8'}, {'Company': '360 DigiTech, Inc.', 'Rating': '28.4'}, {'Company': '360 One Wam Ltd.', 'Rating': '33.3'}, {'Company': '360 Security Technology, Inc.', 'Rating': '23.1'}, {'Company': '361 Degrees International Ltd.', 'Rating': '18.6'}, {'Company': '37 Interactive Entertainment Network Technology Group Co. Ltd.', 'Rating': '14.3'}, {'Company': '3D Systems Corp.', 'Rating': '23.0'}, {'Company': '3i Group Plc', 'Rating': '11.1'}, {'Company': '3M Co.', 'Rating': '33.9'}, {'Company': '3M India Ltd.', 'Rating': '23.4'}, {'Company': '3R Petroleum Óleo e Gás SA', 'Rating': '56.7'}, {'Company': '3SBio, Inc.', 'Rating': '27.1'}, {'Company': '407 East Development Group GP', 'Rating': '45.7'}, {'Company': '407 International, Inc.', 'Rating': '11.4'}, {'Company': '4D Molecular Therapeutics, Inc.', 'Rating': '28.4'}, {'Company': '4imprint Group Plc', 'Rating': '17.2'}, {'Company': '5E Advanced Materials, Inc.', 'Rating': '42.0'}, {'Company': '5I5J Holding Group Co. Ltd.', 'Rating': '15.0'}, {'Company': '7-Eleven Malaysia Holdings Bhd.', 'Rating': '24.6'}, {'Company': '7-Eleven, Inc.', 'Rating': '35.1'}, {'Company': '888 Holdings Plc', 'Rating': '18.7'}, {'Company': '8x8, Inc.', 'Rating': '29.9'}, {'Company': '908 Devices, Inc.', 'Rating': '36.8'}, {'Company': '91APP, Inc.', 'Rating': '25.8'}, {'Company': 'A-Living Smart City Services Co., Ltd.', 'Rating': '9.3'}, {'Company': 'A-Mark Precious Metals, Inc.', 'Rating': '30.3'}, {'Company': 'A. O. Smith Corp.', 'Rating': '25.4'}, {'Company': 'A.G. BARR Plc', 'Rating': '23.7'}]
119 | """
120 | 
121 | """
122 | reference:
123 | https://stackoverflow.com/questions/76513303/scraping-a-website-for-multiple-pages-that-url-does-not-c
124 | """


--------------------------------------------------------------------------------
/switching_bw_windows.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from selenium import webdriver
 3 | from selenium.webdriver import ChromeOptions
 4 | from selenium.webdriver.common.by import By
 5 | 
 6 | options = ChromeOptions()
 7 | 
 8 | options.add_argument("--start-maximized")
 9 | options.add_experimental_option("useAutomationExtension", False)
10 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
11 | 
12 | driver = webdriver.Chrome(options=options)
13 | 
14 | url = "http://www.hamiltoncountyherald.com/PublicNotices.aspx"
15 | 
16 | 
17 | def scrape_data():
18 |     # Create list of labels of data you want to scrape
19 |     labels = ["lbl1", "lbl2", "lbl3", "lbl4", "lbl5", "lbl6", "lbl7", "lbl8", "lbl9", "lbl10", "lbl11"]
20 | 
21 |     # Empty list to append data values to
22 |     list_of_data = []
23 | 
24 |     # Create loop to iterate through list and print values of labels
25 |     for items in labels:
26 |         link = driver.find_element("id", items)
27 |         link_label = link.text
28 |         list_of_data.append(link_label)
29 | 
30 |     # Create list of titles to use as dict keys
31 |     titles = ["Borrower", "Address", "Original Trustee", "Attorney", "Instrumental No.", "Substitute Trustee",
32 |               "Advertised Auction Date", "Date of First Public Notice", "Trust Date", "DR No."]
33 | 
34 |     # Zip the titles and labels data together into one dict
35 |     zipped_data = dict(zip(titles, list_of_data))
36 | 
37 |     return zipped_data
38 | 
39 | 
40 | driver.get(url)
41 | tables = driver.find_elements(By.TAG_NAME, 'table')[0]
42 | foreclosure_table = tables.find_elements(By.TAG_NAME, 'table')[7]
43 | views = foreclosure_table.find_elements(By.TAG_NAME, 'tr')[1:]
44 | 
45 | final_data = []
46 | for view in views:
47 |     # Store the current window handle
48 |     win_handle_before = driver.current_window_handle
49 | 
50 |     # Perform the click operation that opens new window
51 |     view.find_element(By.TAG_NAME, 'a').click()
52 |     time.sleep(2)
53 | 
54 |     # Switch to new window opened
55 |     for win_handle in driver.window_handles:
56 |         driver.switch_to.window(win_handle)
57 | 
58 |     # Perform the actions on new window
59 |     final_data.append(scrape_data())
60 | 
61 |     # Close the new window, if that window no more required
62 |     driver.close()
63 | 
64 |     # Switch back to original browser (first window)
65 |     driver.switch_to.window(win_handle_before)
66 |     time.sleep(2)
67 | 
68 | print(final_data)
69 | 


--------------------------------------------------------------------------------
/switching_bw_windows_excercise_2.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : 
 3 | Author : Dart Korea
 4 | Date : July 10, 2023
 5 | """
 6 | 
 7 | from selenium.webdriver import Chrome, Keys
 8 | from selenium.webdriver.common.by import By
 9 | from selenium.webdriver.support.select import Select
10 | from selenium.webdriver.support.wait import WebDriverWait
11 | import selenium.webdriver.support.expected_conditions as EC
12 | import time
13 | 
14 | driver = Chrome()
15 | 
16 | 
17 | url = "https://dart.fss.or.kr/dsab007/main.do"
18 | driver.get(url)
19 | 
20 | state = driver.find_element(By.ID, 'option')
21 | nsw = Select(state)
22 | nsw.select_by_visible_text('회사명')
23 | 
24 | search = driver.find_element(By.ID, 'textCrpNm')
25 | search.send_keys('조이푸드')
26 | search.send_keys(Keys.ENTER)
27 | 
28 | 
29 | # Store the current window handle
30 | win_handle_before = driver.current_window_handle
31 | 
32 | # Perform the click operation that opens new window
33 | WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'a[title="감사보고서 공시뷰어 새창"]'))).click()
34 | time.sleep(5)
35 | 
36 | # Switch to new window opened
37 | driver.switch_to.window(driver.window_handles[1])
38 | 
39 | # Perform the actions on new window
40 | con = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'ul[class="jstree-children"]')))
41 | con.find_elements(By.TAG_NAME, 'li')[-1].find_element(By.TAG_NAME, 'a').click()
42 | 
43 | # do the page parsing
44 | driver.switch_to.frame(driver.find_element(By.ID, 'ifrm'))
45 | overview = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//p[text()="1. 회사의 개요"]')))
46 | print(overview.text)
47 | 
48 | # Close the new window, if that window no more required
49 | driver.close()
50 | 
51 | # Switch back to original browser (first window)
52 | driver.switch_to.window(win_handle_before)
53 | 


--------------------------------------------------------------------------------
/text_option_under_select.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : 
 3 | Author : Ajeet
 4 | Date : June 22, 2023
 5 | """
 6 | from selenium.webdriver import Chrome
 7 | from selenium.webdriver.support.ui import Select
 8 | from selenium.webdriver.common.by import By
 9 | from selenium.webdriver.support.wait import WebDriverWait
10 | import selenium.webdriver.support.expected_conditions as EC
11 | 
12 | driver = Chrome()
13 | url = "https://vahan.parivahan.gov.in/vahan4dashboard/vahan/view/reportview.xhtml"
14 | driver.get(url)
15 | dropdown_element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "j_idt31_input")))
16 | dropdown = Select(dropdown_element)
17 | option_names = [option.get_attribute('innerHTML') for option in dropdown.options]
18 | print(len(option_names))
19 | print(option_names)
20 | 
21 | 
22 | """
23 | output:
24 | 
25 | 35
26 | ['All Vahan4 Running States (34/36)', 'Andaman &amp; Nicobar Island(8)', 'Andhra Pradesh(80)', 'Arunachal Pradesh(26)', 'Assam(36)', 'Bihar(49)', 'Chhattisgarh(30)', 'Chandigarh(1)', 'UT of DNH and DD(3)', 'Delhi(23)', 'Goa(13)', 'Gujarat(37)', 'Himachal Pradesh(113)', 'Haryana(179)', 'Jharkhand(30)', 'Jammu and Kashmir(21)', 'Karnataka(68)', 'Kerala(87)', 'Ladakh(3)', 'Maharashtra(53)', 'Meghalaya(13)', 'Manipur(12)', 'Madhya Pradesh(52)', 'Mizoram(10)', 'Nagaland(9)', 'Odisha(39)', 'Punjab(93)', 'Puducherry(8)', 'Rajasthan(142)', 'Sikkim(8)', 'Tamil Nadu(146)', 'Tripura(9)', 'Uttarakhand(21)', 'Uttar Pradesh(78)', 'West Bengal(56)']
27 | 
28 | reference:
29 | https://stackoverflow.com/questions/76528109/how-do-i-obtain-a-list-of-values-from-a-website-dropdown-using-selenium
30 | """


--------------------------------------------------------------------------------
/tiktok_com.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : TikTok video's view count
 3 | Author : Ajeet
 4 | Date : July 19, 2023
 5 | """
 6 | 
 7 | import time
 8 | import json
 9 | from selenium.webdriver import Chrome
10 | from selenium.webdriver.chrome.service import Service
11 | from webdriver_manager.chrome import ChromeDriverManager
12 | from selenium.webdriver.common.by import By
13 | from selenium.webdriver.support.wait import WebDriverWait
14 | import selenium.webdriver.support.expected_conditions as EC
15 | 
16 | 
17 | def save_view_counts(urls, filename):
18 |     data = {}
19 |     driver = Chrome(service=Service(ChromeDriverManager().install()))
20 | 
21 |     for url in urls:
22 | 
23 |         driver.get(url)
24 |         recent_videos = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'strong[data-e2e="video-views"]')))
25 |         print(f"number of recent videos: {len(recent_videos)}")
26 |         data[url] = [i.get_attribute('innerHTML') for i in recent_videos]
27 | 
28 |         time.sleep(3) # delay between requests
29 | 
30 |     driver.quit()
31 |     print(data)
32 | 
33 |     # save data
34 |     with open(filename, 'w') as f:
35 |         f.write(json.dumps(data, indent=4))
36 | 
37 | # urls 2 scrape
38 | urls = [
39 |     'https://www.tiktok.com/@netflix',
40 |     'https://www.tiktok.com/@twitter'
41 | ]
42 | 
43 | save_view_counts(urls, 'views.txt')
44 | 
45 | 
46 | """
47 | output:
48 | 
49 | number of recent videos: 34
50 | number of recent videos: 23
51 | {'https://www.tiktok.com/@netflix': ['99.7K', '136.7K', '27.6K', '18.1K', '12.8K', '7670', '87K', '15.8K', '14.5K', '102.1K', '25.7K', '203.2K', '4.1M', '43K', '32.9K', '101.5K', '2.3M', '233K', '440.9K', '92.4K', '25.9K', '53.3K', '33.3K', '449.5K', '92K', '53.2K', '215.5K', '32.1K', '1.6M', '415K', '224K', '319.1K', '469.8K', '420.1K'], 'https://www.tiktok.com/@twitter': ['361.4K', '138.5K', '54.4K', '169.3K', '67.6K', '90.4K', '4.6M', '115.4K', '48.4K', '45.6K', '73K', '223.8K', '107K', '11.8M', '155.7K', '100K', '1.4M', '94.6K', '55.3K', '67.4K', '48K', '40.7K', '40.4K']}
52 | 
53 | Few things to note:
54 | 
55 | - we can directly locate/find the element of view-count using the CSS selector strong[data-e2e="video-views"]
56 | - To get the view-count text, use i.get_attribute('innerHTML') instead of i.text
57 | 
58 | reference:
59 | https://stackoverflow.com/questions/76716861/bulk-scraping-tiktok-view-count-from-20-most-recent-posts
60 | """


--------------------------------------------------------------------------------
/tiktok_com_video_post.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : TikTok video post
 3 | Author : Ajeet
 4 | Date : April 30, 2025
 5 | 
 6 | Description:
 7 | This script automates the process of uploading a video to TikTok using Selenium.
 8 | It loads a set of saved cookies to bypass the login, navigates to the upload section,
 9 | selects a video from the local system, and posts it.
10 | """
11 | 
12 | import json
13 | from time import sleep
14 | from selenium.webdriver import Chrome, ChromeOptions
15 | from selenium.webdriver.common.by import By
16 | from selenium.webdriver.support.wait import WebDriverWait
17 | import selenium.webdriver.support.expected_conditions as EC
18 | 
19 | 
20 | def upload_video_to_tiktok(video_path: str, cookie_file: str = 'tiktok_cookies.json') -> None:
21 |     """
22 |     Uploads a video to TikTok using Selenium automation.
23 | 
24 |     Args:
25 |         video_path (str): Full path to the video file to upload.
26 |         cookie_file (str): Path to the JSON file containing TikTok session cookies.
27 |     """
28 |     # ===== SETUP CHROME OPTIONS =====
29 |     options = ChromeOptions()
30 |     options.add_argument('--start-maximized')
31 |     options.add_experimental_option("excludeSwitches", ["enable-automation"])
32 | 
33 |     # Initialize driver and wait
34 |     driver = Chrome(options=options)
35 |     wait = WebDriverWait(driver, 10)
36 |     url = "https://www.tiktok.com/"
37 | 
38 |     try:
39 |         # Step 1: Open TikTok
40 |         driver.get(url)
41 | 
42 |         # Step 2: Load cookies
43 |         with open(cookie_file) as f:
44 |             cookies = json.load(f)
45 | 
46 |         for cookie in cookies:
47 |             driver.add_cookie({
48 |                 "domain": cookie['domain'],
49 |                 "value": cookie['value'],
50 |                 "id": cookie['id'],
51 |                 "name": cookie['name']
52 |             })
53 | 
54 |         # Step 3: Reload with authenticated session
55 |         sleep(2)
56 |         driver.get(url)
57 | 
58 |         # Step 4: Click Upload button
59 |         wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'button[aria-label="Upload"]'))).click()
60 | 
61 |         # Step 5: Upload the video file
62 |         wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'input[type="file"]'))).send_keys(video_path)
63 | 
64 |         # Step 6: Click Post (enabled) button
65 |         wait.until(EC.presence_of_element_located((
66 |             By.XPATH, '//button[(@data-e2e="post_video_button") and (@aria-disabled="false")]'
67 |         ))).click()
68 | 
69 |         print("Video upload initiated successfully.")
70 |         sleep(5)
71 | 
72 |     except Exception as e:
73 |         print(f"An error occurred during upload: {e}")
74 |     finally:
75 |         driver.quit()
76 | 
77 | 
78 | # Example usage
79 | if __name__ == "__main__":
80 |     upload_video_to_tiktok("D:\\IMG_4070.mp4")
81 | 
82 | """
83 | reference: https://stackoverflow.com/a/79599064/11179336
84 | 
85 | """


--------------------------------------------------------------------------------
/tiktok_video_post.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Help-the-community/Web_Scraping_with_Selenium/83d65976e29668ac62cd8edf50511f66ff00084e/tiktok_video_post.gif


--------------------------------------------------------------------------------
/transat_com.py:
--------------------------------------------------------------------------------
 1 | from time import sleep
 2 | from selenium import webdriver
 3 | from selenium.webdriver.common.by import By
 4 | from selenium.webdriver.support.ui import WebDriverWait
 5 | from selenium.webdriver.support import expected_conditions as EC
 6 | from selenium.webdriver.chrome.options import Options
 7 | 
 8 | 
 9 | def main():
10 |     options = Options()
11 |     options.add_argument('--start-maximized')
12 |     options.add_argument("--disable-notifications")
13 |     options.add_argument("--disable-popup-blocking")
14 | 
15 |     options.add_experimental_option("excludeSwitches", ["enable-automation"])
16 |     options.add_experimental_option("useAutomationExtension", False)
17 | 
18 |     driver = webdriver.Chrome(options=options)
19 |     wait = WebDriverWait(driver, 5)
20 | 
21 |     driver.get(f"https://www.transat.com/fr-CA?search=package")
22 | 
23 |     wait.until(EC.presence_of_element_located((By.ID, 'FROMSEARCH'))).click()
24 |     sleep(2)
25 |     driver.find_element(By.CSS_SELECTOR, '#YUL-FROMSEARCH > span.code').click()
26 | 
27 |     wait.until(EC.presence_of_element_located((By.ID, 'TOSEARCH'))).click()
28 |     sleep(2)
29 |     driver.find_element(By.CSS_SELECTOR, '#City-13-TOSEARCH > div > span.name').click()
30 |     sleep(2)
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     main()
35 | 


--------------------------------------------------------------------------------
/twitter_login.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : Twitter Login
 3 | Author : Ajeet
 4 | Date : August 7, 2023
 5 | """
 6 | 
 7 | import time
 8 | from selenium import webdriver
 9 | from selenium.webdriver import ChromeOptions, Keys
10 | from selenium.webdriver.common.by import By
11 | from selenium.webdriver.support import expected_conditions as EC
12 | from selenium.webdriver.support.wait import WebDriverWait
13 | 
14 | 
15 | def login_twitter(username: str, password: str) -> None:
16 |     """
17 |     Log in to Twitter using the provided username and password.
18 | 
19 |     This function automates the login process on Twitter using Selenium WebDriver.
20 |     It opens the Twitter login page, enters the provided username and password, and submits the form.
21 | 
22 |     Parameters:
23 |     username (str): The Twitter username to log in with.
24 |     password (str): The Twitter password for the specified username.
25 | 
26 |     Returns:
27 |     None
28 |     """
29 |     options = ChromeOptions()
30 |     options.add_argument("--start-maximized")
31 |     options.add_experimental_option("excludeSwitches", ["enable-automation"])
32 |     driver = webdriver.Chrome(options=options)
33 | 
34 |     # Open the Twitter login page
35 |     url = "https://twitter.com/i/flow/login"
36 |     driver.get(url)
37 | 
38 |     # Find and input the username
39 |     username_input = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'input[autocomplete="username"]')))
40 |     username_input.send_keys(username)
41 |     username_input.send_keys(Keys.ENTER)
42 | 
43 |     # Find and input the password
44 |     password_input = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'input[name="password"]')))
45 |     password_input.send_keys(password)
46 |     password_input.send_keys(Keys.ENTER)
47 | 
48 |     # Wait for a short period (e.g., 10 seconds) to ensure the login process completes
49 |     time.sleep(10)
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     your_username = "your_twitter_username_here"
54 |     your_password = "your_twitter_password_here"
55 | 
56 |     # Call the login_twitter function with your Twitter credentials
57 |     login_twitter(your_username, your_password)
58 | 


--------------------------------------------------------------------------------
/usa_visa_com.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : 
 3 | Author : Ajeet
 4 | Date : June 9, 2023
 5 | """
 6 | from selenium import webdriver
 7 | from selenium.webdriver.common.by import By
 8 | from selenium.webdriver.support.wait import WebDriverWait
 9 | from selenium.webdriver.support import expected_conditions as EC
10 | from selenium.webdriver.common.keys import Keys
11 | from time import sleep
12 | 
13 | 
14 | driver = webdriver.Chrome()
15 | driver.get('https://usa.visa.com/support/consumer/travel-support/exchange-rate-calculator.html')
16 | wait = WebDriverWait(driver, 30)
17 | 
18 | # click to Accept
19 | wait.until(EC.element_to_be_clickable((By.XPATH, "//a[text()='Accept']"))).click()
20 | 
21 | shadow_root = driver.find_element(By.XPATH, "//dm-calculator").shadow_root
22 | # enter_amount
23 | shadow_root.find_element(By.ID, "input_amount_paid").send_keys("1")
24 | 
25 | # from_dropdown
26 | shadow_root.find_element(By.ID, "autosuggestinput_from").click()
27 | shadow_root.find_element(By.ID, "listbox-item-157").click()
28 | 
29 | # to_dropdown
30 | shadow_root.find_element(By.ID, "autosuggestinput_to").click()
31 | shadow_root.find_element(By.ID, "listbox-item-0").click()
32 | 
33 | # fee_edit
34 | shadow_root.find_element(By.CLASS_NAME, 'vs-link-cta.vs-calculator-edit-link').click()
35 | 
36 | bank_rate = to_dropdown = shadow_root.find_element(By.ID, "input_bank_rate")
37 | bank_rate.send_keys(Keys.CONTROL, 'a')
38 | bank_rate.send_keys(Keys.BACKSPACE)
39 | bank_rate.send_keys('0')
40 | 
41 | # clicks on Calculate Conversion button
42 | shadow_root.find_elements(By.CSS_SELECTOR, 'div.vs-container')[-1].find_elements(By.TAG_NAME, 'button')[0].click()
43 | sleep(2)
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/wallet_polygon_technology.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : Wallet Polygon Technology
 3 | Author : Ajeet
 4 | Date : July 12, 2023
 5 | """
 6 | 
 7 | import time
 8 | from selenium import webdriver
 9 | from selenium.webdriver import ChromeOptions
10 | from selenium.webdriver.common.by import By
11 | from selenium.webdriver.support import expected_conditions as EC
12 | from selenium.webdriver.support.wait import WebDriverWait
13 | 
14 | options = ChromeOptions()
15 | options.add_argument("--start-maximized")
16 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
17 | 
18 | driver = webdriver.Chrome(options=options)
19 | wait = WebDriverWait(driver, 10)
20 | url = "https://wallet.polygon.technology/?redirectOnConnect=zkEVM_bridge"
21 | 
22 | driver.get(url)
23 | # click on the "Connect to a Wallet" button
24 | wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.navbar__apps-section__auth__login"))).click()
25 | time.sleep(2)
26 | driver.execute_script("""document.querySelector('w3m-modal').shadowRoot.querySelector('w3m-modal-router').shadowRoot.querySelector('w3m-connect-wallet-view').shadowRoot.querySelector('w3m-desktop-wallet-selection').shadowRoot.querySelector('w3m-modal-footer').querySelectorAll('w3m-wallet-button')[0].shadowRoot.querySelector('button').click();""")
27 | time.sleep(5)
28 | 
29 | """
30 | - Various elements on this website are embedded inside the shadow-root.
31 | - for example, your target/desired button is embedded in a 5-layer nested shadow-root.
32 | - After clicking on the Connect to a Wallet, we wait for 1-2 seconds just to make sure that the overlay window is 
33 |   visibly present, although it appears very quickly.
34 | - The used javascript query to locate and click on the desired button:
35 | 
36 |   document.querySelector('w3m-modal').shadowRoot.querySelector('w3m-modal-router').shadowRoot.querySelector('w3m-connect-wallet-view').shadowRoot.querySelector('w3m-desktop-wallet-selection').shadowRoot.querySelector('w3m-modal-footer').querySelectorAll('w3m-wallet-button')[0].shadowRoot.querySelector('button').click();
37 |   
38 |   will click on the very first wallet, if you like to click on the 2nd or 3rd wallet option, just simply replace 
39 |   the querySelectorAll('w3m-wallet-button')[0] with querySelectorAll('w3m-wallet-button')[1] or 
40 |   querySelectorAll('w3m-wallet-button')[2] respectively in the above-mentioned javascript query.
41 | 
42 | reference:
43 | https://stackoverflow.com/questions/76658230/selenium-how-to-get-element-in-shadow-root-of-html-page-code
44 | """


--------------------------------------------------------------------------------
/wallet_sendit_arcana_network.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : https://sendit.arcana.network/app/login
 3 | Author : Ajeet
 4 | Date : August 18, 2023
 5 | """
 6 | 
 7 | import time
 8 | from selenium import webdriver
 9 | from selenium.webdriver import ChromeOptions
10 | from selenium.webdriver.common.by import By
11 | from selenium.webdriver.support import expected_conditions as EC
12 | from selenium.webdriver.support.wait import WebDriverWait
13 | 
14 | options = ChromeOptions()
15 | options.add_argument("--start-maximized")
16 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
17 | 
18 | driver = webdriver.Chrome(options=options)
19 | wait = WebDriverWait(driver, 20)
20 | driver.get(url="https://sendit.arcana.network/app/login")
21 | 
22 | # Click on the "Connect Wallet" button on the page
23 | wait.until(EC.element_to_be_clickable((By.XPATH, '//span[text()=" Connect Wallet "]'))).click()
24 | time.sleep(2)
25 | 
26 | # Click on the "View All" to see all wallet options
27 | driver.execute_script(
28 |     """document.querySelector('w3m-modal').shadowRoot.querySelector('w3m-modal-router').shadowRoot.querySelector('w3m-connect-wallet-view').shadowRoot.querySelector('w3m-desktop-wallet-selection').shadowRoot.querySelector('w3m-modal-footer').querySelector('div.w3m-grid').querySelector('w3m-view-all-wallets-button').shadowRoot.querySelector('button').click();""")
29 | 
30 | time.sleep(2)
31 | # Click on the "MetaMask" wallet option
32 | driver.execute_script(
33 | """document.querySelector('w3m-modal').shadowRoot.querySelector('w3m-modal-router').shadowRoot.querySelector('w3m-wallet-explorer-view').shadowRoot.querySelector('div.w3m-grid').querySelector('[name="MetaMask"]').shadowRoot.querySelector('button').click();""")
34 | 
35 | time.sleep(2)
36 | 
37 | """
38 | reference: 
39 | https://stackoverflow.com/questions/76922866/how-to-authorise-in-walletconnect-using-python
40 | """


--------------------------------------------------------------------------------
/yomiuri_co_jp.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Project : Yomiuri
 3 | Author : Ajeet
 4 | Date : July 10, 2023
 5 | """
 6 | 
 7 | from selenium.webdriver import Chrome, ChromeOptions
 8 | from selenium.webdriver.common.by import By
 9 | from selenium.webdriver.support.wait import WebDriverWait
10 | import selenium.webdriver.support.expected_conditions as EC
11 | import time
12 | 
13 | options = ChromeOptions()
14 | options.add_argument('--start-maximized')
15 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
16 | options.add_experimental_option("prefs", {
17 |         "profile.default_content_setting_values.notifications": 2})
18 | 
19 | driver = Chrome(options=options)
20 | wait = WebDriverWait(driver, 10)
21 | driver.get('https://www.yomiuri.co.jp/editorial/')
22 | 
23 | element = wait.until(EC.presence_of_element_located((By.ID, "ajax_more_button")))
24 | 
25 | count = len(wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'ul#latest_list>li[class="p-list-item "]'))))
26 | print(f"initial number of articles: {count}")
27 | 
28 | while True:
29 | 
30 |     driver.execute_script("return arguments[0].click()", element)
31 |     time.sleep(1)
32 |     new_count = len(driver.find_elements(By.CSS_SELECTOR, 'ul#latest_list>li[class="p-list-item "]'))
33 |     print(f"articles after clicking read/load more button: {new_count}")
34 |     if new_count>count:
35 |         count = new_count
36 |     else:
37 |         break
38 | 
39 |     if count==100:
40 |         break
41 | 
42 | news_articles = [i.find_element(By.TAG_NAME, 'a').get_attribute('href') for i in driver.find_elements(By.CSS_SELECTOR, 'ul#latest_list>li[class="p-list-item "]')]
43 | 
44 | print(news_articles)
45 | print(f"Total articles {len(news_articles)}")
46 | 
47 | """
48 | output:
49 | initial number of articles: 20
50 | articles after clicking read/load more button: 30
51 | articles after clicking read/load more button: 40
52 | articles after clicking read/load more button: 50
53 | articles after clicking read/load more button: 60
54 | articles after clicking read/load more button: 70
55 | articles after clicking read/load more button: 80
56 | articles after clicking read/load more button: 90
57 | articles after clicking read/load more button: 100
58 | ['https://www.yomiuri.co.jp/editorial/20230707-OYT1T50249/', 'https://www.yomiuri.co.jp/editorial/20230707-OYT1T50246/', 'https://www.yomiuri.co.jp/editorial/20230706-OYT1T50325/', 'https://www.yomiuri.co.jp/editorial/20230706-OYT1T50322/', 'https://www.yomiuri.co.jp/editorial/20230705-OYT1T50244/', 'https://www.yomiuri.co.jp/editorial/20230705-OYT1T50241/', 'https://www.yomiuri.co.jp/editorial/20230704-OYT1T50239/', 'https://www.yomiuri.co.jp/editorial/20230704-OYT1T50236/', 'https://www.yomiuri.co.jp/editorial/20230703-OYT1T50226/', 'https://www.yomiuri.co.jp/editorial/20230703-OYT1T50223/', 'https://www.yomiuri.co.jp/editorial/20230702-OYT1T50218/', 'https://www.yomiuri.co.jp/editorial/20230702-OYT1T50215/', 'https://www.yomiuri.co.jp/editorial/20230701-OYT1T50247/', 'https://www.yomiuri.co.jp/editorial/20230701-OYT1T50244/', 'https://www.yomiuri.co.jp/editorial/20230630-OYT1T50265/', 'https://www.yomiuri.co.jp/editorial/20230630-OYT1T50259/', 'https://www.yomiuri.co.jp/editorial/20230629-OYT1T50195/', 'https://www.yomiuri.co.jp/editorial/20230629-OYT1T50192/', 'https://www.yomiuri.co.jp/editorial/20230628-OYT1T50240/', 'https://www.yomiuri.co.jp/editorial/20230628-OYT1T50237/', 'https://www.yomiuri.co.jp/editorial/20230627-OYT1T50257/', 'https://www.yomiuri.co.jp/editorial/20230627-OYT1T50254/', 'https://www.yomiuri.co.jp/editorial/20230626-OYT1T50297/', 'https://www.yomiuri.co.jp/editorial/20230626-OYT1T50292/', 'https://www.yomiuri.co.jp/editorial/20230625-OYT1T50191/', 'https://www.yomiuri.co.jp/editorial/20230625-OYT1T50188/', 'https://www.yomiuri.co.jp/editorial/20230624-OYT1T50186/', 'https://www.yomiuri.co.jp/editorial/20230624-OYT1T50183/', 'https://www.yomiuri.co.jp/editorial/20230623-OYT1T50305/', 'https://www.yomiuri.co.jp/editorial/20230623-OYT1T50302/', 'https://www.yomiuri.co.jp/editorial/20230623-OYT1T50083/', 'https://www.yomiuri.co.jp/editorial/20230623-OYT1T50070/', 'https://www.yomiuri.co.jp/editorial/20230621-OYT1T50273/', 'https://www.yomiuri.co.jp/editorial/20230621-OYT1T50270/', 'https://www.yomiuri.co.jp/editorial/20230620-OYT1T50203/', 'https://www.yomiuri.co.jp/editorial/20230620-OYT1T50200/', 'https://www.yomiuri.co.jp/editorial/20230619-OYT1T50253/', 'https://www.yomiuri.co.jp/editorial/20230619-OYT1T50250/', 'https://www.yomiuri.co.jp/editorial/20230618-OYT1T50138/', 'https://www.yomiuri.co.jp/editorial/20230618-OYT1T50135/', 'https://www.yomiuri.co.jp/editorial/20230617-OYT1T50290/', 'https://www.yomiuri.co.jp/editorial/20230617-OYT1T50287/', 'https://www.yomiuri.co.jp/editorial/20230616-OYT1T50258/', 'https://www.yomiuri.co.jp/editorial/20230616-OYT1T50254/', 'https://www.yomiuri.co.jp/editorial/20230616-OYT1T50013/', 'https://www.yomiuri.co.jp/editorial/20230616-OYT1T50010/', 'https://www.yomiuri.co.jp/editorial/20230614-OYT1T50286/', 'https://www.yomiuri.co.jp/editorial/20230614-OYT1T50284/', 'https://www.yomiuri.co.jp/editorial/20230613-OYT1T50164/', 'https://www.yomiuri.co.jp/editorial/20230613-OYT1T50161/', 'https://www.yomiuri.co.jp/editorial/20230612-OYT1T50193/', 'https://www.yomiuri.co.jp/editorial/20230612-OYT1T50189/', 'https://www.yomiuri.co.jp/editorial/20230610-OYT1T50273/', 'https://www.yomiuri.co.jp/editorial/20230610-OYT1T50270/', 'https://www.yomiuri.co.jp/editorial/20230609-OYT1T50270/', 'https://www.yomiuri.co.jp/editorial/20230609-OYT1T50267/', 'https://www.yomiuri.co.jp/editorial/20230608-OYT1T50261/', 'https://www.yomiuri.co.jp/editorial/20230608-OYT1T50257/', 'https://www.yomiuri.co.jp/editorial/20230607-OYT1T50239/', 'https://www.yomiuri.co.jp/editorial/20230607-OYT1T50236/', 'https://www.yomiuri.co.jp/editorial/20230606-OYT1T50228/', 'https://www.yomiuri.co.jp/editorial/20230606-OYT1T50225/', 'https://www.yomiuri.co.jp/editorial/20230605-OYT1T50252/', 'https://www.yomiuri.co.jp/editorial/20230605-OYT1T50244/', 'https://www.yomiuri.co.jp/editorial/20230604-OYT1T50144/', 'https://www.yomiuri.co.jp/editorial/20230604-OYT1T50141/', 'https://www.yomiuri.co.jp/editorial/20230603-OYT1T50230/', 'https://www.yomiuri.co.jp/editorial/20230603-OYT1T50227/', 'https://www.yomiuri.co.jp/editorial/20230602-OYT1T50262/', 'https://www.yomiuri.co.jp/editorial/20230602-OYT1T50259/', 'https://www.yomiuri.co.jp/editorial/20230601-OYT1T50232/', 'https://www.yomiuri.co.jp/editorial/20230601-OYT1T50229/', 'https://www.yomiuri.co.jp/editorial/20230531-OYT1T50307/', 'https://www.yomiuri.co.jp/editorial/20230531-OYT1T50304/', 'https://www.yomiuri.co.jp/editorial/20230530-OYT1T50254/', 'https://www.yomiuri.co.jp/editorial/20230530-OYT1T50251/', 'https://www.yomiuri.co.jp/editorial/20230529-OYT1T50201/', 'https://www.yomiuri.co.jp/editorial/20230529-OYT1T50198/', 'https://www.yomiuri.co.jp/editorial/20230528-OYT1T50116/', 'https://www.yomiuri.co.jp/editorial/20230528-OYT1T50113/', 'https://www.yomiuri.co.jp/editorial/20230527-OYT1T50305/', 'https://www.yomiuri.co.jp/editorial/20230527-OYT1T50301/', 'https://www.yomiuri.co.jp/editorial/20230526-OYT1T50307/', 'https://www.yomiuri.co.jp/editorial/20230526-OYT1T50304/', 'https://www.yomiuri.co.jp/editorial/20230525-OYT1T50378/', 'https://www.yomiuri.co.jp/editorial/20230525-OYT1T50371/', 'https://www.yomiuri.co.jp/editorial/20230524-OYT1T50273/', 'https://www.yomiuri.co.jp/editorial/20230524-OYT1T50270/', 'https://www.yomiuri.co.jp/editorial/20230523-OYT1T50272/', 'https://www.yomiuri.co.jp/editorial/20230523-OYT1T50269/', 'https://www.yomiuri.co.jp/editorial/20230522-OYT1T50192/', 'https://www.yomiuri.co.jp/editorial/20230522-OYT1T50189/', 'https://www.yomiuri.co.jp/editorial/20230521-OYT1T50225/', 'https://www.yomiuri.co.jp/editorial/20230520-OYT1T50354/', 'https://www.yomiuri.co.jp/editorial/20230520-OYT1T50351/', 'https://www.yomiuri.co.jp/editorial/20230519-OYT1T50244/', 'https://www.yomiuri.co.jp/editorial/20230518-OYT1T50235/', 'https://www.yomiuri.co.jp/editorial/20230518-OYT1T50232/', 'https://www.yomiuri.co.jp/editorial/20230517-OYT1T50298/', 'https://www.yomiuri.co.jp/editorial/20230517-OYT1T50295/']
59 | Total articles 100
60 | 
61 | Few things to note:
62 | 
63 | - as we load the home page, initially it contains 20 articles under the 最新ニュース section.
64 | - on every click on the button さらに読み込む , it loads 10 more articles and so on.
65 | - as you may notice, to click on the desired button, we used driver.execute_script("return arguments[0].click()", element)
66 | - there could be thousands of articles on the page. And if you wish to load more, simply remove the if count==100: statement 
67 | or increase the count number to load a given number of articles. Please notice that as every click loads 10 more articles, 
68 | the value of the variable count will be a multiple of 10 starting from 20. (20, 30, 40, 50,....and so on)
69 | 
70 | reference:
71 | https://stackoverflow.com/questions/76643641/how-to-click-a-button-with-selenium-on-a-javascript-page
72 | """


--------------------------------------------------------------------------------