├── .DS_Store ├── .gitignore ├── README.md ├── api_scraper ├── .DS_Store ├── remote_jobs.xls └── remoteok_scraper.py ├── html_scraper ├── amazon_products_urls.csv ├── amazon_scraper.py ├── amazon_scraper_single.py └── output-01-08-2022.csv └── web_bot ├── .DS_Store ├── chromedriver ├── config.json └── trello_bot.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hussain-mustafa990/python_web_scraping_and_automation/15de5134b4d860d0c9c55b0b366faf7adca85fdd/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Python Web Scraping and Automation 2 | 3 |

Selenium and BeautifulSoup Based Web Scraping and Automation Scripts

4 |
5 | Awesome Badge 6 | Star Badge 7 | Join Community Badge 8 |
9 | 10 |
11 | 12 | Selenium Primarily it is for automating web applications for testing purposes, but is certainly not limited to just that. 13 | Boring web-based administration tasks can (and should) also be automated as well. 14 | Beautiful Soup is a Python library for pulling data out of HTML and XML files. It works with your favorite parser to provide idiomatic ways of navigating, searching, and modifying the parse tree. It commonly saves programmers hours or days of work. 15 |

16 | 17 | ### Discord 18 | 19 | 20 | feed example 21 | 22 | 23 | ## License 24 | 25 | [![CC0](http://mirrors.creativecommons.org/presskit/buttons/88x31/svg/cc-zero.svg)](https://creativecommons.org/publicdomain/zero/1.0/) 26 | -------------------------------------------------------------------------------- /api_scraper/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hussain-mustafa990/python_web_scraping_and_automation/15de5134b4d860d0c9c55b0b366faf7adca85fdd/api_scraper/.DS_Store -------------------------------------------------------------------------------- /api_scraper/remote_jobs.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hussain-mustafa990/python_web_scraping_and_automation/15de5134b4d860d0c9c55b0b366faf7adca85fdd/api_scraper/remote_jobs.xls -------------------------------------------------------------------------------- /api_scraper/remoteok_scraper.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import xlwt 3 | from xlwt import Workbook 4 | import smtplib 5 | from os.path import basename 6 | from email.mime.application import MIMEApplication 7 | from email.mime.multipart import MIMEMultipart 8 | from email.mime.text import MIMEText 9 | from email.utils import COMMASPACE, formatdate 10 | 11 | # https://myaccount.google.com/lesssecureapps 12 | BASE_URL = 'https://remoteok.com/api/' 13 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_0_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36' 14 | REQUEST_HEADER = { 15 | 'User-Agent': USER_AGENT, 16 | 'Accept-Language': 'en-US, en;q=0.5', 17 | } 18 | 19 | 20 | def get_job_postings(): 21 | res = requests.get(url=BASE_URL, headers=REQUEST_HEADER) 22 | return res.json() 23 | 24 | 25 | def output_jobs_to_xls(data): 26 | wb = Workbook() 27 | job_sheet = wb.add_sheet('Jobs') 28 | headers = list(data[0].keys()) 29 | for i in range(0, len(headers)): 30 | job_sheet.write(0, i, headers[i]) 31 | for i in range(0, len(data)): 32 | job = data[i] 33 | values = list(job.values()) 34 | for x in range(0, len(values)): 35 | job_sheet.write(i+1, x, values[x]) 36 | wb.save('remote_jobs.xls') 37 | 38 | 39 | def send_email(send_from, send_to, subject, text, files=None): 40 | assert isinstance(send_to, list) 41 | msg = MIMEMultipart() 42 | msg['From'] = send_from 43 | msg['To'] = COMMASPACE.join(send_to) 44 | msg['Date'] = formatdate(localtime=True) 45 | msg['Subject'] = subject 46 | 47 | msg.attach(MIMEText(text)) 48 | 49 | for f in files or []: 50 | with open(f, "rb") as fil: 51 | part = MIMEApplication(fil.read(), Name=basename(f)) 52 | part['Content-Disposition'] = f'attachment; filename="{basename(f)}"' 53 | msg.attach(part) 54 | 55 | smtp = smtplib.SMTP('smtp.gmail.com: 587') 56 | smtp.starttls() 57 | smtp.login(send_from, 'Enter Password Here') 58 | smtp.sendmail(send_from, send_to, msg.as_string()) 59 | smtp.close() 60 | 61 | 62 | if __name__ == "__main__": 63 | json = get_job_postings()[1:] 64 | output_jobs_to_xls(json) 65 | send_email('Enter Sending Email Here', ['contact@preneure.com'], 66 | 'Jobs Posting', 'Please, find attached a list of job posting to this email', 67 | files=['remote_jobs.xls']) 68 | -------------------------------------------------------------------------------- /html_scraper/amazon_products_urls.csv: -------------------------------------------------------------------------------- 1 | https://www.amazon.com/Hisense-Premium-65-Inch-Compatibility-65U8G/dp/B091XWTGXL/?th=1, 2 | https://www.amazon.com/dp/B08TKSMQSY/?th=1, 3 | https://www.amazon.com/dp/B08WJMQ5TG/?th=1, 4 | https://www.amazon.com/SAMSUNG-86-inch-Crystal-TU9010-Built/dp/B094C627M5/ -------------------------------------------------------------------------------- /html_scraper/amazon_scraper.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import requests 3 | import csv 4 | import bs4 5 | import concurrent.futures 6 | from tqdm import tqdm 7 | 8 | USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 12_0_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36" 9 | REQUEST_HEADER = { 10 | 'User-Agent': USER_AGENT, 11 | 'Accept-Language': 'en-US, en;q=0.5', 12 | } 13 | NO_THREADS = 10 14 | 15 | 16 | def get_page_html(url): 17 | res = requests.get(url=url, headers=REQUEST_HEADER) 18 | return res.content 19 | 20 | 21 | def get_product_price(soup): 22 | main_price_span = soup.find('span', attrs={ 23 | 'class': 'a-price a-text-price a-size-medium apexPriceToPay' 24 | }) 25 | price_spans = main_price_span.findAll('span') 26 | for span in price_spans: 27 | price = span.text.strip().replace('$', '').replace(',', '') 28 | try: 29 | return float(price) 30 | except ValueError: 31 | print("Value Obtained For Price Could Not Be Parsed") 32 | exit() 33 | 34 | 35 | def get_product_title(soup): 36 | product_title = soup.find('span', id='productTitle') 37 | return product_title.text.strip() 38 | 39 | 40 | def get_product_rating(soup): 41 | product_ratings_div = soup.find('div', attrs={ 42 | 'id': 'averageCustomerReviews' 43 | }) 44 | product_rating_section = product_ratings_div.find( 45 | 'i', attrs={'class': 'a-icon-star'}) 46 | product_rating_span = product_rating_section.find('span') 47 | try: 48 | rating = product_rating_span.text.strip().split() 49 | return float(rating[0]) 50 | except ValueError: 51 | print("Value Obtained For Rating Could Not Be Parsed") 52 | exit() 53 | 54 | 55 | def get_product_technical_details(soup): 56 | details = {} 57 | technical_details_section = soup.find('div', id='prodDetails') 58 | data_tables = technical_details_section.findAll( 59 | 'table', class_='prodDetTable') 60 | for table in data_tables: 61 | table_rows = table.findAll('tr') 62 | for row in table_rows: 63 | row_key = row.find('th').text.strip() 64 | row_value = row.find('td').text.strip().replace('\u200e', '') 65 | details[row_key] = row_value 66 | return details 67 | 68 | 69 | def extract_product_info(url, output): 70 | product_info = {} 71 | #print(f'Scraping URL: {url}') 72 | html = get_page_html(url=url) 73 | soup = bs4.BeautifulSoup(html, 'lxml') 74 | product_info['price'] = get_product_price(soup) 75 | product_info['title'] = get_product_title(soup) 76 | product_info['rating'] = get_product_rating(soup) 77 | product_info.update(get_product_technical_details(soup)) 78 | output.append(product_info) 79 | 80 | 81 | if __name__ == "__main__": 82 | products_data = [] 83 | urls = [] 84 | with open('amazon_products_urls.csv', newline='') as csvfile: 85 | urls = list(csv.reader(csvfile, delimiter=',')) 86 | with concurrent.futures.ThreadPoolExecutor(max_workers=NO_THREADS) as executor: 87 | for wkn in tqdm(range(0, len(urls))): 88 | executor.submit(extract_product_info, urls[wkn][0], products_data) 89 | output_file_name = 'output-{}.csv'.format( 90 | datetime.today().strftime("%m-%d-%Y")) 91 | with open(output_file_name, 'w') as outputfile: 92 | writer = csv.writer(outputfile) 93 | writer.writerow(products_data[0].keys()) 94 | for product in products_data: 95 | writer.writerow(product.values()) 96 | -------------------------------------------------------------------------------- /html_scraper/amazon_scraper_single.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import requests 3 | import csv 4 | import bs4 5 | 6 | USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 12_0_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36" 7 | REQUEST_HEADER = {'User-Agent': 8 | USER_AGENT, 9 | 'Accept-Language': 'en-US, en;q=0.5'} 10 | 11 | 12 | def get_page_html(url): 13 | res = requests.get(url=url, headers=REQUEST_HEADER) 14 | return res.content 15 | 16 | 17 | def get_product_price(soup): 18 | main_price_span = soup.find('span', attrs={ 19 | 'class': 'a-price a-text-price a-size-medium apexPriceToPay', }) 20 | price_spans = main_price_span.findAll('span') 21 | for span in price_spans: 22 | price = span.text.strip().replace('$', '').replace(',', '') 23 | try: 24 | return float(price) 25 | except ValueError: 26 | print("Value Obtained For Price Could Not Be Parsed") 27 | exit() 28 | 29 | 30 | def get_product_title(soup): 31 | product_title = soup.find('span', id='productTitle') 32 | return product_title.text.strip() 33 | 34 | 35 | def get_product_rating(soup): 36 | product_ratings_div = soup.find( 37 | 'div', attrs={'id': 'averageCustomerReviews'}) 38 | product_rating_section = product_ratings_div.find( 39 | 'i', attrs={'class': 'a-icon-star'}) 40 | product_rating_span = product_rating_section.find('span') 41 | try: 42 | rating = product_rating_span.text.strip().split()[0] 43 | return float(rating) 44 | except ValueError: 45 | print("Value Obtained For Rating Could Not Be Parsed") 46 | exit() 47 | 48 | 49 | def get_product_technical_details(soup): 50 | details = {} 51 | technical_details_section = soup.find( 52 | 'div', id='prodDetails') 53 | data_tables = technical_details_section.findAll( 54 | 'table', class_='prodDetTable') 55 | for table in data_tables: 56 | table_rows = table.findAll('tr') 57 | for row in table_rows: 58 | row_key = row.find('th').text.strip() 59 | row_value = row.find('td').text.strip().replace("\u200e", '') 60 | details[row_key] = row_value 61 | return details 62 | 63 | 64 | def extract_product_info(url): 65 | product_info = {} 66 | print(f'Scraping URL: {url}') 67 | html = get_page_html(url) 68 | soup = bs4.BeautifulSoup(html, 'lxml') 69 | product_info['price'] = get_product_price(soup) 70 | product_info['title'] = get_product_title(soup) 71 | product_info['rating'] = get_product_rating(soup) 72 | product_info.update(get_product_technical_details(soup)) 73 | return product_info 74 | 75 | 76 | if __name__ == "__main__": 77 | products_data = [] 78 | with open('amazon_products_urls.csv', newline='') as csvfile: 79 | reader = csv.reader(csvfile, delimiter=',') 80 | for row in reader: 81 | url = row[0] 82 | products_data.append(extract_product_info(url)) 83 | 84 | output_file_name = 'output-{}.csv'.format( 85 | datetime.today().strftime("%m-%d-%Y")) 86 | with open(output_file_name, 'w') as outputfile: 87 | writer = csv.writer(outputfile) 88 | writer.writerow(products_data.pop().keys()) 89 | for product in products_data: 90 | writer.writerow(product.values()) 91 | -------------------------------------------------------------------------------- /html_scraper/output-01-08-2022.csv: -------------------------------------------------------------------------------- 1 | price,title,rating,Brand Name,Item Weight,Product Dimensions,Country of Origin,Item model number,Is Discontinued By Manufacturer,Output Wattage,Color Name,Specification Met,Special Features,Speaker Type,ASIN,Customer Reviews,Best Sellers Rank,Date First Available 2 | 1698.0,Sony X90J 75 Inch TV: BRAVIA XR Full Array LED 4K Ultra HD Smart Google TV with Dolby Vision HDR and Alexa Compatibility XR75X90J- 2021 Model,4.6,Sony,73.4 pounds,66 x 16.25 x 37.88 inches,Mexico,XR75X90J,No,20 Watts,Black,Energy Star,"Cognitive Intelligence with the Cognitive Processor XR; XR HDR Remaster; XR Contrast; 4K XR Super Resolution; XR 4K Upscaling; 4K XR Smoothing, XR Sound Position; Voice Search; Smart Remote; Chromecast Built-in; Works with Apple AirPlay 2 and Apple HomeKit",Built-In,B08TKSMQSY,"4.6 out of 5 stars 3 | 990 ratings 4 | 5 | 6 | 4.6 out of 5 stars","#6,508 in Electronics (See Top 100 in Electronics) #100 in LED & LCD TVs","April 19, 2021" 7 | 1897.99,"SAMSUNG 86-inch Class Crystal UHD TU9010 Series - 4K UHD LED Smart TV with Alexa Built-in (UN86TU9010FXZA, 2021 Model)",4.7,SAMSUNG,93 pounds,15.4 x 75.8 x 47.4 inches,Mexico,UN86TU9010FXZA,3 AAA batteries required. (included),PurColor,B094C627M5,"4.7 out of 5 stars 8 | 151 ratings 9 | 10 | 11 | 4.7 out of 5 stars","#6,236 in Electronics (See Top 100 in Electronics) #95 in LED & LCD TVs","June 21, 2021" 12 | 798.0,"Sony X85J 50 Inch TV: 4K Ultra HD LED Smart Google TV with Native 120HZ Refresh Rate, Dolby Vision HDR, and Alexa Compatibility KD50X85J- 2021 Model, Black",4.6,Sony,27.9 pounds,44.25 x 11.38 x 25.63 inches,KD50X85J,Black,"4K HDR Processor X1; TRILUMINOS PRO; Object-Based HDR Remaster; 4K X-Reality™ PRO; Motionflow™ XR; X-Balanced Speaker, Voice Search; Google Assistant; Works with Alexa; Works with Hey Google; Dolby Vision and Dolby Atmos",B08WJMQ5TG,"4.6 out of 5 stars 13 | 737 ratings 14 | 15 | 16 | 4.6 out of 5 stars","#6,858 in Electronics (See Top 100 in Electronics) #106 in LED & LCD TVs","May 24, 2021" 17 | -------------------------------------------------------------------------------- /web_bot/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hussain-mustafa990/python_web_scraping_and_automation/15de5134b4d860d0c9c55b0b366faf7adca85fdd/web_bot/.DS_Store -------------------------------------------------------------------------------- /web_bot/chromedriver: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hussain-mustafa990/python_web_scraping_and_automation/15de5134b4d860d0c9c55b0b366faf7adca85fdd/web_bot/chromedriver -------------------------------------------------------------------------------- /web_bot/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "USERNAME": "", 3 | "PASSWORD": "" 4 | } 5 | -------------------------------------------------------------------------------- /web_bot/trello_bot.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.keys import Keys 3 | from selenium.webdriver.common.by import By 4 | from selenium.webdriver.support.ui import WebDriverWait 5 | import time 6 | from datetime import date 7 | import os 8 | import json 9 | 10 | CHROME_DRIVER_PATH = os.path.join(os.getcwd(), "chromedriver") 11 | OP = webdriver.ChromeOptions() 12 | OP.add_argument('--headless') 13 | DRIVER = webdriver.Chrome(CHROME_DRIVER_PATH) 14 | 15 | 16 | def screenshotPage(): 17 | time.sleep(2) 18 | date_str = date.today().strftime("%m-%d-%Y") 19 | fpath = os.path.join(os.getcwd(), 'downloads/{}.png'.format(date_str)) 20 | DRIVER.get_screenshot_as_file(fpath) 21 | 22 | 23 | def addTask(): 24 | time.sleep(2) 25 | DRIVER.find_element( 26 | By.XPATH, value="//textarea[@aria-label='To Do']/ancestor::div/descendant::div[@class='card-composer-container js-card-composer-container']/child::a").click() 27 | task_text_area = DRIVER.find_element( 28 | By.XPATH, value="//div[@class='card-composer']/descendant::textarea") 29 | task_text_area.clear() 30 | task_text_area.send_keys("Bot Added Task") 31 | DRIVER.find_element(By.XPATH, value="//input[@value='Add card']").click() 32 | time.sleep(5) 33 | 34 | 35 | def login(): 36 | with open('config.json') as configFile: 37 | credentials = json.load(configFile) 38 | time.sleep(2) 39 | DRIVER.find_element(By.XPATH, value="//a[@href='/login']").click() 40 | time.sleep(2) 41 | username = DRIVER.find_element( 42 | By.CSS_SELECTOR, value="input[name='user']") 43 | password = DRIVER.find_element( 44 | By.CSS_SELECTOR, value="input[name='password']") 45 | username.clear() 46 | password.clear() 47 | username.send_keys(credentials["USERNAME"]) 48 | password.send_keys(credentials["PASSWORD"]) 49 | DRIVER.find_element_by_css_selector("input[type='submit']").click() 50 | time.sleep(5) 51 | password = DRIVER.find_element( 52 | By.CSS_SELECTOR, value="input[name='password']") 53 | password.clear() 54 | password.send_keys(credentials["PASSWORD"]) 55 | time.sleep(5) 56 | DRIVER.find_element_by_css_selector("button[type='submit']").click() 57 | 58 | 59 | def navigateToBoard(): 60 | time.sleep(5) 61 | DRIVER.find_element( 62 | By.XPATH, value="//div[@title='{}']/ancestor::a".format('Bot Board')).click() 63 | time.sleep(5) 64 | 65 | 66 | def main(): 67 | try: 68 | DRIVER.get("https://trello.com") 69 | login() 70 | navigateToBoard() 71 | addTask() 72 | screenshotPage() 73 | input("Bot Operation Completed. Press any key...") 74 | DRIVER.close() 75 | except Exception as e: 76 | print(e) 77 | DRIVER.close() 78 | 79 | 80 | if __name__ == "__main__": 81 | main() 82 | --------------------------------------------------------------------------------