├── .DS_Store
├── .gitignore
├── README.md
├── api_scraper
├── .DS_Store
├── remote_jobs.xls
└── remoteok_scraper.py
├── html_scraper
├── amazon_products_urls.csv
├── amazon_scraper.py
├── amazon_scraper_single.py
└── output-01-08-2022.csv
└── web_bot
├── .DS_Store
├── chromedriver
├── config.json
└── trello_bot.py
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hussain-mustafa990/python_web_scraping_and_automation/15de5134b4d860d0c9c55b0b366faf7adca85fdd/.DS_Store
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
98 | __pypackages__/
99 |
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 |
104 | # SageMath parsed files
105 | *.sage.py
106 |
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 |
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 |
120 | # Rope project settings
121 | .ropeproject
122 |
123 | # mkdocs documentation
124 | /site
125 |
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 |
131 | # Pyre type checker
132 | .pyre/
133 |
134 | # pytype static type analyzer
135 | .pytype/
136 |
137 | # Cython debug symbols
138 | cython_debug/
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Python Web Scraping and Automation
2 |
3 |
Selenium and BeautifulSoup Based Web Scraping and Automation Scripts
4 |
9 |
10 |
11 |
12 | Selenium Primarily it is for automating web applications for testing purposes, but is certainly not limited to just that.
13 | Boring web-based administration tasks can (and should) also be automated as well.
14 | Beautiful Soup is a Python library for pulling data out of HTML and XML files. It works with your favorite parser to provide idiomatic ways of navigating, searching, and modifying the parse tree. It commonly saves programmers hours or days of work.
15 |
16 |
17 | ### Discord
18 |
19 |
20 |
21 |
22 |
23 | ## License
24 |
25 | [](https://creativecommons.org/publicdomain/zero/1.0/)
26 |
--------------------------------------------------------------------------------
/api_scraper/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hussain-mustafa990/python_web_scraping_and_automation/15de5134b4d860d0c9c55b0b366faf7adca85fdd/api_scraper/.DS_Store
--------------------------------------------------------------------------------
/api_scraper/remote_jobs.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hussain-mustafa990/python_web_scraping_and_automation/15de5134b4d860d0c9c55b0b366faf7adca85fdd/api_scraper/remote_jobs.xls
--------------------------------------------------------------------------------
/api_scraper/remoteok_scraper.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import xlwt
3 | from xlwt import Workbook
4 | import smtplib
5 | from os.path import basename
6 | from email.mime.application import MIMEApplication
7 | from email.mime.multipart import MIMEMultipart
8 | from email.mime.text import MIMEText
9 | from email.utils import COMMASPACE, formatdate
10 |
11 | # https://myaccount.google.com/lesssecureapps
12 | BASE_URL = 'https://remoteok.com/api/'
13 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 12_0_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
14 | REQUEST_HEADER = {
15 | 'User-Agent': USER_AGENT,
16 | 'Accept-Language': 'en-US, en;q=0.5',
17 | }
18 |
19 |
20 | def get_job_postings():
21 | res = requests.get(url=BASE_URL, headers=REQUEST_HEADER)
22 | return res.json()
23 |
24 |
25 | def output_jobs_to_xls(data):
26 | wb = Workbook()
27 | job_sheet = wb.add_sheet('Jobs')
28 | headers = list(data[0].keys())
29 | for i in range(0, len(headers)):
30 | job_sheet.write(0, i, headers[i])
31 | for i in range(0, len(data)):
32 | job = data[i]
33 | values = list(job.values())
34 | for x in range(0, len(values)):
35 | job_sheet.write(i+1, x, values[x])
36 | wb.save('remote_jobs.xls')
37 |
38 |
39 | def send_email(send_from, send_to, subject, text, files=None):
40 | assert isinstance(send_to, list)
41 | msg = MIMEMultipart()
42 | msg['From'] = send_from
43 | msg['To'] = COMMASPACE.join(send_to)
44 | msg['Date'] = formatdate(localtime=True)
45 | msg['Subject'] = subject
46 |
47 | msg.attach(MIMEText(text))
48 |
49 | for f in files or []:
50 | with open(f, "rb") as fil:
51 | part = MIMEApplication(fil.read(), Name=basename(f))
52 | part['Content-Disposition'] = f'attachment; filename="{basename(f)}"'
53 | msg.attach(part)
54 |
55 | smtp = smtplib.SMTP('smtp.gmail.com: 587')
56 | smtp.starttls()
57 | smtp.login(send_from, 'Enter Password Here')
58 | smtp.sendmail(send_from, send_to, msg.as_string())
59 | smtp.close()
60 |
61 |
62 | if __name__ == "__main__":
63 | json = get_job_postings()[1:]
64 | output_jobs_to_xls(json)
65 | send_email('Enter Sending Email Here', ['contact@preneure.com'],
66 | 'Jobs Posting', 'Please, find attached a list of job posting to this email',
67 | files=['remote_jobs.xls'])
68 |
--------------------------------------------------------------------------------
/html_scraper/amazon_products_urls.csv:
--------------------------------------------------------------------------------
1 | https://www.amazon.com/Hisense-Premium-65-Inch-Compatibility-65U8G/dp/B091XWTGXL/?th=1,
2 | https://www.amazon.com/dp/B08TKSMQSY/?th=1,
3 | https://www.amazon.com/dp/B08WJMQ5TG/?th=1,
4 | https://www.amazon.com/SAMSUNG-86-inch-Crystal-TU9010-Built/dp/B094C627M5/
--------------------------------------------------------------------------------
/html_scraper/amazon_scraper.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | import requests
3 | import csv
4 | import bs4
5 | import concurrent.futures
6 | from tqdm import tqdm
7 |
8 | USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 12_0_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
9 | REQUEST_HEADER = {
10 | 'User-Agent': USER_AGENT,
11 | 'Accept-Language': 'en-US, en;q=0.5',
12 | }
13 | NO_THREADS = 10
14 |
15 |
16 | def get_page_html(url):
17 | res = requests.get(url=url, headers=REQUEST_HEADER)
18 | return res.content
19 |
20 |
21 | def get_product_price(soup):
22 | main_price_span = soup.find('span', attrs={
23 | 'class': 'a-price a-text-price a-size-medium apexPriceToPay'
24 | })
25 | price_spans = main_price_span.findAll('span')
26 | for span in price_spans:
27 | price = span.text.strip().replace('$', '').replace(',', '')
28 | try:
29 | return float(price)
30 | except ValueError:
31 | print("Value Obtained For Price Could Not Be Parsed")
32 | exit()
33 |
34 |
35 | def get_product_title(soup):
36 | product_title = soup.find('span', id='productTitle')
37 | return product_title.text.strip()
38 |
39 |
40 | def get_product_rating(soup):
41 | product_ratings_div = soup.find('div', attrs={
42 | 'id': 'averageCustomerReviews'
43 | })
44 | product_rating_section = product_ratings_div.find(
45 | 'i', attrs={'class': 'a-icon-star'})
46 | product_rating_span = product_rating_section.find('span')
47 | try:
48 | rating = product_rating_span.text.strip().split()
49 | return float(rating[0])
50 | except ValueError:
51 | print("Value Obtained For Rating Could Not Be Parsed")
52 | exit()
53 |
54 |
55 | def get_product_technical_details(soup):
56 | details = {}
57 | technical_details_section = soup.find('div', id='prodDetails')
58 | data_tables = technical_details_section.findAll(
59 | 'table', class_='prodDetTable')
60 | for table in data_tables:
61 | table_rows = table.findAll('tr')
62 | for row in table_rows:
63 | row_key = row.find('th').text.strip()
64 | row_value = row.find('td').text.strip().replace('\u200e', '')
65 | details[row_key] = row_value
66 | return details
67 |
68 |
69 | def extract_product_info(url, output):
70 | product_info = {}
71 | #print(f'Scraping URL: {url}')
72 | html = get_page_html(url=url)
73 | soup = bs4.BeautifulSoup(html, 'lxml')
74 | product_info['price'] = get_product_price(soup)
75 | product_info['title'] = get_product_title(soup)
76 | product_info['rating'] = get_product_rating(soup)
77 | product_info.update(get_product_technical_details(soup))
78 | output.append(product_info)
79 |
80 |
81 | if __name__ == "__main__":
82 | products_data = []
83 | urls = []
84 | with open('amazon_products_urls.csv', newline='') as csvfile:
85 | urls = list(csv.reader(csvfile, delimiter=','))
86 | with concurrent.futures.ThreadPoolExecutor(max_workers=NO_THREADS) as executor:
87 | for wkn in tqdm(range(0, len(urls))):
88 | executor.submit(extract_product_info, urls[wkn][0], products_data)
89 | output_file_name = 'output-{}.csv'.format(
90 | datetime.today().strftime("%m-%d-%Y"))
91 | with open(output_file_name, 'w') as outputfile:
92 | writer = csv.writer(outputfile)
93 | writer.writerow(products_data[0].keys())
94 | for product in products_data:
95 | writer.writerow(product.values())
96 |
--------------------------------------------------------------------------------
/html_scraper/amazon_scraper_single.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | import requests
3 | import csv
4 | import bs4
5 |
6 | USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 12_0_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
7 | REQUEST_HEADER = {'User-Agent':
8 | USER_AGENT,
9 | 'Accept-Language': 'en-US, en;q=0.5'}
10 |
11 |
12 | def get_page_html(url):
13 | res = requests.get(url=url, headers=REQUEST_HEADER)
14 | return res.content
15 |
16 |
17 | def get_product_price(soup):
18 | main_price_span = soup.find('span', attrs={
19 | 'class': 'a-price a-text-price a-size-medium apexPriceToPay', })
20 | price_spans = main_price_span.findAll('span')
21 | for span in price_spans:
22 | price = span.text.strip().replace('$', '').replace(',', '')
23 | try:
24 | return float(price)
25 | except ValueError:
26 | print("Value Obtained For Price Could Not Be Parsed")
27 | exit()
28 |
29 |
30 | def get_product_title(soup):
31 | product_title = soup.find('span', id='productTitle')
32 | return product_title.text.strip()
33 |
34 |
35 | def get_product_rating(soup):
36 | product_ratings_div = soup.find(
37 | 'div', attrs={'id': 'averageCustomerReviews'})
38 | product_rating_section = product_ratings_div.find(
39 | 'i', attrs={'class': 'a-icon-star'})
40 | product_rating_span = product_rating_section.find('span')
41 | try:
42 | rating = product_rating_span.text.strip().split()[0]
43 | return float(rating)
44 | except ValueError:
45 | print("Value Obtained For Rating Could Not Be Parsed")
46 | exit()
47 |
48 |
49 | def get_product_technical_details(soup):
50 | details = {}
51 | technical_details_section = soup.find(
52 | 'div', id='prodDetails')
53 | data_tables = technical_details_section.findAll(
54 | 'table', class_='prodDetTable')
55 | for table in data_tables:
56 | table_rows = table.findAll('tr')
57 | for row in table_rows:
58 | row_key = row.find('th').text.strip()
59 | row_value = row.find('td').text.strip().replace("\u200e", '')
60 | details[row_key] = row_value
61 | return details
62 |
63 |
64 | def extract_product_info(url):
65 | product_info = {}
66 | print(f'Scraping URL: {url}')
67 | html = get_page_html(url)
68 | soup = bs4.BeautifulSoup(html, 'lxml')
69 | product_info['price'] = get_product_price(soup)
70 | product_info['title'] = get_product_title(soup)
71 | product_info['rating'] = get_product_rating(soup)
72 | product_info.update(get_product_technical_details(soup))
73 | return product_info
74 |
75 |
76 | if __name__ == "__main__":
77 | products_data = []
78 | with open('amazon_products_urls.csv', newline='') as csvfile:
79 | reader = csv.reader(csvfile, delimiter=',')
80 | for row in reader:
81 | url = row[0]
82 | products_data.append(extract_product_info(url))
83 |
84 | output_file_name = 'output-{}.csv'.format(
85 | datetime.today().strftime("%m-%d-%Y"))
86 | with open(output_file_name, 'w') as outputfile:
87 | writer = csv.writer(outputfile)
88 | writer.writerow(products_data.pop().keys())
89 | for product in products_data:
90 | writer.writerow(product.values())
91 |
--------------------------------------------------------------------------------
/html_scraper/output-01-08-2022.csv:
--------------------------------------------------------------------------------
1 | price,title,rating,Brand Name,Item Weight,Product Dimensions,Country of Origin,Item model number,Is Discontinued By Manufacturer,Output Wattage,Color Name,Specification Met,Special Features,Speaker Type,ASIN,Customer Reviews,Best Sellers Rank,Date First Available
2 | 1698.0,Sony X90J 75 Inch TV: BRAVIA XR Full Array LED 4K Ultra HD Smart Google TV with Dolby Vision HDR and Alexa Compatibility XR75X90J- 2021 Model,4.6,Sony,73.4 pounds,66 x 16.25 x 37.88 inches,Mexico,XR75X90J,No,20 Watts,Black,Energy Star,"Cognitive Intelligence with the Cognitive Processor XR; XR HDR Remaster; XR Contrast; 4K XR Super Resolution; XR 4K Upscaling; 4K XR Smoothing, XR Sound Position; Voice Search; Smart Remote; Chromecast Built-in; Works with Apple AirPlay 2 and Apple HomeKit",Built-In,B08TKSMQSY,"4.6 out of 5 stars
3 | 990 ratings
4 |
5 |
6 | 4.6 out of 5 stars","#6,508 in Electronics (See Top 100 in Electronics) #100 in LED & LCD TVs","April 19, 2021"
7 | 1897.99,"SAMSUNG 86-inch Class Crystal UHD TU9010 Series - 4K UHD LED Smart TV with Alexa Built-in (UN86TU9010FXZA, 2021 Model)",4.7,SAMSUNG,93 pounds,15.4 x 75.8 x 47.4 inches,Mexico,UN86TU9010FXZA,3 AAA batteries required. (included),PurColor,B094C627M5,"4.7 out of 5 stars
8 | 151 ratings
9 |
10 |
11 | 4.7 out of 5 stars","#6,236 in Electronics (See Top 100 in Electronics) #95 in LED & LCD TVs","June 21, 2021"
12 | 798.0,"Sony X85J 50 Inch TV: 4K Ultra HD LED Smart Google TV with Native 120HZ Refresh Rate, Dolby Vision HDR, and Alexa Compatibility KD50X85J- 2021 Model, Black",4.6,Sony,27.9 pounds,44.25 x 11.38 x 25.63 inches,KD50X85J,Black,"4K HDR Processor X1; TRILUMINOS PRO; Object-Based HDR Remaster; 4K X-Reality™ PRO; Motionflow™ XR; X-Balanced Speaker, Voice Search; Google Assistant; Works with Alexa; Works with Hey Google; Dolby Vision and Dolby Atmos",B08WJMQ5TG,"4.6 out of 5 stars
13 | 737 ratings
14 |
15 |
16 | 4.6 out of 5 stars","#6,858 in Electronics (See Top 100 in Electronics) #106 in LED & LCD TVs","May 24, 2021"
17 |
--------------------------------------------------------------------------------
/web_bot/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hussain-mustafa990/python_web_scraping_and_automation/15de5134b4d860d0c9c55b0b366faf7adca85fdd/web_bot/.DS_Store
--------------------------------------------------------------------------------
/web_bot/chromedriver:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hussain-mustafa990/python_web_scraping_and_automation/15de5134b4d860d0c9c55b0b366faf7adca85fdd/web_bot/chromedriver
--------------------------------------------------------------------------------
/web_bot/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "USERNAME": "",
3 | "PASSWORD": ""
4 | }
5 |
--------------------------------------------------------------------------------
/web_bot/trello_bot.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | from selenium.webdriver.common.keys import Keys
3 | from selenium.webdriver.common.by import By
4 | from selenium.webdriver.support.ui import WebDriverWait
5 | import time
6 | from datetime import date
7 | import os
8 | import json
9 |
10 | CHROME_DRIVER_PATH = os.path.join(os.getcwd(), "chromedriver")
11 | OP = webdriver.ChromeOptions()
12 | OP.add_argument('--headless')
13 | DRIVER = webdriver.Chrome(CHROME_DRIVER_PATH)
14 |
15 |
16 | def screenshotPage():
17 | time.sleep(2)
18 | date_str = date.today().strftime("%m-%d-%Y")
19 | fpath = os.path.join(os.getcwd(), 'downloads/{}.png'.format(date_str))
20 | DRIVER.get_screenshot_as_file(fpath)
21 |
22 |
23 | def addTask():
24 | time.sleep(2)
25 | DRIVER.find_element(
26 | By.XPATH, value="//textarea[@aria-label='To Do']/ancestor::div/descendant::div[@class='card-composer-container js-card-composer-container']/child::a").click()
27 | task_text_area = DRIVER.find_element(
28 | By.XPATH, value="//div[@class='card-composer']/descendant::textarea")
29 | task_text_area.clear()
30 | task_text_area.send_keys("Bot Added Task")
31 | DRIVER.find_element(By.XPATH, value="//input[@value='Add card']").click()
32 | time.sleep(5)
33 |
34 |
35 | def login():
36 | with open('config.json') as configFile:
37 | credentials = json.load(configFile)
38 | time.sleep(2)
39 | DRIVER.find_element(By.XPATH, value="//a[@href='/login']").click()
40 | time.sleep(2)
41 | username = DRIVER.find_element(
42 | By.CSS_SELECTOR, value="input[name='user']")
43 | password = DRIVER.find_element(
44 | By.CSS_SELECTOR, value="input[name='password']")
45 | username.clear()
46 | password.clear()
47 | username.send_keys(credentials["USERNAME"])
48 | password.send_keys(credentials["PASSWORD"])
49 | DRIVER.find_element_by_css_selector("input[type='submit']").click()
50 | time.sleep(5)
51 | password = DRIVER.find_element(
52 | By.CSS_SELECTOR, value="input[name='password']")
53 | password.clear()
54 | password.send_keys(credentials["PASSWORD"])
55 | time.sleep(5)
56 | DRIVER.find_element_by_css_selector("button[type='submit']").click()
57 |
58 |
59 | def navigateToBoard():
60 | time.sleep(5)
61 | DRIVER.find_element(
62 | By.XPATH, value="//div[@title='{}']/ancestor::a".format('Bot Board')).click()
63 | time.sleep(5)
64 |
65 |
66 | def main():
67 | try:
68 | DRIVER.get("https://trello.com")
69 | login()
70 | navigateToBoard()
71 | addTask()
72 | screenshotPage()
73 | input("Bot Operation Completed. Press any key...")
74 | DRIVER.close()
75 | except Exception as e:
76 | print(e)
77 | DRIVER.close()
78 |
79 |
80 | if __name__ == "__main__":
81 | main()
82 |
--------------------------------------------------------------------------------