├── README.md ├── basic_scrape.py ├── basic_scrape_csv_export.py ├── scraping_logins_pagination_selenium.py └── scraping_logins_selenium.py /README.md: -------------------------------------------------------------------------------- 1 | # basic_python_scraping 2 | Basic python web scraping code. 3 | -------------------------------------------------------------------------------- /basic_scrape.py: -------------------------------------------------------------------------------- 1 | #IMPORT LIBRARIES 2 | from bs4 import BeautifulSoup 3 | import requests 4 | 5 | #REQUEST WEBPAGE AND STORE IT AS A VARIABLE 6 | page_to_scrape = requests.get("http://quotes.toscrape.com") 7 | 8 | #USE BEAUTIFULSOUP TO PARSE THE HTML AND STORE IT AS A VARIABLE 9 | soup = BeautifulSoup(page_to_scrape.text, 'html.parser') 10 | 11 | #FIND ALL THE ITEMS IN THE PAGE WITH A CLASS ATTRIBUTE OF 'TEXT' 12 | #AND STORE THE LIST AS A VARIABLE 13 | quotes = soup.findAll('span', attrs={'class':'text'}) 14 | 15 | #FIND ALL THE ITEMS IN THE PAGE WITH A CLASS ATTRIBUTE OF 'AUTHOR' 16 | #AND STORE THE LIST AS A VARIABLE 17 | authors = soup.findAll('small', attrs={"class":"author"}) 18 | 19 | #LOOP THROUGH BOTH LISTS USING THE 'ZIP' FUNCTION 20 | #AND PRINT AND FORMAT THE RESULTS 21 | for quote, author in zip(quotes, authors): 22 | print(quote.text + "-" + author.text) 23 | -------------------------------------------------------------------------------- /basic_scrape_csv_export.py: -------------------------------------------------------------------------------- 1 | #IMPORT LIBRARIES 2 | from bs4 import BeautifulSoup 3 | import requests 4 | 5 | #IMPORT CSV LIBRARY 6 | import csv 7 | 8 | #OPEN A NEW CSV FILE. IT CAN BE CALLED ANYTHING 9 | file = open('scraped_quotes.csv', 'w') 10 | #CREATE A VARIABLE FOR WRITING TO THE CSV 11 | writer = csv.writer(file) 12 | 13 | #CREATE THE HEADER ROW OF THE CSV 14 | writer.writerow(['Quote', 'Author']) 15 | 16 | #REQUEST WEBPAGE AND STORE IT AS A VARIABLE 17 | page_to_scrape = requests.get("http://quotes.toscrape.com") 18 | #USE BEAUTIFULSOUP TO PARSE THE HTML AND STORE IT AS A VARIABLE 19 | soup = BeautifulSoup(page_to_scrape.text, 'html.parser') 20 | #FIND ALL THE ITEMS IN THE PAGE WITH A CLASS ATTRIBUTE OF 'TEXT' 21 | #AND STORE THE LIST AS A VARIABLE 22 | quotes = soup.findAll('span', attrs={'class':'text'}) 23 | 24 | #FIND ALL THE ITEMS IN THE PAGE WITH A CLASS ATTRIBUTE OF 'AUTHOR' 25 | #AND STORE THE LIST AS A VARIABLE 26 | authors = soup.findAll('small', attrs={"class":"author"}) 27 | 28 | #LOOP THROUGH BOTH LISTS USING THE 'ZIP' FUNCTION 29 | #AND PRINT AND FORMAT THE RESULTS 30 | for quote, author in zip(quotes, authors): 31 | print(quote.text + "-" + author.text) 32 | #WRITE EACH ITEM AS A NEW ROW IN THE CSV 33 | writer.writerow([quote.text, author.text]) 34 | #CLOSE THE CSV FILE 35 | file.close() 36 | -------------------------------------------------------------------------------- /scraping_logins_pagination_selenium.py: -------------------------------------------------------------------------------- 1 | #INSTALL SELENIUM BEFORE RUNNING THIS CODE 2 | #pip3 install selenium 3 | import csv 4 | from selenium import webdriver 5 | from selenium.webdriver.chrome.service import Service 6 | from selenium.webdriver.common.by import By 7 | import time 8 | import getpass 9 | from selenium.common.exceptions import NoSuchElementException 10 | 11 | #IF USING A RASPBERRY PI, FIRST INSTALL THIS OPTIMIZED CHROME DRIVER 12 | #sudo apt-get install chromium-chromedriver 13 | browser_driver = Service('/usr/lib/chromium-browser/chromedriver') 14 | page_to_scrape = webdriver.Chrome(service=browser_driver) 15 | page_to_scrape.get("http://quotes.toscrape.com") 16 | 17 | page_to_scrape.find_element(By.LINK_TEXT, "Login").click() 18 | 19 | time.sleep(3) 20 | username = page_to_scrape.find_element(By.ID, "username") 21 | password = page_to_scrape.find_element(By.ID, "password") 22 | username.send_keys("admin") 23 | #USING GETPASS WILL PROMPT YOU TO ENTER YOUR PASSWORD INSTEAD OF STORING 24 | #IT IN CODE. YOU'RE ALSO WELCOME TO USE A PYTHON KEYRING TO STORE PASSWORDS. 25 | my_pass = getpass.getpass() 26 | password.send_keys(my_pass) 27 | page_to_scrape.find_element(By.CSS_SELECTOR, "input.btn-primary").click() 28 | 29 | quotes = page_to_scrape.find_elements(By.CLASS_NAME, "text") 30 | authors = page_to_scrape.find_elements(By.CLASS_NAME, "author") 31 | 32 | file = open("scraped_quotes.csv", "w") 33 | writer = csv.writer(file) 34 | 35 | writer.writerow(["QUOTES", "AUTHORS"]) 36 | while True: 37 | quotes = page_to_scrape.find_elements(By.CLASS_NAME, "text") 38 | authors = page_to_scrape.find_elements(By.CLASS_NAME, "author") 39 | for quote, author in zip(quotes, authors): 40 | print(quote.text + " - " + author.text) 41 | writer.writerow([quote.text, author.text]) 42 | try: 43 | page_to_scrape.find_element(By.PARTIAL_LINK_TEXT, "Next").click() 44 | except NoSuchElementException: 45 | break 46 | file.close() 47 | -------------------------------------------------------------------------------- /scraping_logins_selenium.py: -------------------------------------------------------------------------------- 1 | #INSTALL SELENIUM BEFORE RUNNING THIS CODE 2 | #pip3 install selenium 3 | from selenium.webdriver.chrome.service import Service 4 | from selenium import webdriver 5 | from selenium.webdriver.common.by import By 6 | import csv 7 | import time 8 | import getpass 9 | 10 | #IF USING A RASPBERRY PI, FIRST INSTALL THIS OPTIMIZED CHROME DRIVER 11 | #sudo apt-get install chromium-chromedriver 12 | browser_driver = Service('/usr/lib/chromium-browser/chromedriver') 13 | page_to_scrape = webdriver.Chrome(service=browser_driver) 14 | page_to_scrape.get("https://quotes.toscrape.com") 15 | 16 | page_to_scrape.find_element(By.LINK_TEXT, "Login").click() 17 | 18 | time.sleep(3) 19 | username = page_to_scrape.find_element(By.ID, "username") 20 | password = page_to_scrape.find_element(By.ID, "password") 21 | username.send_keys("admin") 22 | #USING GETPASS WILL PROMPT YOU TO ENTER YOUR PASSWORD INSTEAD OF STORING 23 | #IT IN CODE. YOU'RE ALSO WELCOME TO USE A PYTHON KEYRING TO STORE PASSWORDS. 24 | my_pass = getpass.getpass() 25 | password.send_keys(my_pass) 26 | page_to_scrape.find_element(By.CSS_SELECTOR, "input.btn-primary").click() 27 | 28 | quotes = page_to_scrape.find_elements(By.CLASS_NAME, "text") 29 | authors = page_to_scrape.find_elements(By.CLASS_NAME, "author") 30 | 31 | file = open("scraped_quotes2.csv", "w") 32 | writer = csv.writer(file) 33 | 34 | writer.writerow(["QUOTES", "AUTHORS"]) 35 | 36 | for quote, author in zip(quotes, authors): 37 | print(quote.text + " - " + author.text) 38 | writer.writerow([quote.text, author.text]) 39 | file.close() 40 | --------------------------------------------------------------------------------