├── README.md
├── basic_scrape.py
├── basic_scrape_csv_export.py
├── scraping_logins_pagination_selenium.py
└── scraping_logins_selenium.py


/README.md:
--------------------------------------------------------------------------------
1 | # basic_python_scraping
2 | Basic python web scraping code.
3 | 


--------------------------------------------------------------------------------
/basic_scrape.py:
--------------------------------------------------------------------------------
 1 | #IMPORT LIBRARIES
 2 | from bs4 import BeautifulSoup
 3 | import requests
 4 | 
 5 | #REQUEST WEBPAGE AND STORE IT AS A VARIABLE
 6 | page_to_scrape = requests.get("http://quotes.toscrape.com")
 7 | 
 8 | #USE BEAUTIFULSOUP TO PARSE THE HTML AND STORE IT AS A VARIABLE
 9 | soup = BeautifulSoup(page_to_scrape.text, 'html.parser')
10 | 
11 | #FIND ALL THE ITEMS IN THE PAGE WITH A CLASS ATTRIBUTE OF 'TEXT'
12 | #AND STORE THE LIST AS A VARIABLE
13 | quotes = soup.findAll('span', attrs={'class':'text'})
14 | 
15 | #FIND ALL THE ITEMS IN THE PAGE WITH A CLASS ATTRIBUTE OF 'AUTHOR'
16 | #AND STORE THE LIST AS A VARIABLE
17 | authors = soup.findAll('small', attrs={"class":"author"})
18 | 
19 | #LOOP THROUGH BOTH LISTS USING THE 'ZIP' FUNCTION
20 | #AND PRINT AND FORMAT THE RESULTS
21 | for quote, author in zip(quotes, authors):
22 |     print(quote.text + "-" + author.text)
23 | 


--------------------------------------------------------------------------------
/basic_scrape_csv_export.py:
--------------------------------------------------------------------------------
 1 | #IMPORT LIBRARIES
 2 | from bs4 import BeautifulSoup
 3 | import requests
 4 | 
 5 | #IMPORT CSV LIBRARY
 6 | import csv
 7 | 
 8 | #OPEN A NEW CSV FILE. IT CAN BE CALLED ANYTHING
 9 | file = open('scraped_quotes.csv', 'w')
10 | #CREATE A VARIABLE FOR WRITING TO THE CSV
11 | writer = csv.writer(file)
12 | 
13 | #CREATE THE HEADER ROW OF THE CSV
14 | writer.writerow(['Quote', 'Author'])
15 | 
16 | #REQUEST WEBPAGE AND STORE IT AS A VARIABLE
17 | page_to_scrape = requests.get("http://quotes.toscrape.com")
18 | #USE BEAUTIFULSOUP TO PARSE THE HTML AND STORE IT AS A VARIABLE
19 | soup = BeautifulSoup(page_to_scrape.text, 'html.parser')
20 | #FIND ALL THE ITEMS IN THE PAGE WITH A CLASS ATTRIBUTE OF 'TEXT'
21 | #AND STORE THE LIST AS A VARIABLE
22 | quotes = soup.findAll('span', attrs={'class':'text'})
23 | 
24 | #FIND ALL THE ITEMS IN THE PAGE WITH A CLASS ATTRIBUTE OF 'AUTHOR'
25 | #AND STORE THE LIST AS A VARIABLE
26 | authors = soup.findAll('small', attrs={"class":"author"})
27 | 
28 | #LOOP THROUGH BOTH LISTS USING THE 'ZIP' FUNCTION
29 | #AND PRINT AND FORMAT THE RESULTS
30 | for quote, author in zip(quotes, authors):
31 |     print(quote.text + "-" + author.text)
32 |     #WRITE EACH ITEM AS A NEW ROW IN THE CSV
33 |     writer.writerow([quote.text, author.text])
34 | #CLOSE THE CSV FILE
35 | file.close()
36 | 


--------------------------------------------------------------------------------
/scraping_logins_pagination_selenium.py:
--------------------------------------------------------------------------------
 1 | #INSTALL SELENIUM BEFORE RUNNING THIS CODE
 2 | #pip3 install selenium
 3 | import csv
 4 | from selenium import webdriver
 5 | from selenium.webdriver.chrome.service import Service
 6 | from selenium.webdriver.common.by import By
 7 | import time
 8 | import getpass
 9 | from selenium.common.exceptions import NoSuchElementException
10 | 
11 | #IF USING A RASPBERRY PI, FIRST INSTALL THIS OPTIMIZED CHROME DRIVER
12 | #sudo apt-get install chromium-chromedriver
13 | browser_driver = Service('/usr/lib/chromium-browser/chromedriver')
14 | page_to_scrape = webdriver.Chrome(service=browser_driver)
15 | page_to_scrape.get("http://quotes.toscrape.com")
16 | 
17 | page_to_scrape.find_element(By.LINK_TEXT, "Login").click()
18 | 
19 | time.sleep(3)
20 | username = page_to_scrape.find_element(By.ID, "username")
21 | password = page_to_scrape.find_element(By.ID, "password")
22 | username.send_keys("admin")
23 | #USING GETPASS WILL PROMPT YOU TO ENTER YOUR PASSWORD INSTEAD OF STORING
24 | #IT IN CODE. YOU'RE ALSO WELCOME TO USE A PYTHON KEYRING TO STORE PASSWORDS.
25 | my_pass = getpass.getpass()
26 | password.send_keys(my_pass)
27 | page_to_scrape.find_element(By.CSS_SELECTOR, "input.btn-primary").click()
28 | 
29 | quotes = page_to_scrape.find_elements(By.CLASS_NAME, "text")
30 | authors = page_to_scrape.find_elements(By.CLASS_NAME, "author")
31 | 
32 | file = open("scraped_quotes.csv", "w")
33 | writer = csv.writer(file)
34 | 
35 | writer.writerow(["QUOTES", "AUTHORS"])
36 | while True:
37 |     quotes = page_to_scrape.find_elements(By.CLASS_NAME, "text")
38 |     authors = page_to_scrape.find_elements(By.CLASS_NAME, "author")
39 |     for quote, author in zip(quotes, authors):
40 |         print(quote.text + " - " + author.text)
41 |         writer.writerow([quote.text, author.text])
42 |     try:
43 |         page_to_scrape.find_element(By.PARTIAL_LINK_TEXT, "Next").click()
44 |     except NoSuchElementException:
45 |         break
46 | file.close()
47 | 


--------------------------------------------------------------------------------
/scraping_logins_selenium.py:
--------------------------------------------------------------------------------
 1 | #INSTALL SELENIUM BEFORE RUNNING THIS CODE
 2 | #pip3 install selenium
 3 | from selenium.webdriver.chrome.service import Service
 4 | from selenium import webdriver
 5 | from selenium.webdriver.common.by import By
 6 | import csv
 7 | import time
 8 | import getpass
 9 | 
10 | #IF USING A RASPBERRY PI, FIRST INSTALL THIS OPTIMIZED CHROME DRIVER
11 | #sudo apt-get install chromium-chromedriver
12 | browser_driver = Service('/usr/lib/chromium-browser/chromedriver')
13 | page_to_scrape = webdriver.Chrome(service=browser_driver)
14 | page_to_scrape.get("https://quotes.toscrape.com")
15 | 
16 | page_to_scrape.find_element(By.LINK_TEXT, "Login").click()
17 | 
18 | time.sleep(3)
19 | username = page_to_scrape.find_element(By.ID, "username")
20 | password = page_to_scrape.find_element(By.ID, "password")
21 | username.send_keys("admin")
22 | #USING GETPASS WILL PROMPT YOU TO ENTER YOUR PASSWORD INSTEAD OF STORING
23 | #IT IN CODE. YOU'RE ALSO WELCOME TO USE A PYTHON KEYRING TO STORE PASSWORDS.
24 | my_pass = getpass.getpass()
25 | password.send_keys(my_pass)
26 | page_to_scrape.find_element(By.CSS_SELECTOR, "input.btn-primary").click()
27 | 
28 | quotes = page_to_scrape.find_elements(By.CLASS_NAME, "text")
29 | authors = page_to_scrape.find_elements(By.CLASS_NAME, "author")
30 | 
31 | file = open("scraped_quotes2.csv", "w")
32 | writer = csv.writer(file)
33 | 
34 | writer.writerow(["QUOTES", "AUTHORS"])
35 | 
36 | for quote, author in zip(quotes, authors):
37 |     print(quote.text + " - " + author.text)
38 |     writer.writerow([quote.text, author.text])
39 | file.close()
40 | 


--------------------------------------------------------------------------------