├── README.md └── main.py /README.md: -------------------------------------------------------------------------------- 1 | # WEB SCRAPING - Beautiful Soup 2 | 3 | ## Description 4 | 5 | This project scraps all books from http://goodreads.com 6 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from urllib.request import urlopen 2 | from bs4 import BeautifulSoup 3 | import json 4 | 5 | 6 | BASE_URL = "https://www.goodreads.com" 7 | 8 | 9 | # Making Page URLs 10 | page_urls = [] 11 | 12 | for i in range(1, 101): 13 | page_urls.append(BASE_URL + "/list/tag/romance?page=" + str(i)) 14 | print("Page URL Appended: " + BASE_URL+"/list/tag/romance?page=" + str(i)) 15 | 16 | 17 | # Making Category URLs 18 | category_urls = [] 19 | 20 | for page_url in page_urls: 21 | page = urlopen(page_url) 22 | html_bytes = page.read() 23 | html = html_bytes.decode("utf-8") 24 | soup = BeautifulSoup(html, "html.parser") 25 | links = soup.find_all("a", { "class" : "listTitle"}) 26 | 27 | for link in links: 28 | category_urls.append(BASE_URL + link["href"]) 29 | print("Category URL Appended: " + BASE_URL + link["href"]) 30 | 31 | 32 | # Save Book Data by Category URL 33 | books = [] 34 | 35 | for category_url in category_urls 36 | page = urlopen(category_url) 37 | html_bytes = page.read() 38 | html = html_bytes.decode("utf-8") 39 | soup = BeautifulSoup(html, "html.parser") 40 | els = soup.find_all("tr") 41 | 42 | for el in els: 43 | book = { 44 | "title": el.find("a", { "class" : "bookTitle" }).get_text(), 45 | "url": BASE_URL + el.find("a", { "class" : "bookTitle" })["href"], 46 | "auth": { 47 | "name": el.find("a", { "class" : "authorName" }).get_text(), 48 | "url": el.find("a", { "class" : "authorName" })["href"] 49 | }, 50 | "rating": el.find("span", { "class" : "minirating" }).get_text() 51 | } 52 | 53 | books.append(book) 54 | print("Book Appended: " book.title) 55 | 56 | 57 | # Save Data to Json 58 | with open ("data.json", "w") as fp: 59 | json.dump(books, fp) --------------------------------------------------------------------------------