├── README.md
└── main.py


/README.md:
--------------------------------------------------------------------------------
1 | # WEB SCRAPING - Beautiful Soup
2 | 
3 | ## Description
4 | 
5 | This project scraps all books from http://goodreads.com
6 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | from urllib.request import urlopen
 2 | from bs4 import BeautifulSoup
 3 | import json
 4 | 
 5 | 
 6 | BASE_URL = "https://www.goodreads.com"
 7 | 
 8 | 
 9 | # Making Page URLs
10 | page_urls = []
11 | 
12 | for i in range(1, 101):
13 |   page_urls.append(BASE_URL + "/list/tag/romance?page=" + str(i))
14 |   print("Page URL Appended: " + BASE_URL+"/list/tag/romance?page=" + str(i))
15 | 
16 | 
17 | # Making Category URLs
18 | category_urls = []
19 | 
20 | for page_url in page_urls:
21 |   page = urlopen(page_url)
22 |   html_bytes = page.read()
23 |   html = html_bytes.decode("utf-8")
24 |   soup = BeautifulSoup(html, "html.parser")
25 |   links = soup.find_all("a", { "class" : "listTitle"})
26 | 
27 |   for link in links:
28 |     category_urls.append(BASE_URL + link["href"])
29 |     print("Category URL Appended: " + BASE_URL + link["href"])
30 | 
31 | 
32 | # Save Book Data by Category URL
33 | books = []
34 | 
35 | for category_url in category_urls
36 |   page = urlopen(category_url)
37 |   html_bytes = page.read()
38 |   html = html_bytes.decode("utf-8")
39 |   soup = BeautifulSoup(html, "html.parser")
40 |   els = soup.find_all("tr")
41 | 
42 |   for el in els:
43 |     book = {
44 |       "title": el.find("a", { "class" : "bookTitle" }).get_text(),
45 |       "url": BASE_URL + el.find("a", { "class" : "bookTitle" })["href"],
46 |       "auth": {
47 |         "name": el.find("a", { "class" : "authorName" }).get_text(),
48 |         "url": el.find("a", { "class" : "authorName" })["href"]
49 |       },
50 |       "rating": el.find("span", { "class" : "minirating" }).get_text()
51 |     }
52 | 
53 |     books.append(book)
54 |     print("Book Appended: " book.title)
55 | 
56 | 
57 | # Save Data to Json
58 | with open ("data.json", "w") as fp:
59 |   json.dump(books, fp)


--------------------------------------------------------------------------------