├── cleaning.py
├── data
├── README.md
├── raw
│ ├── CSVtoDF.py
│ ├── best_books_scraper.py
│ ├── times_api_call.ipynb
│ ├── RawDataCollector.py
│ ├── bestsellers_generating.ipynb
│ └── bestsellers_combining.ipynb
└── preprocessing
│ └── bestsellers_cleaning.ipynb
├── README.md
├── requirements.txt
├── .gitignore
└── expl
└── bestsellers_EDA.ipynb
/cleaning.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | Readme coming soon
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Best-Seller-Books
2 | Coming soon...
3 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.9.1
2 | bs4==0.0.1
3 | certifi==2020.4.5.1
4 | chardet==3.0.4
5 | cycler==0.10.0
6 | DateTime==4.3
7 | idna==2.9
8 | kiwisolver==1.2.0
9 | matplotlib==3.2.1
10 | numpy==1.18.4
11 | pandas==1.0.3
12 | pyparsing==2.4.7
13 | python-dateutil==2.8.1
14 | pytz==2020.1
15 | requests==2.23.0
16 | six==1.15.0
17 | soupsieve==2.0.1
18 | tqdm==4.46.0
19 | urllib3==1.26.5
20 | zope.interface==5.1.0
--------------------------------------------------------------------------------
/data/raw/CSVtoDF.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | class CSVtoDF(object):
4 | """
5 | Temporarily imports data frame columns from .csv file
6 | and deletes them after selecting the needed columns.
7 |
8 | """
9 |
10 | def __init__(self, file):
11 | self.df = pd.read_csv(file)
12 | self.columns = list(self.df.columns.values)
13 |
14 | def __enter__(self):
15 | return self.df
16 |
17 | def __exit__(self, exc_type, exc_val, exc_tb):
18 | self.df.drop([c for c in self.columns], axis=1, inplace=True)
19 |
20 |
--------------------------------------------------------------------------------
/data/raw/best_books_scraper.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import requests
3 |
4 | from bs4 import BeautifulSoup
5 |
6 |
7 | class BestBooks():
8 |
9 | def __init__(self, pages):
10 | self.pages = pages
11 | self.soups = [BeautifulSoup(requests.get(f'https://www.goodreads.com/list/show/1.Best_Books_Ever?page={n}').text,
12 | 'html.parser') for n in range(1, self.pages + 1)]
13 |
14 | def __str__(self):
15 | return f"You just scraped {self.pages} pages from the GoodReads.com!"
16 |
17 | def store_html(self):
18 | # If you want to store scraped HTML as a list
19 | return self.soups
20 |
21 | def books_authors(self):
22 | titles, authors = [], []
23 | for page in self.soups:
24 | for title, author in zip(page.find_all(class_="bookTitle"), page.find_all(class_="authorName")):
25 | titles.append(title.get_text())
26 | authors.append(author.get_text())
27 | return [titles, authors]
28 |
29 | s = BestBooks(1)
30 | print(s.books_authors())
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Specific to the project
2 | *.[oa]
3 | *~
4 | venv
5 | .ipynb_checkpoints
6 | *.pkl
7 | *.csv
8 | goodreads_metadata.txt
9 | test.py
10 | practice.py
11 | *.exe
12 |
13 | # Byte-compiled / optimized / DLL files
14 | __pycache__/
15 | *.py[cod]
16 |
17 | # C extensions
18 | *.so
19 |
20 | # Distribution / packaging
21 | .Python
22 | env/
23 | build/
24 | develop-eggs/
25 | dist/
26 | downloads/
27 | eggs/
28 | .eggs/
29 | lib/
30 | lib64/
31 | parts/
32 | sdist/
33 | var/
34 | *.egg-info/
35 | .installed.cfg
36 | *.egg
37 |
38 | # PyInstaller
39 | # Usually these files are written by a python script from a template
40 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
41 | *.manifest
42 | *.spec
43 |
44 | # Installer logs
45 | pip-log.txt
46 | pip-delete-this-directory.txt
47 |
48 | # Unit test / coverage reports
49 | htmlcov/
50 | .tox/
51 | .coverage
52 | .coverage.*
53 | .cache
54 | nosetests.xml
55 | coverage.xml
56 | *,cover
57 |
58 | # Translations
59 | *.mo
60 | *.pot
61 |
62 | # Django stuff:
63 | *.log
64 |
65 | # Sphinx documentation
66 | docs/_build/
67 |
68 | # PyBuilder
69 | target/
70 |
--------------------------------------------------------------------------------
/data/raw/times_api_call.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Calling Times API\n",
8 | "\n",
9 | "We will extract the weekly best seller's list for the last 10 years from the Times API."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import numpy as np\n",
19 | "import pandas as pd\n",
20 | "\n",
21 | "from RawDataCollector import TimesExtractor"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "I use the manually defined class `TimesExtractor`, which will take `start_date`, `end_date`, and `frequencies` as an argument and will make calls to the Times Best Seller Books API based on the number of weeks within those dates range."
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "year = TimesExtractor(start_date='2011-05-01', end_date='2011-05-01', frq='W')\n",
38 | "dic = year.make_call()"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "Because the Times API has a limit of maximum 20 calls per minute with 3 seconds lapses between calls it takes time for the module to make all the given calls.\n",
46 | "\n",
47 | "* In our case it took 27 minutes to make 470 calls.\n",
48 | "\n",
49 | "-----------\n",
50 | "Now I will extract only the dictionaries containing the data about bestseller books.\n",
51 | "In order to do that I use personally defined function `converter` which will extract values based on given `keys`."
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "keys = ['weeks_on_list', 'primary_isbn10', 'primary_isbn13',\n",
61 | " 'publisher', 'description', 'title', 'author']"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "def converter(dic, keys):\n",
71 | " converted = {}\n",
72 | " for n in range(0, len(dic)):\n",
73 | " for m in range(0, len(dic[n])):\n",
74 | " for k in keys:\n",
75 | " converted.setdefault(k, []).append(dic[n][m][k])\n",
76 | " return converted"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": null,
82 | "metadata": {},
83 | "outputs": [],
84 | "source": [
85 | "# Calling function with extracted data from the Times API and hand picked keys\n",
86 | "result = converter(dic, keys)"
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "metadata": {},
92 | "source": [
93 | "Finally, I convert the dictionary into the `pandas` dataframe before saving it."
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {
100 | "scrolled": true
101 | },
102 | "outputs": [],
103 | "source": [
104 | "df = pd.DataFrame.from_dict(result)\n",
105 | "df.head()"
106 | ]
107 | },
108 | {
109 | "cell_type": "markdown",
110 | "metadata": {},
111 | "source": [
112 | "I also add the column which will indicate that the book is from Bestsellers list."
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "metadata": {},
119 | "outputs": [],
120 | "source": [
121 | "df['best_seller'] = 'yes'"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "metadata": {},
127 | "source": [
128 | "Let's check our DataFrame:"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": null,
134 | "metadata": {},
135 | "outputs": [],
136 | "source": [
137 | "print('dimensions: ', df.shape)\n",
138 | "print()\n",
139 | "df.head()"
140 | ]
141 | },
142 | {
143 | "cell_type": "markdown",
144 | "metadata": {},
145 | "source": [
146 | "We have extracted 8555 rows among seven columns from the Times API containing books that have been on their best-sellers list.\n",
147 | "\n",
148 | "Final step: I save the dataframe to the pickle file for more convenient transferring into the notebook where I will perform additional work on the dataframe."
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": null,
154 | "metadata": {},
155 | "outputs": [],
156 | "source": [
157 | "# df.to_pickle('bestsellers_raw.pkl')\n",
158 | "df.to_csv('bestsellers_raw.csv', index=False)"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": null,
164 | "metadata": {},
165 | "outputs": [],
166 | "source": []
167 | }
168 | ],
169 | "metadata": {
170 | "kernelspec": {
171 | "display_name": "Python 3",
172 | "language": "python",
173 | "name": "python3"
174 | },
175 | "language_info": {
176 | "codemirror_mode": {
177 | "name": "ipython",
178 | "version": 3
179 | },
180 | "file_extension": ".py",
181 | "mimetype": "text/x-python",
182 | "name": "python",
183 | "nbconvert_exporter": "python",
184 | "pygments_lexer": "ipython3",
185 | "version": "3.7.7"
186 | }
187 | },
188 | "nbformat": 4,
189 | "nbformat_minor": 4
190 | }
191 |
--------------------------------------------------------------------------------
/data/preprocessing/bestsellers_cleaning.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import numpy as np\n",
11 | "import re\n",
12 | "import sys"
13 | ]
14 | },
15 | {
16 | "cell_type": "markdown",
17 | "metadata": {},
18 | "source": [
19 | "I define the path to the **raw** folder to use `CSVtoDF` module, which is located there."
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": null,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "sys.path.insert(0, \"C:/Users/Hellrox/Desktop/Projects/Best-Seller-Books/data/raw\")\n",
29 | "from CSVtoDF import CSVtoDF"
30 | ]
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "metadata": {},
35 | "source": [
36 | "I load everything except _cover picture url_ column."
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": null,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "with CSVtoDF(\"C:/Users/Hellrox/Desktop/Projects/Best-Seller-Books/data/raw/bestsellers_merged.csv\") as df:\n",
46 | " df['title'] = df['title1']\n",
47 | " df['author'] = df['author1']\n",
48 | " df['pages_number'] = df['pages']\n",
49 | " df['format'] = df['edition']\n",
50 | " df['genre'] = df['genres']\n",
51 | " df['score'] = df['rating']\n",
52 | " df['total_count'] = df['count']\n",
53 | "\n",
54 | "df.head()"
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {},
60 | "source": [
61 | "# Cleaning the Dataframe\n",
62 | "\n",
63 | "I start by cleaning the genres column, which still is a text from a CSS format I scraped from Goodreads.com\n",
64 | "\n",
65 | "I start by defining function `genre_to_list` which will take `pandas` series and `regex` pattern as an argument and return list of extracted strings."
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "def genre_to_list(series, pattern):\n",
75 | " return [re.findall(pattern, txt) for txt in series]\n",
76 | "\n",
77 | "genr = genre_to_list(df['genre'], r'>(\\w+)<')"
78 | ]
79 | },
80 | {
81 | "cell_type": "markdown",
82 | "metadata": {},
83 | "source": [
84 | "After that I iterate through the list and join them to the original dataframe."
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": null,
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "for g, i in zip(genr, range(0, len(genr))):\n",
94 | " df['genre'][i] = ', '.join(g)\n",
95 | " \n",
96 | "df.head()"
97 | ]
98 | },
99 | {
100 | "cell_type": "markdown",
101 | "metadata": {},
102 | "source": [
103 | "Next comes the `pages_number` column. I strip only the digit values and convert them to the _float_."
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {},
110 | "outputs": [],
111 | "source": [
112 | "df[['pages', 'drop']] = df['pages_number'].str.split(expand=True)\n",
113 | "df['pages'] = df['pages'].astype(float)\n",
114 | "df.drop(['drop', 'pages_number'], axis=1, inplace=True)\n",
115 | "df.head()"
116 | ]
117 | },
118 | {
119 | "cell_type": "markdown",
120 | "metadata": {},
121 | "source": [
122 | "I strip numerical value from the `total_count`, remove commas and covnert to float value."
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "df[['count', 'drop']] = df['total_count'].str.strip().str.split('\\\\n', expand=True)\n",
132 | "df['count'] = df['count'].str.replace(',', '').astype(float)\n",
133 | "df.drop(['drop', 'total_count'], axis=1, inplace=True)\n",
134 | "df = df[['title', 'author', 'format', 'pages', 'score', 'count']]\n",
135 | "df.head()"
136 | ]
137 | },
138 | {
139 | "cell_type": "markdown",
140 | "metadata": {},
141 | "source": [
142 | "I convert the `title` and `author` values into title to capitalize first letters of each word. To make for pleasently readable"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": null,
148 | "metadata": {},
149 | "outputs": [],
150 | "source": [
151 | "df['title'] = df['title'].str.title()\n",
152 | "df['author'] = df['author'].str.title()\n",
153 | "df.head()"
154 | ]
155 | },
156 | {
157 | "cell_type": "markdown",
158 | "metadata": {},
159 | "source": [
160 | "Let's take a look and clean version of previosu dataframe."
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": null,
166 | "metadata": {},
167 | "outputs": [],
168 | "source": [
169 | "df.info()"
170 | ]
171 | },
172 | {
173 | "cell_type": "markdown",
174 | "metadata": {},
175 | "source": [
176 | "Finally, I save it to the new csv file."
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": null,
182 | "metadata": {},
183 | "outputs": [],
184 | "source": [
185 | "df.to_csv('bestsellers.csv')"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": null,
191 | "metadata": {},
192 | "outputs": [],
193 | "source": []
194 | }
195 | ],
196 | "metadata": {
197 | "kernelspec": {
198 | "display_name": "Python 3",
199 | "language": "python",
200 | "name": "python3"
201 | },
202 | "language_info": {
203 | "codemirror_mode": {
204 | "name": "ipython",
205 | "version": 3
206 | },
207 | "file_extension": ".py",
208 | "mimetype": "text/x-python",
209 | "name": "python",
210 | "nbconvert_exporter": "python",
211 | "pygments_lexer": "ipython3",
212 | "version": "3.7.7"
213 | }
214 | },
215 | "nbformat": 4,
216 | "nbformat_minor": 4
217 | }
218 |
--------------------------------------------------------------------------------
/data/raw/RawDataCollector.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import pandas as pd
4 | import requests
5 | import json
6 | import time
7 | import re
8 |
9 | from tqdm import tqdm
10 | from datetime import datetime
11 | from bs4 import BeautifulSoup
12 |
13 |
14 | class GoodReadsScraper():
15 | """ This module scraps the information from the Goodreads webpage.
16 | You provide a list of ISBN's as the argument and then magic happens.
17 |
18 | attributes obtained: author, title, number of pages, edition, cover_url, genres
19 | """
20 |
21 | def __init__(self, isbns):
22 | self.isbns = isbns
23 | # Iterates over the list of ISBNs
24 | # Gets HTMLs and stores as a list
25 | self.soups = [BeautifulSoup(requests.get(f'https://www.goodreads.com/book/isbn/{i}').text, 'html.parser') for i in self.isbns]
26 |
27 | def __str__(self):
28 | return f"You just scrapped {len(self.isbns)} books from the Goodreads.com!"
29 |
30 | def store_html(self):
31 | # If you want to store scraped HTML as a list
32 | return self.soups
33 |
34 | def _goodreads_scraping(self):
35 | """ This method extracts the number of pages,
36 | book's edition from accrued HTML texts.
37 |
38 | """
39 | isbn_13, pages, edition, released = [], [], [], []
40 | for s, i in zip(self.soups, self.isbns):
41 | isbn_13.append(i)
42 | pages.append(getattr(s.find(itemprop="numberOfPages"), 'text', None))
43 | released.append(getattr(s.select_one('nobr'), 'text', None))
44 | edition.append(getattr(s.find(itemprop="bookFormat"), 'text', None))
45 | return isbn_13, pages, released, edition
46 |
47 | def data_converter(self):
48 | # Convertes previously extracted list into dataframe.
49 | i, p, r, e = self._goodreads_scraping()
50 | return pd.DataFrame({'isbn13': i, 'pages': p, 'released': r, 'edition': e})
51 |
52 | def _meta_scraping(self):
53 | """ This method extracts the metadata of the book.
54 | That is book's title and author's full name & genre.
55 |
56 | """
57 | isbn_13, author, title, genre = [], [], [], []
58 | for s, i in zip(self.soups, self.isbns):
59 | isbn_13.append(i)
60 | author.append(getattr(s.find(class_='authorName'), 'text', None))
61 | title.append(getattr(s.find(class_="gr-h1 gr-h1--serif"), 'text', None))
62 | genre.append(s.find_all(class_='actionLinkLite bookPageGenreLink')[:3])
63 | return isbn_13, author, title, genre
64 |
65 | def meta_converter(self):
66 | # Convertes previously extracted list into dataframe.
67 | i, a, t, g = self._meta_scraping()
68 | return pd.DataFrame({'isbn13': i, 'author': a, 'title': t, 'genres': g})
69 |
70 | def _pop_scraping(self):
71 | """
72 | This method extracts a book's rating and a count.
73 |
74 | """
75 | isbn_13, rating, count = [], [], []
76 | for s, i in zip(self.soups, self.isbns):
77 | isbn_13.append(i)
78 | rating.append(getattr(s.find(itemprop="ratingValue"), 'text', None))
79 | count.append(getattr(s.find(itemprop="ratingCount"), 'text', None))
80 | return isbn_13, rating, count
81 |
82 | def pop_converter(self):
83 | # Convertes previously extracted list into dataframe.
84 | i, r, c = self._pop_scraping()
85 | return pd.DataFrame({'isbn13': i, 'rating': r, 'count': c})
86 |
87 | def _cover_scraper(self):
88 | """ This method grabs cover picture URL.
89 |
90 | This portion of the text is a bit problematic
91 | so it's handled using a try-except function.
92 | """
93 | isbn_13, cover_url = [], []
94 | for s, i in zip(self.soups, self.isbns):
95 | isbn_13.append(i)
96 | try:
97 | cover_url.append(s.find(id='coverImage')['src'])
98 | except TypeError:
99 | cover_url.append(np.NaN)
100 | return isbn_13, cover_url
101 |
102 | def cover_url_converter(self):
103 | # Convertes previously extracted list into dataframe.
104 | i, cu = self._cover_scraper()
105 | return pd.DataFrame({'isbn13': i, 'cover_url': cu})
106 |
107 | def description(self):
108 | """ Extract description text in a raw format
109 |
110 | """
111 | isbn_13, descr = [], []
112 | for s, i in zip(self.soups, self.isbns):
113 | isbn_13.append(i)
114 | try:
115 | descr.append(s.find(id='description').text)
116 | except TypeError:
117 | descr.append(np.NaN)
118 | return pd.DataFrame({'isbn13': i, 'description': descr})
119 |
120 |
121 | class TimesExtractor:
122 | """ The purpose of the module is to extract books data from the Times API.
123 |
124 | Times has a limit on how many requests you can send, so we have to do
125 | with time lapses and then combine all the collected data into one list.
126 |
127 | """
128 | # Default api-key which can be changed.
129 | api_key = 'djDLXwAoSfreMrzYGE5iacl7GUifIRrV'
130 | # Default lapse between calls
131 | lapse = 3
132 |
133 | def __init__(self, start_date, end_date, frq):
134 | """ Please enter dates in 'yyyy-mm-dd' format.
135 |
136 | For frq -> 'D'=day; 'W'=week; 'M'=month; 'Y'=year.
137 | """
138 | self.start_date = start_date
139 | self.end_date = end_date
140 | self.frq = frq
141 | # Raise Error if entered value doesn't match the pattern.
142 | if not re.search(r"^\d{4}-\d{2}-\d{2}$", self.start_date):
143 | raise ValueError('please enter start_date in \'yyyy-mm-dd\' format')
144 | elif not re.search(r"^\d{4}-\d{2}-\d{2}$", self.end_date):
145 | raise ValueError('please enter end_date in \'yyyy-mm-dd\' format')
146 | elif self.frq not in ('D', 'W', 'M', 'Y'):
147 | raise ValueError('please enter frequency in correct format')
148 | else:
149 | pass
150 |
151 | def __str__(self):
152 | return f"The bestellers list is from {self.start_date} to {self.end_date}"
153 |
154 | def _datesrange(self):
155 | if self.start_date >= self.end_date:
156 | return [self.end_date]
157 | else:
158 | dates = [d for d in pd.date_range(start=self.start_date,
159 | end=self.end_date,
160 | freq=self.frq).strftime('%Y-%m-%d')]
161 | return dates
162 |
163 | @classmethod
164 | def key(cls, key):
165 | cls.api_key = key
166 |
167 | @classmethod
168 | def seconds(cls, second):
169 | cls.lapse = second
170 |
171 | def make_call(self):
172 | """ This function iterates through the dates and sends the request
173 | to the Times API for the given set of dates.
174 |
175 | Next, it returns all the dictionaries as a combined list.
176 | """
177 | super_list = []
178 | print(f'Due to API\'s limitation, there is a {self.lapse} second lapse between calls')
179 | for date in tqdm(self._datesrange()):
180 | """ Times API has a daily limit on calls
181 | and a requirement for intervals between calls.
182 |
183 | Thus, we set up the lapses between calls.
184 | """
185 |
186 | time.sleep(self.lapse)
187 | print(date)
188 | res = requests.get(f"https://api.nytimes.com/svc/books/v3/lists/{date}/combined-print-and-e-book-fiction.json?",
189 | params = {'api-key': self.api_key}).json()
190 | super_list.append(res['results']['books'])
191 | return super_list
192 |
193 |
194 | if __name__ == "__main__":
195 | print('This method is intended to be used within Jupyter Notebook...')
196 |
197 |
--------------------------------------------------------------------------------
/expl/bestsellers_EDA.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import numpy as np\n",
11 | "import matplotlib.pyplot as plt\n",
12 | "import seaborn as sns"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 23,
18 | "metadata": {},
19 | "outputs": [
20 | {
21 | "data": {
22 | "text/html": [
23 | "
\n",
24 | "\n",
37 | "
\n",
38 | " \n",
39 | " \n",
40 | " | \n",
41 | " title | \n",
42 | " author | \n",
43 | " format | \n",
44 | " pages | \n",
45 | " score | \n",
46 | " count | \n",
47 | "
\n",
48 | " \n",
49 | " \n",
50 | " \n",
51 | " | 0 | \n",
52 | " Water For Elephants | \n",
53 | " Sara Gruen | \n",
54 | " ebook | \n",
55 | " 297.0 | \n",
56 | " 4.09 | \n",
57 | " 1340594.0 | \n",
58 | "
\n",
59 | " \n",
60 | " | 1 | \n",
61 | " Chasing Fire | \n",
62 | " Nora Roberts | \n",
63 | " ebook | \n",
64 | " 480.0 | \n",
65 | " 4.12 | \n",
66 | " 34591.0 | \n",
67 | "
\n",
68 | " \n",
69 | " | 2 | \n",
70 | " The Lincoln Lawyer | \n",
71 | " Michael Connelly | \n",
72 | " ebook | \n",
73 | " 528.0 | \n",
74 | " 4.16 | \n",
75 | " 186048.0 | \n",
76 | "
\n",
77 | " \n",
78 | " | 3 | \n",
79 | " The Fifth Witness | \n",
80 | " Michael Connelly | \n",
81 | " ebook | \n",
82 | " 448.0 | \n",
83 | " 4.19 | \n",
84 | " 54599.0 | \n",
85 | "
\n",
86 | " \n",
87 | " | 4 | \n",
88 | " The Help | \n",
89 | " Kathryn Stockett | \n",
90 | " Paperback | \n",
91 | " 544.0 | \n",
92 | " 4.47 | \n",
93 | " 2079528.0 | \n",
94 | "
\n",
95 | " \n",
96 | "
\n",
97 | "
"
98 | ],
99 | "text/plain": [
100 | " title author format pages score count\n",
101 | "0 Water For Elephants Sara Gruen ebook 297.0 4.09 1340594.0\n",
102 | "1 Chasing Fire Nora Roberts ebook 480.0 4.12 34591.0\n",
103 | "2 The Lincoln Lawyer Michael Connelly ebook 528.0 4.16 186048.0\n",
104 | "3 The Fifth Witness Michael Connelly ebook 448.0 4.19 54599.0\n",
105 | "4 The Help Kathryn Stockett Paperback 544.0 4.47 2079528.0"
106 | ]
107 | },
108 | "execution_count": 23,
109 | "metadata": {},
110 | "output_type": "execute_result"
111 | }
112 | ],
113 | "source": [
114 | "df = pd.read_csv('C:/Users/Hellrox/Desktop/Projects/Best-Seller-Books/data/preprocessing/bestsellers.csv',\n",
115 | " index_col=[0])\n",
116 | "df.head()"
117 | ]
118 | },
119 | {
120 | "cell_type": "markdown",
121 | "metadata": {},
122 | "source": [
123 | "Graphs"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": 24,
129 | "metadata": {},
130 | "outputs": [
131 | {
132 | "data": {
133 | "text/html": [
134 | "\n",
135 | "\n",
148 | "
\n",
149 | " \n",
150 | " \n",
151 | " | \n",
152 | " count | \n",
153 | " pages | \n",
154 | " score | \n",
155 | "
\n",
156 | " \n",
157 | " | format | \n",
158 | " | \n",
159 | " | \n",
160 | " | \n",
161 | "
\n",
162 | " \n",
163 | " \n",
164 | " \n",
165 | " | Audiobook | \n",
166 | " 557956.500000 | \n",
167 | " 324.000000 | \n",
168 | " 4.195000 | \n",
169 | "
\n",
170 | " \n",
171 | " | Edición Kindle | \n",
172 | " 53068.600000 | \n",
173 | " 332.400000 | \n",
174 | " 4.120000 | \n",
175 | "
\n",
176 | " \n",
177 | " | Hardcover | \n",
178 | " 65059.028200 | \n",
179 | " 407.181223 | \n",
180 | " 3.964403 | \n",
181 | "
\n",
182 | " \n",
183 | " | Kindle Edition | \n",
184 | " 6363.500000 | \n",
185 | " 475.636364 | \n",
186 | " 4.075000 | \n",
187 | "
\n",
188 | " \n",
189 | " | Mass Market Paperback | \n",
190 | " 115290.131148 | \n",
191 | " 453.383333 | \n",
192 | " 4.046393 | \n",
193 | "
\n",
194 | " \n",
195 | " | Nook | \n",
196 | " 27151.636364 | \n",
197 | " 258.500000 | \n",
198 | " 3.927273 | \n",
199 | "
\n",
200 | " \n",
201 | " | Paperback | \n",
202 | " 295383.494382 | \n",
203 | " 426.359551 | \n",
204 | " 3.994888 | \n",
205 | "
\n",
206 | " \n",
207 | " | Trade Paperback | \n",
208 | " 630470.000000 | \n",
209 | " 254.000000 | \n",
210 | " 3.970000 | \n",
211 | "
\n",
212 | " \n",
213 | " | Unknown Binding | \n",
214 | " 4521.578947 | \n",
215 | " NaN | \n",
216 | " 4.032632 | \n",
217 | "
\n",
218 | " \n",
219 | " | eBook | \n",
220 | " 151774.000000 | \n",
221 | " 432.000000 | \n",
222 | " 4.020000 | \n",
223 | "
\n",
224 | " \n",
225 | " | eBook Kindle | \n",
226 | " 29939.600000 | \n",
227 | " 430.800000 | \n",
228 | " 4.182000 | \n",
229 | "
\n",
230 | " \n",
231 | " | ebook | \n",
232 | " 53692.752699 | \n",
233 | " 384.626810 | \n",
234 | " 4.017154 | \n",
235 | "
\n",
236 | " \n",
237 | " | mp3 Audiobook | \n",
238 | " 90080.000000 | \n",
239 | " NaN | \n",
240 | " 3.820000 | \n",
241 | "
\n",
242 | " \n",
243 | "
\n",
244 | "
"
245 | ],
246 | "text/plain": [
247 | " count pages score\n",
248 | "format \n",
249 | "Audiobook 557956.500000 324.000000 4.195000\n",
250 | "Edición Kindle 53068.600000 332.400000 4.120000\n",
251 | "Hardcover 65059.028200 407.181223 3.964403\n",
252 | "Kindle Edition 6363.500000 475.636364 4.075000\n",
253 | "Mass Market Paperback 115290.131148 453.383333 4.046393\n",
254 | "Nook 27151.636364 258.500000 3.927273\n",
255 | "Paperback 295383.494382 426.359551 3.994888\n",
256 | "Trade Paperback 630470.000000 254.000000 3.970000\n",
257 | "Unknown Binding 4521.578947 NaN 4.032632\n",
258 | "eBook 151774.000000 432.000000 4.020000\n",
259 | "eBook Kindle 29939.600000 430.800000 4.182000\n",
260 | "ebook 53692.752699 384.626810 4.017154\n",
261 | "mp3 Audiobook 90080.000000 NaN 3.820000"
262 | ]
263 | },
264 | "execution_count": 24,
265 | "metadata": {},
266 | "output_type": "execute_result"
267 | }
268 | ],
269 | "source": [
270 | "pd.pivot_table(df, index='format', aggfunc=np.mean)"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": null,
276 | "metadata": {},
277 | "outputs": [],
278 | "source": []
279 | }
280 | ],
281 | "metadata": {
282 | "kernelspec": {
283 | "display_name": "Python 3",
284 | "language": "python",
285 | "name": "python3"
286 | },
287 | "language_info": {
288 | "codemirror_mode": {
289 | "name": "ipython",
290 | "version": 3
291 | },
292 | "file_extension": ".py",
293 | "mimetype": "text/x-python",
294 | "name": "python",
295 | "nbconvert_exporter": "python",
296 | "pygments_lexer": "ipython3",
297 | "version": "3.7.7"
298 | }
299 | },
300 | "nbformat": 4,
301 | "nbformat_minor": 4
302 | }
303 |
--------------------------------------------------------------------------------
/data/raw/bestsellers_generating.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Generating ISBNs and Metadata"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import re\n",
17 | "import pandas as pd\n",
18 | "import numpy as np\n",
19 | "\n",
20 | "from CSVtoDF import CSVtoDF"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "I will use personally defined module `CSVtoDF`, which will temporary open csv file with with statement and only load manually picked columns and drop the rest and close the file."
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 2,
33 | "metadata": {},
34 | "outputs": [
35 | {
36 | "data": {
37 | "text/html": [
38 | "\n",
39 | "\n",
52 | "
\n",
53 | " \n",
54 | " \n",
55 | " | \n",
56 | " isbn13 | \n",
57 | " title1 | \n",
58 | " author1 | \n",
59 | " weeks | \n",
60 | "
\n",
61 | " \n",
62 | " \n",
63 | " \n",
64 | " | 0 | \n",
65 | " 9781616200817 | \n",
66 | " water for elephants | \n",
67 | " sara gruen | \n",
68 | " 12 | \n",
69 | "
\n",
70 | " \n",
71 | " | 1 | \n",
72 | " 9781101513781 | \n",
73 | " chasing fire | \n",
74 | " nora roberts | \n",
75 | " 1 | \n",
76 | "
\n",
77 | " \n",
78 | " | 2 | \n",
79 | " 9780759514713 | \n",
80 | " the lincoln lawyer | \n",
81 | " michael connelly | \n",
82 | " 6 | \n",
83 | "
\n",
84 | " \n",
85 | " | 3 | \n",
86 | " 9780316069380 | \n",
87 | " the fifth witness | \n",
88 | " michael connelly | \n",
89 | " 2 | \n",
90 | "
\n",
91 | " \n",
92 | " | 4 | \n",
93 | " 9780425232200 | \n",
94 | " the help | \n",
95 | " kathryn stockett | \n",
96 | " 9 | \n",
97 | "
\n",
98 | " \n",
99 | "
\n",
100 | "
"
101 | ],
102 | "text/plain": [
103 | " isbn13 title1 author1 weeks\n",
104 | "0 9781616200817 water for elephants sara gruen 12\n",
105 | "1 9781101513781 chasing fire nora roberts 1\n",
106 | "2 9780759514713 the lincoln lawyer michael connelly 6\n",
107 | "3 9780316069380 the fifth witness michael connelly 2\n",
108 | "4 9780425232200 the help kathryn stockett 9"
109 | ]
110 | },
111 | "execution_count": 2,
112 | "metadata": {},
113 | "output_type": "execute_result"
114 | }
115 | ],
116 | "source": [
117 | "with CSVtoDF('bestsellers_raw.csv') as df:\n",
118 | " df['isbn13'] = df['primary_isbn13']\n",
119 | " df['title1'] = df['title'].str.lower()\n",
120 | " df['author1'] = df['author'].str.lower()\n",
121 | " df['weeks'] = df['weeks_on_list']\n",
122 | "df.head()"
123 | ]
124 | },
125 | {
126 | "cell_type": "markdown",
127 | "metadata": {},
128 | "source": [
129 | "Actually we don't need ISBN10 as ISBN13 can better do the job.\n",
130 | "\n",
131 | "We drop the `isbn` column and keep the remaining 3."
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": 3,
137 | "metadata": {},
138 | "outputs": [
139 | {
140 | "name": "stdout",
141 | "output_type": "stream",
142 | "text": [
143 | "\n",
144 | "\n",
145 | "RangeIndex: 8555 entries, 0 to 8554\n",
146 | "Data columns (total 4 columns):\n",
147 | " # Column Non-Null Count Dtype \n",
148 | "--- ------ -------------- ----- \n",
149 | " 0 isbn13 8552 non-null object\n",
150 | " 1 title1 8555 non-null object\n",
151 | " 2 author1 8555 non-null object\n",
152 | " 3 weeks 8555 non-null int64 \n",
153 | "dtypes: int64(1), object(3)\n",
154 | "memory usage: 267.5+ KB\n"
155 | ]
156 | }
157 | ],
158 | "source": [
159 | "df = df[['isbn13', 'title1', 'author1', 'weeks']]\n",
160 | "print()\n",
161 | "df.info()"
162 | ]
163 | },
164 | {
165 | "cell_type": "markdown",
166 | "metadata": {},
167 | "source": [
168 | "We see that there are some _Null_ values in our dataframe.\n",
169 | "But at this moment we are only interested with ISBN13 column, as the values found in that column will be used as a indexes to connect to the goodreads webpage for scraping."
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": 4,
175 | "metadata": {},
176 | "outputs": [
177 | {
178 | "data": {
179 | "text/html": [
180 | "\n",
181 | "\n",
194 | "
\n",
195 | " \n",
196 | " \n",
197 | " | \n",
198 | " isbn13 | \n",
199 | " title1 | \n",
200 | " author1 | \n",
201 | " weeks | \n",
202 | "
\n",
203 | " \n",
204 | " \n",
205 | " \n",
206 | " | 143 | \n",
207 | " NaN | \n",
208 | " summer secrets | \n",
209 | " barbara freethy | \n",
210 | " 1 | \n",
211 | "
\n",
212 | " \n",
213 | " | 169 | \n",
214 | " NaN | \n",
215 | " summer secrets | \n",
216 | " barbara freethy | \n",
217 | " 2 | \n",
218 | "
\n",
219 | " \n",
220 | " | 196 | \n",
221 | " NaN | \n",
222 | " summer secrets | \n",
223 | " barbara freethy | \n",
224 | " 0 | \n",
225 | "
\n",
226 | " \n",
227 | "
\n",
228 | "
"
229 | ],
230 | "text/plain": [
231 | " isbn13 title1 author1 weeks\n",
232 | "143 NaN summer secrets barbara freethy 1\n",
233 | "169 NaN summer secrets barbara freethy 2\n",
234 | "196 NaN summer secrets barbara freethy 0"
235 | ]
236 | },
237 | "execution_count": 4,
238 | "metadata": {},
239 | "output_type": "execute_result"
240 | }
241 | ],
242 | "source": [
243 | "df1 = df[df['isbn13'].isna()]\n",
244 | "df1"
245 | ]
246 | },
247 | {
248 | "cell_type": "markdown",
249 | "metadata": {},
250 | "source": [
251 | "Fortunatelly, Summer Secrets by Barbara Freethy is the only book which has no ISBN13 information.\n",
252 | "I will simply replace the empty value with ISBN number."
253 | ]
254 | },
255 | {
256 | "cell_type": "code",
257 | "execution_count": 5,
258 | "metadata": {},
259 | "outputs": [
260 | {
261 | "name": "stdout",
262 | "output_type": "stream",
263 | "text": [
264 | "\n",
265 | "(8555, 4)\n",
266 | "\n",
267 | "RangeIndex: 8555 entries, 0 to 8554\n",
268 | "Data columns (total 4 columns):\n",
269 | " # Column Non-Null Count Dtype \n",
270 | "--- ------ -------------- ----- \n",
271 | " 0 isbn13 8555 non-null object\n",
272 | " 1 title1 8555 non-null object\n",
273 | " 2 author1 8555 non-null object\n",
274 | " 3 weeks 8555 non-null int64 \n",
275 | "dtypes: int64(1), object(3)\n",
276 | "memory usage: 267.5+ KB\n"
277 | ]
278 | }
279 | ],
280 | "source": [
281 | "df['isbn13'].replace(np.NaN, 'B003K15AKQ', inplace=True)\n",
282 | "print()\n",
283 | "print(df.shape)\n",
284 | "df.info()"
285 | ]
286 | },
287 | {
288 | "cell_type": "markdown",
289 | "metadata": {},
290 | "source": [
291 | "Next we have to see at the duplicate values in our dataframe.\n",
292 | "\n",
293 | "First I combine Author's name, Title and ISBN13 number into one column. This way we will check for absolute duplicates (books that are exactly same editions, otherwise even if the same book is published with different cover or as revised version its ISBN will be changed)."
294 | ]
295 | },
296 | {
297 | "cell_type": "code",
298 | "execution_count": 6,
299 | "metadata": {},
300 | "outputs": [
301 | {
302 | "name": "stdout",
303 | "output_type": "stream",
304 | "text": [
305 | "\n"
306 | ]
307 | },
308 | {
309 | "data": {
310 | "text/plain": [
311 | "0 sara gruen water for elephants 9781616200817\n",
312 | "1 nora roberts chasing fire 9781101513781\n",
313 | "2 michael connelly the lincoln lawyer 9780759514713\n",
314 | "3 michael connelly the fifth witness 9780316069380\n",
315 | "4 kathryn stockett the help 9780425232200\n",
316 | "Name: author_title_isbn13, dtype: object"
317 | ]
318 | },
319 | "execution_count": 6,
320 | "metadata": {},
321 | "output_type": "execute_result"
322 | }
323 | ],
324 | "source": [
325 | "df['author_title_isbn13'] = df['author1'] + ' ' + df['title1'] + ' ' + df['isbn13']\n",
326 | "print()\n",
327 | "df['author_title_isbn13'].head()"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": 7,
333 | "metadata": {
334 | "scrolled": true
335 | },
336 | "outputs": [
337 | {
338 | "name": "stdout",
339 | "output_type": "stream",
340 | "text": [
341 | "\n",
342 | "author_title_isbn13\n",
343 | "gillian flynn gone girl 9780307588388 112\n",
344 | "george rr martin a game of thrones 9780553897845 77\n",
345 | "kristin hannah the nightingale 9781466850606 74\n",
346 | "e l james fifty shades of grey 9781612130293 71\n",
347 | "e l james fifty shades darker 9781612130590 70\n",
348 | " ... \n",
349 | "kristen ashley walk through fire 9781455533244 1\n",
350 | "kristen ashley the will A00B00HYIF9FW 1\n",
351 | "kristen ashley the slow burn A00B07P1HZQDH 1\n",
352 | "kristen ashley the promise A00B00JXW6GFE 1\n",
353 | "jo nesbo the son 9780385351386 1\n",
354 | "Length: 3084, dtype: int64\n"
355 | ]
356 | }
357 | ],
358 | "source": [
359 | "dups = df.pivot_table(index=['author_title_isbn13'], aggfunc='size')\n",
360 | "print()\n",
361 | "print(dups.sort_values(ascending=False))"
362 | ]
363 | },
364 | {
365 | "cell_type": "markdown",
366 | "metadata": {},
367 | "source": [
368 | "Now we can observe that Gone Girl and GOT and some other books appear several times in out df. That is because as mentioned earlier some books have been on the bestseller list for tens of weeks and their data came along with each week they have been featured.\n",
369 | "\n",
370 | "We will drop those duplicates using `pandas` `drop_duplicates`."
371 | ]
372 | },
373 | {
374 | "cell_type": "code",
375 | "execution_count": 8,
376 | "metadata": {},
377 | "outputs": [
378 | {
379 | "data": {
380 | "text/plain": [
381 | "(3084, 3)"
382 | ]
383 | },
384 | "execution_count": 8,
385 | "metadata": {},
386 | "output_type": "execute_result"
387 | }
388 | ],
389 | "source": [
390 | "df.drop_duplicates('author_title_isbn13', keep='last', ignore_index=True, inplace=True)\n",
391 | "df = df[['title1', 'author1', 'isbn13']]\n",
392 | "df.shape"
393 | ]
394 | },
395 | {
396 | "cell_type": "markdown",
397 | "metadata": {},
398 | "source": [
399 | "We also observed in a previous cell that some books have ASIN code instead of ISBN, this can be issue as Goodreads can't identify books based on ASIN. So I'll filter them out as well."
400 | ]
401 | },
402 | {
403 | "cell_type": "code",
404 | "execution_count": 9,
405 | "metadata": {},
406 | "outputs": [
407 | {
408 | "name": "stdout",
409 | "output_type": "stream",
410 | "text": [
411 | "149\n",
412 | "2935\n"
413 | ]
414 | }
415 | ],
416 | "source": [
417 | "aisbn = []\n",
418 | "i13 = []\n",
419 | "\n",
420 | "for i in list(df['isbn13']):\n",
421 | " if re.search(r'^[\\dB]+', i):\n",
422 | " i13.append(i)\n",
423 | " else:\n",
424 | " aisbn.append(i)\n",
425 | " \n",
426 | "print(len(aisbn))\n",
427 | "print(len(i13))"
428 | ]
429 | },
430 | {
431 | "cell_type": "code",
432 | "execution_count": 10,
433 | "metadata": {},
434 | "outputs": [
435 | {
436 | "data": {
437 | "text/plain": [
438 | "(2935, 3)"
439 | ]
440 | },
441 | "execution_count": 10,
442 | "metadata": {},
443 | "output_type": "execute_result"
444 | }
445 | ],
446 | "source": [
447 | "df_final = df[df['isbn13'].str.contains(r'^[\\dB]+')]\n",
448 | "df_final.shape"
449 | ]
450 | },
451 | {
452 | "cell_type": "markdown",
453 | "metadata": {},
454 | "source": [
455 | "We are down to wooping 2935 entries, but this are original titles which can be used for further analysis.\n",
456 | "\n",
457 | "We see that there are no more `NaN` values in `isbn13` column and no more duplicates, thus we can proceed and extract it as a list for web scraping."
458 | ]
459 | },
460 | {
461 | "cell_type": "code",
462 | "execution_count": null,
463 | "metadata": {},
464 | "outputs": [],
465 | "source": []
466 | }
467 | ],
468 | "metadata": {
469 | "kernelspec": {
470 | "display_name": "Python 3",
471 | "language": "python",
472 | "name": "python3"
473 | },
474 | "language_info": {
475 | "codemirror_mode": {
476 | "name": "ipython",
477 | "version": 3
478 | },
479 | "file_extension": ".py",
480 | "mimetype": "text/x-python",
481 | "name": "python",
482 | "nbconvert_exporter": "python",
483 | "pygments_lexer": "ipython3",
484 | "version": "3.7.7"
485 | }
486 | },
487 | "nbformat": 4,
488 | "nbformat_minor": 4
489 | }
490 |
--------------------------------------------------------------------------------
/data/raw/bestsellers_combining.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Scraping Data from Goodreads.com"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import pandas as pd\n",
17 | "import numpy as np\n",
18 | "\n",
19 | "from RawDataCollector import GoodReadsScraper"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "First I import another jupyter notebook to have list of ISBNs and metadata about books"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 2,
32 | "metadata": {},
33 | "outputs": [
34 | {
35 | "name": "stdout",
36 | "output_type": "stream",
37 | "text": [
38 | "\n",
39 | "\n",
40 | "RangeIndex: 8555 entries, 0 to 8554\n",
41 | "Data columns (total 4 columns):\n",
42 | " # Column Non-Null Count Dtype \n",
43 | "--- ------ -------------- ----- \n",
44 | " 0 isbn13 8552 non-null object\n",
45 | " 1 title1 8555 non-null object\n",
46 | " 2 author1 8555 non-null object\n",
47 | " 3 weeks 8555 non-null int64 \n",
48 | "dtypes: int64(1), object(3)\n",
49 | "memory usage: 267.5+ KB\n",
50 | "\n",
51 | "(8555, 4)\n",
52 | "\n",
53 | "RangeIndex: 8555 entries, 0 to 8554\n",
54 | "Data columns (total 4 columns):\n",
55 | " # Column Non-Null Count Dtype \n",
56 | "--- ------ -------------- ----- \n",
57 | " 0 isbn13 8555 non-null object\n",
58 | " 1 title1 8555 non-null object\n",
59 | " 2 author1 8555 non-null object\n",
60 | " 3 weeks 8555 non-null int64 \n",
61 | "dtypes: int64(1), object(3)\n",
62 | "memory usage: 267.5+ KB\n",
63 | "\n",
64 | "\n",
65 | "author_title_isbn13\n",
66 | "gillian flynn gone girl 9780307588388 112\n",
67 | "george rr martin a game of thrones 9780553897845 77\n",
68 | "kristin hannah the nightingale 9781466850606 74\n",
69 | "e l james fifty shades of grey 9781612130293 71\n",
70 | "e l james fifty shades darker 9781612130590 70\n",
71 | " ... \n",
72 | "kristen ashley walk through fire 9781455533244 1\n",
73 | "kristen ashley the will A00B00HYIF9FW 1\n",
74 | "kristen ashley the slow burn A00B07P1HZQDH 1\n",
75 | "kristen ashley the promise A00B00JXW6GFE 1\n",
76 | "jo nesbo the son 9780385351386 1\n",
77 | "Length: 3084, dtype: int64\n",
78 | "149\n",
79 | "2935\n"
80 | ]
81 | }
82 | ],
83 | "source": [
84 | "# Importing df_final from other notebook\n",
85 | "%run \"..\\raw\\bestsellers_generating.ipynb\""
86 | ]
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "metadata": {},
91 | "source": [
92 | "With available list of ISBNs and Dataframe with books metadata I start:\n",
93 | "### Web Scraping"
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {},
99 | "source": [
100 | "First we check if _bestsellers_ dataframe with _ISBNs_ is defined and is not empty.\n",
101 | "Otherwise raise the `ValueError`."
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": 3,
107 | "metadata": {},
108 | "outputs": [
109 | {
110 | "name": "stdout",
111 | "output_type": "stream",
112 | "text": [
113 | "All cool!\n"
114 | ]
115 | }
116 | ],
117 | "source": [
118 | "if df_final is not None:\n",
119 | " df1 = df_final\n",
120 | " isbns = df_final['isbn13']\n",
121 | " print('All cool!')\n",
122 | "else:\n",
123 | " raise ValueError('variables not found...')"
124 | ]
125 | },
126 | {
127 | "cell_type": "markdown",
128 | "metadata": {},
129 | "source": [
130 | "If everything is in order we proceed scraping additional data from the Goodreads.com"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 4,
136 | "metadata": {},
137 | "outputs": [
138 | {
139 | "name": "stdout",
140 | "output_type": "stream",
141 | "text": [
142 | "You just scrapped 5 books from the Goodreads.com!\n"
143 | ]
144 | }
145 | ],
146 | "source": [
147 | "scrapped = GoodReadsScraper(isbns)\n",
148 | "html_list = scrapped.store_html()\n",
149 | "print(scrapped)"
150 | ]
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "metadata": {},
155 | "source": [
156 | "I instantiate `GoodReadsScraper`, personal module, which takes isbn numbers and scraps data for each book and stores html text as a list.\n",
157 | "\n",
158 | "This will take some time as we are performing thousands of calls."
159 | ]
160 | },
161 | {
162 | "cell_type": "markdown",
163 | "metadata": {},
164 | "source": [
165 | "After scrapping is done we extract **number of pages, edition, cover picture url** and **genres** and convert them into `pandas` dataframe."
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": 5,
171 | "metadata": {},
172 | "outputs": [
173 | {
174 | "name": "stdout",
175 | "output_type": "stream",
176 | "text": [
177 | "(5, 4)\n"
178 | ]
179 | }
180 | ],
181 | "source": [
182 | "df2 = scrapped.data_converter()\n",
183 | "print(df2.shape)"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": 7,
189 | "metadata": {},
190 | "outputs": [
191 | {
192 | "name": "stdout",
193 | "output_type": "stream",
194 | "text": [
195 | "(5, 2)\n"
196 | ]
197 | }
198 | ],
199 | "source": [
200 | "df3 = scrapped.cover_url_converter()\n",
201 | "print(df3.shape)"
202 | ]
203 | },
204 | {
205 | "cell_type": "code",
206 | "execution_count": 8,
207 | "metadata": {},
208 | "outputs": [
209 | {
210 | "name": "stdout",
211 | "output_type": "stream",
212 | "text": [
213 | "(5, 3)\n"
214 | ]
215 | }
216 | ],
217 | "source": [
218 | "df4 = scrapped.pop_converter()\n",
219 | "print(df4.shape)"
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": 9,
225 | "metadata": {},
226 | "outputs": [
227 | {
228 | "data": {
229 | "text/html": [
230 | "\n",
231 | "\n",
244 | "
\n",
245 | " \n",
246 | " \n",
247 | " | \n",
248 | " isbn13 | \n",
249 | " description | \n",
250 | "
\n",
251 | " \n",
252 | " \n",
253 | " \n",
254 | " | 0 | \n",
255 | " 9780345541444 | \n",
256 | " \\nAt nearly one hundred years old, Thalia Mars... | \n",
257 | "
\n",
258 | " \n",
259 | "
\n",
260 | "
"
261 | ],
262 | "text/plain": [
263 | " isbn13 description\n",
264 | "0 9780345541444 \\nAt nearly one hundred years old, Thalia Mars..."
265 | ]
266 | },
267 | "execution_count": 9,
268 | "metadata": {},
269 | "output_type": "execute_result"
270 | }
271 | ],
272 | "source": [
273 | "df6 = scrapped.description()\n",
274 | "df6"
275 | ]
276 | },
277 | {
278 | "cell_type": "markdown",
279 | "metadata": {},
280 | "source": [
281 | "After all neccessary methods are executed we have four different dataframes, which we will join on _ISBN13_ number and merge into one."
282 | ]
283 | },
284 | {
285 | "cell_type": "code",
286 | "execution_count": 11,
287 | "metadata": {},
288 | "outputs": [
289 | {
290 | "data": {
291 | "text/html": [
292 | "\n",
293 | "\n",
306 | "
\n",
307 | " \n",
308 | " \n",
309 | " | \n",
310 | " title1 | \n",
311 | " author1 | \n",
312 | " isbn13 | \n",
313 | " pages | \n",
314 | " released | \n",
315 | " edition | \n",
316 | " cover_url | \n",
317 | " rating | \n",
318 | " count | \n",
319 | "
\n",
320 | " \n",
321 | " \n",
322 | " \n",
323 | " | 0 | \n",
324 | " never never | \n",
325 | " james patterson and candice fox | \n",
326 | " 9780316433174 | \n",
327 | " 363 pages | \n",
328 | " \\n (first published August 25th 2... | \n",
329 | " Hardcover | \n",
330 | " https://i.gr-assets.com/images/S/compressed.ph... | \n",
331 | " \\n 3.62\\n | \n",
332 | " \\n 13,917\\n ratings\\n | \n",
333 | "
\n",
334 | " \n",
335 | " | 1 | \n",
336 | " devil in spring | \n",
337 | " lisa kleypas | \n",
338 | " 9780062371904 | \n",
339 | " 384 pages | \n",
340 | " \\n —\\n 37 likes\\n | \n",
341 | " ebook | \n",
342 | " https://i.gr-assets.com/images/S/compressed.ph... | \n",
343 | " \\n 4.08\\n | \n",
344 | " \\n 22,036\\n ratings\\n | \n",
345 | "
\n",
346 | " \n",
347 | " | 2 | \n",
348 | " aftermath:: empire's end | \n",
349 | " chuck wendig | \n",
350 | " 9781101966969 | \n",
351 | " 423 pages | \n",
352 | " \\n —\\n 7 likes\\n | \n",
353 | " Hardcover | \n",
354 | " https://i.gr-assets.com/images/S/compressed.ph... | \n",
355 | " \\n 3.79\\n | \n",
356 | " \\n 9,247\\n ratings\\n | \n",
357 | "
\n",
358 | " \n",
359 | " | 3 | \n",
360 | " echoes in death | \n",
361 | " j d robb | \n",
362 | " 9781250123145 | \n",
363 | " 400 pages | \n",
364 | " \\n —\\n 12 likes\\n | \n",
365 | " ebook | \n",
366 | " https://i.gr-assets.com/images/S/compressed.ph... | \n",
367 | " \\n 4.41\\n | \n",
368 | " \\n 19,576\\n ratings\\n | \n",
369 | "
\n",
370 | " \n",
371 | " | 4 | \n",
372 | " heartbreak hotel | \n",
373 | " jonathan kellerman | \n",
374 | " 9780345541444 | \n",
375 | " 325 pages | \n",
376 | " \\n —\\n 0 likes\\n | \n",
377 | " Nook | \n",
378 | " https://i.gr-assets.com/images/S/compressed.ph... | \n",
379 | " \\n 3.84\\n | \n",
380 | " \\n 11,852\\n ratings\\n | \n",
381 | "
\n",
382 | " \n",
383 | "
\n",
384 | "
"
385 | ],
386 | "text/plain": [
387 | " title1 author1 isbn13 \\\n",
388 | "0 never never james patterson and candice fox 9780316433174 \n",
389 | "1 devil in spring lisa kleypas 9780062371904 \n",
390 | "2 aftermath:: empire's end chuck wendig 9781101966969 \n",
391 | "3 echoes in death j d robb 9781250123145 \n",
392 | "4 heartbreak hotel jonathan kellerman 9780345541444 \n",
393 | "\n",
394 | " pages released edition \\\n",
395 | "0 363 pages \\n (first published August 25th 2... Hardcover \n",
396 | "1 384 pages \\n —\\n 37 likes\\n ebook \n",
397 | "2 423 pages \\n —\\n 7 likes\\n Hardcover \n",
398 | "3 400 pages \\n —\\n 12 likes\\n ebook \n",
399 | "4 325 pages \\n —\\n 0 likes\\n Nook \n",
400 | "\n",
401 | " cover_url rating \\\n",
402 | "0 https://i.gr-assets.com/images/S/compressed.ph... \\n 3.62\\n \n",
403 | "1 https://i.gr-assets.com/images/S/compressed.ph... \\n 4.08\\n \n",
404 | "2 https://i.gr-assets.com/images/S/compressed.ph... \\n 3.79\\n \n",
405 | "3 https://i.gr-assets.com/images/S/compressed.ph... \\n 4.41\\n \n",
406 | "4 https://i.gr-assets.com/images/S/compressed.ph... \\n 3.84\\n \n",
407 | "\n",
408 | " count \n",
409 | "0 \\n 13,917\\n ratings\\n \n",
410 | "1 \\n 22,036\\n ratings\\n \n",
411 | "2 \\n 9,247\\n ratings\\n \n",
412 | "3 \\n 19,576\\n ratings\\n \n",
413 | "4 \\n 11,852\\n ratings\\n "
414 | ]
415 | },
416 | "execution_count": 11,
417 | "metadata": {},
418 | "output_type": "execute_result"
419 | }
420 | ],
421 | "source": [
422 | "result = df1.merge(df2, on='isbn13').merge(df3, on='isbn13').merge(df4, on='isbn13')\n",
423 | "result.tail()"
424 | ]
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": 12,
429 | "metadata": {},
430 | "outputs": [
431 | {
432 | "data": {
433 | "text/plain": [
434 | "(5, 9)"
435 | ]
436 | },
437 | "execution_count": 12,
438 | "metadata": {},
439 | "output_type": "execute_result"
440 | }
441 | ],
442 | "source": [
443 | "result.shape"
444 | ]
445 | },
446 | {
447 | "cell_type": "code",
448 | "execution_count": 13,
449 | "metadata": {},
450 | "outputs": [
451 | {
452 | "name": "stdout",
453 | "output_type": "stream",
454 | "text": [
455 | "\n",
456 | "Int64Index: 5 entries, 0 to 4\n",
457 | "Data columns (total 9 columns):\n",
458 | " # Column Non-Null Count Dtype \n",
459 | "--- ------ -------------- ----- \n",
460 | " 0 title1 5 non-null object\n",
461 | " 1 author1 5 non-null object\n",
462 | " 2 isbn13 5 non-null object\n",
463 | " 3 pages 5 non-null object\n",
464 | " 4 released 5 non-null object\n",
465 | " 5 edition 5 non-null object\n",
466 | " 6 cover_url 5 non-null object\n",
467 | " 7 rating 5 non-null object\n",
468 | " 8 count 5 non-null object\n",
469 | "dtypes: object(9)\n",
470 | "memory usage: 400.0+ bytes\n"
471 | ]
472 | }
473 | ],
474 | "source": [
475 | "result.info()"
476 | ]
477 | },
478 | {
479 | "cell_type": "markdown",
480 | "metadata": {},
481 | "source": [
482 | "I have some missing values but that's ok.\n",
483 | "\n",
484 | "Finally I save data as a pickle file."
485 | ]
486 | },
487 | {
488 | "cell_type": "code",
489 | "execution_count": null,
490 | "metadata": {},
491 | "outputs": [],
492 | "source": [
493 | "# result.to_csv('bestsellers_merged.csv', index=False)"
494 | ]
495 | },
496 | {
497 | "cell_type": "code",
498 | "execution_count": null,
499 | "metadata": {},
500 | "outputs": [],
501 | "source": [
502 | "# result.to_pickle('complete_bestsellers.pkl')"
503 | ]
504 | },
505 | {
506 | "cell_type": "code",
507 | "execution_count": null,
508 | "metadata": {},
509 | "outputs": [],
510 | "source": []
511 | }
512 | ],
513 | "metadata": {
514 | "kernelspec": {
515 | "display_name": "Python 3",
516 | "language": "python",
517 | "name": "python3"
518 | },
519 | "language_info": {
520 | "codemirror_mode": {
521 | "name": "ipython",
522 | "version": 3
523 | },
524 | "file_extension": ".py",
525 | "mimetype": "text/x-python",
526 | "name": "python",
527 | "nbconvert_exporter": "python",
528 | "pygments_lexer": "ipython3",
529 | "version": "3.7.7"
530 | }
531 | },
532 | "nbformat": 4,
533 | "nbformat_minor": 4
534 | }
535 |
--------------------------------------------------------------------------------