├── cleaning.py
├── data
    ├── README.md
    ├── raw
    │   ├── CSVtoDF.py
    │   ├── best_books_scraper.py
    │   ├── times_api_call.ipynb
    │   ├── RawDataCollector.py
    │   ├── bestsellers_generating.ipynb
    │   └── bestsellers_combining.ipynb
    └── preprocessing
    │   └── bestsellers_cleaning.ipynb
├── README.md
├── requirements.txt
├── .gitignore
└── expl
    └── bestsellers_EDA.ipynb


/cleaning.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | Readme coming soon
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Best-Seller-Books
2 | Coming soon...
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | beautifulsoup4==4.9.1
 2 | bs4==0.0.1
 3 | certifi==2020.4.5.1
 4 | chardet==3.0.4
 5 | cycler==0.10.0
 6 | DateTime==4.3
 7 | idna==2.9
 8 | kiwisolver==1.2.0
 9 | matplotlib==3.2.1
10 | numpy==1.18.4
11 | pandas==1.0.3
12 | pyparsing==2.4.7
13 | python-dateutil==2.8.1
14 | pytz==2020.1
15 | requests==2.23.0
16 | six==1.15.0
17 | soupsieve==2.0.1
18 | tqdm==4.46.0
19 | urllib3==1.26.5
20 | zope.interface==5.1.0


--------------------------------------------------------------------------------
/data/raw/CSVtoDF.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | class CSVtoDF(object):
 4 |     """
 5 |     Temporarily imports data frame columns from .csv file
 6 |     and deletes them after selecting the needed columns.
 7 | 
 8 |     """
 9 | 
10 |     def __init__(self, file):
11 |         self.df = pd.read_csv(file)
12 |         self.columns = list(self.df.columns.values)
13 | 
14 |     def __enter__(self):
15 |         return self.df
16 | 
17 |     def __exit__(self, exc_type, exc_val, exc_tb):
18 |         self.df.drop([c for c in self.columns], axis=1, inplace=True)
19 | 
20 | 


--------------------------------------------------------------------------------
/data/raw/best_books_scraper.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import requests
 3 | 
 4 | from bs4 import BeautifulSoup
 5 | 
 6 | 
 7 | class BestBooks():
 8 | 
 9 |     def __init__(self, pages):
10 |         self.pages = pages
11 |         self.soups = [BeautifulSoup(requests.get(f'https://www.goodreads.com/list/show/1.Best_Books_Ever?page={n}').text,
12 |                                                                         'html.parser') for n in range(1, self.pages + 1)]
13 | 
14 |     def __str__(self):
15 |         return f"You just scraped {self.pages} pages from the GoodReads.com!"
16 |     
17 |     def store_html(self):
18 |         # If you want to store scraped HTML as a list
19 |         return self.soups
20 | 
21 |     def books_authors(self):
22 |         titles, authors = [], []
23 |         for page in self.soups:
24 |             for title, author in zip(page.find_all(class_="bookTitle"), page.find_all(class_="authorName")):
25 |                 titles.append(title.get_text())
26 |                 authors.append(author.get_text())
27 |         return [titles, authors]
28 | 
29 | s = BestBooks(1)
30 | print(s.books_authors())


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Specific to the project
 2 | *.[oa]
 3 | *~
 4 | venv
 5 | .ipynb_checkpoints
 6 | *.pkl
 7 | *.csv
 8 | goodreads_metadata.txt
 9 | test.py
10 | practice.py
11 | *.exe
12 | 
13 | # Byte-compiled / optimized / DLL files
14 | __pycache__/
15 | *.py[cod]
16 | 
17 | # C extensions
18 | *.so
19 | 
20 | # Distribution / packaging
21 | .Python
22 | env/
23 | build/
24 | develop-eggs/
25 | dist/
26 | downloads/
27 | eggs/
28 | .eggs/
29 | lib/
30 | lib64/
31 | parts/
32 | sdist/
33 | var/
34 | *.egg-info/
35 | .installed.cfg
36 | *.egg
37 | 
38 | # PyInstaller
39 | #  Usually these files are written by a python script from a template
40 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
41 | *.manifest
42 | *.spec
43 | 
44 | # Installer logs
45 | pip-log.txt
46 | pip-delete-this-directory.txt
47 | 
48 | # Unit test / coverage reports
49 | htmlcov/
50 | .tox/
51 | .coverage
52 | .coverage.*
53 | .cache
54 | nosetests.xml
55 | coverage.xml
56 | *,cover
57 | 
58 | # Translations
59 | *.mo
60 | *.pot
61 | 
62 | # Django stuff:
63 | *.log
64 | 
65 | # Sphinx documentation
66 | docs/_build/
67 | 
68 | # PyBuilder
69 | target/
70 | 


--------------------------------------------------------------------------------
/data/raw/times_api_call.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Calling Times API\n",
  8 |     "\n",
  9 |     "We will extract the weekly best seller's list for the last 10 years from the Times API."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "import pandas as pd\n",
 20 |     "\n",
 21 |     "from RawDataCollector import TimesExtractor"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "I use the manually defined class `TimesExtractor`, which will take `start_date`, `end_date`, and `frequencies` as an argument and will make calls to the Times Best Seller Books API based on the number of weeks within those dates range."
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "year = TimesExtractor(start_date='2011-05-01', end_date='2011-05-01', frq='W')\n",
 38 |     "dic = year.make_call()"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "Because the Times API has a limit of maximum 20 calls per minute with 3 seconds lapses between calls it takes time for the module to make all the given calls.\n",
 46 |     "\n",
 47 |     "* In our case it took 27 minutes to make 470 calls.\n",
 48 |     "\n",
 49 |     "-----------\n",
 50 |     "Now I will extract only the dictionaries containing the data about bestseller books.\n",
 51 |     "In order to do that I use personally defined function `converter` which will extract values based on given `keys`."
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "keys = ['weeks_on_list', 'primary_isbn10', 'primary_isbn13',\n",
 61 |     "       'publisher', 'description', 'title', 'author']"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "def converter(dic, keys):\n",
 71 |     "    converted = {}\n",
 72 |     "    for n in range(0, len(dic)):\n",
 73 |     "        for m in range(0, len(dic[n])):\n",
 74 |     "            for k in keys:\n",
 75 |     "                converted.setdefault(k, []).append(dic[n][m][k])\n",
 76 |     "    return converted"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "# Calling function with extracted data from the Times API and hand picked keys\n",
 86 |     "result = converter(dic, keys)"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "Finally, I convert the dictionary into the `pandas` dataframe before saving it."
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {
100 |     "scrolled": true
101 |    },
102 |    "outputs": [],
103 |    "source": [
104 |     "df = pd.DataFrame.from_dict(result)\n",
105 |     "df.head()"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "metadata": {},
111 |    "source": [
112 |     "I also add the column which will indicate that the book is from Bestsellers list."
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "df['best_seller'] = 'yes'"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "Let's check our DataFrame:"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "print('dimensions: ', df.shape)\n",
138 |     "print()\n",
139 |     "df.head()"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "markdown",
144 |    "metadata": {},
145 |    "source": [
146 |     "We have extracted 8555 rows among seven columns from the Times API containing books that have been on their best-sellers list.\n",
147 |     "\n",
148 |     "Final step: I save the dataframe to the pickle file for more convenient transferring into the notebook where I will perform additional work on the dataframe."
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "# df.to_pickle('bestsellers_raw.pkl')\n",
158 |     "df.to_csv('bestsellers_raw.csv', index=False)"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": []
167 |   }
168 |  ],
169 |  "metadata": {
170 |   "kernelspec": {
171 |    "display_name": "Python 3",
172 |    "language": "python",
173 |    "name": "python3"
174 |   },
175 |   "language_info": {
176 |    "codemirror_mode": {
177 |     "name": "ipython",
178 |     "version": 3
179 |    },
180 |    "file_extension": ".py",
181 |    "mimetype": "text/x-python",
182 |    "name": "python",
183 |    "nbconvert_exporter": "python",
184 |    "pygments_lexer": "ipython3",
185 |    "version": "3.7.7"
186 |   }
187 |  },
188 |  "nbformat": 4,
189 |  "nbformat_minor": 4
190 | }
191 | 


--------------------------------------------------------------------------------
/data/preprocessing/bestsellers_cleaning.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import numpy as np\n",
 11 |     "import re\n",
 12 |     "import sys"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "I define the path to the **raw** folder to use `CSVtoDF` module, which is located there."
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "sys.path.insert(0, \"C:/Users/Hellrox/Desktop/Projects/Best-Seller-Books/data/raw\")\n",
 29 |     "from CSVtoDF import CSVtoDF"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "I load everything except _cover picture url_ column."
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "with CSVtoDF(\"C:/Users/Hellrox/Desktop/Projects/Best-Seller-Books/data/raw/bestsellers_merged.csv\") as df:\n",
 46 |     "    df['title'] = df['title1']\n",
 47 |     "    df['author'] = df['author1']\n",
 48 |     "    df['pages_number'] = df['pages']\n",
 49 |     "    df['format'] = df['edition']\n",
 50 |     "    df['genre'] = df['genres']\n",
 51 |     "    df['score'] = df['rating']\n",
 52 |     "    df['total_count'] = df['count']\n",
 53 |     "\n",
 54 |     "df.head()"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "# Cleaning the Dataframe\n",
 62 |     "\n",
 63 |     "I start by cleaning the genres column, which still is a text from a CSS format I scraped from Goodreads.com\n",
 64 |     "\n",
 65 |     "I start by defining function `genre_to_list` which will take `pandas` series and `regex` pattern as an argument and return list of extracted strings."
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "def genre_to_list(series, pattern):\n",
 75 |     "    return [re.findall(pattern, txt) for txt in series]\n",
 76 |     "\n",
 77 |     "genr = genre_to_list(df['genre'], r'>(\\w+)<')"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "After that I iterate through the list and join them to the original dataframe."
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "for g, i in zip(genr, range(0, len(genr))):\n",
 94 |     "    df['genre'][i] = ', '.join(g)\n",
 95 |     "    \n",
 96 |     "df.head()"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "Next comes the `pages_number` column. I strip only the digit values and convert them to the _float_."
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "df[['pages', 'drop']] = df['pages_number'].str.split(expand=True)\n",
113 |     "df['pages'] = df['pages'].astype(float)\n",
114 |     "df.drop(['drop', 'pages_number'], axis=1, inplace=True)\n",
115 |     "df.head()"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "I strip numerical value from the `total_count`, remove commas and covnert to float value."
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "df[['count', 'drop']] = df['total_count'].str.strip().str.split('\\\\n', expand=True)\n",
132 |     "df['count'] = df['count'].str.replace(',', '').astype(float)\n",
133 |     "df.drop(['drop', 'total_count'], axis=1, inplace=True)\n",
134 |     "df = df[['title', 'author', 'format', 'pages', 'score', 'count']]\n",
135 |     "df.head()"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "markdown",
140 |    "metadata": {},
141 |    "source": [
142 |     "I convert the `title` and `author` values into title to capitalize first letters of each word. To make for pleasently readable"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "df['title'] = df['title'].str.title()\n",
152 |     "df['author'] = df['author'].str.title()\n",
153 |     "df.head()"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "markdown",
158 |    "metadata": {},
159 |    "source": [
160 |     "Let's take a look and clean version of previosu dataframe."
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "metadata": {},
167 |    "outputs": [],
168 |    "source": [
169 |     "df.info()"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "markdown",
174 |    "metadata": {},
175 |    "source": [
176 |     "Finally, I save it to the new csv file."
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "metadata": {},
183 |    "outputs": [],
184 |    "source": [
185 |     "df.to_csv('bestsellers.csv')"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": []
194 |   }
195 |  ],
196 |  "metadata": {
197 |   "kernelspec": {
198 |    "display_name": "Python 3",
199 |    "language": "python",
200 |    "name": "python3"
201 |   },
202 |   "language_info": {
203 |    "codemirror_mode": {
204 |     "name": "ipython",
205 |     "version": 3
206 |    },
207 |    "file_extension": ".py",
208 |    "mimetype": "text/x-python",
209 |    "name": "python",
210 |    "nbconvert_exporter": "python",
211 |    "pygments_lexer": "ipython3",
212 |    "version": "3.7.7"
213 |   }
214 |  },
215 |  "nbformat": 4,
216 |  "nbformat_minor": 4
217 | }
218 | 


--------------------------------------------------------------------------------
/data/raw/RawDataCollector.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import pandas as pd
  4 | import requests
  5 | import json
  6 | import time
  7 | import re
  8 | 
  9 | from tqdm import tqdm
 10 | from datetime import datetime
 11 | from bs4 import BeautifulSoup
 12 | 
 13 | 
 14 | class GoodReadsScraper():
 15 |     """ This module scraps the information from the Goodreads webpage.
 16 |     You provide a list of ISBN's as the argument and then magic happens.
 17 | 
 18 |     attributes obtained: author, title, number of pages, edition, cover_url, genres
 19 |     """
 20 |     
 21 |     def __init__(self, isbns):
 22 |         self.isbns = isbns
 23 |         # Iterates over the list of ISBNs
 24 |         # Gets HTMLs and stores as a list
 25 |         self.soups = [BeautifulSoup(requests.get(f'https://www.goodreads.com/book/isbn/{i}').text, 'html.parser') for i in self.isbns]
 26 | 
 27 |     def __str__(self):
 28 |         return f"You just scrapped {len(self.isbns)} books from the Goodreads.com!"
 29 | 
 30 |     def store_html(self):
 31 |         # If you want to store scraped HTML as a list
 32 |         return self.soups
 33 | 
 34 |     def _goodreads_scraping(self):
 35 |         """ This method extracts the number of pages,
 36 |             book's edition from accrued HTML texts.
 37 |         
 38 |         """
 39 |         isbn_13, pages, edition, released = [], [], [], []
 40 |         for s, i in zip(self.soups, self.isbns):
 41 |             isbn_13.append(i)
 42 |             pages.append(getattr(s.find(itemprop="numberOfPages"), 'text', None))
 43 |             released.append(getattr(s.select_one('nobr'), 'text', None))
 44 |             edition.append(getattr(s.find(itemprop="bookFormat"), 'text', None))
 45 |         return isbn_13, pages, released, edition
 46 |             
 47 |     def data_converter(self):
 48 |         # Convertes previously extracted list into dataframe.
 49 |         i, p, r, e = self._goodreads_scraping()
 50 |         return pd.DataFrame({'isbn13': i, 'pages': p, 'released': r, 'edition': e})
 51 |     
 52 |     def _meta_scraping(self):
 53 |         """ This method extracts the metadata of the book.
 54 |             That is book's title and author's full name & genre.
 55 |         
 56 |         """
 57 |         isbn_13, author, title, genre = [], [], [], []
 58 |         for s, i in zip(self.soups, self.isbns):
 59 |             isbn_13.append(i)
 60 |             author.append(getattr(s.find(class_='authorName'), 'text', None))
 61 |             title.append(getattr(s.find(class_="gr-h1 gr-h1--serif"), 'text', None))
 62 |             genre.append(s.find_all(class_='actionLinkLite bookPageGenreLink')[:3])
 63 |         return isbn_13, author, title, genre
 64 | 
 65 |     def meta_converter(self):
 66 |         # Convertes previously extracted list into dataframe.
 67 |         i, a, t, g = self._meta_scraping()
 68 |         return pd.DataFrame({'isbn13': i, 'author': a, 'title': t, 'genres': g})
 69 | 
 70 |     def _pop_scraping(self):
 71 |         """
 72 |         This method extracts a book's rating and a count.
 73 | 
 74 |         """
 75 |         isbn_13, rating, count = [], [], []
 76 |         for s, i in zip(self.soups, self.isbns):
 77 |             isbn_13.append(i)
 78 |             rating.append(getattr(s.find(itemprop="ratingValue"), 'text', None))
 79 |             count.append(getattr(s.find(itemprop="ratingCount"), 'text', None))
 80 |         return isbn_13, rating, count
 81 | 
 82 |     def pop_converter(self):
 83 |         # Convertes previously extracted list into dataframe.
 84 |         i, r, c = self._pop_scraping()
 85 |         return pd.DataFrame({'isbn13': i, 'rating': r, 'count': c})
 86 | 
 87 |     def _cover_scraper(self):
 88 |         """ This method grabs cover picture URL.
 89 | 
 90 |         This portion of the text is a bit problematic
 91 |         so it's handled using a try-except function.
 92 |         """
 93 |         isbn_13, cover_url = [], []
 94 |         for s, i in zip(self.soups, self.isbns):
 95 |             isbn_13.append(i)
 96 |             try:
 97 |                 cover_url.append(s.find(id='coverImage')['src'])
 98 |             except TypeError:
 99 |                 cover_url.append(np.NaN)
100 |         return isbn_13, cover_url
101 | 
102 |     def cover_url_converter(self):
103 |         # Convertes previously extracted list into dataframe.
104 |         i, cu = self._cover_scraper()
105 |         return pd.DataFrame({'isbn13': i, 'cover_url': cu})
106 | 
107 |     def description(self):
108 |         """ Extract description text in a raw format
109 |         
110 |         """
111 |         isbn_13, descr = [], []
112 |         for s, i in zip(self.soups, self.isbns):
113 |             isbn_13.append(i)
114 |         try:
115 |             descr.append(s.find(id='description').text)
116 |         except TypeError:
117 |             descr.append(np.NaN)
118 |         return pd.DataFrame({'isbn13': i, 'description': descr})
119 | 
120 | 
121 | class TimesExtractor:
122 |     """ The purpose of the module is to extract books data from the Times API.
123 | 
124 |     Times has a limit on how many requests you can send, so we have to do
125 |     with time lapses and then combine all the collected data into one list.
126 | 
127 |     """
128 |     # Default api-key which can be changed.
129 |     api_key = 'djDLXwAoSfreMrzYGE5iacl7GUifIRrV'
130 |     # Default lapse between calls
131 |     lapse = 3
132 | 
133 |     def __init__(self, start_date, end_date, frq):
134 |         """ Please enter dates in 'yyyy-mm-dd' format.
135 | 
136 |         For frq -> 'D'=day; 'W'=week; 'M'=month; 'Y'=year.
137 |         """
138 |         self.start_date = start_date
139 |         self.end_date = end_date
140 |         self.frq = frq
141 |         # Raise Error if entered value doesn't match the pattern.
142 |         if not re.search(r"^\d{4}-\d{2}-\d{2}$", self.start_date):
143 |             raise ValueError('please enter start_date in \'yyyy-mm-dd\' format')
144 |         elif not re.search(r"^\d{4}-\d{2}-\d{2}$", self.end_date):
145 |             raise ValueError('please enter end_date in \'yyyy-mm-dd\' format')
146 |         elif self.frq not in ('D', 'W', 'M', 'Y'):
147 |             raise ValueError('please enter frequency in correct format')
148 |         else:
149 |             pass
150 | 
151 |     def __str__(self):
152 |         return f"The bestellers list is from {self.start_date} to {self.end_date}"
153 | 
154 |     def _datesrange(self):
155 |         if self.start_date >= self.end_date:
156 |             return [self.end_date]
157 |         else:
158 |             dates = [d for d in pd.date_range(start=self.start_date,
159 |                                         end=self.end_date,
160 |                                         freq=self.frq).strftime('%Y-%m-%d')]
161 |             return dates
162 | 
163 |     @classmethod
164 |     def key(cls, key):
165 |         cls.api_key = key
166 | 
167 |     @classmethod
168 |     def seconds(cls, second):
169 |         cls.lapse = second
170 | 
171 |     def make_call(self):
172 |         """ This function iterates through the dates and sends the request
173 |         to the Times API for the given set of dates.
174 |         
175 |         Next, it returns all the dictionaries as a combined list.
176 |         """
177 |         super_list = []
178 |         print(f'Due to API\'s limitation, there is a {self.lapse} second lapse between calls')
179 |         for date in tqdm(self._datesrange()):
180 |             """ Times API has a daily limit on calls
181 |             and a requirement for intervals between calls.
182 |             
183 |             Thus, we set up the lapses between calls.
184 |             """
185 | 
186 |             time.sleep(self.lapse)
187 |             print(date)
188 |             res = requests.get(f"https://api.nytimes.com/svc/books/v3/lists/{date}/combined-print-and-e-book-fiction.json?",
189 |                                 params = {'api-key': self.api_key}).json()
190 |             super_list.append(res['results']['books'])
191 |         return super_list
192 | 
193 | 
194 | if __name__ == "__main__":
195 |     print('This method is intended to be used within Jupyter Notebook...')
196 | 
197 | 


--------------------------------------------------------------------------------
/expl/bestsellers_EDA.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import numpy as np\n",
 11 |     "import matplotlib.pyplot as plt\n",
 12 |     "import seaborn as sns"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 23,
 18 |    "metadata": {},
 19 |    "outputs": [
 20 |     {
 21 |      "data": {
 22 |       "text/html": [
 23 |        "<div>\n",
 24 |        "<style scoped>\n",
 25 |        "    .dataframe tbody tr th:only-of-type {\n",
 26 |        "        vertical-align: middle;\n",
 27 |        "    }\n",
 28 |        "\n",
 29 |        "    .dataframe tbody tr th {\n",
 30 |        "        vertical-align: top;\n",
 31 |        "    }\n",
 32 |        "\n",
 33 |        "    .dataframe thead th {\n",
 34 |        "        text-align: right;\n",
 35 |        "    }\n",
 36 |        "</style>\n",
 37 |        "<table border=\"1\" class=\"dataframe\">\n",
 38 |        "  <thead>\n",
 39 |        "    <tr style=\"text-align: right;\">\n",
 40 |        "      <th></th>\n",
 41 |        "      <th>title</th>\n",
 42 |        "      <th>author</th>\n",
 43 |        "      <th>format</th>\n",
 44 |        "      <th>pages</th>\n",
 45 |        "      <th>score</th>\n",
 46 |        "      <th>count</th>\n",
 47 |        "    </tr>\n",
 48 |        "  </thead>\n",
 49 |        "  <tbody>\n",
 50 |        "    <tr>\n",
 51 |        "      <th>0</th>\n",
 52 |        "      <td>Water For Elephants</td>\n",
 53 |        "      <td>Sara Gruen</td>\n",
 54 |        "      <td>ebook</td>\n",
 55 |        "      <td>297.0</td>\n",
 56 |        "      <td>4.09</td>\n",
 57 |        "      <td>1340594.0</td>\n",
 58 |        "    </tr>\n",
 59 |        "    <tr>\n",
 60 |        "      <th>1</th>\n",
 61 |        "      <td>Chasing Fire</td>\n",
 62 |        "      <td>Nora Roberts</td>\n",
 63 |        "      <td>ebook</td>\n",
 64 |        "      <td>480.0</td>\n",
 65 |        "      <td>4.12</td>\n",
 66 |        "      <td>34591.0</td>\n",
 67 |        "    </tr>\n",
 68 |        "    <tr>\n",
 69 |        "      <th>2</th>\n",
 70 |        "      <td>The Lincoln Lawyer</td>\n",
 71 |        "      <td>Michael Connelly</td>\n",
 72 |        "      <td>ebook</td>\n",
 73 |        "      <td>528.0</td>\n",
 74 |        "      <td>4.16</td>\n",
 75 |        "      <td>186048.0</td>\n",
 76 |        "    </tr>\n",
 77 |        "    <tr>\n",
 78 |        "      <th>3</th>\n",
 79 |        "      <td>The Fifth Witness</td>\n",
 80 |        "      <td>Michael Connelly</td>\n",
 81 |        "      <td>ebook</td>\n",
 82 |        "      <td>448.0</td>\n",
 83 |        "      <td>4.19</td>\n",
 84 |        "      <td>54599.0</td>\n",
 85 |        "    </tr>\n",
 86 |        "    <tr>\n",
 87 |        "      <th>4</th>\n",
 88 |        "      <td>The Help</td>\n",
 89 |        "      <td>Kathryn Stockett</td>\n",
 90 |        "      <td>Paperback</td>\n",
 91 |        "      <td>544.0</td>\n",
 92 |        "      <td>4.47</td>\n",
 93 |        "      <td>2079528.0</td>\n",
 94 |        "    </tr>\n",
 95 |        "  </tbody>\n",
 96 |        "</table>\n",
 97 |        "</div>"
 98 |       ],
 99 |       "text/plain": [
100 |        "                 title            author     format  pages  score      count\n",
101 |        "0  Water For Elephants        Sara Gruen      ebook  297.0   4.09  1340594.0\n",
102 |        "1         Chasing Fire      Nora Roberts      ebook  480.0   4.12    34591.0\n",
103 |        "2   The Lincoln Lawyer  Michael Connelly      ebook  528.0   4.16   186048.0\n",
104 |        "3    The Fifth Witness  Michael Connelly      ebook  448.0   4.19    54599.0\n",
105 |        "4             The Help  Kathryn Stockett  Paperback  544.0   4.47  2079528.0"
106 |       ]
107 |      },
108 |      "execution_count": 23,
109 |      "metadata": {},
110 |      "output_type": "execute_result"
111 |     }
112 |    ],
113 |    "source": [
114 |     "df = pd.read_csv('C:/Users/Hellrox/Desktop/Projects/Best-Seller-Books/data/preprocessing/bestsellers.csv',\n",
115 |     "                index_col=[0])\n",
116 |     "df.head()"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {},
122 |    "source": [
123 |     "Graphs"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 24,
129 |    "metadata": {},
130 |    "outputs": [
131 |     {
132 |      "data": {
133 |       "text/html": [
134 |        "<div>\n",
135 |        "<style scoped>\n",
136 |        "    .dataframe tbody tr th:only-of-type {\n",
137 |        "        vertical-align: middle;\n",
138 |        "    }\n",
139 |        "\n",
140 |        "    .dataframe tbody tr th {\n",
141 |        "        vertical-align: top;\n",
142 |        "    }\n",
143 |        "\n",
144 |        "    .dataframe thead th {\n",
145 |        "        text-align: right;\n",
146 |        "    }\n",
147 |        "</style>\n",
148 |        "<table border=\"1\" class=\"dataframe\">\n",
149 |        "  <thead>\n",
150 |        "    <tr style=\"text-align: right;\">\n",
151 |        "      <th></th>\n",
152 |        "      <th>count</th>\n",
153 |        "      <th>pages</th>\n",
154 |        "      <th>score</th>\n",
155 |        "    </tr>\n",
156 |        "    <tr>\n",
157 |        "      <th>format</th>\n",
158 |        "      <th></th>\n",
159 |        "      <th></th>\n",
160 |        "      <th></th>\n",
161 |        "    </tr>\n",
162 |        "  </thead>\n",
163 |        "  <tbody>\n",
164 |        "    <tr>\n",
165 |        "      <th>Audiobook</th>\n",
166 |        "      <td>557956.500000</td>\n",
167 |        "      <td>324.000000</td>\n",
168 |        "      <td>4.195000</td>\n",
169 |        "    </tr>\n",
170 |        "    <tr>\n",
171 |        "      <th>Edición Kindle</th>\n",
172 |        "      <td>53068.600000</td>\n",
173 |        "      <td>332.400000</td>\n",
174 |        "      <td>4.120000</td>\n",
175 |        "    </tr>\n",
176 |        "    <tr>\n",
177 |        "      <th>Hardcover</th>\n",
178 |        "      <td>65059.028200</td>\n",
179 |        "      <td>407.181223</td>\n",
180 |        "      <td>3.964403</td>\n",
181 |        "    </tr>\n",
182 |        "    <tr>\n",
183 |        "      <th>Kindle Edition</th>\n",
184 |        "      <td>6363.500000</td>\n",
185 |        "      <td>475.636364</td>\n",
186 |        "      <td>4.075000</td>\n",
187 |        "    </tr>\n",
188 |        "    <tr>\n",
189 |        "      <th>Mass Market Paperback</th>\n",
190 |        "      <td>115290.131148</td>\n",
191 |        "      <td>453.383333</td>\n",
192 |        "      <td>4.046393</td>\n",
193 |        "    </tr>\n",
194 |        "    <tr>\n",
195 |        "      <th>Nook</th>\n",
196 |        "      <td>27151.636364</td>\n",
197 |        "      <td>258.500000</td>\n",
198 |        "      <td>3.927273</td>\n",
199 |        "    </tr>\n",
200 |        "    <tr>\n",
201 |        "      <th>Paperback</th>\n",
202 |        "      <td>295383.494382</td>\n",
203 |        "      <td>426.359551</td>\n",
204 |        "      <td>3.994888</td>\n",
205 |        "    </tr>\n",
206 |        "    <tr>\n",
207 |        "      <th>Trade Paperback</th>\n",
208 |        "      <td>630470.000000</td>\n",
209 |        "      <td>254.000000</td>\n",
210 |        "      <td>3.970000</td>\n",
211 |        "    </tr>\n",
212 |        "    <tr>\n",
213 |        "      <th>Unknown Binding</th>\n",
214 |        "      <td>4521.578947</td>\n",
215 |        "      <td>NaN</td>\n",
216 |        "      <td>4.032632</td>\n",
217 |        "    </tr>\n",
218 |        "    <tr>\n",
219 |        "      <th>eBook</th>\n",
220 |        "      <td>151774.000000</td>\n",
221 |        "      <td>432.000000</td>\n",
222 |        "      <td>4.020000</td>\n",
223 |        "    </tr>\n",
224 |        "    <tr>\n",
225 |        "      <th>eBook Kindle</th>\n",
226 |        "      <td>29939.600000</td>\n",
227 |        "      <td>430.800000</td>\n",
228 |        "      <td>4.182000</td>\n",
229 |        "    </tr>\n",
230 |        "    <tr>\n",
231 |        "      <th>ebook</th>\n",
232 |        "      <td>53692.752699</td>\n",
233 |        "      <td>384.626810</td>\n",
234 |        "      <td>4.017154</td>\n",
235 |        "    </tr>\n",
236 |        "    <tr>\n",
237 |        "      <th>mp3 Audiobook</th>\n",
238 |        "      <td>90080.000000</td>\n",
239 |        "      <td>NaN</td>\n",
240 |        "      <td>3.820000</td>\n",
241 |        "    </tr>\n",
242 |        "  </tbody>\n",
243 |        "</table>\n",
244 |        "</div>"
245 |       ],
246 |       "text/plain": [
247 |        "                               count       pages     score\n",
248 |        "format                                                    \n",
249 |        "Audiobook              557956.500000  324.000000  4.195000\n",
250 |        "Edición Kindle          53068.600000  332.400000  4.120000\n",
251 |        "Hardcover               65059.028200  407.181223  3.964403\n",
252 |        "Kindle Edition           6363.500000  475.636364  4.075000\n",
253 |        "Mass Market Paperback  115290.131148  453.383333  4.046393\n",
254 |        "Nook                    27151.636364  258.500000  3.927273\n",
255 |        "Paperback              295383.494382  426.359551  3.994888\n",
256 |        "Trade Paperback        630470.000000  254.000000  3.970000\n",
257 |        "Unknown Binding          4521.578947         NaN  4.032632\n",
258 |        "eBook                  151774.000000  432.000000  4.020000\n",
259 |        "eBook Kindle            29939.600000  430.800000  4.182000\n",
260 |        "ebook                   53692.752699  384.626810  4.017154\n",
261 |        "mp3 Audiobook           90080.000000         NaN  3.820000"
262 |       ]
263 |      },
264 |      "execution_count": 24,
265 |      "metadata": {},
266 |      "output_type": "execute_result"
267 |     }
268 |    ],
269 |    "source": [
270 |     "pd.pivot_table(df, index='format', aggfunc=np.mean)"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": null,
276 |    "metadata": {},
277 |    "outputs": [],
278 |    "source": []
279 |   }
280 |  ],
281 |  "metadata": {
282 |   "kernelspec": {
283 |    "display_name": "Python 3",
284 |    "language": "python",
285 |    "name": "python3"
286 |   },
287 |   "language_info": {
288 |    "codemirror_mode": {
289 |     "name": "ipython",
290 |     "version": 3
291 |    },
292 |    "file_extension": ".py",
293 |    "mimetype": "text/x-python",
294 |    "name": "python",
295 |    "nbconvert_exporter": "python",
296 |    "pygments_lexer": "ipython3",
297 |    "version": "3.7.7"
298 |   }
299 |  },
300 |  "nbformat": 4,
301 |  "nbformat_minor": 4
302 | }
303 | 


--------------------------------------------------------------------------------
/data/raw/bestsellers_generating.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Generating ISBNs and Metadata"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import re\n",
 17 |     "import pandas as pd\n",
 18 |     "import numpy as np\n",
 19 |     "\n",
 20 |     "from CSVtoDF import CSVtoDF"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "I will use personally defined module `CSVtoDF`, which will temporary open csv file with with statement and only load manually picked columns and drop the rest and close the file."
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 2,
 33 |    "metadata": {},
 34 |    "outputs": [
 35 |     {
 36 |      "data": {
 37 |       "text/html": [
 38 |        "<div>\n",
 39 |        "<style scoped>\n",
 40 |        "    .dataframe tbody tr th:only-of-type {\n",
 41 |        "        vertical-align: middle;\n",
 42 |        "    }\n",
 43 |        "\n",
 44 |        "    .dataframe tbody tr th {\n",
 45 |        "        vertical-align: top;\n",
 46 |        "    }\n",
 47 |        "\n",
 48 |        "    .dataframe thead th {\n",
 49 |        "        text-align: right;\n",
 50 |        "    }\n",
 51 |        "</style>\n",
 52 |        "<table border=\"1\" class=\"dataframe\">\n",
 53 |        "  <thead>\n",
 54 |        "    <tr style=\"text-align: right;\">\n",
 55 |        "      <th></th>\n",
 56 |        "      <th>isbn13</th>\n",
 57 |        "      <th>title1</th>\n",
 58 |        "      <th>author1</th>\n",
 59 |        "      <th>weeks</th>\n",
 60 |        "    </tr>\n",
 61 |        "  </thead>\n",
 62 |        "  <tbody>\n",
 63 |        "    <tr>\n",
 64 |        "      <th>0</th>\n",
 65 |        "      <td>9781616200817</td>\n",
 66 |        "      <td>water for elephants</td>\n",
 67 |        "      <td>sara gruen</td>\n",
 68 |        "      <td>12</td>\n",
 69 |        "    </tr>\n",
 70 |        "    <tr>\n",
 71 |        "      <th>1</th>\n",
 72 |        "      <td>9781101513781</td>\n",
 73 |        "      <td>chasing fire</td>\n",
 74 |        "      <td>nora roberts</td>\n",
 75 |        "      <td>1</td>\n",
 76 |        "    </tr>\n",
 77 |        "    <tr>\n",
 78 |        "      <th>2</th>\n",
 79 |        "      <td>9780759514713</td>\n",
 80 |        "      <td>the lincoln lawyer</td>\n",
 81 |        "      <td>michael connelly</td>\n",
 82 |        "      <td>6</td>\n",
 83 |        "    </tr>\n",
 84 |        "    <tr>\n",
 85 |        "      <th>3</th>\n",
 86 |        "      <td>9780316069380</td>\n",
 87 |        "      <td>the fifth witness</td>\n",
 88 |        "      <td>michael connelly</td>\n",
 89 |        "      <td>2</td>\n",
 90 |        "    </tr>\n",
 91 |        "    <tr>\n",
 92 |        "      <th>4</th>\n",
 93 |        "      <td>9780425232200</td>\n",
 94 |        "      <td>the help</td>\n",
 95 |        "      <td>kathryn stockett</td>\n",
 96 |        "      <td>9</td>\n",
 97 |        "    </tr>\n",
 98 |        "  </tbody>\n",
 99 |        "</table>\n",
100 |        "</div>"
101 |       ],
102 |       "text/plain": [
103 |        "          isbn13               title1           author1  weeks\n",
104 |        "0  9781616200817  water for elephants        sara gruen     12\n",
105 |        "1  9781101513781         chasing fire      nora roberts      1\n",
106 |        "2  9780759514713   the lincoln lawyer  michael connelly      6\n",
107 |        "3  9780316069380    the fifth witness  michael connelly      2\n",
108 |        "4  9780425232200             the help  kathryn stockett      9"
109 |       ]
110 |      },
111 |      "execution_count": 2,
112 |      "metadata": {},
113 |      "output_type": "execute_result"
114 |     }
115 |    ],
116 |    "source": [
117 |     "with CSVtoDF('bestsellers_raw.csv') as df:\n",
118 |     "    df['isbn13'] = df['primary_isbn13']\n",
119 |     "    df['title1'] = df['title'].str.lower()\n",
120 |     "    df['author1'] = df['author'].str.lower()\n",
121 |     "    df['weeks'] = df['weeks_on_list']\n",
122 |     "df.head()"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "Actually we don't need ISBN10 as ISBN13 can better do the job.\n",
130 |     "\n",
131 |     "We drop the `isbn` column and keep the remaining 3."
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 3,
137 |    "metadata": {},
138 |    "outputs": [
139 |     {
140 |      "name": "stdout",
141 |      "output_type": "stream",
142 |      "text": [
143 |       "\n",
144 |       "<class 'pandas.core.frame.DataFrame'>\n",
145 |       "RangeIndex: 8555 entries, 0 to 8554\n",
146 |       "Data columns (total 4 columns):\n",
147 |       " #   Column   Non-Null Count  Dtype \n",
148 |       "---  ------   --------------  ----- \n",
149 |       " 0   isbn13   8552 non-null   object\n",
150 |       " 1   title1   8555 non-null   object\n",
151 |       " 2   author1  8555 non-null   object\n",
152 |       " 3   weeks    8555 non-null   int64 \n",
153 |       "dtypes: int64(1), object(3)\n",
154 |       "memory usage: 267.5+ KB\n"
155 |      ]
156 |     }
157 |    ],
158 |    "source": [
159 |     "df = df[['isbn13', 'title1', 'author1', 'weeks']]\n",
160 |     "print()\n",
161 |     "df.info()"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "We see that there are some _Null_ values in our dataframe.\n",
169 |     "But at this moment we are only interested with ISBN13 column, as the values found in that column will be used as a indexes to connect to the goodreads webpage for scraping."
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 4,
175 |    "metadata": {},
176 |    "outputs": [
177 |     {
178 |      "data": {
179 |       "text/html": [
180 |        "<div>\n",
181 |        "<style scoped>\n",
182 |        "    .dataframe tbody tr th:only-of-type {\n",
183 |        "        vertical-align: middle;\n",
184 |        "    }\n",
185 |        "\n",
186 |        "    .dataframe tbody tr th {\n",
187 |        "        vertical-align: top;\n",
188 |        "    }\n",
189 |        "\n",
190 |        "    .dataframe thead th {\n",
191 |        "        text-align: right;\n",
192 |        "    }\n",
193 |        "</style>\n",
194 |        "<table border=\"1\" class=\"dataframe\">\n",
195 |        "  <thead>\n",
196 |        "    <tr style=\"text-align: right;\">\n",
197 |        "      <th></th>\n",
198 |        "      <th>isbn13</th>\n",
199 |        "      <th>title1</th>\n",
200 |        "      <th>author1</th>\n",
201 |        "      <th>weeks</th>\n",
202 |        "    </tr>\n",
203 |        "  </thead>\n",
204 |        "  <tbody>\n",
205 |        "    <tr>\n",
206 |        "      <th>143</th>\n",
207 |        "      <td>NaN</td>\n",
208 |        "      <td>summer secrets</td>\n",
209 |        "      <td>barbara freethy</td>\n",
210 |        "      <td>1</td>\n",
211 |        "    </tr>\n",
212 |        "    <tr>\n",
213 |        "      <th>169</th>\n",
214 |        "      <td>NaN</td>\n",
215 |        "      <td>summer secrets</td>\n",
216 |        "      <td>barbara freethy</td>\n",
217 |        "      <td>2</td>\n",
218 |        "    </tr>\n",
219 |        "    <tr>\n",
220 |        "      <th>196</th>\n",
221 |        "      <td>NaN</td>\n",
222 |        "      <td>summer secrets</td>\n",
223 |        "      <td>barbara freethy</td>\n",
224 |        "      <td>0</td>\n",
225 |        "    </tr>\n",
226 |        "  </tbody>\n",
227 |        "</table>\n",
228 |        "</div>"
229 |       ],
230 |       "text/plain": [
231 |        "    isbn13          title1          author1  weeks\n",
232 |        "143    NaN  summer secrets  barbara freethy      1\n",
233 |        "169    NaN  summer secrets  barbara freethy      2\n",
234 |        "196    NaN  summer secrets  barbara freethy      0"
235 |       ]
236 |      },
237 |      "execution_count": 4,
238 |      "metadata": {},
239 |      "output_type": "execute_result"
240 |     }
241 |    ],
242 |    "source": [
243 |     "df1 = df[df['isbn13'].isna()]\n",
244 |     "df1"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "markdown",
249 |    "metadata": {},
250 |    "source": [
251 |     "Fortunatelly, Summer Secrets by Barbara Freethy is the only book which has no ISBN13 information.\n",
252 |     "I will simply replace the empty value with ISBN number."
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": 5,
258 |    "metadata": {},
259 |    "outputs": [
260 |     {
261 |      "name": "stdout",
262 |      "output_type": "stream",
263 |      "text": [
264 |       "\n",
265 |       "(8555, 4)\n",
266 |       "<class 'pandas.core.frame.DataFrame'>\n",
267 |       "RangeIndex: 8555 entries, 0 to 8554\n",
268 |       "Data columns (total 4 columns):\n",
269 |       " #   Column   Non-Null Count  Dtype \n",
270 |       "---  ------   --------------  ----- \n",
271 |       " 0   isbn13   8555 non-null   object\n",
272 |       " 1   title1   8555 non-null   object\n",
273 |       " 2   author1  8555 non-null   object\n",
274 |       " 3   weeks    8555 non-null   int64 \n",
275 |       "dtypes: int64(1), object(3)\n",
276 |       "memory usage: 267.5+ KB\n"
277 |      ]
278 |     }
279 |    ],
280 |    "source": [
281 |     "df['isbn13'].replace(np.NaN, 'B003K15AKQ', inplace=True)\n",
282 |     "print()\n",
283 |     "print(df.shape)\n",
284 |     "df.info()"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "markdown",
289 |    "metadata": {},
290 |    "source": [
291 |     "Next we have to see at the duplicate values in our dataframe.\n",
292 |     "\n",
293 |     "First I combine Author's name, Title and ISBN13 number into one column. This way we will check for absolute duplicates (books that are exactly same editions, otherwise even if the same book is published with different cover or as revised version its ISBN will be changed)."
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": 6,
299 |    "metadata": {},
300 |    "outputs": [
301 |     {
302 |      "name": "stdout",
303 |      "output_type": "stream",
304 |      "text": [
305 |       "\n"
306 |      ]
307 |     },
308 |     {
309 |      "data": {
310 |       "text/plain": [
311 |        "0         sara gruen water for elephants 9781616200817\n",
312 |        "1              nora roberts chasing fire 9781101513781\n",
313 |        "2    michael connelly the lincoln lawyer 9780759514713\n",
314 |        "3     michael connelly the fifth witness 9780316069380\n",
315 |        "4              kathryn stockett the help 9780425232200\n",
316 |        "Name: author_title_isbn13, dtype: object"
317 |       ]
318 |      },
319 |      "execution_count": 6,
320 |      "metadata": {},
321 |      "output_type": "execute_result"
322 |     }
323 |    ],
324 |    "source": [
325 |     "df['author_title_isbn13'] = df['author1'] + ' ' + df['title1'] + ' ' + df['isbn13']\n",
326 |     "print()\n",
327 |     "df['author_title_isbn13'].head()"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": 7,
333 |    "metadata": {
334 |     "scrolled": true
335 |    },
336 |    "outputs": [
337 |     {
338 |      "name": "stdout",
339 |      "output_type": "stream",
340 |      "text": [
341 |       "\n",
342 |       "author_title_isbn13\n",
343 |       "gillian flynn gone girl 9780307588388               112\n",
344 |       "george rr martin a game of thrones 9780553897845     77\n",
345 |       "kristin hannah the nightingale 9781466850606         74\n",
346 |       "e l james fifty shades of grey 9781612130293         71\n",
347 |       "e l james fifty shades darker 9781612130590          70\n",
348 |       "                                                   ... \n",
349 |       "kristen ashley walk through fire 9781455533244        1\n",
350 |       "kristen ashley the will A00B00HYIF9FW                 1\n",
351 |       "kristen ashley the slow burn A00B07P1HZQDH            1\n",
352 |       "kristen ashley the promise A00B00JXW6GFE              1\n",
353 |       "jo nesbo the son 9780385351386                        1\n",
354 |       "Length: 3084, dtype: int64\n"
355 |      ]
356 |     }
357 |    ],
358 |    "source": [
359 |     "dups = df.pivot_table(index=['author_title_isbn13'], aggfunc='size')\n",
360 |     "print()\n",
361 |     "print(dups.sort_values(ascending=False))"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "markdown",
366 |    "metadata": {},
367 |    "source": [
368 |     "Now we can observe that Gone Girl and GOT and some other books appear several times in out df. That is because as mentioned earlier some books have been on the bestseller list for tens of weeks and their data came along with each week they have been featured.\n",
369 |     "\n",
370 |     "We will drop those duplicates using `pandas` `drop_duplicates`."
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "code",
375 |    "execution_count": 8,
376 |    "metadata": {},
377 |    "outputs": [
378 |     {
379 |      "data": {
380 |       "text/plain": [
381 |        "(3084, 3)"
382 |       ]
383 |      },
384 |      "execution_count": 8,
385 |      "metadata": {},
386 |      "output_type": "execute_result"
387 |     }
388 |    ],
389 |    "source": [
390 |     "df.drop_duplicates('author_title_isbn13', keep='last', ignore_index=True, inplace=True)\n",
391 |     "df = df[['title1', 'author1', 'isbn13']]\n",
392 |     "df.shape"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "markdown",
397 |    "metadata": {},
398 |    "source": [
399 |     "We also observed in a previous cell that some books have ASIN code instead of ISBN, this can be issue as Goodreads can't identify books based on ASIN. So I'll filter them out as well."
400 |    ]
401 |   },
402 |   {
403 |    "cell_type": "code",
404 |    "execution_count": 9,
405 |    "metadata": {},
406 |    "outputs": [
407 |     {
408 |      "name": "stdout",
409 |      "output_type": "stream",
410 |      "text": [
411 |       "149\n",
412 |       "2935\n"
413 |      ]
414 |     }
415 |    ],
416 |    "source": [
417 |     "aisbn = []\n",
418 |     "i13 = []\n",
419 |     "\n",
420 |     "for i in list(df['isbn13']):\n",
421 |     "    if re.search(r'^[\\dB]+', i):\n",
422 |     "        i13.append(i)\n",
423 |     "    else:\n",
424 |     "        aisbn.append(i)\n",
425 |     "        \n",
426 |     "print(len(aisbn))\n",
427 |     "print(len(i13))"
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "code",
432 |    "execution_count": 10,
433 |    "metadata": {},
434 |    "outputs": [
435 |     {
436 |      "data": {
437 |       "text/plain": [
438 |        "(2935, 3)"
439 |       ]
440 |      },
441 |      "execution_count": 10,
442 |      "metadata": {},
443 |      "output_type": "execute_result"
444 |     }
445 |    ],
446 |    "source": [
447 |     "df_final = df[df['isbn13'].str.contains(r'^[\\dB]+')]\n",
448 |     "df_final.shape"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "markdown",
453 |    "metadata": {},
454 |    "source": [
455 |     "We are down to wooping 2935 entries, but this are original titles which can be used for further analysis.\n",
456 |     "\n",
457 |     "We see that there are no more `NaN` values in `isbn13` column and no more duplicates, thus we can proceed and extract it as a list for web scraping."
458 |    ]
459 |   },
460 |   {
461 |    "cell_type": "code",
462 |    "execution_count": null,
463 |    "metadata": {},
464 |    "outputs": [],
465 |    "source": []
466 |   }
467 |  ],
468 |  "metadata": {
469 |   "kernelspec": {
470 |    "display_name": "Python 3",
471 |    "language": "python",
472 |    "name": "python3"
473 |   },
474 |   "language_info": {
475 |    "codemirror_mode": {
476 |     "name": "ipython",
477 |     "version": 3
478 |    },
479 |    "file_extension": ".py",
480 |    "mimetype": "text/x-python",
481 |    "name": "python",
482 |    "nbconvert_exporter": "python",
483 |    "pygments_lexer": "ipython3",
484 |    "version": "3.7.7"
485 |   }
486 |  },
487 |  "nbformat": 4,
488 |  "nbformat_minor": 4
489 | }
490 | 


--------------------------------------------------------------------------------
/data/raw/bestsellers_combining.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Scraping Data from Goodreads.com"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import pandas as pd\n",
 17 |     "import numpy as np\n",
 18 |     "\n",
 19 |     "from RawDataCollector import GoodReadsScraper"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "First I import another jupyter notebook to have list of ISBNs and metadata about books"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "metadata": {},
 33 |    "outputs": [
 34 |     {
 35 |      "name": "stdout",
 36 |      "output_type": "stream",
 37 |      "text": [
 38 |       "\n",
 39 |       "<class 'pandas.core.frame.DataFrame'>\n",
 40 |       "RangeIndex: 8555 entries, 0 to 8554\n",
 41 |       "Data columns (total 4 columns):\n",
 42 |       " #   Column   Non-Null Count  Dtype \n",
 43 |       "---  ------   --------------  ----- \n",
 44 |       " 0   isbn13   8552 non-null   object\n",
 45 |       " 1   title1   8555 non-null   object\n",
 46 |       " 2   author1  8555 non-null   object\n",
 47 |       " 3   weeks    8555 non-null   int64 \n",
 48 |       "dtypes: int64(1), object(3)\n",
 49 |       "memory usage: 267.5+ KB\n",
 50 |       "\n",
 51 |       "(8555, 4)\n",
 52 |       "<class 'pandas.core.frame.DataFrame'>\n",
 53 |       "RangeIndex: 8555 entries, 0 to 8554\n",
 54 |       "Data columns (total 4 columns):\n",
 55 |       " #   Column   Non-Null Count  Dtype \n",
 56 |       "---  ------   --------------  ----- \n",
 57 |       " 0   isbn13   8555 non-null   object\n",
 58 |       " 1   title1   8555 non-null   object\n",
 59 |       " 2   author1  8555 non-null   object\n",
 60 |       " 3   weeks    8555 non-null   int64 \n",
 61 |       "dtypes: int64(1), object(3)\n",
 62 |       "memory usage: 267.5+ KB\n",
 63 |       "\n",
 64 |       "\n",
 65 |       "author_title_isbn13\n",
 66 |       "gillian flynn gone girl 9780307588388               112\n",
 67 |       "george rr martin a game of thrones 9780553897845     77\n",
 68 |       "kristin hannah the nightingale 9781466850606         74\n",
 69 |       "e l james fifty shades of grey 9781612130293         71\n",
 70 |       "e l james fifty shades darker 9781612130590          70\n",
 71 |       "                                                   ... \n",
 72 |       "kristen ashley walk through fire 9781455533244        1\n",
 73 |       "kristen ashley the will A00B00HYIF9FW                 1\n",
 74 |       "kristen ashley the slow burn A00B07P1HZQDH            1\n",
 75 |       "kristen ashley the promise A00B00JXW6GFE              1\n",
 76 |       "jo nesbo the son 9780385351386                        1\n",
 77 |       "Length: 3084, dtype: int64\n",
 78 |       "149\n",
 79 |       "2935\n"
 80 |      ]
 81 |     }
 82 |    ],
 83 |    "source": [
 84 |     "# Importing df_final from other notebook\n",
 85 |     "%run \"..\\raw\\bestsellers_generating.ipynb\""
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "With available list of ISBNs and Dataframe with books metadata I start:\n",
 93 |     "### Web Scraping"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "First we check if _bestsellers_ dataframe with _ISBNs_ is defined and is not empty.\n",
101 |     "Otherwise raise the `ValueError`."
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 3,
107 |    "metadata": {},
108 |    "outputs": [
109 |     {
110 |      "name": "stdout",
111 |      "output_type": "stream",
112 |      "text": [
113 |       "All cool!\n"
114 |      ]
115 |     }
116 |    ],
117 |    "source": [
118 |     "if df_final is not None:\n",
119 |     "    df1 = df_final\n",
120 |     "    isbns = df_final['isbn13']\n",
121 |     "    print('All cool!')\n",
122 |     "else:\n",
123 |     "    raise ValueError('variables not found...')"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "If everything is in order we proceed scraping additional data from the Goodreads.com"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 4,
136 |    "metadata": {},
137 |    "outputs": [
138 |     {
139 |      "name": "stdout",
140 |      "output_type": "stream",
141 |      "text": [
142 |       "You just scrapped 5 books from the Goodreads.com!\n"
143 |      ]
144 |     }
145 |    ],
146 |    "source": [
147 |     "scrapped = GoodReadsScraper(isbns)\n",
148 |     "html_list = scrapped.store_html()\n",
149 |     "print(scrapped)"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "metadata": {},
155 |    "source": [
156 |     "I instantiate `GoodReadsScraper`, personal module, which takes isbn numbers and scraps data for each book and stores html text as a list.\n",
157 |     "\n",
158 |     "This will take some time as we are performing thousands of calls."
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "After scrapping is done we extract **number of pages, edition, cover picture url** and **genres** and convert them into `pandas` dataframe."
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 5,
171 |    "metadata": {},
172 |    "outputs": [
173 |     {
174 |      "name": "stdout",
175 |      "output_type": "stream",
176 |      "text": [
177 |       "(5, 4)\n"
178 |      ]
179 |     }
180 |    ],
181 |    "source": [
182 |     "df2 = scrapped.data_converter()\n",
183 |     "print(df2.shape)"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": 7,
189 |    "metadata": {},
190 |    "outputs": [
191 |     {
192 |      "name": "stdout",
193 |      "output_type": "stream",
194 |      "text": [
195 |       "(5, 2)\n"
196 |      ]
197 |     }
198 |    ],
199 |    "source": [
200 |     "df3 = scrapped.cover_url_converter()\n",
201 |     "print(df3.shape)"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": 8,
207 |    "metadata": {},
208 |    "outputs": [
209 |     {
210 |      "name": "stdout",
211 |      "output_type": "stream",
212 |      "text": [
213 |       "(5, 3)\n"
214 |      ]
215 |     }
216 |    ],
217 |    "source": [
218 |     "df4 = scrapped.pop_converter()\n",
219 |     "print(df4.shape)"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 9,
225 |    "metadata": {},
226 |    "outputs": [
227 |     {
228 |      "data": {
229 |       "text/html": [
230 |        "<div>\n",
231 |        "<style scoped>\n",
232 |        "    .dataframe tbody tr th:only-of-type {\n",
233 |        "        vertical-align: middle;\n",
234 |        "    }\n",
235 |        "\n",
236 |        "    .dataframe tbody tr th {\n",
237 |        "        vertical-align: top;\n",
238 |        "    }\n",
239 |        "\n",
240 |        "    .dataframe thead th {\n",
241 |        "        text-align: right;\n",
242 |        "    }\n",
243 |        "</style>\n",
244 |        "<table border=\"1\" class=\"dataframe\">\n",
245 |        "  <thead>\n",
246 |        "    <tr style=\"text-align: right;\">\n",
247 |        "      <th></th>\n",
248 |        "      <th>isbn13</th>\n",
249 |        "      <th>description</th>\n",
250 |        "    </tr>\n",
251 |        "  </thead>\n",
252 |        "  <tbody>\n",
253 |        "    <tr>\n",
254 |        "      <th>0</th>\n",
255 |        "      <td>9780345541444</td>\n",
256 |        "      <td>\\nAt nearly one hundred years old, Thalia Mars...</td>\n",
257 |        "    </tr>\n",
258 |        "  </tbody>\n",
259 |        "</table>\n",
260 |        "</div>"
261 |       ],
262 |       "text/plain": [
263 |        "          isbn13                                        description\n",
264 |        "0  9780345541444  \\nAt nearly one hundred years old, Thalia Mars..."
265 |       ]
266 |      },
267 |      "execution_count": 9,
268 |      "metadata": {},
269 |      "output_type": "execute_result"
270 |     }
271 |    ],
272 |    "source": [
273 |     "df6 = scrapped.description()\n",
274 |     "df6"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "markdown",
279 |    "metadata": {},
280 |    "source": [
281 |     "After all neccessary methods are executed we have four different dataframes, which we will join on _ISBN13_ number and merge into one."
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": 11,
287 |    "metadata": {},
288 |    "outputs": [
289 |     {
290 |      "data": {
291 |       "text/html": [
292 |        "<div>\n",
293 |        "<style scoped>\n",
294 |        "    .dataframe tbody tr th:only-of-type {\n",
295 |        "        vertical-align: middle;\n",
296 |        "    }\n",
297 |        "\n",
298 |        "    .dataframe tbody tr th {\n",
299 |        "        vertical-align: top;\n",
300 |        "    }\n",
301 |        "\n",
302 |        "    .dataframe thead th {\n",
303 |        "        text-align: right;\n",
304 |        "    }\n",
305 |        "</style>\n",
306 |        "<table border=\"1\" class=\"dataframe\">\n",
307 |        "  <thead>\n",
308 |        "    <tr style=\"text-align: right;\">\n",
309 |        "      <th></th>\n",
310 |        "      <th>title1</th>\n",
311 |        "      <th>author1</th>\n",
312 |        "      <th>isbn13</th>\n",
313 |        "      <th>pages</th>\n",
314 |        "      <th>released</th>\n",
315 |        "      <th>edition</th>\n",
316 |        "      <th>cover_url</th>\n",
317 |        "      <th>rating</th>\n",
318 |        "      <th>count</th>\n",
319 |        "    </tr>\n",
320 |        "  </thead>\n",
321 |        "  <tbody>\n",
322 |        "    <tr>\n",
323 |        "      <th>0</th>\n",
324 |        "      <td>never never</td>\n",
325 |        "      <td>james patterson and candice fox</td>\n",
326 |        "      <td>9780316433174</td>\n",
327 |        "      <td>363 pages</td>\n",
328 |        "      <td>\\n              (first published August 25th 2...</td>\n",
329 |        "      <td>Hardcover</td>\n",
330 |        "      <td>https://i.gr-assets.com/images/S/compressed.ph...</td>\n",
331 |        "      <td>\\n  3.62\\n</td>\n",
332 |        "      <td>\\n  13,917\\n  ratings\\n</td>\n",
333 |        "    </tr>\n",
334 |        "    <tr>\n",
335 |        "      <th>1</th>\n",
336 |        "      <td>devil in spring</td>\n",
337 |        "      <td>lisa kleypas</td>\n",
338 |        "      <td>9780062371904</td>\n",
339 |        "      <td>384 pages</td>\n",
340 |        "      <td>\\n              —\\n              37 likes\\n</td>\n",
341 |        "      <td>ebook</td>\n",
342 |        "      <td>https://i.gr-assets.com/images/S/compressed.ph...</td>\n",
343 |        "      <td>\\n  4.08\\n</td>\n",
344 |        "      <td>\\n  22,036\\n  ratings\\n</td>\n",
345 |        "    </tr>\n",
346 |        "    <tr>\n",
347 |        "      <th>2</th>\n",
348 |        "      <td>aftermath:: empire's end</td>\n",
349 |        "      <td>chuck wendig</td>\n",
350 |        "      <td>9781101966969</td>\n",
351 |        "      <td>423 pages</td>\n",
352 |        "      <td>\\n              —\\n              7 likes\\n</td>\n",
353 |        "      <td>Hardcover</td>\n",
354 |        "      <td>https://i.gr-assets.com/images/S/compressed.ph...</td>\n",
355 |        "      <td>\\n  3.79\\n</td>\n",
356 |        "      <td>\\n  9,247\\n  ratings\\n</td>\n",
357 |        "    </tr>\n",
358 |        "    <tr>\n",
359 |        "      <th>3</th>\n",
360 |        "      <td>echoes in death</td>\n",
361 |        "      <td>j d robb</td>\n",
362 |        "      <td>9781250123145</td>\n",
363 |        "      <td>400 pages</td>\n",
364 |        "      <td>\\n              —\\n              12 likes\\n</td>\n",
365 |        "      <td>ebook</td>\n",
366 |        "      <td>https://i.gr-assets.com/images/S/compressed.ph...</td>\n",
367 |        "      <td>\\n  4.41\\n</td>\n",
368 |        "      <td>\\n  19,576\\n  ratings\\n</td>\n",
369 |        "    </tr>\n",
370 |        "    <tr>\n",
371 |        "      <th>4</th>\n",
372 |        "      <td>heartbreak hotel</td>\n",
373 |        "      <td>jonathan kellerman</td>\n",
374 |        "      <td>9780345541444</td>\n",
375 |        "      <td>325 pages</td>\n",
376 |        "      <td>\\n              —\\n              0 likes\\n</td>\n",
377 |        "      <td>Nook</td>\n",
378 |        "      <td>https://i.gr-assets.com/images/S/compressed.ph...</td>\n",
379 |        "      <td>\\n  3.84\\n</td>\n",
380 |        "      <td>\\n  11,852\\n  ratings\\n</td>\n",
381 |        "    </tr>\n",
382 |        "  </tbody>\n",
383 |        "</table>\n",
384 |        "</div>"
385 |       ],
386 |       "text/plain": [
387 |        "                     title1                          author1         isbn13  \\\n",
388 |        "0               never never  james patterson and candice fox  9780316433174   \n",
389 |        "1           devil in spring                     lisa kleypas  9780062371904   \n",
390 |        "2  aftermath:: empire's end                     chuck wendig  9781101966969   \n",
391 |        "3           echoes in death                         j d robb  9781250123145   \n",
392 |        "4          heartbreak hotel               jonathan kellerman  9780345541444   \n",
393 |        "\n",
394 |        "       pages                                           released    edition  \\\n",
395 |        "0  363 pages  \\n              (first published August 25th 2...  Hardcover   \n",
396 |        "1  384 pages        \\n              —\\n              37 likes\\n      ebook   \n",
397 |        "2  423 pages         \\n              —\\n              7 likes\\n  Hardcover   \n",
398 |        "3  400 pages        \\n              —\\n              12 likes\\n      ebook   \n",
399 |        "4  325 pages         \\n              —\\n              0 likes\\n       Nook   \n",
400 |        "\n",
401 |        "                                           cover_url      rating  \\\n",
402 |        "0  https://i.gr-assets.com/images/S/compressed.ph...  \\n  3.62\\n   \n",
403 |        "1  https://i.gr-assets.com/images/S/compressed.ph...  \\n  4.08\\n   \n",
404 |        "2  https://i.gr-assets.com/images/S/compressed.ph...  \\n  3.79\\n   \n",
405 |        "3  https://i.gr-assets.com/images/S/compressed.ph...  \\n  4.41\\n   \n",
406 |        "4  https://i.gr-assets.com/images/S/compressed.ph...  \\n  3.84\\n   \n",
407 |        "\n",
408 |        "                     count  \n",
409 |        "0  \\n  13,917\\n  ratings\\n  \n",
410 |        "1  \\n  22,036\\n  ratings\\n  \n",
411 |        "2   \\n  9,247\\n  ratings\\n  \n",
412 |        "3  \\n  19,576\\n  ratings\\n  \n",
413 |        "4  \\n  11,852\\n  ratings\\n  "
414 |       ]
415 |      },
416 |      "execution_count": 11,
417 |      "metadata": {},
418 |      "output_type": "execute_result"
419 |     }
420 |    ],
421 |    "source": [
422 |     "result = df1.merge(df2, on='isbn13').merge(df3, on='isbn13').merge(df4, on='isbn13')\n",
423 |     "result.tail()"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": 12,
429 |    "metadata": {},
430 |    "outputs": [
431 |     {
432 |      "data": {
433 |       "text/plain": [
434 |        "(5, 9)"
435 |       ]
436 |      },
437 |      "execution_count": 12,
438 |      "metadata": {},
439 |      "output_type": "execute_result"
440 |     }
441 |    ],
442 |    "source": [
443 |     "result.shape"
444 |    ]
445 |   },
446 |   {
447 |    "cell_type": "code",
448 |    "execution_count": 13,
449 |    "metadata": {},
450 |    "outputs": [
451 |     {
452 |      "name": "stdout",
453 |      "output_type": "stream",
454 |      "text": [
455 |       "<class 'pandas.core.frame.DataFrame'>\n",
456 |       "Int64Index: 5 entries, 0 to 4\n",
457 |       "Data columns (total 9 columns):\n",
458 |       " #   Column     Non-Null Count  Dtype \n",
459 |       "---  ------     --------------  ----- \n",
460 |       " 0   title1     5 non-null      object\n",
461 |       " 1   author1    5 non-null      object\n",
462 |       " 2   isbn13     5 non-null      object\n",
463 |       " 3   pages      5 non-null      object\n",
464 |       " 4   released   5 non-null      object\n",
465 |       " 5   edition    5 non-null      object\n",
466 |       " 6   cover_url  5 non-null      object\n",
467 |       " 7   rating     5 non-null      object\n",
468 |       " 8   count      5 non-null      object\n",
469 |       "dtypes: object(9)\n",
470 |       "memory usage: 400.0+ bytes\n"
471 |      ]
472 |     }
473 |    ],
474 |    "source": [
475 |     "result.info()"
476 |    ]
477 |   },
478 |   {
479 |    "cell_type": "markdown",
480 |    "metadata": {},
481 |    "source": [
482 |     "I have some missing values but that's ok.\n",
483 |     "\n",
484 |     "Finally I save data as a pickle file."
485 |    ]
486 |   },
487 |   {
488 |    "cell_type": "code",
489 |    "execution_count": null,
490 |    "metadata": {},
491 |    "outputs": [],
492 |    "source": [
493 |     "# result.to_csv('bestsellers_merged.csv', index=False)"
494 |    ]
495 |   },
496 |   {
497 |    "cell_type": "code",
498 |    "execution_count": null,
499 |    "metadata": {},
500 |    "outputs": [],
501 |    "source": [
502 |     "# result.to_pickle('complete_bestsellers.pkl')"
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "code",
507 |    "execution_count": null,
508 |    "metadata": {},
509 |    "outputs": [],
510 |    "source": []
511 |   }
512 |  ],
513 |  "metadata": {
514 |   "kernelspec": {
515 |    "display_name": "Python 3",
516 |    "language": "python",
517 |    "name": "python3"
518 |   },
519 |   "language_info": {
520 |    "codemirror_mode": {
521 |     "name": "ipython",
522 |     "version": 3
523 |    },
524 |    "file_extension": ".py",
525 |    "mimetype": "text/x-python",
526 |    "name": "python",
527 |    "nbconvert_exporter": "python",
528 |    "pygments_lexer": "ipython3",
529 |    "version": "3.7.7"
530 |   }
531 |  },
532 |  "nbformat": 4,
533 |  "nbformat_minor": 4
534 | }
535 | 


--------------------------------------------------------------------------------