├── .DS_Store
├── NY_restaurant_data
├── .DS_Store
├── .ipynb_checkpoints
│ ├── Untitled-checkpoint.ipynb
│ └── ny_restaurant_data-checkpoint.ipynb
├── gps_coords.pkl
├── ny_restaurant_data.ipynb
└── nyc_map.png
├── audible_eda
├── .ipynb_checkpoints
│ ├── audible_data_analysis-checkpoint.ipynb
│ ├── audible_prices-checkpoint.ipynb
│ ├── audible_review_classifier-checkpoint.ipynb
│ └── audible_reviews_scraper-checkpoint.ipynb
├── audible_data_analysis.ipynb
├── audible_prices.ipynb
├── audible_review_classifier.ipynb
├── audible_reviews_scraper.ipynb
└── audible_scraper.ipynb
├── county_politics_analysis.ipynb
├── mo health data.ipynb
└── oct_classification.ipynb
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobymanders/data-projects/249885ae53feb4bb65c0a42a30b8c448705bb0a5/.DS_Store
--------------------------------------------------------------------------------
/NY_restaurant_data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobymanders/data-projects/249885ae53feb4bb65c0a42a30b8c448705bb0a5/NY_restaurant_data/.DS_Store
--------------------------------------------------------------------------------
/NY_restaurant_data/.ipynb_checkpoints/Untitled-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [],
3 | "metadata": {},
4 | "nbformat": 4,
5 | "nbformat_minor": 2
6 | }
7 |
--------------------------------------------------------------------------------
/NY_restaurant_data/gps_coords.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobymanders/data-projects/249885ae53feb4bb65c0a42a30b8c448705bb0a5/NY_restaurant_data/gps_coords.pkl
--------------------------------------------------------------------------------
/NY_restaurant_data/nyc_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobymanders/data-projects/249885ae53feb4bb65c0a42a30b8c448705bb0a5/NY_restaurant_data/nyc_map.png
--------------------------------------------------------------------------------
/audible_eda/.ipynb_checkpoints/audible_prices-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from requests_html import HTMLSession, HTML\n",
10 | "import numpy as np\n",
11 | "import pandas as pd\n",
12 | "from datetime import datetime"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 2,
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "url = 'https://www.audible.com/search?pf_rd_p=1d79b443-2f1d-43a3-b1dc-31a2cd242566&pf_rd_r=HK8P1MY097JB8VJ6PRTQ&ref=a_search_c4_pageSize_3&keywords=the+great+courses&pageSize=50'"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 3,
27 | "metadata": {
28 | "scrolled": true
29 | },
30 | "outputs": [],
31 | "source": [
32 | "def scrape_great_courses(url):\n",
33 | " sess = HTMLSession()\n",
34 | " data = []\n",
35 | " pages = 15\n",
36 | " for page in range(pages):\n",
37 | " pageurl = url + '&page=' + str(page+1)\n",
38 | " r = sess.get(pageurl)\n",
39 | " items = r.html.find('li.bc-list-item.productListItem', first=False)\n",
40 | " for i, item in enumerate(items):\n",
41 | " text_fields = item.text.split('\\n')\n",
42 | " dict_entry={\n",
43 | " 'title' : text_fields[0],\n",
44 | " 'price' : np.float([s for s in text_fields if 'Regular' in s][0].split('$')[1]),\n",
45 | " 'length' : [s for s in text_fields if 'Length' in s][0].split(': ')[1],\n",
46 | " 'rating' : np.float([s for s in text_fields if 'stars' in s][0].split(' out')[0]),\n",
47 | " 'rating_count' : np.int([s for s in text_fields if 'stars' in s][0].split(\n",
48 | " 'stars ')[1].replace(',','')),\n",
49 | " 'link' : 'https://www.audible.com' + [link for link in item.links if '/pd/' in link][0],\n",
50 | " }\n",
51 | " if any(['Series:' in s for s in text_fields]):\n",
52 | " dict_entry['series'] = [s for s in text_fields if 'Series:' in s][0].split('Series: ')[1]\n",
53 | " else:\n",
54 | " dict_entry['series'] = 'N/A'\n",
55 | " if any(['Release date:' in s for s in text_fields]):\n",
56 | " dict_entry['release_date'] = datetime.strptime([s for s in text_fields if 'Release date:' in s][0].split(\n",
57 | " ': ')[1], '%m-%d-%y')\n",
58 | " data.append(dict_entry)\n",
59 | " return data"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 4,
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "df = pd.DataFrame(data=scrape_great_courses(url))"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 5,
74 | "metadata": {
75 | "scrolled": true
76 | },
77 | "outputs": [
78 | {
79 | "data": {
80 | "text/html": [
81 | "
\n",
82 | "\n",
95 | "
\n",
96 | " \n",
97 | " \n",
98 | " | \n",
99 | " length | \n",
100 | " link | \n",
101 | " price | \n",
102 | " rating | \n",
103 | " rating_count | \n",
104 | " release_date | \n",
105 | " series | \n",
106 | " title | \n",
107 | "
\n",
108 | " \n",
109 | " \n",
110 | " \n",
111 | " 0 | \n",
112 | " 43 hrs and 23 mins | \n",
113 | " https://www.audible.com/pd/The-History-of-the-... | \n",
114 | " 59.95 | \n",
115 | " 4.5 | \n",
116 | " 2315 | \n",
117 | " 2013-07-08 | \n",
118 | " The Great Courses: Modern History | \n",
119 | " The History of the United States, 2nd Edition | \n",
120 | "
\n",
121 | " \n",
122 | " 1 | \n",
123 | " 12 hrs and 39 mins | \n",
124 | " https://www.audible.com/pd/Your-Best-Brain-The... | \n",
125 | " 34.95 | \n",
126 | " 4.5 | \n",
127 | " 2629 | \n",
128 | " 2014-11-14 | \n",
129 | " The Great Courses: Psychology | \n",
130 | " Your Best Brain: The Science of Brain Improvement | \n",
131 | "
\n",
132 | " \n",
133 | " 2 | \n",
134 | " 18 hrs and 15 mins | \n",
135 | " https://www.audible.com/pd/The-Story-of-Human-... | \n",
136 | " 41.95 | \n",
137 | " 4.5 | \n",
138 | " 3613 | \n",
139 | " 2013-07-08 | \n",
140 | " The Great Courses: Linguistics | \n",
141 | " The Story of Human Language | \n",
142 | "
\n",
143 | " \n",
144 | " 3 | \n",
145 | " 36 hrs and 34 mins | \n",
146 | " https://www.audible.com/pd/How-to-Listen-to-an... | \n",
147 | " 59.95 | \n",
148 | " 4.5 | \n",
149 | " 2337 | \n",
150 | " 2013-07-08 | \n",
151 | " The Great Courses: Fine Arts & Music | \n",
152 | " How to Listen to and Understand Great Music, 3... | \n",
153 | "
\n",
154 | " \n",
155 | " 4 | \n",
156 | " 31 hrs and 18 mins | \n",
157 | " https://www.audible.com/pd/Critical-Business-S... | \n",
158 | " 59.95 | \n",
159 | " 4.5 | \n",
160 | " 2171 | \n",
161 | " 2015-04-08 | \n",
162 | " The Great Courses: Professional | \n",
163 | " Critical Business Skills for Success | \n",
164 | "
\n",
165 | " \n",
166 | "
\n",
167 | "
"
168 | ],
169 | "text/plain": [
170 | " length link \\\n",
171 | "0 43 hrs and 23 mins https://www.audible.com/pd/The-History-of-the-... \n",
172 | "1 12 hrs and 39 mins https://www.audible.com/pd/Your-Best-Brain-The... \n",
173 | "2 18 hrs and 15 mins https://www.audible.com/pd/The-Story-of-Human-... \n",
174 | "3 36 hrs and 34 mins https://www.audible.com/pd/How-to-Listen-to-an... \n",
175 | "4 31 hrs and 18 mins https://www.audible.com/pd/Critical-Business-S... \n",
176 | "\n",
177 | " price rating rating_count release_date \\\n",
178 | "0 59.95 4.5 2315 2013-07-08 \n",
179 | "1 34.95 4.5 2629 2014-11-14 \n",
180 | "2 41.95 4.5 3613 2013-07-08 \n",
181 | "3 59.95 4.5 2337 2013-07-08 \n",
182 | "4 59.95 4.5 2171 2015-04-08 \n",
183 | "\n",
184 | " series \\\n",
185 | "0 The Great Courses: Modern History \n",
186 | "1 The Great Courses: Psychology \n",
187 | "2 The Great Courses: Linguistics \n",
188 | "3 The Great Courses: Fine Arts & Music \n",
189 | "4 The Great Courses: Professional \n",
190 | "\n",
191 | " title \n",
192 | "0 The History of the United States, 2nd Edition \n",
193 | "1 Your Best Brain: The Science of Brain Improvement \n",
194 | "2 The Story of Human Language \n",
195 | "3 How to Listen to and Understand Great Music, 3... \n",
196 | "4 Critical Business Skills for Success "
197 | ]
198 | },
199 | "execution_count": 5,
200 | "metadata": {},
201 | "output_type": "execute_result"
202 | }
203 | ],
204 | "source": [
205 | "df.head()"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": 6,
211 | "metadata": {},
212 | "outputs": [],
213 | "source": [
214 | "def scrape_sale_courses():\n",
215 | " data = []\n",
216 | " pages = 5\n",
217 | " for page in range(pages):\n",
218 | " pageurl = '/Users/tobymanders/Desktop/{}.html'.format(page+1)\n",
219 | " with open(pageurl) as file:\n",
220 | " html = file.read()\n",
221 | " r = HTML(html=html)\n",
222 | " items = r.find('li.bc-list-item.productListItem', first=False)\n",
223 | " for i, item in enumerate(items):\n",
224 | " text_fields = item.text.split('\\n')\n",
225 | " dict_entry={\n",
226 | " 'title' : text_fields[0],\n",
227 | " 'sale' : 'Yes',\n",
228 | " }\n",
229 | " if any(['Member' in s for s in text_fields]):\n",
230 | " dict_entry['member-price'] = np.float([s for s in text_fields if 'Member' in s][0].split('$')[1].split(' or')[0])\n",
231 | " data.append(dict_entry)\n",
232 | " return data"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": 7,
238 | "metadata": {},
239 | "outputs": [],
240 | "source": [
241 | "sale_df = pd.DataFrame(data=scrape_sale_courses())"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": 8,
247 | "metadata": {},
248 | "outputs": [],
249 | "source": [
250 | "df_merge = df.join(sale_df.set_index('title'), on='title')"
251 | ]
252 | },
253 | {
254 | "cell_type": "code",
255 | "execution_count": 9,
256 | "metadata": {},
257 | "outputs": [
258 | {
259 | "data": {
260 | "text/html": [
261 | "\n",
262 | "\n",
275 | "
\n",
276 | " \n",
277 | " \n",
278 | " | \n",
279 | " length | \n",
280 | " link | \n",
281 | " price | \n",
282 | " rating | \n",
283 | " rating_count | \n",
284 | " release_date | \n",
285 | " series | \n",
286 | " title | \n",
287 | " member-price | \n",
288 | " sale | \n",
289 | "
\n",
290 | " \n",
291 | " \n",
292 | " \n",
293 | " 0 | \n",
294 | " 43 hrs and 23 mins | \n",
295 | " https://www.audible.com/pd/The-History-of-the-... | \n",
296 | " 59.95 | \n",
297 | " 4.5 | \n",
298 | " 2315 | \n",
299 | " 2013-07-08 | \n",
300 | " The Great Courses: Modern History | \n",
301 | " The History of the United States, 2nd Edition | \n",
302 | " NaN | \n",
303 | " Yes | \n",
304 | "
\n",
305 | " \n",
306 | " 1 | \n",
307 | " 12 hrs and 39 mins | \n",
308 | " https://www.audible.com/pd/Your-Best-Brain-The... | \n",
309 | " 34.95 | \n",
310 | " 4.5 | \n",
311 | " 2629 | \n",
312 | " 2014-11-14 | \n",
313 | " The Great Courses: Psychology | \n",
314 | " Your Best Brain: The Science of Brain Improvement | \n",
315 | " 24.46 | \n",
316 | " Yes | \n",
317 | "
\n",
318 | " \n",
319 | " 2 | \n",
320 | " 18 hrs and 15 mins | \n",
321 | " https://www.audible.com/pd/The-Story-of-Human-... | \n",
322 | " 41.95 | \n",
323 | " 4.5 | \n",
324 | " 3613 | \n",
325 | " 2013-07-08 | \n",
326 | " The Great Courses: Linguistics | \n",
327 | " The Story of Human Language | \n",
328 | " NaN | \n",
329 | " Yes | \n",
330 | "
\n",
331 | " \n",
332 | " 3 | \n",
333 | " 36 hrs and 34 mins | \n",
334 | " https://www.audible.com/pd/How-to-Listen-to-an... | \n",
335 | " 59.95 | \n",
336 | " 4.5 | \n",
337 | " 2337 | \n",
338 | " 2013-07-08 | \n",
339 | " The Great Courses: Fine Arts & Music | \n",
340 | " How to Listen to and Understand Great Music, 3... | \n",
341 | " 41.96 | \n",
342 | " Yes | \n",
343 | "
\n",
344 | " \n",
345 | " 4 | \n",
346 | " 31 hrs and 18 mins | \n",
347 | " https://www.audible.com/pd/Critical-Business-S... | \n",
348 | " 59.95 | \n",
349 | " 4.5 | \n",
350 | " 2171 | \n",
351 | " 2015-04-08 | \n",
352 | " The Great Courses: Professional | \n",
353 | " Critical Business Skills for Success | \n",
354 | " 41.96 | \n",
355 | " Yes | \n",
356 | "
\n",
357 | " \n",
358 | "
\n",
359 | "
"
360 | ],
361 | "text/plain": [
362 | " length link \\\n",
363 | "0 43 hrs and 23 mins https://www.audible.com/pd/The-History-of-the-... \n",
364 | "1 12 hrs and 39 mins https://www.audible.com/pd/Your-Best-Brain-The... \n",
365 | "2 18 hrs and 15 mins https://www.audible.com/pd/The-Story-of-Human-... \n",
366 | "3 36 hrs and 34 mins https://www.audible.com/pd/How-to-Listen-to-an... \n",
367 | "4 31 hrs and 18 mins https://www.audible.com/pd/Critical-Business-S... \n",
368 | "\n",
369 | " price rating rating_count release_date \\\n",
370 | "0 59.95 4.5 2315 2013-07-08 \n",
371 | "1 34.95 4.5 2629 2014-11-14 \n",
372 | "2 41.95 4.5 3613 2013-07-08 \n",
373 | "3 59.95 4.5 2337 2013-07-08 \n",
374 | "4 59.95 4.5 2171 2015-04-08 \n",
375 | "\n",
376 | " series \\\n",
377 | "0 The Great Courses: Modern History \n",
378 | "1 The Great Courses: Psychology \n",
379 | "2 The Great Courses: Linguistics \n",
380 | "3 The Great Courses: Fine Arts & Music \n",
381 | "4 The Great Courses: Professional \n",
382 | "\n",
383 | " title member-price sale \n",
384 | "0 The History of the United States, 2nd Edition NaN Yes \n",
385 | "1 Your Best Brain: The Science of Brain Improvement 24.46 Yes \n",
386 | "2 The Story of Human Language NaN Yes \n",
387 | "3 How to Listen to and Understand Great Music, 3... 41.96 Yes \n",
388 | "4 Critical Business Skills for Success 41.96 Yes "
389 | ]
390 | },
391 | "execution_count": 9,
392 | "metadata": {},
393 | "output_type": "execute_result"
394 | }
395 | ],
396 | "source": [
397 | "df_merge.head()"
398 | ]
399 | },
400 | {
401 | "cell_type": "code",
402 | "execution_count": 10,
403 | "metadata": {},
404 | "outputs": [],
405 | "source": [
406 | "df_merge['sale'] = df_merge['sale'].fillna('No')"
407 | ]
408 | },
409 | {
410 | "cell_type": "code",
411 | "execution_count": 11,
412 | "metadata": {},
413 | "outputs": [
414 | {
415 | "data": {
416 | "text/html": [
417 | "\n",
418 | "\n",
431 | "
\n",
432 | " \n",
433 | " \n",
434 | " | \n",
435 | " length | \n",
436 | " link | \n",
437 | " price | \n",
438 | " rating | \n",
439 | " rating_count | \n",
440 | " release_date | \n",
441 | " series | \n",
442 | " title | \n",
443 | " member-price | \n",
444 | " sale | \n",
445 | "
\n",
446 | " \n",
447 | " \n",
448 | " \n",
449 | " 0 | \n",
450 | " 43 hrs and 23 mins | \n",
451 | " https://www.audible.com/pd/The-History-of-the-... | \n",
452 | " 59.95 | \n",
453 | " 4.5 | \n",
454 | " 2315 | \n",
455 | " 2013-07-08 | \n",
456 | " The Great Courses: Modern History | \n",
457 | " The History of the United States, 2nd Edition | \n",
458 | " NaN | \n",
459 | " Yes | \n",
460 | "
\n",
461 | " \n",
462 | " 1 | \n",
463 | " 12 hrs and 39 mins | \n",
464 | " https://www.audible.com/pd/Your-Best-Brain-The... | \n",
465 | " 34.95 | \n",
466 | " 4.5 | \n",
467 | " 2629 | \n",
468 | " 2014-11-14 | \n",
469 | " The Great Courses: Psychology | \n",
470 | " Your Best Brain: The Science of Brain Improvement | \n",
471 | " 24.46 | \n",
472 | " Yes | \n",
473 | "
\n",
474 | " \n",
475 | " 2 | \n",
476 | " 18 hrs and 15 mins | \n",
477 | " https://www.audible.com/pd/The-Story-of-Human-... | \n",
478 | " 41.95 | \n",
479 | " 4.5 | \n",
480 | " 3613 | \n",
481 | " 2013-07-08 | \n",
482 | " The Great Courses: Linguistics | \n",
483 | " The Story of Human Language | \n",
484 | " NaN | \n",
485 | " Yes | \n",
486 | "
\n",
487 | " \n",
488 | " 3 | \n",
489 | " 36 hrs and 34 mins | \n",
490 | " https://www.audible.com/pd/How-to-Listen-to-an... | \n",
491 | " 59.95 | \n",
492 | " 4.5 | \n",
493 | " 2337 | \n",
494 | " 2013-07-08 | \n",
495 | " The Great Courses: Fine Arts & Music | \n",
496 | " How to Listen to and Understand Great Music, 3... | \n",
497 | " 41.96 | \n",
498 | " Yes | \n",
499 | "
\n",
500 | " \n",
501 | " 4 | \n",
502 | " 31 hrs and 18 mins | \n",
503 | " https://www.audible.com/pd/Critical-Business-S... | \n",
504 | " 59.95 | \n",
505 | " 4.5 | \n",
506 | " 2171 | \n",
507 | " 2015-04-08 | \n",
508 | " The Great Courses: Professional | \n",
509 | " Critical Business Skills for Success | \n",
510 | " 41.96 | \n",
511 | " Yes | \n",
512 | "
\n",
513 | " \n",
514 | "
\n",
515 | "
"
516 | ],
517 | "text/plain": [
518 | " length link \\\n",
519 | "0 43 hrs and 23 mins https://www.audible.com/pd/The-History-of-the-... \n",
520 | "1 12 hrs and 39 mins https://www.audible.com/pd/Your-Best-Brain-The... \n",
521 | "2 18 hrs and 15 mins https://www.audible.com/pd/The-Story-of-Human-... \n",
522 | "3 36 hrs and 34 mins https://www.audible.com/pd/How-to-Listen-to-an... \n",
523 | "4 31 hrs and 18 mins https://www.audible.com/pd/Critical-Business-S... \n",
524 | "\n",
525 | " price rating rating_count release_date \\\n",
526 | "0 59.95 4.5 2315 2013-07-08 \n",
527 | "1 34.95 4.5 2629 2014-11-14 \n",
528 | "2 41.95 4.5 3613 2013-07-08 \n",
529 | "3 59.95 4.5 2337 2013-07-08 \n",
530 | "4 59.95 4.5 2171 2015-04-08 \n",
531 | "\n",
532 | " series \\\n",
533 | "0 The Great Courses: Modern History \n",
534 | "1 The Great Courses: Psychology \n",
535 | "2 The Great Courses: Linguistics \n",
536 | "3 The Great Courses: Fine Arts & Music \n",
537 | "4 The Great Courses: Professional \n",
538 | "\n",
539 | " title member-price sale \n",
540 | "0 The History of the United States, 2nd Edition NaN Yes \n",
541 | "1 Your Best Brain: The Science of Brain Improvement 24.46 Yes \n",
542 | "2 The Story of Human Language NaN Yes \n",
543 | "3 How to Listen to and Understand Great Music, 3... 41.96 Yes \n",
544 | "4 Critical Business Skills for Success 41.96 Yes "
545 | ]
546 | },
547 | "execution_count": 11,
548 | "metadata": {},
549 | "output_type": "execute_result"
550 | }
551 | ],
552 | "source": [
553 | "df_merge.head()"
554 | ]
555 | },
556 | {
557 | "cell_type": "code",
558 | "execution_count": 12,
559 | "metadata": {},
560 | "outputs": [],
561 | "source": [
562 | "def fix_mins(x):\n",
563 | " if 'min' in x and 'hr' in x:\n",
564 | " hrs = x.split(' hr')[0]\n",
565 | " mins = x.split(' min')[0].split('and ')[1]\n",
566 | " len_ = make_len(hrs, mins)\n",
567 | " elif 'min' in x:\n",
568 | " mins = x.split(' min')[0]\n",
569 | " len_ = make_len('00', mins)\n",
570 | " elif 'hr' in x:\n",
571 | " hrs = x.split(' hr')[0]\n",
572 | " len_ = make_len(hrs, '00')\n",
573 | " else:\n",
574 | " len_ = make_len('00', '00')\n",
575 | " return len_\n",
576 | "\n",
577 | "def make_len(hrs, mins):\n",
578 | " if len(hrs)<2:\n",
579 | " hrs = '0' + hrs\n",
580 | " if len(mins)<2:\n",
581 | " mins = '0' + mins\n",
582 | " return hrs + ' hrs ' + mins + ' mins'"
583 | ]
584 | },
585 | {
586 | "cell_type": "code",
587 | "execution_count": 13,
588 | "metadata": {},
589 | "outputs": [],
590 | "source": [
591 | "df_merge['length'] = df_merge['length'].apply(fix_mins)"
592 | ]
593 | },
594 | {
595 | "cell_type": "code",
596 | "execution_count": 14,
597 | "metadata": {},
598 | "outputs": [],
599 | "source": [
600 | "columns = ['title', 'sale', 'price', 'member-price', 'length', 'rating', 'rating_count', 'release_date', 'series', 'link']"
601 | ]
602 | },
603 | {
604 | "cell_type": "code",
605 | "execution_count": 15,
606 | "metadata": {},
607 | "outputs": [],
608 | "source": [
609 | "df_merge.to_csv('great_courses_all_titles_v2.csv', columns=columns, index=False)"
610 | ]
611 | },
612 | {
613 | "cell_type": "code",
614 | "execution_count": 16,
615 | "metadata": {},
616 | "outputs": [],
617 | "source": [
618 | "url_list = list(df_merge['link'])"
619 | ]
620 | },
621 | {
622 | "cell_type": "code",
623 | "execution_count": 17,
624 | "metadata": {},
625 | "outputs": [
626 | {
627 | "data": {
628 | "text/plain": [
629 | "724"
630 | ]
631 | },
632 | "execution_count": 17,
633 | "metadata": {},
634 | "output_type": "execute_result"
635 | }
636 | ],
637 | "source": [
638 | "len(df_merge)"
639 | ]
640 | },
641 | {
642 | "cell_type": "code",
643 | "execution_count": 50,
644 | "metadata": {},
645 | "outputs": [],
646 | "source": [
647 | "def get_accurate_ratings(addresses, rating_dict):\n",
648 | " for address in addresses:\n",
649 | " sess = HTMLSession()\n",
650 | " r = sess.get(address)\n",
651 | " try:\n",
652 | " rating_dict[address] = np.float(r.html.find('div.bc-row.bc-spacing-small', first=False)[1].text.split(' stars ')[1][:3])\n",
653 | " except:\n",
654 | " rating_dict[address] = np.nan\n",
655 | " \n",
656 | " return rating_dict"
657 | ]
658 | },
659 | {
660 | "cell_type": "code",
661 | "execution_count": 51,
662 | "metadata": {},
663 | "outputs": [],
664 | "source": [
665 | "from threading import Thread\n",
666 | "\n",
667 | "def threaded_ratings(nthreads, addresses, rating_dict=None):\n",
668 | " if rating_dict == None:\n",
669 | " rating_dict = {}\n",
670 | " threads = []\n",
671 | " for i in range(nthreads):\n",
672 | " add_subset = addresses[i::nthreads]\n",
673 | " t = Thread(target=get_accurate_ratings, args=(add_subset, rating_dict))\n",
674 | " threads.append(t)\n",
675 | " [t.start() for t in threads]\n",
676 | " [t.join() for t in threads]\n",
677 | " \n",
678 | " return rating_dict"
679 | ]
680 | },
681 | {
682 | "cell_type": "code",
683 | "execution_count": 52,
684 | "metadata": {},
685 | "outputs": [],
686 | "source": [
687 | "rating_dict = threaded_ratings(64, url_list)"
688 | ]
689 | },
690 | {
691 | "cell_type": "code",
692 | "execution_count": 58,
693 | "metadata": {},
694 | "outputs": [],
695 | "source": [
696 | "df_merge['rating'] = df_merge['link'].apply(lambda x: rating_dict[x])"
697 | ]
698 | },
699 | {
700 | "cell_type": "code",
701 | "execution_count": 59,
702 | "metadata": {},
703 | "outputs": [
704 | {
705 | "data": {
706 | "text/html": [
707 | "\n",
708 | "\n",
721 | "
\n",
722 | " \n",
723 | " \n",
724 | " | \n",
725 | " length | \n",
726 | " link | \n",
727 | " price | \n",
728 | " rating | \n",
729 | " rating_count | \n",
730 | " release_date | \n",
731 | " series | \n",
732 | " title | \n",
733 | " member-price | \n",
734 | " sale | \n",
735 | "
\n",
736 | " \n",
737 | " \n",
738 | " \n",
739 | " 0 | \n",
740 | " 43 hrs 23 mins | \n",
741 | " https://www.audible.com/pd/The-History-of-the-... | \n",
742 | " 59.95 | \n",
743 | " 4.7 | \n",
744 | " 2315 | \n",
745 | " 2013-07-08 | \n",
746 | " The Great Courses: Modern History | \n",
747 | " The History of the United States, 2nd Edition | \n",
748 | " NaN | \n",
749 | " Yes | \n",
750 | "
\n",
751 | " \n",
752 | " 1 | \n",
753 | " 12 hrs 39 mins | \n",
754 | " https://www.audible.com/pd/Your-Best-Brain-The... | \n",
755 | " 34.95 | \n",
756 | " 4.5 | \n",
757 | " 2629 | \n",
758 | " 2014-11-14 | \n",
759 | " The Great Courses: Psychology | \n",
760 | " Your Best Brain: The Science of Brain Improvement | \n",
761 | " 24.46 | \n",
762 | " Yes | \n",
763 | "
\n",
764 | " \n",
765 | " 2 | \n",
766 | " 18 hrs 15 mins | \n",
767 | " https://www.audible.com/pd/The-Story-of-Human-... | \n",
768 | " 41.95 | \n",
769 | " 4.7 | \n",
770 | " 3613 | \n",
771 | " 2013-07-08 | \n",
772 | " The Great Courses: Linguistics | \n",
773 | " The Story of Human Language | \n",
774 | " NaN | \n",
775 | " Yes | \n",
776 | "
\n",
777 | " \n",
778 | " 3 | \n",
779 | " 36 hrs 34 mins | \n",
780 | " https://www.audible.com/pd/How-to-Listen-to-an... | \n",
781 | " 59.95 | \n",
782 | " 4.7 | \n",
783 | " 2337 | \n",
784 | " 2013-07-08 | \n",
785 | " The Great Courses: Fine Arts & Music | \n",
786 | " How to Listen to and Understand Great Music, 3... | \n",
787 | " 41.96 | \n",
788 | " Yes | \n",
789 | "
\n",
790 | " \n",
791 | " 4 | \n",
792 | " 31 hrs 18 mins | \n",
793 | " https://www.audible.com/pd/Critical-Business-S... | \n",
794 | " 59.95 | \n",
795 | " 4.6 | \n",
796 | " 2171 | \n",
797 | " 2015-04-08 | \n",
798 | " The Great Courses: Professional | \n",
799 | " Critical Business Skills for Success | \n",
800 | " 41.96 | \n",
801 | " Yes | \n",
802 | "
\n",
803 | " \n",
804 | "
\n",
805 | "
"
806 | ],
807 | "text/plain": [
808 | " length link price \\\n",
809 | "0 43 hrs 23 mins https://www.audible.com/pd/The-History-of-the-... 59.95 \n",
810 | "1 12 hrs 39 mins https://www.audible.com/pd/Your-Best-Brain-The... 34.95 \n",
811 | "2 18 hrs 15 mins https://www.audible.com/pd/The-Story-of-Human-... 41.95 \n",
812 | "3 36 hrs 34 mins https://www.audible.com/pd/How-to-Listen-to-an... 59.95 \n",
813 | "4 31 hrs 18 mins https://www.audible.com/pd/Critical-Business-S... 59.95 \n",
814 | "\n",
815 | " rating rating_count release_date series \\\n",
816 | "0 4.7 2315 2013-07-08 The Great Courses: Modern History \n",
817 | "1 4.5 2629 2014-11-14 The Great Courses: Psychology \n",
818 | "2 4.7 3613 2013-07-08 The Great Courses: Linguistics \n",
819 | "3 4.7 2337 2013-07-08 The Great Courses: Fine Arts & Music \n",
820 | "4 4.6 2171 2015-04-08 The Great Courses: Professional \n",
821 | "\n",
822 | " title member-price sale \n",
823 | "0 The History of the United States, 2nd Edition NaN Yes \n",
824 | "1 Your Best Brain: The Science of Brain Improvement 24.46 Yes \n",
825 | "2 The Story of Human Language NaN Yes \n",
826 | "3 How to Listen to and Understand Great Music, 3... 41.96 Yes \n",
827 | "4 Critical Business Skills for Success 41.96 Yes "
828 | ]
829 | },
830 | "execution_count": 59,
831 | "metadata": {},
832 | "output_type": "execute_result"
833 | }
834 | ],
835 | "source": [
836 | "df_merge.head()"
837 | ]
838 | },
839 | {
840 | "cell_type": "code",
841 | "execution_count": 62,
842 | "metadata": {},
843 | "outputs": [],
844 | "source": [
845 | "df_merge.to_csv('great_courses_list_v3.csv', columns=columns, index=False)"
846 | ]
847 | },
848 | {
849 | "cell_type": "code",
850 | "execution_count": null,
851 | "metadata": {},
852 | "outputs": [],
853 | "source": []
854 | }
855 | ],
856 | "metadata": {
857 | "kernelspec": {
858 | "display_name": "Python 3",
859 | "language": "python",
860 | "name": "python3"
861 | },
862 | "language_info": {
863 | "codemirror_mode": {
864 | "name": "ipython",
865 | "version": 3
866 | },
867 | "file_extension": ".py",
868 | "mimetype": "text/x-python",
869 | "name": "python",
870 | "nbconvert_exporter": "python",
871 | "pygments_lexer": "ipython3",
872 | "version": "3.7.2"
873 | }
874 | },
875 | "nbformat": 4,
876 | "nbformat_minor": 2
877 | }
878 |
--------------------------------------------------------------------------------
/audible_eda/.ipynb_checkpoints/audible_reviews_scraper-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import numpy as np\n",
11 | "import matplotlib.pyplot as plt\n",
12 | "%matplotlib inline\n",
13 | "import nest_asyncio\n",
14 | "nest_asyncio.apply()\n",
15 | "from requests_html import HTML, HTMLSession, AsyncHTMLSession\n",
16 | "from threading import Thread\n",
17 | "import time"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 2,
23 | "metadata": {},
24 | "outputs": [],
25 | "source": [
26 | "df = pd.read_csv('/Users/cytology/Documents/code/datasets/all_english_audible.csv')\n",
27 | "df = df[~df['asin'].isnull()]"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 3,
33 | "metadata": {},
34 | "outputs": [
35 | {
36 | "data": {
37 | "text/plain": [
38 | "64023215.0"
39 | ]
40 | },
41 | "execution_count": 3,
42 | "metadata": {},
43 | "output_type": "execute_result"
44 | }
45 | ],
46 | "source": [
47 | "total_ratings = df['rating_count'].sum()\n",
48 | "total_ratings"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 4,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "df = df.sort_values('rating_count', ascending=False)\n",
58 | "df.reset_index(inplace=True)"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 5,
64 | "metadata": {},
65 | "outputs": [
66 | {
67 | "data": {
68 | "text/html": [
69 | "\n",
70 | "\n",
83 | "
\n",
84 | " \n",
85 | " \n",
86 | " | \n",
87 | " index | \n",
88 | " asin | \n",
89 | " author | \n",
90 | " category | \n",
91 | " length | \n",
92 | " link | \n",
93 | " narrator | \n",
94 | " price | \n",
95 | " rating | \n",
96 | " rating_count | \n",
97 | " release_date | \n",
98 | " title | \n",
99 | "
\n",
100 | " \n",
101 | " \n",
102 | " \n",
103 | " 0 | \n",
104 | " 247928 | \n",
105 | " B005FRGT44 | \n",
106 | " Ernest Cline | \n",
107 | " Sci-Fi & Fantasy | \n",
108 | " 15 hrs and 40 mins | \n",
109 | " https://www.audible.com/pd/Ready-Player-One-Au... | \n",
110 | " Wil Wheaton | \n",
111 | " 31.50 | \n",
112 | " 5.0 | \n",
113 | " 216094.0 | \n",
114 | " 2011-08-16 | \n",
115 | " Ready Player One | \n",
116 | "
\n",
117 | " \n",
118 | " 1 | \n",
119 | " 248009 | \n",
120 | " B00B5HZGUG | \n",
121 | " Andy Weir | \n",
122 | " Sci-Fi & Fantasy | \n",
123 | " 10 hrs and 53 mins | \n",
124 | " https://www.audible.com/pd/The-Martian-Audiobo... | \n",
125 | " R. C. Bray | \n",
126 | " 29.99 | \n",
127 | " 5.0 | \n",
128 | " 164988.0 | \n",
129 | " 2013-03-22 | \n",
130 | " The Martian | \n",
131 | "
\n",
132 | " \n",
133 | " 2 | \n",
134 | " 142087 | \n",
135 | " B00QXW5GYY | \n",
136 | " Paula Hawkins | \n",
137 | " Mysteries & Thrillers | \n",
138 | " 10 hrs and 58 mins | \n",
139 | " https://www.audible.com/pd/The-Girl-on-the-Tra... | \n",
140 | " Clare Corbett, Louise Brealey, India Fisher | \n",
141 | " 28.00 | \n",
142 | " 4.5 | \n",
143 | " 133818.0 | \n",
144 | " 2015-01-13 | \n",
145 | " The Girl on the Train | \n",
146 | "
\n",
147 | " \n",
148 | " 3 | \n",
149 | " 4895 | \n",
150 | " B01IW9TQPK | \n",
151 | " Trevor Noah | \n",
152 | " Bios & Memoirs | \n",
153 | " 8 hrs and 44 mins | \n",
154 | " https://www.audible.com/pd/Born-a-Crime-Audiob... | \n",
155 | " Trevor Noah | \n",
156 | " 24.95 | \n",
157 | " 5.0 | \n",
158 | " 123838.0 | \n",
159 | " 2016-11-15 | \n",
160 | " Born a Crime | \n",
161 | "
\n",
162 | " \n",
163 | " 4 | \n",
164 | " 282008 | \n",
165 | " B01I28NFEE | \n",
166 | " Mark Manson | \n",
167 | " Self Development | \n",
168 | " 5 hrs and 17 mins | \n",
169 | " https://www.audible.com/pd/The-Subtle-Art-of-N... | \n",
170 | " Roger Wayne | \n",
171 | " 23.95 | \n",
172 | " 4.5 | \n",
173 | " 113261.0 | \n",
174 | " 2016-09-13 | \n",
175 | " The Subtle Art of Not Giving a F*ck | \n",
176 | "
\n",
177 | " \n",
178 | "
\n",
179 | "
"
180 | ],
181 | "text/plain": [
182 | " index asin author category \\\n",
183 | "0 247928 B005FRGT44 Ernest Cline Sci-Fi & Fantasy \n",
184 | "1 248009 B00B5HZGUG Andy Weir Sci-Fi & Fantasy \n",
185 | "2 142087 B00QXW5GYY Paula Hawkins Mysteries & Thrillers \n",
186 | "3 4895 B01IW9TQPK Trevor Noah Bios & Memoirs \n",
187 | "4 282008 B01I28NFEE Mark Manson Self Development \n",
188 | "\n",
189 | " length link \\\n",
190 | "0 15 hrs and 40 mins https://www.audible.com/pd/Ready-Player-One-Au... \n",
191 | "1 10 hrs and 53 mins https://www.audible.com/pd/The-Martian-Audiobo... \n",
192 | "2 10 hrs and 58 mins https://www.audible.com/pd/The-Girl-on-the-Tra... \n",
193 | "3 8 hrs and 44 mins https://www.audible.com/pd/Born-a-Crime-Audiob... \n",
194 | "4 5 hrs and 17 mins https://www.audible.com/pd/The-Subtle-Art-of-N... \n",
195 | "\n",
196 | " narrator price rating rating_count \\\n",
197 | "0 Wil Wheaton 31.50 5.0 216094.0 \n",
198 | "1 R. C. Bray 29.99 5.0 164988.0 \n",
199 | "2 Clare Corbett, Louise Brealey, India Fisher 28.00 4.5 133818.0 \n",
200 | "3 Trevor Noah 24.95 5.0 123838.0 \n",
201 | "4 Roger Wayne 23.95 4.5 113261.0 \n",
202 | "\n",
203 | " release_date title \n",
204 | "0 2011-08-16 Ready Player One \n",
205 | "1 2013-03-22 The Martian \n",
206 | "2 2015-01-13 The Girl on the Train \n",
207 | "3 2016-11-15 Born a Crime \n",
208 | "4 2016-09-13 The Subtle Art of Not Giving a F*ck "
209 | ]
210 | },
211 | "execution_count": 5,
212 | "metadata": {},
213 | "output_type": "execute_result"
214 | }
215 | ],
216 | "source": [
217 | "df.head()"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": 10,
223 | "metadata": {},
224 | "outputs": [
225 | {
226 | "data": {
227 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAHzBJREFUeJzt3Xl81Xed7/HXh+x7AkkgZG0oe6FAI920m21taS3qWKX19jrakXHpqKNeL3XudbztzDjW61id6VWrdpmpnVqtC7d2c7qMLa1lKQUKlBJCIAmQhOz7dr7zx/mBISRwgJDfOb+8n49HHud3vud7ks83PXnz7fe3mXMOEREJlil+FyAiIuNP4S4iEkAKdxGRAFK4i4gEkMJdRCSAFO4iIgGkcBcRCSCFu4hIACncRUQCKN6vH5ybm+vKysr8+vEiIjFp06ZNh51zeSfr51u4l5WVsXHjRr9+vIhITDKzfZH007KMiEgAKdxFRAJI4S4iEkAKdxGRAFK4i4gEkMJdRCSAFO4iIgHk23HuIiJB1zswRGNHH42dfeFH7+u98/NZXJR9Vn+2wl1E5BQMDIVo7uo/JqyPhrf3eNhr7+gbHPV75GYkKdxFRCbC4FCIpq5+DrX1Ut/eS31HHw3tveHn3nZDRx/NXf2jvj8jKZ68jCRyM5KYX5DJZXOSyMtIIi/de8xIIjc9iWnpiSTEnf0VcYW7iASac47W7gHqO8JB3dDeR317L4fae6lv76PBaz/c2UfIHfveKQZ5GUlMz0ymKCeVZaU5x4V1vveYkhjnzwDHoHAXkZjlnKOtZ4C61h4OtvZysK2HOu/xYGsvB9t7qG/vo38wdNx7c1ITmJ6ZTH5mMvNmZBzdnpGZzPTMcKBPS0skfgJm2WeDwl1EolZ3/yAHhoV1XWtPeLut92ig9wwMHfOehDhjRlYyBVkpLCvJYcYooZ2XkURyQnTNtMebwl1EfNPeO0BNczc1zT3UtnRT2xJ+PDL7bu0eOKa/GeSlJ1GQncK8GRlcOTefgqxkZmanhL+ykslNT2LKFPNpRNFD4S4iZ01X3+DRwK5pDod3jRfiNc3dtPceezRJelI8RTkpFGancEFpNgVZ4e0jAT49M5nE+NhcJploCncROW2hkONQey/7mrrZ19RFddOREO+mpqXnuCNLkhOmUJyTSlFOCheU5lCUk+I9T6V4agpZKQmYadY9HhTuInJCg0Mh6lp7qG7qZr8X4PuausKB3tx9zM7KhDijyAvv6wqzhoV3CsVTU5mWlqjwniAKdxFhKOSoa+lhz+FOqhq7js7C9zd1UdvSw+CwYwSTE6ZQNi2N8rw0rpyXT+m0VMqmpVE6LZWCrBTitN4dFRTuIpNIe+8AVY1dVDV2sqex09vuYm9T1zEz8IykeMpy01hYmMUNiwsonZZ2NMDzM5I0+44BCneRgAmFHLUtPezxAnzP0TDv4nBn39F+cVOM0qmplOelcfncPGblpVGel055bhpTtXwS8xTuIjHKOUddaw+76zt5p76DXfUd7K7vpLKh85hjv3NSEyjPS+fKuXnMyg+Hd3leOiVTU3XkSYAp3EWinHPhI1Leqe9kd30Huw518E5DJ5X1HXT1/ynE8zOSmDsjg1uWlzBnejrn5qdTnpfO1LREH6sXvyjcRaJI78AQ79R3sONAOzsPtrPzYAc7D7XTMex48Nz0ROZMz+DmimJmT09nzvQM5uRnkJWa4GPlEm0U7iI+aejoZefBPwX5joPtVDV2Hr14VVpiHPMKMlm5ZCZzp2cwe3oGc6ZnaCYuEVG4i5xlzjn2NXWzta6N7XVt7DgYDvPDnX86wacwO4X5BZmsOG8G8wsyWTAzk+KcVJ1GL6ctonA3s+uA7wFxwE+cc/844vUS4GEg2+uzxjn31DjXKhL1nHPUNPewra6NrXWtbKtt4626tqOn2SfGTWH29HSunJt/NMTnz8jUkoqMu5OGu5nFAfcB1wC1wAYzW+uc2zGs2/8CHnfO/cDMFgBPAWVnoV6RqHHkaJVttW1srQuH+NbaNtp6whe7Sogz5s3I5MbzZ7K4MIvzCrOYMz1DR6jIhIhk5r4cqHTOVQGY2WPASmB4uDsg09vOAg6MZ5Ei0aC7f5AtNW1srmlh8/5WNu9vPXrcePwUY+6MDFYsmsF5hVksLsxmzox0kuKDfVlZiV6RhHshUDPseS1w4Yg+3wCeM7O/AtKAq8elOhGfOOfYe7iLzftbeWN/OMx31Xcw5O3tLM9N47I5uSwtzmZxUTZzZ2QE/vrgElsiCffR9uiMuBkVtwAPOee+Y2YXA/9mZuc55465/YmZrQZWA5SUlJxOvSJnRe/AEFtqWtlQ3cymfS1srmk9ei3x9KR4lhRn87krZrG0JIclxdnk6IgViXKRhHstUDzseRHHL7vcDlwH4Jx7zcySgVygYXgn59z9wP0AFRUVI/+BEJkwnX2DbNrXwvq9Tazf28yWmjb6h8Jzkdn56bxvwQyWlmSzrDSHWXnpuhiWxJxIwn0DMNvMzgHqgFXArSP67AfeCzxkZvOBZKBxPAsVORNNnX1sqG5h/d5mNlQ3s/1AGyEXvr7KeYVZ/PmlZSwvm0pFWQ7ZqZqVS+w7abg75wbN7A7gWcKHOT7gnNtuZncBG51za4EvAz82s78mvGTz5845zczFN519g7xe1cS6yiZe3XOYtw91AJAUP4WlJdnccdVslpdNZWlJNmlJOt1Dgsf8yuCKigq3ceNGX362BE/f4BCb97fyauVh1u1p4s2aVoZCjqT4KVSU5XDJrFwuKp/KeYVZOoJFYpqZbXLOVZysn6YsEpOcc+yq7+AP7zTySmUT6/c20TsQYorB4qJsPn15OZfOymVZaY6OYpFJSeEuMaOjd4B1lYd5aVcj//lOIwfbeoHwDtBV7yrhklnTuLB8GlkpOttTROEuUevI7PzFtxt5aVcDm/a1MBhyZCTF8+7ZuXzx6jwun5PPjKxkv0sViToKd4kq/YMh/ljVxHM7DvH8zoajs/N5MzL41GXlXDEnj2WlOSTE6RR+kRNRuIvvOnoH+M93Gnluez0vvt1AR98gKQlxXDYnly9ePVuzc5HToHAXXzR09PL7HfU8t72e1/Y00T8UYlpaIisWFXDtwulcem6udoSKnAGFu0yYw519PP3WIZ7ccoD11c04B6XTUvn4JaVcu3AGy0pydCaoyDhRuMtZ1dzVz7PbD/Hk1gO8tqeJkINZeWl8/qrZXL9oBnOnZ2CmQBcZbwp3GXedfYM889Yh1m45wLrKwwyFHOfkpvG5K8/lhsUFCnSRCaBwl3ExFHK8uucwv3qjjmfeOkTPwBDFU1NYfVk5NywqYOHMTAW6yARSuMsZeae+gyfeqOU3m+uob+8jMzmeDy4r5M+WFbKsJEeBLuIThbucso7eAdZuOcBj62vYVtdG3BTjyrl5/O37i7hqXr6OchGJAgp3iYhzjm11bTz6+n7WbjlAd/8Q82Zk8PUbF3DTkpnkpif5XaKIDKNwlxPq7Bvkt2/W8ejr+9l+oJ2UhDjef34Bt15YyvlFWVp2EYlSCncZVVVjJw+9Ws0Tm2rp8mbpd69cyMqlhWQm68JcItFO4S5HOed4efdhHly3lxd3NZIYN4Ubzy/gtotKWVKcrVm6SAxRuAvd/YP86o06Hnq1msqGTnLTk/ji1bP52IWl5GVoLV0kFincJ7GWrn4eerWah1+rprV7gEWFWfzTR87nhsUFuluRSIxTuE9CB9t6+PEf9vLv6/fTMzDE1fOn85eXl1NRquPSRYJC4T6J7Gns5Icv7eE3b9YRcrDy/Jl8+opZzJme4XdpIjLOFO6TQPXhLr73/G5++2YdCXFTuHV5CZ+6rJyinFS/SxORs0ThHmA1zd388wu7eeKNOhLijL94TzmrLyvXCUcik4DCPYAOtvXwLy9U8vjGGsyM2y4q5bNXziI/Q3czEpksFO4B0tE7wA9e2sNPX9lLyDk++q5iPnfluRRkpfhdmohMMIV7AAwMhXhs/X7u/Y/dNHX184ElM/nytXMpnqo1dZHJSuEew5xz/H5HPf/49NtUHe7iovKpPLRiAYuKsvwuTUR8pnCPUZUNHfzt2u2sq2xiVl4aP/14BVfNy9dx6iICKNxjTlffIN9/YTc/fXkvqYlx3LVyIbcuLyE+borfpYlIFFG4xwjnHL/bdpC/e3Inh9p7+UhFEf/zunlM02GNIjIKhXsMqG3p5s5fbePl3YdZODOT+z62jAtKc/wuS0SimMI9ioVCjkde38e3nn4bB3zj/Qu47eIy4qZoXV1ETkzhHqWqGjtZ88Q21lc3857ZuXzzQ4t0uQARiZjCPcqEQo4H1u3l28/uIil+Cvd8eDE3X1Cko2BE5JQo3KPIobZevvKLLbxSeZir5+fz9x9cxPRMXTJARE6dwj1KPPPWIdb8ait9AyG++aFFrHpXsWbrInLaIjo42syuM7NdZlZpZmvG6PMRM9thZtvN7NHxLTO4uvsHWfPEVj79yCaKc1J58vPv5pblJQp2ETkjJ525m1kccB9wDVALbDCztc65HcP6zAbuBC51zrWYWf7ZKjhI9jR28plHNrG7oZPPXDGLv756DonxOhlJRM5cJMsyy4FK51wVgJk9BqwEdgzr8yngPudcC4BzrmG8Cw2a3209yFd/uYWkhDj+9ZPLec/sPL9LEpEAiSTcC4GaYc9rgQtH9JkDYGbrgDjgG865Z8alwoDpHwzxzad38uC6apaWZHPfrcuYma1L8orI+Iok3Edb/HWjfJ/ZwBVAEfCymZ3nnGs95huZrQZWA5SUlJxysbGuqbOPzzzyBuurm/nEpWXcef18LcOIyFkRSbjXAsXDnhcBB0bp80fn3ACw18x2EQ77DcM7OefuB+4HqKioGPkPRKDtOtTB7Q9voLGjj++tWsLKJYV+lyQiARbJtHEDMNvMzjGzRGAVsHZEn98AVwKYWS7hZZqq8Sw0lr34dgN/9oNX6RsM8fO/vFjBLiJn3Uln7s65QTO7A3iW8Hr6A8657WZ2F7DRObfWe+1aM9sBDAH/wznXdDYLjxUPrtvL3U/uYH5BJj/5eIVueSciE8Kc82d1pKKiwm3cuNGXnz0RnHPc8+wufvDSHq5dMJ17Vy0hNVHnjInImTGzTc65ipP1U9qcBYNDIb726208vrGWWy8s4e6V5+lKjiIyoRTu46x3YIg7Ht3Mf+ys5wvvnc0Xr56ts01FZMIp3MdRT/8Qtz+8gdeqmrh75UJuu7jM75JEZJJSuI+T7v5Bbn9oI6/vbeI7N5/Ph5YV+V2SiExiCvdx0N0/yCcf2sD6vc3800eW8IGlOtRRRPylcD9DPf1DfOLBDWyobua7H9XJSSISHRTuZ2BgKMRnf7aJ9dXN3KtgF5EoogubnKZQyPHVX27lxV2N/P0HFinYRSSqKNxPg3OOu3+3g19vruMr187h1gsn30XQRCS6KdxPw4/+UMWD66r5xKVlfO7Kc/0uR0TkOAr3U/Ts9kN865m3uWFxAf/7hgU6QUlEopLC/RS8VdfGFx97k8VF2Xzn5vOZoksKiEiUUrhHqL69l794eCM5qQn8+L9fQHJCnN8liYiMSYdCRqB/MMSnH9lEe+8AT3zmEvIzkv0uSUTkhBTuEfiHp3ayeX8r/+9jy5hfkOl3OSIiJ6VlmZNYu+UAD71aze3vPocViwr8LkdEJCIK9xOobOhgzRNbuaA0hzXXz/O7HBGRiCncx9A7MMTnfraZlIQ47rt1GQlx+lWJSOzQmvsY/u+zu9hV38GDn3gXM7K0A1VEYoumo6N4tfIwP3llL7ddVMqVc/P9LkdE5JQp3Edo6xngK7/YQnluGneu0Dq7iMQmLcuMcNf/30F9Rx9PfOYSUhP16xGR2KSZ+zCv7D7ME2/U8unLy1lSnO13OSIip03h7unpH+Jrv97GOblp/NVVs/0uR0TkjGjdwfP9F3azv7mbRz91oa4bIyIxTzN34J36Dn78hypuvqCIS2bl+l2OiMgZm/Th7pzj7id3kJoYx9dWzPe7HBGRcTHpw/3FXQ28vPswX7h6DjlpiX6XIyIyLiZ1uA8Mhfi73+2kPDeN2y4q9bscEZFxM6nD/Wd/3EdVYxdfWzGfxPhJ/asQkYCZtInW1TfIP79QySWzpvHe+brEgIgEy6QN94dfq6apq5+vvG+ubnItIoEzKcO9o3eA+/9QxZVz81hWkuN3OSIi425ShvsDr1TT2j3Al66Z63cpIiJnxaQL9/beAX7yShXXLpjOoqIsv8sRETkrJl24P/r6fjp6B/n8e3X9GBEJrojC3cyuM7NdZlZpZmtO0O/DZubMrGL8Shw//YMhHly3l0vPncZ5hZq1i0hwnTTczSwOuA+4HlgA3GJmC0bplwF8Hnh9vIscL799s4769j5WXzbL71JERM6qSGbuy4FK51yVc64feAxYOUq/u4F7gN5xrG/chEKOH79cxbwZGVw2WxcHE5FgiyTcC4GaYc9rvbajzGwpUOyce3IcaxtXf9jdyDv1nay+rFzHtYtI4EUS7qMloTv6otkU4LvAl0/6jcxWm9lGM9vY2NgYeZXj4JE/7ic3PYkbF8+c0J8rIuKHSMK9Fige9rwIODDseQZwHvCSmVUDFwFrR9up6py73zlX4ZyryMvLO/2qT9GB1h5eeLuej76rSNeQEZFJIZKk2wDMNrNzzCwRWAWsPfKic67NOZfrnCtzzpUBfwRucs5tPCsVn4bHNtTggFXvKvG7FBGRCXHScHfODQJ3AM8CO4HHnXPbzewuM7vpbBd4pgaGQjy2fj9XzMmjeGqq3+WIiEyIiO6h6px7CnhqRNvXx+h7xZmXNX5eeLuBho4+/uFCXa9dRCaPwC9A//qNOnLTk7hi7sSt8YuI+C3Q4d7WPcALbzewcslM4uMCPVQRkWMEOvF+t+0g/UMhPri08OSdRUQCJNDh/pvNdZybn87CmZl+lyIiMqECG+61Ld2sr27mg0sLdUaqiEw6gQ33Z946BMCNiwt8rkREZOIFNtyf21HPvBkZlE5L87sUEZEJF8hwb+rsY2N1M9cumO53KSIivghkuD+/s4GQg2sXzvC7FBERXwQy3J/bcYjC7BQdJSMik1bgwr1vcIh1lU1cNS9fR8mIyKQVuHB/Y18rPQNDvEd3WxKRSSxw4f5KZSNxU4yLZk3zuxQREd8EL9x3H2ZJcTaZyQl+lyIi4ptAhXtrdz9b69p497lakhGRyS1Q4f7aniacQ+vtIjLpBSrc11c3k5wwhfOLs/0uRUTEV4EK9037Wji/KJsEXbtdRCa5wKRgd/8g2w+0U1GW43cpIiK+C0y4b6lpYyjkuKBU4S4iEphwf2N/CwDLShTuIiKBCfdN+1o4Nz+d7NREv0sREfFdIMLdOceWmlaW6CgZEREgIOFe395HU1c/5+kqkCIiQEDCfcfBNgAWFmb5XImISHQIRLhvr2sHYH6BZu4iIhCUcD/QTtm0VNKT4v0uRUQkKgQj3A+2sXCmlmRERI6I+XBv6xmgprmHBdqZKiJyVMyH+65DHQAKdxGRYWI+3CsbOgGYnZ/ucyUiItEj5sN9T2MnKQlxzMxK8bsUEZGoEfPhXtnQSXleGlOmmN+liIhEjZgP9z2NnczK05KMiMhwMR3uPf1D1LX2cK7W20VEjhFRuJvZdWa2y8wqzWzNKK9/ycx2mNlWM3vezErHv9TjVR3uxDk0cxcRGeGk4W5mccB9wPXAAuAWM1swottmoMI5txj4JXDPeBc6mv1N3QCU5aZOxI8TEYkZkczclwOVzrkq51w/8BiwcngH59yLzrlu7+kfgaLxLXN0tS09ABTlKNxFRIaLJNwLgZphz2u9trHcDjx9JkVFqralm8zkeLJSEibix4mIxIxIrrQ12jGGbtSOZv8NqAAuH+P11cBqgJKSkghLHFttSw+FmrWLiBwnkpl7LVA87HkRcGBkJzO7Gvgb4CbnXN9o38g5d79zrsI5V5GXl3c69R5bWEsPRTk6eUlEZKRIwn0DMNvMzjGzRGAVsHZ4BzNbCvyIcLA3jH+Zx3POUdvSrXAXERnFScPdOTcI3AE8C+wEHnfObTezu8zsJq/bt4F04Bdm9qaZrR3j242b1u4BuvqHtDNVRGQUEd3dwjn3FPDUiLavD9u+epzrOqk/HSmjmbuIyEgxe4bqwbZwuOuCYSIix4vZcG/oCO+zzc9M8rkSEZHoE9PhbgbT0hL9LkVEJOrEbLg3dvQxLS2R+LiYHYKIyFkTs8nY2NFLXkay32WIiESlmA33ho4+8jK03i4iMpqYDffGjj7yFe4iIqOKyXAPhRyNmrmLiIwpJsO9tWeAwZAjL13hLiIympgM95bufgCmpeswSBGR0cRkuLd2DwCQqeu4i4iMKibDvb0nHO7ZCncRkVHFZLi39oSXZXQHJhGR0cVkuLd5yzLZqVpzFxEZTUyGe6u3LJOZHNEVi0VEJp2YDPe2ngEykuJ1XRkRkTHEZDq2dQ+Qlar1dhGRscRmuPcMaGeqiMgJxGS4d/QNkqH1dhGRMcVkuPf0D5GaqHAXERlLTIZ7d/8gKQlxfpchIhK1YjLce/qHSElUuIuIjCUmw717YIhUhbuIyJhiM9w1cxcROaGYC/ehkKN/MERqgnaoioiMJebCvbt/EEDLMiIiJxBz4d7TPwSgZRkRkROIuXDv9sJdM3cRkbHFXLj3DCjcRUROJubCvfvosox2qIqIjCXmwr3Xm7knx8dc6SIiEybmErJ/KARAosJdRGRMMZeQA4PhcE/QjTpERMYUcwk5MOQAzdxFRE4k5hJyYEgzdxGRk4m5hOw/Gu7mcyUiItEronA3s+vMbJeZVZrZmlFeTzKzn3uvv25mZeNd6BFHZu6JmrmLiIzppAlpZnHAfcD1wALgFjNbMKLb7UCLc+5c4LvAt8a70CO0Q1VE5OQiScjlQKVzrso51w88Bqwc0Wcl8LC3/UvgvWZ2VtZNjuxQjdeyjIjImCIJ90KgZtjzWq9t1D7OuUGgDZg2HgWOVJabxopFM3S0jIjICURyDv9oU2R3Gn0ws9XAaoCSkpIIfvTxrlkwnWsWTD+t94qITBaRTH9rgeJhz4uAA2P1MbN4IAtoHvmNnHP3O+cqnHMVeXl5p1exiIicVCThvgGYbWbnmFkisApYO6LPWuDj3vaHgRecc8fN3EVEZGKcdFnGOTdoZncAzwJxwAPOue1mdhew0Tm3Fvgp8G9mVkl4xr7qbBYtIiInFtF1c51zTwFPjWj7+rDtXuDm8S1NREROlw45EREJIIW7iEgAKdxFRAJI4S4iEkDm1xGLZtYI7DvNt+cCh8exnFigMU8OGvPkcCZjLnXOnfREId/C/UyY2UbnXIXfdUwkjXly0Jgnh4kYs5ZlREQCSOEuIhJAsRru9/tdgA805slBY54czvqYY3LNXURETixWZ+4iInICMRfuJ7ufa7QzswfMrMHM3hrWNtXMfm9mu73HHK/dzOz73li3mtmyYe/5uNd/t5l9fFj7BWa2zXvP98/WHbEiZWbFZvaime00s+1m9gWvPchjTjaz9Wa2xRvz//Haz/HuMbzbu+dwotc+5j2IzexOr32Xmb1vWHtU/h2YWZyZbTazJ73ngR6zmVV7n703zWyj1xYdn23nXMx8Eb4q5R6gHEgEtgAL/K7rFMdwGbAMeGtY2z3AGm97DfAtb3sF8DThm6FcBLzutU8FqrzHHG87x3ttPXCx956nget9Hm8BsMzbzgDeIXwv3iCP2YB0bzsBeN0by+PAKq/9h8BnvO3PAj/0tlcBP/e2F3if8STgHO+zHxfNfwfAl4BHgSe954EeM1AN5I5oi4rPtu8fhlP8RV4MPDvs+Z3AnX7XdRrjKOPYcN8FFHjbBcAub/tHwC0j+wG3AD8a1v4jr60AeHtY+zH9ouEL+C1wzWQZM5AKvAFcSPiklXiv/ehnmfDltC/2tuO9fjby832kX7T+HRC+kc/zwFXAk94Ygj7mao4P96j4bMfaskwk93ONRdOdcwcBvMd8r32s8Z6ovXaU9qjg/a/3UsIz2UCP2VueeBNoAH5PeNbZ6sL3GIZj6xzrHsSn+rvw273AV4GQ93wawR+zA54zs00Wvo0oRMlnO6LruUeRiO7VGiBjjfdU231nZunAE8AXnXPtJ1g6DMSYnXNDwBIzywZ+DcwfrZv3eKpjG21S5uuYzexGoME5t8nMrjjSPErXwIzZc6lz7oCZ5QO/N7O3T9B3Qj/bsTZzj+R+rrGo3swKALzHBq99rPGeqL1olHZfmVkC4WD/mXPuV15zoMd8hHOuFXiJ8BprtoXvMQzH1jnWPYhP9Xfhp0uBm8ysGniM8NLMvQR7zDjnDniPDYT/EV9OtHy2/V6zOsX1rXjCOxvO4U87VRb6XddpjKOMY9fcv82xO2Du8bZv4NgdMOu99qnAXsI7X3K87aneaxu8vkd2wKzweawG/Ctw74j2II85D8j2tlOAl4EbgV9w7M7Fz3rbn+PYnYuPe9sLOXbnYhXhHYtR/XcAXMGfdqgGdsxAGpAxbPtV4Lpo+Wz7/kE4jV/oCsJHXOwB/sbvek6j/n8HDgIDhP9lvp3wWuPzwG7v8ch/WAPu88a6DagY9n0+CVR6X58Y1l4BvOW951/wTlTzcbzvJvy/kluBN72vFQEf82Jgszfmt4Cve+3lhI9+qPRCL8lrT/aeV3qvlw/7Xn/jjWsXw46UiOa/A44N98CO2RvbFu9r+5GaouWzrTNURUQCKNbW3EVEJAIKdxGRAFK4i4gEkMJdRCSAFO4iIgGkcBcRCSCFu4hIACncRUQC6L8AeMHSmicNQgUAAAAASUVORK5CYII=\n",
228 | "text/plain": [
229 | ""
230 | ]
231 | },
232 | "metadata": {
233 | "needs_background": "light"
234 | },
235 | "output_type": "display_data"
236 | }
237 | ],
238 | "source": [
239 | "top_n = 50000\n",
240 | "x = list(range(top_n))\n",
241 | "cumsum = df.iloc[:top_n]['rating_count'].cumsum()/total_ratings\n",
242 | "plt.plot(x, cumsum);"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": 7,
248 | "metadata": {},
249 | "outputs": [
250 | {
251 | "data": {
252 | "text/plain": [
253 | "436796"
254 | ]
255 | },
256 | "execution_count": 7,
257 | "metadata": {},
258 | "output_type": "execute_result"
259 | }
260 | ],
261 | "source": [
262 | "len(df)"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": 19,
268 | "metadata": {},
269 | "outputs": [
270 | {
271 | "data": {
272 | "text/plain": [
273 | "4417"
274 | ]
275 | },
276 | "execution_count": 19,
277 | "metadata": {},
278 | "output_type": "execute_result"
279 | }
280 | ],
281 | "source": [
282 | "cutoff_ind = cumsum[cumsum>0.5].index[0]\n",
283 | "df = df.iloc[:cutoff_ind]\n",
284 | "# df = df.iloc[800:820]\n",
285 | "cutoff_ind"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": 12,
291 | "metadata": {},
292 | "outputs": [],
293 | "source": [
294 | "asin_list = list(df['asin'])"
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": 13,
300 | "metadata": {},
301 | "outputs": [],
302 | "source": [
303 | "asin_list = asin_list[5000:5020]"
304 | ]
305 | },
306 | {
307 | "cell_type": "code",
308 | "execution_count": 14,
309 | "metadata": {},
310 | "outputs": [],
311 | "source": [
312 | "def get_overall_rating(asin, rating_dict):\n",
313 | " sess = HTMLSession()\n",
314 | " r = sess.get(f\"https://www.audible.com/pd/{asin}\")\n",
315 | " rating_dict[asin] = np.float(r.html.find('div.bc-row.bc-spacing-small', \n",
316 | " first=False)[1].text.split(' stars ')[1][:3])\n",
317 | " sess.close()\n",
318 | " return rating_dict\n",
319 | "\n",
320 | "\n",
321 | "def get_reviews(asin):\n",
322 | " baseurl = f'https://www.audible.com/pd/reviews?country=US&asin={asin}&page='\n",
323 | " page_num = 0\n",
324 | " ratings_reviews = []\n",
325 | " sess = HTMLSession()\n",
326 | " \n",
327 | " while True:\n",
328 | " try:\n",
329 | " url = baseurl + str(page_num)\n",
330 | " r = sess.get(url).html\n",
331 | " page_elements = r.find('div.bc-row-responsive.bc-spacing-top-medium', first=False)\n",
332 | " for elem in page_elements:\n",
333 | " review = elem.find(f'div.bc-col-responsive.USreviews{page_num}.bc-col-9', first=True).text\n",
334 | " ratings = [item.text[0] for item in elem.find('span.bc-text')]\n",
335 | " ratings_reviews.append((review, *ratings))\n",
336 | " page_num += 1\n",
337 | " \n",
338 | " except:\n",
339 | " print(page_num)\n",
340 | " break\n",
341 | " \n",
342 | " sess.close()\n",
343 | " return ratings_reviews\n",
344 | "\n",
345 | "\n",
346 | "def get_ratings_and_reviews(asins, rating_dict, reviews):\n",
347 | " for asin in asins:\n",
348 | " rating_dict = (get_overall_rating(asin, rating_dict))\n",
349 | " reviews.extend(get_reviews(asin))\n",
350 | "# print(reviews)\n",
351 | " return rating_dict, reviews"
352 | ]
353 | },
354 | {
355 | "cell_type": "code",
356 | "execution_count": 15,
357 | "metadata": {},
358 | "outputs": [],
359 | "source": [
360 | "def threaded_ratings(nthreads, asins, rating_dict=None, reviews=None):\n",
361 | " if rating_dict == None:\n",
362 | " rating_dict = {}\n",
363 | " \n",
364 | " if reviews == None:\n",
365 | " reviews = []\n",
366 | " \n",
367 | " threads = []\n",
368 | " for i in range(nthreads):\n",
369 | " asin_group = asins[i::nthreads]\n",
370 | " t = Thread(target=get_ratings_and_reviews, args=(asin_group, rating_dict, reviews))\n",
371 | " threads.append(t)\n",
372 | " \n",
373 | " [t.start() for t in threads]\n",
374 | " [t.join() for t in threads]\n",
375 | " \n",
376 | " return rating_dict, reviews"
377 | ]
378 | },
379 | {
380 | "cell_type": "code",
381 | "execution_count": 16,
382 | "metadata": {},
383 | "outputs": [
384 | {
385 | "name": "stdout",
386 | "output_type": "stream",
387 | "text": [
388 | "6\n",
389 | "6\n",
390 | "10\n",
391 | "13\n",
392 | "5\n",
393 | "15\n",
394 | "15\n",
395 | "9\n",
396 | "9\n",
397 | "5\n",
398 | "13\n",
399 | "10\n",
400 | "13\n",
401 | "16\n",
402 | "15\n",
403 | "6\n",
404 | "15\n",
405 | "29\n",
406 | "9\n",
407 | "9\n",
408 | "29.16919183731079\n"
409 | ]
410 | }
411 | ],
412 | "source": [
413 | "start = time.time()\n",
414 | "rating_dict = {}\n",
415 | "rating_dict, reviews = threaded_ratings(6, asin_list, rating_dict)\n",
416 | "end = time.time()\n",
417 | "print(end-start)"
418 | ]
419 | },
420 | {
421 | "cell_type": "code",
422 | "execution_count": 17,
423 | "metadata": {},
424 | "outputs": [
425 | {
426 | "data": {
427 | "text/plain": [
428 | "2216"
429 | ]
430 | },
431 | "execution_count": 17,
432 | "metadata": {},
433 | "output_type": "execute_result"
434 | }
435 | ],
436 | "source": [
437 | "len(reviews)"
438 | ]
439 | },
440 | {
441 | "cell_type": "code",
442 | "execution_count": null,
443 | "metadata": {},
444 | "outputs": [],
445 | "source": [
446 | "# url = 'https://www.audible.com/pd/reviews?country=US&asin=B00JU4QCMC&page=0'\n",
447 | "# sess = HTMLSession()\n",
448 | "# r = sess.get(url).html\n",
449 | "# page_elements = r.find('div.bc-row-responsive.bc-spacing-top-medium', first=False)\n",
450 | " "
451 | ]
452 | },
453 | {
454 | "cell_type": "code",
455 | "execution_count": null,
456 | "metadata": {},
457 | "outputs": [],
458 | "source": [
459 | "# page_elements"
460 | ]
461 | },
462 | {
463 | "cell_type": "code",
464 | "execution_count": null,
465 | "metadata": {},
466 | "outputs": [],
467 | "source": [
468 | "rev_df = pd.DataFrame(data=reviews, columns=['text', 'overall', 'performance', 'story'])"
469 | ]
470 | },
471 | {
472 | "cell_type": "code",
473 | "execution_count": null,
474 | "metadata": {},
475 | "outputs": [],
476 | "source": [
477 | "rev_df['overall'] = rev_df['overall'].astype(str)"
478 | ]
479 | },
480 | {
481 | "cell_type": "code",
482 | "execution_count": null,
483 | "metadata": {},
484 | "outputs": [],
485 | "source": [
486 | "nums = ['1','2','3','4','5']\n",
487 | "scores = rev_df[rev_df.isin(nums)].drop('text', axis=1).dropna().astype(int)"
488 | ]
489 | },
490 | {
491 | "cell_type": "code",
492 | "execution_count": null,
493 | "metadata": {},
494 | "outputs": [],
495 | "source": [
496 | "scores.hist()"
497 | ]
498 | },
499 | {
500 | "cell_type": "code",
501 | "execution_count": null,
502 | "metadata": {},
503 | "outputs": [],
504 | "source": [
505 | "rev_df['text_length'] = rev_df['text'].apply(len)"
506 | ]
507 | },
508 | {
509 | "cell_type": "code",
510 | "execution_count": null,
511 | "metadata": {},
512 | "outputs": [],
513 | "source": [
514 | "rev_df['text_length'].hist(bins=50, range=(0, 2000))"
515 | ]
516 | },
517 | {
518 | "cell_type": "code",
519 | "execution_count": null,
520 | "metadata": {},
521 | "outputs": [],
522 | "source": [
523 | "rev_df.sort_values('text_length', inplace=True)"
524 | ]
525 | },
526 | {
527 | "cell_type": "code",
528 | "execution_count": null,
529 | "metadata": {},
530 | "outputs": [],
531 | "source": [
532 | "y = rev_df['text_length'].cumsum()/rev_df['text_length'].sum()"
533 | ]
534 | },
535 | {
536 | "cell_type": "code",
537 | "execution_count": null,
538 | "metadata": {},
539 | "outputs": [],
540 | "source": [
541 | "import matplotlib.pyplot as plt\n",
542 | "plt.plot(rev_df['text_length'], y)"
543 | ]
544 | },
545 | {
546 | "cell_type": "code",
547 | "execution_count": null,
548 | "metadata": {},
549 | "outputs": [],
550 | "source": [
551 | "txt = 'i was impressed by the overall narrative, but felt the author could have used some better organization'\n",
552 | "len(txt)"
553 | ]
554 | },
555 | {
556 | "cell_type": "code",
557 | "execution_count": null,
558 | "metadata": {},
559 | "outputs": [],
560 | "source": [
561 | "alltext = list(rev_df['text'].values)"
562 | ]
563 | },
564 | {
565 | "cell_type": "code",
566 | "execution_count": null,
567 | "metadata": {},
568 | "outputs": [],
569 | "source": [
570 | "allwords = ' '.join(alltext)"
571 | ]
572 | },
573 | {
574 | "cell_type": "code",
575 | "execution_count": null,
576 | "metadata": {},
577 | "outputs": [],
578 | "source": [
579 | "allwords.split()[:50]"
580 | ]
581 | },
582 | {
583 | "cell_type": "code",
584 | "execution_count": null,
585 | "metadata": {},
586 | "outputs": [],
587 | "source": []
588 | }
589 | ],
590 | "metadata": {
591 | "kernelspec": {
592 | "display_name": "Python 3",
593 | "language": "python",
594 | "name": "python3"
595 | },
596 | "language_info": {
597 | "codemirror_mode": {
598 | "name": "ipython",
599 | "version": 3
600 | },
601 | "file_extension": ".py",
602 | "mimetype": "text/x-python",
603 | "name": "python",
604 | "nbconvert_exporter": "python",
605 | "pygments_lexer": "ipython3",
606 | "version": "3.7.2"
607 | }
608 | },
609 | "nbformat": 4,
610 | "nbformat_minor": 2
611 | }
612 |
--------------------------------------------------------------------------------
/audible_eda/audible_prices.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from requests_html import HTMLSession, HTML\n",
10 | "import numpy as np\n",
11 | "import pandas as pd\n",
12 | "from datetime import datetime\n",
13 | "import nest_asyncio"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 2,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "url = 'https://www.audible.com/search?pf_rd_p=1d79b443-2f1d-43a3-b1dc-31a2cd242566&pf_rd_r=HK8P1MY097JB8VJ6PRTQ&ref=a_search_c4_pageSize_3&keywords=the+great+courses&pageSize=50'"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 3,
28 | "metadata": {
29 | "scrolled": true
30 | },
31 | "outputs": [],
32 | "source": [
33 | "def scrape_great_courses(url):\n",
34 | " sess = HTMLSession()\n",
35 | " data = []\n",
36 | " pages = 15\n",
37 | " for page in range(pages):\n",
38 | " pageurl = url + '&page=' + str(page+1)\n",
39 | " r = sess.get(pageurl)\n",
40 | " items = r.html.find('li.bc-list-item.productListItem', first=False)\n",
41 | " for i, item in enumerate(items):\n",
42 | " text_fields = item.text.split('\\n')\n",
43 | " dict_entry={\n",
44 | " 'title' : text_fields[0],\n",
45 | " 'price' : np.float([s for s in text_fields if 'Regular' in s][0].split('$')[1]),\n",
46 | " 'length' : [s for s in text_fields if 'Length' in s][0].split(': ')[1],\n",
47 | " 'rating' : np.float([s for s in text_fields if 'stars' in s][0].split(' out')[0]),\n",
48 | " 'rating_count' : np.int([s for s in text_fields if 'stars' in s][0].split(\n",
49 | " 'stars ')[1].replace(',','')),\n",
50 | " 'link' : 'https://www.audible.com' + [link for link in item.links if '/pd/' in link][0],\n",
51 | " }\n",
52 | " if any(['Series:' in s for s in text_fields]):\n",
53 | " dict_entry['series'] = [s for s in text_fields if 'Series:' in s][0].split('Series: ')[1]\n",
54 | " else:\n",
55 | " dict_entry['series'] = 'N/A'\n",
56 | " if any(['Release date:' in s for s in text_fields]):\n",
57 | " dict_entry['release_date'] = datetime.strptime([s for s in text_fields if 'Release date:' in s][0].split(\n",
58 | " ': ')[1], '%m-%d-%y')\n",
59 | " data.append(dict_entry)\n",
60 | " return data"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 4,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "df = pd.DataFrame(data=scrape_great_courses(url))"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 5,
75 | "metadata": {
76 | "scrolled": true
77 | },
78 | "outputs": [
79 | {
80 | "data": {
81 | "text/html": [
82 | "\n",
83 | "\n",
96 | "
\n",
97 | " \n",
98 | " \n",
99 | " | \n",
100 | " length | \n",
101 | " link | \n",
102 | " price | \n",
103 | " rating | \n",
104 | " rating_count | \n",
105 | " release_date | \n",
106 | " series | \n",
107 | " title | \n",
108 | "
\n",
109 | " \n",
110 | " \n",
111 | " \n",
112 | " 0 | \n",
113 | " 43 hrs and 23 mins | \n",
114 | " https://www.audible.com/pd/The-History-of-the-... | \n",
115 | " 59.95 | \n",
116 | " 4.5 | \n",
117 | " 2315 | \n",
118 | " 2013-07-08 | \n",
119 | " The Great Courses: Modern History | \n",
120 | " The History of the United States, 2nd Edition | \n",
121 | "
\n",
122 | " \n",
123 | " 1 | \n",
124 | " 12 hrs and 39 mins | \n",
125 | " https://www.audible.com/pd/Your-Best-Brain-The... | \n",
126 | " 34.95 | \n",
127 | " 4.5 | \n",
128 | " 2629 | \n",
129 | " 2014-11-14 | \n",
130 | " The Great Courses: Psychology | \n",
131 | " Your Best Brain: The Science of Brain Improvement | \n",
132 | "
\n",
133 | " \n",
134 | " 2 | \n",
135 | " 18 hrs and 15 mins | \n",
136 | " https://www.audible.com/pd/The-Story-of-Human-... | \n",
137 | " 41.95 | \n",
138 | " 4.5 | \n",
139 | " 3613 | \n",
140 | " 2013-07-08 | \n",
141 | " The Great Courses: Linguistics | \n",
142 | " The Story of Human Language | \n",
143 | "
\n",
144 | " \n",
145 | " 3 | \n",
146 | " 36 hrs and 34 mins | \n",
147 | " https://www.audible.com/pd/How-to-Listen-to-an... | \n",
148 | " 59.95 | \n",
149 | " 4.5 | \n",
150 | " 2337 | \n",
151 | " 2013-07-08 | \n",
152 | " The Great Courses: Fine Arts & Music | \n",
153 | " How to Listen to and Understand Great Music, 3... | \n",
154 | "
\n",
155 | " \n",
156 | " 4 | \n",
157 | " 31 hrs and 18 mins | \n",
158 | " https://www.audible.com/pd/Critical-Business-S... | \n",
159 | " 59.95 | \n",
160 | " 4.5 | \n",
161 | " 2171 | \n",
162 | " 2015-04-08 | \n",
163 | " The Great Courses: Professional | \n",
164 | " Critical Business Skills for Success | \n",
165 | "
\n",
166 | " \n",
167 | "
\n",
168 | "
"
169 | ],
170 | "text/plain": [
171 | " length link \\\n",
172 | "0 43 hrs and 23 mins https://www.audible.com/pd/The-History-of-the-... \n",
173 | "1 12 hrs and 39 mins https://www.audible.com/pd/Your-Best-Brain-The... \n",
174 | "2 18 hrs and 15 mins https://www.audible.com/pd/The-Story-of-Human-... \n",
175 | "3 36 hrs and 34 mins https://www.audible.com/pd/How-to-Listen-to-an... \n",
176 | "4 31 hrs and 18 mins https://www.audible.com/pd/Critical-Business-S... \n",
177 | "\n",
178 | " price rating rating_count release_date \\\n",
179 | "0 59.95 4.5 2315 2013-07-08 \n",
180 | "1 34.95 4.5 2629 2014-11-14 \n",
181 | "2 41.95 4.5 3613 2013-07-08 \n",
182 | "3 59.95 4.5 2337 2013-07-08 \n",
183 | "4 59.95 4.5 2171 2015-04-08 \n",
184 | "\n",
185 | " series \\\n",
186 | "0 The Great Courses: Modern History \n",
187 | "1 The Great Courses: Psychology \n",
188 | "2 The Great Courses: Linguistics \n",
189 | "3 The Great Courses: Fine Arts & Music \n",
190 | "4 The Great Courses: Professional \n",
191 | "\n",
192 | " title \n",
193 | "0 The History of the United States, 2nd Edition \n",
194 | "1 Your Best Brain: The Science of Brain Improvement \n",
195 | "2 The Story of Human Language \n",
196 | "3 How to Listen to and Understand Great Music, 3... \n",
197 | "4 Critical Business Skills for Success "
198 | ]
199 | },
200 | "execution_count": 5,
201 | "metadata": {},
202 | "output_type": "execute_result"
203 | }
204 | ],
205 | "source": [
206 | "df.head()"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": 6,
212 | "metadata": {},
213 | "outputs": [],
214 | "source": [
215 | "def scrape_sale_courses():\n",
216 | " data = []\n",
217 | " pages = 5\n",
218 | " for page in range(pages):\n",
219 | " pageurl = '/Users/tobymanders/Desktop/{}.html'.format(page+1)\n",
220 | " with open(pageurl) as file:\n",
221 | " html = file.read()\n",
222 | " r = HTML(html=html)\n",
223 | " items = r.find('li.bc-list-item.productListItem', first=False)\n",
224 | " for i, item in enumerate(items):\n",
225 | " text_fields = item.text.split('\\n')\n",
226 | " dict_entry={\n",
227 | " 'title' : text_fields[0],\n",
228 | " 'sale' : 'Yes',\n",
229 | " }\n",
230 | " if any(['Member' in s for s in text_fields]):\n",
231 | " dict_entry['member-price'] = np.float([s for s in text_fields if 'Member' in s][0].split('$')[1].split(' or')[0])\n",
232 | " data.append(dict_entry)\n",
233 | " return data"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": 7,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "sale_df = pd.DataFrame(data=scrape_sale_courses())"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": 8,
248 | "metadata": {},
249 | "outputs": [],
250 | "source": [
251 | "df_merge = df.join(sale_df.set_index('title'), on='title')"
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": 9,
257 | "metadata": {},
258 | "outputs": [
259 | {
260 | "data": {
261 | "text/html": [
262 | "\n",
263 | "\n",
276 | "
\n",
277 | " \n",
278 | " \n",
279 | " | \n",
280 | " length | \n",
281 | " link | \n",
282 | " price | \n",
283 | " rating | \n",
284 | " rating_count | \n",
285 | " release_date | \n",
286 | " series | \n",
287 | " title | \n",
288 | " member-price | \n",
289 | " sale | \n",
290 | "
\n",
291 | " \n",
292 | " \n",
293 | " \n",
294 | " 0 | \n",
295 | " 43 hrs and 23 mins | \n",
296 | " https://www.audible.com/pd/The-History-of-the-... | \n",
297 | " 59.95 | \n",
298 | " 4.5 | \n",
299 | " 2315 | \n",
300 | " 2013-07-08 | \n",
301 | " The Great Courses: Modern History | \n",
302 | " The History of the United States, 2nd Edition | \n",
303 | " NaN | \n",
304 | " Yes | \n",
305 | "
\n",
306 | " \n",
307 | " 1 | \n",
308 | " 12 hrs and 39 mins | \n",
309 | " https://www.audible.com/pd/Your-Best-Brain-The... | \n",
310 | " 34.95 | \n",
311 | " 4.5 | \n",
312 | " 2629 | \n",
313 | " 2014-11-14 | \n",
314 | " The Great Courses: Psychology | \n",
315 | " Your Best Brain: The Science of Brain Improvement | \n",
316 | " 24.46 | \n",
317 | " Yes | \n",
318 | "
\n",
319 | " \n",
320 | " 2 | \n",
321 | " 18 hrs and 15 mins | \n",
322 | " https://www.audible.com/pd/The-Story-of-Human-... | \n",
323 | " 41.95 | \n",
324 | " 4.5 | \n",
325 | " 3613 | \n",
326 | " 2013-07-08 | \n",
327 | " The Great Courses: Linguistics | \n",
328 | " The Story of Human Language | \n",
329 | " NaN | \n",
330 | " Yes | \n",
331 | "
\n",
332 | " \n",
333 | " 3 | \n",
334 | " 36 hrs and 34 mins | \n",
335 | " https://www.audible.com/pd/How-to-Listen-to-an... | \n",
336 | " 59.95 | \n",
337 | " 4.5 | \n",
338 | " 2337 | \n",
339 | " 2013-07-08 | \n",
340 | " The Great Courses: Fine Arts & Music | \n",
341 | " How to Listen to and Understand Great Music, 3... | \n",
342 | " 41.96 | \n",
343 | " Yes | \n",
344 | "
\n",
345 | " \n",
346 | " 4 | \n",
347 | " 31 hrs and 18 mins | \n",
348 | " https://www.audible.com/pd/Critical-Business-S... | \n",
349 | " 59.95 | \n",
350 | " 4.5 | \n",
351 | " 2171 | \n",
352 | " 2015-04-08 | \n",
353 | " The Great Courses: Professional | \n",
354 | " Critical Business Skills for Success | \n",
355 | " 41.96 | \n",
356 | " Yes | \n",
357 | "
\n",
358 | " \n",
359 | "
\n",
360 | "
"
361 | ],
362 | "text/plain": [
363 | " length link \\\n",
364 | "0 43 hrs and 23 mins https://www.audible.com/pd/The-History-of-the-... \n",
365 | "1 12 hrs and 39 mins https://www.audible.com/pd/Your-Best-Brain-The... \n",
366 | "2 18 hrs and 15 mins https://www.audible.com/pd/The-Story-of-Human-... \n",
367 | "3 36 hrs and 34 mins https://www.audible.com/pd/How-to-Listen-to-an... \n",
368 | "4 31 hrs and 18 mins https://www.audible.com/pd/Critical-Business-S... \n",
369 | "\n",
370 | " price rating rating_count release_date \\\n",
371 | "0 59.95 4.5 2315 2013-07-08 \n",
372 | "1 34.95 4.5 2629 2014-11-14 \n",
373 | "2 41.95 4.5 3613 2013-07-08 \n",
374 | "3 59.95 4.5 2337 2013-07-08 \n",
375 | "4 59.95 4.5 2171 2015-04-08 \n",
376 | "\n",
377 | " series \\\n",
378 | "0 The Great Courses: Modern History \n",
379 | "1 The Great Courses: Psychology \n",
380 | "2 The Great Courses: Linguistics \n",
381 | "3 The Great Courses: Fine Arts & Music \n",
382 | "4 The Great Courses: Professional \n",
383 | "\n",
384 | " title member-price sale \n",
385 | "0 The History of the United States, 2nd Edition NaN Yes \n",
386 | "1 Your Best Brain: The Science of Brain Improvement 24.46 Yes \n",
387 | "2 The Story of Human Language NaN Yes \n",
388 | "3 How to Listen to and Understand Great Music, 3... 41.96 Yes \n",
389 | "4 Critical Business Skills for Success 41.96 Yes "
390 | ]
391 | },
392 | "execution_count": 9,
393 | "metadata": {},
394 | "output_type": "execute_result"
395 | }
396 | ],
397 | "source": [
398 | "df_merge.head()"
399 | ]
400 | },
401 | {
402 | "cell_type": "code",
403 | "execution_count": 10,
404 | "metadata": {},
405 | "outputs": [],
406 | "source": [
407 | "df_merge['sale'] = df_merge['sale'].fillna('No')"
408 | ]
409 | },
410 | {
411 | "cell_type": "code",
412 | "execution_count": 11,
413 | "metadata": {},
414 | "outputs": [
415 | {
416 | "data": {
417 | "text/html": [
418 | "\n",
419 | "\n",
432 | "
\n",
433 | " \n",
434 | " \n",
435 | " | \n",
436 | " length | \n",
437 | " link | \n",
438 | " price | \n",
439 | " rating | \n",
440 | " rating_count | \n",
441 | " release_date | \n",
442 | " series | \n",
443 | " title | \n",
444 | " member-price | \n",
445 | " sale | \n",
446 | "
\n",
447 | " \n",
448 | " \n",
449 | " \n",
450 | " 0 | \n",
451 | " 43 hrs and 23 mins | \n",
452 | " https://www.audible.com/pd/The-History-of-the-... | \n",
453 | " 59.95 | \n",
454 | " 4.5 | \n",
455 | " 2315 | \n",
456 | " 2013-07-08 | \n",
457 | " The Great Courses: Modern History | \n",
458 | " The History of the United States, 2nd Edition | \n",
459 | " NaN | \n",
460 | " Yes | \n",
461 | "
\n",
462 | " \n",
463 | " 1 | \n",
464 | " 12 hrs and 39 mins | \n",
465 | " https://www.audible.com/pd/Your-Best-Brain-The... | \n",
466 | " 34.95 | \n",
467 | " 4.5 | \n",
468 | " 2629 | \n",
469 | " 2014-11-14 | \n",
470 | " The Great Courses: Psychology | \n",
471 | " Your Best Brain: The Science of Brain Improvement | \n",
472 | " 24.46 | \n",
473 | " Yes | \n",
474 | "
\n",
475 | " \n",
476 | " 2 | \n",
477 | " 18 hrs and 15 mins | \n",
478 | " https://www.audible.com/pd/The-Story-of-Human-... | \n",
479 | " 41.95 | \n",
480 | " 4.5 | \n",
481 | " 3613 | \n",
482 | " 2013-07-08 | \n",
483 | " The Great Courses: Linguistics | \n",
484 | " The Story of Human Language | \n",
485 | " NaN | \n",
486 | " Yes | \n",
487 | "
\n",
488 | " \n",
489 | " 3 | \n",
490 | " 36 hrs and 34 mins | \n",
491 | " https://www.audible.com/pd/How-to-Listen-to-an... | \n",
492 | " 59.95 | \n",
493 | " 4.5 | \n",
494 | " 2337 | \n",
495 | " 2013-07-08 | \n",
496 | " The Great Courses: Fine Arts & Music | \n",
497 | " How to Listen to and Understand Great Music, 3... | \n",
498 | " 41.96 | \n",
499 | " Yes | \n",
500 | "
\n",
501 | " \n",
502 | " 4 | \n",
503 | " 31 hrs and 18 mins | \n",
504 | " https://www.audible.com/pd/Critical-Business-S... | \n",
505 | " 59.95 | \n",
506 | " 4.5 | \n",
507 | " 2171 | \n",
508 | " 2015-04-08 | \n",
509 | " The Great Courses: Professional | \n",
510 | " Critical Business Skills for Success | \n",
511 | " 41.96 | \n",
512 | " Yes | \n",
513 | "
\n",
514 | " \n",
515 | "
\n",
516 | "
"
517 | ],
518 | "text/plain": [
519 | " length link \\\n",
520 | "0 43 hrs and 23 mins https://www.audible.com/pd/The-History-of-the-... \n",
521 | "1 12 hrs and 39 mins https://www.audible.com/pd/Your-Best-Brain-The... \n",
522 | "2 18 hrs and 15 mins https://www.audible.com/pd/The-Story-of-Human-... \n",
523 | "3 36 hrs and 34 mins https://www.audible.com/pd/How-to-Listen-to-an... \n",
524 | "4 31 hrs and 18 mins https://www.audible.com/pd/Critical-Business-S... \n",
525 | "\n",
526 | " price rating rating_count release_date \\\n",
527 | "0 59.95 4.5 2315 2013-07-08 \n",
528 | "1 34.95 4.5 2629 2014-11-14 \n",
529 | "2 41.95 4.5 3613 2013-07-08 \n",
530 | "3 59.95 4.5 2337 2013-07-08 \n",
531 | "4 59.95 4.5 2171 2015-04-08 \n",
532 | "\n",
533 | " series \\\n",
534 | "0 The Great Courses: Modern History \n",
535 | "1 The Great Courses: Psychology \n",
536 | "2 The Great Courses: Linguistics \n",
537 | "3 The Great Courses: Fine Arts & Music \n",
538 | "4 The Great Courses: Professional \n",
539 | "\n",
540 | " title member-price sale \n",
541 | "0 The History of the United States, 2nd Edition NaN Yes \n",
542 | "1 Your Best Brain: The Science of Brain Improvement 24.46 Yes \n",
543 | "2 The Story of Human Language NaN Yes \n",
544 | "3 How to Listen to and Understand Great Music, 3... 41.96 Yes \n",
545 | "4 Critical Business Skills for Success 41.96 Yes "
546 | ]
547 | },
548 | "execution_count": 11,
549 | "metadata": {},
550 | "output_type": "execute_result"
551 | }
552 | ],
553 | "source": [
554 | "df_merge.head()"
555 | ]
556 | },
557 | {
558 | "cell_type": "code",
559 | "execution_count": 12,
560 | "metadata": {},
561 | "outputs": [],
562 | "source": [
563 | "def fix_mins(x):\n",
564 | " if 'min' in x and 'hr' in x:\n",
565 | " hrs = x.split(' hr')[0]\n",
566 | " mins = x.split(' min')[0].split('and ')[1]\n",
567 | " len_ = make_len(hrs, mins)\n",
568 | " elif 'min' in x:\n",
569 | " mins = x.split(' min')[0]\n",
570 | " len_ = make_len('00', mins)\n",
571 | " elif 'hr' in x:\n",
572 | " hrs = x.split(' hr')[0]\n",
573 | " len_ = make_len(hrs, '00')\n",
574 | " else:\n",
575 | " len_ = make_len('00', '00')\n",
576 | " return len_\n",
577 | "\n",
578 | "def make_len(hrs, mins):\n",
579 | " if len(hrs)<2:\n",
580 | " hrs = '0' + hrs\n",
581 | " if len(mins)<2:\n",
582 | " mins = '0' + mins\n",
583 | " return hrs + ' hrs ' + mins + ' mins'"
584 | ]
585 | },
586 | {
587 | "cell_type": "code",
588 | "execution_count": 13,
589 | "metadata": {},
590 | "outputs": [],
591 | "source": [
592 | "df_merge['length'] = df_merge['length'].apply(fix_mins)"
593 | ]
594 | },
595 | {
596 | "cell_type": "code",
597 | "execution_count": 14,
598 | "metadata": {},
599 | "outputs": [],
600 | "source": [
601 | "columns = ['title', 'sale', 'price', 'member-price', 'length', 'rating', 'rating_count', 'release_date', 'series', 'link']"
602 | ]
603 | },
604 | {
605 | "cell_type": "code",
606 | "execution_count": 15,
607 | "metadata": {},
608 | "outputs": [],
609 | "source": [
610 | "df_merge.to_csv('great_courses_all_titles_v2.csv', columns=columns, index=False)"
611 | ]
612 | },
613 | {
614 | "cell_type": "code",
615 | "execution_count": 16,
616 | "metadata": {},
617 | "outputs": [],
618 | "source": [
619 | "url_list = list(df_merge['link'])"
620 | ]
621 | },
622 | {
623 | "cell_type": "code",
624 | "execution_count": 17,
625 | "metadata": {},
626 | "outputs": [
627 | {
628 | "data": {
629 | "text/plain": [
630 | "724"
631 | ]
632 | },
633 | "execution_count": 17,
634 | "metadata": {},
635 | "output_type": "execute_result"
636 | }
637 | ],
638 | "source": [
639 | "len(df_merge)"
640 | ]
641 | },
642 | {
643 | "cell_type": "code",
644 | "execution_count": 50,
645 | "metadata": {},
646 | "outputs": [],
647 | "source": [
648 | "def get_accurate_ratings(addresses, rating_dict):\n",
649 | " for address in addresses:\n",
650 | " sess = HTMLSession()\n",
651 | " r = sess.get(address)\n",
652 | " try:\n",
653 | " rating_dict[address] = np.float(r.html.find('div.bc-row.bc-spacing-small', first=False)[1].text.split(' stars ')[1][:3])\n",
654 | " except:\n",
655 | " rating_dict[address] = np.nan\n",
656 | " \n",
657 | " return rating_dict"
658 | ]
659 | },
660 | {
661 | "cell_type": "code",
662 | "execution_count": 51,
663 | "metadata": {},
664 | "outputs": [],
665 | "source": [
666 | "from threading import Thread\n",
667 | "\n",
668 | "def threaded_ratings(nthreads, addresses, rating_dict=None):\n",
669 | " if rating_dict == None:\n",
670 | " rating_dict = {}\n",
671 | " threads = []\n",
672 | " for i in range(nthreads):\n",
673 | " add_subset = addresses[i::nthreads]\n",
674 | " t = Thread(target=get_accurate_ratings, args=(add_subset, rating_dict))\n",
675 | " threads.append(t)\n",
676 | " [t.start() for t in threads]\n",
677 | " [t.join() for t in threads]\n",
678 | " \n",
679 | " return rating_dict"
680 | ]
681 | },
682 | {
683 | "cell_type": "code",
684 | "execution_count": 52,
685 | "metadata": {},
686 | "outputs": [],
687 | "source": [
688 | "rating_dict = threaded_ratings(64, url_list)"
689 | ]
690 | },
691 | {
692 | "cell_type": "code",
693 | "execution_count": 58,
694 | "metadata": {},
695 | "outputs": [],
696 | "source": [
697 | "df_merge['rating'] = df_merge['link'].apply(lambda x: rating_dict[x])"
698 | ]
699 | },
700 | {
701 | "cell_type": "code",
702 | "execution_count": 59,
703 | "metadata": {},
704 | "outputs": [
705 | {
706 | "data": {
707 | "text/html": [
708 | "\n",
709 | "\n",
722 | "
\n",
723 | " \n",
724 | " \n",
725 | " | \n",
726 | " length | \n",
727 | " link | \n",
728 | " price | \n",
729 | " rating | \n",
730 | " rating_count | \n",
731 | " release_date | \n",
732 | " series | \n",
733 | " title | \n",
734 | " member-price | \n",
735 | " sale | \n",
736 | "
\n",
737 | " \n",
738 | " \n",
739 | " \n",
740 | " 0 | \n",
741 | " 43 hrs 23 mins | \n",
742 | " https://www.audible.com/pd/The-History-of-the-... | \n",
743 | " 59.95 | \n",
744 | " 4.7 | \n",
745 | " 2315 | \n",
746 | " 2013-07-08 | \n",
747 | " The Great Courses: Modern History | \n",
748 | " The History of the United States, 2nd Edition | \n",
749 | " NaN | \n",
750 | " Yes | \n",
751 | "
\n",
752 | " \n",
753 | " 1 | \n",
754 | " 12 hrs 39 mins | \n",
755 | " https://www.audible.com/pd/Your-Best-Brain-The... | \n",
756 | " 34.95 | \n",
757 | " 4.5 | \n",
758 | " 2629 | \n",
759 | " 2014-11-14 | \n",
760 | " The Great Courses: Psychology | \n",
761 | " Your Best Brain: The Science of Brain Improvement | \n",
762 | " 24.46 | \n",
763 | " Yes | \n",
764 | "
\n",
765 | " \n",
766 | " 2 | \n",
767 | " 18 hrs 15 mins | \n",
768 | " https://www.audible.com/pd/The-Story-of-Human-... | \n",
769 | " 41.95 | \n",
770 | " 4.7 | \n",
771 | " 3613 | \n",
772 | " 2013-07-08 | \n",
773 | " The Great Courses: Linguistics | \n",
774 | " The Story of Human Language | \n",
775 | " NaN | \n",
776 | " Yes | \n",
777 | "
\n",
778 | " \n",
779 | " 3 | \n",
780 | " 36 hrs 34 mins | \n",
781 | " https://www.audible.com/pd/How-to-Listen-to-an... | \n",
782 | " 59.95 | \n",
783 | " 4.7 | \n",
784 | " 2337 | \n",
785 | " 2013-07-08 | \n",
786 | " The Great Courses: Fine Arts & Music | \n",
787 | " How to Listen to and Understand Great Music, 3... | \n",
788 | " 41.96 | \n",
789 | " Yes | \n",
790 | "
\n",
791 | " \n",
792 | " 4 | \n",
793 | " 31 hrs 18 mins | \n",
794 | " https://www.audible.com/pd/Critical-Business-S... | \n",
795 | " 59.95 | \n",
796 | " 4.6 | \n",
797 | " 2171 | \n",
798 | " 2015-04-08 | \n",
799 | " The Great Courses: Professional | \n",
800 | " Critical Business Skills for Success | \n",
801 | " 41.96 | \n",
802 | " Yes | \n",
803 | "
\n",
804 | " \n",
805 | "
\n",
806 | "
"
807 | ],
808 | "text/plain": [
809 | " length link price \\\n",
810 | "0 43 hrs 23 mins https://www.audible.com/pd/The-History-of-the-... 59.95 \n",
811 | "1 12 hrs 39 mins https://www.audible.com/pd/Your-Best-Brain-The... 34.95 \n",
812 | "2 18 hrs 15 mins https://www.audible.com/pd/The-Story-of-Human-... 41.95 \n",
813 | "3 36 hrs 34 mins https://www.audible.com/pd/How-to-Listen-to-an... 59.95 \n",
814 | "4 31 hrs 18 mins https://www.audible.com/pd/Critical-Business-S... 59.95 \n",
815 | "\n",
816 | " rating rating_count release_date series \\\n",
817 | "0 4.7 2315 2013-07-08 The Great Courses: Modern History \n",
818 | "1 4.5 2629 2014-11-14 The Great Courses: Psychology \n",
819 | "2 4.7 3613 2013-07-08 The Great Courses: Linguistics \n",
820 | "3 4.7 2337 2013-07-08 The Great Courses: Fine Arts & Music \n",
821 | "4 4.6 2171 2015-04-08 The Great Courses: Professional \n",
822 | "\n",
823 | " title member-price sale \n",
824 | "0 The History of the United States, 2nd Edition NaN Yes \n",
825 | "1 Your Best Brain: The Science of Brain Improvement 24.46 Yes \n",
826 | "2 The Story of Human Language NaN Yes \n",
827 | "3 How to Listen to and Understand Great Music, 3... 41.96 Yes \n",
828 | "4 Critical Business Skills for Success 41.96 Yes "
829 | ]
830 | },
831 | "execution_count": 59,
832 | "metadata": {},
833 | "output_type": "execute_result"
834 | }
835 | ],
836 | "source": [
837 | "df_merge.head()"
838 | ]
839 | },
840 | {
841 | "cell_type": "code",
842 | "execution_count": 62,
843 | "metadata": {},
844 | "outputs": [],
845 | "source": [
846 | "df_merge.to_csv('great_courses_list_v3.csv', columns=columns, index=False)"
847 | ]
848 | },
849 | {
850 | "cell_type": "code",
851 | "execution_count": null,
852 | "metadata": {},
853 | "outputs": [],
854 | "source": []
855 | }
856 | ],
857 | "metadata": {
858 | "kernelspec": {
859 | "display_name": "Python 3",
860 | "language": "python",
861 | "name": "python3"
862 | },
863 | "language_info": {
864 | "codemirror_mode": {
865 | "name": "ipython",
866 | "version": 3
867 | },
868 | "file_extension": ".py",
869 | "mimetype": "text/x-python",
870 | "name": "python",
871 | "nbconvert_exporter": "python",
872 | "pygments_lexer": "ipython3",
873 | "version": "3.7.2"
874 | }
875 | },
876 | "nbformat": 4,
877 | "nbformat_minor": 2
878 | }
879 |
--------------------------------------------------------------------------------
/audible_eda/audible_reviews_scraper.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import os\n",
10 | "import pandas as pd\n",
11 | "import numpy as np\n",
12 | "import seaborn as sns\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "%matplotlib inline\n",
15 | "# import nest_asyncio\n",
16 | "# nest_asyncio.apply()\n",
17 | "from requests_html import HTML, HTMLSession, AsyncHTMLSession\n",
18 | "from threading import Thread\n",
19 | "import time\n",
20 | "import string"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 2,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "df = pd.read_csv(r'C:\\Users\\Toby-PC\\Documents\\code\\audible\\all_english_audible.csv')\n",
30 | "df = df[~df['asin'].isnull()]\n",
31 | "\n",
32 | "image_path = r'C:\\Users\\Toby-PC\\Documents\\code\\audible\\figures'\n",
33 | "\n",
34 | "def save_fig(fig_name, tight_layout=True):\n",
35 | " path = os.path.join(image_path, fig_name + '.png')\n",
36 | " print(\"Saving figure\", fig_name)\n",
37 | " if tight_layout:\n",
38 | " plt.tight_layout()\n",
39 | " plt.savefig(path, format='png', dpi=300)"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 3,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "df = df.drop_duplicates()"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 4,
54 | "metadata": {},
55 | "outputs": [
56 | {
57 | "data": {
58 | "text/plain": [
59 | "64023215.0"
60 | ]
61 | },
62 | "execution_count": 4,
63 | "metadata": {},
64 | "output_type": "execute_result"
65 | }
66 | ],
67 | "source": [
68 | "total_ratings = df['rating_count'].sum()\n",
69 | "total_ratings"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 5,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "df = df.sort_values('rating_count', ascending=False)\n",
79 | "df.reset_index(inplace=True)"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 6,
85 | "metadata": {
86 | "scrolled": false
87 | },
88 | "outputs": [
89 | {
90 | "data": {
91 | "text/html": [
92 | "\n",
93 | "\n",
106 | "
\n",
107 | " \n",
108 | " \n",
109 | " | \n",
110 | " index | \n",
111 | " asin | \n",
112 | " author | \n",
113 | " category | \n",
114 | " length | \n",
115 | " link | \n",
116 | " narrator | \n",
117 | " price | \n",
118 | " rating | \n",
119 | " rating_count | \n",
120 | " release_date | \n",
121 | " title | \n",
122 | "
\n",
123 | " \n",
124 | " \n",
125 | " \n",
126 | " 0 | \n",
127 | " 247928 | \n",
128 | " B005FRGT44 | \n",
129 | " Ernest Cline | \n",
130 | " Sci-Fi & Fantasy | \n",
131 | " 15 hrs and 40 mins | \n",
132 | " https://www.audible.com/pd/Ready-Player-One-Au... | \n",
133 | " Wil Wheaton | \n",
134 | " 31.50 | \n",
135 | " 5.0 | \n",
136 | " 216094.0 | \n",
137 | " 2011-08-16 | \n",
138 | " Ready Player One | \n",
139 | "
\n",
140 | " \n",
141 | " 1 | \n",
142 | " 248009 | \n",
143 | " B00B5HZGUG | \n",
144 | " Andy Weir | \n",
145 | " Sci-Fi & Fantasy | \n",
146 | " 10 hrs and 53 mins | \n",
147 | " https://www.audible.com/pd/The-Martian-Audiobo... | \n",
148 | " R. C. Bray | \n",
149 | " 29.99 | \n",
150 | " 5.0 | \n",
151 | " 164988.0 | \n",
152 | " 2013-03-22 | \n",
153 | " The Martian | \n",
154 | "
\n",
155 | " \n",
156 | " 2 | \n",
157 | " 142087 | \n",
158 | " B00QXW5GYY | \n",
159 | " Paula Hawkins | \n",
160 | " Mysteries & Thrillers | \n",
161 | " 10 hrs and 58 mins | \n",
162 | " https://www.audible.com/pd/The-Girl-on-the-Tra... | \n",
163 | " Clare Corbett, Louise Brealey, India Fisher | \n",
164 | " 28.00 | \n",
165 | " 4.5 | \n",
166 | " 133818.0 | \n",
167 | " 2015-01-13 | \n",
168 | " The Girl on the Train | \n",
169 | "
\n",
170 | " \n",
171 | " 3 | \n",
172 | " 4895 | \n",
173 | " B01IW9TQPK | \n",
174 | " Trevor Noah | \n",
175 | " Bios & Memoirs | \n",
176 | " 8 hrs and 44 mins | \n",
177 | " https://www.audible.com/pd/Born-a-Crime-Audiob... | \n",
178 | " Trevor Noah | \n",
179 | " 24.95 | \n",
180 | " 5.0 | \n",
181 | " 123838.0 | \n",
182 | " 2016-11-15 | \n",
183 | " Born a Crime | \n",
184 | "
\n",
185 | " \n",
186 | " 4 | \n",
187 | " 282008 | \n",
188 | " B01I28NFEE | \n",
189 | " Mark Manson | \n",
190 | " Self Development | \n",
191 | " 5 hrs and 17 mins | \n",
192 | " https://www.audible.com/pd/The-Subtle-Art-of-N... | \n",
193 | " Roger Wayne | \n",
194 | " 23.95 | \n",
195 | " 4.5 | \n",
196 | " 113261.0 | \n",
197 | " 2016-09-13 | \n",
198 | " The Subtle Art of Not Giving a F*ck | \n",
199 | "
\n",
200 | " \n",
201 | "
\n",
202 | "
"
203 | ],
204 | "text/plain": [
205 | " index asin author category \\\n",
206 | "0 247928 B005FRGT44 Ernest Cline Sci-Fi & Fantasy \n",
207 | "1 248009 B00B5HZGUG Andy Weir Sci-Fi & Fantasy \n",
208 | "2 142087 B00QXW5GYY Paula Hawkins Mysteries & Thrillers \n",
209 | "3 4895 B01IW9TQPK Trevor Noah Bios & Memoirs \n",
210 | "4 282008 B01I28NFEE Mark Manson Self Development \n",
211 | "\n",
212 | " length link \\\n",
213 | "0 15 hrs and 40 mins https://www.audible.com/pd/Ready-Player-One-Au... \n",
214 | "1 10 hrs and 53 mins https://www.audible.com/pd/The-Martian-Audiobo... \n",
215 | "2 10 hrs and 58 mins https://www.audible.com/pd/The-Girl-on-the-Tra... \n",
216 | "3 8 hrs and 44 mins https://www.audible.com/pd/Born-a-Crime-Audiob... \n",
217 | "4 5 hrs and 17 mins https://www.audible.com/pd/The-Subtle-Art-of-N... \n",
218 | "\n",
219 | " narrator price rating rating_count \\\n",
220 | "0 Wil Wheaton 31.50 5.0 216094.0 \n",
221 | "1 R. C. Bray 29.99 5.0 164988.0 \n",
222 | "2 Clare Corbett, Louise Brealey, India Fisher 28.00 4.5 133818.0 \n",
223 | "3 Trevor Noah 24.95 5.0 123838.0 \n",
224 | "4 Roger Wayne 23.95 4.5 113261.0 \n",
225 | "\n",
226 | " release_date title \n",
227 | "0 2011-08-16 Ready Player One \n",
228 | "1 2013-03-22 The Martian \n",
229 | "2 2015-01-13 The Girl on the Train \n",
230 | "3 2016-11-15 Born a Crime \n",
231 | "4 2016-09-13 The Subtle Art of Not Giving a F*ck "
232 | ]
233 | },
234 | "execution_count": 6,
235 | "metadata": {},
236 | "output_type": "execute_result"
237 | }
238 | ],
239 | "source": [
240 | "df.head()"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": 7,
246 | "metadata": {},
247 | "outputs": [
248 | {
249 | "name": "stdout",
250 | "output_type": "stream",
251 | "text": [
252 | "Saving figure Index vs Cum Reviews\n"
253 | ]
254 | },
255 | {
256 | "data": {
257 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjgAAAGECAYAAAA7lVplAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzt3XtcVHX+P/DXXLnNcBlARRFEFPNGyGpqiNfQvCDeKLOlbcvuWW61pVarlbm2W+2W29dqt2zX+q2Wum73TNdbpGUmXlLwjtzlzswAczuf3x/gKCoiChw483o+Hj6GmXPmzHvmM3Befj6fc45KCCFAREREpCBquQsgIiIiamkMOERERKQ4DDhERESkOAw4REREpDgMOERERKQ4DDhERESkOAw41Or69OmD5ORkpKSkYNq0aZgwYQJmzpyJgwcPXvM2n332WXz//fctWGWdPn36oKysrFnPefHFF7FixYoWr6UlbN26FWlpaUhJScHkyZMxf/58FBQUtMlrjx07tsk2PnDgAP7whz8AAA4ePIjHHnusRV57w4YN+NWvfuX+zqWkpGD27NnYt29fi2z/WuXk5GDevHnNft7KlSsxevRoLFy4sMHjjz32GFJSUpCSktLg9ywtLe2K2ztz5gwef/zxJl/3L3/5C15++eVLHv/+++8RGxvrfu2UlBSMGzcODz30ECoqKprc7sKFC5GZmen+effu3U0+h6i5tHIXQJ7hn//8J0wmk/v+e++9h6VLl2Lt2rXXtL3L/dGlhj777DOsXLkSK1euRGRkJIQQePfdd3HXXXfhiy++gF6vl7tEHD9+HEVFRQCAgQMH4s0332yxbQ8ePBjvvPOO+/7//vc/zJs3D9u2bYNWK8+fvvz8fJw6darZz1u3bh1effVVDB48uMHjF35effr0ueT3rDG5ubk4ffp0s+u4UFRUFP773/+67zudTjzyyCP44IMPMH/+/Cs+Nz09HXfddRcA4I9//ON11UHUGPbgUJtzOp0oKChAQECA+7GVK1di+vTpSElJwcMPP4yioiKcOnUKQ4cOhd1uBwC4XC4kJibixIkTSEtLw9dffw0A+PnnnzFnzhxMnz4dM2fOxNatW+FyuTBs2DBkZ2cDAN555x2MGTPG/Xp33303tm/f3miNubm5uOWWW/DSSy9h1qxZGD9+PL799lsAgMViweOPP44JEyYgLS0NJ0+edD+vqKgIjzzyCGbMmIHk5GS8/fbbAIDdu3dj6NChKCoqgiRJSEtLw1tvvdXgNa/0fjdt2oTp06djxowZSE1NxZ49e5r8nP/yl7/g2WefRWRkJABApVLh/vvvx7x582C327FhwwY88MAD7vUvvL9gwQIsWbIEc+bMwYQJE/DKK6/gnXfewezZszFu3Djs2rXLvd57773n3sbF9wFAkiQsXboUqampmDRpEiZOnIi9e/eioKAAb775Jn766ScsXLgQP/zwA6ZMmQKz2Yz4+HgUFxe7t5Gamort27fDbrdj2bJlmD59OqZOnYoFCxbAYrE0+VkAwPDhw1FcXIyqqqorbmfs2LGYP38+Jk6ciG+//RanTp1CWloaJk+ejOTkZHz55ZcAGm/rxr47LpcLzz33HM6cOYN77733kvoKCwvx4IMPIjk5GVOmTME//vEPAMD8+fNRVFSEZ5991v3aV+ubb77BtGnTMHXqVMyZMwcHDx6E3W7H4sWLcerUKdx3330AgLfeeguzZs1CcnIybrnlFmzZsqVZrwPU/V6Ul5cjMDAQALB3717ceeedmDVrFkaNGoXnn38eAPDqq6+irKwM8+fPx8GDB3HHHXdg8+bNyM7OxoQJE7BkyRLMnDkT48ePd9dhtVrx1FNPYcKECZg1axaeeeYZPPvsswCADz/8EFOnTsXMmTNx55134sSJE82unRRKELWymJgYMWXKFDFlyhSRkJAgxo4dK1566SVRUlIihBDiP//5j5g/f75wOBxCCCHWrFkj5s6dK4QQ4s477xRfffWVEEKIbdu2idmzZwshhPj1r38tvvrqK1FRUSHGjx8vcnJyhBBCFBYWipEjR4q8vDyxYMECsXr1avd2EhISxMmTJ0VVVZUYOnSosNlsl621tLRU5OTkiJiYGPG///1PCCHE119/LUaPHi2EEOLll18WTz/9tJAkSZSWloqRI0eKN998UwghRFpamtiyZYsQQoja2lqRlpYmvvjiCyGEEK+//rqYO3euWLFihbjnnnuEy+W65PUbe7/jxo0T+/btE0IIsXPnTrFixYorfuZlZWUiJiZGVFdXN7rO+vXrxf3333/Z+88884xITU0VdrtdnD17VsTExIh//etfQgghPvjgA/Hb3/7Wvd4//vEP9zYuvD9mzBhx4MAB8fPPP4t58+a53+8777wjHnjggUtec/fu3WLy5MlCCCGefvpp93aOHz8uRo8eLVwul1ixYoVYvny5kCRJCCHEa6+9JhYvXtzke5MkSaxatUpMmTJFCCGuuJ0xY8aIv/3tb+7nTps2TXz44YdCCCHy8/PFuHHjhNlsbrStr/TdufA9XuzOO+8U77//vhBCiKqqKpGcnCw+//zzBp/llZz77p5z9OhRkZCQ4P7d2Llzp0hISBAWi0Wkp6eLqVOnCiGEOHPmjPjNb34jamtrhRBCbNy4UaSkpAgh6r6zS5cuveS10tPTxcCBA8XUqVPFxIkTxbBhw8S0adPEu+++6/49fuyxx8SePXuEEEKYzWYxZMgQceTIESGEEImJieLw4cNCCCFmz54tvv32W3H69GkRExMjtm/fLoQQ4osvvhDjxo0TQgixfPly8dRTTwmXyyWqqqrE5MmTxaJFi4Tdbhf9+/d3v+/169eLjz/++IqfE3kODlFRmzjXdf7LL7/g/vvvx9ChQxEcHAygbp7IwYMHMXPmTAB1/+OvqakBAMyaNQv/+c9/cOutt2LDhg247bbbGmw3IyMDxcXFeOSRR9yPqVQqZGVlISkpCWvWrMG0adNQXFyMKVOm4Pvvv0dAQAASExObHKLR6XQYNWoUAKBfv37uuQW7du3CokWLoFKpYDKZkJSUBACorq7Gnj17UFlZiTfeeMP9WGZmJiZNmoR58+Zhzpw5+Pe//43PPvsMavWlHaiNvd/Jkyfj0UcfxahRo5CQkOD+n3djzm1bkqQrrnclY8aMgU6nQ2hoKHx9fZGYmAgAiIiIuKp5FucMGjQIAQEBWLNmDXJycvDDDz/Az8/vis9JTU3FCy+8gHvvvRfr16/HzJkzoVarsW3bNpjNZvf8K4fD4f4eXeynn35CSkoKVCoV7HY7evbs6R7SaWo754aCKioqkJmZidTUVABAWFgYNm/efMW2jo2NbfS705jq6mr8/PPPeP/99wEARqMRM2bMwI4dOzB58uQrPrcxu3btQkJCAsLDwwEAI0aMQEBAAI4cOdJgve7du2PZsmX49NNPkZ2djX379qG6urrJ7V84RPXJJ5/gjTfewMSJE93Df3/+85+xfft2rFy5EidPnoTNZoPVar3iNvV6vft71q9fP1RWVgIAduzYgcWLF0OtVsNoNCIlJQWnT5+GTqdDUlISUlNTMXr0aIwYMcL9uRMx4FCb6t+/PxYuXIgFCxagb9++CA8PhyRJmDt3LubMmQMAsNvt7j9sEydOxPLly3HixAns2bMHy5cvb7A9l8uF6OhofPLJJ+7HioqKYDKZIEkSnnvuOWzfvh1Dhw7FzTffjH//+9/w8fHBpEmTmqxVp9O5g4JKpWqwTFxwCTeNRgOgLkwIIbBmzRr4+PgAAMrKyuDl5QUAMJvNKC4uhkqlQnZ29mXnSjT2fn/3u99h5syZSE9Px4YNG/D+++9j3bp1jdYeEBCAHj16YP/+/bj55psbLHv88cfx0EMPQaVSNXgfDoejwXoXB8DLzVtpahtAXZh4+eWX8dvf/hbjxo1Dz5498emnnzZaO1AXMJxOJw4cOIDPP//cPVdLkiQsWrTIvROzWq2w2WyNbuPCOTgXamo7vr6+Dd7zhe1/8uRJhIaGNtrW5eXlV/zuNFaPuOiygJIkwel0NvncK23z4teWJOmSNjp48CAeffRR/Pa3v8WIESPwq1/9CsuWLWvWa6WmpmLfvn2YP38+1q5dC7VajdmzZ2PAgAFITEzE5MmTsW/fvkve48W8vLzcNV/43dJoNJf9nQPqhmKzsrKwa9cuvP322/j888/x2muvNat+UibOwaE2N2XKFMTGxronF44YMQLr1q1zz4F444038PTTTwOo+4M3efJkLFiwAOPHj3fvTM6Ji4tDdna2e07KkSNHMGHCBBQVFcHLywtDhgzB3/72NyQkJOCmm25CRkYGfvrpJ/f/Eq9FYmIi1q1bB0mSUFlZ6Z4nYDAYEBcXh1WrVgEAqqqqcMcdd7iXP/vss5g6dSr++Mc/4qmnnoLZbL5k25d7v06nE2PHjkVNTQ3uuOMOLF68GFlZWe65Oo159NFH8fLLL7vnIblcLvzf//0fMjMz0bNnT5hMJhw7dgw2mw0OhwPffPNNsz+LoKAgHDp0CEBdsPzxxx8vWSc9PR1jxozBnDlzMGDAAGzevBkulwtA3Y6qsZ14amoqXnrpJfTp0wdhYWEA6r4rH330Eex2OyRJwvPPP4/XX3+92XVf7XYMBgP69++PjRs3AgAKCgpwxx13oLa29opt3RiNRnPZEGgwGHDjjTfio48+AlAXhjdu3HhJOG2O4cOHY/v27cjNzQUAfPfddygpKcHAgQMbfO4//vgjbrzxRtx9990YPHhwg/Zpjqeffho5OTlYs2YNysvLkZmZid///vdISkpCXl4ecnNz3T2KWq32sp9DY0aPHo0NGzZAkiRUV1fj888/h0qlQklJCUaPHo3g4GDcfffdeOyxx67r6ExSFvbgkCyef/55TJ06FTt37kRqaiqKiopw2223QaVSISwsrEFPTWpqKj788EMsWbLkku2YTCa8+eab+NOf/gSbzQYhBP70pz+5u+WTkpKwadMmDBs2DN7e3rjhhhsQEBDg7lW5FvPmzcPixYsxceJEmEwmxMTEuJe9+uqreOmll5CcnAy73Y4pU6Zg6tSp+Oijj1BQUIA33ngDOp0OI0aMwPPPP4+//vWvl2z/4ver1WqxaNEiPPXUU9BqtVCpVFi2bBn0ej22bNmCNWvW4O9///sl20lOToYQAk888QScTidsNhv69++Pf/7zn9Dr9UhISMCQIUMwceJEhIaGYujQocjKymrWZ5GWluae/BkeHo5hw4Zdss7s2bPx5JNPIjk5GU6nEwkJCdi0aRMkSUJcXBzeeustPProo5cc2jxt2jS8/vrrDYLHww8/jFdeeQXTp0+Hy+VC3759sWDBgmbV3NztvPbaa3jhhRewevVqqFQqvPzyywgNDW20rc8Fisvp1asXvLy8MGvWLHzyyScNelheffVVvPjii9iwYQPsdjuSk5MxY8aMZr+3c/r06YPnnnsOjzzyCFwuF3x8fPD222/DYDCgd+/eUKvVuP3227FixQps3rwZkyZNgiRJGD16NMrLy69qmOpCgYGBePLJJ/Hqq69i4sSJuPfee5GSkgIfHx+EhYVh0KBByM7Oxk033YSkpCQ88cQTeOmll65q2w899BBefPFFJCcnw2g0IiQkBN7e3ggJCcF9992HtLQ0+Pj4QKvV4oUXXriWj4sUSCWa6jMkIiKS0WeffYaAgACMHDkSkiTh4YcfxtixYy+Zk0d0IQYcIiJq1zIzM7FkyRLU1NTA4XBg+PDhWLhwoWznM6KOgQGHiIiIFIeTjImIiEhxGHCIiIhIcdp8ALO4+NJDY1tSUJAvysubN/ufWhbbQH5sA/mxDeTHNpBfW7RBaKjxso8rrgdHq9U0vRK1KraB/NgG8mMbyI9tID8520BxAYeIiIiIAYeIiIgUhwGHiIiIFIcBh4iIiBSHAYeIiIgUhwGHiIiIFIcBh4iIiBSHAYeIiIgUhwGHiIiIFIcBh4iIiBSHAYeIiIgUhwGHiIiIFIcBh4iIiBRHK3cBRERE1PEIIWCtdaLKakel1Y5Kqw1VFjsqq+2ostphrnbAz1ePu5Ji4KVv+6uKM+AQERGRm83uqgsplrrQUmm1o9JSF2KqrOcfq7La4XSJK27L5O8Nm9PFgENEREQtz+mSYK521IUTy7kel4tCTP0/m911xW1pNSoE+OnRvZMRAX56BBj08Pc9f+vvp0eAX91t926BKCmxtNG7vKhOWV6ViIiIrptLklBldaDCYqv/Z0eF+YKf6x83VzuuuB0VAKOfHp0CfepCi58e/gY9Avy83GHlXJjx9dJCpVJdVX1Xu15rYMAhIiJqZyQhYKlx1IcV+yUBprz+fpXVDnGFUSJvvQaBBi90DfZDwLnAYtCfDzH1twZfHTRqZR13xIBDRETUhmwOF8rNNpRX1aLsXG+L+cIQUxdkXFLjyUWnVSPQoEevbgEINHghyOiFQIMXAg36ultjXc+Lj5fn7uY9950TERG1sIvDS9mFP1fZUG6uhbXW2ejzNWoVAg169OhirA8sXgg06s//bNAj0OjVrGEiT8WAQ0REdBVsDhcqzDaUXRhe6u+fu71SePHSa2AyeqFHmD+CjF4wGet6XoKM3u7gYvDRQc3g0iIYcIiIyONJQqDSYkdpZS1KqmpQWlmLsqrzYabcbIOlpvGJuu7w0sWIIH9vmIxeMPl7XxBkvOHrzV1uW+KnTUREiudwSig319YHmLrb0gtuy6psjc558dJpYPL3QmRnA4KM3jD51/W8nA8w3vDx0nDIqJ1hwCEiog6vxuZsEFhKK2thsbmQf9aMkqpaVFnsaGzKboCfHpFdjAj29677F3DhrRd8ON+lQ2LAISKids/pklBWVYviyloUV9SgpKLutriiBiWVtY0OH2nUKgQZvRDTPbBhcAnwRoh/XW+MTtv2Z9ml1seAQ0REshNCwFztqAstlTUorqhFSX2AKa6oRZm59rLne9FqVAgO8EGPLsaLel68ERMVAsnugFrN3hdPxIBDRERtwuZw1YeW2voQU98TU1l3a3Nc/hIBgQY9orsFIDTAB6GB3ggN9EFooA9CArwRaPRq9Kij0CAfFBc3flQTKRsDDhERtRib3YWzFTU4W16NovK627PlNSgqr0G52XbZ53jrNegUdD601AWYuttgf2/odRxCouZjwCEiomapsTlRXFFTH1zOBZm6nyst9kvWVwEw+Xuhb2RQg/By7p+fNyfxUstjwCEiokvYHC4UlVWjsOzSnpgqa2Mhxhv9egShU5AvOgX6oLPJp/5nb07kpTbHgENE5KGEECg321BQVo3C0rowU1j/c1lV7SWHVatUQLC/N/r3CEInky86B9YFmM4mH4QE+ECnVdbFGqljY8AhIlI4m93lDi8FpVb3z0VlNZed2Btg0KNPRCC6mHzRxeRbF2bq58hoNQwx1DEw4BARKYAQAlXVDuSXWJFfYkVBqRUF9b0yl5vcq9Oq0TnIF12C60JMmOn8z558BWpSDn6LiYg6ECEEKix25JfWB5kSK/LqQ83lLvQYZKyb3NsluGGIMfl786KOpGgMOERE7dC5+THnemTqAk018kusqLY1DDIqFdApyBcx3QPRNcQP3UL8EBbshy4mX3jpObmXPBMDDhGRzKqq7cg7a0FusRU5xRZ3qKm1N5wfo1Gr0CnIB317BKFrsJ87zHQ2+fAoJaKLMOAQEbURh9OF/JJq5BZb6v9ZkXvWgsqLDrvWqFXoYvJF1xC/8/+CfdHZ5MtJvkRXiQGHiKiFCSFQVFaNA8dKkFNsQV6xBTlnLSgqq4F00QWVgv29cWN0MMI7GRAeakB4JwM6B/FoJaLrxYBDRHQdHE4XcoutyC4yI6eoLsjklVhQY2s4vOSt16BnN3+EhxrQPdQP4Z0M6BZigK83/wwTtQb+ZhERXaXqWgfOFFlwpsiM7CILzpw1o6CkukGvjFqlQmeTDwb3DUKIvxe6hxoQ3skPwf7evBwBURtiwCEiusi5Q7HremXMOFNkQXaRGSWVtQ3W0+vUiOpqRERnIyI7GxHR2YBuIX7QaTUIDTWiuNgs0zsgIgYcIvJoQgiUVtbiVKEZpwur3D005mpHg/UMPjr07xGEiM7G+n8GdA7yhVrNXhmi9ogBh4g8SoXFhlMFVThdYMapwrpbS03DMBPs741BvQPqe2XqwkyQ0YtDTEQdCAMOESmWpcaB0wVVdb0zBVU4XWi+5LIFIQHeuCEyCFFdjIjsUhdoDD46mSomopbSZMCRJAlLlixBVlYW9Ho9li5disjISPfy9957D1988QVUKhUefPBBJCUltWrBRESXY7O7cLqwCqcKzHU9NIVVKK5oOGcmwKBHXK8Q9AgzIirMH5FdjPD31ctUMRG1piYDzubNm2G327F27VpkZGRg+fLlWLlyJQCgqqoKq1evxqZNm1BTU4Np06Yx4BBRqxNC4Gx5DY7nVeJkfhVO5Fci96y1wdFMft5aDIgy1YWZLv7oEeaPIKOXjFUTUVtqMuDs3bsXiYmJAIC4uDgcOnTIvczHxwddu3ZFTU0Nampqrmp8OijIF9pWPqV4aKixVbdPTWMbyE9JbVBd68DRM+XIyi5HZnY5srLLGkwC1mnV6BMZhD6RQYiJCELv7oHobPKVfc6Mktqgo2IbyE+uNmgy4FgsFhgMBvd9jUYDp9MJrbbuqWFhYZg8eTJcLhceeOCBJl+wvLz6OsptGg/NlB/bQH4duQ0kIVBQWo0TeZU4mV+JE/lVyC+24sLz/4YEeKNfPxOiu/ojulsAuncyNDzzryShpMTS5rVfqCO3gVKwDeTXFm3QWIBqMuAYDAZYrVb3fUmS3OFmx44dOHv2LLZs2QIAuPfeexEfH4/Y2NiWqJmIPIDTJeF0oRnHcipwNKcCx/MqYa09f7VsvU6NPhGB6Nk1ANFd/dGzWwAC/DhvhoiurMmAEx8fj61bt2LSpEnIyMhATEyMe1lAQAC8vb2h1+uhUqlgNBpRVVXVqgUTUcdWY3PiRF4ljuZW4FhOJU4WVMHhlNzLQwK8ERsdjF7dAhDdLQDdQv2gUfO6TETUPE0GnKSkJKSnp2P27NkQQmDZsmVYtWoVIiIiMG7cOHz//fe47bbboFarER8fj4SEhLaom4g6iAqLDUdzKnAstxLHciuQc9aCc3OBVQDCOxkQEx6I3t0D0Ds8kBOBiahFqIS46NK2rawtxuI45iovtoH85GyDCosNmdnlyDxTjswzFThbXuNeptWo0TPMiN7dA9E7PBC9ugUo9mKT/D2QH9tAfu16Dg4R0ZVUWu3IOlNeH2oqUFh2/kACHy8NYqODEdM9EL3DA9Cjiz90Wg43EVHrY8AhomYxV9uRdabC3UOTX3L+IAQvvQYDewbjhshA3BARhMjORl6riYhkwYBDRFdks7uQlVOBw6fLcPh0GXKLzwcavU6N/lEm3BARiBsi6wJNg8O1iYhkwoBDRA1IQiCnyIJDp0px+HQ5juVWwOmqm6qn06rRNzLIHWiiwvwZaIioXWLAISKUVdXil9Nl+OVUGQ6fLm9wde2Izgb0jzKhfw8TeocHQNfKZyInImoJDDhEHsjmcCHrTDkOnaoLNQWl5ycGBxm9kDCwC/pHmdAv0gR/nlSPiDogBhwiD3G2ogYHT5Ri/4kSZGZXwOmqO7meXqdGbHQw+vUwoX+UCV2D5b+GExHR9WLAIVIop0vC0ZwKHDhRioMnSxv00oSH+mFgdDAGRgUjulsAD90mIsVhwCFSkHKzDQdPliIrtxI/Z52Fze4CUNdLE9crBLG9ghHbMxgmf2+ZKyUial0MOEQdmBAC+SVW/HysBPuOFuN04fkzhnYK8kFsbDBio4PRp3sgJwcTkUdhwCHqYCRJ4ER+JfYdLcHPx4rdl0LQqFXoGxmEG3uFYMyQCOjQpldhISJqVxhwiDoAh9OFw6fLse9YMTKOlaCquu4wbi+dBoP7hGJQTChio4Ph560DAISGGngNHiLyaAw4RO2Uze7C/hMl+CmrGAdPlMLmqJtP4++rw8gbwzCodyj69Qji0BMR0WUw4BC1I7V2Jw6cKMWezLM4eKIUdmfdodydgnwQHxOK+N6h6NnVn9d3IiJqAgMOkcwaCzVdTL4YfEMn3HRDJ3QL9eO5aYiImoEBh0gG54af9hw5iwMnS+G4INQMuaEThjDUEBFdFwYcojbikiQcPl2O3b8U4uejJe45NWHBvhjch6GGiKglMeAQtSIhBE4WVGH3L0XYc6TIffRTSIA3kvqH46a+ndEthKGGiKilMeAQtYKismrs+qUQuw8Xuc9TY/DRYUx8Nwzv1wXR3fwZaoiIWhEDDlELqa514IfDRfjuYCFOFVQBqLtEwtB+nTGsX2f0jzJBq+E1n4iI2gIDDtF1kITAkexyfHegAD8fLYbDKUGlAgb0NGF4vy4YFBMCbz1/zYiI2hr/8hJdg+KKGqQfLED6wQKUVtkA1B0BNSI2DMP7d0GQ0UvmComIPBsDDtFVsjtc2JtVjJ0H8pF5pgIA4KXXIDE2DImxXTmvhoioHWHAIWpCYVk1tu3LQ/rBAlhrnQCAPt0DMSI2DIP7dIKXnpdKICJqbxhwiC7D6ZKQcawEW/fl4Uh2OYC6a0BNGhaJxBvD0DnIV+YKiYjoShhwiC5QWlmL7fvzsXN/PiqtdgDADRGBGD2oG+JjQnkUFBFRB8GAQx5PCIHDp8uxZW8u9p8ogRCAr5cWtwwOx5hB3RAW7Cd3iURE1EwMOOSxbA4Xdh0qxOa9ucgvsQIAosL8MWZQNwzp2wleOs6tISLqqBhwyOOUVdViy8+52JGRD2utExq1CsP7d8Ytg7sjKsxf7vKIiKgFMOCQRxBC4ER+Fb7dk4O9WcWQhIDRV4fkm3tgTHw3BBp43hoiIiVhwCFFk4TAvqMl+OqHbJzMr7t8QvdOBiQN7o6h/TpBp+UwFBGREjHgkCI5nBJ2/VKIr344g6KyaqgADOodgvFDuiOmeyBPyEdEpHAMOKQo1bVObMvIw7d7clBptUOjVmFEbBgmDo3g0VBERB6EAYcUocJiw7d7crAtIw81Nhe89RrcOjQCSYO787pQREQeiAGHOrRysw1f7c7G9v3xwGGRAAAfi0lEQVT5cDgl+PvpMWlYJMYM6gZfb53c5RERkUwYcKhDKjfb8OWuumDjdEkI9vfG5OGRSBjYhROHiYiIAYc6lrKqWny5Oxs79ufD6RIICfDGlJt74OYBXXgZBSIicmPAoQ6h3GzD57tOYyeDDRERXQUGHGrXLDUOfLk7G1v25sLhlBAa6I0pw3tgOIMNERFdAQMOtUs2uwvf/pSDr344gxqbE0FGL6SMiGKPDRERXRUGHGpXnC4JO/bn49P006iy2mHw0eH2sb0wNr4bJw8TEdFVY8ChdkESAj8eLsJ/dp5EcUUtvHQaTE3ogQk3RcDHi19TIiJqHu45SHbHciuwZssxnCowQ6NW4ZZfhWPKzT3g76eXuzQiIuqgGHBINiUVNfhk2wnsyTwLALipbyfMHBWN0EAfmSsjIqKOjgGH2lyNzYkvdmVj054cOF0Senb1x+xxvdGrW4DcpRERkUIw4FCbkYTAdwcKsGH7CVRVO2Dy98KsUdG4qV9nqHl1byIiakEMONQmThdWYfU3R3GqoApeOg2mJ0Zh/E0R8NLxyCgiImp5DDjUqqy1DmzYcRLbfs6DADC0X2fcNqYXr/BNREStigGHWoUkBL4/WIhPth2HudqBsGBf/DopBn17mOQujYiIPAADDrW47IIq/HXNzzieWwm9To1Zo6Mxfkh3noGYiIjaDAMOtRinS8Ln35/Gl7uz4XQJ/ComFLPH9UZwgLfcpRERkYdhwKEWcSK/Eh98mYm8EiuCA7xxZ1IM4nqFyF0WERF5KAYcui42uwv/2XkS3+7JgQAwelA3PDTrRljNtXKXRkREHowBh67Z4dNl+OCrTJRU1qJzkA/unngD+kQEwddbx4BDRESyYsChZrM5XFi39QS2/JwLtUqFicMikJIQBT3PaUNERO0EAw41y6mCKrz72WEUlVUjLNgX9yX3Q48u/nKXRURE1AADDl2Vc0dIff59NiQhMH5Id8wY2ZO9NkRE1C4x4FCTCkqt+Ptnh3G60AyTvxfundwPfSOD5C6LiIioUQw41ChRf3HMjzYfhd0h4eYBXTDnlhj4evNrQ0RE7Rv3VHRZNTYnVn+Thd2Hi+DjpcVD0/phyA2d5C6LiIjoqjDg0CWyC81Y+d9DOFteg55d/fHg1P4ICfSRuywiIqKrxoBDbkIIbNmbi4+3HofTJTBxaASmj+zJa0gREVGH02TAkSQJS5YsQVZWFvR6PZYuXYrIyEj38u3bt+Ott94CAPTr1w+LFy+GSqVqvYqpVdTYnHj/iyPYe7QYRl8d5k7ph4E9g+Uui4iI6Jo0GXA2b94Mu92OtWvXIiMjA8uXL8fKlSsBABaLBX/+85/xr3/9CyaTCX//+99RXl4Ok8nU6oVTy8krseKtDQdRWFaNPt0Dcf/U/ggyesldFhER0TVrMuDs3bsXiYmJAIC4uDgcOnTIvWzfvn2IiYnBK6+8gpycHKSmpjLcdDA/ZZ7Fe18egc3uwq03RWDm6J7QqDkkRUREHVuTAcdiscBgMLjvazQaOJ1OaLValJeX44cffsDGjRvh6+uLO++8E3FxcYiKimp0e0FBvtBqW/fkcKGhxlbdvhK4XBJWf3UE67ceh7deg6d/PRiJg7q12PbZBvJjG8iPbSA/toH85GqDJgOOwWCA1Wp135ckCVpt3dMCAwMxcOBAhIaGAgAGDx6MI0eOXDHglJdXX2/NVxQaakRxsblVX6Ojs9Q48PZ/D+Hw6XJ0DvLBIzMGIjzU0GKfG9tAfmwD+bEN5Mc2kF9btEFjAarJsYj4+Hjs2LEDAJCRkYGYmBj3sgEDBuDo0aMoKyuD0+nE/v370atXrxYqmVpDYVk1Xv7XTzh8uhxxvULw/G8GIzzU0PQTiYiIOpAme3CSkpKQnp6O2bNnQwiBZcuWYdWqVYiIiMC4cePw5JNPYu7cuQCAW2+9tUEAovblyOky/N/GQ7DWOjFpWCRmjOoJNY94IyIiBVIJIURbvmBbdFWxS/JS2zPy8OGmowCA39x6A0bEhrXaa7EN5Mc2kB/bQH5sA/nJOUTFE/0pnCQJfLz1ODbtyYHBR4dHZwxETPdAucsiIiJqVQw4CuZwuvD3zw7jp6xihAX74vFZsegU5Ct3WURERK2OAUehqmudWLH+ALJyKtCneyDmzRwIX2+d3GURERG1CQYcBSo32/CXj/cjt9iCX/UJxf3J/aBr5XMPERERtScMOApTWFaN19ZkoLSqFmPiu+HOW2KgVvNIKSIi8iwMOAqSXWjGa2szYKlxYFpiFJJv7sELnxIRkUdiwFGIk/lVeH1tBmpsTtx1ax+Mjmu5yy4QERF1NAw4CnAstwJ/+Xg/bA4X5k7ph+EDushdEhERkawYcDq4I9nleHPdAThdEh5MGYAhN3SSuyQiIiLZMeB0YIdOlWLF+oMQQuDh6QMwqHeo3CURERG1Cww4HdSR02VYsf4gAGDezFgM7Bksc0VERETtR5NXE6f251huBd5YfwBCCMybMZDhhoiI6CIMOB3MqYIq/OXj/XC5BB5KGYABDDdERESXYMDpQM4UmfH62gzYHC7cl9wPg2I454aIiOhyGHA6iKLyary2NgPVtU7cM6kvburbWe6SiIiI2i0GnA6g0mrH62szYK524M7xMUgYGCZ3SURERO0aA047V2Nz4q8f70dxRS2Sb+6BsfHhcpdERETU7jHgtGNOl4T/23gI2UVmJMaGYVpilNwlERERdQgMOO2UJATe//IIfjlVhhujg3HXrX144UwiIqKrxIDTTv135yns/qUI0V398eC0AdCo2VRERERXi3vNdmj3L4X47PvTCAnwxmOzYuGl08hdEhERUYfCgNPOnMivxPtfZsLHS4PHU2+E0Vcvd0lEREQdDgNOO1JWVYsV6w/CJUl4YOoAdAvxk7skIiKiDokBp52w2V14c90BVFntmD22N2KjeQkGIiKia8WA0w4IIfDPrzNx5qwFI2/silsG81w3RERE14MBpx3438952H247oipX4+P4eHgRERE14kBR2bH8yqxZssxGH11eGjaAGg1bBIiIqLrxb2pjKqsdqzceAiSEHhwan+Y/L3lLomIiEgRGHBk4pIkvP3fQyg32zBzVDT69jDJXRIREZFiMODI5PPvs5F5pgKDeodg4tAIucshIiJSFAYcGRzNqcCn6acQ7O+Feyf35aRiIiKiFsaA08astQ78/bNfAAD3T+0PX2+dzBUREREpDwNOG6o7300WSqtsmJoQhd7hgXKXREREpEgMOG3ouwMF+CnzLHqHB2DKzZFyl0NERKRYDDhtpKSiBv9vyzH4eGlxX3I/aNT86ImIiFoL97JtQBICq77KhM3uwp1JvRES4CN3SURERIrGgNMGtu/Lw5HscsT1CsHw/l3kLoeIiEjxGHBaWXFFDT7eegJ+3lrcdWsfHhJORETUBhhwWpEkBFZ9eQQ2hwtzbolBoMFL7pKIiIg8AgNOK9q5Px+ZZyoQ1ysEw/p3lrscIiIij8GA00qqrHas23YC3noN0iZwaIqIiKgtMeC0krX/Ow5rrRMzRvZEkJFDU0RERG2JAacVHMkux65fChHZxYix8eFyl0NERORxGHBamMMp4V/fZEGlAn5zax+o1RyaIiIiamsMOC1s054zKCqrxtj4cPTo4i93OURERB6JAacFVVps+GJXNgw+OkxPjJK7HCIiIo/FgNOC/rPzJGrtLkxPjIKvt07ucoiIiDwWA04LOVNkxs79BegW4oeRcV3lLoeIiMijMeC0ACEE1mw5BgHg9rG9eKVwIiIimXFP3AIyjpcg80wFYqODMaBnsNzlEBEReTwGnOskSQIbtp+ESgXcNqaX3OUQERERGHCu2w+Hi5BXYkXCgDB0DfGTuxwiIiICA851cbokbPzuJDRqFaYm9JC7HCIiIqrHgHMdvjtQgOKKWoyO64aQQB+5yyEiIqJ6DDjXyO5w4dP0U9Br1Zhyc6Tc5RAREdEFGHCu0bZ9eaiw2DFucDgCDLxaOBERUXvCgHMNHE4XvvrxDLz0Gkwcyt4bIiKi9oYB5xqkHyxEpcWOMYO6weDDSzIQERG1Nww4zeSSJHy5OxtajRoThnSXuxwiIiK6DAacZvrx8FmUVNYi8cYwzr0hIiJqpxhwmkESAl/szoZGrcLEoRFyl0NERESNYMBphn1HS5BfYsWwfp0REsDz3hAREbVXDDjNsGnPGQDAxGE8coqIiKg9azLgSJKEP/zhD7j99tuRlpaG7Ozsy64zd+5c/Pvf/26VItuDUwVVOJZbiYE9g3nNKSIionauyYCzefNm2O12rF27Fk8++SSWL19+yTp//etfUVlZ2SoFthff/pQDAEgaEi5zJURERNQUbVMr7N27F4mJiQCAuLg4HDp0qMHyr7/+GiqVCiNHjryqFwwK8oVWq7mGUq9eaKixRbdXWlmDPUfOontnI0YPiYRKpWrR7StRS7cBNR/bQH5sA/mxDeQnVxs0GXAsFgsMBoP7vkajgdPphFarxdGjR/H555/jzTffxFtvvXVVL1heXn3t1V6F0FAjiovNLbrN9dtPwCUJjB3UFSUllhbdthK1RhtQ87AN5Mc2kB/bQH5t0QaNBagmA47BYIDVanXflyQJWm3d0zZu3IiioiL85je/QV5eHnQ6Hbp163bVvTkdgcPpwvaMfBh8dBjev4vc5RAREdFVaDLgxMfHY+vWrZg0aRIyMjIQExPjXvb000+7f16xYgVCQkIUFW4AYG9WMSw1DkwcGgG9rnWH1oiIiKhlNBlwkpKSkJ6ejtmzZ0MIgWXLlmHVqlWIiIjAuHHj2qJGWW3LyAcAjIzrKnMlREREdLWaDDhqtRovvvhig8eio6MvWW/evHktV1U7UVBqxdGcCvSNDELnIF+5yyEiIqKrxBP9XcH2+t6bUey9ISIi6lAYcBrhcLrw/aFCGH11iI8JlbscIiIiagYGnEbsPVo3uThhYBi0Gn5MREREHQn33I347kABAGDkjRyeIiIi6mgYcC6j3GzDkdPl6NUtAF1MnFxMRETU0TDgXMYPh4sgAAzv31nuUoiIiOgaMOBcxveHCqFRqzCkLwMOERFRR8SAc5GcsxbkFlsQGx0Mg49O7nKIiIjoGjDgXGTXL4UAwOtOERERdWAMOBeQhMAPh4vg46XFjb2C5S6HiIiIrhEDzgVO5lWh3GzDr2JCodPywppEREQdFQPOBX7KOgsAGHwDz1xMRETUkTHg1BNCYG/WWfh4adE30iR3OURERHQdGHDqnS40o7TKhrhewdBp+bEQERF1ZNyT1/sps354qk8nmSshIiKi68WAg3PDU8Xw0mvQP4rDU0RERB0dAw6AwrJqnK2owcAoE/Q6Hj1FRETU0THgADhwohQAEBsdInMlRERE1BIYcHA+4AzsyeEpIiIiJfD4gFNjc+JoTgV6dDEiwOAldzlERETUAjw+4Bw+XQaXJBAbzUszEBERKYXHBxz38BQDDhERkWJ4dMARQuDgyVIYfHSICvOXuxwiIiJqIR4dcArLqlFhsaNfjyCoVSq5yyEiIqIW4tEB50h2OQCgb2SQzJUQERFRS2LAAQMOERGR0nhswJGEQGZ2OYL9vRAa6CN3OURERNSCPDbg5J61wFrrxA2RQVBx/g0REZGieGzA4fAUERGRcnl8wLkhggGHiIhIaTwy4EhC4EReJUICvGHy95a7HCIiImphHhlwCkurYa11ond4gNylEBERUSvwyIBzPK8SANCrGwMOERGREnl0wIlmwCEiIlIkjww4J/Iq4aXXIDzUIHcpRERE1Ao8LuBYahwoKK1GzzB/qNU8/w0REZESeVzAOZnP+TdERERK53EBxz3BmEdQERERKZbHBZzTBWYAQFSYv8yVEBERUWvxqIAjhEB2kRnB/t4w+OjkLoeIiIhaiUcFnHKzDeZqB3p0McpdChEREbUijwo42UV1w1MRDDhERESK5lkBp7Au4ER2ZsAhIiJSMo8KOGeKLACASPbgEBERKZpHBZzsIjMCDXoE+OnlLoWIiIhakccEnEqrHeVmG4eniIiIPIDHBJyccxOMGXCIiIgUz2MCTm6xFQDQvRMvsElERKR0HhNw8krqJhh3C/WTuRIiIiJqbR4TcPJLrNBqVOgU5CN3KURERNTKPCLgSEIgr8SKLiY/aNQe8ZaJiIg8mkfs7Usqa2F3SAjn8BQREZFH8IiAk18/wZjzb4iIiDyDRwQc9wTjEB5BRURE5Ak8I+CwB4eIiMijeETAKSithk6rRnCAt9ylEBERURtQfMARQqCovBqdgnygVqnkLoeIiIjagOIDTlW1A7V2FzoH+cpdChEREbURxQecorJqAEBnnuCPiIjIYyg+4JwtrwEAnsGYiIjIgyg+4BSVn+vB4RAVERGRp/CAgFPXg9PZxIBDRETkKbRNrSBJEpYsWYKsrCzo9XosXboUkZGR7uUffPABvvjiCwDAqFGj8Oijj7ZetdfgbFk19Fo1Ag16uUshIiKiNtJkD87mzZtht9uxdu1aPPnkk1i+fLl7WU5ODj799FOsWbMGa9euxXfffYfMzMxWLbg5hBAoqqhBpyAfqHiIOBERkcdosgdn7969SExMBADExcXh0KFD7mVdunTBP/7xD2g0GgCA0+mEl5dXK5XafJYaB2x2F0IDOcGYiIjIkzQZcCwWCwyG89dw0mg0cDqd0Gq10Ol0MJlMEELgT3/6E/r164eoqKgrbi8oyBdareb6K7+C0FAjAKAypwIAEN7F3/0YtQ1+3vJjG8iPbSA/toH85GqDJgOOwWCA1Wp135ckCVrt+afZbDYsWrQIfn5+WLx4cZMvWF5/VFNrCQ01orjYDAA4nl0GAPDVqd2PUeu7sA1IHmwD+bEN5Mc2kF9btEFjAarJOTjx8fHYsWMHACAjIwMxMTHuZUIIPPzww+jTpw9efPFF91BVe1FaVQsACPbnNaiIiIg8SZM9OElJSUhPT8fs2bMhhMCyZcuwatUqREREQJIk/Pjjj7Db7di5cycA4IknnsCgQYNavfCrUVpZH3B4kU0iIiKP0mTAUavVePHFFxs8Fh0d7f754MGDLV9VCyljDw4REZFHUvSJ/kqqaqHTqmH01cldChEREbUhRQec0spamPy9eQ4cIiIiD6PYgGNzuGCpcSDEv/2cl4eIiIjahmIDzrn5NybOvyEiIvI4ig04FRY7ACDQwB4cIiIiT6PYgFNpsQEAL7JJRETkgRQbcM714ASwB4eIiMjjKDbgVFrrenAC2INDRETkcZQbcM7NwfFjDw4REZGnUWzAqbCwB4eIiMhTKTbgVFrtMPjooNUo9i0SERFRIxS796+w2Nl7Q0RE5KEUGXDsDhdqbE4E+jHgEBEReSJFBpwKKw8RJyIi8mSKDDhV5wIOe3CIiIg8kiIDjqXaAQAw+jLgEBEReSJFBhxzTV0Pjp+PVuZKiIiISA6KDDjWGicAwOjDHhwiIiJPpMiAc64Hx+Cjk7kSIiIikoMiA461pm4OjsGXAYeIiMgTKTLgmOsnGbMHh4iIyDMpMuBYaxxQqQBfL04yJiIi8kSKDDjmGgf8vHVQq1Vyl0JEREQyUGTAsdY4ODxFRETkwRQXcIQQsNQ4GXCIiIg8mOICTnWtE5IQ8PPm/BsiIiJPpciAAwA+DDhEREQeS3kBx1Z3iLgPj6AiIiLyWMoLOPWXafDRM+AQERF5KuUFHHcPjkbmSoiIiEguygs49XNwvNmDQ0RE5LEUG3B4FmMiIiLPpcCAUzdE5c0hKiIiIo+lwIDDScZERESeTnkBh4eJExEReTzFBZyacz04HKIiIiLyWIoLOO6jqNiDQ0RE5LEUF3BqbPUBR8ceHCIiIk+luIBjc7igAqDTKu6tERER0VVSXAqwOVzQ6dRQqVRyl0JEREQyUV7Asbug13J4ioiIyJMpL+A4XPDSKe5tERERUTMoLgnY7S7o2INDRETk0RQXcGwOF/TswSEiIvJoikoCQoj6gMMeHCIiIk+mqIDjkgQkSUDPQ8SJiIg8mqKSgN0hAQCPoiIiIvJwygo4ThcAcA4OERGRh1NUErA76gMOe3CIiIg8msICTv0QFXtwiIiIPJqikoDdeS7gsAeHiIjIkykr4LiHqBT1toiIiKiZFJUEzk8yZg8OERGRJ1NWwKmfg6NjDw4REZFHU1QScEp1AUerUdTbIiIiomZSVBJwuQQAQKtWyVwJERERyUlZAUeqCzgaDQMOERGRJ1NWwHFxiIqIiIgUFnCc9UNUGg5REREReTRFBZzzQ1SKeltERETUTIpKAq5zR1GxB4eIiMijKSrguIeo2INDRETk0RSVBM714HAODhERkWdrMuBIkoQ//OEPuP3225GWlobs7OwGyz/++GPMmDEDt912G7Zu3dpqhV4N93lw2INDRETk0bRNrbB582bY7XasXbsWGRkZWL58OVauXAkAKC4uxurVq7F+/XrYbDbMmTMHCQkJ0Ov1rV745YSHGhBo9EJIoLcsr09ERETtQ5MBZ+/evUhMTAQAxMXF4dChQ+5lBw4cwKBBg6DX66HX6xEREYHMzEzExsY2ur2gIF9ota1zMcypY4xIHt0LKhWHqOQWGmqUuwSPxzaQH9tAfmwD+cnVBk0GHIvFAoPB4L6v0WjgdDqh1WphsVhgNJ4v3M/PDxaL5YrbKy+vvo5ymxYaakRxsblVX4OujG0gP7aB/NgG8mMbyK8t2qCxANXkZBWDwQCr1eq+L0kStFrtZZdZrdYGgYeIiIhIDk0GnPj4eOzYsQMAkJGRgZiYGPey2NhY7N27FzabDWazGSdOnGiwnIiIiEgOTQ5RJSUlIT09HbNnz4YQAsuWLcOqVasQERGBcePGIS0tDXPmzIEQAr/73e/g5eXVFnUTERERNUolhBBt+YJtMRbHMVd5sQ3kxzaQH9tAfmwD+bXrOThEREREHQ0DDhERESkOAw4REREpDgMOERERKQ4DDhERESkOAw4REREpDgMOERERKQ4DDhERESkOAw4REREpTpufyZiIiIiotbEHh4iIiBSHAYeIiIgUhwGHiIiIFIcBh4iIiBSHAYeIiIgUhwGHiIiIFIcBh4iIiBRHK3cBLUWSJCxZsgRZWVnQ6/VYunQpIiMj5S5LMfbv349XX30Vq1evRnZ2NhYsWACVSoXevXtj8eLFUKvV+Nvf/oZt27ZBq9Vi0aJFiI2Nbda6dHkOhwOLFi1CXl4e7HY7HnroIfTq1Ytt0IZcLheee+45nDp1ChqNBn/84x8hhGAbyKC0tBQzZszA+++/D61WyzZoY9OmTYPRaAQAhIeH4/bbb8fLL78MjUaDESNG4NFHH210f5yRkXHV67YIoRDffPONeOaZZ4QQQuzbt088+OCDMlekHO+++66YMmWKSE1NFUII8cADD4jdu3cLIYR4/vnnxaZNm8ShQ4dEWlqakCRJ5OXliRkzZjR7Xbq8devWiaVLlwohhCgrKxOjRo1iG7Sxb7/9VixYsEAIIcTu3bvFgw8+yDaQgd1uFw8//LAYP368OH78ONugjdXW1oqUlJQGj02dOlVkZ2cLSZLE3LlzxaFDhxrdHzdn3ZagmB6cvXv3IjExEQAQFxeHQ4cOyVyRckRERGDFihV4+umnAQC//PILbrrpJgDAyJEjkZ6ejqioKIwYMQIqlQpdu3aFy+VCWVlZs9Y1mUyyvcf27NZbb8WECRPc9zUaDdugjd1yyy0YPXo0ACA/Px8hISHYtm0b26CNvfLKK5g9ezbeffddAPxb1NYyMzNRU1ODe+65B06nE/PmzYPdbkdERAQAYMSIEdi1axeKi4sv2R9bLJarXrelKGYOjsVigcFgcN/XaDRwOp0yVqQcEyZMgFZ7PgsLIaBSqQAAfn5+MJvNl3z+5x5vzrp0eX5+fjAYDLBYLHjssccwf/58toEMtFotnnnmGbz00kuYMGEC26CNbdiwASaTyb0zBPi3qK15e3vj3nvvxXvvvYcXXngBCxcuhI+Pj3t5Y5+rRqNp9LNuzX23YnpwDAYDrFar+74kSQ12ytRy1OrzudhqtcLf3/+Sz99qtcJoNDZrXWpcQUEBHnnkEcyZMwfJycn485//7F7GNmg7r7zyCp566incdtttsNls7sfZBq1v/fr1UKlU2LVrF44cOYJnnnkGZWVl7uVsg9YXFRWFyMhIqFQqREVFwWg0oqKiwr383OdaW1t7yf74cp91Y+u21L5bMT048fHx2LFjBwAgIyMDMTExMlekXP369cMPP/wAANixYwcGDx6M+Ph4fPfdd5AkCfn5+ZAkCSaTqVnr0uWVlJTgnnvuwe9//3vMmjULANugrW3cuBHvvPMOAMDHxwcqlQoDBgxgG7Shjz76CB9++CFWr16Nvn374pVXXsHIkSPZBm1o3bp1WL58OQCgqKgINTU18PX1xZkzZyCEwHfffef+XC/eHxsMBuh0uqtat6Uo5mri52ZiHz16FEIILFu2DNHR0XKXpRi5ubl44okn8PHHH+PUqVN4/vnn4XA40LNnTyxduhQajQYrVqzAjh07IEkSFi5ciMGDBzdrXbq8pUuX4quvvkLPnj3djz377LNYunQp26CNVFdXY+HChSgpKYHT6cR9992H6Oho/h7IJC0tDUuWLIFarWYbtCG73Y6FCxciPz8fKpUKTz31FNRqNZYtWwaXy4URI0bgd7/7XaP744yMjKtetyUoJuAQERERnaOYISoiIiKicxhwiIiISHEYcIiIiEhxGHCIiIhIcRhwiIiISHEYcIiIiEhxGHCIiIhIcf4/MeH+2W1LvYQAAAAASUVORK5CYII=\n",
258 | "text/plain": [
259 | ""
260 | ]
261 | },
262 | "metadata": {},
263 | "output_type": "display_data"
264 | }
265 | ],
266 | "source": [
267 | "plt.style.use('seaborn')\n",
268 | "top_n = 50000\n",
269 | "x = list(range(top_n))\n",
270 | "cumsum = df.iloc[:top_n]['rating_count'].cumsum()/total_ratings\n",
271 | "plt.plot(x, cumsum);\n",
272 | "plt.title('Review Index vs. Cumulative Percent of Total Ratings')\n",
273 | "save_fig('Index vs Cum Reviews')"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": 8,
279 | "metadata": {},
280 | "outputs": [
281 | {
282 | "data": {
283 | "text/plain": [
284 | "436796"
285 | ]
286 | },
287 | "execution_count": 8,
288 | "metadata": {},
289 | "output_type": "execute_result"
290 | }
291 | ],
292 | "source": [
293 | "len(df)"
294 | ]
295 | },
296 | {
297 | "cell_type": "code",
298 | "execution_count": 18,
299 | "metadata": {},
300 | "outputs": [
301 | {
302 | "data": {
303 | "text/plain": [
304 | "26101"
305 | ]
306 | },
307 | "execution_count": 18,
308 | "metadata": {},
309 | "output_type": "execute_result"
310 | }
311 | ],
312 | "source": [
313 | "cutoff_ind = cumsum[cumsum>0.8].index[0]\n",
314 | "df = df.iloc[7978:cutoff_ind]\n",
315 | "cutoff_ind"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": 10,
321 | "metadata": {},
322 | "outputs": [],
323 | "source": [
324 | "df_items = df[['asin', 'title', 'author']]\n",
325 | "program_list = [tuple(x) for x in df_items.values]"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": 11,
331 | "metadata": {},
332 | "outputs": [],
333 | "source": [
334 | "def normalize(s):\n",
335 | " s = s.replace('.', ' stop ')\n",
336 | " s = s.replace('\\n', ' ')\n",
337 | " for p in string.punctuation:\n",
338 | " s = s.replace(p, '')\n",
339 | " return s.lower().strip()\n",
340 | "\n",
341 | "\n",
342 | "def get_overall_rating(asin, rating_dict):\n",
343 | " sess = HTMLSession()\n",
344 | " r = sess.get(f\"https://www.audible.com/pd/{asin}\")\n",
345 | " rating_dict[asin] = np.float(r.html.find('div.bc-row.bc-spacing-small', \n",
346 | " first=False)[1].text.split(' stars ')[1][:3])\n",
347 | " sess.close()\n",
348 | " return rating_dict\n",
349 | "\n",
350 | "\n",
351 | "def get_reviews(asin, title, author):\n",
352 | " baseurl = f'https://www.audible.com/pd/reviews?country=US&asin={asin}&page='\n",
353 | " page_num = 0\n",
354 | " ratings_reviews = []\n",
355 | " sess = HTMLSession()\n",
356 | " title = normalize(title)\n",
357 | " try:\n",
358 | " author = normalize(author)\n",
359 | " except:\n",
360 | " author = ''\n",
361 | " \n",
362 | " while True:\n",
363 | " try:\n",
364 | " url = baseurl + str(page_num)\n",
365 | " r = sess.get(url).html\n",
366 | " page_elements = r.find('div.bc-row-responsive.bc-spacing-top-medium', first=False)\n",
367 | " for elem in page_elements:\n",
368 | " try:\n",
369 | " review = elem.find(f'div.bc-col-responsive.USreviews{page_num}.bc-col-9', first=True).text\n",
370 | " review = normalize(review)\n",
371 | " review = review.replace(title, '').replace(author, '')\n",
372 | " ratings = [item.text[0] for item in elem.find('span.bc-text')]\n",
373 | " ratings_reviews.append((review, *ratings))\n",
374 | " except:\n",
375 | " break\n",
376 | " page_num += 1\n",
377 | " \n",
378 | " except:\n",
379 | " break\n",
380 | " \n",
381 | " sess.close()\n",
382 | " return ratings_reviews\n",
383 | "\n",
384 | "\n",
385 | "def get_ratings_and_reviews(program_group, rating_dict, reviews):\n",
386 | " for asin, title, author in program_group:\n",
387 | " try:\n",
388 | " rating_dict = (get_overall_rating(asin, rating_dict))\n",
389 | " except:\n",
390 | " pass\n",
391 | " reviews.extend(get_reviews(asin, title, author))\n",
392 | "# print(reviews)\n",
393 | " return rating_dict, reviews"
394 | ]
395 | },
396 | {
397 | "cell_type": "code",
398 | "execution_count": 12,
399 | "metadata": {},
400 | "outputs": [],
401 | "source": [
402 | "def threaded_ratings(nthreads, program_list, rating_dict=None, reviews=None):\n",
403 | " if rating_dict == None:\n",
404 | " rating_dict = {}\n",
405 | " \n",
406 | " if reviews == None:\n",
407 | " reviews = []\n",
408 | " \n",
409 | " threads = []\n",
410 | " for i in range(nthreads):\n",
411 | " program_group = program_list[i::nthreads]\n",
412 | " t = Thread(target=get_ratings_and_reviews, args=(program_group, rating_dict, reviews))\n",
413 | " threads.append(t)\n",
414 | " \n",
415 | " [t.start() for t in threads]\n",
416 | " [t.join() for t in threads]\n",
417 | " \n",
418 | " return rating_dict, reviews"
419 | ]
420 | },
421 | {
422 | "cell_type": "code",
423 | "execution_count": 13,
424 | "metadata": {
425 | "scrolled": true
426 | },
427 | "outputs": [
428 | {
429 | "name": "stdout",
430 | "output_type": "stream",
431 | "text": [
432 | "7308.237472772598\n"
433 | ]
434 | }
435 | ],
436 | "source": [
437 | "start = time.time()\n",
438 | "rating_dict = {}\n",
439 | "rating_dict, reviews = threaded_ratings(16, program_list[::-1], rating_dict)\n",
440 | "end = time.time()\n",
441 | "print(end-start)"
442 | ]
443 | },
444 | {
445 | "cell_type": "code",
446 | "execution_count": 14,
447 | "metadata": {},
448 | "outputs": [
449 | {
450 | "data": {
451 | "text/plain": [
452 | "822922"
453 | ]
454 | },
455 | "execution_count": 14,
456 | "metadata": {},
457 | "output_type": "execute_result"
458 | }
459 | ],
460 | "source": [
461 | "len(reviews)"
462 | ]
463 | },
464 | {
465 | "cell_type": "code",
466 | "execution_count": 15,
467 | "metadata": {},
468 | "outputs": [],
469 | "source": [
470 | "df_reviews = pd.DataFrame(data=reviews, columns=['text', 'overall', 'performance', 'story'])"
471 | ]
472 | },
473 | {
474 | "cell_type": "code",
475 | "execution_count": 16,
476 | "metadata": {},
477 | "outputs": [],
478 | "source": [
479 | "df_reviews.to_csv('reviews2.csv', index=False)"
480 | ]
481 | },
482 | {
483 | "cell_type": "code",
484 | "execution_count": 17,
485 | "metadata": {},
486 | "outputs": [],
487 | "source": [
488 | "df_ratings = pd.DataFrame.from_dict(data=rating_dict, orient='index', columns=['rating'])"
489 | ]
490 | },
491 | {
492 | "cell_type": "code",
493 | "execution_count": 18,
494 | "metadata": {},
495 | "outputs": [],
496 | "source": [
497 | "df_ratings['asin'] = df_ratings.index"
498 | ]
499 | },
500 | {
501 | "cell_type": "code",
502 | "execution_count": 19,
503 | "metadata": {},
504 | "outputs": [],
505 | "source": [
506 | "df_ratings.reset_index(drop=True, inplace=True)"
507 | ]
508 | },
509 | {
510 | "cell_type": "code",
511 | "execution_count": 20,
512 | "metadata": {},
513 | "outputs": [],
514 | "source": [
515 | "df.drop('rating', inplace=True, axis=1)"
516 | ]
517 | },
518 | {
519 | "cell_type": "code",
520 | "execution_count": 21,
521 | "metadata": {},
522 | "outputs": [],
523 | "source": [
524 | "df_merged = pd.merge(df, df_ratings, on='asin')"
525 | ]
526 | },
527 | {
528 | "cell_type": "code",
529 | "execution_count": 22,
530 | "metadata": {},
531 | "outputs": [],
532 | "source": [
533 | "df_merged.to_csv('8k_top_granular_audible2.csv')"
534 | ]
535 | },
536 | {
537 | "cell_type": "code",
538 | "execution_count": null,
539 | "metadata": {},
540 | "outputs": [],
541 | "source": []
542 | },
543 | {
544 | "cell_type": "code",
545 | "execution_count": null,
546 | "metadata": {},
547 | "outputs": [],
548 | "source": []
549 | }
550 | ],
551 | "metadata": {
552 | "kernelspec": {
553 | "display_name": "Python 3",
554 | "language": "python",
555 | "name": "python3"
556 | },
557 | "language_info": {
558 | "codemirror_mode": {
559 | "name": "ipython",
560 | "version": 3
561 | },
562 | "file_extension": ".py",
563 | "mimetype": "text/x-python",
564 | "name": "python",
565 | "nbconvert_exporter": "python",
566 | "pygments_lexer": "ipython3",
567 | "version": "3.7.2"
568 | }
569 | },
570 | "nbformat": 4,
571 | "nbformat_minor": 2
572 | }
573 |
--------------------------------------------------------------------------------
/audible_eda/audible_scraper.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from requests_html import HTMLSession, HTML\n",
10 | "import numpy as np\n",
11 | "import pandas as pd\n",
12 | "from datetime import datetime\n",
13 | "from threading import Thread\n",
14 | "import time\n",
15 | "import matplotlib.pyplot as plt\n",
16 | "import progressbar\n",
17 | "import warnings\n",
18 | "warnings.filterwarnings(\"ignore\")"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 2,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "base_url = 'https://www.audible.com/search?pf_rd_p=7fe4387b-4762-42a8-8d9a-a63254c74bb2&pf_rd_r=C7ENYKDADHMCH4KY12D4&ref=a_search_l1_feature_five_browse-bin_6&feature_six_browse-bin=9178177011&pageSize=50'"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 3,
33 | "metadata": {
34 | "scrolled": true
35 | },
36 | "outputs": [],
37 | "source": [
38 | "def build_dict(items, category, data):\n",
39 | " for item in items:\n",
40 | " text_fields = item.text.split('\\n')\n",
41 | " link = [link for link in item.absolute_links if '/pd/' in link][0]\n",
42 | " dict_entry={\n",
43 | " 'category' : category,\n",
44 | " 'title' : text_fields[0],\n",
45 | " 'link' : link\n",
46 | " }\n",
47 | " try:\n",
48 | " dict_entry['rating_count'] = np.int([s for s in text_fields if 'stars' in s][0].split(\n",
49 | " 'stars ')[1].replace(',',''))\n",
50 | " except: pass\n",
51 | " try:\n",
52 | " dict_entry['narrator'] = [s for s in text_fields if 'Narrated by' in s][0].split(': ')[1]\n",
53 | " except: pass\n",
54 | " try:\n",
55 | " dict_entry['asin'] = [s for s in link.split('/') if 'B0' in s][0].split('?')[0]\n",
56 | " except: pass\n",
57 | " try:\n",
58 | " dict_entry['length'] = [s for s in text_fields if 'Length' in s][0].split(': ')[1]\n",
59 | " except: pass\n",
60 | " try:\n",
61 | " dict_entry['rating'] = np.float([s for s in text_fields if 'stars' in s][-1].split(' out')[0])\n",
62 | " except: pass\n",
63 | " try:\n",
64 | " dict_entry['author'] = [s for s in text_fields if 'By' in s][0].split(': ')[1]\n",
65 | " except: pass\n",
66 | " try:\n",
67 | " dict_entry['price'] = np.float([s for s in text_fields if 'Regular' in s][0].split('$')[1])\n",
68 | " except: pass\n",
69 | " try:\n",
70 | " dict_entry['release_date'] = datetime.strptime([s for s in text_fields if 'Release date:' in s][0].split(\n",
71 | " ': ')[1], '%m-%d-%y')\n",
72 | " except : pass\n",
73 | " data.append(dict_entry)\n",
74 | " return data\n",
75 | "\n",
76 | "def scrape_great_courses(mthreads, category, pages, url_list, data):\n",
77 | " sess = HTMLSession()\n",
78 | " \n",
79 | " for url in url_list:\n",
80 | " try:\n",
81 | " r = sess.get(url)\n",
82 | " except:\n",
83 | " try:\n",
84 | " time.sleep(0.5)\n",
85 | " r = sess.get(url)\n",
86 | " except:\n",
87 | " pass\n",
88 | " \n",
89 | " items = r.html.find('li.bc-list-item.productListItem', first=False)\n",
90 | " \n",
91 | " threads = []\n",
92 | " for j in range(mthreads):\n",
93 | " item_sublist = items[j::mthreads]\n",
94 | " t = Thread(target=build_dict, args=(item_sublist, category, data))\n",
95 | " threads.append(t)\n",
96 | " \n",
97 | " [t.start() for t in threads]\n",
98 | " [t.join() for t in threads]\n",
99 | " \n",
100 | " sess.close()\n",
101 | " return data"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": 4,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "def scrape_threader(nthreads, mthreads, category, pages, base_url, data=None):\n",
111 | " if data == None:\n",
112 | " data = []\n",
113 | " \n",
114 | " # Create url list\n",
115 | " url_list = []\n",
116 | " for page in range(pages):\n",
117 | " pageurl = base_url + '&page=' + str(page+1)\n",
118 | " url_list.append(pageurl)\n",
119 | " \n",
120 | " # Create threads\n",
121 | " threads = []\n",
122 | " for i in range(nthreads):\n",
123 | " url_sublist = url_list[i::nthreads]\n",
124 | " t = Thread(target=scrape_great_courses, args=(mthreads, category, pages, url_sublist, data))\n",
125 | " threads.append(t)\n",
126 | " \n",
127 | " # Run threads\n",
128 | " [t.start() for t in threads]\n",
129 | " [t.join() for t in threads]\n",
130 | " \n",
131 | " return data"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": 5,
137 | "metadata": {
138 | "scrolled": true
139 | },
140 | "outputs": [],
141 | "source": [
142 | "def loop_categories(nthreads, mthreads, cat_names, cat_page_nums, cat_links):\n",
143 | " data = []\n",
144 | " \n",
145 | "# widgets = [\n",
146 | "# progressbar.Percentage(), \n",
147 | "# progressbar.Bar(), \n",
148 | "# progressbar.ETA(),\n",
149 | "# progressbar.DynamicMessage('cat')]\n",
150 | "# bar = progressbar.ProgressBar(widgets=widgets, max_value=sum(cat_page_nums)).start()\n",
151 | " \n",
152 | " finished_pages = 0 \n",
153 | " for category, pages, link in zip(cat_names, cat_page_nums, cat_links):\n",
154 | " print('Scraping ', category, '...')\n",
155 | "# bar.update(finished_pages, cat=category)\n",
156 | " data.extend(scrape_threader(nthreads, mthreads, category, pages, link, data=data))\n",
157 | " finished_pages += pages\n",
158 | " \n",
159 | "# bar.finish() \n",
160 | " return data"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": 6,
166 | "metadata": {},
167 | "outputs": [],
168 | "source": [
169 | "sess = HTMLSession()\n",
170 | "r = sess.get(base_url)\n",
171 | "\n",
172 | "cat_items = r.html.find('div.bc-col-responsive.bc-col-3')[1].find('ul.bc-list')[0].find('li.bc-list-item')\n",
173 | "cat_names = [item.text.split(' (')[0] for item in cat_items]\n",
174 | "cat_item_nums = [np.int(item.text.split(' (')[1][:-1].replace(',', '')) for item in cat_items]\n",
175 | "cat_page_nums = [np.int(np.ceil(item/50)) for item in cat_item_nums]\n",
176 | "cat_links = [item.absolute_links.pop() + '&pageSize=50' for item in cat_items]\n",
177 | "\n",
178 | "sess.close()"
179 | ]
180 | },
181 | {
182 | "cell_type": "code",
183 | "execution_count": 7,
184 | "metadata": {
185 | "scrolled": true
186 | },
187 | "outputs": [
188 | {
189 | "name": "stdout",
190 | "output_type": "stream",
191 | "text": [
192 | "Scraping Classics ...\n",
193 | "Done. Scraped 11042 out of 11043 items at 1.1 pages/s. ETA: 118.2 min.\n",
194 | "Scraping Erotica & Sexuality ...\n",
195 | "Done. Scraped 14405 out of 14405 items at 1.1 pages/s. ETA: 119.0 min.\n",
196 | "Scraping Fiction ...\n",
197 | "Done. Scraped 55564 out of 55614 items at 1.1 pages/s. ETA: 103.4 min.\n",
198 | "Scraping History ...\n"
199 | ]
200 | },
201 | {
202 | "name": "stderr",
203 | "output_type": "stream",
204 | "text": [
205 | "Exception in thread Thread-3276:\n",
206 | "Traceback (most recent call last):\n",
207 | " File \"C:\\Users\\Toby-PC\\Anaconda3\\lib\\site-packages\\pyquery\\pyquery.py\", line 95, in fromstring\n",
208 | " result = getattr(etree, meth)(context)\n",
209 | " File \"src/lxml/etree.pyx\", line 3213, in lxml.etree.fromstring\n",
210 | " File \"src/lxml/parser.pxi\", line 1877, in lxml.etree._parseMemoryDocument\n",
211 | " File \"src/lxml/parser.pxi\", line 1765, in lxml.etree._parseDoc\n",
212 | " File \"src/lxml/parser.pxi\", line 1127, in lxml.etree._BaseParser._parseDoc\n",
213 | " File \"src/lxml/parser.pxi\", line 601, in lxml.etree._ParserContext._handleParseResultDoc\n",
214 | " File \"src/lxml/parser.pxi\", line 711, in lxml.etree._handleParseResult\n",
215 | " File \"src/lxml/parser.pxi\", line 640, in lxml.etree._raiseParseError\n",
216 | " File \"\", line 1\n",
217 | "lxml.etree.XMLSyntaxError: Document is empty, line 1, column 1\n",
218 | "\n",
219 | "During handling of the above exception, another exception occurred:\n",
220 | "\n",
221 | "Traceback (most recent call last):\n",
222 | " File \"C:\\Users\\Toby-PC\\Anaconda3\\lib\\threading.py\", line 917, in _bootstrap_inner\n",
223 | " self.run()\n",
224 | " File \"C:\\Users\\Toby-PC\\Anaconda3\\lib\\threading.py\", line 865, in run\n",
225 | " self._target(*self._args, **self._kwargs)\n",
226 | " File \"\", line 52, in scrape_great_courses\n",
227 | " items = r.html.find('li.bc-list-item.productListItem', first=False)\n",
228 | " File \"C:\\Users\\Toby-PC\\Anaconda3\\lib\\site-packages\\requests_html.py\", line 654, in html\n",
229 | " self._html = HTML(session=self.session, url=self.url, html=self.content, default_encoding=self.encoding)\n",
230 | " File \"C:\\Users\\Toby-PC\\Anaconda3\\lib\\site-packages\\requests_html.py\", line 421, in __init__\n",
231 | " element=PyQuery(html)('html') or PyQuery(f'{html}')('html'),\n",
232 | " File \"C:\\Users\\Toby-PC\\Anaconda3\\lib\\site-packages\\pyquery\\pyquery.py\", line 255, in __init__\n",
233 | " elements = fromstring(context, self.parser)\n",
234 | " File \"C:\\Users\\Toby-PC\\Anaconda3\\lib\\site-packages\\pyquery\\pyquery.py\", line 99, in fromstring\n",
235 | " result = getattr(lxml.html, meth)(context)\n",
236 | " File \"C:\\Users\\Toby-PC\\Anaconda3\\lib\\site-packages\\lxml\\html\\__init__.py\", line 876, in fromstring\n",
237 | " doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)\n",
238 | " File \"C:\\Users\\Toby-PC\\Anaconda3\\lib\\site-packages\\lxml\\html\\__init__.py\", line 765, in document_fromstring\n",
239 | " \"Document is empty\")\n",
240 | "lxml.etree.ParserError: Document is empty\n",
241 | "\n"
242 | ]
243 | },
244 | {
245 | "name": "stdout",
246 | "output_type": "stream",
247 | "text": [
248 | "Done. Scraped 15500 out of 15621 items at 1.1 pages/s. ETA: 96.7 min.\n",
249 | "Scraping Mysteries & Thrillers ...\n",
250 | "Done. Scraped 47392 out of 47401 items at 1.1 pages/s. ETA: 83.2 min.\n",
251 | "Scraping Romance ...\n",
252 | "Done. Scraped 44607 out of 44623 items at 0.8 pages/s. ETA: 89.2 min.\n",
253 | "Scraping Science & Technology ...\n",
254 | "Done. Scraped 13984 out of 13984 items at 1.1 pages/s. ETA: 62.7 min.\n",
255 | "Scraping Sci-Fi & Fantasy ...\n",
256 | "Done. Scraped 34151 out of 34151 items at 1.1 pages/s. ETA: 56.0 min.\n",
257 | "Scraping Self Development ...\n",
258 | "Done. Scraped 44007 out of 44028 items at 1.1 pages/s. ETA: 39.9 min.\n",
259 | "Scraping Comedy ...\n",
260 | "Done. Scraped 5085 out of 5085 items at 1.2 pages/s. ETA: 36.9 min.\n",
261 | "Scraping Newspapers & Magazines ...\n",
262 | "Done. Scraped 10209 out of 10208 items at 1.2 pages/s. ETA: 32.1 min.\n",
263 | "Scraping Nostalgia Radio ...\n",
264 | "Done. Scraped 2053 out of 2104 items at 1.2 pages/s. ETA: 33.1 min.\n",
265 | "Scraping Radio & TV ...\n",
266 | "Done. Scraped 10697 out of 10697 items at 1.2 pages/s. ETA: 29.3 min.\n",
267 | "Scraping Sports ...\n",
268 | "Done. Scraped 3540 out of 3540 items at 1.1 pages/s. ETA: 30.0 min.\n",
269 | "Scraping Travel & Adventure ...\n",
270 | "Done. Scraped 3526 out of 3526 items at 1.2 pages/s. ETA: 28.7 min.\n",
271 | "Scraping Religion & Spirituality ...\n",
272 | "Done. Scraped 20785 out of 20785 items at 1.1 pages/s. ETA: 22.9 min.\n",
273 | "Scraping Nonfiction ...\n",
274 | "Done. Scraped 17573 out of 17573 items at 1.1 pages/s. ETA: 18.1 min.\n",
275 | "Scraping Live Events ...\n",
276 | "Done. Scraped 1071 out of 1071 items at 1.1 pages/s. ETA: 17.7 min.\n",
277 | "Scraping Language Instruction ...\n",
278 | "Done. Scraped 4454 out of 4454 items at 1.2 pages/s. ETA: 15.9 min.\n",
279 | "Scraping Drama & Poetry ...\n",
280 | "Done. Scraped 3512 out of 3512 items at 1.2 pages/s. ETA: 15.0 min.\n",
281 | "Scraping Health & Fitness ...\n",
282 | "Done. Scraped 7899 out of 7899 items at 1.1 pages/s. ETA: 12.9 min.\n",
283 | "Scraping Kids ...\n",
284 | "Done. Scraped 26689 out of 26689 items at 1.2 pages/s. ETA: 5.0 min.\n",
285 | "Scraping Teens ...\n",
286 | "Done. Scraped 17539 out of 17539 items at 1.1 pages/s. ETA: 0.0 min.\n"
287 | ]
288 | }
289 | ],
290 | "source": [
291 | "for i in range(3, len(cat_names)):\n",
292 | " start=time.time()\n",
293 | " df = pd.DataFrame(data=loop_categories(\n",
294 | " 8, 2, [cat_names[i]], [cat_page_nums[i]], [cat_links[i]]))\n",
295 | " df = df.drop_duplicates()\n",
296 | " df.to_csv('{}.csv'.format(i))\n",
297 | " end = time.time()\n",
298 | " rate = cat_page_nums[i]/(end-start)\n",
299 | " pages_left = np.sum(cat_page_nums[i+1:])\n",
300 | " eta = pages_left/rate/60\n",
301 | " print('Done. Scraped {} out of {} items at {:.1f} pages/s. ETA: {:.1f} min.'.format(\n",
302 | " len(df), cat_item_nums[i], rate, eta))"
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": 13,
308 | "metadata": {},
309 | "outputs": [],
310 | "source": [
311 | "df = pd.read_csv('0.csv')\n",
312 | "for i in range(1, len(cat_names)):\n",
313 | " df = pd.concat([df, pd.read_csv('{}.csv'.format(i))], ignore_index=True)"
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": 16,
319 | "metadata": {},
320 | "outputs": [],
321 | "source": [
322 | "df = df.drop_duplicates()\n",
323 | "df = df.drop('Unnamed: 0', axis=1)"
324 | ]
325 | },
326 | {
327 | "cell_type": "code",
328 | "execution_count": 19,
329 | "metadata": {},
330 | "outputs": [],
331 | "source": [
332 | "df.to_csv('all_english_audible.csv', index=False)"
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": null,
338 | "metadata": {},
339 | "outputs": [],
340 | "source": []
341 | }
342 | ],
343 | "metadata": {
344 | "kernelspec": {
345 | "display_name": "Python 3",
346 | "language": "python",
347 | "name": "python3"
348 | },
349 | "language_info": {
350 | "codemirror_mode": {
351 | "name": "ipython",
352 | "version": 3
353 | },
354 | "file_extension": ".py",
355 | "mimetype": "text/x-python",
356 | "name": "python",
357 | "nbconvert_exporter": "python",
358 | "pygments_lexer": "ipython3",
359 | "version": "3.7.2"
360 | }
361 | },
362 | "nbformat": 4,
363 | "nbformat_minor": 2
364 | }
365 |
--------------------------------------------------------------------------------