├── .DS_Store ├── NY_restaurant_data ├── .DS_Store ├── .ipynb_checkpoints │ ├── Untitled-checkpoint.ipynb │ └── ny_restaurant_data-checkpoint.ipynb ├── gps_coords.pkl ├── ny_restaurant_data.ipynb └── nyc_map.png ├── audible_eda ├── .ipynb_checkpoints │ ├── audible_data_analysis-checkpoint.ipynb │ ├── audible_prices-checkpoint.ipynb │ ├── audible_review_classifier-checkpoint.ipynb │ └── audible_reviews_scraper-checkpoint.ipynb ├── audible_data_analysis.ipynb ├── audible_prices.ipynb ├── audible_review_classifier.ipynb ├── audible_reviews_scraper.ipynb └── audible_scraper.ipynb ├── county_politics_analysis.ipynb ├── mo health data.ipynb └── oct_classification.ipynb /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobymanders/data-projects/249885ae53feb4bb65c0a42a30b8c448705bb0a5/.DS_Store -------------------------------------------------------------------------------- /NY_restaurant_data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobymanders/data-projects/249885ae53feb4bb65c0a42a30b8c448705bb0a5/NY_restaurant_data/.DS_Store -------------------------------------------------------------------------------- /NY_restaurant_data/.ipynb_checkpoints/Untitled-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /NY_restaurant_data/gps_coords.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobymanders/data-projects/249885ae53feb4bb65c0a42a30b8c448705bb0a5/NY_restaurant_data/gps_coords.pkl -------------------------------------------------------------------------------- /NY_restaurant_data/nyc_map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobymanders/data-projects/249885ae53feb4bb65c0a42a30b8c448705bb0a5/NY_restaurant_data/nyc_map.png -------------------------------------------------------------------------------- /audible_eda/.ipynb_checkpoints/audible_prices-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from requests_html import HTMLSession, HTML\n", 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "from datetime import datetime" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "url = 'https://www.audible.com/search?pf_rd_p=1d79b443-2f1d-43a3-b1dc-31a2cd242566&pf_rd_r=HK8P1MY097JB8VJ6PRTQ&ref=a_search_c4_pageSize_3&keywords=the+great+courses&pageSize=50'" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 3, 27 | "metadata": { 28 | "scrolled": true 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "def scrape_great_courses(url):\n", 33 | " sess = HTMLSession()\n", 34 | " data = []\n", 35 | " pages = 15\n", 36 | " for page in range(pages):\n", 37 | " pageurl = url + '&page=' + str(page+1)\n", 38 | " r = sess.get(pageurl)\n", 39 | " items = r.html.find('li.bc-list-item.productListItem', first=False)\n", 40 | " for i, item in enumerate(items):\n", 41 | " text_fields = item.text.split('\\n')\n", 42 | " dict_entry={\n", 43 | " 'title' : text_fields[0],\n", 44 | " 'price' : np.float([s for s in text_fields if 'Regular' in s][0].split('$')[1]),\n", 45 | " 'length' : [s for s in text_fields if 'Length' in s][0].split(': ')[1],\n", 46 | " 'rating' : np.float([s for s in text_fields if 'stars' in s][0].split(' out')[0]),\n", 47 | " 'rating_count' : np.int([s for s in text_fields if 'stars' in s][0].split(\n", 48 | " 'stars ')[1].replace(',','')),\n", 49 | " 'link' : 'https://www.audible.com' + [link for link in item.links if '/pd/' in link][0],\n", 50 | " }\n", 51 | " if any(['Series:' in s for s in text_fields]):\n", 52 | " dict_entry['series'] = [s for s in text_fields if 'Series:' in s][0].split('Series: ')[1]\n", 53 | " else:\n", 54 | " dict_entry['series'] = 'N/A'\n", 55 | " if any(['Release date:' in s for s in text_fields]):\n", 56 | " dict_entry['release_date'] = datetime.strptime([s for s in text_fields if 'Release date:' in s][0].split(\n", 57 | " ': ')[1], '%m-%d-%y')\n", 58 | " data.append(dict_entry)\n", 59 | " return data" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 4, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "df = pd.DataFrame(data=scrape_great_courses(url))" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 5, 74 | "metadata": { 75 | "scrolled": true 76 | }, 77 | "outputs": [ 78 | { 79 | "data": { 80 | "text/html": [ 81 | "
\n", 82 | "\n", 95 | "\n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | "
lengthlinkpriceratingrating_countrelease_dateseriestitle
043 hrs and 23 minshttps://www.audible.com/pd/The-History-of-the-...59.954.523152013-07-08The Great Courses: Modern HistoryThe History of the United States, 2nd Edition
112 hrs and 39 minshttps://www.audible.com/pd/Your-Best-Brain-The...34.954.526292014-11-14The Great Courses: PsychologyYour Best Brain: The Science of Brain Improvement
218 hrs and 15 minshttps://www.audible.com/pd/The-Story-of-Human-...41.954.536132013-07-08The Great Courses: LinguisticsThe Story of Human Language
336 hrs and 34 minshttps://www.audible.com/pd/How-to-Listen-to-an...59.954.523372013-07-08The Great Courses: Fine Arts & MusicHow to Listen to and Understand Great Music, 3...
431 hrs and 18 minshttps://www.audible.com/pd/Critical-Business-S...59.954.521712015-04-08The Great Courses: ProfessionalCritical Business Skills for Success
\n", 167 | "
" 168 | ], 169 | "text/plain": [ 170 | " length link \\\n", 171 | "0 43 hrs and 23 mins https://www.audible.com/pd/The-History-of-the-... \n", 172 | "1 12 hrs and 39 mins https://www.audible.com/pd/Your-Best-Brain-The... \n", 173 | "2 18 hrs and 15 mins https://www.audible.com/pd/The-Story-of-Human-... \n", 174 | "3 36 hrs and 34 mins https://www.audible.com/pd/How-to-Listen-to-an... \n", 175 | "4 31 hrs and 18 mins https://www.audible.com/pd/Critical-Business-S... \n", 176 | "\n", 177 | " price rating rating_count release_date \\\n", 178 | "0 59.95 4.5 2315 2013-07-08 \n", 179 | "1 34.95 4.5 2629 2014-11-14 \n", 180 | "2 41.95 4.5 3613 2013-07-08 \n", 181 | "3 59.95 4.5 2337 2013-07-08 \n", 182 | "4 59.95 4.5 2171 2015-04-08 \n", 183 | "\n", 184 | " series \\\n", 185 | "0 The Great Courses: Modern History \n", 186 | "1 The Great Courses: Psychology \n", 187 | "2 The Great Courses: Linguistics \n", 188 | "3 The Great Courses: Fine Arts & Music \n", 189 | "4 The Great Courses: Professional \n", 190 | "\n", 191 | " title \n", 192 | "0 The History of the United States, 2nd Edition \n", 193 | "1 Your Best Brain: The Science of Brain Improvement \n", 194 | "2 The Story of Human Language \n", 195 | "3 How to Listen to and Understand Great Music, 3... \n", 196 | "4 Critical Business Skills for Success " 197 | ] 198 | }, 199 | "execution_count": 5, 200 | "metadata": {}, 201 | "output_type": "execute_result" 202 | } 203 | ], 204 | "source": [ 205 | "df.head()" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 6, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "def scrape_sale_courses():\n", 215 | " data = []\n", 216 | " pages = 5\n", 217 | " for page in range(pages):\n", 218 | " pageurl = '/Users/tobymanders/Desktop/{}.html'.format(page+1)\n", 219 | " with open(pageurl) as file:\n", 220 | " html = file.read()\n", 221 | " r = HTML(html=html)\n", 222 | " items = r.find('li.bc-list-item.productListItem', first=False)\n", 223 | " for i, item in enumerate(items):\n", 224 | " text_fields = item.text.split('\\n')\n", 225 | " dict_entry={\n", 226 | " 'title' : text_fields[0],\n", 227 | " 'sale' : 'Yes',\n", 228 | " }\n", 229 | " if any(['Member' in s for s in text_fields]):\n", 230 | " dict_entry['member-price'] = np.float([s for s in text_fields if 'Member' in s][0].split('$')[1].split(' or')[0])\n", 231 | " data.append(dict_entry)\n", 232 | " return data" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 7, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "sale_df = pd.DataFrame(data=scrape_sale_courses())" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 8, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "df_merge = df.join(sale_df.set_index('title'), on='title')" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 9, 256 | "metadata": {}, 257 | "outputs": [ 258 | { 259 | "data": { 260 | "text/html": [ 261 | "
\n", 262 | "\n", 275 | "\n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | "
lengthlinkpriceratingrating_countrelease_dateseriestitlemember-pricesale
043 hrs and 23 minshttps://www.audible.com/pd/The-History-of-the-...59.954.523152013-07-08The Great Courses: Modern HistoryThe History of the United States, 2nd EditionNaNYes
112 hrs and 39 minshttps://www.audible.com/pd/Your-Best-Brain-The...34.954.526292014-11-14The Great Courses: PsychologyYour Best Brain: The Science of Brain Improvement24.46Yes
218 hrs and 15 minshttps://www.audible.com/pd/The-Story-of-Human-...41.954.536132013-07-08The Great Courses: LinguisticsThe Story of Human LanguageNaNYes
336 hrs and 34 minshttps://www.audible.com/pd/How-to-Listen-to-an...59.954.523372013-07-08The Great Courses: Fine Arts & MusicHow to Listen to and Understand Great Music, 3...41.96Yes
431 hrs and 18 minshttps://www.audible.com/pd/Critical-Business-S...59.954.521712015-04-08The Great Courses: ProfessionalCritical Business Skills for Success41.96Yes
\n", 359 | "
" 360 | ], 361 | "text/plain": [ 362 | " length link \\\n", 363 | "0 43 hrs and 23 mins https://www.audible.com/pd/The-History-of-the-... \n", 364 | "1 12 hrs and 39 mins https://www.audible.com/pd/Your-Best-Brain-The... \n", 365 | "2 18 hrs and 15 mins https://www.audible.com/pd/The-Story-of-Human-... \n", 366 | "3 36 hrs and 34 mins https://www.audible.com/pd/How-to-Listen-to-an... \n", 367 | "4 31 hrs and 18 mins https://www.audible.com/pd/Critical-Business-S... \n", 368 | "\n", 369 | " price rating rating_count release_date \\\n", 370 | "0 59.95 4.5 2315 2013-07-08 \n", 371 | "1 34.95 4.5 2629 2014-11-14 \n", 372 | "2 41.95 4.5 3613 2013-07-08 \n", 373 | "3 59.95 4.5 2337 2013-07-08 \n", 374 | "4 59.95 4.5 2171 2015-04-08 \n", 375 | "\n", 376 | " series \\\n", 377 | "0 The Great Courses: Modern History \n", 378 | "1 The Great Courses: Psychology \n", 379 | "2 The Great Courses: Linguistics \n", 380 | "3 The Great Courses: Fine Arts & Music \n", 381 | "4 The Great Courses: Professional \n", 382 | "\n", 383 | " title member-price sale \n", 384 | "0 The History of the United States, 2nd Edition NaN Yes \n", 385 | "1 Your Best Brain: The Science of Brain Improvement 24.46 Yes \n", 386 | "2 The Story of Human Language NaN Yes \n", 387 | "3 How to Listen to and Understand Great Music, 3... 41.96 Yes \n", 388 | "4 Critical Business Skills for Success 41.96 Yes " 389 | ] 390 | }, 391 | "execution_count": 9, 392 | "metadata": {}, 393 | "output_type": "execute_result" 394 | } 395 | ], 396 | "source": [ 397 | "df_merge.head()" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": 10, 403 | "metadata": {}, 404 | "outputs": [], 405 | "source": [ 406 | "df_merge['sale'] = df_merge['sale'].fillna('No')" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 11, 412 | "metadata": {}, 413 | "outputs": [ 414 | { 415 | "data": { 416 | "text/html": [ 417 | "
\n", 418 | "\n", 431 | "\n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | "
lengthlinkpriceratingrating_countrelease_dateseriestitlemember-pricesale
043 hrs and 23 minshttps://www.audible.com/pd/The-History-of-the-...59.954.523152013-07-08The Great Courses: Modern HistoryThe History of the United States, 2nd EditionNaNYes
112 hrs and 39 minshttps://www.audible.com/pd/Your-Best-Brain-The...34.954.526292014-11-14The Great Courses: PsychologyYour Best Brain: The Science of Brain Improvement24.46Yes
218 hrs and 15 minshttps://www.audible.com/pd/The-Story-of-Human-...41.954.536132013-07-08The Great Courses: LinguisticsThe Story of Human LanguageNaNYes
336 hrs and 34 minshttps://www.audible.com/pd/How-to-Listen-to-an...59.954.523372013-07-08The Great Courses: Fine Arts & MusicHow to Listen to and Understand Great Music, 3...41.96Yes
431 hrs and 18 minshttps://www.audible.com/pd/Critical-Business-S...59.954.521712015-04-08The Great Courses: ProfessionalCritical Business Skills for Success41.96Yes
\n", 515 | "
" 516 | ], 517 | "text/plain": [ 518 | " length link \\\n", 519 | "0 43 hrs and 23 mins https://www.audible.com/pd/The-History-of-the-... \n", 520 | "1 12 hrs and 39 mins https://www.audible.com/pd/Your-Best-Brain-The... \n", 521 | "2 18 hrs and 15 mins https://www.audible.com/pd/The-Story-of-Human-... \n", 522 | "3 36 hrs and 34 mins https://www.audible.com/pd/How-to-Listen-to-an... \n", 523 | "4 31 hrs and 18 mins https://www.audible.com/pd/Critical-Business-S... \n", 524 | "\n", 525 | " price rating rating_count release_date \\\n", 526 | "0 59.95 4.5 2315 2013-07-08 \n", 527 | "1 34.95 4.5 2629 2014-11-14 \n", 528 | "2 41.95 4.5 3613 2013-07-08 \n", 529 | "3 59.95 4.5 2337 2013-07-08 \n", 530 | "4 59.95 4.5 2171 2015-04-08 \n", 531 | "\n", 532 | " series \\\n", 533 | "0 The Great Courses: Modern History \n", 534 | "1 The Great Courses: Psychology \n", 535 | "2 The Great Courses: Linguistics \n", 536 | "3 The Great Courses: Fine Arts & Music \n", 537 | "4 The Great Courses: Professional \n", 538 | "\n", 539 | " title member-price sale \n", 540 | "0 The History of the United States, 2nd Edition NaN Yes \n", 541 | "1 Your Best Brain: The Science of Brain Improvement 24.46 Yes \n", 542 | "2 The Story of Human Language NaN Yes \n", 543 | "3 How to Listen to and Understand Great Music, 3... 41.96 Yes \n", 544 | "4 Critical Business Skills for Success 41.96 Yes " 545 | ] 546 | }, 547 | "execution_count": 11, 548 | "metadata": {}, 549 | "output_type": "execute_result" 550 | } 551 | ], 552 | "source": [ 553 | "df_merge.head()" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": 12, 559 | "metadata": {}, 560 | "outputs": [], 561 | "source": [ 562 | "def fix_mins(x):\n", 563 | " if 'min' in x and 'hr' in x:\n", 564 | " hrs = x.split(' hr')[0]\n", 565 | " mins = x.split(' min')[0].split('and ')[1]\n", 566 | " len_ = make_len(hrs, mins)\n", 567 | " elif 'min' in x:\n", 568 | " mins = x.split(' min')[0]\n", 569 | " len_ = make_len('00', mins)\n", 570 | " elif 'hr' in x:\n", 571 | " hrs = x.split(' hr')[0]\n", 572 | " len_ = make_len(hrs, '00')\n", 573 | " else:\n", 574 | " len_ = make_len('00', '00')\n", 575 | " return len_\n", 576 | "\n", 577 | "def make_len(hrs, mins):\n", 578 | " if len(hrs)<2:\n", 579 | " hrs = '0' + hrs\n", 580 | " if len(mins)<2:\n", 581 | " mins = '0' + mins\n", 582 | " return hrs + ' hrs ' + mins + ' mins'" 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": 13, 588 | "metadata": {}, 589 | "outputs": [], 590 | "source": [ 591 | "df_merge['length'] = df_merge['length'].apply(fix_mins)" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": 14, 597 | "metadata": {}, 598 | "outputs": [], 599 | "source": [ 600 | "columns = ['title', 'sale', 'price', 'member-price', 'length', 'rating', 'rating_count', 'release_date', 'series', 'link']" 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": 15, 606 | "metadata": {}, 607 | "outputs": [], 608 | "source": [ 609 | "df_merge.to_csv('great_courses_all_titles_v2.csv', columns=columns, index=False)" 610 | ] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": 16, 615 | "metadata": {}, 616 | "outputs": [], 617 | "source": [ 618 | "url_list = list(df_merge['link'])" 619 | ] 620 | }, 621 | { 622 | "cell_type": "code", 623 | "execution_count": 17, 624 | "metadata": {}, 625 | "outputs": [ 626 | { 627 | "data": { 628 | "text/plain": [ 629 | "724" 630 | ] 631 | }, 632 | "execution_count": 17, 633 | "metadata": {}, 634 | "output_type": "execute_result" 635 | } 636 | ], 637 | "source": [ 638 | "len(df_merge)" 639 | ] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "execution_count": 50, 644 | "metadata": {}, 645 | "outputs": [], 646 | "source": [ 647 | "def get_accurate_ratings(addresses, rating_dict):\n", 648 | " for address in addresses:\n", 649 | " sess = HTMLSession()\n", 650 | " r = sess.get(address)\n", 651 | " try:\n", 652 | " rating_dict[address] = np.float(r.html.find('div.bc-row.bc-spacing-small', first=False)[1].text.split(' stars ')[1][:3])\n", 653 | " except:\n", 654 | " rating_dict[address] = np.nan\n", 655 | " \n", 656 | " return rating_dict" 657 | ] 658 | }, 659 | { 660 | "cell_type": "code", 661 | "execution_count": 51, 662 | "metadata": {}, 663 | "outputs": [], 664 | "source": [ 665 | "from threading import Thread\n", 666 | "\n", 667 | "def threaded_ratings(nthreads, addresses, rating_dict=None):\n", 668 | " if rating_dict == None:\n", 669 | " rating_dict = {}\n", 670 | " threads = []\n", 671 | " for i in range(nthreads):\n", 672 | " add_subset = addresses[i::nthreads]\n", 673 | " t = Thread(target=get_accurate_ratings, args=(add_subset, rating_dict))\n", 674 | " threads.append(t)\n", 675 | " [t.start() for t in threads]\n", 676 | " [t.join() for t in threads]\n", 677 | " \n", 678 | " return rating_dict" 679 | ] 680 | }, 681 | { 682 | "cell_type": "code", 683 | "execution_count": 52, 684 | "metadata": {}, 685 | "outputs": [], 686 | "source": [ 687 | "rating_dict = threaded_ratings(64, url_list)" 688 | ] 689 | }, 690 | { 691 | "cell_type": "code", 692 | "execution_count": 58, 693 | "metadata": {}, 694 | "outputs": [], 695 | "source": [ 696 | "df_merge['rating'] = df_merge['link'].apply(lambda x: rating_dict[x])" 697 | ] 698 | }, 699 | { 700 | "cell_type": "code", 701 | "execution_count": 59, 702 | "metadata": {}, 703 | "outputs": [ 704 | { 705 | "data": { 706 | "text/html": [ 707 | "
\n", 708 | "\n", 721 | "\n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | "
lengthlinkpriceratingrating_countrelease_dateseriestitlemember-pricesale
043 hrs 23 minshttps://www.audible.com/pd/The-History-of-the-...59.954.723152013-07-08The Great Courses: Modern HistoryThe History of the United States, 2nd EditionNaNYes
112 hrs 39 minshttps://www.audible.com/pd/Your-Best-Brain-The...34.954.526292014-11-14The Great Courses: PsychologyYour Best Brain: The Science of Brain Improvement24.46Yes
218 hrs 15 minshttps://www.audible.com/pd/The-Story-of-Human-...41.954.736132013-07-08The Great Courses: LinguisticsThe Story of Human LanguageNaNYes
336 hrs 34 minshttps://www.audible.com/pd/How-to-Listen-to-an...59.954.723372013-07-08The Great Courses: Fine Arts & MusicHow to Listen to and Understand Great Music, 3...41.96Yes
431 hrs 18 minshttps://www.audible.com/pd/Critical-Business-S...59.954.621712015-04-08The Great Courses: ProfessionalCritical Business Skills for Success41.96Yes
\n", 805 | "
" 806 | ], 807 | "text/plain": [ 808 | " length link price \\\n", 809 | "0 43 hrs 23 mins https://www.audible.com/pd/The-History-of-the-... 59.95 \n", 810 | "1 12 hrs 39 mins https://www.audible.com/pd/Your-Best-Brain-The... 34.95 \n", 811 | "2 18 hrs 15 mins https://www.audible.com/pd/The-Story-of-Human-... 41.95 \n", 812 | "3 36 hrs 34 mins https://www.audible.com/pd/How-to-Listen-to-an... 59.95 \n", 813 | "4 31 hrs 18 mins https://www.audible.com/pd/Critical-Business-S... 59.95 \n", 814 | "\n", 815 | " rating rating_count release_date series \\\n", 816 | "0 4.7 2315 2013-07-08 The Great Courses: Modern History \n", 817 | "1 4.5 2629 2014-11-14 The Great Courses: Psychology \n", 818 | "2 4.7 3613 2013-07-08 The Great Courses: Linguistics \n", 819 | "3 4.7 2337 2013-07-08 The Great Courses: Fine Arts & Music \n", 820 | "4 4.6 2171 2015-04-08 The Great Courses: Professional \n", 821 | "\n", 822 | " title member-price sale \n", 823 | "0 The History of the United States, 2nd Edition NaN Yes \n", 824 | "1 Your Best Brain: The Science of Brain Improvement 24.46 Yes \n", 825 | "2 The Story of Human Language NaN Yes \n", 826 | "3 How to Listen to and Understand Great Music, 3... 41.96 Yes \n", 827 | "4 Critical Business Skills for Success 41.96 Yes " 828 | ] 829 | }, 830 | "execution_count": 59, 831 | "metadata": {}, 832 | "output_type": "execute_result" 833 | } 834 | ], 835 | "source": [ 836 | "df_merge.head()" 837 | ] 838 | }, 839 | { 840 | "cell_type": "code", 841 | "execution_count": 62, 842 | "metadata": {}, 843 | "outputs": [], 844 | "source": [ 845 | "df_merge.to_csv('great_courses_list_v3.csv', columns=columns, index=False)" 846 | ] 847 | }, 848 | { 849 | "cell_type": "code", 850 | "execution_count": null, 851 | "metadata": {}, 852 | "outputs": [], 853 | "source": [] 854 | } 855 | ], 856 | "metadata": { 857 | "kernelspec": { 858 | "display_name": "Python 3", 859 | "language": "python", 860 | "name": "python3" 861 | }, 862 | "language_info": { 863 | "codemirror_mode": { 864 | "name": "ipython", 865 | "version": 3 866 | }, 867 | "file_extension": ".py", 868 | "mimetype": "text/x-python", 869 | "name": "python", 870 | "nbconvert_exporter": "python", 871 | "pygments_lexer": "ipython3", 872 | "version": "3.7.2" 873 | } 874 | }, 875 | "nbformat": 4, 876 | "nbformat_minor": 2 877 | } 878 | -------------------------------------------------------------------------------- /audible_eda/.ipynb_checkpoints/audible_reviews_scraper-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "%matplotlib inline\n", 13 | "import nest_asyncio\n", 14 | "nest_asyncio.apply()\n", 15 | "from requests_html import HTML, HTMLSession, AsyncHTMLSession\n", 16 | "from threading import Thread\n", 17 | "import time" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "df = pd.read_csv('/Users/cytology/Documents/code/datasets/all_english_audible.csv')\n", 27 | "df = df[~df['asin'].isnull()]" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 3, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "text/plain": [ 38 | "64023215.0" 39 | ] 40 | }, 41 | "execution_count": 3, 42 | "metadata": {}, 43 | "output_type": "execute_result" 44 | } 45 | ], 46 | "source": [ 47 | "total_ratings = df['rating_count'].sum()\n", 48 | "total_ratings" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 4, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "df = df.sort_values('rating_count', ascending=False)\n", 58 | "df.reset_index(inplace=True)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 5, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "data": { 68 | "text/html": [ 69 | "
\n", 70 | "\n", 83 | "\n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | "
indexasinauthorcategorylengthlinknarratorpriceratingrating_countrelease_datetitle
0247928B005FRGT44Ernest ClineSci-Fi & Fantasy15 hrs and 40 minshttps://www.audible.com/pd/Ready-Player-One-Au...Wil Wheaton31.505.0216094.02011-08-16Ready Player One
1248009B00B5HZGUGAndy WeirSci-Fi & Fantasy10 hrs and 53 minshttps://www.audible.com/pd/The-Martian-Audiobo...R. C. Bray29.995.0164988.02013-03-22The Martian
2142087B00QXW5GYYPaula HawkinsMysteries & Thrillers10 hrs and 58 minshttps://www.audible.com/pd/The-Girl-on-the-Tra...Clare Corbett, Louise Brealey, India Fisher28.004.5133818.02015-01-13The Girl on the Train
34895B01IW9TQPKTrevor NoahBios & Memoirs8 hrs and 44 minshttps://www.audible.com/pd/Born-a-Crime-Audiob...Trevor Noah24.955.0123838.02016-11-15Born a Crime
4282008B01I28NFEEMark MansonSelf Development5 hrs and 17 minshttps://www.audible.com/pd/The-Subtle-Art-of-N...Roger Wayne23.954.5113261.02016-09-13The Subtle Art of Not Giving a F*ck
\n", 179 | "
" 180 | ], 181 | "text/plain": [ 182 | " index asin author category \\\n", 183 | "0 247928 B005FRGT44 Ernest Cline Sci-Fi & Fantasy \n", 184 | "1 248009 B00B5HZGUG Andy Weir Sci-Fi & Fantasy \n", 185 | "2 142087 B00QXW5GYY Paula Hawkins Mysteries & Thrillers \n", 186 | "3 4895 B01IW9TQPK Trevor Noah Bios & Memoirs \n", 187 | "4 282008 B01I28NFEE Mark Manson Self Development \n", 188 | "\n", 189 | " length link \\\n", 190 | "0 15 hrs and 40 mins https://www.audible.com/pd/Ready-Player-One-Au... \n", 191 | "1 10 hrs and 53 mins https://www.audible.com/pd/The-Martian-Audiobo... \n", 192 | "2 10 hrs and 58 mins https://www.audible.com/pd/The-Girl-on-the-Tra... \n", 193 | "3 8 hrs and 44 mins https://www.audible.com/pd/Born-a-Crime-Audiob... \n", 194 | "4 5 hrs and 17 mins https://www.audible.com/pd/The-Subtle-Art-of-N... \n", 195 | "\n", 196 | " narrator price rating rating_count \\\n", 197 | "0 Wil Wheaton 31.50 5.0 216094.0 \n", 198 | "1 R. C. Bray 29.99 5.0 164988.0 \n", 199 | "2 Clare Corbett, Louise Brealey, India Fisher 28.00 4.5 133818.0 \n", 200 | "3 Trevor Noah 24.95 5.0 123838.0 \n", 201 | "4 Roger Wayne 23.95 4.5 113261.0 \n", 202 | "\n", 203 | " release_date title \n", 204 | "0 2011-08-16 Ready Player One \n", 205 | "1 2013-03-22 The Martian \n", 206 | "2 2015-01-13 The Girl on the Train \n", 207 | "3 2016-11-15 Born a Crime \n", 208 | "4 2016-09-13 The Subtle Art of Not Giving a F*ck " 209 | ] 210 | }, 211 | "execution_count": 5, 212 | "metadata": {}, 213 | "output_type": "execute_result" 214 | } 215 | ], 216 | "source": [ 217 | "df.head()" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 10, 223 | "metadata": {}, 224 | "outputs": [ 225 | { 226 | "data": { 227 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAHzBJREFUeJzt3Xl81Xed7/HXh+x7AkkgZG0oe6FAI920m21taS3qWKX19jrakXHpqKNeL3XudbztzDjW61id6VWrdpmpnVqtC7d2c7qMLa1lKQUKlBJCIAmQhOz7dr7zx/mBISRwgJDfOb+8n49HHud3vud7ks83PXnz7fe3mXMOEREJlil+FyAiIuNP4S4iEkAKdxGRAFK4i4gEkMJdRCSAFO4iIgGkcBcRCSCFu4hIACncRUQCKN6vH5ybm+vKysr8+vEiIjFp06ZNh51zeSfr51u4l5WVsXHjRr9+vIhITDKzfZH007KMiEgAKdxFRAJI4S4iEkAKdxGRAFK4i4gEkMJdRCSAFO4iIgHk23HuIiJB1zswRGNHH42dfeFH7+u98/NZXJR9Vn+2wl1E5BQMDIVo7uo/JqyPhrf3eNhr7+gbHPV75GYkKdxFRCbC4FCIpq5+DrX1Ut/eS31HHw3tveHn3nZDRx/NXf2jvj8jKZ68jCRyM5KYX5DJZXOSyMtIIi/de8xIIjc9iWnpiSTEnf0VcYW7iASac47W7gHqO8JB3dDeR317L4fae6lv76PBaz/c2UfIHfveKQZ5GUlMz0ymKCeVZaU5x4V1vveYkhjnzwDHoHAXkZjlnKOtZ4C61h4OtvZysK2HOu/xYGsvB9t7qG/vo38wdNx7c1ITmJ6ZTH5mMvNmZBzdnpGZzPTMcKBPS0skfgJm2WeDwl1EolZ3/yAHhoV1XWtPeLut92ig9wwMHfOehDhjRlYyBVkpLCvJYcYooZ2XkURyQnTNtMebwl1EfNPeO0BNczc1zT3UtnRT2xJ+PDL7bu0eOKa/GeSlJ1GQncK8GRlcOTefgqxkZmanhL+ykslNT2LKFPNpRNFD4S4iZ01X3+DRwK5pDod3jRfiNc3dtPceezRJelI8RTkpFGancEFpNgVZ4e0jAT49M5nE+NhcJploCncROW2hkONQey/7mrrZ19RFddOREO+mpqXnuCNLkhOmUJyTSlFOCheU5lCUk+I9T6V4agpZKQmYadY9HhTuInJCg0Mh6lp7qG7qZr8X4PuausKB3tx9zM7KhDijyAvv6wqzhoV3CsVTU5mWlqjwniAKdxFhKOSoa+lhz+FOqhq7js7C9zd1UdvSw+CwYwSTE6ZQNi2N8rw0rpyXT+m0VMqmpVE6LZWCrBTitN4dFRTuIpNIe+8AVY1dVDV2sqex09vuYm9T1zEz8IykeMpy01hYmMUNiwsonZZ2NMDzM5I0+44BCneRgAmFHLUtPezxAnzP0TDv4nBn39F+cVOM0qmplOelcfncPGblpVGel055bhpTtXwS8xTuIjHKOUddaw+76zt5p76DXfUd7K7vpLKh85hjv3NSEyjPS+fKuXnMyg+Hd3leOiVTU3XkSYAp3EWinHPhI1Leqe9kd30Huw518E5DJ5X1HXT1/ynE8zOSmDsjg1uWlzBnejrn5qdTnpfO1LREH6sXvyjcRaJI78AQ79R3sONAOzsPtrPzYAc7D7XTMex48Nz0ROZMz+DmimJmT09nzvQM5uRnkJWa4GPlEm0U7iI+aejoZefBPwX5joPtVDV2Hr14VVpiHPMKMlm5ZCZzp2cwe3oGc6ZnaCYuEVG4i5xlzjn2NXWzta6N7XVt7DgYDvPDnX86wacwO4X5BZmsOG8G8wsyWTAzk+KcVJ1GL6ctonA3s+uA7wFxwE+cc/844vUS4GEg2+uzxjn31DjXKhL1nHPUNPewra6NrXWtbKtt4626tqOn2SfGTWH29HSunJt/NMTnz8jUkoqMu5OGu5nFAfcB1wC1wAYzW+uc2zGs2/8CHnfO/cDMFgBPAWVnoV6RqHHkaJVttW1srQuH+NbaNtp6whe7Sogz5s3I5MbzZ7K4MIvzCrOYMz1DR6jIhIhk5r4cqHTOVQGY2WPASmB4uDsg09vOAg6MZ5Ei0aC7f5AtNW1srmlh8/5WNu9vPXrcePwUY+6MDFYsmsF5hVksLsxmzox0kuKDfVlZiV6RhHshUDPseS1w4Yg+3wCeM7O/AtKAq8elOhGfOOfYe7iLzftbeWN/OMx31Xcw5O3tLM9N47I5uSwtzmZxUTZzZ2QE/vrgElsiCffR9uiMuBkVtwAPOee+Y2YXA/9mZuc55465/YmZrQZWA5SUlJxOvSJnRe/AEFtqWtlQ3cymfS1srmk9ei3x9KR4lhRn87krZrG0JIclxdnk6IgViXKRhHstUDzseRHHL7vcDlwH4Jx7zcySgVygYXgn59z9wP0AFRUVI/+BEJkwnX2DbNrXwvq9Tazf28yWmjb6h8Jzkdn56bxvwQyWlmSzrDSHWXnpuhiWxJxIwn0DMNvMzgHqgFXArSP67AfeCzxkZvOBZKBxPAsVORNNnX1sqG5h/d5mNlQ3s/1AGyEXvr7KeYVZ/PmlZSwvm0pFWQ7ZqZqVS+w7abg75wbN7A7gWcKHOT7gnNtuZncBG51za4EvAz82s78mvGTz5845zczFN519g7xe1cS6yiZe3XOYtw91AJAUP4WlJdnccdVslpdNZWlJNmlJOt1Dgsf8yuCKigq3ceNGX362BE/f4BCb97fyauVh1u1p4s2aVoZCjqT4KVSU5XDJrFwuKp/KeYVZOoJFYpqZbXLOVZysn6YsEpOcc+yq7+AP7zTySmUT6/c20TsQYorB4qJsPn15OZfOymVZaY6OYpFJSeEuMaOjd4B1lYd5aVcj//lOIwfbeoHwDtBV7yrhklnTuLB8GlkpOttTROEuUevI7PzFtxt5aVcDm/a1MBhyZCTF8+7ZuXzx6jwun5PPjKxkv0sViToKd4kq/YMh/ljVxHM7DvH8zoajs/N5MzL41GXlXDEnj2WlOSTE6RR+kRNRuIvvOnoH+M93Gnluez0vvt1AR98gKQlxXDYnly9ePVuzc5HToHAXXzR09PL7HfU8t72e1/Y00T8UYlpaIisWFXDtwulcem6udoSKnAGFu0yYw519PP3WIZ7ccoD11c04B6XTUvn4JaVcu3AGy0pydCaoyDhRuMtZ1dzVz7PbD/Hk1gO8tqeJkINZeWl8/qrZXL9oBnOnZ2CmQBcZbwp3GXedfYM889Yh1m45wLrKwwyFHOfkpvG5K8/lhsUFCnSRCaBwl3ExFHK8uucwv3qjjmfeOkTPwBDFU1NYfVk5NywqYOHMTAW6yARSuMsZeae+gyfeqOU3m+uob+8jMzmeDy4r5M+WFbKsJEeBLuIThbucso7eAdZuOcBj62vYVtdG3BTjyrl5/O37i7hqXr6OchGJAgp3iYhzjm11bTz6+n7WbjlAd/8Q82Zk8PUbF3DTkpnkpif5XaKIDKNwlxPq7Bvkt2/W8ejr+9l+oJ2UhDjef34Bt15YyvlFWVp2EYlSCncZVVVjJw+9Ws0Tm2rp8mbpd69cyMqlhWQm68JcItFO4S5HOed4efdhHly3lxd3NZIYN4Ubzy/gtotKWVKcrVm6SAxRuAvd/YP86o06Hnq1msqGTnLTk/ji1bP52IWl5GVoLV0kFincJ7GWrn4eerWah1+rprV7gEWFWfzTR87nhsUFuluRSIxTuE9CB9t6+PEf9vLv6/fTMzDE1fOn85eXl1NRquPSRYJC4T6J7Gns5Icv7eE3b9YRcrDy/Jl8+opZzJme4XdpIjLOFO6TQPXhLr73/G5++2YdCXFTuHV5CZ+6rJyinFS/SxORs0ThHmA1zd388wu7eeKNOhLijL94TzmrLyvXCUcik4DCPYAOtvXwLy9U8vjGGsyM2y4q5bNXziI/Q3czEpksFO4B0tE7wA9e2sNPX9lLyDk++q5iPnfluRRkpfhdmohMMIV7AAwMhXhs/X7u/Y/dNHX184ElM/nytXMpnqo1dZHJSuEew5xz/H5HPf/49NtUHe7iovKpPLRiAYuKsvwuTUR8pnCPUZUNHfzt2u2sq2xiVl4aP/14BVfNy9dx6iICKNxjTlffIN9/YTc/fXkvqYlx3LVyIbcuLyE+borfpYlIFFG4xwjnHL/bdpC/e3Inh9p7+UhFEf/zunlM02GNIjIKhXsMqG3p5s5fbePl3YdZODOT+z62jAtKc/wuS0SimMI9ioVCjkde38e3nn4bB3zj/Qu47eIy4qZoXV1ETkzhHqWqGjtZ88Q21lc3857ZuXzzQ4t0uQARiZjCPcqEQo4H1u3l28/uIil+Cvd8eDE3X1Cko2BE5JQo3KPIobZevvKLLbxSeZir5+fz9x9cxPRMXTJARE6dwj1KPPPWIdb8ait9AyG++aFFrHpXsWbrInLaIjo42syuM7NdZlZpZmvG6PMRM9thZtvN7NHxLTO4uvsHWfPEVj79yCaKc1J58vPv5pblJQp2ETkjJ525m1kccB9wDVALbDCztc65HcP6zAbuBC51zrWYWf7ZKjhI9jR28plHNrG7oZPPXDGLv756DonxOhlJRM5cJMsyy4FK51wVgJk9BqwEdgzr8yngPudcC4BzrmG8Cw2a3209yFd/uYWkhDj+9ZPLec/sPL9LEpEAiSTcC4GaYc9rgQtH9JkDYGbrgDjgG865Z8alwoDpHwzxzad38uC6apaWZHPfrcuYma1L8orI+Iok3Edb/HWjfJ/ZwBVAEfCymZ3nnGs95huZrQZWA5SUlJxysbGuqbOPzzzyBuurm/nEpWXcef18LcOIyFkRSbjXAsXDnhcBB0bp80fn3ACw18x2EQ77DcM7OefuB+4HqKioGPkPRKDtOtTB7Q9voLGjj++tWsLKJYV+lyQiARbJtHEDMNvMzjGzRGAVsHZEn98AVwKYWS7hZZqq8Sw0lr34dgN/9oNX6RsM8fO/vFjBLiJn3Uln7s65QTO7A3iW8Hr6A8657WZ2F7DRObfWe+1aM9sBDAH/wznXdDYLjxUPrtvL3U/uYH5BJj/5eIVueSciE8Kc82d1pKKiwm3cuNGXnz0RnHPc8+wufvDSHq5dMJ17Vy0hNVHnjInImTGzTc65ipP1U9qcBYNDIb726208vrGWWy8s4e6V5+lKjiIyoRTu46x3YIg7Ht3Mf+ys5wvvnc0Xr56ts01FZMIp3MdRT/8Qtz+8gdeqmrh75UJuu7jM75JEZJJSuI+T7v5Bbn9oI6/vbeI7N5/Ph5YV+V2SiExiCvdx0N0/yCcf2sD6vc3800eW8IGlOtRRRPylcD9DPf1DfOLBDWyobua7H9XJSSISHRTuZ2BgKMRnf7aJ9dXN3KtgF5EoogubnKZQyPHVX27lxV2N/P0HFinYRSSqKNxPg3OOu3+3g19vruMr187h1gsn30XQRCS6KdxPw4/+UMWD66r5xKVlfO7Kc/0uR0TkOAr3U/Ts9kN865m3uWFxAf/7hgU6QUlEopLC/RS8VdfGFx97k8VF2Xzn5vOZoksKiEiUUrhHqL69l794eCM5qQn8+L9fQHJCnN8liYiMSYdCRqB/MMSnH9lEe+8AT3zmEvIzkv0uSUTkhBTuEfiHp3ayeX8r/+9jy5hfkOl3OSIiJ6VlmZNYu+UAD71aze3vPocViwr8LkdEJCIK9xOobOhgzRNbuaA0hzXXz/O7HBGRiCncx9A7MMTnfraZlIQ47rt1GQlx+lWJSOzQmvsY/u+zu9hV38GDn3gXM7K0A1VEYoumo6N4tfIwP3llL7ddVMqVc/P9LkdE5JQp3Edo6xngK7/YQnluGneu0Dq7iMQmLcuMcNf/30F9Rx9PfOYSUhP16xGR2KSZ+zCv7D7ME2/U8unLy1lSnO13OSIip03h7unpH+Jrv97GOblp/NVVs/0uR0TkjGjdwfP9F3azv7mbRz91oa4bIyIxTzN34J36Dn78hypuvqCIS2bl+l2OiMgZm/Th7pzj7id3kJoYx9dWzPe7HBGRcTHpw/3FXQ28vPswX7h6DjlpiX6XIyIyLiZ1uA8Mhfi73+2kPDeN2y4q9bscEZFxM6nD/Wd/3EdVYxdfWzGfxPhJ/asQkYCZtInW1TfIP79QySWzpvHe+brEgIgEy6QN94dfq6apq5+vvG+ubnItIoEzKcO9o3eA+/9QxZVz81hWkuN3OSIi425ShvsDr1TT2j3Al66Z63cpIiJnxaQL9/beAX7yShXXLpjOoqIsv8sRETkrJl24P/r6fjp6B/n8e3X9GBEJrojC3cyuM7NdZlZpZmtO0O/DZubMrGL8Shw//YMhHly3l0vPncZ5hZq1i0hwnTTczSwOuA+4HlgA3GJmC0bplwF8Hnh9vIscL799s4769j5WXzbL71JERM6qSGbuy4FK51yVc64feAxYOUq/u4F7gN5xrG/chEKOH79cxbwZGVw2WxcHE5FgiyTcC4GaYc9rvbajzGwpUOyce3IcaxtXf9jdyDv1nay+rFzHtYtI4EUS7qMloTv6otkU4LvAl0/6jcxWm9lGM9vY2NgYeZXj4JE/7ic3PYkbF8+c0J8rIuKHSMK9Fige9rwIODDseQZwHvCSmVUDFwFrR9up6py73zlX4ZyryMvLO/2qT9GB1h5eeLuej76rSNeQEZFJIZKk2wDMNrNzzCwRWAWsPfKic67NOZfrnCtzzpUBfwRucs5tPCsVn4bHNtTggFXvKvG7FBGRCXHScHfODQJ3AM8CO4HHnXPbzewuM7vpbBd4pgaGQjy2fj9XzMmjeGqq3+WIiEyIiO6h6px7CnhqRNvXx+h7xZmXNX5eeLuBho4+/uFCXa9dRCaPwC9A//qNOnLTk7hi7sSt8YuI+C3Q4d7WPcALbzewcslM4uMCPVQRkWMEOvF+t+0g/UMhPri08OSdRUQCJNDh/pvNdZybn87CmZl+lyIiMqECG+61Ld2sr27mg0sLdUaqiEw6gQ33Z946BMCNiwt8rkREZOIFNtyf21HPvBkZlE5L87sUEZEJF8hwb+rsY2N1M9cumO53KSIivghkuD+/s4GQg2sXzvC7FBERXwQy3J/bcYjC7BQdJSMik1bgwr1vcIh1lU1cNS9fR8mIyKQVuHB/Y18rPQNDvEd3WxKRSSxw4f5KZSNxU4yLZk3zuxQREd8EL9x3H2ZJcTaZyQl+lyIi4ptAhXtrdz9b69p497lakhGRyS1Q4f7aniacQ+vtIjLpBSrc11c3k5wwhfOLs/0uRUTEV4EK9037Wji/KJsEXbtdRCa5wKRgd/8g2w+0U1GW43cpIiK+C0y4b6lpYyjkuKBU4S4iEphwf2N/CwDLShTuIiKBCfdN+1o4Nz+d7NREv0sREfFdIMLdOceWmlaW6CgZEREgIOFe395HU1c/5+kqkCIiQEDCfcfBNgAWFmb5XImISHQIRLhvr2sHYH6BZu4iIhCUcD/QTtm0VNKT4v0uRUQkKgQj3A+2sXCmlmRERI6I+XBv6xmgprmHBdqZKiJyVMyH+65DHQAKdxGRYWI+3CsbOgGYnZ/ucyUiItEj5sN9T2MnKQlxzMxK8bsUEZGoEfPhXtnQSXleGlOmmN+liIhEjZgP9z2NnczK05KMiMhwMR3uPf1D1LX2cK7W20VEjhFRuJvZdWa2y8wqzWzNKK9/ycx2mNlWM3vezErHv9TjVR3uxDk0cxcRGeGk4W5mccB9wPXAAuAWM1swottmoMI5txj4JXDPeBc6mv1N3QCU5aZOxI8TEYkZkczclwOVzrkq51w/8BiwcngH59yLzrlu7+kfgaLxLXN0tS09ABTlKNxFRIaLJNwLgZphz2u9trHcDjx9JkVFqralm8zkeLJSEibix4mIxIxIrrQ12jGGbtSOZv8NqAAuH+P11cBqgJKSkghLHFttSw+FmrWLiBwnkpl7LVA87HkRcGBkJzO7Gvgb4CbnXN9o38g5d79zrsI5V5GXl3c69R5bWEsPRTk6eUlEZKRIwn0DMNvMzjGzRGAVsHZ4BzNbCvyIcLA3jH+Zx3POUdvSrXAXERnFScPdOTcI3AE8C+wEHnfObTezu8zsJq/bt4F04Bdm9qaZrR3j242b1u4BuvqHtDNVRGQUEd3dwjn3FPDUiLavD9u+epzrOqk/HSmjmbuIyEgxe4bqwbZwuOuCYSIix4vZcG/oCO+zzc9M8rkSEZHoE9PhbgbT0hL9LkVEJOrEbLg3dvQxLS2R+LiYHYKIyFkTs8nY2NFLXkay32WIiESlmA33ho4+8jK03i4iMpqYDffGjj7yFe4iIqOKyXAPhRyNmrmLiIwpJsO9tWeAwZAjL13hLiIympgM95bufgCmpeswSBGR0cRkuLd2DwCQqeu4i4iMKibDvb0nHO7ZCncRkVHFZLi39oSXZXQHJhGR0cVkuLd5yzLZqVpzFxEZTUyGe6u3LJOZHNEVi0VEJp2YDPe2ngEykuJ1XRkRkTHEZDq2dQ+Qlar1dhGRscRmuPcMaGeqiMgJxGS4d/QNkqH1dhGRMcVkuPf0D5GaqHAXERlLTIZ7d/8gKQlxfpchIhK1YjLce/qHSElUuIuIjCUmw717YIhUhbuIyJhiM9w1cxcROaGYC/ehkKN/MERqgnaoioiMJebCvbt/EEDLMiIiJxBz4d7TPwSgZRkRkROIuXDv9sJdM3cRkbHFXLj3DCjcRUROJubCvfvosox2qIqIjCXmwr3Xm7knx8dc6SIiEybmErJ/KARAosJdRGRMMZeQA4PhcE/QjTpERMYUcwk5MOQAzdxFRE4k5hJyYEgzdxGRk4m5hOw/Gu7mcyUiItEronA3s+vMbJeZVZrZmlFeTzKzn3uvv25mZeNd6BFHZu6JmrmLiIzppAlpZnHAfcD1wALgFjNbMKLb7UCLc+5c4LvAt8a70CO0Q1VE5OQiScjlQKVzrso51w88Bqwc0Wcl8LC3/UvgvWZ2VtZNjuxQjdeyjIjImCIJ90KgZtjzWq9t1D7OuUGgDZg2HgWOVJabxopFM3S0jIjICURyDv9oU2R3Gn0ws9XAaoCSkpIIfvTxrlkwnWsWTD+t94qITBaRTH9rgeJhz4uAA2P1MbN4IAtoHvmNnHP3O+cqnHMVeXl5p1exiIicVCThvgGYbWbnmFkisApYO6LPWuDj3vaHgRecc8fN3EVEZGKcdFnGOTdoZncAzwJxwAPOue1mdhew0Tm3Fvgp8G9mVkl4xr7qbBYtIiInFtF1c51zTwFPjWj7+rDtXuDm8S1NREROlw45EREJIIW7iEgAKdxFRAJI4S4iEkDm1xGLZtYI7DvNt+cCh8exnFigMU8OGvPkcCZjLnXOnfREId/C/UyY2UbnXIXfdUwkjXly0Jgnh4kYs5ZlREQCSOEuIhJAsRru9/tdgA805slBY54czvqYY3LNXURETixWZ+4iInICMRfuJ7ufa7QzswfMrMHM3hrWNtXMfm9mu73HHK/dzOz73li3mtmyYe/5uNd/t5l9fFj7BWa2zXvP98/WHbEiZWbFZvaime00s+1m9gWvPchjTjaz9Wa2xRvz//Haz/HuMbzbu+dwotc+5j2IzexOr32Xmb1vWHtU/h2YWZyZbTazJ73ngR6zmVV7n703zWyj1xYdn23nXMx8Eb4q5R6gHEgEtgAL/K7rFMdwGbAMeGtY2z3AGm97DfAtb3sF8DThm6FcBLzutU8FqrzHHG87x3ttPXCx956nget9Hm8BsMzbzgDeIXwv3iCP2YB0bzsBeN0by+PAKq/9h8BnvO3PAj/0tlcBP/e2F3if8STgHO+zHxfNfwfAl4BHgSe954EeM1AN5I5oi4rPtu8fhlP8RV4MPDvs+Z3AnX7XdRrjKOPYcN8FFHjbBcAub/tHwC0j+wG3AD8a1v4jr60AeHtY+zH9ouEL+C1wzWQZM5AKvAFcSPiklXiv/ehnmfDltC/2tuO9fjby832kX7T+HRC+kc/zwFXAk94Ygj7mao4P96j4bMfaskwk93ONRdOdcwcBvMd8r32s8Z6ovXaU9qjg/a/3UsIz2UCP2VueeBNoAH5PeNbZ6sL3GIZj6xzrHsSn+rvw273AV4GQ93wawR+zA54zs00Wvo0oRMlnO6LruUeRiO7VGiBjjfdU231nZunAE8AXnXPtJ1g6DMSYnXNDwBIzywZ+DcwfrZv3eKpjG21S5uuYzexGoME5t8nMrjjSPErXwIzZc6lz7oCZ5QO/N7O3T9B3Qj/bsTZzj+R+rrGo3swKALzHBq99rPGeqL1olHZfmVkC4WD/mXPuV15zoMd8hHOuFXiJ8BprtoXvMQzH1jnWPYhP9Xfhp0uBm8ysGniM8NLMvQR7zDjnDniPDYT/EV9OtHy2/V6zOsX1rXjCOxvO4U87VRb6XddpjKOMY9fcv82xO2Du8bZv4NgdMOu99qnAXsI7X3K87aneaxu8vkd2wKzweawG/Ctw74j2II85D8j2tlOAl4EbgV9w7M7Fz3rbn+PYnYuPe9sLOXbnYhXhHYtR/XcAXMGfdqgGdsxAGpAxbPtV4Lpo+Wz7/kE4jV/oCsJHXOwB/sbvek6j/n8HDgIDhP9lvp3wWuPzwG7v8ch/WAPu88a6DagY9n0+CVR6X58Y1l4BvOW951/wTlTzcbzvJvy/kluBN72vFQEf82Jgszfmt4Cve+3lhI9+qPRCL8lrT/aeV3qvlw/7Xn/jjWsXw46UiOa/A44N98CO2RvbFu9r+5GaouWzrTNURUQCKNbW3EVEJAIKdxGRAFK4i4gEkMJdRCSAFO4iIgGkcBcRCSCFu4hIACncRUQC6L8AeMHSmicNQgUAAAAASUVORK5CYII=\n", 228 | "text/plain": [ 229 | "
" 230 | ] 231 | }, 232 | "metadata": { 233 | "needs_background": "light" 234 | }, 235 | "output_type": "display_data" 236 | } 237 | ], 238 | "source": [ 239 | "top_n = 50000\n", 240 | "x = list(range(top_n))\n", 241 | "cumsum = df.iloc[:top_n]['rating_count'].cumsum()/total_ratings\n", 242 | "plt.plot(x, cumsum);" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 7, 248 | "metadata": {}, 249 | "outputs": [ 250 | { 251 | "data": { 252 | "text/plain": [ 253 | "436796" 254 | ] 255 | }, 256 | "execution_count": 7, 257 | "metadata": {}, 258 | "output_type": "execute_result" 259 | } 260 | ], 261 | "source": [ 262 | "len(df)" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 19, 268 | "metadata": {}, 269 | "outputs": [ 270 | { 271 | "data": { 272 | "text/plain": [ 273 | "4417" 274 | ] 275 | }, 276 | "execution_count": 19, 277 | "metadata": {}, 278 | "output_type": "execute_result" 279 | } 280 | ], 281 | "source": [ 282 | "cutoff_ind = cumsum[cumsum>0.5].index[0]\n", 283 | "df = df.iloc[:cutoff_ind]\n", 284 | "# df = df.iloc[800:820]\n", 285 | "cutoff_ind" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 12, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "asin_list = list(df['asin'])" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 13, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "asin_list = asin_list[5000:5020]" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 14, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "def get_overall_rating(asin, rating_dict):\n", 313 | " sess = HTMLSession()\n", 314 | " r = sess.get(f\"https://www.audible.com/pd/{asin}\")\n", 315 | " rating_dict[asin] = np.float(r.html.find('div.bc-row.bc-spacing-small', \n", 316 | " first=False)[1].text.split(' stars ')[1][:3])\n", 317 | " sess.close()\n", 318 | " return rating_dict\n", 319 | "\n", 320 | "\n", 321 | "def get_reviews(asin):\n", 322 | " baseurl = f'https://www.audible.com/pd/reviews?country=US&asin={asin}&page='\n", 323 | " page_num = 0\n", 324 | " ratings_reviews = []\n", 325 | " sess = HTMLSession()\n", 326 | " \n", 327 | " while True:\n", 328 | " try:\n", 329 | " url = baseurl + str(page_num)\n", 330 | " r = sess.get(url).html\n", 331 | " page_elements = r.find('div.bc-row-responsive.bc-spacing-top-medium', first=False)\n", 332 | " for elem in page_elements:\n", 333 | " review = elem.find(f'div.bc-col-responsive.USreviews{page_num}.bc-col-9', first=True).text\n", 334 | " ratings = [item.text[0] for item in elem.find('span.bc-text')]\n", 335 | " ratings_reviews.append((review, *ratings))\n", 336 | " page_num += 1\n", 337 | " \n", 338 | " except:\n", 339 | " print(page_num)\n", 340 | " break\n", 341 | " \n", 342 | " sess.close()\n", 343 | " return ratings_reviews\n", 344 | "\n", 345 | "\n", 346 | "def get_ratings_and_reviews(asins, rating_dict, reviews):\n", 347 | " for asin in asins:\n", 348 | " rating_dict = (get_overall_rating(asin, rating_dict))\n", 349 | " reviews.extend(get_reviews(asin))\n", 350 | "# print(reviews)\n", 351 | " return rating_dict, reviews" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 15, 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [ 360 | "def threaded_ratings(nthreads, asins, rating_dict=None, reviews=None):\n", 361 | " if rating_dict == None:\n", 362 | " rating_dict = {}\n", 363 | " \n", 364 | " if reviews == None:\n", 365 | " reviews = []\n", 366 | " \n", 367 | " threads = []\n", 368 | " for i in range(nthreads):\n", 369 | " asin_group = asins[i::nthreads]\n", 370 | " t = Thread(target=get_ratings_and_reviews, args=(asin_group, rating_dict, reviews))\n", 371 | " threads.append(t)\n", 372 | " \n", 373 | " [t.start() for t in threads]\n", 374 | " [t.join() for t in threads]\n", 375 | " \n", 376 | " return rating_dict, reviews" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 16, 382 | "metadata": {}, 383 | "outputs": [ 384 | { 385 | "name": "stdout", 386 | "output_type": "stream", 387 | "text": [ 388 | "6\n", 389 | "6\n", 390 | "10\n", 391 | "13\n", 392 | "5\n", 393 | "15\n", 394 | "15\n", 395 | "9\n", 396 | "9\n", 397 | "5\n", 398 | "13\n", 399 | "10\n", 400 | "13\n", 401 | "16\n", 402 | "15\n", 403 | "6\n", 404 | "15\n", 405 | "29\n", 406 | "9\n", 407 | "9\n", 408 | "29.16919183731079\n" 409 | ] 410 | } 411 | ], 412 | "source": [ 413 | "start = time.time()\n", 414 | "rating_dict = {}\n", 415 | "rating_dict, reviews = threaded_ratings(6, asin_list, rating_dict)\n", 416 | "end = time.time()\n", 417 | "print(end-start)" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": 17, 423 | "metadata": {}, 424 | "outputs": [ 425 | { 426 | "data": { 427 | "text/plain": [ 428 | "2216" 429 | ] 430 | }, 431 | "execution_count": 17, 432 | "metadata": {}, 433 | "output_type": "execute_result" 434 | } 435 | ], 436 | "source": [ 437 | "len(reviews)" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": null, 443 | "metadata": {}, 444 | "outputs": [], 445 | "source": [ 446 | "# url = 'https://www.audible.com/pd/reviews?country=US&asin=B00JU4QCMC&page=0'\n", 447 | "# sess = HTMLSession()\n", 448 | "# r = sess.get(url).html\n", 449 | "# page_elements = r.find('div.bc-row-responsive.bc-spacing-top-medium', first=False)\n", 450 | " " 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": null, 456 | "metadata": {}, 457 | "outputs": [], 458 | "source": [ 459 | "# page_elements" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": null, 465 | "metadata": {}, 466 | "outputs": [], 467 | "source": [ 468 | "rev_df = pd.DataFrame(data=reviews, columns=['text', 'overall', 'performance', 'story'])" 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": null, 474 | "metadata": {}, 475 | "outputs": [], 476 | "source": [ 477 | "rev_df['overall'] = rev_df['overall'].astype(str)" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": null, 483 | "metadata": {}, 484 | "outputs": [], 485 | "source": [ 486 | "nums = ['1','2','3','4','5']\n", 487 | "scores = rev_df[rev_df.isin(nums)].drop('text', axis=1).dropna().astype(int)" 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": null, 493 | "metadata": {}, 494 | "outputs": [], 495 | "source": [ 496 | "scores.hist()" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": null, 502 | "metadata": {}, 503 | "outputs": [], 504 | "source": [ 505 | "rev_df['text_length'] = rev_df['text'].apply(len)" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": null, 511 | "metadata": {}, 512 | "outputs": [], 513 | "source": [ 514 | "rev_df['text_length'].hist(bins=50, range=(0, 2000))" 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": null, 520 | "metadata": {}, 521 | "outputs": [], 522 | "source": [ 523 | "rev_df.sort_values('text_length', inplace=True)" 524 | ] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "execution_count": null, 529 | "metadata": {}, 530 | "outputs": [], 531 | "source": [ 532 | "y = rev_df['text_length'].cumsum()/rev_df['text_length'].sum()" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": null, 538 | "metadata": {}, 539 | "outputs": [], 540 | "source": [ 541 | "import matplotlib.pyplot as plt\n", 542 | "plt.plot(rev_df['text_length'], y)" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": null, 548 | "metadata": {}, 549 | "outputs": [], 550 | "source": [ 551 | "txt = 'i was impressed by the overall narrative, but felt the author could have used some better organization'\n", 552 | "len(txt)" 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": null, 558 | "metadata": {}, 559 | "outputs": [], 560 | "source": [ 561 | "alltext = list(rev_df['text'].values)" 562 | ] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "execution_count": null, 567 | "metadata": {}, 568 | "outputs": [], 569 | "source": [ 570 | "allwords = ' '.join(alltext)" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": null, 576 | "metadata": {}, 577 | "outputs": [], 578 | "source": [ 579 | "allwords.split()[:50]" 580 | ] 581 | }, 582 | { 583 | "cell_type": "code", 584 | "execution_count": null, 585 | "metadata": {}, 586 | "outputs": [], 587 | "source": [] 588 | } 589 | ], 590 | "metadata": { 591 | "kernelspec": { 592 | "display_name": "Python 3", 593 | "language": "python", 594 | "name": "python3" 595 | }, 596 | "language_info": { 597 | "codemirror_mode": { 598 | "name": "ipython", 599 | "version": 3 600 | }, 601 | "file_extension": ".py", 602 | "mimetype": "text/x-python", 603 | "name": "python", 604 | "nbconvert_exporter": "python", 605 | "pygments_lexer": "ipython3", 606 | "version": "3.7.2" 607 | } 608 | }, 609 | "nbformat": 4, 610 | "nbformat_minor": 2 611 | } 612 | -------------------------------------------------------------------------------- /audible_eda/audible_prices.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from requests_html import HTMLSession, HTML\n", 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "from datetime import datetime\n", 13 | "import nest_asyncio" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "url = 'https://www.audible.com/search?pf_rd_p=1d79b443-2f1d-43a3-b1dc-31a2cd242566&pf_rd_r=HK8P1MY097JB8VJ6PRTQ&ref=a_search_c4_pageSize_3&keywords=the+great+courses&pageSize=50'" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 3, 28 | "metadata": { 29 | "scrolled": true 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "def scrape_great_courses(url):\n", 34 | " sess = HTMLSession()\n", 35 | " data = []\n", 36 | " pages = 15\n", 37 | " for page in range(pages):\n", 38 | " pageurl = url + '&page=' + str(page+1)\n", 39 | " r = sess.get(pageurl)\n", 40 | " items = r.html.find('li.bc-list-item.productListItem', first=False)\n", 41 | " for i, item in enumerate(items):\n", 42 | " text_fields = item.text.split('\\n')\n", 43 | " dict_entry={\n", 44 | " 'title' : text_fields[0],\n", 45 | " 'price' : np.float([s for s in text_fields if 'Regular' in s][0].split('$')[1]),\n", 46 | " 'length' : [s for s in text_fields if 'Length' in s][0].split(': ')[1],\n", 47 | " 'rating' : np.float([s for s in text_fields if 'stars' in s][0].split(' out')[0]),\n", 48 | " 'rating_count' : np.int([s for s in text_fields if 'stars' in s][0].split(\n", 49 | " 'stars ')[1].replace(',','')),\n", 50 | " 'link' : 'https://www.audible.com' + [link for link in item.links if '/pd/' in link][0],\n", 51 | " }\n", 52 | " if any(['Series:' in s for s in text_fields]):\n", 53 | " dict_entry['series'] = [s for s in text_fields if 'Series:' in s][0].split('Series: ')[1]\n", 54 | " else:\n", 55 | " dict_entry['series'] = 'N/A'\n", 56 | " if any(['Release date:' in s for s in text_fields]):\n", 57 | " dict_entry['release_date'] = datetime.strptime([s for s in text_fields if 'Release date:' in s][0].split(\n", 58 | " ': ')[1], '%m-%d-%y')\n", 59 | " data.append(dict_entry)\n", 60 | " return data" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 4, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "df = pd.DataFrame(data=scrape_great_courses(url))" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 5, 75 | "metadata": { 76 | "scrolled": true 77 | }, 78 | "outputs": [ 79 | { 80 | "data": { 81 | "text/html": [ 82 | "
\n", 83 | "\n", 96 | "\n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | "
lengthlinkpriceratingrating_countrelease_dateseriestitle
043 hrs and 23 minshttps://www.audible.com/pd/The-History-of-the-...59.954.523152013-07-08The Great Courses: Modern HistoryThe History of the United States, 2nd Edition
112 hrs and 39 minshttps://www.audible.com/pd/Your-Best-Brain-The...34.954.526292014-11-14The Great Courses: PsychologyYour Best Brain: The Science of Brain Improvement
218 hrs and 15 minshttps://www.audible.com/pd/The-Story-of-Human-...41.954.536132013-07-08The Great Courses: LinguisticsThe Story of Human Language
336 hrs and 34 minshttps://www.audible.com/pd/How-to-Listen-to-an...59.954.523372013-07-08The Great Courses: Fine Arts & MusicHow to Listen to and Understand Great Music, 3...
431 hrs and 18 minshttps://www.audible.com/pd/Critical-Business-S...59.954.521712015-04-08The Great Courses: ProfessionalCritical Business Skills for Success
\n", 168 | "
" 169 | ], 170 | "text/plain": [ 171 | " length link \\\n", 172 | "0 43 hrs and 23 mins https://www.audible.com/pd/The-History-of-the-... \n", 173 | "1 12 hrs and 39 mins https://www.audible.com/pd/Your-Best-Brain-The... \n", 174 | "2 18 hrs and 15 mins https://www.audible.com/pd/The-Story-of-Human-... \n", 175 | "3 36 hrs and 34 mins https://www.audible.com/pd/How-to-Listen-to-an... \n", 176 | "4 31 hrs and 18 mins https://www.audible.com/pd/Critical-Business-S... \n", 177 | "\n", 178 | " price rating rating_count release_date \\\n", 179 | "0 59.95 4.5 2315 2013-07-08 \n", 180 | "1 34.95 4.5 2629 2014-11-14 \n", 181 | "2 41.95 4.5 3613 2013-07-08 \n", 182 | "3 59.95 4.5 2337 2013-07-08 \n", 183 | "4 59.95 4.5 2171 2015-04-08 \n", 184 | "\n", 185 | " series \\\n", 186 | "0 The Great Courses: Modern History \n", 187 | "1 The Great Courses: Psychology \n", 188 | "2 The Great Courses: Linguistics \n", 189 | "3 The Great Courses: Fine Arts & Music \n", 190 | "4 The Great Courses: Professional \n", 191 | "\n", 192 | " title \n", 193 | "0 The History of the United States, 2nd Edition \n", 194 | "1 Your Best Brain: The Science of Brain Improvement \n", 195 | "2 The Story of Human Language \n", 196 | "3 How to Listen to and Understand Great Music, 3... \n", 197 | "4 Critical Business Skills for Success " 198 | ] 199 | }, 200 | "execution_count": 5, 201 | "metadata": {}, 202 | "output_type": "execute_result" 203 | } 204 | ], 205 | "source": [ 206 | "df.head()" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 6, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "def scrape_sale_courses():\n", 216 | " data = []\n", 217 | " pages = 5\n", 218 | " for page in range(pages):\n", 219 | " pageurl = '/Users/tobymanders/Desktop/{}.html'.format(page+1)\n", 220 | " with open(pageurl) as file:\n", 221 | " html = file.read()\n", 222 | " r = HTML(html=html)\n", 223 | " items = r.find('li.bc-list-item.productListItem', first=False)\n", 224 | " for i, item in enumerate(items):\n", 225 | " text_fields = item.text.split('\\n')\n", 226 | " dict_entry={\n", 227 | " 'title' : text_fields[0],\n", 228 | " 'sale' : 'Yes',\n", 229 | " }\n", 230 | " if any(['Member' in s for s in text_fields]):\n", 231 | " dict_entry['member-price'] = np.float([s for s in text_fields if 'Member' in s][0].split('$')[1].split(' or')[0])\n", 232 | " data.append(dict_entry)\n", 233 | " return data" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 7, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "sale_df = pd.DataFrame(data=scrape_sale_courses())" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 8, 248 | "metadata": {}, 249 | "outputs": [], 250 | "source": [ 251 | "df_merge = df.join(sale_df.set_index('title'), on='title')" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 9, 257 | "metadata": {}, 258 | "outputs": [ 259 | { 260 | "data": { 261 | "text/html": [ 262 | "
\n", 263 | "\n", 276 | "\n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | "
lengthlinkpriceratingrating_countrelease_dateseriestitlemember-pricesale
043 hrs and 23 minshttps://www.audible.com/pd/The-History-of-the-...59.954.523152013-07-08The Great Courses: Modern HistoryThe History of the United States, 2nd EditionNaNYes
112 hrs and 39 minshttps://www.audible.com/pd/Your-Best-Brain-The...34.954.526292014-11-14The Great Courses: PsychologyYour Best Brain: The Science of Brain Improvement24.46Yes
218 hrs and 15 minshttps://www.audible.com/pd/The-Story-of-Human-...41.954.536132013-07-08The Great Courses: LinguisticsThe Story of Human LanguageNaNYes
336 hrs and 34 minshttps://www.audible.com/pd/How-to-Listen-to-an...59.954.523372013-07-08The Great Courses: Fine Arts & MusicHow to Listen to and Understand Great Music, 3...41.96Yes
431 hrs and 18 minshttps://www.audible.com/pd/Critical-Business-S...59.954.521712015-04-08The Great Courses: ProfessionalCritical Business Skills for Success41.96Yes
\n", 360 | "
" 361 | ], 362 | "text/plain": [ 363 | " length link \\\n", 364 | "0 43 hrs and 23 mins https://www.audible.com/pd/The-History-of-the-... \n", 365 | "1 12 hrs and 39 mins https://www.audible.com/pd/Your-Best-Brain-The... \n", 366 | "2 18 hrs and 15 mins https://www.audible.com/pd/The-Story-of-Human-... \n", 367 | "3 36 hrs and 34 mins https://www.audible.com/pd/How-to-Listen-to-an... \n", 368 | "4 31 hrs and 18 mins https://www.audible.com/pd/Critical-Business-S... \n", 369 | "\n", 370 | " price rating rating_count release_date \\\n", 371 | "0 59.95 4.5 2315 2013-07-08 \n", 372 | "1 34.95 4.5 2629 2014-11-14 \n", 373 | "2 41.95 4.5 3613 2013-07-08 \n", 374 | "3 59.95 4.5 2337 2013-07-08 \n", 375 | "4 59.95 4.5 2171 2015-04-08 \n", 376 | "\n", 377 | " series \\\n", 378 | "0 The Great Courses: Modern History \n", 379 | "1 The Great Courses: Psychology \n", 380 | "2 The Great Courses: Linguistics \n", 381 | "3 The Great Courses: Fine Arts & Music \n", 382 | "4 The Great Courses: Professional \n", 383 | "\n", 384 | " title member-price sale \n", 385 | "0 The History of the United States, 2nd Edition NaN Yes \n", 386 | "1 Your Best Brain: The Science of Brain Improvement 24.46 Yes \n", 387 | "2 The Story of Human Language NaN Yes \n", 388 | "3 How to Listen to and Understand Great Music, 3... 41.96 Yes \n", 389 | "4 Critical Business Skills for Success 41.96 Yes " 390 | ] 391 | }, 392 | "execution_count": 9, 393 | "metadata": {}, 394 | "output_type": "execute_result" 395 | } 396 | ], 397 | "source": [ 398 | "df_merge.head()" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 10, 404 | "metadata": {}, 405 | "outputs": [], 406 | "source": [ 407 | "df_merge['sale'] = df_merge['sale'].fillna('No')" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 11, 413 | "metadata": {}, 414 | "outputs": [ 415 | { 416 | "data": { 417 | "text/html": [ 418 | "
\n", 419 | "\n", 432 | "\n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | "
lengthlinkpriceratingrating_countrelease_dateseriestitlemember-pricesale
043 hrs and 23 minshttps://www.audible.com/pd/The-History-of-the-...59.954.523152013-07-08The Great Courses: Modern HistoryThe History of the United States, 2nd EditionNaNYes
112 hrs and 39 minshttps://www.audible.com/pd/Your-Best-Brain-The...34.954.526292014-11-14The Great Courses: PsychologyYour Best Brain: The Science of Brain Improvement24.46Yes
218 hrs and 15 minshttps://www.audible.com/pd/The-Story-of-Human-...41.954.536132013-07-08The Great Courses: LinguisticsThe Story of Human LanguageNaNYes
336 hrs and 34 minshttps://www.audible.com/pd/How-to-Listen-to-an...59.954.523372013-07-08The Great Courses: Fine Arts & MusicHow to Listen to and Understand Great Music, 3...41.96Yes
431 hrs and 18 minshttps://www.audible.com/pd/Critical-Business-S...59.954.521712015-04-08The Great Courses: ProfessionalCritical Business Skills for Success41.96Yes
\n", 516 | "
" 517 | ], 518 | "text/plain": [ 519 | " length link \\\n", 520 | "0 43 hrs and 23 mins https://www.audible.com/pd/The-History-of-the-... \n", 521 | "1 12 hrs and 39 mins https://www.audible.com/pd/Your-Best-Brain-The... \n", 522 | "2 18 hrs and 15 mins https://www.audible.com/pd/The-Story-of-Human-... \n", 523 | "3 36 hrs and 34 mins https://www.audible.com/pd/How-to-Listen-to-an... \n", 524 | "4 31 hrs and 18 mins https://www.audible.com/pd/Critical-Business-S... \n", 525 | "\n", 526 | " price rating rating_count release_date \\\n", 527 | "0 59.95 4.5 2315 2013-07-08 \n", 528 | "1 34.95 4.5 2629 2014-11-14 \n", 529 | "2 41.95 4.5 3613 2013-07-08 \n", 530 | "3 59.95 4.5 2337 2013-07-08 \n", 531 | "4 59.95 4.5 2171 2015-04-08 \n", 532 | "\n", 533 | " series \\\n", 534 | "0 The Great Courses: Modern History \n", 535 | "1 The Great Courses: Psychology \n", 536 | "2 The Great Courses: Linguistics \n", 537 | "3 The Great Courses: Fine Arts & Music \n", 538 | "4 The Great Courses: Professional \n", 539 | "\n", 540 | " title member-price sale \n", 541 | "0 The History of the United States, 2nd Edition NaN Yes \n", 542 | "1 Your Best Brain: The Science of Brain Improvement 24.46 Yes \n", 543 | "2 The Story of Human Language NaN Yes \n", 544 | "3 How to Listen to and Understand Great Music, 3... 41.96 Yes \n", 545 | "4 Critical Business Skills for Success 41.96 Yes " 546 | ] 547 | }, 548 | "execution_count": 11, 549 | "metadata": {}, 550 | "output_type": "execute_result" 551 | } 552 | ], 553 | "source": [ 554 | "df_merge.head()" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": 12, 560 | "metadata": {}, 561 | "outputs": [], 562 | "source": [ 563 | "def fix_mins(x):\n", 564 | " if 'min' in x and 'hr' in x:\n", 565 | " hrs = x.split(' hr')[0]\n", 566 | " mins = x.split(' min')[0].split('and ')[1]\n", 567 | " len_ = make_len(hrs, mins)\n", 568 | " elif 'min' in x:\n", 569 | " mins = x.split(' min')[0]\n", 570 | " len_ = make_len('00', mins)\n", 571 | " elif 'hr' in x:\n", 572 | " hrs = x.split(' hr')[0]\n", 573 | " len_ = make_len(hrs, '00')\n", 574 | " else:\n", 575 | " len_ = make_len('00', '00')\n", 576 | " return len_\n", 577 | "\n", 578 | "def make_len(hrs, mins):\n", 579 | " if len(hrs)<2:\n", 580 | " hrs = '0' + hrs\n", 581 | " if len(mins)<2:\n", 582 | " mins = '0' + mins\n", 583 | " return hrs + ' hrs ' + mins + ' mins'" 584 | ] 585 | }, 586 | { 587 | "cell_type": "code", 588 | "execution_count": 13, 589 | "metadata": {}, 590 | "outputs": [], 591 | "source": [ 592 | "df_merge['length'] = df_merge['length'].apply(fix_mins)" 593 | ] 594 | }, 595 | { 596 | "cell_type": "code", 597 | "execution_count": 14, 598 | "metadata": {}, 599 | "outputs": [], 600 | "source": [ 601 | "columns = ['title', 'sale', 'price', 'member-price', 'length', 'rating', 'rating_count', 'release_date', 'series', 'link']" 602 | ] 603 | }, 604 | { 605 | "cell_type": "code", 606 | "execution_count": 15, 607 | "metadata": {}, 608 | "outputs": [], 609 | "source": [ 610 | "df_merge.to_csv('great_courses_all_titles_v2.csv', columns=columns, index=False)" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": 16, 616 | "metadata": {}, 617 | "outputs": [], 618 | "source": [ 619 | "url_list = list(df_merge['link'])" 620 | ] 621 | }, 622 | { 623 | "cell_type": "code", 624 | "execution_count": 17, 625 | "metadata": {}, 626 | "outputs": [ 627 | { 628 | "data": { 629 | "text/plain": [ 630 | "724" 631 | ] 632 | }, 633 | "execution_count": 17, 634 | "metadata": {}, 635 | "output_type": "execute_result" 636 | } 637 | ], 638 | "source": [ 639 | "len(df_merge)" 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": 50, 645 | "metadata": {}, 646 | "outputs": [], 647 | "source": [ 648 | "def get_accurate_ratings(addresses, rating_dict):\n", 649 | " for address in addresses:\n", 650 | " sess = HTMLSession()\n", 651 | " r = sess.get(address)\n", 652 | " try:\n", 653 | " rating_dict[address] = np.float(r.html.find('div.bc-row.bc-spacing-small', first=False)[1].text.split(' stars ')[1][:3])\n", 654 | " except:\n", 655 | " rating_dict[address] = np.nan\n", 656 | " \n", 657 | " return rating_dict" 658 | ] 659 | }, 660 | { 661 | "cell_type": "code", 662 | "execution_count": 51, 663 | "metadata": {}, 664 | "outputs": [], 665 | "source": [ 666 | "from threading import Thread\n", 667 | "\n", 668 | "def threaded_ratings(nthreads, addresses, rating_dict=None):\n", 669 | " if rating_dict == None:\n", 670 | " rating_dict = {}\n", 671 | " threads = []\n", 672 | " for i in range(nthreads):\n", 673 | " add_subset = addresses[i::nthreads]\n", 674 | " t = Thread(target=get_accurate_ratings, args=(add_subset, rating_dict))\n", 675 | " threads.append(t)\n", 676 | " [t.start() for t in threads]\n", 677 | " [t.join() for t in threads]\n", 678 | " \n", 679 | " return rating_dict" 680 | ] 681 | }, 682 | { 683 | "cell_type": "code", 684 | "execution_count": 52, 685 | "metadata": {}, 686 | "outputs": [], 687 | "source": [ 688 | "rating_dict = threaded_ratings(64, url_list)" 689 | ] 690 | }, 691 | { 692 | "cell_type": "code", 693 | "execution_count": 58, 694 | "metadata": {}, 695 | "outputs": [], 696 | "source": [ 697 | "df_merge['rating'] = df_merge['link'].apply(lambda x: rating_dict[x])" 698 | ] 699 | }, 700 | { 701 | "cell_type": "code", 702 | "execution_count": 59, 703 | "metadata": {}, 704 | "outputs": [ 705 | { 706 | "data": { 707 | "text/html": [ 708 | "
\n", 709 | "\n", 722 | "\n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | "
lengthlinkpriceratingrating_countrelease_dateseriestitlemember-pricesale
043 hrs 23 minshttps://www.audible.com/pd/The-History-of-the-...59.954.723152013-07-08The Great Courses: Modern HistoryThe History of the United States, 2nd EditionNaNYes
112 hrs 39 minshttps://www.audible.com/pd/Your-Best-Brain-The...34.954.526292014-11-14The Great Courses: PsychologyYour Best Brain: The Science of Brain Improvement24.46Yes
218 hrs 15 minshttps://www.audible.com/pd/The-Story-of-Human-...41.954.736132013-07-08The Great Courses: LinguisticsThe Story of Human LanguageNaNYes
336 hrs 34 minshttps://www.audible.com/pd/How-to-Listen-to-an...59.954.723372013-07-08The Great Courses: Fine Arts & MusicHow to Listen to and Understand Great Music, 3...41.96Yes
431 hrs 18 minshttps://www.audible.com/pd/Critical-Business-S...59.954.621712015-04-08The Great Courses: ProfessionalCritical Business Skills for Success41.96Yes
\n", 806 | "
" 807 | ], 808 | "text/plain": [ 809 | " length link price \\\n", 810 | "0 43 hrs 23 mins https://www.audible.com/pd/The-History-of-the-... 59.95 \n", 811 | "1 12 hrs 39 mins https://www.audible.com/pd/Your-Best-Brain-The... 34.95 \n", 812 | "2 18 hrs 15 mins https://www.audible.com/pd/The-Story-of-Human-... 41.95 \n", 813 | "3 36 hrs 34 mins https://www.audible.com/pd/How-to-Listen-to-an... 59.95 \n", 814 | "4 31 hrs 18 mins https://www.audible.com/pd/Critical-Business-S... 59.95 \n", 815 | "\n", 816 | " rating rating_count release_date series \\\n", 817 | "0 4.7 2315 2013-07-08 The Great Courses: Modern History \n", 818 | "1 4.5 2629 2014-11-14 The Great Courses: Psychology \n", 819 | "2 4.7 3613 2013-07-08 The Great Courses: Linguistics \n", 820 | "3 4.7 2337 2013-07-08 The Great Courses: Fine Arts & Music \n", 821 | "4 4.6 2171 2015-04-08 The Great Courses: Professional \n", 822 | "\n", 823 | " title member-price sale \n", 824 | "0 The History of the United States, 2nd Edition NaN Yes \n", 825 | "1 Your Best Brain: The Science of Brain Improvement 24.46 Yes \n", 826 | "2 The Story of Human Language NaN Yes \n", 827 | "3 How to Listen to and Understand Great Music, 3... 41.96 Yes \n", 828 | "4 Critical Business Skills for Success 41.96 Yes " 829 | ] 830 | }, 831 | "execution_count": 59, 832 | "metadata": {}, 833 | "output_type": "execute_result" 834 | } 835 | ], 836 | "source": [ 837 | "df_merge.head()" 838 | ] 839 | }, 840 | { 841 | "cell_type": "code", 842 | "execution_count": 62, 843 | "metadata": {}, 844 | "outputs": [], 845 | "source": [ 846 | "df_merge.to_csv('great_courses_list_v3.csv', columns=columns, index=False)" 847 | ] 848 | }, 849 | { 850 | "cell_type": "code", 851 | "execution_count": null, 852 | "metadata": {}, 853 | "outputs": [], 854 | "source": [] 855 | } 856 | ], 857 | "metadata": { 858 | "kernelspec": { 859 | "display_name": "Python 3", 860 | "language": "python", 861 | "name": "python3" 862 | }, 863 | "language_info": { 864 | "codemirror_mode": { 865 | "name": "ipython", 866 | "version": 3 867 | }, 868 | "file_extension": ".py", 869 | "mimetype": "text/x-python", 870 | "name": "python", 871 | "nbconvert_exporter": "python", 872 | "pygments_lexer": "ipython3", 873 | "version": "3.7.2" 874 | } 875 | }, 876 | "nbformat": 4, 877 | "nbformat_minor": 2 878 | } 879 | -------------------------------------------------------------------------------- /audible_eda/audible_reviews_scraper.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import pandas as pd\n", 11 | "import numpy as np\n", 12 | "import seaborn as sns\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "%matplotlib inline\n", 15 | "# import nest_asyncio\n", 16 | "# nest_asyncio.apply()\n", 17 | "from requests_html import HTML, HTMLSession, AsyncHTMLSession\n", 18 | "from threading import Thread\n", 19 | "import time\n", 20 | "import string" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "df = pd.read_csv(r'C:\\Users\\Toby-PC\\Documents\\code\\audible\\all_english_audible.csv')\n", 30 | "df = df[~df['asin'].isnull()]\n", 31 | "\n", 32 | "image_path = r'C:\\Users\\Toby-PC\\Documents\\code\\audible\\figures'\n", 33 | "\n", 34 | "def save_fig(fig_name, tight_layout=True):\n", 35 | " path = os.path.join(image_path, fig_name + '.png')\n", 36 | " print(\"Saving figure\", fig_name)\n", 37 | " if tight_layout:\n", 38 | " plt.tight_layout()\n", 39 | " plt.savefig(path, format='png', dpi=300)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 3, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "df = df.drop_duplicates()" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 4, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "text/plain": [ 59 | "64023215.0" 60 | ] 61 | }, 62 | "execution_count": 4, 63 | "metadata": {}, 64 | "output_type": "execute_result" 65 | } 66 | ], 67 | "source": [ 68 | "total_ratings = df['rating_count'].sum()\n", 69 | "total_ratings" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 5, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "df = df.sort_values('rating_count', ascending=False)\n", 79 | "df.reset_index(inplace=True)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 6, 85 | "metadata": { 86 | "scrolled": false 87 | }, 88 | "outputs": [ 89 | { 90 | "data": { 91 | "text/html": [ 92 | "
\n", 93 | "\n", 106 | "\n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | "
indexasinauthorcategorylengthlinknarratorpriceratingrating_countrelease_datetitle
0247928B005FRGT44Ernest ClineSci-Fi & Fantasy15 hrs and 40 minshttps://www.audible.com/pd/Ready-Player-One-Au...Wil Wheaton31.505.0216094.02011-08-16Ready Player One
1248009B00B5HZGUGAndy WeirSci-Fi & Fantasy10 hrs and 53 minshttps://www.audible.com/pd/The-Martian-Audiobo...R. C. Bray29.995.0164988.02013-03-22The Martian
2142087B00QXW5GYYPaula HawkinsMysteries & Thrillers10 hrs and 58 minshttps://www.audible.com/pd/The-Girl-on-the-Tra...Clare Corbett, Louise Brealey, India Fisher28.004.5133818.02015-01-13The Girl on the Train
34895B01IW9TQPKTrevor NoahBios & Memoirs8 hrs and 44 minshttps://www.audible.com/pd/Born-a-Crime-Audiob...Trevor Noah24.955.0123838.02016-11-15Born a Crime
4282008B01I28NFEEMark MansonSelf Development5 hrs and 17 minshttps://www.audible.com/pd/The-Subtle-Art-of-N...Roger Wayne23.954.5113261.02016-09-13The Subtle Art of Not Giving a F*ck
\n", 202 | "
" 203 | ], 204 | "text/plain": [ 205 | " index asin author category \\\n", 206 | "0 247928 B005FRGT44 Ernest Cline Sci-Fi & Fantasy \n", 207 | "1 248009 B00B5HZGUG Andy Weir Sci-Fi & Fantasy \n", 208 | "2 142087 B00QXW5GYY Paula Hawkins Mysteries & Thrillers \n", 209 | "3 4895 B01IW9TQPK Trevor Noah Bios & Memoirs \n", 210 | "4 282008 B01I28NFEE Mark Manson Self Development \n", 211 | "\n", 212 | " length link \\\n", 213 | "0 15 hrs and 40 mins https://www.audible.com/pd/Ready-Player-One-Au... \n", 214 | "1 10 hrs and 53 mins https://www.audible.com/pd/The-Martian-Audiobo... \n", 215 | "2 10 hrs and 58 mins https://www.audible.com/pd/The-Girl-on-the-Tra... \n", 216 | "3 8 hrs and 44 mins https://www.audible.com/pd/Born-a-Crime-Audiob... \n", 217 | "4 5 hrs and 17 mins https://www.audible.com/pd/The-Subtle-Art-of-N... \n", 218 | "\n", 219 | " narrator price rating rating_count \\\n", 220 | "0 Wil Wheaton 31.50 5.0 216094.0 \n", 221 | "1 R. C. Bray 29.99 5.0 164988.0 \n", 222 | "2 Clare Corbett, Louise Brealey, India Fisher 28.00 4.5 133818.0 \n", 223 | "3 Trevor Noah 24.95 5.0 123838.0 \n", 224 | "4 Roger Wayne 23.95 4.5 113261.0 \n", 225 | "\n", 226 | " release_date title \n", 227 | "0 2011-08-16 Ready Player One \n", 228 | "1 2013-03-22 The Martian \n", 229 | "2 2015-01-13 The Girl on the Train \n", 230 | "3 2016-11-15 Born a Crime \n", 231 | "4 2016-09-13 The Subtle Art of Not Giving a F*ck " 232 | ] 233 | }, 234 | "execution_count": 6, 235 | "metadata": {}, 236 | "output_type": "execute_result" 237 | } 238 | ], 239 | "source": [ 240 | "df.head()" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 7, 246 | "metadata": {}, 247 | "outputs": [ 248 | { 249 | "name": "stdout", 250 | "output_type": "stream", 251 | "text": [ 252 | "Saving figure Index vs Cum Reviews\n" 253 | ] 254 | }, 255 | { 256 | "data": { 257 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjgAAAGECAYAAAA7lVplAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzt3XtcVHX+P/DXXLnNcBlARRFEFPNGyGpqiNfQvCDeKLOlbcvuWW61pVarlbm2W+2W29dqt2zX+q2Wum73TNdbpGUmXlLwjtzlzswAczuf3x/gKCoiChw483o+Hj6GmXPmzHvmM3Befj6fc45KCCFAREREpCBquQsgIiIiamkMOERERKQ4DDhERESkOAw4REREpDgMOERERKQ4DDhERESkOAw41Or69OmD5ORkpKSkYNq0aZgwYQJmzpyJgwcPXvM2n332WXz//fctWGWdPn36oKysrFnPefHFF7FixYoWr6UlbN26FWlpaUhJScHkyZMxf/58FBQUtMlrjx07tsk2PnDgAP7whz8AAA4ePIjHHnusRV57w4YN+NWvfuX+zqWkpGD27NnYt29fi2z/WuXk5GDevHnNft7KlSsxevRoLFy4sMHjjz32GFJSUpCSktLg9ywtLe2K2ztz5gwef/zxJl/3L3/5C15++eVLHv/+++8RGxvrfu2UlBSMGzcODz30ECoqKprc7sKFC5GZmen+effu3U0+h6i5tHIXQJ7hn//8J0wmk/v+e++9h6VLl2Lt2rXXtL3L/dGlhj777DOsXLkSK1euRGRkJIQQePfdd3HXXXfhiy++gF6vl7tEHD9+HEVFRQCAgQMH4s0332yxbQ8ePBjvvPOO+/7//vc/zJs3D9u2bYNWK8+fvvz8fJw6darZz1u3bh1effVVDB48uMHjF35effr0ueT3rDG5ubk4ffp0s+u4UFRUFP773/+67zudTjzyyCP44IMPMH/+/Cs+Nz09HXfddRcA4I9//ON11UHUGPbgUJtzOp0oKChAQECA+7GVK1di+vTpSElJwcMPP4yioiKcOnUKQ4cOhd1uBwC4XC4kJibixIkTSEtLw9dffw0A+PnnnzFnzhxMnz4dM2fOxNatW+FyuTBs2DBkZ2cDAN555x2MGTPG/Xp33303tm/f3miNubm5uOWWW/DSSy9h1qxZGD9+PL799lsAgMViweOPP44JEyYgLS0NJ0+edD+vqKgIjzzyCGbMmIHk5GS8/fbbAIDdu3dj6NChKCoqgiRJSEtLw1tvvdXgNa/0fjdt2oTp06djxowZSE1NxZ49e5r8nP/yl7/g2WefRWRkJABApVLh/vvvx7x582C327FhwwY88MAD7vUvvL9gwQIsWbIEc+bMwYQJE/DKK6/gnXfewezZszFu3Djs2rXLvd57773n3sbF9wFAkiQsXboUqampmDRpEiZOnIi9e/eioKAAb775Jn766ScsXLgQP/zwA6ZMmQKz2Yz4+HgUFxe7t5Gamort27fDbrdj2bJlmD59OqZOnYoFCxbAYrE0+VkAwPDhw1FcXIyqqqorbmfs2LGYP38+Jk6ciG+//RanTp1CWloaJk+ejOTkZHz55ZcAGm/rxr47LpcLzz33HM6cOYN77733kvoKCwvx4IMPIjk5GVOmTME//vEPAMD8+fNRVFSEZ5991v3aV+ubb77BtGnTMHXqVMyZMwcHDx6E3W7H4sWLcerUKdx3330AgLfeeguzZs1CcnIybrnlFmzZsqVZrwPU/V6Ul5cjMDAQALB3717ceeedmDVrFkaNGoXnn38eAPDqq6+irKwM8+fPx8GDB3HHHXdg8+bNyM7OxoQJE7BkyRLMnDkT48ePd9dhtVrx1FNPYcKECZg1axaeeeYZPPvsswCADz/8EFOnTsXMmTNx55134sSJE82unRRKELWymJgYMWXKFDFlyhSRkJAgxo4dK1566SVRUlIihBDiP//5j5g/f75wOBxCCCHWrFkj5s6dK4QQ4s477xRfffWVEEKIbdu2idmzZwshhPj1r38tvvrqK1FRUSHGjx8vcnJyhBBCFBYWipEjR4q8vDyxYMECsXr1avd2EhISxMmTJ0VVVZUYOnSosNlsl621tLRU5OTkiJiYGPG///1PCCHE119/LUaPHi2EEOLll18WTz/9tJAkSZSWloqRI0eKN998UwghRFpamtiyZYsQQoja2lqRlpYmvvjiCyGEEK+//rqYO3euWLFihbjnnnuEy+W65PUbe7/jxo0T+/btE0IIsXPnTrFixYorfuZlZWUiJiZGVFdXN7rO+vXrxf3333/Z+88884xITU0VdrtdnD17VsTExIh//etfQgghPvjgA/Hb3/7Wvd4//vEP9zYuvD9mzBhx4MAB8fPPP4t58+a53+8777wjHnjggUtec/fu3WLy5MlCCCGefvpp93aOHz8uRo8eLVwul1ixYoVYvny5kCRJCCHEa6+9JhYvXtzke5MkSaxatUpMmTJFCCGuuJ0xY8aIv/3tb+7nTps2TXz44YdCCCHy8/PFuHHjhNlsbrStr/TdufA9XuzOO+8U77//vhBCiKqqKpGcnCw+//zzBp/llZz77p5z9OhRkZCQ4P7d2Llzp0hISBAWi0Wkp6eLqVOnCiGEOHPmjPjNb34jamtrhRBCbNy4UaSkpAgh6r6zS5cuveS10tPTxcCBA8XUqVPFxIkTxbBhw8S0adPEu+++6/49fuyxx8SePXuEEEKYzWYxZMgQceTIESGEEImJieLw4cNCCCFmz54tvv32W3H69GkRExMjtm/fLoQQ4osvvhDjxo0TQgixfPly8dRTTwmXyyWqqqrE5MmTxaJFi4Tdbhf9+/d3v+/169eLjz/++IqfE3kODlFRmzjXdf7LL7/g/vvvx9ChQxEcHAygbp7IwYMHMXPmTAB1/+OvqakBAMyaNQv/+c9/cOutt2LDhg247bbbGmw3IyMDxcXFeOSRR9yPqVQqZGVlISkpCWvWrMG0adNQXFyMKVOm4Pvvv0dAQAASExObHKLR6XQYNWoUAKBfv37uuQW7du3CokWLoFKpYDKZkJSUBACorq7Gnj17UFlZiTfeeMP9WGZmJiZNmoR58+Zhzpw5+Pe//43PPvsMavWlHaiNvd/Jkyfj0UcfxahRo5CQkOD+n3djzm1bkqQrrnclY8aMgU6nQ2hoKHx9fZGYmAgAiIiIuKp5FucMGjQIAQEBWLNmDXJycvDDDz/Az8/vis9JTU3FCy+8gHvvvRfr16/HzJkzoVarsW3bNpjNZvf8K4fD4f4eXeynn35CSkoKVCoV7HY7evbs6R7SaWo754aCKioqkJmZidTUVABAWFgYNm/efMW2jo2NbfS705jq6mr8/PPPeP/99wEARqMRM2bMwI4dOzB58uQrPrcxu3btQkJCAsLDwwEAI0aMQEBAAI4cOdJgve7du2PZsmX49NNPkZ2djX379qG6urrJ7V84RPXJJ5/gjTfewMSJE93Df3/+85+xfft2rFy5EidPnoTNZoPVar3iNvV6vft71q9fP1RWVgIAduzYgcWLF0OtVsNoNCIlJQWnT5+GTqdDUlISUlNTMXr0aIwYMcL9uRMx4FCb6t+/PxYuXIgFCxagb9++CA8PhyRJmDt3LubMmQMAsNvt7j9sEydOxPLly3HixAns2bMHy5cvb7A9l8uF6OhofPLJJ+7HioqKYDKZIEkSnnvuOWzfvh1Dhw7FzTffjH//+9/w8fHBpEmTmqxVp9O5g4JKpWqwTFxwCTeNRgOgLkwIIbBmzRr4+PgAAMrKyuDl5QUAMJvNKC4uhkqlQnZ29mXnSjT2fn/3u99h5syZSE9Px4YNG/D+++9j3bp1jdYeEBCAHj16YP/+/bj55psbLHv88cfx0EMPQaVSNXgfDoejwXoXB8DLzVtpahtAXZh4+eWX8dvf/hbjxo1Dz5498emnnzZaO1AXMJxOJw4cOIDPP//cPVdLkiQsWrTIvROzWq2w2WyNbuPCOTgXamo7vr6+Dd7zhe1/8uRJhIaGNtrW5eXlV/zuNFaPuOiygJIkwel0NvncK23z4teWJOmSNjp48CAeffRR/Pa3v8WIESPwq1/9CsuWLWvWa6WmpmLfvn2YP38+1q5dC7VajdmzZ2PAgAFITEzE5MmTsW/fvkve48W8vLzcNV/43dJoNJf9nQPqhmKzsrKwa9cuvP322/j888/x2muvNat+UibOwaE2N2XKFMTGxronF44YMQLr1q1zz4F444038PTTTwOo+4M3efJkLFiwAOPHj3fvTM6Ji4tDdna2e07KkSNHMGHCBBQVFcHLywtDhgzB3/72NyQkJOCmm25CRkYGfvrpJ/f/Eq9FYmIi1q1bB0mSUFlZ6Z4nYDAYEBcXh1WrVgEAqqqqcMcdd7iXP/vss5g6dSr++Mc/4qmnnoLZbL5k25d7v06nE2PHjkVNTQ3uuOMOLF68GFlZWe65Oo159NFH8fLLL7vnIblcLvzf//0fMjMz0bNnT5hMJhw7dgw2mw0OhwPffPNNsz+LoKAgHDp0CEBdsPzxxx8vWSc9PR1jxozBnDlzMGDAAGzevBkulwtA3Y6qsZ14amoqXnrpJfTp0wdhYWEA6r4rH330Eex2OyRJwvPPP4/XX3+92XVf7XYMBgP69++PjRs3AgAKCgpwxx13oLa29opt3RiNRnPZEGgwGHDjjTfio48+AlAXhjdu3HhJOG2O4cOHY/v27cjNzQUAfPfddygpKcHAgQMbfO4//vgjbrzxRtx9990YPHhwg/Zpjqeffho5OTlYs2YNysvLkZmZid///vdISkpCXl4ecnNz3T2KWq32sp9DY0aPHo0NGzZAkiRUV1fj888/h0qlQklJCUaPHo3g4GDcfffdeOyxx67r6ExSFvbgkCyef/55TJ06FTt37kRqaiqKiopw2223QaVSISwsrEFPTWpqKj788EMsWbLkku2YTCa8+eab+NOf/gSbzQYhBP70pz+5u+WTkpKwadMmDBs2DN7e3rjhhhsQEBDg7lW5FvPmzcPixYsxceJEmEwmxMTEuJe9+uqreOmll5CcnAy73Y4pU6Zg6tSp+Oijj1BQUIA33ngDOp0OI0aMwPPPP4+//vWvl2z/4ver1WqxaNEiPPXUU9BqtVCpVFi2bBn0ej22bNmCNWvW4O9///sl20lOToYQAk888QScTidsNhv69++Pf/7zn9Dr9UhISMCQIUMwceJEhIaGYujQocjKymrWZ5GWluae/BkeHo5hw4Zdss7s2bPx5JNPIjk5GU6nEwkJCdi0aRMkSUJcXBzeeustPProo5cc2jxt2jS8/vrrDYLHww8/jFdeeQXTp0+Hy+VC3759sWDBgmbV3NztvPbaa3jhhRewevVqqFQqvPzyywgNDW20rc8Fisvp1asXvLy8MGvWLHzyyScNelheffVVvPjii9iwYQPsdjuSk5MxY8aMZr+3c/r06YPnnnsOjzzyCFwuF3x8fPD222/DYDCgd+/eUKvVuP3227FixQps3rwZkyZNgiRJGD16NMrLy69qmOpCgYGBePLJJ/Hqq69i4sSJuPfee5GSkgIfHx+EhYVh0KBByM7Oxk033YSkpCQ88cQTeOmll65q2w899BBefPFFJCcnw2g0IiQkBN7e3ggJCcF9992HtLQ0+Pj4QKvV4oUXXriWj4sUSCWa6jMkIiKS0WeffYaAgACMHDkSkiTh4YcfxtixYy+Zk0d0IQYcIiJq1zIzM7FkyRLU1NTA4XBg+PDhWLhwoWznM6KOgQGHiIiIFIeTjImIiEhxGHCIiIhIcdp8ALO4+NJDY1tSUJAvysubN/ufWhbbQH5sA/mxDeTHNpBfW7RBaKjxso8rrgdHq9U0vRK1KraB/NgG8mMbyI9tID8520BxAYeIiIiIAYeIiIgUhwGHiIiIFIcBh4iIiBSHAYeIiIgUhwGHiIiIFIcBh4iIiBSHAYeIiIgUhwGHiIiIFIcBh4iIiBSHAYeIiIgUhwGHiIiIFIcBh4iIiBRHK3cBRERE1PEIIWCtdaLKakel1Y5Kqw1VFjsqq+2ostphrnbAz1ePu5Ji4KVv+6uKM+AQERGRm83uqgsplrrQUmm1o9JSF2KqrOcfq7La4XSJK27L5O8Nm9PFgENEREQtz+mSYK521IUTy7kel4tCTP0/m911xW1pNSoE+OnRvZMRAX56BBj08Pc9f+vvp0eAX91t926BKCmxtNG7vKhOWV6ViIiIrptLklBldaDCYqv/Z0eF+YKf6x83VzuuuB0VAKOfHp0CfepCi58e/gY9Avy83GHlXJjx9dJCpVJdVX1Xu15rYMAhIiJqZyQhYKlx1IcV+yUBprz+fpXVDnGFUSJvvQaBBi90DfZDwLnAYtCfDzH1twZfHTRqZR13xIBDRETUhmwOF8rNNpRX1aLsXG+L+cIQUxdkXFLjyUWnVSPQoEevbgEINHghyOiFQIMXAg36ultjXc+Lj5fn7uY9950TERG1sIvDS9mFP1fZUG6uhbXW2ejzNWoVAg169OhirA8sXgg06s//bNAj0OjVrGEiT8WAQ0REdBVsDhcqzDaUXRhe6u+fu71SePHSa2AyeqFHmD+CjF4wGet6XoKM3u7gYvDRQc3g0iIYcIiIyONJQqDSYkdpZS1KqmpQWlmLsqrzYabcbIOlpvGJuu7w0sWIIH9vmIxeMPl7XxBkvOHrzV1uW+KnTUREiudwSig319YHmLrb0gtuy6psjc558dJpYPL3QmRnA4KM3jD51/W8nA8w3vDx0nDIqJ1hwCEiog6vxuZsEFhKK2thsbmQf9aMkqpaVFnsaGzKboCfHpFdjAj29677F3DhrRd8ON+lQ2LAISKids/pklBWVYviyloUV9SgpKLutriiBiWVtY0OH2nUKgQZvRDTPbBhcAnwRoh/XW+MTtv2Z9ml1seAQ0REshNCwFztqAstlTUorqhFSX2AKa6oRZm59rLne9FqVAgO8EGPLsaLel68ERMVAsnugFrN3hdPxIBDRERtwuZw1YeW2voQU98TU1l3a3Nc/hIBgQY9orsFIDTAB6GB3ggN9EFooA9CArwRaPRq9Kij0CAfFBc3flQTKRsDDhERtRib3YWzFTU4W16NovK627PlNSgqr0G52XbZ53jrNegUdD601AWYuttgf2/odRxCouZjwCEiomapsTlRXFFTH1zOBZm6nyst9kvWVwEw+Xuhb2RQg/By7p+fNyfxUstjwCEiokvYHC4UlVWjsOzSnpgqa2Mhxhv9egShU5AvOgX6oLPJp/5nb07kpTbHgENE5KGEECg321BQVo3C0rowU1j/c1lV7SWHVatUQLC/N/r3CEInky86B9YFmM4mH4QE+ECnVdbFGqljY8AhIlI4m93lDi8FpVb3z0VlNZed2Btg0KNPRCC6mHzRxeRbF2bq58hoNQwx1DEw4BARKYAQAlXVDuSXWJFfYkVBqRUF9b0yl5vcq9Oq0TnIF12C60JMmOn8z558BWpSDn6LiYg6ECEEKix25JfWB5kSK/LqQ83lLvQYZKyb3NsluGGIMfl786KOpGgMOERE7dC5+THnemTqAk018kusqLY1DDIqFdApyBcx3QPRNcQP3UL8EBbshy4mX3jpObmXPBMDDhGRzKqq7cg7a0FusRU5xRZ3qKm1N5wfo1Gr0CnIB317BKFrsJ87zHQ2+fAoJaKLMOAQEbURh9OF/JJq5BZb6v9ZkXvWgsqLDrvWqFXoYvJF1xC/8/+CfdHZ5MtJvkRXiQGHiKiFCSFQVFaNA8dKkFNsQV6xBTlnLSgqq4F00QWVgv29cWN0MMI7GRAeakB4JwM6B/FoJaLrxYBDRHQdHE4XcoutyC4yI6eoLsjklVhQY2s4vOSt16BnN3+EhxrQPdQP4Z0M6BZigK83/wwTtQb+ZhERXaXqWgfOFFlwpsiM7CILzpw1o6CkukGvjFqlQmeTDwb3DUKIvxe6hxoQ3skPwf7evBwBURtiwCEiusi5Q7HremXMOFNkQXaRGSWVtQ3W0+vUiOpqRERnIyI7GxHR2YBuIX7QaTUIDTWiuNgs0zsgIgYcIvJoQgiUVtbiVKEZpwur3D005mpHg/UMPjr07xGEiM7G+n8GdA7yhVrNXhmi9ogBh4g8SoXFhlMFVThdYMapwrpbS03DMBPs741BvQPqe2XqwkyQ0YtDTEQdCAMOESmWpcaB0wVVdb0zBVU4XWi+5LIFIQHeuCEyCFFdjIjsUhdoDD46mSomopbSZMCRJAlLlixBVlYW9Ho9li5disjISPfy9957D1988QVUKhUefPBBJCUltWrBRESXY7O7cLqwCqcKzHU9NIVVKK5oOGcmwKBHXK8Q9AgzIirMH5FdjPD31ctUMRG1piYDzubNm2G327F27VpkZGRg+fLlWLlyJQCgqqoKq1evxqZNm1BTU4Np06Yx4BBRqxNC4Gx5DY7nVeJkfhVO5Fci96y1wdFMft5aDIgy1YWZLv7oEeaPIKOXjFUTUVtqMuDs3bsXiYmJAIC4uDgcOnTIvczHxwddu3ZFTU0Nampqrmp8OijIF9pWPqV4aKixVbdPTWMbyE9JbVBd68DRM+XIyi5HZnY5srLLGkwC1mnV6BMZhD6RQYiJCELv7oHobPKVfc6Mktqgo2IbyE+uNmgy4FgsFhgMBvd9jUYDp9MJrbbuqWFhYZg8eTJcLhceeOCBJl+wvLz6OsptGg/NlB/bQH4duQ0kIVBQWo0TeZU4mV+JE/lVyC+24sLz/4YEeKNfPxOiu/ojulsAuncyNDzzryShpMTS5rVfqCO3gVKwDeTXFm3QWIBqMuAYDAZYrVb3fUmS3OFmx44dOHv2LLZs2QIAuPfeexEfH4/Y2NiWqJmIPIDTJeF0oRnHcipwNKcCx/MqYa09f7VsvU6NPhGB6Nk1ANFd/dGzWwAC/DhvhoiurMmAEx8fj61bt2LSpEnIyMhATEyMe1lAQAC8vb2h1+uhUqlgNBpRVVXVqgUTUcdWY3PiRF4ljuZW4FhOJU4WVMHhlNzLQwK8ERsdjF7dAhDdLQDdQv2gUfO6TETUPE0GnKSkJKSnp2P27NkQQmDZsmVYtWoVIiIiMG7cOHz//fe47bbboFarER8fj4SEhLaom4g6iAqLDUdzKnAstxLHciuQc9aCc3OBVQDCOxkQEx6I3t0D0Ds8kBOBiahFqIS46NK2rawtxuI45iovtoH85GyDCosNmdnlyDxTjswzFThbXuNeptWo0TPMiN7dA9E7PBC9ugUo9mKT/D2QH9tAfu16Dg4R0ZVUWu3IOlNeH2oqUFh2/kACHy8NYqODEdM9EL3DA9Cjiz90Wg43EVHrY8AhomYxV9uRdabC3UOTX3L+IAQvvQYDewbjhshA3BARhMjORl6riYhkwYBDRFdks7uQlVOBw6fLcPh0GXKLzwcavU6N/lEm3BARiBsi6wJNg8O1iYhkwoBDRA1IQiCnyIJDp0px+HQ5juVWwOmqm6qn06rRNzLIHWiiwvwZaIioXWLAISKUVdXil9Nl+OVUGQ6fLm9wde2Izgb0jzKhfw8TeocHQNfKZyInImoJDDhEHsjmcCHrTDkOnaoLNQWl5ycGBxm9kDCwC/pHmdAv0gR/nlSPiDogBhwiD3G2ogYHT5Ri/4kSZGZXwOmqO7meXqdGbHQw+vUwoX+UCV2D5b+GExHR9WLAIVIop0vC0ZwKHDhRioMnSxv00oSH+mFgdDAGRgUjulsAD90mIsVhwCFSkHKzDQdPliIrtxI/Z52Fze4CUNdLE9crBLG9ghHbMxgmf2+ZKyUial0MOEQdmBAC+SVW/HysBPuOFuN04fkzhnYK8kFsbDBio4PRp3sgJwcTkUdhwCHqYCRJ4ER+JfYdLcHPx4rdl0LQqFXoGxmEG3uFYMyQCOjQpldhISJqVxhwiDoAh9OFw6fLse9YMTKOlaCquu4wbi+dBoP7hGJQTChio4Ph560DAISGGngNHiLyaAw4RO2Uze7C/hMl+CmrGAdPlMLmqJtP4++rw8gbwzCodyj69Qji0BMR0WUw4BC1I7V2Jw6cKMWezLM4eKIUdmfdodydgnwQHxOK+N6h6NnVn9d3IiJqAgMOkcwaCzVdTL4YfEMn3HRDJ3QL9eO5aYiImoEBh0gG54af9hw5iwMnS+G4INQMuaEThjDUEBFdFwYcojbikiQcPl2O3b8U4uejJe45NWHBvhjch6GGiKglMeAQtSIhBE4WVGH3L0XYc6TIffRTSIA3kvqH46a+ndEthKGGiKilMeAQtYKismrs+qUQuw8Xuc9TY/DRYUx8Nwzv1wXR3fwZaoiIWhEDDlELqa514IfDRfjuYCFOFVQBqLtEwtB+nTGsX2f0jzJBq+E1n4iI2gIDDtF1kITAkexyfHegAD8fLYbDKUGlAgb0NGF4vy4YFBMCbz1/zYiI2hr/8hJdg+KKGqQfLED6wQKUVtkA1B0BNSI2DMP7d0GQ0UvmComIPBsDDtFVsjtc2JtVjJ0H8pF5pgIA4KXXIDE2DImxXTmvhoioHWHAIWpCYVk1tu3LQ/rBAlhrnQCAPt0DMSI2DIP7dIKXnpdKICJqbxhwiC7D6ZKQcawEW/fl4Uh2OYC6a0BNGhaJxBvD0DnIV+YKiYjoShhwiC5QWlmL7fvzsXN/PiqtdgDADRGBGD2oG+JjQnkUFBFRB8GAQx5PCIHDp8uxZW8u9p8ogRCAr5cWtwwOx5hB3RAW7Cd3iURE1EwMOOSxbA4Xdh0qxOa9ucgvsQIAosL8MWZQNwzp2wleOs6tISLqqBhwyOOUVdViy8+52JGRD2utExq1CsP7d8Ytg7sjKsxf7vKIiKgFMOCQRxBC4ER+Fb7dk4O9WcWQhIDRV4fkm3tgTHw3BBp43hoiIiVhwCFFk4TAvqMl+OqHbJzMr7t8QvdOBiQN7o6h/TpBp+UwFBGREjHgkCI5nBJ2/VKIr344g6KyaqgADOodgvFDuiOmeyBPyEdEpHAMOKQo1bVObMvIw7d7clBptUOjVmFEbBgmDo3g0VBERB6EAYcUocJiw7d7crAtIw81Nhe89RrcOjQCSYO787pQREQeiAGHOrRysw1f7c7G9v3xwGGRAAAfi0lEQVT5cDgl+PvpMWlYJMYM6gZfb53c5RERkUwYcKhDKjfb8OWuumDjdEkI9vfG5OGRSBjYhROHiYiIAYc6lrKqWny5Oxs79ufD6RIICfDGlJt74OYBXXgZBSIicmPAoQ6h3GzD57tOYyeDDRERXQUGHGrXLDUOfLk7G1v25sLhlBAa6I0pw3tgOIMNERFdAQMOtUs2uwvf/pSDr344gxqbE0FGL6SMiGKPDRERXRUGHGpXnC4JO/bn49P006iy2mHw0eH2sb0wNr4bJw8TEdFVY8ChdkESAj8eLsJ/dp5EcUUtvHQaTE3ogQk3RcDHi19TIiJqHu45SHbHciuwZssxnCowQ6NW4ZZfhWPKzT3g76eXuzQiIuqgGHBINiUVNfhk2wnsyTwLALipbyfMHBWN0EAfmSsjIqKOjgGH2lyNzYkvdmVj054cOF0Senb1x+xxvdGrW4DcpRERkUIw4FCbkYTAdwcKsGH7CVRVO2Dy98KsUdG4qV9nqHl1byIiakEMONQmThdWYfU3R3GqoApeOg2mJ0Zh/E0R8NLxyCgiImp5DDjUqqy1DmzYcRLbfs6DADC0X2fcNqYXr/BNREStigGHWoUkBL4/WIhPth2HudqBsGBf/DopBn17mOQujYiIPAADDrW47IIq/HXNzzieWwm9To1Zo6Mxfkh3noGYiIjaDAMOtRinS8Ln35/Gl7uz4XQJ/ComFLPH9UZwgLfcpRERkYdhwKEWcSK/Eh98mYm8EiuCA7xxZ1IM4nqFyF0WERF5KAYcui42uwv/2XkS3+7JgQAwelA3PDTrRljNtXKXRkREHowBh67Z4dNl+OCrTJRU1qJzkA/unngD+kQEwddbx4BDRESyYsChZrM5XFi39QS2/JwLtUqFicMikJIQBT3PaUNERO0EAw41y6mCKrz72WEUlVUjLNgX9yX3Q48u/nKXRURE1AADDl2Vc0dIff59NiQhMH5Id8wY2ZO9NkRE1C4x4FCTCkqt+Ptnh3G60AyTvxfundwPfSOD5C6LiIioUQw41ChRf3HMjzYfhd0h4eYBXTDnlhj4evNrQ0RE7Rv3VHRZNTYnVn+Thd2Hi+DjpcVD0/phyA2d5C6LiIjoqjDg0CWyC81Y+d9DOFteg55d/fHg1P4ICfSRuywiIqKrxoBDbkIIbNmbi4+3HofTJTBxaASmj+zJa0gREVGH02TAkSQJS5YsQVZWFvR6PZYuXYrIyEj38u3bt+Ott94CAPTr1w+LFy+GSqVqvYqpVdTYnHj/iyPYe7QYRl8d5k7ph4E9g+Uui4iI6Jo0GXA2b94Mu92OtWvXIiMjA8uXL8fKlSsBABaLBX/+85/xr3/9CyaTCX//+99RXl4Ok8nU6oVTy8krseKtDQdRWFaNPt0Dcf/U/ggyesldFhER0TVrMuDs3bsXiYmJAIC4uDgcOnTIvWzfvn2IiYnBK6+8gpycHKSmpjLcdDA/ZZ7Fe18egc3uwq03RWDm6J7QqDkkRUREHVuTAcdiscBgMLjvazQaOJ1OaLValJeX44cffsDGjRvh6+uLO++8E3FxcYiKimp0e0FBvtBqW/fkcKGhxlbdvhK4XBJWf3UE67ceh7deg6d/PRiJg7q12PbZBvJjG8iPbSA/toH85GqDJgOOwWCA1Wp135ckCVpt3dMCAwMxcOBAhIaGAgAGDx6MI0eOXDHglJdXX2/NVxQaakRxsblVX6Ojs9Q48PZ/D+Hw6XJ0DvLBIzMGIjzU0GKfG9tAfmwD+bEN5Mc2kF9btEFjAarJsYj4+Hjs2LEDAJCRkYGYmBj3sgEDBuDo0aMoKyuD0+nE/v370atXrxYqmVpDYVk1Xv7XTzh8uhxxvULw/G8GIzzU0PQTiYiIOpAme3CSkpKQnp6O2bNnQwiBZcuWYdWqVYiIiMC4cePw5JNPYu7cuQCAW2+9tUEAovblyOky/N/GQ7DWOjFpWCRmjOoJNY94IyIiBVIJIURbvmBbdFWxS/JS2zPy8OGmowCA39x6A0bEhrXaa7EN5Mc2kB/bQH5sA/nJOUTFE/0pnCQJfLz1ODbtyYHBR4dHZwxETPdAucsiIiJqVQw4CuZwuvD3zw7jp6xihAX74vFZsegU5Ct3WURERK2OAUehqmudWLH+ALJyKtCneyDmzRwIX2+d3GURERG1CQYcBSo32/CXj/cjt9iCX/UJxf3J/aBr5XMPERERtScMOApTWFaN19ZkoLSqFmPiu+HOW2KgVvNIKSIi8iwMOAqSXWjGa2szYKlxYFpiFJJv7sELnxIRkUdiwFGIk/lVeH1tBmpsTtx1ax+Mjmu5yy4QERF1NAw4CnAstwJ/+Xg/bA4X5k7ph+EDushdEhERkawYcDq4I9nleHPdAThdEh5MGYAhN3SSuyQiIiLZMeB0YIdOlWLF+oMQQuDh6QMwqHeo3CURERG1Cww4HdSR02VYsf4gAGDezFgM7Bksc0VERETtR5NXE6f251huBd5YfwBCCMybMZDhhoiI6CIMOB3MqYIq/OXj/XC5BB5KGYABDDdERESXYMDpQM4UmfH62gzYHC7cl9wPg2I454aIiOhyGHA6iKLyary2NgPVtU7cM6kvburbWe6SiIiI2i0GnA6g0mrH62szYK524M7xMUgYGCZ3SURERO0aA047V2Nz4q8f70dxRS2Sb+6BsfHhcpdERETU7jHgtGNOl4T/23gI2UVmJMaGYVpilNwlERERdQgMOO2UJATe//IIfjlVhhujg3HXrX144UwiIqKrxIDTTv135yns/qUI0V398eC0AdCo2VRERERXi3vNdmj3L4X47PvTCAnwxmOzYuGl08hdEhERUYfCgNPOnMivxPtfZsLHS4PHU2+E0Vcvd0lEREQdDgNOO1JWVYsV6w/CJUl4YOoAdAvxk7skIiKiDokBp52w2V14c90BVFntmD22N2KjeQkGIiKia8WA0w4IIfDPrzNx5qwFI2/silsG81w3RERE14MBpx3438952H247oipX4+P4eHgRERE14kBR2bH8yqxZssxGH11eGjaAGg1bBIiIqLrxb2pjKqsdqzceAiSEHhwan+Y/L3lLomIiEgRGHBk4pIkvP3fQyg32zBzVDT69jDJXRIREZFiMODI5PPvs5F5pgKDeodg4tAIucshIiJSFAYcGRzNqcCn6acQ7O+Feyf35aRiIiKiFsaA08astQ78/bNfAAD3T+0PX2+dzBUREREpDwNOG6o7300WSqtsmJoQhd7hgXKXREREpEgMOG3ouwMF+CnzLHqHB2DKzZFyl0NERKRYDDhtpKSiBv9vyzH4eGlxX3I/aNT86ImIiFoL97JtQBICq77KhM3uwp1JvRES4CN3SURERIrGgNMGtu/Lw5HscsT1CsHw/l3kLoeIiEjxGHBaWXFFDT7eegJ+3lrcdWsfHhJORETUBhhwWpEkBFZ9eQQ2hwtzbolBoMFL7pKIiIg8AgNOK9q5Px+ZZyoQ1ysEw/p3lrscIiIij8GA00qqrHas23YC3noN0iZwaIqIiKgtMeC0krX/Ow5rrRMzRvZEkJFDU0RERG2JAacVHMkux65fChHZxYix8eFyl0NERORxGHBamMMp4V/fZEGlAn5zax+o1RyaIiIiamsMOC1s054zKCqrxtj4cPTo4i93OURERB6JAacFVVps+GJXNgw+OkxPjJK7HCIiIo/FgNOC/rPzJGrtLkxPjIKvt07ucoiIiDwWA04LOVNkxs79BegW4oeRcV3lLoeIiMijMeC0ACEE1mw5BgHg9rG9eKVwIiIimXFP3AIyjpcg80wFYqODMaBnsNzlEBEReTwGnOskSQIbtp+ESgXcNqaX3OUQERERGHCu2w+Hi5BXYkXCgDB0DfGTuxwiIiICA851cbokbPzuJDRqFaYm9JC7HCIiIqrHgHMdvjtQgOKKWoyO64aQQB+5yyEiIqJ6DDjXyO5w4dP0U9Br1Zhyc6Tc5RAREdEFGHCu0bZ9eaiw2DFucDgCDLxaOBERUXvCgHMNHE4XvvrxDLz0Gkwcyt4bIiKi9oYB5xqkHyxEpcWOMYO6weDDSzIQERG1Nww4zeSSJHy5OxtajRoThnSXuxwiIiK6DAacZvrx8FmUVNYi8cYwzr0hIiJqpxhwmkESAl/szoZGrcLEoRFyl0NERESNYMBphn1HS5BfYsWwfp0REsDz3hAREbVXDDjNsGnPGQDAxGE8coqIiKg9azLgSJKEP/zhD7j99tuRlpaG7Ozsy64zd+5c/Pvf/26VItuDUwVVOJZbiYE9g3nNKSIionauyYCzefNm2O12rF27Fk8++SSWL19+yTp//etfUVlZ2SoFthff/pQDAEgaEi5zJURERNQUbVMr7N27F4mJiQCAuLg4HDp0qMHyr7/+GiqVCiNHjryqFwwK8oVWq7mGUq9eaKixRbdXWlmDPUfOontnI0YPiYRKpWrR7StRS7cBNR/bQH5sA/mxDeQnVxs0GXAsFgsMBoP7vkajgdPphFarxdGjR/H555/jzTffxFtvvXVVL1heXn3t1V6F0FAjiovNLbrN9dtPwCUJjB3UFSUllhbdthK1RhtQ87AN5Mc2kB/bQH5t0QaNBagmA47BYIDVanXflyQJWm3d0zZu3IiioiL85je/QV5eHnQ6Hbp163bVvTkdgcPpwvaMfBh8dBjev4vc5RAREdFVaDLgxMfHY+vWrZg0aRIyMjIQExPjXvb000+7f16xYgVCQkIUFW4AYG9WMSw1DkwcGgG9rnWH1oiIiKhlNBlwkpKSkJ6ejtmzZ0MIgWXLlmHVqlWIiIjAuHHj2qJGWW3LyAcAjIzrKnMlREREdLWaDDhqtRovvvhig8eio6MvWW/evHktV1U7UVBqxdGcCvSNDELnIF+5yyEiIqKrxBP9XcH2+t6bUey9ISIi6lAYcBrhcLrw/aFCGH11iI8JlbscIiIiagYGnEbsPVo3uThhYBi0Gn5MREREHQn33I347kABAGDkjRyeIiIi6mgYcC6j3GzDkdPl6NUtAF1MnFxMRETU0TDgXMYPh4sgAAzv31nuUoiIiOgaMOBcxveHCqFRqzCkLwMOERFRR8SAc5GcsxbkFlsQGx0Mg49O7nKIiIjoGjDgXGTXL4UAwOtOERERdWAMOBeQhMAPh4vg46XFjb2C5S6HiIiIrhEDzgVO5lWh3GzDr2JCodPywppEREQdFQPOBX7KOgsAGHwDz1xMRETUkTHg1BNCYG/WWfh4adE30iR3OURERHQdGHDqnS40o7TKhrhewdBp+bEQERF1ZNyT1/sps354qk8nmSshIiKi68WAg3PDU8Xw0mvQP4rDU0RERB0dAw6AwrJqnK2owcAoE/Q6Hj1FRETU0THgADhwohQAEBsdInMlRERE1BIYcHA+4AzsyeEpIiIiJfD4gFNjc+JoTgV6dDEiwOAldzlERETUAjw+4Bw+XQaXJBAbzUszEBERKYXHBxz38BQDDhERkWJ4dMARQuDgyVIYfHSICvOXuxwiIiJqIR4dcArLqlFhsaNfjyCoVSq5yyEiIqIW4tEB50h2OQCgb2SQzJUQERFRS2LAAQMOERGR0nhswJGEQGZ2OYL9vRAa6CN3OURERNSCPDbg5J61wFrrxA2RQVBx/g0REZGieGzA4fAUERGRcnl8wLkhggGHiIhIaTwy4EhC4EReJUICvGHy95a7HCIiImphHhlwCkurYa11ond4gNylEBERUSvwyIBzPK8SANCrGwMOERGREnl0wIlmwCEiIlIkjww4J/Iq4aXXIDzUIHcpRERE1Ao8LuBYahwoKK1GzzB/qNU8/w0REZESeVzAOZnP+TdERERK53EBxz3BmEdQERERKZbHBZzTBWYAQFSYv8yVEBERUWvxqIAjhEB2kRnB/t4w+OjkLoeIiIhaiUcFnHKzDeZqB3p0McpdChEREbUijwo42UV1w1MRDDhERESK5lkBp7Au4ER2ZsAhIiJSMo8KOGeKLACASPbgEBERKZpHBZzsIjMCDXoE+OnlLoWIiIhakccEnEqrHeVmG4eniIiIPIDHBJyccxOMGXCIiIgUz2MCTm6xFQDQvRMvsElERKR0HhNw8krqJhh3C/WTuRIiIiJqbR4TcPJLrNBqVOgU5CN3KURERNTKPCLgSEIgr8SKLiY/aNQe8ZaJiIg8mkfs7Usqa2F3SAjn8BQREZFH8IiAk18/wZjzb4iIiDyDRwQc9wTjEB5BRURE5Ak8I+CwB4eIiMijeETAKSithk6rRnCAt9ylEBERURtQfMARQqCovBqdgnygVqnkLoeIiIjagOIDTlW1A7V2FzoH+cpdChEREbURxQecorJqAEBnnuCPiIjIYyg+4JwtrwEAnsGYiIjIgyg+4BSVn+vB4RAVERGRp/CAgFPXg9PZxIBDRETkKbRNrSBJEpYsWYKsrCzo9XosXboUkZGR7uUffPABvvjiCwDAqFGj8Oijj7ZetdfgbFk19Fo1Ag16uUshIiKiNtJkD87mzZtht9uxdu1aPPnkk1i+fLl7WU5ODj799FOsWbMGa9euxXfffYfMzMxWLbg5hBAoqqhBpyAfqHiIOBERkcdosgdn7969SExMBADExcXh0KFD7mVdunTBP/7xD2g0GgCA0+mEl5dXK5XafJYaB2x2F0IDOcGYiIjIkzQZcCwWCwyG89dw0mg0cDqd0Gq10Ol0MJlMEELgT3/6E/r164eoqKgrbi8oyBdareb6K7+C0FAjAKAypwIAEN7F3/0YtQ1+3vJjG8iPbSA/toH85GqDJgOOwWCA1Wp135ckCVrt+afZbDYsWrQIfn5+WLx4cZMvWF5/VFNrCQ01orjYDAA4nl0GAPDVqd2PUeu7sA1IHmwD+bEN5Mc2kF9btEFjAarJOTjx8fHYsWMHACAjIwMxMTHuZUIIPPzww+jTpw9efPFF91BVe1FaVQsACPbnNaiIiIg8SZM9OElJSUhPT8fs2bMhhMCyZcuwatUqREREQJIk/Pjjj7Db7di5cycA4IknnsCgQYNavfCrUVpZH3B4kU0iIiKP0mTAUavVePHFFxs8Fh0d7f754MGDLV9VCyljDw4REZFHUvSJ/kqqaqHTqmH01cldChEREbUhRQec0spamPy9eQ4cIiIiD6PYgGNzuGCpcSDEv/2cl4eIiIjahmIDzrn5NybOvyEiIvI4ig04FRY7ACDQwB4cIiIiT6PYgFNpsQEAL7JJRETkgRQbcM714ASwB4eIiMjjKDbgVFrrenAC2INDRETkcZQbcM7NwfFjDw4REZGnUWzAqbCwB4eIiMhTKTbgVFrtMPjooNUo9i0SERFRIxS796+w2Nl7Q0RE5KEUGXDsDhdqbE4E+jHgEBEReSJFBpwKKw8RJyIi8mSKDDhV5wIOe3CIiIg8kiIDjqXaAQAw+jLgEBEReSJFBhxzTV0Pjp+PVuZKiIiISA6KDDjWGicAwOjDHhwiIiJPpMiAc64Hx+Cjk7kSIiIikoMiA461pm4OjsGXAYeIiMgTKTLgmOsnGbMHh4iIyDMpMuBYaxxQqQBfL04yJiIi8kSKDDjmGgf8vHVQq1Vyl0JEREQyUGTAsdY4ODxFRETkwRQXcIQQsNQ4GXCIiIg8mOICTnWtE5IQ8PPm/BsiIiJPpciAAwA+DDhEREQeS3kBx1Z3iLgPj6AiIiLyWMoLOPWXafDRM+AQERF5KuUFHHcPjkbmSoiIiEguygs49XNwvNmDQ0RE5LEUG3B4FmMiIiLPpcCAUzdE5c0hKiIiIo+lwIDDScZERESeTnkBh4eJExEReTzFBZyacz04HKIiIiLyWIoLOO6jqNiDQ0RE5LEUF3BqbPUBR8ceHCIiIk+luIBjc7igAqDTKu6tERER0VVSXAqwOVzQ6dRQqVRyl0JEREQyUV7Asbug13J4ioiIyJMpL+A4XPDSKe5tERERUTMoLgnY7S7o2INDRETk0RQXcGwOF/TswSEiIvJoikoCQoj6gMMeHCIiIk+mqIDjkgQkSUDPQ8SJiIg8mqKSgN0hAQCPoiIiIvJwygo4ThcAcA4OERGRh1NUErA76gMOe3CIiIg8msICTv0QFXtwiIiIPJqikoDdeS7gsAeHiIjIkykr4LiHqBT1toiIiKiZFJUEzk8yZg8OERGRJ1NWwKmfg6NjDw4REZFHU1QScEp1AUerUdTbIiIiomZSVBJwuQQAQKtWyVwJERERyUlZAUeqCzgaDQMOERGRJ1NWwHFxiIqIiIgUFnCc9UNUGg5REREReTRFBZzzQ1SKeltERETUTIpKAq5zR1GxB4eIiMijKSrguIeo2INDRETk0RSVBM714HAODhERkWdrMuBIkoQ//OEPuP3225GWlobs7OwGyz/++GPMmDEDt912G7Zu3dpqhV4N93lw2INDRETk0bRNrbB582bY7XasXbsWGRkZWL58OVauXAkAKC4uxurVq7F+/XrYbDbMmTMHCQkJ0Ov1rV745YSHGhBo9EJIoLcsr09ERETtQ5MBZ+/evUhMTAQAxMXF4dChQ+5lBw4cwKBBg6DX66HX6xEREYHMzEzExsY2ur2gIF9ota1zMcypY4xIHt0LKhWHqOQWGmqUuwSPxzaQH9tAfmwD+cnVBk0GHIvFAoPB4L6v0WjgdDqh1WphsVhgNJ4v3M/PDxaL5YrbKy+vvo5ymxYaakRxsblVX4OujG0gP7aB/NgG8mMbyK8t2qCxANXkZBWDwQCr1eq+L0kStFrtZZdZrdYGgYeIiIhIDk0GnPj4eOzYsQMAkJGRgZiYGPey2NhY7N27FzabDWazGSdOnGiwnIiIiEgOTQ5RJSUlIT09HbNnz4YQAsuWLcOqVasQERGBcePGIS0tDXPmzIEQAr/73e/g5eXVFnUTERERNUolhBBt+YJtMRbHMVd5sQ3kxzaQH9tAfmwD+bXrOThEREREHQ0DDhERESkOAw4REREpDgMOERERKQ4DDhERESkOAw4REREpDgMOERERKQ4DDhERESkOAw4REREpTpufyZiIiIiotbEHh4iIiBSHAYeIiIgUhwGHiIiIFIcBh4iIiBSHAYeIiIgUhwGHiIiIFIcBh4iIiBRHK3cBLUWSJCxZsgRZWVnQ6/VYunQpIiMj5S5LMfbv349XX30Vq1evRnZ2NhYsWACVSoXevXtj8eLFUKvV+Nvf/oZt27ZBq9Vi0aJFiI2Nbda6dHkOhwOLFi1CXl4e7HY7HnroIfTq1Ytt0IZcLheee+45nDp1ChqNBn/84x8hhGAbyKC0tBQzZszA+++/D61WyzZoY9OmTYPRaAQAhIeH4/bbb8fLL78MjUaDESNG4NFHH210f5yRkXHV67YIoRDffPONeOaZZ4QQQuzbt088+OCDMlekHO+++66YMmWKSE1NFUII8cADD4jdu3cLIYR4/vnnxaZNm8ShQ4dEWlqakCRJ5OXliRkzZjR7Xbq8devWiaVLlwohhCgrKxOjRo1iG7Sxb7/9VixYsEAIIcTu3bvFgw8+yDaQgd1uFw8//LAYP368OH78ONugjdXW1oqUlJQGj02dOlVkZ2cLSZLE3LlzxaFDhxrdHzdn3ZagmB6cvXv3IjExEQAQFxeHQ4cOyVyRckRERGDFihV4+umnAQC//PILbrrpJgDAyJEjkZ6ejqioKIwYMQIqlQpdu3aFy+VCWVlZs9Y1mUyyvcf27NZbb8WECRPc9zUaDdugjd1yyy0YPXo0ACA/Px8hISHYtm0b26CNvfLKK5g9ezbeffddAPxb1NYyMzNRU1ODe+65B06nE/PmzYPdbkdERAQAYMSIEdi1axeKi4sv2R9bLJarXrelKGYOjsVigcFgcN/XaDRwOp0yVqQcEyZMgFZ7PgsLIaBSqQAAfn5+MJvNl3z+5x5vzrp0eX5+fjAYDLBYLHjssccwf/58toEMtFotnnnmGbz00kuYMGEC26CNbdiwASaTyb0zBPi3qK15e3vj3nvvxXvvvYcXXngBCxcuhI+Pj3t5Y5+rRqNp9LNuzX23YnpwDAYDrFar+74kSQ12ytRy1OrzudhqtcLf3/+Sz99qtcJoNDZrXWpcQUEBHnnkEcyZMwfJycn485//7F7GNmg7r7zyCp566incdtttsNls7sfZBq1v/fr1UKlU2LVrF44cOYJnnnkGZWVl7uVsg9YXFRWFyMhIqFQqREVFwWg0oqKiwr383OdaW1t7yf74cp91Y+u21L5bMT048fHx2LFjBwAgIyMDMTExMlekXP369cMPP/wAANixYwcGDx6M+Ph4fPfdd5AkCfn5+ZAkCSaTqVnr0uWVlJTgnnvuwe9//3vMmjULANugrW3cuBHvvPMOAMDHxwcqlQoDBgxgG7Shjz76CB9++CFWr16Nvn374pVXXsHIkSPZBm1o3bp1WL58OQCgqKgINTU18PX1xZkzZyCEwHfffef+XC/eHxsMBuh0uqtat6Uo5mri52ZiHz16FEIILFu2DNHR0XKXpRi5ubl44okn8PHHH+PUqVN4/vnn4XA40LNnTyxduhQajQYrVqzAjh07IEkSFi5ciMGDBzdrXbq8pUuX4quvvkLPnj3djz377LNYunQp26CNVFdXY+HChSgpKYHT6cR9992H6Oho/h7IJC0tDUuWLIFarWYbtCG73Y6FCxciPz8fKpUKTz31FNRqNZYtWwaXy4URI0bgd7/7XaP744yMjKtetyUoJuAQERERnaOYISoiIiKicxhwiIiISHEYcIiIiEhxGHCIiIhIcRhwiIiISHEYcIiIiEhxGHCIiIhIcf4/MeH+2W1LvYQAAAAASUVORK5CYII=\n", 258 | "text/plain": [ 259 | "
" 260 | ] 261 | }, 262 | "metadata": {}, 263 | "output_type": "display_data" 264 | } 265 | ], 266 | "source": [ 267 | "plt.style.use('seaborn')\n", 268 | "top_n = 50000\n", 269 | "x = list(range(top_n))\n", 270 | "cumsum = df.iloc[:top_n]['rating_count'].cumsum()/total_ratings\n", 271 | "plt.plot(x, cumsum);\n", 272 | "plt.title('Review Index vs. Cumulative Percent of Total Ratings')\n", 273 | "save_fig('Index vs Cum Reviews')" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 8, 279 | "metadata": {}, 280 | "outputs": [ 281 | { 282 | "data": { 283 | "text/plain": [ 284 | "436796" 285 | ] 286 | }, 287 | "execution_count": 8, 288 | "metadata": {}, 289 | "output_type": "execute_result" 290 | } 291 | ], 292 | "source": [ 293 | "len(df)" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 18, 299 | "metadata": {}, 300 | "outputs": [ 301 | { 302 | "data": { 303 | "text/plain": [ 304 | "26101" 305 | ] 306 | }, 307 | "execution_count": 18, 308 | "metadata": {}, 309 | "output_type": "execute_result" 310 | } 311 | ], 312 | "source": [ 313 | "cutoff_ind = cumsum[cumsum>0.8].index[0]\n", 314 | "df = df.iloc[7978:cutoff_ind]\n", 315 | "cutoff_ind" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 10, 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [ 324 | "df_items = df[['asin', 'title', 'author']]\n", 325 | "program_list = [tuple(x) for x in df_items.values]" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 11, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "def normalize(s):\n", 335 | " s = s.replace('.', ' stop ')\n", 336 | " s = s.replace('\\n', ' ')\n", 337 | " for p in string.punctuation:\n", 338 | " s = s.replace(p, '')\n", 339 | " return s.lower().strip()\n", 340 | "\n", 341 | "\n", 342 | "def get_overall_rating(asin, rating_dict):\n", 343 | " sess = HTMLSession()\n", 344 | " r = sess.get(f\"https://www.audible.com/pd/{asin}\")\n", 345 | " rating_dict[asin] = np.float(r.html.find('div.bc-row.bc-spacing-small', \n", 346 | " first=False)[1].text.split(' stars ')[1][:3])\n", 347 | " sess.close()\n", 348 | " return rating_dict\n", 349 | "\n", 350 | "\n", 351 | "def get_reviews(asin, title, author):\n", 352 | " baseurl = f'https://www.audible.com/pd/reviews?country=US&asin={asin}&page='\n", 353 | " page_num = 0\n", 354 | " ratings_reviews = []\n", 355 | " sess = HTMLSession()\n", 356 | " title = normalize(title)\n", 357 | " try:\n", 358 | " author = normalize(author)\n", 359 | " except:\n", 360 | " author = ''\n", 361 | " \n", 362 | " while True:\n", 363 | " try:\n", 364 | " url = baseurl + str(page_num)\n", 365 | " r = sess.get(url).html\n", 366 | " page_elements = r.find('div.bc-row-responsive.bc-spacing-top-medium', first=False)\n", 367 | " for elem in page_elements:\n", 368 | " try:\n", 369 | " review = elem.find(f'div.bc-col-responsive.USreviews{page_num}.bc-col-9', first=True).text\n", 370 | " review = normalize(review)\n", 371 | " review = review.replace(title, '').replace(author, '')\n", 372 | " ratings = [item.text[0] for item in elem.find('span.bc-text')]\n", 373 | " ratings_reviews.append((review, *ratings))\n", 374 | " except:\n", 375 | " break\n", 376 | " page_num += 1\n", 377 | " \n", 378 | " except:\n", 379 | " break\n", 380 | " \n", 381 | " sess.close()\n", 382 | " return ratings_reviews\n", 383 | "\n", 384 | "\n", 385 | "def get_ratings_and_reviews(program_group, rating_dict, reviews):\n", 386 | " for asin, title, author in program_group:\n", 387 | " try:\n", 388 | " rating_dict = (get_overall_rating(asin, rating_dict))\n", 389 | " except:\n", 390 | " pass\n", 391 | " reviews.extend(get_reviews(asin, title, author))\n", 392 | "# print(reviews)\n", 393 | " return rating_dict, reviews" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": 12, 399 | "metadata": {}, 400 | "outputs": [], 401 | "source": [ 402 | "def threaded_ratings(nthreads, program_list, rating_dict=None, reviews=None):\n", 403 | " if rating_dict == None:\n", 404 | " rating_dict = {}\n", 405 | " \n", 406 | " if reviews == None:\n", 407 | " reviews = []\n", 408 | " \n", 409 | " threads = []\n", 410 | " for i in range(nthreads):\n", 411 | " program_group = program_list[i::nthreads]\n", 412 | " t = Thread(target=get_ratings_and_reviews, args=(program_group, rating_dict, reviews))\n", 413 | " threads.append(t)\n", 414 | " \n", 415 | " [t.start() for t in threads]\n", 416 | " [t.join() for t in threads]\n", 417 | " \n", 418 | " return rating_dict, reviews" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": 13, 424 | "metadata": { 425 | "scrolled": true 426 | }, 427 | "outputs": [ 428 | { 429 | "name": "stdout", 430 | "output_type": "stream", 431 | "text": [ 432 | "7308.237472772598\n" 433 | ] 434 | } 435 | ], 436 | "source": [ 437 | "start = time.time()\n", 438 | "rating_dict = {}\n", 439 | "rating_dict, reviews = threaded_ratings(16, program_list[::-1], rating_dict)\n", 440 | "end = time.time()\n", 441 | "print(end-start)" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": 14, 447 | "metadata": {}, 448 | "outputs": [ 449 | { 450 | "data": { 451 | "text/plain": [ 452 | "822922" 453 | ] 454 | }, 455 | "execution_count": 14, 456 | "metadata": {}, 457 | "output_type": "execute_result" 458 | } 459 | ], 460 | "source": [ 461 | "len(reviews)" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": 15, 467 | "metadata": {}, 468 | "outputs": [], 469 | "source": [ 470 | "df_reviews = pd.DataFrame(data=reviews, columns=['text', 'overall', 'performance', 'story'])" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": 16, 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "df_reviews.to_csv('reviews2.csv', index=False)" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 17, 485 | "metadata": {}, 486 | "outputs": [], 487 | "source": [ 488 | "df_ratings = pd.DataFrame.from_dict(data=rating_dict, orient='index', columns=['rating'])" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": 18, 494 | "metadata": {}, 495 | "outputs": [], 496 | "source": [ 497 | "df_ratings['asin'] = df_ratings.index" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": 19, 503 | "metadata": {}, 504 | "outputs": [], 505 | "source": [ 506 | "df_ratings.reset_index(drop=True, inplace=True)" 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": 20, 512 | "metadata": {}, 513 | "outputs": [], 514 | "source": [ 515 | "df.drop('rating', inplace=True, axis=1)" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": 21, 521 | "metadata": {}, 522 | "outputs": [], 523 | "source": [ 524 | "df_merged = pd.merge(df, df_ratings, on='asin')" 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": 22, 530 | "metadata": {}, 531 | "outputs": [], 532 | "source": [ 533 | "df_merged.to_csv('8k_top_granular_audible2.csv')" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": null, 539 | "metadata": {}, 540 | "outputs": [], 541 | "source": [] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": null, 546 | "metadata": {}, 547 | "outputs": [], 548 | "source": [] 549 | } 550 | ], 551 | "metadata": { 552 | "kernelspec": { 553 | "display_name": "Python 3", 554 | "language": "python", 555 | "name": "python3" 556 | }, 557 | "language_info": { 558 | "codemirror_mode": { 559 | "name": "ipython", 560 | "version": 3 561 | }, 562 | "file_extension": ".py", 563 | "mimetype": "text/x-python", 564 | "name": "python", 565 | "nbconvert_exporter": "python", 566 | "pygments_lexer": "ipython3", 567 | "version": "3.7.2" 568 | } 569 | }, 570 | "nbformat": 4, 571 | "nbformat_minor": 2 572 | } 573 | -------------------------------------------------------------------------------- /audible_eda/audible_scraper.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from requests_html import HTMLSession, HTML\n", 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "from datetime import datetime\n", 13 | "from threading import Thread\n", 14 | "import time\n", 15 | "import matplotlib.pyplot as plt\n", 16 | "import progressbar\n", 17 | "import warnings\n", 18 | "warnings.filterwarnings(\"ignore\")" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "base_url = 'https://www.audible.com/search?pf_rd_p=7fe4387b-4762-42a8-8d9a-a63254c74bb2&pf_rd_r=C7ENYKDADHMCH4KY12D4&ref=a_search_l1_feature_five_browse-bin_6&feature_six_browse-bin=9178177011&pageSize=50'" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 3, 33 | "metadata": { 34 | "scrolled": true 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "def build_dict(items, category, data):\n", 39 | " for item in items:\n", 40 | " text_fields = item.text.split('\\n')\n", 41 | " link = [link for link in item.absolute_links if '/pd/' in link][0]\n", 42 | " dict_entry={\n", 43 | " 'category' : category,\n", 44 | " 'title' : text_fields[0],\n", 45 | " 'link' : link\n", 46 | " }\n", 47 | " try:\n", 48 | " dict_entry['rating_count'] = np.int([s for s in text_fields if 'stars' in s][0].split(\n", 49 | " 'stars ')[1].replace(',',''))\n", 50 | " except: pass\n", 51 | " try:\n", 52 | " dict_entry['narrator'] = [s for s in text_fields if 'Narrated by' in s][0].split(': ')[1]\n", 53 | " except: pass\n", 54 | " try:\n", 55 | " dict_entry['asin'] = [s for s in link.split('/') if 'B0' in s][0].split('?')[0]\n", 56 | " except: pass\n", 57 | " try:\n", 58 | " dict_entry['length'] = [s for s in text_fields if 'Length' in s][0].split(': ')[1]\n", 59 | " except: pass\n", 60 | " try:\n", 61 | " dict_entry['rating'] = np.float([s for s in text_fields if 'stars' in s][-1].split(' out')[0])\n", 62 | " except: pass\n", 63 | " try:\n", 64 | " dict_entry['author'] = [s for s in text_fields if 'By' in s][0].split(': ')[1]\n", 65 | " except: pass\n", 66 | " try:\n", 67 | " dict_entry['price'] = np.float([s for s in text_fields if 'Regular' in s][0].split('$')[1])\n", 68 | " except: pass\n", 69 | " try:\n", 70 | " dict_entry['release_date'] = datetime.strptime([s for s in text_fields if 'Release date:' in s][0].split(\n", 71 | " ': ')[1], '%m-%d-%y')\n", 72 | " except : pass\n", 73 | " data.append(dict_entry)\n", 74 | " return data\n", 75 | "\n", 76 | "def scrape_great_courses(mthreads, category, pages, url_list, data):\n", 77 | " sess = HTMLSession()\n", 78 | " \n", 79 | " for url in url_list:\n", 80 | " try:\n", 81 | " r = sess.get(url)\n", 82 | " except:\n", 83 | " try:\n", 84 | " time.sleep(0.5)\n", 85 | " r = sess.get(url)\n", 86 | " except:\n", 87 | " pass\n", 88 | " \n", 89 | " items = r.html.find('li.bc-list-item.productListItem', first=False)\n", 90 | " \n", 91 | " threads = []\n", 92 | " for j in range(mthreads):\n", 93 | " item_sublist = items[j::mthreads]\n", 94 | " t = Thread(target=build_dict, args=(item_sublist, category, data))\n", 95 | " threads.append(t)\n", 96 | " \n", 97 | " [t.start() for t in threads]\n", 98 | " [t.join() for t in threads]\n", 99 | " \n", 100 | " sess.close()\n", 101 | " return data" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 4, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "def scrape_threader(nthreads, mthreads, category, pages, base_url, data=None):\n", 111 | " if data == None:\n", 112 | " data = []\n", 113 | " \n", 114 | " # Create url list\n", 115 | " url_list = []\n", 116 | " for page in range(pages):\n", 117 | " pageurl = base_url + '&page=' + str(page+1)\n", 118 | " url_list.append(pageurl)\n", 119 | " \n", 120 | " # Create threads\n", 121 | " threads = []\n", 122 | " for i in range(nthreads):\n", 123 | " url_sublist = url_list[i::nthreads]\n", 124 | " t = Thread(target=scrape_great_courses, args=(mthreads, category, pages, url_sublist, data))\n", 125 | " threads.append(t)\n", 126 | " \n", 127 | " # Run threads\n", 128 | " [t.start() for t in threads]\n", 129 | " [t.join() for t in threads]\n", 130 | " \n", 131 | " return data" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 5, 137 | "metadata": { 138 | "scrolled": true 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "def loop_categories(nthreads, mthreads, cat_names, cat_page_nums, cat_links):\n", 143 | " data = []\n", 144 | " \n", 145 | "# widgets = [\n", 146 | "# progressbar.Percentage(), \n", 147 | "# progressbar.Bar(), \n", 148 | "# progressbar.ETA(),\n", 149 | "# progressbar.DynamicMessage('cat')]\n", 150 | "# bar = progressbar.ProgressBar(widgets=widgets, max_value=sum(cat_page_nums)).start()\n", 151 | " \n", 152 | " finished_pages = 0 \n", 153 | " for category, pages, link in zip(cat_names, cat_page_nums, cat_links):\n", 154 | " print('Scraping ', category, '...')\n", 155 | "# bar.update(finished_pages, cat=category)\n", 156 | " data.extend(scrape_threader(nthreads, mthreads, category, pages, link, data=data))\n", 157 | " finished_pages += pages\n", 158 | " \n", 159 | "# bar.finish() \n", 160 | " return data" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 6, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "sess = HTMLSession()\n", 170 | "r = sess.get(base_url)\n", 171 | "\n", 172 | "cat_items = r.html.find('div.bc-col-responsive.bc-col-3')[1].find('ul.bc-list')[0].find('li.bc-list-item')\n", 173 | "cat_names = [item.text.split(' (')[0] for item in cat_items]\n", 174 | "cat_item_nums = [np.int(item.text.split(' (')[1][:-1].replace(',', '')) for item in cat_items]\n", 175 | "cat_page_nums = [np.int(np.ceil(item/50)) for item in cat_item_nums]\n", 176 | "cat_links = [item.absolute_links.pop() + '&pageSize=50' for item in cat_items]\n", 177 | "\n", 178 | "sess.close()" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 7, 184 | "metadata": { 185 | "scrolled": true 186 | }, 187 | "outputs": [ 188 | { 189 | "name": "stdout", 190 | "output_type": "stream", 191 | "text": [ 192 | "Scraping Classics ...\n", 193 | "Done. Scraped 11042 out of 11043 items at 1.1 pages/s. ETA: 118.2 min.\n", 194 | "Scraping Erotica & Sexuality ...\n", 195 | "Done. Scraped 14405 out of 14405 items at 1.1 pages/s. ETA: 119.0 min.\n", 196 | "Scraping Fiction ...\n", 197 | "Done. Scraped 55564 out of 55614 items at 1.1 pages/s. ETA: 103.4 min.\n", 198 | "Scraping History ...\n" 199 | ] 200 | }, 201 | { 202 | "name": "stderr", 203 | "output_type": "stream", 204 | "text": [ 205 | "Exception in thread Thread-3276:\n", 206 | "Traceback (most recent call last):\n", 207 | " File \"C:\\Users\\Toby-PC\\Anaconda3\\lib\\site-packages\\pyquery\\pyquery.py\", line 95, in fromstring\n", 208 | " result = getattr(etree, meth)(context)\n", 209 | " File \"src/lxml/etree.pyx\", line 3213, in lxml.etree.fromstring\n", 210 | " File \"src/lxml/parser.pxi\", line 1877, in lxml.etree._parseMemoryDocument\n", 211 | " File \"src/lxml/parser.pxi\", line 1765, in lxml.etree._parseDoc\n", 212 | " File \"src/lxml/parser.pxi\", line 1127, in lxml.etree._BaseParser._parseDoc\n", 213 | " File \"src/lxml/parser.pxi\", line 601, in lxml.etree._ParserContext._handleParseResultDoc\n", 214 | " File \"src/lxml/parser.pxi\", line 711, in lxml.etree._handleParseResult\n", 215 | " File \"src/lxml/parser.pxi\", line 640, in lxml.etree._raiseParseError\n", 216 | " File \"\", line 1\n", 217 | "lxml.etree.XMLSyntaxError: Document is empty, line 1, column 1\n", 218 | "\n", 219 | "During handling of the above exception, another exception occurred:\n", 220 | "\n", 221 | "Traceback (most recent call last):\n", 222 | " File \"C:\\Users\\Toby-PC\\Anaconda3\\lib\\threading.py\", line 917, in _bootstrap_inner\n", 223 | " self.run()\n", 224 | " File \"C:\\Users\\Toby-PC\\Anaconda3\\lib\\threading.py\", line 865, in run\n", 225 | " self._target(*self._args, **self._kwargs)\n", 226 | " File \"\", line 52, in scrape_great_courses\n", 227 | " items = r.html.find('li.bc-list-item.productListItem', first=False)\n", 228 | " File \"C:\\Users\\Toby-PC\\Anaconda3\\lib\\site-packages\\requests_html.py\", line 654, in html\n", 229 | " self._html = HTML(session=self.session, url=self.url, html=self.content, default_encoding=self.encoding)\n", 230 | " File \"C:\\Users\\Toby-PC\\Anaconda3\\lib\\site-packages\\requests_html.py\", line 421, in __init__\n", 231 | " element=PyQuery(html)('html') or PyQuery(f'{html}')('html'),\n", 232 | " File \"C:\\Users\\Toby-PC\\Anaconda3\\lib\\site-packages\\pyquery\\pyquery.py\", line 255, in __init__\n", 233 | " elements = fromstring(context, self.parser)\n", 234 | " File \"C:\\Users\\Toby-PC\\Anaconda3\\lib\\site-packages\\pyquery\\pyquery.py\", line 99, in fromstring\n", 235 | " result = getattr(lxml.html, meth)(context)\n", 236 | " File \"C:\\Users\\Toby-PC\\Anaconda3\\lib\\site-packages\\lxml\\html\\__init__.py\", line 876, in fromstring\n", 237 | " doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)\n", 238 | " File \"C:\\Users\\Toby-PC\\Anaconda3\\lib\\site-packages\\lxml\\html\\__init__.py\", line 765, in document_fromstring\n", 239 | " \"Document is empty\")\n", 240 | "lxml.etree.ParserError: Document is empty\n", 241 | "\n" 242 | ] 243 | }, 244 | { 245 | "name": "stdout", 246 | "output_type": "stream", 247 | "text": [ 248 | "Done. Scraped 15500 out of 15621 items at 1.1 pages/s. ETA: 96.7 min.\n", 249 | "Scraping Mysteries & Thrillers ...\n", 250 | "Done. Scraped 47392 out of 47401 items at 1.1 pages/s. ETA: 83.2 min.\n", 251 | "Scraping Romance ...\n", 252 | "Done. Scraped 44607 out of 44623 items at 0.8 pages/s. ETA: 89.2 min.\n", 253 | "Scraping Science & Technology ...\n", 254 | "Done. Scraped 13984 out of 13984 items at 1.1 pages/s. ETA: 62.7 min.\n", 255 | "Scraping Sci-Fi & Fantasy ...\n", 256 | "Done. Scraped 34151 out of 34151 items at 1.1 pages/s. ETA: 56.0 min.\n", 257 | "Scraping Self Development ...\n", 258 | "Done. Scraped 44007 out of 44028 items at 1.1 pages/s. ETA: 39.9 min.\n", 259 | "Scraping Comedy ...\n", 260 | "Done. Scraped 5085 out of 5085 items at 1.2 pages/s. ETA: 36.9 min.\n", 261 | "Scraping Newspapers & Magazines ...\n", 262 | "Done. Scraped 10209 out of 10208 items at 1.2 pages/s. ETA: 32.1 min.\n", 263 | "Scraping Nostalgia Radio ...\n", 264 | "Done. Scraped 2053 out of 2104 items at 1.2 pages/s. ETA: 33.1 min.\n", 265 | "Scraping Radio & TV ...\n", 266 | "Done. Scraped 10697 out of 10697 items at 1.2 pages/s. ETA: 29.3 min.\n", 267 | "Scraping Sports ...\n", 268 | "Done. Scraped 3540 out of 3540 items at 1.1 pages/s. ETA: 30.0 min.\n", 269 | "Scraping Travel & Adventure ...\n", 270 | "Done. Scraped 3526 out of 3526 items at 1.2 pages/s. ETA: 28.7 min.\n", 271 | "Scraping Religion & Spirituality ...\n", 272 | "Done. Scraped 20785 out of 20785 items at 1.1 pages/s. ETA: 22.9 min.\n", 273 | "Scraping Nonfiction ...\n", 274 | "Done. Scraped 17573 out of 17573 items at 1.1 pages/s. ETA: 18.1 min.\n", 275 | "Scraping Live Events ...\n", 276 | "Done. Scraped 1071 out of 1071 items at 1.1 pages/s. ETA: 17.7 min.\n", 277 | "Scraping Language Instruction ...\n", 278 | "Done. Scraped 4454 out of 4454 items at 1.2 pages/s. ETA: 15.9 min.\n", 279 | "Scraping Drama & Poetry ...\n", 280 | "Done. Scraped 3512 out of 3512 items at 1.2 pages/s. ETA: 15.0 min.\n", 281 | "Scraping Health & Fitness ...\n", 282 | "Done. Scraped 7899 out of 7899 items at 1.1 pages/s. ETA: 12.9 min.\n", 283 | "Scraping Kids ...\n", 284 | "Done. Scraped 26689 out of 26689 items at 1.2 pages/s. ETA: 5.0 min.\n", 285 | "Scraping Teens ...\n", 286 | "Done. Scraped 17539 out of 17539 items at 1.1 pages/s. ETA: 0.0 min.\n" 287 | ] 288 | } 289 | ], 290 | "source": [ 291 | "for i in range(3, len(cat_names)):\n", 292 | " start=time.time()\n", 293 | " df = pd.DataFrame(data=loop_categories(\n", 294 | " 8, 2, [cat_names[i]], [cat_page_nums[i]], [cat_links[i]]))\n", 295 | " df = df.drop_duplicates()\n", 296 | " df.to_csv('{}.csv'.format(i))\n", 297 | " end = time.time()\n", 298 | " rate = cat_page_nums[i]/(end-start)\n", 299 | " pages_left = np.sum(cat_page_nums[i+1:])\n", 300 | " eta = pages_left/rate/60\n", 301 | " print('Done. Scraped {} out of {} items at {:.1f} pages/s. ETA: {:.1f} min.'.format(\n", 302 | " len(df), cat_item_nums[i], rate, eta))" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 13, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "df = pd.read_csv('0.csv')\n", 312 | "for i in range(1, len(cat_names)):\n", 313 | " df = pd.concat([df, pd.read_csv('{}.csv'.format(i))], ignore_index=True)" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 16, 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "df = df.drop_duplicates()\n", 323 | "df = df.drop('Unnamed: 0', axis=1)" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 19, 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "df.to_csv('all_english_audible.csv', index=False)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [] 341 | } 342 | ], 343 | "metadata": { 344 | "kernelspec": { 345 | "display_name": "Python 3", 346 | "language": "python", 347 | "name": "python3" 348 | }, 349 | "language_info": { 350 | "codemirror_mode": { 351 | "name": "ipython", 352 | "version": 3 353 | }, 354 | "file_extension": ".py", 355 | "mimetype": "text/x-python", 356 | "name": "python", 357 | "nbconvert_exporter": "python", 358 | "pygments_lexer": "ipython3", 359 | "version": "3.7.2" 360 | } 361 | }, 362 | "nbformat": 4, 363 | "nbformat_minor": 2 364 | } 365 | --------------------------------------------------------------------------------