├── .gitignore ├── Basic HTML.htm ├── LICENSE ├── README.md ├── Web Scraping Tutorial.ipynb └── Web Scraping.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /Basic HTML.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 |10 | WELCOME TO MY BLOG TheMenYouWantToBe. 11 |
12 | 13 | 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc.Here is some simple content for this page.
\\n \\n'" 124 | ] 125 | }, 126 | "execution_count": 21, 127 | "metadata": {}, 128 | "output_type": "execute_result" 129 | } 130 | ], 131 | "source": [ 132 | "page.content" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "#### You can see we extract the content but it's not look good." 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "## We use BeautifulSoup library to parse the document and extract the text in beautiful manner" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 22, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "from bs4 import BeautifulSoup\n", 156 | "soup = BeautifulSoup(page.content,'html.parser')" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 23, 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/plain": [ 167 | "\n", 168 | "\n", 169 | "\n", 170 | "\n", 171 | "Here is some simple content for this page.
\n", 175 | "\n", 176 | "" 177 | ] 178 | }, 179 | "execution_count": 23, 180 | "metadata": {}, 181 | "output_type": "execute_result" 182 | } 183 | ], 184 | "source": [ 185 | "soup" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "#### NOW IT'S LOOK BETTER!" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "#### To get even more beautiful we can use prettify method" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 25, 205 | "metadata": {}, 206 | "outputs": [ 207 | { 208 | "name": "stdout", 209 | "output_type": "stream", 210 | "text": [ 211 | "\n", 212 | "\n", 213 | " \n", 214 | "\n", 220 | " Here is some simple content for this page.\n", 221 | "
\n", 222 | " \n", 223 | "\n" 224 | ] 225 | } 226 | ], 227 | "source": [ 228 | "print(soup.prettify())" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "#### Note - If you don't use print() and directly try to print soup.prettify() you will end up with messy text!" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 26, 241 | "metadata": {}, 242 | "outputs": [ 243 | { 244 | "data": { 245 | "text/plain": [ 246 | "'\\n\\n \\n\\n Here is some simple content for this page.\\n
\\n \\n'" 247 | ] 248 | }, 249 | "execution_count": 26, 250 | "metadata": {}, 251 | "output_type": "execute_result" 252 | } 253 | ], 254 | "source": [ 255 | "soup.prettify()" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "#### LIKE THIS ..." 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "### Now, if you want to select all the elements at the top level of the page using the children property of soup. Note that children returns a list generator, so we need to call the list function on it" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 28, 275 | "metadata": {}, 276 | "outputs": [ 277 | { 278 | "data": { 279 | "text/plain": [ 280 | "Here is some simple content for this page.
\n", 306 | " \n", 307 | " ]" 308 | ] 309 | }, 310 | "execution_count": 29, 311 | "metadata": {}, 312 | "output_type": "execute_result" 313 | } 314 | ], 315 | "source": [ 316 | "list(soup.children)" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 30, 322 | "metadata": {}, 323 | "outputs": [ 324 | { 325 | "data": { 326 | "text/plain": [ 327 | "'html'" 328 | ] 329 | }, 330 | "execution_count": 30, 331 | "metadata": {}, 332 | "output_type": "execute_result" 333 | } 334 | ], 335 | "source": [ 336 | "list(soup.children)[0]" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 31, 342 | "metadata": {}, 343 | "outputs": [ 344 | { 345 | "data": { 346 | "text/plain": [ 347 | "'\\n'" 348 | ] 349 | }, 350 | "execution_count": 31, 351 | "metadata": {}, 352 | "output_type": "execute_result" 353 | } 354 | ], 355 | "source": [ 356 | "list(soup.children)[1]" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 33, 362 | "metadata": {}, 363 | "outputs": [ 364 | { 365 | "data": { 366 | "text/plain": [ 367 | "\n", 368 | "\n", 369 | "Here is some simple content for this page.
\n", 373 | "\n", 374 | "" 375 | ] 376 | }, 377 | "execution_count": 33, 378 | "metadata": {}, 379 | "output_type": "execute_result" 380 | } 381 | ], 382 | "source": [ 383 | "list(soup.children)[2]" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 34, 389 | "metadata": {}, 390 | "outputs": [ 391 | { 392 | "data": { 393 | "text/plain": [ 394 | "[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]" 395 | ] 396 | }, 397 | "execution_count": 34, 398 | "metadata": {}, 399 | "output_type": "execute_result" 400 | } 401 | ], 402 | "source": [ 403 | "# To see the type\n", 404 | "[type(item) for item in list(soup.children)]" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": 35, 410 | "metadata": {}, 411 | "outputs": [], 412 | "source": [ 413 | "html = list(soup.children)[2]" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": 37, 419 | "metadata": {}, 420 | "outputs": [ 421 | { 422 | "data": { 423 | "text/plain": [ 424 | "['\\n', \n", 425 | "Here is some simple content for this page.
\n", 428 | " , '\\n']" 429 | ] 430 | }, 431 | "execution_count": 37, 432 | "metadata": {}, 433 | "output_type": "execute_result" 434 | } 435 | ], 436 | "source": [ 437 | "list(html.children)" 438 | ] 439 | }, 440 | { 441 | "cell_type": "markdown", 442 | "metadata": {}, 443 | "source": [ 444 | "#### As you can see above, there are two tags here, head, and body. We want to extract the text inside the p tag, so we'll dive into the body" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": 47, 450 | "metadata": {}, 451 | "outputs": [], 452 | "source": [ 453 | "body = body = list(html.children)[3]" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": 48, 459 | "metadata": {}, 460 | "outputs": [ 461 | { 462 | "data": { 463 | "text/plain": [ 464 | "\n", 465 | "Here is some simple content for this page.
\n", 466 | "" 467 | ] 468 | }, 469 | "execution_count": 48, 470 | "metadata": {}, 471 | "output_type": "execute_result" 472 | } 473 | ], 474 | "source": [ 475 | "body" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": 49, 481 | "metadata": {}, 482 | "outputs": [ 483 | { 484 | "data": { 485 | "text/plain": [ 486 | "['\\n',Here is some simple content for this page.
, '\\n']" 487 | ] 488 | }, 489 | "execution_count": 49, 490 | "metadata": {}, 491 | "output_type": "execute_result" 492 | } 493 | ], 494 | "source": [ 495 | "list(body.children)" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": 50, 501 | "metadata": {}, 502 | "outputs": [ 503 | { 504 | "data": { 505 | "text/plain": [ 506 | "Here is some simple content for this page.
" 507 | ] 508 | }, 509 | "execution_count": 50, 510 | "metadata": {}, 511 | "output_type": "execute_result" 512 | } 513 | ], 514 | "source": [ 515 | "list(body.children)[1]" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": 51, 521 | "metadata": {}, 522 | "outputs": [ 523 | { 524 | "data": { 525 | "text/plain": [ 526 | "'Here is some simple content for this page.'" 527 | ] 528 | }, 529 | "execution_count": 51, 530 | "metadata": {}, 531 | "output_type": "execute_result" 532 | } 533 | ], 534 | "source": [ 535 | "p = list(body.children)[1]\n", 536 | "p.get_text()" 537 | ] 538 | }, 539 | { 540 | "cell_type": "markdown", 541 | "metadata": {}, 542 | "source": [ 543 | "#### We can use the get_text method to extract all of the text inside the tag" 544 | ] 545 | }, 546 | { 547 | "cell_type": "markdown", 548 | "metadata": {}, 549 | "source": [ 550 | "### What we did above was useful for figuring out how to navigate a page, but it took a lot of commands to do something fairly simple. If we want to extract a single tag, we can instead use the find_all method, which will find all the instances of a tag on a page" 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": 52, 556 | "metadata": {}, 557 | "outputs": [ 558 | { 559 | "data": { 560 | "text/plain": [ 561 | "[Here is some simple content for this page.
]" 562 | ] 563 | }, 564 | "execution_count": 52, 565 | "metadata": {}, 566 | "output_type": "execute_result" 567 | } 568 | ], 569 | "source": [ 570 | "soup = BeautifulSoup(page.content, 'html.parser')\n", 571 | "soup.find_all('p')" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": 53, 577 | "metadata": {}, 578 | "outputs": [], 579 | "source": [ 580 | "# Note it returns the list so we use list indexing" 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": 54, 586 | "metadata": {}, 587 | "outputs": [ 588 | { 589 | "data": { 590 | "text/plain": [ 591 | "Here is some simple content for this page.
" 592 | ] 593 | }, 594 | "execution_count": 54, 595 | "metadata": {}, 596 | "output_type": "execute_result" 597 | } 598 | ], 599 | "source": [ 600 | "soup.find_all('p')[0]" 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": 55, 606 | "metadata": {}, 607 | "outputs": [ 608 | { 609 | "data": { 610 | "text/plain": [ 611 | "'Here is some simple content for this page.'" 612 | ] 613 | }, 614 | "execution_count": 55, 615 | "metadata": {}, 616 | "output_type": "execute_result" 617 | } 618 | ], 619 | "source": [ 620 | "soup.find_all('p')[0].get_text()" 621 | ] 622 | }, 623 | { 624 | "cell_type": "markdown", 625 | "metadata": {}, 626 | "source": [ 627 | "#### You can also use find()" 628 | ] 629 | }, 630 | { 631 | "cell_type": "code", 632 | "execution_count": 56, 633 | "metadata": {}, 634 | "outputs": [ 635 | { 636 | "data": { 637 | "text/plain": [ 638 | "Here is some simple content for this page.
" 639 | ] 640 | }, 641 | "execution_count": 56, 642 | "metadata": {}, 643 | "output_type": "execute_result" 644 | } 645 | ], 646 | "source": [ 647 | "soup.find('p')" 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": 57, 653 | "metadata": {}, 654 | "outputs": [ 655 | { 656 | "data": { 657 | "text/plain": [ 658 | "'Here is some simple content for this page.'" 659 | ] 660 | }, 661 | "execution_count": 57, 662 | "metadata": {}, 663 | "output_type": "execute_result" 664 | } 665 | ], 666 | "source": [ 667 | "soup.find('p').get_text()" 668 | ] 669 | }, 670 | { 671 | "cell_type": "markdown", 672 | "metadata": {}, 673 | "source": [ 674 | "#### Now, you have a good idea how to do web scraping. I will highly recommend you to check out how web scraping done in web pages using python. Refer to this - https://themenyouwanttobe.wordpress.com " 675 | ] 676 | }, 677 | { 678 | "cell_type": "code", 679 | "execution_count": null, 680 | "metadata": {}, 681 | "outputs": [], 682 | "source": [] 683 | } 684 | ], 685 | "metadata": { 686 | "kernelspec": { 687 | "display_name": "Python 3", 688 | "language": "python", 689 | "name": "python3" 690 | }, 691 | "language_info": { 692 | "codemirror_mode": { 693 | "name": "ipython", 694 | "version": 3 695 | }, 696 | "file_extension": ".py", 697 | "mimetype": "text/x-python", 698 | "name": "python", 699 | "nbconvert_exporter": "python", 700 | "pygments_lexer": "ipython3", 701 | "version": "3.6.5" 702 | } 703 | }, 704 | "nbformat": 4, 705 | "nbformat_minor": 2 706 | } 707 | -------------------------------------------------------------------------------- /Web Scraping.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Import the Library" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "from bs4 import BeautifulSoup\n", 17 | "import requests" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "#### Download the web page containing the forecast." 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "page = requests.get(\"http://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168\")" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "#### Create a BeautifulSoup class to parse the page." 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "soup = BeautifulSoup(page.content, 'html.parser')" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "#### Find the div with id seven-day-forecast, and assign to seven_day" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 4, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "seven_day = soup.find(id=\"seven-day-forecast\")" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "#### Inside seven_day, find each individual forecast item." 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 5, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "forecast_items = seven_day.find_all(class_=\"tombstone-container\")" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "#### Extract and print the first forecast item." 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 6, 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "name": "stdout", 98 | "output_type": "stream", 99 | "text": [ 100 | "\n",
102 | " Today\n",
103 | "
\n",
104 | "
\n",
105 | "
\n",
107 | " \n",
108 | "
\n", 110 | " Sunny\n", 111 | "
\n", 112 | "\n", 113 | " High: 69 °F\n", 114 | "
\n", 115 | "\n", 305 | " | period | \n", 306 | "short_desc | \n", 307 | "temp | \n", 308 | "desc | \n", 309 | "
---|---|---|---|---|
0 | \n", 314 | "Today | \n", 315 | "Sunny | \n", 316 | "High: 69 °F | \n", 317 | "Today: Sunny, with a high near 69. West wind 1... | \n", 318 | "
1 | \n", 321 | "Tonight | \n", 322 | "Partly Cloudy | \n", 323 | "Low: 55 °F | \n", 324 | "Tonight: Partly cloudy, with a low around 55. ... | \n", 325 | "
2 | \n", 328 | "Tuesday | \n", 329 | "Mostly Sunny | \n", 330 | "High: 67 °F | \n", 331 | "Tuesday: Mostly sunny, with a high near 67. We... | \n", 332 | "
3 | \n", 335 | "TuesdayNight | \n", 336 | "Partly Cloudy | \n", 337 | "Low: 54 °F | \n", 338 | "Tuesday Night: Partly cloudy, with a low aroun... | \n", 339 | "
4 | \n", 342 | "Wednesday | \n", 343 | "Mostly Sunny | \n", 344 | "High: 68 °F | \n", 345 | "Wednesday: Mostly sunny, with a high near 68. ... | \n", 346 | "
5 | \n", 349 | "WednesdayNight | \n", 350 | "Partly Cloudy | \n", 351 | "Low: 55 °F | \n", 352 | "Wednesday Night: Partly cloudy, with a low aro... | \n", 353 | "
6 | \n", 356 | "Thursday | \n", 357 | "Mostly Sunny | \n", 358 | "High: 68 °F | \n", 359 | "Thursday: Mostly sunny, with a high near 68. | \n", 360 | "
7 | \n", 363 | "ThursdayNight | \n", 364 | "Partly Cloudy | \n", 365 | "Low: 56 °F | \n", 366 | "Thursday Night: Partly cloudy, with a low arou... | \n", 367 | "
8 | \n", 370 | "Friday | \n", 371 | "Mostly Sunny | \n", 372 | "High: 70 °F | \n", 373 | "Friday: Mostly sunny, with a high near 70. | \n", 374 | "
\n", 534 | " | period | \n", 535 | "short_desc | \n", 536 | "temp | \n", 537 | "desc | \n", 538 | "temp_num | \n", 539 | "is_night | \n", 540 | "
---|---|---|---|---|---|---|
1 | \n", 545 | "Tonight | \n", 546 | "Partly Cloudy | \n", 547 | "Low: 55 °F | \n", 548 | "Tonight: Partly cloudy, with a low around 55. ... | \n", 549 | "55 | \n", 550 | "True | \n", 551 | "
3 | \n", 554 | "TuesdayNight | \n", 555 | "Partly Cloudy | \n", 556 | "Low: 54 °F | \n", 557 | "Tuesday Night: Partly cloudy, with a low aroun... | \n", 558 | "54 | \n", 559 | "True | \n", 560 | "
5 | \n", 563 | "WednesdayNight | \n", 564 | "Partly Cloudy | \n", 565 | "Low: 55 °F | \n", 566 | "Wednesday Night: Partly cloudy, with a low aro... | \n", 567 | "55 | \n", 568 | "True | \n", 569 | "
7 | \n", 572 | "ThursdayNight | \n", 573 | "Partly Cloudy | \n", 574 | "Low: 56 °F | \n", 575 | "Thursday Night: Partly cloudy, with a low arou... | \n", 576 | "56 | \n", 577 | "True | \n", 578 | "