├── .gitignore ├── 00_GET_POST ├── 00_GET_movie_board.ipynb ├── 01_POST_thsrc_time_table.ipynb ├── 02_google_search_result.ipynb └── get_post_diff.ipynb ├── 01_files_website ├── 00_image_crawling.ipynb ├── 01_image_crawling_and_check_format.ipynb ├── 02_file_crawling.ipynb ├── 03_website_crawling.ipynb ├── 04_image_crawling_check_last_modified.ipynb └── 05_website_crawling_valid_URL.ipynb ├── 02_selenium ├── 00_selenium_crawling_render_image.ipynb ├── 01_pchome_crawling_item.ipynb ├── 02_selenium_google_search.ipynb └── 03_crawling_reCAPTCHA_image.ipynb ├── 03_graph_api ├── 00_facebook_crawling_article_comments.ipynb ├── 01_facebook_crawling_fanpage_likes_shares.ipynb └── 02_facebook_crawling_article_all.ipynb ├── LICENSE ├── Pipfile ├── Pipfile.lock ├── README.md └── appendix_ptt ├── 00_parse_article.ipynb ├── 01_search_api_by_title.ipynb ├── 02_today_articles.ipynb ├── 03_crawl_image.ipynb └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /00_GET_POST/00_GET_movie_board.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 練習\n", 8 | "\n", 9 | "- 觀察 http://www.boxofficemojo.com/yearly/ 並撰寫爬蟲程式\n", 10 | "- 抓取每年度冠軍排行榜\n", 11 | "- 使用 requests + BeautifulSoup 實作\n", 12 | "- 透過 pandas 輸出成 csv" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "import os\n", 24 | "import requests\n", 25 | "import pandas as pd\n", 26 | "\n", 27 | "from bs4 import BeautifulSoup\n", 28 | "\n", 29 | "url = 'http://www.boxofficemojo.com/yearly/'" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": { 36 | "collapsed": true 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "resp = requests.get(url)\n", 41 | "resp.encoding = 'utf-8'\n", 42 | "soup = BeautifulSoup(resp.text, 'lxml')" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "name": "stdout", 52 | "output_type": "stream", 53 | "text": [ 54 | "['Year',\n", 55 | " 'TotalGross*',\n", 56 | " 'Change',\n", 57 | " 'TicketsSold',\n", 58 | " 'Change',\n", 59 | " '# ofMovies',\n", 60 | " 'TotalScreens',\n", 61 | " 'Avg.TicketPrice',\n", 62 | " 'Avg.Cost^',\n", 63 | " '#1 Movie']\n" 64 | ] 65 | }, 66 | { 67 | "data": { 68 | "text/html": [ 69 | "
\n", 70 | "\n", 83 | "\n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | "
YearTotalGross*ChangeTicketsSoldChange# ofMoviesTotalScreensAvg.TicketPriceAvg.Cost^#1 Movie
02018$4,310.3-470.6-264-$9.16-Black Panther
12017$11,071.9-2.7%1,234.3-6.2%738-$8.97-Star Wars: The Last Jedi
22016$11,377.7+2.2%1,315.3-0.4%736-$8.65-Rogue One
32015$11,129.4+7.4%1,320.2+4.1%705-$8.43-Star Wars: The Force Awakens
42014$10,361.2-5.2%1,268.2-5.6%706-$8.17-American Sniper
52013$10,924.6+0.8%1,343.7-1.3%688-$8.13-Catching Fire
62012$10,837.6+6.5%1,361.5+6.1%669-$7.96-The Avengers
72011$10,174.2-3.7%1,283.0-4.2%602-$7.93-Harry Potter / Deathly Hallows (P2)
82010$10,565.6-0.3%1,339.1-5.2%537-$7.89-Toy Story 3
92009$10,595.5+10.0%1,412.7+5.3%521-$7.50-Avatar
102008$9,630.7-0.3%1,341.3-4.5%607-$7.18-The Dark Knight
112007$9,663.8+4.9%1,404.6-0.1%631-$6.88-Spider-Man 3
122006$9,209.5+4.2%1,406.0+2.0%608-$6.55-Dead Man's Chest
132005$8,840.5-5.8%1,379.2-8.7%547-$6.41-Revenge of the Sith
142004$9,380.5+1.5%1,510.5-1.4%551-$6.21-Shrek 2
152003$9,239.7+0.9%1,532.3-2.8%506-$6.03$63.8Return of the King
162002$9,155.1+8.8%1,575.7+6.0%48035,592$5.81$58.8Spider-Man
172001$8,412.5+9.8%1,487.3+4.7%48236,764$5.66$47.7Harry Potter / Sorcerer's Stone
182000$7,661.0+2.9%1,420.8-3.0%47837,396$5.39$54.8The Grinch
191999$7,448.0+7.2%1,465.2-1.1%46137,185$5.08$51.5The Phantom Menace
201998$6,949.0+9.2%1,480.7+6.7%50934,186$4.69$52.7Saving Private Ryan
211997$6,365.9+7.7%1,387.7+3.7%51031,640$4.59$53.4Titanic
221996$5,911.5+7.6%1,338.6+6.0%47129,690$4.42$39.8Independence Day
231995$5,493.5+1.8%1,262.6-2.3%41127,805$4.35$36.4Toy Story
241994$5,396.2+4.7%1,291.7+3.8%45326,586$4.18$34.3Forrest Gump
251993$5,154.2+5.8%1,244.0+6.0%46225,737$4.14$29.9Jurassic Park
261992$4,871.0+1.4%1,173.2+2.9%48025,105$4.15$28.9Aladdin
271991$4,803.2-4.4%1,140.6-4.0%45824,570$4.21$26.1Terminator 2
281990$5,021.8-0.2%1,188.6-5.9%41023,689$4.23$26.8Home Alone
291989$5,033.4+12.9%1,262.8+16.4%50223,132$3.97$23.5Batman
301988$4,458.4+4.8%1,084.8-0.3%51023,234$4.11$18.1Rain Man
311987$4,252.9+12.6%1,088.5+7.0%50923,555$3.91$20.1Three Men and a Baby
321986$3,778.0+0.8%1,017.2-3.7%45122,765$3.71$17.5Top Gun
331985$3,749.2-7.0%1,056.1-11.9%47021,147$3.55$16.8Back to the Future
341984$4,031.0+7.0%1,199.0+0.2%53620,200$3.36$14.4Beverly Hills Cop
351983$3,766.0+9.1%1,197.0+1.9%49518,884$3.15$11.9Return of the Jedi
361982$3,453.0+16.4%1,175.0+10.1%42818,020$2.94$11.8E.T.
371981$2,966.0+7.9%1,067.0+4.4%17318,040$2.78$11.3Raiders / Lost Ark
381980$2,749.0-1,022.0-16117,590$2.69$9.4The Empire Strikes Back
\n", 609 | "
" 610 | ], 611 | "text/plain": [ 612 | " Year TotalGross* Change TicketsSold Change # ofMovies TotalScreens \\\n", 613 | "0 2018 $4,310.3 - 470.6 - 264 - \n", 614 | "1 2017 $11,071.9 -2.7% 1,234.3 -6.2% 738 - \n", 615 | "2 2016 $11,377.7 +2.2% 1,315.3 -0.4% 736 - \n", 616 | "3 2015 $11,129.4 +7.4% 1,320.2 +4.1% 705 - \n", 617 | "4 2014 $10,361.2 -5.2% 1,268.2 -5.6% 706 - \n", 618 | "5 2013 $10,924.6 +0.8% 1,343.7 -1.3% 688 - \n", 619 | "6 2012 $10,837.6 +6.5% 1,361.5 +6.1% 669 - \n", 620 | "7 2011 $10,174.2 -3.7% 1,283.0 -4.2% 602 - \n", 621 | "8 2010 $10,565.6 -0.3% 1,339.1 -5.2% 537 - \n", 622 | "9 2009 $10,595.5 +10.0% 1,412.7 +5.3% 521 - \n", 623 | "10 2008 $9,630.7 -0.3% 1,341.3 -4.5% 607 - \n", 624 | "11 2007 $9,663.8 +4.9% 1,404.6 -0.1% 631 - \n", 625 | "12 2006 $9,209.5 +4.2% 1,406.0 +2.0% 608 - \n", 626 | "13 2005 $8,840.5 -5.8% 1,379.2 -8.7% 547 - \n", 627 | "14 2004 $9,380.5 +1.5% 1,510.5 -1.4% 551 - \n", 628 | "15 2003 $9,239.7 +0.9% 1,532.3 -2.8% 506 - \n", 629 | "16 2002 $9,155.1 +8.8% 1,575.7 +6.0% 480 35,592 \n", 630 | "17 2001 $8,412.5 +9.8% 1,487.3 +4.7% 482 36,764 \n", 631 | "18 2000 $7,661.0 +2.9% 1,420.8 -3.0% 478 37,396 \n", 632 | "19 1999 $7,448.0 +7.2% 1,465.2 -1.1% 461 37,185 \n", 633 | "20 1998 $6,949.0 +9.2% 1,480.7 +6.7% 509 34,186 \n", 634 | "21 1997 $6,365.9 +7.7% 1,387.7 +3.7% 510 31,640 \n", 635 | "22 1996 $5,911.5 +7.6% 1,338.6 +6.0% 471 29,690 \n", 636 | "23 1995 $5,493.5 +1.8% 1,262.6 -2.3% 411 27,805 \n", 637 | "24 1994 $5,396.2 +4.7% 1,291.7 +3.8% 453 26,586 \n", 638 | "25 1993 $5,154.2 +5.8% 1,244.0 +6.0% 462 25,737 \n", 639 | "26 1992 $4,871.0 +1.4% 1,173.2 +2.9% 480 25,105 \n", 640 | "27 1991 $4,803.2 -4.4% 1,140.6 -4.0% 458 24,570 \n", 641 | "28 1990 $5,021.8 -0.2% 1,188.6 -5.9% 410 23,689 \n", 642 | "29 1989 $5,033.4 +12.9% 1,262.8 +16.4% 502 23,132 \n", 643 | "30 1988 $4,458.4 +4.8% 1,084.8 -0.3% 510 23,234 \n", 644 | "31 1987 $4,252.9 +12.6% 1,088.5 +7.0% 509 23,555 \n", 645 | "32 1986 $3,778.0 +0.8% 1,017.2 -3.7% 451 22,765 \n", 646 | "33 1985 $3,749.2 -7.0% 1,056.1 -11.9% 470 21,147 \n", 647 | "34 1984 $4,031.0 +7.0% 1,199.0 +0.2% 536 20,200 \n", 648 | "35 1983 $3,766.0 +9.1% 1,197.0 +1.9% 495 18,884 \n", 649 | "36 1982 $3,453.0 +16.4% 1,175.0 +10.1% 428 18,020 \n", 650 | "37 1981 $2,966.0 +7.9% 1,067.0 +4.4% 173 18,040 \n", 651 | "38 1980 $2,749.0 - 1,022.0 - 161 17,590 \n", 652 | "\n", 653 | " Avg.TicketPrice Avg.Cost^ #1 Movie \n", 654 | "0 $9.16 - Black Panther \n", 655 | "1 $8.97 - Star Wars: The Last Jedi \n", 656 | "2 $8.65 - Rogue One \n", 657 | "3 $8.43 - Star Wars: The Force Awakens \n", 658 | "4 $8.17 - American Sniper \n", 659 | "5 $8.13 - Catching Fire \n", 660 | "6 $7.96 - The Avengers \n", 661 | "7 $7.93 - Harry Potter / Deathly Hallows (P2) \n", 662 | "8 $7.89 - Toy Story 3 \n", 663 | "9 $7.50 - Avatar \n", 664 | "10 $7.18 - The Dark Knight \n", 665 | "11 $6.88 - Spider-Man 3 \n", 666 | "12 $6.55 - Dead Man's Chest \n", 667 | "13 $6.41 - Revenge of the Sith \n", 668 | "14 $6.21 - Shrek 2 \n", 669 | "15 $6.03 $63.8 Return of the King \n", 670 | "16 $5.81 $58.8 Spider-Man \n", 671 | "17 $5.66 $47.7 Harry Potter / Sorcerer's Stone \n", 672 | "18 $5.39 $54.8 The Grinch \n", 673 | "19 $5.08 $51.5 The Phantom Menace \n", 674 | "20 $4.69 $52.7 Saving Private Ryan \n", 675 | "21 $4.59 $53.4 Titanic \n", 676 | "22 $4.42 $39.8 Independence Day \n", 677 | "23 $4.35 $36.4 Toy Story \n", 678 | "24 $4.18 $34.3 Forrest Gump \n", 679 | "25 $4.14 $29.9 Jurassic Park \n", 680 | "26 $4.15 $28.9 Aladdin \n", 681 | "27 $4.21 $26.1 Terminator 2 \n", 682 | "28 $4.23 $26.8 Home Alone \n", 683 | "29 $3.97 $23.5 Batman \n", 684 | "30 $4.11 $18.1 Rain Man \n", 685 | "31 $3.91 $20.1 Three Men and a Baby \n", 686 | "32 $3.71 $17.5 Top Gun \n", 687 | "33 $3.55 $16.8 Back to the Future \n", 688 | "34 $3.36 $14.4 Beverly Hills Cop \n", 689 | "35 $3.15 $11.9 Return of the Jedi \n", 690 | "36 $2.94 $11.8 E.T. \n", 691 | "37 $2.78 $11.3 Raiders / Lost Ark \n", 692 | "38 $2.69 $9.4 The Empire Strikes Back " 693 | ] 694 | }, 695 | "execution_count": 3, 696 | "metadata": {}, 697 | "output_type": "execute_result" 698 | } 699 | ], 700 | "source": [ 701 | "table = soup.find('table', attrs={'cellspacing': '1'})\n", 702 | "rows = table.find_all('tr')\n", 703 | "from pprint import pprint\n", 704 | "\n", 705 | "colname = rows.pop(0)\n", 706 | "colname = [i.text for i in colname]\n", 707 | "rows = [list(row.stripped_strings) for row in rows]\n", 708 | "\n", 709 | "df = pd.DataFrame(rows, columns=colname)\n", 710 | "df" 711 | ] 712 | }, 713 | { 714 | "cell_type": "code", 715 | "execution_count": 4, 716 | "metadata": {}, 717 | "outputs": [ 718 | { 719 | "name": "stdout", 720 | "output_type": "stream", 721 | "text": [ 722 | "Save csv to /home/dirl/github/Python-Crawling-Tutorial/results/boxofficemojo.csv\n" 723 | ] 724 | } 725 | ], 726 | "source": [ 727 | "results = os.path.abspath('../results')\n", 728 | "if not os.path.exists(results):\n", 729 | " os.makedirs(results)\n", 730 | "\n", 731 | "filename = os.path.join(results, 'boxofficemojo.csv')\n", 732 | "df.to_csv(filename, index=False)\n", 733 | "print('Save csv to {}'.format(filename))" 734 | ] 735 | } 736 | ], 737 | "metadata": { 738 | "kernelspec": { 739 | "display_name": "Python 3", 740 | "language": "python", 741 | "name": "python3" 742 | }, 743 | "language_info": { 744 | "codemirror_mode": { 745 | "name": "ipython", 746 | "version": 3 747 | }, 748 | "file_extension": ".py", 749 | "mimetype": "text/x-python", 750 | "name": "python", 751 | "nbconvert_exporter": "python", 752 | "pygments_lexer": "ipython3", 753 | "version": "3.5.2" 754 | } 755 | }, 756 | "nbformat": 4, 757 | "nbformat_minor": 2 758 | } 759 | -------------------------------------------------------------------------------- /00_GET_POST/01_POST_thsrc_time_table.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 練習\n", 8 | "\n", 9 | "- 觀察 https://www.thsrc.com.tw/tw/TimeTable/SearchResult 並撰寫爬蟲程式\n", 10 | "- 抓取一個禮拜後的高鐵時刻表\n", 11 | "- 台北到台南下午兩點的班次\n", 12 | "- 使用 requests + BeautifulSoup 實作\n", 13 | "- 透過 pandas 輸出成 csv" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import os\n", 23 | "import requests\n", 24 | "import pandas as pd\n", 25 | "import datetime\n", 26 | "\n", 27 | "from bs4 import BeautifulSoup\n", 28 | "\n", 29 | "url = 'https://www.thsrc.com.tw/tw/TimeTable/SearchResult'" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "name": "stdout", 39 | "output_type": "stream", 40 | "text": [ 41 | "The date after one week - 2018/02/28\n" 42 | ] 43 | } 44 | ], 45 | "source": [ 46 | "after_one_week = datetime.datetime.now() + datetime.timedelta(weeks=1)\n", 47 | "after_one_week_format = after_one_week.strftime('%Y/%m/%d')\n", 48 | "print('The date after one week - {}'.format(after_one_week_format))\n", 49 | "\n", 50 | "form_data = {\n", 51 | " 'StartStation': '977abb69-413a-4ccf-a109-0272c24fd490',\n", 52 | " 'EndStation': '9c5ac6ca-ec89-48f8-aab0-41b738cb1814',\n", 53 | " 'SearchDate': after_one_week_format,\n", 54 | " 'SearchTime': '14:00',\n", 55 | " 'SearchWay': 'DepartureInMandarin',\n", 56 | " 'RestTime': '',\n", 57 | " 'EarlyOrLater': ''\n", 58 | "}" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 3, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "resp = requests.post(url, data=form_data)\n", 68 | "resp.encoding = 'utf-8'\n", 69 | "soup = BeautifulSoup(resp.text, 'lxml')" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 4, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "data": { 79 | "text/html": [ 80 | "
\n", 81 | "\n", 94 | "\n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | "
車次出發時間抵達時間行車時間早鳥
0083314:1116:1102:008折起
1065114:4616:3201:46
2083715:1117:1102:008折起
3065715:4617:3201:46
4084116:1118:1102:0065折起
5066116:2118:0601:458折起
6066316:4618:3201:46
7084517:1119:1102:0065折起
8066717:2119:0601:458折起
9066917:4619:3201:46
\n", 188 | "
" 189 | ], 190 | "text/plain": [ 191 | " 車次 出發時間 抵達時間 行車時間 早鳥\n", 192 | "0 0833 14:11 16:11 02:00 8折起\n", 193 | "1 0651 14:46 16:32 01:46 \n", 194 | "2 0837 15:11 17:11 02:00 8折起\n", 195 | "3 0657 15:46 17:32 01:46 \n", 196 | "4 0841 16:11 18:11 02:00 65折起\n", 197 | "5 0661 16:21 18:06 01:45 8折起\n", 198 | "6 0663 16:46 18:32 01:46 \n", 199 | "7 0845 17:11 19:11 02:00 65折起\n", 200 | "8 0667 17:21 19:06 01:45 8折起\n", 201 | "9 0669 17:46 19:32 01:46 " 202 | ] 203 | }, 204 | "execution_count": 4, 205 | "metadata": {}, 206 | "output_type": "execute_result" 207 | } 208 | ], 209 | "source": [ 210 | "rows = soup.table.find_all('tr', recursive=False)\n", 211 | "\n", 212 | "colname, rows = rows[1], rows[2:]\n", 213 | "colname = list(colname.stripped_strings)\n", 214 | "\n", 215 | "for i, row in enumerate(rows):\n", 216 | " trips = row.find('td', class_='column1')\n", 217 | " t_departure = row.find('td', class_='column3')\n", 218 | " t_arrive = row.find('td', class_='column4')\n", 219 | " duration = row.find('td', class_='column2')\n", 220 | " early_ticket = row.find('td', class_='Width1')\n", 221 | " \n", 222 | " trips = trips.text if trips else None\n", 223 | " t_departure = t_departure.text if t_departure else ''\n", 224 | " t_arrive = t_arrive.text if t_arrive else ''\n", 225 | " duration = duration.text if duration else ''\n", 226 | " early_ticket = list(early_ticket.stripped_strings) if early_ticket else ''\n", 227 | " early_ticket = early_ticket[0] if early_ticket else ''\n", 228 | " \n", 229 | " rows[i] = [trips, t_departure, t_arrive, duration, early_ticket]\n", 230 | "\n", 231 | "df = pd.DataFrame(rows, columns=colname)\n", 232 | "df" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 5, 238 | "metadata": {}, 239 | "outputs": [ 240 | { 241 | "name": "stdout", 242 | "output_type": "stream", 243 | "text": [ 244 | "Save csv to /home/afun/github/Python-Crawling-Tutorial/results/thsrc_20180228.csv\n" 245 | ] 246 | } 247 | ], 248 | "source": [ 249 | "results = os.path.abspath('../results')\n", 250 | "if not os.path.exists(results):\n", 251 | " os.makedirs(results)\n", 252 | "\n", 253 | "filename = os.path.join(results, 'thsrc_{}.csv'.format(after_one_week.strftime('%Y%m%d')))\n", 254 | "df.to_csv(filename, index=False)\n", 255 | "print('Save csv to {}'.format(filename))" 256 | ] 257 | } 258 | ], 259 | "metadata": { 260 | "kernelspec": { 261 | "display_name": "Python 3", 262 | "language": "python", 263 | "name": "python3" 264 | }, 265 | "language_info": { 266 | "codemirror_mode": { 267 | "name": "ipython", 268 | "version": 3 269 | }, 270 | "file_extension": ".py", 271 | "mimetype": "text/x-python", 272 | "name": "python", 273 | "nbconvert_exporter": "python", 274 | "pygments_lexer": "ipython3", 275 | "version": "3.5.2" 276 | } 277 | }, 278 | "nbformat": 4, 279 | "nbformat_minor": 2 280 | } 281 | -------------------------------------------------------------------------------- /00_GET_POST/02_google_search_result.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 爬取 google 搜尋結果的第一個頁面標題\n", 8 | "\n", 9 | "- 練習使用 beautifulsoup css selector" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import requests\n", 19 | "\n", 20 | "from bs4 import BeautifulSoup\n", 21 | "from urllib3.exceptions import HTTPError\n", 22 | "from urllib.parse import urljoin" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "base_url = 'https://www.google.com.tw/search'\n", 32 | "query = {'q': 'python'}" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 3, 38 | "metadata": {}, 39 | "outputs": [ 40 | { 41 | "name": "stdout", 42 | "output_type": "stream", 43 | "text": [ 44 | "https://www.google.com.tw/search?q=python\n" 45 | ] 46 | } 47 | ], 48 | "source": [ 49 | "try:\n", 50 | " resp = requests.get(base_url, params=query)\n", 51 | " soup = BeautifulSoup(resp.text, 'lxml')\n", 52 | " print(resp.url)\n", 53 | "except HTTPError as err:\n", 54 | " print(err)\n", 55 | "except AttributeError as err:\n", 56 | " print(err)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 4, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "# print(soup.prettify())" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 5, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "Welcome to Python.org\n", 78 | "https://www.google.com.tw/url?q=https://www.python.org/&sa=U&ved=0ahUKEwj-8PeJzfvdAhUEa7wKHfNzBOoQFggoMAA&usg=AOvVaw348GGzSkqgB-FXPinUSErY\n", 79 | "=======================================================================================\n", 80 | "Download Python | Python.org\n", 81 | "https://www.google.com.tw/url?q=https://www.python.org/downloads/&sa=U&ved=0ahUKEwj-8PeJzfvdAhUEa7wKHfNzBOoQFggzMAE&usg=AOvVaw2UHusa0FkZGKEoJRjlxYza\n", 82 | "=======================================================================================\n", 83 | "Python - 維基百科,自由的百科全書 - Wikipedia\n", 84 | "https://www.google.com.tw/url?q=https://zh.wikipedia.org/zh-tw/Python&sa=U&ved=0ahUKEwj-8PeJzfvdAhUEa7wKHfNzBOoQFgg5MAI&usg=AOvVaw1gsx_ugnMzjTP2nlH7zARm\n", 85 | "=======================================================================================\n", 86 | "一小時Python入門-part 1 - - 寫點科普\n", 87 | "https://www.google.com.tw/url?q=https://kopu.chat/2017/01/18/%25E4%25B8%2580%25E5%25B0%258F%25E6%2599%2582python%25E5%2585%25A5%25E9%2596%2580-part-1/&sa=U&ved=0ahUKEwj-8PeJzfvdAhUEa7wKHfNzBOoQFghEMAM&usg=AOvVaw1BLo112Hj6BBWauFDpnbQN\n", 88 | "=======================================================================================\n", 89 | "課程介紹- 成為python數據分析達人的第一課(自學課程) | 政治大學磨 ...\n", 90 | "https://www.google.com.tw/url?q=http://moocs.nccu.edu.tw/course/123&sa=U&ved=0ahUKEwj-8PeJzfvdAhUEa7wKHfNzBOoQFghOMAQ&usg=AOvVaw3RXTAa5ochrAyo-2evVdhI\n", 91 | "=======================================================================================\n", 92 | "《經濟學人》專文探討:「為什麼Python 是世上最屌的程式語言 ...\n", 93 | "https://www.google.com.tw/url?q=https://buzzorange.com/techorange/2018/08/01/python-a-skr-language/&sa=U&ved=0ahUKEwj-8PeJzfvdAhUEa7wKHfNzBOoQFghTMAU&usg=AOvVaw2yA2hrrl61qBKnKoEeeTix\n", 94 | "=======================================================================================\n", 95 | "Python 入門| Django Girls Taipei\n", 96 | "https://www.google.com.tw/url?q=http://djangogirlstaipei.herokuapp.com/tutorials/python/&sa=U&ved=0ahUKEwj-8PeJzfvdAhUEa7wKHfNzBOoQFghZMAY&usg=AOvVaw0ha-itZMKnVgaSsRQlcutt\n", 97 | "=======================================================================================\n", 98 | "Python Tutorial: Learn Python For Free | Codecademy\n", 99 | "https://www.google.com.tw/url?q=https://www.codecademy.com/learn/learn-python&sa=U&ved=0ahUKEwj-8PeJzfvdAhUEa7wKHfNzBOoQFghfMAc&usg=AOvVaw09DURYBaIbVzO6GSXKb0gH\n", 100 | "=======================================================================================\n", 101 | "Python Tutorial - W3Schools\n", 102 | "https://www.google.com.tw/url?q=https://www.w3schools.com/python/&sa=U&ved=0ahUKEwj-8PeJzfvdAhUEa7wKHfNzBOoQFghlMAg&usg=AOvVaw2kfYHx2obM5EhxwIrSMn-4\n", 103 | "=======================================================================================\n" 104 | ] 105 | } 106 | ], 107 | "source": [ 108 | "search_results = soup.select('div.g > h3.r > a[href^=\"/url\"]')\n", 109 | "for search_item in search_results:\n", 110 | " print(search_item.text)\n", 111 | " print(urljoin(base_url, search_item['href']))\n", 112 | " print('='*87)" 113 | ] 114 | } 115 | ], 116 | "metadata": { 117 | "kernelspec": { 118 | "display_name": "Python 3", 119 | "language": "python", 120 | "name": "python3" 121 | }, 122 | "language_info": { 123 | "codemirror_mode": { 124 | "name": "ipython", 125 | "version": 3 126 | }, 127 | "file_extension": ".py", 128 | "mimetype": "text/x-python", 129 | "name": "python", 130 | "nbconvert_exporter": "python", 131 | "pygments_lexer": "ipython3", 132 | "version": "3.6.6" 133 | } 134 | }, 135 | "nbformat": 4, 136 | "nbformat_minor": 2 137 | } 138 | -------------------------------------------------------------------------------- /00_GET_POST/get_post_diff.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 觀察 GET / POST 的差別\n", 8 | "\n", 9 | "透過 postman 網站的測試觀察 GET 與 POST 之間的差別\n", 10 | "\n", 11 | "- https://docs.postman-echo.com/" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import requests\n", 21 | "from pprint import pformat" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "## GET request\n", 29 | "\n", 30 | "- 觀察回傳的內容\n", 31 | "- 觀察 URL" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "get_url = 'https://postman-echo.com/get'\n", 41 | "query = {\n", 42 | " 'name': 'afun',\n", 43 | " 'msg': 'A Foolish Consistency is the Hobgoblin of Little Minds'\n", 44 | "}\n", 45 | "\n", 46 | "get_resp = requests.get(get_url, params=query)" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "name": "stdout", 56 | "output_type": "stream", 57 | "text": [ 58 | "response text ('{\"args\":{\"name\":\"afun\",\"msg\":\"A Foolish Consistency is the Hobgoblin of '\n", 59 | " 'Little '\n", 60 | " 'Minds\"},\"headers\":{\"host\":\"postman-echo.com\",\"accept\":\"*/*\",\"accept-encoding\":\"gzip, '\n", 61 | " 'deflate\",\"user-agent\":\"python-requests/2.19.1\",\"x-forwarded-port\":\"443\",\"x-forwarded-proto\":\"https\"},\"url\":\"https://postman-echo.com/get?name=afun&msg=A+Foolish+Consistency+is+the+Hobgoblin+of+Little+Minds\"}')\n", 62 | "=======================================================================================\n", 63 | "original URL - https://postman-echo.com/get\n", 64 | "GET URL - https://postman-echo.com/get?name=afun&msg=A+Foolish+Consistency+is+the+Hobgoblin+of+Little+Minds\n" 65 | ] 66 | } 67 | ], 68 | "source": [ 69 | "print('response text', pformat(get_resp.text))\n", 70 | "print('='*87)\n", 71 | "print('original URL -', get_url)\n", 72 | "print('GET URL -', get_resp.url)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## POST request\n", 80 | "\n", 81 | "- 觀察回傳的內容\n", 82 | "- 觀察 URL" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 4, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "post_url = 'https://postman-echo.com/post'\n", 92 | "payload = 'A Foolish Consistency is the Hobgoblin of Little Minds'\n", 93 | "\n", 94 | "post_resp = requests.post(post_url, data=payload)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 5, 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "name": "stdout", 104 | "output_type": "stream", 105 | "text": [ 106 | "response text ('{\"args\":{},\"data\":{},\"files\":{},\"form\":{},\"headers\":{\"host\":\"postman-echo.com\",\"content-length\":\"54\",\"accept\":\"*/*\",\"accept-encoding\":\"gzip, '\n", 107 | " 'deflate\",\"user-agent\":\"python-requests/2.19.1\",\"x-forwarded-port\":\"443\",\"x-forwarded-proto\":\"https\"},\"json\":null,\"url\":\"https://postman-echo.com/post\"}')\n", 108 | "=======================================================================================\n", 109 | "original URL - https://postman-echo.com/post\n", 110 | "GET URL - https://postman-echo.com/post\n" 111 | ] 112 | } 113 | ], 114 | "source": [ 115 | "print('response text', pformat(post_resp.text))\n", 116 | "print('='*87)\n", 117 | "print('original URL -', post_url)\n", 118 | "print('GET URL -', post_resp.url)" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 6, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "query = {\n", 128 | " 'name': 'afun',\n", 129 | " 'msg': 'A Foolish Consistency is the Hobgoblin of Little Minds'\n", 130 | "}\n", 131 | "post_form_data_resp = requests.post(post_url, data=query)" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 7, 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "name": "stdout", 141 | "output_type": "stream", 142 | "text": [ 143 | "response text ('{\"args\":{},\"data\":\"\",\"files\":{},\"form\":{\"name\":\"afun\",\"msg\":\"A Foolish '\n", 144 | " 'Consistency is the Hobgoblin of Little '\n", 145 | " 'Minds\"},\"headers\":{\"host\":\"postman-echo.com\",\"content-length\":\"68\",\"accept\":\"*/*\",\"accept-encoding\":\"gzip, '\n", 146 | " 'deflate\",\"content-type\":\"application/x-www-form-urlencoded\",\"user-agent\":\"python-requests/2.19.1\",\"x-forwarded-port\":\"443\",\"x-forwarded-proto\":\"https\"},\"json\":{\"name\":\"afun\",\"msg\":\"A '\n", 147 | " 'Foolish Consistency is the Hobgoblin of Little '\n", 148 | " 'Minds\"},\"url\":\"https://postman-echo.com/post\"}')\n", 149 | "=======================================================================================\n", 150 | "original URL - https://postman-echo.com/post\n", 151 | "GET URL - https://postman-echo.com/post\n" 152 | ] 153 | } 154 | ], 155 | "source": [ 156 | "print('response text', pformat(post_form_data_resp.text))\n", 157 | "print('='*87)\n", 158 | "print('original URL -', post_url)\n", 159 | "print('GET URL -', post_form_data_resp.url)" 160 | ] 161 | } 162 | ], 163 | "metadata": { 164 | "kernelspec": { 165 | "display_name": "Python 3", 166 | "language": "python", 167 | "name": "python3" 168 | }, 169 | "language_info": { 170 | "codemirror_mode": { 171 | "name": "ipython", 172 | "version": 3 173 | }, 174 | "file_extension": ".py", 175 | "mimetype": "text/x-python", 176 | "name": "python", 177 | "nbconvert_exporter": "python", 178 | "pygments_lexer": "ipython3", 179 | "version": "3.6.6" 180 | } 181 | }, 182 | "nbformat": 4, 183 | "nbformat_minor": 2 184 | } 185 | -------------------------------------------------------------------------------- /01_files_website/00_image_crawling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 練習\n", 8 | "\n", 9 | "- 觀察 https://www.pexels.com/ 並撰寫爬蟲程式\n", 10 | "- 下載 5 張桌布圖" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": { 17 | "collapsed": true 18 | }, 19 | "outputs": [], 20 | "source": [ 21 | "import requests\n", 22 | "import re\n", 23 | "import os\n", 24 | "\n", 25 | "from bs4 import BeautifulSoup\n", 26 | "from pprint import pprint\n", 27 | "\n", 28 | "url = 'https://www.pexels.com/'" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "metadata": { 35 | "collapsed": true 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "resp = requests.get(url)\n", 40 | "soup = BeautifulSoup(resp.text, 'lxml')" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "name": "stdout", 50 | "output_type": "stream", 51 | "text": [ 52 | "['https://images.pexels.com/photos/106606/pexels-photo-106606.jpeg?h=350&auto=compress&cs=tinysrgb',\n", 53 | " 'https://images.pexels.com/photos/405041/pexels-photo-405041.jpeg?h=350&auto=compress&cs=tinysrgb',\n", 54 | " 'https://images.pexels.com/photos/102170/pexels-photo-102170.jpeg?h=350&auto=compress&cs=tinysrgb',\n", 55 | " 'https://images.pexels.com/photos/583399/pexels-photo-583399.jpeg?h=350&auto=compress&cs=tinysrgb',\n", 56 | " 'https://images.pexels.com/photos/398533/pexels-photo-398533.jpeg?h=350&auto=compress&cs=tinysrgb']\n" 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "article = soup.find('div', class_='photos').find_all('article', class_='photo-item')\n", 62 | "imgs = [a.find('a').find('img')['src'] for a in article]\n", 63 | "target = imgs[:5]\n", 64 | "\n", 65 | "pprint(target)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 4, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "regex catch the name pexels-photo-106606.jpeg\n", 78 | "Save the img at /home/dirl/github/Python-Crawling-Tutorial/results/pexels-photo-106606.jpeg\n", 79 | "regex catch the name pexels-photo-405041.jpeg\n", 80 | "Save the img at /home/dirl/github/Python-Crawling-Tutorial/results/pexels-photo-405041.jpeg\n", 81 | "regex catch the name pexels-photo-102170.jpeg\n", 82 | "Save the img at /home/dirl/github/Python-Crawling-Tutorial/results/pexels-photo-102170.jpeg\n", 83 | "regex catch the name pexels-photo-583399.jpeg\n", 84 | "Save the img at /home/dirl/github/Python-Crawling-Tutorial/results/pexels-photo-583399.jpeg\n", 85 | "regex catch the name pexels-photo-398533.jpeg\n", 86 | "Save the img at /home/dirl/github/Python-Crawling-Tutorial/results/pexels-photo-398533.jpeg\n" 87 | ] 88 | } 89 | ], 90 | "source": [ 91 | "results = os.path.abspath('../results')\n", 92 | "\n", 93 | "if not os.path.exists(results):\n", 94 | " os.makedirs(results)\n", 95 | "\n", 96 | "for i in target:\n", 97 | " img_resp = requests.get(i, stream=True) \n", 98 | " filename = re.match(r\".*(pexels-photo-([0-9]{6})\\.jpeg).*\", i).group(1)\n", 99 | " print('regex catch the name {}'.format(filename))\n", 100 | " \n", 101 | " filename = os.path.join(results, filename)\n", 102 | "\n", 103 | " with open(filename, 'wb') as f:\n", 104 | " for chunk in img_resp.iter_content(2048):\n", 105 | " f.write(chunk)\n", 106 | " print('Save the img at {}'.format(filename))" 107 | ] 108 | } 109 | ], 110 | "metadata": { 111 | "kernelspec": { 112 | "display_name": "Python 3", 113 | "language": "python", 114 | "name": "python3" 115 | }, 116 | "language_info": { 117 | "codemirror_mode": { 118 | "name": "ipython", 119 | "version": 3 120 | }, 121 | "file_extension": ".py", 122 | "mimetype": "text/x-python", 123 | "name": "python", 124 | "nbconvert_exporter": "python", 125 | "pygments_lexer": "ipython3", 126 | "version": "3.5.2" 127 | } 128 | }, 129 | "nbformat": 4, 130 | "nbformat_minor": 2 131 | } 132 | -------------------------------------------------------------------------------- /01_files_website/01_image_crawling_and_check_format.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 練習\n", 8 | "\n", 9 | "- 觀察 https://afuntw.github.io/Test-Crawling-Website/pages/portfolio/index.html 並撰寫爬蟲程式\n", 10 | "- 下載 5 張圖片\n", 11 | "- 以正確的圖片格式存檔" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "import requests\n", 23 | "import os\n", 24 | "\n", 25 | "from PIL import Image\n", 26 | "from bs4 import BeautifulSoup\n", 27 | "from pprint import pprint\n", 28 | "\n", 29 | "url = 'https://afuntw.github.io/Test-Crawling-Website/pages/portfolio/index.html'" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": { 36 | "collapsed": true 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "resp = requests.get(url)\n", 41 | "soup = BeautifulSoup(resp.text, 'lxml')" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 3, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "imgs = soup.find_all('img')\n", 51 | "imgs = [i['src'] for i in imgs]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 4, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "name": "stdout", 61 | "output_type": "stream", 62 | "text": [ 63 | "catch the filename XgXT3Va.png and the real format is JPEG\n", 64 | "catch the real filename XgXT3Va.jpeg\n", 65 | "save image at /home/dirl/github/Python-Crawling-Tutorial/results/XgXT3Va.jpeg\n", 66 | "catch the filename Q3bkStv.png and the real format is PNG\n", 67 | "catch the real filename Q3bkStv.png\n", 68 | "save image at /home/dirl/github/Python-Crawling-Tutorial/results/Q3bkStv.png\n", 69 | "catch the filename IDPxvSl.jpg and the real format is PNG\n", 70 | "catch the real filename IDPxvSl.png\n", 71 | "save image at /home/dirl/github/Python-Crawling-Tutorial/results/IDPxvSl.png\n", 72 | "catch the filename ZEhBDs6.png and the real format is PNG\n", 73 | "catch the real filename ZEhBDs6.png\n", 74 | "save image at /home/dirl/github/Python-Crawling-Tutorial/results/ZEhBDs6.png\n", 75 | "catch the filename UKxK6FZ.gif and the real format is PNG\n", 76 | "catch the real filename UKxK6FZ.png\n", 77 | "save image at /home/dirl/github/Python-Crawling-Tutorial/results/UKxK6FZ.png\n" 78 | ] 79 | } 80 | ], 81 | "source": [ 82 | "results = os.path.abspath('../results')\n", 83 | "if not os.path.exists(results):\n", 84 | " os.makedirs(results)\n", 85 | "\n", 86 | "for i in imgs:\n", 87 | " img_resp = requests.get(i, stream=True)\n", 88 | " image = Image.open(img_resp.raw)\n", 89 | " filename = os.path.basename(i)\n", 90 | " print('catch the filename {} and the real format is {}'.format(filename, image.format))\n", 91 | " \n", 92 | " real_filename = '{}.{}'.format(\n", 93 | " filename.split('.')[0],\n", 94 | " image.format.lower()\n", 95 | " )\n", 96 | " save_filename = os.path.join(results, real_filename)\n", 97 | " print('catch the real filename {}'.format(real_filename))\n", 98 | " \n", 99 | " image.save(save_filename)\n", 100 | " print('save image at {}'.format(save_filename))" 101 | ] 102 | } 103 | ], 104 | "metadata": { 105 | "kernelspec": { 106 | "display_name": "Python 3", 107 | "language": "python", 108 | "name": "python3" 109 | }, 110 | "language_info": { 111 | "codemirror_mode": { 112 | "name": "ipython", 113 | "version": 3 114 | }, 115 | "file_extension": ".py", 116 | "mimetype": "text/x-python", 117 | "name": "python", 118 | "nbconvert_exporter": "python", 119 | "pygments_lexer": "ipython3", 120 | "version": "3.5.2" 121 | } 122 | }, 123 | "nbformat": 4, 124 | "nbformat_minor": 2 125 | } 126 | -------------------------------------------------------------------------------- /01_files_website/02_file_crawling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 練習\n", 8 | "\n", 9 | "- 觀察 http://exam.lib.ntu.edu.tw/graduate 並撰寫爬蟲程式\n", 10 | "- request 附上 User-Agent 資訊\n", 11 | "- 下載頁面上所有 pdf 考古題檔案" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "import requests\n", 23 | "import re\n", 24 | "import os\n", 25 | "\n", 26 | "from PIL import Image\n", 27 | "from bs4 import BeautifulSoup\n", 28 | "from fake_useragent import UserAgent\n", 29 | "from urllib.parse import urljoin\n", 30 | "from pprint import pprint\n", 31 | "\n", 32 | "url = 'http://exam.lib.ntu.edu.tw/graduate'" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "fu = UserAgent()\n", 42 | "headers = {'User-Agent': fu.random}\n", 43 | "resp = requests.get(url, headers=headers)\n", 44 | "soup = BeautifulSoup(resp.text, 'lxml')" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "(1/30) catch the filename 106_graduate_4.pdf\n", 57 | "(1/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_4.pdf\n", 58 | "(2/30) catch the filename 106_graduate_6.pdf\n", 59 | "(2/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_6.pdf\n", 60 | "(3/30) catch the filename 106_graduate_3.pdf\n", 61 | "(3/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_3.pdf\n", 62 | "(4/30) catch the filename 106_graduate_1.pdf\n", 63 | "(4/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_1.pdf\n", 64 | "(5/30) catch the filename 106_graduate_2.pdf\n", 65 | "(5/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_2.pdf\n", 66 | "(6/30) catch the filename 106_graduate_8.pdf\n", 67 | "(6/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_8.pdf\n", 68 | "(7/30) catch the filename 106_graduate_5.pdf\n", 69 | "(7/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_5.pdf\n", 70 | "(8/30) catch the filename 106_graduate_10.pdf\n", 71 | "(8/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_10.pdf\n", 72 | "(9/30) catch the filename 106_graduate_7.pdf\n", 73 | "(9/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_7.pdf\n", 74 | "(10/30) catch the filename 106_graduate_11.pdf\n", 75 | "(10/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_11.pdf\n", 76 | "(11/30) catch the filename 106_graduate_13.pdf\n", 77 | "(11/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_13.pdf\n", 78 | "(12/30) catch the filename 106_graduate_15.pdf\n", 79 | "(12/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_15.pdf\n", 80 | "(13/30) catch the filename 106_graduate_14.pdf\n", 81 | "(13/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_14.pdf\n", 82 | "(14/30) catch the filename 106_graduate_8.pdf\n", 83 | "(14/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_8.pdf\n", 84 | "(15/30) catch the filename 106_graduate_5.pdf\n", 85 | "(15/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_5.pdf\n", 86 | "(16/30) catch the filename 106_graduate_16.pdf\n", 87 | "(16/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_16.pdf\n", 88 | "(17/30) catch the filename 106_graduate_17.pdf\n", 89 | "(17/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_17.pdf\n", 90 | "(18/30) catch the filename 106_graduate_18.pdf\n", 91 | "(18/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_18.pdf\n", 92 | "(19/30) catch the filename 106_graduate_19.pdf\n", 93 | "(19/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_19.pdf\n", 94 | "(20/30) catch the filename 106_graduate_17.pdf\n", 95 | "(20/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_17.pdf\n", 96 | "(21/30) catch the filename 106_graduate_20.pdf\n", 97 | "(21/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_20.pdf\n", 98 | "(22/30) catch the filename 106_graduate_22.pdf\n", 99 | "(22/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_22.pdf\n", 100 | "(23/30) catch the filename 106_graduate_21.pdf\n", 101 | "(23/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_21.pdf\n", 102 | "(24/30) catch the filename 106_graduate_8.pdf\n", 103 | "(24/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_8.pdf\n", 104 | "(25/30) catch the filename 106_graduate_25.pdf\n", 105 | "(25/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_25.pdf\n", 106 | "(26/30) catch the filename 106_graduate_23.pdf\n", 107 | "(26/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_23.pdf\n", 108 | "(27/30) catch the filename 106_graduate_24.pdf\n", 109 | "(27/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_24.pdf\n", 110 | "(28/30) catch the filename 106_graduate_8.pdf\n", 111 | "(28/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_8.pdf\n", 112 | "(29/30) catch the filename 106_graduate_26.pdf\n", 113 | "(29/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_26.pdf\n", 114 | "(30/30) catch the filename 106_graduate_28.pdf\n", 115 | "(30/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_28.pdf\n" 116 | ] 117 | } 118 | ], 119 | "source": [ 120 | "results = os.path.abspath('../results')\n", 121 | "if not os.path.exists(results):\n", 122 | " os.makedirs(results)\n", 123 | "\n", 124 | "pdfs = soup.find_all('img', class_=re.compile('.*field-icon-application-pdf$'))\n", 125 | "for i, pdf in enumerate(pdfs):\n", 126 | " href = pdf.parent['href']\n", 127 | " abs_href = urljoin(resp.url, href)\n", 128 | " file_resp = requests.get(abs_href, headers=headers, stream=True)\n", 129 | " \n", 130 | " filename = os.path.basename(abs_href)\n", 131 | " filename = filename.split('&')[0]\n", 132 | " print('({}/{}) catch the filename {}'.format(i+1, len(pdfs), filename))\n", 133 | " filename = os.path.join(results, filename)\n", 134 | "\n", 135 | " with open(filename, 'wb') as f:\n", 136 | " for chunk in file_resp.iter_content(2048):\n", 137 | " f.write(chunk)\n", 138 | " print('({}/{}) save file {}'.format(i+1, len(pdfs),filename))" 139 | ] 140 | } 141 | ], 142 | "metadata": { 143 | "kernelspec": { 144 | "display_name": "Python 3", 145 | "language": "python", 146 | "name": "python3" 147 | }, 148 | "language_info": { 149 | "codemirror_mode": { 150 | "name": "ipython", 151 | "version": 3 152 | }, 153 | "file_extension": ".py", 154 | "mimetype": "text/x-python", 155 | "name": "python", 156 | "nbconvert_exporter": "python", 157 | "pygments_lexer": "ipython3", 158 | "version": "3.5.2" 159 | } 160 | }, 161 | "nbformat": 4, 162 | "nbformat_minor": 2 163 | } 164 | -------------------------------------------------------------------------------- /01_files_website/03_website_crawling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 練習\n", 8 | "\n", 9 | "- 觀察 https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html 並撰寫爬蟲程式\n", 10 | "- request 附上 User-Agent 資訊\n", 11 | "- 下載網站上每個網頁的標題" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "import requests\n", 23 | "import re\n", 24 | "import os\n", 25 | "\n", 26 | "from PIL import Image\n", 27 | "from bs4 import BeautifulSoup\n", 28 | "from fake_useragent import UserAgent\n", 29 | "from urllib.parse import urljoin\n", 30 | "from pprint import pprint\n", 31 | "\n", 32 | "url = 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html'" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "metadata": { 39 | "collapsed": true 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "fu = UserAgent()\n", 44 | "headers = {'User-Agent': fu.random}\n", 45 | "resp = requests.get(url, headers=headers)\n", 46 | "soup = BeautifulSoup(resp.text, 'lxml')" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "wait_list = []\n", 56 | "view_list = []\n", 57 | "links = soup.find_all('a')\n", 58 | "links = [link['href'] for link in links]\n", 59 | "links = [urljoin(resp.url, link) for link in links]\n", 60 | "links = list(set(links))\n", 61 | "wait_list += links" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 4, 67 | "metadata": {}, 68 | "outputs": [ 69 | { 70 | "name": "stdout", 71 | "output_type": "stream", 72 | "text": [ 73 | "https://afuntw.github.io/Test-Crawling-Website/pages/blog/post.html\n", 74 | "wait list:\n", 75 | "['https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html',\n", 76 | " 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/contact.html',\n", 77 | " 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/about.html']\n", 78 | "view list:\n", 79 | "['https://afuntw.github.io/Test-Crawling-Website/pages/blog/post.html']\n", 80 | "all text:\n", 81 | "['Man must explore, and this is exploration at its greatest']\n", 82 | "=======================================================================================\n", 83 | "https://afuntw.github.io/Test-Crawling-Website/pages/blog/about.html\n", 84 | "wait list:\n", 85 | "['https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html',\n", 86 | " 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/contact.html']\n", 87 | "view list:\n", 88 | "['https://afuntw.github.io/Test-Crawling-Website/pages/blog/post.html',\n", 89 | " 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/about.html']\n", 90 | "all text:\n", 91 | "['Man must explore, and this is exploration at its greatest', 'About Me']\n", 92 | "=======================================================================================\n", 93 | "https://afuntw.github.io/Test-Crawling-Website/pages/blog/contact.html\n", 94 | "wait list:\n", 95 | "['https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html']\n", 96 | "view list:\n", 97 | "['https://afuntw.github.io/Test-Crawling-Website/pages/blog/post.html',\n", 98 | " 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/about.html',\n", 99 | " 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/contact.html']\n", 100 | "all text:\n", 101 | "['Man must explore, and this is exploration at its greatest',\n", 102 | " 'About Me',\n", 103 | " 'Contact Me']\n", 104 | "=======================================================================================\n", 105 | "https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html\n", 106 | "wait list:\n", 107 | "[]\n", 108 | "view list:\n", 109 | "['https://afuntw.github.io/Test-Crawling-Website/pages/blog/post.html',\n", 110 | " 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/about.html',\n", 111 | " 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/contact.html',\n", 112 | " 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html']\n", 113 | "all text:\n", 114 | "['Man must explore, and this is exploration at its greatest',\n", 115 | " 'About Me',\n", 116 | " 'Contact Me',\n", 117 | " 'Clean Blog']\n", 118 | "=======================================================================================\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "all_h1_text = []\n", 124 | "\n", 125 | "while wait_list:\n", 126 | "\n", 127 | " link = wait_list.pop()\n", 128 | " if link in view_list:\n", 129 | " continue\n", 130 | " \n", 131 | " print(link)\n", 132 | " view_list.append(link)\n", 133 | " \n", 134 | " page_resp = requests.get(link, headers=headers)\n", 135 | " page_soup = BeautifulSoup(page_resp.text, 'lxml')\n", 136 | " \n", 137 | " # get h1 tag on current page\n", 138 | " h1s = page_soup.find_all('h1')\n", 139 | " h1s = [h1.text for h1 in h1s]\n", 140 | " all_h1_text += h1s\n", 141 | " \n", 142 | " # search new links in current page\n", 143 | " links = page_soup.find_all('a')\n", 144 | " links = [link['href'] for link in links]\n", 145 | " links = [urljoin(page_resp.url, link) for link in links]\n", 146 | " links = list(filter(lambda x: x not in view_list, links))\n", 147 | " wait_list += links\n", 148 | " wait_list = list(set(wait_list))\n", 149 | " print('wait list:')\n", 150 | " pprint(wait_list)\n", 151 | " print('view list:')\n", 152 | " pprint(view_list)\n", 153 | " print('all text:')\n", 154 | " pprint(all_h1_text)\n", 155 | " print('='*87)" 156 | ] 157 | } 158 | ], 159 | "metadata": { 160 | "kernelspec": { 161 | "display_name": "Python 3", 162 | "language": "python", 163 | "name": "python3" 164 | }, 165 | "language_info": { 166 | "codemirror_mode": { 167 | "name": "ipython", 168 | "version": 3 169 | }, 170 | "file_extension": ".py", 171 | "mimetype": "text/x-python", 172 | "name": "python", 173 | "nbconvert_exporter": "python", 174 | "pygments_lexer": "ipython3", 175 | "version": "3.5.2" 176 | } 177 | }, 178 | "nbformat": 4, 179 | "nbformat_minor": 2 180 | } 181 | -------------------------------------------------------------------------------- /01_files_website/04_image_crawling_check_last_modified.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 練習\n", 8 | "\n", 9 | "- 觀察 https://afuntw.github.io/Test-Crawling-Website/pages/portfolio/index.html 並撰寫爬蟲程式\n", 10 | "- 下載 2018/01/29 14:39:10 之後修改過的圖片\n", 11 | "- 以正確的圖片格式存檔" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "import requests\n", 23 | "import os\n", 24 | "\n", 25 | "from PIL import Image\n", 26 | "from bs4 import BeautifulSoup\n", 27 | "from datetime import datetime\n", 28 | "from time import ctime\n", 29 | "from pprint import pprint\n", 30 | "\n", 31 | "url = 'https://afuntw.github.io/Test-Crawling-Website/pages/portfolio/index.html'\n", 32 | "last_modified = datetime(2018, 1, 29, 14, 39, 10)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "metadata": { 39 | "collapsed": true 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "resp = requests.get(url)\n", 44 | "soup = BeautifulSoup(resp.text, 'lxml')" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "metadata": { 51 | "collapsed": true 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "imgs = soup.find_all('img')\n", 56 | "imgs = [i['src'] for i in imgs]" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 4, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "name": "stdout", 66 | "output_type": "stream", 67 | "text": [ 68 | "catch the filename IDPxvSl.jpg and the real format is PNG\n", 69 | "catch the real filename IDPxvSl.png\n", 70 | "save image at /home/dirl/github/Python-Crawling-Tutorial/results/IDPxvSl.png\n", 71 | "catch the filename UKxK6FZ.gif and the real format is PNG\n", 72 | "catch the real filename UKxK6FZ.png\n", 73 | "save image at /home/dirl/github/Python-Crawling-Tutorial/results/UKxK6FZ.png\n" 74 | ] 75 | } 76 | ], 77 | "source": [ 78 | "results = os.path.abspath('../results')\n", 79 | "if not os.path.exists(results):\n", 80 | " os.makedirs(results)\n", 81 | "\n", 82 | "for i in imgs:\n", 83 | " # check header only\n", 84 | " check_resp = requests.head(i)\n", 85 | " check_head = dict(check_resp.headers)\n", 86 | " if 'Last-Modified' in check_head:\n", 87 | " check_modified = check_head['Last-Modified']\n", 88 | " check_modified = datetime.strptime(check_modified, '%a, %d %b %Y %H:%M:%S GMT')\n", 89 | " check_not_modified = check_modified < last_modified\n", 90 | " if check_not_modified:\n", 91 | " continue\n", 92 | " \n", 93 | " img_resp = requests.get(i, stream=True)\n", 94 | " image = Image.open(img_resp.raw)\n", 95 | " filename = os.path.basename(i)\n", 96 | " print('catch the filename {} and the real format is {}'.format(filename, image.format))\n", 97 | " \n", 98 | " real_filename = '{}.{}'.format(\n", 99 | " filename.split('.')[0],\n", 100 | " image.format.lower()\n", 101 | " )\n", 102 | " save_filename = os.path.join(results, real_filename)\n", 103 | " print('catch the real filename {}'.format(real_filename))\n", 104 | " \n", 105 | " image.save(save_filename)\n", 106 | " print('save image at {}'.format(save_filename))" 107 | ] 108 | } 109 | ], 110 | "metadata": { 111 | "kernelspec": { 112 | "display_name": "Python 3", 113 | "language": "python", 114 | "name": "python3" 115 | }, 116 | "language_info": { 117 | "codemirror_mode": { 118 | "name": "ipython", 119 | "version": 3 120 | }, 121 | "file_extension": ".py", 122 | "mimetype": "text/x-python", 123 | "name": "python", 124 | "nbconvert_exporter": "python", 125 | "pygments_lexer": "ipython3", 126 | "version": "3.5.2" 127 | } 128 | }, 129 | "nbformat": 4, 130 | "nbformat_minor": 2 131 | } 132 | -------------------------------------------------------------------------------- /01_files_website/05_website_crawling_valid_URL.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 練習\n", 8 | "\n", 9 | "- 觀察 http://aiacademy.tw/ 並撰寫爬蟲程式\n", 10 | "- 紀錄所有 URL" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": { 17 | "collapsed": true 18 | }, 19 | "outputs": [], 20 | "source": [ 21 | "import requests\n", 22 | "import re\n", 23 | "import os\n", 24 | "\n", 25 | "from PIL import Image\n", 26 | "from bs4 import BeautifulSoup\n", 27 | "from fake_useragent import UserAgent\n", 28 | "from urllib.parse import urljoin\n", 29 | "from urllib.parse import urlparse\n", 30 | "from tldextract import extract\n", 31 | "from pprint import pprint\n", 32 | "\n", 33 | "url = 'http://aiacademy.tw/'" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": { 40 | "collapsed": true 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "fu = UserAgent()\n", 45 | "headers = {'User-Agent': fu.random}\n", 46 | "resp = requests.get(url, headers=headers)\n", 47 | "soup = BeautifulSoup(resp.text, 'lxml')" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "metadata": { 54 | "collapsed": true 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "def invalid_href(url):\n", 59 | " check_anchor = re.match('.*#.*', url)\n", 60 | " check_protocol = re.match('[^https|http].*', urlparse(url).scheme)\n", 61 | " check_js = re.match('javascript.*', url)\n", 62 | " return any([check_anchor, check_protocol, check_js])\n", 63 | "\n", 64 | "def inner_href(url, domain):\n", 65 | " return extract(url).domain == domain" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 4, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "http://aiacademy.tw/\n", 78 | "http://aiacademy.tw/admission-mgr2/\n", 79 | "http://aiacademy.tw/opening/\n", 80 | "http://aiacademy.tw/category/general/\n", 81 | "http://aiacademy.tw/registration-tech1/\n", 82 | "http://aiacademy.tw/registration/\n", 83 | "http://aiacademy.tw/announcement-180129/\n", 84 | "http://aiacademy.tw/registration-mgr1/\n", 85 | "http://aiacademy.tw/admission-tech2/\n", 86 | "http://aiacademy.tw/curriculum-tech2/\n", 87 | "http://aiacademy.tw/registration-tech2/\n", 88 | "http://aiacademy.tw/wp-content/uploads/2018/01/registration-tech2.pdf\n", 89 | "http://aiacademy.tw/parking/\n", 90 | "http://aiacademy.tw/category/class-tech/\n", 91 | "http://aiacademy.tw/category/news/\n", 92 | "http://aiacademy.tw/category/news/page/3/\n", 93 | "http://aiacademy.tw/refund/\n", 94 | "http://aiacademy.tw/class-tech/\n", 95 | "http://aiacademy.tw/mgr-class-overview/\n", 96 | "http://aiacademy.tw/tech-leader-lecturers/\n", 97 | "http://aiacademy.tw/lecturer/swc/\n", 98 | "http://aiacademy.tw/lecturer/tjw/\n", 99 | "http://aiacademy.tw/lecturer/whm/\n", 100 | "http://aiacademy.tw/general\n", 101 | "http://aiacademy.tw/curriculum/\n", 102 | "http://aiacademy.tw/mgr-registration/\n", 103 | "http://aiacademy.tw/curriculum-tech1/\n", 104 | "http://aiacademy.tw/registration-mgr2/\n", 105 | "http://aiacademy.tw/category/curriculum/\n", 106 | "http://aiacademy.tw/organization/\n", 107 | "http://aiacademy.tw/wp-content/uploads/2018/01/孔校長致詞.pdf\n", 108 | "http://aiacademy.tw/mgr-lecturers/\n", 109 | "http://aiacademy.tw/lecturer/hjh/\n", 110 | "http://aiacademy.tw/mgr-admission-rule/\n", 111 | "http://aiacademy.tw/ta/\n", 112 | "http://aiacademy.tw/press-180127/\n", 113 | "http://aiacademy.tw/category/%e9%87%8d%e5%a4%a7%e6%b4%bb%e5%8b%95/\n", 114 | "http://aiacademy.tw/admission-mgr2\n", 115 | "http://aiacademy.tw/category/faq/\n", 116 | "http://aiacademy.tw/class-tech\n", 117 | "http://aiacademy.tw/lecturer/huk/\n", 118 | "http://aiacademy.tw/leave-of-absence-rule/\n", 119 | "http://aiacademy.tw/about\n", 120 | "http://aiacademy.tw/certificate-rule\n", 121 | "http://aiacademy.tw/lecturer/albert/\n", 122 | "http://aiacademy.tw/wp-content/uploads/2018/01/registration-mgr2.pdf\n", 123 | "http://aiacademy.tw/vision/\n", 124 | "http://aiacademy.tw/exam_notes/\n", 125 | "http://aiacademy.tw/category/news\n", 126 | "http://aiacademy.tw/press-180122/\n", 127 | "http://aiacademy.tw/history/\n", 128 | "http://aiacademy.tw/presence-rule-tech/\n", 129 | "http://aiacademy.tw/support-us/\n", 130 | "http://aiacademy.tw/admission-tech2\n", 131 | "http://aiacademy.tw/category/faq\n", 132 | "http://aiacademy.tw/lecturer/hungyilee/\n", 133 | "http://aiacademy.tw/%e5%ad%b8%e5%93%a1%e7%b7%a8%e8%99%9f%e5%8f%8a%e7%ac%ac%e4%b8%80%e6%9c%9f%e9%96%8b%e5%ad%b8%e5%85%b8%e7%a6%ae%e8%a1%8c%e5%89%8d%e9%80%9a%e7%9f%a5%e5%b7%b2%e5%af%84%e5%87%ba%e8%ab%8b%e6%b3%a8%e6%84%8f/\n", 134 | "http://aiacademy.tw/wp-content/uploads/2018/01/台灣人工智慧學校創校緣起及招生狀況1.pdf\n", 135 | "http://aiacademy.tw/class-1st-overview/\n", 136 | "http://aiacademy.tw/opening-presentation/\n", 137 | "http://aiacademy.tw/certificate-rule/\n", 138 | "http://aiacademy.tw/lecturer/ycw/\n", 139 | "http://aiacademy.tw/wp-content/uploads/2018/01/AI-vs-Startups.pdf\n", 140 | "http://aiacademy.tw/lecturer/ilt/\n", 141 | "http://aiacademy.tw/lecturer/lin/\n", 142 | "http://aiacademy.tw/class-overview/\n", 143 | "http://aiacademy.tw/calendar/\n", 144 | "http://aiacademy.tw/registration-mgr/\n", 145 | "http://aiacademy.tw/mgr-class-enrollment-notice-0110/\n", 146 | "http://aiacademy.tw/lecturer/swh/\n", 147 | "http://aiacademy.tw/admission-tech1/\n", 148 | "http://aiacademy.tw/enrollment_1st_term/\n", 149 | "http://aiacademy.tw/mgr-class-1st-overview\n", 150 | "http://aiacademy.tw/faq/\n", 151 | "http://aiacademy.tw/mgr-curriculum/\n", 152 | "http://aiacademy.tw/lecturer/sunmin/\n", 153 | "http://aiacademy.tw/class-1st-overview\n", 154 | "http://aiacademy.tw/lecturer/cph/\n", 155 | "http://aiacademy.tw/rent/\n", 156 | "http://aiacademy.tw/curriculum-mgr2/\n", 157 | "http://aiacademy.tw/lecturer/ysc/\n", 158 | "http://aiacademy.tw/lecturer/shw/\n", 159 | "http://aiacademy.tw/calendar-am071/\n", 160 | "http://aiacademy.tw/opening\n", 161 | "http://aiacademy.tw/admission-mgr1/\n", 162 | "http://aiacademy.tw/lecturers\n", 163 | "http://aiacademy.tw/category/class-mgr/\n", 164 | "http://aiacademy.tw/mgr-class-1st-overview/\n", 165 | "http://aiacademy.tw/wp-content/uploads/2017/12/台灣人工智慧學校經理人周末研修班第一期-報名表格.pdf\n", 166 | "http://aiacademy.tw/class-mgr/\n", 167 | "http://aiacademy.tw/curriculum-mgr/\n", 168 | "http://aiacademy.tw/admission-rule/\n", 169 | "http://aiacademy.tw/category/misc/\n", 170 | "http://aiacademy.tw/policy-tech/\n", 171 | "http://aiacademy.tw/class-mgr\n", 172 | "http://aiacademy.tw/absence-rule-manager/\n", 173 | "http://aiacademy.tw/mgr-class-overview\n", 174 | "http://aiacademy.tw/category/admission/\n", 175 | "http://aiacademy.tw/class-2018jan-written-examination-list/\n", 176 | "http://aiacademy.tw/presence-rule-mgr/\n", 177 | "http://aiacademy.tw/curriculum-mgr1/\n", 178 | "http://aiacademy.tw/lecturer/iac/\n", 179 | "http://aiacademy.tw/category/lecturers/\n", 180 | "http://aiacademy.tw/lecturer/weichao-chen/\n", 181 | "http://aiacademy.tw/wp-content/uploads/2017/10/aiacademy.tw-registration.pdf\n", 182 | "http://aiacademy.tw/job-fair/\n", 183 | "http://aiacademy.tw/corporate-partner\n", 184 | "http://aiacademy.tw/category/news/page/2/\n", 185 | "http://aiacademy.tw/class-enrollment-notice-1225/\n", 186 | "http://aiacademy.tw/aia-examination-notice-20171216/\n", 187 | "http://aiacademy.tw/about/\n", 188 | "http://aiacademy.tw/corporate-partner/\n", 189 | "view list:\n", 190 | "['http://aiacademy.tw/',\n", 191 | " 'http://aiacademy.tw/admission-mgr2/',\n", 192 | " 'http://aiacademy.tw/opening/',\n", 193 | " 'http://aiacademy.tw/category/general/',\n", 194 | " 'http://aiacademy.tw/registration-tech1/',\n", 195 | " 'http://aiacademy.tw/registration/',\n", 196 | " 'http://aiacademy.tw/announcement-180129/',\n", 197 | " 'http://aiacademy.tw/registration-mgr1/',\n", 198 | " 'http://aiacademy.tw/admission-tech2/',\n", 199 | " 'http://aiacademy.tw/curriculum-tech2/',\n", 200 | " 'http://aiacademy.tw/registration-tech2/',\n", 201 | " 'http://aiacademy.tw/wp-content/uploads/2018/01/registration-tech2.pdf',\n", 202 | " 'http://aiacademy.tw/parking/',\n", 203 | " 'http://aiacademy.tw/category/class-tech/',\n", 204 | " 'http://aiacademy.tw/category/news/',\n", 205 | " 'http://aiacademy.tw/category/news/page/3/',\n", 206 | " 'http://aiacademy.tw/refund/',\n", 207 | " 'http://aiacademy.tw/class-tech/',\n", 208 | " 'http://aiacademy.tw/mgr-class-overview/',\n", 209 | " 'http://aiacademy.tw/tech-leader-lecturers/',\n", 210 | " 'http://aiacademy.tw/lecturer/swc/',\n", 211 | " 'http://aiacademy.tw/lecturer/tjw/',\n", 212 | " 'http://aiacademy.tw/lecturer/whm/',\n", 213 | " 'http://aiacademy.tw/general',\n", 214 | " 'http://aiacademy.tw/curriculum/',\n", 215 | " 'http://aiacademy.tw/mgr-registration/',\n", 216 | " 'http://aiacademy.tw/curriculum-tech1/',\n", 217 | " 'http://aiacademy.tw/registration-mgr2/',\n", 218 | " 'http://aiacademy.tw/category/curriculum/',\n", 219 | " 'http://aiacademy.tw/organization/',\n", 220 | " 'http://aiacademy.tw/wp-content/uploads/2018/01/孔校長致詞.pdf',\n", 221 | " 'http://aiacademy.tw/mgr-lecturers/',\n", 222 | " 'http://aiacademy.tw/lecturer/hjh/',\n", 223 | " 'http://aiacademy.tw/mgr-admission-rule/',\n", 224 | " 'http://aiacademy.tw/ta/',\n", 225 | " 'http://aiacademy.tw/press-180127/',\n", 226 | " 'http://aiacademy.tw/category/%e9%87%8d%e5%a4%a7%e6%b4%bb%e5%8b%95/',\n", 227 | " 'http://aiacademy.tw/admission-mgr2',\n", 228 | " 'http://aiacademy.tw/category/faq/',\n", 229 | " 'http://aiacademy.tw/class-tech',\n", 230 | " 'http://aiacademy.tw/lecturer/huk/',\n", 231 | " 'http://aiacademy.tw/leave-of-absence-rule/',\n", 232 | " 'http://aiacademy.tw/about',\n", 233 | " 'http://aiacademy.tw/certificate-rule',\n", 234 | " 'http://aiacademy.tw/lecturer/albert/',\n", 235 | " 'http://aiacademy.tw/wp-content/uploads/2018/01/registration-mgr2.pdf',\n", 236 | " 'http://aiacademy.tw/vision/',\n", 237 | " 'http://aiacademy.tw/exam_notes/',\n", 238 | " 'http://aiacademy.tw/category/news',\n", 239 | " 'http://aiacademy.tw/press-180122/',\n", 240 | " 'http://aiacademy.tw/history/',\n", 241 | " 'http://aiacademy.tw/presence-rule-tech/',\n", 242 | " 'http://aiacademy.tw/support-us/',\n", 243 | " 'http://aiacademy.tw/admission-tech2',\n", 244 | " 'http://aiacademy.tw/category/faq',\n", 245 | " 'http://aiacademy.tw/lecturer/hungyilee/',\n", 246 | " 'http://aiacademy.tw/%e5%ad%b8%e5%93%a1%e7%b7%a8%e8%99%9f%e5%8f%8a%e7%ac%ac%e4%b8%80%e6%9c%9f%e9%96%8b%e5%ad%b8%e5%85%b8%e7%a6%ae%e8%a1%8c%e5%89%8d%e9%80%9a%e7%9f%a5%e5%b7%b2%e5%af%84%e5%87%ba%e8%ab%8b%e6%b3%a8%e6%84%8f/',\n", 247 | " 'http://aiacademy.tw/wp-content/uploads/2018/01/台灣人工智慧學校創校緣起及招生狀況1.pdf',\n", 248 | " 'http://aiacademy.tw/class-1st-overview/',\n", 249 | " 'http://aiacademy.tw/opening-presentation/',\n", 250 | " 'http://aiacademy.tw/certificate-rule/',\n", 251 | " 'http://aiacademy.tw/lecturer/ycw/',\n", 252 | " 'http://aiacademy.tw/wp-content/uploads/2018/01/AI-vs-Startups.pdf',\n", 253 | " 'http://aiacademy.tw/lecturer/ilt/',\n", 254 | " 'http://aiacademy.tw/lecturer/lin/',\n", 255 | " 'http://aiacademy.tw/class-overview/',\n", 256 | " 'http://aiacademy.tw/calendar/',\n", 257 | " 'http://aiacademy.tw/registration-mgr/',\n", 258 | " 'http://aiacademy.tw/mgr-class-enrollment-notice-0110/',\n", 259 | " 'http://aiacademy.tw/lecturer/swh/',\n", 260 | " 'http://aiacademy.tw/admission-tech1/',\n", 261 | " 'http://aiacademy.tw/enrollment_1st_term/',\n", 262 | " 'http://aiacademy.tw/mgr-class-1st-overview',\n", 263 | " 'http://aiacademy.tw/faq/',\n", 264 | " 'http://aiacademy.tw/mgr-curriculum/',\n", 265 | " 'http://aiacademy.tw/lecturer/sunmin/',\n", 266 | " 'http://aiacademy.tw/class-1st-overview',\n", 267 | " 'http://aiacademy.tw/lecturer/cph/',\n", 268 | " 'http://aiacademy.tw/rent/',\n", 269 | " 'http://aiacademy.tw/curriculum-mgr2/',\n", 270 | " 'http://aiacademy.tw/lecturer/ysc/',\n", 271 | " 'http://aiacademy.tw/lecturer/shw/',\n", 272 | " 'http://aiacademy.tw/calendar-am071/',\n", 273 | " 'http://aiacademy.tw/opening',\n", 274 | " 'http://aiacademy.tw/admission-mgr1/',\n", 275 | " 'http://aiacademy.tw/lecturers',\n", 276 | " 'http://aiacademy.tw/category/class-mgr/',\n", 277 | " 'http://aiacademy.tw/mgr-class-1st-overview/',\n", 278 | " 'http://aiacademy.tw/wp-content/uploads/2017/12/台灣人工智慧學校經理人周末研修班第一期-報名表格.pdf',\n", 279 | " 'http://aiacademy.tw/class-mgr/',\n", 280 | " 'http://aiacademy.tw/curriculum-mgr/',\n", 281 | " 'http://aiacademy.tw/admission-rule/',\n", 282 | " 'http://aiacademy.tw/category/misc/',\n", 283 | " 'http://aiacademy.tw/policy-tech/',\n", 284 | " 'http://aiacademy.tw/class-mgr',\n", 285 | " 'http://aiacademy.tw/absence-rule-manager/',\n", 286 | " 'http://aiacademy.tw/mgr-class-overview',\n", 287 | " 'http://aiacademy.tw/category/admission/',\n", 288 | " 'http://aiacademy.tw/class-2018jan-written-examination-list/',\n", 289 | " 'http://aiacademy.tw/presence-rule-mgr/',\n", 290 | " 'http://aiacademy.tw/curriculum-mgr1/',\n", 291 | " 'http://aiacademy.tw/lecturer/iac/',\n", 292 | " 'http://aiacademy.tw/category/lecturers/',\n", 293 | " 'http://aiacademy.tw/lecturer/weichao-chen/',\n", 294 | " 'http://aiacademy.tw/wp-content/uploads/2017/10/aiacademy.tw-registration.pdf',\n", 295 | " 'http://aiacademy.tw/job-fair/',\n", 296 | " 'http://aiacademy.tw/corporate-partner',\n", 297 | " 'http://aiacademy.tw/category/news/page/2/',\n", 298 | " 'http://aiacademy.tw/class-enrollment-notice-1225/',\n", 299 | " 'http://aiacademy.tw/aia-examination-notice-20171216/',\n", 300 | " 'http://aiacademy.tw/about/',\n", 301 | " 'http://aiacademy.tw/corporate-partner/']\n" 302 | ] 303 | } 304 | ], 305 | "source": [ 306 | "wait_list = [url]\n", 307 | "view_list = []\n", 308 | "domain = extract(url).domain\n", 309 | "\n", 310 | "while wait_list:\n", 311 | "\n", 312 | " link = wait_list.pop()\n", 313 | " if link in view_list:\n", 314 | " continue\n", 315 | " \n", 316 | " if invalid_href(link):\n", 317 | " continue\n", 318 | " \n", 319 | " if not inner_href(link, domain):\n", 320 | " continue\n", 321 | " \n", 322 | " print(link)\n", 323 | " view_list.append(link)\n", 324 | " \n", 325 | " page_resp = requests.get(link, headers=headers)\n", 326 | " page_soup = BeautifulSoup(page_resp.text, 'lxml')\n", 327 | "\n", 328 | " # search new links in current page\n", 329 | " links = page_soup.find_all('a')\n", 330 | " links = [link['href'] for link in links if link.has_attr('href')]\n", 331 | " links = [urljoin(page_resp.url, link) for link in links]\n", 332 | " links = list(filter(lambda x: x not in view_list, links))\n", 333 | " wait_list += links\n", 334 | " wait_list = list(set(wait_list))\n", 335 | "\n", 336 | "print('view list:')\n", 337 | "pprint(view_list)" 338 | ] 339 | } 340 | ], 341 | "metadata": { 342 | "kernelspec": { 343 | "display_name": "Python 3", 344 | "language": "python", 345 | "name": "python3" 346 | }, 347 | "language_info": { 348 | "codemirror_mode": { 349 | "name": "ipython", 350 | "version": 3 351 | }, 352 | "file_extension": ".py", 353 | "mimetype": "text/x-python", 354 | "name": "python", 355 | "nbconvert_exporter": "python", 356 | "pygments_lexer": "ipython3", 357 | "version": "3.5.2" 358 | } 359 | }, 360 | "nbformat": 4, 361 | "nbformat_minor": 2 362 | } 363 | -------------------------------------------------------------------------------- /02_selenium/00_selenium_crawling_render_image.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 練習\n", 8 | "\n", 9 | "- 觀察 https://afuntw.github.io/Test-Crawling-Website/pages/gallery/index.html 並撰寫爬蟲程式\n", 10 | "- 判斷是否為 JavaScript rendered website\n", 11 | "- 下載網頁影片\n", 12 | "- 設定 Implicit Wait\n", 13 | "- 透過 XPath 定位圖片\n", 14 | "\n", 15 | "**透過靜態網站爬蟲會看到的圖片是**\n", 16 | "\n", 17 | "\n", 18 | "\n", 19 | "**透過動態網站爬蟲會看到的圖片是**\n", 20 | "\n", 21 | "" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "metadata": { 28 | "collapsed": true 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "import os\n", 33 | "import requests\n", 34 | "import re\n", 35 | "\n", 36 | "from bs4 import BeautifulSoup\n", 37 | "from selenium import webdriver\n", 38 | "from selenium.webdriver.common.by import By\n", 39 | "from fake_useragent import UserAgent\n", 40 | "from pprint import pprint\n", 41 | "\n", 42 | "url = 'https://afuntw.github.io/Test-Crawling-Website/pages/gallery/index.html'\n", 43 | "fu = UserAgent()" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "# 使用 requests 做靜態爬蟲" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 2, 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "['https://i.imgur.com/0s6Iiu3.png']\n" 63 | ] 64 | } 65 | ], 66 | "source": [ 67 | "resp = requests.get(url)\n", 68 | "soup = BeautifulSoup(resp.text, 'lxml')\n", 69 | "imgs = soup.find_all('img', class_=re.compile('.*img-change'))\n", 70 | "imgs = [i['src'] for i in imgs]\n", 71 | "imgs = list(set(imgs))\n", 72 | "pprint(imgs)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "# 使用 Selenium 做動態爬蟲" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 3, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "['https://i.imgur.com/db3tGBG.png']\n", 92 | "catch - db3tGBG.png\n", 93 | "save - /home/dirl/github/Python-Crawling-Tutorial/results/db3tGBG.png\n" 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "driver = webdriver.Chrome()\n", 99 | "results = os.path.abspath('../results')\n", 100 | "if not os.path.exists(results):\n", 101 | " os.makedirs(results)\n", 102 | "\n", 103 | "try:\n", 104 | " # webdriver setting\n", 105 | " driver.get(url)\n", 106 | " driver.maximize_window()\n", 107 | " driver.implicitly_wait(10)\n", 108 | " \n", 109 | " # xpath\n", 110 | " imgs = driver.find_elements(By.XPATH, '/html/body/div/div/div/a/img')\n", 111 | " imgs = [i.get_attribute('src') for i in imgs]\n", 112 | " imgs = list(set(imgs))\n", 113 | " print(imgs)\n", 114 | " \n", 115 | " # download\n", 116 | " for img in imgs:\n", 117 | " headers = {'User-Agent': fu.random}\n", 118 | " img_resp = requests.get(img, stream=True, headers=headers)\n", 119 | " \n", 120 | " filename = os.path.basename(img)\n", 121 | " print('catch - {}'.format(filename))\n", 122 | " filename = os.path.join(results, filename)\n", 123 | " \n", 124 | " with open(filename, 'wb') as f:\n", 125 | " for chunk in img_resp.iter_content(2048):\n", 126 | " f.write(chunk)\n", 127 | " print('save - {}'.format(filename))\n", 128 | " \n", 129 | "except Exception as e:\n", 130 | " print(e)\n", 131 | "finally:\n", 132 | " driver.quit()" 133 | ] 134 | } 135 | ], 136 | "metadata": { 137 | "kernelspec": { 138 | "display_name": "Python 3", 139 | "language": "python", 140 | "name": "python3" 141 | }, 142 | "language_info": { 143 | "codemirror_mode": { 144 | "name": "ipython", 145 | "version": 3 146 | }, 147 | "file_extension": ".py", 148 | "mimetype": "text/x-python", 149 | "name": "python", 150 | "nbconvert_exporter": "python", 151 | "pygments_lexer": "ipython3", 152 | "version": "3.5.2" 153 | } 154 | }, 155 | "nbformat": 4, 156 | "nbformat_minor": 2 157 | } 158 | -------------------------------------------------------------------------------- /02_selenium/01_pchome_crawling_item.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 練習\n", 8 | "\n", 9 | "- 觀察 http://24h.pchome.com.tw/region/DHBE 並撰寫爬蟲程式\n", 10 | "- 判斷是否為 JavaScript rendered website\n", 11 | "- 設定 Implicit Wait\n", 12 | "- 透過 XPath 定位\n", 13 | "- 抓取商品的名稱與價格" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "import os\n", 25 | "\n", 26 | "from selenium import webdriver\n", 27 | "from selenium.webdriver.common.by import By\n", 28 | "\n", 29 | "url = 'http://24h.pchome.com.tw/region/DHBE'" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "name": "stdout", 39 | "output_type": "stream", 40 | "text": [ 41 | "ASUS B9440UA-0251A7200U (i5-7200/8G/256G SSD/W10P) - 32900\n", 42 | "(商)Lenovo ThinkPad T470 20HDA00STW(i5-7200U/940MX-2G/1TB/W10P) - 39900\n", 43 | "(商)HP X360 1030 G2(i7-7600U/8G*2/512GB SSD/FHD/W10Pro) - 68900\n", 44 | "(商)HP 240 G6 (i5-7200U/14/4G/500GB/Win10) - 18900\n", 45 | "DELL M7520(i7-7820HQ/Nvidia Quadro M2200M-4G/1TB+256GB/W10P/FHD)繪圖工作站筆電 - 89990\n", 46 | "ASUS P2430UJ-0321A6200U (i5-6200U/4G/500G/920M-2G/Win10) - 22900\n", 47 | "(商)Lenovo ThinkPad X1c 20HRA010TW(i7-7500U/256G SSD/W10P/FHD) - 59900\n", 48 | "(商) HP Probook 430 G4(i5-7200U/4G DDR4/500GB/Win10) - 25900\n", 49 | "(商)HP 240 G6 (i5-7200U/14/4G/500GB/Win10pro) - 23900\n", 50 | "DELL M5520(i7-7820HQ/M1200M/512GB SSD/Win10 Pro/UHD)繪圖工作站筆電 - 98990\n", 51 | "ASUS P2530UJ-0461A6200U (i5-6200U/8G/1TB/GeForce 920M-2G/W10P) - 23900\n", 52 | "(商)Lenovo ThinkPad T470s 20HFA00ETW(i7-7600U/512G SSD/W10P) - 63900\n", 53 | "(商)HP Probook 650 G3(i7-7600U/512GB SSD/AMD Radeon R7 M465 2GB/W10P) - 50900\n", 54 | "(商)HP 240 G6 (i3-6006U/14/UMA/500G/W10DW7) - 17900\n", 55 | "ASUS B9440UA-0251A7200U (i5-7200/8G/256G SSD/W10P) - 27900\n", 56 | "ASUS P2530UJ-0271A6500U (i7-6500U/8G/1TB/920M-2G/FHD/Win10P) - 32900\n", 57 | "(商)Lenovo ThinkPad Edge15 E570 20H5A037TW(i7-7500U/GTX 950M-2G/1TB/W10/FHD) - 34900\n", 58 | "ACER TravelMate TMP238-M-77JQ (i7-6500U/8GB/256GB SSD/W7P+W10P) - 33800\n", 59 | "ACER TravelMate TMX349-G2-M-53L8.(i5-7200U/8GB/256GB SSD/W10P) - 30500\n", 60 | "DELL M7520(i7-7820HQ/Nvidia Quadro M1200M-4G/1TB/W10P/FHD)繪圖工作站筆電 - 76900\n", 61 | "ASUS A550V-0203J6700HQ (i7-6700HQ/500G/GTX950M 2G獨顯/W10P) - 28900\n", 62 | "(商)Lenovo ThinkPad X260 20F6A07QTW(i5-6200U/1TB/W10P) - 38900\n", 63 | "ACER TravelMate TMP249-M-C1DV.(CM3855U/4GB DDR4/500GB/W10P) - 14990\n", 64 | "ACER TravelMate TMP249-M-3142.(i3-6100U/4GB/500GB/W7P+W10P) - 17990\n", 65 | "DELL M7510(i7-6820HQ/Nvidia Quadro M1000M-2G/1TB/W7P/FHD)繪圖工作站筆電 - 69900\n", 66 | "ASUS B8230UA-0061A6500U (i7-6500U/512G SSD/W7P) - 44900\n", 67 | "(商)Lenovo ThinkPad X270 20HNA00RTW (i5-7200U/1TB/W10P) - 41900\n", 68 | "ACER TravelMate TMP446-M-54S0.(i5-5200U/4GB/500G/W7P+W10P) - 19900\n", 69 | "ACER TravelMate TMP259-M-5726(i5-6200U/4GB/128GB SSD/W7P+W10P) - 26900\n", 70 | "DELL Vostro 14 5000 (i5-7200U/4G/1TB/940MX-2G/W10/HD/Jingle Gold) - 23999\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "try:\n", 76 | " driver = webdriver.Chrome()\n", 77 | " driver.get(url)\n", 78 | " driver.maximize_window()\n", 79 | " driver.implicitly_wait(10)\n", 80 | " \n", 81 | " # get web elements\n", 82 | " items = driver.find_elements(By.XPATH, '//dl[@id=\"Block12Container50\"]/dd')\n", 83 | " \n", 84 | " for item in items:\n", 85 | " title = item.find_element(By.XPATH, './div/h5/a').text\n", 86 | " price = item.find_element(By.XPATH, './div/ul/li/span/span').text\n", 87 | " if title and price:\n", 88 | " print('{} - {}'.format(title, price))\n", 89 | "\n", 90 | "except Exception as e:\n", 91 | " print(e)\n", 92 | "finally:\n", 93 | " driver.quit()" 94 | ] 95 | } 96 | ], 97 | "metadata": { 98 | "kernelspec": { 99 | "display_name": "Python 3", 100 | "language": "python", 101 | "name": "python3" 102 | }, 103 | "language_info": { 104 | "codemirror_mode": { 105 | "name": "ipython", 106 | "version": 3 107 | }, 108 | "file_extension": ".py", 109 | "mimetype": "text/x-python", 110 | "name": "python", 111 | "nbconvert_exporter": "python", 112 | "pygments_lexer": "ipython3", 113 | "version": "3.5.2" 114 | } 115 | }, 116 | "nbformat": 4, 117 | "nbformat_minor": 2 118 | } 119 | -------------------------------------------------------------------------------- /02_selenium/02_selenium_google_search.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 練習\n", 8 | "\n", 9 | "- 模擬 google search 流程\n", 10 | "- https://www.google.com.tw/\n", 11 | "- 搜尋「人工智慧」\n", 12 | "- 紀錄前兩頁搜尋結果的連結" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "import os\n", 22 | "\n", 23 | "from selenium import webdriver\n", 24 | "from selenium.webdriver.common.keys import Keys\n", 25 | "from selenium.webdriver.common.by import By\n", 26 | "from pprint import pprint\n", 27 | "\n", 28 | "url = 'https://google.com'" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "metadata": {}, 35 | "outputs": [ 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "======================================================================================= Page 0\n", 41 | "title: 人工智能- 维基百科,自由的百科全书\n", 42 | "url: https://zh.wikipedia.org/zh-tw/%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD\n", 43 | "---\n", 44 | "title: 人工智慧三大關鍵技術 - 數位時代\n", 45 | "url: https://www.bnext.com.tw/article/41534/3-key-techniques-of-ai\n", 46 | "---\n", 47 | "title: AI 人工智慧| TechNews 科技新報\n", 48 | "url: https://technews.tw/category/cutting-edge/ai/\n", 49 | "---\n", 50 | "title: 人工智慧不可能超越人類,原因居然是這樣的……(上) |智慧機器人網 ...\n", 51 | "url: https://www.limitlessiq.com/news/post/view/id/3596/\n", 52 | "---\n", 53 | "title: 人工智慧AI – CASE報科學 - 國立臺灣大學科學教育發展中心\n", 54 | "url: https://case.ntu.edu.tw/blog/?cat=3772\n", 55 | "---\n", 56 | "title: AI人工智慧來了! 你的未來在哪裡?|深度專題|天下雜誌\n", 57 | "url: https://www.cw.com.tw/special/2073\n", 58 | "---\n", 59 | "title: 台灣人工智慧學校| Taiwan AI Academy\n", 60 | "url: http://aiacademy.tw/\n", 61 | "---\n", 62 | "title: 人工智慧- MBA智库百科\n", 63 | "url: https://wiki.mbalib.com/zh-tw/%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD\n", 64 | "---\n", 65 | "======================================================================================= Page 1\n", 66 | "title: 什麼是人工智慧?︱《三分鐘財經教室》#01 - YouTube\n", 67 | "url: https://www.youtube.com/watch?v=nKcsu4JierI\n", 68 | "---\n", 69 | "title: 『AI人工智慧!機器學習& 突如其來的危機』芬特克FinTech EP3 - YouTube\n", 70 | "url: https://www.youtube.com/watch?v=i0UxYDqlX6o\n", 71 | "---\n", 72 | "title: 人工智慧:搜尋方法與邏輯推論(Artificial Intelligence - Search & Logic ...\n", 73 | "url: https://www.coursera.org/learn/rengong-zhineng\n", 74 | "---\n", 75 | "title: 博客來-中文書>電腦資訊>概論/科技趨勢>人工智慧/機器學習\n", 76 | "url: https://www.books.com.tw/web/sys_bbotm/books/190102\n", 77 | "---\n", 78 | "title: 人工智能| 大紀元\n", 79 | "url: http://www.epochtimes.com/b5/tag/%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD.html\n", 80 | "---\n", 81 | "title: 如何對面AI時代的孩子?人工智慧博士教年輕父母強化個人實力- Yahoo ...\n", 82 | "url: https://tw.news.yahoo.com/%E5%A6%82%E4%BD%95%E5%B0%8D%E9%9D%A2ai%E6%99%82%E4%BB%A3%E7%9A%84%E5%AD%A9%E5%AD%90-%E4%BA%BA%E5%B7%A5%E6%99%BA%E6%85%A7%E5%8D%9A%E5%A3%AB%E6%95%99%E5%B9%B4%E8%BC%95%E7%88%B6%E6%AF%8D%E5%BC%B7%E5%8C%96%E5%80%8B%E4%BA%BA%E5%AF%A6%E5%8A%9B-010012301.html\n", 83 | "---\n", 84 | "title: 人工智慧對勞動就業的影響- STPI Research Portal - 科技政策觀點\n", 85 | "url: https://portal.stpi.narl.org.tw/index/article/10401\n", 86 | "---\n", 87 | "title: AI人工智慧時代來臨- 中時電子報\n", 88 | "url: https://www.chinatimes.com/newspapers/20180907000541-260204\n", 89 | "---\n", 90 | "title: 人工智慧應用新趨勢與展望—學生與機器人共同學習-臺北產經資訊網\n", 91 | "url: https://www.taipeiecon.taipei/article_cont.aspx?MmmID=1201&MSid=1001302007727155764\n", 92 | "---\n", 93 | "title: 人工智慧技術的下一波研發核心 - Digitimes\n", 94 | "url: https://www.digitimes.com.tw/col/article.asp?id=944\n", 95 | "---\n" 96 | ] 97 | } 98 | ], 99 | "source": [ 100 | "try:\n", 101 | " driver = webdriver.Chrome('/home/afun/Downloads/chromedriver')\n", 102 | " driver.get(url)\n", 103 | " driver.maximize_window()\n", 104 | " driver.implicitly_wait(10)\n", 105 | " \n", 106 | " search_input = driver.find_element(By.ID, 'lst-ib')\n", 107 | " search_input.send_keys(u'人工智慧')\n", 108 | " search_input.send_keys(Keys.ENTER)\n", 109 | " \n", 110 | " for i in range(2):\n", 111 | " print('='*87, 'Page {}'.format(i))\n", 112 | "\n", 113 | " links = driver.find_elements(By.XPATH, '//div[@class=\"r\"]/a[@href]')\n", 114 | "\n", 115 | " for link in links:\n", 116 | " page_title = link.find_element(By.TAG_NAME, 'h3').text\n", 117 | " page_url = ''\n", 118 | "\n", 119 | " if link.get_attribute('href'):\n", 120 | " page_url = link.get_attribute('href')\n", 121 | "\n", 122 | " print('title: {}\\nurl: {}\\n---'.format(page_title, page_url))\n", 123 | "\n", 124 | " next_page = driver.find_element(By.XPATH, '//*[@id=\"pnnext\"]/span[2]').click()\n", 125 | "\n", 126 | "except Exception as e:\n", 127 | " print(e)\n", 128 | "finally:\n", 129 | " driver.quit()" 130 | ] 131 | } 132 | ], 133 | "metadata": { 134 | "kernelspec": { 135 | "display_name": "Python 3", 136 | "language": "python", 137 | "name": "python3" 138 | }, 139 | "language_info": { 140 | "codemirror_mode": { 141 | "name": "ipython", 142 | "version": 3 143 | }, 144 | "file_extension": ".py", 145 | "mimetype": "text/x-python", 146 | "name": "python", 147 | "nbconvert_exporter": "python", 148 | "pygments_lexer": "ipython3", 149 | "version": "3.6.6" 150 | } 151 | }, 152 | "nbformat": 4, 153 | "nbformat_minor": 2 154 | } 155 | -------------------------------------------------------------------------------- /02_selenium/03_crawling_reCAPTCHA_image.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 練習\n", 8 | "\n", 9 | "- https://www.google.com/recaptcha/demo/recaptcha\n", 10 | "- 透過 google reCAPTCHA demo 生成圖片\n", 11 | "- 將 reCAPTCHA 的圖片抓下來" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "import os\n", 23 | "import hashlib\n", 24 | "import requests\n", 25 | "import time\n", 26 | "\n", 27 | "from selenium import webdriver\n", 28 | "from selenium.webdriver.common.keys import Keys\n", 29 | "from selenium.webdriver.common.by import By\n", 30 | "\n", 31 | "from fake_useragent import UserAgent\n", 32 | "from PIL import Image\n", 33 | "\n", 34 | "url = 'https://www.google.com/recaptcha/demo/recaptcha'\n", 35 | "fu = UserAgent()" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "Save img - /home/dirl/github/Python-Crawling-Tutorial/results/ceecac6a5a9677750a69c80a87f26080.JPEG\n", 48 | "Save img - /home/dirl/github/Python-Crawling-Tutorial/results/ceecac6a5a9677750a69c80a87f26080.JPEG\n", 49 | "Save img - /home/dirl/github/Python-Crawling-Tutorial/results/4cce70c2cdde67af52e27920693da213.JPEG\n", 50 | "Save img - /home/dirl/github/Python-Crawling-Tutorial/results/1682c3490f1ec9df1da4a43407f890b7.JPEG\n", 51 | "Save img - /home/dirl/github/Python-Crawling-Tutorial/results/aa6a4d1bfa181fc53636a341562fb2ea.PNG\n" 52 | ] 53 | } 54 | ], 55 | "source": [ 56 | "results = os.path.abspath('../results')\n", 57 | "if not os.path.exists(results):\n", 58 | " os.makedirs(results)\n", 59 | "\n", 60 | "try:\n", 61 | " driver = webdriver.Chrome()\n", 62 | " driver.get(url)\n", 63 | " driver.maximize_window()\n", 64 | " driver.implicitly_wait(10)\n", 65 | " compare_url = ''\n", 66 | " \n", 67 | " for i in range(5):\n", 68 | " # get image\n", 69 | " img_el = driver.find_element(By.XPATH, '//div[@id=\"recaptcha_image\"]/img')\n", 70 | " img_url = img_el.get_attribute('src')\n", 71 | " img_filename = hashlib.md5(img_url.encode('utf-8')).hexdigest()\n", 72 | " compare_url = img_url\n", 73 | "\n", 74 | " headers = {'User-Agent': fu.random}\n", 75 | " img_resp = requests.get(img_url, stream=True, headers=headers)\n", 76 | " img = Image.open(img_resp.raw)\n", 77 | " img_filename = '{}.{}'.format(img_filename, img.format)\n", 78 | " img_filename = os.path.join(results, img_filename)\n", 79 | " img.save(img_filename)\n", 80 | " print('Save img - {}'.format(img_filename))\n", 81 | " \n", 82 | " # re-generate image\n", 83 | " btn_refresh = driver.find_element(By.XPATH, '//*[@id=\"recaptcha_reload_btn\"]').click()\n", 84 | " time.sleep(2)\n", 85 | " \n", 86 | "\n", 87 | "except Exception as e:\n", 88 | " print(e)\n", 89 | "finally:\n", 90 | " driver.quit()" 91 | ] 92 | } 93 | ], 94 | "metadata": { 95 | "kernelspec": { 96 | "display_name": "Python 3", 97 | "language": "python", 98 | "name": "python3" 99 | }, 100 | "language_info": { 101 | "codemirror_mode": { 102 | "name": "ipython", 103 | "version": 3 104 | }, 105 | "file_extension": ".py", 106 | "mimetype": "text/x-python", 107 | "name": "python", 108 | "nbconvert_exporter": "python", 109 | "pygments_lexer": "ipython3", 110 | "version": "3.5.2" 111 | } 112 | }, 113 | "nbformat": 4, 114 | "nbformat_minor": 2 115 | } 116 | -------------------------------------------------------------------------------- /03_graph_api/00_facebook_crawling_article_comments.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 練習\n", 8 | "\n", 9 | "- 取得 FB 文章底下所有留言\n", 10 | "- 使用 [Graph API](https://developers.facebook.com/tools/explorer/)\n", 11 | "- https://www.facebook.com/DoctorKoWJ/videos/1213927345375910/\n", 12 | "- 輸出成 CSV" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "import os\n", 24 | "import requests\n", 25 | "import pandas as pd\n", 26 | "\n", 27 | "from datetime import datetime" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": { 34 | "collapsed": true 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "# 透過 Graph API 觀察文章 ID 與 token\n", 39 | "article_id = '1213927345375910'\n", 40 | "token = ''" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "name": "stdout", 50 | "output_type": "stream", 51 | "text": [ 52 | "pages 1\n", 53 | "pages 2\n", 54 | "pages 3\n", 55 | "pages 4\n", 56 | "EOF\n", 57 | "comment length = 431\n" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "comments = []\n", 63 | "pages = 0\n", 64 | "\n", 65 | "url = 'https://graph.facebook.com/v2.11/{}/comments?pretty=0&limit={}&access_token={}'.format(\n", 66 | " article_id, 100, token\n", 67 | ")\n", 68 | "\n", 69 | "while True:\n", 70 | " pages += 1\n", 71 | " resp = requests.get(url)\n", 72 | " data = resp.json()\n", 73 | " comments += data['data']\n", 74 | " \n", 75 | " if 'next' not in data['paging']:\n", 76 | " print('EOF')\n", 77 | " break\n", 78 | " else:\n", 79 | " url = data['paging']['next']\n", 80 | " print('pages {}'.format(pages))\n", 81 | " \n", 82 | "print('comment length = {}'.format(len(comments)))" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 4, 88 | "metadata": {}, 89 | "outputs": [ 90 | { 91 | "data": { 92 | "text/html": [ 93 | "
\n", 94 | "\n", 107 | "\n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | "
created_timefromidmessage
02018-01-09T11:02:42+0000NaN1213927345375910_1213982232037088市長,謝謝您注意到這個議題。但是,不知道您是否同時有發現,比起醫療環境,更加威脅台灣幼兒的,...
12018-01-09T11:07:44+0000NaN1213927345375910_1213985318703446我希望如果有天你有能力了,可以為被虐的兒童提出修法保護,更另闢一個無力撫養孩子的人一個出口,...
22018-01-09T11:21:33+0000NaN1213927345375910_1213993592035952我也是重症兒童家屬\\n感謝你的發言\\n我第一次看到有政治人物願意大聲疾呼\\n但不是說沒有其他...
32018-01-09T09:34:35+0000NaN1213927345375910_1213934828708495每次看到你就覺得台灣還有希望\\n不在乎選票在乎的是人
42018-01-09T11:28:25+0000NaN1213927345375910_1213997665368878每當我覺得天下的烏鴉一般黑的時候 看到你的發文 又讓我覺得繼續奮鬥 台灣會被照亮的 柯文哲 ...
\n", 155 | "
" 156 | ], 157 | "text/plain": [ 158 | " created_time from id \\\n", 159 | "0 2018-01-09T11:02:42+0000 NaN 1213927345375910_1213982232037088 \n", 160 | "1 2018-01-09T11:07:44+0000 NaN 1213927345375910_1213985318703446 \n", 161 | "2 2018-01-09T11:21:33+0000 NaN 1213927345375910_1213993592035952 \n", 162 | "3 2018-01-09T09:34:35+0000 NaN 1213927345375910_1213934828708495 \n", 163 | "4 2018-01-09T11:28:25+0000 NaN 1213927345375910_1213997665368878 \n", 164 | "\n", 165 | " message \n", 166 | "0 市長,謝謝您注意到這個議題。但是,不知道您是否同時有發現,比起醫療環境,更加威脅台灣幼兒的,... \n", 167 | "1 我希望如果有天你有能力了,可以為被虐的兒童提出修法保護,更另闢一個無力撫養孩子的人一個出口,... \n", 168 | "2 我也是重症兒童家屬\\n感謝你的發言\\n我第一次看到有政治人物願意大聲疾呼\\n但不是說沒有其他... \n", 169 | "3 每次看到你就覺得台灣還有希望\\n不在乎選票在乎的是人 \n", 170 | "4 每當我覺得天下的烏鴉一般黑的時候 看到你的發文 又讓我覺得繼續奮鬥 台灣會被照亮的 柯文哲 ... " 171 | ] 172 | }, 173 | "execution_count": 4, 174 | "metadata": {}, 175 | "output_type": "execute_result" 176 | } 177 | ], 178 | "source": [ 179 | "df = pd.DataFrame.from_records(comments)\n", 180 | "df.head()" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 5, 186 | "metadata": {}, 187 | "outputs": [ 188 | { 189 | "name": "stdout", 190 | "output_type": "stream", 191 | "text": [ 192 | "Save file - /home/dirl/github/Python-Crawling-Tutorial/results/1213927345375910.csv\n" 193 | ] 194 | } 195 | ], 196 | "source": [ 197 | "results = os.path.abspath('../results')\n", 198 | "if not os.path.exists(results):\n", 199 | " os.makedirs(results)\n", 200 | "\n", 201 | "filename = os.path.join(results, '{}.csv'.format(article_id))\n", 202 | "df.to_csv(filename, index=False)\n", 203 | "print('Save file - {}'.format(filename))" 204 | ] 205 | } 206 | ], 207 | "metadata": { 208 | "kernelspec": { 209 | "display_name": "Python 3", 210 | "language": "python", 211 | "name": "python3" 212 | }, 213 | "language_info": { 214 | "codemirror_mode": { 215 | "name": "ipython", 216 | "version": 3 217 | }, 218 | "file_extension": ".py", 219 | "mimetype": "text/x-python", 220 | "name": "python", 221 | "nbconvert_exporter": "python", 222 | "pygments_lexer": "ipython3", 223 | "version": "3.5.2" 224 | } 225 | }, 226 | "nbformat": 4, 227 | "nbformat_minor": 2 228 | } 229 | -------------------------------------------------------------------------------- /03_graph_api/01_facebook_crawling_fanpage_likes_shares.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 練習\n", 8 | "\n", 9 | "- 取得 FB 粉絲團中所有文章的按讚與分享數\n", 10 | "- 使用 [Graph API](https://developers.facebook.com/tools/explorer/)\n", 11 | "- https://www.facebook.com/DoctorKoWJ\n", 12 | "- 輸出成 CSV" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "import os\n", 24 | "import requests\n", 25 | "import pandas as pd\n", 26 | "\n", 27 | "from datetime import datetime" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": { 34 | "collapsed": true 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "# 透過 Graph API 觀察文章 ID 與 token\n", 39 | "fanpage_id = '136845026417486'\n", 40 | "token = ''" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "name": "stdout", 50 | "output_type": "stream", 51 | "text": [ 52 | "page 1\n", 53 | "page 2\n", 54 | "page 3\n", 55 | "page 4\n", 56 | "page 5\n", 57 | "page 6\n", 58 | "page 7\n", 59 | "page 8\n", 60 | "page 9\n", 61 | "page 10\n", 62 | "page 11\n", 63 | "page 12\n", 64 | "page 13\n", 65 | "page 14\n", 66 | "page 15\n", 67 | "page 16\n", 68 | "page 17\n", 69 | "page 18\n", 70 | "page 19\n", 71 | "page 20\n", 72 | "page 21\n", 73 | "page 22\n", 74 | "page 23\n", 75 | "page 24\n", 76 | "page 25\n", 77 | "page 26\n", 78 | "page 27\n", 79 | "page 28\n", 80 | "page 29\n", 81 | "page 30\n", 82 | "page 31\n", 83 | "page 32\n", 84 | "page 33\n", 85 | "page 34\n", 86 | "page 35\n", 87 | "page 36\n", 88 | "page 37\n", 89 | "page 38\n", 90 | "page 39\n", 91 | "page 40\n", 92 | "page 41\n", 93 | "page 42\n", 94 | "page 43\n", 95 | "page 44\n", 96 | "page 45\n", 97 | "page 46\n", 98 | "page 47\n", 99 | "page 48\n", 100 | "page 49\n", 101 | "page 50\n", 102 | "page 51\n", 103 | "page 52\n", 104 | "page 53\n", 105 | "page 54\n", 106 | "page 55\n", 107 | "page 56\n", 108 | "page 57\n", 109 | "EOF\n" 110 | ] 111 | } 112 | ], 113 | "source": [ 114 | "url = 'https://graph.facebook.com/v2.11/{}/posts?fields={}&access_token={}'.format(\n", 115 | " fanpage_id, 'id,created_time,name,likes.limit(0).summary(true),shares,message', token\n", 116 | ")\n", 117 | "\n", 118 | "posts = []\n", 119 | "pages = 0\n", 120 | "\n", 121 | "while True:\n", 122 | " resp = requests.get(url)\n", 123 | " data = resp.json()\n", 124 | " posts += data['data']\n", 125 | " pages += 1\n", 126 | " \n", 127 | " if 'next' not in data['paging']:\n", 128 | " print('EOF')\n", 129 | " break\n", 130 | " \n", 131 | " else:\n", 132 | " url = data['paging']['next']\n", 133 | " print('page {}'.format(pages))" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 4, 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/html": [ 144 | "
\n", 145 | "\n", 158 | "\n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | "
created_timeidmessagenametotal_likestotal_shares
02018-01-29T10:07:27+0000136845026417486_1230167763751868來荷蘭烏特勒支市走一走,看看他們如何創造一個友善的自行車通行環境。\\n\\n---\\nPart...直播|考察荷蘭自行車設施(Part 2)9022131.0
12018-01-29T09:40:44+0000136845026417486_1230143707087607來荷蘭烏特勒支市走一走,看看他們如何創造一座全世界最大的自行車停車場。\\n\\n---\\nPa...直播|考察荷蘭自行車設施(Part 1)10470181.0
22018-01-28T03:30:00+0000136845026417486_1228976073871037每次出訪,都是一次難得的學習機會,這一趟歐洲行也不例外。\\n\\n荷蘭一直是我想去好好研究的地...Timeline Photos38594400.0
32018-01-27T13:15:49+0000136845026417486_1228569593911685很多人都聽過「順手捐發票,救救老殘窮」,也看過在路上推著烤爐賣烤地瓜的「地瓜媽媽」,這些都是...NaN12317192.0
42018-01-26T09:29:05+0000136845026417486_1227573790677932政治就是落實在人民的每一天生活之中,讓人民有好的居住環境,應當是中央和地方一致認同的進步價值...Photos from 柯文哲's post914586.0
\n", 218 | "
" 219 | ], 220 | "text/plain": [ 221 | " created_time id \\\n", 222 | "0 2018-01-29T10:07:27+0000 136845026417486_1230167763751868 \n", 223 | "1 2018-01-29T09:40:44+0000 136845026417486_1230143707087607 \n", 224 | "2 2018-01-28T03:30:00+0000 136845026417486_1228976073871037 \n", 225 | "3 2018-01-27T13:15:49+0000 136845026417486_1228569593911685 \n", 226 | "4 2018-01-26T09:29:05+0000 136845026417486_1227573790677932 \n", 227 | "\n", 228 | " message name \\\n", 229 | "0 來荷蘭烏特勒支市走一走,看看他們如何創造一個友善的自行車通行環境。\\n\\n---\\nPart... 直播|考察荷蘭自行車設施(Part 2) \n", 230 | "1 來荷蘭烏特勒支市走一走,看看他們如何創造一座全世界最大的自行車停車場。\\n\\n---\\nPa... 直播|考察荷蘭自行車設施(Part 1) \n", 231 | "2 每次出訪,都是一次難得的學習機會,這一趟歐洲行也不例外。\\n\\n荷蘭一直是我想去好好研究的地... Timeline Photos \n", 232 | "3 很多人都聽過「順手捐發票,救救老殘窮」,也看過在路上推著烤爐賣烤地瓜的「地瓜媽媽」,這些都是... NaN \n", 233 | "4 政治就是落實在人民的每一天生活之中,讓人民有好的居住環境,應當是中央和地方一致認同的進步價值... Photos from 柯文哲's post \n", 234 | "\n", 235 | " total_likes total_shares \n", 236 | "0 9022 131.0 \n", 237 | "1 10470 181.0 \n", 238 | "2 38594 400.0 \n", 239 | "3 12317 192.0 \n", 240 | "4 9145 86.0 " 241 | ] 242 | }, 243 | "execution_count": 4, 244 | "metadata": {}, 245 | "output_type": "execute_result" 246 | } 247 | ], 248 | "source": [ 249 | "posts_summary = []\n", 250 | "for post in posts:\n", 251 | " p = {}\n", 252 | " for k, v in post.items():\n", 253 | " if k == 'likes' and 'summary' in v and 'total_count' in v['summary']:\n", 254 | " p['total_likes'] = v['summary']['total_count']\n", 255 | " elif k == 'shares' and 'count' in v:\n", 256 | " p['total_shares'] = v['count']\n", 257 | " else:\n", 258 | " p[k] = v\n", 259 | " posts_summary.append(p)\n", 260 | "\n", 261 | "df = pd.DataFrame.from_records(posts_summary)\n", 262 | "df.head()" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 5, 268 | "metadata": {}, 269 | "outputs": [ 270 | { 271 | "name": "stdout", 272 | "output_type": "stream", 273 | "text": [ 274 | "Save file - /home/dirl/github/Python-Crawling-Tutorial/results/fanpage_136845026417486.csv\n" 275 | ] 276 | } 277 | ], 278 | "source": [ 279 | "results = os.path.abspath('../results')\n", 280 | "if not os.path.exists(results):\n", 281 | " os.makedirs(results)\n", 282 | " \n", 283 | "filename = os.path.join(results, 'fanpage_{}.csv'.format(fanpage_id))\n", 284 | "df.to_csv(filename, index=False)\n", 285 | "print('Save file - {}'.format(filename))" 286 | ] 287 | } 288 | ], 289 | "metadata": { 290 | "kernelspec": { 291 | "display_name": "Python 3", 292 | "language": "python", 293 | "name": "python3" 294 | }, 295 | "language_info": { 296 | "codemirror_mode": { 297 | "name": "ipython", 298 | "version": 3 299 | }, 300 | "file_extension": ".py", 301 | "mimetype": "text/x-python", 302 | "name": "python", 303 | "nbconvert_exporter": "python", 304 | "pygments_lexer": "ipython3", 305 | "version": "3.5.2" 306 | } 307 | }, 308 | "nbformat": 4, 309 | "nbformat_minor": 2 310 | } 311 | -------------------------------------------------------------------------------- /03_graph_api/02_facebook_crawling_article_all.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 練習\n", 8 | "\n", 9 | "- 取得 FB 文章底下所有留言, 附檔連結, application\n", 10 | "- 使用 [Graph API](https://developers.facebook.com/tools/explorer/)\n", 11 | "- https://www.facebook.com/appledaily.tw/posts/10156769966527069\n", 12 | "- 輸出成 CSV" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "import os\n", 24 | "import requests\n", 25 | "import pandas as pd\n", 26 | "\n", 27 | "from datetime import datetime" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": { 34 | "collapsed": true 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "# 透過 Graph API 觀察文章 ID 與 token\n", 39 | "article_id = '232633627068_10156769966527069'\n", 40 | "token = ''" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "name": "stdout", 50 | "output_type": "stream", 51 | "text": [ 52 | "pages 1\n", 53 | "pages 2\n", 54 | "pages 3\n", 55 | "pages 4\n", 56 | "pages 5\n", 57 | "pages 6\n", 58 | "pages 7\n", 59 | "comments length = 63\n" 60 | ] 61 | } 62 | ], 63 | "source": [ 64 | "comments = []\n", 65 | "pages = 0\n", 66 | "\n", 67 | "\"\"\"\n", 68 | "nested query + 游標型分頁\n", 69 | "%7B => {\n", 70 | "%7D => }\n", 71 | "%2C => ,\n", 72 | "reference: https://www.w3schools.com/tags/ref_urlencode.asp\n", 73 | "\"\"\"\n", 74 | "\n", 75 | "base_url = 'https://graph.facebook.com/v2.11/{}'.format(article_id)\n", 76 | "query = '?fields=comments.limit({})%7Battachment%2Capplication%2Cmessage.limit({})%7D&access_token={}'.format(\n", 77 | " 10, 100, token\n", 78 | ")\n", 79 | "url = '{}/{}'.format(base_url, query)\n", 80 | "\n", 81 | "while True:\n", 82 | " pages += 1\n", 83 | " resp = requests.get(url)\n", 84 | " data = resp.json()\n", 85 | " if 'comments' not in data:\n", 86 | " break\n", 87 | "\n", 88 | " comments += data['comments']['data']\n", 89 | " \n", 90 | " if 'after' not in data['comments']['paging']['cursors']:\n", 91 | " print('EOF')\n", 92 | " break\n", 93 | " else:\n", 94 | " cursors_after = data['comments']['paging']['cursors']['after']\n", 95 | " query = '?fields=comments.limit({}).after({})%7Battachment%2Capplication%2Cmessage.limit({})%7D&access_token={}'.format(\n", 96 | " 10, cursors_after, 100, token\n", 97 | " )\n", 98 | " url = '{}/{}'.format(base_url, query)\n", 99 | " print('pages {}'.format(pages))\n", 100 | "\n", 101 | "print('comments length = {}'.format(len(comments)))" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 4, 107 | "metadata": {}, 108 | "outputs": [ 109 | { 110 | "data": { 111 | "text/html": [ 112 | "
\n", 113 | "\n", 126 | "\n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | "
application_categoryapplication_idapplication_linkapplication_nameapplication_namespaceattachment_typeattachment_urlidmessage
0Utilities350685531728/androidFacebook for AndroidfbandroidNaNNaN10156769966527069_10156771068602069又要連PO好幾天\\n一天好幾篇\\nPO到有人反感\\n留言開始有人吵架鬥嘴\\n最後一面倒開始噴這遊戲
1Utilities6628568379/iphoneFacebook for iPhonefbiphonephotohttps://www.facebook.com/photo.php?fbid=164774...10156769966527069_10156771204372069水溝是怎樣
2Utilities350685531728/androidFacebook for Androidfbandroidphotohttps://www.facebook.com/photo.php?fbid=201326...10156769966527069_10156771212477069我的🐸兒子好久才回家本來很生氣(找不到罵兒子的選項XD\\n\\n結果看到他帶回來的名產\\n以及...
3Utilities350685531728/androidFacebook for AndroidfbandroidNaNNaN10156769966527069_10156771109777069重複報導是不會膩喔
4Utilities350685531728/androidFacebook for Androidfbandroidphotohttps://www.facebook.com/photo.php?fbid=537644...10156769966527069_10156771833147069我家的青蛙在我肚子裡跟我一起去旅行了
\n", 204 | "
" 205 | ], 206 | "text/plain": [ 207 | " application_category application_id application_link application_name \\\n", 208 | "0 Utilities 350685531728 /android Facebook for Android \n", 209 | "1 Utilities 6628568379 /iphone Facebook for iPhone \n", 210 | "2 Utilities 350685531728 /android Facebook for Android \n", 211 | "3 Utilities 350685531728 /android Facebook for Android \n", 212 | "4 Utilities 350685531728 /android Facebook for Android \n", 213 | "\n", 214 | " application_namespace attachment_type \\\n", 215 | "0 fbandroid NaN \n", 216 | "1 fbiphone photo \n", 217 | "2 fbandroid photo \n", 218 | "3 fbandroid NaN \n", 219 | "4 fbandroid photo \n", 220 | "\n", 221 | " attachment_url \\\n", 222 | "0 NaN \n", 223 | "1 https://www.facebook.com/photo.php?fbid=164774... \n", 224 | "2 https://www.facebook.com/photo.php?fbid=201326... \n", 225 | "3 NaN \n", 226 | "4 https://www.facebook.com/photo.php?fbid=537644... \n", 227 | "\n", 228 | " id \\\n", 229 | "0 10156769966527069_10156771068602069 \n", 230 | "1 10156769966527069_10156771204372069 \n", 231 | "2 10156769966527069_10156771212477069 \n", 232 | "3 10156769966527069_10156771109777069 \n", 233 | "4 10156769966527069_10156771833147069 \n", 234 | "\n", 235 | " message \n", 236 | "0 又要連PO好幾天\\n一天好幾篇\\nPO到有人反感\\n留言開始有人吵架鬥嘴\\n最後一面倒開始噴這遊戲 \n", 237 | "1 水溝是怎樣 \n", 238 | "2 我的🐸兒子好久才回家本來很生氣(找不到罵兒子的選項XD\\n\\n結果看到他帶回來的名產\\n以及... \n", 239 | "3 重複報導是不會膩喔 \n", 240 | "4 我家的青蛙在我肚子裡跟我一起去旅行了 " 241 | ] 242 | }, 243 | "execution_count": 4, 244 | "metadata": {}, 245 | "output_type": "execute_result" 246 | } 247 | ], 248 | "source": [ 249 | "for comment in comments:\n", 250 | " application, attachment, message = '', '', ''\n", 251 | " if 'application' in comment:\n", 252 | " app = {'application_{}'.format(k):v for k, v in comment['application'].items()}\n", 253 | " comment.update(app)\n", 254 | " del comment['application']\n", 255 | " if 'attachment' in comment:\n", 256 | " att = {\n", 257 | " 'attachment_type': comment['attachment']['type'],\n", 258 | " 'attachment_url': comment['attachment']['url']\n", 259 | " }\n", 260 | " comment.update(att)\n", 261 | " del comment['attachment']\n", 262 | "\n", 263 | "df = pd.DataFrame.from_records(comments)\n", 264 | "df.head()" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 5, 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "name": "stdout", 274 | "output_type": "stream", 275 | "text": [ 276 | "Save file - /home/dirl/github/Python-Crawling-Tutorial/results/232633627068_10156769966527069.csv\n" 277 | ] 278 | } 279 | ], 280 | "source": [ 281 | "results = os.path.abspath('../results')\n", 282 | "if not os.path.exists(results):\n", 283 | " os.makedirs(results)\n", 284 | "\n", 285 | "filename = os.path.join(results, '{}.csv'.format(article_id))\n", 286 | "df.to_csv(filename, index=False)\n", 287 | "print('Save file - {}'.format(filename))" 288 | ] 289 | } 290 | ], 291 | "metadata": { 292 | "kernelspec": { 293 | "display_name": "Python 3", 294 | "language": "python", 295 | "name": "python3" 296 | }, 297 | "language_info": { 298 | "codemirror_mode": { 299 | "name": "ipython", 300 | "version": 3 301 | }, 302 | "file_extension": ".py", 303 | "mimetype": "text/x-python", 304 | "name": "python", 305 | "nbconvert_exporter": "python", 306 | "pygments_lexer": "ipython3", 307 | "version": "3.5.2" 308 | } 309 | }, 310 | "nbformat": 4, 311 | "nbformat_minor": 2 312 | } 313 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | requests = "*" 8 | lxml = "*" 9 | jupyter = "*" 10 | "beautifulsoup4" = "*" 11 | browsercookie = "*" 12 | pandas = "*" 13 | fake-useragent = "*" 14 | pillow = "*" 15 | tldextract = "*" 16 | selenium = "*" 17 | 18 | [dev-packages] 19 | 20 | [requires] 21 | python_version = "3.6" 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python-Crawling-Tutorial 基礎爬蟲實戰 2 | 3 | ## 相關資源 4 | 5 | 最新的投影片放在 [slideshare](https://www.slideshare.net/ChenMingYang/python-crawling-tutorial-87165481) 上, 會不定期更新, 程式碼可透過這個頁面右邊的 **Clone or download** 下載 6 | ![demo](https://user-images.githubusercontent.com/4820492/35319787-585ea0c4-011c-11e8-802a-02ae0dbc4044.png) 7 | 8 | > 2017 年以前的投影片教材放在 [release](https://github.com/afunTW/Python-Crawling-Tutorial/releases), 但是部份實戰練習網站會失效 9 | > 或是可透過 [link](https://goo.gl/CFR95x) 下載投影片 10 | 11 | ## 安裝環境 12 | 13 | ### Anaconda (建議) 14 | 15 | - 下載 Python 3.6 版本 https://www.continuum.io/downloads 16 | - 練習題會使用到瀏覽器 Chrome,麻煩各位選擇自己電腦的平台安裝 [Chrome](https://www.google.com.tw/chrome/browser/desktop/index.html) 17 | - 動態網站的爬蟲也需要下載 webdriver,需要額外下載 18 | - [Chrome](https://sites.google.com/a/chromium.org/chromedriver/downloads) 19 | - [Firefox](https://github.com/mozilla/geckodriver/releases) 20 | - 題目都是以 `jupyter notebook` 進行,安裝完 Anaconda 後即可用內建 `jupyter notebook` 打開 `.ipynb` 檔 21 | - 建議安裝 Anaconda,如有安裝 Anaconda 只需安裝以下套件 22 | 23 | ```sh 24 | $ pip install selenium tldextract Pillow 25 | ``` 26 | 27 | ### pip 28 | 29 | pip 是 Python 的套件管理系統,在部份系統裏面會用 `pip3` 代表 Python3 的版本,請各位依照自己的系統安裝 pip3 後,安裝以下 Python3 版本的套件 30 | 31 | ```sh 32 | # 視情況而定, 使用 pip 或是 pip3 33 | $ pip install requests beautifulsoup4 lxml Pillow selenium tldextract 34 | ``` 35 | 36 | #### Optional: 資料分析 37 | 38 | 沒有練習題但會有範例 code 可以執行,可自行選擇是否安裝 (如果安裝 wordcloud 時有問題,可能是沒有下載 visual studio,可以從 warining 中提供的網址下載安裝) 39 | 40 | ```sh 41 | # Anaconda 42 | $ pip install jieba wordcloud 43 | 44 | # pip 45 | $ pip3 install numpy pandas matplotlib scipy scikit-learn jieba wordcloud 46 | ``` 47 | 48 | ## 請遵守別人的規則 49 | 50 | 有些網站會在目錄底下加上 robots.txt, 基本上這就是對方定義的爬蟲規則,請大家在練習爬蟲的時候要尊重對方的規則 51 | 52 | > robots.txt 詳細的語法與用途請參考 [wiki](https://zh.wikipedia.org/zh-tw/Robots.txt) 與 [google 文件](https://support.google.com/webmasters/answer/6062608?hl=zh-Hant) 53 | 54 | --- 55 | 56 | ## Q&A 57 | 58 | **Q: 有哪些常用的 API** 59 | 60 | 課堂中有說到,爬蟲只是一種得到資料的手段,如果對方有提供 API 就可以直接使用 API, 61 | API 通常對方都會幫你整理好資料格式,或是根據權限決定你可以獲取的資料內容 62 | 63 | - [Facebook Graph API](https://developers.facebook.com/tools/explorer/) 64 | - [Youtube](https://www.youtube.com/yt/dev/zh-TW/api-resources.html) 65 | - [Yahoo YQL](https://developer.yahoo.com/yql/) 66 | - [Instagram](https://www.instagram.com/developer/) 67 | - [KKTIX](http://support.kktix.com/knowledgebase/articles/558918-%E6%B4%BB%E5%8B%95%E8%B3%87%E8%A8%8A-api) 68 | - [Google Maps API](https://developers.google.com/maps/?hl=zh-tw) 69 | - [Taipei Open Data API](http://data.taipei/opendata/developer) 70 | - [Imgur API](https://api.imgur.com/endpoints) 71 | -------------------------------------------------------------------------------- /appendix_ptt/00_parse_article.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 爬取單一文章資訊\n", 8 | "\n", 9 | "1. 你有可能會遇到「是否滿18歲」的詢問頁面\n", 10 | "2. 解析 ptt.cc/bbs 裏面文章的結構\n", 11 | "3. 爬取文章\n", 12 | "4. 爬取留言\n", 13 | "\n", 14 | "URL https://www.ptt.cc/bbs/Gossiping/M.1537847530.A.E12.html\n", 15 | "\n", 16 | "BACKUP https://afuntw.github.io/Test-Crawling-Website/pages/ptt/M.1537847530.A.E12.html" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 1, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import requests\n", 26 | "import re\n", 27 | "import json\n", 28 | "\n", 29 | "from bs4 import BeautifulSoup, NavigableString\n", 30 | "from pprint import pprint" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "ARTICLE_URL = 'https://www.ptt.cc/bbs/Gossiping/M.1537847530.A.E12.html'" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "## 透過 cookies 繞過年齡檢查\n", 47 | "\n", 48 | "觀察開發者工具 > NetWork > requests header" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "metadata": { 55 | "scrolled": true 56 | }, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "\n", 63 | "\n", 64 | "\t\n", 65 | "\t\t\n", 66 | "\t\t\n", 67 | "\n", 68 | "\n", 69 | "\n", 70 | "批踢踢實業坊\n", 71 | "\n", 72 | "\n", 73 | "\n", 74 | "\n", 75 | "\n", 76 | "\n", 77 | "\n", 78 | "\n", 79 | "\n", 80 | "\n", 81 | "\t\n", 82 | " \n", 83 | "\t\t\n", 84 | "
\n", 85 | "
\n", 86 | "

本網站已依網站內容分級規定處理

\n", 87 | "\n", 88 | "

警告︰您即將進入之看板內容需滿十八歲方可瀏覽。

\n", 89 | "\n", 90 | "

若您尚未年滿十八歲,請點選離開。若您已滿十八歲,亦不可將本區之內容派發、傳閱、出售、出租、交給或借予年齡未滿18歲的人士瀏覽,或將本網站內容向該人士出示、播放或放映。

\n", 91 | "
\n", 92 | "
\n", 93 | "\n", 94 | "
\n", 95 | "
\n", 96 | " \n", 97 | "
\n", 98 | " \n", 99 | "
\n", 100 | "
\n", 101 | " \n", 102 | "
\n", 103 | "
\n", 104 | "
\n", 105 | "\n", 106 | "\t\t\n", 107 | "\n", 108 | "\n", 120 | "\n", 121 | "\n", 122 | "\t\t\n", 123 | "\n", 124 | "\n", 125 | "\n", 126 | " \n", 127 | "\n", 128 | "\n" 129 | ] 130 | } 131 | ], 132 | "source": [ 133 | "resp = requests.get(ARTICLE_URL)\n", 134 | "if resp.status_code == 200:\n", 135 | " print(resp.text)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 4, 141 | "metadata": { 142 | "scrolled": true 143 | }, 144 | "outputs": [ 145 | { 146 | "name": "stdout", 147 | "output_type": "stream", 148 | "text": [ 149 | "\n", 150 | "\n", 151 | "\t\n", 152 | "\t\t\n", 153 | "\t\t\n", 154 | "\n", 155 | "\n", 156 | "\n", 157 | "[問卦] 中央與北大併校 - 看板 Gossiping - 批踢踢實業坊\n", 158 | "\n", 159 | "\n", 160 | "\n", 166 | "\n", 167 | "\n", 168 | "\n", 174 | "\n", 175 | "\n", 176 | "\n", 177 | "\n", 178 | "\n", 179 | "\n", 180 | "\n", 181 | "\n", 182 | "\n", 183 | "\n", 184 | "\n", 185 | "\t\n", 186 | " \n", 187 | "\t\t\n", 188 | "
\n", 189 | "\n", 196 | "\n", 197 | "
\n", 198 | "\t
\n", 199 | "\t\t批踢踢實業坊\n", 200 | "\t\t\n", 201 | "\t\t看板 Gossiping\n", 202 | "\t\t關於我們\n", 203 | "\t\t聯絡資訊\n", 204 | "\t
\n", 205 | "
\n", 206 | "
\n", 207 | "\t
\n", 208 | "\t\t返回看板\n", 209 | "\t\t
\n", 210 | "\t\t
\n", 211 | "\t\t\t分享\n", 212 | "\t\t\t
\n", 213 | "\n", 214 | "\t\t\t
\n", 215 | "\n", 223 | "\n", 224 | "\t\t
\n", 225 | "\t
\n", 226 | "
\n", 227 | "
\n", 228 | "
作者R101 (索尼大法好)
看板Gossiping
標題[問卦] 中央與北大併校
時間Tue Sep 25 11:52:08 2018
\n", 229 | "如題啊,最近陽明跟交大併校吵的很兇,中央都變成台聯大邊緣人了。\n", 230 | "為什麼不讓中央跟台北大學併校呢?\n", 231 | "中央缺法商剛好北大有,\n", 232 | "中央的理工北大沒有,兩校剛好互補,\n", 233 | "而且地理位置也不遠,有沒有人想過讓台北大學跟中央合併呢?\n", 234 | "有沒有八卦?\n", 235 | "\n", 236 | "--\n", 237 | "※ 發信站: 批踢踢實業坊(ptt.cc), 來自: 140.115.197.252\n", 238 | "※ 文章網址: https://www.ptt.cc/bbs/Gossiping/M.1537847530.A.E12.html\n", 239 | "
bobobola: 中央不缺商阿 42.75.76.1 09/25 11:54\n", 240 | "
nikewang: 北中和中興合併不就好了121.157.204.247 09/25 11:54\n", 241 | "
nikewang: 北大121.157.204.247 09/25 11:54\n", 242 | "
qqq1234: 北大好不容易才脫離中興獨立 怎可能去併 117.56.55.46 09/25 11:59\n", 243 | "
Lakland: 北大跟北科合作一陣子了,中央找體大吧 114.24.29.42 09/25 11:59\n", 244 | "
atlaswhz: 中央找體大和警大組成桃聯大好了 1.34.181.133 09/25 12:07\n", 245 | "
sooppp: 體大的聽的懂中央上課在教什麼嗎?223.140.169.234 09/25 12:15\n", 246 | "
homepark: 中央缺醫學喇223.137.74.137 09/25 12:16\n", 247 | "
lee457088: 197.252是哪棟140.115.216.209 09/25 12:18\n", 248 | "
不知道\n", 249 | "
mecca: 當年有文法商理工醫農學院 現在洗洗睡吧210.64.134.103 09/25 12:40\n", 250 | "
※ 編輯: R101 (140.115.130.200), 09/25/2018 15:41:01\n", 251 | "
\n", 252 | " \n", 253 | "
\n", 254 | " \n", 255 | "\n", 256 | " \n", 257 | "
本網站已依台灣網站內容分級規定處理。此區域為限制級,未滿十八歲者不得瀏覽。
\n", 258 | "\n", 259 | "
\n", 260 | "\n", 261 | "\t\t\n", 262 | "\n", 263 | "\n", 275 | "\n", 276 | "\n", 277 | "\t\t\n", 278 | "\n", 279 | "\n", 280 | "\n", 281 | " \n", 282 | "\n", 283 | "\n" 284 | ] 285 | } 286 | ], 287 | "source": [ 288 | "cookies = {'over18': '1'}\n", 289 | "resp = requests.get(ARTICLE_URL, cookies=cookies)\n", 290 | "if resp.status_code == 200:\n", 291 | " print(resp.text)" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 5, 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "soup = BeautifulSoup(resp.text, 'lxml')" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "## 爬取文章\n", 308 | "\n", 309 | "- 作者 id\n", 310 | "- 作者暱稱\n", 311 | "- 文章標題\n", 312 | "- 發佈時間\n", 313 | "- 文章內容\n", 314 | "- 發文 ip" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 6, 320 | "metadata": {}, 321 | "outputs": [ 322 | { 323 | "name": "stdout", 324 | "output_type": "stream", 325 | "text": [ 326 | "{'author_id': 'R101',\n", 327 | " 'author_nickname': '索尼大法好',\n", 328 | " 'contents': '如題啊,最近陽明跟交大併校吵的很兇,中央都變成台聯大邊緣人了。為什麼不讓中央跟台北大學併校呢?中央缺法商剛好北大有,中央的理工北大沒有,兩校剛好互補,而且地理位置也不遠,有沒有人想過讓台北大學跟中央合併呢?有沒有八卦?--\\n'\n", 329 | " '不知道',\n", 330 | " 'ip': '140.115.197.252',\n", 331 | " 'timestamp': 'Tue Sep 25 11:52:08 2018',\n", 332 | " 'title': '[問卦] 中央與北大併校'}\n" 333 | ] 334 | } 335 | ], 336 | "source": [ 337 | "article = {\n", 338 | " 'author_id': '',\n", 339 | " 'author_nickname': '',\n", 340 | " 'title': '',\n", 341 | " 'timestamp': '',\n", 342 | " 'contents': '',\n", 343 | " 'ip': ''\n", 344 | "}\n", 345 | "article_body = soup.find(id='main-content')\n", 346 | "\n", 347 | "# article header\n", 348 | "article_head = article_body.findAll('div', class_='article-metaline')\n", 349 | "for metaline in article_head:\n", 350 | " meta_tag = metaline.find(class_='article-meta-tag').text\n", 351 | " meta_value = metaline.find(class_='article-meta-value').text\n", 352 | " if meta_tag == '作者':\n", 353 | " compile_nickname = re.compile('\\((.*)\\)').search(meta_value)\n", 354 | " article['author_id'] = meta_value.split('(')[0].strip(' ')\n", 355 | " article['author_nickname'] = compile_nickname.group(1) if compile_nickname else ''\n", 356 | " elif meta_tag == '標題':\n", 357 | " article['title'] = meta_value\n", 358 | " elif meta_tag == '時間':\n", 359 | " article['timestamp'] = meta_value\n", 360 | "\n", 361 | "# article content\n", 362 | "contents = [expr for expr in article_body.contents if isinstance(expr, NavigableString)]\n", 363 | "contents = [re.sub('\\n', '', expr) for expr in contents]\n", 364 | "contents = [i for i in contents if i]\n", 365 | "contents = '\\n'.join(contents)\n", 366 | "article['contents'] = contents\n", 367 | "\n", 368 | "# article publish ip\n", 369 | "article_ip = article_body.find(class_='f2').text\n", 370 | "compile_ip = re.compile('[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}').search(article_ip)\n", 371 | "article['ip'] = compile_ip.group(0) if compile_ip else ''\n", 372 | "\n", 373 | "pprint(article)" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "metadata": {}, 379 | "source": [ 380 | "## 爬取流言\n", 381 | "\n", 382 | "- 推噓\n", 383 | "- 推文 id\n", 384 | "- 推文內容\n", 385 | "- 推文 ip\n", 386 | "- 推文時間" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 7, 392 | "metadata": { 393 | "scrolled": true 394 | }, 395 | "outputs": [ 396 | { 397 | "name": "stdout", 398 | "output_type": "stream", 399 | "text": [ 400 | "[{'content': ': 中央不缺商阿',\n", 401 | " 'id': 'bobobola',\n", 402 | " 'ip': '42.75.76.1',\n", 403 | " 'tag': '推 ',\n", 404 | " 'timestamp': '09/25 11:54'},\n", 405 | " {'content': ': 北中和中興合併不就好了',\n", 406 | " 'id': 'nikewang',\n", 407 | " 'ip': '121.157.204.247',\n", 408 | " 'tag': '→ ',\n", 409 | " 'timestamp': '09/25 11:54'},\n", 410 | " {'content': ': 北大',\n", 411 | " 'id': 'nikewang',\n", 412 | " 'ip': '121.157.204.247',\n", 413 | " 'tag': '→ ',\n", 414 | " 'timestamp': '09/25 11:54'},\n", 415 | " {'content': ': 北大好不容易才脫離中興獨立 怎可能去併',\n", 416 | " 'id': 'qqq1234',\n", 417 | " 'ip': '117.56.55.46',\n", 418 | " 'tag': '推 ',\n", 419 | " 'timestamp': '09/25 11:59'},\n", 420 | " {'content': ': 北大跟北科合作一陣子了,中央找體大吧',\n", 421 | " 'id': 'Lakland',\n", 422 | " 'ip': '114.24.29.42',\n", 423 | " 'tag': '→ ',\n", 424 | " 'timestamp': '09/25 11:59'},\n", 425 | " {'content': ': 中央找體大和警大組成桃聯大好了',\n", 426 | " 'id': 'atlaswhz',\n", 427 | " 'ip': '1.34.181.133',\n", 428 | " 'tag': '推 ',\n", 429 | " 'timestamp': '09/25 12:07'},\n", 430 | " {'content': ': 體大的聽的懂中央上課在教什麼嗎?',\n", 431 | " 'id': 'sooppp',\n", 432 | " 'ip': '223.140.169.234',\n", 433 | " 'tag': '推 ',\n", 434 | " 'timestamp': '09/25 12:15'},\n", 435 | " {'content': ': 中央缺醫學喇',\n", 436 | " 'id': 'homepark',\n", 437 | " 'ip': '223.137.74.137',\n", 438 | " 'tag': '推 ',\n", 439 | " 'timestamp': '09/25 12:16'},\n", 440 | " {'content': ': 197.252是哪棟',\n", 441 | " 'id': 'lee457088',\n", 442 | " 'ip': '140.115.216.209',\n", 443 | " 'tag': '→ ',\n", 444 | " 'timestamp': '09/25 12:18'},\n", 445 | " {'content': ': 當年有文法商理工醫農學院 現在洗洗睡吧',\n", 446 | " 'id': 'mecca',\n", 447 | " 'ip': '210.64.134.103',\n", 448 | " 'tag': '推 ',\n", 449 | " 'timestamp': '09/25 12:40'}]\n" 450 | ] 451 | } 452 | ], 453 | "source": [ 454 | "comments = []\n", 455 | "for comment in article_body.findAll('div', class_='push'):\n", 456 | " tag = comment.find(class_='push-tag').text\n", 457 | " guest_id = comment.find(class_='push-userid').text\n", 458 | " guest_content = comment.find(class_='push-content').text\n", 459 | " guest_ipdatetime = comment.find(class_='push-ipdatetime').text\n", 460 | " compile_ip = re.compile('[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}').search(guest_ipdatetime)\n", 461 | " guest_ip = compile_ip.group(0) if compile_ip else ''\n", 462 | " guest_timestamp = re.sub(guest_ip, '', guest_ipdatetime).strip()\n", 463 | " comments.append({\n", 464 | " 'tag': tag,\n", 465 | " 'id': guest_id,\n", 466 | " 'content': guest_content,\n", 467 | " 'ip': guest_ip,\n", 468 | " 'timestamp': guest_timestamp\n", 469 | " })\n", 470 | "pprint(comments)" 471 | ] 472 | }, 473 | { 474 | "cell_type": "markdown", 475 | "metadata": {}, 476 | "source": [ 477 | "## 將資料存成 json 檔" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": 8, 483 | "metadata": {}, 484 | "outputs": [], 485 | "source": [ 486 | "article['comments'] = comments\n", 487 | "data = [article]\n", 488 | "with open('M.1537847530.A.E12.json', 'w+', encoding='utf-8') as f:\n", 489 | " json.dump(data, f, indent=2, ensure_ascii=False)" 490 | ] 491 | } 492 | ], 493 | "metadata": { 494 | "kernelspec": { 495 | "display_name": "Python 3", 496 | "language": "python", 497 | "name": "python3" 498 | }, 499 | "language_info": { 500 | "codemirror_mode": { 501 | "name": "ipython", 502 | "version": 3 503 | }, 504 | "file_extension": ".py", 505 | "mimetype": "text/x-python", 506 | "name": "python", 507 | "nbconvert_exporter": "python", 508 | "pygments_lexer": "ipython3", 509 | "version": "3.6.6" 510 | } 511 | }, 512 | "nbformat": 4, 513 | "nbformat_minor": 2 514 | } 515 | -------------------------------------------------------------------------------- /appendix_ptt/01_search_api_by_title.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 給定文章標題,爬取 PTT 上的所有相關文章\n", 8 | "\n", 9 | "- title: [新聞] 2噸水晶球沿街滾 撞壞5輛汽機車和民宅\n", 10 | "- URL encoing (UTF-8)\n", 11 | "- combine URL path" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import requests\n", 21 | "import re\n", 22 | "import json\n", 23 | "\n", 24 | "from bs4 import BeautifulSoup, NavigableString\n", 25 | "from pprint import pprint\n", 26 | "from urllib.parse import urlencode, urljoin" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "QUERY_TITLE = '[新聞] 2噸水晶球沿街滾 撞壞5輛汽機車和民宅'\n", 36 | "cookies = {'over18': '1'}" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "## URL encoding\n", 44 | "\n", 45 | "取得相同文章標題的列表" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 3, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "name": "stdout", 55 | "output_type": "stream", 56 | "text": [ 57 | "https://www.ptt.cc/bbs/Gossiping/search?q=%5B%E6%96%B0%E8%81%9E%5D+2%E5%99%B8%E6%B0%B4%E6%99%B6%E7%90%83%E6%B2%BF%E8%A1%97%E6%BB%BE+%E6%92%9E%E5%A3%9E5%E8%BC%9B%E6%B1%BD%E6%A9%9F%E8%BB%8A%E5%92%8C%E6%B0%91%E5%AE%85\n" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "encoding_title = urlencode({'q': QUERY_TITLE})\n", 63 | "query = 'https://www.ptt.cc/bbs/Gossiping/search?{}'.format(encoding_title)\n", 64 | "print(query)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 4, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "resp_article_list = requests.get(query, cookies=cookies)\n", 74 | "soup_article_list = BeautifulSoup(resp_article_list.text, 'lxml')" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "## 列出所有文章並爬取" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 5, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "def crawl_article(url):\n", 91 | " resp = requests.get(url, cookies={'over18': '1'})\n", 92 | " if resp.status_code != 200:\n", 93 | " return\n", 94 | " soup = BeautifulSoup(resp.text, 'lxml')\n", 95 | " print('Start to Crawling', url)\n", 96 | "\n", 97 | " # ##############################\n", 98 | " # crawl article\n", 99 | " # ##############################\n", 100 | " article = {\n", 101 | " 'author_id': '',\n", 102 | " 'author_nickname': '',\n", 103 | " 'title': '',\n", 104 | " 'timestamp': '',\n", 105 | " 'contents': '',\n", 106 | " 'ip': ''\n", 107 | " }\n", 108 | " article_body = soup.find(id='main-content')\n", 109 | "\n", 110 | " # article header\n", 111 | " article_head = article_body.findAll('div', class_='article-metaline')\n", 112 | " for metaline in article_head:\n", 113 | " meta_tag = metaline.find(class_='article-meta-tag').text\n", 114 | " meta_value = metaline.find(class_='article-meta-value').text\n", 115 | " if meta_tag == '作者':\n", 116 | " compile_nickname = re.compile('\\((.*)\\)').search(meta_value)\n", 117 | " article['author_id'] = meta_value.split('(')[0].strip(' ')\n", 118 | " article['author_nickname'] = compile_nickname.group(1) if compile_nickname else ''\n", 119 | " elif meta_tag == '標題':\n", 120 | " article['title'] = meta_value\n", 121 | " elif meta_tag == '時間':\n", 122 | " article['timestamp'] = meta_value\n", 123 | "\n", 124 | " # article content\n", 125 | " contents = [expr for expr in article_body.contents if isinstance(expr, NavigableString)]\n", 126 | " contents = [re.sub('\\n', '', expr) for expr in contents]\n", 127 | " contents = [i for i in contents if i]\n", 128 | " contents = '\\n'.join(contents)\n", 129 | " article['contents'] = contents\n", 130 | "\n", 131 | " # article publish ip\n", 132 | " article_ip = article_body.find(class_='f2').text\n", 133 | " compile_ip = re.compile('[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}').search(article_ip)\n", 134 | " article['ip'] = compile_ip.group(0) if compile_ip else ''\n", 135 | "\n", 136 | " # ##############################\n", 137 | " # crawl comments\n", 138 | " # ##############################\n", 139 | " comments = []\n", 140 | " for comment in article_body.findAll('div', class_='push'):\n", 141 | " tag = comment.find(class_='push-tag').text\n", 142 | " guest_id = comment.find(class_='push-userid').text\n", 143 | " guest_content = comment.find(class_='push-content').text\n", 144 | " guest_ipdatetime = comment.find(class_='push-ipdatetime').text\n", 145 | " compile_ip = re.compile('[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}').search(guest_ipdatetime)\n", 146 | " guest_ip = compile_ip.group(0) if compile_ip else ''\n", 147 | " guest_timestamp = re.sub(guest_ip, '', guest_ipdatetime).strip()\n", 148 | " comments.append({\n", 149 | " 'tag': tag,\n", 150 | " 'id': guest_id,\n", 151 | " 'content': guest_content,\n", 152 | " 'ip': guest_ip,\n", 153 | " 'timestamp': guest_timestamp\n", 154 | " })\n", 155 | " \n", 156 | " article['comments'] = comments\n", 157 | " article['url'] = url\n", 158 | " return article" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 6, 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "name": "stdout", 168 | "output_type": "stream", 169 | "text": [ 170 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537874850.A.20D.html\n", 171 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537868945.A.8A9.html\n", 172 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537861382.A.154.html\n", 173 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537859788.A.BE2.html\n", 174 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537859045.A.287.html\n", 175 | "Save - search_api_by_title.json\n" 176 | ] 177 | } 178 | ], 179 | "source": [ 180 | "data = []\n", 181 | "for article_line in soup_article_list.findAll('div', class_='r-ent'):\n", 182 | " title_tag = article_line.find('div', class_='title')\n", 183 | " article_url = title_tag.find('a')['href']\n", 184 | " article_url = urljoin(resp_article_list.url, article_url)\n", 185 | " article_data = crawl_article(article_url)\n", 186 | " data.append(article_data)\n", 187 | "\n", 188 | "with open('search_api_by_title.json', 'w+', encoding='utf-8') as f:\n", 189 | " json.dump(data, f, indent=2, ensure_ascii=False)\n", 190 | " print('Save - search_api_by_title.json')" 191 | ] 192 | } 193 | ], 194 | "metadata": { 195 | "kernelspec": { 196 | "display_name": "Python 3", 197 | "language": "python", 198 | "name": "python3" 199 | }, 200 | "language_info": { 201 | "codemirror_mode": { 202 | "name": "ipython", 203 | "version": 3 204 | }, 205 | "file_extension": ".py", 206 | "mimetype": "text/x-python", 207 | "name": "python", 208 | "nbconvert_exporter": "python", 209 | "pygments_lexer": "ipython3", 210 | "version": "3.6.6" 211 | } 212 | }, 213 | "nbformat": 4, 214 | "nbformat_minor": 2 215 | } 216 | -------------------------------------------------------------------------------- /appendix_ptt/02_today_articles.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 爬取今天到目前為止的所有文章\n", 8 | "\n", 9 | "https://www.ptt.cc/bbs/Gossiping/index.html" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import requests\n", 19 | "import re\n", 20 | "import json\n", 21 | "\n", 22 | "from bs4 import BeautifulSoup, NavigableString\n", 23 | "from datetime import datetime\n", 24 | "from pprint import pprint\n", 25 | "from urllib.parse import urljoin" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "name": "stdout", 35 | "output_type": "stream", 36 | "text": [ 37 | "09/27\n" 38 | ] 39 | } 40 | ], 41 | "source": [ 42 | "base_url = 'https://www.ptt.cc/bbs/Gossiping/index.html'\n", 43 | "ptt_today = datetime.now()\n", 44 | "ptt_today_str = ptt_today.strftime('%m/%d')\n", 45 | "print(ptt_today_str)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "## 取得總頁碼\n", 53 | "\n", 54 | "從 html 上一頁的按鈕中取得 n-1 page 的頁碼,在將該頁碼加一就是總頁碼了" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "resp_base = requests.get(base_url, cookies={'over18': '1'})\n", 64 | "assert resp_base.status_code == 200\n", 65 | "soup_base = BeautifulSoup(resp_base.text, 'lxml') " 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 4, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "total page = 39228\n" 78 | ] 79 | } 80 | ], 81 | "source": [ 82 | "paging_tag = soup_base.find(class_='btn-group-paging')\n", 83 | "total_page = None\n", 84 | "for btn_tag in paging_tag.findAll('a'):\n", 85 | " if btn_tag.text == '‹ 上頁':\n", 86 | " compile_page = re.search('(\\d+)', btn_tag['href'])\n", 87 | " if compile_page:\n", 88 | " total_page = int(compile_page.group(0)) + 1\n", 89 | "print('total page =', total_page)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "## 往回檢查日期並爬取文章\n", 97 | "\n", 98 | "最舊的文章頁面,頁碼為 1" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 5, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "def crawl_article(url):\n", 108 | " resp = requests.get(url, cookies={'over18': '1'})\n", 109 | " if resp.status_code != 200:\n", 110 | " return\n", 111 | " soup = BeautifulSoup(resp.text, 'lxml')\n", 112 | " print('Start to Crawling', url)\n", 113 | "\n", 114 | " # ##############################\n", 115 | " # crawl article\n", 116 | " # ##############################\n", 117 | " article = {\n", 118 | " 'author_id': '',\n", 119 | " 'author_nickname': '',\n", 120 | " 'title': '',\n", 121 | " 'timestamp': '',\n", 122 | " 'contents': '',\n", 123 | " 'ip': ''\n", 124 | " }\n", 125 | " article_body = soup.find(id='main-content')\n", 126 | "\n", 127 | " # article header\n", 128 | " article_head = article_body.findAll('div', class_='article-metaline')\n", 129 | " for metaline in article_head:\n", 130 | " meta_tag = metaline.find(class_='article-meta-tag').text\n", 131 | " meta_value = metaline.find(class_='article-meta-value').text\n", 132 | " if meta_tag == '作者':\n", 133 | " compile_nickname = re.compile('\\((.*)\\)').search(meta_value)\n", 134 | " article['author_id'] = meta_value.split('(')[0].strip(' ')\n", 135 | " article['author_nickname'] = compile_nickname.group(1) if compile_nickname else ''\n", 136 | " elif meta_tag == '標題':\n", 137 | " article['title'] = meta_value\n", 138 | " elif meta_tag == '時間':\n", 139 | " article['timestamp'] = meta_value\n", 140 | "\n", 141 | " # article content\n", 142 | " contents = [expr for expr in article_body.contents if isinstance(expr, NavigableString)]\n", 143 | " contents = [re.sub('\\n', '', expr) for expr in contents]\n", 144 | " contents = [i for i in contents if i]\n", 145 | " contents = '\\n'.join(contents)\n", 146 | " article['contents'] = contents\n", 147 | "\n", 148 | " # article publish ip\n", 149 | " article_ip = article_body.find(class_='f2').text\n", 150 | " compile_ip = re.compile('[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}').search(article_ip)\n", 151 | " article['ip'] = compile_ip.group(0) if compile_ip else ''\n", 152 | "\n", 153 | " # ##############################\n", 154 | " # crawl comments\n", 155 | " # ##############################\n", 156 | " comments = []\n", 157 | " for comment in article_body.findAll('div', class_='push'):\n", 158 | " tag = comment.find(class_='push-tag').text\n", 159 | " guest_id = comment.find(class_='push-userid').text\n", 160 | " guest_content = comment.find(class_='push-content').text\n", 161 | " guest_ipdatetime = comment.find(class_='push-ipdatetime').text\n", 162 | " compile_ip = re.compile('[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}').search(guest_ipdatetime)\n", 163 | " guest_ip = compile_ip.group(0) if compile_ip else ''\n", 164 | " guest_timestamp = re.sub(guest_ip, '', guest_ipdatetime).strip()\n", 165 | " comments.append({\n", 166 | " 'tag': tag,\n", 167 | " 'id': guest_id,\n", 168 | " 'content': guest_content,\n", 169 | " 'ip': guest_ip,\n", 170 | " 'timestamp': guest_timestamp\n", 171 | " })\n", 172 | " \n", 173 | " article['comments'] = comments\n", 174 | " article['url'] = url\n", 175 | " return article" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 6, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "DATE_GRATER=1\n", 185 | "DATE_EQUAL=0\n", 186 | "DATE_LESS=-1\n", 187 | "\n", 188 | "def compare_timestamp_md(src, dest):\n", 189 | " \"\"\"\n", 190 | " greater: 1\n", 191 | " equal: 0\n", 192 | " less: -1\n", 193 | " \"\"\"\n", 194 | " date_src = datetime.strptime(src, '%m/%d')\n", 195 | " date_dest = datetime.strptime(dest, '%m/%d')\n", 196 | " if date_dest > date_src:\n", 197 | " return 1\n", 198 | " elif date_dest == date_src:\n", 199 | " return 0\n", 200 | " else:\n", 201 | " return -1" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 7, 207 | "metadata": {}, 208 | "outputs": [ 209 | { 210 | "name": "stdout", 211 | "output_type": "stream", 212 | "text": [ 213 | "https://www.ptt.cc/bbs/Gossiping/index39228.html - date 9/27 result 0\n", 214 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978608.A.325.html\n", 215 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978662.A.45A.html\n", 216 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978695.A.9A7.html\n", 217 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978699.A.194.html\n", 218 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978724.A.356.html\n", 219 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978750.A.39A.html\n", 220 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978768.A.08B.html\n", 221 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978815.A.5B2.html\n", 222 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978820.A.119.html\n", 223 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978934.A.F8E.html\n", 224 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978941.A.754.html\n", 225 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978960.A.779.html\n", 226 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978973.A.B90.html\n", 227 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978993.A.F88.html\n", 228 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537979013.A.67C.html\n", 229 | "https://www.ptt.cc/bbs/Gossiping/index39227.html - date 9/27 result 0\n", 230 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977913.A.4EE.html\n", 231 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977930.A.01B.html\n", 232 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977933.A.013.html\n", 233 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977952.A.904.html\n", 234 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977959.A.A7B.html\n", 235 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977966.A.77C.html\n", 236 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978043.A.03E.html\n", 237 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978060.A.9DF.html\n", 238 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978098.A.D36.html\n", 239 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978140.A.C44.html\n", 240 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978152.A.31C.html\n", 241 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978156.A.B1A.html\n", 242 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978179.A.844.html\n", 243 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978195.A.D33.html\n", 244 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978272.A.533.html\n", 245 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978295.A.B6A.html\n", 246 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978350.A.D02.html\n", 247 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978378.A.746.html\n", 248 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978494.A.B6B.html\n", 249 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978521.A.06B.html\n", 250 | "https://www.ptt.cc/bbs/Gossiping/index39226.html - date 9/26 result -1\n", 251 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977639.A.3F8.html\n", 252 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977693.A.A67.html\n", 253 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977700.A.FD6.html\n", 254 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977711.A.493.html\n", 255 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977729.A.BE4.html\n", 256 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977740.A.534.html\n", 257 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977827.A.B50.html\n", 258 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977851.A.17A.html\n", 259 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977857.A.B1D.html\n", 260 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977877.A.292.html\n", 261 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977878.A.13E.html\n", 262 | "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977910.A.566.html\n", 263 | "Save - today_articles.json\n" 264 | ] 265 | } 266 | ], 267 | "source": [ 268 | "data = []\n", 269 | "for page in range(total_page, 1, -1):\n", 270 | " current_url = 'https://www.ptt.cc/bbs/Gossiping/index{}.html'.format(page)\n", 271 | " resp_page = requests.get(current_url, cookies={'over18': '1'})\n", 272 | " if resp_page.status_code != 200:\n", 273 | " continue\n", 274 | " soup_page = BeautifulSoup(resp_page.text, 'lxml')\n", 275 | " \n", 276 | " # ##############################\n", 277 | " # check the first article date\n", 278 | " # ##############################\n", 279 | " container_tag = soup_page.find('div', class_='r-list-container')\n", 280 | " first_article = container_tag.find('div', class_='r-ent')\n", 281 | " first_article_date = first_article.find('div', class_='date').text.strip()\n", 282 | " compare_datetime = compare_timestamp_md(ptt_today_str, first_article_date)\n", 283 | " print('{} - date {} result {}'.format(current_url, first_article_date, compare_datetime))\n", 284 | " \n", 285 | " if compare_datetime == 1:\n", 286 | " continue\n", 287 | " else:\n", 288 | " # only crawling today's article before r-list-sep line\n", 289 | " for article_row_tag in container_tag.findChildren('div', recursive=False):\n", 290 | " if 'r-list-sep' in article_row_tag['class']:\n", 291 | " break\n", 292 | " if 'r-ent' in article_row_tag['class']:\n", 293 | " article_date = article_row_tag.find('div', class_='date').text.strip()\n", 294 | " article_date_compare = compare_timestamp_md(ptt_today_str, article_date)\n", 295 | " if article_date_compare != 0:\n", 296 | " continue\n", 297 | " article_tag = article_row_tag.find('a', href=True)\n", 298 | " article_url = urljoin(base_url, article_tag['href'])\n", 299 | " article_data = crawl_article(article_url)\n", 300 | " data.append(article_data)\n", 301 | "\n", 302 | " # if the first article date is earlier than current date, should break the iteration\n", 303 | " if compare_datetime == -1:\n", 304 | " break\n", 305 | "\n", 306 | "with open('today_articles.json', 'w+', encoding='utf-8') as f:\n", 307 | " json.dump(data, f, indent=2, ensure_ascii=False)\n", 308 | " print('Save - today_articles.json')" 309 | ] 310 | } 311 | ], 312 | "metadata": { 313 | "kernelspec": { 314 | "display_name": "Python 3", 315 | "language": "python", 316 | "name": "python3" 317 | }, 318 | "language_info": { 319 | "codemirror_mode": { 320 | "name": "ipython", 321 | "version": 3 322 | }, 323 | "file_extension": ".py", 324 | "mimetype": "text/x-python", 325 | "name": "python", 326 | "nbconvert_exporter": "python", 327 | "pygments_lexer": "ipython3", 328 | "version": "3.6.6" 329 | } 330 | }, 331 | "nbformat": 4, 332 | "nbformat_minor": 2 333 | } 334 | -------------------------------------------------------------------------------- /appendix_ptt/03_crawl_image.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 爬取文章上的內文的所有文章\n", 8 | "\n", 9 | "1. 你有可能會遇到「是否滿18歲」的詢問頁面\n", 10 | "2. 解析 ptt.cc/bbs 裏面文章的結構\n", 11 | "3. 爬取文章\n", 12 | "4. 解析並確認圖片格式\n", 13 | "5. 下載圖片\n", 14 | "\n", 15 | "URL https://www.ptt.cc/bbs/Gossiping/M.1538373690.A.72D.html\n", 16 | "\n", 17 | "BACKUP https://afuntw.github.io/Test-Crawling-Website/pages/ptt/M.1538373690.A.72D.html" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 1, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import requests\n", 27 | "import re\n", 28 | "import json\n", 29 | "import os\n", 30 | "\n", 31 | "from PIL import Image\n", 32 | "from bs4 import BeautifulSoup, NavigableString\n", 33 | "from pprint import pprint" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "ARTICLE_URL = 'https://www.ptt.cc/bbs/Gossiping/M.1538373690.A.72D.html'" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "## 爬取文章" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 3, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "resp = requests.get(ARTICLE_URL, cookies={'over18': '1'})\n", 59 | "assert resp.status_code == 200\n", 60 | "soup = BeautifulSoup(resp.text, 'lxml')" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 4, 66 | "metadata": { 67 | "scrolled": true 68 | }, 69 | "outputs": [ 70 | { 71 | "name": "stdout", 72 | "output_type": "stream", 73 | "text": [ 74 | "[https://i.imgur.com/HdI5e8G.jpg,\n", 75 | " https://i.imgur.com/6W5aQk2.jpg,\n", 76 | " https://i.imgur.com/PhhH8ga.jpg,\n", 77 | " https://i.imgur.com/zF6ZwFj.jpg,\n", 78 | " https://i.imgur.com/4CXovkJ.jpg,\n", 79 | " https://i.imgur.com/NFwopB9.jpg,\n", 80 | " https://i.imgur.com/BFlIDmf.jpg,\n", 81 | " https://i.imgur.com/ARewyx8.jpg,\n", 82 | " https://i.imgur.com/LK4fnZX.jpg,\n", 83 | " https://i.imgur.com/AjTWzRW.jpg,\n", 84 | " https://i.imgur.com/daJpBHQ.jpg,\n", 85 | " https://i.imgur.com/X2RqYU6.jpg,\n", 86 | " https://i.imgur.com/j8rj172.jpg,\n", 87 | " https://i.imgur.com/nNnAJFf.jpg,\n", 88 | " https://i.imgur.com/dwZAQu1.jpg,\n", 89 | " https://i.imgur.com/7ibAOi8.jpg,\n", 90 | " https://i.imgur.com/YTaD5bs.jpg,\n", 91 | " https://i.imgur.com/FzwkWYt.jpg,\n", 92 | " https://i.imgur.com/NflbWR5.jpg,\n", 93 | " https://i.imgur.com/6sqAzjT.jpg,\n", 94 | " https://i.imgur.com/KmEAkaP.jpg,\n", 95 | " https://i.imgur.com/73yb0Ao.jpg,\n", 96 | " https://i.imgur.com/K6ukMIf.jpg,\n", 97 | " https://i.imgur.com/3BFzLjv.jpg,\n", 98 | " https://i.imgur.com/72a2Bas.jpg,\n", 99 | " https://i.imgur.com/89GSqqx.jpg,\n", 100 | " https://i.imgur.com/9CSJ3M5.jpg,\n", 101 | " https://i.imgur.com/NgKEiFz.jpg,\n", 102 | " https://i.imgur.com/aN6aYyo.jpg,\n", 103 | " https://i.imgur.com/O2KNZJV.jpg,\n", 104 | " https://i.imgur.com/WvjeC9N.jpg,\n", 105 | " https://i.imgur.com/bG8O5he.jpg,\n", 106 | " https://i.imgur.com/aJ7Lt7l.jpg,\n", 107 | " https://i.imgur.com/bNVe7S2.jpg,\n", 108 | " https://i.imgur.com/LxOXwCC.jpg,\n", 109 | " https://i.imgur.com/wI5TKjP.jpg,\n", 110 | " https://i.imgur.com/TW8c7ei.jpg,\n", 111 | " https://i.imgur.com/xl4zx8N.jpg,\n", 112 | " https://i.imgur.com/kbY3glw.jpg,\n", 113 | " https://i.imgur.com/Aa3utxo.jpg,\n", 114 | " https://i.imgur.com/zPfERpw.jpg,\n", 115 | " https://i.imgur.com/vXAbWHR.jpg,\n", 116 | " https://i.imgur.com/I7hUgF4.jpg,\n", 117 | " https://i.imgur.com/KOu9YRR.jpg,\n", 118 | " https://i.imgur.com/WvjeC9N.jpg,\n", 119 | " https://i.imgur.com/PtXgokJ.jpg,\n", 120 | " https://i.imgur.com/2sF8O4u.jpg,\n", 121 | " https://i.imgur.com/ZnEC7Jf.jpg,\n", 122 | " https://i.imgur.com/zqEwg69.jpg,\n", 123 | " https://i.imgur.com/I6QeEsc.jpg,\n", 124 | " https://i.imgur.com/XDLSNW4.jpg,\n", 125 | " https://i.imgur.com/4KZ6JOH.jpg,\n", 126 | " https://i.imgur.com/ixuwTe5.jpg,\n", 127 | " https://i.imgur.com/6wShMfE.jpg,\n", 128 | " https://i.imgur.com/6TK1rp5.jpg,\n", 129 | " https://i.imgur.com/Mtf5Hz5.jpg,\n", 130 | " https://i.imgur.com/XLB5kPg.jpg,\n", 131 | " https://i.imgur.com/xIyvraR.jpg,\n", 132 | " https://i.imgur.com/enTsU1Z.jpg,\n", 133 | " https://i.imgur.com/3YHKqwJ.jpg,\n", 134 | " https://i.imgur.com/mNGnRU7.jpg,\n", 135 | " https://i.imgur.com/5ughnWE.jpg,\n", 136 | " https://i.imgur.com/AA8U6Al.jpg,\n", 137 | " https://i.imgur.com/juPKVUR.jpg,\n", 138 | " https://i.imgur.com/M2mJx5N.jpg,\n", 139 | " https://i.imgur.com/8Kwd9Rc.jpg,\n", 140 | " https://i.imgur.com/KmRqaPE.jpg,\n", 141 | " https://i.imgur.com/FIjGDka.jpg,\n", 142 | " https://i.imgur.com/DB0Zu8Q.jpg,\n", 143 | " https://i.imgur.com/t8S3vno.png,\n", 144 | " https://i.imgur.com/MJxZfgi.jpg,\n", 145 | " https://i.imgur.com/G2dw8Cp.jpg,\n", 146 | " https://i.imgur.com/1CwI4YX.jpg,\n", 147 | " https://i.imgur.com/wSShBG7.jpg,\n", 148 | " https://i.imgur.com/kIS1BTe.jpg,\n", 149 | " https://i.imgur.com/3zG4M7q.jpg,\n", 150 | " https://i.imgur.com/xhIgdYH.jpg,\n", 151 | " https://i.imgur.com/Xaefcnj.jpg,\n", 152 | " https://i.imgur.com/VOfcZ6l.jpg,\n", 153 | " https://i.imgur.com/0MvMt9H.jpg,\n", 154 | " https://i.imgur.com/gTBGELL.jpg,\n", 155 | " https://i.imgur.com/mDkgG5m.jpg,\n", 156 | " https://i.imgur.com/6zItH1z.jpg,\n", 157 | " https://i.imgur.com/Ikp4oXG.jpg,\n", 158 | " https://i.imgur.com/ge0XrdB.jpg,\n", 159 | " https://i.imgur.com/qrIsZKP.jpg,\n", 160 | " https://i.imgur.com/4k9bFUi.jpg]\n" 161 | ] 162 | } 163 | ], 164 | "source": [ 165 | "main_content = soup.find(id = 'main-content')\n", 166 | "img_link = main_content.findAll('a', recursive=False)\n", 167 | "pprint(img_link)" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "## 檢查並下載圖片" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 5, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "def check_and_download_img(url, savedir='download_img'):\n", 184 | " image_resp = requests.get(url, stream=True)\n", 185 | " image = Image.open(image_resp.raw)\n", 186 | " filename = os.path.basename(url)\n", 187 | " \n", 188 | " # check format\n", 189 | " real_filename = '{}.{}'.format(\n", 190 | " filename.split('.')[0],\n", 191 | " image.format.lower()\n", 192 | " )\n", 193 | " print('check and fixed filename {} -> {}'.format(filename, real_filename))\n", 194 | " \n", 195 | " # download\n", 196 | " if not os.path.exists(savedir):\n", 197 | " os.makedirs(savedir)\n", 198 | " savepath = os.path.join(savedir, real_filename)\n", 199 | " image.save(savepath)\n", 200 | " print('save imag - {}'.format(savepath))" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 6, 206 | "metadata": {}, 207 | "outputs": [ 208 | { 209 | "name": "stdout", 210 | "output_type": "stream", 211 | "text": [ 212 | "check and fixed filename HdI5e8G.jpg -> HdI5e8G.jpeg\n", 213 | "save imag - download_img/HdI5e8G.jpeg\n", 214 | "check and fixed filename 6W5aQk2.jpg -> 6W5aQk2.jpeg\n", 215 | "save imag - download_img/6W5aQk2.jpeg\n", 216 | "check and fixed filename PhhH8ga.jpg -> PhhH8ga.jpeg\n", 217 | "save imag - download_img/PhhH8ga.jpeg\n", 218 | "check and fixed filename zF6ZwFj.jpg -> zF6ZwFj.jpeg\n", 219 | "save imag - download_img/zF6ZwFj.jpeg\n", 220 | "check and fixed filename 4CXovkJ.jpg -> 4CXovkJ.jpeg\n", 221 | "save imag - download_img/4CXovkJ.jpeg\n", 222 | "check and fixed filename NFwopB9.jpg -> NFwopB9.jpeg\n", 223 | "save imag - download_img/NFwopB9.jpeg\n", 224 | "check and fixed filename BFlIDmf.jpg -> BFlIDmf.jpeg\n", 225 | "save imag - download_img/BFlIDmf.jpeg\n", 226 | "check and fixed filename ARewyx8.jpg -> ARewyx8.jpeg\n", 227 | "save imag - download_img/ARewyx8.jpeg\n", 228 | "check and fixed filename LK4fnZX.jpg -> LK4fnZX.jpeg\n", 229 | "save imag - download_img/LK4fnZX.jpeg\n", 230 | "check and fixed filename AjTWzRW.jpg -> AjTWzRW.jpeg\n", 231 | "save imag - download_img/AjTWzRW.jpeg\n", 232 | "check and fixed filename daJpBHQ.jpg -> daJpBHQ.jpeg\n", 233 | "save imag - download_img/daJpBHQ.jpeg\n", 234 | "check and fixed filename X2RqYU6.jpg -> X2RqYU6.jpeg\n", 235 | "save imag - download_img/X2RqYU6.jpeg\n", 236 | "check and fixed filename j8rj172.jpg -> j8rj172.jpeg\n", 237 | "save imag - download_img/j8rj172.jpeg\n", 238 | "check and fixed filename nNnAJFf.jpg -> nNnAJFf.jpeg\n", 239 | "save imag - download_img/nNnAJFf.jpeg\n", 240 | "check and fixed filename dwZAQu1.jpg -> dwZAQu1.jpeg\n", 241 | "save imag - download_img/dwZAQu1.jpeg\n", 242 | "check and fixed filename 7ibAOi8.jpg -> 7ibAOi8.jpeg\n", 243 | "save imag - download_img/7ibAOi8.jpeg\n", 244 | "check and fixed filename YTaD5bs.jpg -> YTaD5bs.jpeg\n", 245 | "save imag - download_img/YTaD5bs.jpeg\n", 246 | "check and fixed filename FzwkWYt.jpg -> FzwkWYt.jpeg\n", 247 | "save imag - download_img/FzwkWYt.jpeg\n", 248 | "check and fixed filename NflbWR5.jpg -> NflbWR5.jpeg\n", 249 | "save imag - download_img/NflbWR5.jpeg\n", 250 | "check and fixed filename 6sqAzjT.jpg -> 6sqAzjT.jpeg\n", 251 | "save imag - download_img/6sqAzjT.jpeg\n", 252 | "check and fixed filename KmEAkaP.jpg -> KmEAkaP.jpeg\n", 253 | "save imag - download_img/KmEAkaP.jpeg\n", 254 | "check and fixed filename 73yb0Ao.jpg -> 73yb0Ao.jpeg\n", 255 | "save imag - download_img/73yb0Ao.jpeg\n", 256 | "check and fixed filename K6ukMIf.jpg -> K6ukMIf.jpeg\n", 257 | "save imag - download_img/K6ukMIf.jpeg\n", 258 | "check and fixed filename 3BFzLjv.jpg -> 3BFzLjv.jpeg\n", 259 | "save imag - download_img/3BFzLjv.jpeg\n", 260 | "check and fixed filename 72a2Bas.jpg -> 72a2Bas.jpeg\n", 261 | "save imag - download_img/72a2Bas.jpeg\n", 262 | "check and fixed filename 89GSqqx.jpg -> 89GSqqx.jpeg\n", 263 | "save imag - download_img/89GSqqx.jpeg\n", 264 | "check and fixed filename 9CSJ3M5.jpg -> 9CSJ3M5.jpeg\n", 265 | "save imag - download_img/9CSJ3M5.jpeg\n", 266 | "check and fixed filename NgKEiFz.jpg -> NgKEiFz.jpeg\n", 267 | "save imag - download_img/NgKEiFz.jpeg\n", 268 | "check and fixed filename aN6aYyo.jpg -> aN6aYyo.jpeg\n", 269 | "save imag - download_img/aN6aYyo.jpeg\n", 270 | "check and fixed filename O2KNZJV.jpg -> O2KNZJV.jpeg\n", 271 | "save imag - download_img/O2KNZJV.jpeg\n", 272 | "check and fixed filename WvjeC9N.jpg -> WvjeC9N.jpeg\n", 273 | "save imag - download_img/WvjeC9N.jpeg\n", 274 | "check and fixed filename bG8O5he.jpg -> bG8O5he.jpeg\n", 275 | "save imag - download_img/bG8O5he.jpeg\n", 276 | "check and fixed filename aJ7Lt7l.jpg -> aJ7Lt7l.jpeg\n", 277 | "save imag - download_img/aJ7Lt7l.jpeg\n", 278 | "check and fixed filename bNVe7S2.jpg -> bNVe7S2.jpeg\n", 279 | "save imag - download_img/bNVe7S2.jpeg\n", 280 | "check and fixed filename LxOXwCC.jpg -> LxOXwCC.jpeg\n", 281 | "save imag - download_img/LxOXwCC.jpeg\n", 282 | "check and fixed filename wI5TKjP.jpg -> wI5TKjP.jpeg\n", 283 | "save imag - download_img/wI5TKjP.jpeg\n", 284 | "check and fixed filename TW8c7ei.jpg -> TW8c7ei.jpeg\n", 285 | "save imag - download_img/TW8c7ei.jpeg\n", 286 | "check and fixed filename xl4zx8N.jpg -> xl4zx8N.jpeg\n", 287 | "save imag - download_img/xl4zx8N.jpeg\n", 288 | "check and fixed filename kbY3glw.jpg -> kbY3glw.jpeg\n", 289 | "save imag - download_img/kbY3glw.jpeg\n", 290 | "check and fixed filename Aa3utxo.jpg -> Aa3utxo.jpeg\n", 291 | "save imag - download_img/Aa3utxo.jpeg\n", 292 | "check and fixed filename zPfERpw.jpg -> zPfERpw.jpeg\n", 293 | "save imag - download_img/zPfERpw.jpeg\n", 294 | "check and fixed filename vXAbWHR.jpg -> vXAbWHR.jpeg\n", 295 | "save imag - download_img/vXAbWHR.jpeg\n", 296 | "check and fixed filename I7hUgF4.jpg -> I7hUgF4.jpeg\n", 297 | "save imag - download_img/I7hUgF4.jpeg\n", 298 | "check and fixed filename KOu9YRR.jpg -> KOu9YRR.jpeg\n", 299 | "save imag - download_img/KOu9YRR.jpeg\n", 300 | "check and fixed filename WvjeC9N.jpg -> WvjeC9N.jpeg\n", 301 | "save imag - download_img/WvjeC9N.jpeg\n", 302 | "check and fixed filename PtXgokJ.jpg -> PtXgokJ.jpeg\n", 303 | "save imag - download_img/PtXgokJ.jpeg\n", 304 | "check and fixed filename 2sF8O4u.jpg -> 2sF8O4u.jpeg\n", 305 | "save imag - download_img/2sF8O4u.jpeg\n", 306 | "check and fixed filename ZnEC7Jf.jpg -> ZnEC7Jf.jpeg\n", 307 | "save imag - download_img/ZnEC7Jf.jpeg\n", 308 | "check and fixed filename zqEwg69.jpg -> zqEwg69.jpeg\n", 309 | "save imag - download_img/zqEwg69.jpeg\n", 310 | "check and fixed filename I6QeEsc.jpg -> I6QeEsc.jpeg\n", 311 | "save imag - download_img/I6QeEsc.jpeg\n", 312 | "check and fixed filename XDLSNW4.jpg -> XDLSNW4.jpeg\n", 313 | "save imag - download_img/XDLSNW4.jpeg\n", 314 | "check and fixed filename 4KZ6JOH.jpg -> 4KZ6JOH.jpeg\n", 315 | "save imag - download_img/4KZ6JOH.jpeg\n", 316 | "check and fixed filename ixuwTe5.jpg -> ixuwTe5.jpeg\n", 317 | "save imag - download_img/ixuwTe5.jpeg\n", 318 | "check and fixed filename 6wShMfE.jpg -> 6wShMfE.jpeg\n", 319 | "save imag - download_img/6wShMfE.jpeg\n", 320 | "check and fixed filename 6TK1rp5.jpg -> 6TK1rp5.jpeg\n", 321 | "save imag - download_img/6TK1rp5.jpeg\n", 322 | "check and fixed filename Mtf5Hz5.jpg -> Mtf5Hz5.jpeg\n", 323 | "save imag - download_img/Mtf5Hz5.jpeg\n", 324 | "check and fixed filename XLB5kPg.jpg -> XLB5kPg.jpeg\n", 325 | "save imag - download_img/XLB5kPg.jpeg\n", 326 | "check and fixed filename xIyvraR.jpg -> xIyvraR.jpeg\n", 327 | "save imag - download_img/xIyvraR.jpeg\n", 328 | "check and fixed filename enTsU1Z.jpg -> enTsU1Z.jpeg\n", 329 | "save imag - download_img/enTsU1Z.jpeg\n", 330 | "check and fixed filename 3YHKqwJ.jpg -> 3YHKqwJ.jpeg\n", 331 | "save imag - download_img/3YHKqwJ.jpeg\n", 332 | "check and fixed filename mNGnRU7.jpg -> mNGnRU7.jpeg\n", 333 | "save imag - download_img/mNGnRU7.jpeg\n", 334 | "check and fixed filename 5ughnWE.jpg -> 5ughnWE.jpeg\n", 335 | "save imag - download_img/5ughnWE.jpeg\n", 336 | "check and fixed filename AA8U6Al.jpg -> AA8U6Al.jpeg\n", 337 | "save imag - download_img/AA8U6Al.jpeg\n", 338 | "check and fixed filename juPKVUR.jpg -> juPKVUR.jpeg\n", 339 | "save imag - download_img/juPKVUR.jpeg\n", 340 | "check and fixed filename M2mJx5N.jpg -> M2mJx5N.jpeg\n", 341 | "save imag - download_img/M2mJx5N.jpeg\n", 342 | "check and fixed filename 8Kwd9Rc.jpg -> 8Kwd9Rc.jpeg\n", 343 | "save imag - download_img/8Kwd9Rc.jpeg\n", 344 | "check and fixed filename KmRqaPE.jpg -> KmRqaPE.jpeg\n", 345 | "save imag - download_img/KmRqaPE.jpeg\n", 346 | "check and fixed filename FIjGDka.jpg -> FIjGDka.jpeg\n", 347 | "save imag - download_img/FIjGDka.jpeg\n", 348 | "check and fixed filename DB0Zu8Q.jpg -> DB0Zu8Q.jpeg\n", 349 | "save imag - download_img/DB0Zu8Q.jpeg\n", 350 | "check and fixed filename t8S3vno.png -> t8S3vno.png\n", 351 | "save imag - download_img/t8S3vno.png\n", 352 | "check and fixed filename MJxZfgi.jpg -> MJxZfgi.jpeg\n", 353 | "save imag - download_img/MJxZfgi.jpeg\n", 354 | "check and fixed filename G2dw8Cp.jpg -> G2dw8Cp.jpeg\n", 355 | "save imag - download_img/G2dw8Cp.jpeg\n", 356 | "check and fixed filename 1CwI4YX.jpg -> 1CwI4YX.jpeg\n", 357 | "save imag - download_img/1CwI4YX.jpeg\n", 358 | "check and fixed filename wSShBG7.jpg -> wSShBG7.jpeg\n", 359 | "save imag - download_img/wSShBG7.jpeg\n", 360 | "check and fixed filename kIS1BTe.jpg -> kIS1BTe.jpeg\n", 361 | "save imag - download_img/kIS1BTe.jpeg\n", 362 | "check and fixed filename 3zG4M7q.jpg -> 3zG4M7q.jpeg\n", 363 | "save imag - download_img/3zG4M7q.jpeg\n", 364 | "check and fixed filename xhIgdYH.jpg -> xhIgdYH.jpeg\n", 365 | "save imag - download_img/xhIgdYH.jpeg\n", 366 | "check and fixed filename Xaefcnj.jpg -> Xaefcnj.jpeg\n", 367 | "save imag - download_img/Xaefcnj.jpeg\n", 368 | "check and fixed filename VOfcZ6l.jpg -> VOfcZ6l.jpeg\n", 369 | "save imag - download_img/VOfcZ6l.jpeg\n", 370 | "check and fixed filename 0MvMt9H.jpg -> 0MvMt9H.jpeg\n", 371 | "save imag - download_img/0MvMt9H.jpeg\n", 372 | "check and fixed filename gTBGELL.jpg -> gTBGELL.jpeg\n", 373 | "save imag - download_img/gTBGELL.jpeg\n", 374 | "check and fixed filename mDkgG5m.jpg -> mDkgG5m.jpeg\n", 375 | "save imag - download_img/mDkgG5m.jpeg\n", 376 | "check and fixed filename 6zItH1z.jpg -> 6zItH1z.jpeg\n", 377 | "save imag - download_img/6zItH1z.jpeg\n", 378 | "check and fixed filename Ikp4oXG.jpg -> Ikp4oXG.jpeg\n", 379 | "save imag - download_img/Ikp4oXG.jpeg\n", 380 | "check and fixed filename ge0XrdB.jpg -> ge0XrdB.jpeg\n", 381 | "save imag - download_img/ge0XrdB.jpeg\n", 382 | "check and fixed filename qrIsZKP.jpg -> qrIsZKP.jpeg\n", 383 | "save imag - download_img/qrIsZKP.jpeg\n", 384 | "check and fixed filename 4k9bFUi.jpg -> 4k9bFUi.jpeg\n", 385 | "save imag - download_img/4k9bFUi.jpeg\n" 386 | ] 387 | } 388 | ], 389 | "source": [ 390 | "for tag in img_link:\n", 391 | " check_and_download_img(tag['href'])" 392 | ] 393 | } 394 | ], 395 | "metadata": { 396 | "kernelspec": { 397 | "display_name": "Python 3", 398 | "language": "python", 399 | "name": "python3" 400 | }, 401 | "language_info": { 402 | "codemirror_mode": { 403 | "name": "ipython", 404 | "version": 3 405 | }, 406 | "file_extension": ".py", 407 | "mimetype": "text/x-python", 408 | "name": "python", 409 | "nbconvert_exporter": "python", 410 | "pygments_lexer": "ipython3", 411 | "version": "3.6.6" 412 | } 413 | }, 414 | "nbformat": 4, 415 | "nbformat_minor": 2 416 | } 417 | -------------------------------------------------------------------------------- /appendix_ptt/README.md: -------------------------------------------------------------------------------- 1 | # Ptt Crawler 2 | 3 | > This crawler is basically with reference to [jwlin/ptt-web-crawler](https://github.com/jwlin/ptt-web-crawler) 4 | --------------------------------------------------------------------------------