├── .gitignore
├── 00_GET_POST
    ├── 00_GET_movie_board.ipynb
    ├── 01_POST_thsrc_time_table.ipynb
    ├── 02_google_search_result.ipynb
    └── get_post_diff.ipynb
├── 01_files_website
    ├── 00_image_crawling.ipynb
    ├── 01_image_crawling_and_check_format.ipynb
    ├── 02_file_crawling.ipynb
    ├── 03_website_crawling.ipynb
    ├── 04_image_crawling_check_last_modified.ipynb
    └── 05_website_crawling_valid_URL.ipynb
├── 02_selenium
    ├── 00_selenium_crawling_render_image.ipynb
    ├── 01_pchome_crawling_item.ipynb
    ├── 02_selenium_google_search.ipynb
    └── 03_crawling_reCAPTCHA_image.ipynb
├── 03_graph_api
    ├── 00_facebook_crawling_article_comments.ipynb
    ├── 01_facebook_crawling_fanpage_likes_shares.ipynb
    └── 02_facebook_crawling_article_all.ipynb
├── LICENSE
├── Pipfile
├── Pipfile.lock
├── README.md
└── appendix_ptt
    ├── 00_parse_article.ipynb
    ├── 01_search_api_by_title.ipynb
    ├── 02_today_articles.ipynb
    ├── 03_crawl_image.ipynb
    └── README.md


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/00_GET_POST/00_GET_movie_board.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 練習\n",
  8 |     "\n",
  9 |     "- 觀察 http://www.boxofficemojo.com/yearly/ 並撰寫爬蟲程式\n",
 10 |     "- 抓取每年度冠軍排行榜\n",
 11 |     "- 使用 requests + BeautifulSoup 實作\n",
 12 |     "- 透過 pandas 輸出成 csv"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 1,
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import os\n",
 24 |     "import requests\n",
 25 |     "import pandas as pd\n",
 26 |     "\n",
 27 |     "from bs4 import BeautifulSoup\n",
 28 |     "\n",
 29 |     "url = 'http://www.boxofficemojo.com/yearly/'"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 2,
 35 |    "metadata": {
 36 |     "collapsed": true
 37 |    },
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "resp = requests.get(url)\n",
 41 |     "resp.encoding = 'utf-8'\n",
 42 |     "soup = BeautifulSoup(resp.text, 'lxml')"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 3,
 48 |    "metadata": {},
 49 |    "outputs": [
 50 |     {
 51 |      "name": "stdout",
 52 |      "output_type": "stream",
 53 |      "text": [
 54 |       "['Year',\n",
 55 |       " 'TotalGross*',\n",
 56 |       " 'Change',\n",
 57 |       " 'TicketsSold',\n",
 58 |       " 'Change',\n",
 59 |       " '# ofMovies',\n",
 60 |       " 'TotalScreens',\n",
 61 |       " 'Avg.TicketPrice',\n",
 62 |       " 'Avg.Cost^',\n",
 63 |       " '#1 Movie']\n"
 64 |      ]
 65 |     },
 66 |     {
 67 |      "data": {
 68 |       "text/html": [
 69 |        "<div>\n",
 70 |        "<style>\n",
 71 |        "    .dataframe thead tr:only-child th {\n",
 72 |        "        text-align: right;\n",
 73 |        "    }\n",
 74 |        "\n",
 75 |        "    .dataframe thead th {\n",
 76 |        "        text-align: left;\n",
 77 |        "    }\n",
 78 |        "\n",
 79 |        "    .dataframe tbody tr th {\n",
 80 |        "        vertical-align: top;\n",
 81 |        "    }\n",
 82 |        "</style>\n",
 83 |        "<table border=\"1\" class=\"dataframe\">\n",
 84 |        "  <thead>\n",
 85 |        "    <tr style=\"text-align: right;\">\n",
 86 |        "      <th></th>\n",
 87 |        "      <th>Year</th>\n",
 88 |        "      <th>TotalGross*</th>\n",
 89 |        "      <th>Change</th>\n",
 90 |        "      <th>TicketsSold</th>\n",
 91 |        "      <th>Change</th>\n",
 92 |        "      <th># ofMovies</th>\n",
 93 |        "      <th>TotalScreens</th>\n",
 94 |        "      <th>Avg.TicketPrice</th>\n",
 95 |        "      <th>Avg.Cost^</th>\n",
 96 |        "      <th>#1 Movie</th>\n",
 97 |        "    </tr>\n",
 98 |        "  </thead>\n",
 99 |        "  <tbody>\n",
100 |        "    <tr>\n",
101 |        "      <th>0</th>\n",
102 |        "      <td>2018</td>\n",
103 |        "      <td>$4,310.3</td>\n",
104 |        "      <td>-</td>\n",
105 |        "      <td>470.6</td>\n",
106 |        "      <td>-</td>\n",
107 |        "      <td>264</td>\n",
108 |        "      <td>-</td>\n",
109 |        "      <td>$9.16</td>\n",
110 |        "      <td>-</td>\n",
111 |        "      <td>Black Panther</td>\n",
112 |        "    </tr>\n",
113 |        "    <tr>\n",
114 |        "      <th>1</th>\n",
115 |        "      <td>2017</td>\n",
116 |        "      <td>$11,071.9</td>\n",
117 |        "      <td>-2.7%</td>\n",
118 |        "      <td>1,234.3</td>\n",
119 |        "      <td>-6.2%</td>\n",
120 |        "      <td>738</td>\n",
121 |        "      <td>-</td>\n",
122 |        "      <td>$8.97</td>\n",
123 |        "      <td>-</td>\n",
124 |        "      <td>Star Wars: The Last Jedi</td>\n",
125 |        "    </tr>\n",
126 |        "    <tr>\n",
127 |        "      <th>2</th>\n",
128 |        "      <td>2016</td>\n",
129 |        "      <td>$11,377.7</td>\n",
130 |        "      <td>+2.2%</td>\n",
131 |        "      <td>1,315.3</td>\n",
132 |        "      <td>-0.4%</td>\n",
133 |        "      <td>736</td>\n",
134 |        "      <td>-</td>\n",
135 |        "      <td>$8.65</td>\n",
136 |        "      <td>-</td>\n",
137 |        "      <td>Rogue One</td>\n",
138 |        "    </tr>\n",
139 |        "    <tr>\n",
140 |        "      <th>3</th>\n",
141 |        "      <td>2015</td>\n",
142 |        "      <td>$11,129.4</td>\n",
143 |        "      <td>+7.4%</td>\n",
144 |        "      <td>1,320.2</td>\n",
145 |        "      <td>+4.1%</td>\n",
146 |        "      <td>705</td>\n",
147 |        "      <td>-</td>\n",
148 |        "      <td>$8.43</td>\n",
149 |        "      <td>-</td>\n",
150 |        "      <td>Star Wars: The Force Awakens</td>\n",
151 |        "    </tr>\n",
152 |        "    <tr>\n",
153 |        "      <th>4</th>\n",
154 |        "      <td>2014</td>\n",
155 |        "      <td>$10,361.2</td>\n",
156 |        "      <td>-5.2%</td>\n",
157 |        "      <td>1,268.2</td>\n",
158 |        "      <td>-5.6%</td>\n",
159 |        "      <td>706</td>\n",
160 |        "      <td>-</td>\n",
161 |        "      <td>$8.17</td>\n",
162 |        "      <td>-</td>\n",
163 |        "      <td>American Sniper</td>\n",
164 |        "    </tr>\n",
165 |        "    <tr>\n",
166 |        "      <th>5</th>\n",
167 |        "      <td>2013</td>\n",
168 |        "      <td>$10,924.6</td>\n",
169 |        "      <td>+0.8%</td>\n",
170 |        "      <td>1,343.7</td>\n",
171 |        "      <td>-1.3%</td>\n",
172 |        "      <td>688</td>\n",
173 |        "      <td>-</td>\n",
174 |        "      <td>$8.13</td>\n",
175 |        "      <td>-</td>\n",
176 |        "      <td>Catching Fire</td>\n",
177 |        "    </tr>\n",
178 |        "    <tr>\n",
179 |        "      <th>6</th>\n",
180 |        "      <td>2012</td>\n",
181 |        "      <td>$10,837.6</td>\n",
182 |        "      <td>+6.5%</td>\n",
183 |        "      <td>1,361.5</td>\n",
184 |        "      <td>+6.1%</td>\n",
185 |        "      <td>669</td>\n",
186 |        "      <td>-</td>\n",
187 |        "      <td>$7.96</td>\n",
188 |        "      <td>-</td>\n",
189 |        "      <td>The Avengers</td>\n",
190 |        "    </tr>\n",
191 |        "    <tr>\n",
192 |        "      <th>7</th>\n",
193 |        "      <td>2011</td>\n",
194 |        "      <td>$10,174.2</td>\n",
195 |        "      <td>-3.7%</td>\n",
196 |        "      <td>1,283.0</td>\n",
197 |        "      <td>-4.2%</td>\n",
198 |        "      <td>602</td>\n",
199 |        "      <td>-</td>\n",
200 |        "      <td>$7.93</td>\n",
201 |        "      <td>-</td>\n",
202 |        "      <td>Harry Potter / Deathly Hallows (P2)</td>\n",
203 |        "    </tr>\n",
204 |        "    <tr>\n",
205 |        "      <th>8</th>\n",
206 |        "      <td>2010</td>\n",
207 |        "      <td>$10,565.6</td>\n",
208 |        "      <td>-0.3%</td>\n",
209 |        "      <td>1,339.1</td>\n",
210 |        "      <td>-5.2%</td>\n",
211 |        "      <td>537</td>\n",
212 |        "      <td>-</td>\n",
213 |        "      <td>$7.89</td>\n",
214 |        "      <td>-</td>\n",
215 |        "      <td>Toy Story 3</td>\n",
216 |        "    </tr>\n",
217 |        "    <tr>\n",
218 |        "      <th>9</th>\n",
219 |        "      <td>2009</td>\n",
220 |        "      <td>$10,595.5</td>\n",
221 |        "      <td>+10.0%</td>\n",
222 |        "      <td>1,412.7</td>\n",
223 |        "      <td>+5.3%</td>\n",
224 |        "      <td>521</td>\n",
225 |        "      <td>-</td>\n",
226 |        "      <td>$7.50</td>\n",
227 |        "      <td>-</td>\n",
228 |        "      <td>Avatar</td>\n",
229 |        "    </tr>\n",
230 |        "    <tr>\n",
231 |        "      <th>10</th>\n",
232 |        "      <td>2008</td>\n",
233 |        "      <td>$9,630.7</td>\n",
234 |        "      <td>-0.3%</td>\n",
235 |        "      <td>1,341.3</td>\n",
236 |        "      <td>-4.5%</td>\n",
237 |        "      <td>607</td>\n",
238 |        "      <td>-</td>\n",
239 |        "      <td>$7.18</td>\n",
240 |        "      <td>-</td>\n",
241 |        "      <td>The Dark Knight</td>\n",
242 |        "    </tr>\n",
243 |        "    <tr>\n",
244 |        "      <th>11</th>\n",
245 |        "      <td>2007</td>\n",
246 |        "      <td>$9,663.8</td>\n",
247 |        "      <td>+4.9%</td>\n",
248 |        "      <td>1,404.6</td>\n",
249 |        "      <td>-0.1%</td>\n",
250 |        "      <td>631</td>\n",
251 |        "      <td>-</td>\n",
252 |        "      <td>$6.88</td>\n",
253 |        "      <td>-</td>\n",
254 |        "      <td>Spider-Man 3</td>\n",
255 |        "    </tr>\n",
256 |        "    <tr>\n",
257 |        "      <th>12</th>\n",
258 |        "      <td>2006</td>\n",
259 |        "      <td>$9,209.5</td>\n",
260 |        "      <td>+4.2%</td>\n",
261 |        "      <td>1,406.0</td>\n",
262 |        "      <td>+2.0%</td>\n",
263 |        "      <td>608</td>\n",
264 |        "      <td>-</td>\n",
265 |        "      <td>$6.55</td>\n",
266 |        "      <td>-</td>\n",
267 |        "      <td>Dead Man's Chest</td>\n",
268 |        "    </tr>\n",
269 |        "    <tr>\n",
270 |        "      <th>13</th>\n",
271 |        "      <td>2005</td>\n",
272 |        "      <td>$8,840.5</td>\n",
273 |        "      <td>-5.8%</td>\n",
274 |        "      <td>1,379.2</td>\n",
275 |        "      <td>-8.7%</td>\n",
276 |        "      <td>547</td>\n",
277 |        "      <td>-</td>\n",
278 |        "      <td>$6.41</td>\n",
279 |        "      <td>-</td>\n",
280 |        "      <td>Revenge of the Sith</td>\n",
281 |        "    </tr>\n",
282 |        "    <tr>\n",
283 |        "      <th>14</th>\n",
284 |        "      <td>2004</td>\n",
285 |        "      <td>$9,380.5</td>\n",
286 |        "      <td>+1.5%</td>\n",
287 |        "      <td>1,510.5</td>\n",
288 |        "      <td>-1.4%</td>\n",
289 |        "      <td>551</td>\n",
290 |        "      <td>-</td>\n",
291 |        "      <td>$6.21</td>\n",
292 |        "      <td>-</td>\n",
293 |        "      <td>Shrek 2</td>\n",
294 |        "    </tr>\n",
295 |        "    <tr>\n",
296 |        "      <th>15</th>\n",
297 |        "      <td>2003</td>\n",
298 |        "      <td>$9,239.7</td>\n",
299 |        "      <td>+0.9%</td>\n",
300 |        "      <td>1,532.3</td>\n",
301 |        "      <td>-2.8%</td>\n",
302 |        "      <td>506</td>\n",
303 |        "      <td>-</td>\n",
304 |        "      <td>$6.03</td>\n",
305 |        "      <td>$63.8</td>\n",
306 |        "      <td>Return of the King</td>\n",
307 |        "    </tr>\n",
308 |        "    <tr>\n",
309 |        "      <th>16</th>\n",
310 |        "      <td>2002</td>\n",
311 |        "      <td>$9,155.1</td>\n",
312 |        "      <td>+8.8%</td>\n",
313 |        "      <td>1,575.7</td>\n",
314 |        "      <td>+6.0%</td>\n",
315 |        "      <td>480</td>\n",
316 |        "      <td>35,592</td>\n",
317 |        "      <td>$5.81</td>\n",
318 |        "      <td>$58.8</td>\n",
319 |        "      <td>Spider-Man</td>\n",
320 |        "    </tr>\n",
321 |        "    <tr>\n",
322 |        "      <th>17</th>\n",
323 |        "      <td>2001</td>\n",
324 |        "      <td>$8,412.5</td>\n",
325 |        "      <td>+9.8%</td>\n",
326 |        "      <td>1,487.3</td>\n",
327 |        "      <td>+4.7%</td>\n",
328 |        "      <td>482</td>\n",
329 |        "      <td>36,764</td>\n",
330 |        "      <td>$5.66</td>\n",
331 |        "      <td>$47.7</td>\n",
332 |        "      <td>Harry Potter / Sorcerer's Stone</td>\n",
333 |        "    </tr>\n",
334 |        "    <tr>\n",
335 |        "      <th>18</th>\n",
336 |        "      <td>2000</td>\n",
337 |        "      <td>$7,661.0</td>\n",
338 |        "      <td>+2.9%</td>\n",
339 |        "      <td>1,420.8</td>\n",
340 |        "      <td>-3.0%</td>\n",
341 |        "      <td>478</td>\n",
342 |        "      <td>37,396</td>\n",
343 |        "      <td>$5.39</td>\n",
344 |        "      <td>$54.8</td>\n",
345 |        "      <td>The Grinch</td>\n",
346 |        "    </tr>\n",
347 |        "    <tr>\n",
348 |        "      <th>19</th>\n",
349 |        "      <td>1999</td>\n",
350 |        "      <td>$7,448.0</td>\n",
351 |        "      <td>+7.2%</td>\n",
352 |        "      <td>1,465.2</td>\n",
353 |        "      <td>-1.1%</td>\n",
354 |        "      <td>461</td>\n",
355 |        "      <td>37,185</td>\n",
356 |        "      <td>$5.08</td>\n",
357 |        "      <td>$51.5</td>\n",
358 |        "      <td>The Phantom Menace</td>\n",
359 |        "    </tr>\n",
360 |        "    <tr>\n",
361 |        "      <th>20</th>\n",
362 |        "      <td>1998</td>\n",
363 |        "      <td>$6,949.0</td>\n",
364 |        "      <td>+9.2%</td>\n",
365 |        "      <td>1,480.7</td>\n",
366 |        "      <td>+6.7%</td>\n",
367 |        "      <td>509</td>\n",
368 |        "      <td>34,186</td>\n",
369 |        "      <td>$4.69</td>\n",
370 |        "      <td>$52.7</td>\n",
371 |        "      <td>Saving Private Ryan</td>\n",
372 |        "    </tr>\n",
373 |        "    <tr>\n",
374 |        "      <th>21</th>\n",
375 |        "      <td>1997</td>\n",
376 |        "      <td>$6,365.9</td>\n",
377 |        "      <td>+7.7%</td>\n",
378 |        "      <td>1,387.7</td>\n",
379 |        "      <td>+3.7%</td>\n",
380 |        "      <td>510</td>\n",
381 |        "      <td>31,640</td>\n",
382 |        "      <td>$4.59</td>\n",
383 |        "      <td>$53.4</td>\n",
384 |        "      <td>Titanic</td>\n",
385 |        "    </tr>\n",
386 |        "    <tr>\n",
387 |        "      <th>22</th>\n",
388 |        "      <td>1996</td>\n",
389 |        "      <td>$5,911.5</td>\n",
390 |        "      <td>+7.6%</td>\n",
391 |        "      <td>1,338.6</td>\n",
392 |        "      <td>+6.0%</td>\n",
393 |        "      <td>471</td>\n",
394 |        "      <td>29,690</td>\n",
395 |        "      <td>$4.42</td>\n",
396 |        "      <td>$39.8</td>\n",
397 |        "      <td>Independence Day</td>\n",
398 |        "    </tr>\n",
399 |        "    <tr>\n",
400 |        "      <th>23</th>\n",
401 |        "      <td>1995</td>\n",
402 |        "      <td>$5,493.5</td>\n",
403 |        "      <td>+1.8%</td>\n",
404 |        "      <td>1,262.6</td>\n",
405 |        "      <td>-2.3%</td>\n",
406 |        "      <td>411</td>\n",
407 |        "      <td>27,805</td>\n",
408 |        "      <td>$4.35</td>\n",
409 |        "      <td>$36.4</td>\n",
410 |        "      <td>Toy Story</td>\n",
411 |        "    </tr>\n",
412 |        "    <tr>\n",
413 |        "      <th>24</th>\n",
414 |        "      <td>1994</td>\n",
415 |        "      <td>$5,396.2</td>\n",
416 |        "      <td>+4.7%</td>\n",
417 |        "      <td>1,291.7</td>\n",
418 |        "      <td>+3.8%</td>\n",
419 |        "      <td>453</td>\n",
420 |        "      <td>26,586</td>\n",
421 |        "      <td>$4.18</td>\n",
422 |        "      <td>$34.3</td>\n",
423 |        "      <td>Forrest Gump</td>\n",
424 |        "    </tr>\n",
425 |        "    <tr>\n",
426 |        "      <th>25</th>\n",
427 |        "      <td>1993</td>\n",
428 |        "      <td>$5,154.2</td>\n",
429 |        "      <td>+5.8%</td>\n",
430 |        "      <td>1,244.0</td>\n",
431 |        "      <td>+6.0%</td>\n",
432 |        "      <td>462</td>\n",
433 |        "      <td>25,737</td>\n",
434 |        "      <td>$4.14</td>\n",
435 |        "      <td>$29.9</td>\n",
436 |        "      <td>Jurassic Park</td>\n",
437 |        "    </tr>\n",
438 |        "    <tr>\n",
439 |        "      <th>26</th>\n",
440 |        "      <td>1992</td>\n",
441 |        "      <td>$4,871.0</td>\n",
442 |        "      <td>+1.4%</td>\n",
443 |        "      <td>1,173.2</td>\n",
444 |        "      <td>+2.9%</td>\n",
445 |        "      <td>480</td>\n",
446 |        "      <td>25,105</td>\n",
447 |        "      <td>$4.15</td>\n",
448 |        "      <td>$28.9</td>\n",
449 |        "      <td>Aladdin</td>\n",
450 |        "    </tr>\n",
451 |        "    <tr>\n",
452 |        "      <th>27</th>\n",
453 |        "      <td>1991</td>\n",
454 |        "      <td>$4,803.2</td>\n",
455 |        "      <td>-4.4%</td>\n",
456 |        "      <td>1,140.6</td>\n",
457 |        "      <td>-4.0%</td>\n",
458 |        "      <td>458</td>\n",
459 |        "      <td>24,570</td>\n",
460 |        "      <td>$4.21</td>\n",
461 |        "      <td>$26.1</td>\n",
462 |        "      <td>Terminator 2</td>\n",
463 |        "    </tr>\n",
464 |        "    <tr>\n",
465 |        "      <th>28</th>\n",
466 |        "      <td>1990</td>\n",
467 |        "      <td>$5,021.8</td>\n",
468 |        "      <td>-0.2%</td>\n",
469 |        "      <td>1,188.6</td>\n",
470 |        "      <td>-5.9%</td>\n",
471 |        "      <td>410</td>\n",
472 |        "      <td>23,689</td>\n",
473 |        "      <td>$4.23</td>\n",
474 |        "      <td>$26.8</td>\n",
475 |        "      <td>Home Alone</td>\n",
476 |        "    </tr>\n",
477 |        "    <tr>\n",
478 |        "      <th>29</th>\n",
479 |        "      <td>1989</td>\n",
480 |        "      <td>$5,033.4</td>\n",
481 |        "      <td>+12.9%</td>\n",
482 |        "      <td>1,262.8</td>\n",
483 |        "      <td>+16.4%</td>\n",
484 |        "      <td>502</td>\n",
485 |        "      <td>23,132</td>\n",
486 |        "      <td>$3.97</td>\n",
487 |        "      <td>$23.5</td>\n",
488 |        "      <td>Batman</td>\n",
489 |        "    </tr>\n",
490 |        "    <tr>\n",
491 |        "      <th>30</th>\n",
492 |        "      <td>1988</td>\n",
493 |        "      <td>$4,458.4</td>\n",
494 |        "      <td>+4.8%</td>\n",
495 |        "      <td>1,084.8</td>\n",
496 |        "      <td>-0.3%</td>\n",
497 |        "      <td>510</td>\n",
498 |        "      <td>23,234</td>\n",
499 |        "      <td>$4.11</td>\n",
500 |        "      <td>$18.1</td>\n",
501 |        "      <td>Rain Man</td>\n",
502 |        "    </tr>\n",
503 |        "    <tr>\n",
504 |        "      <th>31</th>\n",
505 |        "      <td>1987</td>\n",
506 |        "      <td>$4,252.9</td>\n",
507 |        "      <td>+12.6%</td>\n",
508 |        "      <td>1,088.5</td>\n",
509 |        "      <td>+7.0%</td>\n",
510 |        "      <td>509</td>\n",
511 |        "      <td>23,555</td>\n",
512 |        "      <td>$3.91</td>\n",
513 |        "      <td>$20.1</td>\n",
514 |        "      <td>Three Men and a Baby</td>\n",
515 |        "    </tr>\n",
516 |        "    <tr>\n",
517 |        "      <th>32</th>\n",
518 |        "      <td>1986</td>\n",
519 |        "      <td>$3,778.0</td>\n",
520 |        "      <td>+0.8%</td>\n",
521 |        "      <td>1,017.2</td>\n",
522 |        "      <td>-3.7%</td>\n",
523 |        "      <td>451</td>\n",
524 |        "      <td>22,765</td>\n",
525 |        "      <td>$3.71</td>\n",
526 |        "      <td>$17.5</td>\n",
527 |        "      <td>Top Gun</td>\n",
528 |        "    </tr>\n",
529 |        "    <tr>\n",
530 |        "      <th>33</th>\n",
531 |        "      <td>1985</td>\n",
532 |        "      <td>$3,749.2</td>\n",
533 |        "      <td>-7.0%</td>\n",
534 |        "      <td>1,056.1</td>\n",
535 |        "      <td>-11.9%</td>\n",
536 |        "      <td>470</td>\n",
537 |        "      <td>21,147</td>\n",
538 |        "      <td>$3.55</td>\n",
539 |        "      <td>$16.8</td>\n",
540 |        "      <td>Back to the Future</td>\n",
541 |        "    </tr>\n",
542 |        "    <tr>\n",
543 |        "      <th>34</th>\n",
544 |        "      <td>1984</td>\n",
545 |        "      <td>$4,031.0</td>\n",
546 |        "      <td>+7.0%</td>\n",
547 |        "      <td>1,199.0</td>\n",
548 |        "      <td>+0.2%</td>\n",
549 |        "      <td>536</td>\n",
550 |        "      <td>20,200</td>\n",
551 |        "      <td>$3.36</td>\n",
552 |        "      <td>$14.4</td>\n",
553 |        "      <td>Beverly Hills Cop</td>\n",
554 |        "    </tr>\n",
555 |        "    <tr>\n",
556 |        "      <th>35</th>\n",
557 |        "      <td>1983</td>\n",
558 |        "      <td>$3,766.0</td>\n",
559 |        "      <td>+9.1%</td>\n",
560 |        "      <td>1,197.0</td>\n",
561 |        "      <td>+1.9%</td>\n",
562 |        "      <td>495</td>\n",
563 |        "      <td>18,884</td>\n",
564 |        "      <td>$3.15</td>\n",
565 |        "      <td>$11.9</td>\n",
566 |        "      <td>Return of the Jedi</td>\n",
567 |        "    </tr>\n",
568 |        "    <tr>\n",
569 |        "      <th>36</th>\n",
570 |        "      <td>1982</td>\n",
571 |        "      <td>$3,453.0</td>\n",
572 |        "      <td>+16.4%</td>\n",
573 |        "      <td>1,175.0</td>\n",
574 |        "      <td>+10.1%</td>\n",
575 |        "      <td>428</td>\n",
576 |        "      <td>18,020</td>\n",
577 |        "      <td>$2.94</td>\n",
578 |        "      <td>$11.8</td>\n",
579 |        "      <td>E.T.</td>\n",
580 |        "    </tr>\n",
581 |        "    <tr>\n",
582 |        "      <th>37</th>\n",
583 |        "      <td>1981</td>\n",
584 |        "      <td>$2,966.0</td>\n",
585 |        "      <td>+7.9%</td>\n",
586 |        "      <td>1,067.0</td>\n",
587 |        "      <td>+4.4%</td>\n",
588 |        "      <td>173</td>\n",
589 |        "      <td>18,040</td>\n",
590 |        "      <td>$2.78</td>\n",
591 |        "      <td>$11.3</td>\n",
592 |        "      <td>Raiders / Lost Ark</td>\n",
593 |        "    </tr>\n",
594 |        "    <tr>\n",
595 |        "      <th>38</th>\n",
596 |        "      <td>1980</td>\n",
597 |        "      <td>$2,749.0</td>\n",
598 |        "      <td>-</td>\n",
599 |        "      <td>1,022.0</td>\n",
600 |        "      <td>-</td>\n",
601 |        "      <td>161</td>\n",
602 |        "      <td>17,590</td>\n",
603 |        "      <td>$2.69</td>\n",
604 |        "      <td>$9.4</td>\n",
605 |        "      <td>The Empire Strikes Back</td>\n",
606 |        "    </tr>\n",
607 |        "  </tbody>\n",
608 |        "</table>\n",
609 |        "</div>"
610 |       ],
611 |       "text/plain": [
612 |        "    Year TotalGross*  Change TicketsSold  Change # ofMovies TotalScreens  \\\n",
613 |        "0   2018    $4,310.3       -       470.6       -        264            -   \n",
614 |        "1   2017   $11,071.9   -2.7%     1,234.3   -6.2%        738            -   \n",
615 |        "2   2016   $11,377.7   +2.2%     1,315.3   -0.4%        736            -   \n",
616 |        "3   2015   $11,129.4   +7.4%     1,320.2   +4.1%        705            -   \n",
617 |        "4   2014   $10,361.2   -5.2%     1,268.2   -5.6%        706            -   \n",
618 |        "5   2013   $10,924.6   +0.8%     1,343.7   -1.3%        688            -   \n",
619 |        "6   2012   $10,837.6   +6.5%     1,361.5   +6.1%        669            -   \n",
620 |        "7   2011   $10,174.2   -3.7%     1,283.0   -4.2%        602            -   \n",
621 |        "8   2010   $10,565.6   -0.3%     1,339.1   -5.2%        537            -   \n",
622 |        "9   2009   $10,595.5  +10.0%     1,412.7   +5.3%        521            -   \n",
623 |        "10  2008    $9,630.7   -0.3%     1,341.3   -4.5%        607            -   \n",
624 |        "11  2007    $9,663.8   +4.9%     1,404.6   -0.1%        631            -   \n",
625 |        "12  2006    $9,209.5   +4.2%     1,406.0   +2.0%        608            -   \n",
626 |        "13  2005    $8,840.5   -5.8%     1,379.2   -8.7%        547            -   \n",
627 |        "14  2004    $9,380.5   +1.5%     1,510.5   -1.4%        551            -   \n",
628 |        "15  2003    $9,239.7   +0.9%     1,532.3   -2.8%        506            -   \n",
629 |        "16  2002    $9,155.1   +8.8%     1,575.7   +6.0%        480       35,592   \n",
630 |        "17  2001    $8,412.5   +9.8%     1,487.3   +4.7%        482       36,764   \n",
631 |        "18  2000    $7,661.0   +2.9%     1,420.8   -3.0%        478       37,396   \n",
632 |        "19  1999    $7,448.0   +7.2%     1,465.2   -1.1%        461       37,185   \n",
633 |        "20  1998    $6,949.0   +9.2%     1,480.7   +6.7%        509       34,186   \n",
634 |        "21  1997    $6,365.9   +7.7%     1,387.7   +3.7%        510       31,640   \n",
635 |        "22  1996    $5,911.5   +7.6%     1,338.6   +6.0%        471       29,690   \n",
636 |        "23  1995    $5,493.5   +1.8%     1,262.6   -2.3%        411       27,805   \n",
637 |        "24  1994    $5,396.2   +4.7%     1,291.7   +3.8%        453       26,586   \n",
638 |        "25  1993    $5,154.2   +5.8%     1,244.0   +6.0%        462       25,737   \n",
639 |        "26  1992    $4,871.0   +1.4%     1,173.2   +2.9%        480       25,105   \n",
640 |        "27  1991    $4,803.2   -4.4%     1,140.6   -4.0%        458       24,570   \n",
641 |        "28  1990    $5,021.8   -0.2%     1,188.6   -5.9%        410       23,689   \n",
642 |        "29  1989    $5,033.4  +12.9%     1,262.8  +16.4%        502       23,132   \n",
643 |        "30  1988    $4,458.4   +4.8%     1,084.8   -0.3%        510       23,234   \n",
644 |        "31  1987    $4,252.9  +12.6%     1,088.5   +7.0%        509       23,555   \n",
645 |        "32  1986    $3,778.0   +0.8%     1,017.2   -3.7%        451       22,765   \n",
646 |        "33  1985    $3,749.2   -7.0%     1,056.1  -11.9%        470       21,147   \n",
647 |        "34  1984    $4,031.0   +7.0%     1,199.0   +0.2%        536       20,200   \n",
648 |        "35  1983    $3,766.0   +9.1%     1,197.0   +1.9%        495       18,884   \n",
649 |        "36  1982    $3,453.0  +16.4%     1,175.0  +10.1%        428       18,020   \n",
650 |        "37  1981    $2,966.0   +7.9%     1,067.0   +4.4%        173       18,040   \n",
651 |        "38  1980    $2,749.0       -     1,022.0       -        161       17,590   \n",
652 |        "\n",
653 |        "   Avg.TicketPrice Avg.Cost^                             #1 Movie  \n",
654 |        "0            $9.16         -                        Black Panther  \n",
655 |        "1            $8.97         -             Star Wars: The Last Jedi  \n",
656 |        "2            $8.65         -                            Rogue One  \n",
657 |        "3            $8.43         -         Star Wars: The Force Awakens  \n",
658 |        "4            $8.17         -                      American Sniper  \n",
659 |        "5            $8.13         -                        Catching Fire  \n",
660 |        "6            $7.96         -                         The Avengers  \n",
661 |        "7            $7.93         -  Harry Potter / Deathly Hallows (P2)  \n",
662 |        "8            $7.89         -                          Toy Story 3  \n",
663 |        "9            $7.50         -                               Avatar  \n",
664 |        "10           $7.18         -                      The Dark Knight  \n",
665 |        "11           $6.88         -                         Spider-Man 3  \n",
666 |        "12           $6.55         -                     Dead Man's Chest  \n",
667 |        "13           $6.41         -                  Revenge of the Sith  \n",
668 |        "14           $6.21         -                              Shrek 2  \n",
669 |        "15           $6.03     $63.8                   Return of the King  \n",
670 |        "16           $5.81     $58.8                           Spider-Man  \n",
671 |        "17           $5.66     $47.7      Harry Potter / Sorcerer's Stone  \n",
672 |        "18           $5.39     $54.8                           The Grinch  \n",
673 |        "19           $5.08     $51.5                   The Phantom Menace  \n",
674 |        "20           $4.69     $52.7                  Saving Private Ryan  \n",
675 |        "21           $4.59     $53.4                              Titanic  \n",
676 |        "22           $4.42     $39.8                     Independence Day  \n",
677 |        "23           $4.35     $36.4                            Toy Story  \n",
678 |        "24           $4.18     $34.3                         Forrest Gump  \n",
679 |        "25           $4.14     $29.9                        Jurassic Park  \n",
680 |        "26           $4.15     $28.9                              Aladdin  \n",
681 |        "27           $4.21     $26.1                         Terminator 2  \n",
682 |        "28           $4.23     $26.8                           Home Alone  \n",
683 |        "29           $3.97     $23.5                               Batman  \n",
684 |        "30           $4.11     $18.1                             Rain Man  \n",
685 |        "31           $3.91     $20.1                 Three Men and a Baby  \n",
686 |        "32           $3.71     $17.5                              Top Gun  \n",
687 |        "33           $3.55     $16.8                   Back to the Future  \n",
688 |        "34           $3.36     $14.4                    Beverly Hills Cop  \n",
689 |        "35           $3.15     $11.9                   Return of the Jedi  \n",
690 |        "36           $2.94     $11.8                                 E.T.  \n",
691 |        "37           $2.78     $11.3                   Raiders / Lost Ark  \n",
692 |        "38           $2.69      $9.4              The Empire Strikes Back  "
693 |       ]
694 |      },
695 |      "execution_count": 3,
696 |      "metadata": {},
697 |      "output_type": "execute_result"
698 |     }
699 |    ],
700 |    "source": [
701 |     "table = soup.find('table', attrs={'cellspacing': '1'})\n",
702 |     "rows = table.find_all('tr')\n",
703 |     "from pprint import pprint\n",
704 |     "\n",
705 |     "colname = rows.pop(0)\n",
706 |     "colname = [i.text for i in colname]\n",
707 |     "rows = [list(row.stripped_strings) for row in rows]\n",
708 |     "\n",
709 |     "df = pd.DataFrame(rows, columns=colname)\n",
710 |     "df"
711 |    ]
712 |   },
713 |   {
714 |    "cell_type": "code",
715 |    "execution_count": 4,
716 |    "metadata": {},
717 |    "outputs": [
718 |     {
719 |      "name": "stdout",
720 |      "output_type": "stream",
721 |      "text": [
722 |       "Save csv to /home/dirl/github/Python-Crawling-Tutorial/results/boxofficemojo.csv\n"
723 |      ]
724 |     }
725 |    ],
726 |    "source": [
727 |     "results = os.path.abspath('../results')\n",
728 |     "if not os.path.exists(results):\n",
729 |     "    os.makedirs(results)\n",
730 |     "\n",
731 |     "filename = os.path.join(results, 'boxofficemojo.csv')\n",
732 |     "df.to_csv(filename, index=False)\n",
733 |     "print('Save csv to {}'.format(filename))"
734 |    ]
735 |   }
736 |  ],
737 |  "metadata": {
738 |   "kernelspec": {
739 |    "display_name": "Python 3",
740 |    "language": "python",
741 |    "name": "python3"
742 |   },
743 |   "language_info": {
744 |    "codemirror_mode": {
745 |     "name": "ipython",
746 |     "version": 3
747 |    },
748 |    "file_extension": ".py",
749 |    "mimetype": "text/x-python",
750 |    "name": "python",
751 |    "nbconvert_exporter": "python",
752 |    "pygments_lexer": "ipython3",
753 |    "version": "3.5.2"
754 |   }
755 |  },
756 |  "nbformat": 4,
757 |  "nbformat_minor": 2
758 | }
759 | 


--------------------------------------------------------------------------------
/00_GET_POST/01_POST_thsrc_time_table.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 練習\n",
  8 |     "\n",
  9 |     "- 觀察 https://www.thsrc.com.tw/tw/TimeTable/SearchResult 並撰寫爬蟲程式\n",
 10 |     "- 抓取一個禮拜後的高鐵時刻表\n",
 11 |     "- 台北到台南下午兩點的班次\n",
 12 |     "- 使用 requests + BeautifulSoup 實作\n",
 13 |     "- 透過 pandas 輸出成 csv"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 1,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "import os\n",
 23 |     "import requests\n",
 24 |     "import pandas as pd\n",
 25 |     "import datetime\n",
 26 |     "\n",
 27 |     "from bs4 import BeautifulSoup\n",
 28 |     "\n",
 29 |     "url = 'https://www.thsrc.com.tw/tw/TimeTable/SearchResult'"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 2,
 35 |    "metadata": {},
 36 |    "outputs": [
 37 |     {
 38 |      "name": "stdout",
 39 |      "output_type": "stream",
 40 |      "text": [
 41 |       "The date after one week - 2018/02/28\n"
 42 |      ]
 43 |     }
 44 |    ],
 45 |    "source": [
 46 |     "after_one_week = datetime.datetime.now() + datetime.timedelta(weeks=1)\n",
 47 |     "after_one_week_format = after_one_week.strftime('%Y/%m/%d')\n",
 48 |     "print('The date after one week - {}'.format(after_one_week_format))\n",
 49 |     "\n",
 50 |     "form_data = {\n",
 51 |     "    'StartStation': '977abb69-413a-4ccf-a109-0272c24fd490',\n",
 52 |     "    'EndStation': '9c5ac6ca-ec89-48f8-aab0-41b738cb1814',\n",
 53 |     "    'SearchDate': after_one_week_format,\n",
 54 |     "    'SearchTime': '14:00',\n",
 55 |     "    'SearchWay': 'DepartureInMandarin',\n",
 56 |     "    'RestTime': '',\n",
 57 |     "    'EarlyOrLater': ''\n",
 58 |     "}"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 3,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "resp = requests.post(url, data=form_data)\n",
 68 |     "resp.encoding = 'utf-8'\n",
 69 |     "soup = BeautifulSoup(resp.text, 'lxml')"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 4,
 75 |    "metadata": {},
 76 |    "outputs": [
 77 |     {
 78 |      "data": {
 79 |       "text/html": [
 80 |        "<div>\n",
 81 |        "<style scoped>\n",
 82 |        "    .dataframe tbody tr th:only-of-type {\n",
 83 |        "        vertical-align: middle;\n",
 84 |        "    }\n",
 85 |        "\n",
 86 |        "    .dataframe tbody tr th {\n",
 87 |        "        vertical-align: top;\n",
 88 |        "    }\n",
 89 |        "\n",
 90 |        "    .dataframe thead th {\n",
 91 |        "        text-align: right;\n",
 92 |        "    }\n",
 93 |        "</style>\n",
 94 |        "<table border=\"1\" class=\"dataframe\">\n",
 95 |        "  <thead>\n",
 96 |        "    <tr style=\"text-align: right;\">\n",
 97 |        "      <th></th>\n",
 98 |        "      <th>車次</th>\n",
 99 |        "      <th>出發時間</th>\n",
100 |        "      <th>抵達時間</th>\n",
101 |        "      <th>行車時間</th>\n",
102 |        "      <th>早鳥</th>\n",
103 |        "    </tr>\n",
104 |        "  </thead>\n",
105 |        "  <tbody>\n",
106 |        "    <tr>\n",
107 |        "      <th>0</th>\n",
108 |        "      <td>0833</td>\n",
109 |        "      <td>14:11</td>\n",
110 |        "      <td>16:11</td>\n",
111 |        "      <td>02:00</td>\n",
112 |        "      <td>8折起</td>\n",
113 |        "    </tr>\n",
114 |        "    <tr>\n",
115 |        "      <th>1</th>\n",
116 |        "      <td>0651</td>\n",
117 |        "      <td>14:46</td>\n",
118 |        "      <td>16:32</td>\n",
119 |        "      <td>01:46</td>\n",
120 |        "      <td></td>\n",
121 |        "    </tr>\n",
122 |        "    <tr>\n",
123 |        "      <th>2</th>\n",
124 |        "      <td>0837</td>\n",
125 |        "      <td>15:11</td>\n",
126 |        "      <td>17:11</td>\n",
127 |        "      <td>02:00</td>\n",
128 |        "      <td>8折起</td>\n",
129 |        "    </tr>\n",
130 |        "    <tr>\n",
131 |        "      <th>3</th>\n",
132 |        "      <td>0657</td>\n",
133 |        "      <td>15:46</td>\n",
134 |        "      <td>17:32</td>\n",
135 |        "      <td>01:46</td>\n",
136 |        "      <td></td>\n",
137 |        "    </tr>\n",
138 |        "    <tr>\n",
139 |        "      <th>4</th>\n",
140 |        "      <td>0841</td>\n",
141 |        "      <td>16:11</td>\n",
142 |        "      <td>18:11</td>\n",
143 |        "      <td>02:00</td>\n",
144 |        "      <td>65折起</td>\n",
145 |        "    </tr>\n",
146 |        "    <tr>\n",
147 |        "      <th>5</th>\n",
148 |        "      <td>0661</td>\n",
149 |        "      <td>16:21</td>\n",
150 |        "      <td>18:06</td>\n",
151 |        "      <td>01:45</td>\n",
152 |        "      <td>8折起</td>\n",
153 |        "    </tr>\n",
154 |        "    <tr>\n",
155 |        "      <th>6</th>\n",
156 |        "      <td>0663</td>\n",
157 |        "      <td>16:46</td>\n",
158 |        "      <td>18:32</td>\n",
159 |        "      <td>01:46</td>\n",
160 |        "      <td></td>\n",
161 |        "    </tr>\n",
162 |        "    <tr>\n",
163 |        "      <th>7</th>\n",
164 |        "      <td>0845</td>\n",
165 |        "      <td>17:11</td>\n",
166 |        "      <td>19:11</td>\n",
167 |        "      <td>02:00</td>\n",
168 |        "      <td>65折起</td>\n",
169 |        "    </tr>\n",
170 |        "    <tr>\n",
171 |        "      <th>8</th>\n",
172 |        "      <td>0667</td>\n",
173 |        "      <td>17:21</td>\n",
174 |        "      <td>19:06</td>\n",
175 |        "      <td>01:45</td>\n",
176 |        "      <td>8折起</td>\n",
177 |        "    </tr>\n",
178 |        "    <tr>\n",
179 |        "      <th>9</th>\n",
180 |        "      <td>0669</td>\n",
181 |        "      <td>17:46</td>\n",
182 |        "      <td>19:32</td>\n",
183 |        "      <td>01:46</td>\n",
184 |        "      <td></td>\n",
185 |        "    </tr>\n",
186 |        "  </tbody>\n",
187 |        "</table>\n",
188 |        "</div>"
189 |       ],
190 |       "text/plain": [
191 |        "     車次   出發時間   抵達時間   行車時間    早鳥\n",
192 |        "0  0833  14:11  16:11  02:00   8折起\n",
193 |        "1  0651  14:46  16:32  01:46      \n",
194 |        "2  0837  15:11  17:11  02:00   8折起\n",
195 |        "3  0657  15:46  17:32  01:46      \n",
196 |        "4  0841  16:11  18:11  02:00  65折起\n",
197 |        "5  0661  16:21  18:06  01:45   8折起\n",
198 |        "6  0663  16:46  18:32  01:46      \n",
199 |        "7  0845  17:11  19:11  02:00  65折起\n",
200 |        "8  0667  17:21  19:06  01:45   8折起\n",
201 |        "9  0669  17:46  19:32  01:46      "
202 |       ]
203 |      },
204 |      "execution_count": 4,
205 |      "metadata": {},
206 |      "output_type": "execute_result"
207 |     }
208 |    ],
209 |    "source": [
210 |     "rows = soup.table.find_all('tr', recursive=False)\n",
211 |     "\n",
212 |     "colname, rows = rows[1], rows[2:]\n",
213 |     "colname = list(colname.stripped_strings)\n",
214 |     "\n",
215 |     "for i, row in enumerate(rows):\n",
216 |     "    trips = row.find('td', class_='column1')\n",
217 |     "    t_departure = row.find('td', class_='column3')\n",
218 |     "    t_arrive = row.find('td', class_='column4')\n",
219 |     "    duration = row.find('td', class_='column2')\n",
220 |     "    early_ticket = row.find('td', class_='Width1')\n",
221 |     "    \n",
222 |     "    trips = trips.text if trips else None\n",
223 |     "    t_departure = t_departure.text if t_departure else ''\n",
224 |     "    t_arrive = t_arrive.text if t_arrive else ''\n",
225 |     "    duration = duration.text if duration else ''\n",
226 |     "    early_ticket = list(early_ticket.stripped_strings) if early_ticket else ''\n",
227 |     "    early_ticket = early_ticket[0] if early_ticket else ''\n",
228 |     "    \n",
229 |     "    rows[i] = [trips, t_departure, t_arrive, duration, early_ticket]\n",
230 |     "\n",
231 |     "df = pd.DataFrame(rows, columns=colname)\n",
232 |     "df"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": 5,
238 |    "metadata": {},
239 |    "outputs": [
240 |     {
241 |      "name": "stdout",
242 |      "output_type": "stream",
243 |      "text": [
244 |       "Save csv to /home/afun/github/Python-Crawling-Tutorial/results/thsrc_20180228.csv\n"
245 |      ]
246 |     }
247 |    ],
248 |    "source": [
249 |     "results = os.path.abspath('../results')\n",
250 |     "if not os.path.exists(results):\n",
251 |     "    os.makedirs(results)\n",
252 |     "\n",
253 |     "filename = os.path.join(results, 'thsrc_{}.csv'.format(after_one_week.strftime('%Y%m%d')))\n",
254 |     "df.to_csv(filename, index=False)\n",
255 |     "print('Save csv to {}'.format(filename))"
256 |    ]
257 |   }
258 |  ],
259 |  "metadata": {
260 |   "kernelspec": {
261 |    "display_name": "Python 3",
262 |    "language": "python",
263 |    "name": "python3"
264 |   },
265 |   "language_info": {
266 |    "codemirror_mode": {
267 |     "name": "ipython",
268 |     "version": 3
269 |    },
270 |    "file_extension": ".py",
271 |    "mimetype": "text/x-python",
272 |    "name": "python",
273 |    "nbconvert_exporter": "python",
274 |    "pygments_lexer": "ipython3",
275 |    "version": "3.5.2"
276 |   }
277 |  },
278 |  "nbformat": 4,
279 |  "nbformat_minor": 2
280 | }
281 | 


--------------------------------------------------------------------------------
/00_GET_POST/02_google_search_result.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 爬取 google 搜尋結果的第一個頁面標題\n",
  8 |     "\n",
  9 |     "- 練習使用 beautifulsoup css selector"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import requests\n",
 19 |     "\n",
 20 |     "from bs4 import BeautifulSoup\n",
 21 |     "from urllib3.exceptions import HTTPError\n",
 22 |     "from urllib.parse import urljoin"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 2,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "base_url = 'https://www.google.com.tw/search'\n",
 32 |     "query = {'q': 'python'}"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 3,
 38 |    "metadata": {},
 39 |    "outputs": [
 40 |     {
 41 |      "name": "stdout",
 42 |      "output_type": "stream",
 43 |      "text": [
 44 |       "https://www.google.com.tw/search?q=python\n"
 45 |      ]
 46 |     }
 47 |    ],
 48 |    "source": [
 49 |     "try:\n",
 50 |     "    resp = requests.get(base_url, params=query)\n",
 51 |     "    soup = BeautifulSoup(resp.text, 'lxml')\n",
 52 |     "    print(resp.url)\n",
 53 |     "except HTTPError as err:\n",
 54 |     "    print(err)\n",
 55 |     "except AttributeError as err:\n",
 56 |     "    print(err)"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 4,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "# print(soup.prettify())"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 5,
 71 |    "metadata": {},
 72 |    "outputs": [
 73 |     {
 74 |      "name": "stdout",
 75 |      "output_type": "stream",
 76 |      "text": [
 77 |       "Welcome to Python.org\n",
 78 |       "https://www.google.com.tw/url?q=https://www.python.org/&sa=U&ved=0ahUKEwj-8PeJzfvdAhUEa7wKHfNzBOoQFggoMAA&usg=AOvVaw348GGzSkqgB-FXPinUSErY\n",
 79 |       "=======================================================================================\n",
 80 |       "Download Python | Python.org\n",
 81 |       "https://www.google.com.tw/url?q=https://www.python.org/downloads/&sa=U&ved=0ahUKEwj-8PeJzfvdAhUEa7wKHfNzBOoQFggzMAE&usg=AOvVaw2UHusa0FkZGKEoJRjlxYza\n",
 82 |       "=======================================================================================\n",
 83 |       "Python - 維基百科，自由的百科全書 - Wikipedia\n",
 84 |       "https://www.google.com.tw/url?q=https://zh.wikipedia.org/zh-tw/Python&sa=U&ved=0ahUKEwj-8PeJzfvdAhUEa7wKHfNzBOoQFgg5MAI&usg=AOvVaw1gsx_ugnMzjTP2nlH7zARm\n",
 85 |       "=======================================================================================\n",
 86 |       "一小時Python入門-part 1 - - 寫點科普\n",
 87 |       "https://www.google.com.tw/url?q=https://kopu.chat/2017/01/18/%25E4%25B8%2580%25E5%25B0%258F%25E6%2599%2582python%25E5%2585%25A5%25E9%2596%2580-part-1/&sa=U&ved=0ahUKEwj-8PeJzfvdAhUEa7wKHfNzBOoQFghEMAM&usg=AOvVaw1BLo112Hj6BBWauFDpnbQN\n",
 88 |       "=======================================================================================\n",
 89 |       "課程介紹- 成為python數據分析達人的第一課(自學課程) | 政治大學磨 ...\n",
 90 |       "https://www.google.com.tw/url?q=http://moocs.nccu.edu.tw/course/123&sa=U&ved=0ahUKEwj-8PeJzfvdAhUEa7wKHfNzBOoQFghOMAQ&usg=AOvVaw3RXTAa5ochrAyo-2evVdhI\n",
 91 |       "=======================================================================================\n",
 92 |       "《經濟學人》專文探討：「為什麼Python 是世上最屌的程式語言 ...\n",
 93 |       "https://www.google.com.tw/url?q=https://buzzorange.com/techorange/2018/08/01/python-a-skr-language/&sa=U&ved=0ahUKEwj-8PeJzfvdAhUEa7wKHfNzBOoQFghTMAU&usg=AOvVaw2yA2hrrl61qBKnKoEeeTix\n",
 94 |       "=======================================================================================\n",
 95 |       "Python 入門| Django Girls Taipei\n",
 96 |       "https://www.google.com.tw/url?q=http://djangogirlstaipei.herokuapp.com/tutorials/python/&sa=U&ved=0ahUKEwj-8PeJzfvdAhUEa7wKHfNzBOoQFghZMAY&usg=AOvVaw0ha-itZMKnVgaSsRQlcutt\n",
 97 |       "=======================================================================================\n",
 98 |       "Python Tutorial: Learn Python For Free | Codecademy\n",
 99 |       "https://www.google.com.tw/url?q=https://www.codecademy.com/learn/learn-python&sa=U&ved=0ahUKEwj-8PeJzfvdAhUEa7wKHfNzBOoQFghfMAc&usg=AOvVaw09DURYBaIbVzO6GSXKb0gH\n",
100 |       "=======================================================================================\n",
101 |       "Python Tutorial - W3Schools\n",
102 |       "https://www.google.com.tw/url?q=https://www.w3schools.com/python/&sa=U&ved=0ahUKEwj-8PeJzfvdAhUEa7wKHfNzBOoQFghlMAg&usg=AOvVaw2kfYHx2obM5EhxwIrSMn-4\n",
103 |       "=======================================================================================\n"
104 |      ]
105 |     }
106 |    ],
107 |    "source": [
108 |     "search_results = soup.select('div.g > h3.r > a[href^=\"/url\"]')\n",
109 |     "for search_item in search_results:\n",
110 |     "    print(search_item.text)\n",
111 |     "    print(urljoin(base_url, search_item['href']))\n",
112 |     "    print('='*87)"
113 |    ]
114 |   }
115 |  ],
116 |  "metadata": {
117 |   "kernelspec": {
118 |    "display_name": "Python 3",
119 |    "language": "python",
120 |    "name": "python3"
121 |   },
122 |   "language_info": {
123 |    "codemirror_mode": {
124 |     "name": "ipython",
125 |     "version": 3
126 |    },
127 |    "file_extension": ".py",
128 |    "mimetype": "text/x-python",
129 |    "name": "python",
130 |    "nbconvert_exporter": "python",
131 |    "pygments_lexer": "ipython3",
132 |    "version": "3.6.6"
133 |   }
134 |  },
135 |  "nbformat": 4,
136 |  "nbformat_minor": 2
137 | }
138 | 


--------------------------------------------------------------------------------
/00_GET_POST/get_post_diff.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 觀察 GET / POST 的差別\n",
  8 |     "\n",
  9 |     "透過 postman 網站的測試觀察 GET 與 POST 之間的差別\n",
 10 |     "\n",
 11 |     "- https://docs.postman-echo.com/"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import requests\n",
 21 |     "from pprint import pformat"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "## GET request\n",
 29 |     "\n",
 30 |     "- 觀察回傳的內容\n",
 31 |     "- 觀察 URL"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 2,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "get_url = 'https://postman-echo.com/get'\n",
 41 |     "query = {\n",
 42 |     "    'name': 'afun',\n",
 43 |     "    'msg': 'A Foolish Consistency is the Hobgoblin of Little Minds'\n",
 44 |     "}\n",
 45 |     "\n",
 46 |     "get_resp = requests.get(get_url, params=query)"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 3,
 52 |    "metadata": {},
 53 |    "outputs": [
 54 |     {
 55 |      "name": "stdout",
 56 |      "output_type": "stream",
 57 |      "text": [
 58 |       "response text ('{\"args\":{\"name\":\"afun\",\"msg\":\"A Foolish Consistency is the Hobgoblin of '\n",
 59 |       " 'Little '\n",
 60 |       " 'Minds\"},\"headers\":{\"host\":\"postman-echo.com\",\"accept\":\"*/*\",\"accept-encoding\":\"gzip, '\n",
 61 |       " 'deflate\",\"user-agent\":\"python-requests/2.19.1\",\"x-forwarded-port\":\"443\",\"x-forwarded-proto\":\"https\"},\"url\":\"https://postman-echo.com/get?name=afun&msg=A+Foolish+Consistency+is+the+Hobgoblin+of+Little+Minds\"}')\n",
 62 |       "=======================================================================================\n",
 63 |       "original URL - https://postman-echo.com/get\n",
 64 |       "GET URL - https://postman-echo.com/get?name=afun&msg=A+Foolish+Consistency+is+the+Hobgoblin+of+Little+Minds\n"
 65 |      ]
 66 |     }
 67 |    ],
 68 |    "source": [
 69 |     "print('response text', pformat(get_resp.text))\n",
 70 |     "print('='*87)\n",
 71 |     "print('original URL -', get_url)\n",
 72 |     "print('GET URL -', get_resp.url)"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "## POST request\n",
 80 |     "\n",
 81 |     "- 觀察回傳的內容\n",
 82 |     "- 觀察 URL"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 4,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "post_url = 'https://postman-echo.com/post'\n",
 92 |     "payload = 'A Foolish Consistency is the Hobgoblin of Little Minds'\n",
 93 |     "\n",
 94 |     "post_resp = requests.post(post_url, data=payload)"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 5,
100 |    "metadata": {},
101 |    "outputs": [
102 |     {
103 |      "name": "stdout",
104 |      "output_type": "stream",
105 |      "text": [
106 |       "response text ('{\"args\":{},\"data\":{},\"files\":{},\"form\":{},\"headers\":{\"host\":\"postman-echo.com\",\"content-length\":\"54\",\"accept\":\"*/*\",\"accept-encoding\":\"gzip, '\n",
107 |       " 'deflate\",\"user-agent\":\"python-requests/2.19.1\",\"x-forwarded-port\":\"443\",\"x-forwarded-proto\":\"https\"},\"json\":null,\"url\":\"https://postman-echo.com/post\"}')\n",
108 |       "=======================================================================================\n",
109 |       "original URL - https://postman-echo.com/post\n",
110 |       "GET URL - https://postman-echo.com/post\n"
111 |      ]
112 |     }
113 |    ],
114 |    "source": [
115 |     "print('response text', pformat(post_resp.text))\n",
116 |     "print('='*87)\n",
117 |     "print('original URL -', post_url)\n",
118 |     "print('GET URL -', post_resp.url)"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 6,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "query = {\n",
128 |     "    'name': 'afun',\n",
129 |     "    'msg': 'A Foolish Consistency is the Hobgoblin of Little Minds'\n",
130 |     "}\n",
131 |     "post_form_data_resp = requests.post(post_url, data=query)"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 7,
137 |    "metadata": {},
138 |    "outputs": [
139 |     {
140 |      "name": "stdout",
141 |      "output_type": "stream",
142 |      "text": [
143 |       "response text ('{\"args\":{},\"data\":\"\",\"files\":{},\"form\":{\"name\":\"afun\",\"msg\":\"A Foolish '\n",
144 |       " 'Consistency is the Hobgoblin of Little '\n",
145 |       " 'Minds\"},\"headers\":{\"host\":\"postman-echo.com\",\"content-length\":\"68\",\"accept\":\"*/*\",\"accept-encoding\":\"gzip, '\n",
146 |       " 'deflate\",\"content-type\":\"application/x-www-form-urlencoded\",\"user-agent\":\"python-requests/2.19.1\",\"x-forwarded-port\":\"443\",\"x-forwarded-proto\":\"https\"},\"json\":{\"name\":\"afun\",\"msg\":\"A '\n",
147 |       " 'Foolish Consistency is the Hobgoblin of Little '\n",
148 |       " 'Minds\"},\"url\":\"https://postman-echo.com/post\"}')\n",
149 |       "=======================================================================================\n",
150 |       "original URL - https://postman-echo.com/post\n",
151 |       "GET URL - https://postman-echo.com/post\n"
152 |      ]
153 |     }
154 |    ],
155 |    "source": [
156 |     "print('response text', pformat(post_form_data_resp.text))\n",
157 |     "print('='*87)\n",
158 |     "print('original URL -', post_url)\n",
159 |     "print('GET URL -', post_form_data_resp.url)"
160 |    ]
161 |   }
162 |  ],
163 |  "metadata": {
164 |   "kernelspec": {
165 |    "display_name": "Python 3",
166 |    "language": "python",
167 |    "name": "python3"
168 |   },
169 |   "language_info": {
170 |    "codemirror_mode": {
171 |     "name": "ipython",
172 |     "version": 3
173 |    },
174 |    "file_extension": ".py",
175 |    "mimetype": "text/x-python",
176 |    "name": "python",
177 |    "nbconvert_exporter": "python",
178 |    "pygments_lexer": "ipython3",
179 |    "version": "3.6.6"
180 |   }
181 |  },
182 |  "nbformat": 4,
183 |  "nbformat_minor": 2
184 | }
185 | 


--------------------------------------------------------------------------------
/01_files_website/00_image_crawling.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 練習\n",
  8 |     "\n",
  9 |     "- 觀察 https://www.pexels.com/ 並撰寫爬蟲程式\n",
 10 |     "- 下載 5 張桌布圖"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 1,
 16 |    "metadata": {
 17 |     "collapsed": true
 18 |    },
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "import requests\n",
 22 |     "import re\n",
 23 |     "import os\n",
 24 |     "\n",
 25 |     "from bs4 import BeautifulSoup\n",
 26 |     "from pprint import pprint\n",
 27 |     "\n",
 28 |     "url = 'https://www.pexels.com/'"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 2,
 34 |    "metadata": {
 35 |     "collapsed": true
 36 |    },
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "resp = requests.get(url)\n",
 40 |     "soup = BeautifulSoup(resp.text, 'lxml')"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 3,
 46 |    "metadata": {},
 47 |    "outputs": [
 48 |     {
 49 |      "name": "stdout",
 50 |      "output_type": "stream",
 51 |      "text": [
 52 |       "['https://images.pexels.com/photos/106606/pexels-photo-106606.jpeg?h=350&auto=compress&cs=tinysrgb',\n",
 53 |       " 'https://images.pexels.com/photos/405041/pexels-photo-405041.jpeg?h=350&auto=compress&cs=tinysrgb',\n",
 54 |       " 'https://images.pexels.com/photos/102170/pexels-photo-102170.jpeg?h=350&auto=compress&cs=tinysrgb',\n",
 55 |       " 'https://images.pexels.com/photos/583399/pexels-photo-583399.jpeg?h=350&auto=compress&cs=tinysrgb',\n",
 56 |       " 'https://images.pexels.com/photos/398533/pexels-photo-398533.jpeg?h=350&auto=compress&cs=tinysrgb']\n"
 57 |      ]
 58 |     }
 59 |    ],
 60 |    "source": [
 61 |     "article = soup.find('div', class_='photos').find_all('article', class_='photo-item')\n",
 62 |     "imgs = [a.find('a').find('img')['src'] for a in article]\n",
 63 |     "target = imgs[:5]\n",
 64 |     "\n",
 65 |     "pprint(target)"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 4,
 71 |    "metadata": {},
 72 |    "outputs": [
 73 |     {
 74 |      "name": "stdout",
 75 |      "output_type": "stream",
 76 |      "text": [
 77 |       "regex catch the name pexels-photo-106606.jpeg\n",
 78 |       "Save the img at /home/dirl/github/Python-Crawling-Tutorial/results/pexels-photo-106606.jpeg\n",
 79 |       "regex catch the name pexels-photo-405041.jpeg\n",
 80 |       "Save the img at /home/dirl/github/Python-Crawling-Tutorial/results/pexels-photo-405041.jpeg\n",
 81 |       "regex catch the name pexels-photo-102170.jpeg\n",
 82 |       "Save the img at /home/dirl/github/Python-Crawling-Tutorial/results/pexels-photo-102170.jpeg\n",
 83 |       "regex catch the name pexels-photo-583399.jpeg\n",
 84 |       "Save the img at /home/dirl/github/Python-Crawling-Tutorial/results/pexels-photo-583399.jpeg\n",
 85 |       "regex catch the name pexels-photo-398533.jpeg\n",
 86 |       "Save the img at /home/dirl/github/Python-Crawling-Tutorial/results/pexels-photo-398533.jpeg\n"
 87 |      ]
 88 |     }
 89 |    ],
 90 |    "source": [
 91 |     "results = os.path.abspath('../results')\n",
 92 |     "\n",
 93 |     "if not os.path.exists(results):\n",
 94 |     "    os.makedirs(results)\n",
 95 |     "\n",
 96 |     "for i in target:\n",
 97 |     "    img_resp = requests.get(i, stream=True)    \n",
 98 |     "    filename = re.match(r\".*(pexels-photo-([0-9]{6})\\.jpeg).*\", i).group(1)\n",
 99 |     "    print('regex catch the name {}'.format(filename))\n",
100 |     "    \n",
101 |     "    filename = os.path.join(results, filename)\n",
102 |     "\n",
103 |     "    with open(filename, 'wb') as f:\n",
104 |     "        for chunk in img_resp.iter_content(2048):\n",
105 |     "            f.write(chunk)\n",
106 |     "        print('Save the img at {}'.format(filename))"
107 |    ]
108 |   }
109 |  ],
110 |  "metadata": {
111 |   "kernelspec": {
112 |    "display_name": "Python 3",
113 |    "language": "python",
114 |    "name": "python3"
115 |   },
116 |   "language_info": {
117 |    "codemirror_mode": {
118 |     "name": "ipython",
119 |     "version": 3
120 |    },
121 |    "file_extension": ".py",
122 |    "mimetype": "text/x-python",
123 |    "name": "python",
124 |    "nbconvert_exporter": "python",
125 |    "pygments_lexer": "ipython3",
126 |    "version": "3.5.2"
127 |   }
128 |  },
129 |  "nbformat": 4,
130 |  "nbformat_minor": 2
131 | }
132 | 


--------------------------------------------------------------------------------
/01_files_website/01_image_crawling_and_check_format.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 練習\n",
  8 |     "\n",
  9 |     "- 觀察 https://afuntw.github.io/Test-Crawling-Website/pages/portfolio/index.html 並撰寫爬蟲程式\n",
 10 |     "- 下載 5 張圖片\n",
 11 |     "- 以正確的圖片格式存檔"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {
 18 |     "collapsed": true
 19 |    },
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "import requests\n",
 23 |     "import os\n",
 24 |     "\n",
 25 |     "from PIL import Image\n",
 26 |     "from bs4 import BeautifulSoup\n",
 27 |     "from pprint import pprint\n",
 28 |     "\n",
 29 |     "url = 'https://afuntw.github.io/Test-Crawling-Website/pages/portfolio/index.html'"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 2,
 35 |    "metadata": {
 36 |     "collapsed": true
 37 |    },
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "resp = requests.get(url)\n",
 41 |     "soup = BeautifulSoup(resp.text, 'lxml')"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 3,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "imgs = soup.find_all('img')\n",
 51 |     "imgs = [i['src'] for i in imgs]"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 4,
 57 |    "metadata": {},
 58 |    "outputs": [
 59 |     {
 60 |      "name": "stdout",
 61 |      "output_type": "stream",
 62 |      "text": [
 63 |       "catch the filename XgXT3Va.png and the real format is JPEG\n",
 64 |       "catch the real filename XgXT3Va.jpeg\n",
 65 |       "save image at /home/dirl/github/Python-Crawling-Tutorial/results/XgXT3Va.jpeg\n",
 66 |       "catch the filename Q3bkStv.png and the real format is PNG\n",
 67 |       "catch the real filename Q3bkStv.png\n",
 68 |       "save image at /home/dirl/github/Python-Crawling-Tutorial/results/Q3bkStv.png\n",
 69 |       "catch the filename IDPxvSl.jpg and the real format is PNG\n",
 70 |       "catch the real filename IDPxvSl.png\n",
 71 |       "save image at /home/dirl/github/Python-Crawling-Tutorial/results/IDPxvSl.png\n",
 72 |       "catch the filename ZEhBDs6.png and the real format is PNG\n",
 73 |       "catch the real filename ZEhBDs6.png\n",
 74 |       "save image at /home/dirl/github/Python-Crawling-Tutorial/results/ZEhBDs6.png\n",
 75 |       "catch the filename UKxK6FZ.gif and the real format is PNG\n",
 76 |       "catch the real filename UKxK6FZ.png\n",
 77 |       "save image at /home/dirl/github/Python-Crawling-Tutorial/results/UKxK6FZ.png\n"
 78 |      ]
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "results = os.path.abspath('../results')\n",
 83 |     "if not os.path.exists(results):\n",
 84 |     "    os.makedirs(results)\n",
 85 |     "\n",
 86 |     "for i in imgs:\n",
 87 |     "    img_resp = requests.get(i, stream=True)\n",
 88 |     "    image = Image.open(img_resp.raw)\n",
 89 |     "    filename = os.path.basename(i)\n",
 90 |     "    print('catch the filename {} and the real format is {}'.format(filename, image.format))\n",
 91 |     "    \n",
 92 |     "    real_filename = '{}.{}'.format(\n",
 93 |     "        filename.split('.')[0],\n",
 94 |     "        image.format.lower()\n",
 95 |     "    )\n",
 96 |     "    save_filename = os.path.join(results, real_filename)\n",
 97 |     "    print('catch the real filename {}'.format(real_filename))\n",
 98 |     "    \n",
 99 |     "    image.save(save_filename)\n",
100 |     "    print('save image at {}'.format(save_filename))"
101 |    ]
102 |   }
103 |  ],
104 |  "metadata": {
105 |   "kernelspec": {
106 |    "display_name": "Python 3",
107 |    "language": "python",
108 |    "name": "python3"
109 |   },
110 |   "language_info": {
111 |    "codemirror_mode": {
112 |     "name": "ipython",
113 |     "version": 3
114 |    },
115 |    "file_extension": ".py",
116 |    "mimetype": "text/x-python",
117 |    "name": "python",
118 |    "nbconvert_exporter": "python",
119 |    "pygments_lexer": "ipython3",
120 |    "version": "3.5.2"
121 |   }
122 |  },
123 |  "nbformat": 4,
124 |  "nbformat_minor": 2
125 | }
126 | 


--------------------------------------------------------------------------------
/01_files_website/02_file_crawling.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 練習\n",
  8 |     "\n",
  9 |     "- 觀察 http://exam.lib.ntu.edu.tw/graduate 並撰寫爬蟲程式\n",
 10 |     "- request 附上 User-Agent 資訊\n",
 11 |     "- 下載頁面上所有 pdf 考古題檔案"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {
 18 |     "collapsed": true
 19 |    },
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "import requests\n",
 23 |     "import re\n",
 24 |     "import os\n",
 25 |     "\n",
 26 |     "from PIL import Image\n",
 27 |     "from bs4 import BeautifulSoup\n",
 28 |     "from fake_useragent import UserAgent\n",
 29 |     "from urllib.parse import urljoin\n",
 30 |     "from pprint import pprint\n",
 31 |     "\n",
 32 |     "url = 'http://exam.lib.ntu.edu.tw/graduate'"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 2,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "fu = UserAgent()\n",
 42 |     "headers = {'User-Agent': fu.random}\n",
 43 |     "resp = requests.get(url, headers=headers)\n",
 44 |     "soup = BeautifulSoup(resp.text, 'lxml')"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 3,
 50 |    "metadata": {},
 51 |    "outputs": [
 52 |     {
 53 |      "name": "stdout",
 54 |      "output_type": "stream",
 55 |      "text": [
 56 |       "(1/30) catch the filename 106_graduate_4.pdf\n",
 57 |       "(1/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_4.pdf\n",
 58 |       "(2/30) catch the filename 106_graduate_6.pdf\n",
 59 |       "(2/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_6.pdf\n",
 60 |       "(3/30) catch the filename 106_graduate_3.pdf\n",
 61 |       "(3/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_3.pdf\n",
 62 |       "(4/30) catch the filename 106_graduate_1.pdf\n",
 63 |       "(4/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_1.pdf\n",
 64 |       "(5/30) catch the filename 106_graduate_2.pdf\n",
 65 |       "(5/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_2.pdf\n",
 66 |       "(6/30) catch the filename 106_graduate_8.pdf\n",
 67 |       "(6/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_8.pdf\n",
 68 |       "(7/30) catch the filename 106_graduate_5.pdf\n",
 69 |       "(7/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_5.pdf\n",
 70 |       "(8/30) catch the filename 106_graduate_10.pdf\n",
 71 |       "(8/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_10.pdf\n",
 72 |       "(9/30) catch the filename 106_graduate_7.pdf\n",
 73 |       "(9/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_7.pdf\n",
 74 |       "(10/30) catch the filename 106_graduate_11.pdf\n",
 75 |       "(10/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_11.pdf\n",
 76 |       "(11/30) catch the filename 106_graduate_13.pdf\n",
 77 |       "(11/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_13.pdf\n",
 78 |       "(12/30) catch the filename 106_graduate_15.pdf\n",
 79 |       "(12/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_15.pdf\n",
 80 |       "(13/30) catch the filename 106_graduate_14.pdf\n",
 81 |       "(13/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_14.pdf\n",
 82 |       "(14/30) catch the filename 106_graduate_8.pdf\n",
 83 |       "(14/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_8.pdf\n",
 84 |       "(15/30) catch the filename 106_graduate_5.pdf\n",
 85 |       "(15/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_5.pdf\n",
 86 |       "(16/30) catch the filename 106_graduate_16.pdf\n",
 87 |       "(16/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_16.pdf\n",
 88 |       "(17/30) catch the filename 106_graduate_17.pdf\n",
 89 |       "(17/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_17.pdf\n",
 90 |       "(18/30) catch the filename 106_graduate_18.pdf\n",
 91 |       "(18/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_18.pdf\n",
 92 |       "(19/30) catch the filename 106_graduate_19.pdf\n",
 93 |       "(19/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_19.pdf\n",
 94 |       "(20/30) catch the filename 106_graduate_17.pdf\n",
 95 |       "(20/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_17.pdf\n",
 96 |       "(21/30) catch the filename 106_graduate_20.pdf\n",
 97 |       "(21/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_20.pdf\n",
 98 |       "(22/30) catch the filename 106_graduate_22.pdf\n",
 99 |       "(22/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_22.pdf\n",
100 |       "(23/30) catch the filename 106_graduate_21.pdf\n",
101 |       "(23/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_21.pdf\n",
102 |       "(24/30) catch the filename 106_graduate_8.pdf\n",
103 |       "(24/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_8.pdf\n",
104 |       "(25/30) catch the filename 106_graduate_25.pdf\n",
105 |       "(25/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_25.pdf\n",
106 |       "(26/30) catch the filename 106_graduate_23.pdf\n",
107 |       "(26/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_23.pdf\n",
108 |       "(27/30) catch the filename 106_graduate_24.pdf\n",
109 |       "(27/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_24.pdf\n",
110 |       "(28/30) catch the filename 106_graduate_8.pdf\n",
111 |       "(28/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_8.pdf\n",
112 |       "(29/30) catch the filename 106_graduate_26.pdf\n",
113 |       "(29/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_26.pdf\n",
114 |       "(30/30) catch the filename 106_graduate_28.pdf\n",
115 |       "(30/30) save file /home/dirl/github/Python-Crawling-Tutorial/results/106_graduate_28.pdf\n"
116 |      ]
117 |     }
118 |    ],
119 |    "source": [
120 |     "results = os.path.abspath('../results')\n",
121 |     "if not os.path.exists(results):\n",
122 |     "    os.makedirs(results)\n",
123 |     "\n",
124 |     "pdfs = soup.find_all('img', class_=re.compile('.*field-icon-application-pdf$'))\n",
125 |     "for i, pdf in enumerate(pdfs):\n",
126 |     "    href = pdf.parent['href']\n",
127 |     "    abs_href = urljoin(resp.url, href)\n",
128 |     "    file_resp = requests.get(abs_href, headers=headers, stream=True)\n",
129 |     "    \n",
130 |     "    filename = os.path.basename(abs_href)\n",
131 |     "    filename = filename.split('&')[0]\n",
132 |     "    print('({}/{}) catch the filename {}'.format(i+1, len(pdfs), filename))\n",
133 |     "    filename = os.path.join(results, filename)\n",
134 |     "\n",
135 |     "    with open(filename, 'wb') as f:\n",
136 |     "        for chunk in file_resp.iter_content(2048):\n",
137 |     "            f.write(chunk)\n",
138 |     "        print('({}/{}) save file {}'.format(i+1, len(pdfs),filename))"
139 |    ]
140 |   }
141 |  ],
142 |  "metadata": {
143 |   "kernelspec": {
144 |    "display_name": "Python 3",
145 |    "language": "python",
146 |    "name": "python3"
147 |   },
148 |   "language_info": {
149 |    "codemirror_mode": {
150 |     "name": "ipython",
151 |     "version": 3
152 |    },
153 |    "file_extension": ".py",
154 |    "mimetype": "text/x-python",
155 |    "name": "python",
156 |    "nbconvert_exporter": "python",
157 |    "pygments_lexer": "ipython3",
158 |    "version": "3.5.2"
159 |   }
160 |  },
161 |  "nbformat": 4,
162 |  "nbformat_minor": 2
163 | }
164 | 


--------------------------------------------------------------------------------
/01_files_website/03_website_crawling.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 練習\n",
  8 |     "\n",
  9 |     "- 觀察 https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html 並撰寫爬蟲程式\n",
 10 |     "- request 附上 User-Agent 資訊\n",
 11 |     "- 下載網站上每個網頁的標題"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {
 18 |     "collapsed": true
 19 |    },
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "import requests\n",
 23 |     "import re\n",
 24 |     "import os\n",
 25 |     "\n",
 26 |     "from PIL import Image\n",
 27 |     "from bs4 import BeautifulSoup\n",
 28 |     "from fake_useragent import UserAgent\n",
 29 |     "from urllib.parse import urljoin\n",
 30 |     "from pprint import pprint\n",
 31 |     "\n",
 32 |     "url = 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html'"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 2,
 38 |    "metadata": {
 39 |     "collapsed": true
 40 |    },
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "fu = UserAgent()\n",
 44 |     "headers = {'User-Agent': fu.random}\n",
 45 |     "resp = requests.get(url, headers=headers)\n",
 46 |     "soup = BeautifulSoup(resp.text, 'lxml')"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 3,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "wait_list = []\n",
 56 |     "view_list = []\n",
 57 |     "links = soup.find_all('a')\n",
 58 |     "links = [link['href'] for link in links]\n",
 59 |     "links = [urljoin(resp.url, link) for link in links]\n",
 60 |     "links = list(set(links))\n",
 61 |     "wait_list += links"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 4,
 67 |    "metadata": {},
 68 |    "outputs": [
 69 |     {
 70 |      "name": "stdout",
 71 |      "output_type": "stream",
 72 |      "text": [
 73 |       "https://afuntw.github.io/Test-Crawling-Website/pages/blog/post.html\n",
 74 |       "wait list:\n",
 75 |       "['https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html',\n",
 76 |       " 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/contact.html',\n",
 77 |       " 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/about.html']\n",
 78 |       "view list:\n",
 79 |       "['https://afuntw.github.io/Test-Crawling-Website/pages/blog/post.html']\n",
 80 |       "all text:\n",
 81 |       "['Man must explore, and this is exploration at its greatest']\n",
 82 |       "=======================================================================================\n",
 83 |       "https://afuntw.github.io/Test-Crawling-Website/pages/blog/about.html\n",
 84 |       "wait list:\n",
 85 |       "['https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html',\n",
 86 |       " 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/contact.html']\n",
 87 |       "view list:\n",
 88 |       "['https://afuntw.github.io/Test-Crawling-Website/pages/blog/post.html',\n",
 89 |       " 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/about.html']\n",
 90 |       "all text:\n",
 91 |       "['Man must explore, and this is exploration at its greatest', 'About Me']\n",
 92 |       "=======================================================================================\n",
 93 |       "https://afuntw.github.io/Test-Crawling-Website/pages/blog/contact.html\n",
 94 |       "wait list:\n",
 95 |       "['https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html']\n",
 96 |       "view list:\n",
 97 |       "['https://afuntw.github.io/Test-Crawling-Website/pages/blog/post.html',\n",
 98 |       " 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/about.html',\n",
 99 |       " 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/contact.html']\n",
100 |       "all text:\n",
101 |       "['Man must explore, and this is exploration at its greatest',\n",
102 |       " 'About Me',\n",
103 |       " 'Contact Me']\n",
104 |       "=======================================================================================\n",
105 |       "https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html\n",
106 |       "wait list:\n",
107 |       "[]\n",
108 |       "view list:\n",
109 |       "['https://afuntw.github.io/Test-Crawling-Website/pages/blog/post.html',\n",
110 |       " 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/about.html',\n",
111 |       " 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/contact.html',\n",
112 |       " 'https://afuntw.github.io/Test-Crawling-Website/pages/blog/index.html']\n",
113 |       "all text:\n",
114 |       "['Man must explore, and this is exploration at its greatest',\n",
115 |       " 'About Me',\n",
116 |       " 'Contact Me',\n",
117 |       " 'Clean Blog']\n",
118 |       "=======================================================================================\n"
119 |      ]
120 |     }
121 |    ],
122 |    "source": [
123 |     "all_h1_text = []\n",
124 |     "\n",
125 |     "while wait_list:\n",
126 |     "\n",
127 |     "    link = wait_list.pop()\n",
128 |     "    if link in view_list:\n",
129 |     "        continue\n",
130 |     "    \n",
131 |     "    print(link)\n",
132 |     "    view_list.append(link)\n",
133 |     "    \n",
134 |     "    page_resp = requests.get(link, headers=headers)\n",
135 |     "    page_soup = BeautifulSoup(page_resp.text, 'lxml')\n",
136 |     "    \n",
137 |     "    # get h1 tag on current page\n",
138 |     "    h1s = page_soup.find_all('h1')\n",
139 |     "    h1s = [h1.text for h1 in h1s]\n",
140 |     "    all_h1_text += h1s\n",
141 |     "    \n",
142 |     "    # search new links in current page\n",
143 |     "    links = page_soup.find_all('a')\n",
144 |     "    links = [link['href'] for link in links]\n",
145 |     "    links = [urljoin(page_resp.url, link) for link in links]\n",
146 |     "    links = list(filter(lambda x: x not in view_list, links))\n",
147 |     "    wait_list += links\n",
148 |     "    wait_list = list(set(wait_list))\n",
149 |     "    print('wait list:')\n",
150 |     "    pprint(wait_list)\n",
151 |     "    print('view list:')\n",
152 |     "    pprint(view_list)\n",
153 |     "    print('all text:')\n",
154 |     "    pprint(all_h1_text)\n",
155 |     "    print('='*87)"
156 |    ]
157 |   }
158 |  ],
159 |  "metadata": {
160 |   "kernelspec": {
161 |    "display_name": "Python 3",
162 |    "language": "python",
163 |    "name": "python3"
164 |   },
165 |   "language_info": {
166 |    "codemirror_mode": {
167 |     "name": "ipython",
168 |     "version": 3
169 |    },
170 |    "file_extension": ".py",
171 |    "mimetype": "text/x-python",
172 |    "name": "python",
173 |    "nbconvert_exporter": "python",
174 |    "pygments_lexer": "ipython3",
175 |    "version": "3.5.2"
176 |   }
177 |  },
178 |  "nbformat": 4,
179 |  "nbformat_minor": 2
180 | }
181 | 


--------------------------------------------------------------------------------
/01_files_website/04_image_crawling_check_last_modified.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 練習\n",
  8 |     "\n",
  9 |     "- 觀察 https://afuntw.github.io/Test-Crawling-Website/pages/portfolio/index.html 並撰寫爬蟲程式\n",
 10 |     "- 下載 2018/01/29 14:39:10 之後修改過的圖片\n",
 11 |     "- 以正確的圖片格式存檔"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {
 18 |     "collapsed": true
 19 |    },
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "import requests\n",
 23 |     "import os\n",
 24 |     "\n",
 25 |     "from PIL import Image\n",
 26 |     "from bs4 import BeautifulSoup\n",
 27 |     "from datetime import datetime\n",
 28 |     "from time import ctime\n",
 29 |     "from pprint import pprint\n",
 30 |     "\n",
 31 |     "url = 'https://afuntw.github.io/Test-Crawling-Website/pages/portfolio/index.html'\n",
 32 |     "last_modified = datetime(2018, 1, 29, 14, 39, 10)"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 2,
 38 |    "metadata": {
 39 |     "collapsed": true
 40 |    },
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "resp = requests.get(url)\n",
 44 |     "soup = BeautifulSoup(resp.text, 'lxml')"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 3,
 50 |    "metadata": {
 51 |     "collapsed": true
 52 |    },
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "imgs = soup.find_all('img')\n",
 56 |     "imgs = [i['src'] for i in imgs]"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 4,
 62 |    "metadata": {},
 63 |    "outputs": [
 64 |     {
 65 |      "name": "stdout",
 66 |      "output_type": "stream",
 67 |      "text": [
 68 |       "catch the filename IDPxvSl.jpg and the real format is PNG\n",
 69 |       "catch the real filename IDPxvSl.png\n",
 70 |       "save image at /home/dirl/github/Python-Crawling-Tutorial/results/IDPxvSl.png\n",
 71 |       "catch the filename UKxK6FZ.gif and the real format is PNG\n",
 72 |       "catch the real filename UKxK6FZ.png\n",
 73 |       "save image at /home/dirl/github/Python-Crawling-Tutorial/results/UKxK6FZ.png\n"
 74 |      ]
 75 |     }
 76 |    ],
 77 |    "source": [
 78 |     "results = os.path.abspath('../results')\n",
 79 |     "if not os.path.exists(results):\n",
 80 |     "    os.makedirs(results)\n",
 81 |     "\n",
 82 |     "for i in imgs:\n",
 83 |     "    # check header only\n",
 84 |     "    check_resp = requests.head(i)\n",
 85 |     "    check_head = dict(check_resp.headers)\n",
 86 |     "    if 'Last-Modified' in check_head:\n",
 87 |     "        check_modified = check_head['Last-Modified']\n",
 88 |     "        check_modified = datetime.strptime(check_modified, '%a, %d %b %Y %H:%M:%S GMT')\n",
 89 |     "        check_not_modified = check_modified < last_modified\n",
 90 |     "        if check_not_modified:\n",
 91 |     "            continue\n",
 92 |     "    \n",
 93 |     "    img_resp = requests.get(i, stream=True)\n",
 94 |     "    image = Image.open(img_resp.raw)\n",
 95 |     "    filename = os.path.basename(i)\n",
 96 |     "    print('catch the filename {} and the real format is {}'.format(filename, image.format))\n",
 97 |     "    \n",
 98 |     "    real_filename = '{}.{}'.format(\n",
 99 |     "        filename.split('.')[0],\n",
100 |     "        image.format.lower()\n",
101 |     "    )\n",
102 |     "    save_filename = os.path.join(results, real_filename)\n",
103 |     "    print('catch the real filename {}'.format(real_filename))\n",
104 |     "    \n",
105 |     "    image.save(save_filename)\n",
106 |     "    print('save image at {}'.format(save_filename))"
107 |    ]
108 |   }
109 |  ],
110 |  "metadata": {
111 |   "kernelspec": {
112 |    "display_name": "Python 3",
113 |    "language": "python",
114 |    "name": "python3"
115 |   },
116 |   "language_info": {
117 |    "codemirror_mode": {
118 |     "name": "ipython",
119 |     "version": 3
120 |    },
121 |    "file_extension": ".py",
122 |    "mimetype": "text/x-python",
123 |    "name": "python",
124 |    "nbconvert_exporter": "python",
125 |    "pygments_lexer": "ipython3",
126 |    "version": "3.5.2"
127 |   }
128 |  },
129 |  "nbformat": 4,
130 |  "nbformat_minor": 2
131 | }
132 | 


--------------------------------------------------------------------------------
/01_files_website/05_website_crawling_valid_URL.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 練習\n",
  8 |     "\n",
  9 |     "- 觀察 http://aiacademy.tw/ 並撰寫爬蟲程式\n",
 10 |     "- 紀錄所有 URL"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 1,
 16 |    "metadata": {
 17 |     "collapsed": true
 18 |    },
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "import requests\n",
 22 |     "import re\n",
 23 |     "import os\n",
 24 |     "\n",
 25 |     "from PIL import Image\n",
 26 |     "from bs4 import BeautifulSoup\n",
 27 |     "from fake_useragent import UserAgent\n",
 28 |     "from urllib.parse import urljoin\n",
 29 |     "from urllib.parse import urlparse\n",
 30 |     "from tldextract import extract\n",
 31 |     "from pprint import pprint\n",
 32 |     "\n",
 33 |     "url = 'http://aiacademy.tw/'"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 2,
 39 |    "metadata": {
 40 |     "collapsed": true
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "fu = UserAgent()\n",
 45 |     "headers = {'User-Agent': fu.random}\n",
 46 |     "resp = requests.get(url, headers=headers)\n",
 47 |     "soup = BeautifulSoup(resp.text, 'lxml')"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 3,
 53 |    "metadata": {
 54 |     "collapsed": true
 55 |    },
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "def invalid_href(url):\n",
 59 |     "    check_anchor = re.match('.*#.*', url)\n",
 60 |     "    check_protocol = re.match('[^https|http].*', urlparse(url).scheme)\n",
 61 |     "    check_js = re.match('javascript.*', url)\n",
 62 |     "    return any([check_anchor, check_protocol, check_js])\n",
 63 |     "\n",
 64 |     "def inner_href(url, domain):\n",
 65 |     "    return extract(url).domain == domain"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 4,
 71 |    "metadata": {},
 72 |    "outputs": [
 73 |     {
 74 |      "name": "stdout",
 75 |      "output_type": "stream",
 76 |      "text": [
 77 |       "http://aiacademy.tw/\n",
 78 |       "http://aiacademy.tw/admission-mgr2/\n",
 79 |       "http://aiacademy.tw/opening/\n",
 80 |       "http://aiacademy.tw/category/general/\n",
 81 |       "http://aiacademy.tw/registration-tech1/\n",
 82 |       "http://aiacademy.tw/registration/\n",
 83 |       "http://aiacademy.tw/announcement-180129/\n",
 84 |       "http://aiacademy.tw/registration-mgr1/\n",
 85 |       "http://aiacademy.tw/admission-tech2/\n",
 86 |       "http://aiacademy.tw/curriculum-tech2/\n",
 87 |       "http://aiacademy.tw/registration-tech2/\n",
 88 |       "http://aiacademy.tw/wp-content/uploads/2018/01/registration-tech2.pdf\n",
 89 |       "http://aiacademy.tw/parking/\n",
 90 |       "http://aiacademy.tw/category/class-tech/\n",
 91 |       "http://aiacademy.tw/category/news/\n",
 92 |       "http://aiacademy.tw/category/news/page/3/\n",
 93 |       "http://aiacademy.tw/refund/\n",
 94 |       "http://aiacademy.tw/class-tech/\n",
 95 |       "http://aiacademy.tw/mgr-class-overview/\n",
 96 |       "http://aiacademy.tw/tech-leader-lecturers/\n",
 97 |       "http://aiacademy.tw/lecturer/swc/\n",
 98 |       "http://aiacademy.tw/lecturer/tjw/\n",
 99 |       "http://aiacademy.tw/lecturer/whm/\n",
100 |       "http://aiacademy.tw/general\n",
101 |       "http://aiacademy.tw/curriculum/\n",
102 |       "http://aiacademy.tw/mgr-registration/\n",
103 |       "http://aiacademy.tw/curriculum-tech1/\n",
104 |       "http://aiacademy.tw/registration-mgr2/\n",
105 |       "http://aiacademy.tw/category/curriculum/\n",
106 |       "http://aiacademy.tw/organization/\n",
107 |       "http://aiacademy.tw/wp-content/uploads/2018/01/孔校長致詞.pdf\n",
108 |       "http://aiacademy.tw/mgr-lecturers/\n",
109 |       "http://aiacademy.tw/lecturer/hjh/\n",
110 |       "http://aiacademy.tw/mgr-admission-rule/\n",
111 |       "http://aiacademy.tw/ta/\n",
112 |       "http://aiacademy.tw/press-180127/\n",
113 |       "http://aiacademy.tw/category/%e9%87%8d%e5%a4%a7%e6%b4%bb%e5%8b%95/\n",
114 |       "http://aiacademy.tw/admission-mgr2\n",
115 |       "http://aiacademy.tw/category/faq/\n",
116 |       "http://aiacademy.tw/class-tech\n",
117 |       "http://aiacademy.tw/lecturer/huk/\n",
118 |       "http://aiacademy.tw/leave-of-absence-rule/\n",
119 |       "http://aiacademy.tw/about\n",
120 |       "http://aiacademy.tw/certificate-rule\n",
121 |       "http://aiacademy.tw/lecturer/albert/\n",
122 |       "http://aiacademy.tw/wp-content/uploads/2018/01/registration-mgr2.pdf\n",
123 |       "http://aiacademy.tw/vision/\n",
124 |       "http://aiacademy.tw/exam_notes/\n",
125 |       "http://aiacademy.tw/category/news\n",
126 |       "http://aiacademy.tw/press-180122/\n",
127 |       "http://aiacademy.tw/history/\n",
128 |       "http://aiacademy.tw/presence-rule-tech/\n",
129 |       "http://aiacademy.tw/support-us/\n",
130 |       "http://aiacademy.tw/admission-tech2\n",
131 |       "http://aiacademy.tw/category/faq\n",
132 |       "http://aiacademy.tw/lecturer/hungyilee/\n",
133 |       "http://aiacademy.tw/%e5%ad%b8%e5%93%a1%e7%b7%a8%e8%99%9f%e5%8f%8a%e7%ac%ac%e4%b8%80%e6%9c%9f%e9%96%8b%e5%ad%b8%e5%85%b8%e7%a6%ae%e8%a1%8c%e5%89%8d%e9%80%9a%e7%9f%a5%e5%b7%b2%e5%af%84%e5%87%ba%e8%ab%8b%e6%b3%a8%e6%84%8f/\n",
134 |       "http://aiacademy.tw/wp-content/uploads/2018/01/台灣人工智慧學校創校緣起及招生狀況1.pdf\n",
135 |       "http://aiacademy.tw/class-1st-overview/\n",
136 |       "http://aiacademy.tw/opening-presentation/\n",
137 |       "http://aiacademy.tw/certificate-rule/\n",
138 |       "http://aiacademy.tw/lecturer/ycw/\n",
139 |       "http://aiacademy.tw/wp-content/uploads/2018/01/AI-vs-Startups.pdf\n",
140 |       "http://aiacademy.tw/lecturer/ilt/\n",
141 |       "http://aiacademy.tw/lecturer/lin/\n",
142 |       "http://aiacademy.tw/class-overview/\n",
143 |       "http://aiacademy.tw/calendar/\n",
144 |       "http://aiacademy.tw/registration-mgr/\n",
145 |       "http://aiacademy.tw/mgr-class-enrollment-notice-0110/\n",
146 |       "http://aiacademy.tw/lecturer/swh/\n",
147 |       "http://aiacademy.tw/admission-tech1/\n",
148 |       "http://aiacademy.tw/enrollment_1st_term/\n",
149 |       "http://aiacademy.tw/mgr-class-1st-overview\n",
150 |       "http://aiacademy.tw/faq/\n",
151 |       "http://aiacademy.tw/mgr-curriculum/\n",
152 |       "http://aiacademy.tw/lecturer/sunmin/\n",
153 |       "http://aiacademy.tw/class-1st-overview\n",
154 |       "http://aiacademy.tw/lecturer/cph/\n",
155 |       "http://aiacademy.tw/rent/\n",
156 |       "http://aiacademy.tw/curriculum-mgr2/\n",
157 |       "http://aiacademy.tw/lecturer/ysc/\n",
158 |       "http://aiacademy.tw/lecturer/shw/\n",
159 |       "http://aiacademy.tw/calendar-am071/\n",
160 |       "http://aiacademy.tw/opening\n",
161 |       "http://aiacademy.tw/admission-mgr1/\n",
162 |       "http://aiacademy.tw/lecturers\n",
163 |       "http://aiacademy.tw/category/class-mgr/\n",
164 |       "http://aiacademy.tw/mgr-class-1st-overview/\n",
165 |       "http://aiacademy.tw/wp-content/uploads/2017/12/台灣人工智慧學校經理人周末研修班第一期－報名表格.pdf\n",
166 |       "http://aiacademy.tw/class-mgr/\n",
167 |       "http://aiacademy.tw/curriculum-mgr/\n",
168 |       "http://aiacademy.tw/admission-rule/\n",
169 |       "http://aiacademy.tw/category/misc/\n",
170 |       "http://aiacademy.tw/policy-tech/\n",
171 |       "http://aiacademy.tw/class-mgr\n",
172 |       "http://aiacademy.tw/absence-rule-manager/\n",
173 |       "http://aiacademy.tw/mgr-class-overview\n",
174 |       "http://aiacademy.tw/category/admission/\n",
175 |       "http://aiacademy.tw/class-2018jan-written-examination-list/\n",
176 |       "http://aiacademy.tw/presence-rule-mgr/\n",
177 |       "http://aiacademy.tw/curriculum-mgr1/\n",
178 |       "http://aiacademy.tw/lecturer/iac/\n",
179 |       "http://aiacademy.tw/category/lecturers/\n",
180 |       "http://aiacademy.tw/lecturer/weichao-chen/\n",
181 |       "http://aiacademy.tw/wp-content/uploads/2017/10/aiacademy.tw-registration.pdf\n",
182 |       "http://aiacademy.tw/job-fair/\n",
183 |       "http://aiacademy.tw/corporate-partner\n",
184 |       "http://aiacademy.tw/category/news/page/2/\n",
185 |       "http://aiacademy.tw/class-enrollment-notice-1225/\n",
186 |       "http://aiacademy.tw/aia-examination-notice-20171216/\n",
187 |       "http://aiacademy.tw/about/\n",
188 |       "http://aiacademy.tw/corporate-partner/\n",
189 |       "view list:\n",
190 |       "['http://aiacademy.tw/',\n",
191 |       " 'http://aiacademy.tw/admission-mgr2/',\n",
192 |       " 'http://aiacademy.tw/opening/',\n",
193 |       " 'http://aiacademy.tw/category/general/',\n",
194 |       " 'http://aiacademy.tw/registration-tech1/',\n",
195 |       " 'http://aiacademy.tw/registration/',\n",
196 |       " 'http://aiacademy.tw/announcement-180129/',\n",
197 |       " 'http://aiacademy.tw/registration-mgr1/',\n",
198 |       " 'http://aiacademy.tw/admission-tech2/',\n",
199 |       " 'http://aiacademy.tw/curriculum-tech2/',\n",
200 |       " 'http://aiacademy.tw/registration-tech2/',\n",
201 |       " 'http://aiacademy.tw/wp-content/uploads/2018/01/registration-tech2.pdf',\n",
202 |       " 'http://aiacademy.tw/parking/',\n",
203 |       " 'http://aiacademy.tw/category/class-tech/',\n",
204 |       " 'http://aiacademy.tw/category/news/',\n",
205 |       " 'http://aiacademy.tw/category/news/page/3/',\n",
206 |       " 'http://aiacademy.tw/refund/',\n",
207 |       " 'http://aiacademy.tw/class-tech/',\n",
208 |       " 'http://aiacademy.tw/mgr-class-overview/',\n",
209 |       " 'http://aiacademy.tw/tech-leader-lecturers/',\n",
210 |       " 'http://aiacademy.tw/lecturer/swc/',\n",
211 |       " 'http://aiacademy.tw/lecturer/tjw/',\n",
212 |       " 'http://aiacademy.tw/lecturer/whm/',\n",
213 |       " 'http://aiacademy.tw/general',\n",
214 |       " 'http://aiacademy.tw/curriculum/',\n",
215 |       " 'http://aiacademy.tw/mgr-registration/',\n",
216 |       " 'http://aiacademy.tw/curriculum-tech1/',\n",
217 |       " 'http://aiacademy.tw/registration-mgr2/',\n",
218 |       " 'http://aiacademy.tw/category/curriculum/',\n",
219 |       " 'http://aiacademy.tw/organization/',\n",
220 |       " 'http://aiacademy.tw/wp-content/uploads/2018/01/孔校長致詞.pdf',\n",
221 |       " 'http://aiacademy.tw/mgr-lecturers/',\n",
222 |       " 'http://aiacademy.tw/lecturer/hjh/',\n",
223 |       " 'http://aiacademy.tw/mgr-admission-rule/',\n",
224 |       " 'http://aiacademy.tw/ta/',\n",
225 |       " 'http://aiacademy.tw/press-180127/',\n",
226 |       " 'http://aiacademy.tw/category/%e9%87%8d%e5%a4%a7%e6%b4%bb%e5%8b%95/',\n",
227 |       " 'http://aiacademy.tw/admission-mgr2',\n",
228 |       " 'http://aiacademy.tw/category/faq/',\n",
229 |       " 'http://aiacademy.tw/class-tech',\n",
230 |       " 'http://aiacademy.tw/lecturer/huk/',\n",
231 |       " 'http://aiacademy.tw/leave-of-absence-rule/',\n",
232 |       " 'http://aiacademy.tw/about',\n",
233 |       " 'http://aiacademy.tw/certificate-rule',\n",
234 |       " 'http://aiacademy.tw/lecturer/albert/',\n",
235 |       " 'http://aiacademy.tw/wp-content/uploads/2018/01/registration-mgr2.pdf',\n",
236 |       " 'http://aiacademy.tw/vision/',\n",
237 |       " 'http://aiacademy.tw/exam_notes/',\n",
238 |       " 'http://aiacademy.tw/category/news',\n",
239 |       " 'http://aiacademy.tw/press-180122/',\n",
240 |       " 'http://aiacademy.tw/history/',\n",
241 |       " 'http://aiacademy.tw/presence-rule-tech/',\n",
242 |       " 'http://aiacademy.tw/support-us/',\n",
243 |       " 'http://aiacademy.tw/admission-tech2',\n",
244 |       " 'http://aiacademy.tw/category/faq',\n",
245 |       " 'http://aiacademy.tw/lecturer/hungyilee/',\n",
246 |       " 'http://aiacademy.tw/%e5%ad%b8%e5%93%a1%e7%b7%a8%e8%99%9f%e5%8f%8a%e7%ac%ac%e4%b8%80%e6%9c%9f%e9%96%8b%e5%ad%b8%e5%85%b8%e7%a6%ae%e8%a1%8c%e5%89%8d%e9%80%9a%e7%9f%a5%e5%b7%b2%e5%af%84%e5%87%ba%e8%ab%8b%e6%b3%a8%e6%84%8f/',\n",
247 |       " 'http://aiacademy.tw/wp-content/uploads/2018/01/台灣人工智慧學校創校緣起及招生狀況1.pdf',\n",
248 |       " 'http://aiacademy.tw/class-1st-overview/',\n",
249 |       " 'http://aiacademy.tw/opening-presentation/',\n",
250 |       " 'http://aiacademy.tw/certificate-rule/',\n",
251 |       " 'http://aiacademy.tw/lecturer/ycw/',\n",
252 |       " 'http://aiacademy.tw/wp-content/uploads/2018/01/AI-vs-Startups.pdf',\n",
253 |       " 'http://aiacademy.tw/lecturer/ilt/',\n",
254 |       " 'http://aiacademy.tw/lecturer/lin/',\n",
255 |       " 'http://aiacademy.tw/class-overview/',\n",
256 |       " 'http://aiacademy.tw/calendar/',\n",
257 |       " 'http://aiacademy.tw/registration-mgr/',\n",
258 |       " 'http://aiacademy.tw/mgr-class-enrollment-notice-0110/',\n",
259 |       " 'http://aiacademy.tw/lecturer/swh/',\n",
260 |       " 'http://aiacademy.tw/admission-tech1/',\n",
261 |       " 'http://aiacademy.tw/enrollment_1st_term/',\n",
262 |       " 'http://aiacademy.tw/mgr-class-1st-overview',\n",
263 |       " 'http://aiacademy.tw/faq/',\n",
264 |       " 'http://aiacademy.tw/mgr-curriculum/',\n",
265 |       " 'http://aiacademy.tw/lecturer/sunmin/',\n",
266 |       " 'http://aiacademy.tw/class-1st-overview',\n",
267 |       " 'http://aiacademy.tw/lecturer/cph/',\n",
268 |       " 'http://aiacademy.tw/rent/',\n",
269 |       " 'http://aiacademy.tw/curriculum-mgr2/',\n",
270 |       " 'http://aiacademy.tw/lecturer/ysc/',\n",
271 |       " 'http://aiacademy.tw/lecturer/shw/',\n",
272 |       " 'http://aiacademy.tw/calendar-am071/',\n",
273 |       " 'http://aiacademy.tw/opening',\n",
274 |       " 'http://aiacademy.tw/admission-mgr1/',\n",
275 |       " 'http://aiacademy.tw/lecturers',\n",
276 |       " 'http://aiacademy.tw/category/class-mgr/',\n",
277 |       " 'http://aiacademy.tw/mgr-class-1st-overview/',\n",
278 |       " 'http://aiacademy.tw/wp-content/uploads/2017/12/台灣人工智慧學校經理人周末研修班第一期－報名表格.pdf',\n",
279 |       " 'http://aiacademy.tw/class-mgr/',\n",
280 |       " 'http://aiacademy.tw/curriculum-mgr/',\n",
281 |       " 'http://aiacademy.tw/admission-rule/',\n",
282 |       " 'http://aiacademy.tw/category/misc/',\n",
283 |       " 'http://aiacademy.tw/policy-tech/',\n",
284 |       " 'http://aiacademy.tw/class-mgr',\n",
285 |       " 'http://aiacademy.tw/absence-rule-manager/',\n",
286 |       " 'http://aiacademy.tw/mgr-class-overview',\n",
287 |       " 'http://aiacademy.tw/category/admission/',\n",
288 |       " 'http://aiacademy.tw/class-2018jan-written-examination-list/',\n",
289 |       " 'http://aiacademy.tw/presence-rule-mgr/',\n",
290 |       " 'http://aiacademy.tw/curriculum-mgr1/',\n",
291 |       " 'http://aiacademy.tw/lecturer/iac/',\n",
292 |       " 'http://aiacademy.tw/category/lecturers/',\n",
293 |       " 'http://aiacademy.tw/lecturer/weichao-chen/',\n",
294 |       " 'http://aiacademy.tw/wp-content/uploads/2017/10/aiacademy.tw-registration.pdf',\n",
295 |       " 'http://aiacademy.tw/job-fair/',\n",
296 |       " 'http://aiacademy.tw/corporate-partner',\n",
297 |       " 'http://aiacademy.tw/category/news/page/2/',\n",
298 |       " 'http://aiacademy.tw/class-enrollment-notice-1225/',\n",
299 |       " 'http://aiacademy.tw/aia-examination-notice-20171216/',\n",
300 |       " 'http://aiacademy.tw/about/',\n",
301 |       " 'http://aiacademy.tw/corporate-partner/']\n"
302 |      ]
303 |     }
304 |    ],
305 |    "source": [
306 |     "wait_list = [url]\n",
307 |     "view_list = []\n",
308 |     "domain = extract(url).domain\n",
309 |     "\n",
310 |     "while wait_list:\n",
311 |     "\n",
312 |     "    link = wait_list.pop()\n",
313 |     "    if link in view_list:\n",
314 |     "        continue\n",
315 |     "    \n",
316 |     "    if invalid_href(link):\n",
317 |     "        continue\n",
318 |     "    \n",
319 |     "    if not inner_href(link, domain):\n",
320 |     "        continue\n",
321 |     "    \n",
322 |     "    print(link)\n",
323 |     "    view_list.append(link)\n",
324 |     "    \n",
325 |     "    page_resp = requests.get(link, headers=headers)\n",
326 |     "    page_soup = BeautifulSoup(page_resp.text, 'lxml')\n",
327 |     "\n",
328 |     "    # search new links in current page\n",
329 |     "    links = page_soup.find_all('a')\n",
330 |     "    links = [link['href'] for link in links if link.has_attr('href')]\n",
331 |     "    links = [urljoin(page_resp.url, link) for link in links]\n",
332 |     "    links = list(filter(lambda x: x not in view_list, links))\n",
333 |     "    wait_list += links\n",
334 |     "    wait_list = list(set(wait_list))\n",
335 |     "\n",
336 |     "print('view list:')\n",
337 |     "pprint(view_list)"
338 |    ]
339 |   }
340 |  ],
341 |  "metadata": {
342 |   "kernelspec": {
343 |    "display_name": "Python 3",
344 |    "language": "python",
345 |    "name": "python3"
346 |   },
347 |   "language_info": {
348 |    "codemirror_mode": {
349 |     "name": "ipython",
350 |     "version": 3
351 |    },
352 |    "file_extension": ".py",
353 |    "mimetype": "text/x-python",
354 |    "name": "python",
355 |    "nbconvert_exporter": "python",
356 |    "pygments_lexer": "ipython3",
357 |    "version": "3.5.2"
358 |   }
359 |  },
360 |  "nbformat": 4,
361 |  "nbformat_minor": 2
362 | }
363 | 


--------------------------------------------------------------------------------
/02_selenium/00_selenium_crawling_render_image.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 練習\n",
  8 |     "\n",
  9 |     "- 觀察 https://afuntw.github.io/Test-Crawling-Website/pages/gallery/index.html 並撰寫爬蟲程式\n",
 10 |     "- 判斷是否為 JavaScript rendered website\n",
 11 |     "- 下載網頁影片\n",
 12 |     "- 設定 Implicit Wait\n",
 13 |     "- 透過 XPath 定位圖片\n",
 14 |     "\n",
 15 |     "**透過靜態網站爬蟲會看到的圖片是**\n",
 16 |     "\n",
 17 |     "<img src=\"https://i.imgur.com/0s6Iiu3.png\" height=\"184\" width=\"337\" style=\"display: inline-block;\">\n",
 18 |     "\n",
 19 |     "**透過動態網站爬蟲會看到的圖片是**\n",
 20 |     "\n",
 21 |     "<img src=\"https://i.imgur.com/db3tGBG.png\" height=\"320\" width=\"320\" style=\"display: inline-block;\">"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 1,
 27 |    "metadata": {
 28 |     "collapsed": true
 29 |    },
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "import os\n",
 33 |     "import requests\n",
 34 |     "import re\n",
 35 |     "\n",
 36 |     "from bs4 import BeautifulSoup\n",
 37 |     "from selenium import webdriver\n",
 38 |     "from selenium.webdriver.common.by import By\n",
 39 |     "from fake_useragent import UserAgent\n",
 40 |     "from pprint import pprint\n",
 41 |     "\n",
 42 |     "url = 'https://afuntw.github.io/Test-Crawling-Website/pages/gallery/index.html'\n",
 43 |     "fu = UserAgent()"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "# 使用 requests 做靜態爬蟲"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 2,
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "name": "stdout",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       "['https://i.imgur.com/0s6Iiu3.png']\n"
 63 |      ]
 64 |     }
 65 |    ],
 66 |    "source": [
 67 |     "resp = requests.get(url)\n",
 68 |     "soup = BeautifulSoup(resp.text, 'lxml')\n",
 69 |     "imgs = soup.find_all('img', class_=re.compile('.*img-change'))\n",
 70 |     "imgs = [i['src'] for i in imgs]\n",
 71 |     "imgs = list(set(imgs))\n",
 72 |     "pprint(imgs)"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "# 使用 Selenium 做動態爬蟲"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 3,
 85 |    "metadata": {},
 86 |    "outputs": [
 87 |     {
 88 |      "name": "stdout",
 89 |      "output_type": "stream",
 90 |      "text": [
 91 |       "['https://i.imgur.com/db3tGBG.png']\n",
 92 |       "catch - db3tGBG.png\n",
 93 |       "save - /home/dirl/github/Python-Crawling-Tutorial/results/db3tGBG.png\n"
 94 |      ]
 95 |     }
 96 |    ],
 97 |    "source": [
 98 |     "driver = webdriver.Chrome()\n",
 99 |     "results = os.path.abspath('../results')\n",
100 |     "if not os.path.exists(results):\n",
101 |     "    os.makedirs(results)\n",
102 |     "\n",
103 |     "try:\n",
104 |     "    # webdriver setting\n",
105 |     "    driver.get(url)\n",
106 |     "    driver.maximize_window()\n",
107 |     "    driver.implicitly_wait(10)\n",
108 |     "    \n",
109 |     "    # xpath\n",
110 |     "    imgs = driver.find_elements(By.XPATH, '/html/body/div/div/div/a/img')\n",
111 |     "    imgs = [i.get_attribute('src') for i in imgs]\n",
112 |     "    imgs = list(set(imgs))\n",
113 |     "    print(imgs)\n",
114 |     "    \n",
115 |     "    # download\n",
116 |     "    for img in imgs:\n",
117 |     "        headers = {'User-Agent': fu.random}\n",
118 |     "        img_resp = requests.get(img, stream=True, headers=headers)\n",
119 |     "        \n",
120 |     "        filename = os.path.basename(img)\n",
121 |     "        print('catch - {}'.format(filename))\n",
122 |     "        filename = os.path.join(results, filename)\n",
123 |     "        \n",
124 |     "        with open(filename, 'wb') as f:\n",
125 |     "            for chunk in img_resp.iter_content(2048):\n",
126 |     "                f.write(chunk)\n",
127 |     "            print('save - {}'.format(filename))\n",
128 |     "    \n",
129 |     "except Exception as e:\n",
130 |     "    print(e)\n",
131 |     "finally:\n",
132 |     "    driver.quit()"
133 |    ]
134 |   }
135 |  ],
136 |  "metadata": {
137 |   "kernelspec": {
138 |    "display_name": "Python 3",
139 |    "language": "python",
140 |    "name": "python3"
141 |   },
142 |   "language_info": {
143 |    "codemirror_mode": {
144 |     "name": "ipython",
145 |     "version": 3
146 |    },
147 |    "file_extension": ".py",
148 |    "mimetype": "text/x-python",
149 |    "name": "python",
150 |    "nbconvert_exporter": "python",
151 |    "pygments_lexer": "ipython3",
152 |    "version": "3.5.2"
153 |   }
154 |  },
155 |  "nbformat": 4,
156 |  "nbformat_minor": 2
157 | }
158 | 


--------------------------------------------------------------------------------
/02_selenium/01_pchome_crawling_item.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 練習\n",
  8 |     "\n",
  9 |     "- 觀察 http://24h.pchome.com.tw/region/DHBE 並撰寫爬蟲程式\n",
 10 |     "- 判斷是否為 JavaScript rendered website\n",
 11 |     "- 設定 Implicit Wait\n",
 12 |     "- 透過 XPath 定位\n",
 13 |     "- 抓取商品的名稱與價格"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 1,
 19 |    "metadata": {
 20 |     "collapsed": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import os\n",
 25 |     "\n",
 26 |     "from selenium import webdriver\n",
 27 |     "from selenium.webdriver.common.by import By\n",
 28 |     "\n",
 29 |     "url = 'http://24h.pchome.com.tw/region/DHBE'"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 2,
 35 |    "metadata": {},
 36 |    "outputs": [
 37 |     {
 38 |      "name": "stdout",
 39 |      "output_type": "stream",
 40 |      "text": [
 41 |       "ASUS B9440UA-0251A7200U (i5-7200/8G/256G SSD/W10P) - 32900\n",
 42 |       "(商)Lenovo ThinkPad T470 20HDA00STW(i5-7200U/940MX-2G/1TB/W10P) - 39900\n",
 43 |       "(商)HP X360 1030 G2(i7-7600U/8G*2/512GB SSD/FHD/W10Pro) - 68900\n",
 44 |       "(商)HP 240 G6 (i5-7200U/14/4G/500GB/Win10) - 18900\n",
 45 |       "DELL M7520(i7-7820HQ/Nvidia Quadro M2200M-4G/1TB+256GB/W10P/FHD)繪圖工作站筆電 - 89990\n",
 46 |       "ASUS P2430UJ-0321A6200U (i5-6200U/4G/500G/920M-2G/Win10) - 22900\n",
 47 |       "(商)Lenovo ThinkPad X1c 20HRA010TW(i7-7500U/256G SSD/W10P/FHD) - 59900\n",
 48 |       "(商) HP Probook 430 G4(i5-7200U/4G DDR4/500GB/Win10) - 25900\n",
 49 |       "(商)HP 240 G6 (i5-7200U/14/4G/500GB/Win10pro) - 23900\n",
 50 |       "DELL M5520(i7-7820HQ/M1200M/512GB SSD/Win10 Pro/UHD)繪圖工作站筆電 - 98990\n",
 51 |       "ASUS P2530UJ-0461A6200U (i5-6200U/8G/1TB/GeForce 920M-2G/W10P) - 23900\n",
 52 |       "(商)Lenovo ThinkPad T470s 20HFA00ETW(i7-7600U/512G SSD/W10P) - 63900\n",
 53 |       "(商)HP Probook 650 G3(i7-7600U/512GB SSD/AMD Radeon R7 M465 2GB/W10P) - 50900\n",
 54 |       "(商)HP 240 G6 (i3-6006U/14/UMA/500G/W10DW7) - 17900\n",
 55 |       "ASUS B9440UA-0251A7200U (i5-7200/8G/256G SSD/W10P) - 27900\n",
 56 |       "ASUS P2530UJ-0271A6500U (i7-6500U/8G/1TB/920M-2G/FHD/Win10P) - 32900\n",
 57 |       "(商)Lenovo ThinkPad Edge15 E570 20H5A037TW(i7-7500U/GTX 950M-2G/1TB/W10/FHD) - 34900\n",
 58 |       "ACER TravelMate TMP238-M-77JQ (i7-6500U/8GB/256GB SSD/W7P+W10P) - 33800\n",
 59 |       "ACER TravelMate TMX349-G2-M-53L8.(i5-7200U/8GB/256GB SSD/W10P) - 30500\n",
 60 |       "DELL M7520(i7-7820HQ/Nvidia Quadro M1200M-4G/1TB/W10P/FHD)繪圖工作站筆電 - 76900\n",
 61 |       "ASUS A550V-0203J6700HQ (i7-6700HQ/500G/GTX950M 2G獨顯/W10P) - 28900\n",
 62 |       "(商)Lenovo ThinkPad X260 20F6A07QTW(i5-6200U/1TB/W10P) - 38900\n",
 63 |       "ACER TravelMate TMP249-M-C1DV.(CM3855U/4GB DDR4/500GB/W10P) - 14990\n",
 64 |       "ACER TravelMate TMP249-M-3142.(i3-6100U/4GB/500GB/W7P+W10P) - 17990\n",
 65 |       "DELL M7510(i7-6820HQ/Nvidia Quadro M1000M-2G/1TB/W7P/FHD)繪圖工作站筆電 - 69900\n",
 66 |       "ASUS B8230UA-0061A6500U (i7-6500U/512G SSD/W7P) - 44900\n",
 67 |       "(商)Lenovo ThinkPad X270 20HNA00RTW (i5-7200U/1TB/W10P) - 41900\n",
 68 |       "ACER TravelMate TMP446-M-54S0.(i5-5200U/4GB/500G/W7P+W10P) - 19900\n",
 69 |       "ACER TravelMate TMP259-M-5726(i5-6200U/4GB/128GB SSD/W7P+W10P) - 26900\n",
 70 |       "DELL Vostro 14 5000 (i5-7200U/4G/1TB/940MX-2G/W10/HD/Jingle Gold) - 23999\n"
 71 |      ]
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "try:\n",
 76 |     "    driver = webdriver.Chrome()\n",
 77 |     "    driver.get(url)\n",
 78 |     "    driver.maximize_window()\n",
 79 |     "    driver.implicitly_wait(10)\n",
 80 |     "    \n",
 81 |     "    # get web elements\n",
 82 |     "    items = driver.find_elements(By.XPATH, '//dl[@id=\"Block12Container50\"]/dd')\n",
 83 |     "    \n",
 84 |     "    for item in items:\n",
 85 |     "        title = item.find_element(By.XPATH, './div/h5/a').text\n",
 86 |     "        price = item.find_element(By.XPATH, './div/ul/li/span/span').text\n",
 87 |     "        if title and price:\n",
 88 |     "            print('{} - {}'.format(title, price))\n",
 89 |     "\n",
 90 |     "except Exception as e:\n",
 91 |     "    print(e)\n",
 92 |     "finally:\n",
 93 |     "    driver.quit()"
 94 |    ]
 95 |   }
 96 |  ],
 97 |  "metadata": {
 98 |   "kernelspec": {
 99 |    "display_name": "Python 3",
100 |    "language": "python",
101 |    "name": "python3"
102 |   },
103 |   "language_info": {
104 |    "codemirror_mode": {
105 |     "name": "ipython",
106 |     "version": 3
107 |    },
108 |    "file_extension": ".py",
109 |    "mimetype": "text/x-python",
110 |    "name": "python",
111 |    "nbconvert_exporter": "python",
112 |    "pygments_lexer": "ipython3",
113 |    "version": "3.5.2"
114 |   }
115 |  },
116 |  "nbformat": 4,
117 |  "nbformat_minor": 2
118 | }
119 | 


--------------------------------------------------------------------------------
/02_selenium/02_selenium_google_search.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 練習\n",
  8 |     "\n",
  9 |     "- 模擬 google search 流程\n",
 10 |     "- https://www.google.com.tw/\n",
 11 |     "- 搜尋「人工智慧」\n",
 12 |     "- 紀錄前兩頁搜尋結果的連結"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 1,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "import os\n",
 22 |     "\n",
 23 |     "from selenium import webdriver\n",
 24 |     "from selenium.webdriver.common.keys import Keys\n",
 25 |     "from selenium.webdriver.common.by import By\n",
 26 |     "from pprint import pprint\n",
 27 |     "\n",
 28 |     "url = 'https://google.com'"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 2,
 34 |    "metadata": {},
 35 |    "outputs": [
 36 |     {
 37 |      "name": "stdout",
 38 |      "output_type": "stream",
 39 |      "text": [
 40 |       "======================================================================================= Page 0\n",
 41 |       "title: 人工智能- 维基百科，自由的百科全书\n",
 42 |       "url: https://zh.wikipedia.org/zh-tw/%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD\n",
 43 |       "---\n",
 44 |       "title: 人工智慧三大關鍵技術 - 數位時代\n",
 45 |       "url: https://www.bnext.com.tw/article/41534/3-key-techniques-of-ai\n",
 46 |       "---\n",
 47 |       "title: AI 人工智慧| TechNews 科技新報\n",
 48 |       "url: https://technews.tw/category/cutting-edge/ai/\n",
 49 |       "---\n",
 50 |       "title: 人工智慧不可能超越人類，原因居然是這樣的……（上） ｜智慧機器人網 ...\n",
 51 |       "url: https://www.limitlessiq.com/news/post/view/id/3596/\n",
 52 |       "---\n",
 53 |       "title: 人工智慧AI – CASE報科學 - 國立臺灣大學科學教育發展中心\n",
 54 |       "url: https://case.ntu.edu.tw/blog/?cat=3772\n",
 55 |       "---\n",
 56 |       "title: AI人工智慧來了! 你的未來在哪裡？｜深度專題｜天下雜誌\n",
 57 |       "url: https://www.cw.com.tw/special/2073\n",
 58 |       "---\n",
 59 |       "title: 台灣人工智慧學校| Taiwan AI Academy\n",
 60 |       "url: http://aiacademy.tw/\n",
 61 |       "---\n",
 62 |       "title: 人工智慧- MBA智库百科\n",
 63 |       "url: https://wiki.mbalib.com/zh-tw/%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD\n",
 64 |       "---\n",
 65 |       "======================================================================================= Page 1\n",
 66 |       "title: 什麼是人工智慧？︱《三分鐘財經教室》#01 - YouTube\n",
 67 |       "url: https://www.youtube.com/watch?v=nKcsu4JierI\n",
 68 |       "---\n",
 69 |       "title: 『AI人工智慧！機器學習& 突如其來的危機』芬特克FinTech EP3 - YouTube\n",
 70 |       "url: https://www.youtube.com/watch?v=i0UxYDqlX6o\n",
 71 |       "---\n",
 72 |       "title: 人工智慧：搜尋方法與邏輯推論(Artificial Intelligence - Search & Logic ...\n",
 73 |       "url: https://www.coursera.org/learn/rengong-zhineng\n",
 74 |       "---\n",
 75 |       "title: 博客來-中文書>電腦資訊>概論/科技趨勢>人工智慧/機器學習\n",
 76 |       "url: https://www.books.com.tw/web/sys_bbotm/books/190102\n",
 77 |       "---\n",
 78 |       "title: 人工智能| 大紀元\n",
 79 |       "url: http://www.epochtimes.com/b5/tag/%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD.html\n",
 80 |       "---\n",
 81 |       "title: 如何對面AI時代的孩子？人工智慧博士教年輕父母強化個人實力- Yahoo ...\n",
 82 |       "url: https://tw.news.yahoo.com/%E5%A6%82%E4%BD%95%E5%B0%8D%E9%9D%A2ai%E6%99%82%E4%BB%A3%E7%9A%84%E5%AD%A9%E5%AD%90-%E4%BA%BA%E5%B7%A5%E6%99%BA%E6%85%A7%E5%8D%9A%E5%A3%AB%E6%95%99%E5%B9%B4%E8%BC%95%E7%88%B6%E6%AF%8D%E5%BC%B7%E5%8C%96%E5%80%8B%E4%BA%BA%E5%AF%A6%E5%8A%9B-010012301.html\n",
 83 |       "---\n",
 84 |       "title: 人工智慧對勞動就業的影響- STPI Research Portal - 科技政策觀點\n",
 85 |       "url: https://portal.stpi.narl.org.tw/index/article/10401\n",
 86 |       "---\n",
 87 |       "title: AI人工智慧時代來臨- 中時電子報\n",
 88 |       "url: https://www.chinatimes.com/newspapers/20180907000541-260204\n",
 89 |       "---\n",
 90 |       "title: 人工智慧應用新趨勢與展望—學生與機器人共同學習-臺北產經資訊網\n",
 91 |       "url: https://www.taipeiecon.taipei/article_cont.aspx?MmmID=1201&MSid=1001302007727155764\n",
 92 |       "---\n",
 93 |       "title: 人工智慧技術的下一波研發核心 - Digitimes\n",
 94 |       "url: https://www.digitimes.com.tw/col/article.asp?id=944\n",
 95 |       "---\n"
 96 |      ]
 97 |     }
 98 |    ],
 99 |    "source": [
100 |     "try:\n",
101 |     "    driver = webdriver.Chrome('/home/afun/Downloads/chromedriver')\n",
102 |     "    driver.get(url)\n",
103 |     "    driver.maximize_window()\n",
104 |     "    driver.implicitly_wait(10)\n",
105 |     "    \n",
106 |     "    search_input  = driver.find_element(By.ID, 'lst-ib')\n",
107 |     "    search_input.send_keys(u'人工智慧')\n",
108 |     "    search_input.send_keys(Keys.ENTER)\n",
109 |     "    \n",
110 |     "    for i in range(2):\n",
111 |     "        print('='*87, 'Page {}'.format(i))\n",
112 |     "\n",
113 |     "        links = driver.find_elements(By.XPATH, '//div[@class=\"r\"]/a[@href]')\n",
114 |     "\n",
115 |     "        for link in links:\n",
116 |     "            page_title = link.find_element(By.TAG_NAME, 'h3').text\n",
117 |     "            page_url = ''\n",
118 |     "\n",
119 |     "            if link.get_attribute('href'):\n",
120 |     "                page_url = link.get_attribute('href')\n",
121 |     "\n",
122 |     "            print('title: {}\\nurl: {}\\n---'.format(page_title, page_url))\n",
123 |     "\n",
124 |     "        next_page = driver.find_element(By.XPATH, '//*[@id=\"pnnext\"]/span[2]').click()\n",
125 |     "\n",
126 |     "except Exception as e:\n",
127 |     "    print(e)\n",
128 |     "finally:\n",
129 |     "    driver.quit()"
130 |    ]
131 |   }
132 |  ],
133 |  "metadata": {
134 |   "kernelspec": {
135 |    "display_name": "Python 3",
136 |    "language": "python",
137 |    "name": "python3"
138 |   },
139 |   "language_info": {
140 |    "codemirror_mode": {
141 |     "name": "ipython",
142 |     "version": 3
143 |    },
144 |    "file_extension": ".py",
145 |    "mimetype": "text/x-python",
146 |    "name": "python",
147 |    "nbconvert_exporter": "python",
148 |    "pygments_lexer": "ipython3",
149 |    "version": "3.6.6"
150 |   }
151 |  },
152 |  "nbformat": 4,
153 |  "nbformat_minor": 2
154 | }
155 | 


--------------------------------------------------------------------------------
/02_selenium/03_crawling_reCAPTCHA_image.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 練習\n",
  8 |     "\n",
  9 |     "- https://www.google.com/recaptcha/demo/recaptcha\n",
 10 |     "- 透過 google reCAPTCHA demo 生成圖片\n",
 11 |     "- 將 reCAPTCHA 的圖片抓下來"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {
 18 |     "collapsed": true
 19 |    },
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "import os\n",
 23 |     "import hashlib\n",
 24 |     "import requests\n",
 25 |     "import time\n",
 26 |     "\n",
 27 |     "from selenium import webdriver\n",
 28 |     "from selenium.webdriver.common.keys import Keys\n",
 29 |     "from selenium.webdriver.common.by import By\n",
 30 |     "\n",
 31 |     "from fake_useragent import UserAgent\n",
 32 |     "from PIL import Image\n",
 33 |     "\n",
 34 |     "url = 'https://www.google.com/recaptcha/demo/recaptcha'\n",
 35 |     "fu = UserAgent()"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 2,
 41 |    "metadata": {},
 42 |    "outputs": [
 43 |     {
 44 |      "name": "stdout",
 45 |      "output_type": "stream",
 46 |      "text": [
 47 |       "Save img - /home/dirl/github/Python-Crawling-Tutorial/results/ceecac6a5a9677750a69c80a87f26080.JPEG\n",
 48 |       "Save img - /home/dirl/github/Python-Crawling-Tutorial/results/ceecac6a5a9677750a69c80a87f26080.JPEG\n",
 49 |       "Save img - /home/dirl/github/Python-Crawling-Tutorial/results/4cce70c2cdde67af52e27920693da213.JPEG\n",
 50 |       "Save img - /home/dirl/github/Python-Crawling-Tutorial/results/1682c3490f1ec9df1da4a43407f890b7.JPEG\n",
 51 |       "Save img - /home/dirl/github/Python-Crawling-Tutorial/results/aa6a4d1bfa181fc53636a341562fb2ea.PNG\n"
 52 |      ]
 53 |     }
 54 |    ],
 55 |    "source": [
 56 |     "results = os.path.abspath('../results')\n",
 57 |     "if not os.path.exists(results):\n",
 58 |     "    os.makedirs(results)\n",
 59 |     "\n",
 60 |     "try:\n",
 61 |     "    driver = webdriver.Chrome()\n",
 62 |     "    driver.get(url)\n",
 63 |     "    driver.maximize_window()\n",
 64 |     "    driver.implicitly_wait(10)\n",
 65 |     "    compare_url = ''\n",
 66 |     "    \n",
 67 |     "    for i in range(5):\n",
 68 |     "        # get image\n",
 69 |     "        img_el = driver.find_element(By.XPATH, '//div[@id=\"recaptcha_image\"]/img')\n",
 70 |     "        img_url = img_el.get_attribute('src')\n",
 71 |     "        img_filename = hashlib.md5(img_url.encode('utf-8')).hexdigest()\n",
 72 |     "        compare_url = img_url\n",
 73 |     "\n",
 74 |     "        headers = {'User-Agent': fu.random}\n",
 75 |     "        img_resp = requests.get(img_url, stream=True, headers=headers)\n",
 76 |     "        img = Image.open(img_resp.raw)\n",
 77 |     "        img_filename = '{}.{}'.format(img_filename, img.format)\n",
 78 |     "        img_filename = os.path.join(results, img_filename)\n",
 79 |     "        img.save(img_filename)\n",
 80 |     "        print('Save img - {}'.format(img_filename))\n",
 81 |     "        \n",
 82 |     "        # re-generate image\n",
 83 |     "        btn_refresh = driver.find_element(By.XPATH, '//*[@id=\"recaptcha_reload_btn\"]').click()\n",
 84 |     "        time.sleep(2)\n",
 85 |     "        \n",
 86 |     "\n",
 87 |     "except Exception as e:\n",
 88 |     "    print(e)\n",
 89 |     "finally:\n",
 90 |     "    driver.quit()"
 91 |    ]
 92 |   }
 93 |  ],
 94 |  "metadata": {
 95 |   "kernelspec": {
 96 |    "display_name": "Python 3",
 97 |    "language": "python",
 98 |    "name": "python3"
 99 |   },
100 |   "language_info": {
101 |    "codemirror_mode": {
102 |     "name": "ipython",
103 |     "version": 3
104 |    },
105 |    "file_extension": ".py",
106 |    "mimetype": "text/x-python",
107 |    "name": "python",
108 |    "nbconvert_exporter": "python",
109 |    "pygments_lexer": "ipython3",
110 |    "version": "3.5.2"
111 |   }
112 |  },
113 |  "nbformat": 4,
114 |  "nbformat_minor": 2
115 | }
116 | 


--------------------------------------------------------------------------------
/03_graph_api/00_facebook_crawling_article_comments.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 練習\n",
  8 |     "\n",
  9 |     "- 取得 FB 文章底下所有留言\n",
 10 |     "- 使用 [Graph API](https://developers.facebook.com/tools/explorer/)\n",
 11 |     "- https://www.facebook.com/DoctorKoWJ/videos/1213927345375910/\n",
 12 |     "- 輸出成 CSV"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 1,
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import os\n",
 24 |     "import requests\n",
 25 |     "import pandas as pd\n",
 26 |     "\n",
 27 |     "from datetime import datetime"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 2,
 33 |    "metadata": {
 34 |     "collapsed": true
 35 |    },
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "# 透過 Graph API 觀察文章 ID 與 token\n",
 39 |     "article_id = '1213927345375910'\n",
 40 |     "token = ''"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 3,
 46 |    "metadata": {},
 47 |    "outputs": [
 48 |     {
 49 |      "name": "stdout",
 50 |      "output_type": "stream",
 51 |      "text": [
 52 |       "pages 1\n",
 53 |       "pages 2\n",
 54 |       "pages 3\n",
 55 |       "pages 4\n",
 56 |       "EOF\n",
 57 |       "comment length = 431\n"
 58 |      ]
 59 |     }
 60 |    ],
 61 |    "source": [
 62 |     "comments = []\n",
 63 |     "pages = 0\n",
 64 |     "\n",
 65 |     "url = 'https://graph.facebook.com/v2.11/{}/comments?pretty=0&limit={}&access_token={}'.format(\n",
 66 |     "    article_id, 100, token\n",
 67 |     ")\n",
 68 |     "\n",
 69 |     "while True:\n",
 70 |     "    pages += 1\n",
 71 |     "    resp = requests.get(url)\n",
 72 |     "    data = resp.json()\n",
 73 |     "    comments += data['data']\n",
 74 |     "    \n",
 75 |     "    if 'next' not in data['paging']:\n",
 76 |     "        print('EOF')\n",
 77 |     "        break\n",
 78 |     "    else:\n",
 79 |     "        url = data['paging']['next']\n",
 80 |     "        print('pages {}'.format(pages))\n",
 81 |     "        \n",
 82 |     "print('comment length = {}'.format(len(comments)))"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 4,
 88 |    "metadata": {},
 89 |    "outputs": [
 90 |     {
 91 |      "data": {
 92 |       "text/html": [
 93 |        "<div>\n",
 94 |        "<style>\n",
 95 |        "    .dataframe thead tr:only-child th {\n",
 96 |        "        text-align: right;\n",
 97 |        "    }\n",
 98 |        "\n",
 99 |        "    .dataframe thead th {\n",
100 |        "        text-align: left;\n",
101 |        "    }\n",
102 |        "\n",
103 |        "    .dataframe tbody tr th {\n",
104 |        "        vertical-align: top;\n",
105 |        "    }\n",
106 |        "</style>\n",
107 |        "<table border=\"1\" class=\"dataframe\">\n",
108 |        "  <thead>\n",
109 |        "    <tr style=\"text-align: right;\">\n",
110 |        "      <th></th>\n",
111 |        "      <th>created_time</th>\n",
112 |        "      <th>from</th>\n",
113 |        "      <th>id</th>\n",
114 |        "      <th>message</th>\n",
115 |        "    </tr>\n",
116 |        "  </thead>\n",
117 |        "  <tbody>\n",
118 |        "    <tr>\n",
119 |        "      <th>0</th>\n",
120 |        "      <td>2018-01-09T11:02:42+0000</td>\n",
121 |        "      <td>NaN</td>\n",
122 |        "      <td>1213927345375910_1213982232037088</td>\n",
123 |        "      <td>市長，謝謝您注意到這個議題。但是，不知道您是否同時有發現，比起醫療環境，更加威脅台灣幼兒的，...</td>\n",
124 |        "    </tr>\n",
125 |        "    <tr>\n",
126 |        "      <th>1</th>\n",
127 |        "      <td>2018-01-09T11:07:44+0000</td>\n",
128 |        "      <td>NaN</td>\n",
129 |        "      <td>1213927345375910_1213985318703446</td>\n",
130 |        "      <td>我希望如果有天你有能力了，可以為被虐的兒童提出修法保護，更另闢一個無力撫養孩子的人一個出口，...</td>\n",
131 |        "    </tr>\n",
132 |        "    <tr>\n",
133 |        "      <th>2</th>\n",
134 |        "      <td>2018-01-09T11:21:33+0000</td>\n",
135 |        "      <td>NaN</td>\n",
136 |        "      <td>1213927345375910_1213993592035952</td>\n",
137 |        "      <td>我也是重症兒童家屬\\n感謝你的發言\\n我第一次看到有政治人物願意大聲疾呼\\n但不是說沒有其他...</td>\n",
138 |        "    </tr>\n",
139 |        "    <tr>\n",
140 |        "      <th>3</th>\n",
141 |        "      <td>2018-01-09T09:34:35+0000</td>\n",
142 |        "      <td>NaN</td>\n",
143 |        "      <td>1213927345375910_1213934828708495</td>\n",
144 |        "      <td>每次看到你就覺得台灣還有希望\\n不在乎選票在乎的是人</td>\n",
145 |        "    </tr>\n",
146 |        "    <tr>\n",
147 |        "      <th>4</th>\n",
148 |        "      <td>2018-01-09T11:28:25+0000</td>\n",
149 |        "      <td>NaN</td>\n",
150 |        "      <td>1213927345375910_1213997665368878</td>\n",
151 |        "      <td>每當我覺得天下的烏鴉一般黑的時候 看到你的發文 又讓我覺得繼續奮鬥 台灣會被照亮的 柯文哲 ...</td>\n",
152 |        "    </tr>\n",
153 |        "  </tbody>\n",
154 |        "</table>\n",
155 |        "</div>"
156 |       ],
157 |       "text/plain": [
158 |        "               created_time from                                 id  \\\n",
159 |        "0  2018-01-09T11:02:42+0000  NaN  1213927345375910_1213982232037088   \n",
160 |        "1  2018-01-09T11:07:44+0000  NaN  1213927345375910_1213985318703446   \n",
161 |        "2  2018-01-09T11:21:33+0000  NaN  1213927345375910_1213993592035952   \n",
162 |        "3  2018-01-09T09:34:35+0000  NaN  1213927345375910_1213934828708495   \n",
163 |        "4  2018-01-09T11:28:25+0000  NaN  1213927345375910_1213997665368878   \n",
164 |        "\n",
165 |        "                                             message  \n",
166 |        "0  市長，謝謝您注意到這個議題。但是，不知道您是否同時有發現，比起醫療環境，更加威脅台灣幼兒的，...  \n",
167 |        "1  我希望如果有天你有能力了，可以為被虐的兒童提出修法保護，更另闢一個無力撫養孩子的人一個出口，...  \n",
168 |        "2  我也是重症兒童家屬\\n感謝你的發言\\n我第一次看到有政治人物願意大聲疾呼\\n但不是說沒有其他...  \n",
169 |        "3                         每次看到你就覺得台灣還有希望\\n不在乎選票在乎的是人  \n",
170 |        "4  每當我覺得天下的烏鴉一般黑的時候 看到你的發文 又讓我覺得繼續奮鬥 台灣會被照亮的 柯文哲 ...  "
171 |       ]
172 |      },
173 |      "execution_count": 4,
174 |      "metadata": {},
175 |      "output_type": "execute_result"
176 |     }
177 |    ],
178 |    "source": [
179 |     "df = pd.DataFrame.from_records(comments)\n",
180 |     "df.head()"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": 5,
186 |    "metadata": {},
187 |    "outputs": [
188 |     {
189 |      "name": "stdout",
190 |      "output_type": "stream",
191 |      "text": [
192 |       "Save file - /home/dirl/github/Python-Crawling-Tutorial/results/1213927345375910.csv\n"
193 |      ]
194 |     }
195 |    ],
196 |    "source": [
197 |     "results = os.path.abspath('../results')\n",
198 |     "if not os.path.exists(results):\n",
199 |     "    os.makedirs(results)\n",
200 |     "\n",
201 |     "filename = os.path.join(results, '{}.csv'.format(article_id))\n",
202 |     "df.to_csv(filename, index=False)\n",
203 |     "print('Save file - {}'.format(filename))"
204 |    ]
205 |   }
206 |  ],
207 |  "metadata": {
208 |   "kernelspec": {
209 |    "display_name": "Python 3",
210 |    "language": "python",
211 |    "name": "python3"
212 |   },
213 |   "language_info": {
214 |    "codemirror_mode": {
215 |     "name": "ipython",
216 |     "version": 3
217 |    },
218 |    "file_extension": ".py",
219 |    "mimetype": "text/x-python",
220 |    "name": "python",
221 |    "nbconvert_exporter": "python",
222 |    "pygments_lexer": "ipython3",
223 |    "version": "3.5.2"
224 |   }
225 |  },
226 |  "nbformat": 4,
227 |  "nbformat_minor": 2
228 | }
229 | 


--------------------------------------------------------------------------------
/03_graph_api/01_facebook_crawling_fanpage_likes_shares.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 練習\n",
  8 |     "\n",
  9 |     "- 取得 FB 粉絲團中所有文章的按讚與分享數\n",
 10 |     "- 使用 [Graph API](https://developers.facebook.com/tools/explorer/)\n",
 11 |     "- https://www.facebook.com/DoctorKoWJ\n",
 12 |     "- 輸出成 CSV"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 1,
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import os\n",
 24 |     "import requests\n",
 25 |     "import pandas as pd\n",
 26 |     "\n",
 27 |     "from datetime import datetime"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 2,
 33 |    "metadata": {
 34 |     "collapsed": true
 35 |    },
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "# 透過 Graph API 觀察文章 ID 與 token\n",
 39 |     "fanpage_id = '136845026417486'\n",
 40 |     "token = ''"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 3,
 46 |    "metadata": {},
 47 |    "outputs": [
 48 |     {
 49 |      "name": "stdout",
 50 |      "output_type": "stream",
 51 |      "text": [
 52 |       "page 1\n",
 53 |       "page 2\n",
 54 |       "page 3\n",
 55 |       "page 4\n",
 56 |       "page 5\n",
 57 |       "page 6\n",
 58 |       "page 7\n",
 59 |       "page 8\n",
 60 |       "page 9\n",
 61 |       "page 10\n",
 62 |       "page 11\n",
 63 |       "page 12\n",
 64 |       "page 13\n",
 65 |       "page 14\n",
 66 |       "page 15\n",
 67 |       "page 16\n",
 68 |       "page 17\n",
 69 |       "page 18\n",
 70 |       "page 19\n",
 71 |       "page 20\n",
 72 |       "page 21\n",
 73 |       "page 22\n",
 74 |       "page 23\n",
 75 |       "page 24\n",
 76 |       "page 25\n",
 77 |       "page 26\n",
 78 |       "page 27\n",
 79 |       "page 28\n",
 80 |       "page 29\n",
 81 |       "page 30\n",
 82 |       "page 31\n",
 83 |       "page 32\n",
 84 |       "page 33\n",
 85 |       "page 34\n",
 86 |       "page 35\n",
 87 |       "page 36\n",
 88 |       "page 37\n",
 89 |       "page 38\n",
 90 |       "page 39\n",
 91 |       "page 40\n",
 92 |       "page 41\n",
 93 |       "page 42\n",
 94 |       "page 43\n",
 95 |       "page 44\n",
 96 |       "page 45\n",
 97 |       "page 46\n",
 98 |       "page 47\n",
 99 |       "page 48\n",
100 |       "page 49\n",
101 |       "page 50\n",
102 |       "page 51\n",
103 |       "page 52\n",
104 |       "page 53\n",
105 |       "page 54\n",
106 |       "page 55\n",
107 |       "page 56\n",
108 |       "page 57\n",
109 |       "EOF\n"
110 |      ]
111 |     }
112 |    ],
113 |    "source": [
114 |     "url = 'https://graph.facebook.com/v2.11/{}/posts?fields={}&access_token={}'.format(\n",
115 |     "    fanpage_id, 'id,created_time,name,likes.limit(0).summary(true),shares,message', token\n",
116 |     ")\n",
117 |     "\n",
118 |     "posts = []\n",
119 |     "pages = 0\n",
120 |     "\n",
121 |     "while True:\n",
122 |     "    resp = requests.get(url)\n",
123 |     "    data = resp.json()\n",
124 |     "    posts += data['data']\n",
125 |     "    pages += 1\n",
126 |     "    \n",
127 |     "    if 'next' not in data['paging']:\n",
128 |     "        print('EOF')\n",
129 |     "        break\n",
130 |     "        \n",
131 |     "    else:\n",
132 |     "        url = data['paging']['next']\n",
133 |     "        print('page {}'.format(pages))"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 4,
139 |    "metadata": {},
140 |    "outputs": [
141 |     {
142 |      "data": {
143 |       "text/html": [
144 |        "<div>\n",
145 |        "<style>\n",
146 |        "    .dataframe thead tr:only-child th {\n",
147 |        "        text-align: right;\n",
148 |        "    }\n",
149 |        "\n",
150 |        "    .dataframe thead th {\n",
151 |        "        text-align: left;\n",
152 |        "    }\n",
153 |        "\n",
154 |        "    .dataframe tbody tr th {\n",
155 |        "        vertical-align: top;\n",
156 |        "    }\n",
157 |        "</style>\n",
158 |        "<table border=\"1\" class=\"dataframe\">\n",
159 |        "  <thead>\n",
160 |        "    <tr style=\"text-align: right;\">\n",
161 |        "      <th></th>\n",
162 |        "      <th>created_time</th>\n",
163 |        "      <th>id</th>\n",
164 |        "      <th>message</th>\n",
165 |        "      <th>name</th>\n",
166 |        "      <th>total_likes</th>\n",
167 |        "      <th>total_shares</th>\n",
168 |        "    </tr>\n",
169 |        "  </thead>\n",
170 |        "  <tbody>\n",
171 |        "    <tr>\n",
172 |        "      <th>0</th>\n",
173 |        "      <td>2018-01-29T10:07:27+0000</td>\n",
174 |        "      <td>136845026417486_1230167763751868</td>\n",
175 |        "      <td>來荷蘭烏特勒支市走一走，看看他們如何創造一個友善的自行車通行環境。\\n\\n---\\nPart...</td>\n",
176 |        "      <td>直播｜考察荷蘭自行車設施（Part 2）</td>\n",
177 |        "      <td>9022</td>\n",
178 |        "      <td>131.0</td>\n",
179 |        "    </tr>\n",
180 |        "    <tr>\n",
181 |        "      <th>1</th>\n",
182 |        "      <td>2018-01-29T09:40:44+0000</td>\n",
183 |        "      <td>136845026417486_1230143707087607</td>\n",
184 |        "      <td>來荷蘭烏特勒支市走一走，看看他們如何創造一座全世界最大的自行車停車場。\\n\\n---\\nPa...</td>\n",
185 |        "      <td>直播｜考察荷蘭自行車設施（Part 1）</td>\n",
186 |        "      <td>10470</td>\n",
187 |        "      <td>181.0</td>\n",
188 |        "    </tr>\n",
189 |        "    <tr>\n",
190 |        "      <th>2</th>\n",
191 |        "      <td>2018-01-28T03:30:00+0000</td>\n",
192 |        "      <td>136845026417486_1228976073871037</td>\n",
193 |        "      <td>每次出訪，都是一次難得的學習機會，這一趟歐洲行也不例外。\\n\\n荷蘭一直是我想去好好研究的地...</td>\n",
194 |        "      <td>Timeline Photos</td>\n",
195 |        "      <td>38594</td>\n",
196 |        "      <td>400.0</td>\n",
197 |        "    </tr>\n",
198 |        "    <tr>\n",
199 |        "      <th>3</th>\n",
200 |        "      <td>2018-01-27T13:15:49+0000</td>\n",
201 |        "      <td>136845026417486_1228569593911685</td>\n",
202 |        "      <td>很多人都聽過「順手捐發票，救救老殘窮」，也看過在路上推著烤爐賣烤地瓜的「地瓜媽媽」，這些都是...</td>\n",
203 |        "      <td>NaN</td>\n",
204 |        "      <td>12317</td>\n",
205 |        "      <td>192.0</td>\n",
206 |        "    </tr>\n",
207 |        "    <tr>\n",
208 |        "      <th>4</th>\n",
209 |        "      <td>2018-01-26T09:29:05+0000</td>\n",
210 |        "      <td>136845026417486_1227573790677932</td>\n",
211 |        "      <td>政治就是落實在人民的每一天生活之中，讓人民有好的居住環境，應當是中央和地方一致認同的進步價值...</td>\n",
212 |        "      <td>Photos from 柯文哲's post</td>\n",
213 |        "      <td>9145</td>\n",
214 |        "      <td>86.0</td>\n",
215 |        "    </tr>\n",
216 |        "  </tbody>\n",
217 |        "</table>\n",
218 |        "</div>"
219 |       ],
220 |       "text/plain": [
221 |        "               created_time                                id  \\\n",
222 |        "0  2018-01-29T10:07:27+0000  136845026417486_1230167763751868   \n",
223 |        "1  2018-01-29T09:40:44+0000  136845026417486_1230143707087607   \n",
224 |        "2  2018-01-28T03:30:00+0000  136845026417486_1228976073871037   \n",
225 |        "3  2018-01-27T13:15:49+0000  136845026417486_1228569593911685   \n",
226 |        "4  2018-01-26T09:29:05+0000  136845026417486_1227573790677932   \n",
227 |        "\n",
228 |        "                                             message                    name  \\\n",
229 |        "0  來荷蘭烏特勒支市走一走，看看他們如何創造一個友善的自行車通行環境。\\n\\n---\\nPart...    直播｜考察荷蘭自行車設施（Part 2）   \n",
230 |        "1  來荷蘭烏特勒支市走一走，看看他們如何創造一座全世界最大的自行車停車場。\\n\\n---\\nPa...    直播｜考察荷蘭自行車設施（Part 1）   \n",
231 |        "2  每次出訪，都是一次難得的學習機會，這一趟歐洲行也不例外。\\n\\n荷蘭一直是我想去好好研究的地...         Timeline Photos   \n",
232 |        "3  很多人都聽過「順手捐發票，救救老殘窮」，也看過在路上推著烤爐賣烤地瓜的「地瓜媽媽」，這些都是...                     NaN   \n",
233 |        "4  政治就是落實在人民的每一天生活之中，讓人民有好的居住環境，應當是中央和地方一致認同的進步價值...  Photos from 柯文哲's post   \n",
234 |        "\n",
235 |        "   total_likes  total_shares  \n",
236 |        "0         9022         131.0  \n",
237 |        "1        10470         181.0  \n",
238 |        "2        38594         400.0  \n",
239 |        "3        12317         192.0  \n",
240 |        "4         9145          86.0  "
241 |       ]
242 |      },
243 |      "execution_count": 4,
244 |      "metadata": {},
245 |      "output_type": "execute_result"
246 |     }
247 |    ],
248 |    "source": [
249 |     "posts_summary = []\n",
250 |     "for post in posts:\n",
251 |     "    p = {}\n",
252 |     "    for k, v in post.items():\n",
253 |     "        if k == 'likes' and 'summary' in v and 'total_count' in v['summary']:\n",
254 |     "            p['total_likes'] = v['summary']['total_count']\n",
255 |     "        elif k == 'shares' and 'count' in v:\n",
256 |     "            p['total_shares'] = v['count']\n",
257 |     "        else:\n",
258 |     "            p[k] = v\n",
259 |     "    posts_summary.append(p)\n",
260 |     "\n",
261 |     "df = pd.DataFrame.from_records(posts_summary)\n",
262 |     "df.head()"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": 5,
268 |    "metadata": {},
269 |    "outputs": [
270 |     {
271 |      "name": "stdout",
272 |      "output_type": "stream",
273 |      "text": [
274 |       "Save file - /home/dirl/github/Python-Crawling-Tutorial/results/fanpage_136845026417486.csv\n"
275 |      ]
276 |     }
277 |    ],
278 |    "source": [
279 |     "results = os.path.abspath('../results')\n",
280 |     "if not os.path.exists(results):\n",
281 |     "    os.makedirs(results)\n",
282 |     "    \n",
283 |     "filename = os.path.join(results, 'fanpage_{}.csv'.format(fanpage_id))\n",
284 |     "df.to_csv(filename, index=False)\n",
285 |     "print('Save file - {}'.format(filename))"
286 |    ]
287 |   }
288 |  ],
289 |  "metadata": {
290 |   "kernelspec": {
291 |    "display_name": "Python 3",
292 |    "language": "python",
293 |    "name": "python3"
294 |   },
295 |   "language_info": {
296 |    "codemirror_mode": {
297 |     "name": "ipython",
298 |     "version": 3
299 |    },
300 |    "file_extension": ".py",
301 |    "mimetype": "text/x-python",
302 |    "name": "python",
303 |    "nbconvert_exporter": "python",
304 |    "pygments_lexer": "ipython3",
305 |    "version": "3.5.2"
306 |   }
307 |  },
308 |  "nbformat": 4,
309 |  "nbformat_minor": 2
310 | }
311 | 


--------------------------------------------------------------------------------
/03_graph_api/02_facebook_crawling_article_all.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 練習\n",
  8 |     "\n",
  9 |     "- 取得 FB 文章底下所有留言, 附檔連結, application\n",
 10 |     "- 使用 [Graph API](https://developers.facebook.com/tools/explorer/)\n",
 11 |     "- https://www.facebook.com/appledaily.tw/posts/10156769966527069\n",
 12 |     "- 輸出成 CSV"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 1,
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import os\n",
 24 |     "import requests\n",
 25 |     "import pandas as pd\n",
 26 |     "\n",
 27 |     "from datetime import datetime"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 2,
 33 |    "metadata": {
 34 |     "collapsed": true
 35 |    },
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "# 透過 Graph API 觀察文章 ID 與 token\n",
 39 |     "article_id = '232633627068_10156769966527069'\n",
 40 |     "token = ''"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 3,
 46 |    "metadata": {},
 47 |    "outputs": [
 48 |     {
 49 |      "name": "stdout",
 50 |      "output_type": "stream",
 51 |      "text": [
 52 |       "pages 1\n",
 53 |       "pages 2\n",
 54 |       "pages 3\n",
 55 |       "pages 4\n",
 56 |       "pages 5\n",
 57 |       "pages 6\n",
 58 |       "pages 7\n",
 59 |       "comments length = 63\n"
 60 |      ]
 61 |     }
 62 |    ],
 63 |    "source": [
 64 |     "comments = []\n",
 65 |     "pages = 0\n",
 66 |     "\n",
 67 |     "\"\"\"\n",
 68 |     "nested query + 游標型分頁\n",
 69 |     "%7B => {\n",
 70 |     "%7D => }\n",
 71 |     "%2C => ,\n",
 72 |     "reference: https://www.w3schools.com/tags/ref_urlencode.asp\n",
 73 |     "\"\"\"\n",
 74 |     "\n",
 75 |     "base_url = 'https://graph.facebook.com/v2.11/{}'.format(article_id)\n",
 76 |     "query = '?fields=comments.limit({})%7Battachment%2Capplication%2Cmessage.limit({})%7D&access_token={}'.format(\n",
 77 |     "    10, 100, token\n",
 78 |     ")\n",
 79 |     "url = '{}/{}'.format(base_url, query)\n",
 80 |     "\n",
 81 |     "while True:\n",
 82 |     "    pages += 1\n",
 83 |     "    resp = requests.get(url)\n",
 84 |     "    data = resp.json()\n",
 85 |     "    if 'comments' not in data:\n",
 86 |     "        break\n",
 87 |     "\n",
 88 |     "    comments += data['comments']['data']\n",
 89 |     "    \n",
 90 |     "    if 'after' not in data['comments']['paging']['cursors']:\n",
 91 |     "        print('EOF')\n",
 92 |     "        break\n",
 93 |     "    else:\n",
 94 |     "        cursors_after = data['comments']['paging']['cursors']['after']\n",
 95 |     "        query = '?fields=comments.limit({}).after({})%7Battachment%2Capplication%2Cmessage.limit({})%7D&access_token={}'.format(\n",
 96 |     "            10, cursors_after, 100, token\n",
 97 |     "        )\n",
 98 |     "        url = '{}/{}'.format(base_url, query)\n",
 99 |     "        print('pages {}'.format(pages))\n",
100 |     "\n",
101 |     "print('comments length = {}'.format(len(comments)))"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 4,
107 |    "metadata": {},
108 |    "outputs": [
109 |     {
110 |      "data": {
111 |       "text/html": [
112 |        "<div>\n",
113 |        "<style>\n",
114 |        "    .dataframe thead tr:only-child th {\n",
115 |        "        text-align: right;\n",
116 |        "    }\n",
117 |        "\n",
118 |        "    .dataframe thead th {\n",
119 |        "        text-align: left;\n",
120 |        "    }\n",
121 |        "\n",
122 |        "    .dataframe tbody tr th {\n",
123 |        "        vertical-align: top;\n",
124 |        "    }\n",
125 |        "</style>\n",
126 |        "<table border=\"1\" class=\"dataframe\">\n",
127 |        "  <thead>\n",
128 |        "    <tr style=\"text-align: right;\">\n",
129 |        "      <th></th>\n",
130 |        "      <th>application_category</th>\n",
131 |        "      <th>application_id</th>\n",
132 |        "      <th>application_link</th>\n",
133 |        "      <th>application_name</th>\n",
134 |        "      <th>application_namespace</th>\n",
135 |        "      <th>attachment_type</th>\n",
136 |        "      <th>attachment_url</th>\n",
137 |        "      <th>id</th>\n",
138 |        "      <th>message</th>\n",
139 |        "    </tr>\n",
140 |        "  </thead>\n",
141 |        "  <tbody>\n",
142 |        "    <tr>\n",
143 |        "      <th>0</th>\n",
144 |        "      <td>Utilities</td>\n",
145 |        "      <td>350685531728</td>\n",
146 |        "      <td>/android</td>\n",
147 |        "      <td>Facebook for Android</td>\n",
148 |        "      <td>fbandroid</td>\n",
149 |        "      <td>NaN</td>\n",
150 |        "      <td>NaN</td>\n",
151 |        "      <td>10156769966527069_10156771068602069</td>\n",
152 |        "      <td>又要連PO好幾天\\n一天好幾篇\\nPO到有人反感\\n留言開始有人吵架鬥嘴\\n最後一面倒開始噴這遊戲</td>\n",
153 |        "    </tr>\n",
154 |        "    <tr>\n",
155 |        "      <th>1</th>\n",
156 |        "      <td>Utilities</td>\n",
157 |        "      <td>6628568379</td>\n",
158 |        "      <td>/iphone</td>\n",
159 |        "      <td>Facebook for iPhone</td>\n",
160 |        "      <td>fbiphone</td>\n",
161 |        "      <td>photo</td>\n",
162 |        "      <td>https://www.facebook.com/photo.php?fbid=164774...</td>\n",
163 |        "      <td>10156769966527069_10156771204372069</td>\n",
164 |        "      <td>水溝是怎樣</td>\n",
165 |        "    </tr>\n",
166 |        "    <tr>\n",
167 |        "      <th>2</th>\n",
168 |        "      <td>Utilities</td>\n",
169 |        "      <td>350685531728</td>\n",
170 |        "      <td>/android</td>\n",
171 |        "      <td>Facebook for Android</td>\n",
172 |        "      <td>fbandroid</td>\n",
173 |        "      <td>photo</td>\n",
174 |        "      <td>https://www.facebook.com/photo.php?fbid=201326...</td>\n",
175 |        "      <td>10156769966527069_10156771212477069</td>\n",
176 |        "      <td>我的🐸兒子好久才回家本來很生氣（找不到罵兒子的選項XD\\n\\n結果看到他帶回來的名產\\n以及...</td>\n",
177 |        "    </tr>\n",
178 |        "    <tr>\n",
179 |        "      <th>3</th>\n",
180 |        "      <td>Utilities</td>\n",
181 |        "      <td>350685531728</td>\n",
182 |        "      <td>/android</td>\n",
183 |        "      <td>Facebook for Android</td>\n",
184 |        "      <td>fbandroid</td>\n",
185 |        "      <td>NaN</td>\n",
186 |        "      <td>NaN</td>\n",
187 |        "      <td>10156769966527069_10156771109777069</td>\n",
188 |        "      <td>重複報導是不會膩喔</td>\n",
189 |        "    </tr>\n",
190 |        "    <tr>\n",
191 |        "      <th>4</th>\n",
192 |        "      <td>Utilities</td>\n",
193 |        "      <td>350685531728</td>\n",
194 |        "      <td>/android</td>\n",
195 |        "      <td>Facebook for Android</td>\n",
196 |        "      <td>fbandroid</td>\n",
197 |        "      <td>photo</td>\n",
198 |        "      <td>https://www.facebook.com/photo.php?fbid=537644...</td>\n",
199 |        "      <td>10156769966527069_10156771833147069</td>\n",
200 |        "      <td>我家的青蛙在我肚子裡跟我一起去旅行了</td>\n",
201 |        "    </tr>\n",
202 |        "  </tbody>\n",
203 |        "</table>\n",
204 |        "</div>"
205 |       ],
206 |       "text/plain": [
207 |        "  application_category application_id application_link      application_name  \\\n",
208 |        "0            Utilities   350685531728         /android  Facebook for Android   \n",
209 |        "1            Utilities     6628568379          /iphone   Facebook for iPhone   \n",
210 |        "2            Utilities   350685531728         /android  Facebook for Android   \n",
211 |        "3            Utilities   350685531728         /android  Facebook for Android   \n",
212 |        "4            Utilities   350685531728         /android  Facebook for Android   \n",
213 |        "\n",
214 |        "  application_namespace attachment_type  \\\n",
215 |        "0             fbandroid             NaN   \n",
216 |        "1              fbiphone           photo   \n",
217 |        "2             fbandroid           photo   \n",
218 |        "3             fbandroid             NaN   \n",
219 |        "4             fbandroid           photo   \n",
220 |        "\n",
221 |        "                                      attachment_url  \\\n",
222 |        "0                                                NaN   \n",
223 |        "1  https://www.facebook.com/photo.php?fbid=164774...   \n",
224 |        "2  https://www.facebook.com/photo.php?fbid=201326...   \n",
225 |        "3                                                NaN   \n",
226 |        "4  https://www.facebook.com/photo.php?fbid=537644...   \n",
227 |        "\n",
228 |        "                                    id  \\\n",
229 |        "0  10156769966527069_10156771068602069   \n",
230 |        "1  10156769966527069_10156771204372069   \n",
231 |        "2  10156769966527069_10156771212477069   \n",
232 |        "3  10156769966527069_10156771109777069   \n",
233 |        "4  10156769966527069_10156771833147069   \n",
234 |        "\n",
235 |        "                                             message  \n",
236 |        "0  又要連PO好幾天\\n一天好幾篇\\nPO到有人反感\\n留言開始有人吵架鬥嘴\\n最後一面倒開始噴這遊戲  \n",
237 |        "1                                              水溝是怎樣  \n",
238 |        "2  我的🐸兒子好久才回家本來很生氣（找不到罵兒子的選項XD\\n\\n結果看到他帶回來的名產\\n以及...  \n",
239 |        "3                                          重複報導是不會膩喔  \n",
240 |        "4                                 我家的青蛙在我肚子裡跟我一起去旅行了  "
241 |       ]
242 |      },
243 |      "execution_count": 4,
244 |      "metadata": {},
245 |      "output_type": "execute_result"
246 |     }
247 |    ],
248 |    "source": [
249 |     "for comment in comments:\n",
250 |     "    application, attachment, message = '', '', ''\n",
251 |     "    if 'application' in comment:\n",
252 |     "        app = {'application_{}'.format(k):v for k, v in comment['application'].items()}\n",
253 |     "        comment.update(app)\n",
254 |     "        del comment['application']\n",
255 |     "    if 'attachment' in comment:\n",
256 |     "        att = {\n",
257 |     "            'attachment_type': comment['attachment']['type'],\n",
258 |     "            'attachment_url': comment['attachment']['url']\n",
259 |     "        }\n",
260 |     "        comment.update(att)\n",
261 |     "        del comment['attachment']\n",
262 |     "\n",
263 |     "df = pd.DataFrame.from_records(comments)\n",
264 |     "df.head()"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": 5,
270 |    "metadata": {},
271 |    "outputs": [
272 |     {
273 |      "name": "stdout",
274 |      "output_type": "stream",
275 |      "text": [
276 |       "Save file - /home/dirl/github/Python-Crawling-Tutorial/results/232633627068_10156769966527069.csv\n"
277 |      ]
278 |     }
279 |    ],
280 |    "source": [
281 |     "results = os.path.abspath('../results')\n",
282 |     "if not os.path.exists(results):\n",
283 |     "    os.makedirs(results)\n",
284 |     "\n",
285 |     "filename = os.path.join(results, '{}.csv'.format(article_id))\n",
286 |     "df.to_csv(filename, index=False)\n",
287 |     "print('Save file - {}'.format(filename))"
288 |    ]
289 |   }
290 |  ],
291 |  "metadata": {
292 |   "kernelspec": {
293 |    "display_name": "Python 3",
294 |    "language": "python",
295 |    "name": "python3"
296 |   },
297 |   "language_info": {
298 |    "codemirror_mode": {
299 |     "name": "ipython",
300 |     "version": 3
301 |    },
302 |    "file_extension": ".py",
303 |    "mimetype": "text/x-python",
304 |    "name": "python",
305 |    "nbconvert_exporter": "python",
306 |    "pygments_lexer": "ipython3",
307 |    "version": "3.5.2"
308 |   }
309 |  },
310 |  "nbformat": 4,
311 |  "nbformat_minor": 2
312 | }
313 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | requests = "*"
 8 | lxml = "*"
 9 | jupyter = "*"
10 | "beautifulsoup4" = "*"
11 | browsercookie = "*"
12 | pandas = "*"
13 | fake-useragent = "*"
14 | pillow = "*"
15 | tldextract = "*"
16 | selenium = "*"
17 | 
18 | [dev-packages]
19 | 
20 | [requires]
21 | python_version = "3.6"
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Python-Crawling-Tutorial 基礎爬蟲實戰
 2 | 
 3 | ## 相關資源
 4 | 
 5 | 最新的投影片放在 [slideshare](https://www.slideshare.net/ChenMingYang/python-crawling-tutorial-87165481) 上, 會不定期更新, 程式碼可透過這個頁面右邊的 **Clone or download** 下載
 6 | ![demo](https://user-images.githubusercontent.com/4820492/35319787-585ea0c4-011c-11e8-802a-02ae0dbc4044.png)
 7 | 
 8 | > 2017 年以前的投影片教材放在 [release](https://github.com/afunTW/Python-Crawling-Tutorial/releases), 但是部份實戰練習網站會失效
 9 | > 或是可透過 [link](https://goo.gl/CFR95x) 下載投影片
10 | 
11 | ## 安裝環境
12 | 
13 | ### Anaconda (建議)
14 | 
15 | - 下載 Python 3.6 版本 https://www.continuum.io/downloads
16 | - 練習題會使用到瀏覽器 Chrome，麻煩各位選擇自己電腦的平台安裝 [Chrome](https://www.google.com.tw/chrome/browser/desktop/index.html)
17 | - 動態網站的爬蟲也需要下載 webdriver，需要額外下載
18 |     - [Chrome](https://sites.google.com/a/chromium.org/chromedriver/downloads)
19 |     - [Firefox](https://github.com/mozilla/geckodriver/releases)
20 | - 題目都是以 `jupyter notebook` 進行，安裝完 Anaconda 後即可用內建 `jupyter notebook` 打開 `.ipynb` 檔
21 | - 建議安裝 Anaconda，如有安裝 Anaconda 只需安裝以下套件
22 | 
23 | ```sh
24 | $ pip install selenium tldextract Pillow
25 | ```
26 | 
27 | ### pip
28 | 
29 | pip 是 Python 的套件管理系統，在部份系統裏面會用 `pip3` 代表 Python3 的版本，請各位依照自己的系統安裝 pip3 後，安裝以下 Python3 版本的套件
30 | 
31 | ```sh
32 | # 視情況而定, 使用 pip 或是 pip3
33 | $ pip install requests beautifulsoup4 lxml Pillow selenium tldextract
34 | ```
35 | 
36 | #### Optional: 資料分析
37 | 
38 | 沒有練習題但會有範例 code 可以執行，可自行選擇是否安裝 (如果安裝 wordcloud 時有問題，可能是沒有下載 visual studio，可以從 warining 中提供的網址下載安裝)
39 | 
40 | ```sh
41 | # Anaconda
42 | $ pip install jieba wordcloud
43 | 
44 | # pip
45 | $ pip3 install numpy pandas matplotlib scipy scikit-learn jieba wordcloud
46 | ```
47 | 
48 | ## 請遵守別人的規則
49 | 
50 | 有些網站會在目錄底下加上 robots.txt, 基本上這就是對方定義的爬蟲規則，請大家在練習爬蟲的時候要尊重對方的規則
51 | 
52 | > robots.txt 詳細的語法與用途請參考 [wiki](https://zh.wikipedia.org/zh-tw/Robots.txt) 與 [google 文件](https://support.google.com/webmasters/answer/6062608?hl=zh-Hant)
53 | 
54 | ---
55 | 
56 | ## Q&A
57 | 
58 | **Q: 有哪些常用的 API**
59 | 
60 | 課堂中有說到，爬蟲只是一種得到資料的手段，如果對方有提供 API 就可以直接使用 API，
61 | API 通常對方都會幫你整理好資料格式，或是根據權限決定你可以獲取的資料內容
62 | 
63 | - [Facebook Graph API](https://developers.facebook.com/tools/explorer/)
64 | - [Youtube](https://www.youtube.com/yt/dev/zh-TW/api-resources.html)
65 | - [Yahoo YQL](https://developer.yahoo.com/yql/)
66 | - [Instagram](https://www.instagram.com/developer/)
67 | - [KKTIX](http://support.kktix.com/knowledgebase/articles/558918-%E6%B4%BB%E5%8B%95%E8%B3%87%E8%A8%8A-api)
68 | - [Google Maps API](https://developers.google.com/maps/?hl=zh-tw)
69 | - [Taipei Open Data API](http://data.taipei/opendata/developer)
70 | - [Imgur API](https://api.imgur.com/endpoints)
71 | 


--------------------------------------------------------------------------------
/appendix_ptt/00_parse_article.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 爬取單一文章資訊\n",
  8 |     "\n",
  9 |     "1. 你有可能會遇到「是否滿18歲」的詢問頁面\n",
 10 |     "2. 解析 ptt.cc/bbs 裏面文章的結構\n",
 11 |     "3. 爬取文章\n",
 12 |     "4. 爬取留言\n",
 13 |     "\n",
 14 |     "URL https://www.ptt.cc/bbs/Gossiping/M.1537847530.A.E12.html\n",
 15 |     "\n",
 16 |     "BACKUP https://afuntw.github.io/Test-Crawling-Website/pages/ptt/M.1537847530.A.E12.html"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 1,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import requests\n",
 26 |     "import re\n",
 27 |     "import json\n",
 28 |     "\n",
 29 |     "from bs4 import BeautifulSoup, NavigableString\n",
 30 |     "from pprint import pprint"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 2,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "ARTICLE_URL = 'https://www.ptt.cc/bbs/Gossiping/M.1537847530.A.E12.html'"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "## 透過 cookies 繞過年齡檢查\n",
 47 |     "\n",
 48 |     "觀察開發者工具 > NetWork > requests header"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 3,
 54 |    "metadata": {
 55 |     "scrolled": true
 56 |    },
 57 |    "outputs": [
 58 |     {
 59 |      "name": "stdout",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       "<!DOCTYPE html>\n",
 63 |       "<html>\n",
 64 |       "\t<head>\n",
 65 |       "\t\t<meta charset=\"utf-8\">\n",
 66 |       "\t\t\n",
 67 |       "\n",
 68 |       "<meta name=\"viewport\" content=\"width=device-width, initial-scale=1\">\n",
 69 |       "\n",
 70 |       "<title>批踢踢實業坊</title>\n",
 71 |       "\n",
 72 |       "<link rel=\"stylesheet\" type=\"text/css\" href=\"//images.ptt.cc/bbs/v2.25/bbs-common.css\">\n",
 73 |       "<link rel=\"stylesheet\" type=\"text/css\" href=\"//images.ptt.cc/bbs/v2.25/bbs-base.css\" media=\"screen\">\n",
 74 |       "<link rel=\"stylesheet\" type=\"text/css\" href=\"//images.ptt.cc/bbs/v2.25/bbs-custom.css\">\n",
 75 |       "<link rel=\"stylesheet\" type=\"text/css\" href=\"//images.ptt.cc/bbs/v2.25/pushstream.css\" media=\"screen\">\n",
 76 |       "<link rel=\"stylesheet\" type=\"text/css\" href=\"//images.ptt.cc/bbs/v2.25/bbs-print.css\" media=\"print\">\n",
 77 |       "\n",
 78 |       "\n",
 79 |       "\n",
 80 |       "\n",
 81 |       "\t</head>\n",
 82 |       "    <body>\n",
 83 |       "\t\t\n",
 84 |       "<div class=\"bbs-screen bbs-content\">\n",
 85 |       "    <div class=\"over18-notice\">\n",
 86 |       "        <p>本網站已依網站內容分級規定處理</p>\n",
 87 |       "\n",
 88 |       "        <p>警告︰您即將進入之看板內容需滿十八歲方可瀏覽。</p>\n",
 89 |       "\n",
 90 |       "        <p>若您尚未年滿十八歲，請點選離開。若您已滿十八歲，亦不可將本區之內容派發、傳閱、出售、出租、交給或借予年齡未滿18歲的人士瀏覽，或將本網站內容向該人士出示、播放或放映。</p>\n",
 91 |       "    </div>\n",
 92 |       "</div>\n",
 93 |       "\n",
 94 |       "<div class=\"bbs-screen bbs-content center clear\">\n",
 95 |       "    <form action=\"/ask/over18\" method=\"post\">\n",
 96 |       "        <input type=\"hidden\" name=\"from\" value=\"/bbs/Gossiping/M.1537847530.A.E12.html\">\n",
 97 |       "        <div class=\"over18-button-container\">\n",
 98 |       "            <button class=\"btn-big\" type=\"submit\" name=\"yes\" value=\"yes\">我同意，我已年滿十八歲<br><small>進入</small></button>\n",
 99 |       "        </div>\n",
100 |       "        <div class=\"over18-button-container\">\n",
101 |       "            <button class=\"btn-big\" type=\"submit\" name=\"no\" value=\"no\">未滿十八歲或不同意本條款<br><small>離開</small></button>\n",
102 |       "        </div>\n",
103 |       "    </form>\n",
104 |       "</div>\n",
105 |       "\n",
106 |       "\t\t\n",
107 |       "\n",
108 |       "<script>\n",
109 |       "  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){\n",
110 |       "  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),\n",
111 |       "  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)\n",
112 |       "  })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');\n",
113 |       "\n",
114 |       "  ga('create', 'UA-32365737-1', {\n",
115 |       "    cookieDomain: 'ptt.cc',\n",
116 |       "    legacyCookieDomain: 'ptt.cc'\n",
117 |       "  });\n",
118 |       "  ga('send', 'pageview');\n",
119 |       "</script>\n",
120 |       "\n",
121 |       "\n",
122 |       "\t\t\n",
123 |       "<script src=\"//ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js\"></script>\n",
124 |       "<script src=\"//images.ptt.cc/bbs/v2.25/bbs.js\"></script>\n",
125 |       "\n",
126 |       "    </body>\n",
127 |       "</html>\n",
128 |       "\n"
129 |      ]
130 |     }
131 |    ],
132 |    "source": [
133 |     "resp = requests.get(ARTICLE_URL)\n",
134 |     "if resp.status_code == 200:\n",
135 |     "    print(resp.text)"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 4,
141 |    "metadata": {
142 |     "scrolled": true
143 |    },
144 |    "outputs": [
145 |     {
146 |      "name": "stdout",
147 |      "output_type": "stream",
148 |      "text": [
149 |       "<!DOCTYPE html>\n",
150 |       "<html>\n",
151 |       "\t<head>\n",
152 |       "\t\t<meta charset=\"utf-8\">\n",
153 |       "\t\t\n",
154 |       "\n",
155 |       "<meta name=\"viewport\" content=\"width=device-width, initial-scale=1\">\n",
156 |       "\n",
157 |       "<title>[問卦] 中央與北大併校 - 看板 Gossiping - 批踢踢實業坊</title>\n",
158 |       "<meta name=\"robots\" content=\"all\">\n",
159 |       "<meta name=\"keywords\" content=\"Ptt BBS 批踢踢\">\n",
160 |       "<meta name=\"description\" content=\"如題啊，最近陽明跟交大併校吵的很兇，中央都變成台聯大邊緣人了。\n",
161 |       "為什麼不讓中央跟台北大學併校呢？\n",
162 |       "中央缺法商剛好北大有，\n",
163 |       "中央的理工北大沒有，兩校剛好互補，\n",
164 |       "而且地理位置也不遠，有沒有人想過讓台北大學跟中央合併呢？\n",
165 |       "\">\n",
166 |       "<meta property=\"og:site_name\" content=\"Ptt 批踢踢實業坊\">\n",
167 |       "<meta property=\"og:title\" content=\"[問卦] 中央與北大併校\">\n",
168 |       "<meta property=\"og:description\" content=\"如題啊，最近陽明跟交大併校吵的很兇，中央都變成台聯大邊緣人了。\n",
169 |       "為什麼不讓中央跟台北大學併校呢？\n",
170 |       "中央缺法商剛好北大有，\n",
171 |       "中央的理工北大沒有，兩校剛好互補，\n",
172 |       "而且地理位置也不遠，有沒有人想過讓台北大學跟中央合併呢？\n",
173 |       "\">\n",
174 |       "<link rel=\"canonical\" href=\"https://www.ptt.cc/bbs/Gossiping/M.1537847530.A.E12.html\">\n",
175 |       "\n",
176 |       "<link rel=\"stylesheet\" type=\"text/css\" href=\"//images.ptt.cc/bbs/v2.25/bbs-common.css\">\n",
177 |       "<link rel=\"stylesheet\" type=\"text/css\" href=\"//images.ptt.cc/bbs/v2.25/bbs-base.css\" media=\"screen\">\n",
178 |       "<link rel=\"stylesheet\" type=\"text/css\" href=\"//images.ptt.cc/bbs/v2.25/bbs-custom.css\">\n",
179 |       "<link rel=\"stylesheet\" type=\"text/css\" href=\"//images.ptt.cc/bbs/v2.25/pushstream.css\" media=\"screen\">\n",
180 |       "<link rel=\"stylesheet\" type=\"text/css\" href=\"//images.ptt.cc/bbs/v2.25/bbs-print.css\" media=\"print\">\n",
181 |       "\n",
182 |       "\n",
183 |       "\n",
184 |       "\n",
185 |       "\t</head>\n",
186 |       "    <body>\n",
187 |       "\t\t\n",
188 |       "<div id=\"fb-root\"></div>\n",
189 |       "<script>(function(d, s, id) {\n",
190 |       "var js, fjs = d.getElementsByTagName(s)[0];\n",
191 |       "if (d.getElementById(id)) return;\n",
192 |       "js = d.createElement(s); js.id = id;\n",
193 |       "js.src = \"//connect.facebook.net/en_US/all.js#xfbml=1\";\n",
194 |       "fjs.parentNode.insertBefore(js, fjs);\n",
195 |       "}(document, 'script', 'facebook-jssdk'));</script>\n",
196 |       "\n",
197 |       "<div id=\"topbar-container\">\n",
198 |       "\t<div id=\"topbar\" class=\"bbs-content\">\n",
199 |       "\t\t<a id=\"logo\" href=\"/bbs/\">批踢踢實業坊</a>\n",
200 |       "\t\t<span>&rsaquo;</span>\n",
201 |       "\t\t<a class=\"board\" href=\"/bbs/Gossiping/index.html\"><span class=\"board-label\">看板 </span>Gossiping</a>\n",
202 |       "\t\t<a class=\"right small\" href=\"/about.html\">關於我們</a>\n",
203 |       "\t\t<a class=\"right small\" href=\"/contact.html\">聯絡資訊</a>\n",
204 |       "\t</div>\n",
205 |       "</div>\n",
206 |       "<div id=\"navigation-container\">\n",
207 |       "\t<div id=\"navigation\" class=\"bbs-content\">\n",
208 |       "\t\t<a class=\"board\" href=\"/bbs/Gossiping/index.html\">返回看板</a>\n",
209 |       "\t\t<div class=\"bar\"></div>\n",
210 |       "\t\t<div class=\"share\">\n",
211 |       "\t\t\t<span>分享</span>\n",
212 |       "\t\t\t<div class=\"fb-like\" data-send=\"false\" data-layout=\"button_count\" data-width=\"90\" data-show-faces=\"false\" data-href=\"http://www.ptt.cc/bbs/Gossiping/M.1537847530.A.E12.html\"></div>\n",
213 |       "\n",
214 |       "\t\t\t<div class=\"g-plusone\" data-size=\"medium\"></div>\n",
215 |       "<script type=\"text/javascript\">\n",
216 |       "window.___gcfg = {lang: 'zh-TW'};\n",
217 |       "(function() {\n",
218 |       "var po = document.createElement('script'); po.type = 'text/javascript'; po.async = true;\n",
219 |       "po.src = 'https://apis.google.com/js/plusone.js';\n",
220 |       "var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(po, s);\n",
221 |       "})();\n",
222 |       "</script>\n",
223 |       "\n",
224 |       "\t\t</div>\n",
225 |       "\t</div>\n",
226 |       "</div>\n",
227 |       "<div id=\"main-container\">\n",
228 |       "    <div id=\"main-content\" class=\"bbs-screen bbs-content\"><div class=\"article-metaline\"><span class=\"article-meta-tag\">作者</span><span class=\"article-meta-value\">R101 (索尼大法好)</span></div><div class=\"article-metaline-right\"><span class=\"article-meta-tag\">看板</span><span class=\"article-meta-value\">Gossiping</span></div><div class=\"article-metaline\"><span class=\"article-meta-tag\">標題</span><span class=\"article-meta-value\">[問卦] 中央與北大併校</span></div><div class=\"article-metaline\"><span class=\"article-meta-tag\">時間</span><span class=\"article-meta-value\">Tue Sep 25 11:52:08 2018</span></div>\n",
229 |       "如題啊，最近陽明跟交大併校吵的很兇，中央都變成台聯大邊緣人了。\n",
230 |       "為什麼不讓中央跟台北大學併校呢？\n",
231 |       "中央缺法商剛好北大有，\n",
232 |       "中央的理工北大沒有，兩校剛好互補，\n",
233 |       "而且地理位置也不遠，有沒有人想過讓台北大學跟中央合併呢？\n",
234 |       "有沒有八卦？\n",
235 |       "\n",
236 |       "--\n",
237 |       "<span class=\"f2\">※ 發信站: 批踢踢實業坊(ptt.cc), 來自: 140.115.197.252</span>\n",
238 |       "<span class=\"f2\">※ 文章網址: </span><a href=\"https://www.ptt.cc/bbs/Gossiping/M.1537847530.A.E12.html\" target=\"_blank\" rel=\"nofollow\"><span class=\"f2\">https://www.ptt.cc/bbs/Gossiping/M.1537847530.A.E12.html</span></a>\n",
239 |       "<div class=\"push\"><span class=\"hl push-tag\">推 </span><span class=\"f3 hl push-userid\">bobobola</span><span class=\"f3 push-content\">: 中央不缺商阿</span><span class=\"push-ipdatetime\">     42.75.76.1 09/25 11:54\n",
240 |       "</span></div><div class=\"push\"><span class=\"f1 hl push-tag\">→ </span><span class=\"f3 hl push-userid\">nikewang</span><span class=\"f3 push-content\">: 北中和中興合併不就好了</span><span class=\"push-ipdatetime\">121.157.204.247 09/25 11:54\n",
241 |       "</span></div><div class=\"push\"><span class=\"f1 hl push-tag\">→ </span><span class=\"f3 hl push-userid\">nikewang</span><span class=\"f3 push-content\">: 北大</span><span class=\"push-ipdatetime\">121.157.204.247 09/25 11:54\n",
242 |       "</span></div><div class=\"push\"><span class=\"hl push-tag\">推 </span><span class=\"f3 hl push-userid\">qqq1234</span><span class=\"f3 push-content\">: 北大好不容易才脫離中興獨立 怎可能去併</span><span class=\"push-ipdatetime\">   117.56.55.46 09/25 11:59\n",
243 |       "</span></div><div class=\"push\"><span class=\"f1 hl push-tag\">→ </span><span class=\"f3 hl push-userid\">Lakland</span><span class=\"f3 push-content\">: 北大跟北科合作一陣子了，中央找體大吧</span><span class=\"push-ipdatetime\">   114.24.29.42 09/25 11:59\n",
244 |       "</span></div><div class=\"push\"><span class=\"hl push-tag\">推 </span><span class=\"f3 hl push-userid\">atlaswhz</span><span class=\"f3 push-content\">: 中央找體大和警大組成桃聯大好了</span><span class=\"push-ipdatetime\">   1.34.181.133 09/25 12:07\n",
245 |       "</span></div><div class=\"push\"><span class=\"hl push-tag\">推 </span><span class=\"f3 hl push-userid\">sooppp</span><span class=\"f3 push-content\">: 體大的聽的懂中央上課在教什麼嗎？</span><span class=\"push-ipdatetime\">223.140.169.234 09/25 12:15\n",
246 |       "</span></div><div class=\"push\"><span class=\"hl push-tag\">推 </span><span class=\"f3 hl push-userid\">homepark</span><span class=\"f3 push-content\">: 中央缺醫學喇</span><span class=\"push-ipdatetime\">223.137.74.137 09/25 12:16\n",
247 |       "</span></div><div class=\"push\"><span class=\"f1 hl push-tag\">→ </span><span class=\"f3 hl push-userid\">lee457088</span><span class=\"f3 push-content\">: 197.252是哪棟</span><span class=\"push-ipdatetime\">140.115.216.209 09/25 12:18\n",
248 |       "</span></div>不知道\n",
249 |       "<div class=\"push\"><span class=\"hl push-tag\">推 </span><span class=\"f3 hl push-userid\">mecca</span><span class=\"f3 push-content\">: 當年有文法商理工醫農學院 現在洗洗睡吧</span><span class=\"push-ipdatetime\">210.64.134.103 09/25 12:40\n",
250 |       "</span></div><span class=\"f2\">※ 編輯: R101 (140.115.130.200), 09/25/2018 15:41:01\n",
251 |       "</span></div>\n",
252 |       "    \n",
253 |       "    <div id=\"article-polling\" data-pollurl=\"/poll/Gossiping/M.1537847530.A.E12.html?cacheKey=2100-35156991&amp;offset=1735&amp;offset-sig=9c9b4e00053e6d701be35d84593a0332e903e91f\" data-longpollurl=\"/v1/longpoll?id=dda406fae0d0b13b8584a5c9c23b2d3f71497976\" data-offset=\"1735\"></div>\n",
254 |       "    \n",
255 |       "\n",
256 |       "    \n",
257 |       "<div class=\"bbs-screen bbs-footer-message\">本網站已依台灣網站內容分級規定處理。此區域為限制級，未滿十八歲者不得瀏覽。</div>\n",
258 |       "\n",
259 |       "</div>\n",
260 |       "\n",
261 |       "\t\t\n",
262 |       "\n",
263 |       "<script>\n",
264 |       "  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){\n",
265 |       "  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),\n",
266 |       "  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)\n",
267 |       "  })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');\n",
268 |       "\n",
269 |       "  ga('create', 'UA-32365737-1', {\n",
270 |       "    cookieDomain: 'ptt.cc',\n",
271 |       "    legacyCookieDomain: 'ptt.cc'\n",
272 |       "  });\n",
273 |       "  ga('send', 'pageview');\n",
274 |       "</script>\n",
275 |       "\n",
276 |       "\n",
277 |       "\t\t\n",
278 |       "<script src=\"//ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js\"></script>\n",
279 |       "<script src=\"//images.ptt.cc/bbs/v2.25/bbs.js\"></script>\n",
280 |       "\n",
281 |       "    </body>\n",
282 |       "</html>\n",
283 |       "\n"
284 |      ]
285 |     }
286 |    ],
287 |    "source": [
288 |     "cookies = {'over18': '1'}\n",
289 |     "resp = requests.get(ARTICLE_URL, cookies=cookies)\n",
290 |     "if resp.status_code == 200:\n",
291 |     "    print(resp.text)"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": 5,
297 |    "metadata": {},
298 |    "outputs": [],
299 |    "source": [
300 |     "soup = BeautifulSoup(resp.text, 'lxml')"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "markdown",
305 |    "metadata": {},
306 |    "source": [
307 |     "## 爬取文章\n",
308 |     "\n",
309 |     "- 作者 id\n",
310 |     "- 作者暱稱\n",
311 |     "- 文章標題\n",
312 |     "- 發佈時間\n",
313 |     "- 文章內容\n",
314 |     "- 發文 ip"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": 6,
320 |    "metadata": {},
321 |    "outputs": [
322 |     {
323 |      "name": "stdout",
324 |      "output_type": "stream",
325 |      "text": [
326 |       "{'author_id': 'R101',\n",
327 |       " 'author_nickname': '索尼大法好',\n",
328 |       " 'contents': '如題啊，最近陽明跟交大併校吵的很兇，中央都變成台聯大邊緣人了。為什麼不讓中央跟台北大學併校呢？中央缺法商剛好北大有，中央的理工北大沒有，兩校剛好互補，而且地理位置也不遠，有沒有人想過讓台北大學跟中央合併呢？有沒有八卦？--\\n'\n",
329 |       "             '不知道',\n",
330 |       " 'ip': '140.115.197.252',\n",
331 |       " 'timestamp': 'Tue Sep 25 11:52:08 2018',\n",
332 |       " 'title': '[問卦] 中央與北大併校'}\n"
333 |      ]
334 |     }
335 |    ],
336 |    "source": [
337 |     "article = {\n",
338 |     "    'author_id': '',\n",
339 |     "    'author_nickname': '',\n",
340 |     "    'title': '',\n",
341 |     "    'timestamp': '',\n",
342 |     "    'contents': '',\n",
343 |     "    'ip': ''\n",
344 |     "}\n",
345 |     "article_body = soup.find(id='main-content')\n",
346 |     "\n",
347 |     "# article header\n",
348 |     "article_head = article_body.findAll('div', class_='article-metaline')\n",
349 |     "for metaline in article_head:\n",
350 |     "    meta_tag = metaline.find(class_='article-meta-tag').text\n",
351 |     "    meta_value = metaline.find(class_='article-meta-value').text\n",
352 |     "    if meta_tag == '作者':\n",
353 |     "        compile_nickname = re.compile('\\((.*)\\)').search(meta_value)\n",
354 |     "        article['author_id'] = meta_value.split('(')[0].strip(' ')\n",
355 |     "        article['author_nickname'] = compile_nickname.group(1) if compile_nickname else ''\n",
356 |     "    elif meta_tag == '標題':\n",
357 |     "        article['title'] = meta_value\n",
358 |     "    elif meta_tag == '時間':\n",
359 |     "        article['timestamp'] = meta_value\n",
360 |     "\n",
361 |     "#  article content\n",
362 |     "contents = [expr for expr in article_body.contents if isinstance(expr, NavigableString)]\n",
363 |     "contents = [re.sub('\\n', '', expr) for expr in contents]\n",
364 |     "contents = [i for i in contents if i]\n",
365 |     "contents = '\\n'.join(contents)\n",
366 |     "article['contents'] = contents\n",
367 |     "\n",
368 |     "# article publish ip\n",
369 |     "article_ip = article_body.find(class_='f2').text\n",
370 |     "compile_ip = re.compile('[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}').search(article_ip)\n",
371 |     "article['ip'] = compile_ip.group(0) if compile_ip else ''\n",
372 |     "\n",
373 |     "pprint(article)"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "markdown",
378 |    "metadata": {},
379 |    "source": [
380 |     "## 爬取流言\n",
381 |     "\n",
382 |     "- 推噓\n",
383 |     "- 推文 id\n",
384 |     "- 推文內容\n",
385 |     "- 推文 ip\n",
386 |     "- 推文時間"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "code",
391 |    "execution_count": 7,
392 |    "metadata": {
393 |     "scrolled": true
394 |    },
395 |    "outputs": [
396 |     {
397 |      "name": "stdout",
398 |      "output_type": "stream",
399 |      "text": [
400 |       "[{'content': ': 中央不缺商阿',\n",
401 |       "  'id': 'bobobola',\n",
402 |       "  'ip': '42.75.76.1',\n",
403 |       "  'tag': '推 ',\n",
404 |       "  'timestamp': '09/25 11:54'},\n",
405 |       " {'content': ': 北中和中興合併不就好了',\n",
406 |       "  'id': 'nikewang',\n",
407 |       "  'ip': '121.157.204.247',\n",
408 |       "  'tag': '→ ',\n",
409 |       "  'timestamp': '09/25 11:54'},\n",
410 |       " {'content': ': 北大',\n",
411 |       "  'id': 'nikewang',\n",
412 |       "  'ip': '121.157.204.247',\n",
413 |       "  'tag': '→ ',\n",
414 |       "  'timestamp': '09/25 11:54'},\n",
415 |       " {'content': ': 北大好不容易才脫離中興獨立 怎可能去併',\n",
416 |       "  'id': 'qqq1234',\n",
417 |       "  'ip': '117.56.55.46',\n",
418 |       "  'tag': '推 ',\n",
419 |       "  'timestamp': '09/25 11:59'},\n",
420 |       " {'content': ': 北大跟北科合作一陣子了，中央找體大吧',\n",
421 |       "  'id': 'Lakland',\n",
422 |       "  'ip': '114.24.29.42',\n",
423 |       "  'tag': '→ ',\n",
424 |       "  'timestamp': '09/25 11:59'},\n",
425 |       " {'content': ': 中央找體大和警大組成桃聯大好了',\n",
426 |       "  'id': 'atlaswhz',\n",
427 |       "  'ip': '1.34.181.133',\n",
428 |       "  'tag': '推 ',\n",
429 |       "  'timestamp': '09/25 12:07'},\n",
430 |       " {'content': ': 體大的聽的懂中央上課在教什麼嗎？',\n",
431 |       "  'id': 'sooppp',\n",
432 |       "  'ip': '223.140.169.234',\n",
433 |       "  'tag': '推 ',\n",
434 |       "  'timestamp': '09/25 12:15'},\n",
435 |       " {'content': ': 中央缺醫學喇',\n",
436 |       "  'id': 'homepark',\n",
437 |       "  'ip': '223.137.74.137',\n",
438 |       "  'tag': '推 ',\n",
439 |       "  'timestamp': '09/25 12:16'},\n",
440 |       " {'content': ': 197.252是哪棟',\n",
441 |       "  'id': 'lee457088',\n",
442 |       "  'ip': '140.115.216.209',\n",
443 |       "  'tag': '→ ',\n",
444 |       "  'timestamp': '09/25 12:18'},\n",
445 |       " {'content': ': 當年有文法商理工醫農學院 現在洗洗睡吧',\n",
446 |       "  'id': 'mecca',\n",
447 |       "  'ip': '210.64.134.103',\n",
448 |       "  'tag': '推 ',\n",
449 |       "  'timestamp': '09/25 12:40'}]\n"
450 |      ]
451 |     }
452 |    ],
453 |    "source": [
454 |     "comments = []\n",
455 |     "for comment in article_body.findAll('div', class_='push'):\n",
456 |     "    tag = comment.find(class_='push-tag').text\n",
457 |     "    guest_id = comment.find(class_='push-userid').text\n",
458 |     "    guest_content = comment.find(class_='push-content').text\n",
459 |     "    guest_ipdatetime = comment.find(class_='push-ipdatetime').text\n",
460 |     "    compile_ip = re.compile('[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}').search(guest_ipdatetime)\n",
461 |     "    guest_ip = compile_ip.group(0) if compile_ip else ''\n",
462 |     "    guest_timestamp = re.sub(guest_ip, '', guest_ipdatetime).strip()\n",
463 |     "    comments.append({\n",
464 |     "        'tag': tag,\n",
465 |     "        'id': guest_id,\n",
466 |     "        'content': guest_content,\n",
467 |     "        'ip': guest_ip,\n",
468 |     "        'timestamp': guest_timestamp\n",
469 |     "    })\n",
470 |     "pprint(comments)"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "markdown",
475 |    "metadata": {},
476 |    "source": [
477 |     "## 將資料存成 json 檔"
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "code",
482 |    "execution_count": 8,
483 |    "metadata": {},
484 |    "outputs": [],
485 |    "source": [
486 |     "article['comments'] = comments\n",
487 |     "data = [article]\n",
488 |     "with open('M.1537847530.A.E12.json', 'w+', encoding='utf-8') as f:\n",
489 |     "    json.dump(data, f, indent=2, ensure_ascii=False)"
490 |    ]
491 |   }
492 |  ],
493 |  "metadata": {
494 |   "kernelspec": {
495 |    "display_name": "Python 3",
496 |    "language": "python",
497 |    "name": "python3"
498 |   },
499 |   "language_info": {
500 |    "codemirror_mode": {
501 |     "name": "ipython",
502 |     "version": 3
503 |    },
504 |    "file_extension": ".py",
505 |    "mimetype": "text/x-python",
506 |    "name": "python",
507 |    "nbconvert_exporter": "python",
508 |    "pygments_lexer": "ipython3",
509 |    "version": "3.6.6"
510 |   }
511 |  },
512 |  "nbformat": 4,
513 |  "nbformat_minor": 2
514 | }
515 | 


--------------------------------------------------------------------------------
/appendix_ptt/01_search_api_by_title.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 給定文章標題，爬取 PTT 上的所有相關文章\n",
  8 |     "\n",
  9 |     "- title: [新聞] 2噸水晶球沿街滾 撞壞5輛汽機車和民宅\n",
 10 |     "- URL encoing (UTF-8)\n",
 11 |     "- combine URL path"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import requests\n",
 21 |     "import re\n",
 22 |     "import json\n",
 23 |     "\n",
 24 |     "from bs4 import BeautifulSoup, NavigableString\n",
 25 |     "from pprint import pprint\n",
 26 |     "from urllib.parse import urlencode, urljoin"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "QUERY_TITLE = '[新聞] 2噸水晶球沿街滾 撞壞5輛汽機車和民宅'\n",
 36 |     "cookies = {'over18': '1'}"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "## URL encoding\n",
 44 |     "\n",
 45 |     "取得相同文章標題的列表"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 3,
 51 |    "metadata": {},
 52 |    "outputs": [
 53 |     {
 54 |      "name": "stdout",
 55 |      "output_type": "stream",
 56 |      "text": [
 57 |       "https://www.ptt.cc/bbs/Gossiping/search?q=%5B%E6%96%B0%E8%81%9E%5D+2%E5%99%B8%E6%B0%B4%E6%99%B6%E7%90%83%E6%B2%BF%E8%A1%97%E6%BB%BE+%E6%92%9E%E5%A3%9E5%E8%BC%9B%E6%B1%BD%E6%A9%9F%E8%BB%8A%E5%92%8C%E6%B0%91%E5%AE%85\n"
 58 |      ]
 59 |     }
 60 |    ],
 61 |    "source": [
 62 |     "encoding_title = urlencode({'q': QUERY_TITLE})\n",
 63 |     "query = 'https://www.ptt.cc/bbs/Gossiping/search?{}'.format(encoding_title)\n",
 64 |     "print(query)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 4,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "resp_article_list = requests.get(query, cookies=cookies)\n",
 74 |     "soup_article_list = BeautifulSoup(resp_article_list.text, 'lxml')"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "## 列出所有文章並爬取"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 5,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "def crawl_article(url):\n",
 91 |     "    resp = requests.get(url, cookies={'over18': '1'})\n",
 92 |     "    if resp.status_code != 200:\n",
 93 |     "        return\n",
 94 |     "    soup = BeautifulSoup(resp.text, 'lxml')\n",
 95 |     "    print('Start to Crawling', url)\n",
 96 |     "\n",
 97 |     "    # ##############################\n",
 98 |     "    # crawl article\n",
 99 |     "    # ##############################\n",
100 |     "    article = {\n",
101 |     "        'author_id': '',\n",
102 |     "        'author_nickname': '',\n",
103 |     "        'title': '',\n",
104 |     "        'timestamp': '',\n",
105 |     "        'contents': '',\n",
106 |     "        'ip': ''\n",
107 |     "    }\n",
108 |     "    article_body = soup.find(id='main-content')\n",
109 |     "\n",
110 |     "    # article header\n",
111 |     "    article_head = article_body.findAll('div', class_='article-metaline')\n",
112 |     "    for metaline in article_head:\n",
113 |     "        meta_tag = metaline.find(class_='article-meta-tag').text\n",
114 |     "        meta_value = metaline.find(class_='article-meta-value').text\n",
115 |     "        if meta_tag == '作者':\n",
116 |     "            compile_nickname = re.compile('\\((.*)\\)').search(meta_value)\n",
117 |     "            article['author_id'] = meta_value.split('(')[0].strip(' ')\n",
118 |     "            article['author_nickname'] = compile_nickname.group(1) if compile_nickname else ''\n",
119 |     "        elif meta_tag == '標題':\n",
120 |     "            article['title'] = meta_value\n",
121 |     "        elif meta_tag == '時間':\n",
122 |     "            article['timestamp'] = meta_value\n",
123 |     "\n",
124 |     "    #  article content\n",
125 |     "    contents = [expr for expr in article_body.contents if isinstance(expr, NavigableString)]\n",
126 |     "    contents = [re.sub('\\n', '', expr) for expr in contents]\n",
127 |     "    contents = [i for i in contents if i]\n",
128 |     "    contents = '\\n'.join(contents)\n",
129 |     "    article['contents'] = contents\n",
130 |     "\n",
131 |     "    # article publish ip\n",
132 |     "    article_ip = article_body.find(class_='f2').text\n",
133 |     "    compile_ip = re.compile('[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}').search(article_ip)\n",
134 |     "    article['ip'] = compile_ip.group(0) if compile_ip else ''\n",
135 |     "\n",
136 |     "    # ##############################\n",
137 |     "    # crawl comments\n",
138 |     "    # ##############################\n",
139 |     "    comments = []\n",
140 |     "    for comment in article_body.findAll('div', class_='push'):\n",
141 |     "        tag = comment.find(class_='push-tag').text\n",
142 |     "        guest_id = comment.find(class_='push-userid').text\n",
143 |     "        guest_content = comment.find(class_='push-content').text\n",
144 |     "        guest_ipdatetime = comment.find(class_='push-ipdatetime').text\n",
145 |     "        compile_ip = re.compile('[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}').search(guest_ipdatetime)\n",
146 |     "        guest_ip = compile_ip.group(0) if compile_ip else ''\n",
147 |     "        guest_timestamp = re.sub(guest_ip, '', guest_ipdatetime).strip()\n",
148 |     "        comments.append({\n",
149 |     "            'tag': tag,\n",
150 |     "            'id': guest_id,\n",
151 |     "            'content': guest_content,\n",
152 |     "            'ip': guest_ip,\n",
153 |     "            'timestamp': guest_timestamp\n",
154 |     "        })\n",
155 |     "    \n",
156 |     "    article['comments'] = comments\n",
157 |     "    article['url'] = url\n",
158 |     "    return article"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 6,
164 |    "metadata": {},
165 |    "outputs": [
166 |     {
167 |      "name": "stdout",
168 |      "output_type": "stream",
169 |      "text": [
170 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537874850.A.20D.html\n",
171 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537868945.A.8A9.html\n",
172 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537861382.A.154.html\n",
173 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537859788.A.BE2.html\n",
174 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537859045.A.287.html\n",
175 |       "Save - search_api_by_title.json\n"
176 |      ]
177 |     }
178 |    ],
179 |    "source": [
180 |     "data = []\n",
181 |     "for article_line in soup_article_list.findAll('div', class_='r-ent'):\n",
182 |     "    title_tag = article_line.find('div', class_='title')\n",
183 |     "    article_url = title_tag.find('a')['href']\n",
184 |     "    article_url = urljoin(resp_article_list.url, article_url)\n",
185 |     "    article_data = crawl_article(article_url)\n",
186 |     "    data.append(article_data)\n",
187 |     "\n",
188 |     "with open('search_api_by_title.json', 'w+', encoding='utf-8') as f:\n",
189 |     "    json.dump(data, f, indent=2, ensure_ascii=False)\n",
190 |     "    print('Save - search_api_by_title.json')"
191 |    ]
192 |   }
193 |  ],
194 |  "metadata": {
195 |   "kernelspec": {
196 |    "display_name": "Python 3",
197 |    "language": "python",
198 |    "name": "python3"
199 |   },
200 |   "language_info": {
201 |    "codemirror_mode": {
202 |     "name": "ipython",
203 |     "version": 3
204 |    },
205 |    "file_extension": ".py",
206 |    "mimetype": "text/x-python",
207 |    "name": "python",
208 |    "nbconvert_exporter": "python",
209 |    "pygments_lexer": "ipython3",
210 |    "version": "3.6.6"
211 |   }
212 |  },
213 |  "nbformat": 4,
214 |  "nbformat_minor": 2
215 | }
216 | 


--------------------------------------------------------------------------------
/appendix_ptt/02_today_articles.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 爬取今天到目前為止的所有文章\n",
  8 |     "\n",
  9 |     "https://www.ptt.cc/bbs/Gossiping/index.html"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import requests\n",
 19 |     "import re\n",
 20 |     "import json\n",
 21 |     "\n",
 22 |     "from bs4 import BeautifulSoup, NavigableString\n",
 23 |     "from datetime import datetime\n",
 24 |     "from pprint import pprint\n",
 25 |     "from urllib.parse import urljoin"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 2,
 31 |    "metadata": {},
 32 |    "outputs": [
 33 |     {
 34 |      "name": "stdout",
 35 |      "output_type": "stream",
 36 |      "text": [
 37 |       "09/27\n"
 38 |      ]
 39 |     }
 40 |    ],
 41 |    "source": [
 42 |     "base_url = 'https://www.ptt.cc/bbs/Gossiping/index.html'\n",
 43 |     "ptt_today = datetime.now()\n",
 44 |     "ptt_today_str = ptt_today.strftime('%m/%d')\n",
 45 |     "print(ptt_today_str)"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "## 取得總頁碼\n",
 53 |     "\n",
 54 |     "從 html 上一頁的按鈕中取得 n-1 page 的頁碼，在將該頁碼加一就是總頁碼了"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 3,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "resp_base = requests.get(base_url, cookies={'over18': '1'})\n",
 64 |     "assert resp_base.status_code == 200\n",
 65 |     "soup_base = BeautifulSoup(resp_base.text, 'lxml')        "
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 4,
 71 |    "metadata": {},
 72 |    "outputs": [
 73 |     {
 74 |      "name": "stdout",
 75 |      "output_type": "stream",
 76 |      "text": [
 77 |       "total page = 39228\n"
 78 |      ]
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "paging_tag = soup_base.find(class_='btn-group-paging')\n",
 83 |     "total_page = None\n",
 84 |     "for btn_tag in paging_tag.findAll('a'):\n",
 85 |     "    if btn_tag.text == '‹ 上頁':\n",
 86 |     "        compile_page = re.search('(\\d+)', btn_tag['href'])\n",
 87 |     "        if compile_page:\n",
 88 |     "            total_page = int(compile_page.group(0)) + 1\n",
 89 |     "print('total page =', total_page)"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "## 往回檢查日期並爬取文章\n",
 97 |     "\n",
 98 |     "最舊的文章頁面，頁碼為 1"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 5,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "def crawl_article(url):\n",
108 |     "    resp = requests.get(url, cookies={'over18': '1'})\n",
109 |     "    if resp.status_code != 200:\n",
110 |     "        return\n",
111 |     "    soup = BeautifulSoup(resp.text, 'lxml')\n",
112 |     "    print('Start to Crawling', url)\n",
113 |     "\n",
114 |     "    # ##############################\n",
115 |     "    # crawl article\n",
116 |     "    # ##############################\n",
117 |     "    article = {\n",
118 |     "        'author_id': '',\n",
119 |     "        'author_nickname': '',\n",
120 |     "        'title': '',\n",
121 |     "        'timestamp': '',\n",
122 |     "        'contents': '',\n",
123 |     "        'ip': ''\n",
124 |     "    }\n",
125 |     "    article_body = soup.find(id='main-content')\n",
126 |     "\n",
127 |     "    # article header\n",
128 |     "    article_head = article_body.findAll('div', class_='article-metaline')\n",
129 |     "    for metaline in article_head:\n",
130 |     "        meta_tag = metaline.find(class_='article-meta-tag').text\n",
131 |     "        meta_value = metaline.find(class_='article-meta-value').text\n",
132 |     "        if meta_tag == '作者':\n",
133 |     "            compile_nickname = re.compile('\\((.*)\\)').search(meta_value)\n",
134 |     "            article['author_id'] = meta_value.split('(')[0].strip(' ')\n",
135 |     "            article['author_nickname'] = compile_nickname.group(1) if compile_nickname else ''\n",
136 |     "        elif meta_tag == '標題':\n",
137 |     "            article['title'] = meta_value\n",
138 |     "        elif meta_tag == '時間':\n",
139 |     "            article['timestamp'] = meta_value\n",
140 |     "\n",
141 |     "    #  article content\n",
142 |     "    contents = [expr for expr in article_body.contents if isinstance(expr, NavigableString)]\n",
143 |     "    contents = [re.sub('\\n', '', expr) for expr in contents]\n",
144 |     "    contents = [i for i in contents if i]\n",
145 |     "    contents = '\\n'.join(contents)\n",
146 |     "    article['contents'] = contents\n",
147 |     "\n",
148 |     "    # article publish ip\n",
149 |     "    article_ip = article_body.find(class_='f2').text\n",
150 |     "    compile_ip = re.compile('[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}').search(article_ip)\n",
151 |     "    article['ip'] = compile_ip.group(0) if compile_ip else ''\n",
152 |     "\n",
153 |     "    # ##############################\n",
154 |     "    # crawl comments\n",
155 |     "    # ##############################\n",
156 |     "    comments = []\n",
157 |     "    for comment in article_body.findAll('div', class_='push'):\n",
158 |     "        tag = comment.find(class_='push-tag').text\n",
159 |     "        guest_id = comment.find(class_='push-userid').text\n",
160 |     "        guest_content = comment.find(class_='push-content').text\n",
161 |     "        guest_ipdatetime = comment.find(class_='push-ipdatetime').text\n",
162 |     "        compile_ip = re.compile('[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}').search(guest_ipdatetime)\n",
163 |     "        guest_ip = compile_ip.group(0) if compile_ip else ''\n",
164 |     "        guest_timestamp = re.sub(guest_ip, '', guest_ipdatetime).strip()\n",
165 |     "        comments.append({\n",
166 |     "            'tag': tag,\n",
167 |     "            'id': guest_id,\n",
168 |     "            'content': guest_content,\n",
169 |     "            'ip': guest_ip,\n",
170 |     "            'timestamp': guest_timestamp\n",
171 |     "        })\n",
172 |     "    \n",
173 |     "    article['comments'] = comments\n",
174 |     "    article['url'] = url\n",
175 |     "    return article"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 6,
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": [
184 |     "DATE_GRATER=1\n",
185 |     "DATE_EQUAL=0\n",
186 |     "DATE_LESS=-1\n",
187 |     "\n",
188 |     "def compare_timestamp_md(src, dest):\n",
189 |     "    \"\"\"\n",
190 |     "    greater: 1\n",
191 |     "    equal: 0\n",
192 |     "    less: -1\n",
193 |     "    \"\"\"\n",
194 |     "    date_src = datetime.strptime(src, '%m/%d')\n",
195 |     "    date_dest = datetime.strptime(dest, '%m/%d')\n",
196 |     "    if date_dest > date_src:\n",
197 |     "        return 1\n",
198 |     "    elif date_dest == date_src:\n",
199 |     "        return 0\n",
200 |     "    else:\n",
201 |     "        return -1"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": 7,
207 |    "metadata": {},
208 |    "outputs": [
209 |     {
210 |      "name": "stdout",
211 |      "output_type": "stream",
212 |      "text": [
213 |       "https://www.ptt.cc/bbs/Gossiping/index39228.html - date 9/27 result 0\n",
214 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978608.A.325.html\n",
215 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978662.A.45A.html\n",
216 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978695.A.9A7.html\n",
217 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978699.A.194.html\n",
218 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978724.A.356.html\n",
219 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978750.A.39A.html\n",
220 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978768.A.08B.html\n",
221 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978815.A.5B2.html\n",
222 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978820.A.119.html\n",
223 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978934.A.F8E.html\n",
224 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978941.A.754.html\n",
225 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978960.A.779.html\n",
226 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978973.A.B90.html\n",
227 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978993.A.F88.html\n",
228 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537979013.A.67C.html\n",
229 |       "https://www.ptt.cc/bbs/Gossiping/index39227.html - date 9/27 result 0\n",
230 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977913.A.4EE.html\n",
231 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977930.A.01B.html\n",
232 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977933.A.013.html\n",
233 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977952.A.904.html\n",
234 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977959.A.A7B.html\n",
235 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977966.A.77C.html\n",
236 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978043.A.03E.html\n",
237 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978060.A.9DF.html\n",
238 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978098.A.D36.html\n",
239 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978140.A.C44.html\n",
240 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978152.A.31C.html\n",
241 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978156.A.B1A.html\n",
242 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978179.A.844.html\n",
243 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978195.A.D33.html\n",
244 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978272.A.533.html\n",
245 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978295.A.B6A.html\n",
246 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978350.A.D02.html\n",
247 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978378.A.746.html\n",
248 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978494.A.B6B.html\n",
249 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537978521.A.06B.html\n",
250 |       "https://www.ptt.cc/bbs/Gossiping/index39226.html - date 9/26 result -1\n",
251 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977639.A.3F8.html\n",
252 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977693.A.A67.html\n",
253 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977700.A.FD6.html\n",
254 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977711.A.493.html\n",
255 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977729.A.BE4.html\n",
256 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977740.A.534.html\n",
257 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977827.A.B50.html\n",
258 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977851.A.17A.html\n",
259 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977857.A.B1D.html\n",
260 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977877.A.292.html\n",
261 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977878.A.13E.html\n",
262 |       "Start to Crawling https://www.ptt.cc/bbs/Gossiping/M.1537977910.A.566.html\n",
263 |       "Save - today_articles.json\n"
264 |      ]
265 |     }
266 |    ],
267 |    "source": [
268 |     "data = []\n",
269 |     "for page in range(total_page, 1, -1):\n",
270 |     "    current_url = 'https://www.ptt.cc/bbs/Gossiping/index{}.html'.format(page)\n",
271 |     "    resp_page = requests.get(current_url, cookies={'over18': '1'})\n",
272 |     "    if resp_page.status_code != 200:\n",
273 |     "        continue\n",
274 |     "    soup_page = BeautifulSoup(resp_page.text, 'lxml')\n",
275 |     "    \n",
276 |     "    # ##############################\n",
277 |     "    # check the first article date\n",
278 |     "    # ##############################\n",
279 |     "    container_tag = soup_page.find('div', class_='r-list-container')\n",
280 |     "    first_article = container_tag.find('div', class_='r-ent')\n",
281 |     "    first_article_date = first_article.find('div', class_='date').text.strip()\n",
282 |     "    compare_datetime = compare_timestamp_md(ptt_today_str, first_article_date)\n",
283 |     "    print('{} - date {} result {}'.format(current_url, first_article_date, compare_datetime))\n",
284 |     "    \n",
285 |     "    if compare_datetime == 1:\n",
286 |     "        continue\n",
287 |     "    else:\n",
288 |     "        # only crawling today's article before r-list-sep line\n",
289 |     "        for article_row_tag in container_tag.findChildren('div', recursive=False):\n",
290 |     "            if 'r-list-sep' in article_row_tag['class']:\n",
291 |     "                break\n",
292 |     "            if 'r-ent' in article_row_tag['class']:\n",
293 |     "                article_date = article_row_tag.find('div', class_='date').text.strip()\n",
294 |     "                article_date_compare = compare_timestamp_md(ptt_today_str, article_date)\n",
295 |     "                if article_date_compare != 0:\n",
296 |     "                    continue\n",
297 |     "                article_tag = article_row_tag.find('a', href=True)\n",
298 |     "                article_url = urljoin(base_url, article_tag['href'])\n",
299 |     "                article_data = crawl_article(article_url)\n",
300 |     "                data.append(article_data)\n",
301 |     "\n",
302 |     "        # if the first article date is earlier than current date, should break the iteration\n",
303 |     "        if compare_datetime == -1:\n",
304 |     "            break\n",
305 |     "\n",
306 |     "with open('today_articles.json', 'w+', encoding='utf-8') as f:\n",
307 |     "    json.dump(data, f, indent=2, ensure_ascii=False)\n",
308 |     "    print('Save - today_articles.json')"
309 |    ]
310 |   }
311 |  ],
312 |  "metadata": {
313 |   "kernelspec": {
314 |    "display_name": "Python 3",
315 |    "language": "python",
316 |    "name": "python3"
317 |   },
318 |   "language_info": {
319 |    "codemirror_mode": {
320 |     "name": "ipython",
321 |     "version": 3
322 |    },
323 |    "file_extension": ".py",
324 |    "mimetype": "text/x-python",
325 |    "name": "python",
326 |    "nbconvert_exporter": "python",
327 |    "pygments_lexer": "ipython3",
328 |    "version": "3.6.6"
329 |   }
330 |  },
331 |  "nbformat": 4,
332 |  "nbformat_minor": 2
333 | }
334 | 


--------------------------------------------------------------------------------
/appendix_ptt/03_crawl_image.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 爬取文章上的內文的所有文章\n",
  8 |     "\n",
  9 |     "1. 你有可能會遇到「是否滿18歲」的詢問頁面\n",
 10 |     "2. 解析 ptt.cc/bbs 裏面文章的結構\n",
 11 |     "3. 爬取文章\n",
 12 |     "4. 解析並確認圖片格式\n",
 13 |     "5. 下載圖片\n",
 14 |     "\n",
 15 |     "URL https://www.ptt.cc/bbs/Gossiping/M.1538373690.A.72D.html\n",
 16 |     "\n",
 17 |     "BACKUP https://afuntw.github.io/Test-Crawling-Website/pages/ptt/M.1538373690.A.72D.html"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 1,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "import requests\n",
 27 |     "import re\n",
 28 |     "import json\n",
 29 |     "import os\n",
 30 |     "\n",
 31 |     "from PIL import Image\n",
 32 |     "from bs4 import BeautifulSoup, NavigableString\n",
 33 |     "from pprint import pprint"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 2,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "ARTICLE_URL = 'https://www.ptt.cc/bbs/Gossiping/M.1538373690.A.72D.html'"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "## 爬取文章"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 3,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "resp = requests.get(ARTICLE_URL, cookies={'over18': '1'})\n",
 59 |     "assert resp.status_code == 200\n",
 60 |     "soup = BeautifulSoup(resp.text, 'lxml')"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 4,
 66 |    "metadata": {
 67 |     "scrolled": true
 68 |    },
 69 |    "outputs": [
 70 |     {
 71 |      "name": "stdout",
 72 |      "output_type": "stream",
 73 |      "text": [
 74 |       "[<a href=\"https://i.imgur.com/HdI5e8G.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/HdI5e8G.jpg</a>,\n",
 75 |       " <a href=\"https://i.imgur.com/6W5aQk2.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/6W5aQk2.jpg</a>,\n",
 76 |       " <a href=\"https://i.imgur.com/PhhH8ga.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/PhhH8ga.jpg</a>,\n",
 77 |       " <a href=\"https://i.imgur.com/zF6ZwFj.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/zF6ZwFj.jpg</a>,\n",
 78 |       " <a href=\"https://i.imgur.com/4CXovkJ.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/4CXovkJ.jpg</a>,\n",
 79 |       " <a href=\"https://i.imgur.com/NFwopB9.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/NFwopB9.jpg</a>,\n",
 80 |       " <a href=\"https://i.imgur.com/BFlIDmf.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/BFlIDmf.jpg</a>,\n",
 81 |       " <a href=\"https://i.imgur.com/ARewyx8.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/ARewyx8.jpg</a>,\n",
 82 |       " <a href=\"https://i.imgur.com/LK4fnZX.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/LK4fnZX.jpg</a>,\n",
 83 |       " <a href=\"https://i.imgur.com/AjTWzRW.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/AjTWzRW.jpg</a>,\n",
 84 |       " <a href=\"https://i.imgur.com/daJpBHQ.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/daJpBHQ.jpg</a>,\n",
 85 |       " <a href=\"https://i.imgur.com/X2RqYU6.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/X2RqYU6.jpg</a>,\n",
 86 |       " <a href=\"https://i.imgur.com/j8rj172.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/j8rj172.jpg</a>,\n",
 87 |       " <a href=\"https://i.imgur.com/nNnAJFf.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/nNnAJFf.jpg</a>,\n",
 88 |       " <a href=\"https://i.imgur.com/dwZAQu1.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/dwZAQu1.jpg</a>,\n",
 89 |       " <a href=\"https://i.imgur.com/7ibAOi8.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/7ibAOi8.jpg</a>,\n",
 90 |       " <a href=\"https://i.imgur.com/YTaD5bs.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/YTaD5bs.jpg</a>,\n",
 91 |       " <a href=\"https://i.imgur.com/FzwkWYt.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/FzwkWYt.jpg</a>,\n",
 92 |       " <a href=\"https://i.imgur.com/NflbWR5.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/NflbWR5.jpg</a>,\n",
 93 |       " <a href=\"https://i.imgur.com/6sqAzjT.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/6sqAzjT.jpg</a>,\n",
 94 |       " <a href=\"https://i.imgur.com/KmEAkaP.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/KmEAkaP.jpg</a>,\n",
 95 |       " <a href=\"https://i.imgur.com/73yb0Ao.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/73yb0Ao.jpg</a>,\n",
 96 |       " <a href=\"https://i.imgur.com/K6ukMIf.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/K6ukMIf.jpg</a>,\n",
 97 |       " <a href=\"https://i.imgur.com/3BFzLjv.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/3BFzLjv.jpg</a>,\n",
 98 |       " <a href=\"https://i.imgur.com/72a2Bas.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/72a2Bas.jpg</a>,\n",
 99 |       " <a href=\"https://i.imgur.com/89GSqqx.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/89GSqqx.jpg</a>,\n",
100 |       " <a href=\"https://i.imgur.com/9CSJ3M5.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/9CSJ3M5.jpg</a>,\n",
101 |       " <a href=\"https://i.imgur.com/NgKEiFz.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/NgKEiFz.jpg</a>,\n",
102 |       " <a href=\"https://i.imgur.com/aN6aYyo.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/aN6aYyo.jpg</a>,\n",
103 |       " <a href=\"https://i.imgur.com/O2KNZJV.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/O2KNZJV.jpg</a>,\n",
104 |       " <a href=\"https://i.imgur.com/WvjeC9N.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/WvjeC9N.jpg</a>,\n",
105 |       " <a href=\"https://i.imgur.com/bG8O5he.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/bG8O5he.jpg</a>,\n",
106 |       " <a href=\"https://i.imgur.com/aJ7Lt7l.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/aJ7Lt7l.jpg</a>,\n",
107 |       " <a href=\"https://i.imgur.com/bNVe7S2.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/bNVe7S2.jpg</a>,\n",
108 |       " <a href=\"https://i.imgur.com/LxOXwCC.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/LxOXwCC.jpg</a>,\n",
109 |       " <a href=\"https://i.imgur.com/wI5TKjP.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/wI5TKjP.jpg</a>,\n",
110 |       " <a href=\"https://i.imgur.com/TW8c7ei.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/TW8c7ei.jpg</a>,\n",
111 |       " <a href=\"https://i.imgur.com/xl4zx8N.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/xl4zx8N.jpg</a>,\n",
112 |       " <a href=\"https://i.imgur.com/kbY3glw.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/kbY3glw.jpg</a>,\n",
113 |       " <a href=\"https://i.imgur.com/Aa3utxo.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/Aa3utxo.jpg</a>,\n",
114 |       " <a href=\"https://i.imgur.com/zPfERpw.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/zPfERpw.jpg</a>,\n",
115 |       " <a href=\"https://i.imgur.com/vXAbWHR.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/vXAbWHR.jpg</a>,\n",
116 |       " <a href=\"https://i.imgur.com/I7hUgF4.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/I7hUgF4.jpg</a>,\n",
117 |       " <a href=\"https://i.imgur.com/KOu9YRR.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/KOu9YRR.jpg</a>,\n",
118 |       " <a href=\"https://i.imgur.com/WvjeC9N.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/WvjeC9N.jpg</a>,\n",
119 |       " <a href=\"https://i.imgur.com/PtXgokJ.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/PtXgokJ.jpg</a>,\n",
120 |       " <a href=\"https://i.imgur.com/2sF8O4u.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/2sF8O4u.jpg</a>,\n",
121 |       " <a href=\"https://i.imgur.com/ZnEC7Jf.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/ZnEC7Jf.jpg</a>,\n",
122 |       " <a href=\"https://i.imgur.com/zqEwg69.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/zqEwg69.jpg</a>,\n",
123 |       " <a href=\"https://i.imgur.com/I6QeEsc.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/I6QeEsc.jpg</a>,\n",
124 |       " <a href=\"https://i.imgur.com/XDLSNW4.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/XDLSNW4.jpg</a>,\n",
125 |       " <a href=\"https://i.imgur.com/4KZ6JOH.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/4KZ6JOH.jpg</a>,\n",
126 |       " <a href=\"https://i.imgur.com/ixuwTe5.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/ixuwTe5.jpg</a>,\n",
127 |       " <a href=\"https://i.imgur.com/6wShMfE.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/6wShMfE.jpg</a>,\n",
128 |       " <a href=\"https://i.imgur.com/6TK1rp5.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/6TK1rp5.jpg</a>,\n",
129 |       " <a href=\"https://i.imgur.com/Mtf5Hz5.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/Mtf5Hz5.jpg</a>,\n",
130 |       " <a href=\"https://i.imgur.com/XLB5kPg.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/XLB5kPg.jpg</a>,\n",
131 |       " <a href=\"https://i.imgur.com/xIyvraR.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/xIyvraR.jpg</a>,\n",
132 |       " <a href=\"https://i.imgur.com/enTsU1Z.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/enTsU1Z.jpg</a>,\n",
133 |       " <a href=\"https://i.imgur.com/3YHKqwJ.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/3YHKqwJ.jpg</a>,\n",
134 |       " <a href=\"https://i.imgur.com/mNGnRU7.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/mNGnRU7.jpg</a>,\n",
135 |       " <a href=\"https://i.imgur.com/5ughnWE.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/5ughnWE.jpg</a>,\n",
136 |       " <a href=\"https://i.imgur.com/AA8U6Al.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/AA8U6Al.jpg</a>,\n",
137 |       " <a href=\"https://i.imgur.com/juPKVUR.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/juPKVUR.jpg</a>,\n",
138 |       " <a href=\"https://i.imgur.com/M2mJx5N.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/M2mJx5N.jpg</a>,\n",
139 |       " <a href=\"https://i.imgur.com/8Kwd9Rc.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/8Kwd9Rc.jpg</a>,\n",
140 |       " <a href=\"https://i.imgur.com/KmRqaPE.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/KmRqaPE.jpg</a>,\n",
141 |       " <a href=\"https://i.imgur.com/FIjGDka.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/FIjGDka.jpg</a>,\n",
142 |       " <a href=\"https://i.imgur.com/DB0Zu8Q.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/DB0Zu8Q.jpg</a>,\n",
143 |       " <a href=\"https://i.imgur.com/t8S3vno.png\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/t8S3vno.png</a>,\n",
144 |       " <a href=\"https://i.imgur.com/MJxZfgi.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/MJxZfgi.jpg</a>,\n",
145 |       " <a href=\"https://i.imgur.com/G2dw8Cp.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/G2dw8Cp.jpg</a>,\n",
146 |       " <a href=\"https://i.imgur.com/1CwI4YX.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/1CwI4YX.jpg</a>,\n",
147 |       " <a href=\"https://i.imgur.com/wSShBG7.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/wSShBG7.jpg</a>,\n",
148 |       " <a href=\"https://i.imgur.com/kIS1BTe.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/kIS1BTe.jpg</a>,\n",
149 |       " <a href=\"https://i.imgur.com/3zG4M7q.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/3zG4M7q.jpg</a>,\n",
150 |       " <a href=\"https://i.imgur.com/xhIgdYH.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/xhIgdYH.jpg</a>,\n",
151 |       " <a href=\"https://i.imgur.com/Xaefcnj.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/Xaefcnj.jpg</a>,\n",
152 |       " <a href=\"https://i.imgur.com/VOfcZ6l.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/VOfcZ6l.jpg</a>,\n",
153 |       " <a href=\"https://i.imgur.com/0MvMt9H.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/0MvMt9H.jpg</a>,\n",
154 |       " <a href=\"https://i.imgur.com/gTBGELL.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/gTBGELL.jpg</a>,\n",
155 |       " <a href=\"https://i.imgur.com/mDkgG5m.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/mDkgG5m.jpg</a>,\n",
156 |       " <a href=\"https://i.imgur.com/6zItH1z.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/6zItH1z.jpg</a>,\n",
157 |       " <a href=\"https://i.imgur.com/Ikp4oXG.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/Ikp4oXG.jpg</a>,\n",
158 |       " <a href=\"https://i.imgur.com/ge0XrdB.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/ge0XrdB.jpg</a>,\n",
159 |       " <a href=\"https://i.imgur.com/qrIsZKP.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/qrIsZKP.jpg</a>,\n",
160 |       " <a href=\"https://i.imgur.com/4k9bFUi.jpg\" rel=\"nofollow\" target=\"_blank\">https://i.imgur.com/4k9bFUi.jpg</a>]\n"
161 |      ]
162 |     }
163 |    ],
164 |    "source": [
165 |     "main_content = soup.find(id = 'main-content')\n",
166 |     "img_link = main_content.findAll('a', recursive=False)\n",
167 |     "pprint(img_link)"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "markdown",
172 |    "metadata": {},
173 |    "source": [
174 |     "## 檢查並下載圖片"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": 5,
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": [
183 |     "def check_and_download_img(url, savedir='download_img'):\n",
184 |     "    image_resp = requests.get(url, stream=True)\n",
185 |     "    image = Image.open(image_resp.raw)\n",
186 |     "    filename = os.path.basename(url)\n",
187 |     "    \n",
188 |     "    # check format\n",
189 |     "    real_filename = '{}.{}'.format(\n",
190 |     "        filename.split('.')[0],\n",
191 |     "        image.format.lower()\n",
192 |     "    )\n",
193 |     "    print('check and fixed filename {} -> {}'.format(filename, real_filename))\n",
194 |     "    \n",
195 |     "    # download\n",
196 |     "    if not os.path.exists(savedir):\n",
197 |     "        os.makedirs(savedir)\n",
198 |     "    savepath = os.path.join(savedir, real_filename)\n",
199 |     "    image.save(savepath)\n",
200 |     "    print('save imag - {}'.format(savepath))"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": 6,
206 |    "metadata": {},
207 |    "outputs": [
208 |     {
209 |      "name": "stdout",
210 |      "output_type": "stream",
211 |      "text": [
212 |       "check and fixed filename HdI5e8G.jpg -> HdI5e8G.jpeg\n",
213 |       "save imag - download_img/HdI5e8G.jpeg\n",
214 |       "check and fixed filename 6W5aQk2.jpg -> 6W5aQk2.jpeg\n",
215 |       "save imag - download_img/6W5aQk2.jpeg\n",
216 |       "check and fixed filename PhhH8ga.jpg -> PhhH8ga.jpeg\n",
217 |       "save imag - download_img/PhhH8ga.jpeg\n",
218 |       "check and fixed filename zF6ZwFj.jpg -> zF6ZwFj.jpeg\n",
219 |       "save imag - download_img/zF6ZwFj.jpeg\n",
220 |       "check and fixed filename 4CXovkJ.jpg -> 4CXovkJ.jpeg\n",
221 |       "save imag - download_img/4CXovkJ.jpeg\n",
222 |       "check and fixed filename NFwopB9.jpg -> NFwopB9.jpeg\n",
223 |       "save imag - download_img/NFwopB9.jpeg\n",
224 |       "check and fixed filename BFlIDmf.jpg -> BFlIDmf.jpeg\n",
225 |       "save imag - download_img/BFlIDmf.jpeg\n",
226 |       "check and fixed filename ARewyx8.jpg -> ARewyx8.jpeg\n",
227 |       "save imag - download_img/ARewyx8.jpeg\n",
228 |       "check and fixed filename LK4fnZX.jpg -> LK4fnZX.jpeg\n",
229 |       "save imag - download_img/LK4fnZX.jpeg\n",
230 |       "check and fixed filename AjTWzRW.jpg -> AjTWzRW.jpeg\n",
231 |       "save imag - download_img/AjTWzRW.jpeg\n",
232 |       "check and fixed filename daJpBHQ.jpg -> daJpBHQ.jpeg\n",
233 |       "save imag - download_img/daJpBHQ.jpeg\n",
234 |       "check and fixed filename X2RqYU6.jpg -> X2RqYU6.jpeg\n",
235 |       "save imag - download_img/X2RqYU6.jpeg\n",
236 |       "check and fixed filename j8rj172.jpg -> j8rj172.jpeg\n",
237 |       "save imag - download_img/j8rj172.jpeg\n",
238 |       "check and fixed filename nNnAJFf.jpg -> nNnAJFf.jpeg\n",
239 |       "save imag - download_img/nNnAJFf.jpeg\n",
240 |       "check and fixed filename dwZAQu1.jpg -> dwZAQu1.jpeg\n",
241 |       "save imag - download_img/dwZAQu1.jpeg\n",
242 |       "check and fixed filename 7ibAOi8.jpg -> 7ibAOi8.jpeg\n",
243 |       "save imag - download_img/7ibAOi8.jpeg\n",
244 |       "check and fixed filename YTaD5bs.jpg -> YTaD5bs.jpeg\n",
245 |       "save imag - download_img/YTaD5bs.jpeg\n",
246 |       "check and fixed filename FzwkWYt.jpg -> FzwkWYt.jpeg\n",
247 |       "save imag - download_img/FzwkWYt.jpeg\n",
248 |       "check and fixed filename NflbWR5.jpg -> NflbWR5.jpeg\n",
249 |       "save imag - download_img/NflbWR5.jpeg\n",
250 |       "check and fixed filename 6sqAzjT.jpg -> 6sqAzjT.jpeg\n",
251 |       "save imag - download_img/6sqAzjT.jpeg\n",
252 |       "check and fixed filename KmEAkaP.jpg -> KmEAkaP.jpeg\n",
253 |       "save imag - download_img/KmEAkaP.jpeg\n",
254 |       "check and fixed filename 73yb0Ao.jpg -> 73yb0Ao.jpeg\n",
255 |       "save imag - download_img/73yb0Ao.jpeg\n",
256 |       "check and fixed filename K6ukMIf.jpg -> K6ukMIf.jpeg\n",
257 |       "save imag - download_img/K6ukMIf.jpeg\n",
258 |       "check and fixed filename 3BFzLjv.jpg -> 3BFzLjv.jpeg\n",
259 |       "save imag - download_img/3BFzLjv.jpeg\n",
260 |       "check and fixed filename 72a2Bas.jpg -> 72a2Bas.jpeg\n",
261 |       "save imag - download_img/72a2Bas.jpeg\n",
262 |       "check and fixed filename 89GSqqx.jpg -> 89GSqqx.jpeg\n",
263 |       "save imag - download_img/89GSqqx.jpeg\n",
264 |       "check and fixed filename 9CSJ3M5.jpg -> 9CSJ3M5.jpeg\n",
265 |       "save imag - download_img/9CSJ3M5.jpeg\n",
266 |       "check and fixed filename NgKEiFz.jpg -> NgKEiFz.jpeg\n",
267 |       "save imag - download_img/NgKEiFz.jpeg\n",
268 |       "check and fixed filename aN6aYyo.jpg -> aN6aYyo.jpeg\n",
269 |       "save imag - download_img/aN6aYyo.jpeg\n",
270 |       "check and fixed filename O2KNZJV.jpg -> O2KNZJV.jpeg\n",
271 |       "save imag - download_img/O2KNZJV.jpeg\n",
272 |       "check and fixed filename WvjeC9N.jpg -> WvjeC9N.jpeg\n",
273 |       "save imag - download_img/WvjeC9N.jpeg\n",
274 |       "check and fixed filename bG8O5he.jpg -> bG8O5he.jpeg\n",
275 |       "save imag - download_img/bG8O5he.jpeg\n",
276 |       "check and fixed filename aJ7Lt7l.jpg -> aJ7Lt7l.jpeg\n",
277 |       "save imag - download_img/aJ7Lt7l.jpeg\n",
278 |       "check and fixed filename bNVe7S2.jpg -> bNVe7S2.jpeg\n",
279 |       "save imag - download_img/bNVe7S2.jpeg\n",
280 |       "check and fixed filename LxOXwCC.jpg -> LxOXwCC.jpeg\n",
281 |       "save imag - download_img/LxOXwCC.jpeg\n",
282 |       "check and fixed filename wI5TKjP.jpg -> wI5TKjP.jpeg\n",
283 |       "save imag - download_img/wI5TKjP.jpeg\n",
284 |       "check and fixed filename TW8c7ei.jpg -> TW8c7ei.jpeg\n",
285 |       "save imag - download_img/TW8c7ei.jpeg\n",
286 |       "check and fixed filename xl4zx8N.jpg -> xl4zx8N.jpeg\n",
287 |       "save imag - download_img/xl4zx8N.jpeg\n",
288 |       "check and fixed filename kbY3glw.jpg -> kbY3glw.jpeg\n",
289 |       "save imag - download_img/kbY3glw.jpeg\n",
290 |       "check and fixed filename Aa3utxo.jpg -> Aa3utxo.jpeg\n",
291 |       "save imag - download_img/Aa3utxo.jpeg\n",
292 |       "check and fixed filename zPfERpw.jpg -> zPfERpw.jpeg\n",
293 |       "save imag - download_img/zPfERpw.jpeg\n",
294 |       "check and fixed filename vXAbWHR.jpg -> vXAbWHR.jpeg\n",
295 |       "save imag - download_img/vXAbWHR.jpeg\n",
296 |       "check and fixed filename I7hUgF4.jpg -> I7hUgF4.jpeg\n",
297 |       "save imag - download_img/I7hUgF4.jpeg\n",
298 |       "check and fixed filename KOu9YRR.jpg -> KOu9YRR.jpeg\n",
299 |       "save imag - download_img/KOu9YRR.jpeg\n",
300 |       "check and fixed filename WvjeC9N.jpg -> WvjeC9N.jpeg\n",
301 |       "save imag - download_img/WvjeC9N.jpeg\n",
302 |       "check and fixed filename PtXgokJ.jpg -> PtXgokJ.jpeg\n",
303 |       "save imag - download_img/PtXgokJ.jpeg\n",
304 |       "check and fixed filename 2sF8O4u.jpg -> 2sF8O4u.jpeg\n",
305 |       "save imag - download_img/2sF8O4u.jpeg\n",
306 |       "check and fixed filename ZnEC7Jf.jpg -> ZnEC7Jf.jpeg\n",
307 |       "save imag - download_img/ZnEC7Jf.jpeg\n",
308 |       "check and fixed filename zqEwg69.jpg -> zqEwg69.jpeg\n",
309 |       "save imag - download_img/zqEwg69.jpeg\n",
310 |       "check and fixed filename I6QeEsc.jpg -> I6QeEsc.jpeg\n",
311 |       "save imag - download_img/I6QeEsc.jpeg\n",
312 |       "check and fixed filename XDLSNW4.jpg -> XDLSNW4.jpeg\n",
313 |       "save imag - download_img/XDLSNW4.jpeg\n",
314 |       "check and fixed filename 4KZ6JOH.jpg -> 4KZ6JOH.jpeg\n",
315 |       "save imag - download_img/4KZ6JOH.jpeg\n",
316 |       "check and fixed filename ixuwTe5.jpg -> ixuwTe5.jpeg\n",
317 |       "save imag - download_img/ixuwTe5.jpeg\n",
318 |       "check and fixed filename 6wShMfE.jpg -> 6wShMfE.jpeg\n",
319 |       "save imag - download_img/6wShMfE.jpeg\n",
320 |       "check and fixed filename 6TK1rp5.jpg -> 6TK1rp5.jpeg\n",
321 |       "save imag - download_img/6TK1rp5.jpeg\n",
322 |       "check and fixed filename Mtf5Hz5.jpg -> Mtf5Hz5.jpeg\n",
323 |       "save imag - download_img/Mtf5Hz5.jpeg\n",
324 |       "check and fixed filename XLB5kPg.jpg -> XLB5kPg.jpeg\n",
325 |       "save imag - download_img/XLB5kPg.jpeg\n",
326 |       "check and fixed filename xIyvraR.jpg -> xIyvraR.jpeg\n",
327 |       "save imag - download_img/xIyvraR.jpeg\n",
328 |       "check and fixed filename enTsU1Z.jpg -> enTsU1Z.jpeg\n",
329 |       "save imag - download_img/enTsU1Z.jpeg\n",
330 |       "check and fixed filename 3YHKqwJ.jpg -> 3YHKqwJ.jpeg\n",
331 |       "save imag - download_img/3YHKqwJ.jpeg\n",
332 |       "check and fixed filename mNGnRU7.jpg -> mNGnRU7.jpeg\n",
333 |       "save imag - download_img/mNGnRU7.jpeg\n",
334 |       "check and fixed filename 5ughnWE.jpg -> 5ughnWE.jpeg\n",
335 |       "save imag - download_img/5ughnWE.jpeg\n",
336 |       "check and fixed filename AA8U6Al.jpg -> AA8U6Al.jpeg\n",
337 |       "save imag - download_img/AA8U6Al.jpeg\n",
338 |       "check and fixed filename juPKVUR.jpg -> juPKVUR.jpeg\n",
339 |       "save imag - download_img/juPKVUR.jpeg\n",
340 |       "check and fixed filename M2mJx5N.jpg -> M2mJx5N.jpeg\n",
341 |       "save imag - download_img/M2mJx5N.jpeg\n",
342 |       "check and fixed filename 8Kwd9Rc.jpg -> 8Kwd9Rc.jpeg\n",
343 |       "save imag - download_img/8Kwd9Rc.jpeg\n",
344 |       "check and fixed filename KmRqaPE.jpg -> KmRqaPE.jpeg\n",
345 |       "save imag - download_img/KmRqaPE.jpeg\n",
346 |       "check and fixed filename FIjGDka.jpg -> FIjGDka.jpeg\n",
347 |       "save imag - download_img/FIjGDka.jpeg\n",
348 |       "check and fixed filename DB0Zu8Q.jpg -> DB0Zu8Q.jpeg\n",
349 |       "save imag - download_img/DB0Zu8Q.jpeg\n",
350 |       "check and fixed filename t8S3vno.png -> t8S3vno.png\n",
351 |       "save imag - download_img/t8S3vno.png\n",
352 |       "check and fixed filename MJxZfgi.jpg -> MJxZfgi.jpeg\n",
353 |       "save imag - download_img/MJxZfgi.jpeg\n",
354 |       "check and fixed filename G2dw8Cp.jpg -> G2dw8Cp.jpeg\n",
355 |       "save imag - download_img/G2dw8Cp.jpeg\n",
356 |       "check and fixed filename 1CwI4YX.jpg -> 1CwI4YX.jpeg\n",
357 |       "save imag - download_img/1CwI4YX.jpeg\n",
358 |       "check and fixed filename wSShBG7.jpg -> wSShBG7.jpeg\n",
359 |       "save imag - download_img/wSShBG7.jpeg\n",
360 |       "check and fixed filename kIS1BTe.jpg -> kIS1BTe.jpeg\n",
361 |       "save imag - download_img/kIS1BTe.jpeg\n",
362 |       "check and fixed filename 3zG4M7q.jpg -> 3zG4M7q.jpeg\n",
363 |       "save imag - download_img/3zG4M7q.jpeg\n",
364 |       "check and fixed filename xhIgdYH.jpg -> xhIgdYH.jpeg\n",
365 |       "save imag - download_img/xhIgdYH.jpeg\n",
366 |       "check and fixed filename Xaefcnj.jpg -> Xaefcnj.jpeg\n",
367 |       "save imag - download_img/Xaefcnj.jpeg\n",
368 |       "check and fixed filename VOfcZ6l.jpg -> VOfcZ6l.jpeg\n",
369 |       "save imag - download_img/VOfcZ6l.jpeg\n",
370 |       "check and fixed filename 0MvMt9H.jpg -> 0MvMt9H.jpeg\n",
371 |       "save imag - download_img/0MvMt9H.jpeg\n",
372 |       "check and fixed filename gTBGELL.jpg -> gTBGELL.jpeg\n",
373 |       "save imag - download_img/gTBGELL.jpeg\n",
374 |       "check and fixed filename mDkgG5m.jpg -> mDkgG5m.jpeg\n",
375 |       "save imag - download_img/mDkgG5m.jpeg\n",
376 |       "check and fixed filename 6zItH1z.jpg -> 6zItH1z.jpeg\n",
377 |       "save imag - download_img/6zItH1z.jpeg\n",
378 |       "check and fixed filename Ikp4oXG.jpg -> Ikp4oXG.jpeg\n",
379 |       "save imag - download_img/Ikp4oXG.jpeg\n",
380 |       "check and fixed filename ge0XrdB.jpg -> ge0XrdB.jpeg\n",
381 |       "save imag - download_img/ge0XrdB.jpeg\n",
382 |       "check and fixed filename qrIsZKP.jpg -> qrIsZKP.jpeg\n",
383 |       "save imag - download_img/qrIsZKP.jpeg\n",
384 |       "check and fixed filename 4k9bFUi.jpg -> 4k9bFUi.jpeg\n",
385 |       "save imag - download_img/4k9bFUi.jpeg\n"
386 |      ]
387 |     }
388 |    ],
389 |    "source": [
390 |     "for tag in img_link:\n",
391 |     "    check_and_download_img(tag['href'])"
392 |    ]
393 |   }
394 |  ],
395 |  "metadata": {
396 |   "kernelspec": {
397 |    "display_name": "Python 3",
398 |    "language": "python",
399 |    "name": "python3"
400 |   },
401 |   "language_info": {
402 |    "codemirror_mode": {
403 |     "name": "ipython",
404 |     "version": 3
405 |    },
406 |    "file_extension": ".py",
407 |    "mimetype": "text/x-python",
408 |    "name": "python",
409 |    "nbconvert_exporter": "python",
410 |    "pygments_lexer": "ipython3",
411 |    "version": "3.6.6"
412 |   }
413 |  },
414 |  "nbformat": 4,
415 |  "nbformat_minor": 2
416 | }
417 | 


--------------------------------------------------------------------------------
/appendix_ptt/README.md:
--------------------------------------------------------------------------------
1 | # Ptt Crawler
2 | 
3 | > This crawler is basically with reference to [jwlin/ptt-web-crawler](https://github.com/jwlin/ptt-web-crawler)
4 | 


--------------------------------------------------------------------------------