├── README.md
├── 第一版
    ├── Cha 10 -登录与验证码处理
    │   ├── Cha 10 -登录与验证码处理.ipynb
    │   ├── captcha.jpg
    │   ├── captcha_gray.jpg
    │   ├── captcha_thresholded.jpg
    │   └── cookies
    ├── Cha 11 -服务器采集
    │   ├── Cha 11 -服务器采集.ipynb
    │   ├── tor1.py
    │   ├── tor2.py
    │   └── tor3.py
    ├── Cha 12 -分布式爬虫
    │   ├── .ipynb_checkpoints
    │   │   └── Cha 12 - 分布式爬虫-checkpoint.ipynb
    │   ├── 1497099934.jpg
    │   ├── Cha 12 - 分布式爬虫.ipynb
    │   ├── alexa.txt
    │   ├── master.py
    │   └── slave.py
    ├── Cha 13 -爬虫实战一：维基百科
    │   ├── Cha 12 -爬虫实战一：维基百科.ipynb
    │   └── link_12-3.txt
    ├── Cha 14 -爬虫实战二：知乎Live
    │   └── Cha 14 -爬虫实战二：知乎Live.ipynb
    ├── Cha 15 -爬虫实战三：百度地图API
    │   ├── Cha 14 -爬虫实战三：百度地图API.ipynb
    │   └── cities.txt
    ├── Cha 16 -爬虫实战四：餐厅评价
    │   └── Cha 16 -爬虫实战四：餐厅评价.ipynb
    ├── Cha 2 - 编写你的第一个网络爬虫
    │   ├── Cha 2 -编写你的第一个网络爬虫.ipynb
    │   └── Cha 2 _章末实战.ipynb
    ├── Cha 3 -静态网页抓取
    │   ├── Cha 3 -静态网页抓取.ipynb
    │   └── Cha 3 _章末实战.ipynb
    ├── Cha 4 -动态网页抓取
    │   ├── Cha 4 -动态网页抓取.ipynb
    │   └── Cha 4 _章末实战.ipynb
    ├── Cha 5 -解析网页
    │   ├── Cha 5 -解析网页.ipynb
    │   └── Cha 5 _章末实战.ipynb
    ├── Cha 6 -数据储存
    │   ├── Cha 6 -数据存储.ipynb
    │   └── Cha 6 _章末实战.ipynb
    ├── Cha 7 -提升爬虫的速度
    │   ├── Cha 7 -提升爬虫的速度.ipynb
    │   ├── Get Alexa.ipynb
    │   ├── alexa.txt
    │   ├── cha7
    │   │   ├── __pycache__
    │   │   │   ├── multiprocess_test.cpython-35.pyc
    │   │   │   ├── multiprocess_test.cpython-36.pyc
    │   │   │   ├── thread_test.cpython-35.pyc
    │   │   │   └── thread_test.cpython-36.pyc
    │   │   ├── alexa.txt
    │   │   ├── gevent1.py
    │   │   ├── gevent_test.py
    │   │   ├── multiprocess_test.py
    │   │   ├── mutilprocess1.py
    │   │   ├── mutilprocess2.py
    │   │   ├── mutilprocess3.py
    │   │   ├── result.txt
    │   │   ├── result_gevent.txt
    │   │   ├── result_single_time.txt
    │   │   ├── thread1.py
    │   │   ├── thread2.py
    │   │   ├── thread_test.py
    │   │   ├── time_spend 2.py
    │   │   └── time_spend.py
    │   ├── 多协程.png
    │   ├── 多线程.png
    │   └── 多进程.png
    ├── Cha 8 -反爬虫问题
    │   └── Cha 8 -反爬虫问题.ipynb
    └── Cha 9 -解决中文乱码
    │   └── Cha 9 -解决中文乱码.ipynb
└── 第二版
    ├── Cha 10 -解决中文乱码
        └── Cha 10 -解决中文乱码.ipynb
    ├── Cha 11 -登录与验证码处理
        ├── Cha 11 -登录与验证码处理.ipynb
        ├── captcha.jpg
        ├── captcha_gray.jpg
        ├── captcha_thresholded.jpg
        └── cookies
    ├── Cha 12 -服务器采集
        ├── Cha 12 -服务器采集.ipynb
        ├── tor1.py
        ├── tor2.py
        └── tor3.py
    ├── Cha 13 -分布式爬虫
        ├── 1497099934.jpg
        ├── Cha 13 - 分布式爬虫.ipynb
        ├── alexa.txt
        ├── master.py
        └── slave.py
    ├── Cha 14 -爬虫实战一：维基百科
        ├── Cha 14 -爬虫实战一：维基百科.ipynb
        └── link_12-3.txt
    ├── Cha 15 -爬虫实战二：知乎Live
        └── Cha 15 -爬虫实战二：知乎Live.ipynb
    ├── Cha 16 -爬虫实战三：百度地图API
        ├── Cha 16 -爬虫实战三：百度地图API.ipynb
        └── cities.txt
    ├── Cha 17 -爬虫实战四：图书信息
        ├── Cha 17 -爬虫实战四：图书信息.ipynb
        ├── book_list.csv
        └── book_review.csv
    ├── Cha 2 - 编写你的第一个网络爬虫
        ├── Cha 2 -编写你的第一个网络爬虫.ipynb
        ├── Cha 2 _章末实战.ipynb
        └── title_test.txt
    ├── Cha 3 -静态网页抓取
        ├── Cha 3 -静态网页抓取.ipynb
        ├── Cha 3 _章末实战.ipynb
        └── Cha 3_自我实践（章末）.ipynb
    ├── Cha 4 -动态网页抓取
        ├── Cha 4 -动态网页抓取.ipynb
        ├── Cha 4 _章末实战.ipynb
        ├── Cha 4 _自我实践（章末）.ipynb
        └── geckodriver.log
    ├── Cha 5 -解析网页
        ├── Cha 5 -解析网页.ipynb
        ├── Cha 5 _章末实战.ipynb
        └── Cha 5 _自我实践（章末）.ipynb
    ├── Cha 6 -数据储存
        ├── Cha 6 -数据存储.ipynb
        ├── Cha 6 _章末实战.ipynb
        ├── test.csv
        └── test2.csv
    ├── Cha 7 -Scrapy爬虫框架
        ├── Cha7 - 自我实践（章末）答案
        │   └── financeSpider
        │   │   ├── financeSpider
        │   │       ├── __init__.py
        │   │       ├── __pycache__
        │   │       │   ├── __init__.cpython-36.pyc
        │   │       │   ├── items.cpython-36.pyc
        │   │       │   ├── pipelines.cpython-36.pyc
        │   │       │   └── settings.cpython-36.pyc
        │   │       ├── items.py
        │   │       ├── middlewares.py
        │   │       ├── pipelines.py
        │   │       ├── settings.py
        │   │       └── spiders
        │   │       │   ├── __init__.py
        │   │       │   ├── __pycache__
        │   │       │       ├── __init__.cpython-36.pyc
        │   │       │       └── finance.cpython-36.pyc
        │   │       │   └── finance.py
        │   │   ├── result.txt
        │   │   └── scrapy.cfg
        ├── blogSpider
        │   ├── article.csv
        │   ├── article.json
        │   ├── blogSpider
        │   │   ├── __init__.py
        │   │   ├── __pycache__
        │   │   │   ├── __init__.cpython-36.pyc
        │   │   │   ├── items.cpython-36.pyc
        │   │   │   ├── pipelines.cpython-36.pyc
        │   │   │   └── settings.cpython-36.pyc
        │   │   ├── items.py
        │   │   ├── middlewares.py
        │   │   ├── pipelines.py
        │   │   ├── settings.py
        │   │   └── spiders
        │   │   │   ├── __init__.py
        │   │   │   ├── __pycache__
        │   │   │       ├── __init__.cpython-36.pyc
        │   │   │       ├── santostang - 副本.cpython-36.pyc
        │   │   │       └── santostang.cpython-36.pyc
        │   │   │   ├── santostang - 副本.py
        │   │   │   └── santostang.py
        │   ├── index.html
        │   ├── result.txt
        │   └── scrapy.cfg
        └── financeSpider
        │   ├── financeSpider
        │       ├── __init__.py
        │       ├── __pycache__
        │       │   ├── __init__.cpython-36.pyc
        │       │   ├── items.cpython-36.pyc
        │       │   ├── pipelines.cpython-36.pyc
        │       │   └── settings.cpython-36.pyc
        │       ├── items.py
        │       ├── middlewares.py
        │       ├── pipelines.py
        │       ├── settings.py
        │       └── spiders
        │       │   ├── __init__.py
        │       │   ├── __pycache__
        │       │       ├── __init__.cpython-36.pyc
        │       │       └── finance.cpython-36.pyc
        │       │   └── finance.py
        │   ├── result.txt
        │   └── scrapy.cfg
    ├── Cha 8 -提升爬虫的速度
        ├── Cha 8 -提升爬虫的速度.ipynb
        ├── alexa.txt
        ├── cha8
        │   ├── __pycache__
        │   │   ├── multiprocess_test.cpython-35.pyc
        │   │   ├── multiprocess_test.cpython-36.pyc
        │   │   ├── thread_test.cpython-35.pyc
        │   │   └── thread_test.cpython-36.pyc
        │   ├── alexa.txt
        │   ├── gevent1.py
        │   ├── gevent_test.py
        │   ├── multiprocess_test.py
        │   ├── mutilprocess1.py
        │   ├── mutilprocess2.py
        │   ├── mutilprocess3.py
        │   ├── result.txt
        │   ├── result_gevent.txt
        │   ├── result_single_time.txt
        │   ├── thread1.py
        │   ├── thread2.py
        │   ├── thread_test.py
        │   ├── time_spend 2.py
        │   └── time_spend.py
        ├── 多协程.png
        ├── 多线程.png
        └── 多进程.png
    ├── Cha 9 -反爬虫问题
        └── Cha 9 -反爬虫问题.ipynb
    └── geckodriver.exe


/README.md:
--------------------------------------------------------------------------------
 1 | # PythonScraping
 2 | 此为《Python 网络爬虫：从入门到实践》的源代码，欢迎读者使用学习。
 3 | 本人对代码拥有知识产权，如果读者需要使用其中的代码，请在注释中写明作者：唐松，来自《Python 网络爬虫：从入门到实践》。
 4 | 
 5 | 本书分为两版，
 6 | 
 7 | 第二版为2019年出版《Python 网络爬虫：从入门到实践》，相较第一版，本代码加入了每章课后练习的答案代码。
 8 | 
 9 | 京东：[《Python网络爬虫从入门到实践 第2版》(唐松)- 京东图书](https://item.jd.com/12536063.html)
10 | 
11 | 当当：[《Python网络爬虫从入门到实践 第2版》(唐松)- 当当图书](http://product.dangdang.com/27882003.html)
12 | 
13 | <br>
14 | 第一版为2017年出版《Python 网络爬虫：从入门到实践》
15 | 
16 | 京东：[《Python网络爬虫从入门到实践》(唐松，陈智铨)- 京东图书](http://item.jd.com/12180379.html)
17 | 
18 | 当当：[《Python网络爬虫从入门到实践》(唐松 陈智铨)- 当当图书](http://product.dangdang.com/25162123.html)
19 | 
20 | 天猫：[Python网络爬虫从入门到实践 ](https://detail.tmall.com/item.htm?id=558781742115)
21 | 
22 | Code for *Web Crawling with Python* Published by China Machine Press in 2019 and 2017
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/第一版/Cha 10 -登录与验证码处理/captcha.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第一版/Cha 10 -登录与验证码处理/captcha.jpg


--------------------------------------------------------------------------------
/第一版/Cha 10 -登录与验证码处理/captcha_gray.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第一版/Cha 10 -登录与验证码处理/captcha_gray.jpg


--------------------------------------------------------------------------------
/第一版/Cha 10 -登录与验证码处理/captcha_thresholded.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第一版/Cha 10 -登录与验证码处理/captcha_thresholded.jpg


--------------------------------------------------------------------------------
/第一版/Cha 10 -登录与验证码处理/cookies:
--------------------------------------------------------------------------------
1 | #LWP-Cookies-2.0
2 | Set-Cookie3: wordpress_logged_in_d7bcd9a2844a13f8dc3e5a4eb2cdfa70="test%7C1495985122%7CEbrRwEjpLk1v6MIpVKS8KzpNCw3iZlzA0WBqM8JpovA%7Ce97d29b8b77573ce9983896968d8ea9acd5d8959465c15650e0b5a82f4a2c156"; path="/"; domain="www.santostang.com"; path_spec; expires="2017-05-29 03:25:00Z"; httponly=None; version=0
3 | Set-Cookie3: wordpress_d7bcd9a2844a13f8dc3e5a4eb2cdfa70="test%7C1495985122%7CEbrRwEjpLk1v6MIpVKS8KzpNCw3iZlzA0WBqM8JpovA%7Cd3c089e89e99da8d6b43228440e7c791ddce3e2ef172b03bfebc7692e49b8fb8"; path="/wp-admin"; domain="www.santostang.com"; path_spec; expires="2017-05-29 03:25:00Z"; httponly=None; version=0
4 | Set-Cookie3: wordpress_d7bcd9a2844a13f8dc3e5a4eb2cdfa70="test%7C1495985122%7CEbrRwEjpLk1v6MIpVKS8KzpNCw3iZlzA0WBqM8JpovA%7Cd3c089e89e99da8d6b43228440e7c791ddce3e2ef172b03bfebc7692e49b8fb8"; path="/wp-content/plugins"; domain="www.santostang.com"; path_spec; expires="2017-05-29 03:25:00Z"; httponly=None; version=0
5 | 


--------------------------------------------------------------------------------
/第一版/Cha 11 -服务器采集/tor1.py:
--------------------------------------------------------------------------------
 1 | import socket
 2 | import socks
 3 | import requests
 4 | 
 5 | # Tor使用9150端口为默认的socks端口
 6 | socks.set_default_proxy(socks.SOCKS5, "127.0.0.1", 9150)
 7 | socket.socket = socks.socksocket
 8 | # 获取这次抓取使用的IP地址
 9 | a = requests.get("http://checkip.amazonaws.com").text
10 | 
11 | print (a)


--------------------------------------------------------------------------------
/第一版/Cha 11 -服务器采集/tor2.py:
--------------------------------------------------------------------------------
 1 | from stem import Signal
 2 | from stem.control import Controller
 3 | import socket
 4 | import socks
 5 | import requests
 6 | import time
 7 | 
 8 | controller = Controller.from_port(port = 9151)
 9 | controller.authenticate()
10 | socks.set_default_proxy(socks.SOCKS5, "127.0.0.1", 9150)
11 | socket.socket = socks.socksocket
12 | 
13 | total_scrappy_time = 0
14 | total_changeIP_time = 0
15 | 
16 | for x in range(0,10):
17 | 	a = requests.get("http://checkip.amazonaws.com").text
18 | 	print ("第", x+1, "次IP：", a)
19 | 
20 | 	time1 = time.time()
21 | 	a = requests.get("http://www.santostang.com/").text
22 | 	#print (a)
23 | 	time2 = time.time()
24 | 	total_scrappy_time = total_scrappy_time + time2-time1
25 | 	print ("第", x+1, "次抓取花费时间：", time2-time1)
26 | 
27 | 	time3 = time.time()
28 | 	controller.signal(Signal.NEWNYM)
29 | 	time.sleep(5)
30 | 	time4 = time.time()
31 | 	total_changeIP_time = total_changeIP_time + time4-time3-5
32 | 	print ("第", x+1, "次更换IP花费时间：", time4-time3-5)
33 | 
34 | print ("平均抓取花费时间：", total_scrappy_time/10)
35 | print ("平均更换IP花费时间：", total_changeIP_time/10)
36 | 


--------------------------------------------------------------------------------
/第一版/Cha 11 -服务器采集/tor3.py:
--------------------------------------------------------------------------------
 1 | from stem import Signal
 2 | from stem.control import Controller
 3 | import socket
 4 | import socks
 5 | import requests
 6 | import time
 7 | 
 8 | #controller = Controller.from_port(port = 9151)
 9 | #controller.authenticate()
10 | #socks.set_default_proxy(socks.SOCKS5, "127.0.0.1", 9150)
11 | #socket.socket = socks.socksocket
12 | 
13 | total_scrappy_time = 0
14 | total_changeIP_time = 0
15 | 
16 | for x in range(0,10):
17 | 	#a = requests.get("http://checkip.amazonaws.com").text
18 | 	#print ("第", x+1, "次IP：", a)
19 | 
20 | 	time1 = time.time()
21 | 	a = requests.get("http://www.santostang.com/").text
22 | 	#print (a)
23 | 	time2 = time.time()
24 | 	total_scrappy_time = total_scrappy_time + time2-time1
25 | 	print ("第", x+1, "次抓取花费时间：", time2-time1)
26 | 
27 | 	time3 = time.time()
28 | 	#controller.signal(Signal.NEWNYM)
29 | 	time.sleep(5)
30 | 	time4 = time.time()
31 | 	total_changeIP_time = total_changeIP_time + time4-time3-5
32 | 	print ("第", x+1, "次更换IP花费时间：", time4-time3-5)
33 | 
34 | print ("平均抓取花费时间：", total_scrappy_time/10)
35 | print ("平均更换IP花费时间：", total_changeIP_time/10)


--------------------------------------------------------------------------------
/第一版/Cha 12 -分布式爬虫/.ipynb_checkpoints/Cha 12 - 分布式爬虫-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 12.3 Redis分布式爬虫实战"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## 12.3.2 加入任务队列"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "def push_redis_list():\n",
 26 |     "    r = Redis(host='137.189.204.65', port=6379 ,password='redisredis')\n",
 27 |     "    print (r.keys('*'))\n",
 28 |     "    \n",
 29 |     "    link_list = []\n",
 30 |     "    with open('alexa.txt', 'r') as file:\n",
 31 |     "        file_list = file.readlines()\n",
 32 |     "        for eachone in file_list:\n",
 33 |     "            link = eachone.split('\\t')[1]\n",
 34 |     "            link = link.replace('\\n','')\n",
 35 |     "            link_list.append(link)\n",
 36 |     "            if len(link_list) == 100:\n",
 37 |     "                break\n",
 38 |     "                \n",
 39 |     "    for url in link_list:\n",
 40 |     "        response = requests.get(url, headers=headers, timeout=20)\n",
 41 |     "        soup = BeautifulSoup(response.text, 'lxml')\n",
 42 |     "        img_list = soup.find_all('img')\n",
 43 |     "        for img in img_list:\n",
 44 |     "            img_url = img['src']\n",
 45 |     "            if img_url != '':\n",
 46 |     "                print (\"加入的图片url: \", img_url)\n",
 47 |     "                r.lpush('img_url',img_url)\n",
 48 |     "        print ('现在图片链接的个数为', r.llen('img_url'))\n",
 49 |     "    return"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "# 12.3.3 读取任务队列，下载图片"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {
 63 |     "collapsed": true
 64 |    },
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "def get_img():\n",
 68 |     "    r = Redis(host='137.189.204.65', port=6379 ,password='redisredis')\n",
 69 |     "    while True:\n",
 70 |     "        try:\n",
 71 |     "            url = r.lpop('img_url')\n",
 72 |     "            url = url.decode('ascii')\n",
 73 |     "            try:\n",
 74 |     "                response = requests.get(url, headers=headers,timeout = 20)\n",
 75 |     "                name = int(time.time())\n",
 76 |     "                f = open(str(name)+ url[-4:], 'wb')\n",
 77 |     "                f.write(response.content)\n",
 78 |     "                f.close()\n",
 79 |     "                print ('已经获取图片', url)\n",
 80 |     "            except Exception as e:\n",
 81 |     "                print ('爬取图片过程出问题', e)\n",
 82 |     "            time.sleep(3)\n",
 83 |     "        except Exception as e:\n",
 84 |     "            print (e)\n",
 85 |     "            time.sleep(10)\n",
 86 |     "            break\n",
 87 |     "    return "
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "# 12.3.4 分布式爬虫代码"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 13,
100 |    "metadata": {
101 |     "collapsed": false
102 |    },
103 |    "outputs": [],
104 |    "source": [
105 |     "import requests\n",
106 |     "from bs4 import BeautifulSoup\n",
107 |     "import re\n",
108 |     "import time\n",
109 |     "from redis import Redis\n",
110 |     "headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' }\n",
111 |     "\n",
112 |     "def push_redis_list():\n",
113 |     "    #与上面此函数相同\n",
114 |     "\n",
115 |     "def get_img():\n",
116 |     "    #与上面此函数相同\n",
117 |     "\n",
118 |     "if __name__ == '__main__':           \n",
119 |     "    this_machine = 'master'          \n",
120 |     "    print ('开始分布式爬虫')\n",
121 |     "    if this_machine == 'master':\n",
122 |     "        push_redis_list()\n",
123 |     "    else:\n",
124 |     "        get_img()"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 14,
130 |    "metadata": {
131 |     "collapsed": false
132 |    },
133 |    "outputs": [],
134 |    "source": [
135 |     "import requests\n",
136 |     "from bs4 import BeautifulSoup\n",
137 |     "import re\n",
138 |     "import time\n",
139 |     "from redis import Redis\n",
140 |     "headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' }\n",
141 |     "\n",
142 |     "def push_redis_list():\n",
143 |     "    #与上面此函数相同\n",
144 |     "\n",
145 |     "def get_img():\n",
146 |     "    #与上面此函数相同\n",
147 |     "\n",
148 |     "if __name__ == '__main__':           \n",
149 |     "    this_machine = 'slave'          \n",
150 |     "    print ('开始分布式爬虫')\n",
151 |     "    if this_machine == 'master':\n",
152 |     "        push_redis_list()\n",
153 |     "    else:\n",
154 |     "        get_img()"
155 |    ]
156 |   }
157 |  ],
158 |  "metadata": {
159 |   "kernelspec": {
160 |    "display_name": "Python 3",
161 |    "language": "python",
162 |    "name": "python3"
163 |   },
164 |   "language_info": {
165 |    "codemirror_mode": {
166 |     "name": "ipython",
167 |     "version": 3
168 |    },
169 |    "file_extension": ".py",
170 |    "mimetype": "text/x-python",
171 |    "name": "python",
172 |    "nbconvert_exporter": "python",
173 |    "pygments_lexer": "ipython3",
174 |    "version": "3.6.0"
175 |   }
176 |  },
177 |  "nbformat": 4,
178 |  "nbformat_minor": 2
179 | }
180 | 


--------------------------------------------------------------------------------
/第一版/Cha 12 -分布式爬虫/1497099934.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第一版/Cha 12 -分布式爬虫/1497099934.jpg


--------------------------------------------------------------------------------
/第一版/Cha 12 -分布式爬虫/Cha 12 - 分布式爬虫.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 12.3 Redis分布式爬虫实战"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## 12.3.2 加入任务队列"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "def push_redis_list():\n",
 26 |     "    r = Redis(host='137.189.204.65', port=6379 ,password='redisredis')\n",
 27 |     "    print (r.keys('*'))\n",
 28 |     "    \n",
 29 |     "    link_list = []\n",
 30 |     "    with open('alexa.txt', 'r') as file:\n",
 31 |     "        file_list = file.readlines()\n",
 32 |     "        for eachone in file_list:\n",
 33 |     "            link = eachone.split('\\t')[1]\n",
 34 |     "            link = link.replace('\\n','')\n",
 35 |     "            link_list.append(link)\n",
 36 |     "            if len(link_list) == 100:\n",
 37 |     "                break\n",
 38 |     "                \n",
 39 |     "    for url in link_list:\n",
 40 |     "        response = requests.get(url, headers=headers, timeout=20)\n",
 41 |     "        soup = BeautifulSoup(response.text, 'lxml')\n",
 42 |     "        img_list = soup.find_all('img')\n",
 43 |     "        for img in img_list:\n",
 44 |     "            img_url = img['src']\n",
 45 |     "            if img_url != '':\n",
 46 |     "                print (\"加入的图片url: \", img_url)\n",
 47 |     "                r.lpush('img_url',img_url)\n",
 48 |     "        print ('现在图片链接的个数为', r.llen('img_url'))\n",
 49 |     "    return"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "# 12.3.3 读取任务队列，下载图片"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {
 63 |     "collapsed": true
 64 |    },
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "def get_img():\n",
 68 |     "    r = Redis(host='137.189.204.65', port=6379 ,password='redisredis')\n",
 69 |     "    while True:\n",
 70 |     "        try:\n",
 71 |     "            url = r.lpop('img_url')\n",
 72 |     "            url = url.decode('ascii')\n",
 73 |     "            try:\n",
 74 |     "                response = requests.get(url, headers=headers,timeout = 20)\n",
 75 |     "                name = int(time.time())\n",
 76 |     "                f = open(str(name)+ url[-4:], 'wb')\n",
 77 |     "                f.write(response.content)\n",
 78 |     "                f.close()\n",
 79 |     "                print ('已经获取图片', url)\n",
 80 |     "            except Exception as e:\n",
 81 |     "                print ('爬取图片过程出问题', e)\n",
 82 |     "            time.sleep(3)\n",
 83 |     "        except Exception as e:\n",
 84 |     "            print (e)\n",
 85 |     "            time.sleep(10)\n",
 86 |     "            break\n",
 87 |     "    return "
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "# 12.3.4 分布式爬虫代码"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 13,
100 |    "metadata": {
101 |     "collapsed": false
102 |    },
103 |    "outputs": [],
104 |    "source": [
105 |     "import requests\n",
106 |     "from bs4 import BeautifulSoup\n",
107 |     "import re\n",
108 |     "import time\n",
109 |     "from redis import Redis\n",
110 |     "headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' }\n",
111 |     "\n",
112 |     "def push_redis_list():\n",
113 |     "    #与上面此函数相同\n",
114 |     "\n",
115 |     "def get_img():\n",
116 |     "    #与上面此函数相同\n",
117 |     "\n",
118 |     "if __name__ == '__main__':           \n",
119 |     "    this_machine = 'master'          \n",
120 |     "    print ('开始分布式爬虫')\n",
121 |     "    if this_machine == 'master':\n",
122 |     "        push_redis_list()\n",
123 |     "    else:\n",
124 |     "        get_img()"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 14,
130 |    "metadata": {
131 |     "collapsed": false
132 |    },
133 |    "outputs": [],
134 |    "source": [
135 |     "import requests\n",
136 |     "from bs4 import BeautifulSoup\n",
137 |     "import re\n",
138 |     "import time\n",
139 |     "from redis import Redis\n",
140 |     "headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' }\n",
141 |     "\n",
142 |     "def push_redis_list():\n",
143 |     "    #与上面此函数相同\n",
144 |     "\n",
145 |     "def get_img():\n",
146 |     "    #与上面此函数相同\n",
147 |     "\n",
148 |     "if __name__ == '__main__':           \n",
149 |     "    this_machine = 'slave'          \n",
150 |     "    print ('开始分布式爬虫')\n",
151 |     "    if this_machine == 'master':\n",
152 |     "        push_redis_list()\n",
153 |     "    else:\n",
154 |     "        get_img()"
155 |    ]
156 |   }
157 |  ],
158 |  "metadata": {
159 |   "kernelspec": {
160 |    "display_name": "Python 3",
161 |    "language": "python",
162 |    "name": "python3"
163 |   },
164 |   "language_info": {
165 |    "codemirror_mode": {
166 |     "name": "ipython",
167 |     "version": 3
168 |    },
169 |    "file_extension": ".py",
170 |    "mimetype": "text/x-python",
171 |    "name": "python",
172 |    "nbconvert_exporter": "python",
173 |    "pygments_lexer": "ipython3",
174 |    "version": "3.6.0"
175 |   }
176 |  },
177 |  "nbformat": 4,
178 |  "nbformat_minor": 2
179 | }
180 | 


--------------------------------------------------------------------------------
/第一版/Cha 12 -分布式爬虫/master.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import re
 4 | import time
 5 | from redis import Redis
 6 | headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' }
 7 | 
 8 | def push_redis_list():
 9 |     r = Redis(host='137.189.204.65', port=6379 ,password='redisredis')
10 |     print (r.keys('*'))
11 |     
12 |     link_list = []
13 |     with open('alexa.txt', 'r') as file:
14 |         file_list = file.readlines()
15 |         for eachone in file_list:
16 |             link = eachone.split('\t')[1]
17 |             link = link.replace('\n','')
18 |             link_list.append(link)
19 |             if len(link_list) == 100:
20 |                 break
21 |                 
22 |     for url in link_list:
23 |         response = requests.get(url, headers=headers, timeout=20)
24 |         soup = BeautifulSoup(response.text, 'lxml')
25 |         img_list = soup.find_all('img')
26 |         for img in img_list:
27 |             img_url = img['src']
28 |             if img_url != '':
29 |                 print ("加入的图片url: ", img_url)
30 |                 r.lpush('img_url',img_url)
31 |         print ('现在图片链接的个数为', r.llen('img_url'))
32 |     return
33 | 
34 | def get_img():
35 |     r = Redis(host='137.189.204.65', port=6379 ,password='redisredis')
36 |     while True:
37 |         try:
38 |             url = r.lpop('img_url')
39 |             url = url.decode('ascii')
40 |             try:
41 |                 response = requests.get(url, headers=headers,timeout = 20)
42 |                 name = int(time.time())
43 |                 f = open(str(name)+ url[-4:], 'wb')
44 |                 f.write(response.content)
45 |                 f.close()
46 |                 print ('已经获取图片', url)
47 |             except Exception as e:
48 |                 print ('爬取图片过程出问题', e)
49 |             time.sleep(3)
50 |         except Exception as e:
51 |             print (e)
52 |             time.sleep(10)
53 |             break
54 |     return 
55 | 
56 | if __name__ == '__main__':           
57 |     this_machine = 'master'          
58 |     print ('开始分布式爬虫')
59 |     if this_machine == 'master':
60 |         push_redis_list()
61 |     else:
62 |         get_img()


--------------------------------------------------------------------------------
/第一版/Cha 12 -分布式爬虫/slave.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import re
 4 | import time
 5 | from redis import Redis
 6 | headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' }
 7 | 
 8 | def push_redis_list():
 9 |     r = Redis(host='137.189.204.65', port=6379 ,password='redisredis')
10 |     print (r.keys('*'))
11 |     
12 |     link_list = []
13 |     with open('alexa.txt', 'r') as file:
14 |         file_list = file.readlines()
15 |         for eachone in file_list:
16 |             link = eachone.split('\t')[1]
17 |             link = link.replace('\n','')
18 |             link_list.append(link)
19 |             if len(link_list) == 100:
20 |                 break
21 |                 
22 |     for url in link_list:
23 |         response = requests.get(url, headers=headers, timeout=20)
24 |         soup = BeautifulSoup(response.text, 'lxml')
25 |         img_list = soup.find_all('img')
26 |         for img in img_list:
27 |             img_url = img['src']
28 |             if img_url != '':
29 |                 print ("加入的图片url: ", img_url)
30 |                 r.lpush('img_url',img_url)
31 |         print ('现在图片链接的个数为', r.llen('img_url'))
32 |     return
33 | 
34 | def get_img():
35 |     r = Redis(host='137.189.204.65', port=6379 ,password='redisredis')
36 |     while True:
37 |         try:
38 |             url = r.lpop('img_url')
39 |             url = url.decode('ascii')
40 |             if url[:2] == '//':
41 |                 url = 'http:' + url
42 |             print (url)
43 |             try:
44 |                 response = requests.get(url, headers=headers,timeout = 20)
45 |                 name = int(time.time())
46 |                 f = open(str(name)+ url[-4:], 'wb')
47 |                 f.write(response.content)
48 |                 f.close()
49 |                 print ('已经获取图片', url)
50 |             except Exception as e:
51 |                 print ('爬取图片过程出问题', e)
52 |             time.sleep(3)
53 |         except Exception as e:
54 |             print (e)
55 |             time.sleep(10)
56 |             break
57 |     return 
58 | 
59 | if __name__ == '__main__':           
60 |     this_machine = 'slave'          
61 |     print ('开始分布式爬虫')
62 |     if this_machine == 'master':
63 |         push_redis_list()
64 |     else:
65 |         get_img()


--------------------------------------------------------------------------------
/第一版/Cha 15 -爬虫实战三：百度地图API/cities.txt:
--------------------------------------------------------------------------------
  1 | 南京市	534
  2 | 苏州市	698
  3 | 无锡市	491
  4 | 常州市	292
  5 | 南通市	236
  6 | 盐城市	227
  7 | 徐州市	216
  8 | 扬州市	200
  9 | 淮安市	145
 10 | 泰州市	140
 11 | 宿迁市	114
 12 | 镇江市	110
 13 | 连云港市	78
 14 | 杭州市	665
 15 | 宁波市	549
 16 | 温州市	514
 17 | 嘉兴市	301
 18 | 金华市	274
 19 | 绍兴市	216
 20 | 台州市	213
 21 | 湖州市	123
 22 | 丽水市	87
 23 | 衢州市	72
 24 | 舟山市	59
 25 | 广州市	1357
 26 | 深圳市	1022
 27 | 东莞市	509
 28 | 佛山市	799
 29 | 江门市	281
 30 | 中山市	200
 31 | 惠州市	199
 32 | 珠海市	147
 33 | 梅州市	132
 34 | 湛江市	104
 35 | 汕头市	94
 36 | 韶关市	83
 37 | 揭阳市	82
 38 | 肇庆市	72
 39 | 清远市	66
 40 | 茂名市	61
 41 | 阳江市	59
 42 | 河源市	51
 43 | 汕尾市	48
 44 | 潮州市	33
 45 | 云浮市	22
 46 | 福州市	557
 47 | 厦门市	426
 48 | 泉州市	411
 49 | 漳州市	184
 50 | 龙岩市	93
 51 | 南平市	91
 52 | 三明市	90
 53 | 宁德市	78
 54 | 莆田市	66
 55 | 济南市	223
 56 | 青岛市	417
 57 | 潍坊市	179
 58 | 烟台市	165
 59 | 临沂市	133
 60 | 威海市	118
 61 | 淄博市	95
 62 | 济宁市	92
 63 | 东营市	73
 64 | 泰安市	72
 65 | 枣庄市	62
 66 | 滨州市	61
 67 | 菏泽市	60
 68 | 德州市	55
 69 | 莱芜市	43
 70 | 日照市	43
 71 | 聊城市	42
 72 | 郑州市	409
 73 | 洛阳市	171
 74 | 南阳市	102
 75 | 平顶山市	64
 76 | 新乡市	63
 77 | 驻马店市	61
 78 | 开封市	58
 79 | 焦作市	54
 80 | 三门峡市	48
 81 | 信阳市	41
 82 | 许昌市	40
 83 | 周口市	38
 84 | 商丘市	37
 85 | 安阳市	36
 86 | 漯河市	29
 87 | 鹤壁市	25
 88 | 濮阳市	21
 89 | 济源市	18
 90 | 石家庄市	287
 91 | 保定市	190
 92 | 秦皇岛市	157
 93 | 唐山市	155
 94 | 邯郸市	133
 95 | 邢台市	115
 96 | 张家口市	96
 97 | 沧州市	83
 98 | 廊坊市	74
 99 | 承德市	49
100 | 衡水市	49
101 | 成都市	590
102 | 南充市	80
103 | 绵阳市	76
104 | 遂宁市	62
105 | 德阳市	58
106 | 内江市	55
107 | 泸州市	54
108 | 宜宾市	43
109 | 乐山市	41
110 | 广安市	40
111 | 攀枝花市	38
112 | 达州市	36
113 | 凉山彝族自治州	36
114 | 广元市	34
115 | 资阳市	34
116 | 自贡市	33
117 | 眉山市	30
118 | 雅安市	30
119 | 甘孜藏族自治州	19
120 | 巴中市	14
121 | 阿坝藏族羌族自治州	11
122 | 沈阳市	311
123 | 大连市	406
124 | 抚顺市	82
125 | 锦州市	70
126 | 鞍山市	59
127 | 营口市	54
128 | 盘锦市	53
129 | 本溪市	42
130 | 铁岭市	37
131 | 阜新市	34
132 | 葫芦岛市	34
133 | 辽阳市	29
134 | 丹东市	24
135 | 朝阳市	20
136 | 昆明市	402
137 | 西双版纳傣族自治州	140
138 | 玉溪市	71
139 | 大理白族自治州	71
140 | 曲靖市	48
141 | 丽江市	46
142 | 普洱市	39
143 | 红河哈尼族彝族自治州	36
144 | 保山市	33
145 | 临沧市	31
146 | 楚雄彝族自治州	28
147 | 文山壮族苗族自治州	23
148 | 德宏傣族景颇族自治州	20
149 | 迪庆藏族自治州	14
150 | 昭通市	14
151 | 怒江傈僳族自治州	5
152 | 长沙市	367
153 | 株洲市	134
154 | 郴州市	70
155 | 衡阳市	68
156 | 邵阳市	54
157 | 永州市	49
158 | 张家界市	43
159 | 娄底市	41
160 | 岳阳市	38
161 | 怀化市	36
162 | 湘西土家族苗族自治州	34
163 | 常德市	32
164 | 湘潭市	30
165 | 益阳市	22
166 | 武汉市	531
167 | 宜昌市	90
168 | 襄阳市	65
169 | 荆门市	62
170 | 十堰市	46
171 | 黄冈市	46
172 | 黄石市	43
173 | 荆州市	43
174 | 孝感市	42
175 | 咸宁市	38
176 | 随州市	35
177 | 恩施土家族苗族自治州	29
178 | 鄂州市	19
179 | 潜江市	19
180 | 仙桃市	16
181 | 神农架林区	10
182 | 天门市	10
183 | 赣州市	223
184 | 南昌市	145
185 | 九江市	111
186 | 上饶市	100
187 | 吉安市	93
188 | 宜春市	77
189 | 抚州市	75
190 | 萍乡市	53
191 | 新余市	51
192 | 景德镇市	46
193 | 鹰潭市	25
194 | 合肥市	274
195 | 芜湖市	70
196 | 黄山市	59
197 | 滁州市	57
198 | 阜阳市	56
199 | 六安市	55
200 | 安庆市	54
201 | 宿州市	48
202 | 马鞍山市	44
203 | 宣城市	44
204 | 巢湖市	38
205 | 淮南市	35
206 | 亳州市	31
207 | 池州市	27
208 | 淮北市	27
209 | 铜陵市	27
210 | 蚌埠市	25
211 | 太原市	209
212 | 晋中市	89
213 | 临汾市	78
214 | 长治市	70
215 | 大同市	58
216 | 晋城市	56
217 | 运城市	55
218 | 忻州市	47
219 | 阳泉市	45
220 | 吕梁市	33
221 | 朔州市	26
222 | 南宁市	180
223 | 柳州市	85
224 | 桂林市	77
225 | 玉林市	60
226 | 北海市	51
227 | 河池市	47
228 | 百色市	37
229 | 梧州市	35
230 | 贺州市	28
231 | 防城港市	26
232 | 贵港市	26
233 | 钦州市	26
234 | 来宾市	21
235 | 崇左市	18
236 | 西安市	378
237 | 宝鸡市	86
238 | 渭南市	61
239 | 咸阳市	56
240 | 汉中市	49
241 | 榆林市	34
242 | 延安市	31
243 | 铜川市	27
244 | 商洛市	23
245 | 安康市	22
246 | 哈尔滨市	205
247 | 齐齐哈尔市	65
248 | 大庆市	65
249 | 黑河市	57
250 | 牡丹江市	51
251 | 伊春市	47
252 | 佳木斯市	47
253 | 鹤岗市	36
254 | 鸡西市	34
255 | 绥化市	25
256 | 七台河市	22
257 | 双鸭山市	22
258 | 大兴安岭地区	14
259 | 包头市	139
260 | 鄂尔多斯市	101
261 | 呼和浩特市	96
262 | 呼伦贝尔市	55
263 | 赤峰市	45
264 | 乌海市	39
265 | 巴彦淖尔市	33
266 | 乌兰察布市	33
267 | 锡林郭勒盟	33
268 | 阿拉善盟	27
269 | 通辽市	26
270 | 兴安盟	21
271 | 贵阳市	156
272 | 遵义市	67
273 | 黔南布依族苗族自治州	51
274 | 毕节地区	40
275 | 铜仁地区	38
276 | 六盘水市	35
277 | 黔东南苗族侗族自治州	31
278 | 黔西南布依族苗族自治州	21
279 | 安顺市	12
280 | 长春市	201
281 | 延边朝鲜族自治州	90
282 | 吉林市	69
283 | 通化市	35
284 | 松原市	34
285 | 四平市	32
286 | 白山市	29
287 | 辽源市	17
288 | 白城市	15
289 | 兰州市	102
290 | 酒泉市	53
291 | 天水市	35
292 | 武威市	34
293 | 定西市	31
294 | 白银市	30
295 | 张掖市	23
296 | 平凉市	21
297 | 临夏回族自治州	20
298 | 金昌市	19
299 | 陇南市	18
300 | 甘南藏族自治州	11
301 | 嘉峪关市	11
302 | 庆阳市	11
303 | 乌鲁木齐市	110
304 | 昌吉回族自治州	44
305 | 克拉玛依市	31
306 | 伊犁哈萨克自治州	30
307 | 塔城地区	25
308 | 喀什地区	24
309 | 巴音郭楞蒙古自治州	19
310 | 阿勒泰地区	18
311 | 哈密地区	15
312 | 阿克苏地区	14
313 | 吐鲁番地区	13
314 | 和田地区	11
315 | 博尔塔拉蒙古自治州	9
316 | 石河子市	4
317 | 五家渠市	4
318 | 阿拉尔市	2
319 | 图木舒克市	2
320 | 海口市	97
321 | 三亚市	74
322 | 文昌市	13
323 | 儋州市	11
324 | 澄迈县	11
325 | 琼海市	10
326 | 万宁市	8
327 | 昌江黎族自治县	7
328 | 东方市	5
329 | 屯昌县	5
330 | 保亭黎族苗族自治县	4
331 | 定安县	4
332 | 琼中黎族苗族自治县	4
333 | 临高县	3
334 | 陵水黎族自治县	3
335 | 白沙黎族自治县	2
336 | 乐东黎族自治县	2
337 | 五指山市	2
338 | 银川市	105
339 | 吴忠市	36
340 | 石嘴山市	26
341 | 固原市	22
342 | 中卫市	10
343 | 西宁市	73
344 | 海西蒙古族藏族自治州	19
345 | 海南藏族自治州	14
346 | 海北藏族自治州	8
347 | 海东地区	7
348 | 黄南藏族自治州	7
349 | 果洛藏族自治州	3
350 | 玉树藏族自治州	2
351 | 拉萨市	18
352 | 日喀则地区	10
353 | 林芝地区	9
354 | 山南地区	6
355 | 阿里地区	2
356 | 昌都地区	1
357 | 北京市	1499
358 | 上海市	1147
359 | 重庆市	734
360 | 天津市	509
361 | 香港特别行政区	200
362 | 澳门特别行政区	43
363 | 


--------------------------------------------------------------------------------
/第一版/Cha 2 - 编写你的第一个网络爬虫/Cha 2 _章末实战.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 题目1：请使用Python中的循环，打印输出从1到100的所有奇数。"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 2,
 13 |    "metadata": {
 14 |     "scrolled": true
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "1\n",
 22 |       "3\n",
 23 |       "5\n",
 24 |       "7\n",
 25 |       "9\n",
 26 |       "11\n",
 27 |       "13\n",
 28 |       "15\n",
 29 |       "17\n",
 30 |       "19\n",
 31 |       "21\n",
 32 |       "23\n",
 33 |       "25\n",
 34 |       "27\n",
 35 |       "29\n",
 36 |       "31\n",
 37 |       "33\n",
 38 |       "35\n",
 39 |       "37\n",
 40 |       "39\n",
 41 |       "41\n",
 42 |       "43\n",
 43 |       "45\n",
 44 |       "47\n",
 45 |       "49\n",
 46 |       "51\n",
 47 |       "53\n",
 48 |       "55\n",
 49 |       "57\n",
 50 |       "59\n",
 51 |       "61\n",
 52 |       "63\n",
 53 |       "65\n",
 54 |       "67\n",
 55 |       "69\n",
 56 |       "71\n",
 57 |       "73\n",
 58 |       "75\n",
 59 |       "77\n",
 60 |       "79\n",
 61 |       "81\n",
 62 |       "83\n",
 63 |       "85\n",
 64 |       "87\n",
 65 |       "89\n",
 66 |       "91\n",
 67 |       "93\n",
 68 |       "95\n",
 69 |       "97\n",
 70 |       "99\n"
 71 |      ]
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "for i in range(1,101):\n",
 76 |     "    if i % 2 == 1:\n",
 77 |     "        print (i)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "# 题目2：请将字符串 ”你好$$$我正在学Python@#@#现在需要&%&%&修改字符串” 中的符号变成一个空格，需要输出的格式为：”你好 我正在学Python 现在需要 修改字符串”"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 17,
 90 |    "metadata": {},
 91 |    "outputs": [
 92 |     {
 93 |      "name": "stdout",
 94 |      "output_type": "stream",
 95 |      "text": [
 96 |       "你好 我正在学Python 现在需要 修改字符串\n"
 97 |      ]
 98 |     }
 99 |    ],
100 |    "source": [
101 |     "str1 = '你好$$$我正在学Python@#@#现在需要&%&%&修改字符串'\n",
102 |     "str2 = str1.replace('$$$', ' ').replace('@#@#', ' ').replace('&%&%&', ' ')\n",
103 |     "print (str2)"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 18,
109 |    "metadata": {},
110 |    "outputs": [
111 |     {
112 |      "name": "stdout",
113 |      "output_type": "stream",
114 |      "text": [
115 |       "你好 我正在学Python 现在需要 修改字符串\n"
116 |      ]
117 |     }
118 |    ],
119 |    "source": [
120 |     "import re\n",
121 |     "str1 = '你好$$$我正在学Python@#@#现在需要&%&%&修改字符串'\n",
122 |     "str2 = re.sub('[$@#&%]+', ' ' ,str1)\n",
123 |     "print (str2)"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "# 题目3：输出 9*9 乘法口诀表"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 50,
136 |    "metadata": {},
137 |    "outputs": [
138 |     {
139 |      "name": "stdout",
140 |      "output_type": "stream",
141 |      "text": [
142 |       "1x1=1\t\n",
143 |       "1x2=2\t2x2=4\t\n",
144 |       "1x3=3\t2x3=6\t3x3=9\t\n",
145 |       "1x4=4\t2x4=8\t3x4=12\t4x4=16\t\n",
146 |       "1x5=5\t2x5=10\t3x5=15\t4x5=20\t5x5=25\t\n",
147 |       "1x6=6\t2x6=12\t3x6=18\t4x6=24\t5x6=30\t6x6=36\t\n",
148 |       "1x7=7\t2x7=14\t3x7=21\t4x7=28\t5x7=35\t6x7=42\t7x7=49\t\n",
149 |       "1x8=8\t2x8=16\t3x8=24\t4x8=32\t5x8=40\t6x8=48\t7x8=56\t8x8=64\t\n",
150 |       "1x9=9\t2x9=18\t3x9=27\t4x9=36\t5x9=45\t6x9=54\t7x9=63\t8x9=72\t9x9=81\t\n"
151 |      ]
152 |     }
153 |    ],
154 |    "source": [
155 |     "for i in range(1, 10):\n",
156 |     "    for j in range(1, i+1):\n",
157 |     "        print (\"%dx%d=%d\\t\" % (j, i, i*j), end=\"\")\n",
158 |     "    print(\"\")"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "# 题目4：请写出一个函数，当输入函数变量当月利润I，能返回应发放奖金总数，例如输出“利润100000元时，应发放奖金总数为10000元。”。\n",
166 |     "其中，企业发放的奖金根据利润提成。利润(I)低于或等于10万元时，奖金可提10%；利润高于10万元，低于20万元时，低于10万元的部分按10%提成，高于10万元的部分，可提成7.5%；20万到40万之间时，高于20万元的部分，可提成5%；40万到60万之间时高于40万元的部分，可提成3%；60万到100万之间时，高于60万元的部分，可提成1.5%，高于100万元时，超过100万元的部分按1%提成"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": 61,
172 |    "metadata": {},
173 |    "outputs": [
174 |     {
175 |      "name": "stdout",
176 |      "output_type": "stream",
177 |      "text": [
178 |       "净利润:210000\n",
179 |       "利润为210000元时，应发奖金总数为18000元\n"
180 |      ]
181 |     }
182 |    ],
183 |    "source": [
184 |     "def calcute_profit(I):\n",
185 |     "    I = I / 10000\n",
186 |     "    if I <= 10:\n",
187 |     "        a = I * 0.01\n",
188 |     "        return a * 10000\n",
189 |     "    elif I <= 20 and I > 10:\n",
190 |     "        b =0.25 + I * 0.075\n",
191 |     "        return b * 10000\n",
192 |     "    elif I <= 40 and I > 20:\n",
193 |     "        c = 0.75 + I * 0.05\n",
194 |     "        return c * 10000\n",
195 |     "    elif I <= 60 and I > 40:\n",
196 |     "        d = 0.95 + I * 0.03\n",
197 |     "        return d * 10000\n",
198 |     "    elif I <= 60 and I > 100:\n",
199 |     "        e = 2 + I * 0.015\n",
200 |     "        return e * 10000\n",
201 |     "    else:\n",
202 |     "        f = 2.95 + I * 0.01\n",
203 |     "        return f * 10000\n",
204 |     "    \n",
205 |     "I = int(input('净利润:'))\n",
206 |     "profit = calcute_profit(I)\n",
207 |     "print ('利润为%d元时，应发奖金总数为%d元' % (I, profit))"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": 57,
213 |    "metadata": {},
214 |    "outputs": [
215 |     {
216 |      "name": "stdout",
217 |      "output_type": "stream",
218 |      "text": [
219 |       "净利润:210000\n",
220 |       "利润为210000元时，应发奖金总数为18000元\n"
221 |      ]
222 |     }
223 |    ],
224 |    "source": [
225 |     "def calcute_profit(I):\n",
226 |     "    arr = [1000000,600000,400000,200000,100000,0] #这应该就是各个分界值了，把它们放在列表里方便访问\n",
227 |     "    rat = [0.01,0.015,0.03,0.05,0.075,0.1] #这是各个分界值所对应的奖金比例值\n",
228 |     "    r = 0                       #这是总奖金的初始值\n",
229 |     "    for idx in range(0,6):      #有6个分界值当然要循环6次\n",
230 |     "        if I > arr[idx]:\n",
231 |     "            r = r + (I - arr[idx]) * rat[idx] \n",
232 |     "            I = arr[idx]\n",
233 |     "    return r\n",
234 |     "\n",
235 |     "I = int(input('净利润:'))\n",
236 |     "profit = calcute_profit(I)\n",
237 |     "print ('利润为%d元时，应发奖金总数为%d元' % (I, profit))"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "markdown",
242 |    "metadata": {},
243 |    "source": [
244 |     "# 题目5：用字典的值对字典进行排序"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 71,
250 |    "metadata": {},
251 |    "outputs": [
252 |     {
253 |      "name": "stdout",
254 |      "output_type": "stream",
255 |      "text": [
256 |       "[(0, 0), (2, 1), (1, 2), (4, 3), (3, 4)]\n"
257 |      ]
258 |     }
259 |    ],
260 |    "source": [
261 |     "import operator\n",
262 |     "x = {1: 2, 3: 4, 4:3, 2:1, 0:0}\n",
263 |     "sorted_x = sorted(x.items(), key=operator.itemgetter(1))\n",
264 |     "print (sorted_x)"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "markdown",
269 |    "metadata": {},
270 |    "source": [
271 |     "# 题目6：请问一下两段代码的输出分别是什么？"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": 72,
277 |    "metadata": {},
278 |    "outputs": [
279 |     {
280 |      "name": "stdout",
281 |      "output_type": "stream",
282 |      "text": [
283 |       "1\n"
284 |      ]
285 |     }
286 |    ],
287 |    "source": [
288 |     "a = 1\n",
289 |     "def fun(a):\n",
290 |     "    a = 2\n",
291 |     "fun(a)\n",
292 |     "print (a)"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": 74,
298 |    "metadata": {},
299 |    "outputs": [
300 |     {
301 |      "name": "stdout",
302 |      "output_type": "stream",
303 |      "text": [
304 |       "[1]\n"
305 |      ]
306 |     }
307 |    ],
308 |    "source": [
309 |     "a = []\n",
310 |     "def fun(a):\n",
311 |     "    a.append(1)\n",
312 |     "fun(a)\n",
313 |     "print (a)"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "markdown",
318 |    "metadata": {},
319 |    "source": [
320 |     "# 题目7： 请问以下两段代码的输出分别是什么？"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": 76,
326 |    "metadata": {},
327 |    "outputs": [
328 |     {
329 |      "name": "stdout",
330 |      "output_type": "stream",
331 |      "text": [
332 |       "bbb\n",
333 |       "aaa\n",
334 |       "aaa\n"
335 |      ]
336 |     }
337 |    ],
338 |    "source": [
339 |     "class Person:\n",
340 |     "    name=\"aaa\"\n",
341 |     "\n",
342 |     "p1=Person()\n",
343 |     "p2=Person()\n",
344 |     "p1.name=\"bbb\"\n",
345 |     "print (p1.name)\n",
346 |     "print (p2.name)\n",
347 |     "print (Person.name)"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": 77,
353 |    "metadata": {},
354 |    "outputs": [
355 |     {
356 |      "name": "stdout",
357 |      "output_type": "stream",
358 |      "text": [
359 |       "[1]\n",
360 |       "[1]\n",
361 |       "[1]\n"
362 |      ]
363 |     }
364 |    ],
365 |    "source": [
366 |     "class Person:\n",
367 |     "    name=[]\n",
368 |     "\n",
369 |     "p1=Person()\n",
370 |     "p2=Person()\n",
371 |     "p1.name.append(1)\n",
372 |     "print (p1.name)\n",
373 |     "print (p2.name)\n",
374 |     "print (Person.name)"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "code",
379 |    "execution_count": null,
380 |    "metadata": {
381 |     "collapsed": true
382 |    },
383 |    "outputs": [],
384 |    "source": []
385 |   }
386 |  ],
387 |  "metadata": {
388 |   "kernelspec": {
389 |    "display_name": "Python 3",
390 |    "language": "python",
391 |    "name": "python3"
392 |   },
393 |   "language_info": {
394 |    "codemirror_mode": {
395 |     "name": "ipython",
396 |     "version": 3
397 |    },
398 |    "file_extension": ".py",
399 |    "mimetype": "text/x-python",
400 |    "name": "python",
401 |    "nbconvert_exporter": "python",
402 |    "pygments_lexer": "ipython3",
403 |    "version": "3.6.1"
404 |   },
405 |   "toc": {
406 |    "colors": {
407 |     "hover_highlight": "#DAA520",
408 |     "navigate_num": "#000000",
409 |     "navigate_text": "#333333",
410 |     "running_highlight": "#FF0000",
411 |     "selected_highlight": "#FFD700",
412 |     "sidebar_border": "#EEEEEE",
413 |     "wrapper_background": "#FFFFFF"
414 |    },
415 |    "moveMenuLeft": true,
416 |    "nav_menu": {
417 |     "height": "153px",
418 |     "width": "252px"
419 |    },
420 |    "navigate_menu": true,
421 |    "number_sections": true,
422 |    "sideBar": true,
423 |    "threshold": 4,
424 |    "toc_cell": false,
425 |    "toc_section_display": "block",
426 |    "toc_window_display": false,
427 |    "widenNotebook": false
428 |   }
429 |  },
430 |  "nbformat": 4,
431 |  "nbformat_minor": 2
432 | }
433 | 


--------------------------------------------------------------------------------
/第一版/Cha 7 -提升爬虫的速度/Get Alexa.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import requests\n",
 12 |     "from bs4 import BeautifulSoup\n",
 13 |     "import time"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "metadata": {
 20 |     "collapsed": false
 21 |    },
 22 |    "outputs": [
 23 |     {
 24 |      "name": "stdout",
 25 |      "output_type": "stream",
 26 |      "text": [
 27 |       "20\n",
 28 |       "40\n",
 29 |       "60\n",
 30 |       "80\n",
 31 |       "100\n",
 32 |       "120\n",
 33 |       "140\n",
 34 |       "160\n",
 35 |       "180\n",
 36 |       "200\n",
 37 |       "220\n",
 38 |       "240\n",
 39 |       "260\n",
 40 |       "280\n",
 41 |       "300\n",
 42 |       "320\n",
 43 |       "340\n",
 44 |       "360\n",
 45 |       "380\n",
 46 |       "400\n",
 47 |       "420\n",
 48 |       "440\n",
 49 |       "460\n",
 50 |       "480\n",
 51 |       "500\n",
 52 |       "520\n",
 53 |       "540\n",
 54 |       "560\n",
 55 |       "580\n",
 56 |       "600\n",
 57 |       "620\n",
 58 |       "640\n",
 59 |       "660\n",
 60 |       "680\n",
 61 |       "700\n",
 62 |       "720\n",
 63 |       "740\n",
 64 |       "760\n",
 65 |       "780\n",
 66 |       "800\n",
 67 |       "820\n",
 68 |       "840\n",
 69 |       "860\n",
 70 |       "880\n",
 71 |       "900\n",
 72 |       "920\n",
 73 |       "940\n",
 74 |       "960\n",
 75 |       "980\n",
 76 |       "1000\n"
 77 |      ]
 78 |     }
 79 |    ],
 80 |    "source": [
 81 |     "num = 0\n",
 82 |     "for i in range(1,51):\n",
 83 |     "    r = requests.get('http://www.alexa.cn/siterank/' + str(i))\n",
 84 |     "    soup = BeautifulSoup(r.text, \"lxml\")\n",
 85 |     "    span_list = soup.find_all('span', class_ = 'domain-link')   \n",
 86 |     "    link_list = [(str(j + num), span_list[j].a['href']) for j in range(len(span_list))]\n",
 87 |     "    num = num + len(link_list)\n",
 88 |     "        \n",
 89 |     "    output = \"\\n\".join(\"%s\\t%s\" % tup for tup in link_list) + \"\\n\"\n",
 90 |     "    print (num)\n",
 91 |     "    with open('C:\\\\Users\\\\Administrator\\\\Desktop\\\\alexa.txt', 'a+', encoding = 'utf-8') as f:\n",
 92 |     "        f.write(output)\n",
 93 |     "        f.close\n",
 94 |     "    time.sleep(3)"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {
101 |     "collapsed": true
102 |    },
103 |    "outputs": [],
104 |    "source": []
105 |   }
106 |  ],
107 |  "metadata": {
108 |   "kernelspec": {
109 |    "display_name": "Python 3",
110 |    "language": "python",
111 |    "name": "python3"
112 |   },
113 |   "language_info": {
114 |    "codemirror_mode": {
115 |     "name": "ipython",
116 |     "version": 3
117 |    },
118 |    "file_extension": ".py",
119 |    "mimetype": "text/x-python",
120 |    "name": "python",
121 |    "nbconvert_exporter": "python",
122 |    "pygments_lexer": "ipython3",
123 |    "version": "3.6.0"
124 |   }
125 |  },
126 |  "nbformat": 4,
127 |  "nbformat_minor": 2
128 | }
129 | 


--------------------------------------------------------------------------------
/第一版/Cha 7 -提升爬虫的速度/cha7/__pycache__/multiprocess_test.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第一版/Cha 7 -提升爬虫的速度/cha7/__pycache__/multiprocess_test.cpython-35.pyc


--------------------------------------------------------------------------------
/第一版/Cha 7 -提升爬虫的速度/cha7/__pycache__/multiprocess_test.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第一版/Cha 7 -提升爬虫的速度/cha7/__pycache__/multiprocess_test.cpython-36.pyc


--------------------------------------------------------------------------------
/第一版/Cha 7 -提升爬虫的速度/cha7/__pycache__/thread_test.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第一版/Cha 7 -提升爬虫的速度/cha7/__pycache__/thread_test.cpython-35.pyc


--------------------------------------------------------------------------------
/第一版/Cha 7 -提升爬虫的速度/cha7/__pycache__/thread_test.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第一版/Cha 7 -提升爬虫的速度/cha7/__pycache__/thread_test.cpython-36.pyc


--------------------------------------------------------------------------------
/第一版/Cha 7 -提升爬虫的速度/cha7/gevent1.py:
--------------------------------------------------------------------------------
 1 | import gevent
 2 | from gevent.queue import Queue, Empty
 3 | import time
 4 | import requests
 5 | 
 6 | from gevent import monkey#把下面有可能有IO操作的单独做上标记
 7 | monkey.patch_all() # 将IO转为异步执行的函数
 8 | 
 9 | link_list = []
10 | with open('alexa.txt', 'r') as file:
11 |     file_list = file.readlines()
12 |     for eachone in file_list:
13 |         link = eachone.split('\t')[1]
14 |         link = link.replace('\n','')
15 |         link_list.append(link)
16 | 
17 | start = time.time()
18 | def crawler(index):
19 |     Process_id = 'Process-' + str(index)
20 |     while not workQueue.empty():
21 |         url = workQueue.get(timeout=2)
22 |         try:
23 |             r = requests.get(url, timeout=20)
24 |             print (Process_id, workQueue.qsize(), r.status_code, url)
25 |         except Exception as e: 
26 |             print (Process_id, workQueue.qsize(), url, 'Error: ', e)
27 | 
28 | def boss():
29 |     for url in link_list:
30 |         workQueue.put_nowait(url)
31 | 
32 | if __name__ == '__main__':
33 |     workQueue = Queue(1000)
34 | 
35 |     gevent.spawn(boss).join()
36 |     jobs = []
37 |     for i in range(10):
38 |         jobs.append(gevent.spawn(crawler, i))
39 |     gevent.joinall(jobs)
40 | 
41 |     end = time.time()
42 |     print ('gevent + Queue多协程爬虫的总时间为：', end-start)
43 |     print ('Main Ended!')
44 | 


--------------------------------------------------------------------------------
/第一版/Cha 7 -提升爬虫的速度/cha7/gevent_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 |  # -*- coding: utf-8 -*-
 3 | 
 4 | import gevent
 5 | from gevent.queue import Queue, Empty
 6 | import time
 7 | import requests
 8 | 
 9 | from gevent import monkey#把下面有可能有IO操作的单独做上标记
10 | monkey.patch_all() # 将IO转为异步执行的函数
11 | 
12 | start = time.time()
13 | workQueue = Queue(1000)
14 | def crawler(index):
15 |     Process_id = 'Process-' + str(index)
16 |     while not workQueue.empty():
17 |         url = workQueue.get(timeout=2)
18 |         try:
19 |             r = requests.get(url, timeout=20)
20 |             print (Process_id, workQueue.qsize(), r.status_code, url)
21 |         except Exception as e: 
22 |             print (Process_id, workQueue.qsize(), url, 'Error: ', e)
23 | 
24 | def boss(link_list):
25 |     for url in link_list:
26 |         workQueue.put_nowait(url)
27 | 
28 | def gevent_main(link_list, g_num):
29 |     gevent.spawn(boss,link_list).join()
30 |     jobs = []
31 |     for i in range(g_num):
32 |         jobs.append(gevent.spawn(crawler, i))
33 |     gevent.joinall(jobs)
34 | 
35 |     end = time.time()
36 |     time_spend = end-start
37 |     print ('gevent + Queue多协程爬虫的总时间为：', time_spend)
38 |     print ('Main Ended!')
39 |     return time_spend
40 | 
41 | if __name__ == '__main__':
42 |     link_list = []
43 |     with open('alexa.txt', 'r') as file:
44 |         file_list = file.readlines()
45 |         for eachone in file_list:
46 |             link = eachone.split('\t')[1]
47 |             link = link.replace('\n','')
48 |             link_list.append(link)
49 | 
50 |     
51 | 
52 |     gevent_time10 = gevent_main(link_list, 15)
53 |     print ('gevent + Queue多协程爬虫的总时间为：', gevent_time10)
54 | 
55 |     gevent_time3 = gevent_main(link_list, 20)
56 |     print ('gevent + Queue多协程爬虫的总时间为：', gevent_time3)
57 | 
58 |     with open('result_gevent.txt','a+',encoding='utf-8') as f:
59 |         f.write('\t' + str(gevent_time10) + '\t' + str(gevent_time3))


--------------------------------------------------------------------------------
/第一版/Cha 7 -提升爬虫的速度/cha7/multiprocess_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 |  # -*- coding: utf-8 -*-
 3 | 
 4 | from multiprocessing import Pool, Manager
 5 | import time
 6 | import requests
 7 | 
 8 | def crawler(q, index):
 9 |     Process_id = 'Process-' + str(index)
10 |     while not q.empty():
11 |         url = q.get(timeout=2)
12 |         try:
13 |             r = requests.get(url, timeout=20)
14 |             print (Process_id, q.qsize(), r.status_code, url)
15 |         except Exception as e: 
16 |             print (Process_id, q.qsize(), url, 'Error: ', e)
17 | 
18 | 
19 | def multiprocess_main(link_list, p_num):
20 |     start = time.time() 
21 |     manager = Manager()
22 |     workQueue = manager.Queue(1000)
23 | 
24 |     # 填充队列
25 |     for url in link_list:
26 |         workQueue.put(url)
27 | 
28 |     print ("Started processes")
29 |     pool = Pool(processes=p_num)
30 |     for i in range(p_num):
31 |         pool.apply_async(crawler, args=(workQueue, i))
32 | 
33 |     
34 |     pool.close()
35 |     pool.join()
36 | 
37 |     end = time.time()
38 |     time_spend = end-start
39 |     print ('Pool + Queue多进程爬虫的总时间为：', time_spend)
40 |     print ('Main process Ended!')
41 |     return time_spend
42 | 
43 | if __name__ == '__main__':
44 |     link_list = []
45 |     with open('alexa.txt', 'r') as file:
46 |         file_list = file.readlines()
47 |         for eachone in file_list:
48 |             link = eachone.split('\t')[1]
49 |             link = link.replace('\n','')
50 |             link_list.append(link)
51 | 
52 |     multiprocess_main(link_list, 3)


--------------------------------------------------------------------------------
/第一版/Cha 7 -提升爬虫的速度/cha7/mutilprocess1.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing import Process, Queue
 2 | import time
 3 | import requests
 4 | 
 5 | link_list = []
 6 | with open('alexa.txt', 'r') as file:
 7 |     file_list = file.readlines()
 8 |     for eachone in file_list:
 9 |         link = eachone.split('\t')[1]
10 |         link = link.replace('\n','')
11 |         link_list.append(link)
12 | 
13 | start = time.time()
14 | class MyProcess(Process):
15 |     def __init__(self, q):
16 |         Process.__init__(self)
17 |         self.q = q
18 | 
19 |     def run(self):
20 |         print ("Starting " , self.pid)
21 |         while not self.q.empty():
22 |             crawler(self.q)
23 |         print ("Exiting " , self.pid)
24 | 
25 | def crawler(q):
26 |     url = q.get(timeout=2)
27 |     try:
28 |         r = requests.get(url, timeout=20)
29 |         print (q.qsize(), r.status_code, url)
30 |     except Exception as e: 
31 |         print (q.qsize(), url, 'Error: ', e)
32 | 
33 | if __name__ == '__main__':
34 |     ProcessNames = ["Process-1", "Process-2", "Process-3"]
35 |     workQueue = Queue(1000)
36 | 
37 |     # 填充队列
38 |     for url in link_list:
39 |         workQueue.put(url)
40 | 
41 |     for i in range(0, 3):
42 |         p = MyProcess(workQueue)
43 |         p.daemon = True
44 |         p.start()
45 |         p.join()
46 | 
47 |     end = time.time()
48 |     print ('Process + Queue多进程爬虫的总时间为：', end-start)
49 |     print ('Main process Ended!')


--------------------------------------------------------------------------------
/第一版/Cha 7 -提升爬虫的速度/cha7/mutilprocess2.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing import Pool, Manager
 2 | import time
 3 | import requests
 4 | 
 5 | link_list = []
 6 | with open('alexa.txt', 'r') as file:
 7 |     file_list = file.readlines()
 8 |     for eachone in file_list:
 9 |         link = eachone.split('\t')[1]
10 |         link = link.replace('\n','')
11 |         link_list.append(link)
12 | 
13 | start = time.time()
14 | def crawler(q, index):
15 |     Process_id = 'Process-' + str(index)
16 |     while not q.empty():
17 |         url = q.get(timeout=2)
18 |         try:
19 |             r = requests.get(url, timeout=20)
20 |             print (Process_id, q.qsize(), r.status_code, url)
21 |         except Exception as e: 
22 |             print (Process_id, q.qsize(), url, 'Error: ', e)
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     manager = Manager()
27 |     workQueue = manager.Queue(1000)
28 | 
29 |     # 填充队列
30 |     for url in link_list:
31 |         workQueue.put(url)
32 | 
33 |     pool = Pool(processes=3)
34 |     for i in range(4):
35 |         pool.apply_async(crawler, args=(workQueue, i))
36 | 
37 |     print ("Started processes")
38 |     pool.close()
39 |     pool.join()
40 | 
41 |     end = time.time()
42 |     print ('Pool + Queue多进程爬虫的总时间为：', end-start)
43 |     print ('Main process Ended!')
44 | 


--------------------------------------------------------------------------------
/第一版/Cha 7 -提升爬虫的速度/cha7/mutilprocess3.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing import Pool, Manager
 2 | import time
 3 | import requests
 4 | 
 5 | link_list = []
 6 | with open('alexa.txt', 'r') as file:
 7 |     file_list = file.readlines()
 8 |     for eachone in file_list:
 9 |         link = eachone.split('\t')[1]
10 |         link = link.replace('\n','')
11 |         link_list.append(link)
12 | 
13 | start = time.time()
14 | def crawler(q, index):
15 |     Process_id = 'Process-' + str(index)
16 |     while not q.empty():
17 |         url = q.get(timeout=2)
18 |         try:
19 |             r = requests.get(url, timeout=20)
20 |             print (Process_id, q.qsize(), r.status_code, url)
21 |         except Exception as e: 
22 |             print (Process_id, q.qsize(), url, 'Error: ', e)
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     manager = Manager()
27 |     workQueue = manager.Queue(1000)
28 | 
29 |     # 填充队列
30 |     for url in link_list:
31 |         workQueue.put(url)
32 | 
33 |     pool = Pool(processes=3)
34 |     for i in range(4):
35 |         pool.apply(crawler, args=(workQueue, i))
36 | 
37 |     print ("Started processes")
38 |     pool.close()
39 |     pool.join()
40 | 
41 |     end = time.time()
42 |     print ('Pool + Queue多进程爬虫的总时间为：', end-start)
43 |     print ('Main process Ended!')
44 | 


--------------------------------------------------------------------------------
/第一版/Cha 7 -提升爬虫的速度/cha7/result.txt:
--------------------------------------------------------------------------------
1 | 312.7718894481659	143.37620067596436	549.7254424095154	549.978456735611


--------------------------------------------------------------------------------
/第一版/Cha 7 -提升爬虫的速度/cha7/result_gevent.txt:
--------------------------------------------------------------------------------
1 | 338.3443522453308	922.8117818832397	312.1618547439575	484.05668663978577


--------------------------------------------------------------------------------
/第一版/Cha 7 -提升爬虫的速度/cha7/result_single_time.txt:
--------------------------------------------------------------------------------
1 | 1721.3604562282562


--------------------------------------------------------------------------------
/第一版/Cha 7 -提升爬虫的速度/cha7/thread1.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | import requests
 3 | import time
 4 | 
 5 | link_list = []
 6 | with open('alexa.txt', 'r') as file:
 7 |     file_list = file.readlines()
 8 |     for eachone in file_list:
 9 |         link = eachone.split('\t')[1]
10 |         link = link.replace('\n','')
11 |         link_list.append(link)
12 |         
13 | start = time.time()
14 | class myThread (threading.Thread):
15 |     def __init__(self, name, link_range):
16 |         threading.Thread.__init__(self)
17 |         self.name = name
18 |         self.link_range = link_range
19 |     def run(self):
20 |         print ("Starting " + self.name)
21 |         crawler(self.name, self.link_range)
22 |         print ("Exiting " + self.name)
23 |         
24 | def crawler(threadName, link_range):
25 |     for i in range(link_range[0],link_range[1]+1):
26 |         try:
27 |             r = requests.get(link_list[i], timeout=20)
28 |             print (threadName, r.status_code, link_list[i])
29 |         except Exception as e: 
30 |             print(threadName, 'Error: ', e)
31 |         
32 | thread_list = []
33 | link_range_list = [(0,200),(201,400),(401,600),(601,800),(801,1000)]
34 | 
35 | # 创建新线程
36 | for i in range(1,6):
37 |     thread = myThread("Thread-" + str(i), link_range_list[i-1])
38 |     thread.start()
39 |     thread_list.append(thread)
40 |     
41 | # 等待所有线程完成
42 | for thread in thread_list:
43 |     thread.join()
44 | 
45 | end = time.time()
46 | print ('简单多线程爬虫的总时间为：', end-start)
47 | print ("Exiting Main Thread")


--------------------------------------------------------------------------------
/第一版/Cha 7 -提升爬虫的速度/cha7/thread2.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | import requests
 3 | import time
 4 | import queue as Queue
 5 | 
 6 | link_list = []
 7 | with open('alexa.txt', 'r') as file:
 8 |     file_list = file.readlines()
 9 |     for eachone in file_list:
10 |         link = eachone.split('\t')[1]
11 |         link = link.replace('\n','')
12 |         link_list.append(link)
13 |         
14 | start = time.time()
15 | class myThread (threading.Thread):
16 |     def __init__(self, name, q):
17 |         threading.Thread.__init__(self)
18 |         self.name = name
19 |         self.q = q
20 |     def run(self):
21 |         print ("Starting " + self.name)
22 |         while True:
23 |             try:
24 |                 crawler(self.name, self.q)
25 |             except:
26 |                 break
27 |         print ("Exiting " + self.name)
28 |         
29 | def crawler(threadName, q):
30 |     url = q.get(timeout=2)
31 |     try:
32 |         r = requests.get(url, timeout=20)
33 |         print (q.qsize(), threadName, r.status_code, url)
34 |     except Exception as e: 
35 |         print (q.qsize(), threadName, url, 'Error: ', e)
36 |         
37 | threadList = ["Thread-1", "Thread-2", "Thread-3","Thread-4", "Thread-5"]
38 | workQueue = Queue.Queue(1000)
39 | threads = []
40 | 
41 | # 创建新线程
42 | for tName in threadList:
43 |     thread = myThread(tName, workQueue)
44 |     thread.start()
45 |     threads.append(thread)
46 |     
47 | # 填充队列
48 | for url in link_list:
49 |     workQueue.put(url)
50 | 
51 | # 等待所有线程完成
52 | for t in threads:
53 |     t.join()
54 | 
55 | end = time.time()
56 | print ('Queue多线程爬虫的总时间为：', end-start)
57 | print ("Exiting Main Thread")


--------------------------------------------------------------------------------
/第一版/Cha 7 -提升爬虫的速度/cha7/thread_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 |  # -*- coding: utf-8 -*-
 3 | 
 4 | import threading
 5 | import requests
 6 | import time
 7 | import queue as Queue
 8 | 
 9 | 
10 | class myThread (threading.Thread):
11 |     def __init__(self, name, q):
12 |         threading.Thread.__init__(self)
13 |         self.name = name
14 |         self.q = q
15 |     def run(self):
16 |         print ("Starting " + self.name)
17 |         while True:
18 |             try:
19 |                 crawler(self.name, self.q)
20 |             except:
21 |                 break
22 |         print ("Exiting " + self.name)
23 |         
24 | def crawler(threadName, q):
25 |     url = q.get(timeout=2)
26 |     try:
27 |         r = requests.get(url, timeout=20)
28 |         print (q.qsize(), threadName, r.status_code, url)
29 |     except Exception as e: 
30 |         print (q.qsize(), threadName, url, 'Error: ', e)
31 | 
32 | def thread_main(link_list, t_num):
33 |     start = time.time()
34 |     workQueue = Queue.Queue(1000)
35 |     threads = []
36 | 
37 |     # 创建新线程
38 |     for tName in range(t_num):
39 |         thread = myThread('Thread' + str(tName), workQueue)
40 |         thread.start()
41 |         threads.append(thread)
42 |         
43 |     # 填充队列
44 |     for url in link_list:
45 |         workQueue.put(url)
46 | 
47 |     # 等待所有线程完成
48 |     for t in threads:
49 |         t.join()
50 | 
51 |     end = time.time()
52 |     print ('Queue多线程爬虫的总时间为：', end-start)
53 |     print ("Exiting Main Thread")
54 |     return end-start
55 | 
56 | if __name__ == '__main__':
57 |     link_list = []
58 |     with open('alexa.txt', 'r') as file:
59 |         file_list = file.readlines()
60 |         for eachone in file_list:
61 |             link = eachone.split('\t')[1]
62 |             link = link.replace('\n','')
63 |             link_list.append(link)
64 | 
65 |     thread_main(link_list, 5)


--------------------------------------------------------------------------------
/第一版/Cha 7 -提升爬虫的速度/cha7/time_spend 2.py:
--------------------------------------------------------------------------------
 1 | from multiprocess_test import multiprocess_main
 2 | from thread_test import thread_main
 3 | 
 4 | if __name__ == '__main__':
 5 |     link_list = []
 6 |     with open('alexa.txt', 'r') as file:
 7 |         file_list = file.readlines()
 8 |         for eachone in file_list:
 9 |             link = eachone.split('\t')[1]
10 |             link = link.replace('\n','')
11 |             link_list.append(link)
12 | 
13 |     #single = single()
14 |     #print ('串行的总时间为：', single)
15 | 
16 |     #thread_time = thread_main(link_list, 5)
17 |     #print ('Queue多线程爬虫的总时间为：', thread_time)
18 | 
19 |     multiprocess_time = multiprocess_main(link_list, 3)
20 |     #print ('Pool + Queue多进程爬虫的总时间为：', multiprocess_time)
21 | 
22 |     #gevent_time = gevent_main(link_list, 10)
23 |     #print ('gevent + Queue多协程爬虫的总时间为：', gevent_time)
24 | 
25 |     #with open('result.txt','a+',encoding='utf-8') as f:
26 |     #    f.write(single + '\t' + thread_time + '\t' + multiprocess_time + '\t' + gevent_time)


--------------------------------------------------------------------------------
/第一版/Cha 7 -提升爬虫的速度/cha7/time_spend.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 |  # -*- coding: utf-8 -*-
 3 | 
 4 | import requests
 5 | import time
 6 | #from multiprocess_test import multiprocess_main
 7 | #from thread_test import thread_main
 8 | 
 9 | def single():
10 |     start = time.time()
11 |     for eachone in link_list:
12 |         try:
13 |             r = requests.get(eachone)
14 |             print (r.status_code, eachone)
15 |         except Exception as e: 
16 |             print('Error: ', e)
17 |     end = time.time()
18 |     time_spend = end-start
19 |     print ('串行的总时间为：', time_spend)
20 |     return time_spend
21 | 
22 | if __name__ == '__main__':
23 |     link_list = []
24 |     with open('alexa.txt', 'r') as file:
25 |         file_list = file.readlines()
26 |         for eachone in file_list:
27 |             link = eachone.split('\t')[1]
28 |             link = link.replace('\n','')
29 |             link_list.append(link)
30 | 
31 |     #thread_time10 = thread_main(link_list, 10)
32 |     #print ('Queue多线程爬虫的总时间为：', thread_time10)
33 | 
34 |     #multiprocess_time10 = multiprocess_main(link_list, 10)
35 |     #print ('Pool + Queue多进程爬虫的总时间为：', multiprocess_time10)
36 | 
37 |     #thread_time3 = thread_main(link_list, 3)
38 |     #print ('Queue多线程爬虫的总时间为：', thread_time3)
39 | 
40 |     #multiprocess_time3 = multiprocess_main(link_list, 3)
41 |     #print ('Pool + Queue多进程爬虫的总时间为：', multiprocess_time3)
42 | 
43 |     single_time = single()
44 |     print ('串行的总时间为：', single_time)
45 | 
46 |     with open('result_single_time.txt','a+',encoding='utf-8') as f:
47 |         f.write(str(single_time))
48 |         #f.write(str(thread_time10) + '\t' + str(multiprocess_time10) + '\t' + str(thread_time3) + '\t' + str(multiprocess_time3))


--------------------------------------------------------------------------------
/第一版/Cha 7 -提升爬虫的速度/多协程.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第一版/Cha 7 -提升爬虫的速度/多协程.png


--------------------------------------------------------------------------------
/第一版/Cha 7 -提升爬虫的速度/多线程.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第一版/Cha 7 -提升爬虫的速度/多线程.png


--------------------------------------------------------------------------------
/第一版/Cha 7 -提升爬虫的速度/多进程.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第一版/Cha 7 -提升爬虫的速度/多进程.png


--------------------------------------------------------------------------------
/第一版/Cha 8 -反爬虫问题/Cha 8 -反爬虫问题.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 8.3如何“反反爬虫”？"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## 8.3.1修改请求 header"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 6,
 20 |    "metadata": {
 21 |     "collapsed": false
 22 |    },
 23 |    "outputs": [
 24 |     {
 25 |      "name": "stdout",
 26 |      "output_type": "stream",
 27 |      "text": [
 28 |       "{'User-Agent': 'python-requests/2.12.4', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}\n"
 29 |      ]
 30 |     }
 31 |    ],
 32 |    "source": [
 33 |     "import requests\n",
 34 |     "r = requests.get('http://www.santostang.com')\n",
 35 |     "print (r.request.headers)"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 5,
 41 |    "metadata": {
 42 |     "collapsed": false
 43 |    },
 44 |    "outputs": [
 45 |     {
 46 |      "name": "stdout",
 47 |      "output_type": "stream",
 48 |      "text": [
 49 |       "{'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}\n"
 50 |      ]
 51 |     }
 52 |    ],
 53 |    "source": [
 54 |     "import requests\n",
 55 |     "\n",
 56 |     "link = 'http://www.santostang.com'\n",
 57 |     "headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} \n",
 58 |     "r = requests.get(link, headers= headers)\n",
 59 |     "print (r.request.headers)"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "## 8.3.2 修改爬虫的间隔时间"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 10,
 72 |    "metadata": {
 73 |     "collapsed": false
 74 |    },
 75 |    "outputs": [
 76 |     {
 77 |      "name": "stdout",
 78 |      "output_type": "stream",
 79 |      "text": [
 80 |       "2.0001144409179688\n"
 81 |      ]
 82 |     }
 83 |    ],
 84 |    "source": [
 85 |     "import time\n",
 86 |     "t1 = time.time()\n",
 87 |     "time.sleep(2)\n",
 88 |     "t2 = time.time()\n",
 89 |     "total_time = t2-t1\n",
 90 |     "print (total_time)"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 17,
 96 |    "metadata": {
 97 |     "collapsed": false
 98 |    },
 99 |    "outputs": [
100 |     {
101 |      "name": "stdout",
102 |      "output_type": "stream",
103 |      "text": [
104 |       "0.3481693303048349\n"
105 |      ]
106 |     }
107 |    ],
108 |    "source": [
109 |     "import time\n",
110 |     "import random\n",
111 |     "\n",
112 |     "sleep_time = random.randint(0,2) + random.random()\n",
113 |     "print (sleep_time)\n",
114 |     "time.sleep(sleep_time)"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 19,
120 |    "metadata": {
121 |     "collapsed": false
122 |    },
123 |    "outputs": [
124 |     {
125 |      "name": "stdout",
126 |      "output_type": "stream",
127 |      "text": [
128 |       "开始爬取这篇博客:  http://www.santostang.com/2017/03/08/hello-python/\n",
129 |       "这篇博客的标题为:  Hello Python!\n",
130 |       "开始休息:  0.16292490492777212 秒\n",
131 |       "开始爬取这篇博客:  http://www.santostang.com/2017/03/07/echarts%e5%ad%a6%e4%b9%a0%e7%ac%94%e8%ae%b02-%e5%8d%95%e9%a1%b5%e9%9d%a2%e5%a4%9a%e5%bc%a0%e5%9b%be%e8%a1%a8/\n",
132 |       "这篇博客的标题为:  echarts学习笔记(2) — 同一页面多图表\n",
133 |       "开始休息:  1.912631031656519 秒\n",
134 |       "开始爬取这篇博客:  http://www.santostang.com/2017/03/07/echarts%e5%ad%a6%e4%b9%a0%e7%ac%94%e8%ae%b01-%e4%bd%bf%e7%94%a8%e6%a8%a1%e5%9d%97%e5%8c%96%e5%8d%95%e6%96%87%e4%bb%b6%e5%bc%95%e5%85%a5/\n",
135 |       "这篇博客的标题为:  echarts学习笔记(1) — 模块化单文件引入\n",
136 |       "开始休息:  1.3634313119416182 秒\n",
137 |       "开始爬取这篇博客:  http://www.santostang.com/2017/03/06/%e3%80%90%e7%88%ac%e8%99%ab%e4%ba%8c%e3%80%91%e7%88%ac%e8%99%ab%e7%9a%84%e6%a1%86%e6%9e%b6%e5%92%8c%e5%9f%ba%e6%9c%ac%e8%ae%ae%e9%a2%98/\n",
138 |       "这篇博客的标题为:  【爬虫二】爬虫的框架和基本议题\n",
139 |       "开始休息:  2.0205314818737516 秒\n",
140 |       "开始爬取这篇博客:  http://www.santostang.com/2017/03/06/%e3%80%90%e7%88%ac%e8%99%ab%e4%b8%80%e3%80%91%e6%9c%80%e7%ae%80%e5%8d%95%e7%9a%84%e7%88%ac%e8%99%ab%ef%bc%8c%e9%9b%b6%e5%9f%ba%e7%a1%80%e6%95%99%e5%ad%a6/\n",
141 |       "这篇博客的标题为:  【爬虫一】最简单的爬虫，零基础教学\n",
142 |       "开始休息:  2.446761436097069 秒\n",
143 |       "开始爬取这篇博客:  http://www.santostang.com/2017/03/02/hello-world/\n",
144 |       "这篇博客的标题为:  Hello world!\n",
145 |       "开始休息:  0.8005131789714476 秒\n"
146 |      ]
147 |     }
148 |    ],
149 |    "source": [
150 |     "import requests\n",
151 |     "from bs4 import BeautifulSoup\n",
152 |     "import time\n",
153 |     "import random\n",
154 |     "\n",
155 |     "link = \"http://www.santostang.com/\"\n",
156 |     "\n",
157 |     "def scrap(link):\n",
158 |     "    headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} \n",
159 |     "    r = requests.get(link, headers= headers)\n",
160 |     "    html = r.text\n",
161 |     "    soup = BeautifulSoup(html, \"lxml\")\n",
162 |     "    return soup\n",
163 |     "\n",
164 |     "soup = scrap(link)\n",
165 |     "title_list = soup.find_all(\"h1\", class_=\"post-title\")\n",
166 |     "for eachone in title_list:\n",
167 |     "    url = eachone.a['href']\n",
168 |     "    print ('开始爬取这篇博客: ', url)\n",
169 |     "    soup_article = scrap(url)\n",
170 |     "    title = soup_article.find(\"h1\", class_=\"view-title\").text.strip()\n",
171 |     "    print ('这篇博客的标题为: ', title)\n",
172 |     "    sleep_time = random.randint(0,2) + random.random()\n",
173 |     "    print ('开始休息: ', sleep_time, '秒')\n",
174 |     "    time.sleep(sleep_time)"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {
181 |     "collapsed": true
182 |    },
183 |    "outputs": [],
184 |    "source": [
185 |     "scrap_times = 0\n",
186 |     "for eachone in title_list:\n",
187 |     "    url = eachone.a['href']\n",
188 |     "    print ('开始爬取这篇博客: ', url)\n",
189 |     "    soup_article = scrap(url)\n",
190 |     "    title = soup_article.find(\"h1\", class_=\"view-title\").text.strip()\n",
191 |     "    print ('这篇博客的标题为: ', title)\n",
192 |     "    \n",
193 |     "    scrap_times += 1\n",
194 |     "    if scrap_times % 5 == 0:\n",
195 |     "        sleep_time = 10 + random.random()\n",
196 |     "    else:\n",
197 |     "        sleep_time = random.randint(0,2) + random.random()\n",
198 |     "    time.sleep(sleep_time)\n",
199 |     "    print ('开始休息: ', sleep_time, '秒')"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "metadata": {},
205 |    "source": [
206 |     "## 8.3.3 使用代理"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {
213 |     "collapsed": true
214 |    },
215 |    "outputs": [],
216 |    "source": [
217 |     "import requests\n",
218 |     "\n",
219 |     "link = \"http://www.santostang.com/\"\n",
220 |     "proxies = {'http':'http://xxx.xxx.xxx.xxx:xxxx'}\n",
221 |     "response = requests.get(link, proxies=proxies)"
222 |    ]
223 |   }
224 |  ],
225 |  "metadata": {
226 |   "kernelspec": {
227 |    "display_name": "Python 3",
228 |    "language": "python",
229 |    "name": "python3"
230 |   },
231 |   "language_info": {
232 |    "codemirror_mode": {
233 |     "name": "ipython",
234 |     "version": 3
235 |    },
236 |    "file_extension": ".py",
237 |    "mimetype": "text/x-python",
238 |    "name": "python",
239 |    "nbconvert_exporter": "python",
240 |    "pygments_lexer": "ipython3",
241 |    "version": "3.6.0"
242 |   }
243 |  },
244 |  "nbformat": 4,
245 |  "nbformat_minor": 2
246 | }
247 | 


--------------------------------------------------------------------------------
/第二版/Cha 11 -登录与验证码处理/captcha.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 11 -登录与验证码处理/captcha.jpg


--------------------------------------------------------------------------------
/第二版/Cha 11 -登录与验证码处理/captcha_gray.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 11 -登录与验证码处理/captcha_gray.jpg


--------------------------------------------------------------------------------
/第二版/Cha 11 -登录与验证码处理/captcha_thresholded.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 11 -登录与验证码处理/captcha_thresholded.jpg


--------------------------------------------------------------------------------
/第二版/Cha 11 -登录与验证码处理/cookies:
--------------------------------------------------------------------------------
1 | #LWP-Cookies-2.0
2 | Set-Cookie3: wordpress_logged_in_d7bcd9a2844a13f8dc3e5a4eb2cdfa70="test%7C1544889241%7CzMUKtgScAb81VCyAMpqMQ8fQ7jryOCjFrooHMWk72xF%7C449659c6398642d69e61044e14efbb77b42028c4063536da2725f55e8aa5af26"; path="/"; domain="www.santostang.com"; path_spec; expires="2018-12-16 03:54:01Z"; httponly=None; version=0
3 | Set-Cookie3: wordpress_d7bcd9a2844a13f8dc3e5a4eb2cdfa70="test%7C1544889241%7CzMUKtgScAb81VCyAMpqMQ8fQ7jryOCjFrooHMWk72xF%7C823caf1f78a9836d8921bb977cb81ae822c6020cd554a0658d77d5475a4b0303"; path="/wp-admin"; domain="www.santostang.com"; path_spec; expires="2018-12-16 03:54:01Z"; httponly=None; version=0
4 | Set-Cookie3: wordpress_d7bcd9a2844a13f8dc3e5a4eb2cdfa70="test%7C1544889241%7CzMUKtgScAb81VCyAMpqMQ8fQ7jryOCjFrooHMWk72xF%7C823caf1f78a9836d8921bb977cb81ae822c6020cd554a0658d77d5475a4b0303"; path="/wp-content/plugins"; domain="www.santostang.com"; path_spec; expires="2018-12-16 03:54:01Z"; httponly=None; version=0
5 | 


--------------------------------------------------------------------------------
/第二版/Cha 12 -服务器采集/tor1.py:
--------------------------------------------------------------------------------
 1 | import socket
 2 | import socks
 3 | import requests
 4 | 
 5 | # Tor使用9150端口为默认的socks端口
 6 | socks.set_default_proxy(socks.SOCKS5, "127.0.0.1", 9150)
 7 | socket.socket = socks.socksocket
 8 | # 获取这次抓取使用的IP地址
 9 | a = requests.get("http://checkip.amazonaws.com").text
10 | 
11 | print (a)


--------------------------------------------------------------------------------
/第二版/Cha 12 -服务器采集/tor2.py:
--------------------------------------------------------------------------------
 1 | from stem import Signal
 2 | from stem.control import Controller
 3 | import socket
 4 | import socks
 5 | import requests
 6 | import time
 7 | 
 8 | controller = Controller.from_port(port = 9151)
 9 | controller.authenticate()
10 | socks.set_default_proxy(socks.SOCKS5, "127.0.0.1", 9150)
11 | socket.socket = socks.socksocket
12 | 
13 | total_scrappy_time = 0
14 | total_changeIP_time = 0
15 | 
16 | for x in range(0,10):
17 | 	a = requests.get("http://checkip.amazonaws.com").text
18 | 	print ("第", x+1, "次IP：", a)
19 | 
20 | 	time1 = time.time()
21 | 	a = requests.get("http://www.santostang.com/").text
22 | 	#print (a)
23 | 	time2 = time.time()
24 | 	total_scrappy_time = total_scrappy_time + time2-time1
25 | 	print ("第", x+1, "次抓取花费时间：", time2-time1)
26 | 
27 | 	time3 = time.time()
28 | 	controller.signal(Signal.NEWNYM)
29 | 	time.sleep(5)
30 | 	time4 = time.time()
31 | 	total_changeIP_time = total_changeIP_time + time4-time3-5
32 | 	print ("第", x+1, "次更换IP花费时间：", time4-time3-5)
33 | 
34 | print ("平均抓取花费时间：", total_scrappy_time/10)
35 | print ("平均更换IP花费时间：", total_changeIP_time/10)
36 | 


--------------------------------------------------------------------------------
/第二版/Cha 12 -服务器采集/tor3.py:
--------------------------------------------------------------------------------
 1 | from stem import Signal
 2 | from stem.control import Controller
 3 | import socket
 4 | import socks
 5 | import requests
 6 | import time
 7 | 
 8 | #controller = Controller.from_port(port = 9151)
 9 | #controller.authenticate()
10 | #socks.set_default_proxy(socks.SOCKS5, "127.0.0.1", 9150)
11 | #socket.socket = socks.socksocket
12 | 
13 | total_scrappy_time = 0
14 | total_changeIP_time = 0
15 | 
16 | for x in range(0,10):
17 | 	#a = requests.get("http://checkip.amazonaws.com").text
18 | 	#print ("第", x+1, "次IP：", a)
19 | 
20 | 	time1 = time.time()
21 | 	a = requests.get("http://www.santostang.com/").text
22 | 	#print (a)
23 | 	time2 = time.time()
24 | 	total_scrappy_time = total_scrappy_time + time2-time1
25 | 	print ("第", x+1, "次抓取花费时间：", time2-time1)
26 | 
27 | 	time3 = time.time()
28 | 	#controller.signal(Signal.NEWNYM)
29 | 	time.sleep(5)
30 | 	time4 = time.time()
31 | 	total_changeIP_time = total_changeIP_time + time4-time3-5
32 | 	print ("第", x+1, "次更换IP花费时间：", time4-time3-5)
33 | 
34 | print ("平均抓取花费时间：", total_scrappy_time/10)
35 | print ("平均更换IP花费时间：", total_changeIP_time/10)


--------------------------------------------------------------------------------
/第二版/Cha 13 -分布式爬虫/1497099934.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 13 -分布式爬虫/1497099934.jpg


--------------------------------------------------------------------------------
/第二版/Cha 13 -分布式爬虫/Cha 13 - 分布式爬虫.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 12.3 Redis分布式爬虫实战"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## 12.3.2 加入任务队列"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "def push_redis_list():\n",
 26 |     "    r = Redis(host='137.189.204.65', port=6379 ,password='redisredis')\n",
 27 |     "    print (r.keys('*'))\n",
 28 |     "    \n",
 29 |     "    link_list = []\n",
 30 |     "    with open('alexa.txt', 'r') as file:\n",
 31 |     "        file_list = file.readlines()\n",
 32 |     "        for eachone in file_list:\n",
 33 |     "            link = eachone.split('\\t')[1]\n",
 34 |     "            link = link.replace('\\n','')\n",
 35 |     "            link_list.append(link)\n",
 36 |     "            if len(link_list) == 100:\n",
 37 |     "                break\n",
 38 |     "                \n",
 39 |     "    for url in link_list:\n",
 40 |     "        response = requests.get(url, headers=headers, timeout=20)\n",
 41 |     "        soup = BeautifulSoup(response.text, 'lxml')\n",
 42 |     "        img_list = soup.find_all('img')\n",
 43 |     "        for img in img_list:\n",
 44 |     "            img_url = img['src']\n",
 45 |     "            if img_url != '':\n",
 46 |     "                print (\"加入的图片url: \", img_url)\n",
 47 |     "                r.lpush('img_url',img_url)\n",
 48 |     "        print ('现在图片链接的个数为', r.llen('img_url'))\n",
 49 |     "    return"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "# 12.3.3 读取任务队列，下载图片"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {
 63 |     "collapsed": true
 64 |    },
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "def get_img():\n",
 68 |     "    r = Redis(host='137.189.204.65', port=6379 ,password='redisredis')\n",
 69 |     "    while True:\n",
 70 |     "        try:\n",
 71 |     "            url = r.lpop('img_url')\n",
 72 |     "            url = url.decode('ascii')\n",
 73 |     "            try:\n",
 74 |     "                response = requests.get(url, headers=headers,timeout = 20)\n",
 75 |     "                name = int(time.time())\n",
 76 |     "                f = open(str(name)+ url[-4:], 'wb')\n",
 77 |     "                f.write(response.content)\n",
 78 |     "                f.close()\n",
 79 |     "                print ('已经获取图片', url)\n",
 80 |     "            except Exception as e:\n",
 81 |     "                print ('爬取图片过程出问题', e)\n",
 82 |     "            time.sleep(3)\n",
 83 |     "        except Exception as e:\n",
 84 |     "            print (e)\n",
 85 |     "            time.sleep(10)\n",
 86 |     "            break\n",
 87 |     "    return "
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "# 12.3.4 分布式爬虫代码"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 13,
100 |    "metadata": {
101 |     "collapsed": false
102 |    },
103 |    "outputs": [],
104 |    "source": [
105 |     "import requests\n",
106 |     "from bs4 import BeautifulSoup\n",
107 |     "import re\n",
108 |     "import time\n",
109 |     "from redis import Redis\n",
110 |     "headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' }\n",
111 |     "\n",
112 |     "def push_redis_list():\n",
113 |     "    #与上面此函数相同\n",
114 |     "\n",
115 |     "def get_img():\n",
116 |     "    #与上面此函数相同\n",
117 |     "\n",
118 |     "if __name__ == '__main__':           \n",
119 |     "    this_machine = 'master'          \n",
120 |     "    print ('开始分布式爬虫')\n",
121 |     "    if this_machine == 'master':\n",
122 |     "        push_redis_list()\n",
123 |     "    else:\n",
124 |     "        get_img()"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 14,
130 |    "metadata": {
131 |     "collapsed": false
132 |    },
133 |    "outputs": [],
134 |    "source": [
135 |     "import requests\n",
136 |     "from bs4 import BeautifulSoup\n",
137 |     "import re\n",
138 |     "import time\n",
139 |     "from redis import Redis\n",
140 |     "headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' }\n",
141 |     "\n",
142 |     "def push_redis_list():\n",
143 |     "    #与上面此函数相同\n",
144 |     "\n",
145 |     "def get_img():\n",
146 |     "    #与上面此函数相同\n",
147 |     "\n",
148 |     "if __name__ == '__main__':           \n",
149 |     "    this_machine = 'slave'          \n",
150 |     "    print ('开始分布式爬虫')\n",
151 |     "    if this_machine == 'master':\n",
152 |     "        push_redis_list()\n",
153 |     "    else:\n",
154 |     "        get_img()"
155 |    ]
156 |   }
157 |  ],
158 |  "metadata": {
159 |   "kernelspec": {
160 |    "display_name": "Python 3",
161 |    "language": "python",
162 |    "name": "python3"
163 |   },
164 |   "language_info": {
165 |    "codemirror_mode": {
166 |     "name": "ipython",
167 |     "version": 3
168 |    },
169 |    "file_extension": ".py",
170 |    "mimetype": "text/x-python",
171 |    "name": "python",
172 |    "nbconvert_exporter": "python",
173 |    "pygments_lexer": "ipython3",
174 |    "version": "3.6.0"
175 |   }
176 |  },
177 |  "nbformat": 4,
178 |  "nbformat_minor": 2
179 | }
180 | 


--------------------------------------------------------------------------------
/第二版/Cha 13 -分布式爬虫/master.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import re
 4 | import time
 5 | from redis import Redis
 6 | headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' }
 7 | 
 8 | def push_redis_list():
 9 |     r = Redis(host='137.189.204.65', port=6379 ,password='redisredis')
10 |     print (r.keys('*'))
11 |     
12 |     link_list = []
13 |     with open('alexa.txt', 'r') as file:
14 |         file_list = file.readlines()
15 |         for eachone in file_list:
16 |             link = eachone.split('\t')[1]
17 |             link = link.replace('\n','')
18 |             link_list.append(link)
19 |             if len(link_list) == 100:
20 |                 break
21 |                 
22 |     for url in link_list:
23 |         response = requests.get(url, headers=headers, timeout=20)
24 |         soup = BeautifulSoup(response.text, 'lxml')
25 |         img_list = soup.find_all('img')
26 |         for img in img_list:
27 |             img_url = img['src']
28 |             if img_url != '':
29 |                 print ("加入的图片url: ", img_url)
30 |                 r.lpush('img_url',img_url)
31 |         print ('现在图片链接的个数为', r.llen('img_url'))
32 |     return
33 | 
34 | def get_img():
35 |     r = Redis(host='137.189.204.65', port=6379 ,password='redisredis')
36 |     while True:
37 |         try:
38 |             url = r.lpop('img_url')
39 |             url = url.decode('ascii')
40 |             try:
41 |                 response = requests.get(url, headers=headers,timeout = 20)
42 |                 name = int(time.time())
43 |                 f = open(str(name)+ url[-4:], 'wb')
44 |                 f.write(response.content)
45 |                 f.close()
46 |                 print ('已经获取图片', url)
47 |             except Exception as e:
48 |                 print ('爬取图片过程出问题', e)
49 |             time.sleep(3)
50 |         except Exception as e:
51 |             print (e)
52 |             time.sleep(10)
53 |             break
54 |     return 
55 | 
56 | if __name__ == '__main__':           
57 |     this_machine = 'master'          
58 |     print ('开始分布式爬虫')
59 |     if this_machine == 'master':
60 |         push_redis_list()
61 |     else:
62 |         get_img()


--------------------------------------------------------------------------------
/第二版/Cha 13 -分布式爬虫/slave.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import re
 4 | import time
 5 | from redis import Redis
 6 | headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' }
 7 | 
 8 | def push_redis_list():
 9 |     r = Redis(host='137.189.204.65', port=6379 ,password='redisredis')
10 |     print (r.keys('*'))
11 |     
12 |     link_list = []
13 |     with open('alexa.txt', 'r') as file:
14 |         file_list = file.readlines()
15 |         for eachone in file_list:
16 |             link = eachone.split('\t')[1]
17 |             link = link.replace('\n','')
18 |             link_list.append(link)
19 |             if len(link_list) == 100:
20 |                 break
21 |                 
22 |     for url in link_list:
23 |         response = requests.get(url, headers=headers, timeout=20)
24 |         soup = BeautifulSoup(response.text, 'lxml')
25 |         img_list = soup.find_all('img')
26 |         for img in img_list:
27 |             img_url = img['src']
28 |             if img_url != '':
29 |                 print ("加入的图片url: ", img_url)
30 |                 r.lpush('img_url',img_url)
31 |         print ('现在图片链接的个数为', r.llen('img_url'))
32 |     return
33 | 
34 | def get_img():
35 |     r = Redis(host='137.189.204.65', port=6379 ,password='redisredis')
36 |     while True:
37 |         try:
38 |             url = r.lpop('img_url')
39 |             url = url.decode('ascii')
40 |             if url[:2] == '//':
41 |                 url = 'http:' + url
42 |             print (url)
43 |             try:
44 |                 response = requests.get(url, headers=headers,timeout = 20)
45 |                 name = int(time.time())
46 |                 f = open(str(name)+ url[-4:], 'wb')
47 |                 f.write(response.content)
48 |                 f.close()
49 |                 print ('已经获取图片', url)
50 |             except Exception as e:
51 |                 print ('爬取图片过程出问题', e)
52 |             time.sleep(3)
53 |         except Exception as e:
54 |             print (e)
55 |             time.sleep(10)
56 |             break
57 |     return 
58 | 
59 | if __name__ == '__main__':           
60 |     this_machine = 'slave'          
61 |     print ('开始分布式爬虫')
62 |     if this_machine == 'master':
63 |         push_redis_list()
64 |     else:
65 |         get_img()


--------------------------------------------------------------------------------
/第二版/Cha 16 -爬虫实战三：百度地图API/cities.txt:
--------------------------------------------------------------------------------
  1 | 苏州市	3067
  2 | 南京市	2547
  3 | 无锡市	2324
  4 | 常州市	1432
  5 | 南通市	1201
  6 | 徐州市	1057
  7 | 盐城市	850
  8 | 宿迁市	818
  9 | 扬州市	609
 10 | 镇江市	568
 11 | 泰州市	558
 12 | 淮安市	520
 13 | 连云港市	427
 14 | 杭州市	2495
 15 | 温州市	2114
 16 | 宁波市	1764
 17 | 嘉兴市	869
 18 | 金华市	776
 19 | 台州市	661
 20 | 绍兴市	632
 21 | 湖州市	479
 22 | 丽水市	379
 23 | 衢州市	192
 24 | 舟山市	131
 25 | 广州市	4366
 26 | 深圳市	4850
 27 | 东莞市	2493
 28 | 佛山市	2652
 29 | 惠州市	1386
 30 | 江门市	776
 31 | 中山市	729
 32 | 珠海市	478
 33 | 汕头市	452
 34 | 梅州市	397
 35 | 湛江市	379
 36 | 韶关市	367
 37 | 河源市	350
 38 | 肇庆市	342
 39 | 清远市	303
 40 | 茂名市	295
 41 | 揭阳市	226
 42 | 阳江市	189
 43 | 汕尾市	177
 44 | 潮州市	155
 45 | 云浮市	87
 46 | 福州市	2217
 47 | 厦门市	1364
 48 | 泉州市	1104
 49 | 漳州市	621
 50 | 莆田市	353
 51 | 宁德市	312
 52 | 龙岩市	304
 53 | 南平市	253
 54 | 三明市	245
 55 | 青岛市	1750
 56 | 济南市	976
 57 | 潍坊市	841
 58 | 烟台市	735
 59 | 威海市	680
 60 | 淄博市	528
 61 | 临沂市	525
 62 | 济宁市	447
 63 | 日照市	401
 64 | 菏泽市	361
 65 | 聊城市	330
 66 | 德州市	283
 67 | 滨州市	268
 68 | 枣庄市	264
 69 | 泰安市	251
 70 | 东营市	211
 71 | 莱芜市	116
 72 | 郑州市	2061
 73 | 洛阳市	784
 74 | 开封市	506
 75 | 新乡市	447
 76 | 南阳市	411
 77 | 商丘市	337
 78 | 焦作市	312
 79 | 平顶山市	269
 80 | 安阳市	263
 81 | 周口市	230
 82 | 三门峡市	215
 83 | 信阳市	211
 84 | 许昌市	194
 85 | 驻马店市	183
 86 | 濮阳市	134
 87 | 漯河市	121
 88 | 鹤壁市	103
 89 | 济源市	30
 90 | 石家庄市	1342
 91 | 保定市	777
 92 | 邢台市	633
 93 | 秦皇岛市	630
 94 | 唐山市	606
 95 | 邯郸市	546
 96 | 沧州市	366
 97 | 廊坊市	357
 98 | 张家口市	345
 99 | 承德市	269
100 | 衡水市	171
101 | 成都市	3436
102 | 绵阳市	462
103 | 南充市	447
104 | 德阳市	315
105 | 内江市	270
106 | 遂宁市	242
107 | 宜宾市	202
108 | 乐山市	190
109 | 泸州市	182
110 | 眉山市	169
111 | 资阳市	167
112 | 达州市	154
113 | 自贡市	142
114 | 广元市	140
115 | 攀枝花市	128
116 | 广安市	113
117 | 凉山彝族自治州	98
118 | 巴中市	78
119 | 雅安市	78
120 | 阿坝藏族羌族自治州	58
121 | 甘孜藏族自治州	55
122 | 沈阳市	1410
123 | 大连市	1625
124 | 抚顺市	380
125 | 鞍山市	274
126 | 营口市	211
127 | 盘锦市	203
128 | 葫芦岛市	171
129 | 本溪市	159
130 | 锦州市	156
131 | 丹东市	153
132 | 阜新市	109
133 | 朝阳市	96
134 | 铁岭市	92
135 | 辽阳市	89
136 | 昆明市	1756
137 | 大理白族自治州	356
138 | 玉溪市	285
139 | 西双版纳傣族自治州	275
140 | 曲靖市	226
141 | 丽江市	164
142 | 红河哈尼族彝族自治州	142
143 | 普洱市	115
144 | 迪庆藏族自治州	110
145 | 楚雄彝族自治州	109
146 | 昭通市	106
147 | 保山市	102
148 | 临沧市	92
149 | 文山壮族苗族自治州	67
150 | 德宏傣族景颇族自治州	51
151 | 怒江傈僳族自治州	14
152 | 长沙市	1831
153 | 张家界市	463
154 | 郴州市	445
155 | 株洲市	421
156 | 常德市	411
157 | 衡阳市	350
158 | 岳阳市	247
159 | 邵阳市	219
160 | 永州市	196
161 | 娄底市	181
162 | 湘潭市	144
163 | 怀化市	138
164 | 益阳市	133
165 | 湘西土家族苗族自治州	70
166 | 武汉市	2856
167 | 黄冈市	922
168 | 宜昌市	395
169 | 荆州市	293
170 | 十堰市	283
171 | 荆门市	281
172 | 襄阳市	262
173 | 黄石市	236
174 | 孝感市	202
175 | 咸宁市	184
176 | 随州市	134
177 | 恩施土家族苗族自治州	117
178 | 神农架林区	78
179 | 仙桃市	57
180 | 鄂州市	50
181 | 潜江市	46
182 | 天门市	41
183 | 南昌市	858
184 | 赣州市	717
185 | 九江市	469
186 | 新余市	462
187 | 上饶市	365
188 | 萍乡市	301
189 | 宜春市	284
190 | 吉安市	262
191 | 抚州市	239
192 | 景德镇市	137
193 | 鹰潭市	78
194 | 合肥市	1349
195 | 安庆市	431
196 | 宿州市	323
197 | 阜阳市	312
198 | 芜湖市	302
199 | 六安市	283
200 | 滁州市	206
201 | 亳州市	195
202 | 淮南市	180
203 | 宣城市	170
204 | 马鞍山市	164
205 | 蚌埠市	146
206 | 黄山市	138
207 | 巢湖市	132
208 | 淮北市	113
209 | 池州市	83
210 | 铜陵市	77
211 | 太原市	811
212 | 临汾市	345
213 | 长治市	294
214 | 运城市	289
215 | 晋中市	267
216 | 大同市	249
217 | 晋城市	189
218 | 吕梁市	166
219 | 忻州市	159
220 | 阳泉市	105
221 | 朔州市	79
222 | 南宁市	1102
223 | 桂林市	666
224 | 柳州市	588
225 | 玉林市	410
226 | 北海市	288
227 | 百色市	191
228 | 河池市	183
229 | 梧州市	150
230 | 贵港市	142
231 | 钦州市	91
232 | 防城港市	78
233 | 来宾市	72
234 | 贺州市	68
235 | 崇左市	46
236 | 西安市	3222
237 | 宝鸡市	469
238 | 渭南市	330
239 | 咸阳市	251
240 | 汉中市	183
241 | 榆林市	173
242 | 商洛市	156
243 | 安康市	125
244 | 延安市	95
245 | 铜川市	83
246 | 哈尔滨市	1228
247 | 齐齐哈尔市	178
248 | 牡丹江市	178
249 | 佳木斯市	169
250 | 大庆市	161
251 | 黑河市	137
252 | 伊春市	134
253 | 鹤岗市	97
254 | 绥化市	78
255 | 鸡西市	74
256 | 双鸭山市	56
257 | 七台河市	37
258 | 大兴安岭地区	17
259 | 呼和浩特市	518
260 | 包头市	435
261 | 鄂尔多斯市	392
262 | 赤峰市	203
263 | 呼伦贝尔市	168
264 | 乌海市	163
265 | 乌兰察布市	125
266 | 通辽市	122
267 | 巴彦淖尔市	117
268 | 阿拉善盟	101
269 | 兴安盟	101
270 | 锡林郭勒盟	82
271 | 贵阳市	1491
272 | 遵义市	384
273 | 毕节地区	202
274 | 黔南布依族苗族自治州	193
275 | 六盘水市	191
276 | 黔东南苗族侗族自治州	147
277 | 铜仁地区	141
278 | 黔西南布依族苗族自治州	106
279 | 安顺市	104
280 | 长春市	1404
281 | 延边朝鲜族自治州	431
282 | 吉林市	269
283 | 四平市	186
284 | 通化市	102
285 | 松原市	98
286 | 白山市	95
287 | 白城市	78
288 | 辽源市	44
289 | 兰州市	484
290 | 白银市	252
291 | 金昌市	181
292 | 平凉市	167
293 | 酒泉市	160
294 | 张掖市	128
295 | 武威市	128
296 | 定西市	111
297 | 天水市	98
298 | 临夏回族自治州	61
299 | 陇南市	56
300 | 庆阳市	48
301 | 甘南藏族自治州	42
302 | 嘉峪关市	22
303 | 乌鲁木齐市	520
304 | 昌吉回族自治州	165
305 | 伊犁哈萨克自治州	160
306 | 喀什地区	91
307 | 克拉玛依市	79
308 | 阿勒泰地区	75
309 | 塔城地区	64
310 | 阿克苏地区	58
311 | 巴音郭楞蒙古自治州	50
312 | 哈密地区	38
313 | 吐鲁番地区	25
314 | 石河子市	23
315 | 和田地区	22
316 | 博尔塔拉蒙古自治州	20
317 | 克孜勒苏柯尔克孜自治州	9
318 | 图木舒克市	6
319 | 五家渠市	5
320 | 阿拉尔市	3
321 | 海口市	430
322 | 三亚市	257
323 | 文昌市	49
324 | 儋州市	36
325 | 临高县	33
326 | 定安县	25
327 | 澄迈县	24
328 | 昌江黎族自治县	20
329 | 万宁市	19
330 | 东方市	14
331 | 琼海市	14
332 | 保亭黎族苗族自治县	12
333 | 陵水黎族自治县	12
334 | 乐东黎族自治县	11
335 | 琼中黎族苗族自治县	9
336 | 屯昌县	8
337 | 五指山市	5
338 | 白沙黎族自治县	3
339 | 银川市	673
340 | 固原市	75
341 | 石嘴山市	71
342 | 吴忠市	64
343 | 中卫市	34
344 | 西宁市	274
345 | 海西蒙古族藏族自治州	60
346 | 海南藏族自治州	35
347 | 海东地区	34
348 | 海北藏族自治州	16
349 | 黄南藏族自治州	10
350 | 玉树藏族自治州	7
351 | 果洛藏族自治州	5
352 | 拉萨市	56
353 | 阿里地区	19
354 | 林芝地区	15
355 | 日喀则地区	14
356 | 山南地区	5
357 | 昌都地区	4
358 | 那曲地区	2
359 | 北京市	5607
360 | 上海市	4027
361 | 重庆市	2753
362 | 天津市	1605
363 | 香港特别行政区	214
364 | 澳门特别行政区	28
365 | 


--------------------------------------------------------------------------------
/第二版/Cha 2 - 编写你的第一个网络爬虫/Cha 2 _章末实战.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 题目1：请使用Python中的循环，打印输出从1到100的所有奇数。"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "scrolled": true
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "1\n",
 22 |       "3\n",
 23 |       "5\n",
 24 |       "7\n",
 25 |       "9\n",
 26 |       "11\n",
 27 |       "13\n",
 28 |       "15\n",
 29 |       "17\n",
 30 |       "19\n",
 31 |       "21\n",
 32 |       "23\n",
 33 |       "25\n",
 34 |       "27\n",
 35 |       "29\n",
 36 |       "31\n",
 37 |       "33\n",
 38 |       "35\n",
 39 |       "37\n",
 40 |       "39\n",
 41 |       "41\n",
 42 |       "43\n",
 43 |       "45\n",
 44 |       "47\n",
 45 |       "49\n",
 46 |       "51\n",
 47 |       "53\n",
 48 |       "55\n",
 49 |       "57\n",
 50 |       "59\n",
 51 |       "61\n",
 52 |       "63\n",
 53 |       "65\n",
 54 |       "67\n",
 55 |       "69\n",
 56 |       "71\n",
 57 |       "73\n",
 58 |       "75\n",
 59 |       "77\n",
 60 |       "79\n",
 61 |       "81\n",
 62 |       "83\n",
 63 |       "85\n",
 64 |       "87\n",
 65 |       "89\n",
 66 |       "91\n",
 67 |       "93\n",
 68 |       "95\n",
 69 |       "97\n",
 70 |       "99\n"
 71 |      ]
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "for i in range(1,101):\n",
 76 |     "    if i % 2 == 1:\n",
 77 |     "        print (i)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "# 题目2：请将字符串 ”你好$$$我正在学Python@#@#现在需要&%&%&修改字符串” 中的符号变成一个空格，需要输出的格式为：”你好 我正在学Python 现在需要 修改字符串”"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 2,
 90 |    "metadata": {},
 91 |    "outputs": [
 92 |     {
 93 |      "name": "stdout",
 94 |      "output_type": "stream",
 95 |      "text": [
 96 |       "你好 我正在学Python 现在需要 修改字符串\n"
 97 |      ]
 98 |     }
 99 |    ],
100 |    "source": [
101 |     "str1 = '你好$$$我正在学Python@#@#现在需要&%&%&修改字符串'\n",
102 |     "str2 = str1.replace('$$$', ' ').replace('@#@#', ' ').replace('&%&%&', ' ')\n",
103 |     "print (str2)"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 3,
109 |    "metadata": {},
110 |    "outputs": [
111 |     {
112 |      "name": "stdout",
113 |      "output_type": "stream",
114 |      "text": [
115 |       "你好 我正在学Python 现在需要 修改字符串\n"
116 |      ]
117 |     }
118 |    ],
119 |    "source": [
120 |     "import re\n",
121 |     "str1 = '你好$$$我正在学Python@#@#现在需要&%&%&修改字符串'\n",
122 |     "str2 = re.sub('[$@#&%]+', ' ' ,str1)\n",
123 |     "print (str2)"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "# 题目3：输出 9*9 乘法口诀表"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 1,
136 |    "metadata": {
137 |     "ExecuteTime": {
138 |      "end_time": "2019-01-04T12:18:17.953381Z",
139 |      "start_time": "2019-01-04T12:18:17.938842Z"
140 |     }
141 |    },
142 |    "outputs": [
143 |     {
144 |      "name": "stdout",
145 |      "output_type": "stream",
146 |      "text": [
147 |       "1x1=1\t\n",
148 |       "1x2=2\t2x2=4\t\n",
149 |       "1x3=3\t2x3=6\t3x3=9\t\n",
150 |       "1x4=4\t2x4=8\t3x4=12\t4x4=16\t\n",
151 |       "1x5=5\t2x5=10\t3x5=15\t4x5=20\t5x5=25\t\n",
152 |       "1x6=6\t2x6=12\t3x6=18\t4x6=24\t5x6=30\t6x6=36\t\n",
153 |       "1x7=7\t2x7=14\t3x7=21\t4x7=28\t5x7=35\t6x7=42\t7x7=49\t\n",
154 |       "1x8=8\t2x8=16\t3x8=24\t4x8=32\t5x8=40\t6x8=48\t7x8=56\t8x8=64\t\n",
155 |       "1x9=9\t2x9=18\t3x9=27\t4x9=36\t5x9=45\t6x9=54\t7x9=63\t8x9=72\t9x9=81\t\n"
156 |      ]
157 |     }
158 |    ],
159 |    "source": [
160 |     "for i in range(1, 10):\n",
161 |     "    for j in range(1, i+1):\n",
162 |     "        print (\"%dx%d=%d\\t\" % (j, i, i*j), end=\"\")\n",
163 |     "    print(\"\")"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "# 题目4：请写出一个函数，当输入函数变量当月利润I，能返回应发放奖金总数，例如输出“利润100000元时，应发放奖金总数为10000元。”。\n",
171 |     "其中，企业发放的奖金根据利润提成。利润(I)低于或等于10万元时，奖金可提10%；利润高于10万元，低于20万元时，低于10万元的部分按10%提成，高于10万元的部分，可提成7.5%；20万到40万之间时，高于20万元的部分，可提成5%；40万到60万之间时高于40万元的部分，可提成3%；60万到100万之间时，高于60万元的部分，可提成1.5%，高于100万元时，超过100万元的部分按1%提成"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": 3,
177 |    "metadata": {
178 |     "ExecuteTime": {
179 |      "end_time": "2019-01-04T12:18:43.452862Z",
180 |      "start_time": "2019-01-04T12:18:42.226135Z"
181 |     }
182 |    },
183 |    "outputs": [
184 |     {
185 |      "name": "stdout",
186 |      "output_type": "stream",
187 |      "text": [
188 |       "净利润:2100000\n",
189 |       "利润为2100000元时，应发奖金总数为50500元\n"
190 |      ]
191 |     }
192 |    ],
193 |    "source": [
194 |     "def calcute_profit(I):\n",
195 |     "    I = I / 10000\n",
196 |     "    if I <= 10:\n",
197 |     "        a = I * 0.1\n",
198 |     "        return a * 10000\n",
199 |     "    elif I <= 20 and I > 10:\n",
200 |     "        b =0.25 + I * 0.075\n",
201 |     "        return b * 10000\n",
202 |     "    elif I <= 40 and I > 20:\n",
203 |     "        c = 0.75 + I * 0.05\n",
204 |     "        return c * 10000\n",
205 |     "    elif I <= 60 and I > 40:\n",
206 |     "        d = 1.55 + I * 0.03\n",
207 |     "        return d * 10000\n",
208 |     "    elif I <= 100 and I > 60:\n",
209 |     "        e = 2.45 + I * 0.015\n",
210 |     "        return e * 10000\n",
211 |     "    else:\n",
212 |     "        f = 2.95 + I * 0.01\n",
213 |     "        return f * 10000\n",
214 |     "    \n",
215 |     "I = int(input('净利润:'))\n",
216 |     "profit = calcute_profit(I)\n",
217 |     "print ('利润为%d元时，应发奖金总数为%d元' % (I, profit))"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": 4,
223 |    "metadata": {
224 |     "ExecuteTime": {
225 |      "end_time": "2019-01-04T12:18:48.041910Z",
226 |      "start_time": "2019-01-04T12:18:46.176321Z"
227 |     }
228 |    },
229 |    "outputs": [
230 |     {
231 |      "name": "stdout",
232 |      "output_type": "stream",
233 |      "text": [
234 |       "净利润:2100000\n",
235 |       "利润为2100000元时，应发奖金总数为50500元\n"
236 |      ]
237 |     }
238 |    ],
239 |    "source": [
240 |     "def calcute_profit(I):\n",
241 |     "    arr = [1000000,600000,400000,200000,100000,0] #这应该就是各个分界值了，把它们放在列表里方便访问\n",
242 |     "    rat = [0.01,0.015,0.03,0.05,0.075,0.1] #这是各个分界值所对应的奖金比例值\n",
243 |     "    r = 0                       #这是总奖金的初始值\n",
244 |     "    for idx in range(0,6):      #有6个分界值当然要循环6次\n",
245 |     "        if I > arr[idx]:\n",
246 |     "            r = r + (I - arr[idx]) * rat[idx] \n",
247 |     "            I = arr[idx]\n",
248 |     "    return r\n",
249 |     "\n",
250 |     "I = int(input('净利润:'))\n",
251 |     "profit = calcute_profit(I)\n",
252 |     "print ('利润为%d元时，应发奖金总数为%d元' % (I, profit))"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "markdown",
257 |    "metadata": {},
258 |    "source": [
259 |     "# 题目5：用字典的值对字典进行排序"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": 7,
265 |    "metadata": {},
266 |    "outputs": [
267 |     {
268 |      "name": "stdout",
269 |      "output_type": "stream",
270 |      "text": [
271 |       "[(0, 0), (2, 1), (1, 2), (4, 3), (3, 4)]\n"
272 |      ]
273 |     }
274 |    ],
275 |    "source": [
276 |     "import operator\n",
277 |     "x = {1: 2, 3: 4, 4:3, 2:1, 0:0}\n",
278 |     "sorted_x = sorted(x.items(), key=operator.itemgetter(1))\n",
279 |     "print (sorted_x)"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "markdown",
284 |    "metadata": {},
285 |    "source": [
286 |     "# 题目6：请问一下两段代码的输出分别是什么？"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": 8,
292 |    "metadata": {},
293 |    "outputs": [
294 |     {
295 |      "name": "stdout",
296 |      "output_type": "stream",
297 |      "text": [
298 |       "1\n"
299 |      ]
300 |     }
301 |    ],
302 |    "source": [
303 |     "a = 1\n",
304 |     "def fun(a):\n",
305 |     "    a = 2\n",
306 |     "fun(a)\n",
307 |     "print (a)"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": 9,
313 |    "metadata": {},
314 |    "outputs": [
315 |     {
316 |      "name": "stdout",
317 |      "output_type": "stream",
318 |      "text": [
319 |       "[1]\n"
320 |      ]
321 |     }
322 |    ],
323 |    "source": [
324 |     "a = []\n",
325 |     "def fun(a):\n",
326 |     "    a.append(1)\n",
327 |     "fun(a)\n",
328 |     "print (a)"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "markdown",
333 |    "metadata": {},
334 |    "source": [
335 |     "# 题目7： 请问以下两段代码的输出分别是什么？"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": 10,
341 |    "metadata": {},
342 |    "outputs": [
343 |     {
344 |      "name": "stdout",
345 |      "output_type": "stream",
346 |      "text": [
347 |       "bbb\n",
348 |       "aaa\n",
349 |       "aaa\n"
350 |      ]
351 |     }
352 |    ],
353 |    "source": [
354 |     "class Person:\n",
355 |     "    name=\"aaa\"\n",
356 |     "\n",
357 |     "p1=Person()\n",
358 |     "p2=Person()\n",
359 |     "p1.name=\"bbb\"\n",
360 |     "print (p1.name)\n",
361 |     "print (p2.name)\n",
362 |     "print (Person.name)"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": 11,
368 |    "metadata": {},
369 |    "outputs": [
370 |     {
371 |      "name": "stdout",
372 |      "output_type": "stream",
373 |      "text": [
374 |       "[1]\n",
375 |       "[1]\n",
376 |       "[1]\n"
377 |      ]
378 |     }
379 |    ],
380 |    "source": [
381 |     "class Person:\n",
382 |     "    name=[]\n",
383 |     "\n",
384 |     "p1=Person()\n",
385 |     "p2=Person()\n",
386 |     "p1.name.append(1)\n",
387 |     "print (p1.name)\n",
388 |     "print (p2.name)\n",
389 |     "print (Person.name)"
390 |    ]
391 |   }
392 |  ],
393 |  "metadata": {
394 |   "kernelspec": {
395 |    "display_name": "Python 3",
396 |    "language": "python",
397 |    "name": "python3"
398 |   },
399 |   "language_info": {
400 |    "codemirror_mode": {
401 |     "name": "ipython",
402 |     "version": 3
403 |    },
404 |    "file_extension": ".py",
405 |    "mimetype": "text/x-python",
406 |    "name": "python",
407 |    "nbconvert_exporter": "python",
408 |    "pygments_lexer": "ipython3",
409 |    "version": "3.6.5"
410 |   },
411 |   "toc": {
412 |    "base_numbering": 1,
413 |    "nav_menu": {
414 |     "height": "153px",
415 |     "width": "252px"
416 |    },
417 |    "number_sections": true,
418 |    "sideBar": true,
419 |    "skip_h1_title": false,
420 |    "title_cell": "Table of Contents",
421 |    "title_sidebar": "Contents",
422 |    "toc_cell": false,
423 |    "toc_position": {},
424 |    "toc_section_display": "block",
425 |    "toc_window_display": false
426 |   }
427 |  },
428 |  "nbformat": 4,
429 |  "nbformat_minor": 2
430 | }
431 | 


--------------------------------------------------------------------------------
/第二版/Cha 2 - 编写你的第一个网络爬虫/title_test.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 2 - 编写你的第一个网络爬虫/title_test.txt


--------------------------------------------------------------------------------
/第二版/Cha 4 -动态网页抓取/Cha 4 _章末实战.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "metadata": {
  7 |     "ExecuteTime": {
  8 |      "end_time": "2019-01-04T13:01:25.237670Z",
  9 |      "start_time": "2019-01-04T13:01:12.178305Z"
 10 |     },
 11 |     "scrolled": true
 12 |    },
 13 |    "outputs": [
 14 |     {
 15 |      "name": "stdout",
 16 |      "output_type": "stream",
 17 |      "text": [
 18 |       "0 ￥288  【宫遇】17-KKmall楼上一房一厅--【Loft时代】 整套公寓 1室1卫1床\n",
 19 |       "0 ￥369  LADYMA |原宿 摩洛哥风格 福田CBD会展中心#家庭影院CocoPark福田皇岗口岸岗厦地铁口 整套公寓 1室1卫1床\n",
 20 |       "0 ￥439  小屋子和它的大陽台 整套公寓 单间1卫1床\n",
 21 |       "0 ￥318  【十二微邸】14J-KKmall楼上的城市微魔方 整套公寓 1室1卫1床\n",
 22 |       "0 ￥318  【十二微邸】32Q-KKmall楼上的天空微城堡 整套公寓 1室1卫1床\n",
 23 |       "0 ￥435  近市中心舒适温馨的GAO's HOME 整套公寓 1室1卫1床\n",
 24 |       "0 ￥266  【暖居·小小】 深圳福田区 迷你新居 北欧简约风 下梅林地铁A出口步行1500米左右 整套公寓 1室1卫1床\n",
 25 |       "0 ￥89  花叶家青年旅舍【世界之窗店】-粉色女生4人房(一张床位) 合住房间 1室2卫4床\n",
 26 |       "0 ￥218  推荐:香蜜湖温馨公寓(双地铁)ShenZhen FuTian 整套酒店式公寓 1室1卫1床\n",
 27 |       "0 ￥350  【3D空间】 超大落地窗|按摩浴缸|巨幕投影|城市夜景 罗湖#地王#京基#老街 整套公寓 单间1卫1床\n",
 28 |       "0 ￥172  Loire花房公寓 合住房间 1室1卫1床\n",
 29 |       "0 ￥580  #寒舍Room1#福田CBD下沙地铁口极简风格,无敌景观房,独享大浴缸,私人房间,拍摄请另咨询 独立房间 1室1卫1床\n",
 30 |       "0 ￥349  [Misa’s house] ‘想‘老街地铁口/kkmall万象城罗湖口岸/ins风商务房/直达香港 整套公寓 1室1卫2床\n",
 31 |       "0 ￥138  (世界之窗欢乐谷华侨城)超棒阳光房 独立房间 1室1卫1床\n",
 32 |       "0 ￥210  福田CBD会展中心莲花山福田口岸9号线孖岭地铁口温馨一房一厅 小屋 1室1卫1床\n",
 33 |       "0 ￥258  覔舍·A#紧邻南科大和深大西丽校区#塘朗地铁站#大窗户#24小时入住#一居室 整套公寓 1室1卫1床\n",
 34 |       "0 ￥450  福田 八卦岭 园岭地铁口 美食街旁 复式公寓整租--居心地设计民宿 整间阁楼 1室1卫1床\n",
 35 |       "0 ￥358  深圳福田中心区與香港口岸附近的五星小居 整套公寓 1室1卫1床\n"
 36 |      ]
 37 |     }
 38 |    ],
 39 |    "source": [
 40 |     "from selenium import webdriver\n",
 41 |     "import time\n",
 42 |     "\n",
 43 |     "driver = webdriver.Firefox(executable_path = r'C:\\Users\\santostang\\Desktop\\geckodriver.exe')\n",
 44 |     "#把上述地址改成你电脑中geckodriver.exe程序的地址\n",
 45 |     "#在虚拟浏览器中打开 Airbnb 页面\n",
 46 |     "driver.get(\"https://zh.airbnb.com/s/Shenzhen--China/homes\")\n",
 47 |     "\n",
 48 |     "#找到页面中所有的出租房\n",
 49 |     "rent_list = driver.find_elements_by_css_selector('div._gig1e7')\n",
 50 |     "\n",
 51 |     "#对于每一个出租房\n",
 52 |     "for eachhouse in rent_list:\n",
 53 |     "    #找到评论数量\n",
 54 |     "    try:\n",
 55 |     "        comment = eachhouse.find_element_by_css_selector('span._1cy09umr')\n",
 56 |     "        comment = comment.text\n",
 57 |     "    except:\n",
 58 |     "        comment = 0\n",
 59 |     "    \n",
 60 |     "    #找到价格\n",
 61 |     "    price = eachhouse.find_element_by_css_selector('div._1yarz4r')\n",
 62 |     "    price = price.text.replace(\"每晚\", \"\").replace(\"价格\", \"\").replace(\"\\n\", \"\")\n",
 63 |     "    \n",
 64 |     "    #找到名称\n",
 65 |     "    name = eachhouse.find_element_by_css_selector('div._vbshb6')\n",
 66 |     "    name = name.text\n",
 67 |     "    \n",
 68 |     "    #找到房屋类型，大小\n",
 69 |     "    details = eachhouse.find_element_by_css_selector('span._14ksqu3j')\n",
 70 |     "    details = details.text\n",
 71 |     "    house_type = details.split(\" · \")[0]\n",
 72 |     "    bed_number = details.split(\" · \")[1]\n",
 73 |     "    print (comment, price, name, house_type, bed_number)"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 19,
 79 |    "metadata": {
 80 |     "ExecuteTime": {
 81 |      "end_time": "2018-11-18T08:48:16.793188Z",
 82 |      "start_time": "2018-11-18T08:47:21.685569Z"
 83 |     }
 84 |    },
 85 |    "outputs": [
 86 |     {
 87 |      "name": "stdout",
 88 |      "output_type": "stream",
 89 |      "text": [
 90 |       "180 ￥291 【宫遇】17-KKmall楼上一房一厅--【Loft时代】 整套公寓 1室1卫1床\n",
 91 |       "99 ￥215 推荐:香蜜湖温馨公寓(双地铁)ShenZhen FuTian 整套酒店式公寓 1室1卫1床\n",
 92 |       "87 ￥167 Loire花房公寓 合住房间 1室1卫1床\n",
 93 |       "82 ￥368 LADYMA |原宿 摩洛哥风格 福田CBD会展中心#家庭影院CocoPark福田皇岗口岸岗厦地铁口 整套公寓 1室1卫1床\n",
 94 |       "150 ￥319 【十二微邸】14J-KKmall楼上的城市微魔方 整套公寓 1室1卫1床\n",
 95 |       "153 ￥319 【十二微邸】32Q-KKmall楼上的天空微城堡 整套公寓 1室1卫1床\n",
 96 |       "69 ￥319 【猫薄荷】罗湖口岸|双地铁口|美食一街|巨幕投影|泡泡吊椅|Loft| 整间阁楼 1室1卫1床\n",
 97 |       "152 ￥437 近市中心舒适温馨的GAO's HOME 整套公寓 1室1卫1床\n",
 98 |       "153 ￥160 深圳北站 大床房 直达香港口岸/出差首选 温馨舒适北欧风公寓 独立房间 1室2卫1床\n",
 99 |       "68 ￥264 【暖居·小小】 深圳福田区 迷你新居 北欧简约风 下梅林地铁A出口步行1500米左右 整套公寓 1室1卫1床\n",
100 |       "298 ￥444 小屋子和它的大陽台 整套公寓 单间1卫1床\n",
101 |       "52 ￥583 无间-海上世界别墅#BLACK#地铁站边#超大天台#独立卫浴阳台#超大浴缸#蛇口港#近深圳湾口岸 独立房间 1室1卫1床\n",
102 |       "20 ￥347 [Misa’s house] 老街地铁口/kkmall万象城罗湖口岸/ins风商务房/直达香港!北欧 整套公寓 1室1卫2床\n",
103 |       "64 ￥236 福原居-Luck House 榻榻米高层观景套间-福田口岸 独立房间 3室2卫1床\n",
104 |       "31 ￥298 樱空 日式loft公寓 近罗湖口岸国贸金光华万象城东门 好住好逛又好吃 整间阁楼 1室1卫1床\n",
105 |       "135 ￥340 【白日梦蓝】美食街|loft|罗湖口岸|双地铁|巨幕影院 | 游泳池1-3人公寓 整套公寓 1室1卫1床\n",
106 |       "48 ￥319 【双子座】ins少女风|罗湖口岸|东门老街|泡泡池|巨幕投影 整套公寓 单间1卫1床\n",
107 |       "44 ￥319 【克洛偌斯】新房特惠上线 | 万象城 | kkmall | 老街美食中心 | 罗湖口岸 | 巨幕投影 整套公寓 1室1卫1床\n",
108 |       "68 ￥333 LADYMA |念念 现代公寓福田CBD会展中心#私人家庭影院#CocoPark福田皇岗口岸岗厦地铁 整套公寓 1室1卫1床\n",
109 |       "148 ￥701 【水泥盒子】の【白露】【有狗】【有共享空间】 独立房间 1室1卫1床\n",
110 |       "36 ￥187 【粉粉少女心+网红打卡】少女系单人房-会展中心CBD皇岗口岸福田口岸购物公园福田高铁站 独立房间 1室0.5卫1床\n",
111 |       "61 ￥333 【理想的家】福田皇岗口岸/地铁口/近会展中心华强北,大社区整租 整套公寓 单间1卫1床\n",
112 |       "121 ￥278 【宫遇】9E-KKmall-小米智能家居体验-小爱同学-软床垫-京基100/万象城/地王/荔枝公园 整套公寓 1室1卫1床\n",
113 |       "40 ￥326 【Yuri Dream Hut 】京基100KKmall楼上/大剧院地铁站/罗湖口岸ins风温馨公寓 整套公寓 单间1卫1床\n",
114 |       "233 ￥500 燈塔·時光 Clean warm and cozy place 整套公寓 1室1卫1床\n",
115 |       "294 ￥742 Cozy Studio near Mongkok 整间阁楼 单间1卫1床\n",
116 |       "105 ￥423 #寒舍Room 2#福田CBD下沙地铁口极简风,看海景观房,独立卫浴,私人房间,拍摄请另咨询 独立房间 1室1卫1床\n",
117 |       "70 ￥271 抢手房源:香蜜湖复式美寓(双地铁)FuTian 整间阁楼 1室1卫1床\n",
118 |       "70 ￥215 【白日梦蓝】罗湖口岸|巨幕投影|双地铁美食街上的白色幻想空间 整套公寓 1室1卫1床\n",
119 |       "75 ￥451 福田 八卦岭 园岭地铁口 美食街旁 复式公寓整租--居心地设计民宿 整间阁楼 1室1卫2床\n",
120 |       "194 ￥291 【宫遇】06-KKmall-小米智能家居体验-小爱同学-软床垫-京基100/万象城/地王/荔枝公园 整套公寓 1室1卫1床\n",
121 |       "73 ￥319 福田CBD岗夏地铁旁会展中心现代公寓 整套公寓 1室1卫1床\n",
122 |       "86 ￥395 大梅沙180度海景工业风大床房,海滩旁,近东部华侨城 整套公寓 1室1卫1床\n",
123 |       "104 ￥389 New & cosy modern 1BR 4pax 1min walk MTR 整套公寓 1室1卫2床\n",
124 |       "49 ￥250 【艺术之家】福田中心区石厦地铁站时尚的画家居室 The painter's room 整套公寓 1室1卫1床\n",
125 |       "38 ￥291 【宫遇】19-KKmall楼上一房一厅--【家庭影院】 整套公寓 1室1卫1床\n",
126 |       "148 ￥291 【宫遇】31-KKmall-小米智能家居体验-小爱同学-软床垫-京基100/万象城/地王/荔枝公园 整套公寓 1室1卫1床\n",
127 |       "21 ￥291 【Ebnb】佛系日式网红一居!近会展中心购物公园近地铁福田皇岗口岸 整套公寓 1室1卫1床\n",
128 |       "38 ￥326 机场地铁直达固戍站,Ins大空间品牌公寓 整套酒店式公寓 单间1卫1床\n",
129 |       "32 ￥215 水贝珠宝园7号线地铁出口高端居家公寓 整套酒店式公寓 1室1卫1床\n",
130 |       "18 ￥347 |卷儿· room1| 北欧现代混搭风 直达香港/罗湖口岸&火车站/老街双地铁口/万象城kkmall 整套公寓 1室1卫1床\n",
131 |       "16 ￥347 [Studio Q-Air]深圳罗湖商圈/东门老街/地铁口/万象城/商务套房/直达香港/罗湖口岸 整套公寓 1室1卫1床\n",
132 |       "37 ￥222 [Ein] 出差居家温馨大床房/近深圳东/东门/深圳北 整套公寓 1室1卫1床\n",
133 |       "92 ￥319 【十二微邸】22Q-kkmall楼上的梦唤微天使 整套酒店式公寓 1室1卫1床\n",
134 |       "44 ￥354 LADYMA |遇见 北欧混搭风 福田CBD会展中心｣家庭影院CocoPark福田皇岗口岸岗厦地铁口 整套公寓 1室1卫1床\n",
135 |       "75 ￥257 复式:福田区香蜜小筑(双地铁)ShenZhen FuTian 整间阁楼 1室1卫1床\n",
136 |       "230 ￥201 SmileHouse超讚房東_陽光大床房115米平方~香港口岸5分鐘､石厦站2分鐘,如朋友热情招待 独立房间 1室1卫1床\n",
137 |       "138 ￥333 (初见)罗湖阅见·生活美学公寓城景大床房近地王大厦万象城大剧院D出口 整套公寓 1室1卫1床\n",
138 |       "43 ￥215 溜达家法式风格 浴缸房单房整租 3号线地铁口 龙岗中心城大运中心万科广场深圳东 整套公寓 单间1卫1床\n",
139 |       "20 ￥201 【NiteNite奈奈】7号地铁口 水贝独立阳光创意公寓 1min Sta.stylish room 整套公寓 单间1卫1床\n",
140 |       "108 ￥347 【十二微邸】28H-KKmall楼上的心所微山水 整套公寓 1室1卫1床\n",
141 |       "21 ￥326 深圳福田中心区與香港口岸附近的五星小居 整套公寓 1室1卫1床\n",
142 |       "103 ￥284 ｢木夕｣地铁口/九方购物中心楼上/深圳北站 /直达福田口岸 全新日式风公寓 整套公寓 1室1卫1床\n",
143 |       "112 ￥319 (设计师Room)一分钟到地铁站/近世界之窗/万象天地/科技园北欧精致套房 整套公寓 1室1卫1床\n",
144 |       "113 ￥291 【宫遇】18-KKmall楼上一房一厅--【黑胶之夜】 整套公寓 1室1卫1床\n",
145 |       "53 ￥382 无间-海上世界别墅#BLUE#地铁站边#超大天台#独立卫浴阳台#蛇口港#近深圳湾口岸 独立房间 1室1卫1床\n",
146 |       "54 ￥423 猫筑Aircat#岗厦/会展中心福田中心粉色北欧风配大投影高层公寓 整套公寓 1室1卫1床\n",
147 |       "23 ￥215 【YOME空间】/ROOM1.木槿/九方购物中心/深圳北站,地铁直达福田口岸 独立房间 1室1卫1床\n",
148 |       "155 ￥326 (云意)罗湖阅见·生活美学公寓城景大床房近地王大厦万象城KKMALL大剧院D出口 整套公寓 1室1卫1床\n",
149 |       "122 ￥229 【植意间·梦境D1】主卧带独立洗手间,1.8米大床,深圳北站,高铁19分钟直达香港,走路5分钟到地铁 独立房间 1室1卫1床\n",
150 |       "72 ￥423 猫筑AirCat#高新园万象天地北欧风智能公寓配大投影及开放式厨房 整套公寓 1室1卫1床\n",
151 |       "90 ￥278 福田中心温馨小家Q大街 整套公寓 1室1卫1床\n",
152 |       "33 ￥430 【Sen’sHome】福田中心区/FuTianCBD/双地铁/阳光公寓/购物公园/会展中心/潮人首选 整套公寓 1室1卫1床\n",
153 |       "110 ￥180 可拍照~ 小红书网红摆拍｢Summerの民宿｣西丽•366大街【有猫】 独立房间 1室1卫1床\n",
154 |       "145 ￥333 【胖鸟公舍】南山､桃园双地铁口公寓!让你心安､庸懒､自在!倦鸟变胖鸟! 整套公寓 1室1卫1床\n",
155 |       "8 ￥319 【伊利昂】新房特惠上线 | 万象城 | kkmall | 老街美食中心 | 罗湖口岸 | 巨幕投影 整套公寓 1室1卫1床\n",
156 |       "41 ￥291 Futian CBD nice apartment 福田CBD中心区双地铁口精致公寓 整套公寓 单间1卫1床\n",
157 |       "98 ￥347 New【云邸】设计师风格现代高层公寓,靠近罗湖口岸万象城kkmall地王,去香港方便｡ 整套公寓 1室1卫1床\n",
158 |       "111 ￥201 近机场北欧风格超大主卧(独立卫生间) 独立房间 1室2卫1床\n",
159 |       "37 ￥215 【新故 深圳】双地铁口|美食街|巨幕投影|北欧空间|1-3人公寓 整套公寓 1室1卫2床\n",
160 |       "100 ￥326 (流年)罗湖阅见·生活美学公寓城景大床房近地王大厦万象城KKMALL大剧院D出口 整套公寓 1室1卫1床\n",
161 |       "60 ￥361 明式新中式风格现代公寓 整套公寓 1室1卫1床\n",
162 |       "94 ￥236 【10】★ ★ ★官方推荐★ ★ ★地铁站旁独立Loft套房,简单纯粹,清新方便,温馨浪漫 整间阁楼 1室1卫1床\n",
163 |       "59 ￥451 猫筑AirCat#高新园万象天地粉色配大投影北欧风公寓 整套公寓 1室1卫1床\n",
164 |       "37 ￥250 【粉粉少女心+网红打卡】大床房-会展中心CBD皇岗口岸福田口岸福田购物公园福田高铁站 独立房间 1室1卫1床\n",
165 |       "66 ￥298 #栖息地•CityNest#罗湖东门老街/地铁口/简约北欧一居【simple n romantic】 整套房子 1室1卫1床\n",
166 |       "57 ￥347 无间-海上世界别墅#PINK#地铁站边#超大天台#独立卫浴阳台#蛇口港#近深圳湾口岸 独立房间 1室1卫1床\n",
167 |       "29 ￥305 AA购物公园旁石厦地铁口的私人工作室 整套公寓 1室1卫1床\n",
168 |       "67 ￥368 Muji风格园景公寓,CBD中心区 整套公寓 1室1卫1床\n",
169 |       "36 ￥215 会展中心新洲石厦沙尾北欧清新风格清爽小屋 整套公寓 1室1卫1床\n"
170 |      ]
171 |     }
172 |    ],
173 |    "source": [
174 |     "from selenium import webdriver\n",
175 |     "import time\n",
176 |     "\n",
177 |     "driver = webdriver.Firefox(executable_path = r'C:\\Users\\santostang\\Desktop\\geckodriver.exe')\n",
178 |     "#把上述地址改成你电脑中geckodriver.exe程序的地址\n",
179 |     "for i in range(0,5):\n",
180 |     "    link = \"https://zh.airbnb.com/s/Shenzhen--China/homes?items_offset=\" + str(i *18)\n",
181 |     "    driver.get(link)\n",
182 |     "    rent_list = driver.find_elements_by_css_selector('div._gig1e7')\n",
183 |     "\n",
184 |     "    for eachhouse in rent_list:\n",
185 |     "        try:\n",
186 |     "            comment = eachhouse.find_element_by_css_selector('span._1cy09umr').text\n",
187 |     "        except:\n",
188 |     "            comment = 0\n",
189 |     "        price = eachhouse.find_element_by_css_selector('div._1yarz4r')\n",
190 |     "        price = price.text.replace(\"每晚\", \"\").replace(\"价格\", \"\").replace(\"\\n\", \"\")\n",
191 |     "        name = eachhouse.find_element_by_css_selector('div._vbshb6')\n",
192 |     "        name = name.text\n",
193 |     "        details = eachhouse.find_element_by_css_selector('span._14ksqu3j')\n",
194 |     "        details = details.text\n",
195 |     "        house_type = details.split(\" · \")[0]\n",
196 |     "        bed_number = details.split(\" · \")[1]\n",
197 |     "        print (comment, price, name, house_type, bed_number)\n",
198 |     "    time.sleep(5)"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "metadata": {},
205 |    "outputs": [],
206 |    "source": []
207 |   }
208 |  ],
209 |  "metadata": {
210 |   "kernelspec": {
211 |    "display_name": "Python 3",
212 |    "language": "python",
213 |    "name": "python3"
214 |   },
215 |   "language_info": {
216 |    "codemirror_mode": {
217 |     "name": "ipython",
218 |     "version": 3
219 |    },
220 |    "file_extension": ".py",
221 |    "mimetype": "text/x-python",
222 |    "name": "python",
223 |    "nbconvert_exporter": "python",
224 |    "pygments_lexer": "ipython3",
225 |    "version": "3.6.5"
226 |   },
227 |   "toc": {
228 |    "base_numbering": 1,
229 |    "nav_menu": {
230 |     "height": "12px",
231 |     "width": "252px"
232 |    },
233 |    "number_sections": true,
234 |    "sideBar": true,
235 |    "skip_h1_title": false,
236 |    "title_cell": "Table of Contents",
237 |    "title_sidebar": "Contents",
238 |    "toc_cell": false,
239 |    "toc_position": {},
240 |    "toc_section_display": "block",
241 |    "toc_window_display": false
242 |   }
243 |  },
244 |  "nbformat": 4,
245 |  "nbformat_minor": 2
246 | }
247 | 


--------------------------------------------------------------------------------
/第二版/Cha 4 -动态网页抓取/Cha 4 _自我实践（章末）.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "ExecuteTime": {
  8 |      "end_time": "2018-11-18T08:53:29.428181Z",
  9 |      "start_time": "2018-11-18T08:53:15.439607Z"
 10 |     }
 11 |    },
 12 |    "outputs": [
 13 |     {
 14 |      "name": "stdout",
 15 |      "output_type": "stream",
 16 |      "text": [
 17 |       "180 ￥288 【宫遇】17-KKmall楼上一房一厅--【Loft时代】 整套公寓 1室1卫1床\n",
 18 |       "99 ￥218 推荐:香蜜湖温馨公寓(双地铁)ShenZhen FuTian 整套酒店式公寓 1室1卫1床\n",
 19 |       "87 ￥167 Loire花房公寓 合住房间 1室1卫1床\n",
 20 |       "82 ￥371 LADYMA |原宿 摩洛哥风格 福田CBD会展中心#家庭影院CocoPark福田皇岗口岸岗厦地铁口 整套公寓 1室1卫1床\n",
 21 |       "150 ￥318 【十二微邸】14J-KKmall楼上的城市微魔方 整套公寓 1室1卫1床\n",
 22 |       "153 ￥318 【十二微邸】32Q-KKmall楼上的天空微城堡 整套公寓 1室1卫1床\n",
 23 |       "69 ￥318 【猫薄荷】罗湖口岸|双地铁口|美食一街|巨幕投影|泡泡吊椅|Loft| 整间阁楼 1室1卫1床\n",
 24 |       "152 ￥435 近市中心舒适温馨的GAO's HOME 整套公寓 1室1卫1床\n",
 25 |       "153 ￥162 深圳北站 大床房 直达香港口岸/出差首选 温馨舒适北欧风公寓 独立房间 1室2卫1床\n",
 26 |       "298 ￥443 小屋子和它的大陽台 整套公寓 单间1卫1床\n",
 27 |       "68 ￥266 【暖居·小小】 深圳福田区 迷你新居 北欧简约风 下梅林地铁A出口步行1500米左右 整套公寓 1室1卫1床\n",
 28 |       "52 ￥580 无间-海上世界别墅#BLACK#地铁站边#超大天台#独立卫浴阳台#超大浴缸#蛇口港#近深圳湾口岸 独立房间 1室1卫1床\n",
 29 |       "20 ￥349 [Misa’s house] 老街地铁口/kkmall万象城罗湖口岸/ins风商务房/直达香港!北欧 整套公寓 1室1卫2床\n",
 30 |       "64 ￥239 福原居-Luck House 榻榻米高层观景套间-福田口岸 独立房间 3室2卫1床\n",
 31 |       "31 ￥298 樱空 日式loft公寓 近罗湖口岸国贸金光华万象城东门 好住好逛又好吃 整间阁楼 1室1卫1床\n",
 32 |       "135 ￥338 【白日梦蓝】美食街|loft|罗湖口岸|双地铁|巨幕影院 | 游泳池1-3人公寓 整套公寓 1室1卫1床\n"
 33 |      ]
 34 |     }
 35 |    ],
 36 |    "source": [
 37 |     "from selenium import webdriver\n",
 38 |     "\n",
 39 |     "fp = webdriver.FirefoxProfile()\n",
 40 |     "fp.set_preference(\"permissions.default.stylesheet\",2)\n",
 41 |     "fp.set_preference(\"permissions.default.image\",2)\n",
 42 |     "fp.set_preference(\"javascript.enabled\", False)\n",
 43 |     "\n",
 44 |     "driver = webdriver.Firefox(firefox_profile=fp, executable_path = r'C:\\Users\\santostang\\Desktop\\geckodriver.exe')\n",
 45 |     "#把上述地址改成你电脑中geckodriver.exe程序的地址\n",
 46 |     "#在虚拟浏览器中打开 Airbnb 页面\n",
 47 |     "driver.get(\"https://zh.airbnb.com/s/Shenzhen--China/homes\")\n",
 48 |     "\n",
 49 |     "#找到页面中所有的出租房\n",
 50 |     "rent_list = driver.find_elements_by_css_selector('div._gig1e7')\n",
 51 |     "\n",
 52 |     "#对于每一个出租房\n",
 53 |     "for eachhouse in rent_list:\n",
 54 |     "    #找到评论数量\n",
 55 |     "    try:\n",
 56 |     "        comment = eachhouse.find_element_by_css_selector('span._1cy09umr')\n",
 57 |     "        comment = comment.text\n",
 58 |     "    except:\n",
 59 |     "        comment = 0\n",
 60 |     "    \n",
 61 |     "    #找到价格\n",
 62 |     "    price = eachhouse.find_element_by_css_selector('div._1yarz4r')\n",
 63 |     "    price = price.text.replace(\"每晚\", \"\").replace(\"价格\", \"\").replace(\"\\n\", \"\")\n",
 64 |     "    \n",
 65 |     "    #找到名称\n",
 66 |     "    name = eachhouse.find_element_by_css_selector('div._vbshb6')\n",
 67 |     "    name = name.text\n",
 68 |     "    \n",
 69 |     "    #找到房屋类型，大小\n",
 70 |     "    details = eachhouse.find_element_by_css_selector('span._14ksqu3j')\n",
 71 |     "    details = details.text\n",
 72 |     "    house_type = details.split(\" · \")[0]\n",
 73 |     "    bed_number = details.split(\" · \")[1]\n",
 74 |     "    print (comment, price, name, house_type, bed_number)"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 4,
 80 |    "metadata": {
 81 |     "ExecuteTime": {
 82 |      "end_time": "2018-11-18T09:09:30.981400Z",
 83 |      "start_time": "2018-11-18T09:08:36.398046Z"
 84 |     }
 85 |    },
 86 |    "outputs": [
 87 |     {
 88 |      "name": "stdout",
 89 |      "output_type": "stream",
 90 |      "text": [
 91 |       "180 $42 【宫遇】17-KKmall楼上一房一厅--【Loft时代】 整套公寓 1室1卫1床\n",
 92 |       "99 $31 推荐:香蜜湖温馨公寓(双地铁)ShenZhen FuTian 整套酒店式公寓 1室1卫1床\n",
 93 |       "87 $24 Loire花房公寓 合住房间 1室1卫1床\n",
 94 |       "82 $53 LADYMA |原宿 摩洛哥风格 福田CBD会展中心#家庭影院CocoPark福田皇岗口岸岗厦地铁口 整套公寓 1室1卫1床\n",
 95 |       "150 $46 【十二微邸】14J-KKmall楼上的城市微魔方 整套公寓 1室1卫1床\n",
 96 |       "153 $46 【十二微邸】32Q-KKmall楼上的天空微城堡 整套公寓 1室1卫1床\n",
 97 |       "69 $46 【猫薄荷】罗湖口岸|双地铁口|美食一街|巨幕投影|泡泡吊椅|Loft| 整间阁楼 1室1卫1床\n",
 98 |       "152 $63 近市中心舒适温馨的GAO's HOME 整套公寓 1室1卫1床\n",
 99 |       "153 $23 深圳北站 大床房 直达香港口岸/出差首选 温馨舒适北欧风公寓 独立房间 1室2卫1床\n",
100 |       "68 $38 【暖居·小小】 深圳福田区 迷你新居 北欧简约风 下梅林地铁A出口步行1500米左右 整套公寓 1室1卫1床\n",
101 |       "298 $64 小屋子和它的大陽台 整套公寓 单间1卫1床\n",
102 |       "52 $84 无间-海上世界别墅#BLACK#地铁站边#超大天台#独立卫浴阳台#超大浴缸#蛇口港#近深圳湾口岸 独立房间 1室1卫1床\n",
103 |       "20 $50 [Misa’s house] 老街地铁口/kkmall万象城罗湖口岸/ins风商务房/直达香港!北欧 整套公寓 1室1卫2床\n",
104 |       "64 $34 福原居-Luck House 榻榻米高层观景套间-福田口岸 独立房间 3室2卫1床\n",
105 |       "31 $43 樱空 日式loft公寓 近罗湖口岸国贸金光华万象城东门 好住好逛又好吃 整间阁楼 1室1卫1床\n",
106 |       "135 $49 【白日梦蓝】美食街|loft|罗湖口岸|双地铁|巨幕影院 | 游泳池1-3人公寓 整套公寓 1室1卫1床\n",
107 |       "48 ￥322 【双子座】ins少女风|罗湖口岸|东门老街|泡泡池|巨幕投影 整套公寓 单间1卫1床\n",
108 |       "44 ￥318 【克洛偌斯】新房特惠上线 | 万象城 | kkmall | 老街美食中心 | 罗湖口岸 | 巨幕投影 整套公寓 1室1卫1床\n",
109 |       "68 ￥331 LADYMA |念念 现代公寓福田CBD会展中心#私人家庭影院#CocoPark福田皇岗口岸岗厦地铁 整套公寓 1室1卫1床\n",
110 |       "148 ￥698 【水泥盒子】の【白露】【有狗】【有共享空间】 独立房间 1室1卫1床\n",
111 |       "36 ￥188 【粉粉少女心+网红打卡】少女系单人房-会展中心CBD皇岗口岸福田口岸购物公园福田高铁站 独立房间 1室0.5卫1床\n",
112 |       "61 ￥336 【理想的家】福田皇岗口岸/地铁口/近会展中心华强北,大社区整租 整套公寓 单间1卫1床\n",
113 |       "121 ￥280 【宫遇】9E-KKmall-小米智能家居体验-小爱同学-软床垫-京基100/万象城/地王/荔枝公园 整套公寓 1室1卫1床\n",
114 |       "40 ￥328 【Yuri Dream Hut 】京基100KKmall楼上/大剧院地铁站/罗湖口岸ins风温馨公寓 整套公寓 单间1卫1床\n",
115 |       "233 ￥498 燈塔·時光 Clean warm and cozy place 整套公寓 1室1卫1床\n",
116 |       "294 ￥744 Cozy Studio near Mongkok 整间阁楼 单间1卫1床\n",
117 |       "105 ￥420 #寒舍Room 2#福田CBD下沙地铁口极简风,看海景观房,独立卫浴,私人房间,拍摄请另咨询 独立房间 1室1卫1床\n",
118 |       "70 ￥268 抢手房源:香蜜湖复式美寓(双地铁)FuTian 整间阁楼 1室1卫1床\n",
119 |       "70 ￥214 【白日梦蓝】罗湖口岸|巨幕投影|双地铁美食街上的白色幻想空间 整套公寓 1室1卫1床\n",
120 |       "75 ￥449 福田 八卦岭 园岭地铁口 美食街旁 复式公寓整租--居心地设计民宿 整间阁楼 1室1卫2床\n",
121 |       "73 ￥320 福田CBD岗夏地铁旁会展中心现代公寓 整套公寓 1室1卫1床\n",
122 |       "148 ￥288 【宫遇】31-KKmall-小米智能家居体验-小爱同学-软床垫-京基100/万象城/地王/荔枝公园 整套公寓 1室1卫1床\n",
123 |       "86 ￥398 大梅沙180度海景工业风大床房,海滩旁,近东部华侨城 整套公寓 1室1卫1床\n",
124 |       "104 ￥390 New & cosy modern 1BR 4pax 1min walk MTR 整套公寓 1室1卫2床\n",
125 |       "49 ￥250 【艺术之家】福田中心区石厦地铁站时尚的画家居室 The painter's room 整套公寓 1室1卫1床\n",
126 |       "38 ￥288 【宫遇】19-KKmall楼上一房一厅--【家庭影院】 整套公寓 1室1卫1床\n",
127 |       "194 ￥288 【宫遇】06-KKmall-小米智能家居体验-小爱同学-软床垫-京基100/万象城/地王/荔枝公园 整套公寓 1室1卫1床\n",
128 |       "21 ￥288 【Ebnb】佛系日式网红一居!近会展中心购物公园近地铁福田皇岗口岸 整套公寓 1室1卫1床\n",
129 |       "38 ￥328 机场地铁直达固戍站,Ins大空间品牌公寓 整套酒店式公寓 单间1卫1床\n",
130 |       "32 ￥218 水贝珠宝园7号线地铁出口高端居家公寓 整套酒店式公寓 1室1卫1床\n",
131 |       "18 ￥347 |卷儿· room1| 北欧现代混搭风 直达香港/罗湖口岸&火车站/老街双地铁口/万象城kkmall 整套公寓 1室1卫1床\n",
132 |       "16 ￥350 [Studio Q-Air]深圳罗湖商圈/东门老街/地铁口/万象城/商务套房/直达香港/罗湖口岸 整套公寓 1室1卫1床\n",
133 |       "37 ￥220 [Ein] 出差居家温馨大床房/近深圳东/东门/深圳北 整套公寓 1室1卫1床\n",
134 |       "108 ￥348 【十二微邸】28H-KKmall楼上的心所微山水 整套公寓 1室1卫1床\n",
135 |       "44 ￥351 LADYMA |遇见 北欧混搭风 福田CBD会展中心｣家庭影院CocoPark福田皇岗口岸岗厦地铁口 整套公寓 1室1卫1床\n",
136 |       "75 ￥258 复式:福田区香蜜小筑(双地铁)ShenZhen FuTian 整间阁楼 1室1卫1床\n",
137 |       "230 ￥199 SmileHouse超讚房東_陽光大床房115米平方~香港口岸5分鐘､石厦站2分鐘,如朋友热情招待 独立房间 1室1卫1床\n",
138 |       "138 ￥336 (初见)罗湖阅见·生活美学公寓城景大床房近地王大厦万象城大剧院D出口 整套公寓 1室1卫1床\n",
139 |       "43 ￥218 溜达家法式风格 浴缸房单房整租 3号线地铁口 龙岗中心城大运中心万科广场深圳东 整套公寓 单间1卫1床\n",
140 |       "20 ￥198 【NiteNite奈奈】7号地铁口 水贝独立阳光创意公寓 1min Sta.stylish room 整套公寓 单间1卫1床\n",
141 |       "92 ￥318 【十二微邸】22Q-kkmall楼上的梦唤微天使 整套酒店式公寓 1室1卫1床\n",
142 |       "21 ￥328 深圳福田中心区與香港口岸附近的五星小居 整套公寓 1室1卫1床\n",
143 |       "103 ￥282 ｢木夕｣地铁口/九方购物中心楼上/深圳北站 /直达福田口岸 全新日式风公寓 整套公寓 1室1卫1床\n",
144 |       "112 ￥318 (设计师Room)一分钟到地铁站/近世界之窗/万象天地/科技园北欧精致套房 整套公寓 1室1卫1床\n",
145 |       "113 ￥288 【宫遇】18-KKmall楼上一房一厅--【黑胶之夜】 整套公寓 1室1卫1床\n",
146 |       "23 ￥218 【YOME空间】/ROOM1.木槿/九方购物中心/深圳北站,地铁直达福田口岸 独立房间 1室1卫1床\n",
147 |       "53 ￥380 无间-海上世界别墅#BLUE#地铁站边#超大天台#独立卫浴阳台#蛇口港#近深圳湾口岸 独立房间 1室1卫1床\n",
148 |       "54 ￥420 猫筑Aircat#岗厦/会展中心福田中心粉色北欧风配大投影高层公寓 整套公寓 1室1卫1床\n",
149 |       "155 ￥328 (云意)罗湖阅见·生活美学公寓城景大床房近地王大厦万象城KKMALL大剧院D出口 整套公寓 1室1卫1床\n",
150 |       "122 ￥228 【植意间·梦境D1】主卧带独立洗手间,1.8米大床,深圳北站,高铁19分钟直达香港,走路5分钟到地铁 独立房间 1室1卫1床\n",
151 |       "72 ￥420 猫筑AirCat#高新园万象天地北欧风智能公寓配大投影及开放式厨房 整套公寓 1室1卫1床\n",
152 |       "90 ￥275 福田中心温馨小家Q大街 整套公寓 1室1卫1床\n",
153 |       "110 ￥178 可拍照~ 小红书网红摆拍｢Summerの民宿｣西丽•366大街【有猫】 独立房间 1室1卫1床\n",
154 |       "33 ￥429 【Sen’sHome】福田中心区/FuTianCBD/双地铁/阳光公寓/购物公园/会展中心/潮人首选 整套公寓 1室1卫1床\n",
155 |       "111 ￥199 近机场北欧风格超大主卧(独立卫生间) 独立房间 1室2卫1床\n",
156 |       "8 ￥318 【伊利昂】新房特惠上线 | 万象城 | kkmall | 老街美食中心 | 罗湖口岸 | 巨幕投影 整套公寓 1室1卫1床\n",
157 |       "41 ￥289 Futian CBD nice apartment 福田CBD中心区双地铁口精致公寓 整套公寓 单间1卫1床\n",
158 |       "98 ￥350 New【云邸】设计师风格现代高层公寓,靠近罗湖口岸万象城kkmall地王,去香港方便｡ 整套公寓 1室1卫1床\n",
159 |       "145 ￥330 【胖鸟公舍】南山､桃园双地铁口公寓!让你心安､庸懒､自在!倦鸟变胖鸟! 整套公寓 1室1卫1床\n",
160 |       "100 ￥328 (流年)罗湖阅见·生活美学公寓城景大床房近地王大厦万象城KKMALL大剧院D出口 整套公寓 1室1卫1床\n",
161 |       "60 ￥359 明式新中式风格现代公寓 整套公寓 1室1卫1床\n",
162 |       "37 ￥218 【新故 深圳】双地铁口|美食街|巨幕投影|北欧空间|1-3人公寓 整套公寓 1室1卫2床\n",
163 |       "94 ￥233 【10】★ ★ ★官方推荐★ ★ ★地铁站旁独立Loft套房,简单纯粹,清新方便,温馨浪漫 整间阁楼 1室1卫1床\n",
164 |       "59 ￥450 猫筑AirCat#高新园万象天地粉色配大投影北欧风公寓 整套公寓 1室1卫1床\n",
165 |       "37 ￥248 【粉粉少女心+网红打卡】大床房-会展中心CBD皇岗口岸福田口岸福田购物公园福田高铁站 独立房间 1室1卫1床\n",
166 |       "66 ￥299 #栖息地•CityNest#罗湖东门老街/地铁口/简约北欧一居【simple n romantic】 整套房子 1室1卫1床\n",
167 |       "57 ￥348 无间-海上世界别墅#PINK#地铁站边#超大天台#独立卫浴阳台#蛇口港#近深圳湾口岸 独立房间 1室1卫1床\n",
168 |       "29 ￥308 AA购物公园旁石厦地铁口的私人工作室 整套公寓 1室1卫1床\n",
169 |       "67 ￥368 Muji风格园景公寓,CBD中心区 整套公寓 1室1卫1床\n",
170 |       "22 ￥318 #限时特价# 福田中心整租北欧风公寓 双口岸房源 近会展中心 交通便利 整套公寓 1室1卫1床\n"
171 |      ]
172 |     }
173 |    ],
174 |    "source": [
175 |     "from selenium import webdriver\n",
176 |     "import time\n",
177 |     "\n",
178 |     "fp = webdriver.FirefoxProfile()\n",
179 |     "fp.set_preference(\"permissions.default.stylesheet\",2)\n",
180 |     "fp.set_preference(\"permissions.default.image\",2)\n",
181 |     "fp.set_preference(\"javascript.enabled\", False)\n",
182 |     "\n",
183 |     "driver = webdriver.Firefox(firefox_profile=fp, executable_path = r'C:\\Users\\santostang\\Desktop\\geckodriver.exe')\n",
184 |     "#把上述地址改成你电脑中geckodriver.exe程序的地址\n",
185 |     "for i in range(0,5):\n",
186 |     "    link = \"https://zh.airbnb.com/s/Shenzhen--China/homes?items_offset=\" + str(i *18)\n",
187 |     "    #在虚拟浏览器中打开 Airbnb 页面\n",
188 |     "    driver.get(link)\n",
189 |     "\n",
190 |     "    #找到页面中所有的出租房\n",
191 |     "    rent_list = driver.find_elements_by_css_selector('div._gig1e7')\n",
192 |     "\n",
193 |     "    #对于每一个出租房\n",
194 |     "    for eachhouse in rent_list:\n",
195 |     "        #找到评论数量\n",
196 |     "        try:\n",
197 |     "            comment = eachhouse.find_element_by_css_selector('span._1cy09umr')\n",
198 |     "            comment = comment.text\n",
199 |     "        except:\n",
200 |     "            comment = 0\n",
201 |     "\n",
202 |     "        #找到价格\n",
203 |     "        price = eachhouse.find_element_by_css_selector('div._1yarz4r')\n",
204 |     "        price = price.text.replace(\"每晚\", \"\").replace(\"价格\", \"\").replace(\"\\n\", \"\")\n",
205 |     "        #找到名称\n",
206 |     "        name = eachhouse.find_element_by_css_selector('div._vbshb6')\n",
207 |     "        name = name.text\n",
208 |     "\n",
209 |     "        #找到房屋类型，大小\n",
210 |     "        details = eachhouse.find_element_by_css_selector('span._14ksqu3j')\n",
211 |     "        details = details.text\n",
212 |     "        house_type = details.split(\" · \")[0]\n",
213 |     "        bed_number = details.split(\" · \")[1]\n",
214 |     "        print (comment, price, name, house_type, bed_number)\n",
215 |     "    time.sleep(5)"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": null,
221 |    "metadata": {},
222 |    "outputs": [],
223 |    "source": []
224 |   }
225 |  ],
226 |  "metadata": {
227 |   "kernelspec": {
228 |    "display_name": "Python 3",
229 |    "language": "python",
230 |    "name": "python3"
231 |   },
232 |   "language_info": {
233 |    "codemirror_mode": {
234 |     "name": "ipython",
235 |     "version": 3
236 |    },
237 |    "file_extension": ".py",
238 |    "mimetype": "text/x-python",
239 |    "name": "python",
240 |    "nbconvert_exporter": "python",
241 |    "pygments_lexer": "ipython3",
242 |    "version": "3.6.5"
243 |   },
244 |   "toc": {
245 |    "base_numbering": 1,
246 |    "nav_menu": {
247 |     "height": "12px",
248 |     "width": "252px"
249 |    },
250 |    "number_sections": true,
251 |    "sideBar": true,
252 |    "skip_h1_title": false,
253 |    "title_cell": "Table of Contents",
254 |    "title_sidebar": "Contents",
255 |    "toc_cell": false,
256 |    "toc_position": {},
257 |    "toc_section_display": "block",
258 |    "toc_window_display": false
259 |   }
260 |  },
261 |  "nbformat": 4,
262 |  "nbformat_minor": 2
263 | }
264 | 


--------------------------------------------------------------------------------
/第二版/Cha 4 -动态网页抓取/geckodriver.log:
--------------------------------------------------------------------------------
  1 | 1546605469345	mozrunner::runner	INFO	Running command: "C:\\Program Files\\Mozilla Firefox\\firefox.exe" "-marionette" "-foreground" "-no-remote" "-profile" "C:\\Users\\SANTOS~1\\AppData\\Local\\Temp\\rust_mozprofile.LX5w1iAQRrji"
  2 | 1546605470591	addons.webextension.screenshots@mozilla.org	WARN	Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid host permission: resource://pdf.js/
  3 | 1546605470592	addons.webextension.screenshots@mozilla.org	WARN	Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid host permission: about:reader*
  4 | 1546605472624	Marionette	INFO	Listening on port 50416
  5 | 1546605472634	Marionette	WARN	TLS certificate errors will be ignored for this session
  6 | [Parent 10876, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
  7 | [Child 21820, Chrome_ChildThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
  8 | [Chi[Parent 10876, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
  9 | 1546605476628	Marionette	INFO	Stopped listening on port 50416
 10 | 
 11 | ###!!! [Child][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
 12 | 
 13 | [GPU 16888, Chrome_Chil
 14 | ###!!! [Child][MessageChannel::SendAndWait] Error: Channel error: cannot send/recv
 15 | 
 16 | 1546605697589	mozrunner::runner	INFO	Running command: "C:\\Program Files\\Mozilla Firefox\\firefox.exe" "-marionette" "-foreground" "-no-remote" "-profile" "C:\\Users\\SANTOS~1\\AppData\\Local\\Temp\\rust_mozprofile.VodZOZ4kKVAJ"
 17 | 1546605698039	addons.webextension.screenshots@mozilla.org	WARN	Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid host permission: resource://pdf.js/
 18 | 1546605698039	addons.webextension.screenshots@mozilla.org	WARN	Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid host permission: about:reader*
 19 | 1546605699208	Marionette	INFO	Listening on port 51255
 20 | 1546605699254	Marionette	WARN	TLS certificate errors will be ignored for this session
 21 | [Parent 8420, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 22 | [Child 19772, Chrome_ChildThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 23 | [Child 1[Parent 8420, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 24 | [Child 21788, Chrome_ChildThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 25 | [Child 21546605721892	Marionette	INFO	Stopped listening on port 51255
 26 | [GPU 19152, Chrom
 27 | ###!!! [Child][MessageChannel::SendAndWait] Error: Channel error: cannot send/recv
 28 | 
 29 | 1546606494576	mozrunner::runner	INFO	Running command: "C:\\Program Files\\Mozilla Firefox\\firefox.exe" "-marionette" "-foreground" "-no-remote" "-profile" "C:\\Users\\SANTOS~1\\AppData\\Local\\Temp\\rust_mozprofile.BmewmWo0dctd"
 30 | 1546606495625	addons.webextension.screenshots@mozilla.org	WARN	Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid host permission: resource://pdf.js/
 31 | 1546606495625	addons.webextension.screenshots@mozilla.org	WARN	Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid host permission: about:reader*
 32 | 1546606497276	Marionette	INFO	Listening on port 54937
 33 | 1546606497317	Marionette	WARN	TLS certificate errors will be ignored for this session
 34 | JavaScript warning: https://z1.muscache.cn/airbnb/static/client/packages/hypernova/spa/spa_cn.bundle-f27cc504.js, line 1: unreachable code after return statement
 35 | [Parent 21360, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 36 | 1546606667118	mozrunner::runner	INFO	Running command: "C:\\Program Files\\Mozilla Firefox\\firefox.exe" "-marionette" "-foreground" "-no-remote" "-profile" "C:\\Users\\SANTOS~1\\AppData\\Local\\Temp\\rust_mozprofile.Gr2Jrk94qOPT"
 37 | 1546606667538	addons.webextension.screenshots@mozilla.org	WARN	Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid host permission: resource://pdf.js/
 38 | 1546606667538	addons.webextension.screenshots@mozilla.org	WARN	Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid host permission: about:reader*
 39 | 1546606668932	Marionette	INFO	Listening on port 55954
 40 | 1546606669244	Marionette	WARN	TLS certificate errors will be ignored for this session
 41 | JavaScript warning: https://z1.muscache.cn/airbnb/static/client/packages/hypernova/spa/spa_cn.bundle-f27cc504.js, line 1: unreachable code after return statement
 42 | [Parent 8088, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 43 | [Parent 8088, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 44 | [Parent 8088, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 45 | [Child 9196, Chrome_ChildThread] WARNING: pipe error: 109: [Child 2800, Chrome_Cfile z:/build/build/src/ipc/chromium/src/chrome/commhildThread] WARNING: pipe error: 109: file z:/build/buon/ipc_channelild/src_/ipc/chromium/src/chrome/commowin.cc, line 346
 46 | n/ipc_channel_win.cc, line 346
 47 | [Child 2800, C[Parent 8088, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 48 | [Child 19576, Chrome_ChildThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 49 | [Child 19576,1546606870264	Marionette	INFO	Stopped listening on port 55954
 50 | 
 51 | ###!!! [Child][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
 52 | 
 53 | [GPU 8032, Ch
 54 | ###!!! [Child][MessageChannel::SendAndWait] Error: Channel error: cannot send/recv
 55 | 
 56 | 1546606878275	mozrunner::runner	INFO	Running command: "C:\\Program Files\\Mozilla Firefox\\firefox.exe" "-marionette" "-foreground" "-no-remote" "-profile" "C:\\Users\\SANTOS~1\\AppData\\Local\\Temp\\rust_mozprofile.7cBNPdUaoF0k"
 57 | 1546606878633	addons.webextension.screenshots@mozilla.org	WARN	Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid host permission: resource://pdf.js/
 58 | 1546606878633	addons.webextension.screenshots@mozilla.org	WARN	Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid host permission: about:reader*
 59 | 1546606879751	Marionette	INFO	Listening on port 57143
 60 | 1546606879906	Marionette	WARN	TLS certificate errors will be ignored for this session
 61 | JavaScript warning: https://z1.muscache.cn/airbnb/static/client/packages/hypernova/spa/spa_cn.bundle-f27cc504.js, line 1: unreachable code after return statement
 62 | [Parent 14016, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 63 | 1546606964288	mozrunner::runner	INFO	Running command: "C:\\Program Files\\Mozilla Firefox\\firefox.exe" "-marionette" "-foreground" "-no-remote" "-profile" "C:\\Users\\SANTOS~1\\AppData\\Local\\Temp\\rust_mozprofile.8fVzEunbmF0C"
 64 | 1546606964670	addons.webextension.screenshots@mozilla.org	WARN	Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid host permission: resource://pdf.js/
 65 | 1546606964670	addons.webextension.screenshots@mozilla.org	WARN	Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid host permission: about:reader*
 66 | 1546606966358	Marionette	INFO	Listening on port 57918
 67 | 1546606966430	Marionette	WARN	TLS certificate errors will be ignored for this session
 68 | JavaScript warning: https://z1.muscache.cn/airbnb/static/client/packages/hypernova/spa/spa_cn.bundle-f27cc504.js, line 1: unreachable code after return statement
 69 | JavaScript warning: https://z1.muscache.cn/airbnb/static/client/packages/hypernova/spa/spa_cn.bundle-f27cc504.js, line 1: unreachable code after return statement
 70 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 71 | JavaScript warning: https://z1.muscache.cn/airbnb/static/client/packages/hypernova/spa/spa_cn.bundle-f27cc504.js, line 1: unreachable code after return statement
 72 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 73 | JavaScript warning: https://z1.muscache.cn/airbnb/static/client/packages/hypernova/spa/spa_cn.bundle-f27cc504.js, line 1: unreachable code after return statement
 74 | console.error: BroadcastService: 
 75 |   receivedBroadcastMessage: handler for
 76 |   remote-settings/monitor_changes
 77 |   threw error:
 78 |   Message: Error: Polling for changes failed: NetworkError when attempting to fetch resource..
 79 |   Stack:
 80 |     remoteSettingsFunction/remoteSettings.pollChanges@resource://services-settings/remote-settings.js:750:13
 81 | 
 82 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 83 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 84 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 85 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 86 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 87 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 88 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 89 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 90 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 91 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 92 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 93 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 94 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 95 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 96 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 97 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 98 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
 99 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
100 | JavaScript error: resource://gre/modules/WebProgressChild.jsm, line 58: TypeError: this.mm.content is null
101 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
102 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
103 | [Child 5104, Chrome_ChildThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/[cChild 16864, Chrome_ChildThread] ommon/ipc_channel_win.cc, line 346
104 | WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346
105 | [Child 16861546608956897	Marionette	INFO	Stopped listening on port 57918
106 | 
107 | ###!!! [Child][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
108 | 
109 | [GPU 9868, C
110 | ###!!! [Child][MessageChannel::SendAndWait] Error: Channel error: cannot send/recv
111 | 
112 | 


--------------------------------------------------------------------------------
/第二版/Cha 6 -数据储存/Cha 6 -数据存储.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 6.1\t基本存储：存储至txt或csv"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## 6.1.1把数据存储至txt"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 14,
 20 |    "metadata": {
 21 |     "ExecuteTime": {
 22 |      "end_time": "2018-11-18T14:30:19.089018Z",
 23 |      "start_time": "2018-11-18T14:30:19.086009Z"
 24 |     }
 25 |    },
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "title = \"This is a test sentence.\"\n",
 29 |     "with open(r'C:\\Users\\santostang\\Desktop\\title.txt', \"a+\") as f:\n",
 30 |     "    f.write(title)\n",
 31 |     "    f.close()"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 15,
 37 |    "metadata": {
 38 |     "ExecuteTime": {
 39 |      "end_time": "2018-11-18T14:30:19.751164Z",
 40 |      "start_time": "2018-11-18T14:30:19.747656Z"
 41 |     }
 42 |    },
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "output = '\\t'.join(['name','title','age','gender'])\n",
 46 |     "with open('C:\\\\Users\\\\santostang\\\\desktop\\\\test.txt', \"a+\") as f:\n",
 47 |     "    f.write(output)\n",
 48 |     "    f.close()"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 17,
 54 |    "metadata": {
 55 |     "ExecuteTime": {
 56 |      "end_time": "2018-11-18T14:30:38.210437Z",
 57 |      "start_time": "2018-11-18T14:30:38.198405Z"
 58 |     }
 59 |    },
 60 |    "outputs": [
 61 |     {
 62 |      "name": "stdout",
 63 |      "output_type": "stream",
 64 |      "text": [
 65 |       "This is a test sentence.\n"
 66 |      ]
 67 |     }
 68 |    ],
 69 |    "source": [
 70 |     "with open(r'C:\\Users\\santostang\\Desktop\\title.txt', \"r\", encoding ='utf-8') as f:\n",
 71 |     "    result = f.read()\n",
 72 |     "    print (result)"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 20,
 78 |    "metadata": {
 79 |     "ExecuteTime": {
 80 |      "end_time": "2018-11-18T14:32:10.779825Z",
 81 |      "start_time": "2018-11-18T14:32:10.776817Z"
 82 |     }
 83 |    },
 84 |    "outputs": [
 85 |     {
 86 |      "name": "stdout",
 87 |      "output_type": "stream",
 88 |      "text": [
 89 |       "['This is a test sentence.', 'This is the second test sentence.', 'This is the third test sentence.']\n"
 90 |      ]
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "with open(r'C:\\Users\\santostang\\Desktop\\title.txt', \"r\", encoding ='utf-8') as f:\n",
 95 |     "    result = f.read().splitlines()\n",
 96 |     "    print (result)"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "## 6.1.2把数据存储至csv"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 3,
109 |    "metadata": {
110 |     "ExecuteTime": {
111 |      "end_time": "2018-11-18T14:43:09.018560Z",
112 |      "start_time": "2018-11-18T14:43:09.014551Z"
113 |     }
114 |    },
115 |    "outputs": [
116 |     {
117 |      "name": "stdout",
118 |      "output_type": "stream",
119 |      "text": [
120 |       "['\\ufeffA1', 'B1', 'C1', 'D1']\n",
121 |       "﻿A1\n",
122 |       "['A2', 'B2', 'C2', 'D2']\n",
123 |       "A2\n",
124 |       "['A3', 'B3', 'C3', 'D3']\n",
125 |       "A3\n",
126 |       "['A4', 'B4', 'C4', 'D4']\n",
127 |       "A4\n"
128 |      ]
129 |     }
130 |    ],
131 |    "source": [
132 |     "import csv\n",
133 |     "with open('test.csv', 'r',encoding='utf-8') as csvfile:\n",
134 |     "    csv_reader = csv.reader(csvfile)\n",
135 |     "    for row in csv_reader:\n",
136 |     "        print(row)\n",
137 |     "        print(row[0])"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 4,
143 |    "metadata": {
144 |     "ExecuteTime": {
145 |      "end_time": "2018-11-18T14:43:35.680232Z",
146 |      "start_time": "2018-11-18T14:43:35.676723Z"
147 |     }
148 |    },
149 |    "outputs": [],
150 |    "source": [
151 |     "import csv\n",
152 |     "output_list = ['1', '2','3','4']\n",
153 |     "with open('test2.csv', 'a+', encoding='utf-8', newline='') as csvfile:\n",
154 |     "    w = csv.writer(csvfile)\n",
155 |     "    w.writerow(output_list)"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "metadata": {},
161 |    "source": [
162 |     "## 6.2 储存至MySQL数据库"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "## 6.2.3 Python操作MySQL数据库"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 11,
175 |    "metadata": {
176 |     "ExecuteTime": {
177 |      "end_time": "2018-11-25T15:24:14.804439Z",
178 |      "start_time": "2018-11-25T15:24:14.764978Z"
179 |     }
180 |    },
181 |    "outputs": [],
182 |    "source": [
183 |     "import pymysql\n",
184 |     " \n",
185 |     "# 打开数据库连接\n",
186 |     "db = pymysql.connect(\"localhost\",\"root\",\"password\",\"scraping\" )\n",
187 |     " \n",
188 |     "# 使用cursor()方法获取操作游标 \n",
189 |     "cursor = db.cursor()\n",
190 |     " \n",
191 |     "# SQL 插入语句\n",
192 |     "sql = \"\"\"INSERT INTO urls (url, content) VALUES ('www.baidu.com', 'This is content.')\"\"\"\n",
193 |     "try:\n",
194 |     "   # 执行sql语句\n",
195 |     "   cursor.execute(sql)\n",
196 |     "   # 提交到数据库执行\n",
197 |     "   db.commit()\n",
198 |     "except:\n",
199 |     "   # 如果发生错误则回滚\n",
200 |     "   db.rollback()\n",
201 |     "# 关闭数据库连接\n",
202 |     "db.close()"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 17,
208 |    "metadata": {
209 |     "ExecuteTime": {
210 |      "end_time": "2018-11-28T15:14:41.464834Z",
211 |      "start_time": "2018-11-28T15:14:40.915500Z"
212 |     }
213 |    },
214 |    "outputs": [],
215 |    "source": [
216 |     "import requests\n",
217 |     "from bs4 import BeautifulSoup\n",
218 |     "import pymysql\n",
219 |     "\n",
220 |     "db = pymysql.connect(\"localhost\",\"root\",\"password\",\"scraping\" )\n",
221 |     "cursor = db.cursor()\n",
222 |     "\n",
223 |     "link = \"http://www.santostang.com/\"\n",
224 |     "headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}\n",
225 |     "r = requests.get(link, headers= headers)\n",
226 |     "\n",
227 |     "soup = BeautifulSoup(r.text, \"lxml\")\n",
228 |     "title_list = soup.find_all(\"h1\", class_=\"post-title\")\n",
229 |     "for eachone in title_list:\n",
230 |     "    url = eachone.a['href']\n",
231 |     "    title = eachone.a.text.strip()\n",
232 |     "    cursor.execute(\"INSERT INTO urls (url, content) VALUES (%s, %s)\", (url, title))\n",
233 |     "    \n",
234 |     "db.commit()\n",
235 |     "db.close()"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "markdown",
240 |    "metadata": {},
241 |    "source": [
242 |     "# 6.3 储存至MongoDB数据库"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "markdown",
247 |    "metadata": {},
248 |    "source": [
249 |     "## 6.3.3 Python操作MongoDB数据库"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": 18,
255 |    "metadata": {
256 |     "ExecuteTime": {
257 |      "end_time": "2018-11-28T16:22:17.163745Z",
258 |      "start_time": "2018-11-28T16:22:17.046880Z"
259 |     }
260 |    },
261 |    "outputs": [],
262 |    "source": [
263 |     "from pymongo import MongoClient\n",
264 |     "client = MongoClient('localhost',27017)\n",
265 |     "db = client.blog_database\n",
266 |     "collection = db.blog"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": 19,
272 |    "metadata": {
273 |     "ExecuteTime": {
274 |      "end_time": "2018-11-28T16:23:15.311100Z",
275 |      "start_time": "2018-11-28T16:23:15.018597Z"
276 |     }
277 |    },
278 |    "outputs": [],
279 |    "source": [
280 |     "import requests\n",
281 |     "import datetime\n",
282 |     "from bs4 import BeautifulSoup\n",
283 |     "from pymongo import MongoClient\n",
284 |     "\n",
285 |     "client = MongoClient('localhost',27017)\n",
286 |     "db = client.blog_database\n",
287 |     "collection = db.blog\n",
288 |     "\n",
289 |     "link = \"http://www.santostang.com/\"\n",
290 |     "headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} \n",
291 |     "r = requests.get(link, headers= headers)\n",
292 |     "\n",
293 |     "soup = BeautifulSoup(r.text, \"lxml\")\n",
294 |     "title_list = soup.find_all(\"h1\", class_=\"post-title\")\n",
295 |     "for eachone in title_list:\n",
296 |     "    url = eachone.a['href']\n",
297 |     "    title = eachone.a.text.strip()\n",
298 |     "    post = {\"url\": url,\n",
299 |     "         \"title\": title,\n",
300 |     "         \"date\": datetime.datetime.utcnow()}\n",
301 |     "    collection.insert_one(post)"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": null,
307 |    "metadata": {
308 |     "collapsed": true
309 |    },
310 |    "outputs": [],
311 |    "source": []
312 |   }
313 |  ],
314 |  "metadata": {
315 |   "kernelspec": {
316 |    "display_name": "Python 3",
317 |    "language": "python",
318 |    "name": "python3"
319 |   },
320 |   "language_info": {
321 |    "codemirror_mode": {
322 |     "name": "ipython",
323 |     "version": 3
324 |    },
325 |    "file_extension": ".py",
326 |    "mimetype": "text/x-python",
327 |    "name": "python",
328 |    "nbconvert_exporter": "python",
329 |    "pygments_lexer": "ipython3",
330 |    "version": "3.6.5"
331 |   },
332 |   "toc": {
333 |    "base_numbering": 1,
334 |    "nav_menu": {},
335 |    "number_sections": true,
336 |    "sideBar": true,
337 |    "skip_h1_title": false,
338 |    "title_cell": "Table of Contents",
339 |    "title_sidebar": "Contents",
340 |    "toc_cell": false,
341 |    "toc_position": {},
342 |    "toc_section_display": true,
343 |    "toc_window_display": false
344 |   }
345 |  },
346 |  "nbformat": 4,
347 |  "nbformat_minor": 2
348 | }
349 | 


--------------------------------------------------------------------------------
/第二版/Cha 6 -数据储存/test.csv:
--------------------------------------------------------------------------------
1 | ﻿A1,B1,C1,D1
2 | A2,B2,C2,D2
3 | A3,B3,C3,D3
4 | A4,B4,C4,D4
5 | 


--------------------------------------------------------------------------------
/第二版/Cha 6 -数据储存/test2.csv:
--------------------------------------------------------------------------------
1 | 1,2,3,4
2 | 


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践（章末）答案/financeSpider/financeSpider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践（章末）答案/financeSpider/financeSpider/__init__.py


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践（章末）答案/financeSpider/financeSpider/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践（章末）答案/financeSpider/financeSpider/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践（章末）答案/financeSpider/financeSpider/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践（章末）答案/financeSpider/financeSpider/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践（章末）答案/financeSpider/financeSpider/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践（章末）答案/financeSpider/financeSpider/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践（章末）答案/financeSpider/financeSpider/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践（章末）答案/financeSpider/financeSpider/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践（章末）答案/financeSpider/financeSpider/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class FinancespiderItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     title = scrapy.Field()
15 |     link = scrapy.Field()
16 |     content = scrapy.Field()
17 |     time = scrapy.Field()
18 |     comment = scrapy.Field()
19 |     discuss = scrapy.Field()
20 | 


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践（章末）答案/financeSpider/financeSpider/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class FinancespiderSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class FinancespiderDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践（章末）答案/financeSpider/financeSpider/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class FinancespiderPipeline(object):
10 |     #填入你的地址
11 |     file_path = "C:/Users/santostang/Desktop/financeSpider/result.txt"
12 | 
13 |     def __init__(self):
14 |         self.article = open(self.file_path, "a+", encoding="utf-8")
15 | 
16 |     #定义管道的处理方法
17 |     def process_item(self, item, spider):
18 |         title = item["title"]
19 |         link = item["link"]
20 |         content = item["content"]
21 |         time = item["time"]
22 |         comment = item["comment"]
23 |         discuss = item["discuss"]
24 |         output = title + '\t' + link + '\t' + time + '\t' + comment + '\t' + discuss + '\t' + content +  '\n\n'
25 |         self.article.write(output)
26 |         return item
27 | 


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践（章末）答案/financeSpider/financeSpider/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for financeSpider project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'financeSpider'
13 | 
14 | SPIDER_MODULES = ['financeSpider.spiders']
15 | NEWSPIDER_MODULE = 'financeSpider.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'financeSpider (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'financeSpider.middlewares.FinancespiderSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'financeSpider.middlewares.FinancespiderDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |     'financeSpider.pipelines.FinancespiderPipeline': 300,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践（章末）答案/financeSpider/financeSpider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践（章末）答案/financeSpider/financeSpider/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践（章末）答案/financeSpider/financeSpider/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践（章末）答案/financeSpider/financeSpider/spiders/__pycache__/finance.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践（章末）答案/financeSpider/financeSpider/spiders/__pycache__/finance.cpython-36.pyc


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践（章末）答案/financeSpider/financeSpider/spiders/finance.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from bs4 import BeautifulSoup
 4 | from financeSpider.items import FinancespiderItem
 5 | 
 6 | class FinanceSpider(scrapy.Spider):
 7 |     name = 'finance'
 8 |     allowed_domains = ['finance.eastmoney.com']
 9 |     start_urls = ['http://finance.eastmoney.com/news/cywjh_1.html']
10 |     url_head = 'http://finance.eastmoney.com/news/cywjh_'
11 |     url_end = '.html'
12 | 
13 |     # Scrapy自带功能，从start_requests开始发送请求
14 |     def start_requests(self):
15 |         #获取前三页的url地址
16 |         for i in range(1,4):
17 |             url = self.url_head + str(i) + self.url_end
18 |             print ("当前的页面是：", url)
19 |             # 对新闻列表页发送Request请求
20 |             yield scrapy.Request(url=url, callback = self.parse)
21 | 
22 |     def parse(self, response):
23 |         soup = BeautifulSoup(response.text, "lxml")
24 |         article_list = soup.find_all("div", class_="text")
25 |         for i in range(len(article_list)):
26 |             # 将数据封装到FinancespiderItem对象，字典类型数据
27 |             item = FinancespiderItem()
28 |             title = article_list[i].find("p", class_="title").a.text.strip()
29 |             link = article_list[i].find("p", class_="title").a["href"]
30 |             time = article_list[i].find("p", class_="time").text.strip()
31 |             # 变成字典
32 |             item["title"] = title
33 |             item["link"] = link
34 |             item["time"] = time 
35 |             # 根据文章链接，发送Request请求，并传递item参数     
36 |             yield scrapy.Request(url=link, meta = {'item':item}, callback = self.parse2)
37 | 
38 |     def parse2(self, response):
39 |         #接收传递的item
40 |         item = response.meta['item']
41 |         #解析提取文章内容
42 |         soup = BeautifulSoup(response.text, "lxml")
43 |         content = soup.find("div", id="ContentBody").text.strip()
44 |         content = content.replace("\n", " ")
45 |         comment = soup.find("span", class_="cNumShow num").text.strip()
46 |         discuss = soup.find("span", class_="num ml5").text.strip()
47 |         item["content"] = content
48 |         item["comment"] = comment 
49 |         item["discuss"] = discuss 
50 |         #返回item，交给item pipeline
51 |         yield item
52 | 


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践（章末）答案/financeSpider/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = financeSpider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = financeSpider
12 | 


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/article.csv:
--------------------------------------------------------------------------------
1 | content,link,title
2 | ,http://www.santostang.com/2018/07/15/4-3-%e9%80%9a%e8%bf%87selenium-%e6%a8%a1%e6%8b%9f%e6%b5%8f%e8%a7%88%e5%99%a8%e6%8a%93%e5%8f%96/,4.3 通过selenium 模拟浏览器抓取
3 | ,http://www.santostang.com/2018/07/14/4-2-%e8%a7%a3%e6%9e%90%e7%9c%9f%e5%ae%9e%e5%9c%b0%e5%9d%80%e6%8a%93%e5%8f%96/,4.2 解析真实地址抓取
4 | ,http://www.santostang.com/2018/07/14/%e7%ac%ac%e5%9b%9b%e7%ab%a0%ef%bc%9a%e5%8a%a8%e6%80%81%e7%bd%91%e9%a1%b5%e6%8a%93%e5%8f%96-%e8%a7%a3%e6%9e%90%e7%9c%9f%e5%ae%9e%e5%9c%b0%e5%9d%80-selenium/,第四章- 动态网页抓取 (解析真实地址 + selenium)
5 | ,http://www.santostang.com/2018/07/11/%e3%80%8a%e7%bd%91%e7%bb%9c%e7%88%ac%e8%99%ab%ef%bc%9a%e4%bb%8e%e5%85%a5%e9%97%a8%e5%88%b0%e5%ae%9e%e8%b7%b5%e3%80%8b%e4%b8%80%e4%b9%a6%e5%8b%98%e8%af%af/,《网络爬虫：从入门到实践》一书勘误
6 | ,http://www.santostang.com/2018/07/04/hello-world/,Hello world!
7 | 


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/article.json:
--------------------------------------------------------------------------------
1 | [
2 | {"title": "4.3 \u901a\u8fc7selenium \u6a21\u62df\u6d4f\u89c8\u5668\u6293\u53d6", "link": "http://www.santostang.com/2018/07/15/4-3-%e9%80%9a%e8%bf%87selenium-%e6%a8%a1%e6%8b%9f%e6%b5%8f%e8%a7%88%e5%99%a8%e6%8a%93%e5%8f%96/"},
3 | {"title": "4.2 \u89e3\u6790\u771f\u5b9e\u5730\u5740\u6293\u53d6", "link": "http://www.santostang.com/2018/07/14/4-2-%e8%a7%a3%e6%9e%90%e7%9c%9f%e5%ae%9e%e5%9c%b0%e5%9d%80%e6%8a%93%e5%8f%96/"},
4 | {"title": "\u7b2c\u56db\u7ae0- \u52a8\u6001\u7f51\u9875\u6293\u53d6 (\u89e3\u6790\u771f\u5b9e\u5730\u5740 + selenium)", "link": "http://www.santostang.com/2018/07/14/%e7%ac%ac%e5%9b%9b%e7%ab%a0%ef%bc%9a%e5%8a%a8%e6%80%81%e7%bd%91%e9%a1%b5%e6%8a%93%e5%8f%96-%e8%a7%a3%e6%9e%90%e7%9c%9f%e5%ae%9e%e5%9c%b0%e5%9d%80-selenium/"},
5 | {"title": "\u300a\u7f51\u7edc\u722c\u866b\uff1a\u4ece\u5165\u95e8\u5230\u5b9e\u8df5\u300b\u4e00\u4e66\u52d8\u8bef", "link": "http://www.santostang.com/2018/07/11/%e3%80%8a%e7%bd%91%e7%bb%9c%e7%88%ac%e8%99%ab%ef%bc%9a%e4%bb%8e%e5%85%a5%e9%97%a8%e5%88%b0%e5%ae%9e%e8%b7%b5%e3%80%8b%e4%b8%80%e4%b9%a6%e5%8b%98%e8%af%af/"},
6 | {"title": "Hello world!", "link": "http://www.santostang.com/2018/07/04/hello-world/"}
7 | ]


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/__init__.py


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class BlogspiderItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     title = scrapy.Field()
15 |     link = scrapy.Field()
16 |     content = scrapy.Field()
17 | 


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class BlogspiderSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class BlogspiderDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class BlogspiderPipeline(object):
10 |     #填入你的地址
11 |     file_path = "C:/Users/santostang/Desktop/blogSpider/result.txt"
12 | 
13 |     def __init__(self):
14 |         self.article = open(self.file_path, "a+", encoding="utf-8")
15 | 
16 |     #定义管道的处理方法
17 |     def process_item(self, item, spider):
18 |         title = item["title"]
19 |         link = item["link"]
20 |         content = item["content"]
21 |         output = title + '\t' + link + '\t' + content + '\n\n'
22 |         self.article.write(output)
23 |         return item
24 | 


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for blogSpider project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'blogSpider'
13 | 
14 | SPIDER_MODULES = ['blogSpider.spiders']
15 | NEWSPIDER_MODULE = 'blogSpider.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'blogSpider (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'blogSpider.middlewares.BlogspiderSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'blogSpider.middlewares.BlogspiderDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |     'blogSpider.pipelines.BlogspiderPipeline': 300,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/spiders/__pycache__/santostang - 副本.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/spiders/__pycache__/santostang - 副本.cpython-36.pyc


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/spiders/__pycache__/santostang.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/spiders/__pycache__/santostang.cpython-36.pyc


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/spiders/santostang - 副本.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import os
 3 | from bs4 import BeautifulSoup
 4 | from blogSpider.items import BlogspiderItem
 5 | # 设置相应的代理用户名密码，主机和端口号
 6 | os.environ['HTTP_PROXY'] = 'http://web-proxy.oa.com:8080'
 7 | os.environ['HTTPS_PROXY'] = 'https://web-proxy.oa.com:8080'
 8 | 
 9 | 
10 | class SantostangSpider(scrapy.Spider):
11 |     name = 'santostang'
12 |     allowed_domains = ['www.santostang.com']
13 |     start_urls = ['http://www.santostang.com/']
14 | 
15 |     def parse(self, response):
16 |         # 第一部分代码：将html保存到本地
17 |         # print (response.text)
18 |         # filename = "index.html"
19 |         # with open(filename, 'w', encoding="utf-8") as f:
20 |         #     f.write(response.text)
21 | 
22 |         # 第二部分代码：打印文章标题
23 |         # soup = BeautifulSoup(response.text, "lxml")
24 |         # first_title = soup.find("h1", class_= "post-title").a.text.strip()
25 |         # print ("第一篇文章的标题是：", first_title)
26 |         # for i in range(len(title_list)):
27 |         #     title = title_list[i].a.text.strip()
28 |         #     print('第 %s 篇文章的标题是：%s' %(i+1, title))
29 | 
30 |         #存放文章信息的列表
31 |         items = []
32 | 
33 |         soup = BeautifulSoup(response.text, "lxml")
34 |         title_list = soup.find_all("h1", class_="post-title")
35 |         for i in range(len(title_list)):
36 |             # 将数据封装到BlogspiderItem对象，字典类型数据
37 |             item = BlogspiderItem()
38 |             title = title_list[i].a.text.strip()
39 |             link = title_list[i].a["href"]
40 |             # 变成字典
41 |             item["title"] = title
42 |             item["link"] = link
43 |             items.append(item)
44 |         
45 |         # 返回数据
46 |         return items
47 | 
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/spiders/santostang.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from bs4 import BeautifulSoup
 3 | from blogSpider.items import BlogspiderItem
 4 | 
 5 | class SantostangSpider(scrapy.Spider):
 6 |     name = 'santostang'
 7 |     allowed_domains = ['www.santostang.com']
 8 |     start_urls = ['http://www.santostang.com/']
 9 | 
10 |     def parse(self, response):
11 |         # 第一部分代码：将html保存到本地
12 |         # print (response.text)
13 |         # filename = "index.html"
14 |         # with open(filename, 'w', encoding="utf-8") as f:
15 |         #     f.write(response.text)
16 | 
17 |         # 第二部分代码：打印文章标题
18 |         # soup = BeautifulSoup(response.text, "lxml")
19 |         # first_title = soup.find("h1", class_= "post-title").a.text.strip()
20 |         # print ("第一篇文章的标题是：", first_title)
21 |         # for i in range(len(title_list)):
22 |         #     title = title_list[i].a.text.strip()
23 |         #     print('第 %s 篇文章的标题是：%s' %(i+1, title))
24 | 
25 |         #第三部分代码：
26 |         # soup = BeautifulSoup(response.text, "lxml")
27 |         # first_title = soup.find("h1", class_= "post-title").a.text.strip()
28 |         # print ("第一篇文章的标题是：", first_title)
29 | 
30 |         # for i in range(len(title_list)):
31 |         #     title = title_list[i].a.text.strip()
32 |         #     print('第 %s 篇文章的标题是：%s' %(i+1, title))
33 | 
34 |     #第四部分代码：储存文章内容
35 |         soup = BeautifulSoup(response.text, "lxml")
36 |         title_list = soup.find_all("h1", class_="post-title")
37 |         for i in range(len(title_list)):
38 |             # 将数据封装到BlogspiderItem对象，字典类型数据
39 |             item = BlogspiderItem()
40 |             title = title_list[i].a.text.strip()
41 |             link = title_list[i].a["href"]
42 |             # 变成字典
43 |             item["title"] = title
44 |             item["link"] = link   
45 |             # 根据文章链接，发送Request请求，并传递item参数     
46 |             yield scrapy.Request(url =link, meta = {'item':item}, callback = self.parse2)
47 | 
48 |     def parse2(self, response):
49 |         #接收传递的item
50 |         item = response.meta['item']
51 |         #解析提取文章内容
52 |         soup = BeautifulSoup(response.text, "lxml")
53 |         content = soup.find("div", class_="view-content").text.strip()
54 |         content = content.replace("\n", " ")
55 |         item["content"] = content
56 |         #返回item，交给item pipeline
57 |         yield item


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = blogSpider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = blogSpider
12 | 


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/__init__.py


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class FinancespiderItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     title = scrapy.Field()
15 |     link = scrapy.Field()
16 |     content = scrapy.Field()
17 | 


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class FinancespiderSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class FinancespiderDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class FinancespiderPipeline(object):
10 |     #填入你的地址
11 |     file_path = "C:/Users/santostang/Desktop/financeSpider/result.txt"
12 | 
13 |     def __init__(self):
14 |         self.article = open(self.file_path, "a+", encoding="utf-8")
15 | 
16 |     #定义管道的处理方法
17 |     def process_item(self, item, spider):
18 |         title = item["title"]
19 |         link = item["link"]
20 |         content = item["content"]
21 |         output = title + '\t' + link + '\t' + content + '\n\n'
22 |         self.article.write(output)
23 |         return item
24 | 


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for financeSpider project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'financeSpider'
13 | 
14 | SPIDER_MODULES = ['financeSpider.spiders']
15 | NEWSPIDER_MODULE = 'financeSpider.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'financeSpider (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'financeSpider.middlewares.FinancespiderSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'financeSpider.middlewares.FinancespiderDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |     'financeSpider.pipelines.FinancespiderPipeline': 300,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/spiders/__pycache__/finance.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/spiders/__pycache__/finance.cpython-36.pyc


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/spiders/finance.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from bs4 import BeautifulSoup
 4 | from financeSpider.items import FinancespiderItem
 5 | 
 6 | class FinanceSpider(scrapy.Spider):
 7 |     name = 'finance'
 8 |     allowed_domains = ['finance.eastmoney.com']
 9 |     start_urls = ['http://finance.eastmoney.com/news/cywjh_1.html']
10 |     url_head = 'http://finance.eastmoney.com/news/cywjh_'
11 |     url_end = '.html'
12 | 
13 |     # Scrapy自带功能，从start_requests开始发送请求
14 |     def start_requests(self):
15 |         #获取前三页的url地址
16 |         for i in range(1,4):
17 |             url = self.url_head + str(i) + self.url_end
18 |             print ("当前的页面是：", url)
19 |             # 对新闻列表页发送Request请求
20 |             yield scrapy.Request(url=url, callback = self.parse)
21 | 
22 |     def parse(self, response):
23 |         soup = BeautifulSoup(response.text, "lxml")
24 |         title_list = soup.find_all("p", class_="title")
25 |         for i in range(len(title_list)):
26 |             # 将数据封装到FinancespiderItem对象，字典类型数据
27 |             item = FinancespiderItem()
28 |             title = title_list[i].a.text.strip()
29 |             link = title_list[i].a["href"]
30 |             # 变成字典
31 |             item["title"] = title
32 |             item["link"] = link   
33 |             # 根据文章链接，发送Request请求，并传递item参数     
34 |             yield scrapy.Request(url=link, meta = {'item':item}, callback = self.parse2)
35 | 
36 |     def parse2(self, response):
37 |         #接收传递的item
38 |         item = response.meta['item']
39 |         #解析提取文章内容
40 |         soup = BeautifulSoup(response.text, "lxml")
41 |         content = soup.find("div", id="ContentBody").text.strip()
42 |         content = content.replace("\n", " ")
43 |         item["content"] = content
44 |         #返回item，交给item pipeline
45 |         yield item
46 | 


--------------------------------------------------------------------------------
/第二版/Cha 7 -Scrapy爬虫框架/financeSpider/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = financeSpider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = financeSpider
12 | 


--------------------------------------------------------------------------------
/第二版/Cha 8 -提升爬虫的速度/cha8/__pycache__/multiprocess_test.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 8 -提升爬虫的速度/cha8/__pycache__/multiprocess_test.cpython-35.pyc


--------------------------------------------------------------------------------
/第二版/Cha 8 -提升爬虫的速度/cha8/__pycache__/multiprocess_test.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 8 -提升爬虫的速度/cha8/__pycache__/multiprocess_test.cpython-36.pyc


--------------------------------------------------------------------------------
/第二版/Cha 8 -提升爬虫的速度/cha8/__pycache__/thread_test.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 8 -提升爬虫的速度/cha8/__pycache__/thread_test.cpython-35.pyc


--------------------------------------------------------------------------------
/第二版/Cha 8 -提升爬虫的速度/cha8/__pycache__/thread_test.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 8 -提升爬虫的速度/cha8/__pycache__/thread_test.cpython-36.pyc


--------------------------------------------------------------------------------
/第二版/Cha 8 -提升爬虫的速度/cha8/gevent1.py:
--------------------------------------------------------------------------------
 1 | import gevent
 2 | from gevent.queue import Queue, Empty
 3 | import time
 4 | import requests
 5 | 
 6 | from gevent import monkey#把下面有可能有IO操作的单独做上标记
 7 | monkey.patch_all() # 将IO转为异步执行的函数
 8 | 
 9 | link_list = []
10 | with open('alexa.txt', 'r') as file:
11 |     file_list = file.readlines()
12 |     for eachone in file_list:
13 |         link = eachone.split('\t')[1]
14 |         link = link.replace('\n','')
15 |         link_list.append(link)
16 | 
17 | start = time.time()
18 | def crawler(index):
19 |     Process_id = 'Process-' + str(index)
20 |     while not workQueue.empty():
21 |         url = workQueue.get(timeout=2)
22 |         try:
23 |             r = requests.get(url, timeout=20)
24 |             print (Process_id, workQueue.qsize(), r.status_code, url)
25 |         except Exception as e: 
26 |             print (Process_id, workQueue.qsize(), url, 'Error: ', e)
27 | 
28 | def boss():
29 |     for url in link_list:
30 |         workQueue.put_nowait(url)
31 | 
32 | if __name__ == '__main__':
33 |     workQueue = Queue(1000)
34 | 
35 |     gevent.spawn(boss).join()
36 |     jobs = []
37 |     for i in range(10):
38 |         jobs.append(gevent.spawn(crawler, i))
39 |     gevent.joinall(jobs)
40 | 
41 |     end = time.time()
42 |     print ('gevent + Queue多协程爬虫的总时间为：', end-start)
43 |     print ('Main Ended!')
44 | 


--------------------------------------------------------------------------------
/第二版/Cha 8 -提升爬虫的速度/cha8/gevent_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 |  # -*- coding: utf-8 -*-
 3 | 
 4 | import gevent
 5 | from gevent.queue import Queue, Empty
 6 | import time
 7 | import requests
 8 | 
 9 | from gevent import monkey#把下面有可能有IO操作的单独做上标记
10 | monkey.patch_all() # 将IO转为异步执行的函数
11 | 
12 | start = time.time()
13 | workQueue = Queue(1000)
14 | def crawler(index):
15 |     Process_id = 'Process-' + str(index)
16 |     while not workQueue.empty():
17 |         url = workQueue.get(timeout=2)
18 |         try:
19 |             r = requests.get(url, timeout=20)
20 |             print (Process_id, workQueue.qsize(), r.status_code, url)
21 |         except Exception as e: 
22 |             print (Process_id, workQueue.qsize(), url, 'Error: ', e)
23 | 
24 | def boss(link_list):
25 |     for url in link_list:
26 |         workQueue.put_nowait(url)
27 | 
28 | def gevent_main(link_list, g_num):
29 |     gevent.spawn(boss,link_list).join()
30 |     jobs = []
31 |     for i in range(g_num):
32 |         jobs.append(gevent.spawn(crawler, i))
33 |     gevent.joinall(jobs)
34 | 
35 |     end = time.time()
36 |     time_spend = end-start
37 |     print ('gevent + Queue多协程爬虫的总时间为：', time_spend)
38 |     print ('Main Ended!')
39 |     return time_spend
40 | 
41 | if __name__ == '__main__':
42 |     link_list = []
43 |     with open('alexa.txt', 'r') as file:
44 |         file_list = file.readlines()
45 |         for eachone in file_list:
46 |             link = eachone.split('\t')[1]
47 |             link = link.replace('\n','')
48 |             link_list.append(link)
49 | 
50 |     
51 | 
52 |     gevent_time10 = gevent_main(link_list, 15)
53 |     print ('gevent + Queue多协程爬虫的总时间为：', gevent_time10)
54 | 
55 |     gevent_time3 = gevent_main(link_list, 20)
56 |     print ('gevent + Queue多协程爬虫的总时间为：', gevent_time3)
57 | 
58 |     with open('result_gevent.txt','a+',encoding='utf-8') as f:
59 |         f.write('\t' + str(gevent_time10) + '\t' + str(gevent_time3))


--------------------------------------------------------------------------------
/第二版/Cha 8 -提升爬虫的速度/cha8/multiprocess_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 |  # -*- coding: utf-8 -*-
 3 | 
 4 | from multiprocessing import Pool, Manager
 5 | import time
 6 | import requests
 7 | 
 8 | def crawler(q, index):
 9 |     Process_id = 'Process-' + str(index)
10 |     while not q.empty():
11 |         url = q.get(timeout=2)
12 |         try:
13 |             r = requests.get(url, timeout=20)
14 |             print (Process_id, q.qsize(), r.status_code, url)
15 |         except Exception as e: 
16 |             print (Process_id, q.qsize(), url, 'Error: ', e)
17 | 
18 | 
19 | def multiprocess_main(link_list, p_num):
20 |     start = time.time() 
21 |     manager = Manager()
22 |     workQueue = manager.Queue(1000)
23 | 
24 |     # 填充队列
25 |     for url in link_list:
26 |         workQueue.put(url)
27 | 
28 |     print ("Started processes")
29 |     pool = Pool(processes=p_num)
30 |     for i in range(p_num):
31 |         pool.apply_async(crawler, args=(workQueue, i))
32 | 
33 |     
34 |     pool.close()
35 |     pool.join()
36 | 
37 |     end = time.time()
38 |     time_spend = end-start
39 |     print ('Pool + Queue多进程爬虫的总时间为：', time_spend)
40 |     print ('Main process Ended!')
41 |     return time_spend
42 | 
43 | if __name__ == '__main__':
44 |     link_list = []
45 |     with open('alexa.txt', 'r') as file:
46 |         file_list = file.readlines()
47 |         for eachone in file_list:
48 |             link = eachone.split('\t')[1]
49 |             link = link.replace('\n','')
50 |             link_list.append(link)
51 | 
52 |     multiprocess_main(link_list, 3)


--------------------------------------------------------------------------------
/第二版/Cha 8 -提升爬虫的速度/cha8/mutilprocess1.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing import Process, Queue
 2 | import time
 3 | import requests
 4 | 
 5 | link_list = []
 6 | with open('alexa.txt', 'r') as file:
 7 |     file_list = file.readlines()
 8 |     for eachone in file_list:
 9 |         link = eachone.split('\t')[1]
10 |         link = link.replace('\n','')
11 |         link_list.append(link)
12 | 
13 | start = time.time()
14 | class MyProcess(Process):
15 |     def __init__(self, q):
16 |         Process.__init__(self)
17 |         self.q = q
18 | 
19 |     def run(self):
20 |         print ("Starting " , self.pid)
21 |         while not self.q.empty():
22 |             crawler(self.q)
23 |         print ("Exiting " , self.pid)
24 | 
25 | def crawler(q):
26 |     url = q.get(timeout=2)
27 |     try:
28 |         r = requests.get(url, timeout=20)
29 |         print (q.qsize(), r.status_code, url)
30 |     except Exception as e: 
31 |         print (q.qsize(), url, 'Error: ', e)
32 | 
33 | if __name__ == '__main__':
34 |     ProcessNames = ["Process-1", "Process-2", "Process-3"]
35 |     workQueue = Queue(1000)
36 | 
37 |     # 填充队列
38 |     for url in link_list:
39 |         workQueue.put(url)
40 | 
41 |     for i in range(0, 3):
42 |         p = MyProcess(workQueue)
43 |         p.daemon = True
44 |         p.start()
45 |         p.join()
46 | 
47 |     end = time.time()
48 |     print ('Process + Queue多进程爬虫的总时间为：', end-start)
49 |     print ('Main process Ended!')


--------------------------------------------------------------------------------
/第二版/Cha 8 -提升爬虫的速度/cha8/mutilprocess2.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing import Pool, Manager
 2 | import time
 3 | import requests
 4 | 
 5 | link_list = []
 6 | with open('alexa.txt', 'r') as file:
 7 |     file_list = file.readlines()
 8 |     for eachone in file_list:
 9 |         link = eachone.split('\t')[1]
10 |         link = link.replace('\n','')
11 |         link_list.append(link)
12 | 
13 | start = time.time()
14 | def crawler(q, index):
15 |     Process_id = 'Process-' + str(index)
16 |     while not q.empty():
17 |         url = q.get(timeout=2)
18 |         try:
19 |             r = requests.get(url, timeout=20)
20 |             print (Process_id, q.qsize(), r.status_code, url)
21 |         except Exception as e: 
22 |             print (Process_id, q.qsize(), url, 'Error: ', e)
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     manager = Manager()
27 |     workQueue = manager.Queue(1000)
28 | 
29 |     # 填充队列
30 |     for url in link_list:
31 |         workQueue.put(url)
32 | 
33 |     pool = Pool(processes=3)
34 |     for i in range(4):
35 |         pool.apply_async(crawler, args=(workQueue, i))
36 | 
37 |     print ("Started processes")
38 |     pool.close()
39 |     pool.join()
40 | 
41 |     end = time.time()
42 |     print ('Pool + Queue多进程爬虫的总时间为：', end-start)
43 |     print ('Main process Ended!')
44 | 


--------------------------------------------------------------------------------
/第二版/Cha 8 -提升爬虫的速度/cha8/mutilprocess3.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing import Pool, Manager
 2 | import time
 3 | import requests
 4 | 
 5 | link_list = []
 6 | with open('alexa.txt', 'r') as file:
 7 |     file_list = file.readlines()
 8 |     for eachone in file_list:
 9 |         link = eachone.split('\t')[1]
10 |         link = link.replace('\n','')
11 |         link_list.append(link)
12 | 
13 | start = time.time()
14 | def crawler(q, index):
15 |     Process_id = 'Process-' + str(index)
16 |     while not q.empty():
17 |         url = q.get(timeout=2)
18 |         try:
19 |             r = requests.get(url, timeout=20)
20 |             print (Process_id, q.qsize(), r.status_code, url)
21 |         except Exception as e: 
22 |             print (Process_id, q.qsize(), url, 'Error: ', e)
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     manager = Manager()
27 |     workQueue = manager.Queue(1000)
28 | 
29 |     # 填充队列
30 |     for url in link_list:
31 |         workQueue.put(url)
32 | 
33 |     pool = Pool(processes=3)
34 |     for i in range(4):
35 |         pool.apply(crawler, args=(workQueue, i))
36 | 
37 |     print ("Started processes")
38 |     pool.close()
39 |     pool.join()
40 | 
41 |     end = time.time()
42 |     print ('Pool + Queue多进程爬虫的总时间为：', end-start)
43 |     print ('Main process Ended!')
44 | 


--------------------------------------------------------------------------------
/第二版/Cha 8 -提升爬虫的速度/cha8/result.txt:
--------------------------------------------------------------------------------
1 | 312.7718894481659	143.37620067596436	549.7254424095154	549.978456735611


--------------------------------------------------------------------------------
/第二版/Cha 8 -提升爬虫的速度/cha8/result_gevent.txt:
--------------------------------------------------------------------------------
1 | 338.3443522453308	922.8117818832397	312.1618547439575	484.05668663978577


--------------------------------------------------------------------------------
/第二版/Cha 8 -提升爬虫的速度/cha8/result_single_time.txt:
--------------------------------------------------------------------------------
1 | 1721.3604562282562


--------------------------------------------------------------------------------
/第二版/Cha 8 -提升爬虫的速度/cha8/thread1.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | import requests
 3 | import time
 4 | 
 5 | link_list = []
 6 | with open('alexa.txt', 'r') as file:
 7 |     file_list = file.readlines()
 8 |     for eachone in file_list:
 9 |         link = eachone.split('\t')[1]
10 |         link = link.replace('\n','')
11 |         link_list.append(link)
12 |         
13 | start = time.time()
14 | class myThread (threading.Thread):
15 |     def __init__(self, name, link_range):
16 |         threading.Thread.__init__(self)
17 |         self.name = name
18 |         self.link_range = link_range
19 |     def run(self):
20 |         print ("Starting " + self.name)
21 |         crawler(self.name, self.link_range)
22 |         print ("Exiting " + self.name)
23 |         
24 | def crawler(threadName, link_range):
25 |     for i in range(link_range[0],link_range[1]+1):
26 |         try:
27 |             r = requests.get(link_list[i], timeout=20)
28 |             print (threadName, r.status_code, link_list[i])
29 |         except Exception as e: 
30 |             print(threadName, 'Error: ', e)
31 |         
32 | thread_list = []
33 | link_range_list = [(0,200),(201,400),(401,600),(601,800),(801,1000)]
34 | 
35 | # 创建新线程
36 | for i in range(1,6):
37 |     thread = myThread("Thread-" + str(i), link_range_list[i-1])
38 |     thread.start()
39 |     thread_list.append(thread)
40 |     
41 | # 等待所有线程完成
42 | for thread in thread_list:
43 |     thread.join()
44 | 
45 | end = time.time()
46 | print ('简单多线程爬虫的总时间为：', end-start)
47 | print ("Exiting Main Thread")


--------------------------------------------------------------------------------
/第二版/Cha 8 -提升爬虫的速度/cha8/thread2.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | import requests
 3 | import time
 4 | import queue as Queue
 5 | 
 6 | link_list = []
 7 | with open('alexa.txt', 'r') as file:
 8 |     file_list = file.readlines()
 9 |     for eachone in file_list:
10 |         link = eachone.split('\t')[1]
11 |         link = link.replace('\n','')
12 |         link_list.append(link)
13 |         
14 | start = time.time()
15 | class myThread (threading.Thread):
16 |     def __init__(self, name, q):
17 |         threading.Thread.__init__(self)
18 |         self.name = name
19 |         self.q = q
20 |     def run(self):
21 |         print ("Starting " + self.name)
22 |         while True:
23 |             try:
24 |                 crawler(self.name, self.q)
25 |             except:
26 |                 break
27 |         print ("Exiting " + self.name)
28 |         
29 | def crawler(threadName, q):
30 |     url = q.get(timeout=2)
31 |     try:
32 |         r = requests.get(url, timeout=20)
33 |         print (q.qsize(), threadName, r.status_code, url)
34 |     except Exception as e: 
35 |         print (q.qsize(), threadName, url, 'Error: ', e)
36 |         
37 | threadList = ["Thread-1", "Thread-2", "Thread-3","Thread-4", "Thread-5"]
38 | workQueue = Queue.Queue(1000)
39 | threads = []
40 | 
41 | # 创建新线程
42 | for tName in threadList:
43 |     thread = myThread(tName, workQueue)
44 |     thread.start()
45 |     threads.append(thread)
46 |     
47 | # 填充队列
48 | for url in link_list:
49 |     workQueue.put(url)
50 | 
51 | # 等待所有线程完成
52 | for t in threads:
53 |     t.join()
54 | 
55 | end = time.time()
56 | print ('Queue多线程爬虫的总时间为：', end-start)
57 | print ("Exiting Main Thread")


--------------------------------------------------------------------------------
/第二版/Cha 8 -提升爬虫的速度/cha8/thread_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 |  # -*- coding: utf-8 -*-
 3 | 
 4 | import threading
 5 | import requests
 6 | import time
 7 | import queue as Queue
 8 | 
 9 | 
10 | class myThread (threading.Thread):
11 |     def __init__(self, name, q):
12 |         threading.Thread.__init__(self)
13 |         self.name = name
14 |         self.q = q
15 |     def run(self):
16 |         print ("Starting " + self.name)
17 |         while True:
18 |             try:
19 |                 crawler(self.name, self.q)
20 |             except:
21 |                 break
22 |         print ("Exiting " + self.name)
23 |         
24 | def crawler(threadName, q):
25 |     url = q.get(timeout=2)
26 |     try:
27 |         r = requests.get(url, timeout=20)
28 |         print (q.qsize(), threadName, r.status_code, url)
29 |     except Exception as e: 
30 |         print (q.qsize(), threadName, url, 'Error: ', e)
31 | 
32 | def thread_main(link_list, t_num):
33 |     start = time.time()
34 |     workQueue = Queue.Queue(1000)
35 |     threads = []
36 | 
37 |     # 创建新线程
38 |     for tName in range(t_num):
39 |         thread = myThread('Thread' + str(tName), workQueue)
40 |         thread.start()
41 |         threads.append(thread)
42 |         
43 |     # 填充队列
44 |     for url in link_list:
45 |         workQueue.put(url)
46 | 
47 |     # 等待所有线程完成
48 |     for t in threads:
49 |         t.join()
50 | 
51 |     end = time.time()
52 |     print ('Queue多线程爬虫的总时间为：', end-start)
53 |     print ("Exiting Main Thread")
54 |     return end-start
55 | 
56 | if __name__ == '__main__':
57 |     link_list = []
58 |     with open('alexa.txt', 'r') as file:
59 |         file_list = file.readlines()
60 |         for eachone in file_list:
61 |             link = eachone.split('\t')[1]
62 |             link = link.replace('\n','')
63 |             link_list.append(link)
64 | 
65 |     thread_main(link_list, 5)


--------------------------------------------------------------------------------
/第二版/Cha 8 -提升爬虫的速度/cha8/time_spend 2.py:
--------------------------------------------------------------------------------
 1 | from multiprocess_test import multiprocess_main
 2 | from thread_test import thread_main
 3 | 
 4 | if __name__ == '__main__':
 5 |     link_list = []
 6 |     with open('alexa.txt', 'r') as file:
 7 |         file_list = file.readlines()
 8 |         for eachone in file_list:
 9 |             link = eachone.split('\t')[1]
10 |             link = link.replace('\n','')
11 |             link_list.append(link)
12 | 
13 |     #single = single()
14 |     #print ('串行的总时间为：', single)
15 | 
16 |     #thread_time = thread_main(link_list, 5)
17 |     #print ('Queue多线程爬虫的总时间为：', thread_time)
18 | 
19 |     multiprocess_time = multiprocess_main(link_list, 3)
20 |     #print ('Pool + Queue多进程爬虫的总时间为：', multiprocess_time)
21 | 
22 |     #gevent_time = gevent_main(link_list, 10)
23 |     #print ('gevent + Queue多协程爬虫的总时间为：', gevent_time)
24 | 
25 |     #with open('result.txt','a+',encoding='utf-8') as f:
26 |     #    f.write(single + '\t' + thread_time + '\t' + multiprocess_time + '\t' + gevent_time)


--------------------------------------------------------------------------------
/第二版/Cha 8 -提升爬虫的速度/cha8/time_spend.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 |  # -*- coding: utf-8 -*-
 3 | 
 4 | import requests
 5 | import time
 6 | #from multiprocess_test import multiprocess_main
 7 | #from thread_test import thread_main
 8 | 
 9 | def single():
10 |     start = time.time()
11 |     for eachone in link_list:
12 |         try:
13 |             r = requests.get(eachone)
14 |             print (r.status_code, eachone)
15 |         except Exception as e: 
16 |             print('Error: ', e)
17 |     end = time.time()
18 |     time_spend = end-start
19 |     print ('串行的总时间为：', time_spend)
20 |     return time_spend
21 | 
22 | if __name__ == '__main__':
23 |     link_list = []
24 |     with open('alexa.txt', 'r') as file:
25 |         file_list = file.readlines()
26 |         for eachone in file_list:
27 |             link = eachone.split('\t')[1]
28 |             link = link.replace('\n','')
29 |             link_list.append(link)
30 | 
31 |     #thread_time10 = thread_main(link_list, 10)
32 |     #print ('Queue多线程爬虫的总时间为：', thread_time10)
33 | 
34 |     #multiprocess_time10 = multiprocess_main(link_list, 10)
35 |     #print ('Pool + Queue多进程爬虫的总时间为：', multiprocess_time10)
36 | 
37 |     #thread_time3 = thread_main(link_list, 3)
38 |     #print ('Queue多线程爬虫的总时间为：', thread_time3)
39 | 
40 |     #multiprocess_time3 = multiprocess_main(link_list, 3)
41 |     #print ('Pool + Queue多进程爬虫的总时间为：', multiprocess_time3)
42 | 
43 |     single_time = single()
44 |     print ('串行的总时间为：', single_time)
45 | 
46 |     with open('result_single_time.txt','a+',encoding='utf-8') as f:
47 |         f.write(str(single_time))
48 |         #f.write(str(thread_time10) + '\t' + str(multiprocess_time10) + '\t' + str(thread_time3) + '\t' + str(multiprocess_time3))


--------------------------------------------------------------------------------
/第二版/Cha 8 -提升爬虫的速度/多协程.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 8 -提升爬虫的速度/多协程.png


--------------------------------------------------------------------------------
/第二版/Cha 8 -提升爬虫的速度/多线程.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 8 -提升爬虫的速度/多线程.png


--------------------------------------------------------------------------------
/第二版/Cha 8 -提升爬虫的速度/多进程.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 8 -提升爬虫的速度/多进程.png


--------------------------------------------------------------------------------
/第二版/Cha 9 -反爬虫问题/Cha 9 -反爬虫问题.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 8.3如何“反反爬虫”？"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## 8.3.1修改请求 header"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 6,
 20 |    "metadata": {},
 21 |    "outputs": [
 22 |     {
 23 |      "name": "stdout",
 24 |      "output_type": "stream",
 25 |      "text": [
 26 |       "{'User-Agent': 'python-requests/2.12.4', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}\n"
 27 |      ]
 28 |     }
 29 |    ],
 30 |    "source": [
 31 |     "import requests\n",
 32 |     "r = requests.get('http://www.santostang.com')\n",
 33 |     "print (r.request.headers)"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 5,
 39 |    "metadata": {},
 40 |    "outputs": [
 41 |     {
 42 |      "name": "stdout",
 43 |      "output_type": "stream",
 44 |      "text": [
 45 |       "{'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}\n"
 46 |      ]
 47 |     }
 48 |    ],
 49 |    "source": [
 50 |     "import requests\n",
 51 |     "\n",
 52 |     "link = 'http://www.santostang.com'\n",
 53 |     "headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} \n",
 54 |     "r = requests.get(link, headers= headers)\n",
 55 |     "print (r.request.headers)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "from fake_useragent import UserAgent\n",
 65 |     "import requests\n",
 66 |     "\n",
 67 |     "link = 'http://www.santostang.com'\n",
 68 |     "ua=UserAgent()\n",
 69 |     "headers={\"User-Agent\":ua.random}\n",
 70 |     "response=requests.get(url=url,headers=headers)\n",
 71 |     "\n",
 72 |     "#响应状态信息\n",
 73 |     "print(response.status_code)\n",
 74 |     "print (r.request.headers)"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "## 8.3.2 修改爬虫的间隔时间"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 10,
 87 |    "metadata": {},
 88 |    "outputs": [
 89 |     {
 90 |      "name": "stdout",
 91 |      "output_type": "stream",
 92 |      "text": [
 93 |       "2.0001144409179688\n"
 94 |      ]
 95 |     }
 96 |    ],
 97 |    "source": [
 98 |     "import time\n",
 99 |     "t1 = time.time()\n",
100 |     "time.sleep(2)\n",
101 |     "t2 = time.time()\n",
102 |     "total_time = t2-t1\n",
103 |     "print (total_time)"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 17,
109 |    "metadata": {},
110 |    "outputs": [
111 |     {
112 |      "name": "stdout",
113 |      "output_type": "stream",
114 |      "text": [
115 |       "0.3481693303048349\n"
116 |      ]
117 |     }
118 |    ],
119 |    "source": [
120 |     "import time\n",
121 |     "import random\n",
122 |     "\n",
123 |     "sleep_time = random.randint(0,2) + random.random()\n",
124 |     "print (sleep_time)\n",
125 |     "time.sleep(sleep_time)"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 19,
131 |    "metadata": {},
132 |    "outputs": [
133 |     {
134 |      "name": "stdout",
135 |      "output_type": "stream",
136 |      "text": [
137 |       "开始爬取这篇博客:  http://www.santostang.com/2017/03/08/hello-python/\n",
138 |       "这篇博客的标题为:  Hello Python!\n",
139 |       "开始休息:  0.16292490492777212 秒\n",
140 |       "开始爬取这篇博客:  http://www.santostang.com/2017/03/07/echarts%e5%ad%a6%e4%b9%a0%e7%ac%94%e8%ae%b02-%e5%8d%95%e9%a1%b5%e9%9d%a2%e5%a4%9a%e5%bc%a0%e5%9b%be%e8%a1%a8/\n",
141 |       "这篇博客的标题为:  echarts学习笔记(2) — 同一页面多图表\n",
142 |       "开始休息:  1.912631031656519 秒\n",
143 |       "开始爬取这篇博客:  http://www.santostang.com/2017/03/07/echarts%e5%ad%a6%e4%b9%a0%e7%ac%94%e8%ae%b01-%e4%bd%bf%e7%94%a8%e6%a8%a1%e5%9d%97%e5%8c%96%e5%8d%95%e6%96%87%e4%bb%b6%e5%bc%95%e5%85%a5/\n",
144 |       "这篇博客的标题为:  echarts学习笔记(1) — 模块化单文件引入\n",
145 |       "开始休息:  1.3634313119416182 秒\n",
146 |       "开始爬取这篇博客:  http://www.santostang.com/2017/03/06/%e3%80%90%e7%88%ac%e8%99%ab%e4%ba%8c%e3%80%91%e7%88%ac%e8%99%ab%e7%9a%84%e6%a1%86%e6%9e%b6%e5%92%8c%e5%9f%ba%e6%9c%ac%e8%ae%ae%e9%a2%98/\n",
147 |       "这篇博客的标题为:  【爬虫二】爬虫的框架和基本议题\n",
148 |       "开始休息:  2.0205314818737516 秒\n",
149 |       "开始爬取这篇博客:  http://www.santostang.com/2017/03/06/%e3%80%90%e7%88%ac%e8%99%ab%e4%b8%80%e3%80%91%e6%9c%80%e7%ae%80%e5%8d%95%e7%9a%84%e7%88%ac%e8%99%ab%ef%bc%8c%e9%9b%b6%e5%9f%ba%e7%a1%80%e6%95%99%e5%ad%a6/\n",
150 |       "这篇博客的标题为:  【爬虫一】最简单的爬虫，零基础教学\n",
151 |       "开始休息:  2.446761436097069 秒\n",
152 |       "开始爬取这篇博客:  http://www.santostang.com/2017/03/02/hello-world/\n",
153 |       "这篇博客的标题为:  Hello world!\n",
154 |       "开始休息:  0.8005131789714476 秒\n"
155 |      ]
156 |     }
157 |    ],
158 |    "source": [
159 |     "import requests\n",
160 |     "from bs4 import BeautifulSoup\n",
161 |     "import time\n",
162 |     "import random\n",
163 |     "\n",
164 |     "link = \"http://www.santostang.com/\"\n",
165 |     "\n",
166 |     "def scrap(link):\n",
167 |     "    headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} \n",
168 |     "    r = requests.get(link, headers= headers)\n",
169 |     "    html = r.text\n",
170 |     "    soup = BeautifulSoup(html, \"lxml\")\n",
171 |     "    return soup\n",
172 |     "\n",
173 |     "soup = scrap(link)\n",
174 |     "title_list = soup.find_all(\"h1\", class_=\"post-title\")\n",
175 |     "for eachone in title_list:\n",
176 |     "    url = eachone.a['href']\n",
177 |     "    print ('开始爬取这篇博客: ', url)\n",
178 |     "    soup_article = scrap(url)\n",
179 |     "    title = soup_article.find(\"h1\", class_=\"view-title\").text.strip()\n",
180 |     "    print ('这篇博客的标题为: ', title)\n",
181 |     "    sleep_time = random.randint(0,2) + random.random()\n",
182 |     "    print ('开始休息: ', sleep_time, '秒')\n",
183 |     "    time.sleep(sleep_time)"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {
190 |     "collapsed": true
191 |    },
192 |    "outputs": [],
193 |    "source": [
194 |     "scrap_times = 0\n",
195 |     "for eachone in title_list:\n",
196 |     "    url = eachone.a['href']\n",
197 |     "    print ('开始爬取这篇博客: ', url)\n",
198 |     "    soup_article = scrap(url)\n",
199 |     "    title = soup_article.find(\"h1\", class_=\"view-title\").text.strip()\n",
200 |     "    print ('这篇博客的标题为: ', title)\n",
201 |     "    \n",
202 |     "    scrap_times += 1\n",
203 |     "    if scrap_times % 5 == 0:\n",
204 |     "        sleep_time = 10 + random.random()\n",
205 |     "    else:\n",
206 |     "        sleep_time = random.randint(0,2) + random.random()\n",
207 |     "    time.sleep(sleep_time)\n",
208 |     "    print ('开始休息: ', sleep_time, '秒')"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "markdown",
213 |    "metadata": {},
214 |    "source": [
215 |     "## 8.3.3 使用代理"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": null,
221 |    "metadata": {
222 |     "collapsed": true
223 |    },
224 |    "outputs": [],
225 |    "source": [
226 |     "import requests\n",
227 |     "\n",
228 |     "link = \"http://www.santostang.com/\"\n",
229 |     "proxies = {'http':'http://xxx.xxx.xxx.xxx:xxxx'}\n",
230 |     "response = requests.get(link, proxies=proxies)"
231 |    ]
232 |   }
233 |  ],
234 |  "metadata": {
235 |   "kernelspec": {
236 |    "display_name": "Python 3",
237 |    "language": "python",
238 |    "name": "python3"
239 |   },
240 |   "language_info": {
241 |    "codemirror_mode": {
242 |     "name": "ipython",
243 |     "version": 3
244 |    },
245 |    "file_extension": ".py",
246 |    "mimetype": "text/x-python",
247 |    "name": "python",
248 |    "nbconvert_exporter": "python",
249 |    "pygments_lexer": "ipython3",
250 |    "version": "3.6.5"
251 |   },
252 |   "toc": {
253 |    "base_numbering": 1,
254 |    "nav_menu": {},
255 |    "number_sections": true,
256 |    "sideBar": true,
257 |    "skip_h1_title": false,
258 |    "title_cell": "Table of Contents",
259 |    "title_sidebar": "Contents",
260 |    "toc_cell": false,
261 |    "toc_position": {},
262 |    "toc_section_display": true,
263 |    "toc_window_display": false
264 |   }
265 |  },
266 |  "nbformat": 4,
267 |  "nbformat_minor": 2
268 | }
269 | 


--------------------------------------------------------------------------------
/第二版/geckodriver.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/geckodriver.exe


--------------------------------------------------------------------------------