├── README.md ├── 第一版 ├── Cha 10 -登录与验证码处理 │ ├── Cha 10 -登录与验证码处理.ipynb │ ├── captcha.jpg │ ├── captcha_gray.jpg │ ├── captcha_thresholded.jpg │ └── cookies ├── Cha 11 -服务器采集 │ ├── Cha 11 -服务器采集.ipynb │ ├── tor1.py │ ├── tor2.py │ └── tor3.py ├── Cha 12 -分布式爬虫 │ ├── .ipynb_checkpoints │ │ └── Cha 12 - 分布式爬虫-checkpoint.ipynb │ ├── 1497099934.jpg │ ├── Cha 12 - 分布式爬虫.ipynb │ ├── alexa.txt │ ├── master.py │ └── slave.py ├── Cha 13 -爬虫实战一:维基百科 │ ├── Cha 12 -爬虫实战一:维基百科.ipynb │ └── link_12-3.txt ├── Cha 14 -爬虫实战二:知乎Live │ └── Cha 14 -爬虫实战二:知乎Live.ipynb ├── Cha 15 -爬虫实战三:百度地图API │ ├── Cha 14 -爬虫实战三:百度地图API.ipynb │ └── cities.txt ├── Cha 16 -爬虫实战四:餐厅评价 │ └── Cha 16 -爬虫实战四:餐厅评价.ipynb ├── Cha 2 - 编写你的第一个网络爬虫 │ ├── Cha 2 -编写你的第一个网络爬虫.ipynb │ └── Cha 2 _章末实战.ipynb ├── Cha 3 -静态网页抓取 │ ├── Cha 3 -静态网页抓取.ipynb │ └── Cha 3 _章末实战.ipynb ├── Cha 4 -动态网页抓取 │ ├── Cha 4 -动态网页抓取.ipynb │ └── Cha 4 _章末实战.ipynb ├── Cha 5 -解析网页 │ ├── Cha 5 -解析网页.ipynb │ └── Cha 5 _章末实战.ipynb ├── Cha 6 -数据储存 │ ├── Cha 6 -数据存储.ipynb │ └── Cha 6 _章末实战.ipynb ├── Cha 7 -提升爬虫的速度 │ ├── Cha 7 -提升爬虫的速度.ipynb │ ├── Get Alexa.ipynb │ ├── alexa.txt │ ├── cha7 │ │ ├── __pycache__ │ │ │ ├── multiprocess_test.cpython-35.pyc │ │ │ ├── multiprocess_test.cpython-36.pyc │ │ │ ├── thread_test.cpython-35.pyc │ │ │ └── thread_test.cpython-36.pyc │ │ ├── alexa.txt │ │ ├── gevent1.py │ │ ├── gevent_test.py │ │ ├── multiprocess_test.py │ │ ├── mutilprocess1.py │ │ ├── mutilprocess2.py │ │ ├── mutilprocess3.py │ │ ├── result.txt │ │ ├── result_gevent.txt │ │ ├── result_single_time.txt │ │ ├── thread1.py │ │ ├── thread2.py │ │ ├── thread_test.py │ │ ├── time_spend 2.py │ │ └── time_spend.py │ ├── 多协程.png │ ├── 多线程.png │ └── 多进程.png ├── Cha 8 -反爬虫问题 │ └── Cha 8 -反爬虫问题.ipynb └── Cha 9 -解决中文乱码 │ └── Cha 9 -解决中文乱码.ipynb └── 第二版 ├── Cha 10 -解决中文乱码 └── Cha 10 -解决中文乱码.ipynb ├── Cha 11 -登录与验证码处理 ├── Cha 11 -登录与验证码处理.ipynb ├── captcha.jpg ├── captcha_gray.jpg ├── captcha_thresholded.jpg └── cookies ├── Cha 12 -服务器采集 ├── Cha 12 -服务器采集.ipynb ├── tor1.py ├── tor2.py └── tor3.py ├── Cha 13 -分布式爬虫 ├── 1497099934.jpg ├── Cha 13 - 分布式爬虫.ipynb ├── alexa.txt ├── master.py └── slave.py ├── Cha 14 -爬虫实战一:维基百科 ├── Cha 14 -爬虫实战一:维基百科.ipynb └── link_12-3.txt ├── Cha 15 -爬虫实战二:知乎Live └── Cha 15 -爬虫实战二:知乎Live.ipynb ├── Cha 16 -爬虫实战三:百度地图API ├── Cha 16 -爬虫实战三:百度地图API.ipynb └── cities.txt ├── Cha 17 -爬虫实战四:图书信息 ├── Cha 17 -爬虫实战四:图书信息.ipynb ├── book_list.csv └── book_review.csv ├── Cha 2 - 编写你的第一个网络爬虫 ├── Cha 2 -编写你的第一个网络爬虫.ipynb ├── Cha 2 _章末实战.ipynb └── title_test.txt ├── Cha 3 -静态网页抓取 ├── Cha 3 -静态网页抓取.ipynb ├── Cha 3 _章末实战.ipynb └── Cha 3_自我实践(章末).ipynb ├── Cha 4 -动态网页抓取 ├── Cha 4 -动态网页抓取.ipynb ├── Cha 4 _章末实战.ipynb ├── Cha 4 _自我实践(章末).ipynb └── geckodriver.log ├── Cha 5 -解析网页 ├── Cha 5 -解析网页.ipynb ├── Cha 5 _章末实战.ipynb └── Cha 5 _自我实践(章末).ipynb ├── Cha 6 -数据储存 ├── Cha 6 -数据存储.ipynb ├── Cha 6 _章末实战.ipynb ├── test.csv └── test2.csv ├── Cha 7 -Scrapy爬虫框架 ├── Cha7 - 自我实践(章末)答案 │ └── financeSpider │ │ ├── financeSpider │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── items.cpython-36.pyc │ │ │ ├── pipelines.cpython-36.pyc │ │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ └── finance.cpython-36.pyc │ │ │ └── finance.py │ │ ├── result.txt │ │ └── scrapy.cfg ├── blogSpider │ ├── article.csv │ ├── article.json │ ├── blogSpider │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── items.cpython-36.pyc │ │ │ ├── pipelines.cpython-36.pyc │ │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── santostang - 副本.cpython-36.pyc │ │ │ └── santostang.cpython-36.pyc │ │ │ ├── santostang - 副本.py │ │ │ └── santostang.py │ ├── index.html │ ├── result.txt │ └── scrapy.cfg └── financeSpider │ ├── financeSpider │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── items.cpython-36.pyc │ │ ├── pipelines.cpython-36.pyc │ │ └── settings.cpython-36.pyc │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── finance.cpython-36.pyc │ │ └── finance.py │ ├── result.txt │ └── scrapy.cfg ├── Cha 8 -提升爬虫的速度 ├── Cha 8 -提升爬虫的速度.ipynb ├── alexa.txt ├── cha8 │ ├── __pycache__ │ │ ├── multiprocess_test.cpython-35.pyc │ │ ├── multiprocess_test.cpython-36.pyc │ │ ├── thread_test.cpython-35.pyc │ │ └── thread_test.cpython-36.pyc │ ├── alexa.txt │ ├── gevent1.py │ ├── gevent_test.py │ ├── multiprocess_test.py │ ├── mutilprocess1.py │ ├── mutilprocess2.py │ ├── mutilprocess3.py │ ├── result.txt │ ├── result_gevent.txt │ ├── result_single_time.txt │ ├── thread1.py │ ├── thread2.py │ ├── thread_test.py │ ├── time_spend 2.py │ └── time_spend.py ├── 多协程.png ├── 多线程.png └── 多进程.png ├── Cha 9 -反爬虫问题 └── Cha 9 -反爬虫问题.ipynb └── geckodriver.exe /README.md: -------------------------------------------------------------------------------- 1 | # PythonScraping 2 | 此为《Python 网络爬虫:从入门到实践》的源代码,欢迎读者使用学习。 3 | 本人对代码拥有知识产权,如果读者需要使用其中的代码,请在注释中写明作者:唐松,来自《Python 网络爬虫:从入门到实践》。 4 | 5 | 本书分为两版, 6 | 7 | 第二版为2019年出版《Python 网络爬虫:从入门到实践》,相较第一版,本代码加入了每章课后练习的答案代码。 8 | 9 | 京东:[《Python网络爬虫从入门到实践 第2版》(唐松)- 京东图书](https://item.jd.com/12536063.html) 10 | 11 | 当当:[《Python网络爬虫从入门到实践 第2版》(唐松)- 当当图书](http://product.dangdang.com/27882003.html) 12 | 13 |
14 | 第一版为2017年出版《Python 网络爬虫:从入门到实践》 15 | 16 | 京东:[《Python网络爬虫从入门到实践》(唐松,陈智铨)- 京东图书](http://item.jd.com/12180379.html) 17 | 18 | 当当:[《Python网络爬虫从入门到实践》(唐松 陈智铨)- 当当图书](http://product.dangdang.com/25162123.html) 19 | 20 | 天猫:[Python网络爬虫从入门到实践 ](https://detail.tmall.com/item.htm?id=558781742115) 21 | 22 | Code for *Web Crawling with Python* Published by China Machine Press in 2019 and 2017 23 | 24 | 25 | -------------------------------------------------------------------------------- /第一版/Cha 10 -登录与验证码处理/captcha.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第一版/Cha 10 -登录与验证码处理/captcha.jpg -------------------------------------------------------------------------------- /第一版/Cha 10 -登录与验证码处理/captcha_gray.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第一版/Cha 10 -登录与验证码处理/captcha_gray.jpg -------------------------------------------------------------------------------- /第一版/Cha 10 -登录与验证码处理/captcha_thresholded.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第一版/Cha 10 -登录与验证码处理/captcha_thresholded.jpg -------------------------------------------------------------------------------- /第一版/Cha 10 -登录与验证码处理/cookies: -------------------------------------------------------------------------------- 1 | #LWP-Cookies-2.0 2 | Set-Cookie3: wordpress_logged_in_d7bcd9a2844a13f8dc3e5a4eb2cdfa70="test%7C1495985122%7CEbrRwEjpLk1v6MIpVKS8KzpNCw3iZlzA0WBqM8JpovA%7Ce97d29b8b77573ce9983896968d8ea9acd5d8959465c15650e0b5a82f4a2c156"; path="/"; domain="www.santostang.com"; path_spec; expires="2017-05-29 03:25:00Z"; httponly=None; version=0 3 | Set-Cookie3: wordpress_d7bcd9a2844a13f8dc3e5a4eb2cdfa70="test%7C1495985122%7CEbrRwEjpLk1v6MIpVKS8KzpNCw3iZlzA0WBqM8JpovA%7Cd3c089e89e99da8d6b43228440e7c791ddce3e2ef172b03bfebc7692e49b8fb8"; path="/wp-admin"; domain="www.santostang.com"; path_spec; expires="2017-05-29 03:25:00Z"; httponly=None; version=0 4 | Set-Cookie3: wordpress_d7bcd9a2844a13f8dc3e5a4eb2cdfa70="test%7C1495985122%7CEbrRwEjpLk1v6MIpVKS8KzpNCw3iZlzA0WBqM8JpovA%7Cd3c089e89e99da8d6b43228440e7c791ddce3e2ef172b03bfebc7692e49b8fb8"; path="/wp-content/plugins"; domain="www.santostang.com"; path_spec; expires="2017-05-29 03:25:00Z"; httponly=None; version=0 5 | -------------------------------------------------------------------------------- /第一版/Cha 11 -服务器采集/tor1.py: -------------------------------------------------------------------------------- 1 | import socket 2 | import socks 3 | import requests 4 | 5 | # Tor使用9150端口为默认的socks端口 6 | socks.set_default_proxy(socks.SOCKS5, "127.0.0.1", 9150) 7 | socket.socket = socks.socksocket 8 | # 获取这次抓取使用的IP地址 9 | a = requests.get("http://checkip.amazonaws.com").text 10 | 11 | print (a) -------------------------------------------------------------------------------- /第一版/Cha 11 -服务器采集/tor2.py: -------------------------------------------------------------------------------- 1 | from stem import Signal 2 | from stem.control import Controller 3 | import socket 4 | import socks 5 | import requests 6 | import time 7 | 8 | controller = Controller.from_port(port = 9151) 9 | controller.authenticate() 10 | socks.set_default_proxy(socks.SOCKS5, "127.0.0.1", 9150) 11 | socket.socket = socks.socksocket 12 | 13 | total_scrappy_time = 0 14 | total_changeIP_time = 0 15 | 16 | for x in range(0,10): 17 | a = requests.get("http://checkip.amazonaws.com").text 18 | print ("第", x+1, "次IP:", a) 19 | 20 | time1 = time.time() 21 | a = requests.get("http://www.santostang.com/").text 22 | #print (a) 23 | time2 = time.time() 24 | total_scrappy_time = total_scrappy_time + time2-time1 25 | print ("第", x+1, "次抓取花费时间:", time2-time1) 26 | 27 | time3 = time.time() 28 | controller.signal(Signal.NEWNYM) 29 | time.sleep(5) 30 | time4 = time.time() 31 | total_changeIP_time = total_changeIP_time + time4-time3-5 32 | print ("第", x+1, "次更换IP花费时间:", time4-time3-5) 33 | 34 | print ("平均抓取花费时间:", total_scrappy_time/10) 35 | print ("平均更换IP花费时间:", total_changeIP_time/10) 36 | -------------------------------------------------------------------------------- /第一版/Cha 11 -服务器采集/tor3.py: -------------------------------------------------------------------------------- 1 | from stem import Signal 2 | from stem.control import Controller 3 | import socket 4 | import socks 5 | import requests 6 | import time 7 | 8 | #controller = Controller.from_port(port = 9151) 9 | #controller.authenticate() 10 | #socks.set_default_proxy(socks.SOCKS5, "127.0.0.1", 9150) 11 | #socket.socket = socks.socksocket 12 | 13 | total_scrappy_time = 0 14 | total_changeIP_time = 0 15 | 16 | for x in range(0,10): 17 | #a = requests.get("http://checkip.amazonaws.com").text 18 | #print ("第", x+1, "次IP:", a) 19 | 20 | time1 = time.time() 21 | a = requests.get("http://www.santostang.com/").text 22 | #print (a) 23 | time2 = time.time() 24 | total_scrappy_time = total_scrappy_time + time2-time1 25 | print ("第", x+1, "次抓取花费时间:", time2-time1) 26 | 27 | time3 = time.time() 28 | #controller.signal(Signal.NEWNYM) 29 | time.sleep(5) 30 | time4 = time.time() 31 | total_changeIP_time = total_changeIP_time + time4-time3-5 32 | print ("第", x+1, "次更换IP花费时间:", time4-time3-5) 33 | 34 | print ("平均抓取花费时间:", total_scrappy_time/10) 35 | print ("平均更换IP花费时间:", total_changeIP_time/10) -------------------------------------------------------------------------------- /第一版/Cha 12 -分布式爬虫/.ipynb_checkpoints/Cha 12 - 分布式爬虫-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 12.3 Redis分布式爬虫实战" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 12.3.2 加入任务队列" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "def push_redis_list():\n", 26 | " r = Redis(host='137.189.204.65', port=6379 ,password='redisredis')\n", 27 | " print (r.keys('*'))\n", 28 | " \n", 29 | " link_list = []\n", 30 | " with open('alexa.txt', 'r') as file:\n", 31 | " file_list = file.readlines()\n", 32 | " for eachone in file_list:\n", 33 | " link = eachone.split('\\t')[1]\n", 34 | " link = link.replace('\\n','')\n", 35 | " link_list.append(link)\n", 36 | " if len(link_list) == 100:\n", 37 | " break\n", 38 | " \n", 39 | " for url in link_list:\n", 40 | " response = requests.get(url, headers=headers, timeout=20)\n", 41 | " soup = BeautifulSoup(response.text, 'lxml')\n", 42 | " img_list = soup.find_all('img')\n", 43 | " for img in img_list:\n", 44 | " img_url = img['src']\n", 45 | " if img_url != '':\n", 46 | " print (\"加入的图片url: \", img_url)\n", 47 | " r.lpush('img_url',img_url)\n", 48 | " print ('现在图片链接的个数为', r.llen('img_url'))\n", 49 | " return" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "# 12.3.3 读取任务队列,下载图片" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "collapsed": true 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "def get_img():\n", 68 | " r = Redis(host='137.189.204.65', port=6379 ,password='redisredis')\n", 69 | " while True:\n", 70 | " try:\n", 71 | " url = r.lpop('img_url')\n", 72 | " url = url.decode('ascii')\n", 73 | " try:\n", 74 | " response = requests.get(url, headers=headers,timeout = 20)\n", 75 | " name = int(time.time())\n", 76 | " f = open(str(name)+ url[-4:], 'wb')\n", 77 | " f.write(response.content)\n", 78 | " f.close()\n", 79 | " print ('已经获取图片', url)\n", 80 | " except Exception as e:\n", 81 | " print ('爬取图片过程出问题', e)\n", 82 | " time.sleep(3)\n", 83 | " except Exception as e:\n", 84 | " print (e)\n", 85 | " time.sleep(10)\n", 86 | " break\n", 87 | " return " 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "# 12.3.4 分布式爬虫代码" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 13, 100 | "metadata": { 101 | "collapsed": false 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "import requests\n", 106 | "from bs4 import BeautifulSoup\n", 107 | "import re\n", 108 | "import time\n", 109 | "from redis import Redis\n", 110 | "headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' }\n", 111 | "\n", 112 | "def push_redis_list():\n", 113 | " #与上面此函数相同\n", 114 | "\n", 115 | "def get_img():\n", 116 | " #与上面此函数相同\n", 117 | "\n", 118 | "if __name__ == '__main__': \n", 119 | " this_machine = 'master' \n", 120 | " print ('开始分布式爬虫')\n", 121 | " if this_machine == 'master':\n", 122 | " push_redis_list()\n", 123 | " else:\n", 124 | " get_img()" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 14, 130 | "metadata": { 131 | "collapsed": false 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "import requests\n", 136 | "from bs4 import BeautifulSoup\n", 137 | "import re\n", 138 | "import time\n", 139 | "from redis import Redis\n", 140 | "headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' }\n", 141 | "\n", 142 | "def push_redis_list():\n", 143 | " #与上面此函数相同\n", 144 | "\n", 145 | "def get_img():\n", 146 | " #与上面此函数相同\n", 147 | "\n", 148 | "if __name__ == '__main__': \n", 149 | " this_machine = 'slave' \n", 150 | " print ('开始分布式爬虫')\n", 151 | " if this_machine == 'master':\n", 152 | " push_redis_list()\n", 153 | " else:\n", 154 | " get_img()" 155 | ] 156 | } 157 | ], 158 | "metadata": { 159 | "kernelspec": { 160 | "display_name": "Python 3", 161 | "language": "python", 162 | "name": "python3" 163 | }, 164 | "language_info": { 165 | "codemirror_mode": { 166 | "name": "ipython", 167 | "version": 3 168 | }, 169 | "file_extension": ".py", 170 | "mimetype": "text/x-python", 171 | "name": "python", 172 | "nbconvert_exporter": "python", 173 | "pygments_lexer": "ipython3", 174 | "version": "3.6.0" 175 | } 176 | }, 177 | "nbformat": 4, 178 | "nbformat_minor": 2 179 | } 180 | -------------------------------------------------------------------------------- /第一版/Cha 12 -分布式爬虫/1497099934.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第一版/Cha 12 -分布式爬虫/1497099934.jpg -------------------------------------------------------------------------------- /第一版/Cha 12 -分布式爬虫/Cha 12 - 分布式爬虫.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 12.3 Redis分布式爬虫实战" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 12.3.2 加入任务队列" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "def push_redis_list():\n", 26 | " r = Redis(host='137.189.204.65', port=6379 ,password='redisredis')\n", 27 | " print (r.keys('*'))\n", 28 | " \n", 29 | " link_list = []\n", 30 | " with open('alexa.txt', 'r') as file:\n", 31 | " file_list = file.readlines()\n", 32 | " for eachone in file_list:\n", 33 | " link = eachone.split('\\t')[1]\n", 34 | " link = link.replace('\\n','')\n", 35 | " link_list.append(link)\n", 36 | " if len(link_list) == 100:\n", 37 | " break\n", 38 | " \n", 39 | " for url in link_list:\n", 40 | " response = requests.get(url, headers=headers, timeout=20)\n", 41 | " soup = BeautifulSoup(response.text, 'lxml')\n", 42 | " img_list = soup.find_all('img')\n", 43 | " for img in img_list:\n", 44 | " img_url = img['src']\n", 45 | " if img_url != '':\n", 46 | " print (\"加入的图片url: \", img_url)\n", 47 | " r.lpush('img_url',img_url)\n", 48 | " print ('现在图片链接的个数为', r.llen('img_url'))\n", 49 | " return" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "# 12.3.3 读取任务队列,下载图片" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "collapsed": true 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "def get_img():\n", 68 | " r = Redis(host='137.189.204.65', port=6379 ,password='redisredis')\n", 69 | " while True:\n", 70 | " try:\n", 71 | " url = r.lpop('img_url')\n", 72 | " url = url.decode('ascii')\n", 73 | " try:\n", 74 | " response = requests.get(url, headers=headers,timeout = 20)\n", 75 | " name = int(time.time())\n", 76 | " f = open(str(name)+ url[-4:], 'wb')\n", 77 | " f.write(response.content)\n", 78 | " f.close()\n", 79 | " print ('已经获取图片', url)\n", 80 | " except Exception as e:\n", 81 | " print ('爬取图片过程出问题', e)\n", 82 | " time.sleep(3)\n", 83 | " except Exception as e:\n", 84 | " print (e)\n", 85 | " time.sleep(10)\n", 86 | " break\n", 87 | " return " 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "# 12.3.4 分布式爬虫代码" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 13, 100 | "metadata": { 101 | "collapsed": false 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "import requests\n", 106 | "from bs4 import BeautifulSoup\n", 107 | "import re\n", 108 | "import time\n", 109 | "from redis import Redis\n", 110 | "headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' }\n", 111 | "\n", 112 | "def push_redis_list():\n", 113 | " #与上面此函数相同\n", 114 | "\n", 115 | "def get_img():\n", 116 | " #与上面此函数相同\n", 117 | "\n", 118 | "if __name__ == '__main__': \n", 119 | " this_machine = 'master' \n", 120 | " print ('开始分布式爬虫')\n", 121 | " if this_machine == 'master':\n", 122 | " push_redis_list()\n", 123 | " else:\n", 124 | " get_img()" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 14, 130 | "metadata": { 131 | "collapsed": false 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "import requests\n", 136 | "from bs4 import BeautifulSoup\n", 137 | "import re\n", 138 | "import time\n", 139 | "from redis import Redis\n", 140 | "headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' }\n", 141 | "\n", 142 | "def push_redis_list():\n", 143 | " #与上面此函数相同\n", 144 | "\n", 145 | "def get_img():\n", 146 | " #与上面此函数相同\n", 147 | "\n", 148 | "if __name__ == '__main__': \n", 149 | " this_machine = 'slave' \n", 150 | " print ('开始分布式爬虫')\n", 151 | " if this_machine == 'master':\n", 152 | " push_redis_list()\n", 153 | " else:\n", 154 | " get_img()" 155 | ] 156 | } 157 | ], 158 | "metadata": { 159 | "kernelspec": { 160 | "display_name": "Python 3", 161 | "language": "python", 162 | "name": "python3" 163 | }, 164 | "language_info": { 165 | "codemirror_mode": { 166 | "name": "ipython", 167 | "version": 3 168 | }, 169 | "file_extension": ".py", 170 | "mimetype": "text/x-python", 171 | "name": "python", 172 | "nbconvert_exporter": "python", 173 | "pygments_lexer": "ipython3", 174 | "version": "3.6.0" 175 | } 176 | }, 177 | "nbformat": 4, 178 | "nbformat_minor": 2 179 | } 180 | -------------------------------------------------------------------------------- /第一版/Cha 12 -分布式爬虫/master.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import re 4 | import time 5 | from redis import Redis 6 | headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' } 7 | 8 | def push_redis_list(): 9 | r = Redis(host='137.189.204.65', port=6379 ,password='redisredis') 10 | print (r.keys('*')) 11 | 12 | link_list = [] 13 | with open('alexa.txt', 'r') as file: 14 | file_list = file.readlines() 15 | for eachone in file_list: 16 | link = eachone.split('\t')[1] 17 | link = link.replace('\n','') 18 | link_list.append(link) 19 | if len(link_list) == 100: 20 | break 21 | 22 | for url in link_list: 23 | response = requests.get(url, headers=headers, timeout=20) 24 | soup = BeautifulSoup(response.text, 'lxml') 25 | img_list = soup.find_all('img') 26 | for img in img_list: 27 | img_url = img['src'] 28 | if img_url != '': 29 | print ("加入的图片url: ", img_url) 30 | r.lpush('img_url',img_url) 31 | print ('现在图片链接的个数为', r.llen('img_url')) 32 | return 33 | 34 | def get_img(): 35 | r = Redis(host='137.189.204.65', port=6379 ,password='redisredis') 36 | while True: 37 | try: 38 | url = r.lpop('img_url') 39 | url = url.decode('ascii') 40 | try: 41 | response = requests.get(url, headers=headers,timeout = 20) 42 | name = int(time.time()) 43 | f = open(str(name)+ url[-4:], 'wb') 44 | f.write(response.content) 45 | f.close() 46 | print ('已经获取图片', url) 47 | except Exception as e: 48 | print ('爬取图片过程出问题', e) 49 | time.sleep(3) 50 | except Exception as e: 51 | print (e) 52 | time.sleep(10) 53 | break 54 | return 55 | 56 | if __name__ == '__main__': 57 | this_machine = 'master' 58 | print ('开始分布式爬虫') 59 | if this_machine == 'master': 60 | push_redis_list() 61 | else: 62 | get_img() -------------------------------------------------------------------------------- /第一版/Cha 12 -分布式爬虫/slave.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import re 4 | import time 5 | from redis import Redis 6 | headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' } 7 | 8 | def push_redis_list(): 9 | r = Redis(host='137.189.204.65', port=6379 ,password='redisredis') 10 | print (r.keys('*')) 11 | 12 | link_list = [] 13 | with open('alexa.txt', 'r') as file: 14 | file_list = file.readlines() 15 | for eachone in file_list: 16 | link = eachone.split('\t')[1] 17 | link = link.replace('\n','') 18 | link_list.append(link) 19 | if len(link_list) == 100: 20 | break 21 | 22 | for url in link_list: 23 | response = requests.get(url, headers=headers, timeout=20) 24 | soup = BeautifulSoup(response.text, 'lxml') 25 | img_list = soup.find_all('img') 26 | for img in img_list: 27 | img_url = img['src'] 28 | if img_url != '': 29 | print ("加入的图片url: ", img_url) 30 | r.lpush('img_url',img_url) 31 | print ('现在图片链接的个数为', r.llen('img_url')) 32 | return 33 | 34 | def get_img(): 35 | r = Redis(host='137.189.204.65', port=6379 ,password='redisredis') 36 | while True: 37 | try: 38 | url = r.lpop('img_url') 39 | url = url.decode('ascii') 40 | if url[:2] == '//': 41 | url = 'http:' + url 42 | print (url) 43 | try: 44 | response = requests.get(url, headers=headers,timeout = 20) 45 | name = int(time.time()) 46 | f = open(str(name)+ url[-4:], 'wb') 47 | f.write(response.content) 48 | f.close() 49 | print ('已经获取图片', url) 50 | except Exception as e: 51 | print ('爬取图片过程出问题', e) 52 | time.sleep(3) 53 | except Exception as e: 54 | print (e) 55 | time.sleep(10) 56 | break 57 | return 58 | 59 | if __name__ == '__main__': 60 | this_machine = 'slave' 61 | print ('开始分布式爬虫') 62 | if this_machine == 'master': 63 | push_redis_list() 64 | else: 65 | get_img() -------------------------------------------------------------------------------- /第一版/Cha 15 -爬虫实战三:百度地图API/cities.txt: -------------------------------------------------------------------------------- 1 | 南京市 534 2 | 苏州市 698 3 | 无锡市 491 4 | 常州市 292 5 | 南通市 236 6 | 盐城市 227 7 | 徐州市 216 8 | 扬州市 200 9 | 淮安市 145 10 | 泰州市 140 11 | 宿迁市 114 12 | 镇江市 110 13 | 连云港市 78 14 | 杭州市 665 15 | 宁波市 549 16 | 温州市 514 17 | 嘉兴市 301 18 | 金华市 274 19 | 绍兴市 216 20 | 台州市 213 21 | 湖州市 123 22 | 丽水市 87 23 | 衢州市 72 24 | 舟山市 59 25 | 广州市 1357 26 | 深圳市 1022 27 | 东莞市 509 28 | 佛山市 799 29 | 江门市 281 30 | 中山市 200 31 | 惠州市 199 32 | 珠海市 147 33 | 梅州市 132 34 | 湛江市 104 35 | 汕头市 94 36 | 韶关市 83 37 | 揭阳市 82 38 | 肇庆市 72 39 | 清远市 66 40 | 茂名市 61 41 | 阳江市 59 42 | 河源市 51 43 | 汕尾市 48 44 | 潮州市 33 45 | 云浮市 22 46 | 福州市 557 47 | 厦门市 426 48 | 泉州市 411 49 | 漳州市 184 50 | 龙岩市 93 51 | 南平市 91 52 | 三明市 90 53 | 宁德市 78 54 | 莆田市 66 55 | 济南市 223 56 | 青岛市 417 57 | 潍坊市 179 58 | 烟台市 165 59 | 临沂市 133 60 | 威海市 118 61 | 淄博市 95 62 | 济宁市 92 63 | 东营市 73 64 | 泰安市 72 65 | 枣庄市 62 66 | 滨州市 61 67 | 菏泽市 60 68 | 德州市 55 69 | 莱芜市 43 70 | 日照市 43 71 | 聊城市 42 72 | 郑州市 409 73 | 洛阳市 171 74 | 南阳市 102 75 | 平顶山市 64 76 | 新乡市 63 77 | 驻马店市 61 78 | 开封市 58 79 | 焦作市 54 80 | 三门峡市 48 81 | 信阳市 41 82 | 许昌市 40 83 | 周口市 38 84 | 商丘市 37 85 | 安阳市 36 86 | 漯河市 29 87 | 鹤壁市 25 88 | 濮阳市 21 89 | 济源市 18 90 | 石家庄市 287 91 | 保定市 190 92 | 秦皇岛市 157 93 | 唐山市 155 94 | 邯郸市 133 95 | 邢台市 115 96 | 张家口市 96 97 | 沧州市 83 98 | 廊坊市 74 99 | 承德市 49 100 | 衡水市 49 101 | 成都市 590 102 | 南充市 80 103 | 绵阳市 76 104 | 遂宁市 62 105 | 德阳市 58 106 | 内江市 55 107 | 泸州市 54 108 | 宜宾市 43 109 | 乐山市 41 110 | 广安市 40 111 | 攀枝花市 38 112 | 达州市 36 113 | 凉山彝族自治州 36 114 | 广元市 34 115 | 资阳市 34 116 | 自贡市 33 117 | 眉山市 30 118 | 雅安市 30 119 | 甘孜藏族自治州 19 120 | 巴中市 14 121 | 阿坝藏族羌族自治州 11 122 | 沈阳市 311 123 | 大连市 406 124 | 抚顺市 82 125 | 锦州市 70 126 | 鞍山市 59 127 | 营口市 54 128 | 盘锦市 53 129 | 本溪市 42 130 | 铁岭市 37 131 | 阜新市 34 132 | 葫芦岛市 34 133 | 辽阳市 29 134 | 丹东市 24 135 | 朝阳市 20 136 | 昆明市 402 137 | 西双版纳傣族自治州 140 138 | 玉溪市 71 139 | 大理白族自治州 71 140 | 曲靖市 48 141 | 丽江市 46 142 | 普洱市 39 143 | 红河哈尼族彝族自治州 36 144 | 保山市 33 145 | 临沧市 31 146 | 楚雄彝族自治州 28 147 | 文山壮族苗族自治州 23 148 | 德宏傣族景颇族自治州 20 149 | 迪庆藏族自治州 14 150 | 昭通市 14 151 | 怒江傈僳族自治州 5 152 | 长沙市 367 153 | 株洲市 134 154 | 郴州市 70 155 | 衡阳市 68 156 | 邵阳市 54 157 | 永州市 49 158 | 张家界市 43 159 | 娄底市 41 160 | 岳阳市 38 161 | 怀化市 36 162 | 湘西土家族苗族自治州 34 163 | 常德市 32 164 | 湘潭市 30 165 | 益阳市 22 166 | 武汉市 531 167 | 宜昌市 90 168 | 襄阳市 65 169 | 荆门市 62 170 | 十堰市 46 171 | 黄冈市 46 172 | 黄石市 43 173 | 荆州市 43 174 | 孝感市 42 175 | 咸宁市 38 176 | 随州市 35 177 | 恩施土家族苗族自治州 29 178 | 鄂州市 19 179 | 潜江市 19 180 | 仙桃市 16 181 | 神农架林区 10 182 | 天门市 10 183 | 赣州市 223 184 | 南昌市 145 185 | 九江市 111 186 | 上饶市 100 187 | 吉安市 93 188 | 宜春市 77 189 | 抚州市 75 190 | 萍乡市 53 191 | 新余市 51 192 | 景德镇市 46 193 | 鹰潭市 25 194 | 合肥市 274 195 | 芜湖市 70 196 | 黄山市 59 197 | 滁州市 57 198 | 阜阳市 56 199 | 六安市 55 200 | 安庆市 54 201 | 宿州市 48 202 | 马鞍山市 44 203 | 宣城市 44 204 | 巢湖市 38 205 | 淮南市 35 206 | 亳州市 31 207 | 池州市 27 208 | 淮北市 27 209 | 铜陵市 27 210 | 蚌埠市 25 211 | 太原市 209 212 | 晋中市 89 213 | 临汾市 78 214 | 长治市 70 215 | 大同市 58 216 | 晋城市 56 217 | 运城市 55 218 | 忻州市 47 219 | 阳泉市 45 220 | 吕梁市 33 221 | 朔州市 26 222 | 南宁市 180 223 | 柳州市 85 224 | 桂林市 77 225 | 玉林市 60 226 | 北海市 51 227 | 河池市 47 228 | 百色市 37 229 | 梧州市 35 230 | 贺州市 28 231 | 防城港市 26 232 | 贵港市 26 233 | 钦州市 26 234 | 来宾市 21 235 | 崇左市 18 236 | 西安市 378 237 | 宝鸡市 86 238 | 渭南市 61 239 | 咸阳市 56 240 | 汉中市 49 241 | 榆林市 34 242 | 延安市 31 243 | 铜川市 27 244 | 商洛市 23 245 | 安康市 22 246 | 哈尔滨市 205 247 | 齐齐哈尔市 65 248 | 大庆市 65 249 | 黑河市 57 250 | 牡丹江市 51 251 | 伊春市 47 252 | 佳木斯市 47 253 | 鹤岗市 36 254 | 鸡西市 34 255 | 绥化市 25 256 | 七台河市 22 257 | 双鸭山市 22 258 | 大兴安岭地区 14 259 | 包头市 139 260 | 鄂尔多斯市 101 261 | 呼和浩特市 96 262 | 呼伦贝尔市 55 263 | 赤峰市 45 264 | 乌海市 39 265 | 巴彦淖尔市 33 266 | 乌兰察布市 33 267 | 锡林郭勒盟 33 268 | 阿拉善盟 27 269 | 通辽市 26 270 | 兴安盟 21 271 | 贵阳市 156 272 | 遵义市 67 273 | 黔南布依族苗族自治州 51 274 | 毕节地区 40 275 | 铜仁地区 38 276 | 六盘水市 35 277 | 黔东南苗族侗族自治州 31 278 | 黔西南布依族苗族自治州 21 279 | 安顺市 12 280 | 长春市 201 281 | 延边朝鲜族自治州 90 282 | 吉林市 69 283 | 通化市 35 284 | 松原市 34 285 | 四平市 32 286 | 白山市 29 287 | 辽源市 17 288 | 白城市 15 289 | 兰州市 102 290 | 酒泉市 53 291 | 天水市 35 292 | 武威市 34 293 | 定西市 31 294 | 白银市 30 295 | 张掖市 23 296 | 平凉市 21 297 | 临夏回族自治州 20 298 | 金昌市 19 299 | 陇南市 18 300 | 甘南藏族自治州 11 301 | 嘉峪关市 11 302 | 庆阳市 11 303 | 乌鲁木齐市 110 304 | 昌吉回族自治州 44 305 | 克拉玛依市 31 306 | 伊犁哈萨克自治州 30 307 | 塔城地区 25 308 | 喀什地区 24 309 | 巴音郭楞蒙古自治州 19 310 | 阿勒泰地区 18 311 | 哈密地区 15 312 | 阿克苏地区 14 313 | 吐鲁番地区 13 314 | 和田地区 11 315 | 博尔塔拉蒙古自治州 9 316 | 石河子市 4 317 | 五家渠市 4 318 | 阿拉尔市 2 319 | 图木舒克市 2 320 | 海口市 97 321 | 三亚市 74 322 | 文昌市 13 323 | 儋州市 11 324 | 澄迈县 11 325 | 琼海市 10 326 | 万宁市 8 327 | 昌江黎族自治县 7 328 | 东方市 5 329 | 屯昌县 5 330 | 保亭黎族苗族自治县 4 331 | 定安县 4 332 | 琼中黎族苗族自治县 4 333 | 临高县 3 334 | 陵水黎族自治县 3 335 | 白沙黎族自治县 2 336 | 乐东黎族自治县 2 337 | 五指山市 2 338 | 银川市 105 339 | 吴忠市 36 340 | 石嘴山市 26 341 | 固原市 22 342 | 中卫市 10 343 | 西宁市 73 344 | 海西蒙古族藏族自治州 19 345 | 海南藏族自治州 14 346 | 海北藏族自治州 8 347 | 海东地区 7 348 | 黄南藏族自治州 7 349 | 果洛藏族自治州 3 350 | 玉树藏族自治州 2 351 | 拉萨市 18 352 | 日喀则地区 10 353 | 林芝地区 9 354 | 山南地区 6 355 | 阿里地区 2 356 | 昌都地区 1 357 | 北京市 1499 358 | 上海市 1147 359 | 重庆市 734 360 | 天津市 509 361 | 香港特别行政区 200 362 | 澳门特别行政区 43 363 | -------------------------------------------------------------------------------- /第一版/Cha 2 - 编写你的第一个网络爬虫/Cha 2 _章末实战.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 题目1:请使用Python中的循环,打印输出从1到100的所有奇数。" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 2, 13 | "metadata": { 14 | "scrolled": true 15 | }, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "1\n", 22 | "3\n", 23 | "5\n", 24 | "7\n", 25 | "9\n", 26 | "11\n", 27 | "13\n", 28 | "15\n", 29 | "17\n", 30 | "19\n", 31 | "21\n", 32 | "23\n", 33 | "25\n", 34 | "27\n", 35 | "29\n", 36 | "31\n", 37 | "33\n", 38 | "35\n", 39 | "37\n", 40 | "39\n", 41 | "41\n", 42 | "43\n", 43 | "45\n", 44 | "47\n", 45 | "49\n", 46 | "51\n", 47 | "53\n", 48 | "55\n", 49 | "57\n", 50 | "59\n", 51 | "61\n", 52 | "63\n", 53 | "65\n", 54 | "67\n", 55 | "69\n", 56 | "71\n", 57 | "73\n", 58 | "75\n", 59 | "77\n", 60 | "79\n", 61 | "81\n", 62 | "83\n", 63 | "85\n", 64 | "87\n", 65 | "89\n", 66 | "91\n", 67 | "93\n", 68 | "95\n", 69 | "97\n", 70 | "99\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "for i in range(1,101):\n", 76 | " if i % 2 == 1:\n", 77 | " print (i)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "# 题目2:请将字符串 ”你好$$$我正在学Python@#@#现在需要&%&%&修改字符串” 中的符号变成一个空格,需要输出的格式为:”你好 我正在学Python 现在需要 修改字符串”" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 17, 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "name": "stdout", 94 | "output_type": "stream", 95 | "text": [ 96 | "你好 我正在学Python 现在需要 修改字符串\n" 97 | ] 98 | } 99 | ], 100 | "source": [ 101 | "str1 = '你好$$$我正在学Python@#@#现在需要&%&%&修改字符串'\n", 102 | "str2 = str1.replace('$$$', ' ').replace('@#@#', ' ').replace('&%&%&', ' ')\n", 103 | "print (str2)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 18, 109 | "metadata": {}, 110 | "outputs": [ 111 | { 112 | "name": "stdout", 113 | "output_type": "stream", 114 | "text": [ 115 | "你好 我正在学Python 现在需要 修改字符串\n" 116 | ] 117 | } 118 | ], 119 | "source": [ 120 | "import re\n", 121 | "str1 = '你好$$$我正在学Python@#@#现在需要&%&%&修改字符串'\n", 122 | "str2 = re.sub('[$@#&%]+', ' ' ,str1)\n", 123 | "print (str2)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "# 题目3:输出 9*9 乘法口诀表" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 50, 136 | "metadata": {}, 137 | "outputs": [ 138 | { 139 | "name": "stdout", 140 | "output_type": "stream", 141 | "text": [ 142 | "1x1=1\t\n", 143 | "1x2=2\t2x2=4\t\n", 144 | "1x3=3\t2x3=6\t3x3=9\t\n", 145 | "1x4=4\t2x4=8\t3x4=12\t4x4=16\t\n", 146 | "1x5=5\t2x5=10\t3x5=15\t4x5=20\t5x5=25\t\n", 147 | "1x6=6\t2x6=12\t3x6=18\t4x6=24\t5x6=30\t6x6=36\t\n", 148 | "1x7=7\t2x7=14\t3x7=21\t4x7=28\t5x7=35\t6x7=42\t7x7=49\t\n", 149 | "1x8=8\t2x8=16\t3x8=24\t4x8=32\t5x8=40\t6x8=48\t7x8=56\t8x8=64\t\n", 150 | "1x9=9\t2x9=18\t3x9=27\t4x9=36\t5x9=45\t6x9=54\t7x9=63\t8x9=72\t9x9=81\t\n" 151 | ] 152 | } 153 | ], 154 | "source": [ 155 | "for i in range(1, 10):\n", 156 | " for j in range(1, i+1):\n", 157 | " print (\"%dx%d=%d\\t\" % (j, i, i*j), end=\"\")\n", 158 | " print(\"\")" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "# 题目4:请写出一个函数,当输入函数变量当月利润I,能返回应发放奖金总数,例如输出“利润100000元时,应发放奖金总数为10000元。”。\n", 166 | "其中,企业发放的奖金根据利润提成。利润(I)低于或等于10万元时,奖金可提10%;利润高于10万元,低于20万元时,低于10万元的部分按10%提成,高于10万元的部分,可提成7.5%;20万到40万之间时,高于20万元的部分,可提成5%;40万到60万之间时高于40万元的部分,可提成3%;60万到100万之间时,高于60万元的部分,可提成1.5%,高于100万元时,超过100万元的部分按1%提成" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 61, 172 | "metadata": {}, 173 | "outputs": [ 174 | { 175 | "name": "stdout", 176 | "output_type": "stream", 177 | "text": [ 178 | "净利润:210000\n", 179 | "利润为210000元时,应发奖金总数为18000元\n" 180 | ] 181 | } 182 | ], 183 | "source": [ 184 | "def calcute_profit(I):\n", 185 | " I = I / 10000\n", 186 | " if I <= 10:\n", 187 | " a = I * 0.01\n", 188 | " return a * 10000\n", 189 | " elif I <= 20 and I > 10:\n", 190 | " b =0.25 + I * 0.075\n", 191 | " return b * 10000\n", 192 | " elif I <= 40 and I > 20:\n", 193 | " c = 0.75 + I * 0.05\n", 194 | " return c * 10000\n", 195 | " elif I <= 60 and I > 40:\n", 196 | " d = 0.95 + I * 0.03\n", 197 | " return d * 10000\n", 198 | " elif I <= 60 and I > 100:\n", 199 | " e = 2 + I * 0.015\n", 200 | " return e * 10000\n", 201 | " else:\n", 202 | " f = 2.95 + I * 0.01\n", 203 | " return f * 10000\n", 204 | " \n", 205 | "I = int(input('净利润:'))\n", 206 | "profit = calcute_profit(I)\n", 207 | "print ('利润为%d元时,应发奖金总数为%d元' % (I, profit))" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 57, 213 | "metadata": {}, 214 | "outputs": [ 215 | { 216 | "name": "stdout", 217 | "output_type": "stream", 218 | "text": [ 219 | "净利润:210000\n", 220 | "利润为210000元时,应发奖金总数为18000元\n" 221 | ] 222 | } 223 | ], 224 | "source": [ 225 | "def calcute_profit(I):\n", 226 | " arr = [1000000,600000,400000,200000,100000,0] #这应该就是各个分界值了,把它们放在列表里方便访问\n", 227 | " rat = [0.01,0.015,0.03,0.05,0.075,0.1] #这是各个分界值所对应的奖金比例值\n", 228 | " r = 0 #这是总奖金的初始值\n", 229 | " for idx in range(0,6): #有6个分界值当然要循环6次\n", 230 | " if I > arr[idx]:\n", 231 | " r = r + (I - arr[idx]) * rat[idx] \n", 232 | " I = arr[idx]\n", 233 | " return r\n", 234 | "\n", 235 | "I = int(input('净利润:'))\n", 236 | "profit = calcute_profit(I)\n", 237 | "print ('利润为%d元时,应发奖金总数为%d元' % (I, profit))" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "# 题目5:用字典的值对字典进行排序" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 71, 250 | "metadata": {}, 251 | "outputs": [ 252 | { 253 | "name": "stdout", 254 | "output_type": "stream", 255 | "text": [ 256 | "[(0, 0), (2, 1), (1, 2), (4, 3), (3, 4)]\n" 257 | ] 258 | } 259 | ], 260 | "source": [ 261 | "import operator\n", 262 | "x = {1: 2, 3: 4, 4:3, 2:1, 0:0}\n", 263 | "sorted_x = sorted(x.items(), key=operator.itemgetter(1))\n", 264 | "print (sorted_x)" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "# 题目6:请问一下两段代码的输出分别是什么?" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 72, 277 | "metadata": {}, 278 | "outputs": [ 279 | { 280 | "name": "stdout", 281 | "output_type": "stream", 282 | "text": [ 283 | "1\n" 284 | ] 285 | } 286 | ], 287 | "source": [ 288 | "a = 1\n", 289 | "def fun(a):\n", 290 | " a = 2\n", 291 | "fun(a)\n", 292 | "print (a)" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 74, 298 | "metadata": {}, 299 | "outputs": [ 300 | { 301 | "name": "stdout", 302 | "output_type": "stream", 303 | "text": [ 304 | "[1]\n" 305 | ] 306 | } 307 | ], 308 | "source": [ 309 | "a = []\n", 310 | "def fun(a):\n", 311 | " a.append(1)\n", 312 | "fun(a)\n", 313 | "print (a)" 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": {}, 319 | "source": [ 320 | "# 题目7: 请问以下两段代码的输出分别是什么?" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 76, 326 | "metadata": {}, 327 | "outputs": [ 328 | { 329 | "name": "stdout", 330 | "output_type": "stream", 331 | "text": [ 332 | "bbb\n", 333 | "aaa\n", 334 | "aaa\n" 335 | ] 336 | } 337 | ], 338 | "source": [ 339 | "class Person:\n", 340 | " name=\"aaa\"\n", 341 | "\n", 342 | "p1=Person()\n", 343 | "p2=Person()\n", 344 | "p1.name=\"bbb\"\n", 345 | "print (p1.name)\n", 346 | "print (p2.name)\n", 347 | "print (Person.name)" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 77, 353 | "metadata": {}, 354 | "outputs": [ 355 | { 356 | "name": "stdout", 357 | "output_type": "stream", 358 | "text": [ 359 | "[1]\n", 360 | "[1]\n", 361 | "[1]\n" 362 | ] 363 | } 364 | ], 365 | "source": [ 366 | "class Person:\n", 367 | " name=[]\n", 368 | "\n", 369 | "p1=Person()\n", 370 | "p2=Person()\n", 371 | "p1.name.append(1)\n", 372 | "print (p1.name)\n", 373 | "print (p2.name)\n", 374 | "print (Person.name)" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "metadata": { 381 | "collapsed": true 382 | }, 383 | "outputs": [], 384 | "source": [] 385 | } 386 | ], 387 | "metadata": { 388 | "kernelspec": { 389 | "display_name": "Python 3", 390 | "language": "python", 391 | "name": "python3" 392 | }, 393 | "language_info": { 394 | "codemirror_mode": { 395 | "name": "ipython", 396 | "version": 3 397 | }, 398 | "file_extension": ".py", 399 | "mimetype": "text/x-python", 400 | "name": "python", 401 | "nbconvert_exporter": "python", 402 | "pygments_lexer": "ipython3", 403 | "version": "3.6.1" 404 | }, 405 | "toc": { 406 | "colors": { 407 | "hover_highlight": "#DAA520", 408 | "navigate_num": "#000000", 409 | "navigate_text": "#333333", 410 | "running_highlight": "#FF0000", 411 | "selected_highlight": "#FFD700", 412 | "sidebar_border": "#EEEEEE", 413 | "wrapper_background": "#FFFFFF" 414 | }, 415 | "moveMenuLeft": true, 416 | "nav_menu": { 417 | "height": "153px", 418 | "width": "252px" 419 | }, 420 | "navigate_menu": true, 421 | "number_sections": true, 422 | "sideBar": true, 423 | "threshold": 4, 424 | "toc_cell": false, 425 | "toc_section_display": "block", 426 | "toc_window_display": false, 427 | "widenNotebook": false 428 | } 429 | }, 430 | "nbformat": 4, 431 | "nbformat_minor": 2 432 | } 433 | -------------------------------------------------------------------------------- /第一版/Cha 7 -提升爬虫的速度/Get Alexa.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import requests\n", 12 | "from bs4 import BeautifulSoup\n", 13 | "import time" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": { 20 | "collapsed": false 21 | }, 22 | "outputs": [ 23 | { 24 | "name": "stdout", 25 | "output_type": "stream", 26 | "text": [ 27 | "20\n", 28 | "40\n", 29 | "60\n", 30 | "80\n", 31 | "100\n", 32 | "120\n", 33 | "140\n", 34 | "160\n", 35 | "180\n", 36 | "200\n", 37 | "220\n", 38 | "240\n", 39 | "260\n", 40 | "280\n", 41 | "300\n", 42 | "320\n", 43 | "340\n", 44 | "360\n", 45 | "380\n", 46 | "400\n", 47 | "420\n", 48 | "440\n", 49 | "460\n", 50 | "480\n", 51 | "500\n", 52 | "520\n", 53 | "540\n", 54 | "560\n", 55 | "580\n", 56 | "600\n", 57 | "620\n", 58 | "640\n", 59 | "660\n", 60 | "680\n", 61 | "700\n", 62 | "720\n", 63 | "740\n", 64 | "760\n", 65 | "780\n", 66 | "800\n", 67 | "820\n", 68 | "840\n", 69 | "860\n", 70 | "880\n", 71 | "900\n", 72 | "920\n", 73 | "940\n", 74 | "960\n", 75 | "980\n", 76 | "1000\n" 77 | ] 78 | } 79 | ], 80 | "source": [ 81 | "num = 0\n", 82 | "for i in range(1,51):\n", 83 | " r = requests.get('http://www.alexa.cn/siterank/' + str(i))\n", 84 | " soup = BeautifulSoup(r.text, \"lxml\")\n", 85 | " span_list = soup.find_all('span', class_ = 'domain-link') \n", 86 | " link_list = [(str(j + num), span_list[j].a['href']) for j in range(len(span_list))]\n", 87 | " num = num + len(link_list)\n", 88 | " \n", 89 | " output = \"\\n\".join(\"%s\\t%s\" % tup for tup in link_list) + \"\\n\"\n", 90 | " print (num)\n", 91 | " with open('C:\\\\Users\\\\Administrator\\\\Desktop\\\\alexa.txt', 'a+', encoding = 'utf-8') as f:\n", 92 | " f.write(output)\n", 93 | " f.close\n", 94 | " time.sleep(3)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": { 101 | "collapsed": true 102 | }, 103 | "outputs": [], 104 | "source": [] 105 | } 106 | ], 107 | "metadata": { 108 | "kernelspec": { 109 | "display_name": "Python 3", 110 | "language": "python", 111 | "name": "python3" 112 | }, 113 | "language_info": { 114 | "codemirror_mode": { 115 | "name": "ipython", 116 | "version": 3 117 | }, 118 | "file_extension": ".py", 119 | "mimetype": "text/x-python", 120 | "name": "python", 121 | "nbconvert_exporter": "python", 122 | "pygments_lexer": "ipython3", 123 | "version": "3.6.0" 124 | } 125 | }, 126 | "nbformat": 4, 127 | "nbformat_minor": 2 128 | } 129 | -------------------------------------------------------------------------------- /第一版/Cha 7 -提升爬虫的速度/cha7/__pycache__/multiprocess_test.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第一版/Cha 7 -提升爬虫的速度/cha7/__pycache__/multiprocess_test.cpython-35.pyc -------------------------------------------------------------------------------- /第一版/Cha 7 -提升爬虫的速度/cha7/__pycache__/multiprocess_test.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第一版/Cha 7 -提升爬虫的速度/cha7/__pycache__/multiprocess_test.cpython-36.pyc -------------------------------------------------------------------------------- /第一版/Cha 7 -提升爬虫的速度/cha7/__pycache__/thread_test.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第一版/Cha 7 -提升爬虫的速度/cha7/__pycache__/thread_test.cpython-35.pyc -------------------------------------------------------------------------------- /第一版/Cha 7 -提升爬虫的速度/cha7/__pycache__/thread_test.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第一版/Cha 7 -提升爬虫的速度/cha7/__pycache__/thread_test.cpython-36.pyc -------------------------------------------------------------------------------- /第一版/Cha 7 -提升爬虫的速度/cha7/gevent1.py: -------------------------------------------------------------------------------- 1 | import gevent 2 | from gevent.queue import Queue, Empty 3 | import time 4 | import requests 5 | 6 | from gevent import monkey#把下面有可能有IO操作的单独做上标记 7 | monkey.patch_all() # 将IO转为异步执行的函数 8 | 9 | link_list = [] 10 | with open('alexa.txt', 'r') as file: 11 | file_list = file.readlines() 12 | for eachone in file_list: 13 | link = eachone.split('\t')[1] 14 | link = link.replace('\n','') 15 | link_list.append(link) 16 | 17 | start = time.time() 18 | def crawler(index): 19 | Process_id = 'Process-' + str(index) 20 | while not workQueue.empty(): 21 | url = workQueue.get(timeout=2) 22 | try: 23 | r = requests.get(url, timeout=20) 24 | print (Process_id, workQueue.qsize(), r.status_code, url) 25 | except Exception as e: 26 | print (Process_id, workQueue.qsize(), url, 'Error: ', e) 27 | 28 | def boss(): 29 | for url in link_list: 30 | workQueue.put_nowait(url) 31 | 32 | if __name__ == '__main__': 33 | workQueue = Queue(1000) 34 | 35 | gevent.spawn(boss).join() 36 | jobs = [] 37 | for i in range(10): 38 | jobs.append(gevent.spawn(crawler, i)) 39 | gevent.joinall(jobs) 40 | 41 | end = time.time() 42 | print ('gevent + Queue多协程爬虫的总时间为:', end-start) 43 | print ('Main Ended!') 44 | -------------------------------------------------------------------------------- /第一版/Cha 7 -提升爬虫的速度/cha7/gevent_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | import gevent 5 | from gevent.queue import Queue, Empty 6 | import time 7 | import requests 8 | 9 | from gevent import monkey#把下面有可能有IO操作的单独做上标记 10 | monkey.patch_all() # 将IO转为异步执行的函数 11 | 12 | start = time.time() 13 | workQueue = Queue(1000) 14 | def crawler(index): 15 | Process_id = 'Process-' + str(index) 16 | while not workQueue.empty(): 17 | url = workQueue.get(timeout=2) 18 | try: 19 | r = requests.get(url, timeout=20) 20 | print (Process_id, workQueue.qsize(), r.status_code, url) 21 | except Exception as e: 22 | print (Process_id, workQueue.qsize(), url, 'Error: ', e) 23 | 24 | def boss(link_list): 25 | for url in link_list: 26 | workQueue.put_nowait(url) 27 | 28 | def gevent_main(link_list, g_num): 29 | gevent.spawn(boss,link_list).join() 30 | jobs = [] 31 | for i in range(g_num): 32 | jobs.append(gevent.spawn(crawler, i)) 33 | gevent.joinall(jobs) 34 | 35 | end = time.time() 36 | time_spend = end-start 37 | print ('gevent + Queue多协程爬虫的总时间为:', time_spend) 38 | print ('Main Ended!') 39 | return time_spend 40 | 41 | if __name__ == '__main__': 42 | link_list = [] 43 | with open('alexa.txt', 'r') as file: 44 | file_list = file.readlines() 45 | for eachone in file_list: 46 | link = eachone.split('\t')[1] 47 | link = link.replace('\n','') 48 | link_list.append(link) 49 | 50 | 51 | 52 | gevent_time10 = gevent_main(link_list, 15) 53 | print ('gevent + Queue多协程爬虫的总时间为:', gevent_time10) 54 | 55 | gevent_time3 = gevent_main(link_list, 20) 56 | print ('gevent + Queue多协程爬虫的总时间为:', gevent_time3) 57 | 58 | with open('result_gevent.txt','a+',encoding='utf-8') as f: 59 | f.write('\t' + str(gevent_time10) + '\t' + str(gevent_time3)) -------------------------------------------------------------------------------- /第一版/Cha 7 -提升爬虫的速度/cha7/multiprocess_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | from multiprocessing import Pool, Manager 5 | import time 6 | import requests 7 | 8 | def crawler(q, index): 9 | Process_id = 'Process-' + str(index) 10 | while not q.empty(): 11 | url = q.get(timeout=2) 12 | try: 13 | r = requests.get(url, timeout=20) 14 | print (Process_id, q.qsize(), r.status_code, url) 15 | except Exception as e: 16 | print (Process_id, q.qsize(), url, 'Error: ', e) 17 | 18 | 19 | def multiprocess_main(link_list, p_num): 20 | start = time.time() 21 | manager = Manager() 22 | workQueue = manager.Queue(1000) 23 | 24 | # 填充队列 25 | for url in link_list: 26 | workQueue.put(url) 27 | 28 | print ("Started processes") 29 | pool = Pool(processes=p_num) 30 | for i in range(p_num): 31 | pool.apply_async(crawler, args=(workQueue, i)) 32 | 33 | 34 | pool.close() 35 | pool.join() 36 | 37 | end = time.time() 38 | time_spend = end-start 39 | print ('Pool + Queue多进程爬虫的总时间为:', time_spend) 40 | print ('Main process Ended!') 41 | return time_spend 42 | 43 | if __name__ == '__main__': 44 | link_list = [] 45 | with open('alexa.txt', 'r') as file: 46 | file_list = file.readlines() 47 | for eachone in file_list: 48 | link = eachone.split('\t')[1] 49 | link = link.replace('\n','') 50 | link_list.append(link) 51 | 52 | multiprocess_main(link_list, 3) -------------------------------------------------------------------------------- /第一版/Cha 7 -提升爬虫的速度/cha7/mutilprocess1.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Process, Queue 2 | import time 3 | import requests 4 | 5 | link_list = [] 6 | with open('alexa.txt', 'r') as file: 7 | file_list = file.readlines() 8 | for eachone in file_list: 9 | link = eachone.split('\t')[1] 10 | link = link.replace('\n','') 11 | link_list.append(link) 12 | 13 | start = time.time() 14 | class MyProcess(Process): 15 | def __init__(self, q): 16 | Process.__init__(self) 17 | self.q = q 18 | 19 | def run(self): 20 | print ("Starting " , self.pid) 21 | while not self.q.empty(): 22 | crawler(self.q) 23 | print ("Exiting " , self.pid) 24 | 25 | def crawler(q): 26 | url = q.get(timeout=2) 27 | try: 28 | r = requests.get(url, timeout=20) 29 | print (q.qsize(), r.status_code, url) 30 | except Exception as e: 31 | print (q.qsize(), url, 'Error: ', e) 32 | 33 | if __name__ == '__main__': 34 | ProcessNames = ["Process-1", "Process-2", "Process-3"] 35 | workQueue = Queue(1000) 36 | 37 | # 填充队列 38 | for url in link_list: 39 | workQueue.put(url) 40 | 41 | for i in range(0, 3): 42 | p = MyProcess(workQueue) 43 | p.daemon = True 44 | p.start() 45 | p.join() 46 | 47 | end = time.time() 48 | print ('Process + Queue多进程爬虫的总时间为:', end-start) 49 | print ('Main process Ended!') -------------------------------------------------------------------------------- /第一版/Cha 7 -提升爬虫的速度/cha7/mutilprocess2.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Pool, Manager 2 | import time 3 | import requests 4 | 5 | link_list = [] 6 | with open('alexa.txt', 'r') as file: 7 | file_list = file.readlines() 8 | for eachone in file_list: 9 | link = eachone.split('\t')[1] 10 | link = link.replace('\n','') 11 | link_list.append(link) 12 | 13 | start = time.time() 14 | def crawler(q, index): 15 | Process_id = 'Process-' + str(index) 16 | while not q.empty(): 17 | url = q.get(timeout=2) 18 | try: 19 | r = requests.get(url, timeout=20) 20 | print (Process_id, q.qsize(), r.status_code, url) 21 | except Exception as e: 22 | print (Process_id, q.qsize(), url, 'Error: ', e) 23 | 24 | 25 | if __name__ == '__main__': 26 | manager = Manager() 27 | workQueue = manager.Queue(1000) 28 | 29 | # 填充队列 30 | for url in link_list: 31 | workQueue.put(url) 32 | 33 | pool = Pool(processes=3) 34 | for i in range(4): 35 | pool.apply_async(crawler, args=(workQueue, i)) 36 | 37 | print ("Started processes") 38 | pool.close() 39 | pool.join() 40 | 41 | end = time.time() 42 | print ('Pool + Queue多进程爬虫的总时间为:', end-start) 43 | print ('Main process Ended!') 44 | -------------------------------------------------------------------------------- /第一版/Cha 7 -提升爬虫的速度/cha7/mutilprocess3.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Pool, Manager 2 | import time 3 | import requests 4 | 5 | link_list = [] 6 | with open('alexa.txt', 'r') as file: 7 | file_list = file.readlines() 8 | for eachone in file_list: 9 | link = eachone.split('\t')[1] 10 | link = link.replace('\n','') 11 | link_list.append(link) 12 | 13 | start = time.time() 14 | def crawler(q, index): 15 | Process_id = 'Process-' + str(index) 16 | while not q.empty(): 17 | url = q.get(timeout=2) 18 | try: 19 | r = requests.get(url, timeout=20) 20 | print (Process_id, q.qsize(), r.status_code, url) 21 | except Exception as e: 22 | print (Process_id, q.qsize(), url, 'Error: ', e) 23 | 24 | 25 | if __name__ == '__main__': 26 | manager = Manager() 27 | workQueue = manager.Queue(1000) 28 | 29 | # 填充队列 30 | for url in link_list: 31 | workQueue.put(url) 32 | 33 | pool = Pool(processes=3) 34 | for i in range(4): 35 | pool.apply(crawler, args=(workQueue, i)) 36 | 37 | print ("Started processes") 38 | pool.close() 39 | pool.join() 40 | 41 | end = time.time() 42 | print ('Pool + Queue多进程爬虫的总时间为:', end-start) 43 | print ('Main process Ended!') 44 | -------------------------------------------------------------------------------- /第一版/Cha 7 -提升爬虫的速度/cha7/result.txt: -------------------------------------------------------------------------------- 1 | 312.7718894481659 143.37620067596436 549.7254424095154 549.978456735611 -------------------------------------------------------------------------------- /第一版/Cha 7 -提升爬虫的速度/cha7/result_gevent.txt: -------------------------------------------------------------------------------- 1 | 338.3443522453308 922.8117818832397 312.1618547439575 484.05668663978577 -------------------------------------------------------------------------------- /第一版/Cha 7 -提升爬虫的速度/cha7/result_single_time.txt: -------------------------------------------------------------------------------- 1 | 1721.3604562282562 -------------------------------------------------------------------------------- /第一版/Cha 7 -提升爬虫的速度/cha7/thread1.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import requests 3 | import time 4 | 5 | link_list = [] 6 | with open('alexa.txt', 'r') as file: 7 | file_list = file.readlines() 8 | for eachone in file_list: 9 | link = eachone.split('\t')[1] 10 | link = link.replace('\n','') 11 | link_list.append(link) 12 | 13 | start = time.time() 14 | class myThread (threading.Thread): 15 | def __init__(self, name, link_range): 16 | threading.Thread.__init__(self) 17 | self.name = name 18 | self.link_range = link_range 19 | def run(self): 20 | print ("Starting " + self.name) 21 | crawler(self.name, self.link_range) 22 | print ("Exiting " + self.name) 23 | 24 | def crawler(threadName, link_range): 25 | for i in range(link_range[0],link_range[1]+1): 26 | try: 27 | r = requests.get(link_list[i], timeout=20) 28 | print (threadName, r.status_code, link_list[i]) 29 | except Exception as e: 30 | print(threadName, 'Error: ', e) 31 | 32 | thread_list = [] 33 | link_range_list = [(0,200),(201,400),(401,600),(601,800),(801,1000)] 34 | 35 | # 创建新线程 36 | for i in range(1,6): 37 | thread = myThread("Thread-" + str(i), link_range_list[i-1]) 38 | thread.start() 39 | thread_list.append(thread) 40 | 41 | # 等待所有线程完成 42 | for thread in thread_list: 43 | thread.join() 44 | 45 | end = time.time() 46 | print ('简单多线程爬虫的总时间为:', end-start) 47 | print ("Exiting Main Thread") -------------------------------------------------------------------------------- /第一版/Cha 7 -提升爬虫的速度/cha7/thread2.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import requests 3 | import time 4 | import queue as Queue 5 | 6 | link_list = [] 7 | with open('alexa.txt', 'r') as file: 8 | file_list = file.readlines() 9 | for eachone in file_list: 10 | link = eachone.split('\t')[1] 11 | link = link.replace('\n','') 12 | link_list.append(link) 13 | 14 | start = time.time() 15 | class myThread (threading.Thread): 16 | def __init__(self, name, q): 17 | threading.Thread.__init__(self) 18 | self.name = name 19 | self.q = q 20 | def run(self): 21 | print ("Starting " + self.name) 22 | while True: 23 | try: 24 | crawler(self.name, self.q) 25 | except: 26 | break 27 | print ("Exiting " + self.name) 28 | 29 | def crawler(threadName, q): 30 | url = q.get(timeout=2) 31 | try: 32 | r = requests.get(url, timeout=20) 33 | print (q.qsize(), threadName, r.status_code, url) 34 | except Exception as e: 35 | print (q.qsize(), threadName, url, 'Error: ', e) 36 | 37 | threadList = ["Thread-1", "Thread-2", "Thread-3","Thread-4", "Thread-5"] 38 | workQueue = Queue.Queue(1000) 39 | threads = [] 40 | 41 | # 创建新线程 42 | for tName in threadList: 43 | thread = myThread(tName, workQueue) 44 | thread.start() 45 | threads.append(thread) 46 | 47 | # 填充队列 48 | for url in link_list: 49 | workQueue.put(url) 50 | 51 | # 等待所有线程完成 52 | for t in threads: 53 | t.join() 54 | 55 | end = time.time() 56 | print ('Queue多线程爬虫的总时间为:', end-start) 57 | print ("Exiting Main Thread") -------------------------------------------------------------------------------- /第一版/Cha 7 -提升爬虫的速度/cha7/thread_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | import threading 5 | import requests 6 | import time 7 | import queue as Queue 8 | 9 | 10 | class myThread (threading.Thread): 11 | def __init__(self, name, q): 12 | threading.Thread.__init__(self) 13 | self.name = name 14 | self.q = q 15 | def run(self): 16 | print ("Starting " + self.name) 17 | while True: 18 | try: 19 | crawler(self.name, self.q) 20 | except: 21 | break 22 | print ("Exiting " + self.name) 23 | 24 | def crawler(threadName, q): 25 | url = q.get(timeout=2) 26 | try: 27 | r = requests.get(url, timeout=20) 28 | print (q.qsize(), threadName, r.status_code, url) 29 | except Exception as e: 30 | print (q.qsize(), threadName, url, 'Error: ', e) 31 | 32 | def thread_main(link_list, t_num): 33 | start = time.time() 34 | workQueue = Queue.Queue(1000) 35 | threads = [] 36 | 37 | # 创建新线程 38 | for tName in range(t_num): 39 | thread = myThread('Thread' + str(tName), workQueue) 40 | thread.start() 41 | threads.append(thread) 42 | 43 | # 填充队列 44 | for url in link_list: 45 | workQueue.put(url) 46 | 47 | # 等待所有线程完成 48 | for t in threads: 49 | t.join() 50 | 51 | end = time.time() 52 | print ('Queue多线程爬虫的总时间为:', end-start) 53 | print ("Exiting Main Thread") 54 | return end-start 55 | 56 | if __name__ == '__main__': 57 | link_list = [] 58 | with open('alexa.txt', 'r') as file: 59 | file_list = file.readlines() 60 | for eachone in file_list: 61 | link = eachone.split('\t')[1] 62 | link = link.replace('\n','') 63 | link_list.append(link) 64 | 65 | thread_main(link_list, 5) -------------------------------------------------------------------------------- /第一版/Cha 7 -提升爬虫的速度/cha7/time_spend 2.py: -------------------------------------------------------------------------------- 1 | from multiprocess_test import multiprocess_main 2 | from thread_test import thread_main 3 | 4 | if __name__ == '__main__': 5 | link_list = [] 6 | with open('alexa.txt', 'r') as file: 7 | file_list = file.readlines() 8 | for eachone in file_list: 9 | link = eachone.split('\t')[1] 10 | link = link.replace('\n','') 11 | link_list.append(link) 12 | 13 | #single = single() 14 | #print ('串行的总时间为:', single) 15 | 16 | #thread_time = thread_main(link_list, 5) 17 | #print ('Queue多线程爬虫的总时间为:', thread_time) 18 | 19 | multiprocess_time = multiprocess_main(link_list, 3) 20 | #print ('Pool + Queue多进程爬虫的总时间为:', multiprocess_time) 21 | 22 | #gevent_time = gevent_main(link_list, 10) 23 | #print ('gevent + Queue多协程爬虫的总时间为:', gevent_time) 24 | 25 | #with open('result.txt','a+',encoding='utf-8') as f: 26 | # f.write(single + '\t' + thread_time + '\t' + multiprocess_time + '\t' + gevent_time) -------------------------------------------------------------------------------- /第一版/Cha 7 -提升爬虫的速度/cha7/time_spend.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | import requests 5 | import time 6 | #from multiprocess_test import multiprocess_main 7 | #from thread_test import thread_main 8 | 9 | def single(): 10 | start = time.time() 11 | for eachone in link_list: 12 | try: 13 | r = requests.get(eachone) 14 | print (r.status_code, eachone) 15 | except Exception as e: 16 | print('Error: ', e) 17 | end = time.time() 18 | time_spend = end-start 19 | print ('串行的总时间为:', time_spend) 20 | return time_spend 21 | 22 | if __name__ == '__main__': 23 | link_list = [] 24 | with open('alexa.txt', 'r') as file: 25 | file_list = file.readlines() 26 | for eachone in file_list: 27 | link = eachone.split('\t')[1] 28 | link = link.replace('\n','') 29 | link_list.append(link) 30 | 31 | #thread_time10 = thread_main(link_list, 10) 32 | #print ('Queue多线程爬虫的总时间为:', thread_time10) 33 | 34 | #multiprocess_time10 = multiprocess_main(link_list, 10) 35 | #print ('Pool + Queue多进程爬虫的总时间为:', multiprocess_time10) 36 | 37 | #thread_time3 = thread_main(link_list, 3) 38 | #print ('Queue多线程爬虫的总时间为:', thread_time3) 39 | 40 | #multiprocess_time3 = multiprocess_main(link_list, 3) 41 | #print ('Pool + Queue多进程爬虫的总时间为:', multiprocess_time3) 42 | 43 | single_time = single() 44 | print ('串行的总时间为:', single_time) 45 | 46 | with open('result_single_time.txt','a+',encoding='utf-8') as f: 47 | f.write(str(single_time)) 48 | #f.write(str(thread_time10) + '\t' + str(multiprocess_time10) + '\t' + str(thread_time3) + '\t' + str(multiprocess_time3)) -------------------------------------------------------------------------------- /第一版/Cha 7 -提升爬虫的速度/多协程.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第一版/Cha 7 -提升爬虫的速度/多协程.png -------------------------------------------------------------------------------- /第一版/Cha 7 -提升爬虫的速度/多线程.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第一版/Cha 7 -提升爬虫的速度/多线程.png -------------------------------------------------------------------------------- /第一版/Cha 7 -提升爬虫的速度/多进程.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第一版/Cha 7 -提升爬虫的速度/多进程.png -------------------------------------------------------------------------------- /第一版/Cha 8 -反爬虫问题/Cha 8 -反爬虫问题.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 8.3如何“反反爬虫”?" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 8.3.1修改请求 header" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 6, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "{'User-Agent': 'python-requests/2.12.4', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}\n" 29 | ] 30 | } 31 | ], 32 | "source": [ 33 | "import requests\n", 34 | "r = requests.get('http://www.santostang.com')\n", 35 | "print (r.request.headers)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 5, 41 | "metadata": { 42 | "collapsed": false 43 | }, 44 | "outputs": [ 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | "{'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "import requests\n", 55 | "\n", 56 | "link = 'http://www.santostang.com'\n", 57 | "headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} \n", 58 | "r = requests.get(link, headers= headers)\n", 59 | "print (r.request.headers)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "## 8.3.2 修改爬虫的间隔时间" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 10, 72 | "metadata": { 73 | "collapsed": false 74 | }, 75 | "outputs": [ 76 | { 77 | "name": "stdout", 78 | "output_type": "stream", 79 | "text": [ 80 | "2.0001144409179688\n" 81 | ] 82 | } 83 | ], 84 | "source": [ 85 | "import time\n", 86 | "t1 = time.time()\n", 87 | "time.sleep(2)\n", 88 | "t2 = time.time()\n", 89 | "total_time = t2-t1\n", 90 | "print (total_time)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 17, 96 | "metadata": { 97 | "collapsed": false 98 | }, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "0.3481693303048349\n" 105 | ] 106 | } 107 | ], 108 | "source": [ 109 | "import time\n", 110 | "import random\n", 111 | "\n", 112 | "sleep_time = random.randint(0,2) + random.random()\n", 113 | "print (sleep_time)\n", 114 | "time.sleep(sleep_time)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 19, 120 | "metadata": { 121 | "collapsed": false 122 | }, 123 | "outputs": [ 124 | { 125 | "name": "stdout", 126 | "output_type": "stream", 127 | "text": [ 128 | "开始爬取这篇博客: http://www.santostang.com/2017/03/08/hello-python/\n", 129 | "这篇博客的标题为: Hello Python!\n", 130 | "开始休息: 0.16292490492777212 秒\n", 131 | "开始爬取这篇博客: http://www.santostang.com/2017/03/07/echarts%e5%ad%a6%e4%b9%a0%e7%ac%94%e8%ae%b02-%e5%8d%95%e9%a1%b5%e9%9d%a2%e5%a4%9a%e5%bc%a0%e5%9b%be%e8%a1%a8/\n", 132 | "这篇博客的标题为: echarts学习笔记(2) — 同一页面多图表\n", 133 | "开始休息: 1.912631031656519 秒\n", 134 | "开始爬取这篇博客: http://www.santostang.com/2017/03/07/echarts%e5%ad%a6%e4%b9%a0%e7%ac%94%e8%ae%b01-%e4%bd%bf%e7%94%a8%e6%a8%a1%e5%9d%97%e5%8c%96%e5%8d%95%e6%96%87%e4%bb%b6%e5%bc%95%e5%85%a5/\n", 135 | "这篇博客的标题为: echarts学习笔记(1) — 模块化单文件引入\n", 136 | "开始休息: 1.3634313119416182 秒\n", 137 | "开始爬取这篇博客: http://www.santostang.com/2017/03/06/%e3%80%90%e7%88%ac%e8%99%ab%e4%ba%8c%e3%80%91%e7%88%ac%e8%99%ab%e7%9a%84%e6%a1%86%e6%9e%b6%e5%92%8c%e5%9f%ba%e6%9c%ac%e8%ae%ae%e9%a2%98/\n", 138 | "这篇博客的标题为: 【爬虫二】爬虫的框架和基本议题\n", 139 | "开始休息: 2.0205314818737516 秒\n", 140 | "开始爬取这篇博客: http://www.santostang.com/2017/03/06/%e3%80%90%e7%88%ac%e8%99%ab%e4%b8%80%e3%80%91%e6%9c%80%e7%ae%80%e5%8d%95%e7%9a%84%e7%88%ac%e8%99%ab%ef%bc%8c%e9%9b%b6%e5%9f%ba%e7%a1%80%e6%95%99%e5%ad%a6/\n", 141 | "这篇博客的标题为: 【爬虫一】最简单的爬虫,零基础教学\n", 142 | "开始休息: 2.446761436097069 秒\n", 143 | "开始爬取这篇博客: http://www.santostang.com/2017/03/02/hello-world/\n", 144 | "这篇博客的标题为: Hello world!\n", 145 | "开始休息: 0.8005131789714476 秒\n" 146 | ] 147 | } 148 | ], 149 | "source": [ 150 | "import requests\n", 151 | "from bs4 import BeautifulSoup\n", 152 | "import time\n", 153 | "import random\n", 154 | "\n", 155 | "link = \"http://www.santostang.com/\"\n", 156 | "\n", 157 | "def scrap(link):\n", 158 | " headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} \n", 159 | " r = requests.get(link, headers= headers)\n", 160 | " html = r.text\n", 161 | " soup = BeautifulSoup(html, \"lxml\")\n", 162 | " return soup\n", 163 | "\n", 164 | "soup = scrap(link)\n", 165 | "title_list = soup.find_all(\"h1\", class_=\"post-title\")\n", 166 | "for eachone in title_list:\n", 167 | " url = eachone.a['href']\n", 168 | " print ('开始爬取这篇博客: ', url)\n", 169 | " soup_article = scrap(url)\n", 170 | " title = soup_article.find(\"h1\", class_=\"view-title\").text.strip()\n", 171 | " print ('这篇博客的标题为: ', title)\n", 172 | " sleep_time = random.randint(0,2) + random.random()\n", 173 | " print ('开始休息: ', sleep_time, '秒')\n", 174 | " time.sleep(sleep_time)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": { 181 | "collapsed": true 182 | }, 183 | "outputs": [], 184 | "source": [ 185 | "scrap_times = 0\n", 186 | "for eachone in title_list:\n", 187 | " url = eachone.a['href']\n", 188 | " print ('开始爬取这篇博客: ', url)\n", 189 | " soup_article = scrap(url)\n", 190 | " title = soup_article.find(\"h1\", class_=\"view-title\").text.strip()\n", 191 | " print ('这篇博客的标题为: ', title)\n", 192 | " \n", 193 | " scrap_times += 1\n", 194 | " if scrap_times % 5 == 0:\n", 195 | " sleep_time = 10 + random.random()\n", 196 | " else:\n", 197 | " sleep_time = random.randint(0,2) + random.random()\n", 198 | " time.sleep(sleep_time)\n", 199 | " print ('开始休息: ', sleep_time, '秒')" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "## 8.3.3 使用代理" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": { 213 | "collapsed": true 214 | }, 215 | "outputs": [], 216 | "source": [ 217 | "import requests\n", 218 | "\n", 219 | "link = \"http://www.santostang.com/\"\n", 220 | "proxies = {'http':'http://xxx.xxx.xxx.xxx:xxxx'}\n", 221 | "response = requests.get(link, proxies=proxies)" 222 | ] 223 | } 224 | ], 225 | "metadata": { 226 | "kernelspec": { 227 | "display_name": "Python 3", 228 | "language": "python", 229 | "name": "python3" 230 | }, 231 | "language_info": { 232 | "codemirror_mode": { 233 | "name": "ipython", 234 | "version": 3 235 | }, 236 | "file_extension": ".py", 237 | "mimetype": "text/x-python", 238 | "name": "python", 239 | "nbconvert_exporter": "python", 240 | "pygments_lexer": "ipython3", 241 | "version": "3.6.0" 242 | } 243 | }, 244 | "nbformat": 4, 245 | "nbformat_minor": 2 246 | } 247 | -------------------------------------------------------------------------------- /第二版/Cha 11 -登录与验证码处理/captcha.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 11 -登录与验证码处理/captcha.jpg -------------------------------------------------------------------------------- /第二版/Cha 11 -登录与验证码处理/captcha_gray.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 11 -登录与验证码处理/captcha_gray.jpg -------------------------------------------------------------------------------- /第二版/Cha 11 -登录与验证码处理/captcha_thresholded.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 11 -登录与验证码处理/captcha_thresholded.jpg -------------------------------------------------------------------------------- /第二版/Cha 11 -登录与验证码处理/cookies: -------------------------------------------------------------------------------- 1 | #LWP-Cookies-2.0 2 | Set-Cookie3: wordpress_logged_in_d7bcd9a2844a13f8dc3e5a4eb2cdfa70="test%7C1544889241%7CzMUKtgScAb81VCyAMpqMQ8fQ7jryOCjFrooHMWk72xF%7C449659c6398642d69e61044e14efbb77b42028c4063536da2725f55e8aa5af26"; path="/"; domain="www.santostang.com"; path_spec; expires="2018-12-16 03:54:01Z"; httponly=None; version=0 3 | Set-Cookie3: wordpress_d7bcd9a2844a13f8dc3e5a4eb2cdfa70="test%7C1544889241%7CzMUKtgScAb81VCyAMpqMQ8fQ7jryOCjFrooHMWk72xF%7C823caf1f78a9836d8921bb977cb81ae822c6020cd554a0658d77d5475a4b0303"; path="/wp-admin"; domain="www.santostang.com"; path_spec; expires="2018-12-16 03:54:01Z"; httponly=None; version=0 4 | Set-Cookie3: wordpress_d7bcd9a2844a13f8dc3e5a4eb2cdfa70="test%7C1544889241%7CzMUKtgScAb81VCyAMpqMQ8fQ7jryOCjFrooHMWk72xF%7C823caf1f78a9836d8921bb977cb81ae822c6020cd554a0658d77d5475a4b0303"; path="/wp-content/plugins"; domain="www.santostang.com"; path_spec; expires="2018-12-16 03:54:01Z"; httponly=None; version=0 5 | -------------------------------------------------------------------------------- /第二版/Cha 12 -服务器采集/tor1.py: -------------------------------------------------------------------------------- 1 | import socket 2 | import socks 3 | import requests 4 | 5 | # Tor使用9150端口为默认的socks端口 6 | socks.set_default_proxy(socks.SOCKS5, "127.0.0.1", 9150) 7 | socket.socket = socks.socksocket 8 | # 获取这次抓取使用的IP地址 9 | a = requests.get("http://checkip.amazonaws.com").text 10 | 11 | print (a) -------------------------------------------------------------------------------- /第二版/Cha 12 -服务器采集/tor2.py: -------------------------------------------------------------------------------- 1 | from stem import Signal 2 | from stem.control import Controller 3 | import socket 4 | import socks 5 | import requests 6 | import time 7 | 8 | controller = Controller.from_port(port = 9151) 9 | controller.authenticate() 10 | socks.set_default_proxy(socks.SOCKS5, "127.0.0.1", 9150) 11 | socket.socket = socks.socksocket 12 | 13 | total_scrappy_time = 0 14 | total_changeIP_time = 0 15 | 16 | for x in range(0,10): 17 | a = requests.get("http://checkip.amazonaws.com").text 18 | print ("第", x+1, "次IP:", a) 19 | 20 | time1 = time.time() 21 | a = requests.get("http://www.santostang.com/").text 22 | #print (a) 23 | time2 = time.time() 24 | total_scrappy_time = total_scrappy_time + time2-time1 25 | print ("第", x+1, "次抓取花费时间:", time2-time1) 26 | 27 | time3 = time.time() 28 | controller.signal(Signal.NEWNYM) 29 | time.sleep(5) 30 | time4 = time.time() 31 | total_changeIP_time = total_changeIP_time + time4-time3-5 32 | print ("第", x+1, "次更换IP花费时间:", time4-time3-5) 33 | 34 | print ("平均抓取花费时间:", total_scrappy_time/10) 35 | print ("平均更换IP花费时间:", total_changeIP_time/10) 36 | -------------------------------------------------------------------------------- /第二版/Cha 12 -服务器采集/tor3.py: -------------------------------------------------------------------------------- 1 | from stem import Signal 2 | from stem.control import Controller 3 | import socket 4 | import socks 5 | import requests 6 | import time 7 | 8 | #controller = Controller.from_port(port = 9151) 9 | #controller.authenticate() 10 | #socks.set_default_proxy(socks.SOCKS5, "127.0.0.1", 9150) 11 | #socket.socket = socks.socksocket 12 | 13 | total_scrappy_time = 0 14 | total_changeIP_time = 0 15 | 16 | for x in range(0,10): 17 | #a = requests.get("http://checkip.amazonaws.com").text 18 | #print ("第", x+1, "次IP:", a) 19 | 20 | time1 = time.time() 21 | a = requests.get("http://www.santostang.com/").text 22 | #print (a) 23 | time2 = time.time() 24 | total_scrappy_time = total_scrappy_time + time2-time1 25 | print ("第", x+1, "次抓取花费时间:", time2-time1) 26 | 27 | time3 = time.time() 28 | #controller.signal(Signal.NEWNYM) 29 | time.sleep(5) 30 | time4 = time.time() 31 | total_changeIP_time = total_changeIP_time + time4-time3-5 32 | print ("第", x+1, "次更换IP花费时间:", time4-time3-5) 33 | 34 | print ("平均抓取花费时间:", total_scrappy_time/10) 35 | print ("平均更换IP花费时间:", total_changeIP_time/10) -------------------------------------------------------------------------------- /第二版/Cha 13 -分布式爬虫/1497099934.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 13 -分布式爬虫/1497099934.jpg -------------------------------------------------------------------------------- /第二版/Cha 13 -分布式爬虫/Cha 13 - 分布式爬虫.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 12.3 Redis分布式爬虫实战" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 12.3.2 加入任务队列" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "def push_redis_list():\n", 26 | " r = Redis(host='137.189.204.65', port=6379 ,password='redisredis')\n", 27 | " print (r.keys('*'))\n", 28 | " \n", 29 | " link_list = []\n", 30 | " with open('alexa.txt', 'r') as file:\n", 31 | " file_list = file.readlines()\n", 32 | " for eachone in file_list:\n", 33 | " link = eachone.split('\\t')[1]\n", 34 | " link = link.replace('\\n','')\n", 35 | " link_list.append(link)\n", 36 | " if len(link_list) == 100:\n", 37 | " break\n", 38 | " \n", 39 | " for url in link_list:\n", 40 | " response = requests.get(url, headers=headers, timeout=20)\n", 41 | " soup = BeautifulSoup(response.text, 'lxml')\n", 42 | " img_list = soup.find_all('img')\n", 43 | " for img in img_list:\n", 44 | " img_url = img['src']\n", 45 | " if img_url != '':\n", 46 | " print (\"加入的图片url: \", img_url)\n", 47 | " r.lpush('img_url',img_url)\n", 48 | " print ('现在图片链接的个数为', r.llen('img_url'))\n", 49 | " return" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "# 12.3.3 读取任务队列,下载图片" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "collapsed": true 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "def get_img():\n", 68 | " r = Redis(host='137.189.204.65', port=6379 ,password='redisredis')\n", 69 | " while True:\n", 70 | " try:\n", 71 | " url = r.lpop('img_url')\n", 72 | " url = url.decode('ascii')\n", 73 | " try:\n", 74 | " response = requests.get(url, headers=headers,timeout = 20)\n", 75 | " name = int(time.time())\n", 76 | " f = open(str(name)+ url[-4:], 'wb')\n", 77 | " f.write(response.content)\n", 78 | " f.close()\n", 79 | " print ('已经获取图片', url)\n", 80 | " except Exception as e:\n", 81 | " print ('爬取图片过程出问题', e)\n", 82 | " time.sleep(3)\n", 83 | " except Exception as e:\n", 84 | " print (e)\n", 85 | " time.sleep(10)\n", 86 | " break\n", 87 | " return " 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "# 12.3.4 分布式爬虫代码" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 13, 100 | "metadata": { 101 | "collapsed": false 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "import requests\n", 106 | "from bs4 import BeautifulSoup\n", 107 | "import re\n", 108 | "import time\n", 109 | "from redis import Redis\n", 110 | "headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' }\n", 111 | "\n", 112 | "def push_redis_list():\n", 113 | " #与上面此函数相同\n", 114 | "\n", 115 | "def get_img():\n", 116 | " #与上面此函数相同\n", 117 | "\n", 118 | "if __name__ == '__main__': \n", 119 | " this_machine = 'master' \n", 120 | " print ('开始分布式爬虫')\n", 121 | " if this_machine == 'master':\n", 122 | " push_redis_list()\n", 123 | " else:\n", 124 | " get_img()" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 14, 130 | "metadata": { 131 | "collapsed": false 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "import requests\n", 136 | "from bs4 import BeautifulSoup\n", 137 | "import re\n", 138 | "import time\n", 139 | "from redis import Redis\n", 140 | "headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' }\n", 141 | "\n", 142 | "def push_redis_list():\n", 143 | " #与上面此函数相同\n", 144 | "\n", 145 | "def get_img():\n", 146 | " #与上面此函数相同\n", 147 | "\n", 148 | "if __name__ == '__main__': \n", 149 | " this_machine = 'slave' \n", 150 | " print ('开始分布式爬虫')\n", 151 | " if this_machine == 'master':\n", 152 | " push_redis_list()\n", 153 | " else:\n", 154 | " get_img()" 155 | ] 156 | } 157 | ], 158 | "metadata": { 159 | "kernelspec": { 160 | "display_name": "Python 3", 161 | "language": "python", 162 | "name": "python3" 163 | }, 164 | "language_info": { 165 | "codemirror_mode": { 166 | "name": "ipython", 167 | "version": 3 168 | }, 169 | "file_extension": ".py", 170 | "mimetype": "text/x-python", 171 | "name": "python", 172 | "nbconvert_exporter": "python", 173 | "pygments_lexer": "ipython3", 174 | "version": "3.6.0" 175 | } 176 | }, 177 | "nbformat": 4, 178 | "nbformat_minor": 2 179 | } 180 | -------------------------------------------------------------------------------- /第二版/Cha 13 -分布式爬虫/master.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import re 4 | import time 5 | from redis import Redis 6 | headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' } 7 | 8 | def push_redis_list(): 9 | r = Redis(host='137.189.204.65', port=6379 ,password='redisredis') 10 | print (r.keys('*')) 11 | 12 | link_list = [] 13 | with open('alexa.txt', 'r') as file: 14 | file_list = file.readlines() 15 | for eachone in file_list: 16 | link = eachone.split('\t')[1] 17 | link = link.replace('\n','') 18 | link_list.append(link) 19 | if len(link_list) == 100: 20 | break 21 | 22 | for url in link_list: 23 | response = requests.get(url, headers=headers, timeout=20) 24 | soup = BeautifulSoup(response.text, 'lxml') 25 | img_list = soup.find_all('img') 26 | for img in img_list: 27 | img_url = img['src'] 28 | if img_url != '': 29 | print ("加入的图片url: ", img_url) 30 | r.lpush('img_url',img_url) 31 | print ('现在图片链接的个数为', r.llen('img_url')) 32 | return 33 | 34 | def get_img(): 35 | r = Redis(host='137.189.204.65', port=6379 ,password='redisredis') 36 | while True: 37 | try: 38 | url = r.lpop('img_url') 39 | url = url.decode('ascii') 40 | try: 41 | response = requests.get(url, headers=headers,timeout = 20) 42 | name = int(time.time()) 43 | f = open(str(name)+ url[-4:], 'wb') 44 | f.write(response.content) 45 | f.close() 46 | print ('已经获取图片', url) 47 | except Exception as e: 48 | print ('爬取图片过程出问题', e) 49 | time.sleep(3) 50 | except Exception as e: 51 | print (e) 52 | time.sleep(10) 53 | break 54 | return 55 | 56 | if __name__ == '__main__': 57 | this_machine = 'master' 58 | print ('开始分布式爬虫') 59 | if this_machine == 'master': 60 | push_redis_list() 61 | else: 62 | get_img() -------------------------------------------------------------------------------- /第二版/Cha 13 -分布式爬虫/slave.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import re 4 | import time 5 | from redis import Redis 6 | headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' } 7 | 8 | def push_redis_list(): 9 | r = Redis(host='137.189.204.65', port=6379 ,password='redisredis') 10 | print (r.keys('*')) 11 | 12 | link_list = [] 13 | with open('alexa.txt', 'r') as file: 14 | file_list = file.readlines() 15 | for eachone in file_list: 16 | link = eachone.split('\t')[1] 17 | link = link.replace('\n','') 18 | link_list.append(link) 19 | if len(link_list) == 100: 20 | break 21 | 22 | for url in link_list: 23 | response = requests.get(url, headers=headers, timeout=20) 24 | soup = BeautifulSoup(response.text, 'lxml') 25 | img_list = soup.find_all('img') 26 | for img in img_list: 27 | img_url = img['src'] 28 | if img_url != '': 29 | print ("加入的图片url: ", img_url) 30 | r.lpush('img_url',img_url) 31 | print ('现在图片链接的个数为', r.llen('img_url')) 32 | return 33 | 34 | def get_img(): 35 | r = Redis(host='137.189.204.65', port=6379 ,password='redisredis') 36 | while True: 37 | try: 38 | url = r.lpop('img_url') 39 | url = url.decode('ascii') 40 | if url[:2] == '//': 41 | url = 'http:' + url 42 | print (url) 43 | try: 44 | response = requests.get(url, headers=headers,timeout = 20) 45 | name = int(time.time()) 46 | f = open(str(name)+ url[-4:], 'wb') 47 | f.write(response.content) 48 | f.close() 49 | print ('已经获取图片', url) 50 | except Exception as e: 51 | print ('爬取图片过程出问题', e) 52 | time.sleep(3) 53 | except Exception as e: 54 | print (e) 55 | time.sleep(10) 56 | break 57 | return 58 | 59 | if __name__ == '__main__': 60 | this_machine = 'slave' 61 | print ('开始分布式爬虫') 62 | if this_machine == 'master': 63 | push_redis_list() 64 | else: 65 | get_img() -------------------------------------------------------------------------------- /第二版/Cha 16 -爬虫实战三:百度地图API/cities.txt: -------------------------------------------------------------------------------- 1 | 苏州市 3067 2 | 南京市 2547 3 | 无锡市 2324 4 | 常州市 1432 5 | 南通市 1201 6 | 徐州市 1057 7 | 盐城市 850 8 | 宿迁市 818 9 | 扬州市 609 10 | 镇江市 568 11 | 泰州市 558 12 | 淮安市 520 13 | 连云港市 427 14 | 杭州市 2495 15 | 温州市 2114 16 | 宁波市 1764 17 | 嘉兴市 869 18 | 金华市 776 19 | 台州市 661 20 | 绍兴市 632 21 | 湖州市 479 22 | 丽水市 379 23 | 衢州市 192 24 | 舟山市 131 25 | 广州市 4366 26 | 深圳市 4850 27 | 东莞市 2493 28 | 佛山市 2652 29 | 惠州市 1386 30 | 江门市 776 31 | 中山市 729 32 | 珠海市 478 33 | 汕头市 452 34 | 梅州市 397 35 | 湛江市 379 36 | 韶关市 367 37 | 河源市 350 38 | 肇庆市 342 39 | 清远市 303 40 | 茂名市 295 41 | 揭阳市 226 42 | 阳江市 189 43 | 汕尾市 177 44 | 潮州市 155 45 | 云浮市 87 46 | 福州市 2217 47 | 厦门市 1364 48 | 泉州市 1104 49 | 漳州市 621 50 | 莆田市 353 51 | 宁德市 312 52 | 龙岩市 304 53 | 南平市 253 54 | 三明市 245 55 | 青岛市 1750 56 | 济南市 976 57 | 潍坊市 841 58 | 烟台市 735 59 | 威海市 680 60 | 淄博市 528 61 | 临沂市 525 62 | 济宁市 447 63 | 日照市 401 64 | 菏泽市 361 65 | 聊城市 330 66 | 德州市 283 67 | 滨州市 268 68 | 枣庄市 264 69 | 泰安市 251 70 | 东营市 211 71 | 莱芜市 116 72 | 郑州市 2061 73 | 洛阳市 784 74 | 开封市 506 75 | 新乡市 447 76 | 南阳市 411 77 | 商丘市 337 78 | 焦作市 312 79 | 平顶山市 269 80 | 安阳市 263 81 | 周口市 230 82 | 三门峡市 215 83 | 信阳市 211 84 | 许昌市 194 85 | 驻马店市 183 86 | 濮阳市 134 87 | 漯河市 121 88 | 鹤壁市 103 89 | 济源市 30 90 | 石家庄市 1342 91 | 保定市 777 92 | 邢台市 633 93 | 秦皇岛市 630 94 | 唐山市 606 95 | 邯郸市 546 96 | 沧州市 366 97 | 廊坊市 357 98 | 张家口市 345 99 | 承德市 269 100 | 衡水市 171 101 | 成都市 3436 102 | 绵阳市 462 103 | 南充市 447 104 | 德阳市 315 105 | 内江市 270 106 | 遂宁市 242 107 | 宜宾市 202 108 | 乐山市 190 109 | 泸州市 182 110 | 眉山市 169 111 | 资阳市 167 112 | 达州市 154 113 | 自贡市 142 114 | 广元市 140 115 | 攀枝花市 128 116 | 广安市 113 117 | 凉山彝族自治州 98 118 | 巴中市 78 119 | 雅安市 78 120 | 阿坝藏族羌族自治州 58 121 | 甘孜藏族自治州 55 122 | 沈阳市 1410 123 | 大连市 1625 124 | 抚顺市 380 125 | 鞍山市 274 126 | 营口市 211 127 | 盘锦市 203 128 | 葫芦岛市 171 129 | 本溪市 159 130 | 锦州市 156 131 | 丹东市 153 132 | 阜新市 109 133 | 朝阳市 96 134 | 铁岭市 92 135 | 辽阳市 89 136 | 昆明市 1756 137 | 大理白族自治州 356 138 | 玉溪市 285 139 | 西双版纳傣族自治州 275 140 | 曲靖市 226 141 | 丽江市 164 142 | 红河哈尼族彝族自治州 142 143 | 普洱市 115 144 | 迪庆藏族自治州 110 145 | 楚雄彝族自治州 109 146 | 昭通市 106 147 | 保山市 102 148 | 临沧市 92 149 | 文山壮族苗族自治州 67 150 | 德宏傣族景颇族自治州 51 151 | 怒江傈僳族自治州 14 152 | 长沙市 1831 153 | 张家界市 463 154 | 郴州市 445 155 | 株洲市 421 156 | 常德市 411 157 | 衡阳市 350 158 | 岳阳市 247 159 | 邵阳市 219 160 | 永州市 196 161 | 娄底市 181 162 | 湘潭市 144 163 | 怀化市 138 164 | 益阳市 133 165 | 湘西土家族苗族自治州 70 166 | 武汉市 2856 167 | 黄冈市 922 168 | 宜昌市 395 169 | 荆州市 293 170 | 十堰市 283 171 | 荆门市 281 172 | 襄阳市 262 173 | 黄石市 236 174 | 孝感市 202 175 | 咸宁市 184 176 | 随州市 134 177 | 恩施土家族苗族自治州 117 178 | 神农架林区 78 179 | 仙桃市 57 180 | 鄂州市 50 181 | 潜江市 46 182 | 天门市 41 183 | 南昌市 858 184 | 赣州市 717 185 | 九江市 469 186 | 新余市 462 187 | 上饶市 365 188 | 萍乡市 301 189 | 宜春市 284 190 | 吉安市 262 191 | 抚州市 239 192 | 景德镇市 137 193 | 鹰潭市 78 194 | 合肥市 1349 195 | 安庆市 431 196 | 宿州市 323 197 | 阜阳市 312 198 | 芜湖市 302 199 | 六安市 283 200 | 滁州市 206 201 | 亳州市 195 202 | 淮南市 180 203 | 宣城市 170 204 | 马鞍山市 164 205 | 蚌埠市 146 206 | 黄山市 138 207 | 巢湖市 132 208 | 淮北市 113 209 | 池州市 83 210 | 铜陵市 77 211 | 太原市 811 212 | 临汾市 345 213 | 长治市 294 214 | 运城市 289 215 | 晋中市 267 216 | 大同市 249 217 | 晋城市 189 218 | 吕梁市 166 219 | 忻州市 159 220 | 阳泉市 105 221 | 朔州市 79 222 | 南宁市 1102 223 | 桂林市 666 224 | 柳州市 588 225 | 玉林市 410 226 | 北海市 288 227 | 百色市 191 228 | 河池市 183 229 | 梧州市 150 230 | 贵港市 142 231 | 钦州市 91 232 | 防城港市 78 233 | 来宾市 72 234 | 贺州市 68 235 | 崇左市 46 236 | 西安市 3222 237 | 宝鸡市 469 238 | 渭南市 330 239 | 咸阳市 251 240 | 汉中市 183 241 | 榆林市 173 242 | 商洛市 156 243 | 安康市 125 244 | 延安市 95 245 | 铜川市 83 246 | 哈尔滨市 1228 247 | 齐齐哈尔市 178 248 | 牡丹江市 178 249 | 佳木斯市 169 250 | 大庆市 161 251 | 黑河市 137 252 | 伊春市 134 253 | 鹤岗市 97 254 | 绥化市 78 255 | 鸡西市 74 256 | 双鸭山市 56 257 | 七台河市 37 258 | 大兴安岭地区 17 259 | 呼和浩特市 518 260 | 包头市 435 261 | 鄂尔多斯市 392 262 | 赤峰市 203 263 | 呼伦贝尔市 168 264 | 乌海市 163 265 | 乌兰察布市 125 266 | 通辽市 122 267 | 巴彦淖尔市 117 268 | 阿拉善盟 101 269 | 兴安盟 101 270 | 锡林郭勒盟 82 271 | 贵阳市 1491 272 | 遵义市 384 273 | 毕节地区 202 274 | 黔南布依族苗族自治州 193 275 | 六盘水市 191 276 | 黔东南苗族侗族自治州 147 277 | 铜仁地区 141 278 | 黔西南布依族苗族自治州 106 279 | 安顺市 104 280 | 长春市 1404 281 | 延边朝鲜族自治州 431 282 | 吉林市 269 283 | 四平市 186 284 | 通化市 102 285 | 松原市 98 286 | 白山市 95 287 | 白城市 78 288 | 辽源市 44 289 | 兰州市 484 290 | 白银市 252 291 | 金昌市 181 292 | 平凉市 167 293 | 酒泉市 160 294 | 张掖市 128 295 | 武威市 128 296 | 定西市 111 297 | 天水市 98 298 | 临夏回族自治州 61 299 | 陇南市 56 300 | 庆阳市 48 301 | 甘南藏族自治州 42 302 | 嘉峪关市 22 303 | 乌鲁木齐市 520 304 | 昌吉回族自治州 165 305 | 伊犁哈萨克自治州 160 306 | 喀什地区 91 307 | 克拉玛依市 79 308 | 阿勒泰地区 75 309 | 塔城地区 64 310 | 阿克苏地区 58 311 | 巴音郭楞蒙古自治州 50 312 | 哈密地区 38 313 | 吐鲁番地区 25 314 | 石河子市 23 315 | 和田地区 22 316 | 博尔塔拉蒙古自治州 20 317 | 克孜勒苏柯尔克孜自治州 9 318 | 图木舒克市 6 319 | 五家渠市 5 320 | 阿拉尔市 3 321 | 海口市 430 322 | 三亚市 257 323 | 文昌市 49 324 | 儋州市 36 325 | 临高县 33 326 | 定安县 25 327 | 澄迈县 24 328 | 昌江黎族自治县 20 329 | 万宁市 19 330 | 东方市 14 331 | 琼海市 14 332 | 保亭黎族苗族自治县 12 333 | 陵水黎族自治县 12 334 | 乐东黎族自治县 11 335 | 琼中黎族苗族自治县 9 336 | 屯昌县 8 337 | 五指山市 5 338 | 白沙黎族自治县 3 339 | 银川市 673 340 | 固原市 75 341 | 石嘴山市 71 342 | 吴忠市 64 343 | 中卫市 34 344 | 西宁市 274 345 | 海西蒙古族藏族自治州 60 346 | 海南藏族自治州 35 347 | 海东地区 34 348 | 海北藏族自治州 16 349 | 黄南藏族自治州 10 350 | 玉树藏族自治州 7 351 | 果洛藏族自治州 5 352 | 拉萨市 56 353 | 阿里地区 19 354 | 林芝地区 15 355 | 日喀则地区 14 356 | 山南地区 5 357 | 昌都地区 4 358 | 那曲地区 2 359 | 北京市 5607 360 | 上海市 4027 361 | 重庆市 2753 362 | 天津市 1605 363 | 香港特别行政区 214 364 | 澳门特别行政区 28 365 | -------------------------------------------------------------------------------- /第二版/Cha 2 - 编写你的第一个网络爬虫/Cha 2 _章末实战.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 题目1:请使用Python中的循环,打印输出从1到100的所有奇数。" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "scrolled": true 15 | }, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "1\n", 22 | "3\n", 23 | "5\n", 24 | "7\n", 25 | "9\n", 26 | "11\n", 27 | "13\n", 28 | "15\n", 29 | "17\n", 30 | "19\n", 31 | "21\n", 32 | "23\n", 33 | "25\n", 34 | "27\n", 35 | "29\n", 36 | "31\n", 37 | "33\n", 38 | "35\n", 39 | "37\n", 40 | "39\n", 41 | "41\n", 42 | "43\n", 43 | "45\n", 44 | "47\n", 45 | "49\n", 46 | "51\n", 47 | "53\n", 48 | "55\n", 49 | "57\n", 50 | "59\n", 51 | "61\n", 52 | "63\n", 53 | "65\n", 54 | "67\n", 55 | "69\n", 56 | "71\n", 57 | "73\n", 58 | "75\n", 59 | "77\n", 60 | "79\n", 61 | "81\n", 62 | "83\n", 63 | "85\n", 64 | "87\n", 65 | "89\n", 66 | "91\n", 67 | "93\n", 68 | "95\n", 69 | "97\n", 70 | "99\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "for i in range(1,101):\n", 76 | " if i % 2 == 1:\n", 77 | " print (i)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "# 题目2:请将字符串 ”你好$$$我正在学Python@#@#现在需要&%&%&修改字符串” 中的符号变成一个空格,需要输出的格式为:”你好 我正在学Python 现在需要 修改字符串”" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 2, 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "name": "stdout", 94 | "output_type": "stream", 95 | "text": [ 96 | "你好 我正在学Python 现在需要 修改字符串\n" 97 | ] 98 | } 99 | ], 100 | "source": [ 101 | "str1 = '你好$$$我正在学Python@#@#现在需要&%&%&修改字符串'\n", 102 | "str2 = str1.replace('$$$', ' ').replace('@#@#', ' ').replace('&%&%&', ' ')\n", 103 | "print (str2)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 3, 109 | "metadata": {}, 110 | "outputs": [ 111 | { 112 | "name": "stdout", 113 | "output_type": "stream", 114 | "text": [ 115 | "你好 我正在学Python 现在需要 修改字符串\n" 116 | ] 117 | } 118 | ], 119 | "source": [ 120 | "import re\n", 121 | "str1 = '你好$$$我正在学Python@#@#现在需要&%&%&修改字符串'\n", 122 | "str2 = re.sub('[$@#&%]+', ' ' ,str1)\n", 123 | "print (str2)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "# 题目3:输出 9*9 乘法口诀表" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 1, 136 | "metadata": { 137 | "ExecuteTime": { 138 | "end_time": "2019-01-04T12:18:17.953381Z", 139 | "start_time": "2019-01-04T12:18:17.938842Z" 140 | } 141 | }, 142 | "outputs": [ 143 | { 144 | "name": "stdout", 145 | "output_type": "stream", 146 | "text": [ 147 | "1x1=1\t\n", 148 | "1x2=2\t2x2=4\t\n", 149 | "1x3=3\t2x3=6\t3x3=9\t\n", 150 | "1x4=4\t2x4=8\t3x4=12\t4x4=16\t\n", 151 | "1x5=5\t2x5=10\t3x5=15\t4x5=20\t5x5=25\t\n", 152 | "1x6=6\t2x6=12\t3x6=18\t4x6=24\t5x6=30\t6x6=36\t\n", 153 | "1x7=7\t2x7=14\t3x7=21\t4x7=28\t5x7=35\t6x7=42\t7x7=49\t\n", 154 | "1x8=8\t2x8=16\t3x8=24\t4x8=32\t5x8=40\t6x8=48\t7x8=56\t8x8=64\t\n", 155 | "1x9=9\t2x9=18\t3x9=27\t4x9=36\t5x9=45\t6x9=54\t7x9=63\t8x9=72\t9x9=81\t\n" 156 | ] 157 | } 158 | ], 159 | "source": [ 160 | "for i in range(1, 10):\n", 161 | " for j in range(1, i+1):\n", 162 | " print (\"%dx%d=%d\\t\" % (j, i, i*j), end=\"\")\n", 163 | " print(\"\")" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "# 题目4:请写出一个函数,当输入函数变量当月利润I,能返回应发放奖金总数,例如输出“利润100000元时,应发放奖金总数为10000元。”。\n", 171 | "其中,企业发放的奖金根据利润提成。利润(I)低于或等于10万元时,奖金可提10%;利润高于10万元,低于20万元时,低于10万元的部分按10%提成,高于10万元的部分,可提成7.5%;20万到40万之间时,高于20万元的部分,可提成5%;40万到60万之间时高于40万元的部分,可提成3%;60万到100万之间时,高于60万元的部分,可提成1.5%,高于100万元时,超过100万元的部分按1%提成" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 3, 177 | "metadata": { 178 | "ExecuteTime": { 179 | "end_time": "2019-01-04T12:18:43.452862Z", 180 | "start_time": "2019-01-04T12:18:42.226135Z" 181 | } 182 | }, 183 | "outputs": [ 184 | { 185 | "name": "stdout", 186 | "output_type": "stream", 187 | "text": [ 188 | "净利润:2100000\n", 189 | "利润为2100000元时,应发奖金总数为50500元\n" 190 | ] 191 | } 192 | ], 193 | "source": [ 194 | "def calcute_profit(I):\n", 195 | " I = I / 10000\n", 196 | " if I <= 10:\n", 197 | " a = I * 0.1\n", 198 | " return a * 10000\n", 199 | " elif I <= 20 and I > 10:\n", 200 | " b =0.25 + I * 0.075\n", 201 | " return b * 10000\n", 202 | " elif I <= 40 and I > 20:\n", 203 | " c = 0.75 + I * 0.05\n", 204 | " return c * 10000\n", 205 | " elif I <= 60 and I > 40:\n", 206 | " d = 1.55 + I * 0.03\n", 207 | " return d * 10000\n", 208 | " elif I <= 100 and I > 60:\n", 209 | " e = 2.45 + I * 0.015\n", 210 | " return e * 10000\n", 211 | " else:\n", 212 | " f = 2.95 + I * 0.01\n", 213 | " return f * 10000\n", 214 | " \n", 215 | "I = int(input('净利润:'))\n", 216 | "profit = calcute_profit(I)\n", 217 | "print ('利润为%d元时,应发奖金总数为%d元' % (I, profit))" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 4, 223 | "metadata": { 224 | "ExecuteTime": { 225 | "end_time": "2019-01-04T12:18:48.041910Z", 226 | "start_time": "2019-01-04T12:18:46.176321Z" 227 | } 228 | }, 229 | "outputs": [ 230 | { 231 | "name": "stdout", 232 | "output_type": "stream", 233 | "text": [ 234 | "净利润:2100000\n", 235 | "利润为2100000元时,应发奖金总数为50500元\n" 236 | ] 237 | } 238 | ], 239 | "source": [ 240 | "def calcute_profit(I):\n", 241 | " arr = [1000000,600000,400000,200000,100000,0] #这应该就是各个分界值了,把它们放在列表里方便访问\n", 242 | " rat = [0.01,0.015,0.03,0.05,0.075,0.1] #这是各个分界值所对应的奖金比例值\n", 243 | " r = 0 #这是总奖金的初始值\n", 244 | " for idx in range(0,6): #有6个分界值当然要循环6次\n", 245 | " if I > arr[idx]:\n", 246 | " r = r + (I - arr[idx]) * rat[idx] \n", 247 | " I = arr[idx]\n", 248 | " return r\n", 249 | "\n", 250 | "I = int(input('净利润:'))\n", 251 | "profit = calcute_profit(I)\n", 252 | "print ('利润为%d元时,应发奖金总数为%d元' % (I, profit))" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "# 题目5:用字典的值对字典进行排序" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 7, 265 | "metadata": {}, 266 | "outputs": [ 267 | { 268 | "name": "stdout", 269 | "output_type": "stream", 270 | "text": [ 271 | "[(0, 0), (2, 1), (1, 2), (4, 3), (3, 4)]\n" 272 | ] 273 | } 274 | ], 275 | "source": [ 276 | "import operator\n", 277 | "x = {1: 2, 3: 4, 4:3, 2:1, 0:0}\n", 278 | "sorted_x = sorted(x.items(), key=operator.itemgetter(1))\n", 279 | "print (sorted_x)" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "# 题目6:请问一下两段代码的输出分别是什么?" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 8, 292 | "metadata": {}, 293 | "outputs": [ 294 | { 295 | "name": "stdout", 296 | "output_type": "stream", 297 | "text": [ 298 | "1\n" 299 | ] 300 | } 301 | ], 302 | "source": [ 303 | "a = 1\n", 304 | "def fun(a):\n", 305 | " a = 2\n", 306 | "fun(a)\n", 307 | "print (a)" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 9, 313 | "metadata": {}, 314 | "outputs": [ 315 | { 316 | "name": "stdout", 317 | "output_type": "stream", 318 | "text": [ 319 | "[1]\n" 320 | ] 321 | } 322 | ], 323 | "source": [ 324 | "a = []\n", 325 | "def fun(a):\n", 326 | " a.append(1)\n", 327 | "fun(a)\n", 328 | "print (a)" 329 | ] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": {}, 334 | "source": [ 335 | "# 题目7: 请问以下两段代码的输出分别是什么?" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 10, 341 | "metadata": {}, 342 | "outputs": [ 343 | { 344 | "name": "stdout", 345 | "output_type": "stream", 346 | "text": [ 347 | "bbb\n", 348 | "aaa\n", 349 | "aaa\n" 350 | ] 351 | } 352 | ], 353 | "source": [ 354 | "class Person:\n", 355 | " name=\"aaa\"\n", 356 | "\n", 357 | "p1=Person()\n", 358 | "p2=Person()\n", 359 | "p1.name=\"bbb\"\n", 360 | "print (p1.name)\n", 361 | "print (p2.name)\n", 362 | "print (Person.name)" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": 11, 368 | "metadata": {}, 369 | "outputs": [ 370 | { 371 | "name": "stdout", 372 | "output_type": "stream", 373 | "text": [ 374 | "[1]\n", 375 | "[1]\n", 376 | "[1]\n" 377 | ] 378 | } 379 | ], 380 | "source": [ 381 | "class Person:\n", 382 | " name=[]\n", 383 | "\n", 384 | "p1=Person()\n", 385 | "p2=Person()\n", 386 | "p1.name.append(1)\n", 387 | "print (p1.name)\n", 388 | "print (p2.name)\n", 389 | "print (Person.name)" 390 | ] 391 | } 392 | ], 393 | "metadata": { 394 | "kernelspec": { 395 | "display_name": "Python 3", 396 | "language": "python", 397 | "name": "python3" 398 | }, 399 | "language_info": { 400 | "codemirror_mode": { 401 | "name": "ipython", 402 | "version": 3 403 | }, 404 | "file_extension": ".py", 405 | "mimetype": "text/x-python", 406 | "name": "python", 407 | "nbconvert_exporter": "python", 408 | "pygments_lexer": "ipython3", 409 | "version": "3.6.5" 410 | }, 411 | "toc": { 412 | "base_numbering": 1, 413 | "nav_menu": { 414 | "height": "153px", 415 | "width": "252px" 416 | }, 417 | "number_sections": true, 418 | "sideBar": true, 419 | "skip_h1_title": false, 420 | "title_cell": "Table of Contents", 421 | "title_sidebar": "Contents", 422 | "toc_cell": false, 423 | "toc_position": {}, 424 | "toc_section_display": "block", 425 | "toc_window_display": false 426 | } 427 | }, 428 | "nbformat": 4, 429 | "nbformat_minor": 2 430 | } 431 | -------------------------------------------------------------------------------- /第二版/Cha 2 - 编写你的第一个网络爬虫/title_test.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 2 - 编写你的第一个网络爬虫/title_test.txt -------------------------------------------------------------------------------- /第二版/Cha 4 -动态网页抓取/Cha 4 _章末实战.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": { 7 | "ExecuteTime": { 8 | "end_time": "2019-01-04T13:01:25.237670Z", 9 | "start_time": "2019-01-04T13:01:12.178305Z" 10 | }, 11 | "scrolled": true 12 | }, 13 | "outputs": [ 14 | { 15 | "name": "stdout", 16 | "output_type": "stream", 17 | "text": [ 18 | "0 ¥288 【宫遇】17-KKmall楼上一房一厅--【Loft时代】 整套公寓 1室1卫1床\n", 19 | "0 ¥369 LADYMA |原宿 摩洛哥风格 福田CBD会展中心#家庭影院CocoPark福田皇岗口岸岗厦地铁口 整套公寓 1室1卫1床\n", 20 | "0 ¥439 小屋子和它的大陽台 整套公寓 单间1卫1床\n", 21 | "0 ¥318 【十二微邸】14J-KKmall楼上的城市微魔方 整套公寓 1室1卫1床\n", 22 | "0 ¥318 【十二微邸】32Q-KKmall楼上的天空微城堡 整套公寓 1室1卫1床\n", 23 | "0 ¥435 近市中心舒适温馨的GAO's HOME 整套公寓 1室1卫1床\n", 24 | "0 ¥266 【暖居·小小】 深圳福田区 迷你新居 北欧简约风 下梅林地铁A出口步行1500米左右 整套公寓 1室1卫1床\n", 25 | "0 ¥89 花叶家青年旅舍【世界之窗店】-粉色女生4人房(一张床位) 合住房间 1室2卫4床\n", 26 | "0 ¥218 推荐:香蜜湖温馨公寓(双地铁)ShenZhen FuTian 整套酒店式公寓 1室1卫1床\n", 27 | "0 ¥350 【3D空间】 超大落地窗|按摩浴缸|巨幕投影|城市夜景 罗湖#地王#京基#老街 整套公寓 单间1卫1床\n", 28 | "0 ¥172 Loire花房公寓 合住房间 1室1卫1床\n", 29 | "0 ¥580 #寒舍Room1#福田CBD下沙地铁口极简风格,无敌景观房,独享大浴缸,私人房间,拍摄请另咨询 独立房间 1室1卫1床\n", 30 | "0 ¥349 [Misa’s house] ‘想‘老街地铁口/kkmall万象城罗湖口岸/ins风商务房/直达香港 整套公寓 1室1卫2床\n", 31 | "0 ¥138 (世界之窗欢乐谷华侨城)超棒阳光房 独立房间 1室1卫1床\n", 32 | "0 ¥210 福田CBD会展中心莲花山福田口岸9号线孖岭地铁口温馨一房一厅 小屋 1室1卫1床\n", 33 | "0 ¥258 覔舍·A#紧邻南科大和深大西丽校区#塘朗地铁站#大窗户#24小时入住#一居室 整套公寓 1室1卫1床\n", 34 | "0 ¥450 福田 八卦岭 园岭地铁口 美食街旁 复式公寓整租--居心地设计民宿 整间阁楼 1室1卫1床\n", 35 | "0 ¥358 深圳福田中心区與香港口岸附近的五星小居 整套公寓 1室1卫1床\n" 36 | ] 37 | } 38 | ], 39 | "source": [ 40 | "from selenium import webdriver\n", 41 | "import time\n", 42 | "\n", 43 | "driver = webdriver.Firefox(executable_path = r'C:\\Users\\santostang\\Desktop\\geckodriver.exe')\n", 44 | "#把上述地址改成你电脑中geckodriver.exe程序的地址\n", 45 | "#在虚拟浏览器中打开 Airbnb 页面\n", 46 | "driver.get(\"https://zh.airbnb.com/s/Shenzhen--China/homes\")\n", 47 | "\n", 48 | "#找到页面中所有的出租房\n", 49 | "rent_list = driver.find_elements_by_css_selector('div._gig1e7')\n", 50 | "\n", 51 | "#对于每一个出租房\n", 52 | "for eachhouse in rent_list:\n", 53 | " #找到评论数量\n", 54 | " try:\n", 55 | " comment = eachhouse.find_element_by_css_selector('span._1cy09umr')\n", 56 | " comment = comment.text\n", 57 | " except:\n", 58 | " comment = 0\n", 59 | " \n", 60 | " #找到价格\n", 61 | " price = eachhouse.find_element_by_css_selector('div._1yarz4r')\n", 62 | " price = price.text.replace(\"每晚\", \"\").replace(\"价格\", \"\").replace(\"\\n\", \"\")\n", 63 | " \n", 64 | " #找到名称\n", 65 | " name = eachhouse.find_element_by_css_selector('div._vbshb6')\n", 66 | " name = name.text\n", 67 | " \n", 68 | " #找到房屋类型,大小\n", 69 | " details = eachhouse.find_element_by_css_selector('span._14ksqu3j')\n", 70 | " details = details.text\n", 71 | " house_type = details.split(\" · \")[0]\n", 72 | " bed_number = details.split(\" · \")[1]\n", 73 | " print (comment, price, name, house_type, bed_number)" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 19, 79 | "metadata": { 80 | "ExecuteTime": { 81 | "end_time": "2018-11-18T08:48:16.793188Z", 82 | "start_time": "2018-11-18T08:47:21.685569Z" 83 | } 84 | }, 85 | "outputs": [ 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "180 ¥291 【宫遇】17-KKmall楼上一房一厅--【Loft时代】 整套公寓 1室1卫1床\n", 91 | "99 ¥215 推荐:香蜜湖温馨公寓(双地铁)ShenZhen FuTian 整套酒店式公寓 1室1卫1床\n", 92 | "87 ¥167 Loire花房公寓 合住房间 1室1卫1床\n", 93 | "82 ¥368 LADYMA |原宿 摩洛哥风格 福田CBD会展中心#家庭影院CocoPark福田皇岗口岸岗厦地铁口 整套公寓 1室1卫1床\n", 94 | "150 ¥319 【十二微邸】14J-KKmall楼上的城市微魔方 整套公寓 1室1卫1床\n", 95 | "153 ¥319 【十二微邸】32Q-KKmall楼上的天空微城堡 整套公寓 1室1卫1床\n", 96 | "69 ¥319 【猫薄荷】罗湖口岸|双地铁口|美食一街|巨幕投影|泡泡吊椅|Loft| 整间阁楼 1室1卫1床\n", 97 | "152 ¥437 近市中心舒适温馨的GAO's HOME 整套公寓 1室1卫1床\n", 98 | "153 ¥160 深圳北站 大床房 直达香港口岸/出差首选 温馨舒适北欧风公寓 独立房间 1室2卫1床\n", 99 | "68 ¥264 【暖居·小小】 深圳福田区 迷你新居 北欧简约风 下梅林地铁A出口步行1500米左右 整套公寓 1室1卫1床\n", 100 | "298 ¥444 小屋子和它的大陽台 整套公寓 单间1卫1床\n", 101 | "52 ¥583 无间-海上世界别墅#BLACK#地铁站边#超大天台#独立卫浴阳台#超大浴缸#蛇口港#近深圳湾口岸 独立房间 1室1卫1床\n", 102 | "20 ¥347 [Misa’s house] 老街地铁口/kkmall万象城罗湖口岸/ins风商务房/直达香港!北欧 整套公寓 1室1卫2床\n", 103 | "64 ¥236 福原居-Luck House 榻榻米高层观景套间-福田口岸 独立房间 3室2卫1床\n", 104 | "31 ¥298 樱空 日式loft公寓 近罗湖口岸国贸金光华万象城东门 好住好逛又好吃 整间阁楼 1室1卫1床\n", 105 | "135 ¥340 【白日梦蓝】美食街|loft|罗湖口岸|双地铁|巨幕影院 | 游泳池1-3人公寓 整套公寓 1室1卫1床\n", 106 | "48 ¥319 【双子座】ins少女风|罗湖口岸|东门老街|泡泡池|巨幕投影 整套公寓 单间1卫1床\n", 107 | "44 ¥319 【克洛偌斯】新房特惠上线 | 万象城 | kkmall | 老街美食中心 | 罗湖口岸 | 巨幕投影 整套公寓 1室1卫1床\n", 108 | "68 ¥333 LADYMA |念念 现代公寓福田CBD会展中心#私人家庭影院#CocoPark福田皇岗口岸岗厦地铁 整套公寓 1室1卫1床\n", 109 | "148 ¥701 【水泥盒子】の【白露】【有狗】【有共享空间】 独立房间 1室1卫1床\n", 110 | "36 ¥187 【粉粉少女心+网红打卡】少女系单人房-会展中心CBD皇岗口岸福田口岸购物公园福田高铁站 独立房间 1室0.5卫1床\n", 111 | "61 ¥333 【理想的家】福田皇岗口岸/地铁口/近会展中心华强北,大社区整租 整套公寓 单间1卫1床\n", 112 | "121 ¥278 【宫遇】9E-KKmall-小米智能家居体验-小爱同学-软床垫-京基100/万象城/地王/荔枝公园 整套公寓 1室1卫1床\n", 113 | "40 ¥326 【Yuri Dream Hut 】京基100KKmall楼上/大剧院地铁站/罗湖口岸ins风温馨公寓 整套公寓 单间1卫1床\n", 114 | "233 ¥500 燈塔·時光 Clean warm and cozy place 整套公寓 1室1卫1床\n", 115 | "294 ¥742 Cozy Studio near Mongkok 整间阁楼 单间1卫1床\n", 116 | "105 ¥423 #寒舍Room 2#福田CBD下沙地铁口极简风,看海景观房,独立卫浴,私人房间,拍摄请另咨询 独立房间 1室1卫1床\n", 117 | "70 ¥271 抢手房源:香蜜湖复式美寓(双地铁)FuTian 整间阁楼 1室1卫1床\n", 118 | "70 ¥215 【白日梦蓝】罗湖口岸|巨幕投影|双地铁美食街上的白色幻想空间 整套公寓 1室1卫1床\n", 119 | "75 ¥451 福田 八卦岭 园岭地铁口 美食街旁 复式公寓整租--居心地设计民宿 整间阁楼 1室1卫2床\n", 120 | "194 ¥291 【宫遇】06-KKmall-小米智能家居体验-小爱同学-软床垫-京基100/万象城/地王/荔枝公园 整套公寓 1室1卫1床\n", 121 | "73 ¥319 福田CBD岗夏地铁旁会展中心现代公寓 整套公寓 1室1卫1床\n", 122 | "86 ¥395 大梅沙180度海景工业风大床房,海滩旁,近东部华侨城 整套公寓 1室1卫1床\n", 123 | "104 ¥389 New & cosy modern 1BR 4pax 1min walk MTR 整套公寓 1室1卫2床\n", 124 | "49 ¥250 【艺术之家】福田中心区石厦地铁站时尚的画家居室 The painter's room 整套公寓 1室1卫1床\n", 125 | "38 ¥291 【宫遇】19-KKmall楼上一房一厅--【家庭影院】 整套公寓 1室1卫1床\n", 126 | "148 ¥291 【宫遇】31-KKmall-小米智能家居体验-小爱同学-软床垫-京基100/万象城/地王/荔枝公园 整套公寓 1室1卫1床\n", 127 | "21 ¥291 【Ebnb】佛系日式网红一居!近会展中心购物公园近地铁福田皇岗口岸 整套公寓 1室1卫1床\n", 128 | "38 ¥326 机场地铁直达固戍站,Ins大空间品牌公寓 整套酒店式公寓 单间1卫1床\n", 129 | "32 ¥215 水贝珠宝园7号线地铁出口高端居家公寓 整套酒店式公寓 1室1卫1床\n", 130 | "18 ¥347 |卷儿· room1| 北欧现代混搭风 直达香港/罗湖口岸&火车站/老街双地铁口/万象城kkmall 整套公寓 1室1卫1床\n", 131 | "16 ¥347 [Studio Q-Air]深圳罗湖商圈/东门老街/地铁口/万象城/商务套房/直达香港/罗湖口岸 整套公寓 1室1卫1床\n", 132 | "37 ¥222 [Ein] 出差居家温馨大床房/近深圳东/东门/深圳北 整套公寓 1室1卫1床\n", 133 | "92 ¥319 【十二微邸】22Q-kkmall楼上的梦唤微天使 整套酒店式公寓 1室1卫1床\n", 134 | "44 ¥354 LADYMA |遇见 北欧混搭风 福田CBD会展中心」家庭影院CocoPark福田皇岗口岸岗厦地铁口 整套公寓 1室1卫1床\n", 135 | "75 ¥257 复式:福田区香蜜小筑(双地铁)ShenZhen FuTian 整间阁楼 1室1卫1床\n", 136 | "230 ¥201 SmileHouse超讚房東_陽光大床房115米平方~香港口岸5分鐘、石厦站2分鐘,如朋友热情招待 独立房间 1室1卫1床\n", 137 | "138 ¥333 (初见)罗湖阅见·生活美学公寓城景大床房近地王大厦万象城大剧院D出口 整套公寓 1室1卫1床\n", 138 | "43 ¥215 溜达家法式风格 浴缸房单房整租 3号线地铁口 龙岗中心城大运中心万科广场深圳东 整套公寓 单间1卫1床\n", 139 | "20 ¥201 【NiteNite奈奈】7号地铁口 水贝独立阳光创意公寓 1min Sta.stylish room 整套公寓 单间1卫1床\n", 140 | "108 ¥347 【十二微邸】28H-KKmall楼上的心所微山水 整套公寓 1室1卫1床\n", 141 | "21 ¥326 深圳福田中心区與香港口岸附近的五星小居 整套公寓 1室1卫1床\n", 142 | "103 ¥284 「木夕」地铁口/九方购物中心楼上/深圳北站 /直达福田口岸 全新日式风公寓 整套公寓 1室1卫1床\n", 143 | "112 ¥319 (设计师Room)一分钟到地铁站/近世界之窗/万象天地/科技园北欧精致套房 整套公寓 1室1卫1床\n", 144 | "113 ¥291 【宫遇】18-KKmall楼上一房一厅--【黑胶之夜】 整套公寓 1室1卫1床\n", 145 | "53 ¥382 无间-海上世界别墅#BLUE#地铁站边#超大天台#独立卫浴阳台#蛇口港#近深圳湾口岸 独立房间 1室1卫1床\n", 146 | "54 ¥423 猫筑Aircat#岗厦/会展中心福田中心粉色北欧风配大投影高层公寓 整套公寓 1室1卫1床\n", 147 | "23 ¥215 【YOME空间】/ROOM1.木槿/九方购物中心/深圳北站,地铁直达福田口岸 独立房间 1室1卫1床\n", 148 | "155 ¥326 (云意)罗湖阅见·生活美学公寓城景大床房近地王大厦万象城KKMALL大剧院D出口 整套公寓 1室1卫1床\n", 149 | "122 ¥229 【植意间·梦境D1】主卧带独立洗手间,1.8米大床,深圳北站,高铁19分钟直达香港,走路5分钟到地铁 独立房间 1室1卫1床\n", 150 | "72 ¥423 猫筑AirCat#高新园万象天地北欧风智能公寓配大投影及开放式厨房 整套公寓 1室1卫1床\n", 151 | "90 ¥278 福田中心温馨小家Q大街 整套公寓 1室1卫1床\n", 152 | "33 ¥430 【Sen’sHome】福田中心区/FuTianCBD/双地铁/阳光公寓/购物公园/会展中心/潮人首选 整套公寓 1室1卫1床\n", 153 | "110 ¥180 可拍照~ 小红书网红摆拍「Summerの民宿」西丽•366大街【有猫】 独立房间 1室1卫1床\n", 154 | "145 ¥333 【胖鸟公舍】南山、桃园双地铁口公寓!让你心安、庸懒、自在!倦鸟变胖鸟! 整套公寓 1室1卫1床\n", 155 | "8 ¥319 【伊利昂】新房特惠上线 | 万象城 | kkmall | 老街美食中心 | 罗湖口岸 | 巨幕投影 整套公寓 1室1卫1床\n", 156 | "41 ¥291 Futian CBD nice apartment 福田CBD中心区双地铁口精致公寓 整套公寓 单间1卫1床\n", 157 | "98 ¥347 New【云邸】设计师风格现代高层公寓,靠近罗湖口岸万象城kkmall地王,去香港方便。 整套公寓 1室1卫1床\n", 158 | "111 ¥201 近机场北欧风格超大主卧(独立卫生间) 独立房间 1室2卫1床\n", 159 | "37 ¥215 【新故 深圳】双地铁口|美食街|巨幕投影|北欧空间|1-3人公寓 整套公寓 1室1卫2床\n", 160 | "100 ¥326 (流年)罗湖阅见·生活美学公寓城景大床房近地王大厦万象城KKMALL大剧院D出口 整套公寓 1室1卫1床\n", 161 | "60 ¥361 明式新中式风格现代公寓 整套公寓 1室1卫1床\n", 162 | "94 ¥236 【10】★ ★ ★官方推荐★ ★ ★地铁站旁独立Loft套房,简单纯粹,清新方便,温馨浪漫 整间阁楼 1室1卫1床\n", 163 | "59 ¥451 猫筑AirCat#高新园万象天地粉色配大投影北欧风公寓 整套公寓 1室1卫1床\n", 164 | "37 ¥250 【粉粉少女心+网红打卡】大床房-会展中心CBD皇岗口岸福田口岸福田购物公园福田高铁站 独立房间 1室1卫1床\n", 165 | "66 ¥298 #栖息地•CityNest#罗湖东门老街/地铁口/简约北欧一居【simple n romantic】 整套房子 1室1卫1床\n", 166 | "57 ¥347 无间-海上世界别墅#PINK#地铁站边#超大天台#独立卫浴阳台#蛇口港#近深圳湾口岸 独立房间 1室1卫1床\n", 167 | "29 ¥305 AA购物公园旁石厦地铁口的私人工作室 整套公寓 1室1卫1床\n", 168 | "67 ¥368 Muji风格园景公寓,CBD中心区 整套公寓 1室1卫1床\n", 169 | "36 ¥215 会展中心新洲石厦沙尾北欧清新风格清爽小屋 整套公寓 1室1卫1床\n" 170 | ] 171 | } 172 | ], 173 | "source": [ 174 | "from selenium import webdriver\n", 175 | "import time\n", 176 | "\n", 177 | "driver = webdriver.Firefox(executable_path = r'C:\\Users\\santostang\\Desktop\\geckodriver.exe')\n", 178 | "#把上述地址改成你电脑中geckodriver.exe程序的地址\n", 179 | "for i in range(0,5):\n", 180 | " link = \"https://zh.airbnb.com/s/Shenzhen--China/homes?items_offset=\" + str(i *18)\n", 181 | " driver.get(link)\n", 182 | " rent_list = driver.find_elements_by_css_selector('div._gig1e7')\n", 183 | "\n", 184 | " for eachhouse in rent_list:\n", 185 | " try:\n", 186 | " comment = eachhouse.find_element_by_css_selector('span._1cy09umr').text\n", 187 | " except:\n", 188 | " comment = 0\n", 189 | " price = eachhouse.find_element_by_css_selector('div._1yarz4r')\n", 190 | " price = price.text.replace(\"每晚\", \"\").replace(\"价格\", \"\").replace(\"\\n\", \"\")\n", 191 | " name = eachhouse.find_element_by_css_selector('div._vbshb6')\n", 192 | " name = name.text\n", 193 | " details = eachhouse.find_element_by_css_selector('span._14ksqu3j')\n", 194 | " details = details.text\n", 195 | " house_type = details.split(\" · \")[0]\n", 196 | " bed_number = details.split(\" · \")[1]\n", 197 | " print (comment, price, name, house_type, bed_number)\n", 198 | " time.sleep(5)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [] 207 | } 208 | ], 209 | "metadata": { 210 | "kernelspec": { 211 | "display_name": "Python 3", 212 | "language": "python", 213 | "name": "python3" 214 | }, 215 | "language_info": { 216 | "codemirror_mode": { 217 | "name": "ipython", 218 | "version": 3 219 | }, 220 | "file_extension": ".py", 221 | "mimetype": "text/x-python", 222 | "name": "python", 223 | "nbconvert_exporter": "python", 224 | "pygments_lexer": "ipython3", 225 | "version": "3.6.5" 226 | }, 227 | "toc": { 228 | "base_numbering": 1, 229 | "nav_menu": { 230 | "height": "12px", 231 | "width": "252px" 232 | }, 233 | "number_sections": true, 234 | "sideBar": true, 235 | "skip_h1_title": false, 236 | "title_cell": "Table of Contents", 237 | "title_sidebar": "Contents", 238 | "toc_cell": false, 239 | "toc_position": {}, 240 | "toc_section_display": "block", 241 | "toc_window_display": false 242 | } 243 | }, 244 | "nbformat": 4, 245 | "nbformat_minor": 2 246 | } 247 | -------------------------------------------------------------------------------- /第二版/Cha 4 -动态网页抓取/Cha 4 _自我实践(章末).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "ExecuteTime": { 8 | "end_time": "2018-11-18T08:53:29.428181Z", 9 | "start_time": "2018-11-18T08:53:15.439607Z" 10 | } 11 | }, 12 | "outputs": [ 13 | { 14 | "name": "stdout", 15 | "output_type": "stream", 16 | "text": [ 17 | "180 ¥288 【宫遇】17-KKmall楼上一房一厅--【Loft时代】 整套公寓 1室1卫1床\n", 18 | "99 ¥218 推荐:香蜜湖温馨公寓(双地铁)ShenZhen FuTian 整套酒店式公寓 1室1卫1床\n", 19 | "87 ¥167 Loire花房公寓 合住房间 1室1卫1床\n", 20 | "82 ¥371 LADYMA |原宿 摩洛哥风格 福田CBD会展中心#家庭影院CocoPark福田皇岗口岸岗厦地铁口 整套公寓 1室1卫1床\n", 21 | "150 ¥318 【十二微邸】14J-KKmall楼上的城市微魔方 整套公寓 1室1卫1床\n", 22 | "153 ¥318 【十二微邸】32Q-KKmall楼上的天空微城堡 整套公寓 1室1卫1床\n", 23 | "69 ¥318 【猫薄荷】罗湖口岸|双地铁口|美食一街|巨幕投影|泡泡吊椅|Loft| 整间阁楼 1室1卫1床\n", 24 | "152 ¥435 近市中心舒适温馨的GAO's HOME 整套公寓 1室1卫1床\n", 25 | "153 ¥162 深圳北站 大床房 直达香港口岸/出差首选 温馨舒适北欧风公寓 独立房间 1室2卫1床\n", 26 | "298 ¥443 小屋子和它的大陽台 整套公寓 单间1卫1床\n", 27 | "68 ¥266 【暖居·小小】 深圳福田区 迷你新居 北欧简约风 下梅林地铁A出口步行1500米左右 整套公寓 1室1卫1床\n", 28 | "52 ¥580 无间-海上世界别墅#BLACK#地铁站边#超大天台#独立卫浴阳台#超大浴缸#蛇口港#近深圳湾口岸 独立房间 1室1卫1床\n", 29 | "20 ¥349 [Misa’s house] 老街地铁口/kkmall万象城罗湖口岸/ins风商务房/直达香港!北欧 整套公寓 1室1卫2床\n", 30 | "64 ¥239 福原居-Luck House 榻榻米高层观景套间-福田口岸 独立房间 3室2卫1床\n", 31 | "31 ¥298 樱空 日式loft公寓 近罗湖口岸国贸金光华万象城东门 好住好逛又好吃 整间阁楼 1室1卫1床\n", 32 | "135 ¥338 【白日梦蓝】美食街|loft|罗湖口岸|双地铁|巨幕影院 | 游泳池1-3人公寓 整套公寓 1室1卫1床\n" 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "from selenium import webdriver\n", 38 | "\n", 39 | "fp = webdriver.FirefoxProfile()\n", 40 | "fp.set_preference(\"permissions.default.stylesheet\",2)\n", 41 | "fp.set_preference(\"permissions.default.image\",2)\n", 42 | "fp.set_preference(\"javascript.enabled\", False)\n", 43 | "\n", 44 | "driver = webdriver.Firefox(firefox_profile=fp, executable_path = r'C:\\Users\\santostang\\Desktop\\geckodriver.exe')\n", 45 | "#把上述地址改成你电脑中geckodriver.exe程序的地址\n", 46 | "#在虚拟浏览器中打开 Airbnb 页面\n", 47 | "driver.get(\"https://zh.airbnb.com/s/Shenzhen--China/homes\")\n", 48 | "\n", 49 | "#找到页面中所有的出租房\n", 50 | "rent_list = driver.find_elements_by_css_selector('div._gig1e7')\n", 51 | "\n", 52 | "#对于每一个出租房\n", 53 | "for eachhouse in rent_list:\n", 54 | " #找到评论数量\n", 55 | " try:\n", 56 | " comment = eachhouse.find_element_by_css_selector('span._1cy09umr')\n", 57 | " comment = comment.text\n", 58 | " except:\n", 59 | " comment = 0\n", 60 | " \n", 61 | " #找到价格\n", 62 | " price = eachhouse.find_element_by_css_selector('div._1yarz4r')\n", 63 | " price = price.text.replace(\"每晚\", \"\").replace(\"价格\", \"\").replace(\"\\n\", \"\")\n", 64 | " \n", 65 | " #找到名称\n", 66 | " name = eachhouse.find_element_by_css_selector('div._vbshb6')\n", 67 | " name = name.text\n", 68 | " \n", 69 | " #找到房屋类型,大小\n", 70 | " details = eachhouse.find_element_by_css_selector('span._14ksqu3j')\n", 71 | " details = details.text\n", 72 | " house_type = details.split(\" · \")[0]\n", 73 | " bed_number = details.split(\" · \")[1]\n", 74 | " print (comment, price, name, house_type, bed_number)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 4, 80 | "metadata": { 81 | "ExecuteTime": { 82 | "end_time": "2018-11-18T09:09:30.981400Z", 83 | "start_time": "2018-11-18T09:08:36.398046Z" 84 | } 85 | }, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "180 $42 【宫遇】17-KKmall楼上一房一厅--【Loft时代】 整套公寓 1室1卫1床\n", 92 | "99 $31 推荐:香蜜湖温馨公寓(双地铁)ShenZhen FuTian 整套酒店式公寓 1室1卫1床\n", 93 | "87 $24 Loire花房公寓 合住房间 1室1卫1床\n", 94 | "82 $53 LADYMA |原宿 摩洛哥风格 福田CBD会展中心#家庭影院CocoPark福田皇岗口岸岗厦地铁口 整套公寓 1室1卫1床\n", 95 | "150 $46 【十二微邸】14J-KKmall楼上的城市微魔方 整套公寓 1室1卫1床\n", 96 | "153 $46 【十二微邸】32Q-KKmall楼上的天空微城堡 整套公寓 1室1卫1床\n", 97 | "69 $46 【猫薄荷】罗湖口岸|双地铁口|美食一街|巨幕投影|泡泡吊椅|Loft| 整间阁楼 1室1卫1床\n", 98 | "152 $63 近市中心舒适温馨的GAO's HOME 整套公寓 1室1卫1床\n", 99 | "153 $23 深圳北站 大床房 直达香港口岸/出差首选 温馨舒适北欧风公寓 独立房间 1室2卫1床\n", 100 | "68 $38 【暖居·小小】 深圳福田区 迷你新居 北欧简约风 下梅林地铁A出口步行1500米左右 整套公寓 1室1卫1床\n", 101 | "298 $64 小屋子和它的大陽台 整套公寓 单间1卫1床\n", 102 | "52 $84 无间-海上世界别墅#BLACK#地铁站边#超大天台#独立卫浴阳台#超大浴缸#蛇口港#近深圳湾口岸 独立房间 1室1卫1床\n", 103 | "20 $50 [Misa’s house] 老街地铁口/kkmall万象城罗湖口岸/ins风商务房/直达香港!北欧 整套公寓 1室1卫2床\n", 104 | "64 $34 福原居-Luck House 榻榻米高层观景套间-福田口岸 独立房间 3室2卫1床\n", 105 | "31 $43 樱空 日式loft公寓 近罗湖口岸国贸金光华万象城东门 好住好逛又好吃 整间阁楼 1室1卫1床\n", 106 | "135 $49 【白日梦蓝】美食街|loft|罗湖口岸|双地铁|巨幕影院 | 游泳池1-3人公寓 整套公寓 1室1卫1床\n", 107 | "48 ¥322 【双子座】ins少女风|罗湖口岸|东门老街|泡泡池|巨幕投影 整套公寓 单间1卫1床\n", 108 | "44 ¥318 【克洛偌斯】新房特惠上线 | 万象城 | kkmall | 老街美食中心 | 罗湖口岸 | 巨幕投影 整套公寓 1室1卫1床\n", 109 | "68 ¥331 LADYMA |念念 现代公寓福田CBD会展中心#私人家庭影院#CocoPark福田皇岗口岸岗厦地铁 整套公寓 1室1卫1床\n", 110 | "148 ¥698 【水泥盒子】の【白露】【有狗】【有共享空间】 独立房间 1室1卫1床\n", 111 | "36 ¥188 【粉粉少女心+网红打卡】少女系单人房-会展中心CBD皇岗口岸福田口岸购物公园福田高铁站 独立房间 1室0.5卫1床\n", 112 | "61 ¥336 【理想的家】福田皇岗口岸/地铁口/近会展中心华强北,大社区整租 整套公寓 单间1卫1床\n", 113 | "121 ¥280 【宫遇】9E-KKmall-小米智能家居体验-小爱同学-软床垫-京基100/万象城/地王/荔枝公园 整套公寓 1室1卫1床\n", 114 | "40 ¥328 【Yuri Dream Hut 】京基100KKmall楼上/大剧院地铁站/罗湖口岸ins风温馨公寓 整套公寓 单间1卫1床\n", 115 | "233 ¥498 燈塔·時光 Clean warm and cozy place 整套公寓 1室1卫1床\n", 116 | "294 ¥744 Cozy Studio near Mongkok 整间阁楼 单间1卫1床\n", 117 | "105 ¥420 #寒舍Room 2#福田CBD下沙地铁口极简风,看海景观房,独立卫浴,私人房间,拍摄请另咨询 独立房间 1室1卫1床\n", 118 | "70 ¥268 抢手房源:香蜜湖复式美寓(双地铁)FuTian 整间阁楼 1室1卫1床\n", 119 | "70 ¥214 【白日梦蓝】罗湖口岸|巨幕投影|双地铁美食街上的白色幻想空间 整套公寓 1室1卫1床\n", 120 | "75 ¥449 福田 八卦岭 园岭地铁口 美食街旁 复式公寓整租--居心地设计民宿 整间阁楼 1室1卫2床\n", 121 | "73 ¥320 福田CBD岗夏地铁旁会展中心现代公寓 整套公寓 1室1卫1床\n", 122 | "148 ¥288 【宫遇】31-KKmall-小米智能家居体验-小爱同学-软床垫-京基100/万象城/地王/荔枝公园 整套公寓 1室1卫1床\n", 123 | "86 ¥398 大梅沙180度海景工业风大床房,海滩旁,近东部华侨城 整套公寓 1室1卫1床\n", 124 | "104 ¥390 New & cosy modern 1BR 4pax 1min walk MTR 整套公寓 1室1卫2床\n", 125 | "49 ¥250 【艺术之家】福田中心区石厦地铁站时尚的画家居室 The painter's room 整套公寓 1室1卫1床\n", 126 | "38 ¥288 【宫遇】19-KKmall楼上一房一厅--【家庭影院】 整套公寓 1室1卫1床\n", 127 | "194 ¥288 【宫遇】06-KKmall-小米智能家居体验-小爱同学-软床垫-京基100/万象城/地王/荔枝公园 整套公寓 1室1卫1床\n", 128 | "21 ¥288 【Ebnb】佛系日式网红一居!近会展中心购物公园近地铁福田皇岗口岸 整套公寓 1室1卫1床\n", 129 | "38 ¥328 机场地铁直达固戍站,Ins大空间品牌公寓 整套酒店式公寓 单间1卫1床\n", 130 | "32 ¥218 水贝珠宝园7号线地铁出口高端居家公寓 整套酒店式公寓 1室1卫1床\n", 131 | "18 ¥347 |卷儿· room1| 北欧现代混搭风 直达香港/罗湖口岸&火车站/老街双地铁口/万象城kkmall 整套公寓 1室1卫1床\n", 132 | "16 ¥350 [Studio Q-Air]深圳罗湖商圈/东门老街/地铁口/万象城/商务套房/直达香港/罗湖口岸 整套公寓 1室1卫1床\n", 133 | "37 ¥220 [Ein] 出差居家温馨大床房/近深圳东/东门/深圳北 整套公寓 1室1卫1床\n", 134 | "108 ¥348 【十二微邸】28H-KKmall楼上的心所微山水 整套公寓 1室1卫1床\n", 135 | "44 ¥351 LADYMA |遇见 北欧混搭风 福田CBD会展中心」家庭影院CocoPark福田皇岗口岸岗厦地铁口 整套公寓 1室1卫1床\n", 136 | "75 ¥258 复式:福田区香蜜小筑(双地铁)ShenZhen FuTian 整间阁楼 1室1卫1床\n", 137 | "230 ¥199 SmileHouse超讚房東_陽光大床房115米平方~香港口岸5分鐘、石厦站2分鐘,如朋友热情招待 独立房间 1室1卫1床\n", 138 | "138 ¥336 (初见)罗湖阅见·生活美学公寓城景大床房近地王大厦万象城大剧院D出口 整套公寓 1室1卫1床\n", 139 | "43 ¥218 溜达家法式风格 浴缸房单房整租 3号线地铁口 龙岗中心城大运中心万科广场深圳东 整套公寓 单间1卫1床\n", 140 | "20 ¥198 【NiteNite奈奈】7号地铁口 水贝独立阳光创意公寓 1min Sta.stylish room 整套公寓 单间1卫1床\n", 141 | "92 ¥318 【十二微邸】22Q-kkmall楼上的梦唤微天使 整套酒店式公寓 1室1卫1床\n", 142 | "21 ¥328 深圳福田中心区與香港口岸附近的五星小居 整套公寓 1室1卫1床\n", 143 | "103 ¥282 「木夕」地铁口/九方购物中心楼上/深圳北站 /直达福田口岸 全新日式风公寓 整套公寓 1室1卫1床\n", 144 | "112 ¥318 (设计师Room)一分钟到地铁站/近世界之窗/万象天地/科技园北欧精致套房 整套公寓 1室1卫1床\n", 145 | "113 ¥288 【宫遇】18-KKmall楼上一房一厅--【黑胶之夜】 整套公寓 1室1卫1床\n", 146 | "23 ¥218 【YOME空间】/ROOM1.木槿/九方购物中心/深圳北站,地铁直达福田口岸 独立房间 1室1卫1床\n", 147 | "53 ¥380 无间-海上世界别墅#BLUE#地铁站边#超大天台#独立卫浴阳台#蛇口港#近深圳湾口岸 独立房间 1室1卫1床\n", 148 | "54 ¥420 猫筑Aircat#岗厦/会展中心福田中心粉色北欧风配大投影高层公寓 整套公寓 1室1卫1床\n", 149 | "155 ¥328 (云意)罗湖阅见·生活美学公寓城景大床房近地王大厦万象城KKMALL大剧院D出口 整套公寓 1室1卫1床\n", 150 | "122 ¥228 【植意间·梦境D1】主卧带独立洗手间,1.8米大床,深圳北站,高铁19分钟直达香港,走路5分钟到地铁 独立房间 1室1卫1床\n", 151 | "72 ¥420 猫筑AirCat#高新园万象天地北欧风智能公寓配大投影及开放式厨房 整套公寓 1室1卫1床\n", 152 | "90 ¥275 福田中心温馨小家Q大街 整套公寓 1室1卫1床\n", 153 | "110 ¥178 可拍照~ 小红书网红摆拍「Summerの民宿」西丽•366大街【有猫】 独立房间 1室1卫1床\n", 154 | "33 ¥429 【Sen’sHome】福田中心区/FuTianCBD/双地铁/阳光公寓/购物公园/会展中心/潮人首选 整套公寓 1室1卫1床\n", 155 | "111 ¥199 近机场北欧风格超大主卧(独立卫生间) 独立房间 1室2卫1床\n", 156 | "8 ¥318 【伊利昂】新房特惠上线 | 万象城 | kkmall | 老街美食中心 | 罗湖口岸 | 巨幕投影 整套公寓 1室1卫1床\n", 157 | "41 ¥289 Futian CBD nice apartment 福田CBD中心区双地铁口精致公寓 整套公寓 单间1卫1床\n", 158 | "98 ¥350 New【云邸】设计师风格现代高层公寓,靠近罗湖口岸万象城kkmall地王,去香港方便。 整套公寓 1室1卫1床\n", 159 | "145 ¥330 【胖鸟公舍】南山、桃园双地铁口公寓!让你心安、庸懒、自在!倦鸟变胖鸟! 整套公寓 1室1卫1床\n", 160 | "100 ¥328 (流年)罗湖阅见·生活美学公寓城景大床房近地王大厦万象城KKMALL大剧院D出口 整套公寓 1室1卫1床\n", 161 | "60 ¥359 明式新中式风格现代公寓 整套公寓 1室1卫1床\n", 162 | "37 ¥218 【新故 深圳】双地铁口|美食街|巨幕投影|北欧空间|1-3人公寓 整套公寓 1室1卫2床\n", 163 | "94 ¥233 【10】★ ★ ★官方推荐★ ★ ★地铁站旁独立Loft套房,简单纯粹,清新方便,温馨浪漫 整间阁楼 1室1卫1床\n", 164 | "59 ¥450 猫筑AirCat#高新园万象天地粉色配大投影北欧风公寓 整套公寓 1室1卫1床\n", 165 | "37 ¥248 【粉粉少女心+网红打卡】大床房-会展中心CBD皇岗口岸福田口岸福田购物公园福田高铁站 独立房间 1室1卫1床\n", 166 | "66 ¥299 #栖息地•CityNest#罗湖东门老街/地铁口/简约北欧一居【simple n romantic】 整套房子 1室1卫1床\n", 167 | "57 ¥348 无间-海上世界别墅#PINK#地铁站边#超大天台#独立卫浴阳台#蛇口港#近深圳湾口岸 独立房间 1室1卫1床\n", 168 | "29 ¥308 AA购物公园旁石厦地铁口的私人工作室 整套公寓 1室1卫1床\n", 169 | "67 ¥368 Muji风格园景公寓,CBD中心区 整套公寓 1室1卫1床\n", 170 | "22 ¥318 #限时特价# 福田中心整租北欧风公寓 双口岸房源 近会展中心 交通便利 整套公寓 1室1卫1床\n" 171 | ] 172 | } 173 | ], 174 | "source": [ 175 | "from selenium import webdriver\n", 176 | "import time\n", 177 | "\n", 178 | "fp = webdriver.FirefoxProfile()\n", 179 | "fp.set_preference(\"permissions.default.stylesheet\",2)\n", 180 | "fp.set_preference(\"permissions.default.image\",2)\n", 181 | "fp.set_preference(\"javascript.enabled\", False)\n", 182 | "\n", 183 | "driver = webdriver.Firefox(firefox_profile=fp, executable_path = r'C:\\Users\\santostang\\Desktop\\geckodriver.exe')\n", 184 | "#把上述地址改成你电脑中geckodriver.exe程序的地址\n", 185 | "for i in range(0,5):\n", 186 | " link = \"https://zh.airbnb.com/s/Shenzhen--China/homes?items_offset=\" + str(i *18)\n", 187 | " #在虚拟浏览器中打开 Airbnb 页面\n", 188 | " driver.get(link)\n", 189 | "\n", 190 | " #找到页面中所有的出租房\n", 191 | " rent_list = driver.find_elements_by_css_selector('div._gig1e7')\n", 192 | "\n", 193 | " #对于每一个出租房\n", 194 | " for eachhouse in rent_list:\n", 195 | " #找到评论数量\n", 196 | " try:\n", 197 | " comment = eachhouse.find_element_by_css_selector('span._1cy09umr')\n", 198 | " comment = comment.text\n", 199 | " except:\n", 200 | " comment = 0\n", 201 | "\n", 202 | " #找到价格\n", 203 | " price = eachhouse.find_element_by_css_selector('div._1yarz4r')\n", 204 | " price = price.text.replace(\"每晚\", \"\").replace(\"价格\", \"\").replace(\"\\n\", \"\")\n", 205 | " #找到名称\n", 206 | " name = eachhouse.find_element_by_css_selector('div._vbshb6')\n", 207 | " name = name.text\n", 208 | "\n", 209 | " #找到房屋类型,大小\n", 210 | " details = eachhouse.find_element_by_css_selector('span._14ksqu3j')\n", 211 | " details = details.text\n", 212 | " house_type = details.split(\" · \")[0]\n", 213 | " bed_number = details.split(\" · \")[1]\n", 214 | " print (comment, price, name, house_type, bed_number)\n", 215 | " time.sleep(5)" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [] 224 | } 225 | ], 226 | "metadata": { 227 | "kernelspec": { 228 | "display_name": "Python 3", 229 | "language": "python", 230 | "name": "python3" 231 | }, 232 | "language_info": { 233 | "codemirror_mode": { 234 | "name": "ipython", 235 | "version": 3 236 | }, 237 | "file_extension": ".py", 238 | "mimetype": "text/x-python", 239 | "name": "python", 240 | "nbconvert_exporter": "python", 241 | "pygments_lexer": "ipython3", 242 | "version": "3.6.5" 243 | }, 244 | "toc": { 245 | "base_numbering": 1, 246 | "nav_menu": { 247 | "height": "12px", 248 | "width": "252px" 249 | }, 250 | "number_sections": true, 251 | "sideBar": true, 252 | "skip_h1_title": false, 253 | "title_cell": "Table of Contents", 254 | "title_sidebar": "Contents", 255 | "toc_cell": false, 256 | "toc_position": {}, 257 | "toc_section_display": "block", 258 | "toc_window_display": false 259 | } 260 | }, 261 | "nbformat": 4, 262 | "nbformat_minor": 2 263 | } 264 | -------------------------------------------------------------------------------- /第二版/Cha 4 -动态网页抓取/geckodriver.log: -------------------------------------------------------------------------------- 1 | 1546605469345 mozrunner::runner INFO Running command: "C:\\Program Files\\Mozilla Firefox\\firefox.exe" "-marionette" "-foreground" "-no-remote" "-profile" "C:\\Users\\SANTOS~1\\AppData\\Local\\Temp\\rust_mozprofile.LX5w1iAQRrji" 2 | 1546605470591 addons.webextension.screenshots@mozilla.org WARN Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid host permission: resource://pdf.js/ 3 | 1546605470592 addons.webextension.screenshots@mozilla.org WARN Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid host permission: about:reader* 4 | 1546605472624 Marionette INFO Listening on port 50416 5 | 1546605472634 Marionette WARN TLS certificate errors will be ignored for this session 6 | [Parent 10876, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 7 | [Child 21820, Chrome_ChildThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 8 | [Chi[Parent 10876, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 9 | 1546605476628 Marionette INFO Stopped listening on port 50416 10 | 11 | ###!!! [Child][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost 12 | 13 | [GPU 16888, Chrome_Chil 14 | ###!!! [Child][MessageChannel::SendAndWait] Error: Channel error: cannot send/recv 15 | 16 | 1546605697589 mozrunner::runner INFO Running command: "C:\\Program Files\\Mozilla Firefox\\firefox.exe" "-marionette" "-foreground" "-no-remote" "-profile" "C:\\Users\\SANTOS~1\\AppData\\Local\\Temp\\rust_mozprofile.VodZOZ4kKVAJ" 17 | 1546605698039 addons.webextension.screenshots@mozilla.org WARN Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid host permission: resource://pdf.js/ 18 | 1546605698039 addons.webextension.screenshots@mozilla.org WARN Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid host permission: about:reader* 19 | 1546605699208 Marionette INFO Listening on port 51255 20 | 1546605699254 Marionette WARN TLS certificate errors will be ignored for this session 21 | [Parent 8420, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 22 | [Child 19772, Chrome_ChildThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 23 | [Child 1[Parent 8420, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 24 | [Child 21788, Chrome_ChildThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 25 | [Child 21546605721892 Marionette INFO Stopped listening on port 51255 26 | [GPU 19152, Chrom 27 | ###!!! [Child][MessageChannel::SendAndWait] Error: Channel error: cannot send/recv 28 | 29 | 1546606494576 mozrunner::runner INFO Running command: "C:\\Program Files\\Mozilla Firefox\\firefox.exe" "-marionette" "-foreground" "-no-remote" "-profile" "C:\\Users\\SANTOS~1\\AppData\\Local\\Temp\\rust_mozprofile.BmewmWo0dctd" 30 | 1546606495625 addons.webextension.screenshots@mozilla.org WARN Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid host permission: resource://pdf.js/ 31 | 1546606495625 addons.webextension.screenshots@mozilla.org WARN Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid host permission: about:reader* 32 | 1546606497276 Marionette INFO Listening on port 54937 33 | 1546606497317 Marionette WARN TLS certificate errors will be ignored for this session 34 | JavaScript warning: https://z1.muscache.cn/airbnb/static/client/packages/hypernova/spa/spa_cn.bundle-f27cc504.js, line 1: unreachable code after return statement 35 | [Parent 21360, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 36 | 1546606667118 mozrunner::runner INFO Running command: "C:\\Program Files\\Mozilla Firefox\\firefox.exe" "-marionette" "-foreground" "-no-remote" "-profile" "C:\\Users\\SANTOS~1\\AppData\\Local\\Temp\\rust_mozprofile.Gr2Jrk94qOPT" 37 | 1546606667538 addons.webextension.screenshots@mozilla.org WARN Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid host permission: resource://pdf.js/ 38 | 1546606667538 addons.webextension.screenshots@mozilla.org WARN Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid host permission: about:reader* 39 | 1546606668932 Marionette INFO Listening on port 55954 40 | 1546606669244 Marionette WARN TLS certificate errors will be ignored for this session 41 | JavaScript warning: https://z1.muscache.cn/airbnb/static/client/packages/hypernova/spa/spa_cn.bundle-f27cc504.js, line 1: unreachable code after return statement 42 | [Parent 8088, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 43 | [Parent 8088, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 44 | [Parent 8088, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 45 | [Child 9196, Chrome_ChildThread] WARNING: pipe error: 109: [Child 2800, Chrome_Cfile z:/build/build/src/ipc/chromium/src/chrome/commhildThread] WARNING: pipe error: 109: file z:/build/buon/ipc_channelild/src_/ipc/chromium/src/chrome/commowin.cc, line 346 46 | n/ipc_channel_win.cc, line 346 47 | [Child 2800, C[Parent 8088, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 48 | [Child 19576, Chrome_ChildThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 49 | [Child 19576,1546606870264 Marionette INFO Stopped listening on port 55954 50 | 51 | ###!!! [Child][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost 52 | 53 | [GPU 8032, Ch 54 | ###!!! [Child][MessageChannel::SendAndWait] Error: Channel error: cannot send/recv 55 | 56 | 1546606878275 mozrunner::runner INFO Running command: "C:\\Program Files\\Mozilla Firefox\\firefox.exe" "-marionette" "-foreground" "-no-remote" "-profile" "C:\\Users\\SANTOS~1\\AppData\\Local\\Temp\\rust_mozprofile.7cBNPdUaoF0k" 57 | 1546606878633 addons.webextension.screenshots@mozilla.org WARN Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid host permission: resource://pdf.js/ 58 | 1546606878633 addons.webextension.screenshots@mozilla.org WARN Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid host permission: about:reader* 59 | 1546606879751 Marionette INFO Listening on port 57143 60 | 1546606879906 Marionette WARN TLS certificate errors will be ignored for this session 61 | JavaScript warning: https://z1.muscache.cn/airbnb/static/client/packages/hypernova/spa/spa_cn.bundle-f27cc504.js, line 1: unreachable code after return statement 62 | [Parent 14016, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 63 | 1546606964288 mozrunner::runner INFO Running command: "C:\\Program Files\\Mozilla Firefox\\firefox.exe" "-marionette" "-foreground" "-no-remote" "-profile" "C:\\Users\\SANTOS~1\\AppData\\Local\\Temp\\rust_mozprofile.8fVzEunbmF0C" 64 | 1546606964670 addons.webextension.screenshots@mozilla.org WARN Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid host permission: resource://pdf.js/ 65 | 1546606964670 addons.webextension.screenshots@mozilla.org WARN Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid host permission: about:reader* 66 | 1546606966358 Marionette INFO Listening on port 57918 67 | 1546606966430 Marionette WARN TLS certificate errors will be ignored for this session 68 | JavaScript warning: https://z1.muscache.cn/airbnb/static/client/packages/hypernova/spa/spa_cn.bundle-f27cc504.js, line 1: unreachable code after return statement 69 | JavaScript warning: https://z1.muscache.cn/airbnb/static/client/packages/hypernova/spa/spa_cn.bundle-f27cc504.js, line 1: unreachable code after return statement 70 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 71 | JavaScript warning: https://z1.muscache.cn/airbnb/static/client/packages/hypernova/spa/spa_cn.bundle-f27cc504.js, line 1: unreachable code after return statement 72 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 73 | JavaScript warning: https://z1.muscache.cn/airbnb/static/client/packages/hypernova/spa/spa_cn.bundle-f27cc504.js, line 1: unreachable code after return statement 74 | console.error: BroadcastService: 75 | receivedBroadcastMessage: handler for 76 | remote-settings/monitor_changes 77 | threw error: 78 | Message: Error: Polling for changes failed: NetworkError when attempting to fetch resource.. 79 | Stack: 80 | remoteSettingsFunction/remoteSettings.pollChanges@resource://services-settings/remote-settings.js:750:13 81 | 82 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 83 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 84 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 85 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 86 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 87 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 88 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 89 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 90 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 91 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 92 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 93 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 94 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 95 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 96 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 97 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 98 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 99 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 100 | JavaScript error: resource://gre/modules/WebProgressChild.jsm, line 58: TypeError: this.mm.content is null 101 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 102 | [Parent 20704, Gecko_IOThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 103 | [Child 5104, Chrome_ChildThread] WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/[cChild 16864, Chrome_ChildThread] ommon/ipc_channel_win.cc, line 346 104 | WARNING: pipe error: 109: file z:/build/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 346 105 | [Child 16861546608956897 Marionette INFO Stopped listening on port 57918 106 | 107 | ###!!! [Child][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost 108 | 109 | [GPU 9868, C 110 | ###!!! [Child][MessageChannel::SendAndWait] Error: Channel error: cannot send/recv 111 | 112 | -------------------------------------------------------------------------------- /第二版/Cha 6 -数据储存/Cha 6 -数据存储.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 6.1\t基本存储:存储至txt或csv" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 6.1.1把数据存储至txt" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 14, 20 | "metadata": { 21 | "ExecuteTime": { 22 | "end_time": "2018-11-18T14:30:19.089018Z", 23 | "start_time": "2018-11-18T14:30:19.086009Z" 24 | } 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "title = \"This is a test sentence.\"\n", 29 | "with open(r'C:\\Users\\santostang\\Desktop\\title.txt', \"a+\") as f:\n", 30 | " f.write(title)\n", 31 | " f.close()" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 15, 37 | "metadata": { 38 | "ExecuteTime": { 39 | "end_time": "2018-11-18T14:30:19.751164Z", 40 | "start_time": "2018-11-18T14:30:19.747656Z" 41 | } 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "output = '\\t'.join(['name','title','age','gender'])\n", 46 | "with open('C:\\\\Users\\\\santostang\\\\desktop\\\\test.txt', \"a+\") as f:\n", 47 | " f.write(output)\n", 48 | " f.close()" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 17, 54 | "metadata": { 55 | "ExecuteTime": { 56 | "end_time": "2018-11-18T14:30:38.210437Z", 57 | "start_time": "2018-11-18T14:30:38.198405Z" 58 | } 59 | }, 60 | "outputs": [ 61 | { 62 | "name": "stdout", 63 | "output_type": "stream", 64 | "text": [ 65 | "This is a test sentence.\n" 66 | ] 67 | } 68 | ], 69 | "source": [ 70 | "with open(r'C:\\Users\\santostang\\Desktop\\title.txt', \"r\", encoding ='utf-8') as f:\n", 71 | " result = f.read()\n", 72 | " print (result)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 20, 78 | "metadata": { 79 | "ExecuteTime": { 80 | "end_time": "2018-11-18T14:32:10.779825Z", 81 | "start_time": "2018-11-18T14:32:10.776817Z" 82 | } 83 | }, 84 | "outputs": [ 85 | { 86 | "name": "stdout", 87 | "output_type": "stream", 88 | "text": [ 89 | "['This is a test sentence.', 'This is the second test sentence.', 'This is the third test sentence.']\n" 90 | ] 91 | } 92 | ], 93 | "source": [ 94 | "with open(r'C:\\Users\\santostang\\Desktop\\title.txt', \"r\", encoding ='utf-8') as f:\n", 95 | " result = f.read().splitlines()\n", 96 | " print (result)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "## 6.1.2把数据存储至csv" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 3, 109 | "metadata": { 110 | "ExecuteTime": { 111 | "end_time": "2018-11-18T14:43:09.018560Z", 112 | "start_time": "2018-11-18T14:43:09.014551Z" 113 | } 114 | }, 115 | "outputs": [ 116 | { 117 | "name": "stdout", 118 | "output_type": "stream", 119 | "text": [ 120 | "['\\ufeffA1', 'B1', 'C1', 'D1']\n", 121 | "A1\n", 122 | "['A2', 'B2', 'C2', 'D2']\n", 123 | "A2\n", 124 | "['A3', 'B3', 'C3', 'D3']\n", 125 | "A3\n", 126 | "['A4', 'B4', 'C4', 'D4']\n", 127 | "A4\n" 128 | ] 129 | } 130 | ], 131 | "source": [ 132 | "import csv\n", 133 | "with open('test.csv', 'r',encoding='utf-8') as csvfile:\n", 134 | " csv_reader = csv.reader(csvfile)\n", 135 | " for row in csv_reader:\n", 136 | " print(row)\n", 137 | " print(row[0])" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 4, 143 | "metadata": { 144 | "ExecuteTime": { 145 | "end_time": "2018-11-18T14:43:35.680232Z", 146 | "start_time": "2018-11-18T14:43:35.676723Z" 147 | } 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "import csv\n", 152 | "output_list = ['1', '2','3','4']\n", 153 | "with open('test2.csv', 'a+', encoding='utf-8', newline='') as csvfile:\n", 154 | " w = csv.writer(csvfile)\n", 155 | " w.writerow(output_list)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "## 6.2 储存至MySQL数据库" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "## 6.2.3 Python操作MySQL数据库" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 11, 175 | "metadata": { 176 | "ExecuteTime": { 177 | "end_time": "2018-11-25T15:24:14.804439Z", 178 | "start_time": "2018-11-25T15:24:14.764978Z" 179 | } 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "import pymysql\n", 184 | " \n", 185 | "# 打开数据库连接\n", 186 | "db = pymysql.connect(\"localhost\",\"root\",\"password\",\"scraping\" )\n", 187 | " \n", 188 | "# 使用cursor()方法获取操作游标 \n", 189 | "cursor = db.cursor()\n", 190 | " \n", 191 | "# SQL 插入语句\n", 192 | "sql = \"\"\"INSERT INTO urls (url, content) VALUES ('www.baidu.com', 'This is content.')\"\"\"\n", 193 | "try:\n", 194 | " # 执行sql语句\n", 195 | " cursor.execute(sql)\n", 196 | " # 提交到数据库执行\n", 197 | " db.commit()\n", 198 | "except:\n", 199 | " # 如果发生错误则回滚\n", 200 | " db.rollback()\n", 201 | "# 关闭数据库连接\n", 202 | "db.close()" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 17, 208 | "metadata": { 209 | "ExecuteTime": { 210 | "end_time": "2018-11-28T15:14:41.464834Z", 211 | "start_time": "2018-11-28T15:14:40.915500Z" 212 | } 213 | }, 214 | "outputs": [], 215 | "source": [ 216 | "import requests\n", 217 | "from bs4 import BeautifulSoup\n", 218 | "import pymysql\n", 219 | "\n", 220 | "db = pymysql.connect(\"localhost\",\"root\",\"password\",\"scraping\" )\n", 221 | "cursor = db.cursor()\n", 222 | "\n", 223 | "link = \"http://www.santostang.com/\"\n", 224 | "headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}\n", 225 | "r = requests.get(link, headers= headers)\n", 226 | "\n", 227 | "soup = BeautifulSoup(r.text, \"lxml\")\n", 228 | "title_list = soup.find_all(\"h1\", class_=\"post-title\")\n", 229 | "for eachone in title_list:\n", 230 | " url = eachone.a['href']\n", 231 | " title = eachone.a.text.strip()\n", 232 | " cursor.execute(\"INSERT INTO urls (url, content) VALUES (%s, %s)\", (url, title))\n", 233 | " \n", 234 | "db.commit()\n", 235 | "db.close()" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "# 6.3 储存至MongoDB数据库" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "## 6.3.3 Python操作MongoDB数据库" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 18, 255 | "metadata": { 256 | "ExecuteTime": { 257 | "end_time": "2018-11-28T16:22:17.163745Z", 258 | "start_time": "2018-11-28T16:22:17.046880Z" 259 | } 260 | }, 261 | "outputs": [], 262 | "source": [ 263 | "from pymongo import MongoClient\n", 264 | "client = MongoClient('localhost',27017)\n", 265 | "db = client.blog_database\n", 266 | "collection = db.blog" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 19, 272 | "metadata": { 273 | "ExecuteTime": { 274 | "end_time": "2018-11-28T16:23:15.311100Z", 275 | "start_time": "2018-11-28T16:23:15.018597Z" 276 | } 277 | }, 278 | "outputs": [], 279 | "source": [ 280 | "import requests\n", 281 | "import datetime\n", 282 | "from bs4 import BeautifulSoup\n", 283 | "from pymongo import MongoClient\n", 284 | "\n", 285 | "client = MongoClient('localhost',27017)\n", 286 | "db = client.blog_database\n", 287 | "collection = db.blog\n", 288 | "\n", 289 | "link = \"http://www.santostang.com/\"\n", 290 | "headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} \n", 291 | "r = requests.get(link, headers= headers)\n", 292 | "\n", 293 | "soup = BeautifulSoup(r.text, \"lxml\")\n", 294 | "title_list = soup.find_all(\"h1\", class_=\"post-title\")\n", 295 | "for eachone in title_list:\n", 296 | " url = eachone.a['href']\n", 297 | " title = eachone.a.text.strip()\n", 298 | " post = {\"url\": url,\n", 299 | " \"title\": title,\n", 300 | " \"date\": datetime.datetime.utcnow()}\n", 301 | " collection.insert_one(post)" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "metadata": { 308 | "collapsed": true 309 | }, 310 | "outputs": [], 311 | "source": [] 312 | } 313 | ], 314 | "metadata": { 315 | "kernelspec": { 316 | "display_name": "Python 3", 317 | "language": "python", 318 | "name": "python3" 319 | }, 320 | "language_info": { 321 | "codemirror_mode": { 322 | "name": "ipython", 323 | "version": 3 324 | }, 325 | "file_extension": ".py", 326 | "mimetype": "text/x-python", 327 | "name": "python", 328 | "nbconvert_exporter": "python", 329 | "pygments_lexer": "ipython3", 330 | "version": "3.6.5" 331 | }, 332 | "toc": { 333 | "base_numbering": 1, 334 | "nav_menu": {}, 335 | "number_sections": true, 336 | "sideBar": true, 337 | "skip_h1_title": false, 338 | "title_cell": "Table of Contents", 339 | "title_sidebar": "Contents", 340 | "toc_cell": false, 341 | "toc_position": {}, 342 | "toc_section_display": true, 343 | "toc_window_display": false 344 | } 345 | }, 346 | "nbformat": 4, 347 | "nbformat_minor": 2 348 | } 349 | -------------------------------------------------------------------------------- /第二版/Cha 6 -数据储存/test.csv: -------------------------------------------------------------------------------- 1 | A1,B1,C1,D1 2 | A2,B2,C2,D2 3 | A3,B3,C3,D3 4 | A4,B4,C4,D4 5 | -------------------------------------------------------------------------------- /第二版/Cha 6 -数据储存/test2.csv: -------------------------------------------------------------------------------- 1 | 1,2,3,4 2 | -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践(章末)答案/financeSpider/financeSpider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践(章末)答案/financeSpider/financeSpider/__init__.py -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践(章末)答案/financeSpider/financeSpider/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践(章末)答案/financeSpider/financeSpider/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践(章末)答案/financeSpider/financeSpider/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践(章末)答案/financeSpider/financeSpider/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践(章末)答案/financeSpider/financeSpider/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践(章末)答案/financeSpider/financeSpider/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践(章末)答案/financeSpider/financeSpider/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践(章末)答案/financeSpider/financeSpider/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践(章末)答案/financeSpider/financeSpider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class FinancespiderItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | title = scrapy.Field() 15 | link = scrapy.Field() 16 | content = scrapy.Field() 17 | time = scrapy.Field() 18 | comment = scrapy.Field() 19 | discuss = scrapy.Field() 20 | -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践(章末)答案/financeSpider/financeSpider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class FinancespiderSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class FinancespiderDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践(章末)答案/financeSpider/financeSpider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class FinancespiderPipeline(object): 10 | #填入你的地址 11 | file_path = "C:/Users/santostang/Desktop/financeSpider/result.txt" 12 | 13 | def __init__(self): 14 | self.article = open(self.file_path, "a+", encoding="utf-8") 15 | 16 | #定义管道的处理方法 17 | def process_item(self, item, spider): 18 | title = item["title"] 19 | link = item["link"] 20 | content = item["content"] 21 | time = item["time"] 22 | comment = item["comment"] 23 | discuss = item["discuss"] 24 | output = title + '\t' + link + '\t' + time + '\t' + comment + '\t' + discuss + '\t' + content + '\n\n' 25 | self.article.write(output) 26 | return item 27 | -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践(章末)答案/financeSpider/financeSpider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for financeSpider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'financeSpider' 13 | 14 | SPIDER_MODULES = ['financeSpider.spiders'] 15 | NEWSPIDER_MODULE = 'financeSpider.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'financeSpider (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'financeSpider.middlewares.FinancespiderSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'financeSpider.middlewares.FinancespiderDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'financeSpider.pipelines.FinancespiderPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践(章末)答案/financeSpider/financeSpider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践(章末)答案/financeSpider/financeSpider/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践(章末)答案/financeSpider/financeSpider/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践(章末)答案/financeSpider/financeSpider/spiders/__pycache__/finance.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践(章末)答案/financeSpider/financeSpider/spiders/__pycache__/finance.cpython-36.pyc -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践(章末)答案/financeSpider/financeSpider/spiders/finance.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from bs4 import BeautifulSoup 4 | from financeSpider.items import FinancespiderItem 5 | 6 | class FinanceSpider(scrapy.Spider): 7 | name = 'finance' 8 | allowed_domains = ['finance.eastmoney.com'] 9 | start_urls = ['http://finance.eastmoney.com/news/cywjh_1.html'] 10 | url_head = 'http://finance.eastmoney.com/news/cywjh_' 11 | url_end = '.html' 12 | 13 | # Scrapy自带功能,从start_requests开始发送请求 14 | def start_requests(self): 15 | #获取前三页的url地址 16 | for i in range(1,4): 17 | url = self.url_head + str(i) + self.url_end 18 | print ("当前的页面是:", url) 19 | # 对新闻列表页发送Request请求 20 | yield scrapy.Request(url=url, callback = self.parse) 21 | 22 | def parse(self, response): 23 | soup = BeautifulSoup(response.text, "lxml") 24 | article_list = soup.find_all("div", class_="text") 25 | for i in range(len(article_list)): 26 | # 将数据封装到FinancespiderItem对象,字典类型数据 27 | item = FinancespiderItem() 28 | title = article_list[i].find("p", class_="title").a.text.strip() 29 | link = article_list[i].find("p", class_="title").a["href"] 30 | time = article_list[i].find("p", class_="time").text.strip() 31 | # 变成字典 32 | item["title"] = title 33 | item["link"] = link 34 | item["time"] = time 35 | # 根据文章链接,发送Request请求,并传递item参数 36 | yield scrapy.Request(url=link, meta = {'item':item}, callback = self.parse2) 37 | 38 | def parse2(self, response): 39 | #接收传递的item 40 | item = response.meta['item'] 41 | #解析提取文章内容 42 | soup = BeautifulSoup(response.text, "lxml") 43 | content = soup.find("div", id="ContentBody").text.strip() 44 | content = content.replace("\n", " ") 45 | comment = soup.find("span", class_="cNumShow num").text.strip() 46 | discuss = soup.find("span", class_="num ml5").text.strip() 47 | item["content"] = content 48 | item["comment"] = comment 49 | item["discuss"] = discuss 50 | #返回item,交给item pipeline 51 | yield item 52 | -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/Cha7 - 自我实践(章末)答案/financeSpider/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = financeSpider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = financeSpider 12 | -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/blogSpider/article.csv: -------------------------------------------------------------------------------- 1 | content,link,title 2 | ,http://www.santostang.com/2018/07/15/4-3-%e9%80%9a%e8%bf%87selenium-%e6%a8%a1%e6%8b%9f%e6%b5%8f%e8%a7%88%e5%99%a8%e6%8a%93%e5%8f%96/,4.3 通过selenium 模拟浏览器抓取 3 | ,http://www.santostang.com/2018/07/14/4-2-%e8%a7%a3%e6%9e%90%e7%9c%9f%e5%ae%9e%e5%9c%b0%e5%9d%80%e6%8a%93%e5%8f%96/,4.2 解析真实地址抓取 4 | ,http://www.santostang.com/2018/07/14/%e7%ac%ac%e5%9b%9b%e7%ab%a0%ef%bc%9a%e5%8a%a8%e6%80%81%e7%bd%91%e9%a1%b5%e6%8a%93%e5%8f%96-%e8%a7%a3%e6%9e%90%e7%9c%9f%e5%ae%9e%e5%9c%b0%e5%9d%80-selenium/,第四章- 动态网页抓取 (解析真实地址 + selenium) 5 | ,http://www.santostang.com/2018/07/11/%e3%80%8a%e7%bd%91%e7%bb%9c%e7%88%ac%e8%99%ab%ef%bc%9a%e4%bb%8e%e5%85%a5%e9%97%a8%e5%88%b0%e5%ae%9e%e8%b7%b5%e3%80%8b%e4%b8%80%e4%b9%a6%e5%8b%98%e8%af%af/,《网络爬虫:从入门到实践》一书勘误 6 | ,http://www.santostang.com/2018/07/04/hello-world/,Hello world! 7 | -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/blogSpider/article.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"title": "4.3 \u901a\u8fc7selenium \u6a21\u62df\u6d4f\u89c8\u5668\u6293\u53d6", "link": "http://www.santostang.com/2018/07/15/4-3-%e9%80%9a%e8%bf%87selenium-%e6%a8%a1%e6%8b%9f%e6%b5%8f%e8%a7%88%e5%99%a8%e6%8a%93%e5%8f%96/"}, 3 | {"title": "4.2 \u89e3\u6790\u771f\u5b9e\u5730\u5740\u6293\u53d6", "link": "http://www.santostang.com/2018/07/14/4-2-%e8%a7%a3%e6%9e%90%e7%9c%9f%e5%ae%9e%e5%9c%b0%e5%9d%80%e6%8a%93%e5%8f%96/"}, 4 | {"title": "\u7b2c\u56db\u7ae0- \u52a8\u6001\u7f51\u9875\u6293\u53d6 (\u89e3\u6790\u771f\u5b9e\u5730\u5740 + selenium)", "link": "http://www.santostang.com/2018/07/14/%e7%ac%ac%e5%9b%9b%e7%ab%a0%ef%bc%9a%e5%8a%a8%e6%80%81%e7%bd%91%e9%a1%b5%e6%8a%93%e5%8f%96-%e8%a7%a3%e6%9e%90%e7%9c%9f%e5%ae%9e%e5%9c%b0%e5%9d%80-selenium/"}, 5 | {"title": "\u300a\u7f51\u7edc\u722c\u866b\uff1a\u4ece\u5165\u95e8\u5230\u5b9e\u8df5\u300b\u4e00\u4e66\u52d8\u8bef", "link": "http://www.santostang.com/2018/07/11/%e3%80%8a%e7%bd%91%e7%bb%9c%e7%88%ac%e8%99%ab%ef%bc%9a%e4%bb%8e%e5%85%a5%e9%97%a8%e5%88%b0%e5%ae%9e%e8%b7%b5%e3%80%8b%e4%b8%80%e4%b9%a6%e5%8b%98%e8%af%af/"}, 6 | {"title": "Hello world!", "link": "http://www.santostang.com/2018/07/04/hello-world/"} 7 | ] -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/__init__.py -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class BlogspiderItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | title = scrapy.Field() 15 | link = scrapy.Field() 16 | content = scrapy.Field() 17 | -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class BlogspiderSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class BlogspiderDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class BlogspiderPipeline(object): 10 | #填入你的地址 11 | file_path = "C:/Users/santostang/Desktop/blogSpider/result.txt" 12 | 13 | def __init__(self): 14 | self.article = open(self.file_path, "a+", encoding="utf-8") 15 | 16 | #定义管道的处理方法 17 | def process_item(self, item, spider): 18 | title = item["title"] 19 | link = item["link"] 20 | content = item["content"] 21 | output = title + '\t' + link + '\t' + content + '\n\n' 22 | self.article.write(output) 23 | return item 24 | -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for blogSpider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'blogSpider' 13 | 14 | SPIDER_MODULES = ['blogSpider.spiders'] 15 | NEWSPIDER_MODULE = 'blogSpider.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'blogSpider (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'blogSpider.middlewares.BlogspiderSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'blogSpider.middlewares.BlogspiderDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'blogSpider.pipelines.BlogspiderPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/spiders/__pycache__/santostang - 副本.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/spiders/__pycache__/santostang - 副本.cpython-36.pyc -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/spiders/__pycache__/santostang.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/spiders/__pycache__/santostang.cpython-36.pyc -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/spiders/santostang - 副本.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import os 3 | from bs4 import BeautifulSoup 4 | from blogSpider.items import BlogspiderItem 5 | # 设置相应的代理用户名密码,主机和端口号 6 | os.environ['HTTP_PROXY'] = 'http://web-proxy.oa.com:8080' 7 | os.environ['HTTPS_PROXY'] = 'https://web-proxy.oa.com:8080' 8 | 9 | 10 | class SantostangSpider(scrapy.Spider): 11 | name = 'santostang' 12 | allowed_domains = ['www.santostang.com'] 13 | start_urls = ['http://www.santostang.com/'] 14 | 15 | def parse(self, response): 16 | # 第一部分代码:将html保存到本地 17 | # print (response.text) 18 | # filename = "index.html" 19 | # with open(filename, 'w', encoding="utf-8") as f: 20 | # f.write(response.text) 21 | 22 | # 第二部分代码:打印文章标题 23 | # soup = BeautifulSoup(response.text, "lxml") 24 | # first_title = soup.find("h1", class_= "post-title").a.text.strip() 25 | # print ("第一篇文章的标题是:", first_title) 26 | # for i in range(len(title_list)): 27 | # title = title_list[i].a.text.strip() 28 | # print('第 %s 篇文章的标题是:%s' %(i+1, title)) 29 | 30 | #存放文章信息的列表 31 | items = [] 32 | 33 | soup = BeautifulSoup(response.text, "lxml") 34 | title_list = soup.find_all("h1", class_="post-title") 35 | for i in range(len(title_list)): 36 | # 将数据封装到BlogspiderItem对象,字典类型数据 37 | item = BlogspiderItem() 38 | title = title_list[i].a.text.strip() 39 | link = title_list[i].a["href"] 40 | # 变成字典 41 | item["title"] = title 42 | item["link"] = link 43 | items.append(item) 44 | 45 | # 返回数据 46 | return items 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/blogSpider/blogSpider/spiders/santostang.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from bs4 import BeautifulSoup 3 | from blogSpider.items import BlogspiderItem 4 | 5 | class SantostangSpider(scrapy.Spider): 6 | name = 'santostang' 7 | allowed_domains = ['www.santostang.com'] 8 | start_urls = ['http://www.santostang.com/'] 9 | 10 | def parse(self, response): 11 | # 第一部分代码:将html保存到本地 12 | # print (response.text) 13 | # filename = "index.html" 14 | # with open(filename, 'w', encoding="utf-8") as f: 15 | # f.write(response.text) 16 | 17 | # 第二部分代码:打印文章标题 18 | # soup = BeautifulSoup(response.text, "lxml") 19 | # first_title = soup.find("h1", class_= "post-title").a.text.strip() 20 | # print ("第一篇文章的标题是:", first_title) 21 | # for i in range(len(title_list)): 22 | # title = title_list[i].a.text.strip() 23 | # print('第 %s 篇文章的标题是:%s' %(i+1, title)) 24 | 25 | #第三部分代码: 26 | # soup = BeautifulSoup(response.text, "lxml") 27 | # first_title = soup.find("h1", class_= "post-title").a.text.strip() 28 | # print ("第一篇文章的标题是:", first_title) 29 | 30 | # for i in range(len(title_list)): 31 | # title = title_list[i].a.text.strip() 32 | # print('第 %s 篇文章的标题是:%s' %(i+1, title)) 33 | 34 | #第四部分代码:储存文章内容 35 | soup = BeautifulSoup(response.text, "lxml") 36 | title_list = soup.find_all("h1", class_="post-title") 37 | for i in range(len(title_list)): 38 | # 将数据封装到BlogspiderItem对象,字典类型数据 39 | item = BlogspiderItem() 40 | title = title_list[i].a.text.strip() 41 | link = title_list[i].a["href"] 42 | # 变成字典 43 | item["title"] = title 44 | item["link"] = link 45 | # 根据文章链接,发送Request请求,并传递item参数 46 | yield scrapy.Request(url =link, meta = {'item':item}, callback = self.parse2) 47 | 48 | def parse2(self, response): 49 | #接收传递的item 50 | item = response.meta['item'] 51 | #解析提取文章内容 52 | soup = BeautifulSoup(response.text, "lxml") 53 | content = soup.find("div", class_="view-content").text.strip() 54 | content = content.replace("\n", " ") 55 | item["content"] = content 56 | #返回item,交给item pipeline 57 | yield item -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/blogSpider/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = blogSpider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = blogSpider 12 | -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/__init__.py -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class FinancespiderItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | title = scrapy.Field() 15 | link = scrapy.Field() 16 | content = scrapy.Field() 17 | -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class FinancespiderSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class FinancespiderDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class FinancespiderPipeline(object): 10 | #填入你的地址 11 | file_path = "C:/Users/santostang/Desktop/financeSpider/result.txt" 12 | 13 | def __init__(self): 14 | self.article = open(self.file_path, "a+", encoding="utf-8") 15 | 16 | #定义管道的处理方法 17 | def process_item(self, item, spider): 18 | title = item["title"] 19 | link = item["link"] 20 | content = item["content"] 21 | output = title + '\t' + link + '\t' + content + '\n\n' 22 | self.article.write(output) 23 | return item 24 | -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for financeSpider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'financeSpider' 13 | 14 | SPIDER_MODULES = ['financeSpider.spiders'] 15 | NEWSPIDER_MODULE = 'financeSpider.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'financeSpider (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'financeSpider.middlewares.FinancespiderSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'financeSpider.middlewares.FinancespiderDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'financeSpider.pipelines.FinancespiderPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/spiders/__pycache__/finance.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/spiders/__pycache__/finance.cpython-36.pyc -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/financeSpider/financeSpider/spiders/finance.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from bs4 import BeautifulSoup 4 | from financeSpider.items import FinancespiderItem 5 | 6 | class FinanceSpider(scrapy.Spider): 7 | name = 'finance' 8 | allowed_domains = ['finance.eastmoney.com'] 9 | start_urls = ['http://finance.eastmoney.com/news/cywjh_1.html'] 10 | url_head = 'http://finance.eastmoney.com/news/cywjh_' 11 | url_end = '.html' 12 | 13 | # Scrapy自带功能,从start_requests开始发送请求 14 | def start_requests(self): 15 | #获取前三页的url地址 16 | for i in range(1,4): 17 | url = self.url_head + str(i) + self.url_end 18 | print ("当前的页面是:", url) 19 | # 对新闻列表页发送Request请求 20 | yield scrapy.Request(url=url, callback = self.parse) 21 | 22 | def parse(self, response): 23 | soup = BeautifulSoup(response.text, "lxml") 24 | title_list = soup.find_all("p", class_="title") 25 | for i in range(len(title_list)): 26 | # 将数据封装到FinancespiderItem对象,字典类型数据 27 | item = FinancespiderItem() 28 | title = title_list[i].a.text.strip() 29 | link = title_list[i].a["href"] 30 | # 变成字典 31 | item["title"] = title 32 | item["link"] = link 33 | # 根据文章链接,发送Request请求,并传递item参数 34 | yield scrapy.Request(url=link, meta = {'item':item}, callback = self.parse2) 35 | 36 | def parse2(self, response): 37 | #接收传递的item 38 | item = response.meta['item'] 39 | #解析提取文章内容 40 | soup = BeautifulSoup(response.text, "lxml") 41 | content = soup.find("div", id="ContentBody").text.strip() 42 | content = content.replace("\n", " ") 43 | item["content"] = content 44 | #返回item,交给item pipeline 45 | yield item 46 | -------------------------------------------------------------------------------- /第二版/Cha 7 -Scrapy爬虫框架/financeSpider/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = financeSpider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = financeSpider 12 | -------------------------------------------------------------------------------- /第二版/Cha 8 -提升爬虫的速度/cha8/__pycache__/multiprocess_test.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 8 -提升爬虫的速度/cha8/__pycache__/multiprocess_test.cpython-35.pyc -------------------------------------------------------------------------------- /第二版/Cha 8 -提升爬虫的速度/cha8/__pycache__/multiprocess_test.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 8 -提升爬虫的速度/cha8/__pycache__/multiprocess_test.cpython-36.pyc -------------------------------------------------------------------------------- /第二版/Cha 8 -提升爬虫的速度/cha8/__pycache__/thread_test.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 8 -提升爬虫的速度/cha8/__pycache__/thread_test.cpython-35.pyc -------------------------------------------------------------------------------- /第二版/Cha 8 -提升爬虫的速度/cha8/__pycache__/thread_test.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 8 -提升爬虫的速度/cha8/__pycache__/thread_test.cpython-36.pyc -------------------------------------------------------------------------------- /第二版/Cha 8 -提升爬虫的速度/cha8/gevent1.py: -------------------------------------------------------------------------------- 1 | import gevent 2 | from gevent.queue import Queue, Empty 3 | import time 4 | import requests 5 | 6 | from gevent import monkey#把下面有可能有IO操作的单独做上标记 7 | monkey.patch_all() # 将IO转为异步执行的函数 8 | 9 | link_list = [] 10 | with open('alexa.txt', 'r') as file: 11 | file_list = file.readlines() 12 | for eachone in file_list: 13 | link = eachone.split('\t')[1] 14 | link = link.replace('\n','') 15 | link_list.append(link) 16 | 17 | start = time.time() 18 | def crawler(index): 19 | Process_id = 'Process-' + str(index) 20 | while not workQueue.empty(): 21 | url = workQueue.get(timeout=2) 22 | try: 23 | r = requests.get(url, timeout=20) 24 | print (Process_id, workQueue.qsize(), r.status_code, url) 25 | except Exception as e: 26 | print (Process_id, workQueue.qsize(), url, 'Error: ', e) 27 | 28 | def boss(): 29 | for url in link_list: 30 | workQueue.put_nowait(url) 31 | 32 | if __name__ == '__main__': 33 | workQueue = Queue(1000) 34 | 35 | gevent.spawn(boss).join() 36 | jobs = [] 37 | for i in range(10): 38 | jobs.append(gevent.spawn(crawler, i)) 39 | gevent.joinall(jobs) 40 | 41 | end = time.time() 42 | print ('gevent + Queue多协程爬虫的总时间为:', end-start) 43 | print ('Main Ended!') 44 | -------------------------------------------------------------------------------- /第二版/Cha 8 -提升爬虫的速度/cha8/gevent_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | import gevent 5 | from gevent.queue import Queue, Empty 6 | import time 7 | import requests 8 | 9 | from gevent import monkey#把下面有可能有IO操作的单独做上标记 10 | monkey.patch_all() # 将IO转为异步执行的函数 11 | 12 | start = time.time() 13 | workQueue = Queue(1000) 14 | def crawler(index): 15 | Process_id = 'Process-' + str(index) 16 | while not workQueue.empty(): 17 | url = workQueue.get(timeout=2) 18 | try: 19 | r = requests.get(url, timeout=20) 20 | print (Process_id, workQueue.qsize(), r.status_code, url) 21 | except Exception as e: 22 | print (Process_id, workQueue.qsize(), url, 'Error: ', e) 23 | 24 | def boss(link_list): 25 | for url in link_list: 26 | workQueue.put_nowait(url) 27 | 28 | def gevent_main(link_list, g_num): 29 | gevent.spawn(boss,link_list).join() 30 | jobs = [] 31 | for i in range(g_num): 32 | jobs.append(gevent.spawn(crawler, i)) 33 | gevent.joinall(jobs) 34 | 35 | end = time.time() 36 | time_spend = end-start 37 | print ('gevent + Queue多协程爬虫的总时间为:', time_spend) 38 | print ('Main Ended!') 39 | return time_spend 40 | 41 | if __name__ == '__main__': 42 | link_list = [] 43 | with open('alexa.txt', 'r') as file: 44 | file_list = file.readlines() 45 | for eachone in file_list: 46 | link = eachone.split('\t')[1] 47 | link = link.replace('\n','') 48 | link_list.append(link) 49 | 50 | 51 | 52 | gevent_time10 = gevent_main(link_list, 15) 53 | print ('gevent + Queue多协程爬虫的总时间为:', gevent_time10) 54 | 55 | gevent_time3 = gevent_main(link_list, 20) 56 | print ('gevent + Queue多协程爬虫的总时间为:', gevent_time3) 57 | 58 | with open('result_gevent.txt','a+',encoding='utf-8') as f: 59 | f.write('\t' + str(gevent_time10) + '\t' + str(gevent_time3)) -------------------------------------------------------------------------------- /第二版/Cha 8 -提升爬虫的速度/cha8/multiprocess_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | from multiprocessing import Pool, Manager 5 | import time 6 | import requests 7 | 8 | def crawler(q, index): 9 | Process_id = 'Process-' + str(index) 10 | while not q.empty(): 11 | url = q.get(timeout=2) 12 | try: 13 | r = requests.get(url, timeout=20) 14 | print (Process_id, q.qsize(), r.status_code, url) 15 | except Exception as e: 16 | print (Process_id, q.qsize(), url, 'Error: ', e) 17 | 18 | 19 | def multiprocess_main(link_list, p_num): 20 | start = time.time() 21 | manager = Manager() 22 | workQueue = manager.Queue(1000) 23 | 24 | # 填充队列 25 | for url in link_list: 26 | workQueue.put(url) 27 | 28 | print ("Started processes") 29 | pool = Pool(processes=p_num) 30 | for i in range(p_num): 31 | pool.apply_async(crawler, args=(workQueue, i)) 32 | 33 | 34 | pool.close() 35 | pool.join() 36 | 37 | end = time.time() 38 | time_spend = end-start 39 | print ('Pool + Queue多进程爬虫的总时间为:', time_spend) 40 | print ('Main process Ended!') 41 | return time_spend 42 | 43 | if __name__ == '__main__': 44 | link_list = [] 45 | with open('alexa.txt', 'r') as file: 46 | file_list = file.readlines() 47 | for eachone in file_list: 48 | link = eachone.split('\t')[1] 49 | link = link.replace('\n','') 50 | link_list.append(link) 51 | 52 | multiprocess_main(link_list, 3) -------------------------------------------------------------------------------- /第二版/Cha 8 -提升爬虫的速度/cha8/mutilprocess1.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Process, Queue 2 | import time 3 | import requests 4 | 5 | link_list = [] 6 | with open('alexa.txt', 'r') as file: 7 | file_list = file.readlines() 8 | for eachone in file_list: 9 | link = eachone.split('\t')[1] 10 | link = link.replace('\n','') 11 | link_list.append(link) 12 | 13 | start = time.time() 14 | class MyProcess(Process): 15 | def __init__(self, q): 16 | Process.__init__(self) 17 | self.q = q 18 | 19 | def run(self): 20 | print ("Starting " , self.pid) 21 | while not self.q.empty(): 22 | crawler(self.q) 23 | print ("Exiting " , self.pid) 24 | 25 | def crawler(q): 26 | url = q.get(timeout=2) 27 | try: 28 | r = requests.get(url, timeout=20) 29 | print (q.qsize(), r.status_code, url) 30 | except Exception as e: 31 | print (q.qsize(), url, 'Error: ', e) 32 | 33 | if __name__ == '__main__': 34 | ProcessNames = ["Process-1", "Process-2", "Process-3"] 35 | workQueue = Queue(1000) 36 | 37 | # 填充队列 38 | for url in link_list: 39 | workQueue.put(url) 40 | 41 | for i in range(0, 3): 42 | p = MyProcess(workQueue) 43 | p.daemon = True 44 | p.start() 45 | p.join() 46 | 47 | end = time.time() 48 | print ('Process + Queue多进程爬虫的总时间为:', end-start) 49 | print ('Main process Ended!') -------------------------------------------------------------------------------- /第二版/Cha 8 -提升爬虫的速度/cha8/mutilprocess2.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Pool, Manager 2 | import time 3 | import requests 4 | 5 | link_list = [] 6 | with open('alexa.txt', 'r') as file: 7 | file_list = file.readlines() 8 | for eachone in file_list: 9 | link = eachone.split('\t')[1] 10 | link = link.replace('\n','') 11 | link_list.append(link) 12 | 13 | start = time.time() 14 | def crawler(q, index): 15 | Process_id = 'Process-' + str(index) 16 | while not q.empty(): 17 | url = q.get(timeout=2) 18 | try: 19 | r = requests.get(url, timeout=20) 20 | print (Process_id, q.qsize(), r.status_code, url) 21 | except Exception as e: 22 | print (Process_id, q.qsize(), url, 'Error: ', e) 23 | 24 | 25 | if __name__ == '__main__': 26 | manager = Manager() 27 | workQueue = manager.Queue(1000) 28 | 29 | # 填充队列 30 | for url in link_list: 31 | workQueue.put(url) 32 | 33 | pool = Pool(processes=3) 34 | for i in range(4): 35 | pool.apply_async(crawler, args=(workQueue, i)) 36 | 37 | print ("Started processes") 38 | pool.close() 39 | pool.join() 40 | 41 | end = time.time() 42 | print ('Pool + Queue多进程爬虫的总时间为:', end-start) 43 | print ('Main process Ended!') 44 | -------------------------------------------------------------------------------- /第二版/Cha 8 -提升爬虫的速度/cha8/mutilprocess3.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Pool, Manager 2 | import time 3 | import requests 4 | 5 | link_list = [] 6 | with open('alexa.txt', 'r') as file: 7 | file_list = file.readlines() 8 | for eachone in file_list: 9 | link = eachone.split('\t')[1] 10 | link = link.replace('\n','') 11 | link_list.append(link) 12 | 13 | start = time.time() 14 | def crawler(q, index): 15 | Process_id = 'Process-' + str(index) 16 | while not q.empty(): 17 | url = q.get(timeout=2) 18 | try: 19 | r = requests.get(url, timeout=20) 20 | print (Process_id, q.qsize(), r.status_code, url) 21 | except Exception as e: 22 | print (Process_id, q.qsize(), url, 'Error: ', e) 23 | 24 | 25 | if __name__ == '__main__': 26 | manager = Manager() 27 | workQueue = manager.Queue(1000) 28 | 29 | # 填充队列 30 | for url in link_list: 31 | workQueue.put(url) 32 | 33 | pool = Pool(processes=3) 34 | for i in range(4): 35 | pool.apply(crawler, args=(workQueue, i)) 36 | 37 | print ("Started processes") 38 | pool.close() 39 | pool.join() 40 | 41 | end = time.time() 42 | print ('Pool + Queue多进程爬虫的总时间为:', end-start) 43 | print ('Main process Ended!') 44 | -------------------------------------------------------------------------------- /第二版/Cha 8 -提升爬虫的速度/cha8/result.txt: -------------------------------------------------------------------------------- 1 | 312.7718894481659 143.37620067596436 549.7254424095154 549.978456735611 -------------------------------------------------------------------------------- /第二版/Cha 8 -提升爬虫的速度/cha8/result_gevent.txt: -------------------------------------------------------------------------------- 1 | 338.3443522453308 922.8117818832397 312.1618547439575 484.05668663978577 -------------------------------------------------------------------------------- /第二版/Cha 8 -提升爬虫的速度/cha8/result_single_time.txt: -------------------------------------------------------------------------------- 1 | 1721.3604562282562 -------------------------------------------------------------------------------- /第二版/Cha 8 -提升爬虫的速度/cha8/thread1.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import requests 3 | import time 4 | 5 | link_list = [] 6 | with open('alexa.txt', 'r') as file: 7 | file_list = file.readlines() 8 | for eachone in file_list: 9 | link = eachone.split('\t')[1] 10 | link = link.replace('\n','') 11 | link_list.append(link) 12 | 13 | start = time.time() 14 | class myThread (threading.Thread): 15 | def __init__(self, name, link_range): 16 | threading.Thread.__init__(self) 17 | self.name = name 18 | self.link_range = link_range 19 | def run(self): 20 | print ("Starting " + self.name) 21 | crawler(self.name, self.link_range) 22 | print ("Exiting " + self.name) 23 | 24 | def crawler(threadName, link_range): 25 | for i in range(link_range[0],link_range[1]+1): 26 | try: 27 | r = requests.get(link_list[i], timeout=20) 28 | print (threadName, r.status_code, link_list[i]) 29 | except Exception as e: 30 | print(threadName, 'Error: ', e) 31 | 32 | thread_list = [] 33 | link_range_list = [(0,200),(201,400),(401,600),(601,800),(801,1000)] 34 | 35 | # 创建新线程 36 | for i in range(1,6): 37 | thread = myThread("Thread-" + str(i), link_range_list[i-1]) 38 | thread.start() 39 | thread_list.append(thread) 40 | 41 | # 等待所有线程完成 42 | for thread in thread_list: 43 | thread.join() 44 | 45 | end = time.time() 46 | print ('简单多线程爬虫的总时间为:', end-start) 47 | print ("Exiting Main Thread") -------------------------------------------------------------------------------- /第二版/Cha 8 -提升爬虫的速度/cha8/thread2.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import requests 3 | import time 4 | import queue as Queue 5 | 6 | link_list = [] 7 | with open('alexa.txt', 'r') as file: 8 | file_list = file.readlines() 9 | for eachone in file_list: 10 | link = eachone.split('\t')[1] 11 | link = link.replace('\n','') 12 | link_list.append(link) 13 | 14 | start = time.time() 15 | class myThread (threading.Thread): 16 | def __init__(self, name, q): 17 | threading.Thread.__init__(self) 18 | self.name = name 19 | self.q = q 20 | def run(self): 21 | print ("Starting " + self.name) 22 | while True: 23 | try: 24 | crawler(self.name, self.q) 25 | except: 26 | break 27 | print ("Exiting " + self.name) 28 | 29 | def crawler(threadName, q): 30 | url = q.get(timeout=2) 31 | try: 32 | r = requests.get(url, timeout=20) 33 | print (q.qsize(), threadName, r.status_code, url) 34 | except Exception as e: 35 | print (q.qsize(), threadName, url, 'Error: ', e) 36 | 37 | threadList = ["Thread-1", "Thread-2", "Thread-3","Thread-4", "Thread-5"] 38 | workQueue = Queue.Queue(1000) 39 | threads = [] 40 | 41 | # 创建新线程 42 | for tName in threadList: 43 | thread = myThread(tName, workQueue) 44 | thread.start() 45 | threads.append(thread) 46 | 47 | # 填充队列 48 | for url in link_list: 49 | workQueue.put(url) 50 | 51 | # 等待所有线程完成 52 | for t in threads: 53 | t.join() 54 | 55 | end = time.time() 56 | print ('Queue多线程爬虫的总时间为:', end-start) 57 | print ("Exiting Main Thread") -------------------------------------------------------------------------------- /第二版/Cha 8 -提升爬虫的速度/cha8/thread_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | import threading 5 | import requests 6 | import time 7 | import queue as Queue 8 | 9 | 10 | class myThread (threading.Thread): 11 | def __init__(self, name, q): 12 | threading.Thread.__init__(self) 13 | self.name = name 14 | self.q = q 15 | def run(self): 16 | print ("Starting " + self.name) 17 | while True: 18 | try: 19 | crawler(self.name, self.q) 20 | except: 21 | break 22 | print ("Exiting " + self.name) 23 | 24 | def crawler(threadName, q): 25 | url = q.get(timeout=2) 26 | try: 27 | r = requests.get(url, timeout=20) 28 | print (q.qsize(), threadName, r.status_code, url) 29 | except Exception as e: 30 | print (q.qsize(), threadName, url, 'Error: ', e) 31 | 32 | def thread_main(link_list, t_num): 33 | start = time.time() 34 | workQueue = Queue.Queue(1000) 35 | threads = [] 36 | 37 | # 创建新线程 38 | for tName in range(t_num): 39 | thread = myThread('Thread' + str(tName), workQueue) 40 | thread.start() 41 | threads.append(thread) 42 | 43 | # 填充队列 44 | for url in link_list: 45 | workQueue.put(url) 46 | 47 | # 等待所有线程完成 48 | for t in threads: 49 | t.join() 50 | 51 | end = time.time() 52 | print ('Queue多线程爬虫的总时间为:', end-start) 53 | print ("Exiting Main Thread") 54 | return end-start 55 | 56 | if __name__ == '__main__': 57 | link_list = [] 58 | with open('alexa.txt', 'r') as file: 59 | file_list = file.readlines() 60 | for eachone in file_list: 61 | link = eachone.split('\t')[1] 62 | link = link.replace('\n','') 63 | link_list.append(link) 64 | 65 | thread_main(link_list, 5) -------------------------------------------------------------------------------- /第二版/Cha 8 -提升爬虫的速度/cha8/time_spend 2.py: -------------------------------------------------------------------------------- 1 | from multiprocess_test import multiprocess_main 2 | from thread_test import thread_main 3 | 4 | if __name__ == '__main__': 5 | link_list = [] 6 | with open('alexa.txt', 'r') as file: 7 | file_list = file.readlines() 8 | for eachone in file_list: 9 | link = eachone.split('\t')[1] 10 | link = link.replace('\n','') 11 | link_list.append(link) 12 | 13 | #single = single() 14 | #print ('串行的总时间为:', single) 15 | 16 | #thread_time = thread_main(link_list, 5) 17 | #print ('Queue多线程爬虫的总时间为:', thread_time) 18 | 19 | multiprocess_time = multiprocess_main(link_list, 3) 20 | #print ('Pool + Queue多进程爬虫的总时间为:', multiprocess_time) 21 | 22 | #gevent_time = gevent_main(link_list, 10) 23 | #print ('gevent + Queue多协程爬虫的总时间为:', gevent_time) 24 | 25 | #with open('result.txt','a+',encoding='utf-8') as f: 26 | # f.write(single + '\t' + thread_time + '\t' + multiprocess_time + '\t' + gevent_time) -------------------------------------------------------------------------------- /第二版/Cha 8 -提升爬虫的速度/cha8/time_spend.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | import requests 5 | import time 6 | #from multiprocess_test import multiprocess_main 7 | #from thread_test import thread_main 8 | 9 | def single(): 10 | start = time.time() 11 | for eachone in link_list: 12 | try: 13 | r = requests.get(eachone) 14 | print (r.status_code, eachone) 15 | except Exception as e: 16 | print('Error: ', e) 17 | end = time.time() 18 | time_spend = end-start 19 | print ('串行的总时间为:', time_spend) 20 | return time_spend 21 | 22 | if __name__ == '__main__': 23 | link_list = [] 24 | with open('alexa.txt', 'r') as file: 25 | file_list = file.readlines() 26 | for eachone in file_list: 27 | link = eachone.split('\t')[1] 28 | link = link.replace('\n','') 29 | link_list.append(link) 30 | 31 | #thread_time10 = thread_main(link_list, 10) 32 | #print ('Queue多线程爬虫的总时间为:', thread_time10) 33 | 34 | #multiprocess_time10 = multiprocess_main(link_list, 10) 35 | #print ('Pool + Queue多进程爬虫的总时间为:', multiprocess_time10) 36 | 37 | #thread_time3 = thread_main(link_list, 3) 38 | #print ('Queue多线程爬虫的总时间为:', thread_time3) 39 | 40 | #multiprocess_time3 = multiprocess_main(link_list, 3) 41 | #print ('Pool + Queue多进程爬虫的总时间为:', multiprocess_time3) 42 | 43 | single_time = single() 44 | print ('串行的总时间为:', single_time) 45 | 46 | with open('result_single_time.txt','a+',encoding='utf-8') as f: 47 | f.write(str(single_time)) 48 | #f.write(str(thread_time10) + '\t' + str(multiprocess_time10) + '\t' + str(thread_time3) + '\t' + str(multiprocess_time3)) -------------------------------------------------------------------------------- /第二版/Cha 8 -提升爬虫的速度/多协程.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 8 -提升爬虫的速度/多协程.png -------------------------------------------------------------------------------- /第二版/Cha 8 -提升爬虫的速度/多线程.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 8 -提升爬虫的速度/多线程.png -------------------------------------------------------------------------------- /第二版/Cha 8 -提升爬虫的速度/多进程.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/Cha 8 -提升爬虫的速度/多进程.png -------------------------------------------------------------------------------- /第二版/Cha 9 -反爬虫问题/Cha 9 -反爬虫问题.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 8.3如何“反反爬虫”?" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 8.3.1修改请求 header" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 6, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stdout", 24 | "output_type": "stream", 25 | "text": [ 26 | "{'User-Agent': 'python-requests/2.12.4', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}\n" 27 | ] 28 | } 29 | ], 30 | "source": [ 31 | "import requests\n", 32 | "r = requests.get('http://www.santostang.com')\n", 33 | "print (r.request.headers)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 5, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "{'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}\n" 46 | ] 47 | } 48 | ], 49 | "source": [ 50 | "import requests\n", 51 | "\n", 52 | "link = 'http://www.santostang.com'\n", 53 | "headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} \n", 54 | "r = requests.get(link, headers= headers)\n", 55 | "print (r.request.headers)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "from fake_useragent import UserAgent\n", 65 | "import requests\n", 66 | "\n", 67 | "link = 'http://www.santostang.com'\n", 68 | "ua=UserAgent()\n", 69 | "headers={\"User-Agent\":ua.random}\n", 70 | "response=requests.get(url=url,headers=headers)\n", 71 | "\n", 72 | "#响应状态信息\n", 73 | "print(response.status_code)\n", 74 | "print (r.request.headers)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "## 8.3.2 修改爬虫的间隔时间" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 10, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "name": "stdout", 91 | "output_type": "stream", 92 | "text": [ 93 | "2.0001144409179688\n" 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "import time\n", 99 | "t1 = time.time()\n", 100 | "time.sleep(2)\n", 101 | "t2 = time.time()\n", 102 | "total_time = t2-t1\n", 103 | "print (total_time)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 17, 109 | "metadata": {}, 110 | "outputs": [ 111 | { 112 | "name": "stdout", 113 | "output_type": "stream", 114 | "text": [ 115 | "0.3481693303048349\n" 116 | ] 117 | } 118 | ], 119 | "source": [ 120 | "import time\n", 121 | "import random\n", 122 | "\n", 123 | "sleep_time = random.randint(0,2) + random.random()\n", 124 | "print (sleep_time)\n", 125 | "time.sleep(sleep_time)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 19, 131 | "metadata": {}, 132 | "outputs": [ 133 | { 134 | "name": "stdout", 135 | "output_type": "stream", 136 | "text": [ 137 | "开始爬取这篇博客: http://www.santostang.com/2017/03/08/hello-python/\n", 138 | "这篇博客的标题为: Hello Python!\n", 139 | "开始休息: 0.16292490492777212 秒\n", 140 | "开始爬取这篇博客: http://www.santostang.com/2017/03/07/echarts%e5%ad%a6%e4%b9%a0%e7%ac%94%e8%ae%b02-%e5%8d%95%e9%a1%b5%e9%9d%a2%e5%a4%9a%e5%bc%a0%e5%9b%be%e8%a1%a8/\n", 141 | "这篇博客的标题为: echarts学习笔记(2) — 同一页面多图表\n", 142 | "开始休息: 1.912631031656519 秒\n", 143 | "开始爬取这篇博客: http://www.santostang.com/2017/03/07/echarts%e5%ad%a6%e4%b9%a0%e7%ac%94%e8%ae%b01-%e4%bd%bf%e7%94%a8%e6%a8%a1%e5%9d%97%e5%8c%96%e5%8d%95%e6%96%87%e4%bb%b6%e5%bc%95%e5%85%a5/\n", 144 | "这篇博客的标题为: echarts学习笔记(1) — 模块化单文件引入\n", 145 | "开始休息: 1.3634313119416182 秒\n", 146 | "开始爬取这篇博客: http://www.santostang.com/2017/03/06/%e3%80%90%e7%88%ac%e8%99%ab%e4%ba%8c%e3%80%91%e7%88%ac%e8%99%ab%e7%9a%84%e6%a1%86%e6%9e%b6%e5%92%8c%e5%9f%ba%e6%9c%ac%e8%ae%ae%e9%a2%98/\n", 147 | "这篇博客的标题为: 【爬虫二】爬虫的框架和基本议题\n", 148 | "开始休息: 2.0205314818737516 秒\n", 149 | "开始爬取这篇博客: http://www.santostang.com/2017/03/06/%e3%80%90%e7%88%ac%e8%99%ab%e4%b8%80%e3%80%91%e6%9c%80%e7%ae%80%e5%8d%95%e7%9a%84%e7%88%ac%e8%99%ab%ef%bc%8c%e9%9b%b6%e5%9f%ba%e7%a1%80%e6%95%99%e5%ad%a6/\n", 150 | "这篇博客的标题为: 【爬虫一】最简单的爬虫,零基础教学\n", 151 | "开始休息: 2.446761436097069 秒\n", 152 | "开始爬取这篇博客: http://www.santostang.com/2017/03/02/hello-world/\n", 153 | "这篇博客的标题为: Hello world!\n", 154 | "开始休息: 0.8005131789714476 秒\n" 155 | ] 156 | } 157 | ], 158 | "source": [ 159 | "import requests\n", 160 | "from bs4 import BeautifulSoup\n", 161 | "import time\n", 162 | "import random\n", 163 | "\n", 164 | "link = \"http://www.santostang.com/\"\n", 165 | "\n", 166 | "def scrap(link):\n", 167 | " headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} \n", 168 | " r = requests.get(link, headers= headers)\n", 169 | " html = r.text\n", 170 | " soup = BeautifulSoup(html, \"lxml\")\n", 171 | " return soup\n", 172 | "\n", 173 | "soup = scrap(link)\n", 174 | "title_list = soup.find_all(\"h1\", class_=\"post-title\")\n", 175 | "for eachone in title_list:\n", 176 | " url = eachone.a['href']\n", 177 | " print ('开始爬取这篇博客: ', url)\n", 178 | " soup_article = scrap(url)\n", 179 | " title = soup_article.find(\"h1\", class_=\"view-title\").text.strip()\n", 180 | " print ('这篇博客的标题为: ', title)\n", 181 | " sleep_time = random.randint(0,2) + random.random()\n", 182 | " print ('开始休息: ', sleep_time, '秒')\n", 183 | " time.sleep(sleep_time)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": { 190 | "collapsed": true 191 | }, 192 | "outputs": [], 193 | "source": [ 194 | "scrap_times = 0\n", 195 | "for eachone in title_list:\n", 196 | " url = eachone.a['href']\n", 197 | " print ('开始爬取这篇博客: ', url)\n", 198 | " soup_article = scrap(url)\n", 199 | " title = soup_article.find(\"h1\", class_=\"view-title\").text.strip()\n", 200 | " print ('这篇博客的标题为: ', title)\n", 201 | " \n", 202 | " scrap_times += 1\n", 203 | " if scrap_times % 5 == 0:\n", 204 | " sleep_time = 10 + random.random()\n", 205 | " else:\n", 206 | " sleep_time = random.randint(0,2) + random.random()\n", 207 | " time.sleep(sleep_time)\n", 208 | " print ('开始休息: ', sleep_time, '秒')" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "## 8.3.3 使用代理" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": { 222 | "collapsed": true 223 | }, 224 | "outputs": [], 225 | "source": [ 226 | "import requests\n", 227 | "\n", 228 | "link = \"http://www.santostang.com/\"\n", 229 | "proxies = {'http':'http://xxx.xxx.xxx.xxx:xxxx'}\n", 230 | "response = requests.get(link, proxies=proxies)" 231 | ] 232 | } 233 | ], 234 | "metadata": { 235 | "kernelspec": { 236 | "display_name": "Python 3", 237 | "language": "python", 238 | "name": "python3" 239 | }, 240 | "language_info": { 241 | "codemirror_mode": { 242 | "name": "ipython", 243 | "version": 3 244 | }, 245 | "file_extension": ".py", 246 | "mimetype": "text/x-python", 247 | "name": "python", 248 | "nbconvert_exporter": "python", 249 | "pygments_lexer": "ipython3", 250 | "version": "3.6.5" 251 | }, 252 | "toc": { 253 | "base_numbering": 1, 254 | "nav_menu": {}, 255 | "number_sections": true, 256 | "sideBar": true, 257 | "skip_h1_title": false, 258 | "title_cell": "Table of Contents", 259 | "title_sidebar": "Contents", 260 | "toc_cell": false, 261 | "toc_position": {}, 262 | "toc_section_display": true, 263 | "toc_window_display": false 264 | } 265 | }, 266 | "nbformat": 4, 267 | "nbformat_minor": 2 268 | } 269 | -------------------------------------------------------------------------------- /第二版/geckodriver.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Santostang/PythonScraping/96c2fcfc1ebcc6c6957e05af3de800f9705be1db/第二版/geckodriver.exe --------------------------------------------------------------------------------