├── .gitignore ├── docs ├── db.sql ├── linux-bash-shortcut.md ├── pycharm配置scrapy运行.pdf ├── scrapy.sh └── 复制headers信息.pdf ├── qianmu ├── __init__.py ├── items.py ├── middlewares │ ├── __init__.py │ ├── proxy.py │ └── useragent.py ├── pipelines.py ├── process_item.py ├── settings.py └── spiders │ ├── __init__.py │ ├── u2.py │ └── university.py └── scrapy.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # code IDE 10 | .idea/ 11 | settings.json 12 | 13 | # Distribution / packaging 14 | .Python 15 | env/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # dotenv 87 | .env 88 | 89 | # virtualenv 90 | .venv 91 | venv/ 92 | ENV/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # mypy 105 | .mypy_cache/ 106 | -------------------------------------------------------------------------------- /docs/db.sql: -------------------------------------------------------------------------------- 1 | CREATE DATABASE IF NOT EXISTS `qianmu`; 2 | USE `qianmu`; 3 | 4 | 5 | CREATE TABLE IF NOT EXISTS `universities` ( 6 | `id` BIGINT UNSIGNED AUTO_INCREMENT COMMENT '主键', 7 | `name` VARCHAR(256) NOT NULL COMMENT '学校名称', 8 | `rank` INT(8) NOT NULL DEFAULT 0 COMMENT '学校排名', 9 | `country` VARCHAR(128) COMMENT '国家', 10 | `state` VARCHAR(128) COMMENT '州省', 11 | `city` VARCHAR(128) COMMENT '城市', 12 | `undergraduate_num` VARCHAR(128) COMMENT '本科生人数', 13 | `postgraduate_num` VARCHAR(128) COMMENT '研究生人数', 14 | `website` text COMMENT '网站地址', 15 | PRIMARY KEY (`id`) 16 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT '大学信息表'; -------------------------------------------------------------------------------- /docs/linux-bash-shortcut.md: -------------------------------------------------------------------------------- 1 | #Linux Bash 快捷键 2 | > 本文适用于Mac OS以及大部分Linux发行版本 3 | > 注意本文内所说的前、后分别是指行首、行尾方向 4 | 5 | Linux、Mac OS操作系统,在其默认的命令行模式下(也就是Bash),有许多快捷键。熟悉常用的快捷键可以大大提高我们的操作速度,提升工作效率。按照命令的作用,大概可以分下面几种类型: 6 | 7 | ## 进程控制 8 | * `Ctrl + c` 向当前进程发送一个`SIGINT`信号,通知进程退出。具体效果要看进程的程序如何处理SIGINT信号,有可能会有延迟,有可能甚至会被忽略。比如scrapy程序,按下`Ctrl + c`需要等当前的请求处理完毕后才会结束进程,如果想要强制立即退出,需要按下两次`Ctrl + c` 9 | * `Ctrl + z` 向当前进程发送一个`SIGTSTP`信号,让进程转到后台执行,如果想恢复前台执行,可以使用`fg process_name` 10 | * `Ctrl + d` 退出命令行 11 | 12 | ## 屏幕输出 13 | * `Ctrl + l` 清除屏幕输出 14 | * `Ctrl + s` 停止屏幕输出 15 | * `Ctrl + q` 恢复屏幕输出 16 | 17 | ```js 18 | 有时候我们在输入命令的时候,不知道不小心按到了什么键,控制台“卡死”了,不管怎么操作都不动了。其实就是因为误按下了"ctrl + s"键,我们的输入仍然有效,仍然会执行,只是屏幕上没有反馈罢了。 19 | ``` 20 | 21 | ## 移动光标 22 | * `Ctrl + a` 移动到命令行首 23 | * `Ctrl + e` 移动到命令行尾 24 | * `Ctrl + f` 往前移动一个字符 25 | * `Ctrl + b` 往后移动一个字符 26 | * `Esc + f` 往前移动一个单词(不包含符号) 27 | * `Esc + b` 往后移动一个单词(不包含符号) 28 | * `Ctrl + xx` 在光标当前所处的位置和行首之间切换。 29 | 30 | ## 删除 31 | * `Ctrl + d` 删除光标当前位置的字符 32 | * `Ctrl + h` 删除光标前一个字,相当于Window键盘的Backspace或者Mac键盘的delete键 33 | 34 | ## 剪切与粘贴 35 | * `Ctrl + k` 从光标当然位置剪切到行尾 36 | * `Ctrl + u` 从光标当然位置剪切到行首 37 | * `Ctrl + w` 从光标当前位置向前剪切整个单词(包含符号) 38 | * `Esc + Backspace` 从当前位置向前剪切一个单词(不包含符号,Mac键盘为Esc + delete键) 39 | ```bash 40 | scrapy crawl university -a max_num=500 -t csv -o u.csv 41 | # 注意:假设此时光标500后面,按下 Ctrl + w 后会将 “max_num=500”都删除,如果只想删除到“=”符号之后,则按Esc + BackSpace 42 | ``` 43 | 44 | * `Esc + d` 从光标当前位置向后剪切一个单词(不包含符号) 45 | * `Ctrl + y` 将剪切板中的文本粘贴到当前光标之前 46 | 47 | ## 编辑 48 | * `Ctrl + -` 撤销上一步操作(注意没有反撤销操作,至少目前为止还没发现) 49 | * `Ctrl + t` 交换当前光标所处的字符与前一个字符 50 | * `Esc + t` 交换当前光标所处的单词与前一个单词(不包含符号) 51 | ```bash 52 | scrapy crawl university -a max_num=500 -t csv -o u.csv 53 | # 还是以scrapy命令为例,假设现在光标处理max_num中的"u"处,按下“Esc + t”后,max_num就会变成num_max 54 | 55 | ``` 56 | 57 | ## 修改大小写 58 | * `Esc + u` 将光标所处位置往后一个单词变为大写 59 | * `Esc + l` 将光标所处位置往后一个单词变为小写 60 | * `Esc + c` 将光标所处位置的字符变为大写,并将往后一个单词变为小写 61 | 62 | 63 | ## 历史记录 64 | * `history`可以查看所有命令的历史记录 65 | ``` 66 | 该命令实际上相当于`cat ~/.bash_history`。大家可以看一下自己操作系统用户目录下的.bash_hitory文件,里面记录了命令执行的序号、时间、命令以及所有参数。 67 | ``` 68 | 69 | * `echo $HISTSIZE` 显示历史记录最大记录数量 70 | ``` 71 | HISTSIZE这个环境变量决定了历史记录的最大数量,我们可以通过修改它来修改.bash_history文件的最大行数 72 | ``` 73 | * `history -c` 清除所有的历史命令 74 | * `Ctrl + p` 上一条命令 75 | * `Ctrl + n` 76 | * `Ctrl + r` 进入历史记录逆向搜索模式 77 | * `Esc + r` 撤消所有对当前历史记录命令的修改 78 | * `Esc + .` 使用上一条命令的最后一个参数 79 | 80 | # 命令缩写 81 | 除了以上列出的快捷键,bash还支持下面这些快捷命令 82 | * `!!` 执行上一条命令 83 | * `!command` 执行上一条以“command”开头的命令 84 | * `^command` 删除上一条命令中的"command"并执行 85 | * `^command1^command2` 将上一条命令中第一个"command1"替换为"command2"并执行 86 | * `^command1^command2^` 将上一条命令中所有的"command1"替换为"command2"并执行 87 | * `!$:p` 打印出上一条命令的最后一个参数,类似于上面介绍的"Esc + ." 88 | * `!*:p` 打印出上一条命令的所有参数 89 | ```bash 90 | :p 可以用在很多地方,表示将前面的命令只打印出来,不执行。比如: 91 | !!:p 打印出上一条命令 92 | !scrapy:p 打印出上一条以scrapy开头的命令 93 | ``` 94 | 95 | 96 | -------------------------------------------------------------------------------- /docs/pycharm配置scrapy运行.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guyecode/qianmu/92d7352bbe8d15a969267fe2c1e9cd49aee65707/docs/pycharm配置scrapy运行.pdf -------------------------------------------------------------------------------- /docs/scrapy.sh: -------------------------------------------------------------------------------- 1 | # 创建项目目录结构 2 | scrapy startproject qianmu 3 | 4 | cd qianmu 5 | 6 | # 生成spider文件(注意,当前目录要在项目的根目录下) 7 | # scrapy genspider 8 | scrapy genspider university qianmu.iguye.com 9 | 10 | # 使用scrapy配置下载目标页面,下载完成后在浏览器内打开 11 | scrapy view http://qianmu.iguye.com/2018USNEWS%E4%B8%96%E7%95%8C%E5%A4%A7%E5%AD%A6%E6%8E%92%E5%90%8D 12 | 13 | # 打印出settings文件指定配置的值,如果settings.py内没有指定,则打印出系统默认的值 14 | scrapy settings --get=BOT_NAME 15 | 16 | # 执行爬虫程序(参数为spider的名字) 17 | scrapy crawl university 18 | 19 | # 使用srapy下载器访问链接,并在控制台内输出页面源码 20 | scrapy fetch http://qianmu.iguye.com/2018USNEWS%E4%B8%96%E7%95%8C%E5%A4%A7%E5%AD%A6%E6%8E%92%E5%90%8D 21 | 22 | # 列出所有spider的名字 23 | scrapy list 24 | 25 | # 进入命令行模式 26 | scrapy shell http://qianmu.iguye.com/2018USNEWS%E4%B8%96%E7%95%8C%E5%A4%A7%E5%AD%A6%E6%8E%92%E5%90%8D 27 | 28 | # 使用spider的parse方法去处理链接的返回内容 29 | scrapy parse http://qianmu.iguye.com/2018USNEWS%E4%B8%96%E7%95%8C%E5%A4%A7%E5%AD%A6%E6%8E%92%E5%90%8D 30 | 31 | 32 | 33 | [s] scrapy scrapy模块 34 | [s] request request对象,有headers,url等等属性 35 | [s] response response对象,有xpath,方法,url、meta属性 36 | [s] settings settings.py文件内的内容,以字幕形式保存 37 | [s] spider spider对象,比如:UniversitySpider 38 | [s] Useful shortcuts: 39 | [s] fetch(url[, redirect=True]) 抓取某个链接,并将response,request对象重置为当前爬取的结果 40 | [s] fetch(req) 同上,但是以一个request对象作为参数 41 | [s] shelp() 打印可用的内置方法和变量。 42 | [s] view(response) 将当前的response的页面在浏览器内打开 43 | 44 | 45 | # 运行爬虫程序,最多抓取5个页面,并将结果存储到university.csv,文件格式设置为csv 46 | scrapy crawl university -L INFO -a max_num=5 -o university.csv -t csv 47 | # 打开excel,新建一个空白excel文件,然后选择文件>>导入,选择csv文件, 48 | # 点击导入,选择刚才的university.csv文件。 49 | # 在弹出的窗口中,选择文件来源为utf-8,然后顺着点下一步后点击完成。 -------------------------------------------------------------------------------- /docs/复制headers信息.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guyecode/qianmu/92d7352bbe8d15a969267fe2c1e9cd49aee65707/docs/复制headers信息.pdf -------------------------------------------------------------------------------- /qianmu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guyecode/qianmu/92d7352bbe8d15a969267fe2c1e9cd49aee65707/qianmu/__init__.py -------------------------------------------------------------------------------- /qianmu/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | def convert_int(s): 12 | if isinstance(s, int): 13 | return s 14 | if not s: 15 | return 0 16 | return int(s.strip().replace(',', '')) 17 | 18 | 19 | class UniversityItem(scrapy.Item): 20 | 21 | name = scrapy.Field() 22 | rank = scrapy.Field(serializer=convert_int) 23 | country = scrapy.Field() 24 | state = scrapy.Field() 25 | city = scrapy.Field() 26 | undergraduate_num = scrapy.Field() 27 | postgraduate_num = scrapy.Field() 28 | website = scrapy.Field() 29 | 30 | 31 | if __name__ == '__main__': 32 | u = UniversityItem(name='哈佛大学', rank=1) 33 | u['country'] = '美国' 34 | u['state'] = '马萨诸塞州' 35 | print(u) 36 | print(u['name']) 37 | 38 | # 将会打印出['country', 'state', 'name'],不包含未设置值的字段 39 | print(u.keys()) 40 | # 打印出所有定义过的字段名称 41 | print(u.fields.keys()) 42 | # 打印出所有的fields及其序列化函数 43 | print(u.fields) 44 | # 判断某个item对象是否包含指定字段 45 | print('undergraduate_num' in u.fields) 46 | # 判断某个字段是否设置了值 47 | print('name' in u) 48 | print('undergraduate_num' in u) 49 | 50 | # 复制另外一个Item对象的值 51 | u2 = UniversityItem(u) 52 | u2['undergraduate_num'] = 2345 53 | print(u2) 54 | print(u) 55 | 56 | # 将Item对象转换为字典对象 57 | u_dict = dict(u) 58 | print(type(u_dict)) 59 | # 从一个字典对象中创建item对象 60 | u3 = UniversityItem(u_dict) 61 | print(u3) 62 | 63 | # 如果设置一个未定义的字段,则会抛出KeyError异常 64 | u4 = UniversityItem({'unknow': 123}) 65 | -------------------------------------------------------------------------------- /qianmu/middlewares/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware 10 | 11 | 12 | class QianmuSpiderMiddleware(object): 13 | # Not all methods need to be defined. If a method is not defined, 14 | # scrapy acts as if the spider middleware does not modify the 15 | # passed objects. 16 | 17 | @classmethod 18 | def from_crawler(cls, crawler): 19 | # This method is used by Scrapy to create your spiders. 20 | s = cls() 21 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 22 | return s 23 | 24 | def process_spider_input(self, response, spider): 25 | # Called for each response that goes through the spider 26 | # middleware and into the spider. 27 | 28 | # Should return None or raise an exception. 29 | return None 30 | 31 | def process_spider_output(self, response, result, spider): 32 | # Called with the results returned from the Spider, after 33 | # it has processed the response. 34 | 35 | # Must return an iterable of Request, dict or Item objects. 36 | for i in result: 37 | yield i 38 | 39 | def process_spider_exception(self, response, exception, spider): 40 | # Called when a spider or process_spider_input() method 41 | # (from other spider middleware) raises an exception. 42 | 43 | # Should return either None or an iterable of Response, dict 44 | # or Item objects. 45 | pass 46 | 47 | def process_start_requests(self, start_requests, spider): 48 | # Called with the start requests of the spider, and works 49 | # similarly to the process_spider_output() method, except 50 | # that it doesn’t have a response associated. 51 | 52 | # Must return only requests (not items). 53 | for r in start_requests: 54 | yield r 55 | 56 | def spider_opened(self, spider): 57 | spider.logger.info('Spider opened: %s' % spider.name) 58 | -------------------------------------------------------------------------------- /qianmu/middlewares/proxy.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import random 3 | import logging 4 | from urllib.request import _parse_proxy 5 | from scrapy.exceptions import NotConfigured 6 | 7 | 8 | logger = logging.getLogger() 9 | 10 | def reform_url(url): 11 | # 将url解开成不同的部分 12 | proxy_type, *_, hostport = _parse_proxy(url) 13 | # 将代理URL重新组合,去掉用户名密码 14 | return '%s://%s' % (proxy_type, hostport) 15 | 16 | class RandomProxyMiddleware(object): 17 | 18 | def __init__(self, settings): 19 | # 从配置中读取配置到代理池中 20 | self.proxies = settings.getlist('PROXIES') 21 | # 从配置中读取最大失败次数配置 22 | self.max_failed = settings.getint('PROXY_MAX_FAILED', 3) 23 | # 创建一个字典,用来保存所有代理的失败次数,并将初始值设为0 24 | self.stats = {}.fromkeys(map(reform_url, self.proxies), 0) 25 | 26 | def random_proxy(self): 27 | # 从代理池中随机选择一个 28 | return random.choice(self.proxies) 29 | 30 | @classmethod 31 | def from_crawler(cls, crawler): 32 | # 判断配置中是否打开了代理 33 | if not crawler.settings.getbool('HTTPPROXY_ENABLED'): 34 | raise NotConfigured 35 | # 判断是否有PROXIES这项配置 36 | if not crawler.settings.getlist('PROXIES'): 37 | raise NotConfigured 38 | # 创建并返回一个中间件对象 39 | return cls(crawler.settings) 40 | 41 | def process_request(self, request, spider): 42 | # 如果request.meta中没有设置proxy,则在proxies中随机设置一个proxy 43 | if 'proxy' not in request.meta: 44 | request.meta['proxy'] = self.random_proxy() 45 | 46 | def process_response(self, request, response, spider): 47 | # 本次请求使用的代理 48 | cur_proxy = request.meta['proxy'] 49 | # 判断如果本次请求的status是400以上,则将本次使用的代理失败次数+1 50 | if response.status >= 400: 51 | self.stats[cur_proxy] += 1 52 | # 如果失败次数超过了最大失败次数,则将该代理从代理池中删除 53 | if self.stats[cur_proxy] > self.max_failed: 54 | for proxy in self.proxies: 55 | if reform_url(proxy) == cur_proxy: 56 | self.proxies.remove(proxy) 57 | break 58 | logger.warn('proxy %s remove from proxies list' % cur_proxy) 59 | # 返回response对象以便后续的中间件继续执行 60 | return response 61 | 62 | 63 | -------------------------------------------------------------------------------- /qianmu/middlewares/useragent.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import faker 3 | 4 | class RandomUserAgentMiddleware(object): 5 | """该中间件负责给每个请求随机分配user agent""" 6 | 7 | def __init__(self, settings): 8 | self.faker = faker.Faker() 9 | 10 | @classmethod 11 | def from_crawler(cls, crawler): 12 | return cls(crawler.settings) 13 | 14 | def process_request(self, request, spider): 15 | request.headers['User-Agent'] = self.faker.user_agent() -------------------------------------------------------------------------------- /qianmu/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import json 4 | import pymysql 5 | import pymysql.cursors 6 | import logging 7 | import redis 8 | from twisted.enterprise import adbapi 9 | from scrapy.exceptions import DropItem 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | logger.setLevel('DEBUG') 14 | 15 | 16 | class CheckPipeline(object): 17 | """CheckPipeline验证每个item的关键属性是否为空""" 18 | 19 | def open_spider(self, spider): 20 | """当spider被开启时,这个方法被调用""" 21 | logger.info('spider %s opened' % spider.name) 22 | 23 | def close_spider(self, spider): 24 | """当spider被关闭时,这个方法被调用""" 25 | logger.info('spider %s closed' % spider.name) 26 | 27 | def process_item(self, item, spider): 28 | """每个item pipeline组件都需要调用该方法,这个方法必须返回一个 Item (或任何继承类)对象, 29 | 或是抛出 DropItem 异常,被丢弃的item将不会被之后的pipeline组件所处理。 30 | @param item (Item 对象) – 被爬取的item 31 | @param spider (Spider 对象) – 爬取该item的spider 32 | """ 33 | if not item.get('undergraduate_num'): 34 | # 如果缺失undergraduate_num属性,丢弃该item 35 | raise DropItem("Missing undergraduate in %s" % item['name']) 36 | if not item.get('postgraduate_num'): 37 | # 如果缺失undergraduate_num属性,丢弃该item 38 | raise DropItem("Missing postgraduate_num in %s" % item['name']) 39 | # 如果数据完整,返回item对象供之后的pipeline进行处理 40 | return item 41 | 42 | 43 | class RedisPipeline(object): 44 | 45 | def __init__(self): 46 | self.r = redis.Redis() 47 | 48 | def process_item(self, item, spider): 49 | # 将爬取到的大学名字添加到redis的一个set中 50 | self.r.sadd(spider.name, item['name']) 51 | logger.info('redis: add %s to list %s' % (item['name'], spider.name)) 52 | return item 53 | 54 | 55 | class MysqlPipeline(object): 56 | 57 | def __init__(self): 58 | self.conn = None 59 | self.cur = None 60 | 61 | def open_spider(self, spider): 62 | # 初始化mysql连接 63 | self.conn = pymysql.connect( 64 | host='localhost', 65 | port=3306, 66 | user='root', 67 | passwd='', 68 | db='qianmu', 69 | charset='utf8') 70 | # 初始化游标对象 71 | self.cur = self.conn.cursor() 72 | 73 | def process_item(self, item, spider): 74 | # 将item重组成keys,values的形式,并一一对应 75 | cols, values = zip(*item.items()) 76 | # 拼装SQL语句 77 | sql = "INSERT INTO `universities`(%s) VALUES (%s)" % \ 78 | (','.join(cols), ','.join(['%s'] * len(cols))) 79 | # 执行并commit 80 | self.cur.execute(sql, values) 81 | self.conn.commit() 82 | # 打印出刚才执行的SQL语句 83 | logger.info(self.cur._last_executed) 84 | logger.info('mysql: add %s to universities' % item['name']) 85 | # 返回item以便后续的pipeline处理 86 | return item 87 | 88 | def close_spider(self, spider): 89 | """当spider被关闭时,这个方法被调用""" 90 | self.cur.close() 91 | self.conn.close() -------------------------------------------------------------------------------- /qianmu/process_item.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import sys 3 | import json 4 | import logging 5 | import redis 6 | from pipelines import MysqlPipeline 7 | 8 | r = redis.Redis() 9 | logging.basicConfig() 10 | logger = logging.getLogger('pipelines') 11 | logger.info('begin to process item...') 12 | 13 | def get_item(spider): 14 | key = '%s:items' % spider 15 | item = r.blpop(key) 16 | if item: 17 | return json.loads(item[1]) 18 | 19 | 20 | if __name__ == '__main__': 21 | if len(sys.argv) < 2: 22 | logger.info('need spider name') 23 | exit(1) 24 | spider = sys.argv[1] 25 | 26 | db = MysqlPipeline() 27 | db.open_spider(spider) 28 | item = get_item(spider) 29 | while item: 30 | db.process_item(item, spider) 31 | item = get_item(spider) 32 | db.close_spider(spider) 33 | -------------------------------------------------------------------------------- /qianmu/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for qianmu project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'qianmu' 13 | 14 | SPIDER_MODULES = ['qianmu.spiders'] 15 | NEWSPIDER_MODULE = 'qianmu.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'Mozilla/5.0 (Macintosh; PPC Mac OS X 10_6_0) AppleWebKit/5352 (KHTML, like Gecko) Chrome/14.0.881.0 Safari/5352' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | CONCURRENT_REQUESTS = 32 24 | DOWNLOAD_DELAY = 0 25 | COOKIES_ENABLED = False 26 | 27 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 28 | #CONCURRENT_REQUESTS = 32 29 | 30 | # Configure a delay for requests for the same website (default: 0) 31 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 32 | # See also autothrottle settings and docs 33 | #DOWNLOAD_DELAY = 3 34 | # The download delay setting will honor only one of: 35 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 36 | #CONCURRENT_REQUESTS_PER_IP = 16 37 | 38 | # Disable cookies (enabled by default) 39 | #COOKIES_ENABLED = False 40 | 41 | # Disable Telnet Console (enabled by default) 42 | #TELNETCONSOLE_ENABLED = False 43 | 44 | # Override the default request headers: 45 | DEFAULT_REQUEST_HEADERS = { 46 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 47 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 48 | 'Content-Encoding': 'gzip, deflate', 49 | 'Content-Type': 'text/html; charset=UTF-8', 50 | } 51 | 52 | # Enable or disable spider middlewares 53 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 54 | #SPIDER_MIDDLEWARES = { 55 | # 'qianmu.middlewares.QianmuSpiderMiddleware': 543, 56 | #} 57 | 58 | # Enable or disable downloader middlewares 59 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 60 | # Enable or disable downloader middlewares 61 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 62 | DOWNLOADER_MIDDLEWARES = { 63 | # 打开下载器中间件 64 | 'qianmu.middlewares.useragent.RandomUserAgentMiddleware': 500, 65 | #关闭下载器中间件 66 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 67 | # 添加自定义的proxy中间件,注意优先级要恰好在系统的proxy中间件之前,使用settings['DOWNLOADER_MIDDLEWARES_BASE']查看 68 | 'qianmu.middlewares.proxy.RandomProxyMiddleware': 749, 69 | } 70 | 71 | # 使用代理 72 | HTTPPROXY_ENABLED = True 73 | PROXIES = [ 74 | 'http://ms0108:ms0108@182.84.98.201:888', 75 | 'http://ms0108:ms0108@117.41.187.112:888', 76 | 'http://ms0108:ms0108@210.16.189.75:888', 77 | 'http://ms0108:ms0108@1.82.230.108:888', 78 | 'http://ms0108:ms0108@117.41.184.182:888', 79 | 'http://ms0108:ms0108@222.73.48.188:888', 80 | 'http://ms0108:ms0108@103.21.142.201:888', 81 | ] 82 | 83 | # Enable or disable extensions 84 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 85 | #EXTENSIONS = { 86 | # 'scrapy.extensions.telnet.TelnetConsole': None, 87 | #} 88 | 89 | # Configure item pipelines 90 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 91 | ITEM_PIPELINES = { 92 | 'qianmu.pipelines.CheckPipeline': 300, 93 | 'qianmu.pipelines.RedisPipeline': 301, 94 | 'qianmu.pipelines.MysqlPipeline': 302, 95 | } 96 | 97 | # Enable and configure the AutoThrottle extension (disabled by default) 98 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 99 | #AUTOTHROTTLE_ENABLED = True 100 | # The initial download delay 101 | #AUTOTHROTTLE_START_DELAY = 5 102 | # The maximum download delay to be set in case of high latencies 103 | #AUTOTHROTTLE_MAX_DELAY = 60 104 | # The average number of requests Scrapy should be sending in parallel to 105 | # each remote server 106 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 107 | # Enable showing throttling stats for every response received: 108 | #AUTOTHROTTLE_DEBUG = False 109 | 110 | # Enable and configure HTTP caching (disabled by default) 111 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 112 | #HTTPCACHE_ENABLED = True 113 | #HTTPCACHE_EXPIRATION_SECS = 0 114 | #HTTPCACHE_DIR = 'httpcache' 115 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 116 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 117 | -------------------------------------------------------------------------------- /qianmu/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /qianmu/spiders/u2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy import Request 4 | from scrapy_redis.spiders import RedisSpider 5 | from w3lib.html import remove_tags 6 | from qianmu.items import UniversityItem 7 | 8 | def filter(html): 9 | """过滤网页源码中的特殊符号和sup标签""" 10 | return remove_tags(html, which_ones=('sup',)).replace('\n', '')\ 11 | .replace('\r', '').replace('\t', '') 12 | 13 | 14 | class U2Spider(RedisSpider): 15 | name = 'u2' 16 | # allowed_domains = ['qianmu.iguye.com'] 17 | # start_urls = ['http://qianmu.iguye.com/2018USNEWS%E4%B8%96%E7%95%8C%E5%A4%A7%E5%AD%A6%E6%8E%92%E5%90%8D'] 18 | 19 | def __init__(self, max_num=0, *args, **kwargs): 20 | # 重载父类的构造函数,先调用父类的构造函数 21 | super(U2Spider, self).__init__(*args, **kwargs) 22 | # 然后执行自己的操作, 设置自定义传入的参数 23 | self.max_num = int(max_num) 24 | 25 | def parse(self, response): 26 | # 选择出排名表格里第2行开始的所有行的第2列里的超链接,也就是所有大学的链接 27 | links = response.xpath("//div[@id='content']//tr[position()>1]/td[2]/a/@href").extract() 28 | # 循环这些链接,同时使用enumerate函数,列出每一个链接在列表中的索引 29 | for i, link in enumerate(links): 30 | # 根据自定义的参数判断,如果抓取的链接数量超过max_num,则不再抓取 31 | if self.max_num and i >= self.max_num: 32 | break 33 | # 有个别坏掉的链接,加一下判断 34 | if not link.startswith('http://'): 35 | link = 'http://qianmu.iguye.com/%s' % link 36 | # 创建一个request对象,并将parse_university设置为它的回调函数 37 | request = Request(link, callback=self.parse_university) 38 | # 使用meta属性在函数之间传递参数 39 | request.meta['rank'] = i + 1 40 | yield request 41 | 42 | def parse_university(self, response): 43 | # 使用filter函数过滤网页中的特殊符号,因为response的属性是只读的,所以我们使用replace方法来重新生成一个response对象 44 | response = response.replace(body=filter(response.body)) 45 | # 选择出父节点,缩减xpath表达式的重复编码 46 | wiki_content = response.xpath('//div[@id="wikiContent"]')[0] 47 | # 定义一个Item对象,并设置name,rank的值 48 | item = UniversityItem( 49 | name=wiki_content.xpath('./h1[@class="wikiTitle"]/text()').get(), 50 | rank=response.meta['rank']) 51 | # 取出表格中左列的文本 52 | keys = wiki_content.xpath('./div[@class="infobox"]/table//tr/td[1]/p/text()').extract() 53 | # 取出表格中右列单元格节点list 54 | cols = wiki_content.xpath('./div[@class="infobox"]/table//tr/td[2]') 55 | # 循环上步得到的单元格节点list,并取出每个单元格中的文本,这样做是为了解决有些单元格存在多个p标签的问题 56 | values = [','.join(col.xpath('.//text()').extract()) for col in cols] 57 | # 将左、右单元格组成一个字典 58 | data = dict(zip(keys, values)) 59 | # 从字典中的值设置到item相应的属性中 60 | item['country'] = data.get(u'国家', '') 61 | item['state'] = data.get(u'州省', '') 62 | item['city'] = data.get(u'城市', '') 63 | item['undergraduate_num'] = data.get(u'本科生人数', '') 64 | item['postgraduate_num'] = data.get(u'研究生人数', '') 65 | item['website'] = data.get(u'网址', '') 66 | # 使用内置logger对象记录日志 67 | self.logger.info(u'%s scraped' % item['name']) 68 | yield item 69 | -------------------------------------------------------------------------------- /qianmu/spiders/university.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy import Request 4 | from w3lib.html import remove_tags 5 | from qianmu.items import UniversityItem 6 | 7 | def filter(html): 8 | """过滤网页源码中的特殊符号和sup标签""" 9 | return remove_tags(html, which_ones=('sup',)).replace('\n', '')\ 10 | .replace('\r', '').replace('\t', '') 11 | 12 | 13 | class UniversitySpider(scrapy.Spider): 14 | name = 'university' 15 | # 定义允许访问的域名列表,任何不符合以下域名的链接都不会被下载 16 | allowed_domains = ['qianmu.iguye.com'] 17 | # 入口页面的URL 18 | start_urls = ['http://qianmu.iguye.com/2018USNEWS世界大学排名'] 19 | 20 | def __init__(self, max_num=0, *args, **kwargs): 21 | # 重载父类的构造函数,先调用父类的构造函数 22 | super(UniversitySpider, self).__init__(*args, **kwargs) 23 | # 然后执行自己的操作, 设置自定义传入的参数 24 | self.max_num = int(max_num) 25 | 26 | def parse(self, response): 27 | # 选择出排名表格里第2行开始的所有行的第2列里的超链接,也就是所有大学的链接 28 | links = response.xpath("//div[@id='content']//tr[position()>1]/td[2]/a/@href").extract() 29 | # 循环这些链接,同时使用enumerate函数,列出每一个链接在列表中的索引 30 | for i, link in enumerate(links): 31 | # 根据自定义的参数判断,如果抓取的链接数量超过max_num,则不再抓取 32 | if self.max_num and i >= self.max_num: 33 | break 34 | # 有个别坏掉的链接,加一下判断 35 | if not link.startswith('http://'): 36 | link = 'http://qianmu.iguye.com/%s' % link 37 | # 创建一个request对象,并将parse_university设置为它的回调函数 38 | request = Request(link, callback=self.parse_university) 39 | # 使用meta属性在函数之间传递参数 40 | request.meta['rank'] = i + 1 41 | yield request 42 | 43 | def parse_university(self, response): 44 | # 使用filter函数过滤网页中的特殊符号,因为response的属性是只读的,所以我们使用replace方法来重新生成一个response对象 45 | response = response.replace(body=filter(response.body)) 46 | # 选择出父节点,缩减xpath表达式的重复编码 47 | wiki_content = response.xpath('//div[@id="wikiContent"]')[0] 48 | # 定义一个Item对象,并设置name,rank的值 49 | item = UniversityItem( 50 | name=wiki_content.xpath('./h1[@class="wikiTitle"]/text()').get(), 51 | rank=response.meta['rank']) 52 | # 取出表格中左列的文本 53 | keys = wiki_content.xpath('./div[@class="infobox"]/table//tr/td[1]/p/text()').extract() 54 | # 取出表格中右列单元格节点list 55 | cols = wiki_content.xpath('./div[@class="infobox"]/table//tr/td[2]') 56 | # 循环上步得到的单元格节点list,并取出每个单元格中的文本,这样做是为了解决有些单元格存在多个p标签的问题 57 | values = [','.join(col.xpath('.//text()').extract()) for col in cols] 58 | # 将左、右单元格组成一个字典 59 | data = dict(zip(keys, values)) 60 | # 从字典中的值设置到item相应的属性中 61 | item['country'] = data.get(u'国家', '') 62 | item['state'] = data.get(u'州省', '') 63 | item['city'] = data.get(u'城市', '') 64 | item['undergraduate_num'] = data.get(u'本科生人数', '') 65 | item['postgraduate_num'] = data.get(u'研究生人数', '') 66 | item['website'] = data.get(u'网址', '') 67 | # 使用内置logger对象记录日志 68 | self.logger.info(u'%s scraped' % item['name']) 69 | yield item 70 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = qianmu.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = qianmu 12 | --------------------------------------------------------------------------------