├── README.md ├── __init__.py ├── __pycache__ ├── __init__.cpython-38.pyc ├── items.cpython-38.pyc ├── loaders.cpython-38.pyc ├── middlewares.cpython-38.pyc ├── pipelines.cpython-38.pyc └── settings.cpython-38.pyc ├── items.py ├── loaders.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders ├── __init__.py ├── __pycache__ ├── __init__.cpython-38.pyc ├── move.cpython-38.pyc └── top250.cpython-38.pyc ├── move.py └── top250.py /README.md: -------------------------------------------------------------------------------- 1 | # 项目名称 2 | 3 | 基于Python的豆瓣Top250排行榜影片数据爬取和分析毕业论文+开题报告+答辩PPT+视频讲解+项目源码及运行结果 4 | 5 | # 系统介绍 6 | 随着互联网的快速发展,在“互联网+”的时态下,大数据的挖掘和分析已成为业界和学术界研究的热点。大数据挖掘可以挖掘先前未知且潜在有用的信息样型或规则,进而转化为有价值的信息或知识,帮助决策者迅速做出适当决策。巧妇难为无米之炊,进行大数据挖掘之前,首先应该获取数据,目前使用Python爬虫技术是使用最广泛的方法之一,可以成功获取互联网上的大数据。为了帮助用户进行影片选择,本文主要基于Python的Scrapy框架,设计并实现对豆瓣电影网上海量影视数据的采集,清洗,保存到本地。并用Pandas,Numpy库对影评进行处理,使用WordCloud对处理的影评进行词云展示,让用户对电影有一个认知。用Matplotlib、Pygal展示口碑+人气电影。 7 | 8 | # 环境需要 9 | 10 | 1.运行环境:最好是java jdk 1.8,我们在这个平台上运行的。其他版本理论上也可以。\ 11 | 2.IDE环境:IDEA,Eclipse,Myeclipse都可以。推荐IDEA;\ 12 | 3.tomcat环境:Tomcat 7.x,8.x,9.x版本均可\ 13 | 4.硬件环境:windows 7/8/10 1G内存以上;或者 Mac OS; \ 14 | 5.数据库:MySql 5.7版本;\ 15 | 6.是否Maven项目:否; 16 | 17 | # 技术栈 18 | 19 | 1. 后端:Spring+SpringMVC+Mybatis\ 20 | 2. 前端:JSP+CSS+JavaScript+jQuery 21 | 22 | # 使用说明 23 | 24 | 1. 使用Navicat或者其它工具,在mysql中创建对应名称的数据库,并导入项目的sql文件;\ 25 | 2. 使用IDEA/Eclipse/MyEclipse导入项目,Eclipse/MyEclipse导入时,若为maven项目请选择maven;\ 26 | 若为maven项目,导入成功后请执行maven clean;maven install命令,然后运行;\ 27 | 3. 将项目中springmvc-servlet.xml配置文件中的数据库配置改为自己的配置;\ 28 | 4. 运行项目,在浏览器中输入http://localhost:8080/ 登录 29 | 30 | # 高清视频演示 31 | 32 | https://www.bilibili.com/video/BV1ea411Q7iM/ 33 | 34 | 35 | ​ -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ouyangxiaobai/-Python-Top250-PPT-/cf32fd912704c69938a1caea6a4ebc42fc3ff449/__init__.py -------------------------------------------------------------------------------- /__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ouyangxiaobai/-Python-Top250-PPT-/cf32fd912704c69938a1caea6a4ebc42fc3ff449/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /__pycache__/items.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ouyangxiaobai/-Python-Top250-PPT-/cf32fd912704c69938a1caea6a4ebc42fc3ff449/__pycache__/items.cpython-38.pyc -------------------------------------------------------------------------------- /__pycache__/loaders.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ouyangxiaobai/-Python-Top250-PPT-/cf32fd912704c69938a1caea6a4ebc42fc3ff449/__pycache__/loaders.cpython-38.pyc -------------------------------------------------------------------------------- /__pycache__/middlewares.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ouyangxiaobai/-Python-Top250-PPT-/cf32fd912704c69938a1caea6a4ebc42fc3ff449/__pycache__/middlewares.cpython-38.pyc -------------------------------------------------------------------------------- /__pycache__/pipelines.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ouyangxiaobai/-Python-Top250-PPT-/cf32fd912704c69938a1caea6a4ebc42fc3ff449/__pycache__/pipelines.cpython-38.pyc -------------------------------------------------------------------------------- /__pycache__/settings.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ouyangxiaobai/-Python-Top250-PPT-/cf32fd912704c69938a1caea6a4ebc42fc3ff449/__pycache__/settings.cpython-38.pyc -------------------------------------------------------------------------------- /items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | class MoveItem(scrapy.Item): 10 | #电影名字 11 | move_name=scrapy.Field() 12 | #电影短评 13 | move_inq= scrapy.Field() 14 | class Doubantop250Item(scrapy.Item): 15 | # define the fields for your item here like: 16 | # name = scrapy.Field() 17 | # 电影序号(排名) 18 | number = scrapy.Field() 19 | # 电影名字 20 | name = scrapy.Field() 21 | # 电影又名 22 | name_two = scrapy.Field() 23 | # 电影评分 24 | score = scrapy.Field() 25 | # 评论人数 26 | comment = scrapy.Field() 27 | # 电影封面(链接) 28 | image_urls = scrapy.Field() 29 | # 电影导演 30 | director = scrapy.Field() 31 | # 电影编剧 32 | scenarist = scrapy.Field() 33 | # 电影主演 34 | lead = scrapy.Field() 35 | # 电影上映时间 36 | time = scrapy.Field() 37 | # 电影地区 38 | place = scrapy.Field() 39 | # 电影分类 40 | classify = scrapy.Field() 41 | #电影主要类型 42 | maintypes=scrapy.Field() 43 | # 电影语言 44 | language = scrapy.Field() 45 | # 电影片长 46 | length = scrapy.Field() 47 | # 电影简介 48 | synopsis = scrapy.Field() 49 | # 短评链接 50 | inq_url = scrapy.Field() 51 | 52 | -------------------------------------------------------------------------------- /loaders.py: -------------------------------------------------------------------------------- 1 | from scrapy.loader import ItemLoader 2 | from scrapy.loader.processors import Join, Compose 3 | #MapCompose是处理列表内每个元素 4 | class NewsLoader(ItemLoader): 5 | default_output_processor = Compose(Join(','), lambda s: s.strip(' \n')) -------------------------------------------------------------------------------- /middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class Doubantop250SpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Request, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class Doubantop250DownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | import random 105 | 106 | 107 | # 随机请求头 108 | class RandomUserAgentMiddleware(): 109 | def __init__(self): 110 | self.user_agents = [ 111 | "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393", 112 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)","Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 113 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)","Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)","Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 114 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)","Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 115 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0","Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", 116 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", 117 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", 118 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0", 119 | ] 120 | 121 | def process_request(self, request, spider): 122 | item_ug = random.choice(self.user_agents) 123 | try: 124 | request.headers['User-Agent']=item_ug 125 | except Exception as e: 126 | print(e) 127 | pass -------------------------------------------------------------------------------- /pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 7 | import scrapy 8 | import json 9 | import os 10 | from scrapy.pipelines.images import ImagesPipeline 11 | class Doubantop250Pipeline(object): 12 | def __init__(self): 13 | if not os.path.exists('DouBanTop250/top250.json'): 14 | self.ranking = open('DouBanTop250/top250.json', 'wb+') 15 | self.ranking.write('[\n'.encode('utf-8')) 16 | def process_item(self, item, spider): 17 | if spider.name=='top250': 18 | # '''下面这段是写入txt。 19 | move_name = item['name'] 20 | if len(move_name.split(' ')) > 1: move_name = move_name.split(' ')[0] 21 | with open(r'DouBanTop250/Result/' + move_name + '/' + move_name + '.txt', 'a', encoding='utf-8') as f: 22 | r_list = ['电影排名', '电影名字', '电影又名', '电影评分', '评论人数', '导演', '编剧', '主演', '上映时间', '地区', '分类','主要类型', '语言', '片长', '简介', 23 | '短评链接'] 24 | s_list = ['number', 'name', 'name_two', 'score', 'comment', 'director', 'scenarist', 'lead', 'time', 25 | 'place', 'classify','maintypes', 'language', 'length', 'synopsis', 'inq_url'] 26 | for i in range(len(r_list)): 27 | f.write(r_list[i] + ':' + str(item[s_list[i]]) + '\n') 28 | f.write('\n') 29 | # ''' 30 | text = json.dumps(dict(item), ensure_ascii=False) + ",\n" 31 | self.ranking.write(text.encode('utf-8')) 32 | return item 33 | else: 34 | with open('DouBanTop250/Result/'+item['move_name']+'/'+item['move_name']+'影评.txt','a+', encoding='utf-8') as f: 35 | for i in item['move_inq']: 36 | f.write(i+'\n') 37 | f.close() 38 | return item 39 | 40 | 41 | def close_spider(self, spider): 42 | if spider.name == 'top250': 43 | self.ranking.seek(-2, 1) 44 | self.ranking.write('\n]'.encode('utf-8')) 45 | self.ranking.close() 46 | 47 | 48 | class ImagePipeline(ImagesPipeline): 49 | def get_media_requests(self, item, info): 50 | yield scrapy.Request(url=item['image_urls'], meta={'item': item}) 51 | 52 | def file_path(self, request, response=None, info=None): 53 | '''图片保存的路径''' 54 | item = request.meta['item'] 55 | img_name = item["name"] 56 | if len(img_name.split(' ')) > 1: 57 | img_name =img_name.split(' ')[0] 58 | path = '/' + img_name + '/' + img_name + '.jpg' 59 | return path 60 | 61 | '''图片下载后返回下结果,观察是否成功。''' 62 | 63 | def item_completed(self, results, item, info): 64 | return item -------------------------------------------------------------------------------- /settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for DouBanTop250 project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://docs.scrapy.org/en/latest/topics/settings.html 9 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'DouBanTop250' 13 | 14 | SPIDER_MODULES = ['DouBanTop250.spiders'] 15 | NEWSPIDER_MODULE = 'DouBanTop250.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #增加并发线程,提高效率 26 | CONCURRENT_REQUESTS = 300 27 | #降低日志级别,减少CPU的使用率 28 | LOG_LEVEL ='INFO' 29 | #禁止重试,提高爬取速度 30 | RETRY_ENABLED = False 31 | #减少下载超时,让卡住的链接快速被放弃,从而提升效率。 32 | # DOWNLOAD_TIMEOUT = 30 33 | 34 | # Configure a delay for requests for the same website (default: 0) 35 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 36 | # See also autothrottle settings and docs 37 | #增加下载延迟,为了防止被封IP 38 | DOWNLOAD_DELAY = 1.5 39 | # The download delay setting will honor only one of: 40 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 41 | #并发请求,提高效率 42 | CONCURRENT_REQUESTS_PER_IP = 500 43 | 44 | # Disable cookies (enabled by default) 45 | # COOKIES_ENABLED = False 46 | 47 | # Disable Telnet Console (enabled by default) 48 | #TELNETCONSOLE_ENABLED = False 49 | 50 | # Override the default request headers: 51 | DEFAULT_REQUEST_HEADERS = { 52 | "User-Agent" : "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0", 53 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 54 | } 55 | 56 | # Enable or disable spider middlewares 57 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 58 | #SPIDER_MIDDLEWARES = { 59 | # 'DouBanTop250.middlewares.Doubantop250SpiderMiddleware': 543, 60 | #} 61 | 62 | # Enable or disable downloader middlewares 63 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 64 | DOWNLOADER_MIDDLEWARES = { 65 | 'DouBanTop250.middlewares.RandomUserAgentMiddleware': 543, 66 | } 67 | 68 | # Enable or disable extensions 69 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 70 | #EXTENSIONS = { 71 | # 'scrapy.extensions.telnet.TelnetConsole': None, 72 | #} 73 | 74 | # Configure item pipelines 75 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 76 | ITEM_PIPELINES = { 77 | 'DouBanTop250.pipelines.Doubantop250Pipeline': 400, 78 | 'DouBanTop250.pipelines.ImagePipeline':300 79 | } 80 | #图片下载路径 81 | IMAGES_STORE ='./DouBanTop250/Result' 82 | #图片变量(item对应变量) 83 | IMAGES_URLS_FIELD='image_urls' 84 | # Enable and configure the AutoThrottle extension (disabled by default) 85 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 86 | AUTOTHROTTLE_ENABLED = True 87 | # The initial download delay 88 | AUTOTHROTTLE_START_DELAY = 5 89 | # The maximum download delay to be set in case of high latencies 90 | AUTOTHROTTLE_MAX_DELAY = 60 91 | # The average number of requests Scrapy should be sending in parallel to 92 | # each remote server 93 | AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 94 | # Enable showing throttling stats for every response received: 95 | #AUTOTHROTTLE_DEBUG = False 96 | 97 | # Enable and configure HTTP caching (disabled by default) 98 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 99 | #HTTPCACHE_ENABLED = True 100 | #HTTPCACHE_EXPIRATION_SECS = 0 101 | #HTTPCACHE_DIR = 'httpcache' 102 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 103 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 104 | -------------------------------------------------------------------------------- /spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /spiders/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ouyangxiaobai/-Python-Top250-PPT-/cf32fd912704c69938a1caea6a4ebc42fc3ff449/spiders/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /spiders/__pycache__/move.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ouyangxiaobai/-Python-Top250-PPT-/cf32fd912704c69938a1caea6a4ebc42fc3ff449/spiders/__pycache__/move.cpython-38.pyc -------------------------------------------------------------------------------- /spiders/__pycache__/top250.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ouyangxiaobai/-Python-Top250-PPT-/cf32fd912704c69938a1caea6a4ebc42fc3ff449/spiders/__pycache__/top250.cpython-38.pyc -------------------------------------------------------------------------------- /spiders/move.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import json 4 | from ..items import MoveItem 5 | from scrapy.spiders import Request 6 | class MoveSpider(scrapy.Spider): 7 | name = 'move' 8 | allowed_domains = ['movie.douban.com'] 9 | start_urls = [] 10 | custom_settings = { 11 | 'ITEM_PIPELINES' : { 12 | 'DouBanTop250.pipelines.Doubantop250Pipeline': 300, 13 | } 14 | } 15 | def start_requests(self): 16 | with open('DouBanTop250/top250.json', 'r', encoding='utf-8') as f: 17 | for i in json.loads(f.read()): 18 | move_name = i['name'] 19 | if len(move_name.split(' ')) > 1: move_name = move_name.split(' ')[0] 20 | yield Request(meta={'name':move_name},url=i['inq_url'],callback=self.parse) 21 | def parse(self, response): 22 | next_url=str(response.xpath('//*[@id="paginator"]/a[@class="next"]/@href').extract_first()).split('&percent_type=')[0] 23 | reviews = MoveItem() 24 | reviews['move_inq']=response.xpath('//*[@id="comments"]/div[@class="comment-item"]/div/p/span/text()').extract() 25 | move_name=response.meta['name'] 26 | reviews['move_name']=str(move_name) 27 | yield reviews 28 | if next_url=='None': 29 | pass 30 | else: 31 | if float(next_url.split('&limit=20')[0].split('?start=')[-1])<201: 32 | yield Request(url=response.urljoin(next_url),callback=self.parse,dont_filter=True,meta={'name':move_name}) -------------------------------------------------------------------------------- /spiders/top250.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy.linkextractors import LinkExtractor 3 | from scrapy.spiders import CrawlSpider, Rule 4 | from ..items import * 5 | from ..loaders import * 6 | from scrapy.loader.processors import * 7 | class Top250Spider(CrawlSpider): 8 | name = 'top250' 9 | allowed_domains = ['movie.douban.com'] 10 | start_urls = ['https://movie.douban.com/top250?start=0&filter='] 11 | 12 | rules = ( 13 | Rule(LinkExtractor(allow='subject\/\d*\/', restrict_xpaths='//div[@class="info"]//div'), callback='top250_parse_item'), 14 | Rule(LinkExtractor(restrict_xpaths='//span[@class="next"]//a[contains(.,"后页>")]')), # 翻页功能 15 | ) 16 | 17 | def top250_parse_item(self, response): 18 | loader = NewsLoader(item=Doubantop250Item(), response=response) 19 | loader.add_xpath('name', './/*[@id="content"]/h1/span[@property="v:itemreviewed"]/text()', TakeFirst()) # 电影名字 20 | loader.add_xpath('image_urls', './/*[@id="mainpic"]/a/img/@src',TakeFirst()) # 电影封面(链接) 21 | loader.add_xpath('number', './/*[@id="content"]/div[1]/span[@class="top250-no"]/text()') # 电影排名 22 | loader.add_xpath('score', './/*[@id="interest_sectl"]/div[1]/div[2]/strong/text()') # 电影评分 23 | loader.add_xpath('comment', './/*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()') # 评论人数 24 | loader.add_xpath('director', './/*[@id="info"]/span[1]/span[2]/a[@rel="v:directedBy"]/text()') # 电影导演 25 | loader.add_xpath('scenarist', './/*[@id="info"]/span[2]/span[2]/a/text()') # 电影编剧 26 | #如果scrapy没有重写add_xpath方法,可以试着把下面这个命令注释去掉 27 | # loader.add_value('scenarist','Null') #如果loader没有数据会报错,所以当add_xpath获取不到数据时,就会有add_value添加Null字段,防止报错 28 | loader.add_xpath('lead', './/*[@id="info"]/span[@class="actor"]//a/text()') # 电影主演 29 | # loader.add_value('lead','Null') 30 | loader.add_xpath('classify', './/*[@id="info"]/span[@property="v:genre"]/text()') # 电影类型 31 | movetypes= response.xpath('//*[@id="interest_sectl"]/div[2]/a/text()').extract() 32 | movetypes.sort() 33 | move_maintypes=movetypes[-1].split('% ')[-1].strip('片') 34 | loader.add_value('maintypes',move_maintypes)#主要类型,为了之后分析 35 | onelist = ['place', 'language', 'name_two'] 36 | twolist = ["制片国家/地区:", "语言:", "又名:"] 37 | for x, y in zip(onelist, twolist): 38 | loader.add_xpath(x, './/*[@id="info"]/span[contains(text(),"' + y + '")]/following-sibling::text()', 39 | TakeFirst()) 40 | # loader.add_value('name_two','Null') 41 | loader.add_xpath('time', './/*[@id="info"]/span[@property="v:initialReleaseDate"]/text()') # 上映时间 42 | loader.add_xpath('length', './/*[@id="info"]/span[@property="v:runtime"]/text()') # 电影片长 43 | loader.add_xpath('synopsis', './/*[@id="link-report"]//span[@property="v:summary"]', 44 | re='(?<=\u3000\u3000).*?(?=\n)') # 电影简介 45 | loader.add_xpath('inq_url','.//*[@id="comments-section"]//h2/span[@class="pl"]/a/@href',TakeFirst()) 46 | yield loader.load_item() 47 | --------------------------------------------------------------------------------