├── README.md
├── __init__.py
├── __pycache__
    ├── __init__.cpython-38.pyc
    ├── items.cpython-38.pyc
    ├── loaders.cpython-38.pyc
    ├── middlewares.cpython-38.pyc
    ├── pipelines.cpython-38.pyc
    └── settings.cpython-38.pyc
├── items.py
├── loaders.py
├── middlewares.py
├── pipelines.py
├── settings.py
└── spiders
    ├── __init__.py
    ├── __pycache__
        ├── __init__.cpython-38.pyc
        ├── move.cpython-38.pyc
        └── top250.cpython-38.pyc
    ├── move.py
    └── top250.py


/README.md:
--------------------------------------------------------------------------------
 1 | # 项目名称
 2 | 
 3 | 基于Python的豆瓣Top250排行榜影片数据爬取和分析毕业论文+开题报告+答辩PPT+视频讲解+项目源码及运行结果
 4 | 
 5 | # 系统介绍
 6 | 随着互联网的快速发展,在“互联网+”的时态下,大数据的挖掘和分析已成为业界和学术界研究的热点。大数据挖掘可以挖掘先前未知且潜在有用的信息样型或规则，进而转化为有价值的信息或知识，帮助决策者迅速做出适当决策。巧妇难为无米之炊，进行大数据挖掘之前，首先应该获取数据，目前使用Python爬虫技术是使用最广泛的方法之一，可以成功获取互联网上的大数据。为了帮助用户进行影片选择，本文主要基于Python的Scrapy框架，设计并实现对豆瓣电影网上海量影视数据的采集，清洗，保存到本地。并用Pandas，Numpy库对影评进行处理，使用WordCloud对处理的影评进行词云展示，让用户对电影有一个认知。用Matplotlib、Pygal展示口碑+人气电影。
 7 | 
 8 | # 环境需要
 9 | 
10 | 1.运行环境：最好是java jdk 1.8，我们在这个平台上运行的。其他版本理论上也可以。\
11 | 2.IDE环境：IDEA，Eclipse,Myeclipse都可以。推荐IDEA;\
12 | 3.tomcat环境：Tomcat 7.x,8.x,9.x版本均可\
13 | 4.硬件环境：windows 7/8/10 1G内存以上；或者 Mac OS； \
14 | 5.数据库：MySql 5.7版本；\
15 | 6.是否Maven项目：否；
16 | 
17 | # 技术栈
18 | 
19 | 1. 后端：Spring+SpringMVC+Mybatis\
20 | 2. 前端：JSP+CSS+JavaScript+jQuery
21 | 
22 | # 使用说明
23 | 
24 | 1. 使用Navicat或者其它工具，在mysql中创建对应名称的数据库，并导入项目的sql文件；\
25 | 2. 使用IDEA/Eclipse/MyEclipse导入项目，Eclipse/MyEclipse导入时，若为maven项目请选择maven;\
26 | 若为maven项目，导入成功后请执行maven clean;maven install命令，然后运行；\
27 | 3. 将项目中springmvc-servlet.xml配置文件中的数据库配置改为自己的配置;\
28 | 4. 运行项目，在浏览器中输入http://localhost:8080/ 登录
29 | 
30 | # 高清视频演示
31 | 
32 | https://www.bilibili.com/video/BV1ea411Q7iM/
33 | 
34 | 
35 | ​


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ouyangxiaobai/-Python-Top250-PPT-/cf32fd912704c69938a1caea6a4ebc42fc3ff449/__init__.py


--------------------------------------------------------------------------------
/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ouyangxiaobai/-Python-Top250-PPT-/cf32fd912704c69938a1caea6a4ebc42fc3ff449/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/__pycache__/items.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ouyangxiaobai/-Python-Top250-PPT-/cf32fd912704c69938a1caea6a4ebc42fc3ff449/__pycache__/items.cpython-38.pyc


--------------------------------------------------------------------------------
/__pycache__/loaders.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ouyangxiaobai/-Python-Top250-PPT-/cf32fd912704c69938a1caea6a4ebc42fc3ff449/__pycache__/loaders.cpython-38.pyc


--------------------------------------------------------------------------------
/__pycache__/middlewares.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ouyangxiaobai/-Python-Top250-PPT-/cf32fd912704c69938a1caea6a4ebc42fc3ff449/__pycache__/middlewares.cpython-38.pyc


--------------------------------------------------------------------------------
/__pycache__/pipelines.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ouyangxiaobai/-Python-Top250-PPT-/cf32fd912704c69938a1caea6a4ebc42fc3ff449/__pycache__/pipelines.cpython-38.pyc


--------------------------------------------------------------------------------
/__pycache__/settings.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ouyangxiaobai/-Python-Top250-PPT-/cf32fd912704c69938a1caea6a4ebc42fc3ff449/__pycache__/settings.cpython-38.pyc


--------------------------------------------------------------------------------
/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://docs.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | class MoveItem(scrapy.Item):
10 |     #电影名字
11 |     move_name=scrapy.Field()
12 |     #电影短评
13 |     move_inq= scrapy.Field()
14 | class Doubantop250Item(scrapy.Item):
15 |     # define the fields for your item here like:
16 |     # name = scrapy.Field()
17 |     # 电影序号（排名）
18 |     number = scrapy.Field()
19 |     # 电影名字
20 |     name = scrapy.Field()
21 |     # 电影又名
22 |     name_two = scrapy.Field()
23 |     # 电影评分
24 |     score = scrapy.Field()
25 |     # 评论人数
26 |     comment = scrapy.Field()
27 |     # 电影封面（链接）
28 |     image_urls = scrapy.Field()
29 |     # 电影导演
30 |     director = scrapy.Field()
31 |     # 电影编剧
32 |     scenarist = scrapy.Field()
33 |     # 电影主演
34 |     lead = scrapy.Field()
35 |     # 电影上映时间
36 |     time = scrapy.Field()
37 |     # 电影地区
38 |     place = scrapy.Field()
39 |     # 电影分类
40 |     classify = scrapy.Field()
41 |     #电影主要类型
42 |     maintypes=scrapy.Field()
43 |     # 电影语言
44 |     language = scrapy.Field()
45 |     # 电影片长
46 |     length = scrapy.Field()
47 |     # 电影简介
48 |     synopsis = scrapy.Field()
49 |     # 短评链接
50 |     inq_url = scrapy.Field()
51 | 
52 | 


--------------------------------------------------------------------------------
/loaders.py:
--------------------------------------------------------------------------------
1 | from scrapy.loader import ItemLoader
2 | from scrapy.loader.processors import Join, Compose
3 | #MapCompose是处理列表内每个元素
4 | class NewsLoader(ItemLoader):
5 |     default_output_processor = Compose(Join(','), lambda s: s.strip(' \n'))


--------------------------------------------------------------------------------
/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class Doubantop250SpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Request, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class Doubantop250DownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | import random
105 | 
106 | 
107 | # 随机请求头
108 | class RandomUserAgentMiddleware():
109 |     def __init__(self):
110 |         self.user_agents =  [
111 |             "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393",
112 |             "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)","Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
113 |             "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)","Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)","Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
114 |             "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)","Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
115 |             "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0","Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
116 |             "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
117 |             "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
118 |             "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0",
119 |         ]
120 | 
121 |     def process_request(self, request, spider):
122 |         item_ug = random.choice(self.user_agents)
123 |         try:
124 |             request.headers['User-Agent']=item_ug
125 |         except Exception as e:
126 |             print(e)
127 |             pass


--------------------------------------------------------------------------------
/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import scrapy
 8 | import json
 9 | import os
10 | from scrapy.pipelines.images import ImagesPipeline
11 | class Doubantop250Pipeline(object):
12 |     def __init__(self):
13 |         if not os.path.exists('DouBanTop250/top250.json'):
14 |             self.ranking = open('DouBanTop250/top250.json', 'wb+')
15 |             self.ranking.write('[\n'.encode('utf-8'))
16 |     def process_item(self, item, spider):
17 |         if spider.name=='top250':
18 |             # '''下面这段是写入txt。
19 |             move_name = item['name']
20 |             if len(move_name.split(' ')) > 1: move_name = move_name.split(' ')[0]
21 |             with open(r'DouBanTop250/Result/' + move_name + '/' + move_name + '.txt', 'a', encoding='utf-8') as f:
22 |                 r_list = ['电影排名', '电影名字', '电影又名', '电影评分', '评论人数', '导演', '编剧', '主演', '上映时间', '地区', '分类','主要类型', '语言', '片长', '简介',
23 |                           '短评链接']
24 |                 s_list = ['number', 'name', 'name_two', 'score', 'comment', 'director', 'scenarist', 'lead', 'time',
25 |                           'place', 'classify','maintypes', 'language', 'length', 'synopsis', 'inq_url']
26 |                 for i in range(len(r_list)):
27 |                     f.write(r_list[i] + ':' + str(item[s_list[i]]) + '\n')
28 |                 f.write('\n')
29 |             # '''
30 |             text = json.dumps(dict(item), ensure_ascii=False) + ",\n"
31 |             self.ranking.write(text.encode('utf-8'))
32 |             return item
33 |         else:
34 |             with open('DouBanTop250/Result/'+item['move_name']+'/'+item['move_name']+'影评.txt','a+', encoding='utf-8') as f:
35 |                 for i in item['move_inq']:
36 |                     f.write(i+'\n')
37 |             f.close()
38 |             return item
39 | 
40 | 
41 |     def close_spider(self, spider):
42 |         if spider.name == 'top250':
43 |             self.ranking.seek(-2, 1)
44 |             self.ranking.write('\n]'.encode('utf-8'))
45 |             self.ranking.close()
46 | 
47 | 
48 | class ImagePipeline(ImagesPipeline):
49 |     def get_media_requests(self, item, info):
50 |         yield scrapy.Request(url=item['image_urls'], meta={'item': item})
51 | 
52 |     def file_path(self, request, response=None, info=None):
53 |         '''图片保存的路径'''
54 |         item = request.meta['item']
55 |         img_name = item["name"]
56 |         if len(img_name.split(' ')) > 1:
57 |             img_name =img_name.split(' ')[0]
58 |         path = '/' + img_name + '/' + img_name + '.jpg'
59 |         return path
60 | 
61 |     '''图片下载后返回下结果，观察是否成功。'''
62 | 
63 |     def item_completed(self, results, item, info):
64 |         return item


--------------------------------------------------------------------------------
/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for DouBanTop250 project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://docs.scrapy.org/en/latest/topics/settings.html
  9 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'DouBanTop250'
 13 | 
 14 | SPIDER_MODULES = ['DouBanTop250.spiders']
 15 | NEWSPIDER_MODULE = 'DouBanTop250.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'
 20 | 
 21 | # Obey robots.txt rules
 22 | ROBOTSTXT_OBEY = False
 23 | 
 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 25 | #增加并发线程，提高效率
 26 | CONCURRENT_REQUESTS = 300
 27 | #降低日志级别,减少CPU的使用率
 28 | LOG_LEVEL ='INFO'
 29 | #禁止重试，提高爬取速度
 30 | RETRY_ENABLED = False
 31 | #减少下载超时，让卡住的链接快速被放弃，从而提升效率。
 32 | # DOWNLOAD_TIMEOUT = 30
 33 | 
 34 | # Configure a delay for requests for the same website (default: 0)
 35 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
 36 | # See also autothrottle settings and docs
 37 | #增加下载延迟，为了防止被封IP
 38 | DOWNLOAD_DELAY = 1.5
 39 | # The download delay setting will honor only one of:
 40 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 41 | #并发请求，提高效率
 42 | CONCURRENT_REQUESTS_PER_IP = 500
 43 | 
 44 | # Disable cookies (enabled by default)
 45 | # COOKIES_ENABLED = False
 46 | 
 47 | # Disable Telnet Console (enabled by default)
 48 | #TELNETCONSOLE_ENABLED = False
 49 | 
 50 | # Override the default request headers:
 51 | DEFAULT_REQUEST_HEADERS = {
 52 |     "User-Agent" : "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0",
 53 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 54 | }
 55 | 
 56 | # Enable or disable spider middlewares
 57 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 58 | #SPIDER_MIDDLEWARES = {
 59 | #    'DouBanTop250.middlewares.Doubantop250SpiderMiddleware': 543,
 60 | #}
 61 | 
 62 | # Enable or disable downloader middlewares
 63 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 64 | DOWNLOADER_MIDDLEWARES = {
 65 |     'DouBanTop250.middlewares.RandomUserAgentMiddleware': 543,
 66 | }
 67 | 
 68 | # Enable or disable extensions
 69 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
 70 | #EXTENSIONS = {
 71 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 72 | #}
 73 | 
 74 | # Configure item pipelines
 75 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 76 | ITEM_PIPELINES = {
 77 |    'DouBanTop250.pipelines.Doubantop250Pipeline': 400,
 78 |     'DouBanTop250.pipelines.ImagePipeline':300
 79 | }
 80 | #图片下载路径
 81 | IMAGES_STORE ='./DouBanTop250/Result'
 82 | #图片变量(item对应变量)
 83 | IMAGES_URLS_FIELD='image_urls'
 84 | # Enable and configure the AutoThrottle extension (disabled by default)
 85 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
 86 | AUTOTHROTTLE_ENABLED = True
 87 | # The initial download delay
 88 | AUTOTHROTTLE_START_DELAY = 5
 89 | # The maximum download delay to be set in case of high latencies
 90 | AUTOTHROTTLE_MAX_DELAY = 60
 91 | # The average number of requests Scrapy should be sending in parallel to
 92 | # each remote server
 93 | AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 94 | # Enable showing throttling stats for every response received:
 95 | #AUTOTHROTTLE_DEBUG = False
 96 | 
 97 | # Enable and configure HTTP caching (disabled by default)
 98 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 99 | #HTTPCACHE_ENABLED = True
100 | #HTTPCACHE_EXPIRATION_SECS = 0
101 | #HTTPCACHE_DIR = 'httpcache'
102 | #HTTPCACHE_IGNORE_HTTP_CODES = []
103 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
104 | 


--------------------------------------------------------------------------------
/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/spiders/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ouyangxiaobai/-Python-Top250-PPT-/cf32fd912704c69938a1caea6a4ebc42fc3ff449/spiders/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/spiders/__pycache__/move.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ouyangxiaobai/-Python-Top250-PPT-/cf32fd912704c69938a1caea6a4ebc42fc3ff449/spiders/__pycache__/move.cpython-38.pyc


--------------------------------------------------------------------------------
/spiders/__pycache__/top250.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ouyangxiaobai/-Python-Top250-PPT-/cf32fd912704c69938a1caea6a4ebc42fc3ff449/spiders/__pycache__/top250.cpython-38.pyc


--------------------------------------------------------------------------------
/spiders/move.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import json
 4 | from ..items import MoveItem
 5 | from scrapy.spiders import Request
 6 | class MoveSpider(scrapy.Spider):
 7 |     name = 'move'
 8 |     allowed_domains = ['movie.douban.com']
 9 |     start_urls = []
10 |     custom_settings = {
11 |         'ITEM_PIPELINES' : {
12 |         'DouBanTop250.pipelines.Doubantop250Pipeline': 300,
13 |     }
14 |     }
15 |     def start_requests(self):
16 |         with open('DouBanTop250/top250.json', 'r', encoding='utf-8') as f:
17 |             for i in json.loads(f.read()):
18 |                 move_name = i['name']
19 |                 if len(move_name.split(' ')) > 1: move_name = move_name.split(' ')[0]
20 |                 yield Request(meta={'name':move_name},url=i['inq_url'],callback=self.parse)
21 |     def parse(self, response):
22 |         next_url=str(response.xpath('//*[@id="paginator"]/a[@class="next"]/@href').extract_first()).split('&percent_type=')[0]
23 |         reviews = MoveItem()
24 |         reviews['move_inq']=response.xpath('//*[@id="comments"]/div[@class="comment-item"]/div/p/span/text()').extract()
25 |         move_name=response.meta['name']
26 |         reviews['move_name']=str(move_name)
27 |         yield reviews
28 |         if next_url=='None':
29 |             pass
30 |         else:
31 |             if float(next_url.split('&limit=20')[0].split('?start=')[-1])<201:
32 |                 yield Request(url=response.urljoin(next_url),callback=self.parse,dont_filter=True,meta={'name':move_name})


--------------------------------------------------------------------------------
/spiders/top250.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from scrapy.linkextractors import LinkExtractor
 3 | from scrapy.spiders import CrawlSpider, Rule
 4 | from ..items import *
 5 | from ..loaders import *
 6 | from scrapy.loader.processors import *
 7 | class Top250Spider(CrawlSpider):
 8 |     name = 'top250'
 9 |     allowed_domains = ['movie.douban.com']
10 |     start_urls = ['https://movie.douban.com/top250?start=0&filter=']
11 | 
12 |     rules = (
13 |         Rule(LinkExtractor(allow='subject\/\d*\/', restrict_xpaths='//div[@class="info"]//div'), callback='top250_parse_item'),
14 |         Rule(LinkExtractor(restrict_xpaths='//span[@class="next"]//a[contains(.,"后页>")]')),  # 翻页功能
15 |     )
16 | 
17 |     def top250_parse_item(self, response):
18 |         loader = NewsLoader(item=Doubantop250Item(), response=response)
19 |         loader.add_xpath('name', './/*[@id="content"]/h1/span[@property="v:itemreviewed"]/text()', TakeFirst())  # 电影名字
20 |         loader.add_xpath('image_urls', './/*[@id="mainpic"]/a/img/@src',TakeFirst())  # 电影封面（链接）
21 |         loader.add_xpath('number', './/*[@id="content"]/div[1]/span[@class="top250-no"]/text()')  # 电影排名
22 |         loader.add_xpath('score', './/*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')  # 电影评分
23 |         loader.add_xpath('comment', './/*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()')  # 评论人数
24 |         loader.add_xpath('director', './/*[@id="info"]/span[1]/span[2]/a[@rel="v:directedBy"]/text()')  # 电影导演
25 |         loader.add_xpath('scenarist', './/*[@id="info"]/span[2]/span[2]/a/text()')  # 电影编剧
26 |         #如果scrapy没有重写add_xpath方法，可以试着把下面这个命令注释去掉
27 |         # loader.add_value('scenarist','Null') #如果loader没有数据会报错，所以当add_xpath获取不到数据时，就会有add_value添加Null字段，防止报错
28 |         loader.add_xpath('lead', './/*[@id="info"]/span[@class="actor"]//a/text()')  # 电影主演
29 |         # loader.add_value('lead','Null')
30 |         loader.add_xpath('classify', './/*[@id="info"]/span[@property="v:genre"]/text()')  # 电影类型
31 |         movetypes= response.xpath('//*[@id="interest_sectl"]/div[2]/a/text()').extract()
32 |         movetypes.sort()
33 |         move_maintypes=movetypes[-1].split('% ')[-1].strip('片')
34 |         loader.add_value('maintypes',move_maintypes)#主要类型，为了之后分析
35 |         onelist = ['place', 'language', 'name_two']
36 |         twolist = ["制片国家/地区:", "语言:", "又名:"]
37 |         for x, y in zip(onelist, twolist):
38 |             loader.add_xpath(x, './/*[@id="info"]/span[contains(text(),"' + y + '")]/following-sibling::text()',
39 |                              TakeFirst())
40 |         # loader.add_value('name_two','Null')
41 |         loader.add_xpath('time', './/*[@id="info"]/span[@property="v:initialReleaseDate"]/text()')  # 上映时间
42 |         loader.add_xpath('length', './/*[@id="info"]/span[@property="v:runtime"]/text()')  # 电影片长
43 |         loader.add_xpath('synopsis', './/*[@id="link-report"]//span[@property="v:summary"]',
44 |                          re='(?<=\u3000\u3000).*?(?=\n)')  # 电影简介
45 |         loader.add_xpath('inq_url','.//*[@id="comments-section"]//h2/span[@class="pl"]/a/@href',TakeFirst())
46 |         yield loader.load_item()
47 | 


--------------------------------------------------------------------------------