├── .gitignore
├── README.md
├── dytt8
    ├── __init__.py
    ├── items.py
    ├── middlewares.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
    │   ├── __init__.py
    │   └── dytt8_spider.py
├── main.py
└── scrapy.cfg


/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv
2 | *.log
3 | *.pyc
4 | __pycache__/
5 | 
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # python scrapy爬取电影天堂所有电影
  2 | ### hello，下午好，这次给大家带来的是电影天堂的爬虫，希望大家会喜欢，项目采用的scrapy框架进行爬取，没有用到数据库，直接存成csv格式，这次也没有加入分布式，逻辑比较简单，主要难点就是详情页的数据解析
  3 | 
  4 | 
  5 | ## 主要用到的技术
  6 | - scrapy xpath 和re的结合使用
  7 | - 正则表达式的使用
  8 | 
  9 | 
 10 | ---
 11 | 
 12 | ## 项目截图
 13 | - ![运行中](https://i.loli.net/2019/01/31/5c52a80c27ad1.png)
 14 | - ![运行结果](https://i.loli.net/2019/01/31/5c52a8f30bafe.png)
 15 | 
 16 | ---
 17 | 
 18 | ## 核心源码 :beers:
 19 | ```python
 20 | # -*- coding: utf-8 -*-  
 21 | import scrapy  
 22 |   
 23 | from dytt8.items import Dytt8Item  
 24 |   
 25 |   
 26 | class Dytt8SpiderSpider(scrapy.Spider):  
 27 |     name = 'dytt8_spider'  
 28 |   allowed_domains = ['www.dytt8.net']  
 29 |     start_urls = ['http://www.dytt8.net/']  
 30 |   
 31 |     headers = {  
 32 |         'connection': "keep-alive",  
 33 |   'pragma': "no-cache",  
 34 |   'cache-control': "no-cache",  
 35 |   'upgrade-insecure-requests': "1",  
 36 |   'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10\_13\_4) AppleWebKit/537.36 (KHTML, like Gecko) "  
 37 |  "Chrome/71.0.3578.98 Safari/537.36",  
 38 |   'dnt': "1",  
 39 |   'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",  
 40 |   'accept-encoding': "gzip, deflate, br",  
 41 |   'accept-language': "zh-CN,zh;q=0.9,en;q=0.8",  
 42 |   'cookie': "XLA_CI=97928deaf2eec58555c78b1518df772a",  
 43 |   }  
 44 |   
 45 |     def start_requests(self):  
 46 |         base_url = 'https://www.dytt8.net/html/gndy/{}/index.html'  
 47 |         categories = ['china', 'rihan', 'oumei', 'dyzz']  
 48 |         for category in categories:  
 49 |             yield scrapy.Request(base_url.format(category), headers=self.headers, callback=self.parse)  
 50 |   
 51 |     def parse(self, response):  
 52 |         # xpath('//div[contains(@class,"a") and contains(@class,"b")]') #它会取class含有有a和b的元素  
 53 |         detail_urls = response.xpath('//a[@class="ulink"]/@href').extract()  
 54 |         detail_urls = [url for url in detail_urls if 'index' not in url]  
 55 |         print(detail_urls)  
 56 |   
 57 |         for url in detail_urls:  
 58 |             yield scrapy.Request(response.urljoin(url), headers=self.headers, callback=self.detail)  
 59 |   
 60 |     def detail(self, response):  
 61 |   
 62 |         item = Dytt8Item()  
 63 |   
 64 |         name = response.xpath('//p/text()').re('◎译\\u3000\\u3000名\\u3000(.*)')  
 65 |         category = response.xpath('//p/text()').re('◎类\\u3000\\u3000别\\u3000(.*)')  
 66 |         country = response.xpath('//p/text()').re('◎产\\u3000\\u3000地\\u3000(.*)')  
 67 |         douban_rate = response.xpath('//p/text()').re('◎豆瓣评分\\u3000(.*)')  
 68 |         language = response.xpath('//p/text()').re('◎语\\u3000\\u3000言\\u3000(.*)')  
 69 |         publish_date = response.xpath('//p/text()').re('◎上映日期\\u3000(.*)')  
 70 |         IMDb_rate = response.xpath('//p/text()').re('◎IMDb评分\\u3000(.*)')  
 71 |         movie_time = response.xpath('//p/text()').re('◎片\\u3000\\u3000长\\u3000(.*)')  
 72 |         director = response.xpath('//p/text()').re('◎导\\u3000\\u3000演\\u3000(.*)')  
 73 |         main_actor = response.xpath('//p/text()').re('◎主\\u3000\\u3000演\\u3000(.*)')  
 74 |         introduce = response.xpath('//p/text()').re('\\u3000\\u3000(.*)')  
 75 |         download_url = response.xpath('//a/text()').re('ftp.*')  
 76 |   
 77 |         if name:  
 78 |             item['name'] = name[0]  
 79 |         if category:  
 80 |             item['category'] = category[0]  
 81 |   
 82 |         if country:  
 83 |             item['country'] = country[0]  
 84 |         if douban_rate:  
 85 |             item['douban_rate'] = douban_rate[0]  
 86 |         if language:  
 87 |             item['language'] = language[0]  
 88 |         if publish_date:  
 89 |             item['publish_date'] = publish_date[0]  
 90 |         if IMDb_rate:  
 91 |             item['IMDb_rate'] = IMDb_rate[0]  
 92 |         if movie_time:  
 93 |             item['movie_time'] = movie_time[0]  
 94 |         if director:  
 95 |             item['director'] = director[0]  
 96 |         if main_actor:  
 97 |             item['main_actor'] = main_actor[0]  
 98 |         if download_url:  
 99 |             item['download_url'] = ''.join(download_url)  
100 |         if introduce:  
101 |             item['introduce'] = introduce[-1]  
102 |         yield item
103 | ```
104 | ---
105 | ## 如何使用
106 | 
107 | - pip install scrapy
108 | 
109 | - git clone https://github.com/guapier/dytt8.git
110 | 
111 | - cd dytt8
112 | 
113 | - python3 main.py
114 | 
115 | - 顺便star一下呗
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------
/dytt8/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guapier/dytt8/cfcddfc9b46c2f995171f6e5c19f4e93a4fb683a/dytt8/__init__.py


--------------------------------------------------------------------------------
/dytt8/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class Dytt8Item(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     name = scrapy.Field()  # 电影名
15 |     category = scrapy.Field()  # 类别
16 |     country = scrapy.Field()  # 产地
17 |     douban_rate = scrapy.Field()  # 豆瓣评分
18 |     language = scrapy.Field()  # 语言
19 |     publish_date = scrapy.Field()  # 上映日期
20 |     IMDb_rate = scrapy.Field()  # IMDB评分
21 |     movie_time = scrapy.Field()  # 片长
22 |     director = scrapy.Field()  # 导演
23 |     main_actor = scrapy.Field()  # 主演
24 |     introduce = scrapy.Field()  # 简介
25 |     download_url = scrapy.Field()  # 下载地址
26 | 


--------------------------------------------------------------------------------
/dytt8/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class Dytt8SpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class Dytt8DownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/dytt8/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class Dytt8Pipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/dytt8/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for dytt8 project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'dytt8'
13 | 
14 | SPIDER_MODULES = ['dytt8.spiders']
15 | NEWSPIDER_MODULE = 'dytt8.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'dytt8 (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | DOWNLOAD_DELAY = 1
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'dytt8.middlewares.Dytt8SpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'dytt8.middlewares.Dytt8DownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'dytt8.pipelines.Dytt8Pipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/dytt8/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/dytt8/spiders/dytt8_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | 
 4 | from dytt8.items import Dytt8Item
 5 | 
 6 | 
 7 | class Dytt8SpiderSpider(scrapy.Spider):
 8 |     name = 'dytt8_spider'
 9 |     allowed_domains = ['www.dytt8.net']
10 |     start_urls = ['http://www.dytt8.net/']
11 | 
12 |     headers = {
13 |         'connection': "keep-alive",
14 |         'pragma': "no-cache",
15 |         'cache-control': "no-cache",
16 |         'upgrade-insecure-requests': "1",
17 |         'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) "
18 |                       "Chrome/71.0.3578.98 Safari/537.36",
19 |         'dnt': "1",
20 |         'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
21 |         'accept-encoding': "gzip, deflate, br",
22 |         'accept-language': "zh-CN,zh;q=0.9,en;q=0.8",
23 |         'cookie': "XLA_CI=97928deaf2eec58555c78b1518df772a",
24 |     }
25 | 
26 |     def start_requests(self):
27 |         base_url = 'https://www.dytt8.net/html/gndy/{}/index.html'
28 |         categories = ['china', 'rihan', 'oumei', 'dyzz']
29 |         for category in categories:
30 |             yield scrapy.Request(base_url.format(category), headers=self.headers, callback=self.parse)
31 | 
32 |     def parse(self, response):
33 |         # xpath('//div[contains(@class,"a") and contains(@class,"b")]') #它会取class含有有a和b的元素
34 |         detail_urls = response.xpath('//a[@class="ulink"]/@href').extract()
35 |         detail_urls = [url for url in detail_urls if 'index' not in url]
36 |         print(detail_urls)
37 | 
38 |         for url in detail_urls:
39 |             yield scrapy.Request(response.urljoin(url), headers=self.headers, callback=self.detail)
40 |         next_page = response.xpath('.//a[contains(text(),"下一页")]/@href').extract_first()
41 |         if next_page is not None:
42 |             next_page = response.urljoin(next_page)
43 |             yield scrapy.Request(next_page,headers=self.headers, callback=self.parse)
44 | 
45 |     def detail(self, response):
46 | 
47 |         item = Dytt8Item()
48 | 
49 |         name = response.xpath('//p/text()').re('◎译\u3000\u3000名\u3000(.*)')
50 |         category = response.xpath('//p/text()').re('◎类\u3000\u3000别\u3000(.*)')
51 |         country = response.xpath('//p/text()').re('◎产\u3000\u3000地\u3000(.*)')
52 |         douban_rate = response.xpath('//p/text()').re('◎豆瓣评分\u3000(.*)')
53 |         language = response.xpath('//p/text()').re('◎语\u3000\u3000言\u3000(.*)')
54 |         publish_date = response.xpath('//p/text()').re('◎上映日期\u3000(.*)')
55 |         IMDb_rate = response.xpath('//p/text()').re('◎IMDb评分\u3000(.*)')
56 |         movie_time = response.xpath('//p/text()').re('◎片\u3000\u3000长\u3000(.*)')
57 |         director = response.xpath('//p/text()').re('◎导\u3000\u3000演\u3000(.*)')
58 |         main_actor = response.xpath('//p/text()').re('◎主\u3000\u3000演\u3000(.*)')
59 |         introduce = response.xpath('//p/text()').re('\u3000\u3000(.*)')
60 |         download_url = response.xpath('//a/text()').re('ftp.*')
61 | 
62 |         if name:
63 |             item['name'] = name[0]
64 |         if category:
65 |             item['category'] = category[0]
66 | 
67 |         if country:
68 |             item['country'] = country[0]
69 |         if douban_rate:
70 |             item['douban_rate'] = douban_rate[0]
71 |         if language:
72 |             item['language'] = language[0]
73 |         if publish_date:
74 |             item['publish_date'] = publish_date[0]
75 |         if IMDb_rate:
76 |             item['IMDb_rate'] = IMDb_rate[0]
77 |         if movie_time:
78 |             item['movie_time'] = movie_time[0]
79 |         if director:
80 |             item['director'] = director[0]
81 |         if main_actor:
82 |             item['main_actor'] = main_actor[0]
83 |         if download_url:
84 |             item['download_url'] = ''.join(download_url)
85 |         if introduce:
86 |             item['introduce'] = introduce[-1]
87 |         yield item
88 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | from scrapy.cmdline import execute
2 | 
3 | spider_cmd = 'scrapy crawl dytt8_spider -o movies.csv'
4 | 
5 | execute(spider_cmd.split(' '))


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = dytt8.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = dytt8
12 | 


--------------------------------------------------------------------------------