├── .gitignore ├── README.md ├── dytt8 ├── __init__.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ └── dytt8_spider.py ├── main.py └── scrapy.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | *.log 3 | *.pyc 4 | __pycache__/ 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # python scrapy爬取电影天堂所有电影 2 | ### hello,下午好,这次给大家带来的是电影天堂的爬虫,希望大家会喜欢,项目采用的scrapy框架进行爬取,没有用到数据库,直接存成csv格式,这次也没有加入分布式,逻辑比较简单,主要难点就是详情页的数据解析 3 | 4 | 5 | ## 主要用到的技术 6 | - scrapy xpath 和re的结合使用 7 | - 正则表达式的使用 8 | 9 | 10 | --- 11 | 12 | ## 项目截图 13 | - ![运行中](https://i.loli.net/2019/01/31/5c52a80c27ad1.png) 14 | - ![运行结果](https://i.loli.net/2019/01/31/5c52a8f30bafe.png) 15 | 16 | --- 17 | 18 | ## 核心源码 :beers: 19 | ```python 20 | # -*- coding: utf-8 -*- 21 | import scrapy 22 | 23 | from dytt8.items import Dytt8Item 24 | 25 | 26 | class Dytt8SpiderSpider(scrapy.Spider): 27 | name = 'dytt8_spider' 28 | allowed_domains = ['www.dytt8.net'] 29 | start_urls = ['http://www.dytt8.net/'] 30 | 31 | headers = { 32 | 'connection': "keep-alive", 33 | 'pragma': "no-cache", 34 | 'cache-control': "no-cache", 35 | 'upgrade-insecure-requests': "1", 36 | 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10\_13\_4) AppleWebKit/537.36 (KHTML, like Gecko) " 37 | "Chrome/71.0.3578.98 Safari/537.36", 38 | 'dnt': "1", 39 | 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 40 | 'accept-encoding': "gzip, deflate, br", 41 | 'accept-language': "zh-CN,zh;q=0.9,en;q=0.8", 42 | 'cookie': "XLA_CI=97928deaf2eec58555c78b1518df772a", 43 | } 44 | 45 | def start_requests(self): 46 | base_url = 'https://www.dytt8.net/html/gndy/{}/index.html' 47 | categories = ['china', 'rihan', 'oumei', 'dyzz'] 48 | for category in categories: 49 | yield scrapy.Request(base_url.format(category), headers=self.headers, callback=self.parse) 50 | 51 | def parse(self, response): 52 | # xpath('//div[contains(@class,"a") and contains(@class,"b")]') #它会取class含有有a和b的元素 53 | detail_urls = response.xpath('//a[@class="ulink"]/@href').extract() 54 | detail_urls = [url for url in detail_urls if 'index' not in url] 55 | print(detail_urls) 56 | 57 | for url in detail_urls: 58 | yield scrapy.Request(response.urljoin(url), headers=self.headers, callback=self.detail) 59 | 60 | def detail(self, response): 61 | 62 | item = Dytt8Item() 63 | 64 | name = response.xpath('//p/text()').re('◎译\\u3000\\u3000名\\u3000(.*)') 65 | category = response.xpath('//p/text()').re('◎类\\u3000\\u3000别\\u3000(.*)') 66 | country = response.xpath('//p/text()').re('◎产\\u3000\\u3000地\\u3000(.*)') 67 | douban_rate = response.xpath('//p/text()').re('◎豆瓣评分\\u3000(.*)') 68 | language = response.xpath('//p/text()').re('◎语\\u3000\\u3000言\\u3000(.*)') 69 | publish_date = response.xpath('//p/text()').re('◎上映日期\\u3000(.*)') 70 | IMDb_rate = response.xpath('//p/text()').re('◎IMDb评分\\u3000(.*)') 71 | movie_time = response.xpath('//p/text()').re('◎片\\u3000\\u3000长\\u3000(.*)') 72 | director = response.xpath('//p/text()').re('◎导\\u3000\\u3000演\\u3000(.*)') 73 | main_actor = response.xpath('//p/text()').re('◎主\\u3000\\u3000演\\u3000(.*)') 74 | introduce = response.xpath('//p/text()').re('\\u3000\\u3000(.*)') 75 | download_url = response.xpath('//a/text()').re('ftp.*') 76 | 77 | if name: 78 | item['name'] = name[0] 79 | if category: 80 | item['category'] = category[0] 81 | 82 | if country: 83 | item['country'] = country[0] 84 | if douban_rate: 85 | item['douban_rate'] = douban_rate[0] 86 | if language: 87 | item['language'] = language[0] 88 | if publish_date: 89 | item['publish_date'] = publish_date[0] 90 | if IMDb_rate: 91 | item['IMDb_rate'] = IMDb_rate[0] 92 | if movie_time: 93 | item['movie_time'] = movie_time[0] 94 | if director: 95 | item['director'] = director[0] 96 | if main_actor: 97 | item['main_actor'] = main_actor[0] 98 | if download_url: 99 | item['download_url'] = ''.join(download_url) 100 | if introduce: 101 | item['introduce'] = introduce[-1] 102 | yield item 103 | ``` 104 | --- 105 | ## 如何使用 106 | 107 | - pip install scrapy 108 | 109 | - git clone https://github.com/guapier/dytt8.git 110 | 111 | - cd dytt8 112 | 113 | - python3 main.py 114 | 115 | - 顺便star一下呗 116 | 117 | 118 | -------------------------------------------------------------------------------- /dytt8/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guapier/dytt8/cfcddfc9b46c2f995171f6e5c19f4e93a4fb683a/dytt8/__init__.py -------------------------------------------------------------------------------- /dytt8/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class Dytt8Item(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | name = scrapy.Field() # 电影名 15 | category = scrapy.Field() # 类别 16 | country = scrapy.Field() # 产地 17 | douban_rate = scrapy.Field() # 豆瓣评分 18 | language = scrapy.Field() # 语言 19 | publish_date = scrapy.Field() # 上映日期 20 | IMDb_rate = scrapy.Field() # IMDB评分 21 | movie_time = scrapy.Field() # 片长 22 | director = scrapy.Field() # 导演 23 | main_actor = scrapy.Field() # 主演 24 | introduce = scrapy.Field() # 简介 25 | download_url = scrapy.Field() # 下载地址 26 | -------------------------------------------------------------------------------- /dytt8/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class Dytt8SpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class Dytt8DownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /dytt8/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class Dytt8Pipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /dytt8/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for dytt8 project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'dytt8' 13 | 14 | SPIDER_MODULES = ['dytt8.spiders'] 15 | NEWSPIDER_MODULE = 'dytt8.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'dytt8 (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | DOWNLOAD_DELAY = 1 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'dytt8.middlewares.Dytt8SpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'dytt8.middlewares.Dytt8DownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'dytt8.pipelines.Dytt8Pipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /dytt8/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /dytt8/spiders/dytt8_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | from dytt8.items import Dytt8Item 5 | 6 | 7 | class Dytt8SpiderSpider(scrapy.Spider): 8 | name = 'dytt8_spider' 9 | allowed_domains = ['www.dytt8.net'] 10 | start_urls = ['http://www.dytt8.net/'] 11 | 12 | headers = { 13 | 'connection': "keep-alive", 14 | 'pragma': "no-cache", 15 | 'cache-control': "no-cache", 16 | 'upgrade-insecure-requests': "1", 17 | 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) " 18 | "Chrome/71.0.3578.98 Safari/537.36", 19 | 'dnt': "1", 20 | 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 21 | 'accept-encoding': "gzip, deflate, br", 22 | 'accept-language': "zh-CN,zh;q=0.9,en;q=0.8", 23 | 'cookie': "XLA_CI=97928deaf2eec58555c78b1518df772a", 24 | } 25 | 26 | def start_requests(self): 27 | base_url = 'https://www.dytt8.net/html/gndy/{}/index.html' 28 | categories = ['china', 'rihan', 'oumei', 'dyzz'] 29 | for category in categories: 30 | yield scrapy.Request(base_url.format(category), headers=self.headers, callback=self.parse) 31 | 32 | def parse(self, response): 33 | # xpath('//div[contains(@class,"a") and contains(@class,"b")]') #它会取class含有有a和b的元素 34 | detail_urls = response.xpath('//a[@class="ulink"]/@href').extract() 35 | detail_urls = [url for url in detail_urls if 'index' not in url] 36 | print(detail_urls) 37 | 38 | for url in detail_urls: 39 | yield scrapy.Request(response.urljoin(url), headers=self.headers, callback=self.detail) 40 | next_page = response.xpath('.//a[contains(text(),"下一页")]/@href').extract_first() 41 | if next_page is not None: 42 | next_page = response.urljoin(next_page) 43 | yield scrapy.Request(next_page,headers=self.headers, callback=self.parse) 44 | 45 | def detail(self, response): 46 | 47 | item = Dytt8Item() 48 | 49 | name = response.xpath('//p/text()').re('◎译\u3000\u3000名\u3000(.*)') 50 | category = response.xpath('//p/text()').re('◎类\u3000\u3000别\u3000(.*)') 51 | country = response.xpath('//p/text()').re('◎产\u3000\u3000地\u3000(.*)') 52 | douban_rate = response.xpath('//p/text()').re('◎豆瓣评分\u3000(.*)') 53 | language = response.xpath('//p/text()').re('◎语\u3000\u3000言\u3000(.*)') 54 | publish_date = response.xpath('//p/text()').re('◎上映日期\u3000(.*)') 55 | IMDb_rate = response.xpath('//p/text()').re('◎IMDb评分\u3000(.*)') 56 | movie_time = response.xpath('//p/text()').re('◎片\u3000\u3000长\u3000(.*)') 57 | director = response.xpath('//p/text()').re('◎导\u3000\u3000演\u3000(.*)') 58 | main_actor = response.xpath('//p/text()').re('◎主\u3000\u3000演\u3000(.*)') 59 | introduce = response.xpath('//p/text()').re('\u3000\u3000(.*)') 60 | download_url = response.xpath('//a/text()').re('ftp.*') 61 | 62 | if name: 63 | item['name'] = name[0] 64 | if category: 65 | item['category'] = category[0] 66 | 67 | if country: 68 | item['country'] = country[0] 69 | if douban_rate: 70 | item['douban_rate'] = douban_rate[0] 71 | if language: 72 | item['language'] = language[0] 73 | if publish_date: 74 | item['publish_date'] = publish_date[0] 75 | if IMDb_rate: 76 | item['IMDb_rate'] = IMDb_rate[0] 77 | if movie_time: 78 | item['movie_time'] = movie_time[0] 79 | if director: 80 | item['director'] = director[0] 81 | if main_actor: 82 | item['main_actor'] = main_actor[0] 83 | if download_url: 84 | item['download_url'] = ''.join(download_url) 85 | if introduce: 86 | item['introduce'] = introduce[-1] 87 | yield item 88 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from scrapy.cmdline import execute 2 | 3 | spider_cmd = 'scrapy crawl dytt8_spider -o movies.csv' 4 | 5 | execute(spider_cmd.split(' ')) -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = dytt8.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = dytt8 12 | --------------------------------------------------------------------------------