├── .gitignore ├── README.md ├── docker ├── docker-compose.yml ├── mysql │ ├── Dockerfile │ └── douban.sql └── scrapyd │ ├── Dockerfile │ └── scrapyd.conf ├── requirements.txt └── scrapy ├── douban ├── __init__.py ├── database.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py ├── spiders │ ├── __init__.py │ ├── book_comment.py │ ├── book_meta.py │ ├── book_subject.py │ ├── movie_comment.py │ ├── movie_meta.py │ └── movie_subject.py ├── util.py └── validator.py └── scrapy.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.swp 3 | storage/ 4 | __pycache__/ 5 | .idea/ 6 | .vscode/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## ScrapyDouban 2 | 3 | [](https://www.youtube.com/watch?v=Fyrvrb0rqvE) [演示视频](https://www.youtube.com/watch?v=Fyrvrb0rqvE) 4 | 5 | 基于 Python3 的豆瓣电影/豆瓣读书 Scarpy 爬虫,实现封面下载+元数据抓取+评论入库。 6 | 7 | 维护这个项目的目的是分享一些我在使用 Scrapy 过程中的实践,该项目大概涵盖了 80% 我所使用到的 Scrapy 知识,希望能帮助到正在学习 Scrapy 的朋友,也希望大家在阅读[ Scrapy 官方文档](https://scrapy.readthedocs.io/en/stable/index.html)后食用,但是请注意目前项目所使用版本为 Scrapy 2.5.0。 8 | 9 |  10 |  11 |  12 | 13 | ### Docker 14 | ------- 15 | 项目包含了 douban_scrapyd douban_db douban_adminer 三个容器。 16 | 17 | douban_scrapyd 容器基于 [python:3.9-slim-buster](https://pythonspeed.com/articles/base-image-python-docker-images/),默认安装的 Python3 库有 scrapy scrapyd pymysql pillow arrow,默认映射端口 6800:6800 以方便用户通过宿主机 IP:6800 访问 scrapyd 管理界面,登陆所需参数,用户名:scrapyd 密码:public。 18 | 19 | douban_db 容器基于 mysql:8,root 密码为 public,默认初始化时导入 docker/mysql/douban.sql 文件到 douban 数据库。 20 | 21 | douban_adminer 容器基于 adminer:4,默认映射端口 8080:8080 以方便用户通过宿主机 IP:8080 访问数据库管理界面,登陆所需参数,服务器:mysql 用户名:root 密码:public。 22 | 23 | 24 | ### 项目 SQL 25 | ------ 26 | 27 | 项目所使用的 SQL 文件存放路径为 docker/mysql/douban.sql 。 28 | 29 | ### 收集流程 30 | ------- 31 | 32 | 首先收集 Subject ID --> 然后通过 Subject ID 抓取详情页面,收集元数据 --> 最后通过 Subject ID 来收集评论 33 | 34 | ### 使用方法 35 | ------- 36 | $ git clone https://github.com/baabaaox/ScrapyDouban.git 37 | # 构建并运行容器 38 | $ cd ./ScrapyDouban/docker 39 | $ sudo docker-compose up --build -d 40 | # 进入 douban_scrapyd 容器 41 | $ sudo docker exec -it douban_scrapyd bash 42 | # 进入 scrapy 目录 43 | $ cd /srv/ScrapyDouban/scrapy 44 | $ scrapy list 45 | # 抓取电影数据 46 | $ scrapy crawl movie_subject # 收集电影 Subject ID 47 | $ scrapy crawl movie_meta # 收集电影元数据 48 | $ scrapy crawl movie_comment # 收集电影评论 49 | # 抓取书籍数据 50 | $ scrapy crawl book_subject # 收集书籍 Subject ID 51 | $ scrapy crawl book_meta # 收集书籍元数据 52 | $ scrapy crawl book_comment # 收集书籍评论 53 | 54 | 如果你想在测试的时候比较方便的修改代码,你可以把项目所在路径 scrapy 目录挂载到 douban_scrapyd 容器。 55 | 如果你习惯使用 scrapyd 进行操作,可以通过 scrapyd-client 直接将项目部署到 douban_scrapyd 容器。 56 | 57 | ### 代理 IP 58 | -------- 59 | 60 | 由于豆瓣的反爬虫机制,现在只能通过代理 IP 来绕过。默认 settings.py 里面并未启用 douban.middlewares.ProxyMiddleware 中间件,如果你真的需要使用豆瓣的数据来进行一些研究,可以去租用付费的代理池。 61 | 62 | 63 | ### 图片下载 64 | -------- 65 | 66 | douban.pipelines.CoverPipeline 通过对 spider.name 进行过滤来处理封面下载逻辑,所下载图片文件的保存路径为 douban_scrapy 容器的 /srv/ScrapyDouban/storage 目录。 67 | -------------------------------------------------------------------------------- /docker/docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | mysql: 3 | build: ./mysql 4 | container_name: douban_mysql 5 | environment: 6 | - MYSQL_ROOT_PASSWORD=public 7 | - MYSQL_DATABASE=douban 8 | command: mysqld --default-authentication-plugin=mysql_native_password 9 | adminer: 10 | image: adminer:4 11 | container_name: douban_adminer 12 | ports: 13 | - 8080:8080 14 | links: 15 | - mysql 16 | scrapyd: 17 | build: ./scrapyd 18 | container_name: douban_scrapyd 19 | ports: 20 | - 6800:6800 21 | environment: 22 | - TZ=Asia/Chongqing 23 | - MYSQL_HOST=mysql 24 | - MYSQL_USER=root 25 | - MYSQL_PASS=public 26 | - MYSQL_DB=douban 27 | links: 28 | - mysql 29 | -------------------------------------------------------------------------------- /docker/mysql/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM mysql:8 2 | 3 | ADD douban.sql /docker-entrypoint-initdb.d 4 | -------------------------------------------------------------------------------- /docker/mysql/douban.sql: -------------------------------------------------------------------------------- 1 | -- Adminer 4.6.3 MySQL dump 2 | 3 | SET NAMES utf8; 4 | SET time_zone = '+00:00'; 5 | SET foreign_key_checks = 0; 6 | SET sql_mode = 'NO_AUTO_VALUE_ON_ZERO'; 7 | 8 | SET NAMES utf8mb4; 9 | 10 | CREATE DATABASE IF NOT EXISTS `douban` /*!40100 DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci */; 11 | USE `douban`; 12 | 13 | DROP TABLE IF EXISTS `books`; 14 | CREATE TABLE `books` ( 15 | `id` int(10) unsigned NOT NULL AUTO_INCREMENT, 16 | `slug` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 17 | `name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 18 | `sub_name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 19 | `alt_name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 20 | `cover` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 21 | `summary` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, 22 | `authors` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 23 | `author_intro` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, 24 | `translators` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 25 | `series` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 26 | `publisher` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 27 | `publish_date` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 28 | `pages` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 29 | `price` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 30 | `binding` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 31 | `isbn` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 32 | `tags` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 33 | `douban_id` int(10) unsigned NOT NULL DEFAULT '0', 34 | `douban_score` decimal(3,1) unsigned NOT NULL DEFAULT '0.0', 35 | `douban_votes` int(10) unsigned NOT NULL DEFAULT '0', 36 | `created_at` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00', 37 | `updated_at` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00', 38 | PRIMARY KEY (`id`), 39 | KEY `books_slug_index` (`slug`), 40 | KEY `books_name_index` (`name`), 41 | KEY `books_douban_id_index` (`douban_id`) 42 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; 43 | 44 | 45 | DROP TABLE IF EXISTS `comments`; 46 | CREATE TABLE `comments` ( 47 | `id` int(10) unsigned NOT NULL AUTO_INCREMENT, 48 | `douban_id` int(10) unsigned NOT NULL DEFAULT '0', 49 | `douban_comment_id` int(10) unsigned NOT NULL DEFAULT '0', 50 | `douban_user_nickname` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 51 | `douban_user_avatar` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 52 | `douban_user_url` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 53 | `content` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL, 54 | `votes` int(10) unsigned NOT NULL DEFAULT '0', 55 | `created_at` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00', 56 | `updated_at` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00', 57 | PRIMARY KEY (`id`), 58 | KEY `comments_douban_id_index` (`douban_id`), 59 | KEY `comments_douban_comment_id_index` (`douban_comment_id`) 60 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; 61 | 62 | 63 | DROP TABLE IF EXISTS `movies`; 64 | CREATE TABLE `movies` ( 65 | `id` int(10) unsigned NOT NULL AUTO_INCREMENT, 66 | `type` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 67 | `slug` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 68 | `name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 69 | `alias` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 70 | `cover` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 71 | `year` smallint(5) unsigned NOT NULL DEFAULT '0', 72 | `regions` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 73 | `genres` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 74 | `languages` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 75 | `release_date` date DEFAULT NULL, 76 | `official_site` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 77 | `directors` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 78 | `writers` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 79 | `actors` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, 80 | `storyline` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, 81 | `mins` smallint(5) unsigned NOT NULL DEFAULT '0', 82 | `recommend_tip` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 83 | `tags` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 84 | `avg_score` decimal(3,1) unsigned NOT NULL DEFAULT '0.0', 85 | `imdb_id` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 86 | `imdb_score` decimal(3,1) unsigned NOT NULL DEFAULT '0.0', 87 | `imdb_votes` int(10) unsigned NOT NULL DEFAULT '0', 88 | `douban_id` int(10) unsigned NOT NULL DEFAULT '0', 89 | `douban_score` decimal(3,1) unsigned NOT NULL DEFAULT '0.0', 90 | `douban_votes` int(10) unsigned NOT NULL DEFAULT '0', 91 | `created_at` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00', 92 | `updated_at` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00', 93 | PRIMARY KEY (`id`), 94 | KEY `movies_slug_index` (`slug`), 95 | KEY `movies_name_index` (`name`), 96 | KEY `movies_imdb_id_index` (`imdb_id`), 97 | KEY `movies_douban_id_index` (`douban_id`) 98 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; 99 | 100 | 101 | DROP TABLE IF EXISTS `subjects`; 102 | CREATE TABLE `subjects` ( 103 | `id` int(10) unsigned NOT NULL AUTO_INCREMENT, 104 | `douban_id` int(10) unsigned NOT NULL DEFAULT '0', 105 | `type` enum('movie','book') CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT 'movie', 106 | PRIMARY KEY (`id`), 107 | UNIQUE KEY `subjects_douban_id_unique` (`douban_id`) 108 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; 109 | 110 | 111 | -- 2018-11-27 16:52:54 112 | -------------------------------------------------------------------------------- /docker/scrapyd/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim-buster 2 | 3 | ARG SCRAPY_VERSION=2.5.0 4 | 5 | RUN apt-get update \ 6 | && apt-get install -y --no-install-recommends git \ 7 | && pip install -i https://mirrors.aliyun.com/pypi/simple/ --upgrade pip \ 8 | && pip install -i https://mirrors.aliyun.com/pypi/simple/ scrapy==$SCRAPY_VERSION pymysql==1.0.2 pillow==8.2.0 arrow==1.0.3 \ 9 | && pip install -U git+https://github.com/scrapy/scrapyd.git \ 10 | && git clone https://github.com/baabaaox/ScrapyDouban.git /srv/ScrapyDouban 11 | 12 | COPY scrapyd.conf /etc/scrapyd/ 13 | 14 | EXPOSE 6800 15 | 16 | CMD ["scrapyd"] 17 | -------------------------------------------------------------------------------- /docker/scrapyd/scrapyd.conf: -------------------------------------------------------------------------------- 1 | [scrapyd] 2 | bind_address = 0.0.0.0 3 | username = scrapyd 4 | password = public 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | arrow==1.2.1 2 | Pillow==9.0.0 3 | PyMySQL==1.0.2 4 | Scrapy==2.5.1 5 | -------------------------------------------------------------------------------- /scrapy/douban/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baabaaox/ScrapyDouban/b6d2cced7fc163ede4df560dac1f8dfa218a59b5/scrapy/douban/__init__.py -------------------------------------------------------------------------------- /scrapy/douban/database.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pymysql 4 | 5 | MYSQL_HOST = os.environ.get("MYSQL_HOST", "localhost") 6 | MYSQL_USER = os.environ.get("MYSQL_USER", "root") 7 | MYSQL_PASS = os.environ.get("MYSQL_PASS", "public") 8 | MYSQL_DB = os.environ.get("MYSQL_DB", "douban") 9 | 10 | connection = pymysql.connect( 11 | host=MYSQL_HOST, 12 | user=MYSQL_USER, 13 | password=MYSQL_PASS, 14 | db=MYSQL_DB, 15 | charset="utf8mb4", 16 | cursorclass=pymysql.cursors.DictCursor, 17 | ) 18 | -------------------------------------------------------------------------------- /scrapy/douban/items.py: -------------------------------------------------------------------------------- 1 | from scrapy import Field, Item 2 | 3 | 4 | class Subject(Item): 5 | douban_id = Field() 6 | type = Field() 7 | 8 | 9 | class MovieMeta(Item): 10 | douban_id = Field() 11 | type = Field() 12 | cover = Field() 13 | name = Field() 14 | slug = Field() 15 | year = Field() 16 | directors = Field() 17 | writers = Field() 18 | actors = Field() 19 | genres = Field() 20 | official_site = Field() 21 | regions = Field() 22 | languages = Field() 23 | release_date = Field() 24 | mins = Field() 25 | alias = Field() 26 | imdb_id = Field() 27 | douban_id = Field() 28 | douban_score = Field() 29 | douban_votes = Field() 30 | tags = Field() 31 | storyline = Field() 32 | 33 | 34 | class BookMeta(Item): 35 | douban_id = Field() 36 | slug = Field() 37 | name = Field() 38 | sub_name = Field() 39 | alt_name = Field() 40 | cover = Field() 41 | summary = Field() 42 | authors = Field() 43 | author_intro = Field() 44 | translators = Field() 45 | series = Field() 46 | publisher = Field() 47 | publish_date = Field() 48 | pages = Field() 49 | price = Field() 50 | binding = Field() 51 | isbn = Field() 52 | douban_id = Field() 53 | douban_score = Field() 54 | douban_votes = Field() 55 | tags = Field() 56 | 57 | 58 | class Comment(Item): 59 | douban_id = Field() 60 | douban_comment_id = Field() 61 | douban_user_nickname = Field() 62 | douban_user_avatar = Field() 63 | douban_user_url = Field() 64 | content = Field() 65 | votes = Field() 66 | -------------------------------------------------------------------------------- /scrapy/douban/middlewares.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your spider middleware 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 5 | 6 | # useful for handling different item types with a single interface 7 | from itemadapter import ItemAdapter, is_item 8 | from scrapy import signals 9 | 10 | 11 | class ProxyMiddleware(object): 12 | def process_request(self, request, spider): 13 | # curl https://m.douban.com/book/subject/26628811/ -x http://127.0.0.1:8081 14 | request.meta["proxy"] = "http://127.0.0.1:8081" 15 | 16 | 17 | class DoubanSpiderMiddleware: 18 | # Not all methods need to be defined. If a method is not defined, 19 | # scrapy acts as if the spider middleware does not modify the 20 | # passed objects. 21 | 22 | @classmethod 23 | def from_crawler(cls, crawler): 24 | # This method is used by Scrapy to create your spiders. 25 | s = cls() 26 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 27 | return s 28 | 29 | def process_spider_input(self, response, spider): 30 | # Called for each response that goes through the spider 31 | # middleware and into the spider. 32 | 33 | # Should return None or raise an exception. 34 | return None 35 | 36 | def process_spider_output(self, response, result, spider): 37 | # Called with the results returned from the Spider, after 38 | # it has processed the response. 39 | 40 | # Must return an iterable of Request, or item objects. 41 | for i in result: 42 | yield i 43 | 44 | def process_spider_exception(self, response, exception, spider): 45 | # Called when a spider or process_spider_input() method 46 | # (from other spider middleware) raises an exception. 47 | 48 | # Should return either None or an iterable of Request or item objects. 49 | pass 50 | 51 | def process_start_requests(self, start_requests, spider): 52 | # Called with the start requests of the spider, and works 53 | # similarly to the process_spider_output() method, except 54 | # that it doesn’t have a response associated. 55 | 56 | # Must return only requests (not items). 57 | for r in start_requests: 58 | yield r 59 | 60 | def spider_opened(self, spider): 61 | spider.logger.info("Spider opened: %s" % spider.name) 62 | 63 | 64 | class DoubanDownloaderMiddleware: 65 | # Not all methods need to be defined. If a method is not defined, 66 | # scrapy acts as if the downloader middleware does not modify the 67 | # passed objects. 68 | 69 | @classmethod 70 | def from_crawler(cls, crawler): 71 | # This method is used by Scrapy to create your spiders. 72 | s = cls() 73 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 74 | return s 75 | 76 | def process_request(self, request, spider): 77 | # Called for each request that goes through the downloader 78 | # middleware. 79 | 80 | # Must either: 81 | # - return None: continue processing this request 82 | # - or return a Response object 83 | # - or return a Request object 84 | # - or raise IgnoreRequest: process_exception() methods of 85 | # installed downloader middleware will be called 86 | return None 87 | 88 | def process_response(self, request, response, spider): 89 | # Called with the response returned from the downloader. 90 | 91 | # Must either; 92 | # - return a Response object 93 | # - return a Request object 94 | # - or raise IgnoreRequest 95 | return response 96 | 97 | def process_exception(self, request, exception, spider): 98 | # Called when a download handler or a process_request() 99 | # (from other downloader middleware) raises an exception. 100 | 101 | # Must either: 102 | # - return None: continue processing this exception 103 | # - return a Response object: stops process_exception() chain 104 | # - return a Request object: stops process_exception() chain 105 | pass 106 | 107 | def spider_opened(self, spider): 108 | spider.logger.info("Spider opened: %s" % spider.name) 109 | -------------------------------------------------------------------------------- /scrapy/douban/pipelines.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import logging 3 | 4 | from scrapy import Request 5 | from scrapy.pipelines.images import ImagesPipeline 6 | from scrapy.utils.misc import arg_to_iter 7 | from scrapy.utils.python import to_bytes 8 | from twisted.internet.defer import DeferredList 9 | 10 | import douban.database as db 11 | from douban.items import BookMeta, Comment, MovieMeta, Subject 12 | 13 | cursor = db.connection.cursor() 14 | 15 | 16 | class DoubanPipeline(object): 17 | def get_subject(self, item): 18 | sql = "SELECT id FROM subjects WHERE douban_id=%s" % item["douban_id"] 19 | cursor.execute(sql) 20 | return cursor.fetchone() 21 | 22 | def save_subject(self, item): 23 | keys = item.keys() 24 | values = tuple(item.values()) 25 | fields = ",".join(keys) 26 | temp = ",".join(["%s"] * len(keys)) 27 | sql = "INSERT INTO subjects (%s) VALUES (%s)" % (fields, temp) 28 | cursor.execute(sql, values) 29 | return db.connection.commit() 30 | 31 | def get_movie_meta(self, item): 32 | sql = "SELECT id FROM movies WHERE douban_id=%s" % item["douban_id"] 33 | cursor.execute(sql) 34 | return cursor.fetchone() 35 | 36 | def save_movie_meta(self, item): 37 | keys = item.keys() 38 | values = tuple(item.values()) 39 | fields = ",".join(keys) 40 | temp = ",".join(["%s"] * len(keys)) 41 | sql = "INSERT INTO movies (%s) VALUES (%s)" % (fields, temp) 42 | cursor.execute(sql, tuple(i.strip() for i in values)) 43 | return db.connection.commit() 44 | 45 | def update_movie_meta(self, item): 46 | douban_id = item.pop("douban_id") 47 | keys = item.keys() 48 | values = tuple(item.values()) 49 | values.append(douban_id) 50 | fields = ["%s=" % i + "%s" for i in keys] 51 | sql = "UPDATE movies SET %s WHERE douban_id=%s" % (",".join(fields), "%s") 52 | cursor.execute(sql, tuple(i.strip() for i in values)) 53 | return db.connection.commit() 54 | 55 | def get_book_meta(self, item): 56 | sql = "SELECT id FROM books WHERE douban_id=%s" % item["douban_id"] 57 | cursor.execute(sql) 58 | return cursor.fetchone() 59 | 60 | def save_book_meta(self, item): 61 | keys = item.keys() 62 | values = tuple(item.values()) 63 | fields = ",".join(keys) 64 | temp = ",".join(["%s"] * len(keys)) 65 | sql = "INSERT INTO books (%s) VALUES (%s)" % (fields, temp) 66 | cursor.execute(sql, tuple(i.strip() for i in values)) 67 | return db.connection.commit() 68 | 69 | def update_book_meta(self, item): 70 | douban_id = item.pop("douban_id") 71 | keys = item.keys() 72 | values = tuple(item.values()) 73 | values.append(douban_id) 74 | fields = ["%s=" % i + "%s" for i in keys] 75 | sql = "UPDATE books SET %s WHERE douban_id=%s" % (",".join(fields), "%s") 76 | cursor.execute(sql, values) 77 | return db.connection.commit() 78 | 79 | def get_comment(self, item): 80 | sql = "SELECT * FROM comments WHERE douban_comment_id=%s" % item["douban_comment_id"] 81 | cursor.execute(sql) 82 | return cursor.fetchone() 83 | 84 | def save_comment(self, item): 85 | keys = item.keys() 86 | values = tuple(item.values()) 87 | fields = ",".join(keys) 88 | temp = ",".join(["%s"] * len(keys)) 89 | sql = "INSERT INTO comments (%s) VALUES (%s)" % (fields, temp) 90 | cursor.execute(sql, values) 91 | return db.connection.commit() 92 | 93 | def process_item(self, item, spider): 94 | try: 95 | if isinstance(item, Subject): 96 | """ 97 | subject 98 | """ 99 | exist = self.get_subject(item) 100 | if not exist: 101 | self.save_subject(item) 102 | elif isinstance(item, MovieMeta): 103 | """ 104 | meta 105 | """ 106 | exist = self.get_movie_meta(item) 107 | if not exist: 108 | self.save_movie_meta(item) 109 | else: 110 | self.update_movie_meta(item) 111 | elif isinstance(item, BookMeta): 112 | """ 113 | meta 114 | """ 115 | exist = self.get_book_meta(item) 116 | if not exist: 117 | self.save_book_meta(item) 118 | else: 119 | self.update_book_meta(item) 120 | elif isinstance(item, Comment): 121 | """ 122 | comment 123 | """ 124 | exist = self.get_comment(item) 125 | if not exist: 126 | self.save_comment(item) 127 | except Exception as e: 128 | logging.warn(item) 129 | logging.error(e) 130 | return item 131 | 132 | 133 | class CoverPipeline(ImagesPipeline): 134 | def process_item(self, item, spider): 135 | if "meta" not in spider.name: 136 | return item 137 | info = self.spiderinfo 138 | requests = arg_to_iter(self.get_media_requests(item, info)) 139 | dlist = [self._process_request(r, info, item) for r in requests] 140 | dfd = DeferredList(dlist, consumeErrors=1) 141 | return dfd.addCallback(self.item_completed, item, info) 142 | 143 | def file_path(self, request, response=None, info=None, *, item=None): 144 | guid = hashlib.sha1(to_bytes(request.url)).hexdigest() 145 | return "%s%s/%s%s/%s.jpg" % (guid[9], guid[19], guid[29], guid[39], guid) 146 | 147 | def get_media_requests(self, item, info): 148 | if item["cover"]: 149 | return Request(item["cover"]) 150 | 151 | def item_completed(self, results, item, info): 152 | image_paths = [x["path"] for ok, x in results if ok] 153 | if image_paths: 154 | item["cover"] = image_paths[0] 155 | else: 156 | item["cover"] = "" 157 | return item 158 | -------------------------------------------------------------------------------- /scrapy/douban/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for douban project 2 | # 3 | # For simplicity, this file contains only settings considered important or 4 | # commonly used. You can find more settings consulting the documentation: 5 | # 6 | # https://docs.scrapy.org/en/latest/topics/settings.html 7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 9 | 10 | BOT_NAME = "douban" 11 | 12 | SPIDER_MODULES = ["douban.spiders"] 13 | NEWSPIDER_MODULE = "douban.spiders" 14 | 15 | LOG_LEVEL = "DEBUG" 16 | IMAGES_STORE = "../storage/" 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = ( 20 | "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148" 21 | ) 22 | 23 | # Obey robots.txt rules 24 | ROBOTSTXT_OBEY = False 25 | 26 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 27 | # CONCURRENT_REQUESTS = 32 28 | 29 | # Configure a delay for requests for the same website (default: 0) 30 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 31 | # See also autothrottle settings and docs 32 | # DOWNLOAD_DELAY = 3 33 | # The download delay setting will honor only one of: 34 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 35 | CONCURRENT_REQUESTS_PER_IP = 1 36 | 37 | # Disable cookies (enabled by default) 38 | COOKIES_ENABLED = True 39 | 40 | # Disable Telnet Console (enabled by default) 41 | # TELNETCONSOLE_ENABLED = False 42 | 43 | # Override the default request headers: 44 | # DEFAULT_REQUEST_HEADERS = { 45 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 46 | # 'Accept-Language': 'en', 47 | # } 48 | 49 | # Enable or disable spider middlewares 50 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 51 | # SPIDER_MIDDLEWARES = { 52 | # 'douban.middlewares.DoubanSpiderMiddleware': 543, 53 | # } 54 | 55 | # Enable or disable downloader middlewares 56 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 57 | # DOWNLOADER_MIDDLEWARES = { 58 | # 'douban.middlewares.DoubanDownloaderMiddleware': 543, 59 | # } 60 | 61 | # Enable or disable extensions 62 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 63 | # EXTENSIONS = { 64 | # 'scrapy.extensions.telnet.TelnetConsole': None, 65 | # } 66 | 67 | # Configure item pipelines 68 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 69 | ITEM_PIPELINES = { 70 | "douban.pipelines.CoverPipeline": 1, 71 | "douban.pipelines.DoubanPipeline": 300, 72 | } 73 | 74 | # Enable and configure the AutoThrottle extension (disabled by default) 75 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 76 | # AUTOTHROTTLE_ENABLED = True 77 | # The initial download delay 78 | # AUTOTHROTTLE_START_DELAY = 5 79 | # The maximum download delay to be set in case of high latencies 80 | # AUTOTHROTTLE_MAX_DELAY = 60 81 | # The average number of requests Scrapy should be sending in parallel to 82 | # each remote server 83 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 84 | # Enable showing throttling stats for every response received: 85 | # AUTOTHROTTLE_DEBUG = False 86 | 87 | # Enable and configure HTTP caching (disabled by default) 88 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 89 | # HTTPCACHE_ENABLED = True 90 | # HTTPCACHE_EXPIRATION_SECS = 0 91 | # HTTPCACHE_DIR = 'httpcache' 92 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 93 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 94 | -------------------------------------------------------------------------------- /scrapy/douban/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy/douban/spiders/book_comment.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import douban.database as db 4 | from douban.items import Comment 5 | from scrapy import Request, Spider 6 | 7 | cursor = db.connection.cursor() 8 | 9 | 10 | class BookCommentSpider(Spider): 11 | name = "book_comment" 12 | allowed_domains = ["book.douban.com"] 13 | 14 | def start_requests(self): 15 | sql = "SELECT douban_id FROM books WHERE douban_id NOT IN \ 16 | (SELECT douban_id FROM comments GROUP BY douban_id) ORDER BY douban_id DESC" 17 | cursor.execute(sql) 18 | books = cursor.fetchall() 19 | baseurl = "https://m.douban.com/rexxar/api/v2/book/%s/interests?count=5&order_by=hot" 20 | referer = "https://m.douban.com/book/subject/%s/?from=showing" 21 | for book in books: 22 | yield Request( 23 | baseurl % book["douban_id"], headers={"Referer": referer % book["douban_id"]}, 24 | ) 25 | 26 | def parse(self, response): 27 | douban_id = response.url.split("/")[-2] 28 | items = json.loads(response.body)["interests"] 29 | for item in items: 30 | comment = Comment() 31 | comment["douban_id"] = douban_id 32 | comment["douban_comment_id"] = item["id"] 33 | comment["douban_user_nickname"] = item["user"]["name"] 34 | comment["douban_user_avatar"] = item["user"]["avatar"] 35 | comment["douban_user_url"] = item["user"]["url"] 36 | comment["content"] = item["comment"] 37 | comment["votes"] = item["vote_count"] 38 | yield comment 39 | -------------------------------------------------------------------------------- /scrapy/douban/spiders/book_meta.py: -------------------------------------------------------------------------------- 1 | import douban.database as db 2 | import douban.util as util 3 | from douban.items import BookMeta 4 | from scrapy import Spider 5 | 6 | cursor = db.connection.cursor() 7 | 8 | 9 | class BookMetaSpider(Spider): 10 | name = "book_meta" 11 | user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \ 12 | (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36" 13 | allowed_domains = ["book.douban.com"] 14 | sql = 'SELECT * FROM subjects WHERE type="book" AND douban_id NOT IN \ 15 | (SELECT douban_id FROM books) ORDER BY douban_id' 16 | cursor.execute(sql) 17 | books = cursor.fetchall() 18 | start_urls = ("https://book.douban.com/subject/%s/" % i["douban_id"] for i in books) 19 | 20 | def set_douban_id(self, meta, response): 21 | meta["douban_id"] = response.url[32:-1] 22 | return meta 23 | 24 | def set_cover(self, meta, response): 25 | regex = '//img[@rel="v:photo"]/@src' 26 | match = response.xpath(regex).get() 27 | if match: 28 | if match.find("default") == -1: 29 | meta["cover"] = match.replace("spst", "lpst").replace("mpic", "lpic") 30 | else: 31 | meta["cover"] = "" 32 | return meta 33 | 34 | def set_slug(self, meta, response): 35 | meta["slug"] = util.shorturl(meta["douban_id"]) 36 | return meta 37 | 38 | def set_name(self, meta, response): 39 | regex = "//title/text()" 40 | match = response.xpath(regex).get() 41 | if match: 42 | meta["name"] = match[:-5].strip() 43 | return meta 44 | 45 | def set_alt_name(self, meta, response): 46 | regex = '//text()[preceding-sibling::span[text()="原作名:"]][following\ 47 | -sibling::br]' 48 | match = response.xpath(regex).get() 49 | if match: 50 | meta["alt_name"] = match 51 | return meta 52 | 53 | def set_sub_name(self, meta, response): 54 | regex = '//text()[preceding-sibling::span[text()="副标题:"]][following\ 55 | -sibling::br]' 56 | match = response.xpath(regex).get() 57 | if match: 58 | meta["sub_name"] = match 59 | return meta 60 | 61 | def set_author(self, meta, response): 62 | regex = '//a[parent::span[child::span[text()=" 作者"]]]/text()' 63 | matches = response.xpath(regex).getall() 64 | if matches: 65 | meta["authors"] = "/".join((i.strip() for i in matches)) 66 | return meta 67 | 68 | def set_summary(self, meta, response): 69 | regex = '//div[@id="link-report"]//div[@class="intro"]' 70 | matches = response.xpath(regex) 71 | if matches: 72 | items = matches[-1].xpath("p/text()").getall() 73 | meta["summary"] = "".join(("
%s
" % i for i in items)) 74 | return meta 75 | 76 | def set_author_intro(self, meta, response): 77 | regex = '//div[@class="indent "]//div[@class="intro"]' 78 | matches = response.xpath(regex) 79 | if matches: 80 | items = matches[-1].xpath("p/text()").getall() 81 | meta["author_intro"] = "".join(("%s
" % i for i in items)) 82 | return meta 83 | 84 | def set_translator(self, meta, response): 85 | regex = '//a[parent::span[child::span[text()=" 译者"]]]/text()' 86 | matches = response.xpath(regex).getall() 87 | if matches: 88 | meta["translators"] = "/".join((i.strip() for i in matches)) 89 | return meta 90 | 91 | def set_series(self, meta, response): 92 | regex = '//a[preceding-sibling::span[text()="丛书:"]][following\ 93 | -sibling::br]/text()' 94 | matches = response.xpath(regex).getall() 95 | if matches: 96 | meta["series"] = "/".join((i.strip() for i in matches)) 97 | return meta 98 | 99 | def set_publisher(self, meta, response): 100 | regex = '//text()[preceding-sibling::span[text()="出版社:"]][following\ 101 | -sibling::br]' 102 | match = response.xpath(regex).get() 103 | if match: 104 | meta["publisher"] = match 105 | return meta 106 | 107 | def set_publish_date(self, meta, response): 108 | regex = '//text()[preceding-sibling::span[text()="出版年:"]][following\ 109 | -sibling::br]' 110 | match = response.xpath(regex).get() 111 | if match: 112 | meta["publish_date"] = match 113 | return meta 114 | 115 | def set_pages(self, meta, response): 116 | regex = '//text()[preceding-sibling::span[text()="页数:"]][following\ 117 | -sibling::br]' 118 | match = response.xpath(regex).get() 119 | if match: 120 | meta["pages"] = match 121 | return meta 122 | 123 | def set_price(self, meta, response): 124 | regex = '//text()[preceding-sibling::span[text()="定价:"]][following\ 125 | -sibling::br]' 126 | match = response.xpath(regex).get() 127 | if match: 128 | meta["price"] = match 129 | return meta 130 | 131 | def set_binding(self, meta, response): 132 | regex = '//text()[preceding-sibling::span[text()="装帧:"]][following\ 133 | -sibling::br]' 134 | match = response.xpath(regex).get() 135 | if match: 136 | meta["binding"] = match 137 | return meta 138 | 139 | def set_isbn(self, meta, response): 140 | regex = '//text()[preceding-sibling::span[text()="ISBN:"]][following\ 141 | -sibling::br]' 142 | match = response.xpath(regex).get() 143 | if match: 144 | meta["isbn"] = match 145 | return meta 146 | 147 | def set_score(self, meta, response): 148 | regex = '//strong[@property="v:average"]/text()' 149 | match = response.xpath(regex).get() 150 | if match: 151 | score = match.strip() 152 | if score: 153 | meta["douban_score"] = score 154 | return meta 155 | 156 | def set_votes(self, meta, response): 157 | regex = '//span[@property="v:votes"]/text()' 158 | match = response.xpath(regex).get() 159 | if match: 160 | votes = match.strip() 161 | if votes: 162 | meta["douban_votes"] = votes 163 | return meta 164 | 165 | def set_tags(self, meta, response): 166 | regex = '//a[@class=" tag"]/text()' 167 | matches = response.xpath(regex).getall() 168 | if matches: 169 | meta["tags"] = "/".join((i.strip() for i in matches)) 170 | return meta 171 | 172 | def parse(self, response): 173 | meta = BookMeta() 174 | self.set_douban_id(meta, response) 175 | self.set_cover(meta, response) 176 | self.set_name(meta, response) 177 | self.set_sub_name(meta, response) 178 | self.set_alt_name(meta, response) 179 | self.set_summary(meta, response) 180 | self.set_author(meta, response) 181 | self.set_author_intro(meta, response) 182 | self.set_translator(meta, response) 183 | self.set_series(meta, response) 184 | self.set_publisher(meta, response) 185 | self.set_publish_date(meta, response) 186 | self.set_pages(meta, response) 187 | self.set_price(meta, response) 188 | self.set_binding(meta, response) 189 | self.set_isbn(meta, response) 190 | self.set_score(meta, response) 191 | self.set_votes(meta, response) 192 | self.set_tags(meta, response) 193 | self.set_slug(meta, response) 194 | return meta 195 | -------------------------------------------------------------------------------- /scrapy/douban/spiders/book_subject.py: -------------------------------------------------------------------------------- 1 | import random 2 | import string 3 | 4 | from douban.items import Subject 5 | from scrapy.linkextractors import LinkExtractor 6 | from scrapy.spiders import CrawlSpider, Request, Rule 7 | 8 | 9 | class BookSubjectSpider(CrawlSpider): 10 | name = "book_subject" 11 | user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko" 12 | allowed_domains = ["book.douban.com"] 13 | start_urls = ["https://book.douban.com/subject/26628811/"] 14 | rules = ( 15 | Rule( 16 | LinkExtractor(allow=("https://book.douban.com/subject/(\\d)+/$")), 17 | callback="parse_item", 18 | follow=True, 19 | process_request="cookie", 20 | ), 21 | ) 22 | 23 | def cookie(self, request, response): 24 | bid = "".join(random.choice(string.ascii_letters + string.digits) for x in range(11)) 25 | request.cookies["bid"] = bid 26 | request = request.replace(url=request.url.replace("?", "/?")) 27 | return request 28 | 29 | def start_requests(self): 30 | for url in self.start_urls: 31 | bid = "".join(random.choice(string.ascii_letters + string.digits) for x in range(11)) 32 | yield Request(url, cookies={"bid": bid}) 33 | 34 | def set_douban_id(self, subject, response): 35 | subject["douban_id"] = response.url[32:-1] 36 | return subject 37 | 38 | def parse_item(self, response): 39 | subject = Subject() 40 | self.set_douban_id(subject, response) 41 | subject["type"] = "book" 42 | return subject 43 | -------------------------------------------------------------------------------- /scrapy/douban/spiders/movie_comment.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import douban.database as db 4 | from douban.items import Comment 5 | from scrapy import Request, Spider 6 | 7 | cursor = db.connection.cursor() 8 | 9 | 10 | class MovieCommentSpider(Spider): 11 | name = "movie_comment" 12 | allowed_domains = ["movie.douban.com"] 13 | 14 | def start_requests(self): 15 | sql = "SELECT douban_id FROM movies WHERE douban_id NOT IN \ 16 | (SELECT douban_id FROM comments GROUP BY douban_id) ORDER BY douban_id DESC" 17 | cursor.execute(sql) 18 | movies = cursor.fetchall() 19 | baseurl = "https://m.douban.com/rexxar/api/v2/movie/%s/interests?count=5&order_by=hot" 20 | referer = "https://m.douban.com/movie/subject/%s/?from=showing" 21 | for movie in movies: 22 | yield Request( 23 | baseurl % movie["douban_id"], headers={"Referer": referer % movie["douban_id"]}, 24 | ) 25 | 26 | def parse(self, response): 27 | douban_id = response.url.split("/")[-2] 28 | items = json.loads(response.body)["interests"] 29 | for item in items: 30 | comment = Comment() 31 | comment["douban_id"] = douban_id 32 | comment["douban_comment_id"] = item["id"] 33 | comment["douban_user_nickname"] = item["user"]["name"] 34 | comment["douban_user_avatar"] = item["user"]["avatar"] 35 | comment["douban_user_url"] = item["user"]["url"] 36 | comment["content"] = item["comment"] 37 | comment["votes"] = item["vote_count"] 38 | yield comment 39 | -------------------------------------------------------------------------------- /scrapy/douban/spiders/movie_meta.py: -------------------------------------------------------------------------------- 1 | import douban.database as db 2 | import douban.util as util 3 | import douban.validator as validator 4 | from douban.items import MovieMeta 5 | from scrapy import Spider 6 | 7 | cursor = db.connection.cursor() 8 | 9 | 10 | class MovieMetaSpider(Spider): 11 | name = "movie_meta" 12 | allowed_domains = ["movie.douban.com"] 13 | user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko" 14 | sql = 'SELECT * FROM subjects WHERE type="movie" AND douban_id NOT IN \ 15 | (SELECT douban_id FROM movies) ORDER BY douban_id DESC' 16 | cursor.execute(sql) 17 | movies = cursor.fetchall() 18 | start_urls = ("https://movie.douban.com/subject/%s/" % i["douban_id"] for i in movies) 19 | 20 | def set_douban_id(self, meta, response): 21 | meta["douban_id"] = response.url[33:-1] 22 | return meta 23 | 24 | def set_type(self, meta, response): 25 | regex = '//text()[preceding-sibling::span[text()="集数:"]][fo\ 26 | llowing-sibling::br]' 27 | match = response.xpath(regex).get() 28 | if match: 29 | meta["type"] = "tv" 30 | else: 31 | meta["type"] = "movie" 32 | return meta 33 | 34 | def set_cover(self, meta, response): 35 | regex = '//img[@rel="v:image"]/@src' 36 | match = response.xpath(regex).get() 37 | if match: 38 | meta["cover"] = match.replace("s_ratio_poster", "l_ratio_poster") 39 | else: 40 | meta["cover"] = "" 41 | return meta 42 | 43 | def set_name(self, meta, response): 44 | regex = "//title/text()" 45 | match = response.xpath(regex).get() 46 | if match: 47 | meta["name"] = match[:-5].strip() 48 | return meta 49 | 50 | def set_slug(self, meta, response): 51 | meta["slug"] = util.shorturl(meta["douban_id"]) 52 | return meta 53 | 54 | def set_year(self, meta, response): 55 | regex = '//span[@class="year"]/text()' 56 | match = response.xpath(regex).get() 57 | if match: 58 | meta["year"] = validator.match_year(match) 59 | return meta 60 | 61 | def set_directors(self, meta, response): 62 | regex = '//a[@rel="v:directedBy"]/text()' 63 | matches = response.xpath(regex).getall() 64 | meta["directors"] = validator.process_slash_str("/".join(matches)) 65 | return meta 66 | 67 | def set_writers(self, meta, response): 68 | regex = '//span[preceding-sibling::span[text()="编剧"]]/a/text()' 69 | matches = response.xpath(regex).getall() 70 | meta["writers"] = validator.process_slash_str("/".join(matches)) 71 | return meta 72 | 73 | def set_actors(self, meta, response): 74 | regex = '//a[@rel="v:starring"]/text()' 75 | matches = response.xpath(regex).getall() 76 | meta["actors"] = validator.process_slash_str("/".join(matches)) 77 | return meta 78 | 79 | def set_genres(self, meta, response): 80 | regex = '//span[@property="v:genre"]/text()' 81 | matches = response.xpath(regex).getall() 82 | meta["genres"] = "/".join(matches) 83 | return meta 84 | 85 | def set_official_site(self, meta, response): 86 | regex = '//a[preceding-sibling::span[text()="官方网站:"]][following-si\ 87 | bling::br]/@href' 88 | match = response.xpath(regex).get() 89 | if match: 90 | meta["official_site"] = validator.process_url(match) 91 | return meta 92 | 93 | def set_regions(self, meta, response): 94 | regex = '//text()[preceding-sibling::span[text()="制片国家/地区:"]][fo\ 95 | llowing-sibling::br]' 96 | match = response.xpath(regex).get() 97 | if match: 98 | meta["regions"] = match 99 | return meta 100 | 101 | def set_languages(self, meta, response): 102 | regex = '//text()[preceding-sibling::span[text()="语言:"]][following-s\ 103 | ibling::br]' 104 | match = response.xpath(regex).get() 105 | if match: 106 | meta["languages"] = match 107 | return meta 108 | 109 | def set_release_date(self, meta, response): 110 | regex = '//span[@property="v:initialReleaseDate"]/@content' 111 | match = response.xpath(regex).get() 112 | if match: 113 | release_date = validator.str_to_date(validator.match_date(match)) 114 | if release_date: 115 | meta["release_date"] = release_date 116 | return meta 117 | 118 | def set_runtime(self, meta, response): 119 | regex = '//span[@property="v:runtime"]/@content' 120 | match = response.xpath(regex).get() 121 | if match: 122 | meta["mins"] = match 123 | return meta 124 | 125 | def set_alias(self, meta, response): 126 | regex = '//text()[preceding-sibling::span[text()="又名:"]][following-s\ 127 | ibling::br]' 128 | match = response.xpath(regex).get() 129 | if match: 130 | meta["alias"] = validator.process_slash_str(match) 131 | return meta 132 | 133 | def set_imdb_id(self, meta, response): 134 | regex = '//a[preceding-sibling::span[text()="IMDb链接:"]][following-si\ 135 | bling::br]/@href' 136 | match = response.xpath(regex).get() 137 | if match: 138 | meta["imdb_id"] = match.strip().split("?")[0][27:] 139 | return meta 140 | 141 | def set_score(self, meta, response): 142 | regex = '//strong[@property="v:average"]/text()' 143 | match = response.xpath(regex).get() 144 | if match: 145 | meta["douban_score"] = match 146 | return meta 147 | 148 | def set_votes(self, meta, response): 149 | regex = '//span[@property="v:votes"]/text()' 150 | match = response.xpath(regex).get() 151 | if match: 152 | meta["douban_votes"] = match 153 | return meta 154 | 155 | def set_tags(self, meta, response): 156 | regex = '//div[@class="tags-body"]/a/text()' 157 | matches = response.xpath(regex).getall() 158 | meta["tags"] = "/".join(matches) 159 | return meta 160 | 161 | def set_storyline(self, meta, response): 162 | regex = '//span[@class="all hidden"]/text()' 163 | matches = response.xpath(regex).getall() 164 | if matches: 165 | meta["storyline"] = "