├── .gitignore ├── README.md ├── beat.py ├── caoe.py ├── conf.py ├── control.py ├── init.py ├── log.py ├── models.py ├── parse.py ├── requirements.txt ├── schedulers.py ├── show_log.py ├── spider.py ├── test.py ├── utils.py └── worker.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | __pycache__ 21 | 22 | # Installer logs 23 | pip-log.txt 24 | 25 | # Unit test / coverage reports 26 | .coverage 27 | .tox 28 | nosetests.xml 29 | 30 | # Translations 31 | *.mo 32 | 33 | # Mr Developer 34 | .mr.developer.cfg 35 | .project 36 | .pydevproject 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Mtime 2 | ===== 3 | 4 | A spider... ^.^ 5 | 6 | 7 | #### 这是爬取Mtime时光网的爬虫,用到的技术: 8 | 9 | 1. Mongodb 10 | 2. mongoengine 11 | 12 | 13 | #### 本项目特性: 14 | 15 | 1. 支持随机UA 16 | 2. 构造和Mtime相同的头信息 17 | 3. 爬取间隔自适应(爬取限制自动增加间隔,恢复正常自动恢复间隔) 18 | 4. 较详细的日志 19 | 5. 支持daemon方式启动 20 | 21 | #### 各文件作用 22 | 23 | 1. conf.py # 相关mtime设置的api地址,数据库地址,爬取间隔等设置 24 | 2. beat.py # MQ的任务生产者,通过mtime的搜索接口根据年代遍历,将要爬取的电影的唯一ID列表放到我写的一个简单的消息队列(mongodb) 25 | 3. worker.py # 使用多进程池类的map方法,模拟多进程并发消费MQ. 每个消息对应不同的爬取任务和爬取的电影IDs 26 | 4. control.py # 可以将程序放到后台,提供一个类似start/restart/stop模式的功能. 实现简单的crontab 27 | 5. init.py # 项目开始前执行的初始化,生成beat和worker的执行间隔(在conf.py配置),被他们读取和修改 28 | 6. models.py # mongodb存储爬下来的电影数据模型 29 | 7. parse.py # 页面解析 30 | 8. schedulers.py # 任务执行的数据模型 31 | 9. show_log.py # 将分布式机器的日志通过一个统一的socket接口汇集起来 32 | 10. spider.py # 页面爬取 33 | 11. utils.py # 功能函数 34 | 12. caoe.py # 豆瓣的CaoE, 父进程死掉后帮助杀掉子进程 35 | 36 | #### 爬取流程 37 | 38 | 1. beat.py 按年份获取每年要爬的电影id和库内本年的id取差集,将要爬的放到mongodb的MQ 39 | 2. worker.py定时从数据库取要爬的电影MQ. 40 | 3. 从parse.py里面找到实际爬取本次任务的Parse类 41 | 4. 调用spider.py中对应本次任务的Spider类爬取页面分析 42 | 5. 使用Xpath解析页面获得分析后的结果 43 | 6. 获取models里面入库的模型save之 44 | 7. 根据上面5 获取的数据对AliasName类也去重复累加 45 | 8. 一次任务完成. 继续重复2 46 | 47 | 48 | #### 使用 49 | 50 | 1. 初始化任务 51 | 52 | ``` 53 | $pip install -r requirements.txt 54 | $python init.py 55 | ``` 56 | 57 | 2. 产生任务(全局只需要一个) 58 | 59 | ``` 60 | $python beat.py start 61 | ``` 62 | 63 | ####. 分布式跑任务的每个服务器跑一个worker程序 64 | 65 | ``` 66 | python work.py start 67 | ``` 68 | -------------------------------------------------------------------------------- /beat.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | ''' 3 | 把需要抓取的电影id发送给MQ, 他是一切任务的生成源 4 | ''' 5 | from mongoengine.errors import NotUniqueError 6 | 7 | from parse import get_movie_ids, get_movie_pages 8 | from utils import get_unfinished, group, sleep2 9 | from spider import Search 10 | from conf import (SEARCH_PAGE, SEARCH_API, MIN_YEAR, TASK_BEAT_NUM, TASK_BEAT, 11 | VERIFY_INTERVAL) 12 | from models import YearFinished, IdFinished 13 | from schedulers import Message 14 | from control import Scheduler, periodic, run 15 | from log import error, debug, warn 16 | 17 | scheduler = Scheduler('beat') 18 | 19 | 20 | def get_year(): 21 | '''根据年份从前向后,获取当前要执行的第一个年份(min)''' 22 | obj = YearFinished.objects 23 | if obj: 24 | c_year = obj.first() 25 | return c_year.year 26 | else: 27 | return MIN_YEAR - 1 28 | 29 | 30 | def fetch(year, page): 31 | s = Search(params={'Ajax_CallBack': True, 32 | 'Ajax_CallBackType': 'Mtime.Channel.Pages.SearchService', # noqa 33 | 'Ajax_CallBackMethod': 'SearchMovieByCategory', 34 | 'Ajax_CrossDomain': 1, 35 | 'Ajax_CallBackArgument10': year, 36 | 'Ajax_CallBackArgument14': '1', 37 | 'Ajax_CallBackArgument16': '1', 38 | 'Ajax_CallBackArgument18': page, 39 | 'Ajax_CallBackArgument19': '1', 40 | 'Ajax_CallBackArgument9': year, 41 | 'Ajax_CallBackArgument17': 8, 42 | 'Ajax_CallBackArgument8': '', 43 | 'Ajax_RequestUrl': SEARCH_PAGE.format(year=year) 44 | }) 45 | s.fetch(SEARCH_API) 46 | return s 47 | 48 | 49 | def mtime_beat(): 50 | '''每次任务只跑一年的''' 51 | y_list = [] 52 | y = get_year() + 1 # 要抓取的年份 53 | debug('Fetch Year: {} starting...'.format(y)) 54 | instance = fetch(y, 1) 55 | page = get_movie_pages(instance) 56 | if page is None: 57 | warn('Movie"page has not fetched') 58 | # 执行间隔自适应 59 | if scheduler.get_interval < TASK_BEAT * 7: 60 | scheduler.change_interval(incr=True) 61 | return 62 | ids = get_movie_ids(instance) 63 | if ids is None: 64 | # 间隔自适应也不能太大 65 | warn('Movie has not fetched') 66 | if scheduler.get_interval < TASK_BEAT * 7: 67 | scheduler.change_interval(incr=True) 68 | return 69 | # 当任务继续能执行的时候,回到默认的间隔 70 | if scheduler.get_interval > TASK_BEAT: 71 | debug('Interval back to default') 72 | scheduler.change_interval(TASK_BEAT) 73 | y_list.extend(ids) 74 | if not y_list: 75 | # 本年没有电影 76 | debug('Year: {} has not movie'.format(y)) 77 | YearFinished(year=y).save() 78 | sleep2() 79 | return mtime_beat() 80 | if page > 1: 81 | p = 2 82 | while p <= page: 83 | instance = fetch(y, p) 84 | debug('Fetch Year:{} Page:{}'.format(y, p)) 85 | ids = get_movie_ids(instance) 86 | if ids is None: 87 | # 间隔自适应也不能太大 88 | if scheduler.get_interval < TASK_BEAT * 7: 89 | scheduler.change_interval(incr=True) 90 | # 出现需要验证码 手动输入或者等待一段时间后重试,直到能正常使用 91 | sleep2(VERIFY_INTERVAL) 92 | continue 93 | ids = [] 94 | y_list.extend(ids) 95 | p += 1 96 | sleep2() 97 | obj = IdFinished.objects(year=y).first() 98 | if obj is not None: 99 | has_finished = obj.ids 100 | else: 101 | has_finished = [] 102 | to_process = get_unfinished(has_finished, y_list) 103 | # 给相应队列添加任务 104 | for payload in group(to_process, TASK_BEAT_NUM): 105 | for task in ['Fullcredits', 'Movie', 'Comment', 'Character', 106 | 'MicroComment', 'Scenes', 'Awards', 'Plot', 107 | 'Details']: 108 | debug('Push payload: {} to {} Queue'.format(payload, task)) 109 | try: 110 | Message(year=y, task=task, payload=payload).save() 111 | # Hack一下 112 | #Message.objects.get_or_create(year=y, task=task, payload=payload) 113 | except NotUniqueError: 114 | debug('Duplicate insert: [{}], payload: {}'.format(task, payload)) 115 | # 当前年份数据已经入MQ 116 | YearFinished(year=y).save() 117 | debug('Year: {} done'.format(y)) 118 | 119 | 120 | def main(): 121 | periodic(scheduler, mtime_beat) 122 | scheduler.run() 123 | 124 | 125 | if __name__ == '__main__': 126 | # 假如有各种奇怪的问题,可以使用下注释的不放在后台 127 | #main() 128 | run(main, __file__) 129 | -------------------------------------------------------------------------------- /caoe.py: -------------------------------------------------------------------------------- 1 | import errno 2 | import os 3 | import sys 4 | import time 5 | from signal import signal, SIGINT, SIGQUIT, SIGTERM, SIGCHLD, SIGHUP, pause, SIG_DFL 6 | 7 | __all__ = ['install'] 8 | 9 | 10 | def install(fork=True, sig=SIGTERM): 11 | def _reg(gid): 12 | handler = make_quit_signal_handler(gid, sig) 13 | signal(SIGINT, handler) 14 | signal(SIGQUIT, handler) 15 | signal(SIGTERM, handler) 16 | signal(SIGCHLD, make_child_die_signal_handler(gid, sig)) 17 | 18 | if not fork: 19 | _reg(os.getpid()) 20 | return 21 | 22 | pid = os.fork() 23 | if pid == 0: 24 | # child process 25 | os.setpgrp() 26 | pid = os.fork() 27 | if pid != 0: 28 | exit_when_parent_or_child_dies(sig) 29 | else: 30 | # parent process 31 | gid = pid 32 | _reg(gid) 33 | while True: 34 | pause() 35 | 36 | 37 | def make_quit_signal_handler(gid, sig=SIGTERM): 38 | def handler(signum, frame): 39 | signal(SIGTERM, SIG_DFL) 40 | try: 41 | os.killpg(gid, sig) 42 | except os.error as ex: 43 | if ex.errno != errno.ESRCH: 44 | raise 45 | return handler 46 | 47 | 48 | def make_child_die_signal_handler(gid, sig=SIGTERM): 49 | def handler(signum, frame): 50 | try: 51 | pid, status = os.wait() 52 | except OSError: 53 | # sometimes there is no child processes already 54 | status = 0 55 | 56 | try: 57 | signal(SIGTERM, SIG_DFL) 58 | os.killpg(gid, sig) 59 | finally: 60 | sys.exit((status & 0xff00) >> 8) 61 | return handler 62 | 63 | 64 | def exit_when_parent_or_child_dies(sig): 65 | gid = os.getpgrp() 66 | signal(SIGCHLD, make_child_die_signal_handler(gid)) 67 | 68 | try: 69 | import prctl 70 | signal(SIGHUP, make_quit_signal_handler(gid)) 71 | # give me SIGHUP if my parent dies 72 | prctl.set_pdeathsig(SIGHUP) 73 | pause() 74 | 75 | except ImportError: 76 | # fallback to polling status of parent 77 | while True: 78 | if os.getppid() == 1: 79 | # parent died, suicide 80 | signal(SIGTERM, SIG_DFL) 81 | os.killpg(gid, sig) 82 | sys.exit() 83 | time.sleep(5) 84 | -------------------------------------------------------------------------------- /conf.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | '''所有配置''' 3 | 4 | # MONGODB数据库 5 | HOST = '127.0.0.1' 6 | PORT = 27017 7 | DATABASE = 'mtime' 8 | 9 | # 爬取页面间的间隔, 单位s 10 | INTERVAL = 8 11 | # 提示验证码的重试间隔 12 | VERIFY_INTERVAL = 1200 13 | 14 | # 网卡 15 | IFNAME = 'eth0' 16 | 17 | # 接收日志的服务器IP 18 | SERVER_HOST = '127.0.0.1' 19 | 20 | # 爬取的年份设置 21 | MIN_YEAR = 1194 22 | 23 | # 任务周期设置, 单位s 24 | TASK_BEAT = 600 25 | # 每个任务被分配的电影数 26 | TASK_BEAT_NUM = 20 27 | # 每个worker爬取任务的间隔 单位s 28 | TASK_WORKER = 60 29 | 30 | # 电影查询, 根据年代, 电影名 31 | SEARCH_PAGE = 'http://movie.mtime.com/movie/search/section/#sortType=8&viewType=1&year={year}' # noqa 32 | 33 | # MTIME的搜索结果是通过api和javascript动态添加的 34 | SEARCH_API = 'http://service.channel.mtime.com/service/search.mcs' 35 | 36 | # 获取电影基本信息 37 | MOVIE_API = 'http://service.mtime.com/database/databaseService.m' 38 | 39 | MOVIE_PAGE = 'http://http://movie.mtime.com/{id}/&t={timestamp}' 40 | 41 | # 获取评论的评论转发赞 42 | 43 | COMMENT_API = 'http://service.library.mtime.com/Movie.api' 44 | -------------------------------------------------------------------------------- /control.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | '''调度的抽象''' 4 | import os 5 | import sys 6 | import time 7 | import heapq 8 | import atexit 9 | from datetime import timedelta, datetime 10 | from collections import namedtuple 11 | from signal import SIGKILL 12 | from os.path import join, splitext, basename 13 | 14 | from schedulers import Task 15 | from log import info, debug 16 | 17 | Event = namedtuple('Event', 'time priority action argument') 18 | 19 | 20 | # TODO priority 21 | class Scheduler(object): 22 | 23 | '''调度''' 24 | 25 | def __init__(self, task_name): 26 | self._queue = [] 27 | self.task = Task.objects(type=task_name).first() 28 | 29 | def change_interval(self, interval=None, incr=False, decr=False): 30 | '''改变任务执行的间隔''' 31 | if incr: 32 | interval = self.task.interval * 2 33 | elif decr: 34 | interval = self.task.interval / 2 35 | elif interval is not None: 36 | interval = interval 37 | debug('Change interval to: ' + str(interval)) 38 | self.task.update(set__interval=interval) 39 | 40 | @property 41 | def get_interval(self): 42 | '''获取当前task的执行间隔''' 43 | return self.task.interval 44 | 45 | def start(self, priority, action, argument): 46 | now = datetime.now() 47 | next_time = now + timedelta(seconds=self.task.interval) 48 | event = Event(next_time, priority, action, argument) 49 | heapq.heappush(self._queue, event) 50 | return event 51 | 52 | def run(self): 53 | q = self._queue 54 | pop = heapq.heappop 55 | s = time.sleep 56 | while q: 57 | next_time, priority, action, argument = checked_event = q[0] 58 | now = datetime.now() 59 | if now < next_time: 60 | s((next_time - now).total_seconds()) 61 | else: 62 | event = pop(q) 63 | if event is checked_event: 64 | debug('Excute Cron Starting...') 65 | action(*argument) 66 | debug('Excute Cron Success') 67 | self.task.update(set__last_run_at=now) 68 | s(0) # 把执行权放给其他程序 69 | else: 70 | heapq.heappush(q, event) 71 | 72 | 73 | def periodic(scheduler, action, actionargs=()): 74 | '''定时调度函数''' 75 | scheduler.start(1, periodic, 76 | (scheduler, action, actionargs)) 77 | action(*actionargs) 78 | 79 | 80 | class Daemon(object): 81 | 82 | '''将程序做成daemon版''' 83 | 84 | def __init__(self, run, pidfile=None, stdin='/dev/null', stdout=None, 85 | stderr=None, default=None): 86 | self.run = run 87 | self.stdin = stdin 88 | self.default = default 89 | self.here = os.path.abspath(os.path.dirname(__file__)) 90 | self.stdout = self.path(stdout) 91 | self.stderr = self.path(stderr) 92 | self.pidfile = self.path(pidfile, suffix='.pid') 93 | 94 | def path(self, std, suffix='.log'): 95 | return join(self.here, 'logs', 96 | splitext(basename(self.default))[0]) + suffix 97 | 98 | def daemonize(self): 99 | 100 | try: 101 | pid = os.fork() 102 | if pid > 0: 103 | sys.exit(0) 104 | except OSError, e: 105 | sys.stderr.write("fork #1 failed: %d (%s)\n" % 106 | (e.errno, e.strerror)) 107 | sys.exit(1) 108 | 109 | os.chdir(self.here) 110 | os.setsid() 111 | os.umask(022) 112 | try: 113 | pid = os.fork() 114 | if pid > 0: 115 | sys.exit(0) 116 | except OSError, e: 117 | sys.stderr.write("fork #2 failed: %d (%s)\n" % (e.errno, 118 | e.strerror)) 119 | sys.exit(1) 120 | if sys.platform != 'darwin': 121 | sys.stdout.flush() 122 | sys.stderr.flush() 123 | si = open(self.stdin, 'r') 124 | so = open(self.stdout, 'a+') 125 | se = open(self.stderr, 'a+', 0) 126 | os.dup2(si.fileno(), sys.stdin.fileno()) 127 | os.dup2(so.fileno(), sys.stdout.fileno()) 128 | os.dup2(se.fileno(), sys.stderr.fileno()) 129 | 130 | atexit.register(self.delpid) 131 | pid = str(os.getpid()) 132 | open(self.pidfile, 'w+').write("%s\n" % pid) 133 | 134 | def delpid(self): 135 | os.remove(self.pidfile) 136 | 137 | def start(self): 138 | pid = self.get_pid() 139 | if pid: 140 | message = "pidfile %s already exist. Daemon already running?\n" 141 | sys.stderr.write(message % self.pidfile) 142 | sys.exit(1) 143 | 144 | self.daemonize() 145 | self.run() 146 | 147 | def stop(self): 148 | pid = self.get_pid() 149 | if not pid: 150 | message = "pidfile %s does not exist. Daemon not running?\n" 151 | sys.stderr.write(message % self.pidfile) 152 | sys.exit(1) 153 | return 154 | 155 | try: 156 | while 1: 157 | os.kill(pid, SIGKILL) 158 | time.sleep(0.1) 159 | except OSError, err: 160 | err = str(err) 161 | if err.find("No such process") > 0: 162 | if os.path.exists(self.pidfile): 163 | os.remove(self.pidfile) 164 | else: 165 | print str(err) 166 | sys.exit(1) 167 | 168 | def restart(self): 169 | self.stop() 170 | self.start() 171 | 172 | def get_pid(self): 173 | try: 174 | with open(self.pidfile) as pf: 175 | pid = int(pf.read().strip()) 176 | except IOError: 177 | pid = None 178 | except SystemExit: 179 | pid = None 180 | return pid 181 | 182 | def alive(self): 183 | pid = self.get_pid() 184 | if pid is None: 185 | pid = 0 186 | if sys.platform != 'darwin': 187 | if os.path.exists('/proc/%d' % pid): 188 | print pid 189 | else: 190 | print 0 191 | else: 192 | print pid 193 | 194 | 195 | def run(main, default): 196 | daemon = Daemon(run=main, default=default) 197 | if len(sys.argv) == 2: 198 | arg = sys.argv[1] 199 | if arg in ['start', 'stop', 'restart', 'alive']: 200 | if arg != 'alive': 201 | info(sys.argv[0] + ' ' + arg) 202 | getattr(daemon, arg)() 203 | else: 204 | print "Unknown command" 205 | sys.exit(2) 206 | sys.exit(0) 207 | else: 208 | print "usage: %s start|stop|restart" % sys.argv[0] 209 | sys.exit(2) 210 | -------------------------------------------------------------------------------- /init.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | from datetime import datetime 4 | 5 | from conf import TASK_BEAT, TASK_WORKER 6 | from schedulers import Task 7 | 8 | 9 | def init_task_db(): 10 | # init beat 11 | task = Task.objects.get_or_create(type='beat')[0] 12 | task.update(set__interval=TASK_BEAT, set__last_run_at=datetime.now()) 13 | worker = Task.objects.get_or_create(type='worker')[0] 14 | worker.update(set__interval=TASK_WORKER, set__last_run_at=datetime.now()) 15 | 16 | 17 | def main(): 18 | init_task_db() 19 | 20 | 21 | if __name__ == '__main__': 22 | main() 23 | -------------------------------------------------------------------------------- /log.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | '''日志操作, 从我的项目中拷贝''' 3 | import struct 4 | import pickle 5 | import logging.handlers 6 | from functools import partial 7 | from logging import StreamHandler, Formatter, getLogger, DEBUG, makeLogRecord 8 | 9 | from conf import SERVER_HOST 10 | from utils import get_ip_address 11 | 12 | 13 | def logger(): 14 | '''设置logging记录日志''' 15 | FORMAT = '%(asctime)-15s %(clientip)-15s %(levelname)-8s %(module)-20s %(funcName)-15s %(message)s' # noqa 16 | DATE_FORMAT = '%Y-%m-%d %H:%M:%S' 17 | formatter = Formatter(fmt=FORMAT, datefmt=DATE_FORMAT) 18 | handler = StreamHandler() 19 | sockethandler = logging.handlers.SocketHandler(SERVER_HOST, 20 | logging.handlers.DEFAULT_TCP_LOGGING_PORT) # noqa 21 | handler.setFormatter(formatter) 22 | for_logger = getLogger('Tencent') 23 | for_logger.setLevel(DEBUG) 24 | for_logger.addHandler(handler) 25 | for_logger.addHandler(sockethandler) 26 | return for_logger 27 | 28 | # 添加自定义的客户端ip字段 29 | d = {'clientip': get_ip_address()} 30 | 31 | logger = logger() 32 | debug = partial(logger.debug, extra=d) 33 | info = partial(logger.info, extra=d) 34 | warn = partial(logger.warn, extra=d) 35 | # error类型的日志记录堆栈 36 | error = partial(logger.error, exc_info=1, extra=d) 37 | 38 | 39 | def handle_log(socket, address): 40 | '''搜集各client日志到服务端''' 41 | chunk = socket.recv(4) 42 | if len(chunk) < 4: 43 | return 44 | slen = struct.unpack('>L', chunk)[0] 45 | chunk = socket.recv(slen) 46 | while len(chunk) < slen: 47 | chunk = chunk + socket.recv(slen - len(chunk)) 48 | obj = pickle.loads(chunk) 49 | record = makeLogRecord(obj) 50 | name = record.name 51 | logger = getLogger(name) 52 | logger.handle(record) 53 | socket.close() 54 | -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from datetime import datetime 3 | from mongoengine import * # noqa 4 | 5 | from conf import HOST, PORT, DATABASE 6 | 7 | connect(DATABASE, host=HOST, port=PORT) 8 | 9 | COLORS = ((0, u'黑白'), (1, '彩色')) 10 | 11 | 12 | class MtimeMixin(object): 13 | 14 | '''大部分模型都有movieid''' 15 | movieid = IntField(required=True) # 基础类 16 | 17 | 18 | class AliasName(Document): 19 | 20 | '''数据库中存在的名字和别名(英文名等)''' 21 | name = StringField(max_length=60, required=True) # 数据库中存在的名字 22 | alias = ListField(StringField(max_length=60, required=True)) # 这个人的别名 23 | 24 | 25 | class Actor(EmbeddedDocument): 26 | 27 | '''演员信息''' 28 | mid = IntField(default=0, required=True) # 演员链接的唯一ID 29 | poster = StringField(max_length=100) # 海报缩略图 30 | name = StringField(max_length=60, required=True) # 演员名字 31 | play = StringField(max_length=60, required=True) # 剧中人物 32 | 33 | 34 | class Director(EmbeddedDocument): 35 | 36 | '''导演信息''' 37 | mid = IntField(default=0) # 演员链接的唯一ID 38 | name = StringField(max_length=60) # 导演名字 39 | cnname = StringField(max_length=60) # 可能有中文翻译过来的名字 40 | poster = StringField(max_length=100) # 海报缩略图 41 | 42 | 43 | class Fullcredits(Document, MtimeMixin): 44 | 45 | '''演职员表''' 46 | director = ListField(EmbeddedDocumentField(Director)) # 导演 47 | writer = ListField(StringField(max_length=30, required=True)) # 编剧 48 | actor = ListField(EmbeddedDocumentField(Actor)) # 演员 49 | produced = ListField(StringField(max_length=60, required=True)) # 制作人 50 | originalmusic = ListField( 51 | StringField(max_length=60, required=True)) # 原创音乐 52 | cinematography = ListField(StringField(max_length=60, required=True)) # 摄影 53 | filmediting = ListField(StringField(max_length=60, required=True)) # 剪辑 54 | # casting = ListField(StringField(max_length=60, required=True)) # 选角导演 55 | # productiondesigner = ListField(StringField(max_length=60, 56 | # required=True)) # 艺术指导 57 | artdirection = ListField(StringField(max_length=60, required=True)) # 美术设计 58 | # setdecoration = ListField(StringField(max_length=60, required=True)) # 59 | # 布景师 60 | costumedesign = ListField( 61 | StringField(max_length=60, required=True)) # 服装设计 62 | # visualeffects = ListField(StringField(max_length=60, required=True)) # 63 | # 视觉特效 64 | assistantdirector = ListField( 65 | StringField(max_length=60, required=True)) # 副导演/助理导演 66 | 67 | 68 | class EmbeddedReleaseInfo(EmbeddedDocument): 69 | encountry = StringField(max_length=30, required=True) # 英文国家名 70 | cncountry = StringField(max_length=30, required=True) # 中文国家名 71 | releasetime = DateTimeField(default=datetime.now(), required=True) # 上映时间 72 | 73 | 74 | # 电影信息 75 | class Movie(Document, MtimeMixin): 76 | # name = StringField(max_length=30, required=True) # 电影名 77 | rating = FloatField(required=True) # 评分 78 | #evaluate = StringField(max_length=30, required=True) # 评价 79 | ratingcount = IntField(default=0, required=True) # 评分人数 80 | want = IntField(default=0, required=True) # 想看 81 | favorited = IntField(default=0, required=True) # 收藏数 82 | # poster = ListField(EmbeddedDocumentField(Poster)) # 海报缩略图 83 | #fullcredits = ReferenceField(Fullcredits) 84 | #details = ReferenceField(Details) 85 | #plot = ListField(EmbeddedDocumentField(Plot)) 86 | #awards = ListField(EmbeddedDocumentField(Awards)) 87 | #scenes = ReferenceField(Scenes) 88 | #company = ReferenceField(Company) 89 | 90 | 91 | class EmbeddedContent(EmbeddedDocument): 92 | type = StringField(max_length=10, required=True) # 比如文本,视频,图片, 内嵌 93 | content = StringField() # 内容 94 | 95 | 96 | class EmbeddedComment(EmbeddedDocument): 97 | name = StringField(max_length=30, required=True) # 发评论人 98 | commenter_url = StringField(max_length=100) # 评论人的url 99 | ac = IntField(default=0, required=True) # 点赞数 100 | rc = IntField(default=0, required=True) # 转发数 101 | cc = IntField(default=0, required=True) # 评论数 102 | url = StringField(max_length=100, required=True) # 原文url 103 | poster = StringField(max_length=100) # 原文的海报图 104 | image = StringField(max_length=120, required=True) # 评论人图片url 105 | title = StringField(max_length=60) # 标题 106 | score = FloatField() # 评分, 只是看过的人会评分,但不评分 107 | content = ListField(EmbeddedDocumentField(EmbeddedContent)) # 评论内容 108 | shortcontent = StringField(default='') # 评论内容的简略, 也就是mtime直接显示的那部分 109 | publishdate = DateTimeField(default=datetime.now()) # 发表时间 110 | 111 | meta = {'allow_inheritance': True} 112 | 113 | 114 | class EmbeddedMicroComment(EmbeddedComment): 115 | content = StringField() # 评论内容格式不同 116 | 117 | 118 | class Comment(Document, MtimeMixin): 119 | comments = ListField(EmbeddedDocumentField(EmbeddedComment)) # 长评 120 | 121 | 122 | class MicroComment(Document, MtimeMixin): 123 | microcomments = ListField( 124 | EmbeddedDocumentField(EmbeddedMicroComment)) # 微评 125 | 126 | 127 | class Company(EmbeddedDocument): 128 | 129 | '''制作/发行信息''' 130 | # release = ListField(StringField()) # 发行 131 | # make = ListField(StringField()) # 制作 132 | # stunt = ListField(StringField()) # 特技制作 133 | # other = ListField(StringField()) # 其他公司 134 | name = StringField(max_length=60, required=True) # 公司名字 135 | country = StringField(max_length=30) # 公司所在国家 136 | 137 | 138 | # Delete in next version 139 | class ScenesComment(EmbeddedDocument): 140 | content = StringField(required=True) # 评论内容 141 | who = StringField(max_length=30, required=True) # 评论者 142 | 143 | 144 | class Dialogue(EmbeddedDocument): 145 | endialogue = StringField(required=True) # 英文对白 146 | cndialogue = StringField(required=True) # 中文对白翻译 147 | 148 | # END 149 | # 幕后花絮 150 | 151 | 152 | class EmbeddedScenes(EmbeddedDocument): 153 | title = StringField(max_length=30, required=True) # 主题 154 | content = ListField(StringField()) 155 | 156 | 157 | class Scenes(Document, MtimeMixin): 158 | 159 | '''幕后揭秘 update:新版本很多字段都没有了''' 160 | #comment = ListField(EmbeddedDocumentField(ScenesComment)) 161 | # make = ListField(StringField()) # 幕后制作 162 | scene = ListField(EmbeddedDocumentField(EmbeddedScenes)) # 花絮 163 | #dialogue = ListField(EmbeddedDocumentField(Dialogue)) 164 | # goofs = ListField(StringField()) # 穿帮镜头 165 | 166 | 167 | # 获奖记录 168 | class Awardspeople(EmbeddedDocument): 169 | 170 | '''s实现起来比较麻烦, 展示没用''' 171 | name = ListField(StringField(max_length=60, required=True)) # 获奖或者提名人 172 | awardtype = StringField(max_length=30, required=True) # 具体奖项名字 比如最佳影片 173 | 174 | 175 | class Awardsinfo(EmbeddedDocument): 176 | type = StringField(max_length=30, required=True) # 提名或者获奖 177 | #peoples = ListField(StringField(max_length=30)) 178 | # Hacks: 内嵌的列表第一样式上面Awardspeople的name, 第二项是awardtype 179 | peoples = ListField(ListField(required=True)) # 获奖的人, 但不是必选,有些奖项是整个电影的成就 180 | 181 | 182 | class Oneawards(EmbeddedDocument): 183 | name = StringField(max_length=30, required=True) # 奖项名, 比如 奥斯卡金像奖 184 | period = IntField(required=True) # 届 185 | year = IntField(required=True) # 年份 186 | # nominatecount = IntField(required=True) # 提名的次数 这个其实可以根据具体情况计算 187 | # awardcount = IntField(required=True) # 获奖次数 188 | awards = ListField(EmbeddedDocumentField(Awardsinfo)) # 获奖的具体情况: 奖项-人物 189 | # nominate = ListField(EmbeddedDocumentField(Awardinfo)) # 提名的具体情况 190 | 191 | 192 | class Awards(Document, MtimeMixin): 193 | 194 | '''获奖记录''' 195 | awards = ListField(EmbeddedDocumentField(Oneawards)) 196 | # end 197 | 198 | 199 | class Plot(Document, MtimeMixin): 200 | 201 | '''剧情''' 202 | content = ListField(StringField()) # 剧情片段 203 | # publisher = StringField() # 发布者, 新版已经不存在 204 | # publishdate = DateTimeField(default=datetime.now(), required=True) # 205 | # 发布时间, 新版已经不存在 206 | 207 | 208 | class Details(Document, MtimeMixin): 209 | 210 | '''详细信息''' 211 | enalias = ListField(StringField()) # 中文片名 212 | cnalias = ListField(StringField()) # 外文片名 213 | # type = ListField(StringField()) # 电影类型 214 | time = StringField(max_length=60) # 片长 215 | # country = StringField(max_length=60, required=True) # 国家/地区 216 | language = ListField(StringField(max_length=10)) # 对白语言 217 | # color = StringField(required=True, choices=COLORS) # 色彩 218 | # format = StringField(max_length=30, required=True) # 幅面 219 | # mixin = ListField(StringField(max_length=20)) # 混音 220 | # mpaa = StringField() # MPAA评级 221 | # level = ListField(StringField(max_length=30)) # 级别 222 | cost = StringField() # 制作成本 223 | date = ListField(DateTimeField()) # 拍摄日期 224 | # camera = StringField() # 摄影机 225 | # filmformat = StringField() # 摄制格式 226 | # printformat = StringField() # 洗印格式 227 | release = ListField(EmbeddedDocumentField(EmbeddedReleaseInfo)) # 新增的发布情况 228 | publish = ListField(EmbeddedDocumentField(Company)) # 发行公司 229 | make = ListField(EmbeddedDocumentField(Company)) # 制作公司 230 | site = ListField(StringField(max_length=60, required=True)) # 官方网址 231 | # 关联电影? 232 | 233 | 234 | class IdFinished(Document, MtimeMixin): 235 | 236 | '''完成的电影ids, 防各种原因重新抓取''' 237 | year = IntField(required=True) # 分配的电影的年份 238 | ids = ListField(required=True) # 电影唯一id 239 | meta = { 240 | 'indexes': ['-year'] 241 | } 242 | 243 | 244 | class YearFinished(Document): 245 | 246 | '''完成的电影的年份''' 247 | year = IntField(required=True) # 完成的年份 248 | meta = { 249 | 'indexes': ['-year'], 250 | 'ordering': ['-year'] 251 | } 252 | 253 | 254 | class EmbeddedCharacter(EmbeddedDocument): 255 | 256 | '''角色介绍, 新版增加''' 257 | bigposter = StringField(max_length=100) # 角色大图, 小图在演职员表里面会记录 258 | name = StringField(max_length=30, required=True) # 角色 259 | introduction = StringField() # 角色介绍 260 | 261 | 262 | class Character(Document, MtimeMixin): 263 | character = ListField(EmbeddedDocumentField(EmbeddedCharacter)) 264 | -------------------------------------------------------------------------------- /parse.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | ''' 3 | 使用Xpath解析html页面 4 | ''' 5 | import re 6 | import copy 7 | from lxml import etree 8 | from collections import defaultdict 9 | from datetime import datetime 10 | from urllib2 import HTTPError 11 | 12 | from spider import Spider, Movie, Comment 13 | from conf import MOVIE_API, MOVIE_PAGE, COMMENT_API 14 | from log import debug, warn 15 | 16 | movie_regex = re.compile(r'http://movie.mtime.com/(\d+)/') 17 | people_regex = re.compile(r'http://people.mtime.com/(\d+)/') 18 | movie_page_regex = re.compile(r'pageindex=(\\)?"(\d+)(\\)?(\\)?"') 19 | awardinfo_regex = re.compile(ur'(\d+).*第(\d+)届') 20 | detail_country_regex = re.compile(r'\[(.*)\]') 21 | # 这是mtime的防爬后的提示关键句 22 | mtime_vcodeValid_regex = re.compile(r'\"vcodeValid\":false,\"isRobot\":true') 23 | 24 | date_regex = re.compile(ur'(\d+)年(\d+)月(\d+)日') 25 | name_regex = re.compile(ur'([\u4e00-\u9fa5]+)\s+(.*)') # 匹配中英文名字 26 | favoritedCount_regex = re.compile(r'\"favoritedCount\":(\d+),') 27 | rating_regex = re.compile(r'"rating":(\d.?\d),') 28 | ratingCount_regex = re.compile(r'"ratingCount":(\d+),') 29 | wantToSeeCount_regex = re.compile(r'"wantToSeeCount":(\d+),') 30 | comment_regex = re.compile( 31 | r'\"reviewPraiseCount\":\[(.*)\].*\"reviewPraiseStatus\".*\"reviewShareCount\":\[(.*)\].*\"reviewCommentCount\":\[(.*)\]') # noqa 32 | 33 | movie_url = 'http://movie.mtime.com/{}/{}' 34 | 35 | 36 | def make_datetime(text): 37 | '''通过中文类型的文本解析成datetime类型的日期结果''' 38 | make = lambda t: datetime(int(t[0]), int(t[1]), int(t[2])) 39 | t = date_regex.findall(text) 40 | if t: 41 | if len(t) == 1: 42 | return make(t[0]) 43 | else: 44 | return [make(i) for i in t] 45 | else: 46 | return datetime.now() 47 | 48 | 49 | class Parse(object): 50 | 51 | '''爬取标准类''' 52 | 53 | def __init__(self, movie_id): 54 | self.id = movie_id 55 | self._alias = defaultdict(set) 56 | self.set_url() 57 | self.d = defaultdict(list) 58 | 59 | def set_url(self, url): 60 | self.url = url 61 | self.original_url = url # 其中获取评论页或自动跳转走,这里保留原url供解析下一页使用 62 | 63 | def xpath(self): 64 | raise NotImplementedError() 65 | 66 | @property 67 | def alias(self): 68 | '''别名系统''' 69 | return self._alias 70 | 71 | def spider(self): 72 | # 请求头增加cc 73 | s = Spider(additional_headers={'Cache-Control': 'max-age=0'}) 74 | try: 75 | s.fetch(self.url) 76 | except HTTPError as e: 77 | # 检查该电影相关页面是否存在 78 | if e.msg == 'Not Found': 79 | return 80 | # 因为中文被编码成utf-8之后变成'/u2541'之类的形式,lxml一遇到"/"就会认为其标签结束 81 | return etree.HTML(s.content.decode('utf-8')) 82 | 83 | def __call__(self): 84 | '''调用类''' 85 | self.page = self.spider() 86 | if self.page is None: 87 | return 88 | hasnext = self.xpath() is not None 89 | self.d['movieid'] = self.id 90 | return self.d, hasnext 91 | 92 | def check_next_page(self): 93 | '''检查是否有下一页''' 94 | return self.page.xpath('//a[@id=\"key_nextpage\\"]') 95 | 96 | 97 | # Delete in next version 98 | class ReleaseInfoParse(Parse): 99 | 100 | '''新版(2014, 3, 17)发行数据已经合并到Details里面''' 101 | 102 | def set_url(self): 103 | self.url = movie_url.format(self.id, 'releaseinfo.html') 104 | 105 | def xpath(self): 106 | all = self.page.xpath('//dl[@class="release_date_list"]/dd') 107 | for elem in all: 108 | en = elem.xpath('span/a')[0].text 109 | cn = elem.xpath('span/em')[0].text 110 | date = elem.xpath('span[@class="date"]')[0].text 111 | match = date_regex.search(date) 112 | if match: 113 | t = match.groups() 114 | date = datetime(int(t[0]), int(t[1]), int(t[2])) 115 | else: 116 | date = datetime.now() 117 | self.d['country'] += [{'encountry': en, 'cncountry': cn, 118 | 'releasetime': date}] 119 | 120 | # END 121 | 122 | 123 | class DetailsParse(Parse): 124 | 125 | def set_url(self): 126 | self.url = movie_url.format(self.id, 'details.html') 127 | 128 | def xpath(self): 129 | part = self.page.xpath('//dl[@class="wp50 fl"]') 130 | # 第一部分是中文外文数据 131 | aliases = part[0].xpath('dd') 132 | cnalias = [a.text.strip() for a in aliases[0].xpath('p')] 133 | enalias = [a.text.strip() for a in aliases[1].xpath('p')] 134 | try: 135 | time = aliases[2].xpath('p')[0].text 136 | date = make_datetime(other[1].xpath('p')[0].text) 137 | language = (i.text.encode('utf-8').replace('/', '').strip() 138 | for i in other[2].xpath('p/a')) 139 | site = [other[3].xpath('p/a')[0].text, # 官网缩写 140 | other[3].xpath('p/a')[0].attrib['href']] # 官网url 141 | except IndexError: 142 | warn('{} has not some info'.format(self.id)) 143 | # 制作成本, 拍摄日期等数据 144 | other = part[1].xpath('dd') 145 | cost = other[0].xpath('p')[0].text 146 | # 发行信息 147 | part = self.page.xpath( 148 | '//dl[@id="releaseDateRegion"]/dd//div/ul/li/div[@class="countryname"]/p/span') # noqa 149 | release = [] 150 | for p in part: 151 | encountry = p.text.strip() 152 | cncountry = p.getparent().text.strip() 153 | time_text = p.getparent().getparent().getparent().xpath( 154 | 'div[@class=\"datecont\"]')[0].text 155 | releasetime = make_datetime(time_text) 156 | release.append({'encountry': encountry, 'cncountry': cncountry, 157 | 'releasetime': releasetime}) 158 | part = self.page.xpath( 159 | '//dl[@id="companyRegion"]/dd/div/div[@class="fl wp49"]') 160 | detail = defaultdict(list) 161 | for p in part: 162 | if p.xpath('h4')[0].text == u'制作公司': 163 | cur_type = 'make' 164 | else: 165 | cur_type = 'publish' 166 | for p2 in p.xpath('ul/li'): 167 | name = p2.xpath('a')[0].text 168 | country_info = p2.xpath('span')[0].text 169 | match = detail_country_regex.findall(country_info) 170 | if match: 171 | detail[cur_type] += [{'name': name, 'country': match[0]}] 172 | else: 173 | detail[cur_type] += [{'name': name}] 174 | # details = {'enalias': enalias, 'cnalias': cnalias, 'time': time, 175 | # 'language': language, 'cost': cost, 'date': date, 176 | # 'release': release, 'site': site} 177 | d = locals() 178 | d.pop('self') 179 | detail.update(d) 180 | self.d.update(detail) 181 | 182 | 183 | class AwardsParse(Parse): 184 | 185 | def set_url(self): 186 | self.url = movie_url.format(self.id, 'awards.html') 187 | 188 | def xpath(self): 189 | all = self.page.xpath('//div[@id="awardInfo_data"]/dd') 190 | for elem in all: 191 | name = elem.xpath('h3/b')[0].text 192 | info = defaultdict(list) 193 | year, period, awards = 0, 0, '未知' 194 | try: 195 | yp = elem.xpath('h3/span/a')[0].text 196 | except: 197 | # 可能获了一个大奖的好几届的奖 198 | for e in elem.xpath('dl/child::*'): 199 | if e.tag == 'dt': 200 | if info: 201 | # 因为是一个dl里面包含多个年份届数的数据, 都要独立提交 202 | self.d['awards'] += [dict( 203 | name=name, year=year, period=period, 204 | awards=awards)] 205 | info = defaultdict(list) 206 | 207 | if e.attrib.get('style'): 208 | yp = e.xpath('a')[0].text 209 | year, period = awardinfo_regex.findall(yp)[0] 210 | else: 211 | cur_type = e.text 212 | elif e.tag == 'dd': 213 | awardtype = e.xpath('span')[0].text 214 | try: 215 | people = e.xpath('a')[0].text 216 | except IndexError: 217 | people = '' 218 | info[cur_type] += [(people, awardtype)] 219 | else: 220 | year, period = awardinfo_regex.findall(yp)[0] 221 | for e in elem.xpath('dl/child::*'): 222 | if e.tag == 'dt': 223 | cur_type = e.text 224 | elif e.tag == 'dd': 225 | awardtype = e.xpath('span')[0].text 226 | try: 227 | people = e.xpath('a')[0].text 228 | except IndexError: 229 | people = '' 230 | info[cur_type] += [(people, awardtype)] 231 | awards = [] 232 | for k, v in info.items(): 233 | awards.append(dict(type=k, peoples=v)) 234 | self.d['awards'] += [dict(name=name, year=year, period=period, 235 | awards=awards)] 236 | 237 | 238 | class CommentParse(Parse): 239 | 240 | def set_url(self): 241 | self.url = movie_url.format(self.id, 'comment.html') 242 | 243 | def xpath(self): 244 | all = self.page.xpath('//dl[@class="clearfix"]') 245 | # 变态的mtime获取评论的方法是通过api服务 246 | blogids = [i.attrib['blogid'] 247 | for i in self.page.xpath('//div[@class=\"db_comtool\"]')] 248 | s = Comment(params={'Ajax_CallBackArgument0': ','.join(blogids), 249 | 'Ajax_CallBackArgument1': '', 250 | 'Ajax_RequestUrl': self.url}) 251 | s.fetch(COMMENT_API) 252 | comment_api = comment_regex.findall(s.content) 253 | for index, i in enumerate(all): 254 | comments = i.xpath('dd[@class=\"comboxcont\"]/div') 255 | if not comments: 256 | # 奇怪的是,第一个不是我要的div 257 | continue 258 | hasposter = i.xpath('div[@class=\"fr\"]/a/img') 259 | if hasposter: 260 | poster = hasposter[0].attrib['src'] 261 | else: 262 | poster = '' 263 | comment = comments[0] 264 | t = comment.xpath('h3/a')[0] 265 | title = t.text # 文章标题 266 | url = t.attrib['href'] 267 | try: 268 | shortcontent = comment.xpath('p')[0].text.strip() 269 | except AttributeError: 270 | # 某些坪林没显示缩略文 271 | shortcontent = '' 272 | combox = i.xpath('dd[@class=\"comboxuser2\"]/div')[0] 273 | image = combox.xpath('a/img')[0].attrib['src'] 274 | name = combox.xpath('a/img')[0].attrib['alt'] 275 | commenter_url = combox.xpath('a/img')[0].attrib['src'] 276 | date = combox.xpath('p')[1].xpath('a')[0].attrib['entertime'] 277 | publishdate = datetime.strptime(date, '%Y-%m-%d %H:%M:%S') 278 | hasnext = self.check_next_page() 279 | self.url = url 280 | # 重新设置要爬的页面 281 | content = self.get_content() 282 | look = combox.xpath('p')[2].text 283 | score = 0 284 | if look: 285 | # 表示看过 286 | score = float(combox.xpath('p')[2].xpath('span')[0].text) 287 | ac, rc, cc = 0, 0, 0 288 | if comment_api: 289 | ac, rc, cc = comment_api[0] 290 | p = lambda x: x.split(',')[index - 1] # 多了一个div 291 | ac, rc, cc = p(ac), p(rc), p(cc) 292 | self.d['comments'] += [{'commenter_url': commenter_url, 293 | 'ac': ac, 'rc': rc, 'url': url, 294 | 'poster': poster, 'image': image, 295 | 'title': title, 'name': name, 296 | 'score': score, 'content': content, 297 | 'shortcontent': shortcontent, 'cc': cc, 298 | 'publishdate': publishdate}] 299 | if hasnext: 300 | '''判断还有下一页会传回去继续累加页面,直到没有下一页''' 301 | return True 302 | 303 | def get_content(self): 304 | '''爬取长评论页''' 305 | ret = self.spider() 306 | all = ret.xpath('//div[@class="db_mediacont db_commentcont"]/p') 307 | contents = [] 308 | for elem in all: 309 | istext = elem.xpath('text()') 310 | if istext: 311 | if istext[0].strip(): 312 | # 文本, 否则空行 313 | cur_type = 'text' 314 | content = istext[0].strip() 315 | else: 316 | continue 317 | isembed = elem.xpath('embed') 318 | if isembed: 319 | # 内嵌flash之类 320 | cur_type = 'embed' 321 | content = str(isembed[0].attrib) 322 | isimage = elem.xpath('img') 323 | if isimage: 324 | # 图片 325 | cur_type = 'image' 326 | image = [] 327 | for i in isimage: 328 | image.append(i.attrib['src']) 329 | content = ','.join(image) 330 | contents.append({'type': cur_type, 'content': content}) 331 | return contents 332 | 333 | 334 | class MicroCommentParse(Parse): 335 | 336 | def set_url(self): 337 | self.url = movie_url.format(self.id, 'shortcomment.html') 338 | 339 | def xpath(self): 340 | all = self.page.xpath( 341 | '//div[@class="db_shortcomment db_shortcomlist"]/dl/dd/div') 342 | tweetids = [i.attrib['tweetid'] for i in all] 343 | s = Comment(params={'Ajax_CallBackArgument0': '', 344 | 'Ajax_CallBackArgument1': ','.join(tweetids), 345 | 'Ajax_RequestUrl': self.url}) 346 | s.fetch(COMMENT_API) 347 | comment_api = comment_regex.findall(s.content) 348 | for index, elem in enumerate(all): 349 | content = elem.xpath('h3')[0].text 350 | user = elem.xpath('div[@class="comboxuser"]/div')[0] 351 | url = user.xpath('a')[0].attrib['href'] 352 | info = user.xpath('p')[0].xpath('a')[0] 353 | commenter_url = info.attrib['href'] 354 | name = info.text 355 | image = user.xpath('a/img')[0].attrib['src'] 356 | try: 357 | score = float(user[0].xpath('p')[1].xpath('span/span')[0].text) 358 | except (IndexError, TypeError, ValueError): 359 | score = 0 360 | date = user.xpath('p')[1].xpath('a')[0].attrib['entertime'] 361 | publishdate = datetime.strptime(date, '%Y-%m-%d %H:%M:%S') 362 | hasnext = self.check_next_page() 363 | ac, rc, cc = 0, 0, 0 364 | if comment_api: 365 | ac, rc, cc = comment_api[0] 366 | p = lambda x: x.split(',')[index] 367 | ac, rc, cc = p(ac), p(rc), p(cc) 368 | ret = copy.deepcopy(locals()) 369 | ret.pop('self') 370 | self.d['microcomments'] += [ret] 371 | if hasnext: 372 | return True 373 | 374 | 375 | class CharacterParse(Parse): 376 | 377 | def set_url(self): 378 | self.url = movie_url.format(self.id, 'characters.html') 379 | 380 | def xpath(self): 381 | all = self.page.xpath('//dd[@class=\"cha_box\"]') 382 | for elem in all: 383 | character = {} 384 | bigposter = '' 385 | img = elem.xpath('img') 386 | if img: 387 | bigposter = img[0].attrib['src'] 388 | character['bigposter'] = bigposter 389 | name = elem.xpath('div/div/p[@class="enname"]')[0].text 390 | intro = elem.xpath('div/div[@class=\"cha_mid\"]')[0].text 391 | character['introduction'] = intro 392 | character['name'] = name 393 | self.d['character'] += [character] 394 | 395 | 396 | class ScenesParse(Parse): 397 | 398 | def set_url(self): 399 | self.url = movie_url.format(self.id, 'behind_the_scene.html') 400 | 401 | def xpath(self): 402 | all = self.page.xpath('//div[@class="revealed_modle"]') 403 | if not all: 404 | # Mtime 前段不够严谨 405 | all = self.page.xpath('//div[@class="revealed_modle "]') 406 | if not all: 407 | return 408 | for elem in all: 409 | xpath = '' 410 | try: 411 | title = elem.xpath(xpath + 'h3')[0].text 412 | except IndexError: 413 | xpath = 'div/' 414 | title = elem.xpath(xpath + 'h3')[0].text 415 | l = [] 416 | for i in elem.xpath(xpath + 'div/p|div/dl/dd|dl/dd'): 417 | l.extend(filter(lambda x: x.strip(), i.xpath('text()'))) 418 | self.d['scene'] += [{'title': title, 'content': l}] 419 | 420 | 421 | class PlotParse(Parse): 422 | 423 | def set_url(self): 424 | self.url = movie_url.format(self.id, 'plots.html') 425 | 426 | def xpath(self): 427 | all = self.page.xpath('//div[@class="plots_box"]') 428 | for elem in all: 429 | l = [] 430 | all_p = elem.xpath('div/p') 431 | for p in all_p: 432 | try: 433 | # 第一个字特殊处理:大写 434 | other = p.xpath('span/text()')[1] 435 | txt = p.xpath('span/text()')[0] + other 436 | except IndexError: 437 | # 段落中的非第一段 438 | txt = p.xpath('text()')[0] 439 | l.append(txt) 440 | # 保留了多段之间的u'\u3000\u3000' 441 | self.d['content'] += l 442 | 443 | 444 | class FullcreditsParse(Parse): 445 | 446 | def set_url(self): 447 | self.url = movie_url.format(self.id, 'fullcredits.html') 448 | 449 | def xpath(self): 450 | common = self.page.xpath('//div[@class="credits_list"]') 451 | type = ['director', 'writer', 'produced', 'cinematography', 452 | 'filmediting', 'originalmusic', 'artdirection', 453 | 'costumedesign', 'assistantdirector'] 454 | 455 | if len(type) > len(common): 456 | # 有些老电影没有全部数据 457 | l = len(common) 458 | else: 459 | l = len(type) 460 | for offset in range(l): 461 | c = common[offset] 462 | for i in c.xpath('p'): 463 | name = i.xpath('a')[0].text 464 | if name is None: 465 | continue 466 | match = name_regex.findall(name) 467 | if match: 468 | match = match[0] 469 | self._alias[match[1]].add(match[0]) 470 | name = match[1] 471 | self.d[type[offset]] += [name] 472 | 473 | # 导演信息, 其实我感觉导演可能有多个,单个烦了好几个电影导演都一个.没找到xpath范例 474 | director = common[0] 475 | img = director.xpath('div/a/img') 476 | # 可能有图片 477 | director_dict = {} 478 | if img: 479 | director_dict['poster'] = img[0].attrib['src'] 480 | try: 481 | href = director.xpath('div/a')[0].attrib['href'] 482 | people = people_regex.findall(href) 483 | director_dict['mid'] = people[0] 484 | except IndexError: 485 | warn('[{}] No director'.format(self.id)) 486 | cn = director.xpath('div/h3/a') 487 | if cn: 488 | name = director.xpath('div/p/a')[0].text 489 | director_dict['name'] = name 490 | self._alias[name].add(cn[0].text) 491 | self.d['director'] = [director_dict] 492 | # end 493 | # 获取演员信息 494 | self.get_actor() 495 | 496 | def get_actor(self): 497 | actor = self.page.xpath('//div[@class="db_actor"]/dl/dd') 498 | for a in actor: 499 | one_actor = {} 500 | path = 'div[@class="actor_tit"]/div/' 501 | try: 502 | href = a.xpath(path + 'a')[0].attrib['href'] 503 | name_path = 'div[@class="character_tit"]/div/h3' 504 | except IndexError: 505 | path = 'div[@class="actor_tit"]/' 506 | name_path = 'div/div/h3' 507 | href = a.xpath(path + 'h3/a')[0].attrib['href'] 508 | people = people_regex.findall(href) 509 | one_actor['mid'] = people[0] 510 | img = a.xpath(path + 'a/img') 511 | if img: 512 | one_actor['poster'] = img[0].attrib['src'] 513 | try: 514 | name = a.xpath(path + 'h3/a')[0].text 515 | except IndexError: 516 | # 只有中文名 517 | name = None 518 | one_actor['name'] = name 519 | cn = a.xpath(path + 'h3/a') 520 | if cn: 521 | cnname = cn[0].text 522 | if name is None: 523 | name = cnname 524 | self._alias[name].add(cnname) 525 | try: 526 | play = a.xpath(name_path)[-1].text 527 | except IndexError: 528 | # 无饰演角色信息 529 | play = '' 530 | one_actor['play'] = play 531 | self.d['actor'] += [one_actor] 532 | 533 | # 通过搜索接口获取要爬取的电影ids 534 | 535 | 536 | def get_movie_ids(instance): 537 | '''获取电影在mtime的唯一id''' 538 | if mtime_vcodeValid_regex.search(instance.content): 539 | return 540 | return movie_regex.findall(instance.content) 541 | 542 | 543 | def get_movie_pages(instance): 544 | '''获取当前年份包含电影的页数''' 545 | try: 546 | return max([int(i[1]) for i in 547 | movie_page_regex.findall(instance.content)]) 548 | except ValueError: 549 | # 只有一页 550 | if mtime_vcodeValid_regex.search(instance.content): 551 | return 552 | return 1 553 | # end 554 | 555 | 556 | def checkmatch(regex, instance, type=int): 557 | '''抽象代码做多项正则匹配''' 558 | match = regex.findall(instance.content) 559 | if not match: 560 | return 0 561 | else: 562 | return type(match[0]) 563 | 564 | 565 | # 通过javascript获取评分等信息 566 | def get_movie_info(id): 567 | s = Movie(params={'Ajax_CallBackArgument1': id, 568 | 'Ajax_RequestUrl': MOVIE_PAGE.format( 569 | id=id, timestamp=Movie.get_timestamp())}) 570 | s.fetch(MOVIE_API) 571 | favorited = checkmatch(favoritedCount_regex, s) 572 | rating = checkmatch(rating_regex, s, float) 573 | ratingcount = checkmatch(ratingCount_regex, s) 574 | want = checkmatch(wantToSeeCount_regex, s) 575 | del s, id 576 | return locals() 577 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | mongoengine 2 | lxml 3 | -------------------------------------------------------------------------------- /schedulers.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | '''任务调度models''' 3 | from datetime import datetime 4 | from mongoengine import (Document, IntField, StringField, DateTimeField, 5 | connect, ListField, BooleanField) 6 | 7 | from conf import HOST, PORT, DATABASE 8 | 9 | connect(DATABASE, host=HOST, port=PORT) 10 | 11 | 12 | class Task(Document): 13 | '''设置的任务数据, 针对worker, beat''' 14 | type = StringField(max_length=60, required=True) # 任务类型 15 | last_run_at = DateTimeField( 16 | default=datetime.now(), required=True) # 上一次任务的执行时间 17 | interval = IntField(default=3600, required=True) # 任务间隔,单位秒 18 | 19 | 20 | class Message(Document): 21 | '''MQ存放在mongodb中的格式''' 22 | task = StringField(max_length=60, required=True, unique_with=['payload']) # 任务类型 23 | year = IntField(default=1900) # 为了最后更新ids到IdFinished 24 | payload = ListField(StringField(max_length=20)) # 函数执行的参数 25 | # 任务状态, 0 未执行, 1 运行中, 2 已完成, 3 失败 26 | retry = IntField(default=0) # 错误重试次数 27 | state = IntField(default=0, required=True) 28 | error = StringField(default='', required=True) # 失败日志 29 | inprocess = BooleanField(default=False, required=True) # 是否在处理中 30 | 31 | meta = { 32 | #'index_drop_dups': True, 33 | 'indexes': [('state', 'inprocess'), 'year'], 34 | } 35 | -------------------------------------------------------------------------------- /show_log.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | from gevent.server import StreamServer 4 | 5 | from log import handle_log 6 | 7 | 8 | if __name__ == '__main__': 9 | server = StreamServer(('0.0.0.0', 9020), handle_log) 10 | server.serve_forever() 11 | -------------------------------------------------------------------------------- /spider.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | ''' 3 | 爬虫 4 | ''' 5 | import zlib 6 | import urllib 7 | import urllib2 8 | import cookielib 9 | try: 10 | from cStringIO import StringIO 11 | except: 12 | from StringIO import StringIO 13 | from gzip import GzipFile 14 | from datetime import datetime 15 | from collections import OrderedDict 16 | 17 | from utils import get_user_agent 18 | from log import debug 19 | 20 | 21 | # deflate support 22 | def deflate(data): 23 | try: 24 | return zlib.decompress(data, -zlib.MAX_WBITS) 25 | except zlib.error: 26 | return zlib.decompress(data) 27 | 28 | 29 | class ContentEncodingProcessor(urllib2.BaseHandler): 30 | '''A handler to add gzip capabilities to urllib2 requests''' 31 | cookiejar = None 32 | 33 | def __init__(self, cookie_support, additional_headers): 34 | self.additional_headers = additional_headers 35 | if cookie_support: 36 | self.cookiejar = cookielib.CookieJar() 37 | 38 | def http_request(self, req): 39 | # 默认的头信息 40 | req.add_header('Accept-Encoding', 'gzip, deflate') 41 | req.add_header('User-Agent', get_user_agent()) 42 | req.add_header('Accept-Language', 43 | 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3') 44 | if self.additional_headers is not None: 45 | req.headers.update(self.additional_headers) 46 | if self.cookiejar is not None: 47 | self.cookiejar.add_cookie_header(req) 48 | return req 49 | 50 | def http_response(self, req, resp): 51 | if self.cookiejar is not None: 52 | self.cookiejar.extract_cookies(resp, req) 53 | # 页面没有压缩,直接返回,比如调用API返回JSON数据 54 | if resp.headers.get("content-encoding") not in ('gzip', 'deflate'): 55 | return resp 56 | old_resp = resp 57 | content = resp.read() 58 | 59 | # gzip 60 | if resp.headers.get("content-encoding") == "gzip": 61 | gz = GzipFile( 62 | fileobj=StringIO(content), 63 | mode="r" 64 | ) 65 | # deflate 66 | elif resp.headers.get("content-encoding") == "deflate": 67 | gz = StringIO(deflate(content)) 68 | resp = urllib2.addinfourl( 69 | gz, old_resp.headers, old_resp.url, old_resp.code) 70 | resp.msg = old_resp.msg 71 | return resp 72 | 73 | 74 | class Spider(object): 75 | def __init__(self, cookie_support=True, additional_headers=None, 76 | params={}): 77 | self.cookie_support = cookie_support 78 | self.additional_headers = additional_headers 79 | self.params = params 80 | 81 | def make_query(self): 82 | '''基本队列''' 83 | return {} 84 | 85 | def fetch(self, url): 86 | debug('Fetch Url: {} start...'.format(url)) 87 | opener = urllib2.build_opener( 88 | ContentEncodingProcessor(self.cookie_support, 89 | self.additional_headers), 90 | urllib2.HTTPHandler) 91 | urllib2.install_opener(opener) 92 | params = urllib.urlencode(self.make_query()) 93 | if params: 94 | url = '{}?{}'.format(url, params) 95 | req = urllib2.Request(url) 96 | self.content = urllib2.urlopen(req).read() 97 | debug('Fetch Url: {} done'.format(url)) 98 | 99 | @classmethod 100 | def get_timestamp(cls): 101 | now = datetime.now() 102 | timestamp = '' 103 | for i in (now.year, now.month, now.day, now.hour, now.minute, 104 | now.second, str(now.microsecond)[:5]): 105 | timestamp += str(i) 106 | return timestamp 107 | 108 | 109 | class Search(Spider): 110 | '''搜索电影用的爬虫''' 111 | def make_query(self): 112 | params = self.params 113 | if not isinstance(params, OrderedDict): 114 | d = OrderedDict() 115 | d['Ajax_CallBack'] = params['Ajax_CallBack'] 116 | d['Ajax_CallBackType'] = params['Ajax_CallBackType'] 117 | d['Ajax_CallBackMethod'] = params['Ajax_CallBackMethod'] 118 | d['Ajax_CrossDomain'] = params['Ajax_CrossDomain'] 119 | d['Ajax_RequestUrl'] = params['Ajax_RequestUrl'] 120 | d['t'] = self.get_timestamp() 121 | for i in range(20): 122 | param = 'Ajax_CallBackArgument' + str(i) 123 | d[param] = params.get(param, 0) 124 | return d 125 | else: 126 | return params 127 | 128 | 129 | class Movie(Spider): 130 | 131 | def make_query(self): 132 | params = self.params 133 | if not isinstance(params, OrderedDict): 134 | # TODO 优化,从beat剥离 135 | d = OrderedDict() 136 | d['Ajax_CallBack'] = True 137 | service = 'Mtime.Community.Controls.CommunityPages.DatabaseService' 138 | d['Ajax_CallBackType'] = service 139 | d['Ajax_CallBackMethod'] = 'LoadData2' 140 | d['Ajax_CrossDomain'] = 1 141 | d['Ajax_RequestUrl'] = params['Ajax_RequestUrl'] 142 | d['Ajax_CallBackArgument0'] = 1 143 | d['Ajax_CallBackArgument1'] = params['Ajax_CallBackArgument1'] 144 | return d 145 | else: 146 | return params 147 | 148 | 149 | class Comment(Spider): 150 | 151 | def make_query(self): 152 | params = self.params 153 | if not isinstance(params, OrderedDict): 154 | d = OrderedDict() 155 | d['Ajax_CallBack'] = True 156 | d['Ajax_CallBackType'] = 'Mtime.Library.Services' 157 | d['Ajax_CallBackMethod'] = 'GetMovieReviewAndTweetCountInfo' 158 | d['Ajax_CrossDomain'] = 1 159 | d['Ajax_RequestUrl'] = params['Ajax_RequestUrl'] 160 | d['t'] = self.get_timestamp() 161 | d['Ajax_CallBackArgument0'] = params['Ajax_CallBackArgument0'] 162 | d['Ajax_CallBackArgument1'] = params['Ajax_CallBackArgument1'] 163 | return d 164 | else: 165 | return params 166 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import models 3 | import parse 4 | from schedulers import Message 5 | 6 | def real_mapper(queryset): 7 | this = Message.objects(task=queryset.task, payload=queryset.payload) 8 | STATE = True 9 | Model = getattr(models, queryset.task) 10 | this.update(set__inprocess=True) 11 | if queryset.task == 'Movie': 12 | for process in queryset.payload: 13 | ret = parse.get_movie_info(process) 14 | ret['movieid'] = process 15 | models.Movie(**ret).save() 16 | return 17 | Parse = getattr(parse, queryset.task + 'Parse') 18 | for process in queryset.payload: 19 | try: 20 | p = Parse(process) 21 | count = 1 22 | while 1: 23 | haspage = p() 24 | if haspage is None: 25 | # 很可能404 26 | break 27 | result, hasnext = haspage 28 | Model(**result).save() 29 | # 别名体系, 这样只需要全局记录一个人物就知道他们的全部别名 30 | for k, v in p._alias.items(): 31 | models.AliasName.objects.get_or_create( 32 | name=k)[0].update(add_to_set__alias=v) 33 | if hasnext: 34 | count += 1 35 | url = p.original_url 36 | p.set_url(url.replace('.html', '-{}.html'.format(count))) 37 | else: 38 | #没有下一页就退出循环 39 | break 40 | except: 41 | raise 42 | STATE = False 43 | else: 44 | models.IdFinished.objects( 45 | year=queryset.year ).update(add_to_set__ids=[process]) 46 | if STATE: 47 | this.update(set__state=2) 48 | else: 49 | this.update(set__state=3) 50 | this.update(set__inprocess=False) 51 | 52 | all = Message.objects(state__ne=2) 53 | 54 | for i in all: 55 | try: 56 | real_mapper(i) 57 | except: 58 | raise 59 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | ''' 3 | 功能函数 4 | ''' 5 | import time 6 | import fcntl 7 | import struct 8 | import socket 9 | import random 10 | import base64 11 | 12 | from conf import INTERVAL, IFNAME 13 | 14 | 15 | def get_user_agent(): 16 | '''Modify from rom http://pastebin.com/zYPWHnc6''' 17 | platform = random.choice(['Macintosh', 'Windows', 'X11']) 18 | if platform == 'Macintosh': 19 | os = random.choice(['68K', 'PPC']) 20 | elif platform == 'Windows': 21 | os = random.choice(['Win3.11', 'WinNT3.51', 'WinNT4.0', 22 | 'Windows NT 5.0', 'Windows NT 5.1', 23 | 'Windows NT 5.2', 'Windows NT 6.0', 24 | 'Windows NT 6.1', 'Windows NT 6.2', 25 | 'Win95', 'Win98', 'Win 9x 4.90', 'WindowsCE']) 26 | elif platform == 'X11': 27 | os = random.choice(['Linux i686', 'Linux x86_64']) 28 | 29 | browser = random.choice(['chrome', 'firefox', 'ie']) 30 | if browser == 'chrome': 31 | webkit = str(random.randint(500, 599)) 32 | version = str(random.randint(0, 24)) + '.0' + \ 33 | str(random.randint(0, 1500)) + '.' + \ 34 | str(random.randint(0, 999)) 35 | return 'Mozilla/5.0 (' + os + ') AppleWebKit/' + webkit + \ 36 | '.0 (KHTML, live Gecko) Chrome/' + version + ' Safari/' + webkit 37 | elif browser == 'firefox': 38 | year = str(random.randint(2000, 2012)) 39 | month = random.randint(1, 12) 40 | if month < 10: 41 | month = '0' + str(month) 42 | else: 43 | month = str(month) 44 | day = random.randint(1, 30) 45 | if day < 10: 46 | day = '0' + str(day) 47 | else: 48 | day = str(day) 49 | gecko = year + month + day 50 | version = random.choice(map(lambda x: str(x) + '.0', range(1, 16))) 51 | return 'Mozilla/5.0 (' + os + '; rv:' + version + ') Gecko/' + \ 52 | gecko + ' Firefox/' + version 53 | elif browser == 'ie': 54 | version = str(random.randint(1, 10)) + '.0' 55 | engine = str(random.randint(1, 5)) + '.0' 56 | option = random.choice([True, False]) 57 | if option: 58 | token = random.choice(['.NET CLR', 'SV1', 'Tablet PC', 'WOW64', 59 | 'Win64; IA64', 'Win64; x64']) + '; ' 60 | elif option is False: 61 | token = '' 62 | return 'Mozilla/5.0 (compatible; MSIE ' + version + '; ' + os + \ 63 | '; ' + token + 'Trident/' + engine + ')' 64 | 65 | 66 | def get_unfinished(has, last): 67 | '''获取last里面有而has里面没有的数据列表''' 68 | return list(set(last).difference(set(has))) 69 | 70 | 71 | def encode(s): 72 | return base64.b64encode(s) 73 | 74 | 75 | def decode(s): 76 | return base64.b64decode(s) 77 | 78 | 79 | def group(seq, size): 80 | '''列表分组: 每组size个''' 81 | l = len(seq) 82 | for i in range(0, l, size): 83 | yield seq[i:i + size] 84 | 85 | 86 | def sleep2(interval=None): 87 | '''sleep一定时间''' 88 | num = interval if interval is not None else INTERVAL 89 | time.sleep(num) 90 | 91 | 92 | def get_ip_address(ifname=IFNAME): 93 | '''获取网卡的ip地址''' 94 | s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) 95 | return socket.inet_ntoa(fcntl.ioctl( 96 | s.fileno(), 97 | 0x8915, 98 | struct.pack('256s', ifname[:15]) 99 | )[20:24]) 100 | -------------------------------------------------------------------------------- /worker.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | '''Workers处理任务''' 3 | # 因为比较简单的项目,就都在一起了,没有拆分 4 | import multiprocessing 5 | # https://github.com/douban/CaoE, 父进程死掉杀掉子进程 6 | import caoe 7 | caoe.install() 8 | import parse 9 | import models 10 | from schedulers import Message 11 | from log import error, warn 12 | from control import Scheduler, periodic, run 13 | 14 | terminating = None 15 | scheduler = Scheduler('worker') 16 | 17 | 18 | # Fixed 不能CTRL-C http://stackoverflow.com/questions/14579474/multiprocessing-pool-spawning-new-childern-after-terminate-on-linux-python2-7 19 | def initializer(terminating_): 20 | # This places terminating in the global namespace of the worker subprocesses. 21 | # This allows the worker function to access `terminating` even though it is 22 | # not passed as an argument to the function. 23 | global terminating 24 | terminating = terminating_ 25 | 26 | 27 | class Worker(object): 28 | '''执行任务类''' 29 | def __init__(self, map_func, num_workers=None, **kwargs): 30 | self.map_func = map_func 31 | self.inputs = Message.objects(state__ne=2, inprocess__ne=True) 32 | self.pool = multiprocessing.Pool(num_workers, **kwargs) 33 | 34 | def run(self, chunksize=1): 35 | try: 36 | self.pool.map(self.map_func, self.inputs, chunksize=chunksize) 37 | except KeyboardInterrupt: 38 | warn("^C pressed") 39 | self.pool.terminate() 40 | except: 41 | import traceback 42 | traceback.print_exc() 43 | 44 | 45 | def mapper(queryset): 46 | try: 47 | if not terminating.is_set(): 48 | real_mapper(queryset) 49 | except KeyboardInterrupt: 50 | terminating.set() 51 | 52 | 53 | def real_mapper(queryset): 54 | this = Message.objects(task=queryset.task, payload=queryset.payload) 55 | STATE = True 56 | Model = getattr(models, queryset.task) 57 | this.update(set__inprocess=True) 58 | if queryset.task == 'Movie': 59 | for process in queryset.payload: 60 | ret = parse.get_movie_info(process) 61 | ret['movieid'] = process 62 | models.Movie(**ret).save() 63 | return 64 | Parse = getattr(parse, queryset.task + 'Parse') 65 | for process in queryset.payload: 66 | try: 67 | p = Parse(process) 68 | count = 1 69 | while 1: 70 | haspage = p() 71 | if haspage is None: 72 | # 很可能404 73 | break 74 | result, hasnext = haspage 75 | Model(**result).save() 76 | # 别名体系, 这样只需要全局记录一个人物就知道他们的全部别名 77 | for k, v in p._alias.items(): 78 | models.AliasName.objects.get_or_create( 79 | name=k)[0].update(add_to_set__alias=v) 80 | if hasnext: 81 | count += 1 82 | url = p.original_url 83 | p.set_url(url.replace('.html', '-{}.html'.format(count))) 84 | else: 85 | #没有下一页就退出循环 86 | break 87 | except: 88 | STATE = False 89 | else: 90 | models.IdFinished.objects( 91 | year=queryset.year).update(add_to_set__ids=[process]) 92 | if STATE: 93 | this.update(set__state=2) 94 | else: 95 | this.update(set__state=3) 96 | this.update(set__inprocess=False) 97 | 98 | 99 | def mtime_worker(): 100 | terminating = multiprocessing.Event() 101 | w = Worker(mapper, initializer=initializer, initargs=(terminating, )) 102 | try: 103 | w.run() 104 | except: 105 | error('Other error') 106 | 107 | def main(): 108 | periodic(scheduler, mtime_worker) 109 | scheduler.run() 110 | 111 | 112 | if __name__ == '__main__': 113 | run(main, __file__) 114 | --------------------------------------------------------------------------------