├── .gitignore ├── README.md ├── crawler ├── __init__.py ├── clean.py ├── deep.py ├── job_engine.py ├── log4f.py ├── request.py ├── scanner.py ├── settings.py ├── test_cl.py └── wechat.py ├── dianping.py ├── settings.py └── test ├── __init__.py ├── test_tools.py └── testdata.html /.gitignore: -------------------------------------------------------------------------------- 1 | # .gitignore by JackonYang 2 | 3 | database.sqlite3 4 | cache/* 5 | 6 | 7 | ### Django ### 8 | media/* 9 | static/* 10 | db.sqlite3 11 | local_settings.py 12 | *.log 13 | 14 | 15 | ### python ### 16 | *.py[cod] 17 | __pycache__/ 18 | *.so # C extensions 19 | pip-log.txt # Installer logs 20 | 21 | 22 | ### Vim ### 23 | .ropeproject/* 24 | [._]*.s[a-w][a-z] 25 | [._]s[a-w][a-z] 26 | *~ 27 | 28 | ### Pycharm ### 29 | .idea/* 30 | 31 | 32 | ### Mac ### 33 | .DS_Store 34 | 35 | 36 | # SVN 37 | .svn 38 | 39 | 40 | !PLACEHOLDER 41 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 大众点评爬虫 2 | ============ 3 | 4 | 抓取页面: 5 | 6 | 1. shop profile 7 | 2. shop review 8 | 3. user profile 9 | 10 | 11 | ## 用法 12 | 13 | #### Scanner 14 | 15 | 以 shop review 为例, 下载的数据保存在 16 | `/home/jackon/media/dianping/reviews` 目录下. 17 | 希望从 reviews 页面中找出所有的 user-id 18 | 19 | ```python 20 | import re 21 | from scanner import Scanner 22 | 23 | uid_ptn = re.compile(r'href="/member/(\d+)(?:\?[^"]+)?"') 24 | json_name = 'uid.json' 25 | 26 | s = Scanner(json_name, uid_ptn) 27 | s.scan('/home/jackon/media/dianping/reviews') 28 | 29 | for k, v in s.data.items(): 30 | print '{} items in {}'.format(len(v), k) 31 | ``` 32 | 33 | 扫描完成后输出如下格式: 34 | ```shell 35 | 20 items in 6845514_1.html 36 | 20 items in 550426_18.html 37 | 0 items in 3926803_2.html 38 | 20 items in 4550817_72.html 39 | 0 items in 6006104_3.html 40 | 0 items in 22281825_3.html 41 | 20 items in 2817364_18.html 42 | 20 items in 18221165_1.html 43 | 20 items in 550099_10.html 44 | 20 items in 21293756_2.html 45 | 20 items in 586687_31.html 46 | 20 items in 20815806_10.html 47 | ``` 48 | 49 | 50 | #### 压缩 / 解压数据的 shell 命令 51 | 52 | ```shell 53 | $ time 7z x shop_prof_20150821.7z 54 | # Folders: 1 55 | # Files: 164805 56 | # Size: 18068875270 57 | # Compressed: 359098116 58 | 59 | # real 164m37.408s 60 | # user 18m2.913s 61 | # sys 39m50.784s 62 | $ ls shop_prof | wc -l 63 | # 164805 64 | 7z l shop_prof_20150821.7z | grep '.html' | wc -l 65 | # 164805 66 | ``` 67 | -------------------------------------------------------------------------------- /crawler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JackonYang/dianping-crawler/f01abb31b11fcb6a469042d339112a2c6779f29a/crawler/__init__.py -------------------------------------------------------------------------------- /crawler/clean.py: -------------------------------------------------------------------------------- 1 | # -*- Encoding: utf-8 -*- 2 | __author__ = 'vivian' 3 | import re 4 | import os 5 | import json 6 | 7 | from scanner import Scanner 8 | 9 | revp_ptn = re.compile(r'(\d+)_(\d+)') 10 | revp_list = {} 11 | data = {} 12 | ret_ex = [] 13 | ret_vac = [] 14 | path = 'D:\github\data\shop_review' 15 | 16 | uid_ptn = re.compile(r'href="/member/(\d+)(?:\?[^"]+)?"') 17 | filename = 'uid.json' 18 | s = Scanner(filename, uid_ptn) # 找到每个店铺的评价页面的用户id,输出在uid.json中,key为店铺评价页面,values为对应页面的 19 | # 评价的用户id 20 | s.scan(path, save_period=500) 21 | 22 | count = 0 23 | rid = [] 24 | for k, v in s.data.items(): # 店铺评价页面的首页没有评价的店铺。输出在 rid.json中。 25 | if len(v) == 0 and k.endswith('_1.html'): 26 | count += 1 27 | rid.append(k) 28 | print count # 输出符合条件的店铺总数 29 | with open("rid.json",'w ') as fp: 30 | json.dump(rid, fp) 31 | 32 | 33 | for f in os.listdir(path): # 搜索目标文件夹中的文件名(店铺及评价页面),输出在revp.json中 34 | 35 | # key为店铺id,对应的values为爬取到的店铺评价页 36 | revp_ret = revp_ptn.findall(f) 37 | for k,v in revp_ret: 38 | if k in revp_list: 39 | revp_list[k].append(v) 40 | else: 41 | revp_list[k] = [v] 42 | 43 | with open("revp.json",'w') as fp: 44 | json.dump(revp_list,fp) 45 | 46 | with open("uid.json",'r') as fd: 47 | data = json.load(fd) 48 | 49 | for k,v in revp_list.items(): # 找到每个店铺相应的评价最后一页的评价用户id,输出目标文件不存在(ret_ex)和有用户评价的 50 | file = k+'_'+str(len(v))+'.html' # 文件(ret_vac),最终输出保存在result.json中 51 | if file not in data.keys(): 52 | ret_ex.append(file) 53 | elif data[file]!=[]: 54 | ret_vac.append(file) 55 | else: 56 | pass 57 | 58 | js_dic = {'these are not exist!':ret_ex,'these are not empty!':ret_vac} 59 | with open("result.json",'w') as fr: 60 | json.dump(js_dic,fr, indent=4) -------------------------------------------------------------------------------- /crawler/deep.py: -------------------------------------------------------------------------------- 1 | # -*- Encoding: utf-8 -*- 2 | import os 3 | import re 4 | import json 5 | 6 | path = 'D:/github/data/shop_review' 7 | info_ptn = re.compile(ur'href="/member/(\d*)">.*?title="(.*?)"',re.DOTALL) # 匹配用户id 和他的贡献值 8 | 9 | err = [] 10 | id_sc = {} 11 | re_id = {} 12 | 13 | for f in os.listdir(path): # 进入一个店铺的一页评论。 14 | filename = path + '/'+f 15 | ids = [] 16 | try: 17 | with open(filename,'r') as fp: 18 | content = ''.join(fp.readlines()) 19 | except: 20 | err.append(filename) # 若打不开文件,将其放入 err 中 21 | else: 22 | info_ret = info_ptn.findall(content) 23 | for id,sc in info_ret: # 找出没有分数的用户,并放入到list,ids中 24 | id_sc[id] = sc 25 | if len(sc) == 0: 26 | ids.append(id) 27 | re_id[f] = ids # 店铺id为key, 最终所求(需要爬主页)的用户id为 values. 28 | 29 | 30 | with open('err.json','w') as fe: # 打不开的文件 31 | json.dump(err,fe,indent=4) 32 | with open('id_sc.json','w') as fi: # 用户及他的贡献值 33 | json.dump(id_sc,fi,indent=4) 34 | with open('re_id.json','w') as fd: # 没有贡献值的用户 35 | json.dump(re_id,fd,indent=4) 36 | -------------------------------------------------------------------------------- /crawler/job_engine.py: -------------------------------------------------------------------------------- 1 | # -*- Encoding: utf-8 -*- 2 | import redis 3 | from wechat import send 4 | 5 | from os.path import dirname, join 6 | from log4f import debug_logger 7 | 8 | BASE_DIR = dirname(__file__) 9 | log = debug_logger(join(BASE_DIR, 'log/download'), 'root.download') 10 | 11 | 12 | class JobPool: 13 | def __init__(self, job_name, 14 | host='localhost', port=6379, db=0, 15 | timeout=10): 16 | self.timeout = timeout 17 | self.db = redis.StrictRedis(host, port, db) 18 | self.total_tbl = '{}:total'.format(job_name) 19 | self.todo_tbl = '{}:todo'.format(job_name) 20 | self.name = job_name 21 | 22 | def init_data(self, total, done): 23 | self.db.delete(self.total_tbl) 24 | self.db.delete(self.todo_tbl) 25 | 26 | todo = set(total) - set(done) 27 | 28 | self.db.sadd(self.total_tbl, *total) 29 | self.db.rpush(self.todo_tbl, *todo) 30 | 31 | def count_todo(self): 32 | return self.db.llen(self.todo_tbl) 33 | 34 | def count_total(self): 35 | return self.db.scard(self.total_tbl) 36 | 37 | def run(self, callback, recursive=False): 38 | print '{} start.TODO/Total: {}/{}'.\ 39 | format(self.name, self.count_todo(), self.count_total()) 40 | key = self._next() 41 | while key: 42 | log.info('downloading {}-{}'.format(self.name, key)) 43 | try: 44 | items = callback(key) 45 | log.info( 46 | '{} items in {}-{}'.format( 47 | len(items), self.name, key)) 48 | if len(items) == 0: 49 | send(u'Warning! May be banned. key={}'.format(key)) 50 | if recursive: 51 | self._add(*items) 52 | except Exception as e: 53 | log.error('{}. ID={}'.format(e, key)) 54 | send(u'Info! meet exceptions. key={}, err={}'.format(key, e)) 55 | self.db.rpush(self.todo_tbl, key) 56 | key = self._next() 57 | 58 | info = '{} done. {} got'.format(self.name, self.count_total()) 59 | print(info) 60 | log.warning(info) 61 | 62 | def _next(self): 63 | key = self.db.blpop(self.todo_tbl, self.timeout) 64 | return key and key[1] 65 | 66 | def _add(self, *keys): 67 | for item in keys: 68 | if self.db.sadd(self.total_tbl, item): 69 | self.db.rpush(self.todo_tbl, item) 70 | 71 | 72 | if __name__ == '__main__': 73 | job_name = 'job_test' 74 | total = [str(i) for i in range(1, 9)] 75 | done = [str(i) for i in range(3, 9, 2)] 76 | 77 | job = JobPool(job_name, db=9, timeout=2) 78 | 79 | job.init_data(total, done) 80 | job.run(lambda key: [11, 12]) 81 | 82 | job.init_data(total, done) 83 | job.run(lambda key: [11, 12], recursive=True) 84 | -------------------------------------------------------------------------------- /crawler/log4f.py: -------------------------------------------------------------------------------- 1 | # -*- Encoding: utf-8 -*- 2 | """log in 4 files""" 3 | import logging 4 | import os 5 | 6 | 7 | DEFAULT_LOG_LEVEL = logging.DEBUG 8 | 9 | 10 | def get_4f_logger(formatter, path, name=''): 11 | log = logging.getLogger(name) 12 | log.setLevel(DEFAULT_LOG_LEVEL) 13 | 14 | if not os.path.exists(path): 15 | os.makedirs(path) 16 | 17 | lvls = ['debug', 'info', 'warn', 'error'] 18 | 19 | for lvl in lvls: 20 | logfile = os.path.join(path, '{}.log'.format(lvl.lower())) 21 | hdlr = logging.FileHandler(logfile) 22 | hdlr.setLevel(getattr(logging, lvl.upper())) 23 | hdlr.setFormatter(formatter) 24 | log.addHandler(hdlr) 25 | return log 26 | 27 | 28 | def debug_logger(log_dir='log', logger_name='debug'): 29 | log_format = ('%(asctime)s|%(levelname)s|%(message)s' 30 | '|%(filename)s-%(lineno)s') 31 | return get_4f_logger(logging.Formatter(log_format), log_dir, logger_name) 32 | 33 | 34 | if __name__ == '__main__': 35 | log = debug_logger() 36 | log.error('test log') 37 | log.info('info log') 38 | -------------------------------------------------------------------------------- /crawler/request.py: -------------------------------------------------------------------------------- 1 | # -*- Encoding: utf-8 -*- 2 | import re 3 | import redis 4 | import socket 5 | from httplib2 import Http 6 | 7 | import time 8 | import random 9 | 10 | from os.path import dirname, join 11 | from log4f import debug_logger 12 | import settings 13 | 14 | BASE_DIR = dirname(__file__) 15 | log = debug_logger(join(BASE_DIR, 'log/request'), 'root.request') 16 | 17 | r = redis.StrictRedis(**settings.REDIS_CONN) 18 | 19 | 20 | def wait(f): 21 | lock_name = 'http-lock' 22 | 23 | def _wrap_func(*args, **kwargs): 24 | t = r.ttl(lock_name) 25 | if t > 0: 26 | time.sleep(t) 27 | 28 | n_t = int(random.uniform(settings.DELAY_BOTTOM, settings.DELAY_TOP)) 29 | r.setex(lock_name, n_t, 'locking') 30 | return f(*args, **kwargs) 31 | return _wrap_func 32 | 33 | headers_templates = { 34 | 'Connection': 'keep-alive', 35 | 'User-Agent': ('Mozilla/5.0 (X11; Linux x86_64)' 36 | 'AppleWebKit/537.36 (KHTML, like Gecko)' 37 | 'Chrome/44.0.2403.125 Safari/537.36'), 38 | 'Content-type': 'application/x-www-form-urlencoded', 39 | 'Accept': '*/*', 40 | 'Accept-Charset': 'UTF-8,*;q=0.5', 41 | 'Accept-Encoding': 'gzip,deflate,sdch', 42 | 'Accept-Language': 'zh-CN,zh;q=0.8', 43 | 'Cache-Control': 'no-cache', 44 | 'Host': 'www.dianping.com', 45 | 'Referer': 'http://www.dianping.com/', 46 | 'DNT': '1', 47 | } 48 | 49 | 50 | @wait 51 | def request(url, timeout=2, method='GET', filename=None): 52 | """return None if timeout""" 53 | h = Http(timeout=timeout) 54 | try: 55 | log.debug('request {}'.format(url)) 56 | rsp, content = h.request(url, method, headers=headers_templates) 57 | except socket.timeout: 58 | return None 59 | 60 | if filename: 61 | with open(filename, 'w') as f: 62 | f.write(content) 63 | log.debug('response saved. filename={}'.format(filename)) 64 | 65 | return content 66 | 67 | 68 | def request_pages(key, page_range, url_ptn, find_items, resend=3, 69 | min_num=0, max_failed=5, filename_ptn=None): 70 | """request a list of pages in page_range 71 | 72 | """ 73 | items_total = set() # items will be out of order if some pages failed 74 | failed = set() 75 | 76 | for page in page_range: 77 | 78 | filename = filename_ptn and filename_ptn.format(key, page) 79 | page_url = url_ptn.format(key=key, page=page) 80 | content = request(page_url, filename=filename) 81 | 82 | if content is not None: 83 | items_page = find_items(content, key) 84 | if items_page and len(items_page) > min_num: 85 | items_total.update(items_page) 86 | else: 87 | log.debug('nothing in page {} of {}'.format(page, key)) 88 | break 89 | else: 90 | log.warning('failed to request page {} of {}'.format(page, key)) 91 | failed.add(page) 92 | if len(failed) > max_failed: 93 | log.error('more timeout than {}'.format(max_failed)) 94 | return 95 | 96 | if failed: 97 | if not resend: 98 | return None 99 | log.debug('resend failed pages of {}'.format(key)) 100 | items_more = request_pages(key, failed, url_ptn, find_items, 101 | resend-1, min_num, filename_ptn) 102 | if items_more is None: 103 | return None 104 | items_total.update(items_more) 105 | return items_total 106 | 107 | 108 | if __name__ == '__main__': 109 | 110 | url = 'http://www.dianping.com/shop/{key}/review_more?pageno={page}' 111 | find_uid = lambda content, key: \ 112 | re.compile(r'href="/member/(\d+)">(.+?)').findall(content) 113 | 114 | uid = '5195730' # 45 reviews on 2015.8.3 115 | pages = range(1, 9) 116 | 117 | ret = request_pages(uid, pages, url, find_uid, resend=3, 118 | min_num=0, max_failed=5, filename_ptn=None) 119 | for user, name in ret: 120 | print user, name 121 | print len(ret) 122 | -------------------------------------------------------------------------------- /crawler/scanner.py: -------------------------------------------------------------------------------- 1 | # -*- Encoding: utf-8 -*- 2 | import re 3 | import os 4 | from os.path import join 5 | import json 6 | 7 | 8 | class Scanner: 9 | def __init__(self, cache_filename, ptn): 10 | self.cache_name = cache_filename 11 | self.ptn = ptn 12 | self.data = self.load() 13 | 14 | def scan(self, path, save_period=2000): 15 | print 'scan {}'.format(path) 16 | total = {fn for fn in os.listdir(path) 17 | if os.path.isfile(join(path, fn))} 18 | todo = total - set(self.data.keys()) 19 | 20 | print 'scanning {}/{}'.format(len(todo), len(total)) 21 | for i, filename in enumerate(todo): 22 | with open(join(path, filename), 'r') as f: 23 | c = ''.join(f.readlines()) 24 | self.data[filename] = list(set(self.ptn.findall(c))) 25 | 26 | if i % save_period == 0: 27 | print '...saving. {} done.'.format(i+1) 28 | self.save() 29 | 30 | self.save() 31 | print 'scan finished. {} new files.'.format(len(todo)) 32 | 33 | def load(self): 34 | data = dict() 35 | if os.path.exists(self.cache_name): 36 | with open(self.cache_name, 'r') as fr: 37 | data = json.load(fr) 38 | return data 39 | 40 | def save(self): 41 | with open(self.cache_name, 'wb') as fw: 42 | json.dump(self.data, fw, indent=4) 43 | 44 | 45 | if __name__ == '__main__': 46 | fn = 'test.json' 47 | if os.path.exists(fn): 48 | os.remove(fn) 49 | 50 | ptn = re.compile(r'import (\w+)') 51 | s = Scanner(fn, ptn) 52 | 53 | s.scan('.', save_period=7) 54 | print '----------' 55 | s.scan('.', save_period=7) 56 | print '----------' 57 | s.scan('..', save_period=7) 58 | -------------------------------------------------------------------------------- /crawler/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | REDIS_CONN = { 3 | 'host': '127.0.0.1', 4 | 'port': 6379, 5 | 'db': 8, 6 | } 7 | 8 | WECHAT_CONN = { 9 | 'username': '', 10 | 'password': '', 11 | } 12 | 13 | DELAY_BOTTOM = 5 14 | DELAY_TOP = 10 15 | 16 | NOTIFY_IDS = [ 17 | '2271762240', # k 18 | '98160640', # v 19 | ] 20 | 21 | MSG_SIGNATURE = 'AutoSend by Dianping Crawler' 22 | 23 | 24 | try: 25 | from local_settings import * 26 | except Exception as e: 27 | pass 28 | -------------------------------------------------------------------------------- /crawler/test_cl.py: -------------------------------------------------------------------------------- 1 | # -*- Encoding: utf-8 -*- 2 | __author__ = 'vivian' 3 | import re 4 | import os 5 | import json 6 | 7 | from scanner import Scanner 8 | 9 | revp_ptn = re.compile(r'(\d+)_(\d+)') 10 | revp_list = {} 11 | for f in os.listdir('D:\github\data\shop_review'): 12 | revp_ret = revp_ptn.findall(f) 13 | for k,v in revp_ret: 14 | if k in revp_list: 15 | revp_list[k].append(v) 16 | else: 17 | revp_list[k] = [v] 18 | with open("revp.json",'a') as fp: 19 | json.dump(revp_list,fp) 20 | 21 | 22 | 23 | 24 | #sid_ptn = re.compile('') -------------------------------------------------------------------------------- /crawler/wechat.py: -------------------------------------------------------------------------------- 1 | # -*- Encoding: utf-8 -*- 2 | import json 3 | import redis 4 | from os.path import join, dirname 5 | from wechat_sdk import WechatExt 6 | # from wechat_sdk.exceptions import NeedLoginError 7 | 8 | from log4f import debug_logger 9 | import settings 10 | 11 | 12 | LOGIN_TIMEOUT = 4 * 3600 # 4 hours 13 | r = redis.StrictRedis(**settings.REDIS_CONN) 14 | log = debug_logger(join(dirname(__file__), 'log/notify'), 'root.notify') 15 | 16 | 17 | def login(username, password): 18 | d = r.get(username) 19 | if d: 20 | log.info('lazy login. use cookie, username={}'.format(username)) 21 | return WechatExt(username, password, login=False, **json.loads(d)) 22 | else: 23 | print username, password 24 | wechat = WechatExt(username, password, login=False) 25 | wechat.login() 26 | log.info('login to wechat server. username={}'.format(username)) 27 | r.setex(username, LOGIN_TIMEOUT, 28 | json.dumps(wechat.get_token_cookies(), indent=4)) 29 | return wechat 30 | 31 | 32 | def init_info(): 33 | rsp_str = login(**settings.WECHAT_CONN).get_user_list() 34 | for u in json.loads(rsp_str)['contacts']: 35 | name = u['nick_name'].encode('utf8') 36 | fakeid = u['id'] 37 | r.set(fakeid, name) 38 | print '{:12} -- {}'.format(fakeid, name) 39 | 40 | 41 | def send(msg_text): 42 | msg = u'{}. {}'.format(msg_text, settings.MSG_SIGNATURE) 43 | for fakeid in settings.NOTIFY_IDS: 44 | name = r.get(fakeid) or fakeid 45 | try: 46 | login(**settings.WECHAT_CONN).send_message(fakeid, msg) 47 | log.info(u'msg sent. user={}'.format(name)) 48 | except: 49 | log.info(u'fail to send msg. user={}'.format(name)) 50 | 51 | 52 | if __name__ == "__main__": 53 | init_info() 54 | # send(u'亲爱的') 55 | -------------------------------------------------------------------------------- /dianping.py: -------------------------------------------------------------------------------- 1 | # -*- Encoding: utf-8 -*- 2 | import os 3 | from os.path import join, exists 4 | import re 5 | 6 | from crawler.request import request, request_pages 7 | from crawler.scanner import Scanner 8 | from crawler.job_engine import JobPool 9 | 10 | from settings import cache_root, db 11 | 12 | page_types = ['user_prof', 'shop_prof', 'reviews'] 13 | page_path = {name: join(cache_root, name) for name in page_types} 14 | for path in page_path.values(): 15 | if not exists(path): 16 | os.makedirs(path) 17 | 18 | uid_ptn = re.compile(r'href="/member/(\d+)(?:\?[^"]+)?"') 19 | sid_ptn = re.compile(r'href="/shop/(\d+)(?:\?[^"]+)?"') 20 | rev_ptn = re.compile(r']+id="rev_(\d+)"') 21 | find_rev = lambda c, key: set(rev_ptn.findall(c)) 22 | 23 | 24 | def grab_user_prof(key): 25 | url = 'http://www.dianping.com/member/{}'.format(key) 26 | fn = join(page_path['user_prof'], '{}_user.html'.format(key)) 27 | c = request(url.format(key), filename=fn) 28 | return set(uid_ptn.findall(c)) 29 | 30 | 31 | def grab_shop_prof(key): 32 | url = 'http://www.dianping.com/shop/{}'.format(key) 33 | fn = join(page_path['shop_prof'], '{}_shop.html'.format(key)) 34 | c = request(url.format(key), filename=fn) 35 | return set(sid_ptn.findall(c)) - {key} 36 | 37 | 38 | def grab_reviews(key, max_page=100): 39 | url = 'http://www.dianping.com/shop/{key}/review_more?pageno={page}' 40 | 41 | filename_ptn = join(page_path['reviews'], '{}_{}.html') # key, page 42 | return request_pages(key, range(1, max_page), url, find_rev, 43 | filename_ptn=filename_ptn) 44 | 45 | 46 | def init_user_prof(host, job_name='user_prof'): 47 | s = Scanner(join(cache_root, 'uid.json'), uid_ptn) 48 | s.scan(page_path['reviews']) 49 | total = {v for vs in s.data.values() for v in vs} 50 | done = {fn[:fn.find('_')] 51 | for fn in os.listdir(page_path[job_name])} 52 | job = JobPool(job_name, host, db=db) 53 | job.init(total, done) 54 | 55 | 56 | if __name__ == '__main__': 57 | """ 58 | from test.test_tools import request, request_pages 59 | 60 | uid = '3601131' 61 | print grab_user_prof(uid) 62 | 63 | sid = '5195730' # 45 reviews on 2015.8.3 64 | print grab_shop_prof(sid) 65 | print grab_reviews(sid) 66 | """ 67 | 68 | job = JobPool('user_prof', 'jackon.me', db=db) 69 | job.run(grab_user_prof) 70 | -------------------------------------------------------------------------------- /settings.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | from os.path import join, dirname 3 | 4 | cache_root = join(dirname(__file__), 'cache') 5 | 6 | db = 8 7 | 8 | try: 9 | from local_settings import * 10 | except Exception as e: 11 | pass 12 | 13 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JackonYang/dianping-crawler/f01abb31b11fcb6a469042d339112a2c6779f29a/test/__init__.py -------------------------------------------------------------------------------- /test/test_tools.py: -------------------------------------------------------------------------------- 1 | # -*- Encoding: utf-8 -*- 2 | from os.path import join, dirname 3 | 4 | 5 | testdata = join(dirname(__file__), 'testdata.html') 6 | 7 | 8 | def request(url, filename): 9 | print '\n---------------- requesting ------------' 10 | print 'url: {}'.format(url) 11 | print 'filename: {}'.format(filename) 12 | print '' 13 | with open(testdata, 'r') as f: 14 | text = ''.join(f.readlines()) 15 | return text 16 | 17 | 18 | def request_pages(key, page_range, url_ptn, find_items, resend=3, 19 | min_num=0, max_failed=5, filename_ptn=None): 20 | print '\n---------------- requesting pages ------------' 21 | for i in page_range[:3]: # first 3 pages 22 | print 'url: {}'.format(url_ptn.format(key=key, page=i)) 23 | print 'filename: {}'.format(filename_ptn.format(key, i)) 24 | print '' 25 | 26 | with open(testdata, 'r') as f: 27 | text = ''.join(f.readlines()) 28 | return find_items(text, key) 29 | -------------------------------------------------------------------------------- /test/testdata.html: -------------------------------------------------------------------------------- 1 | yy19870506yy 2 | dpuser_81243665348 3 | 60 | 61 | 630 | --------------------------------------------------------------------------------