├── README.md ├── qrcode_for_gh_61c6224cfae9_258.jpg └── spiderWanghong ├── db.sql ├── huajiao.py ├── mysql.py └── wanghong.py /README.md: -------------------------------------------------------------------------------- 1 | # PythonPractice 2 | 3 | ### 使用说明 4 | * 1. 使用db.sql建立mysql数据库 5 | * 2. 在wanghong.py的BoseModel定义里设置mysql的连接参数 6 | * 3. 安装python库pymysql, requests, BeautifulSoup 7 | * 4. 运行以下命令, 会提示支持的操作 8 | ``` 9 | # python3 wanghong.py 10 | Usage: python3 wanghong.py [spider_womiyouxuan_actors|spider_yixia_videos|spider_yixia_follows|womiyouxuan_actors_count|yixia_videos_count|yixia_actors_count] 11 | ``` 12 | * 5. 运行某一个命令,比如: 13 | ``` 14 | # python3 wanghong.py spider_yixia_follows 15 | ``` 16 | | 命令      | 含义          | 逻辑  | 17 | | :------------- |:-------------|:-----| 18 | | spider_womiyouxuan_actors     | 爬取沃米优选的主播信息 | 遍历每个分页并将主播信息写入数据表Tbl_WMYX_Actor | 19 | | spider_yixia_videos     | 爬取一下网的视频    |  从数据库中取出最新爬取的主播数据,进而爬取每个主播的视频数据,写入数据表Tbl_YiXia_Video | 20 | | spider_yixia_follows | 爬取一下网的主播    |  从数据库中取出最新爬取的主播数据,进而爬取每个主播关注的人的数据,写入数据表Tbl_YiXia_Actor | 21 | |yixia_videos_count|查看爬取的一下网视频总数| 22 | |yixia_actors_count|查看爬取的一下网主播总数| 23 | 24 | ### 已实现对以下直播类网站的数据爬取: 25 | * 花椒(http://www.huajiao.com/) 26 | * 一下(http://www.yixia.com/u/paike_oq7pzk336s) 27 | ``` 28 | ### 访问主播页面,从该页面获取到suid和主播个人信息 29 | uid = 'paike_oq7pzk336s' 30 | ret = YiXia().parse_user_page(uid) 31 | print(ret) 32 | """ 33 | {'relayed': '4', 'avatar': 'http://tp2.sinaimg.cn/2714280233/180/5728135083/0', 'video_count': '140', 'suid': 'ZPWwDeYSvPUb23SL', 'uid': 'paike_oq7pzk336s', 'follow': '13', 'followed': '21031136', 'descr': '微信订阅:dapapi。微博:papi酱。', 'location': '北京 崇文区', 'nickname': 'papi酱', 'praised': '0'} 34 | """ 35 | 36 | ### 获取某用户的关注列表 37 | suid = 'ZPWwDeYSvPUb23SL' 38 | page = 1 39 | ret = YiXia().get_follow_list(suid, page) 40 | print(ret) 41 | """ 42 | [{'followed': '169054', 'nickname': 'lyxp', 'follow': '3', 'descr': 'ta很懒什么都没有留下', 'uid': 'wxsso_nz297durpu', 'avatar': 'http://wx.qlogo.cn/mmopen/gobtgL6xn9Z6KMsibqkqWeOa8Npickk1XKUbrwIWASjw40vdNWUT74PxVIdFe8FmAQu80Yq01rx4WL74rULianT2iaSz5PKgAedH/0', 'suid': '64tfU0JCV~O2YyFVR7sRGw__', 'video_count': '11'}, {'followed': '6827071', 'nickname': '最神奇的视频', 'follow': '11', 'descr': '搞笑,预告,你的喜怒哀乐这里都能看到,通过视频,让你感', 'uid': 'sina_0udpfn0a2h', 'avatar': 'http://tp4.sinaimg.cn/2141823055/180/5621846443/0', 'suid': 'lfMtGJsFJMlMhYm2', 'video_count': '3455'}, {'followed': '352', 'nickname': '扬名止过', 'follow': '14', 'descr': '波澜不惊,荣辱不争。', 'uid': 'paike_8iqcuo8pko', 'avatar': 'http://tp2.sinaimg.cn/1583429645/180/5621703354/1', 'suid': 'gn2U51iUx4PT6k8-', 'video_count': '0'}, {'followed': '499', 'nickname': '段蓓珊', 'follow': '13', 'descr': '……', 'uid': 'paike_c4i54d6ey2', 'avatar': 'http://tp2.sinaimg.cn/1670302465/180/5632141584/0', 'suid': '1Kev5Dmc1H7SMMnX', 'video_count': '1'}, {'followed': '145', 'nickname': '胖大星Alis', 'follow': '0', 'descr': 'ta很懒什么都没有留下', 'uid': 'paike_76o4l8zotz', 'avatar': 'http://tp3.sinaimg.cn/1760582170/180/5709471341/0', 'suid': 'epu~2vdSHF23E0Q-', 'video_count': '1'}, {'followed': '295', 'nickname': '文史_海巴子', 'follow': '0', 'descr': 'ta很懒什么都没有留下', 'uid': 'paike_7bnuhrz12h', 'avatar': 'http://tp2.sinaimg.cn/2624069177/180/5634691164/1', 'suid': 'CGTQC2jMVAA4Me26', 'video_count': '0'}, {'followed': '5880191', 'nickname': '英国那些事儿', 'follow': '45', 'descr': '一个在英国爱吐槽的主页君.没事爱分享英国最搞最有意思大', 'uid': 'paike_t9y36wkt4c', 'avatar': 'http://tp3.sinaimg.cn/2549228714/180/40021372518/1', 'suid': 'Ii9QcPCa~novHdgc', 'video_count': '744'}, {'followed': '12312', 'nickname': '每天搞笑排行榜', 'follow': '6', 'descr': 'ta很懒什么都没有留下', 'uid': 'paike_oqbmsp87kq', 'avatar': 'http://tp3.sinaimg.cn/2281122894/180/5661656420/0', 'suid': 'PQX0xTUI4fgV~s3v', 'video_count': '0'}, {'followed': '3414317', 'nickname': '史上第一最最搞', 'follow': '7', 'descr': 'ta很懒什么都没有留下', 'uid': 'paike_pomohtzbiw', 'avatar': 'http://tp1.sinaimg.cn/1134796120/180/40069206893/0', 'suid': '3Xlno6tiKcXS6noq', 'video_count': '5000'}, {'followed': '63631', 'nickname': '霍泥芳', 'follow': '8', 'descr': '<夏天有风吹过>里,我是内向叛逆的半夏;<幸福生活在招', 'uid': 'paike_4kf51dy2de', 'avatar': 'http://tp1.sinaimg.cn/1277126544/180/5641596294/0', 'suid': 'yVwNg6clktoWe-Ib', 'video_count': '10'}, {'followed': '20308', 'nickname': 'M大王叫我来巡', 'follow': '0', 'descr': 'ta很懒什么都没有留下', 'uid': 'paike_rx2xp66tks', 'avatar': 'http://tp4.sinaimg.cn/1720173771/180/40048639291/1', 'suid': 'tJ2tClKrqCYm6uDc', 'video_count': '26'}, {'followed': '7195252', 'nickname': 'gogoboi', 'follow': '12', 'descr': '冒着脑残的炮火前进,前进,前进进!工作联系:gogob', 'uid': 'paike_bg95tflssd', 'avatar': 'http://tp2.sinaimg.cn/1706372681/180/40017354355/1', 'suid': 's5u1-93x2yMZx6NM', 'video_count': '20'}, {'followed': '8929355', 'nickname': '秒拍', 'follow': '659', 'descr': '秒拍-10秒拍大片!', 'uid': 'paike_i1dudsh696', 'avatar': 'http://dynimg3.yixia.com/square.124/storage.video.sina.com.cn/user-icon/EfFEP4pOsmYCl0Nf_480__1438164133711.jpg', 'suid': 'EfFEP4pOsmYCl0Nf', 'video_count': '622'}] 43 | """ 44 | 45 | ### 获取某用户的视频列表 46 | suid = 'ZPWwDeYSvPUb23SL' 47 | page = 1 48 | ret = YiXia().get_video_list(suid, page) 49 | print(ret) 50 | """ 51 | [{'scid': 'Svl4iqHkBsM~DCNCf0WPsQ__', 'detail_page': 'http://www.yixia.com/show/Svl4iqHkBsM~DCNCf0WPsQ__.htm', 'praised': 2321, 'discussed': 3258, 'flash': 'http://wscdn.miaopai.com/splayer2.2.0.swf?scid=Svl4iqHkBsM~DCNCf0WPsQ__&fromweibo=false&fromweibo=false&token=', 'img': 'http://wsacdn4.miaopai.com/stream/Svl4iqHkBsM~DCNCf0WPsQ___tmp_11_409_.jpg', 'title': '“难道只有我一个人觉得吗?”是呀!当然只有你一个人觉得!你多厉害呀!你最与众不同啦!你存在感爆棚!(祝大家一周&周一愉快嗷~比心~最近的雾霾超好吸超带感超咳咳咳咳咳咳咳咳咳', 'pub_date': '17:44', 'watched': 4680000}, {'scid': 'd5xoiWIzy9edsWtNhNZBEw__', 'detail_page': 'http://www.yixia.com/show/d5xoiWIzy9edsWtNhNZBEw__.htm', 'praised': 29000, 'discussed': 4347, 'flash': 'http://wscdn.miaopai.com/splayer2.2.0.swf?scid=d5xoiWIzy9edsWtNhNZBEw__&fromweibo=false&fromweibo=false&token=', 'img': 'http://wsacdn1.miaopai.com/stream/d5xoiWIzy9edsWtNhNZBEw___tmp_11_354_.jpg', 'title': '“现在的观众,根本不知道什么才是好电影”,资深影迷pa某酱表示。近期影片盘点,该看什么?看点在哪儿?pa某酱让你更迷惑。(本视频纯属胡说八道,不接受任何反驳,比心️', 'pub_date': '12-17', 'watched': 8200000}, {'scid': 'd3Ph834EJZtuSNeSL7AJng__', 'detail_page': 'http://www.yixia.com/show/d3Ph834EJZtuSNeSL7AJng__.htm', 'praised': 27000, 'discussed': 56, 'flash': 'http://wscdn.miaopai.com/splayer2.2.0.swf?scid=d3Ph834EJZtuSNeSL7AJng__&fromweibo=false&fromweibo=false&token=', 'img': 'http://wsacdn3.miaopai.com/stream/d3Ph834EJZtuSNeSL7AJng___tmp_11_741_.jpg', 'title': 'papi酱不定期更新的日常——pa老师的英语课。同学们', 'pub_date': '12-16', 'watched': 20240000}, {'scid': 'ZzRKTzzvM6WgNZbLRO2HUg__', 'detail_page': 'http://www.yixia.com/show/ZzRKTzzvM6WgNZbLRO2HUg__.htm', 'praised': 29000, 'discussed': 93, 'flash': 'http://wscdn.miaopai.com/splayer2.2.0.swf?scid=ZzRKTzzvM6WgNZbLRO2HUg__&fromweibo=false&fromweibo=false&token=', 'img': 'http://qncdn.miaopai.com/stream/ZzRKTzzvM6WgNZbLRO2HUg___qnweb_14818081966424.jpg', 'title': '“爱所有人,信任一些人,不伤害任何人。”这句莎剧的台词,是我在自己的视频中一直想要传达的,也是我静下来的时候不断回想的。不知多少人能接受这个视频里这样的我,希望你们看完后能认识并且接受一个或许不太熟悉的papi。(实不相瞒,这个视频,我是捂着眼睛看的(评论里不要截图给我(我羞赧...', 'pub_date': '12-15', 'watched': 21190000}] 52 | """ 53 | ``` 54 | * 沃米优选(http://video.51wom.com/) 55 | 56 | ### TODO: 57 | * 映客(http://www.inke.cn/hotlive_list.html) 58 | * 斗鱼(https://www.douyu.com/) 59 | * 微信公众号 60 | 61 | ### 代码逻辑请参考以下文章: 62 | #####[Python初学者之网络爬虫](http://mp.weixin.qq.com/s/vNcQtXWjGHnc6JMjt_vWiQ "Python初学者之网络爬虫") 63 | #####[Python初学者之网络爬虫(二)](http://mp.weixin.qq.com/s/WoLKDnaFBcJ-u3msAqtDNw "Python初学者之网络爬虫(二)") 64 | -------------------------------------------------------------------------------- /qrcode_for_gh_61c6224cfae9_258.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/octans/PythonPractice/87884e720f499f94f02ecf3bf94dc8b850ee247b/qrcode_for_gh_61c6224cfae9_258.jpg -------------------------------------------------------------------------------- /spiderWanghong/db.sql: -------------------------------------------------------------------------------- 1 | DROP DATABASE IF EXISTS `wanghong`; 2 | CREATE DATABASE `wanghong` DEFAULT CHARACTER SET utf8mb4 DEFAULT COLLATE utf8mb4_general_ci; 3 | USE `wanghong`; 4 | set names utf8mb4; 5 | 6 | 7 | DROP TABLE IF EXISTS `user`; 8 | CREATE TABLE `user` ( 9 | `id` INT UNSIGNED, 10 | `name` VARCHAR(100), 11 | `order` INT UNSIGNED, 12 | PRIMARY KEY (`id`) 13 | ); 14 | 15 | #DROP TABLE IF EXISTS `Tbl_Huajiao_Live`; 16 | CREATE TABLE `Tbl_Huajiao_Live` ( 17 | `FLiveId` INT UNSIGNED NOT NULL, 18 | `FUserId` INT UNSIGNED NOT NULL, 19 | `FWatches` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '观看人数', 20 | `FPraises` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '赞数', 21 | `FReposts` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT 'unknown', 22 | `FReplies` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT 'unknown', 23 | `FPublishTimestamp` INT UNSIGNED NOT NULL COMMENT '发布日期', 24 | `FTitle` VARCHAR(100) NOT NULL DEFAULT '' COMMENT '直播名称', 25 | `FImage` VARCHAR(255) NOT NULL DEFAULT '' COMMENT '直播封面', 26 | `FLocation` VARCHAR(255) NOT NULL DEFAULT '' COMMENT '地点', 27 | `FScrapedTime` timestamp NOT NULL COMMENT '爬虫更新时间', 28 | PRIMARY KEY (`FLiveId`) 29 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci; 30 | 31 | #DROP TABLE IF EXISTS `Tbl_Huajiao_User`; 32 | CREATE TABLE `Tbl_Huajiao_User` ( 33 | `FUserId` INT UNSIGNED NOT NULL, 34 | `FUserName` VARCHAR(255) NOT NULL DEFAULT '' COMMENT '昵称', 35 | `FLevel` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '等级', 36 | `FFollow` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '关注数', 37 | `FFollowed` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '粉丝数', 38 | `FSupported` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '赞数', 39 | `FExperience` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '经验值', 40 | `FAvatar` VARCHAR(255) NOT NULL DEFAULT '' COMMENT '头像地址', 41 | `FScrapedTime` timestamp NOT NULL COMMENT '爬虫时间', 42 | PRIMARY KEY (`FUserId`) 43 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci; 44 | 45 | 46 | ## 主播汇总表 47 | #DROP TABLE IF EXISTS `Tbl_Actor`; 48 | CREATE TABLE `Tbl_Actor` ( 49 | `id` INT(10) UNSIGNED NOT NULL AUTO_INCREMENT, 50 | `uid` VARCHAR(30) NOT NULL DEFAULT '' COMMENT '唯一标识用户', 51 | `nickname` VARCHAR(100) NOT NULL DEFAULT '' COMMENT '昵称', 52 | `follow` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '关注数', 53 | `followed` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '粉丝数', 54 | `praised` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '被赞数', 55 | `avatar` VARCHAR(200) NOT NULL DEFAULT '' COMMENT '头像', 56 | `pid` TINYINT UNSIGNED NOT NULL DEFAULT 0 COMMENT '平台id, 1-花椒,2-一下', 57 | `scraped_time` timestamp NOT NULL COMMENT '爬虫更新时间', 58 | PRIMARY KEY (`id`), 59 | UNIQUE INDEX `INDEX_uid_pid` (`uid`, `pid`) 60 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci; 61 | 62 | # 沃米优选主播表 63 | #DROP TABLE IF EXISTS `Tbl_WMYX_Actor`; 64 | CREATE TABLE `Tbl_WMYX_Actor` ( 65 | `id` INT(10) UNSIGNED NOT NULL AUTO_INCREMENT, 66 | `uuid` VARCHAR(30) NOT NULL DEFAULT '' COMMENT '沃米优选唯一id', 67 | `user_id` VARCHAR(30) NOT NULL DEFAULT '' COMMENT '平台id', 68 | `platform` TINYINT NOT NULL DEFAULT 0 COMMENT '直播平台', 69 | `nickname` VARCHAR(100) NOT NULL DEFAULT '' COMMENT '昵称', 70 | `followed` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '粉丝数', 71 | `avg_watched` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '平均观看人数', 72 | `price_dict` VARCHAR(255) NOT NULL DEFAULT '' COMMENT '推广方式和价格', 73 | `type_label` VARCHAR(50) NOT NULL DEFAULT '' COMMENT '资源分类', 74 | `geo_range` VARCHAR(30) NOT NULL DEFAULT '' COMMENT '推广覆盖地域', 75 | `sex` TINYINT NOT NULL DEFAULT 0 COMMENT '性别', 76 | `avatar` VARCHAR(200) NOT NULL DEFAULT '' COMMENT '头像', 77 | `min_price` INT(10) UNSIGNED NOT NULL DEFAULT 0 COMMENT 'min_price', 78 | `max_price` INT(10) UNSIGNED NOT NULL DEFAULT 0 COMMENT 'max_price', 79 | `address` VARCHAR(30) NOT NULL DEFAULT '' COMMENT 'address', 80 | `scraped_time` timestamp NOT NULL COMMENT '爬虫更新时间', 81 | PRIMARY KEY (`id`), 82 | UNIQUE INDEX `INDEX_uuid` (`uuid`) 83 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci; 84 | 85 | ## 一下主播表 86 | #DROP TABLE IF EXISTS `Tbl_YiXia_Actor`; 87 | CREATE TABLE `Tbl_YiXia_Actor` ( 88 | `id` INT(10) UNSIGNED NOT NULL AUTO_INCREMENT, 89 | `uid` VARCHAR(30) NOT NULL DEFAULT '' COMMENT '唯一标识用户', 90 | `suid` VARCHAR(30) NOT NULL DEFAULT '' COMMENT '唯一标识用户', 91 | `nickname` VARCHAR(100) NOT NULL DEFAULT '' COMMENT '昵称', 92 | `follow` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '关注数', 93 | `followed` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '粉丝数', 94 | `video_count` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '视频数', 95 | `relayed` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '转发数,主播转发被人的', 96 | `praised` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '被赞数', 97 | `location` VARCHAR(30) NOT NULL DEFAULT '' COMMENT '位置', 98 | `avatar` VARCHAR(200) NOT NULL DEFAULT '' COMMENT '头像', 99 | `descr` VARCHAR(255) NOT NULL DEFAULT '' COMMENT '简介', 100 | `scraped_time` timestamp NOT NULL COMMENT '爬虫更新时间', 101 | PRIMARY KEY (`id`), 102 | UNIQUE INDEX `INDEX_uid` (`uid`) 103 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci; 104 | 105 | ## 一下视频表 106 | #DROP TABLE IF EXISTS `Tbl_YiXia_Video`; 107 | CREATE TABLE `Tbl_YiXia_Video` ( 108 | `id` INT(10) UNSIGNED NOT NULL AUTO_INCREMENT, 109 | `scid` VARCHAR(30) NOT NULL DEFAULT '' COMMENT '唯一标识视频', 110 | `pub_date` VARCHAR(100) NOT NULL DEFAULT '' COMMENT '发布日期', 111 | `watched` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '观看数', 112 | `praised` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '被赞数', 113 | `discussed` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '评论数', 114 | `img` VARCHAR(200) NOT NULL DEFAULT '' COMMENT '封面', 115 | `title` VARCHAR(255) NOT NULL DEFAULT '' COMMENT '标题', 116 | `detail_page` VARCHAR(255) NOT NULL DEFAULT '' COMMENT '详情页', 117 | `flash` VARCHAR(255) NOT NULL DEFAULT '' COMMENT 'falsh地址', 118 | `scraped_time` timestamp NOT NULL COMMENT '爬虫更新时间', 119 | PRIMARY KEY (`id`), 120 | UNIQUE INDEX `INDEX_scid` (`scid`) 121 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci; 122 | 123 | 124 | 125 | 126 | 127 | -------------------------------------------------------------------------------- /spiderWanghong/huajiao.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from urllib.request import urlopen 3 | from bs4 import BeautifulSoup 4 | import re 5 | import json 6 | import pymysql 7 | import time 8 | import datetime 9 | from mysql import Model 10 | from mysql import Mysql 11 | 12 | 13 | def getNowTime(): 14 | return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())) 15 | 16 | # filter out live ids from a url 17 | def filterLiveIds(url): 18 | html = urlopen(url) 19 | liveIds = set() 20 | bsObj = BeautifulSoup(html, "html.parser") 21 | for link in bsObj.findAll("a", href=re.compile("^(/l/)")): 22 | if 'href' in link.attrs: 23 | newPage = link.attrs['href'] 24 | liveId = re.findall("[0-9]+", newPage) 25 | liveIds.add(liveId[0]) 26 | return liveIds 27 | 28 | # get live ids from recommand page 29 | def getLiveIdsFromRecommendPage(): 30 | liveIds = set() 31 | liveIds = filterLiveIds("http://www.huajiao.com/category/1000") | filterLiveIds("http://www.huajiao.com/category/1000?pageno=2") 32 | return liveIds 33 | 34 | # get user id from live page 35 | def getUserId(liveId): 36 | html = urlopen("http://www.huajiao.com/" + "l/" + str(liveId)) 37 | bsObj = BeautifulSoup(html, "html.parser") 38 | text = bsObj.title.get_text() 39 | res = re.findall("[0-9]+", text) 40 | return res[0] 41 | 42 | 43 | # get user data from user page 44 | def getUserData(userId): 45 | print('getUserData: userId=' + userId ) 46 | html = urlopen("http://www.huajiao.com/user/" + str(userId)) 47 | bsObj = BeautifulSoup(html, "html.parser") 48 | data = dict() 49 | try: 50 | userInfoObj = bsObj.find("div", {"id":"userInfo"}) 51 | data['FAvatar'] = userInfoObj.find("div", {"class": "avatar"}).img.attrs['src'] 52 | userId = userInfoObj.find("p", {"class":"user_id"}).get_text() 53 | data['FUserId'] = re.findall("[0-9]+", userId)[0] 54 | tmp = userInfoObj.h3.get_text('|', strip=True).split('|') 55 | data['FUserName'] = tmp[0] 56 | data['FLevel'] = tmp[1] 57 | tmp = userInfoObj.find("ul", {"class":"clearfix"}).get_text('|', strip=True).split('|') 58 | data['FFollow'] = tmp[0] 59 | data['FFollowed'] = tmp[2] 60 | data['FSupported'] = tmp[4] 61 | data['FExperience'] = tmp[6] 62 | 63 | return data 64 | except AttributeError: 65 | #traceback.print_exc() 66 | print(str(userId) + ":html parse error in getUserData()") 67 | return 0 68 | 69 | # get user history lives 70 | def getUserLives(userId): 71 | print('getUserLives: userId=' + str(userId)) 72 | try: 73 | url = "http://webh.huajiao.com/User/getUserFeeds?fmt=json&uid=" + str(userId) 74 | html = urlopen(url).read().decode('utf-8') 75 | jsonData = json.loads(html) 76 | if jsonData['errno'] != 0: 77 | print(str(userId) + "error occured in getUserFeeds for: " + jsonData['msg']) 78 | return 0 79 | 80 | return jsonData['data']['feeds'] 81 | except Exception as e: 82 | print(e) 83 | return 0 84 | 85 | 86 | def getTimestamp(): 87 | return (time.mktime(datetime.datetime.now().timetuple())) 88 | 89 | # update user live data 90 | def replaceUserLive(data): 91 | try: 92 | kvs = dict() 93 | kvs['FLiveId'] = int(data['relateid']) 94 | kvs['FUserId'] = int(data['FUserId']) 95 | kvs['FWatches'] = int(data['watches']) 96 | kvs['FPraises'] = int(data['praises']) 97 | kvs['FReposts'] = int(data['reposts']) 98 | kvs['FReplies'] = int(data['replies']) 99 | kvs['FPublishTimestamp'] = int(data['publishtimestamp']) 100 | kvs['FTitle'] = data['title'] 101 | kvs['FImage'] = data['image'] 102 | kvs['FLocation'] = data['location'] 103 | kvs['FScrapedTime'] = getNowTime() 104 | Live().insert(kvs, 1) 105 | except pymysql.err.InternalError as e: 106 | print(e) 107 | 108 | # spider user ids 109 | def spiderUserDatas(): 110 | for liveId in getLiveIdsFromRecommendPage(): 111 | userId = getUserId(liveId) 112 | userData = getUserData(userId) 113 | try: 114 | if userData: 115 | User().insert(userData, 1) 116 | except pymysql.err.InternalError as e: 117 | print(e) 118 | print(userData) 119 | 120 | return 1 121 | 122 | # spider user lives 123 | def spiderUserLives(): 124 | userIds = User().select("FUserId").limit(100).fetch_all() 125 | for userId in userIds: 126 | liveDatas = getUserLives(userId[0]) 127 | try: 128 | for liveData in liveDatas: 129 | liveData['feed']['FUserId'] = userId[0] 130 | replaceUserLive(liveData['feed']) 131 | except Exception as e: 132 | print(e) 133 | 134 | return 1 135 | 136 | 137 | class BoseModel(Model): 138 | conn = Mysql(host='127.0.0.1', unix_socket='/tmp/mysql.sock', user='root', passwd='123456', db='wanghong', charset='utf8') 139 | 140 | 141 | class User(BoseModel): 142 | tbl = "Tbl_Huajiao_User" 143 | 144 | 145 | class Live(BoseModel): 146 | tbl = "Tbl_Huajiao_Live" 147 | 148 | 149 | def main(argv): 150 | if len(argv) < 2: 151 | print("Usage: python3 huajiao.py [spiderUserDatas|spiderUserLives]") 152 | exit() 153 | if argv[1] == 'spiderUserDatas': 154 | spiderUserDatas() 155 | elif argv[1] == 'spiderUserLives': 156 | spiderUserLives() 157 | elif argv[1] == 'getUserCount': 158 | count = User().select("count(\"FUserId\")").fetch_one() 159 | print(count[0]) 160 | elif argv[1] == 'getLiveCount': 161 | count = Live().select("count(\"FLiveId\")").fetch_one() 162 | print(count[0]) 163 | else: 164 | print("Usage: python3 huajiao.py [spiderUserDatas|spiderUserLives|getUserCount|getLiveCount]") 165 | 166 | if __name__ == '__main__': 167 | main(sys.argv) 168 | 169 | -------------------------------------------------------------------------------- /spiderWanghong/mysql.py: -------------------------------------------------------------------------------- 1 | import pymysql 2 | 3 | 4 | def addslashes(s): 5 | try: 6 | d = {'"': '\\"', "'": "\\'", "\0": "\\\0", "\\": "\\\\"} 7 | return ''.join(d.get(c, c) for c in s) 8 | except: 9 | return s 10 | 11 | 12 | class Mysql(): 13 | def __new__(cls, **params): 14 | cls.connect(**params) 15 | return cls 16 | 17 | def __del__(cls): 18 | cls.close() 19 | 20 | @classmethod 21 | def connect(cls, **params): 22 | cls.conn = pymysql.connect(**params) 23 | cls.cursor = cls.conn.cursor() 24 | cls.cursor.execute("set names utf8mb4") 25 | 26 | @classmethod 27 | def close(cls): 28 | cls.cursor.close() 29 | cls.conn.close() 30 | 31 | @classmethod 32 | def query(cls, sql, is_dict=0): 33 | if is_dict: 34 | cls.cursor = cls.conn.cursor(pymysql.cursors.DictCursor) 35 | cls.cursor.execute(sql) 36 | return cls 37 | 38 | 39 | class Model: 40 | sql = '' 41 | 42 | def select(self, select_str): 43 | if select_str.find(",") == -1: 44 | select_str = select_str 45 | else: 46 | fields = list() 47 | for f in select_str.split(","): 48 | if f.find('as') > 0: 49 | p = f.split(" as ") 50 | fields.append(p[0].strip() + ' as `' + p[1].strip() + '`') 51 | else: 52 | fields.append('`' + f.strip() + '`') 53 | select_str = ",".join(fields) 54 | self.sql = "SELECT " + select_str + " FROM " + self.tbl 55 | return self 56 | 57 | def where(self, string): 58 | self.sql = self.sql + " WHERE " + string 59 | return self 60 | 61 | def order_by(self, string): 62 | self.sql = self.sql + " ORDER BY " + string 63 | return self 64 | 65 | def limit(self, num): 66 | self.sql = self.sql + " LIMIT " + str(num) 67 | return self 68 | 69 | def fetch_all(self, is_dict=0): 70 | return self.conn.query(self.sql, is_dict).cursor.fetchall() 71 | 72 | def fetch_one(self): 73 | return self.conn.query(self.sql).cursor.fetchone() 74 | 75 | def insert(self, data, replace=None): 76 | fields = list() 77 | for a in data.keys(): 78 | fields.append('`' + a + '`') 79 | sqlFields = ",".join(fields) 80 | 81 | values = list() 82 | for v in data.values(): 83 | v = addslashes(v) 84 | v = "\"" + v + "\"" if type(v) is type("a") else str(v) 85 | values.append(v) 86 | sqlValues = ",".join(values) 87 | 88 | action = "INSERT" if replace is None else "REPLACE" 89 | sql = action + " INTO " + self.tbl + " (" + sqlFields + ") VALUES (" + sqlValues + ")" 90 | self.conn.query(sql).conn.commit() 91 | 92 | def update(self, where, **data): 93 | pass 94 | 95 | def delete(self, where='1'): 96 | sql = "DELETE FROM " + self.tbl + " WHERE " + where 97 | self.conn.query(sql).conn.commit() 98 | -------------------------------------------------------------------------------- /spiderWanghong/wanghong.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import re 4 | import json 5 | import sys 6 | import time 7 | from mysql import Model 8 | from mysql import Mysql 9 | 10 | """ 11 | class Cache(): 12 | ''' 13 | 将_csrf的值存储到文件 14 | ''' 15 | cacheFile = './csrf.cache' 16 | fileObj = '' 17 | 18 | def read(self): 19 | if os.path.isfile(self.cacheFile): 20 | self.fileObj = open(self.cacheFile, 'r') 21 | return self.fileObj.read() 22 | else: 23 | return '' 24 | 25 | def write(self, string): 26 | self.fileObj = open(self.cacheFile, 'w') 27 | self.fileObj.write(string) 28 | 29 | def __del__(self): 30 | if self.fileObj != '': 31 | self.fileObj.close() 32 | """ 33 | 34 | 35 | def get_current_time(): 36 | 37 | return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) 38 | 39 | 40 | class Website: 41 | session = requests.session() 42 | 43 | htmlParser = BeautifulSoup 44 | 45 | jsonParser = json 46 | 47 | headers = { 48 | 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' 49 | '54.0.2840.98 Safari/537.36' 50 | } 51 | 52 | def get(self, url, params=None): 53 | if params is None: 54 | params = {} 55 | return self.session.get(url, params=params, headers=self.headers) 56 | 57 | def get_html(self, url, params=None): 58 | """ 59 | GET请求, 用于网站返回html时 60 | """ 61 | r = self.get(url, params) 62 | return self.htmlParser(r.text, 'html.parser') 63 | 64 | def get_json(self, url, params=None): 65 | """ 66 | GET请求, 用于网站返回json时 67 | """ 68 | 69 | r = self.get(url, params) 70 | 71 | return self.jsonParser.loads(r.text) 72 | 73 | def post_url_encoded(self, url, params): 74 | """ 75 | POST方式:Content-Type: application/x-www-form-urlencoded 76 | """ 77 | pass 78 | 79 | def post_multi_part(self, url, params): 80 | """ 81 | POST方式:Content-Type:multipart/form-data 82 | """ 83 | 84 | kwargs = dict() 85 | for (k, v) in params.items(): 86 | kwargs.setdefault(k, (None, v)) 87 | r = self.session.post(url, files=kwargs, headers=self.headers) 88 | 89 | return self.htmlParser(r.text, "html.parser") 90 | 91 | 92 | class YiXia(Website): 93 | 94 | def parse_user_page(self, uid): 95 | """ 96 | 访问主播页面,也是视频列表页,从该页面获取到suid和主播个人信息 97 | """ 98 | 99 | print(get_current_time() + ':' + self.__class__.__name__ + ':parse_user_page, uid=' + uid) 100 | user = dict() 101 | user['uid'] = uid 102 | url = 'http://www.yixia.com/u/' + uid 103 | bs = self.get_html(url) 104 | 105 | div = bs.find('div', {'class': 'box1'}) 106 | user['nickname'] = div.h1.a.get_text(strip=True) # 昵称 107 | 108 | stat = div.ol.get_text(strip=True) 109 | stat = re.split('关注\||粉丝', stat) 110 | user['follow'] = stat[0].strip() # 关注数 111 | user['followed'] = stat[1].strip() # 粉丝数 112 | 113 | user['avatar'] = bs.find('div', {'class': 'nav_div1'}).a.img.attrs['src'] # 头像 114 | 115 | user['suid'] = bs.find('div', {'class': 'nav_div1'}).find('button').attrs['suid'] # suid 116 | 117 | tmp = bs.find('div', {'class': 'nav_div3'}).get_text('@#$%', strip=True).split('@#$%') 118 | user['location'] = tmp[0] 119 | user['descr'] = tmp[1] 120 | 121 | tmp = bs.find('div', {'class': 'n_b_con'}).get_text(strip=True) 122 | tmp = re.split('视频|转发|赞', tmp) 123 | user['video_count'] = tmp[0] # 视频数 124 | user['relayed'] = tmp[1] # 转发数 125 | user['praised'] = tmp[2] # 被赞数 126 | 127 | return user 128 | 129 | def get_follow_list(self, suid, page=1): 130 | """ 131 | 获取某用户的关注列表 132 | 页面地址:http://www.yixia.com/u/$uid/relation/follow.htm 瀑布流展示关注列表, 133 | ajax接口地址:http://www.yixia.com/gu/follow?page=1&suid=$suid 134 | """ 135 | 136 | print(get_current_time() + ':' + self.__class__.__name__ + ':get_follow_list, suid=' + suid + ' page=' + str(page)) 137 | 138 | url = 'http://www.yixia.com/gu/follow' # ajax接口 139 | params = { 140 | 'page': page, 141 | 'suid': suid, 142 | } 143 | res = self.get_json(url, params) 144 | if res['msg'] == '': 145 | return list() 146 | 147 | res = BeautifulSoup(res['msg'], 'html.parser') 148 | boxs = res.findAll('div', {'class': 'box'}) 149 | users = list() 150 | for box in boxs: 151 | user = dict() 152 | user['suid'] = suid 153 | top = box.find('div', {'class': 'box_top'}) 154 | user['avatar'] = top.img.attrs['src'] 155 | user['uid'] = top.a.attrs['href'] 156 | user['uid'] =re.split('http://www.yixia.com/u/', user['uid']) 157 | user['uid'] = user['uid'][1] 158 | user['suid'] = top.div.h2.button.attrs['suid'] 159 | user['nickname'] = box.find('div', {'class': 'top_txt'}).a.get_text(strip=True) 160 | 161 | center = box.find('div', {'class': 'box_center'}).get_text(strip=True) 162 | center = re.split('视频|关注|粉丝', center) 163 | user['video_count'] = center[0] # 视频数 164 | user['follow'] = center[1] # 关注数 165 | user['followed'] = center[2] # 粉丝数 166 | user['descr'] = box.find('p', {'class': 'box_bottom'}).b.get_text(strip=True) 167 | users.append(user) 168 | return users 169 | 170 | def get_video_list(self, suid, page=1): 171 | """ 172 | AJAX请求视频列表 173 | """ 174 | 175 | url = 'http://www.yixia.com/gu/u' 176 | payload = { 177 | 'page': page, 178 | 'suid': suid, 179 | 'fen_type': 'channel' 180 | } 181 | json_obj = self.get_json(url, params=payload) 182 | msg = json_obj['msg'] 183 | msg = BeautifulSoup(msg, 'html.parser') 184 | 185 | ''' 186 | 解析视频标题 187 | ''' 188 | titles = list() 189 | ps = msg.findAll('p') 190 | for p in ps: 191 | titles.append(p.get_text(strip=True)) # 视频标题 192 | 193 | ''' 194 | 解析视频赞和评论数 195 | ''' 196 | stats = list() 197 | divs = msg.findAll('div', {'class': 'list clearfix'}) 198 | for div in divs: 199 | tmp = div.ol.get_text(strip=True) 200 | tmp = re.split('赞|\|评论', tmp) 201 | stats.append(tmp) 202 | 203 | ''' 204 | 解析视频其他数据 205 | ''' 206 | videos = list() 207 | divs = msg.findAll('div', {'class': 'D_video'}) 208 | for (k, div) in enumerate(divs): 209 | video = dict() 210 | video['scid'] = div.attrs['data-scid'] 211 | video['img'] = div.find('div', {'class': 'video_img'}).img.attrs['src'] # 视频封面 212 | video['flash'] = div.find('div', {'class': 'video_flash'}).attrs['va'] # 视频flash地址 213 | intro = div.find('div', {'class': 'introduction'}) 214 | head_area = intro.find('div', {'class': 'D_head_name'}).h2 215 | video['detail_page'] = head_area.a.attrs['href'] # 视频详情地址 216 | video['pub_date'] = head_area.b.get_text(strip=True) # 视频日期 217 | head_area.a.decompose() 218 | tmp = head_area.get_text(strip=True) 219 | tmp = re.split('观看', tmp) 220 | 221 | def format_num(string): 222 | 223 | # 判断是否有逗号,比如8,189 224 | try: 225 | index = string.index(',') 226 | string = string.replace(',', '') 227 | except ValueError: 228 | string = string 229 | 230 | # 判断是否有小数点 231 | try: 232 | index = string.index('.') 233 | is_float = True 234 | except ValueError: 235 | is_float = False 236 | 237 | # 是否有万字 238 | t = string[len(string)-1] 239 | if t == '万': 240 | num = string.replace('万', '') 241 | if is_float: 242 | ret = int(float(num) * 10000) 243 | else: 244 | ret = int(num) * 10000 245 | else: 246 | if is_float: 247 | ret = float(string) 248 | else: 249 | ret = int(string) 250 | 251 | return ret 252 | 253 | try: 254 | video['watched'] = format_num(tmp[0]) #观看量 255 | video['title'] = titles[k] # 标题 256 | video['praised'] = format_num(stats[k][1]) # 赞 257 | video['discussed'] = format_num((stats[k][2])) # 评论 258 | except (ValueError, IndexError) as e: 259 | print(e) 260 | else: 261 | videos.append(video) 262 | 263 | return videos 264 | 265 | def spider_videos(self, suid, video_count): 266 | page = 1 267 | current = 0 268 | tbl_video = YiXiaVideo() 269 | while current < int(video_count): 270 | print(get_current_time() + ':' + 'spider_videos: suid=' + suid + ', page=' + str(page)) 271 | videos = self.get_video_list(suid, page) 272 | for video in videos: 273 | tbl_video.insert(video, replace=True) 274 | current += len(videos) 275 | page += 1 276 | return True 277 | 278 | def spider_follows(self, suid): 279 | page = 1 280 | tbl_user = YiXiaActor() 281 | while True: 282 | users = self.get_follow_list(suid, page) 283 | if len(users) <= 0: 284 | break; 285 | for user in users: 286 | tbl_user.insert(user, replace=True) 287 | page += 1 288 | 289 | return True 290 | 291 | 292 | 293 | class WoMiYouXuan(Website): 294 | """ 295 | 网红数据分析平台:沃米优选 http://www.51wom.com/ 296 | """ 297 | 298 | csrf = '' 299 | 300 | def __init__(self): 301 | self.first_kiss() 302 | 303 | def first_kiss(self): 304 | """ 305 | 首次请求获取cookies和csrf, 将cookies和csrf放入后续每次发请求的头信息里; 306 | 其中cookies由requests.session()自动处理 307 | """ 308 | 309 | url = 'http://video.51wom.com/' 310 | html = self.get_html(url) 311 | self.csrf = html.find('meta', {'name': 'csrf-token'}).attrs['content'] 312 | 313 | def parse_actor_list_page(self, page=1): 314 | """ 315 | 从主播列表页获取主播信息 316 | """ 317 | 318 | ''' 319 | 构造参数->发送请求 320 | ''' 321 | url = 'http://video.51wom.com/media/' + str(page) + '.html' 322 | keys = ('_csrf', 'stage-name', 'platform', ' industry', 'price', 'follower_num', 'follower_area', 323 | 'page', 'is_video_platform', 'sort_by_price', 'type_by_price') 324 | params = dict() 325 | for key in keys: 326 | params.setdefault(key, '') 327 | params['_csrf'] = self.csrf 328 | params['page'] = str(page) 329 | html = self.post_multi_part(url, params) 330 | 331 | ''' 332 | 总条目数 333 | ''' 334 | total = int(html.find('div', {'id': 'w0'}).find('span', {'class': 'gross'}).i.get_text(strip=True)) 335 | 336 | ''' 337 | 解析主播列表 338 | ''' 339 | trs = html.find('div', {'id': 'table-list'}).table.findAll('tr') 340 | trs.pop(0) # 去除标题行 341 | actor_list = list() 342 | for tr in trs: 343 | actor_dict = dict() 344 | 345 | tds = tr.find_all('td') 346 | 347 | actor_dict['address'] = tds[0].span.attrs['data-address'] 348 | actor_dict['uuid'] = tds[0].span.attrs['data-uuid'] 349 | 350 | def format_price(price_str): 351 | p = price_str.split('.') 352 | p = 0 if p[0] == '' else p[0] 353 | return p 354 | actor_dict['max_price'] = format_price(tds[0].span.attrs['data-max-price']) 355 | actor_dict['min_price'] = format_price(tds[0].span.attrs['data-min-price']) 356 | 357 | as_ = tds[1].find_all('a') 358 | actor_dict['avatar'] = as_[0].img.attrs['src'] # 头像 359 | actor_dict['nickname'] = as_[1].get_text(strip=True) # 昵称 360 | sex = tds[1].find('i', {'class': 'note'}).img.attrs['src'] 361 | index_tmp = sex.find('.png') 362 | actor_dict['sex'] = sex[index_tmp - 1:index_tmp] # 性别:1-男,2-女 363 | actor_dict['geo_range'] = tds[1].find('span', {'class': 'name synopsis'}).get_text(strip=True) # 地域范围 364 | actor_dict['type_label'] = tds[1].li.get_text(strip=True) # 资源分类 365 | 366 | platform = tds[2].img.attrs['src'] 367 | index_tmp = platform.find('.png') 368 | actor_dict['platform'] = platform[index_tmp - 1:index_tmp] # 平台:5-秒拍, 369 | 370 | user_id = tds[3].span.get_text(strip=True).split('ID:') 371 | actor_dict['user_id'] = user_id[1].strip() # 用户在平台的user id 372 | 373 | actor_dict['followed'] = tds[4].span.get_text(strip=True) # 粉丝数 374 | 375 | prices = tds[5].find_all('p', {'class', 'p-price'}) # 报价方式,比如"视频原创+发布","线上直播","线下直播" 376 | price_dict = dict() 377 | for price in prices: 378 | price = price.get_text(strip=True).split(':') 379 | try: 380 | price_dict[price[0]] = price[1] 381 | except IndexError: 382 | pass 383 | actor_dict['price_dict'] = price_dict 384 | 385 | avg_watched = tds[6].get_text(strip=True) # 平均观看人数 386 | mode = re.compile(r'\d+') 387 | tmp = mode.findall(avg_watched) 388 | try: 389 | avg_watched = tmp[0] 390 | except IndexError: 391 | pass 392 | actor_dict['avg_watched'] = avg_watched 393 | actor_list.append(actor_dict) 394 | 395 | return {'total': total, 'page': page, 'items_count': len(actor_list), 'items': actor_list} 396 | 397 | def spider_actors(self): 398 | page = 1 399 | tbl_actor = WMYXActor() 400 | while True: 401 | ret = self.parse_actor_list_page(page) 402 | for actor in ret['items']: 403 | actor['price_dict'] = json.dumps(actor['price_dict']) 404 | tbl_actor.insert(actor, replace=True) 405 | if ret['items_count'] * ret['page'] < ret['total']: 406 | page += 1 407 | else: 408 | break 409 | 410 | 411 | class BoseModel(Model): 412 | conn = Mysql(host='127.0.0.1', user='root', passwd='123456', db='wanghong', charset='utf8') 413 | 414 | 415 | class WMYXActor(BoseModel): 416 | tbl = "Tbl_WMYX_Actor" 417 | 418 | 419 | class YiXiaActor(BoseModel): 420 | tbl = "Tbl_YiXia_Actor" 421 | 422 | 423 | class YiXiaVideo(BoseModel): 424 | tbl = "Tbl_YiXia_Video" 425 | 426 | class HuaJiaoActor(BoseModel): 427 | tbl = "Tbl_Huajiao_User" 428 | 429 | class Actor(BoseModel): 430 | tbl = "Tbl_Actor" 431 | 432 | 433 | def agg_actors(): 434 | Actor().delete() 435 | 436 | # 一下网 437 | actors = YiXiaActor().select('uid, nickname, follow, followed, praised, avatar, 2 as pid')\ 438 | .order_by('followed desc').limit(500).fetch_all(is_dict=1) 439 | try: 440 | for actor in actors: 441 | Actor().insert(actor) 442 | except Exception as e: 443 | print(e) 444 | 445 | # 花椒网 446 | actors = HuaJiaoActor().select('FUserId as uid, FUserName as nickname, FFollow as follow, FFollowed as followed,\ 447 | FSupported as praised, FAvatar as avatar, 1 as pid').order_by('FFollowed')\ 448 | .limit(500).fetch_all(is_dict=1) 449 | try: 450 | for actor in actors: 451 | Actor().insert(actor) 452 | except Exception as e: 453 | print(e) 454 | 455 | def spider_yixia_videos(): 456 | # yixia_actors = WMYXActor().select('user_id').where('platform=5').order_by('scraped_time desc').fetch_all() 457 | yixia_actors = YiXiaActor().select('uid').order_by('scraped_time desc').limit(100).fetch_all() 458 | y = YiXia() 459 | for actor in yixia_actors: 460 | uid = actor[0] 461 | actor = y.parse_user_page(uid) 462 | YiXiaActor().insert(actor, replace=True) 463 | y.spider_videos(actor['suid'], actor['video_count']) 464 | 465 | 466 | def spider_womiyouxuan_actors(): 467 | WoMiYouXuan().spider_actors() 468 | 469 | 470 | def spider_yixia_follows(): 471 | suids = YiXiaActor().select('suid').order_by('scraped_time desc, id desc').limit(20).fetch_all() 472 | if len(suids) <= 0: 473 | suids = [{'ZPWwDeYSvPUb23SL'}] 474 | for suid in suids: 475 | YiXia().spider_follows(suid[0]) 476 | 477 | def main(argv): 478 | useage = "Usage: python3 wanghong.py [spider_womiyouxuan_actors|spider_yixia_videos|spider_yixia_follows|" \ 479 | "womiyouxuan_actors_count|" \ 480 | "yixia_videos_count|yixia_actors_count" \ 481 | "|agg_actors]" 482 | if len(argv) < 2: 483 | print(useage) 484 | exit() 485 | 486 | if argv[1] == 'spider_womiyouxuan_actors': 487 | spider_womiyouxuan_actors() 488 | elif argv[1] == 'spider_yixia_videos': 489 | print(get_current_time() + ':' + 'spider_yixia_videos start') 490 | spider_yixia_videos() 491 | print(get_current_time() + ':' + 'spider_yixia_videos end') 492 | elif argv[1] == 'spider_yixia_follows': 493 | print(get_current_time() + ':' + 'spider_yixia_follows start') 494 | spider_yixia_follows() 495 | print(get_current_time() + ':' + 'spider_yixia_follows end') 496 | elif argv[1] == 'womiyouxuan_actors_count': 497 | count = WMYXActor().select("count(\"id\")").fetch_one() 498 | print(count[0]) 499 | elif argv[1] == 'yixia_videos_count': 500 | count = YiXiaVideo().select("count(\"id\")").fetch_one() 501 | print(count[0]) 502 | elif argv[1] == 'yixia_actors_count': 503 | count = YiXiaActor().select("count(\"id\")").fetch_one() 504 | print(count[0]) 505 | elif argv[1] == 'agg_actors': 506 | agg_actors() 507 | else: 508 | print(useage) 509 | 510 | if __name__ == '__main__': 511 | main(sys.argv) 512 | 513 | 514 | 515 | 516 | 517 | 518 | --------------------------------------------------------------------------------