├── README.md
├── qrcode_for_gh_61c6224cfae9_258.jpg
└── spiderWanghong
    ├── db.sql
    ├── huajiao.py
    ├── mysql.py
    └── wanghong.py


/README.md:
--------------------------------------------------------------------------------
 1 | # PythonPractice
 2 | 
 3 | ### 使用说明
 4 | * 1. 使用db.sql建立mysql数据库
 5 | * 2. 在wanghong.py的BoseModel定义里设置mysql的连接参数
 6 | * 3. 安装python库pymysql, requests, BeautifulSoup
 7 | * 4. 运行以下命令, 会提示支持的操作
 8 | ```
 9 | # python3 wanghong.py
10 | Usage: python3 wanghong.py [spider_womiyouxuan_actors|spider_yixia_videos|spider_yixia_follows|womiyouxuan_actors_count|yixia_videos_count|yixia_actors_count]
11 | ```
12 | * 5. 运行某一个命令，比如：
13 | ```
14 | # python3 wanghong.py spider_yixia_follows
15 | ```
16 | | 命令       | 含义          | 逻辑  |
17 | | :------------- |:-------------|:-----|
18 | | spider_womiyouxuan_actors      | 爬取沃米优选的主播信息 | 遍历每个分页并将主播信息写入数据表Tbl_WMYX_Actor |
19 | | spider_yixia_videos      | 爬取一下网的视频     |  从数据库中取出最新爬取的主播数据，进而爬取每个主播的视频数据，写入数据表Tbl_YiXia_Video |
20 | | spider_yixia_follows | 爬取一下网的主播     |  从数据库中取出最新爬取的主播数据，进而爬取每个主播关注的人的数据，写入数据表Tbl_YiXia_Actor |
21 | |yixia_videos_count|查看爬取的一下网视频总数|
22 | |yixia_actors_count|查看爬取的一下网主播总数|
23 | 
24 | ### 已实现对以下直播类网站的数据爬取:
25 | * 花椒(http://www.huajiao.com/)
26 | * 一下(http://www.yixia.com/u/paike_oq7pzk336s)
27 | ```
28 | ### 访问主播页面，从该页面获取到suid和主播个人信息
29 | uid = 'paike_oq7pzk336s'
30 | ret = YiXia().parse_user_page(uid)
31 | print(ret)
32 | """
33 | {'relayed': '4', 'avatar': 'http://tp2.sinaimg.cn/2714280233/180/5728135083/0', 'video_count': '140', 'suid': 'ZPWwDeYSvPUb23SL', 'uid': 'paike_oq7pzk336s', 'follow': '13', 'followed': '21031136', 'descr': '微信订阅：dapapi。微博：papi酱。', 'location': '北京 崇文区', 'nickname': 'papi酱', 'praised': '0'}
34 | """
35 | 
36 | ### 获取某用户的关注列表
37 | suid = 'ZPWwDeYSvPUb23SL'
38 | page = 1
39 | ret = YiXia().get_follow_list(suid, page)
40 | print(ret)
41 | """
42 | [{'followed': '169054', 'nickname': 'lyxp', 'follow': '3', 'descr': 'ta很懒什么都没有留下', 'uid': 'wxsso_nz297durpu', 'avatar': 'http://wx.qlogo.cn/mmopen/gobtgL6xn9Z6KMsibqkqWeOa8Npickk1XKUbrwIWASjw40vdNWUT74PxVIdFe8FmAQu80Yq01rx4WL74rULianT2iaSz5PKgAedH/0', 'suid': '64tfU0JCV~O2YyFVR7sRGw__', 'video_count': '11'}, {'followed': '6827071', 'nickname': '最神奇的视频', 'follow': '11', 'descr': '搞笑，预告，你的喜怒哀乐这里都能看到，通过视频，让你感', 'uid': 'sina_0udpfn0a2h', 'avatar': 'http://tp4.sinaimg.cn/2141823055/180/5621846443/0', 'suid': 'lfMtGJsFJMlMhYm2', 'video_count': '3455'}, {'followed': '352', 'nickname': '扬名止过', 'follow': '14', 'descr': '波澜不惊，荣辱不争。', 'uid': 'paike_8iqcuo8pko', 'avatar': 'http://tp2.sinaimg.cn/1583429645/180/5621703354/1', 'suid': 'gn2U51iUx4PT6k8-', 'video_count': '0'}, {'followed': '499', 'nickname': '段蓓珊', 'follow': '13', 'descr': '……', 'uid': 'paike_c4i54d6ey2', 'avatar': 'http://tp2.sinaimg.cn/1670302465/180/5632141584/0', 'suid': '1Kev5Dmc1H7SMMnX', 'video_count': '1'}, {'followed': '145', 'nickname': '胖大星Alis', 'follow': '0', 'descr': 'ta很懒什么都没有留下', 'uid': 'paike_76o4l8zotz', 'avatar': 'http://tp3.sinaimg.cn/1760582170/180/5709471341/0', 'suid': 'epu~2vdSHF23E0Q-', 'video_count': '1'}, {'followed': '295', 'nickname': '文史_海巴子', 'follow': '0', 'descr': 'ta很懒什么都没有留下', 'uid': 'paike_7bnuhrz12h', 'avatar': 'http://tp2.sinaimg.cn/2624069177/180/5634691164/1', 'suid': 'CGTQC2jMVAA4Me26', 'video_count': '0'}, {'followed': '5880191', 'nickname': '英国那些事儿', 'follow': '45', 'descr': '一个在英国爱吐槽的主页君.没事爱分享英国最搞最有意思大', 'uid': 'paike_t9y36wkt4c', 'avatar': 'http://tp3.sinaimg.cn/2549228714/180/40021372518/1', 'suid': 'Ii9QcPCa~novHdgc', 'video_count': '744'}, {'followed': '12312', 'nickname': '每天搞笑排行榜', 'follow': '6', 'descr': 'ta很懒什么都没有留下', 'uid': 'paike_oqbmsp87kq', 'avatar': 'http://tp3.sinaimg.cn/2281122894/180/5661656420/0', 'suid': 'PQX0xTUI4fgV~s3v', 'video_count': '0'}, {'followed': '3414317', 'nickname': '史上第一最最搞', 'follow': '7', 'descr': 'ta很懒什么都没有留下', 'uid': 'paike_pomohtzbiw', 'avatar': 'http://tp1.sinaimg.cn/1134796120/180/40069206893/0', 'suid': '3Xlno6tiKcXS6noq', 'video_count': '5000'}, {'followed': '63631', 'nickname': '霍泥芳', 'follow': '8', 'descr': '＜夏天有风吹过＞里，我是内向叛逆的半夏；＜幸福生活在招', 'uid': 'paike_4kf51dy2de', 'avatar': 'http://tp1.sinaimg.cn/1277126544/180/5641596294/0', 'suid': 'yVwNg6clktoWe-Ib', 'video_count': '10'}, {'followed': '20308', 'nickname': 'M大王叫我来巡', 'follow': '0', 'descr': 'ta很懒什么都没有留下', 'uid': 'paike_rx2xp66tks', 'avatar': 'http://tp4.sinaimg.cn/1720173771/180/40048639291/1', 'suid': 'tJ2tClKrqCYm6uDc', 'video_count': '26'}, {'followed': '7195252', 'nickname': 'gogoboi', 'follow': '12', 'descr': '冒着脑残的炮火前进，前进，前进进！工作联系：gogob', 'uid': 'paike_bg95tflssd', 'avatar': 'http://tp2.sinaimg.cn/1706372681/180/40017354355/1', 'suid': 's5u1-93x2yMZx6NM', 'video_count': '20'}, {'followed': '8929355', 'nickname': '秒拍', 'follow': '659', 'descr': '秒拍-10秒拍大片！', 'uid': 'paike_i1dudsh696', 'avatar': 'http://dynimg3.yixia.com/square.124/storage.video.sina.com.cn/user-icon/EfFEP4pOsmYCl0Nf_480__1438164133711.jpg', 'suid': 'EfFEP4pOsmYCl0Nf', 'video_count': '622'}]
43 | """
44 | 
45 | ### 获取某用户的视频列表
46 | suid = 'ZPWwDeYSvPUb23SL'
47 | page = 1
48 | ret = YiXia().get_video_list(suid, page)
49 | print(ret)
50 | """
51 | [{'scid': 'Svl4iqHkBsM~DCNCf0WPsQ__', 'detail_page': 'http://www.yixia.com/show/Svl4iqHkBsM~DCNCf0WPsQ__.htm', 'praised': 2321, 'discussed': 3258, 'flash': 'http://wscdn.miaopai.com/splayer2.2.0.swf?scid=Svl4iqHkBsM~DCNCf0WPsQ__&fromweibo=false&fromweibo=false&token=', 'img': 'http://wsacdn4.miaopai.com/stream/Svl4iqHkBsM~DCNCf0WPsQ___tmp_11_409_.jpg', 'title': '“难道只有我一个人觉得吗？”是呀！当然只有你一个人觉得！你多厉害呀！你最与众不同啦！你存在感爆棚！（祝大家一周&周一愉快嗷~比心~最近的雾霾超好吸超带感超咳咳咳咳咳咳咳咳咳', 'pub_date': '17:44', 'watched': 4680000}, {'scid': 'd5xoiWIzy9edsWtNhNZBEw__', 'detail_page': 'http://www.yixia.com/show/d5xoiWIzy9edsWtNhNZBEw__.htm', 'praised': 29000, 'discussed': 4347, 'flash': 'http://wscdn.miaopai.com/splayer2.2.0.swf?scid=d5xoiWIzy9edsWtNhNZBEw__&fromweibo=false&fromweibo=false&token=', 'img': 'http://wsacdn1.miaopai.com/stream/d5xoiWIzy9edsWtNhNZBEw___tmp_11_354_.jpg', 'title': '“现在的观众，根本不知道什么才是好电影”，资深影迷pa某酱表示。近期影片盘点，该看什么？看点在哪儿？pa某酱让你更迷惑。（本视频纯属胡说八道，不接受任何反驳，比心️', 'pub_date': '12-17', 'watched': 8200000}, {'scid': 'd3Ph834EJZtuSNeSL7AJng__', 'detail_page': 'http://www.yixia.com/show/d3Ph834EJZtuSNeSL7AJng__.htm', 'praised': 27000, 'discussed': 56, 'flash': 'http://wscdn.miaopai.com/splayer2.2.0.swf?scid=d3Ph834EJZtuSNeSL7AJng__&fromweibo=false&fromweibo=false&token=', 'img': 'http://wsacdn3.miaopai.com/stream/d3Ph834EJZtuSNeSL7AJng___tmp_11_741_.jpg', 'title': 'papi酱不定期更新的日常——pa老师的英语课。同学们', 'pub_date': '12-16', 'watched': 20240000}, {'scid': 'ZzRKTzzvM6WgNZbLRO2HUg__', 'detail_page': 'http://www.yixia.com/show/ZzRKTzzvM6WgNZbLRO2HUg__.htm', 'praised': 29000, 'discussed': 93, 'flash': 'http://wscdn.miaopai.com/splayer2.2.0.swf?scid=ZzRKTzzvM6WgNZbLRO2HUg__&fromweibo=false&fromweibo=false&token=', 'img': 'http://qncdn.miaopai.com/stream/ZzRKTzzvM6WgNZbLRO2HUg___qnweb_14818081966424.jpg', 'title': '“爱所有人，信任一些人，不伤害任何人。”这句莎剧的台词，是我在自己的视频中一直想要传达的，也是我静下来的时候不断回想的。不知多少人能接受这个视频里这样的我，希望你们看完后能认识并且接受一个或许不太熟悉的papi。（实不相瞒，这个视频，我是捂着眼睛看的（评论里不要截图给我（我羞赧...', 'pub_date': '12-15', 'watched': 21190000}]
52 | """
53 | ```
54 | * 沃米优选(http://video.51wom.com/)
55 | 
56 | ### TODO:
57 | * 映客(http://www.inke.cn/hotlive_list.html)
58 | * 斗鱼(https://www.douyu.com/)
59 | * 微信公众号
60 | 
61 | ### 代码逻辑请参考以下文章：
62 | #####[Python初学者之网络爬虫](http://mp.weixin.qq.com/s/vNcQtXWjGHnc6JMjt_vWiQ "Python初学者之网络爬虫")
63 | #####[Python初学者之网络爬虫(二)](http://mp.weixin.qq.com/s/WoLKDnaFBcJ-u3msAqtDNw "Python初学者之网络爬虫(二)")
64 | 


--------------------------------------------------------------------------------
/qrcode_for_gh_61c6224cfae9_258.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/octans/PythonPractice/87884e720f499f94f02ecf3bf94dc8b850ee247b/qrcode_for_gh_61c6224cfae9_258.jpg


--------------------------------------------------------------------------------
/spiderWanghong/db.sql:
--------------------------------------------------------------------------------
  1 | DROP DATABASE IF EXISTS `wanghong`;
  2 | CREATE DATABASE `wanghong` DEFAULT CHARACTER SET utf8mb4 DEFAULT COLLATE utf8mb4_general_ci;
  3 | USE `wanghong`;
  4 | set names utf8mb4;
  5 | 
  6 | 
  7 | DROP TABLE IF EXISTS `user`;
  8 | CREATE TABLE `user` (
  9 |     `id` INT UNSIGNED,
 10 |     `name` VARCHAR(100),
 11 |     `order` INT UNSIGNED,
 12 |     PRIMARY KEY (`id`)
 13 | );
 14 | 
 15 | #DROP TABLE IF EXISTS `Tbl_Huajiao_Live`;
 16 | CREATE TABLE `Tbl_Huajiao_Live` (
 17 |     `FLiveId` INT UNSIGNED NOT NULL,
 18 |     `FUserId` INT UNSIGNED NOT NULL,
 19 |     `FWatches` INT UNSIGNED NOT NULL DEFAULT 0  COMMENT '观看人数',
 20 |     `FPraises` INT UNSIGNED NOT NULL DEFAULT 0  COMMENT '赞数',
 21 |     `FReposts` INT UNSIGNED NOT NULL DEFAULT 0  COMMENT 'unknown',
 22 |     `FReplies` INT UNSIGNED NOT NULL DEFAULT 0  COMMENT 'unknown',
 23 |     `FPublishTimestamp` INT UNSIGNED NOT NULL COMMENT '发布日期',
 24 |     `FTitle` VARCHAR(100) NOT NULL DEFAULT '' COMMENT '直播名称',
 25 |     `FImage` VARCHAR(255) NOT NULL DEFAULT '' COMMENT '直播封面',
 26 |     `FLocation` VARCHAR(255) NOT NULL DEFAULT '' COMMENT '地点',
 27 |     `FScrapedTime` timestamp NOT NULL COMMENT '爬虫更新时间',
 28 |     PRIMARY KEY (`FLiveId`)
 29 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
 30 | 
 31 | #DROP TABLE IF EXISTS `Tbl_Huajiao_User`;
 32 | CREATE TABLE `Tbl_Huajiao_User` (
 33 |     `FUserId` INT UNSIGNED NOT NULL,
 34 |     `FUserName` VARCHAR(255) NOT NULL DEFAULT '' COMMENT '昵称',
 35 |     `FLevel` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '等级',
 36 |     `FFollow` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '关注数',
 37 |     `FFollowed` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '粉丝数',
 38 |     `FSupported` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '赞数',
 39 |     `FExperience` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '经验值',
 40 |     `FAvatar` VARCHAR(255) NOT NULL DEFAULT '' COMMENT '头像地址',
 41 |     `FScrapedTime` timestamp NOT NULL COMMENT '爬虫时间',
 42 |     PRIMARY KEY (`FUserId`)
 43 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
 44 | 
 45 | 
 46 | ## 主播汇总表
 47 | #DROP TABLE IF EXISTS `Tbl_Actor`;
 48 | CREATE TABLE `Tbl_Actor` (
 49 |     `id` INT(10) UNSIGNED NOT NULL AUTO_INCREMENT,
 50 |     `uid` VARCHAR(30) NOT NULL DEFAULT '' COMMENT '唯一标识用户',
 51 |     `nickname` VARCHAR(100) NOT NULL DEFAULT '' COMMENT '昵称',
 52 |     `follow` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '关注数',
 53 |     `followed` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '粉丝数',
 54 |     `praised` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '被赞数',
 55 |     `avatar` VARCHAR(200) NOT NULL DEFAULT '' COMMENT '头像',
 56 |     `pid` TINYINT UNSIGNED NOT NULL DEFAULT 0 COMMENT '平台id, 1-花椒，2-一下',
 57 |     `scraped_time` timestamp NOT NULL COMMENT '爬虫更新时间',
 58 |     PRIMARY KEY (`id`),
 59 |     UNIQUE INDEX `INDEX_uid_pid` (`uid`, `pid`)
 60 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
 61 | 
 62 | # 沃米优选主播表
 63 | #DROP TABLE IF EXISTS `Tbl_WMYX_Actor`;
 64 | CREATE TABLE `Tbl_WMYX_Actor` (
 65 |     `id` INT(10) UNSIGNED NOT NULL AUTO_INCREMENT,
 66 |     `uuid` VARCHAR(30) NOT NULL DEFAULT '' COMMENT '沃米优选唯一id',
 67 |     `user_id` VARCHAR(30) NOT NULL DEFAULT '' COMMENT '平台id',
 68 |     `platform` TINYINT NOT NULL DEFAULT 0 COMMENT '直播平台',
 69 |     `nickname` VARCHAR(100) NOT NULL DEFAULT '' COMMENT '昵称',
 70 |     `followed` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '粉丝数',
 71 |     `avg_watched` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '平均观看人数',
 72 |     `price_dict` VARCHAR(255) NOT NULL DEFAULT '' COMMENT '推广方式和价格',
 73 |     `type_label` VARCHAR(50) NOT NULL DEFAULT '' COMMENT '资源分类',
 74 |     `geo_range` VARCHAR(30) NOT NULL DEFAULT '' COMMENT '推广覆盖地域',
 75 |     `sex` TINYINT NOT NULL DEFAULT 0 COMMENT '性别',
 76 |     `avatar` VARCHAR(200) NOT NULL DEFAULT '' COMMENT '头像',
 77 |     `min_price` INT(10) UNSIGNED NOT NULL DEFAULT 0 COMMENT 'min_price',
 78 |     `max_price` INT(10) UNSIGNED NOT NULL DEFAULT 0 COMMENT 'max_price',
 79 |     `address` VARCHAR(30) NOT NULL DEFAULT '' COMMENT 'address',
 80 |     `scraped_time` timestamp NOT NULL COMMENT '爬虫更新时间',
 81 |     PRIMARY KEY (`id`),
 82 |     UNIQUE INDEX `INDEX_uuid` (`uuid`)
 83 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
 84 | 
 85 | ## 一下主播表
 86 | #DROP TABLE IF EXISTS `Tbl_YiXia_Actor`;
 87 | CREATE TABLE `Tbl_YiXia_Actor` (
 88 |     `id` INT(10) UNSIGNED NOT NULL AUTO_INCREMENT,
 89 |     `uid` VARCHAR(30) NOT NULL DEFAULT '' COMMENT '唯一标识用户',
 90 |     `suid` VARCHAR(30) NOT NULL DEFAULT '' COMMENT '唯一标识用户',
 91 |     `nickname` VARCHAR(100) NOT NULL DEFAULT '' COMMENT '昵称',
 92 |     `follow` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '关注数',
 93 |     `followed` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '粉丝数',
 94 |     `video_count` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '视频数',
 95 |     `relayed` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '转发数，主播转发被人的',
 96 |     `praised` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '被赞数',
 97 |     `location` VARCHAR(30) NOT NULL DEFAULT '' COMMENT '位置',
 98 |     `avatar` VARCHAR(200) NOT NULL DEFAULT '' COMMENT '头像',
 99 |     `descr` VARCHAR(255) NOT NULL DEFAULT '' COMMENT '简介',
100 |     `scraped_time` timestamp NOT NULL COMMENT '爬虫更新时间',
101 |     PRIMARY KEY (`id`),
102 |     UNIQUE INDEX `INDEX_uid` (`uid`)
103 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
104 | 
105 | ## 一下视频表
106 | #DROP TABLE IF EXISTS `Tbl_YiXia_Video`;
107 | CREATE TABLE `Tbl_YiXia_Video` (
108 |     `id` INT(10) UNSIGNED NOT NULL AUTO_INCREMENT,
109 |     `scid` VARCHAR(30) NOT NULL DEFAULT '' COMMENT '唯一标识视频',
110 |     `pub_date` VARCHAR(100) NOT NULL DEFAULT '' COMMENT '发布日期',
111 |     `watched` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '观看数',
112 |     `praised` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '被赞数',
113 |     `discussed` INT UNSIGNED NOT NULL DEFAULT 0 COMMENT '评论数',
114 |     `img` VARCHAR(200) NOT NULL DEFAULT '' COMMENT '封面',
115 |     `title` VARCHAR(255) NOT NULL DEFAULT '' COMMENT '标题',
116 |     `detail_page` VARCHAR(255) NOT NULL DEFAULT '' COMMENT '详情页',
117 |     `flash` VARCHAR(255) NOT NULL DEFAULT '' COMMENT 'falsh地址',
118 |     `scraped_time` timestamp NOT NULL COMMENT '爬虫更新时间',
119 |     PRIMARY KEY (`id`),
120 |     UNIQUE INDEX `INDEX_scid` (`scid`)
121 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 


--------------------------------------------------------------------------------
/spiderWanghong/huajiao.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from urllib.request import urlopen
  3 | from bs4 import BeautifulSoup
  4 | import re
  5 | import json
  6 | import pymysql
  7 | import time
  8 | import datetime
  9 | from mysql import Model
 10 | from mysql import Mysql
 11 | 
 12 | 
 13 | def getNowTime():
 14 |     return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
 15 | 
 16 | # filter out live ids from a url
 17 | def filterLiveIds(url):
 18 |     html = urlopen(url)
 19 |     liveIds = set()
 20 |     bsObj = BeautifulSoup(html, "html.parser")
 21 |     for link in bsObj.findAll("a", href=re.compile("^(/l/)")):
 22 |         if 'href' in link.attrs:
 23 |             newPage = link.attrs['href']
 24 |             liveId = re.findall("[0-9]+", newPage)
 25 |             liveIds.add(liveId[0])
 26 |     return liveIds
 27 | 
 28 | # get live ids from recommand page
 29 | def getLiveIdsFromRecommendPage():
 30 |     liveIds = set()
 31 |     liveIds = filterLiveIds("http://www.huajiao.com/category/1000") | filterLiveIds("http://www.huajiao.com/category/1000?pageno=2")
 32 |     return liveIds
 33 | 
 34 | # get user id from live page
 35 | def getUserId(liveId):
 36 |     html = urlopen("http://www.huajiao.com/" + "l/" + str(liveId))
 37 |     bsObj = BeautifulSoup(html, "html.parser")
 38 |     text = bsObj.title.get_text()
 39 |     res = re.findall("[0-9]+", text)
 40 |     return res[0]
 41 | 
 42 | 
 43 | # get user data from user page
 44 | def getUserData(userId):
 45 |     print('getUserData: userId=' + userId )
 46 |     html = urlopen("http://www.huajiao.com/user/" + str(userId))
 47 |     bsObj = BeautifulSoup(html, "html.parser")
 48 |     data = dict()
 49 |     try:
 50 |         userInfoObj = bsObj.find("div", {"id":"userInfo"})
 51 |         data['FAvatar'] = userInfoObj.find("div", {"class": "avatar"}).img.attrs['src']
 52 |         userId = userInfoObj.find("p", {"class":"user_id"}).get_text()
 53 |         data['FUserId'] = re.findall("[0-9]+", userId)[0]
 54 |         tmp = userInfoObj.h3.get_text('|', strip=True).split('|')
 55 |         data['FUserName'] = tmp[0]
 56 |         data['FLevel'] = tmp[1]
 57 |         tmp = userInfoObj.find("ul", {"class":"clearfix"}).get_text('|', strip=True).split('|')
 58 |         data['FFollow'] = tmp[0]
 59 |         data['FFollowed'] = tmp[2]
 60 |         data['FSupported'] = tmp[4]
 61 |         data['FExperience'] = tmp[6]
 62 | 
 63 |         return data
 64 |     except AttributeError:
 65 |         #traceback.print_exc()
 66 |         print(str(userId) + ":html parse error in getUserData()")
 67 |         return 0
 68 | 
 69 | # get user history lives
 70 | def getUserLives(userId):
 71 |     print('getUserLives: userId=' + str(userId))
 72 |     try:
 73 |         url = "http://webh.huajiao.com/User/getUserFeeds?fmt=json&uid=" + str(userId)
 74 |         html = urlopen(url).read().decode('utf-8')
 75 |         jsonData = json.loads(html)
 76 |         if jsonData['errno'] != 0:
 77 |             print(str(userId) + "error occured in getUserFeeds for: " + jsonData['msg'])
 78 |             return 0
 79 | 
 80 |         return jsonData['data']['feeds']
 81 |     except Exception as e:
 82 |         print(e)
 83 |         return 0
 84 | 
 85 | 
 86 | def getTimestamp():
 87 |     return (time.mktime(datetime.datetime.now().timetuple()))
 88 | 
 89 | # update user live data
 90 | def replaceUserLive(data):
 91 |     try:
 92 |         kvs = dict()
 93 |         kvs['FLiveId'] = int(data['relateid'])
 94 |         kvs['FUserId'] = int(data['FUserId'])
 95 |         kvs['FWatches'] = int(data['watches'])
 96 |         kvs['FPraises'] = int(data['praises'])
 97 |         kvs['FReposts'] = int(data['reposts'])
 98 |         kvs['FReplies'] = int(data['replies'])
 99 |         kvs['FPublishTimestamp'] = int(data['publishtimestamp'])
100 |         kvs['FTitle'] = data['title']
101 |         kvs['FImage'] = data['image']
102 |         kvs['FLocation'] = data['location']
103 |         kvs['FScrapedTime'] = getNowTime()
104 |         Live().insert(kvs, 1)
105 |     except pymysql.err.InternalError as e:
106 |         print(e)
107 | 
108 | # spider user ids
109 | def spiderUserDatas():
110 |     for liveId in getLiveIdsFromRecommendPage():
111 |         userId = getUserId(liveId)
112 |         userData = getUserData(userId)
113 |         try:
114 |             if userData:
115 |                 User().insert(userData, 1)
116 |         except pymysql.err.InternalError as e:
117 |             print(e)
118 |             print(userData)
119 | 
120 |     return 1
121 | 
122 | # spider user lives
123 | def spiderUserLives():
124 |     userIds = User().select("FUserId").limit(100).fetch_all()
125 |     for userId in userIds:
126 |         liveDatas = getUserLives(userId[0])
127 |         try:
128 |             for liveData in liveDatas:
129 |                 liveData['feed']['FUserId'] = userId[0]
130 |                 replaceUserLive(liveData['feed'])
131 |         except Exception as e:
132 |             print(e)
133 | 
134 |     return 1
135 | 
136 | 
137 | class BoseModel(Model):
138 |     conn = Mysql(host='127.0.0.1', unix_socket='/tmp/mysql.sock', user='root', passwd='123456', db='wanghong', charset='utf8')
139 | 
140 | 
141 | class User(BoseModel):
142 |     tbl = "Tbl_Huajiao_User"
143 | 
144 | 
145 | class Live(BoseModel):
146 |     tbl = "Tbl_Huajiao_Live"
147 | 
148 | 
149 | def main(argv):
150 |     if len(argv) < 2:
151 |         print("Usage: python3 huajiao.py [spiderUserDatas|spiderUserLives]")
152 |         exit()
153 |     if argv[1] == 'spiderUserDatas':
154 |         spiderUserDatas()
155 |     elif argv[1] == 'spiderUserLives':
156 |         spiderUserLives()
157 |     elif argv[1] == 'getUserCount':
158 |         count = User().select("count(\"FUserId\")").fetch_one()
159 |         print(count[0])
160 |     elif argv[1] == 'getLiveCount':
161 |         count = Live().select("count(\"FLiveId\")").fetch_one()
162 |         print(count[0])
163 |     else:
164 |         print("Usage: python3 huajiao.py [spiderUserDatas|spiderUserLives|getUserCount|getLiveCount]")
165 | 
166 | if __name__ == '__main__':
167 |     main(sys.argv)
168 | 
169 | 


--------------------------------------------------------------------------------
/spiderWanghong/mysql.py:
--------------------------------------------------------------------------------
 1 | import pymysql
 2 | 
 3 | 
 4 | def addslashes(s):
 5 |     try:
 6 |         d = {'"': '\\"', "'": "\\'", "\0": "\\\0", "\\": "\\\\"}
 7 |         return ''.join(d.get(c, c) for c in s)
 8 |     except:
 9 |         return s
10 | 
11 | 
12 | class Mysql():
13 |     def __new__(cls, **params):
14 |         cls.connect(**params)
15 |         return cls
16 | 
17 |     def __del__(cls):
18 |         cls.close()
19 | 
20 |     @classmethod
21 |     def connect(cls, **params):
22 |         cls.conn = pymysql.connect(**params)
23 |         cls.cursor = cls.conn.cursor()
24 |         cls.cursor.execute("set names utf8mb4")
25 | 
26 |     @classmethod
27 |     def close(cls):
28 |         cls.cursor.close()
29 |         cls.conn.close()
30 | 
31 |     @classmethod
32 |     def query(cls, sql, is_dict=0):
33 |         if is_dict:
34 |             cls.cursor = cls.conn.cursor(pymysql.cursors.DictCursor)
35 |         cls.cursor.execute(sql)
36 |         return cls
37 | 
38 | 
39 | class Model:
40 |     sql = ''
41 | 
42 |     def select(self, select_str):
43 |         if select_str.find(",") == -1:
44 |             select_str = select_str
45 |         else:
46 |             fields = list()
47 |             for f in select_str.split(","):
48 |                 if f.find('as') > 0:
49 |                     p = f.split(" as ")
50 |                     fields.append(p[0].strip() + ' as `' + p[1].strip() + '`')
51 |                 else:
52 |                     fields.append('`' + f.strip() + '`')
53 |                 select_str = ",".join(fields)
54 |         self.sql = "SELECT " + select_str + " FROM " + self.tbl
55 |         return self
56 | 
57 |     def where(self, string):
58 |         self.sql = self.sql + " WHERE " + string
59 |         return self
60 | 
61 |     def order_by(self, string):
62 |         self.sql = self.sql + " ORDER BY " + string
63 |         return self
64 | 
65 |     def limit(self, num):
66 |         self.sql = self.sql + " LIMIT " + str(num)
67 |         return self
68 | 
69 |     def fetch_all(self, is_dict=0):
70 |             return self.conn.query(self.sql, is_dict).cursor.fetchall()
71 | 
72 |     def fetch_one(self):
73 |         return self.conn.query(self.sql).cursor.fetchone()
74 | 
75 |     def insert(self, data, replace=None):
76 |         fields = list()
77 |         for a in data.keys():
78 |             fields.append('`' + a + '`')
79 |         sqlFields = ",".join(fields)
80 | 
81 |         values = list()
82 |         for v in data.values():
83 |             v = addslashes(v)
84 |             v = "\"" + v + "\"" if type(v) is type("a") else str(v)
85 |             values.append(v)
86 |         sqlValues = ",".join(values)
87 | 
88 |         action = "INSERT" if replace is None else "REPLACE"
89 |         sql = action + " INTO " + self.tbl + " (" + sqlFields + ") VALUES (" + sqlValues + ")"
90 |         self.conn.query(sql).conn.commit()
91 | 
92 |     def update(self, where, **data):
93 |         pass
94 | 
95 |     def delete(self, where='1'):
96 |         sql = "DELETE FROM " + self.tbl + " WHERE " + where
97 |         self.conn.query(sql).conn.commit()
98 | 


--------------------------------------------------------------------------------
/spiderWanghong/wanghong.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | import re
  4 | import json
  5 | import sys
  6 | import time
  7 | from mysql import Model
  8 | from mysql import Mysql
  9 | 
 10 | """
 11 | class Cache():
 12 |     '''
 13 |     将_csrf的值存储到文件
 14 |     '''
 15 |     cacheFile = './csrf.cache'
 16 |     fileObj = ''
 17 | 
 18 |     def read(self):
 19 |         if os.path.isfile(self.cacheFile):
 20 |             self.fileObj = open(self.cacheFile, 'r')
 21 |             return self.fileObj.read()
 22 |         else:
 23 |             return ''
 24 | 
 25 |     def write(self, string):
 26 |         self.fileObj = open(self.cacheFile, 'w')
 27 |         self.fileObj.write(string)
 28 | 
 29 |     def __del__(self):
 30 |         if self.fileObj != '':
 31 |             self.fileObj.close()
 32 | """
 33 | 
 34 | 
 35 | def get_current_time():
 36 | 
 37 |     return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
 38 | 
 39 | 
 40 | class Website:
 41 |     session = requests.session()
 42 | 
 43 |     htmlParser = BeautifulSoup
 44 | 
 45 |     jsonParser = json
 46 | 
 47 |     headers = {
 48 |         'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
 49 |                       '54.0.2840.98 Safari/537.36'
 50 |     }
 51 | 
 52 |     def get(self, url, params=None):
 53 |         if params is None:
 54 |             params = {}
 55 |         return self.session.get(url, params=params, headers=self.headers)
 56 | 
 57 |     def get_html(self, url, params=None):
 58 |         """
 59 |         GET请求, 用于网站返回html时
 60 |         """
 61 |         r = self.get(url, params)
 62 |         return self.htmlParser(r.text, 'html.parser')
 63 | 
 64 |     def get_json(self, url, params=None):
 65 |         """
 66 |         GET请求, 用于网站返回json时
 67 |         """
 68 | 
 69 |         r = self.get(url, params)
 70 | 
 71 |         return self.jsonParser.loads(r.text)
 72 | 
 73 |     def post_url_encoded(self, url, params):
 74 |         """
 75 |         POST方式：Content-Type: application/x-www-form-urlencoded
 76 |         """
 77 |         pass
 78 | 
 79 |     def post_multi_part(self, url, params):
 80 |         """
 81 |         POST方式：Content-Type:multipart/form-data
 82 |         """
 83 | 
 84 |         kwargs = dict()
 85 |         for (k, v) in params.items():
 86 |             kwargs.setdefault(k, (None, v))
 87 |         r = self.session.post(url, files=kwargs, headers=self.headers)
 88 | 
 89 |         return self.htmlParser(r.text, "html.parser")
 90 | 
 91 | 
 92 | class YiXia(Website):
 93 | 
 94 |     def parse_user_page(self, uid):
 95 |         """
 96 |         访问主播页面，也是视频列表页，从该页面获取到suid和主播个人信息
 97 |         """
 98 | 
 99 |         print(get_current_time() + ':' + self.__class__.__name__ + ':parse_user_page, uid=' + uid)
100 |         user = dict()
101 |         user['uid'] = uid
102 |         url = 'http://www.yixia.com/u/' + uid
103 |         bs = self.get_html(url)
104 | 
105 |         div = bs.find('div', {'class': 'box1'})
106 |         user['nickname'] = div.h1.a.get_text(strip=True)  # 昵称
107 | 
108 |         stat = div.ol.get_text(strip=True)
109 |         stat = re.split('关注\||粉丝', stat)
110 |         user['follow'] = stat[0].strip()  # 关注数
111 |         user['followed'] = stat[1].strip()  # 粉丝数
112 | 
113 |         user['avatar'] = bs.find('div', {'class': 'nav_div1'}).a.img.attrs['src']  # 头像
114 | 
115 |         user['suid'] = bs.find('div', {'class': 'nav_div1'}).find('button').attrs['suid']  # suid
116 | 
117 |         tmp = bs.find('div', {'class': 'nav_div3'}).get_text('@#$%', strip=True).split('@#$%')
118 |         user['location'] = tmp[0]
119 |         user['descr'] = tmp[1]
120 | 
121 |         tmp = bs.find('div', {'class': 'n_b_con'}).get_text(strip=True)
122 |         tmp = re.split('视频|转发|赞', tmp)
123 |         user['video_count'] = tmp[0]  # 视频数
124 |         user['relayed'] = tmp[1]  # 转发数
125 |         user['praised'] = tmp[2]  # 被赞数
126 | 
127 |         return user
128 | 
129 |     def get_follow_list(self, suid, page=1):
130 |         """
131 |         获取某用户的关注列表
132 |         页面地址：http://www.yixia.com/u/$uid/relation/follow.htm 瀑布流展示关注列表，
133 |         ajax接口地址：http://www.yixia.com/gu/follow?page=1&suid=$suid
134 |         """
135 | 
136 |         print(get_current_time() + ':' + self.__class__.__name__ + ':get_follow_list, suid=' + suid + ' page=' + str(page))
137 | 
138 |         url = 'http://www.yixia.com/gu/follow'  # ajax接口
139 |         params = {
140 |             'page': page,
141 |             'suid': suid,
142 |         }
143 |         res = self.get_json(url, params)
144 |         if res['msg'] == '':
145 |             return list()
146 | 
147 |         res = BeautifulSoup(res['msg'], 'html.parser')
148 |         boxs = res.findAll('div', {'class': 'box'})
149 |         users = list()
150 |         for box in boxs:
151 |             user = dict()
152 |             user['suid'] = suid
153 |             top = box.find('div', {'class': 'box_top'})
154 |             user['avatar'] = top.img.attrs['src']
155 |             user['uid'] = top.a.attrs['href']
156 |             user['uid'] =re.split('http://www.yixia.com/u/', user['uid'])
157 |             user['uid'] = user['uid'][1]
158 |             user['suid'] = top.div.h2.button.attrs['suid']
159 |             user['nickname'] = box.find('div', {'class': 'top_txt'}).a.get_text(strip=True)
160 | 
161 |             center = box.find('div', {'class': 'box_center'}).get_text(strip=True)
162 |             center = re.split('视频|关注|粉丝', center)
163 |             user['video_count'] = center[0]  # 视频数
164 |             user['follow'] = center[1]  # 关注数
165 |             user['followed'] = center[2]  # 粉丝数
166 |             user['descr'] = box.find('p', {'class': 'box_bottom'}).b.get_text(strip=True)
167 |             users.append(user)
168 |         return users
169 | 
170 |     def get_video_list(self, suid, page=1):
171 |         """
172 |         AJAX请求视频列表
173 |         """
174 | 
175 |         url = 'http://www.yixia.com/gu/u'
176 |         payload = {
177 |             'page': page,
178 |             'suid': suid,
179 |             'fen_type': 'channel'
180 |         }
181 |         json_obj = self.get_json(url, params=payload)
182 |         msg = json_obj['msg']
183 |         msg = BeautifulSoup(msg, 'html.parser')
184 | 
185 |         '''
186 |         解析视频标题
187 |         '''
188 |         titles = list()
189 |         ps = msg.findAll('p')
190 |         for p in ps:
191 |             titles.append(p.get_text(strip=True))  # 视频标题
192 | 
193 |         '''
194 |         解析视频赞和评论数
195 |         '''
196 |         stats = list()
197 |         divs = msg.findAll('div', {'class': 'list clearfix'})
198 |         for div in divs:
199 |             tmp = div.ol.get_text(strip=True)
200 |             tmp = re.split('赞|\|评论', tmp)
201 |             stats.append(tmp)
202 | 
203 |         '''
204 |         解析视频其他数据
205 |         '''
206 |         videos = list()
207 |         divs = msg.findAll('div', {'class': 'D_video'})
208 |         for (k, div) in enumerate(divs):
209 |             video = dict()
210 |             video['scid'] = div.attrs['data-scid']
211 |             video['img'] = div.find('div', {'class': 'video_img'}).img.attrs['src']  # 视频封面
212 |             video['flash'] = div.find('div', {'class': 'video_flash'}).attrs['va']  # 视频flash地址
213 |             intro = div.find('div', {'class': 'introduction'})
214 |             head_area = intro.find('div', {'class': 'D_head_name'}).h2
215 |             video['detail_page'] = head_area.a.attrs['href']  # 视频详情地址
216 |             video['pub_date'] = head_area.b.get_text(strip=True)  # 视频日期
217 |             head_area.a.decompose()
218 |             tmp = head_area.get_text(strip=True)
219 |             tmp = re.split('观看', tmp)
220 | 
221 |             def format_num(string):
222 | 
223 |                 # 判断是否有逗号，比如8,189
224 |                 try:
225 |                     index = string.index(',')
226 |                     string = string.replace(',', '')
227 |                 except ValueError:
228 |                     string = string
229 | 
230 |                 # 判断是否有小数点
231 |                 try:
232 |                     index = string.index('.')
233 |                     is_float = True
234 |                 except ValueError:
235 |                     is_float = False
236 | 
237 |                 # 是否有万字
238 |                 t = string[len(string)-1]
239 |                 if t == '万':
240 |                     num = string.replace('万', '')
241 |                     if is_float:
242 |                         ret = int(float(num) * 10000)
243 |                     else:
244 |                         ret = int(num) * 10000
245 |                 else:
246 |                     if is_float:
247 |                         ret = float(string)
248 |                     else:
249 |                         ret = int(string)
250 | 
251 |                 return ret
252 | 
253 |             try:
254 |                 video['watched'] = format_num(tmp[0])  #观看量
255 |                 video['title'] = titles[k]  # 标题
256 |                 video['praised'] = format_num(stats[k][1])  # 赞
257 |                 video['discussed'] = format_num((stats[k][2]))  # 评论
258 |             except (ValueError, IndexError) as e:
259 |                 print(e)
260 |             else:
261 |                 videos.append(video)
262 | 
263 |         return videos
264 | 
265 |     def spider_videos(self, suid, video_count):
266 |         page = 1
267 |         current = 0
268 |         tbl_video = YiXiaVideo()
269 |         while current < int(video_count):
270 |             print(get_current_time() + ':' + 'spider_videos: suid=' + suid + ', page=' + str(page))
271 |             videos = self.get_video_list(suid, page)
272 |             for video in videos:
273 |                 tbl_video.insert(video, replace=True)
274 |             current += len(videos)
275 |             page += 1
276 |         return True
277 | 
278 |     def spider_follows(self, suid):
279 |         page = 1
280 |         tbl_user = YiXiaActor()
281 |         while True:
282 |             users = self.get_follow_list(suid, page)
283 |             if len(users) <= 0:
284 |                 break;
285 |             for user in users:
286 |                 tbl_user.insert(user, replace=True)
287 |             page += 1
288 | 
289 |         return True
290 | 
291 | 
292 | 
293 | class WoMiYouXuan(Website):
294 |     """
295 |     网红数据分析平台：沃米优选 http://www.51wom.com/
296 |     """
297 | 
298 |     csrf = ''
299 | 
300 |     def __init__(self):
301 |         self.first_kiss()
302 | 
303 |     def first_kiss(self):
304 |         """
305 |         首次请求获取cookies和csrf, 将cookies和csrf放入后续每次发请求的头信息里；
306 |         其中cookies由requests.session()自动处理
307 |         """
308 | 
309 |         url = 'http://video.51wom.com/'
310 |         html = self.get_html(url)
311 |         self.csrf = html.find('meta', {'name': 'csrf-token'}).attrs['content']
312 | 
313 |     def parse_actor_list_page(self, page=1):
314 |         """
315 |         从主播列表页获取主播信息
316 |         """
317 | 
318 |         '''
319 |         构造参数->发送请求
320 |         '''
321 |         url = 'http://video.51wom.com/media/' + str(page) + '.html'
322 |         keys = ('_csrf', 'stage-name', 'platform', ' industry', 'price', 'follower_num', 'follower_area',
323 |                 'page', 'is_video_platform', 'sort_by_price', 'type_by_price')
324 |         params = dict()
325 |         for key in keys:
326 |             params.setdefault(key, '')
327 |         params['_csrf'] = self.csrf
328 |         params['page'] = str(page)
329 |         html = self.post_multi_part(url, params)
330 | 
331 |         '''
332 |         总条目数
333 |         '''
334 |         total = int(html.find('div', {'id': 'w0'}).find('span', {'class': 'gross'}).i.get_text(strip=True))
335 | 
336 |         '''
337 |         解析主播列表
338 |         '''
339 |         trs = html.find('div', {'id': 'table-list'}).table.findAll('tr')
340 |         trs.pop(0)  # 去除标题行
341 |         actor_list = list()
342 |         for tr in trs:
343 |             actor_dict = dict()
344 | 
345 |             tds = tr.find_all('td')
346 | 
347 |             actor_dict['address'] = tds[0].span.attrs['data-address']
348 |             actor_dict['uuid'] = tds[0].span.attrs['data-uuid']
349 | 
350 |             def format_price(price_str):
351 |                 p = price_str.split('.')
352 |                 p = 0 if p[0] == '' else p[0]
353 |                 return p
354 |             actor_dict['max_price'] = format_price(tds[0].span.attrs['data-max-price'])
355 |             actor_dict['min_price'] = format_price(tds[0].span.attrs['data-min-price'])
356 | 
357 |             as_ = tds[1].find_all('a')
358 |             actor_dict['avatar'] = as_[0].img.attrs['src']  # 头像
359 |             actor_dict['nickname'] = as_[1].get_text(strip=True)  # 昵称
360 |             sex = tds[1].find('i', {'class': 'note'}).img.attrs['src']
361 |             index_tmp = sex.find('.png')
362 |             actor_dict['sex'] = sex[index_tmp - 1:index_tmp]  # 性别：1-男，2-女
363 |             actor_dict['geo_range'] = tds[1].find('span', {'class': 'name synopsis'}).get_text(strip=True)  # 地域范围
364 |             actor_dict['type_label'] = tds[1].li.get_text(strip=True)  # 资源分类
365 | 
366 |             platform = tds[2].img.attrs['src']
367 |             index_tmp = platform.find('.png')
368 |             actor_dict['platform'] = platform[index_tmp - 1:index_tmp]  # 平台：5-秒拍，
369 | 
370 |             user_id = tds[3].span.get_text(strip=True).split('ID:')
371 |             actor_dict['user_id'] = user_id[1].strip()  # 用户在平台的user id
372 | 
373 |             actor_dict['followed'] = tds[4].span.get_text(strip=True)  # 粉丝数
374 | 
375 |             prices = tds[5].find_all('p', {'class', 'p-price'})  # 报价方式，比如"视频原创+发布"，"线上直播"，"线下直播"
376 |             price_dict = dict()
377 |             for price in prices:
378 |                 price = price.get_text(strip=True).split('：')
379 |                 try:
380 |                     price_dict[price[0]] = price[1]
381 |                 except IndexError:
382 |                     pass
383 |             actor_dict['price_dict'] = price_dict
384 | 
385 |             avg_watched = tds[6].get_text(strip=True)  # 平均观看人数
386 |             mode = re.compile(r'\d+')
387 |             tmp = mode.findall(avg_watched)
388 |             try:
389 |                 avg_watched = tmp[0]
390 |             except IndexError:
391 |                 pass
392 |             actor_dict['avg_watched'] = avg_watched
393 |             actor_list.append(actor_dict)
394 | 
395 |         return {'total': total, 'page': page, 'items_count': len(actor_list), 'items': actor_list}
396 | 
397 |     def spider_actors(self):
398 |         page = 1
399 |         tbl_actor = WMYXActor()
400 |         while True:
401 |             ret = self.parse_actor_list_page(page)
402 |             for actor in ret['items']:
403 |                 actor['price_dict'] = json.dumps(actor['price_dict'])
404 |                 tbl_actor.insert(actor, replace=True)
405 |             if ret['items_count'] * ret['page'] < ret['total']:
406 |                 page += 1
407 |             else:
408 |                 break
409 | 
410 | 
411 | class BoseModel(Model):
412 |     conn = Mysql(host='127.0.0.1', user='root', passwd='123456', db='wanghong', charset='utf8')
413 | 
414 | 
415 | class WMYXActor(BoseModel):
416 |     tbl = "Tbl_WMYX_Actor"
417 | 
418 | 
419 | class YiXiaActor(BoseModel):
420 |     tbl = "Tbl_YiXia_Actor"
421 | 
422 | 
423 | class YiXiaVideo(BoseModel):
424 |     tbl = "Tbl_YiXia_Video"
425 | 
426 | class HuaJiaoActor(BoseModel):
427 |     tbl = "Tbl_Huajiao_User"
428 | 
429 | class Actor(BoseModel):
430 |     tbl = "Tbl_Actor"
431 | 
432 | 
433 | def agg_actors():
434 |     Actor().delete()
435 | 
436 |     # 一下网
437 |     actors = YiXiaActor().select('uid, nickname, follow, followed, praised, avatar, 2 as pid')\
438 |         .order_by('followed desc').limit(500).fetch_all(is_dict=1)
439 |     try:
440 |         for actor in actors:
441 |             Actor().insert(actor)
442 |     except Exception as e:
443 |         print(e)
444 | 
445 |     # 花椒网
446 |     actors = HuaJiaoActor().select('FUserId as uid, FUserName as nickname, FFollow as follow, FFollowed as followed,\
447 |                                    FSupported as praised, FAvatar as avatar, 1 as pid').order_by('FFollowed')\
448 |         .limit(500).fetch_all(is_dict=1)
449 |     try:
450 |         for actor in actors:
451 |             Actor().insert(actor)
452 |     except Exception as e:
453 |         print(e)
454 | 
455 | def spider_yixia_videos():
456 |     # yixia_actors = WMYXActor().select('user_id').where('platform=5').order_by('scraped_time desc').fetch_all()
457 |     yixia_actors = YiXiaActor().select('uid').order_by('scraped_time desc').limit(100).fetch_all()
458 |     y = YiXia()
459 |     for actor in yixia_actors:
460 |         uid = actor[0]
461 |         actor = y.parse_user_page(uid)
462 |         YiXiaActor().insert(actor, replace=True)
463 |         y.spider_videos(actor['suid'], actor['video_count'])
464 | 
465 | 
466 | def spider_womiyouxuan_actors():
467 |     WoMiYouXuan().spider_actors()
468 | 
469 | 
470 | def spider_yixia_follows():
471 |     suids = YiXiaActor().select('suid').order_by('scraped_time desc, id desc').limit(20).fetch_all()
472 |     if len(suids) <= 0:
473 |         suids = [{'ZPWwDeYSvPUb23SL'}]
474 |     for suid in suids:
475 |         YiXia().spider_follows(suid[0])
476 | 
477 | def main(argv):
478 |     useage = "Usage: python3 wanghong.py [spider_womiyouxuan_actors|spider_yixia_videos|spider_yixia_follows|" \
479 |              "womiyouxuan_actors_count|" \
480 |              "yixia_videos_count|yixia_actors_count" \
481 |              "|agg_actors]"
482 |     if len(argv) < 2:
483 |         print(useage)
484 |         exit()
485 | 
486 |     if argv[1] == 'spider_womiyouxuan_actors':
487 |         spider_womiyouxuan_actors()
488 |     elif argv[1] == 'spider_yixia_videos':
489 |         print(get_current_time() + ':' + 'spider_yixia_videos start')
490 |         spider_yixia_videos()
491 |         print(get_current_time() + ':' + 'spider_yixia_videos end')
492 |     elif argv[1] == 'spider_yixia_follows':
493 |         print(get_current_time() + ':' + 'spider_yixia_follows start')
494 |         spider_yixia_follows()
495 |         print(get_current_time() + ':' + 'spider_yixia_follows end')
496 |     elif argv[1] == 'womiyouxuan_actors_count':
497 |         count = WMYXActor().select("count(\"id\")").fetch_one()
498 |         print(count[0])
499 |     elif argv[1] == 'yixia_videos_count':
500 |         count = YiXiaVideo().select("count(\"id\")").fetch_one()
501 |         print(count[0])
502 |     elif argv[1] == 'yixia_actors_count':
503 |         count = YiXiaActor().select("count(\"id\")").fetch_one()
504 |         print(count[0])
505 |     elif argv[1] == 'agg_actors':
506 |         agg_actors()
507 |     else:
508 |         print(useage)
509 | 
510 | if __name__ == '__main__':
511 |     main(sys.argv)
512 | 
513 | 
514 | 
515 | 
516 | 
517 | 
518 | 


--------------------------------------------------------------------------------