├── .gitignore ├── README.md ├── crawl_weibo.py ├── util ├── change_scale.py ├── current.py └── str_check.py └── weibo ├── crawl.py └── read_preset.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | **.txt -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # img-crawler 2 | 3 | ## 版本 4 | * v1.0.0 19-08-21 5 | 6 | ## 功能 7 | * **使用img-crawler爬取微博用户的相册图片** 8 | * 交互式命令行运行方式 9 | * 可以键入单个微博用户的ID爬取,也可以将多个爬取目标的别名、用户微博ID、爬取选项单独写入一个txt文件,程序可以读取文件对文件中目标进行陆续爬取 10 | * 爬取间隔、请求间歇,防止过于频繁导致IP被封 11 | * 自定义爬取图片的存放地址 12 | * 对时间解析,命名统一格式,采用"xx-xx-xx_64位md码"。方便排序、归类查找。 13 | 14 | 15 | ## 步骤 16 | 17 | * 下载clone项目到本地 18 | * 安装python 19 | * `python crawl_weibo.py` 运行,在命令行中交互 20 | * use preset or not: 是否使用预设(xxx.txt) 21 | * preset_path: 预设的地址,默认为项目根目录的weibo_uid.txt 22 | * weibo_id即用户的微博ID 23 | * object dir:爬取图片存放的根目录 24 | 25 | 26 | ## 预设 27 | > 注: <>代表必须的选项,[]为可选的 28 | 29 | * 文本格式文件 30 | * 每行数据为一个待爬取目标,以回车enter换行 31 | * 每行格式:[#] [nickname] [options] 32 | * 前缀“#”代表跳过此行不爬取 33 | * 中间用空格隔开,tab等也行,只要保证在一行且中间有空白即可 34 | * nickname意味别名,即你为该用户的爬取图片的文件夹命名的名字,若无该选项,则自动采用用户的微博昵称作为名字 35 | * weibo_id即用户的微博ID 36 | * options 待加 37 | * 例: 38 | ```text 39 | Me 1234567890 40 | 9876543210 41 | # 这行不爬 1112223330 42 | ``` 43 | 44 | ## 其他 45 | * 项目借鉴 *[johnnyzhang1992/imageSpider](https://github.com/johnnyzhang1992/imageSpider)* 46 | * 敬请使用,反馈改进 -------------------------------------------------------------------------------- /crawl_weibo.py: -------------------------------------------------------------------------------- 1 | # from selenium import webdriver 2 | 3 | # External import 4 | 5 | import re 6 | import time 7 | import requests 8 | import json 9 | import os 10 | 11 | # Internal import 12 | 13 | import util.current as current 14 | from util.change_scale import encode_b64 15 | from weibo.read_preset import read_preset 16 | from util.str_check import is_contain_chinese 17 | 18 | # Fixed global variables ------------------ 19 | 20 | y_n_regex = '([yY](es)?)|([Nn](o|ot)?)' 21 | y_regex = '[yY](es)?' 22 | n_regex = '[Nn](o|ot)?' 23 | 24 | weibo_url = "https://m.weibo.cn/" 25 | 26 | # ------------------------------------------- 27 | 28 | 29 | # Customizable global variables ------------- 30 | 31 | pre_id = '230413' 32 | post_id = '230283' 33 | 34 | ''' 35 | WEIBO_SECOND_PROFILE_WEIBO 全部 36 | WEIBO_SECOND_PROFILE_WEIBO_ORI 原创 37 | WEIBO_SECOND_PROFILE_WEIBO_VIDEO 视频 38 | WEIBO_SECOND_PROFILE_WEIBO_ARTICAL 文章 39 | WEIBO_SECOND_PROFILE_WEIBO_WEIBO_SECOND_PROFILE_WEIBO_PIC 文章 40 | ''' 41 | weibo_type = 'WEIBO_SECOND_PROFILE_WEIBO_ORI' 42 | 43 | request_params = {"ajwvr": "6", "domain": "100505", "domain_op": "100505", "feed_type": "0", "is_all": "1", 44 | "is_tag": "0", "is_search": "0"} 45 | 46 | headers = { 47 | 'Accept': 'application/json, text/plain, */*', 48 | 'Accept-Encoding': 'gzip, deflate, br', 49 | 'Accept-Language': 'zh-CN,zh;q=0.9', 50 | 'Cache-Control': 'no-cache', 51 | 'Connection': 'keep-alive', 52 | 'Content-Type': 'application/x-www-form-urlencoded', 53 | 'Host': 'm.weibo.cn', 54 | 'Pragma': 'no-cache', 55 | 'Referer': "url", 56 | # User-Agent 根据自己的电脑修改 57 | 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1', 58 | 'X-Requested-With': 'XMLHttpRequest' 59 | } 60 | 61 | # cookie 去网页版获取,具体可以百度 62 | cookie = 'MLOGIN=0; _T_WM=ec3cbb7caac2b6d765aa1c64e065ee7c; OUTFOX_SEARCH_USER_ID_NCOO=200622491.85643607; WEIBOCN_FROM=1110006030; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D2302832101822767%26from%3Dpage_100306%26fid%3D2304132101822767_-_WEIBO_SECOND_PROFILE_PIC%26uicode%3D10000011' 63 | 64 | preset_default_path = "./weibo_uid.txt" 65 | root = "_pics/" 66 | 67 | # ------------------------------------------- 68 | 69 | 70 | # Simple global variables ------------------- 71 | 72 | cur_page = 1 73 | n = 1 74 | page_pics_num = 0 75 | total_pics_num = 0 76 | none_sign = False 77 | page_total = 0 78 | uid = "" 79 | name = "" 80 | 81 | 82 | # ------------------------------------------- 83 | 84 | 85 | def generate_url(_uid): 86 | return 'https://m.weibo.cn/api/container/getIndex?containerid=' + pre_id + _uid + '_-_' \ 87 | + weibo_type + '&luicode=10000011&lfid=' + post_id + _uid 88 | 89 | 90 | def crawl_imgs_of_one_user(_user): 91 | global uid, name 92 | uid = _user["uid"] 93 | name = "" if (_user["name"] is None) else _user["name"] 94 | _url = generate_url(uid) 95 | 96 | # 总页数 97 | global page_total, none_sign 98 | page_total = int(get_total_page(_url)) 99 | none_sign = False 100 | 101 | # 遍历每一页 102 | for i in range(1, page_total): 103 | if none_sign: 104 | print(' (无) 当前页数: ' + str(i) + ', 总页数: ' + str(page_total - 1) + ";") 105 | time.sleep(0.2) 106 | else: 107 | headers['Cookie'] = cookie 108 | print(_url) 109 | if i > 1: 110 | _url = _url + '&page_type=03&page=' + str(i) 111 | # print(_url) 112 | response = requests.get(_url, headers=headers) 113 | # print(response.url) 114 | html = response.text 115 | _json = json.loads(html) 116 | get_cur_page_weibo(_json, i) 117 | # 休眠1秒 118 | time.sleep(1) 119 | if page_total > 10: 120 | if i % 10 == 0 and not none_sign: 121 | # 每爬10页休眠10秒 122 | time.sleep(10) 123 | print() 124 | print("用户[%s]: %s 爬取完成" % (uid, name)) 125 | print() 126 | 127 | 128 | # 保存图片到本地 129 | def save_image(img_src, date, pid, i): 130 | # print(img_src) 131 | _date = time.strptime(date,"%a %b %d %H:%M:%S +0800 %Y") 132 | _date = f"{_date[0]%100}-{_date[1]}-{_date[2]}" 133 | _dir = root + str(name) 134 | if not os.path.exists(_dir): 135 | os.makedirs(_dir) 136 | 137 | # if is_contain_chinese(date): 138 | # date = current.get_date() 139 | # elif date[0:2] == "20": 140 | # date = date[2:] 141 | # else: 142 | # if len(date) == 5: 143 | # date = str(current.get_year()) + "-" + date 144 | _name = _dir + '/' + str(_date) + '_' + str(encode_b64(int(i)))[2:] + '_' + str(pid + 1) + '.jpg' 145 | 146 | if not os.path.exists(_name): 147 | r = requests.get(img_src) 148 | r.raise_for_status() 149 | 150 | with open(_name, "wb") as f: 151 | f.write(r.content) 152 | print(" %s 爬取完成" % _name) 153 | else: 154 | print(" %s 文件已存在" % _name) 155 | 156 | 157 | # 获取当前页的数据 158 | def get_cur_page_weibo(_json, i): 159 | global name 160 | if i == 1: 161 | if len(name) == 0: 162 | name = _json['data']['cards'][0]['mblog']['user']['screen_name'] 163 | print("开始爬取 用户[%s]: %s" % (uid, name)) 164 | 165 | _cards = _json['data']['cards'] 166 | _cardListInfo = _json['data']['cardlistInfo'] 167 | global cur_page 168 | # page_total = _cardListInfo['total'] # 你要爬取的微博的页数 169 | cur_page = i # 当前微博页数 170 | global none_sign 171 | if _cardListInfo['page'] is None: 172 | none_sign = True 173 | 174 | print(' 当前页数: ' + str(cur_page) + ', 总页数: ' + str(page_total - 1) + ";") 175 | # 打印微博 176 | for card in _cards: 177 | if card['card_type'] == 9: 178 | # if card['mblog']['weibo_position'] == 1: 179 | card['mblog'].setdefault('pics', False) 180 | if card['mblog']['pics']: 181 | for x in range(len(card['mblog']['pics'])): 182 | # 保存图片到本地 183 | save_image(card['mblog']['pics'][x]['large']['url'], card['mblog']['created_at'], x, 184 | card['mblog']['mid']) 185 | time.sleep(1) 186 | # print(card['mblog']) 187 | 188 | 189 | # 获取总页数 190 | def get_total_page(_url): 191 | _response = requests.get(_url, headers=headers) 192 | print(_response.url) 193 | print() 194 | _html = _response.text 195 | __json = json.loads(_html) 196 | return __json['data']['cardlistInfo']['total'] # 你要爬取的微博的页数 197 | 198 | 199 | def set_root_path(): 200 | global root 201 | val = input("3)input your object dir path (default current dir): ").strip() 202 | if len(val) == 0: 203 | print(" use current directory...") 204 | elif not os.path.exists(val): 205 | print(" path '%s' does not exist, will use current directory..." % val) 206 | else: 207 | root = val 208 | 209 | 210 | def crawl(): 211 | use_preset = input('1)use preset or not? y|n: ').strip() 212 | while not re.match(y_n_regex, use_preset): 213 | print(" wrong input, try again! ") 214 | use_preset = input('2)use preset or not? y|n: ').strip() 215 | 216 | if re.match(y_regex, use_preset): 217 | preset_path = input(' input preset path (default "%s"): ' % preset_default_path).strip() 218 | path_len = len(preset_path) 219 | if not path_len == 0 and os.path.exists(preset_path): 220 | print(" preset path exists") 221 | else: 222 | preset_path = preset_default_path 223 | if path_len == 0: 224 | print(' will use default path: "%s"' % 225 | preset_path) 226 | else: 227 | print(' preset path does not exist, will use default path: "%s"' % 228 | preset_path) 229 | users = read_preset(preset_path) 230 | set_root_path() 231 | for user in users: 232 | crawl_imgs_of_one_user(user) 233 | time.sleep(1) 234 | elif re.match(n_regex, use_preset): 235 | _uid = input('2)input weibo id: ') 236 | set_root_path() 237 | crawl_imgs_of_one_user({"name": None, "uid": _uid}) 238 | 239 | 240 | if __name__ == '__main__': 241 | crawl() 242 | -------------------------------------------------------------------------------- /util/change_scale.py: -------------------------------------------------------------------------------- 1 | def baseN(num, b): 2 | return ((num == 0) and "0") or \ 3 | (baseN(num // b, b).lstrip("0") + "0123456789abcdefghijklmnopqrstuvwxyz"[num % b]) 4 | def encode_b64(n): 5 | table = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ#$' 6 | result = [] 7 | temp = n 8 | if 0 == temp: 9 | result.append('0') 10 | else: 11 | while 0 < temp: 12 | result.append(table[temp % 64]) 13 | temp //= 64 14 | return ''.join([x for x in reversed(result)]) -------------------------------------------------------------------------------- /util/current.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | 4 | def get_year(): 5 | return datetime.now().year % 100 6 | 7 | 8 | def get_month(): 9 | return datetime.now().month 10 | 11 | 12 | def get_day(): 13 | return datetime.now().day 14 | 15 | 16 | def get_date(): 17 | return datetime.now().strftime('%Y-%m-%d')[2:] 18 | -------------------------------------------------------------------------------- /util/str_check.py: -------------------------------------------------------------------------------- 1 | def is_contain_chinese(check_str): 2 | """ 3 | 判断字符串中是否包含中文 4 | :param check_str: {str} 需要检测的字符串 5 | :return: {bool} 包含返回True, 不包含返回False 6 | """ 7 | for ch in check_str: 8 | if u'\u4e00' <= ch <= u'\u9fff': 9 | return True 10 | return False 11 | -------------------------------------------------------------------------------- /weibo/crawl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oGsLP/img-crawler/5672108a7340bbb5b00d02b672b26436c1402753/weibo/crawl.py -------------------------------------------------------------------------------- /weibo/read_preset.py: -------------------------------------------------------------------------------- 1 | def read_preset(path): 2 | users = [] 3 | with open(path) as fr: 4 | for line in fr: 5 | if line[0] == "#": 6 | continue 7 | else: 8 | args = line.strip().split(" ") 9 | if len(args) == 2: 10 | users.append({'name': args[0], 'uid': args[1]}) 11 | elif len(args) == 1 and len(line.strip()) == 10: 12 | users.append({'name': None, 'uid': line.strip()}) 13 | else: 14 | print("Wrong arguments: %s" 15 | % line) 16 | return users 17 | --------------------------------------------------------------------------------