├── .gitignore
├── README.md
├── crawl_weibo.py
├── util
    ├── change_scale.py
    ├── current.py
    └── str_check.py
└── weibo
    ├── crawl.py
    └── read_preset.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | **.txt


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # img-crawler
 2 | 
 3 | ## 版本
 4 | * v1.0.0 19-08-21
 5 | 
 6 | ## 功能
 7 | * **使用img-crawler爬取微博用户的相册图片**
 8 | * 交互式命令行运行方式
 9 | * 可以键入单个微博用户的ID爬取，也可以将多个爬取目标的别名、用户微博ID、爬取选项单独写入一个txt文件，程序可以读取文件对文件中目标进行陆续爬取
10 | * 爬取间隔、请求间歇，防止过于频繁导致IP被封
11 | * 自定义爬取图片的存放地址
12 | * 对时间解析，命名统一格式，采用"xx-xx-xx_64位md码"。方便排序、归类查找。
13 | 
14 | 
15 | ## 步骤
16 | 
17 | * 下载clone项目到本地
18 | * 安装python
19 | * `python crawl_weibo.py` 运行，在命令行中交互
20 | * use preset or not: 是否使用预设(xxx.txt)
21 | * preset_path: 预设的地址，默认为项目根目录的weibo_uid.txt
22 | * weibo_id即用户的微博ID
23 | * object dir：爬取图片存放的根目录
24 | 
25 | 
26 | ## 预设
27 | > 注: <>代表必须的选项，[]为可选的
28 | 
29 | * 文本格式文件
30 | * 每行数据为一个待爬取目标，以回车enter换行
31 | * 每行格式：[#] [nickname] <weibo_id> [options]
32 |     * 前缀“#”代表跳过此行不爬取
33 |     * 中间用空格隔开，tab等也行，只要保证在一行且中间有空白即可
34 |     * nickname意味别名，即你为该用户的爬取图片的文件夹命名的名字，若无该选项，则自动采用用户的微博昵称作为名字
35 |     * weibo_id即用户的微博ID
36 |     * options 待加
37 | * 例：
38 |     ```text
39 |     Me 1234567890 
40 |     9876543210
41 |     # 这行不爬 1112223330
42 |     ```
43 |   
44 | ## 其他
45 | * 项目借鉴 *[johnnyzhang1992/imageSpider](https://github.com/johnnyzhang1992/imageSpider)*
46 | * 敬请使用，反馈改进


--------------------------------------------------------------------------------
/crawl_weibo.py:
--------------------------------------------------------------------------------
  1 | # from selenium import webdriver
  2 | 
  3 | # External import
  4 | 
  5 | import re
  6 | import time
  7 | import requests
  8 | import json
  9 | import os
 10 | 
 11 | # Internal import
 12 | 
 13 | import util.current as current
 14 | from util.change_scale import encode_b64
 15 | from weibo.read_preset import read_preset
 16 | from util.str_check import is_contain_chinese
 17 | 
 18 | # Fixed global variables ------------------
 19 | 
 20 | y_n_regex = '([yY](es)?)|([Nn](o|ot)?)'
 21 | y_regex = '[yY](es)?'
 22 | n_regex = '[Nn](o|ot)?'
 23 | 
 24 | weibo_url = "https://m.weibo.cn/"
 25 | 
 26 | # -------------------------------------------
 27 | 
 28 | 
 29 | # Customizable global variables -------------
 30 | 
 31 | pre_id = '230413'
 32 | post_id = '230283'
 33 | 
 34 | '''
 35 | WEIBO_SECOND_PROFILE_WEIBO 全部
 36 | WEIBO_SECOND_PROFILE_WEIBO_ORI 原创
 37 | WEIBO_SECOND_PROFILE_WEIBO_VIDEO 视频
 38 | WEIBO_SECOND_PROFILE_WEIBO_ARTICAL 文章
 39 | WEIBO_SECOND_PROFILE_WEIBO_WEIBO_SECOND_PROFILE_WEIBO_PIC 文章
 40 | '''
 41 | weibo_type = 'WEIBO_SECOND_PROFILE_WEIBO_ORI'
 42 | 
 43 | request_params = {"ajwvr": "6", "domain": "100505", "domain_op": "100505", "feed_type": "0", "is_all": "1",
 44 |                   "is_tag": "0", "is_search": "0"}
 45 | 
 46 | headers = {
 47 |     'Accept': 'application/json, text/plain, */*',
 48 |     'Accept-Encoding': 'gzip, deflate, br',
 49 |     'Accept-Language': 'zh-CN,zh;q=0.9',
 50 |     'Cache-Control': 'no-cache',
 51 |     'Connection': 'keep-alive',
 52 |     'Content-Type': 'application/x-www-form-urlencoded',
 53 |     'Host': 'm.weibo.cn',
 54 |     'Pragma': 'no-cache',
 55 |     'Referer': "url",
 56 |     # User-Agent 根据自己的电脑修改
 57 |     'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',
 58 |     'X-Requested-With': 'XMLHttpRequest'
 59 | }
 60 | 
 61 | # cookie 去网页版获取，具体可以百度
 62 | cookie = 'MLOGIN=0; _T_WM=ec3cbb7caac2b6d765aa1c64e065ee7c; OUTFOX_SEARCH_USER_ID_NCOO=200622491.85643607; WEIBOCN_FROM=1110006030; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D2302832101822767%26from%3Dpage_100306%26fid%3D2304132101822767_-_WEIBO_SECOND_PROFILE_PIC%26uicode%3D10000011'
 63 | 
 64 | preset_default_path = "./weibo_uid.txt"
 65 | root = "_pics/"
 66 | 
 67 | # -------------------------------------------
 68 | 
 69 | 
 70 | # Simple global variables -------------------
 71 | 
 72 | cur_page = 1
 73 | n = 1
 74 | page_pics_num = 0
 75 | total_pics_num = 0
 76 | none_sign = False
 77 | page_total = 0
 78 | uid = ""
 79 | name = ""
 80 | 
 81 | 
 82 | # -------------------------------------------
 83 | 
 84 | 
 85 | def generate_url(_uid):
 86 |     return 'https://m.weibo.cn/api/container/getIndex?containerid=' + pre_id + _uid + '_-_' \
 87 |            + weibo_type + '&luicode=10000011&lfid=' + post_id + _uid
 88 | 
 89 | 
 90 | def crawl_imgs_of_one_user(_user):
 91 |     global uid, name
 92 |     uid = _user["uid"]
 93 |     name = "" if (_user["name"] is None) else _user["name"]
 94 |     _url = generate_url(uid)
 95 | 
 96 |     # 总页数
 97 |     global page_total, none_sign
 98 |     page_total = int(get_total_page(_url))
 99 |     none_sign = False
100 | 
101 |     # 遍历每一页
102 |     for i in range(1, page_total):
103 |         if none_sign:
104 |             print('  (无) 当前页数: ' + str(i) + ', 总页数: ' + str(page_total - 1) + ";")
105 |             time.sleep(0.2)
106 |         else:
107 |             headers['Cookie'] = cookie
108 |             print(_url)
109 |             if i > 1:
110 |                 _url = _url + '&page_type=03&page=' + str(i)
111 |             # print(_url)
112 |             response = requests.get(_url, headers=headers)
113 |             # print(response.url)
114 |             html = response.text
115 |             _json = json.loads(html)
116 |             get_cur_page_weibo(_json, i)
117 |             # 休眠1秒
118 |             time.sleep(1)
119 |             if page_total > 10:
120 |                 if i % 10 == 0 and not none_sign:
121 |                     # 每爬10页休眠10秒
122 |                     time.sleep(10)
123 |     print()
124 |     print("用户[%s]: %s 爬取完成" % (uid, name))
125 |     print()
126 | 
127 | 
128 | # 保存图片到本地
129 | def save_image(img_src, date, pid, i):
130 |     # print(img_src)
131 |     _date = time.strptime(date,"%a %b %d %H:%M:%S +0800 %Y")
132 |     _date = f"{_date[0]%100}-{_date[1]}-{_date[2]}"
133 |     _dir = root + str(name)
134 |     if not os.path.exists(_dir):
135 |         os.makedirs(_dir)
136 | 
137 |     # if is_contain_chinese(date):
138 |     #     date = current.get_date()
139 |     # elif date[0:2] == "20":
140 |     #     date = date[2:]
141 |     # else:
142 |     #     if len(date) == 5:
143 |     #         date = str(current.get_year()) + "-" + date
144 |     _name = _dir + '/' + str(_date) + '_' + str(encode_b64(int(i)))[2:] + '_' + str(pid + 1) + '.jpg'
145 | 
146 |     if not os.path.exists(_name):
147 |         r = requests.get(img_src)
148 |         r.raise_for_status()
149 | 
150 |         with open(_name, "wb") as f:
151 |             f.write(r.content)
152 |         print("    %s  爬取完成" % _name)
153 |     else:
154 |         print("    %s  文件已存在" % _name)
155 | 
156 | 
157 | # 获取当前页的数据
158 | def get_cur_page_weibo(_json, i):
159 |     global name
160 |     if i == 1:
161 |         if len(name) == 0:
162 |             name = _json['data']['cards'][0]['mblog']['user']['screen_name']
163 |         print("开始爬取 用户[%s]: %s" % (uid, name))
164 | 
165 |     _cards = _json['data']['cards']
166 |     _cardListInfo = _json['data']['cardlistInfo']
167 |     global cur_page
168 |     # page_total = _cardListInfo['total']  # 你要爬取的微博的页数
169 |     cur_page = i  # 当前微博页数
170 |     global none_sign
171 |     if _cardListInfo['page'] is None:
172 |         none_sign = True
173 | 
174 |     print('  当前页数: ' + str(cur_page) + ', 总页数: ' + str(page_total - 1) + ";")
175 |     # 打印微博
176 |     for card in _cards:
177 |         if card['card_type'] == 9:
178 |             # if card['mblog']['weibo_position'] == 1:
179 |             card['mblog'].setdefault('pics', False)
180 |             if card['mblog']['pics']:
181 |                 for x in range(len(card['mblog']['pics'])):
182 |                     # 保存图片到本地
183 |                     save_image(card['mblog']['pics'][x]['large']['url'], card['mblog']['created_at'], x,
184 |                                card['mblog']['mid'])
185 |                     time.sleep(1)
186 |                     # print(card['mblog'])
187 | 
188 | 
189 | # 获取总页数
190 | def get_total_page(_url):
191 |     _response = requests.get(_url, headers=headers)
192 |     print(_response.url)
193 |     print()
194 |     _html = _response.text
195 |     __json = json.loads(_html)
196 |     return __json['data']['cardlistInfo']['total']  # 你要爬取的微博的页数
197 | 
198 | 
199 | def set_root_path():
200 |     global root
201 |     val = input("3)input your object dir path (default current dir): ").strip()
202 |     if len(val) == 0:
203 |         print("  use current directory...")
204 |     elif not os.path.exists(val):
205 |         print("  path '%s' does not exist, will use current directory..." % val)
206 |     else:
207 |         root = val
208 | 
209 | 
210 | def crawl():
211 |     use_preset = input('1)use preset or not? y|n: ').strip()
212 |     while not re.match(y_n_regex, use_preset):
213 |         print("  wrong input, try again! ")
214 |         use_preset = input('2)use preset or not? y|n: ').strip()
215 | 
216 |     if re.match(y_regex, use_preset):
217 |         preset_path = input('  input preset path (default "%s"): ' % preset_default_path).strip()
218 |         path_len = len(preset_path)
219 |         if not path_len == 0 and os.path.exists(preset_path):
220 |             print("    preset path exists")
221 |         else:
222 |             preset_path = preset_default_path
223 |             if path_len == 0:
224 |                 print('    will use default path: "%s"' %
225 |                       preset_path)
226 |             else:
227 |                 print('    preset path does not exist, will use default path: "%s"' %
228 |                       preset_path)
229 |         users = read_preset(preset_path)
230 |         set_root_path()
231 |         for user in users:
232 |             crawl_imgs_of_one_user(user)
233 |             time.sleep(1)
234 |     elif re.match(n_regex, use_preset):
235 |         _uid = input('2)input weibo id: ')
236 |         set_root_path()
237 |         crawl_imgs_of_one_user({"name": None, "uid": _uid})
238 | 
239 | 
240 | if __name__ == '__main__':
241 |     crawl()
242 | 


--------------------------------------------------------------------------------
/util/change_scale.py:
--------------------------------------------------------------------------------
 1 | def baseN(num, b):
 2 |     return ((num == 0) and "0") or \
 3 |         (baseN(num // b, b).lstrip("0") + "0123456789abcdefghijklmnopqrstuvwxyz"[num % b])
 4 | def encode_b64(n):
 5 |     table = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ#$'
 6 |     result = []
 7 |     temp = n
 8 |     if 0 == temp:
 9 |       result.append('0')
10 |     else:
11 |       while 0 < temp:
12 |         result.append(table[temp % 64])
13 |         temp //= 64
14 |     return ''.join([x for x in reversed(result)])


--------------------------------------------------------------------------------
/util/current.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | 
 4 | def get_year():
 5 |     return datetime.now().year % 100
 6 | 
 7 | 
 8 | def get_month():
 9 |     return datetime.now().month
10 | 
11 | 
12 | def get_day():
13 |     return datetime.now().day
14 | 
15 | 
16 | def get_date():
17 |     return datetime.now().strftime('%Y-%m-%d')[2:]
18 | 


--------------------------------------------------------------------------------
/util/str_check.py:
--------------------------------------------------------------------------------
 1 | def is_contain_chinese(check_str):
 2 |     """
 3 |     判断字符串中是否包含中文
 4 |     :param check_str: {str} 需要检测的字符串
 5 |     :return: {bool} 包含返回True， 不包含返回False
 6 |     """
 7 |     for ch in check_str:
 8 |         if u'\u4e00' <= ch <= u'\u9fff':
 9 |             return True
10 |     return False
11 | 


--------------------------------------------------------------------------------
/weibo/crawl.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oGsLP/img-crawler/5672108a7340bbb5b00d02b672b26436c1402753/weibo/crawl.py


--------------------------------------------------------------------------------
/weibo/read_preset.py:
--------------------------------------------------------------------------------
 1 | def read_preset(path):
 2 |     users = []
 3 |     with open(path) as fr:
 4 |         for line in fr:
 5 |             if line[0] == "#":
 6 |                 continue
 7 |             else:
 8 |                 args = line.strip().split(" ")
 9 |                 if len(args) == 2:
10 |                     users.append({'name': args[0], 'uid': args[1]})
11 |                 elif len(args) == 1 and len(line.strip()) == 10:
12 |                     users.append({'name': None, 'uid': line.strip()})
13 |                 else:
14 |                     print("Wrong arguments: %s"
15 |                           % line)
16 |     return users
17 | 


--------------------------------------------------------------------------------