├── download └── 下载的文件在这里.txt ├── .gitignore ├── share-url.txt ├── v0200f230000bkmk0rimac2mte5mhai0.mp4 ├── requirements.txt ├── share-url.bat ├── fuck-byted-acrawler.js ├── amemv-video-ripper.py └── README.md /download/下载的文件在这里.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | .DS_Store -------------------------------------------------------------------------------- /share-url.txt: -------------------------------------------------------------------------------- 1 | http://v.douyin.com/fKq7su/ -------------------------------------------------------------------------------- /v0200f230000bkmk0rimac2mte5mhai0.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yalarc/GetDouYinApplication/HEAD/v0200f230000bkmk0rimac2mte5mhai0.mp4 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | certifi==2019.3.9 2 | chardet==3.0.4 3 | idna==2.8 4 | PySocks==1.6.8 5 | requests==2.21.0 6 | six==1.12.0 7 | urllib3==1.24.3 8 | -------------------------------------------------------------------------------- /share-url.bat: -------------------------------------------------------------------------------- 1 | http://v.douyin.com/64kGps/ 2 | http://v.douyin.com/6417bb/ 3 | http://v.douyin.com/64kGps/ 4 | http://v.douyin.com/64eXMG/ 5 | http://v.douyin.com/6Q4CYU/ 6 | http://v.douyin.com/6cry3g/ 7 | http://v.douyin.com/6cq8tX/ 8 | http://v.douyin.com/6whRKJ/ 9 | http://v.douyin.com/6cBdKF/ 10 | http://v.douyin.com/6cY3oK/ 11 | http://v.douyin.com/6cyX5A/ 12 | http://v.douyin.com/6cjU2b/ 13 | http://v.douyin.com/6crJtr/ 14 | http://v.douyin.com/6w74Gs/ 15 | http://v.douyin.com/6wKvcs/ 16 | http://v.douyin.com/M125mw/ 17 | http://v.douyin.com/M15wUe/ 18 | http://v.douyin.com/M125ff/ 19 | http://v.douyin.com/haeoGF/ 20 | http://v.douyin.com/h5T63S/ 21 | http://v.douyin.com/haL6Vc/ 22 | http://v.douyin.com/Pp5VM9/ 23 | http://v.douyin.com/PpsgV2/ 24 | http://v.douyin.com/PpuLaR/ 25 | http://v.douyin.com/Pp7vJV/ 26 | http://v.douyin.com/PpaFb2/ 27 | http://v.douyin.com/PtNSES/ 28 | http://v.douyin.com/PtR5NX/ 29 | http://v.douyin.com/PGKHHU/ 30 | http://v.douyin.com/Pt13dX/ 31 | http://v.douyin.com/PGoGHo/ 32 | http://v.douyin.com/PtF5o5/ 33 | http://v.douyin.com/PG7oUa/ 34 | http://v.douyin.com/PGpKu8/ 35 | http://v.douyin.com/PGG7oN/ 36 | http://v.douyin.com/PtRtcW/ 37 | http://v.douyin.com/PGtVDF/ 38 | http://v.douyin.com/PtLmGs/ 39 | http://v.douyin.com/PG3otm/ 40 | http://v.douyin.com/PG36Yw/ 41 | http://v.douyin.com/PGwPpc/ 42 | http://v.douyin.com/PGGExs/ 43 | http://v.douyin.com/PGEMb4/ 44 | http://v.douyin.com/PtFk27/ 45 | http://v.douyin.com/PGK1PY/ 46 | http://v.douyin.com/PGK3VK/ 47 | http://v.douyin.com/PGvufg/ 48 | http://v.douyin.com/PGohgw/ 49 | http://v.douyin.com/PtdhnR/ 50 | -------------------------------------------------------------------------------- /fuck-byted-acrawler.js: -------------------------------------------------------------------------------- 1 | function generateSignature(userId) { 2 | this.navigator = { 3 | userAgent: "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1" 4 | } 5 | var e = {} 6 | 7 | var r = (function () { 8 | function e(e, a, r) { 9 | return (b[e] || (b[e] = t("x,y", "return x " + e + " y")))(r, a) 10 | } 11 | 12 | function a(e, a, r) { 13 | return (k[r] || (k[r] = t("x,y", "return new x[y](" + Array(r + 1).join(",x[++y]").substr(1) + ")")))(e, a) 14 | } 15 | 16 | function r(e, a, r) { 17 | var n, t, s = {}, b = s.d = r ? r.d + 1 : 0; 18 | for (s["$" + b] = s, t = 0; t < b; t++) s[n = "$" + t] = r[n]; 19 | for (t = 0, b = s.length = a.length; t < b; t++) s[t] = a[t]; 20 | return c(e, 0, s) 21 | } 22 | 23 | function c(t, b, k) { 24 | function u(e) { 25 | v[x++] = e 26 | } 27 | 28 | function f() { 29 | return g = t.charCodeAt(b++) - 32, t.substring(b, b += g) 30 | } 31 | 32 | function l() { 33 | try { 34 | y = c(t, b, k) 35 | } catch (e) { 36 | h = e, y = l 37 | } 38 | } 39 | 40 | for (var h, y, d, g, v = [], x = 0; ;) switch (g = t.charCodeAt(b++) - 32) { 41 | case 1: 42 | u(!v[--x]); 43 | break; 44 | case 4: 45 | v[x++] = f(); 46 | break; 47 | case 5: 48 | u(function (e) { 49 | var a = 0, r = e.length; 50 | return function () { 51 | var c = a < r; 52 | return c && u(e[a++]), c 53 | } 54 | }(v[--x])); 55 | break; 56 | case 6: 57 | y = v[--x], u(v[--x](y)); 58 | break; 59 | case 8: 60 | if (g = t.charCodeAt(b++) - 32, l(), b += g, g = t.charCodeAt(b++) - 32, y === c) b += g; else if (y !== l) return y; 61 | break; 62 | case 9: 63 | v[x++] = c; 64 | break; 65 | case 10: 66 | u(s(v[--x])); 67 | break; 68 | case 11: 69 | y = v[--x], u(v[--x] + y); 70 | break; 71 | case 12: 72 | for (y = f(), d = [], g = 0; g < y.length; g++) d[g] = y.charCodeAt(g) ^ g + y.length; 73 | u(String.fromCharCode.apply(null, d)); 74 | break; 75 | case 13: 76 | y = v[--x], h = delete v[--x][y]; 77 | break; 78 | case 14: 79 | v[x++] = t.charCodeAt(b++) - 32; 80 | break; 81 | case 59: 82 | u((g = t.charCodeAt(b++) - 32) ? (y = x, v.slice(x -= g, y)) : []); 83 | break; 84 | case 61: 85 | u(v[--x][t.charCodeAt(b++) - 32]); 86 | break; 87 | case 62: 88 | g = v[--x], k[0] = 65599 * k[0] + k[1].charCodeAt(g) >>> 0; 89 | break; 90 | case 65: 91 | h = v[--x], y = v[--x], v[--x][y] = h; 92 | break; 93 | case 66: 94 | u(e(t[b++], v[--x], v[--x])); 95 | break; 96 | case 67: 97 | y = v[--x], d = v[--x], u((g = v[--x]).x === c ? r(g.y, y, k) : g.apply(d, y)); 98 | break; 99 | case 68: 100 | u(e((g = t[b++]) < "<" ? (b--, f()) : g + g, v[--x], v[--x])); 101 | break; 102 | case 70: 103 | u(!1); 104 | break; 105 | case 71: 106 | v[x++] = n; 107 | break; 108 | case 72: 109 | v[x++] = +f(); 110 | break; 111 | case 73: 112 | u(parseInt(f(), 36)); 113 | break; 114 | case 75: 115 | if (v[--x]) { 116 | b++; 117 | break 118 | } 119 | case 74: 120 | g = t.charCodeAt(b++) - 32 << 16 >> 16, b += g; 121 | break; 122 | case 76: 123 | u(k[t.charCodeAt(b++) - 32]); 124 | break; 125 | case 77: 126 | y = v[--x], u(v[--x][y]); 127 | break; 128 | case 78: 129 | g = t.charCodeAt(b++) - 32, u(a(v, x -= g + 1, g)); 130 | break; 131 | case 79: 132 | g = t.charCodeAt(b++) - 32, u(k["$" + g]); 133 | break; 134 | case 81: 135 | h = v[--x], v[--x][f()] = h; 136 | break; 137 | case 82: 138 | u(v[--x][f()]); 139 | break; 140 | case 83: 141 | h = v[--x], k[t.charCodeAt(b++) - 32] = h; 142 | break; 143 | case 84: 144 | v[x++] = !0; 145 | break; 146 | case 85: 147 | v[x++] = void 0; 148 | break; 149 | case 86: 150 | u(v[x - 1]); 151 | break; 152 | case 88: 153 | h = v[--x], y = v[--x], v[x++] = h, v[x++] = y; 154 | break; 155 | case 89: 156 | u(function () { 157 | function e() { 158 | return r(e.y, arguments, k) 159 | } 160 | 161 | return e.y = f(), e.x = c, e 162 | }()); 163 | break; 164 | case 90: 165 | v[x++] = null; 166 | break; 167 | case 91: 168 | v[x++] = h; 169 | break; 170 | case 93: 171 | h = v[--x]; 172 | break; 173 | case 0: 174 | return v[--x]; 175 | default: 176 | u((g << 16 >> 16) - 16) 177 | } 178 | } 179 | 180 | var n = this, t = n.Function, s = Object.keys || function (e) { 181 | var a = {}, r = 0; 182 | for (var c in e) a[r++] = c; 183 | return a.length = r, a 184 | }, b = {}, k = {}; 185 | return r 186 | })() 187 | 188 | ('gr$Daten Иb/s!l y͒yĹg,(lfi~ah`{mv,-n|jqewVxp{rvmmx,&effkx[!cs"l".Pq%widthl"@q&heightl"vr*getContextx$"2d[!cs#l#,*;?|u.|uc{uq$fontl#vr(fillTextx$$龘ฑภ경2<[#c}l#2q*shadowBlurl#1q-shadowOffsetXl#$$limeq+shadowColorl#vr#arcx88802[%c}l#vr&strokex[ c}l"v,)}eOmyoZB]mx[ cs!0s$l$Pb>>s!0s%yA0s"l"l!r&lengthb&l!l Bd>&+l!l &+l!l 6d>&+l!l &+ s,y=o!o!]/q"13o!l q"10o!],l 2d>& s.{s-yMo!o!]0q"13o!]*Ld>>b|s!o!l q"10o!],l!& s/yIo!o!].q"13o!],o!]*Jd>>b|&o!]+l &+ s0l-l!&l-l!i\'1z141z4b/@d 0: 53 | print(file_name + " 已经爬取过了,文件保存在 " + file_path + " 放弃爬取") 54 | return 55 | 56 | # print("Downloading %s from %s.\n" % (file_name, medium_url)) 57 | # VIDEOID_DICT[VIDEO_ID] = 1 # 记录已经下载的视频 58 | retry_times = 0 59 | while retry_times < RETRY: 60 | try: 61 | res = requests.get(medium_url, headers=headers, stream=True, timeout=TIMEOUT, allow_redirects=False,verify=False) 62 | # if resp.status_code == 403: 63 | # retry_times = RETRY 64 | # print("Access Denied when retrieve %s.\n" % medium_url) 65 | # raise Exception("Access Denied") 66 | # if resp.status_code == 200: 67 | 68 | resp_url = res.headers['Location'] 69 | print("Downloading_LONG %s from %s\n" % (file_name, resp_url)) 70 | resp = requests.get(resp_url, stream=True, timeout=TIMEOUT,verify=False) 71 | with open(file_path, 'wb') as fh: 72 | for chunk in resp.iter_content(chunk_size=1024, decode_unicode=True): 73 | fh.write(chunk) 74 | break 75 | except: 76 | pass 77 | retry_times += 1 78 | else: 79 | try: 80 | os.remove(file_path) 81 | except OSError: 82 | pass 83 | print("Failed to retrieve %s from %s.\n" % (uri, medium_url)) 84 | time.sleep(1) 85 | 86 | 87 | def get_real_address(url): 88 | requests.packages.urllib3.disable_warnings() 89 | if url.find('v.douyin.com') < 0: 90 | return url 91 | res = requests.get(url, headers=HEADERS, allow_redirects=False,verify=False) 92 | return res.headers['Location'] if res.status_code == 302 else None 93 | 94 | 95 | def get_dytk(url): 96 | requests.packages.urllib3.disable_warnings() 97 | res = requests.get(url, headers=HEADERS,verify=False) 98 | if not res: 99 | return None 100 | dytk = re.findall("dytk: '(.*)'", res.content.decode('utf-8')) 101 | if len(dytk): 102 | return dytk[0] 103 | return None 104 | 105 | 106 | class DownloadWorker(Thread): 107 | def __init__(self, queue): 108 | Thread.__init__(self) 109 | self.queue = queue 110 | 111 | def run(self): 112 | while True: 113 | medium_type, uri, download_url, target_folder = self.queue.get() 114 | download(medium_type, uri, download_url, target_folder) 115 | self.queue.task_done() 116 | 117 | 118 | class CrawlerScheduler(object): 119 | 120 | def __init__(self, items): 121 | self.numbers = [] 122 | self.challenges = [] 123 | self.musics = [] 124 | for i in range(len(items)): 125 | url = get_real_address(items[i]) 126 | if not url: 127 | continue 128 | if re.search('share/user', url): 129 | self.numbers.append(url) 130 | if re.search('share/challenge', url): 131 | self.challenges.append(url) 132 | if re.search('share/music', url): 133 | self.musics.append(url) 134 | 135 | self.queue = Queue.Queue() 136 | self.scheduling() 137 | 138 | @staticmethod 139 | def generateSignature(value): 140 | p = os.popen('node fuck-byted-acrawler.js %s' % value) 141 | return p.readlines()[0] 142 | 143 | @staticmethod 144 | def calculateFileMd5(filename): 145 | hmd5 = hashlib.md5() 146 | fp = open(filename, "rb") 147 | hmd5.update(fp.read()) 148 | return hmd5.hexdigest() 149 | 150 | def scheduling(self): 151 | for x in range(THREADS): 152 | worker = DownloadWorker(self.queue) 153 | worker.daemon = True 154 | worker.start() 155 | 156 | for url in self.numbers: 157 | self.download_user_videos(url) 158 | for url in self.challenges: 159 | self.download_challenge_videos(url) 160 | for url in self.musics: 161 | self.download_music_videos(url) 162 | 163 | def download_user_videos(self, url): 164 | number = re.findall(r'share/user/(\d+)', url) 165 | if not len(number): 166 | return 167 | dytk = get_dytk(url) 168 | hostname = urllib.parse.urlparse(url).hostname 169 | if hostname != 't.tiktok.com' and not dytk: 170 | return 171 | user_id = number[0] 172 | params = parse.parse_qs(parse.urlparse(url).query) 173 | sec_uid = params['sec_uid'][0] 174 | video_count = self._download_user_media(sec_uid, dytk, url) 175 | self.queue.join() 176 | print("\nAweme number %s, video number %s\n\n" % 177 | (user_id, str(video_count))) 178 | print("\nFinish Downloading All the videos from %s\n\n" % user_id) 179 | 180 | def download_challenge_videos(self, url): 181 | challenge = re.findall('share/challenge/(\d+)', url) 182 | if not len(challenge): 183 | return 184 | challenges_id = challenge[0] 185 | video_count = self._download_challenge_media(challenges_id, url) 186 | self.queue.join() 187 | print("\nAweme challenge #%s, video number %d\n\n" % 188 | (challenges_id, video_count)) 189 | print("\nFinish Downloading All the videos from #%s\n\n" % challenges_id) 190 | 191 | def download_music_videos(self, url): 192 | music = re.findall('share/music/(\d+)', url) 193 | if not len(music): 194 | return 195 | musics_id = music[0] 196 | video_count = self._download_music_media(musics_id, url) 197 | self.queue.join() 198 | print("\nAweme music @%s, video number %d\n\n" % 199 | (musics_id, video_count)) 200 | print("\nFinish Downloading All the videos from @%s\n\n" % musics_id) 201 | 202 | def _join_download_queue(self, aweme, target_folder): 203 | try: 204 | if aweme.get('video', None): 205 | uri = aweme['video']['play_addr']['uri'] 206 | download_url = "https://aweme.snssdk.com/aweme/v1/play/?{0}" 207 | download_params = { 208 | 'video_id': uri, 209 | 'line': '0', 210 | 'ratio': '720p', 211 | 'media_type': '4', 212 | 'vr_type': '0', 213 | 'test_cdn': 'None', 214 | 'improve_bitrate': '0', 215 | 'iid': '35628056608', 216 | 'device_id': '46166618998', 217 | 'os_api': '18', 218 | 'app_name': 'aweme', 219 | 'channel': 'App%20Store', 220 | 'idfa': '00000000-0000-0000-0000-000000000000', 221 | 'device_platform': 'iphone', 222 | 'build_number': '27014', 223 | 'vid': '2ED380A7-F09C-6C9E-90F5-862D58F3129C', 224 | 'openudid': '21dae85eeac1da35a69e2a0ffeaeef61c78a2e98', 225 | 'device_type': 'iPhone8%2C2', 226 | 'app_version': '2.7.0', 227 | 'version_code': '2.7.0', 228 | 'os_version': '12.0', 229 | 'screen_width': '1242', 230 | 'aid': '1128', 231 | 'ac': 'WIFI', 232 | 'is_play_url': '1' 233 | 234 | } 235 | if aweme.get('hostname') == 't.tiktok.com': 236 | download_url = 'http://api.tiktokv.com/aweme/v1/play/?{0}' 237 | download_params = { 238 | 'video_id': uri, 239 | 'line': '0', 240 | 'ratio': '720p', 241 | 'media_type': '4', 242 | 'vr_type': '0', 243 | 'test_cdn': 'None', 244 | 'improve_bitrate': '0', 245 | 'version_code': '1.7.2', 246 | 'language': 'en', 247 | 'app_name': 'trill', 248 | 'vid': 'D7B3981F-DD46-45A1-A97E-428B90096C3E', 249 | 'app_version': '1.7.2', 250 | 'device_id': '6619780206485964289', 251 | 'channel': 'App Store', 252 | 'mcc_mnc': '', 253 | 'tz_offset': '28800' 254 | } 255 | share_info = aweme.get('share_info', {}) 256 | url = download_url.format( 257 | '&'.join([key + '=' + download_params[key] for key in download_params])) 258 | self.queue.put(('video', 259 | uri, 260 | url, target_folder)) 261 | else: 262 | if aweme.get('image_infos', None): 263 | image = aweme['image_infos']['label_large'] 264 | self.queue.put( 265 | ('image', image['uri'], image['url_list'][0], target_folder)) 266 | 267 | except KeyError: 268 | return 269 | except UnicodeDecodeError: 270 | print("Cannot decode response data from DESC %s" % aweme['desc']) 271 | return 272 | 273 | # def __download_favorite_media(self, user_id, dytk, hostname, signature, favorite_folder, video_count): 274 | # if not os.path.exists(favorite_folder): 275 | # os.makedirs(favorite_folder) 276 | # # favorite_video_url = "https://%s/aweme/8v1/aweme/favorite/" % hostname 277 | # favorite_video_url = "https://%s/web/api/v2/aweme/like/" % hostname 278 | # favorite_video_params = { 279 | # 'user_id': str(user_id), 280 | # 'count': '21', 281 | # 'max_cursor': '0', 282 | # 'aid': '1128', 283 | # '_signature': signature, 284 | # 'dytk': dytk 285 | # } 286 | # max_cursor = None 287 | # while True: 288 | # if max_cursor: 289 | # favorite_video_params['max_cursor'] = str(max_cursor) 290 | # res = requests.get(favorite_video_url, 291 | # headers=HEADERS, params=favorite_video_params) 292 | # contentJson = json.loads(res.content.decode('utf-8')) 293 | # favorite_list = contentJson.get('aweme_list', []) 294 | # for aweme in favorite_list: 295 | # video_count += 1 296 | # aweme['hostname'] = hostname 297 | # self._join_download_queue(aweme, favorite_folder) 298 | # if contentJson.get('has_more'): 299 | # max_cursor = contentJson.get('max_cursor') 300 | # else: 301 | # break 302 | # return video_count 303 | 304 | def _download_user_media(self, user_id, dytk, url): 305 | current_folder = os.getcwd() 306 | target_folder = os.path.join(current_folder, 'download/%s' % user_id) 307 | if not os.path.isdir(target_folder): 308 | os.mkdir(target_folder) 309 | 310 | if not user_id: 311 | print("Number %s does not exist" % user_id) 312 | return 313 | hostname = urllib.parse.urlparse(url).hostname 314 | signature = self.generateSignature(str(user_id)) 315 | # user_video_url = "https://%s/aweme/v1/aweme/post/" % hostname 316 | user_video_url = "https://%s/web/api/v2/aweme/post/" % hostname 317 | # user_video_params = { 318 | # 'user_id': str(user_id), 319 | # 'count': '21', 320 | # 'max_cursor': '0', 321 | # 'aid': '1128', 322 | # '_signature': signature, 323 | # 'dytk': dytk 324 | # } 325 | user_video_params = { 326 | 'sec_uid': str(user_id), 327 | 'count': '21', 328 | 'max_cursor': '0', 329 | 'aid': '1128', 330 | '_signature': signature, 331 | 'dytk': dytk 332 | } 333 | if hostname == 't.tiktok.com': 334 | user_video_params.pop('dytk') 335 | user_video_params['aid'] = '1180' 336 | 337 | max_cursor, video_count = None, 0 338 | while True: 339 | if max_cursor: 340 | user_video_params['max_cursor'] = str(max_cursor) 341 | res = requests.get(user_video_url, headers=HEADERS, 342 | params=user_video_params,verify=False) 343 | contentJson = json.loads(res.content.decode('utf-8')) 344 | aweme_list = contentJson.get('aweme_list', []) 345 | for aweme in aweme_list: 346 | video_count += 1 347 | aweme['hostname'] = hostname 348 | self._join_download_queue(aweme, target_folder) 349 | if contentJson.get('has_more'): 350 | max_cursor = contentJson.get('max_cursor') 351 | else: 352 | break 353 | # if True: 354 | # favorite_folder = target_folder + '/favorite' 355 | # video_count = self.__download_favorite_media( 356 | # user_id, dytk, hostname, signature, favorite_folder, video_count) 357 | 358 | if video_count == 0: 359 | print("There's no video in number %s." % user_id) 360 | 361 | return video_count 362 | 363 | def _download_challenge_media(self, challenge_id, url): 364 | if not challenge_id: 365 | print("Challenge #%s does not exist" % challenge_id) 366 | return 367 | current_folder = os.getcwd() 368 | target_folder = os.path.join( 369 | current_folder, 'download/#%s' % challenge_id) 370 | if not os.path.isdir(target_folder): 371 | os.mkdir(target_folder) 372 | 373 | hostname = urllib.parse.urlparse(url).hostname 374 | signature = self.generateSignature(str(challenge_id) + '9' + '0') 375 | 376 | challenge_video_url = "https://%s/aweme/v1/challenge/aweme/" % hostname 377 | challenge_video_params = { 378 | 'ch_id': str(challenge_id), 379 | 'count': '9', 380 | 'cursor': '0', 381 | 'aid': '1128', 382 | 'screen_limit': '3', 383 | 'download_click_limit': '0', 384 | '_signature': signature 385 | } 386 | 387 | cursor, video_count = None, 0 388 | while True: 389 | if cursor: 390 | challenge_video_params['cursor'] = str(cursor) 391 | challenge_video_params['_signature'] = self.generateSignature( 392 | str(challenge_id) + '9' + str(cursor)) 393 | res = requests.get(challenge_video_url, 394 | headers=HEADERS, params=challenge_video_params,verify=False) 395 | try: 396 | contentJson = json.loads(res.content.decode('utf-8')) 397 | except: 398 | print(res.content) 399 | aweme_list = contentJson.get('aweme_list', []) 400 | if not aweme_list: 401 | break 402 | for aweme in aweme_list: 403 | aweme['hostname'] = hostname 404 | video_count += 1 405 | self._join_download_queue(aweme, target_folder) 406 | print("number: ", video_count) 407 | if contentJson.get('has_more'): 408 | cursor = contentJson.get('cursor') 409 | else: 410 | break 411 | if video_count == 0: 412 | print("There's no video in challenge %s." % challenge_id) 413 | return video_count 414 | 415 | def _download_music_media(self, music_id, url): 416 | if not music_id: 417 | print("Challenge #%s does not exist" % music_id) 418 | return 419 | current_folder = os.getcwd() 420 | target_folder = os.path.join(current_folder, 'download/@%s' % music_id) 421 | if not os.path.isdir(target_folder): 422 | os.mkdir(target_folder) 423 | 424 | hostname = urllib.parse.urlparse(url).hostname 425 | signature = self.generateSignature(str(music_id)) 426 | music_video_url = "https://%s/aweme/v1/music/aweme/?{0}" % hostname 427 | music_video_params = { 428 | 'music_id': str(music_id), 429 | 'count': '9', 430 | 'cursor': '0', 431 | 'aid': '1128', 432 | 'screen_limit': '3', 433 | 'download_click_limit': '0', 434 | '_signature': signature 435 | } 436 | if hostname == 't.tiktok.com': 437 | for key in ['screen_limit', 'download_click_limit', '_signature']: 438 | music_video_params.pop(key) 439 | music_video_params['aid'] = '1180' 440 | 441 | cursor, video_count = None, 0 442 | while True: 443 | if cursor: 444 | music_video_params['cursor'] = str(cursor) 445 | music_video_params['_signature'] = self.generateSignature( 446 | str(music_id) + '9' + str(cursor)) 447 | 448 | url = music_video_url.format( 449 | '&'.join([key + '=' + music_video_params[key] for key in music_video_params])) 450 | res = requests.get(url, headers=HEADERS,verify=False) 451 | contentJson = json.loads(res.content.decode('utf-8')) 452 | aweme_list = contentJson.get('aweme_list', []) 453 | if not aweme_list: 454 | break 455 | for aweme in aweme_list: 456 | aweme['hostname'] = hostname 457 | video_count += 1 458 | self._join_download_queue(aweme, target_folder) 459 | if contentJson.get('has_more'): 460 | cursor = contentJson.get('cursor') 461 | else: 462 | break 463 | if video_count == 0: 464 | print("There's no video in music %s." % music_id) 465 | return video_count 466 | 467 | 468 | def usage(): 469 | print("1. Please create file share-url.txt under this same directory.\n" 470 | "2. In share-url.txt, you can specify amemv share page url separated by " 471 | "comma/space/tab/CR. Accept multiple lines of text\n" 472 | "3. Save the file and retry.\n\n" 473 | "Sample File Content:\nurl1,url2\n\n" 474 | "Or use command line options:\n\n" 475 | "Sample:\npython amemv-video-ripper.py url1,url2\n\n\n") 476 | print(u"未找到share-url.txt文件,请创建.\n" 477 | u"请在文件中指定抖音分享页面URL,并以 逗号/空格/tab/表格鍵/回车符 分割,支持多行.\n" 478 | u"保存文件并重试.\n\n" 479 | u"例子: url1,url12\n\n" 480 | u"或者直接使用命令行参数指定链接\n" 481 | u"例子: python amemv-video-ripper.py url1,url2") 482 | 483 | 484 | def parse_sites(fileName): 485 | with open(fileName, "rb") as f: 486 | txt = f.read().rstrip().lstrip() 487 | txt = codecs.decode(txt, 'utf-8') 488 | txt = txt.replace("\t", ",").replace( 489 | "\r", ",").replace("\n", ",").replace(" ", ",") 490 | txt = txt.split(",") 491 | numbers = list() 492 | for raw_site in txt: 493 | site = raw_site.lstrip().rstrip() 494 | if site: 495 | numbers.append(site) 496 | return numbers 497 | 498 | 499 | download_favorite = False 500 | 501 | if __name__ == "__main__": 502 | content, opts, args = None, None, [] 503 | 504 | try: 505 | if len(sys.argv) >= 2: 506 | opts, args = getopt.getopt(sys.argv[1:], "hi:o:", ["favorite"]) 507 | except getopt.GetoptError as err: 508 | usage() 509 | sys.exit(2) 510 | 511 | if not args: 512 | # check the sites file 513 | filename = "share-url.txt" 514 | if os.path.exists(filename): 515 | content = parse_sites(filename) 516 | else: 517 | usage() 518 | sys.exit(1) 519 | else: 520 | content = (args[0] if args else '').split(",") 521 | 522 | if len(content) == 0 or content[0] == "": 523 | usage() 524 | sys.exit(1) 525 | 526 | if opts: 527 | for o, val in opts: 528 | if o in ("--favorite"): 529 | download_favorite = True 530 | break 531 | 532 | CrawlerScheduler(content) 533 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 文章链接:https://mp.weixin.qq.com/s?__biz=MzI4OTQxNjU2Mw==&mid=2247483973&idx=1&sn=e5f1fc8141993bedab4968cff9ac50d2&chksm=ec2e3659db59bf4f6dd2c8346980c96dacb99e01d518c233b20278c2f5de6b672e37d632bf88&token=829399530&lang=zh_CN#rd 2 | 3 | ### 常见问题:1.报 node 命令找不到 :需要安装 node.js 环境 2.执行完成后,下载的失败率很高,修改 device_id 参数 4长视频下载问题:长视频下载在分享链接爬取不到(我是没找到),app里可以看到,但是videoid不一样,app的加密不好破解,所以暂时长视频只能下载前20秒 5修改抖音获取视频列表接口,签名破解大概需要20-30秒时间,运行不报错的情况下请耐心等待哦 4 | 5 | 6 | 导读:直接上干货,爬取抖音小姐姐视频列表,并去水印下载(仅供学习使用,不做商业用途,如有侵权,联系作者删除);接18年初,Python基础篇更新。 7 | 8 | 比如我想获取抖音网红“惠子”小姐姐的主页列表视频,第一步在抖音上打开惠子的主页,右上角点击一下,可以看到一个分享按钮,点击分享,找到复制链接-> http://v.douyin.com/9GEGSp/ 。把链接放到浏览器中短链接被自动解析,变成长链接: 9 | https://www.iesdouyin.com/share/user/73838190950?u_code=128dfi636&sec_uid=MS4wLjABAAAAHmQ4DqHKN8IdfWWd52sYaGS6zaZaOTghOZ4ysZ0z_YM×tamp=1571884619&utm_source=copy&utm_campaign=client_share&utm_medium=android&share_app_name=douyin ,在长链接中就可以看到一些用户信息,有没有用我们先列出来! 10 | 11 | 12 | key | value 13 | ---|--- 14 | user | 73838190950 15 | u_code |128dfi636 16 | sec_uid |MS4wLjABAAAAHmQ4DqHKN8IdfWWd52sYaGS6zaZaOTghOZ4ysZ0z_YM 17 | timestamp |1571884619 18 | utm_source |copy 19 | utm_campaign |client_share 20 | utm_medium |android 21 | share_app_name |douyin 22 | 23 | 打开浏览器开发者工具,找到对应的视频列表请求接口,一个一个排查终于找到这个链接:https://www.iesdouyin.com/web/api/v2/aweme/post/?sec_uid=MS4wLjABAAAAHmQ4DqHKN8IdfWWd52sYaGS6zaZaOTghOZ4ysZ0z_YM&count=21&max_cursor=0&aid=1128&_signature=QOtJJBARHVwzHUNLqlT-mEDrST&dytk=593d265a74e3384e06112b423ef268da 24 | 25 | key | value 26 | ---|--- 27 | sec_uid | MS4wLjABAAAAHmQ4DqHKN8IdfWWd52sYaGS6zaZaOTghOZ4ysZ0z_YM 28 | count |21 29 | max_cursor |1567769380000 30 | aid |1128 31 | _signature |F1OCixATSudkpYjkPsX5FRdTgp 32 | dytk |593d265a74e3384e06112b423ef268da 33 | 34 | 返回的数据: 35 | 36 | ``` 37 | Json: 38 | { 39 | "max_cursor": 1569668211000, 40 | "min_cursor": 1571815003000, 41 | "has_more": true, 42 | -"extra": { 43 | "now": 1571888892000, 44 | "logid": "2019102411481201001404709304158BDD" 45 | }, 46 | "status_code": 0, 47 | -"aweme_list": [ 48 | -{ 49 | -"statistics": { 50 | "aweme_id": "6750893105127378180", 51 | "comment_count": 1240, 52 | "digg_count": 30000, 53 | "play_count": 675000, 54 | "share_count": 79, 55 | "forward_count": 17 56 | }, 57 | "image_infos": null, 58 | "uniqid_position": null, 59 | "long_video": null, 60 | "aweme_id": "6750893105127378180", 61 | +"text_extra": [ … ], 62 | "position": null, 63 | "geofencing": null, 64 | "promotions": null, 65 | "desc": "#看啥啥都缺 ,爱买女孩绝不认输。", 66 | "aweme_type": 4, 67 | "comment_list": null, 68 | "video_text": null, 69 | "cha_list": null, 70 | -"video": { 71 | +"cover": { … }, 72 | "width": 720, 73 | -"origin_cover": { 74 | -"url_list": [ 75 | "http://p3-dy.byteimg.com/large/tos-cn-p-0015/6e83730009fe4fc2a3eeddbf06b0dbbf_1571815007.jpeg", 76 | "http://p9-dy.byteimg.com/large/tos-cn-p-0015/6e83730009fe4fc2a3eeddbf06b0dbbf_1571815007.jpeg", 77 | "http://p1-dy.byteimg.com/large/tos-cn-p-0015/6e83730009fe4fc2a3eeddbf06b0dbbf_1571815007.jpeg" 78 | ], 79 | "uri": "large/tos-cn-p-0015/6e83730009fe4fc2a3eeddbf06b0dbbf_1571815007" 80 | }, 81 | "has_watermark": false, 82 | -"play_addr_lowbr": { 83 | "uri": "v0200ff80000bmnvs5ignbh26fqqufbg", 84 | -"url_list": [ 85 | "https://aweme.snssdk.com/aweme/v1/play/?video_id=v0200ff80000bmnvs5ignbh26fqqufbg&line=0&ratio=540p&media_type=4&vr_type=0&improve_bitrate=0&is_play_url=1", 86 | "https://api.amemv.com/aweme/v1/play/?video_id=v0200ff80000bmnvs5ignbh26fqqufbg&line=1&ratio=540p&media_type=4&vr_type=0&improve_bitrate=0&is_play_url=1" 87 | ] 88 | }, 89 | "bit_rate": null, 90 | "vid": "v0200ff80000bmnvs5ignbh26fqqufbg", 91 | -"play_addr": { 92 | "uri": "v0200ff80000bmnvs5ignbh26fqqufbg", 93 | -"url_list": [ 94 | "https://aweme.snssdk.com/aweme/v1/play/?video_id=v0200ff80000bmnvs5ignbh26fqqufbg&line=0&ratio=540p&media_type=4&vr_type=0&improve_bitrate=0&is_play_url=1", 95 | "https://api.amemv.com/aweme/v1/play/?video_id=v0200ff80000bmnvs5ignbh26fqqufbg&line=1&ratio=540p&media_type=4&vr_type=0&improve_bitrate=0&is_play_url=1" 96 | ] 97 | }, 98 | "height": 1280, 99 | -"dynamic_cover": { 100 | -"url_list": [ 101 | "https://p3-dy.byteimg.com/obj/tos-cn-p-0015/f4f71ff403574d768a87e7ef3501a7cc_1571815009", 102 | "https://p9-dy.byteimg.com/obj/tos-cn-p-0015/f4f71ff403574d768a87e7ef3501a7cc_1571815009", 103 | "https://p1-dy.byteimg.com/obj/tos-cn-p-0015/f4f71ff403574d768a87e7ef3501a7cc_1571815009" 104 | ], 105 | "uri": "tos-cn-p-0015/f4f71ff403574d768a87e7ef3501a7cc_1571815009" 106 | }, 107 | "ratio": "540p", 108 | -"download_addr": { 109 | "uri": "v0200ff80000bmnvs5ignbh26fqqufbg", 110 | -"url_list": [ 111 | "https://aweme.snssdk.com/aweme/v1/play/?video_id=v0200ff80000bmnvs5ignbh26fqqufbg&line=0&ratio=540p&watermark=0&media_type=4&vr_type=0&improve_bitrate=0&logo_name=aweme_self", 112 | "https://api.amemv.com/aweme/v1/play/?video_id=v0200ff80000bmnvs5ignbh26fqqufbg&line=1&ratio=540p&watermark=0&media_type=4&vr_type=0&improve_bitrate=0&logo_name=aweme_self" 113 | ] 114 | }, 115 | "duration": 61824 116 | }, 117 | "video_labels": null, 118 | "label_top_text": null 119 | } 120 | ] 121 | } 122 | ``` 123 | 通过返回的参数可以看到我们需要的数据都在这里,在这里不着急解析数据,通过对比请求参数,别的参数都是现成的在主页代码中都可以找到,基本可以确定` _signature `参数是加密字符串,接下来我们就跟踪这个参数的形成过程。通过搜索,确定了它在`index_10ae3b3.js`中生成的 ,截图如下: 124 | 125 | 通过截图我们知道,`signature` 是通过`_bytedAcrawler`对象获取的,顺着我们查看它的生成过程:截图如下: 126 | 127 | 它是在`base_327cc85.js`生成的,截图如下: 128 | 129 | 通过分析,`_signature` 获取比较复杂,js代码已经被混淆压制,直接分析算法过程比较难,但是我们可以通过执行签名的算法代码,并返回对应的签名结果。把被压过的js保存下来,执行`_bytedAcrawler.sign("")`获取参数签名。 130 | 131 | 分析完成后,开始python模拟手机数据请求: 132 | 133 | 1.读取主页链接:支持同时爬去多个小姐姐的主页视频列表,在`share-url.txt`中输入每个URL通过逗号/空格/tab/表格鍵/回车符 分割,支持多行,也可以使用命令进行指定链接`python amemv-video-ripper.py url1,url2...`,解析文本数据/命令行数据; 134 | 135 | ``` 136 | content, opts, args = None, None, [] 137 | 138 | try: 139 | if len(sys.argv) >= 2: 140 | opts, args = getopt.getopt(sys.argv[1:], "hi:o:", ["favorite"]) 141 | except getopt.GetoptError as err: 142 | usage() 143 | sys.exit(2) 144 | 145 | if not args: 146 | # check the sites file 147 | filename = "share-url.txt" 148 | if os.path.exists(filename): 149 | content = parse_sites(filename) 150 | else: 151 | usage() 152 | sys.exit(1) 153 | else: 154 | content = (args[0] if args else '').split(",") 155 | 156 | if len(content) == 0 or content[0] == "": 157 | usage() 158 | sys.exit(1) 159 | 160 | if opts: 161 | for o, val in opts: 162 | if o in ("--favorite"): 163 | download_favorite = True 164 | break 165 | 166 | CrawlerScheduler(content) 167 | ``` 168 | 2.获取列表视频: 169 | 170 | ``` 171 | class CrawlerScheduler(object): 172 | 173 | def __init__(self, items): 174 | self.numbers = [] 175 | self.challenges = [] 176 | self.musics = [] 177 | for i in range(len(items)): 178 | url = get_real_address(items[i]) 179 | if not url: 180 | continue 181 | if re.search('share/user', url): 182 | self.numbers.append(url) 183 | if re.search('share/challenge', url): 184 | self.challenges.append(url) 185 | if re.search('share/music', url): 186 | self.musics.append(url) 187 | 188 | self.queue = Queue.Queue() 189 | self.scheduling() 190 | 191 | #通过node执行fuck-byted-acrawler.js得到签名 192 | @staticmethod 193 | def generateSignature(value): 194 | p = os.popen('node fuck-byted-acrawler.js %s' % value) 195 | return p.readlines()[0] 196 | 197 | @staticmethod 198 | def calculateFileMd5(filename): 199 | hmd5 = hashlib.md5() 200 | fp = open(filename, "rb") 201 | hmd5.update(fp.read()) 202 | return hmd5.hexdigest() 203 | 204 | def scheduling(self): 205 | for x in range(THREADS): 206 | worker = DownloadWorker(self.queue) 207 | worker.daemon = True 208 | worker.start() 209 | 210 | for url in self.numbers: 211 | self.download_user_videos(url) 212 | for url in self.challenges: 213 | self.download_challenge_videos(url) 214 | for url in self.musics: 215 | self.download_music_videos(url) 216 | 217 | def download_user_videos(self, url): 218 | number = re.findall(r'share/user/(\d+)', url) 219 | if not len(number): 220 | return 221 | dytk = get_dytk(url) 222 | hostname = urllib.parse.urlparse(url).hostname 223 | if hostname != 't.tiktok.com' and not dytk: 224 | return 225 | user_id = number[0] 226 | video_count = self._download_user_media(user_id, dytk, url) 227 | self.queue.join() 228 | print("\nAweme number %s, video number %s\n\n" % 229 | (user_id, str(video_count))) 230 | print("\nFinish Downloading All the videos from %s\n\n" % user_id) 231 | 232 | def download_challenge_videos(self, url): 233 | challenge = re.findall('share/challenge/(\d+)', url) 234 | if not len(challenge): 235 | return 236 | challenges_id = challenge[0] 237 | video_count = self._download_challenge_media(challenges_id, url) 238 | self.queue.join() 239 | print("\nAweme challenge #%s, video number %d\n\n" % 240 | (challenges_id, video_count)) 241 | print("\nFinish Downloading All the videos from #%s\n\n" % challenges_id) 242 | 243 | def download_music_videos(self, url): 244 | music = re.findall('share/music/(\d+)', url) 245 | if not len(music): 246 | return 247 | musics_id = music[0] 248 | video_count = self._download_music_media(musics_id, url) 249 | self.queue.join() 250 | print("\nAweme music @%s, video number %d\n\n" % 251 | (musics_id, video_count)) 252 | print("\nFinish Downloading All the videos from @%s\n\n" % musics_id) 253 | 254 | def _join_download_queue(self, aweme, target_folder): 255 | try: 256 | if aweme.get('video', None): 257 | uri = aweme['video']['play_addr']['uri'] 258 | download_url = "https://aweme.snssdk.com/aweme/v1/play/?{0}" 259 | download_params = { 260 | 'video_id': uri, 261 | 'line': '0', 262 | 'ratio': '720p', 263 | 'media_type': '4', 264 | 'vr_type': '0', 265 | 'test_cdn': 'None', 266 | 'improve_bitrate': '0', 267 | 'iid': '35628056608', 268 | 'device_id': '46166618999', 269 | 'os_api': '18', 270 | 'app_name': 'aweme', 271 | 'channel': 'App%20Store', 272 | 'idfa': '00000000-0000-0000-0000-000000000000', 273 | 'device_platform': 'iphone', 274 | 'build_number': '27014', 275 | 'vid': '2ED380A7-F09C-6C9E-90F5-862D58F3129C', 276 | 'openudid': '21dae85eeac1da35a69e2a0ffeaeef61c78a2e98', 277 | 'device_type': 'iPhone8%2C2', 278 | 'app_version': '2.7.0', 279 | 'version_code': '2.7.0', 280 | 'os_version': '12.0', 281 | 'screen_width': '1242', 282 | 'aid': '1128', 283 | 'ac': 'WIFI' 284 | } 285 | if aweme.get('hostname') == 't.tiktok.com': 286 | download_url = 'http://api.tiktokv.com/aweme/v1/play/?{0}' 287 | download_params = { 288 | 'video_id': uri, 289 | 'line': '0', 290 | 'ratio': '720p', 291 | 'media_type': '4', 292 | 'vr_type': '0', 293 | 'test_cdn': 'None', 294 | 'improve_bitrate': '0', 295 | 'version_code': '1.7.2', 296 | 'language': 'en', 297 | 'app_name': 'trill', 298 | 'vid': 'D7B3981F-DD46-45A1-A97E-428B90096C3E', 299 | 'app_version': '1.7.2', 300 | 'device_id': '6619780206485964289', 301 | 'channel': 'App Store', 302 | 'mcc_mnc': '', 303 | 'tz_offset': '28800' 304 | } 305 | share_info = aweme.get('share_info', {}) 306 | url = download_url.format( 307 | '&'.join([key + '=' + download_params[key] for key in download_params])) 308 | self.queue.put(('video', 309 | uri + "-" + share_info.get('share_desc', uri), 310 | url, target_folder)) 311 | else: 312 | if aweme.get('image_infos', None): 313 | image = aweme['image_infos']['label_large'] 314 | self.queue.put( 315 | ('image', image['uri'], image['url_list'][0], target_folder)) 316 | 317 | except KeyError: 318 | return 319 | except UnicodeDecodeError: 320 | print("Cannot decode response data from DESC %s" % aweme['desc']) 321 | return 322 | 323 | 324 | def _download_user_media(self, user_id, dytk, url): 325 | current_folder = os.getcwd() 326 | target_folder = os.path.join(current_folder, 'download/%s' % user_id) 327 | if not os.path.isdir(target_folder): 328 | os.mkdir(target_folder) 329 | 330 | if not user_id: 331 | print("Number %s does not exist" % user_id) 332 | return 333 | hostname = urllib.parse.urlparse(url).hostname 334 | signature = self.generateSignature(str(user_id)) 335 | user_video_url = "https://%s/aweme/v1/aweme/post/" % hostname 336 | user_video_params = { 337 | 'user_id': str(user_id), 338 | 'count': '21', 339 | 'max_cursor': '0', 340 | 'aid': '1128', 341 | '_signature': signature, 342 | 'dytk': dytk 343 | } 344 | if hostname == 't.tiktok.com': 345 | user_video_params.pop('dytk') 346 | user_video_params['aid'] = '1180' 347 | 348 | max_cursor, video_count = None, 0 349 | while True: 350 | if max_cursor: 351 | user_video_params['max_cursor'] = str(max_cursor) 352 | res = requests.get(user_video_url, headers=HEADERS, 353 | params=user_video_params) 354 | contentJson = json.loads(res.content.decode('utf-8')) 355 | aweme_list = contentJson.get('aweme_list', []) 356 | for aweme in aweme_list: 357 | video_count += 1 358 | aweme['hostname'] = hostname 359 | self._join_download_queue(aweme, target_folder) 360 | if contentJson.get('has_more'): 361 | max_cursor = contentJson.get('max_cursor') 362 | else: 363 | break 364 | # if True: 365 | # favorite_folder = target_folder + '/favorite' 366 | # video_count = self.__download_favorite_media( 367 | # user_id, dytk, hostname, signature, favorite_folder, video_count) 368 | 369 | if video_count == 0: 370 | print("There's no video in number %s." % user_id) 371 | 372 | return video_count 373 | 374 | def _download_challenge_media(self, challenge_id, url): 375 | if not challenge_id: 376 | print("Challenge #%s does not exist" % challenge_id) 377 | return 378 | current_folder = os.getcwd() 379 | target_folder = os.path.join( 380 | current_folder, 'download/#%s' % challenge_id) 381 | if not os.path.isdir(target_folder): 382 | os.mkdir(target_folder) 383 | 384 | hostname = urllib.parse.urlparse(url).hostname 385 | signature = self.generateSignature(str(challenge_id) + '9' + '0') 386 | 387 | challenge_video_url = "https://%s/aweme/v1/challenge/aweme/" % hostname 388 | challenge_video_params = { 389 | 'ch_id': str(challenge_id), 390 | 'count': '9', 391 | 'cursor': '0', 392 | 'aid': '1128', 393 | 'screen_limit': '3', 394 | 'download_click_limit': '0', 395 | '_signature': signature 396 | } 397 | 398 | cursor, video_count = None, 0 399 | while True: 400 | if cursor: 401 | challenge_video_params['cursor'] = str(cursor) 402 | challenge_video_params['_signature'] = self.generateSignature( 403 | str(challenge_id) + '9' + str(cursor)) 404 | res = requests.get(challenge_video_url, 405 | headers=HEADERS, params=challenge_video_params) 406 | try: 407 | contentJson = json.loads(res.content.decode('utf-8')) 408 | except: 409 | print(res.content) 410 | aweme_list = contentJson.get('aweme_list', []) 411 | if not aweme_list: 412 | break 413 | for aweme in aweme_list: 414 | aweme['hostname'] = hostname 415 | video_count += 1 416 | self._join_download_queue(aweme, target_folder) 417 | print("number: ", video_count) 418 | if contentJson.get('has_more'): 419 | cursor = contentJson.get('cursor') 420 | else: 421 | break 422 | if video_count == 0: 423 | print("There's no video in challenge %s." % challenge_id) 424 | return video_count 425 | 426 | def _download_music_media(self, music_id, url): 427 | if not music_id: 428 | print("Challenge #%s does not exist" % music_id) 429 | return 430 | current_folder = os.getcwd() 431 | target_folder = os.path.join(current_folder, 'download/@%s' % music_id) 432 | if not os.path.isdir(target_folder): 433 | os.mkdir(target_folder) 434 | 435 | hostname = urllib.parse.urlparse(url).hostname 436 | signature = self.generateSignature(str(music_id)) 437 | music_video_url = "https://%s/aweme/v1/music/aweme/?{0}" % hostname 438 | music_video_params = { 439 | 'music_id': str(music_id), 440 | 'count': '9', 441 | 'cursor': '0', 442 | 'aid': '1128', 443 | 'screen_limit': '3', 444 | 'download_click_limit': '0', 445 | '_signature': signature 446 | } 447 | if hostname == 't.tiktok.com': 448 | for key in ['screen_limit', 'download_click_limit', '_signature']: 449 | music_video_params.pop(key) 450 | music_video_params['aid'] = '1180' 451 | 452 | cursor, video_count = None, 0 453 | while True: 454 | if cursor: 455 | music_video_params['cursor'] = str(cursor) 456 | music_video_params['_signature'] = self.generateSignature( 457 | str(music_id) + '9' + str(cursor)) 458 | 459 | url = music_video_url.format( 460 | '&'.join([key + '=' + music_video_params[key] for key in music_video_params])) 461 | res = requests.get(url, headers=HEADERS) 462 | contentJson = json.loads(res.content.decode('utf-8')) 463 | aweme_list = contentJson.get('aweme_list', []) 464 | if not aweme_list: 465 | break 466 | for aweme in aweme_list: 467 | aweme['hostname'] = hostname 468 | video_count += 1 469 | self._join_download_queue(aweme, target_folder) 470 | if contentJson.get('has_more'): 471 | cursor = contentJson.get('cursor') 472 | else: 473 | break 474 | if video_count == 0: 475 | print("There's no video in music %s." % music_id) 476 | return video_count 477 | ``` 478 | 3.下载视频: 479 | 480 | ``` 481 | #下载相关的逻辑 482 | def download(medium_type, uri, medium_url, target_folder): 483 | 484 | headers = copy.copy(HEADERS) 485 | file_name = uri 486 | if medium_type == 'video': 487 | file_name += '.mp4' 488 | headers['user-agent'] = 'Aweme/27014 CFNetwork/974.2.1 Darwin/18.0.0' 489 | elif medium_type == 'image': 490 | file_name += '.jpg' 491 | file_name = file_name.replace("/", "-") 492 | else: 493 | return 494 | 495 | file_path = os.path.join(target_folder, file_name) 496 | if os.path.isfile(file_path): 497 | print(file_name + " 已经爬取过了,文件保存在 " + file_path + " 放弃爬取") 498 | return 499 | 500 | print("Downloading %s from %s.\n" % (file_name, medium_url)) 501 | # VIDEOID_DICT[VIDEO_ID] = 1 # 记录已经下载的视频 502 | retry_times = 0 503 | while retry_times < RETRY: 504 | try: 505 | resp = requests.get(medium_url, headers=headers, stream=True, timeout=TIMEOUT) 506 | if resp.status_code == 403: 507 | retry_times = RETRY 508 | print("Access Denied when retrieve %s.\n" % medium_url) 509 | raise Exception("Access Denied") 510 | with open(file_path, 'wb') as fh: 511 | for chunk in resp.iter_content(chunk_size=1024): 512 | fh.write(chunk) 513 | break 514 | except: 515 | pass 516 | retry_times += 1 517 | else: 518 | try: 519 | os.remove(file_path) 520 | except OSError: 521 | pass 522 | print("Failed to retrieve %s from %s.\n" % (uri, medium_url)) 523 | time.sleep(1) 524 | 525 | ``` 526 | 4.其他: 527 | 528 | ``` 529 | #通过短链接-获取长链接 530 | def get_real_address(url): 531 | if url.find('v.douyin.com') < 0: 532 | return url 533 | res = requests.get(url, headers=HEADERS, allow_redirects=False) 534 | return res.headers['Location'] if res.status_code == 302 else None 535 | 536 | # 得到dytk参数 537 | def get_dytk(url): 538 | res = requests.get(url, headers=HEADERS) 539 | if not res: 540 | return None 541 | dytk = re.findall("dytk: '(.*)'", res.content.decode('utf-8')) 542 | if len(dytk): 543 | return dytk[0] 544 | return None 545 | 546 | # 下载管理器 547 | class DownloadWorker(Thread): 548 | def __init__(self, queue): 549 | Thread.__init__(self) 550 | self.queue = queue 551 | 552 | def run(self): 553 | while True: 554 | medium_type, uri, download_url, target_folder = self.queue.get() 555 | download(medium_type, uri, download_url, target_folder) 556 | self.queue.task_done() 557 | ``` 558 | 559 | 5.执行截图: 560 | 561 | 562 | 563 | 6.源码获取: 564 | 565 | 566 | 7.去水印说明:其实抖音列表返回了无水印视频链接和有水印链接,没有涉及对视频水印的处理 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | --------------------------------------------------------------------------------