├── images ├── logo.jpg └── pay.jpg ├── requirements.txt ├── README.md ├── LICENSE ├── utils.py └── douyin_spider.py /images/logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huangke19/TikTokSpider/HEAD/images/logo.jpg -------------------------------------------------------------------------------- /images/pay.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huangke19/TikTokSpider/HEAD/images/pay.jpg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | appnope==0.1.0 2 | backcall==0.1.0 3 | certifi==2018.8.24 4 | chardet==3.0.4 5 | decorator==4.3.0 6 | idna==2.7 7 | ipython==6.5.0 8 | ipython-genutils==0.2.0 9 | jedi==0.12.1 10 | parso==0.3.1 11 | pexpect==4.6.0 12 | pickleshare==0.7.4 13 | prompt-toolkit==1.0.15 14 | ptyprocess==0.6.0 15 | Pygments==2.2.0 16 | requests==2.20.0 17 | simplegeneric==0.8.1 18 | six==1.11.0 19 | tqdm==4.25.0 20 | traitlets==4.3.2 21 | urllib3==1.24.2 22 | wcwidth==0.1.7 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 抖音爬虫 2 | 3 | 配置虚拟环境和安装依赖包默认都会,略过 4 | 5 | 6 | 7 | 8 | 9 | ## 使用方法 10 | 11 | 12 | 13 | - 抖音主页 - 分享 - 复制链接 - 发送到电脑 14 | - 点击链接 - 将网址复制后使用浏览器访问 15 | - 打开Devtools - 选择手机预览模式 16 | - 点击"**作品**",找到加载视频时对应的url 17 | - 运行脚本,依次输入url、user_agent,用户ID(step2里有) 18 | 19 | 20 | 21 | ## 完整演示 22 | 23 | https://www.bilibili.com/video/BV1Pg4y187sy 24 | 25 | 26 | 27 | 28 | 29 | ## If It Helps 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 huangke 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ user-agent池 """ 5 | 6 | # 导入标准库 7 | import os 8 | import re 9 | import sys 10 | from argparse import ArgumentParser 11 | 12 | 13 | def input_user_agent(): 14 | print("\n抖音要求user_agent和当前请求匹配,请输入当前请求中的user_agent值\n") 15 | UA = input("Dev_tools中的user_agent: ").strip() 16 | return UA 17 | 18 | 19 | def input_request_url(): 20 | print("\n请输入加载作品对应的url,注意检查返回值中aweme_list是否为空\n") 21 | URL = input("Dev_tools中的url: ").strip() 22 | return URL 23 | 24 | 25 | def get_id_from_cmd(cmd_args): 26 | ''' 27 | 从命令行获取user_id 28 | 29 | :param cmd_args: 命令行参数 30 | :return: user_id 31 | ''' 32 | args = parse_args(cmd_args) 33 | if not args: 34 | return 35 | 36 | if args.user_id: 37 | _id = args.user_id 38 | return _id 39 | return None 40 | 41 | 42 | def get_id_from_input(): 43 | ''' 44 | 从用户输入获取user_id 45 | 46 | :return: user_id 47 | ''' 48 | _id = input('\n请输入你要爬取的抖音用户id: ') 49 | return _id 50 | 51 | 52 | def is_valid_id(_id): 53 | ''' 54 | 检查用户输入的抖音id是否合法 55 | 56 | :param _id: user_id 57 | :return: bool 58 | ''' 59 | if not _id: 60 | return False 61 | if not re.match('^\\d+$', str(_id).strip()): 62 | sys.stdout.write("请输入正确格式的抖音id\n") 63 | return False 64 | return True 65 | 66 | 67 | def makedir(name): 68 | ''' 69 | 建立用户名文件夹 70 | 71 | :param name: username 72 | :return: None 73 | ''' 74 | if not os.path.isdir(name): 75 | os.mkdir(name) 76 | else: 77 | pass 78 | 79 | 80 | def parse_args(args): 81 | ''' 82 | 解析命令行参数 83 | 84 | :param args: 命令行参数 85 | :return: 新的parse_args函数 86 | ''' 87 | parser = ArgumentParser() 88 | parser.add_argument('--uid', dest='user_id', type=int, help='用户的抖音id') 89 | return parser.parse_args(args) 90 | -------------------------------------------------------------------------------- /douyin_spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | ''' 一只抖音小爬虫 ''' 5 | 6 | ############################# 7 | # 8 | # Author: Huang Ke 9 | # Email: huangkwell@163.com 10 | # 微信: 760208296 11 | # 复活时间: 2020/4/6 12 | # 13 | ############################# 14 | 15 | 16 | # 导入标准库 17 | import re 18 | import sys 19 | from time import sleep 20 | 21 | # 导入第三方库 22 | import requests 23 | 24 | # 全局变量 25 | from utils import ( 26 | get_id_from_cmd, 27 | is_valid_id, 28 | get_id_from_input, 29 | input_user_agent, 30 | input_request_url, 31 | makedir 32 | ) 33 | 34 | VIDEO_URLS, PAGE = [], 1 35 | 36 | URL = input_request_url() 37 | HEADERS = { 38 | 'user-agent': input_user_agent() 39 | } 40 | 41 | 42 | def get_douyin_id(): 43 | ''' 44 | 获取抖音用户id 45 | 46 | :return: user_id 47 | ''' 48 | _id1 = get_id_from_cmd(sys.argv[1:]) 49 | if _id1: 50 | if is_valid_id(_id1): 51 | return _id1 52 | else: 53 | return get_douyin_id() 54 | 55 | _id2 = get_id_from_input() 56 | if _id2: 57 | if is_valid_id(_id2): 58 | return _id2 59 | else: 60 | return get_douyin_id() 61 | 62 | return None 63 | 64 | 65 | def get_username(user_id): 66 | ''' 67 | 获取用户名 68 | 69 | :param user_id: 用户抖音id 70 | :returns: username 71 | ''' 72 | url = "https://www.amemv.com/share/user/%s" % user_id 73 | headers = HEADERS 74 | try: 75 | print("\n获取用户名,建立文件夹中...\n") 76 | response = requests.request("GET", url, headers=headers) 77 | name = re.findall('

(.*?)

', response.text)[0] 78 | return name 79 | except (TypeError, IndexError): 80 | sys.stdout.write("提示: 请确认输入的是用户ID,而不是抖音号或单个视频的id\n") 81 | return None, None 82 | except requests.exceptions: 83 | sys.stdout.write("连接错误,未能获取正确数据\n") 84 | return None, None 85 | 86 | 87 | def get_all_video_urls(user_id, max_cursor): 88 | ''' 89 | 递归获取用户所有视频的源地址url 90 | 91 | :param user_id: 用户抖音id 92 | :param max_cursor: 下一页地址游标 93 | 94 | :return: urls 95 | ''' 96 | 97 | url = re.sub('max_cursor=0', 'max_cursor=%s' % max_cursor, URL, ) 98 | 99 | try: 100 | global PAGE 101 | print('\n正在收集第%s页视频地址\n' % (PAGE)) 102 | response = requests.request("GET", url, headers=HEADERS) 103 | print('第%s页视频地址获取成功\n' % (PAGE)) 104 | 105 | if response.status_code == 200: 106 | data = response.json() 107 | l = data['aweme_list'] 108 | if l == []: 109 | print("请检查输入的url地址,在Devtools里确认Response中aweme_list列表不为空") 110 | return VIDEO_URLS 111 | 112 | for li in data['aweme_list']: 113 | name = li.get('desc') 114 | url = li.get('video').get('play_addr').get('url_list')[0] 115 | VIDEO_URLS.append([name, url]) 116 | print(VIDEO_URLS[-1]) 117 | 118 | # 下拉获取更多视频 119 | if data['has_more'] is True and data.get('max_cursor') != 0: 120 | sleep(2) 121 | PAGE += 1 122 | return get_all_video_urls( 123 | user_id, data.get('max_cursor')) 124 | else: 125 | return VIDEO_URLS 126 | else: 127 | print(response.status_code) 128 | return 129 | except Exception as e: 130 | print('failed,', e) 131 | return VIDEO_URLS 132 | 133 | 134 | def download_video(index, username, name, url, retry=3): 135 | ''' 136 | 下载视频,显示进度 137 | 138 | :param index: 视频序号 139 | :param username: 用户名 140 | :param name: 视频名 141 | :param url: 视频地址 142 | :param retry: 重试次数 143 | 144 | :return: None 145 | ''' 146 | 147 | print("\n下载第%s个视频: %s" % (index, name)) 148 | try: 149 | response = requests.get( 150 | url, 151 | stream=True, 152 | headers=HEADERS, 153 | timeout=15, 154 | allow_redirects=False) 155 | video_url = response.headers['Location'] 156 | video_response = requests.get( 157 | video_url, headers=HEADERS, timeout=15) 158 | 159 | # 保存视频,显示下载进度 160 | if video_response.status_code == 200: 161 | video_size = int(video_response.headers['Content-Length']) 162 | with open('%s/%s.mp4' % (username, name), 'wb') as f: 163 | data_length = 0 164 | for data in video_response.iter_content(chunk_size=1024): 165 | data_length += len(data) 166 | f.write(data) 167 | done = int(50 * data_length / video_size) 168 | sys.stdout.write("\r下载进度: [%s%s]" % ( 169 | '█' * done, ' ' * (50 - done))) 170 | sys.stdout.flush() 171 | 172 | # 失败重试3次,超过放弃 173 | elif video_response.status_code != 200 and retry: 174 | retry -= 1 175 | download_video(index, username, name, url, retry) 176 | else: 177 | return 178 | except Exception as e: 179 | print('download failed,', name, e) 180 | return None 181 | 182 | 183 | def download_all_videos(videl_urls, username): 184 | """ 185 | 下载所有的视频 186 | """ 187 | for index, item in enumerate(videl_urls, 1): 188 | name = item[0] 189 | if name == '': 190 | name = username + '_' + str(index) 191 | url = item[1] 192 | download_video(index, username, name, url) 193 | sleep(2) 194 | pass 195 | 196 | 197 | def main(): 198 | ''' 199 | 主函数, 下载视频 200 | :return: None 201 | ''' 202 | _id = get_douyin_id() 203 | 204 | username = get_username(_id) 205 | if not username: 206 | return 207 | else: 208 | makedir(username) 209 | 210 | video_urls = get_all_video_urls(_id, 0) 211 | if not video_urls: 212 | return 213 | 214 | download_all_videos(video_urls, username) 215 | 216 | 217 | if __name__ == '__main__': 218 | main() 219 | --------------------------------------------------------------------------------