├── images
├── logo.jpg
└── pay.jpg
├── requirements.txt
├── README.md
├── LICENSE
├── utils.py
└── douyin_spider.py
/images/logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huangke19/TikTokSpider/HEAD/images/logo.jpg
--------------------------------------------------------------------------------
/images/pay.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huangke19/TikTokSpider/HEAD/images/pay.jpg
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | appnope==0.1.0
2 | backcall==0.1.0
3 | certifi==2018.8.24
4 | chardet==3.0.4
5 | decorator==4.3.0
6 | idna==2.7
7 | ipython==6.5.0
8 | ipython-genutils==0.2.0
9 | jedi==0.12.1
10 | parso==0.3.1
11 | pexpect==4.6.0
12 | pickleshare==0.7.4
13 | prompt-toolkit==1.0.15
14 | ptyprocess==0.6.0
15 | Pygments==2.2.0
16 | requests==2.20.0
17 | simplegeneric==0.8.1
18 | six==1.11.0
19 | tqdm==4.25.0
20 | traitlets==4.3.2
21 | urllib3==1.24.2
22 | wcwidth==0.1.7
23 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## 抖音爬虫
2 |
3 | 配置虚拟环境和安装依赖包默认都会,略过
4 |
5 |
6 |
7 |
8 |
9 | ## 使用方法
10 |
11 |
12 |
13 | - 抖音主页 - 分享 - 复制链接 - 发送到电脑
14 | - 点击链接 - 将网址复制后使用浏览器访问
15 | - 打开Devtools - 选择手机预览模式
16 | - 点击"**作品**",找到加载视频时对应的url
17 | - 运行脚本,依次输入url、user_agent,用户ID(step2里有)
18 |
19 |
20 |
21 | ## 完整演示
22 |
23 | https://www.bilibili.com/video/BV1Pg4y187sy
24 |
25 |
26 |
27 |
28 |
29 | ## If It Helps
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 huangke
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | """ user-agent池 """
5 |
6 | # 导入标准库
7 | import os
8 | import re
9 | import sys
10 | from argparse import ArgumentParser
11 |
12 |
13 | def input_user_agent():
14 | print("\n抖音要求user_agent和当前请求匹配,请输入当前请求中的user_agent值\n")
15 | UA = input("Dev_tools中的user_agent: ").strip()
16 | return UA
17 |
18 |
19 | def input_request_url():
20 | print("\n请输入加载作品对应的url,注意检查返回值中aweme_list是否为空\n")
21 | URL = input("Dev_tools中的url: ").strip()
22 | return URL
23 |
24 |
25 | def get_id_from_cmd(cmd_args):
26 | '''
27 | 从命令行获取user_id
28 |
29 | :param cmd_args: 命令行参数
30 | :return: user_id
31 | '''
32 | args = parse_args(cmd_args)
33 | if not args:
34 | return
35 |
36 | if args.user_id:
37 | _id = args.user_id
38 | return _id
39 | return None
40 |
41 |
42 | def get_id_from_input():
43 | '''
44 | 从用户输入获取user_id
45 |
46 | :return: user_id
47 | '''
48 | _id = input('\n请输入你要爬取的抖音用户id: ')
49 | return _id
50 |
51 |
52 | def is_valid_id(_id):
53 | '''
54 | 检查用户输入的抖音id是否合法
55 |
56 | :param _id: user_id
57 | :return: bool
58 | '''
59 | if not _id:
60 | return False
61 | if not re.match('^\\d+$', str(_id).strip()):
62 | sys.stdout.write("请输入正确格式的抖音id\n")
63 | return False
64 | return True
65 |
66 |
67 | def makedir(name):
68 | '''
69 | 建立用户名文件夹
70 |
71 | :param name: username
72 | :return: None
73 | '''
74 | if not os.path.isdir(name):
75 | os.mkdir(name)
76 | else:
77 | pass
78 |
79 |
80 | def parse_args(args):
81 | '''
82 | 解析命令行参数
83 |
84 | :param args: 命令行参数
85 | :return: 新的parse_args函数
86 | '''
87 | parser = ArgumentParser()
88 | parser.add_argument('--uid', dest='user_id', type=int, help='用户的抖音id')
89 | return parser.parse_args(args)
90 |
--------------------------------------------------------------------------------
/douyin_spider.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 |
4 | ''' 一只抖音小爬虫 '''
5 |
6 | #############################
7 | #
8 | # Author: Huang Ke
9 | # Email: huangkwell@163.com
10 | # 微信: 760208296
11 | # 复活时间: 2020/4/6
12 | #
13 | #############################
14 |
15 |
16 | # 导入标准库
17 | import re
18 | import sys
19 | from time import sleep
20 |
21 | # 导入第三方库
22 | import requests
23 |
24 | # 全局变量
25 | from utils import (
26 | get_id_from_cmd,
27 | is_valid_id,
28 | get_id_from_input,
29 | input_user_agent,
30 | input_request_url,
31 | makedir
32 | )
33 |
34 | VIDEO_URLS, PAGE = [], 1
35 |
36 | URL = input_request_url()
37 | HEADERS = {
38 | 'user-agent': input_user_agent()
39 | }
40 |
41 |
42 | def get_douyin_id():
43 | '''
44 | 获取抖音用户id
45 |
46 | :return: user_id
47 | '''
48 | _id1 = get_id_from_cmd(sys.argv[1:])
49 | if _id1:
50 | if is_valid_id(_id1):
51 | return _id1
52 | else:
53 | return get_douyin_id()
54 |
55 | _id2 = get_id_from_input()
56 | if _id2:
57 | if is_valid_id(_id2):
58 | return _id2
59 | else:
60 | return get_douyin_id()
61 |
62 | return None
63 |
64 |
65 | def get_username(user_id):
66 | '''
67 | 获取用户名
68 |
69 | :param user_id: 用户抖音id
70 | :returns: username
71 | '''
72 | url = "https://www.amemv.com/share/user/%s" % user_id
73 | headers = HEADERS
74 | try:
75 | print("\n获取用户名,建立文件夹中...\n")
76 | response = requests.request("GET", url, headers=headers)
77 | name = re.findall('
(.*?)
', response.text)[0] 78 | return name 79 | except (TypeError, IndexError): 80 | sys.stdout.write("提示: 请确认输入的是用户ID,而不是抖音号或单个视频的id\n") 81 | return None, None 82 | except requests.exceptions: 83 | sys.stdout.write("连接错误,未能获取正确数据\n") 84 | return None, None 85 | 86 | 87 | def get_all_video_urls(user_id, max_cursor): 88 | ''' 89 | 递归获取用户所有视频的源地址url 90 | 91 | :param user_id: 用户抖音id 92 | :param max_cursor: 下一页地址游标 93 | 94 | :return: urls 95 | ''' 96 | 97 | url = re.sub('max_cursor=0', 'max_cursor=%s' % max_cursor, URL, ) 98 | 99 | try: 100 | global PAGE 101 | print('\n正在收集第%s页视频地址\n' % (PAGE)) 102 | response = requests.request("GET", url, headers=HEADERS) 103 | print('第%s页视频地址获取成功\n' % (PAGE)) 104 | 105 | if response.status_code == 200: 106 | data = response.json() 107 | l = data['aweme_list'] 108 | if l == []: 109 | print("请检查输入的url地址,在Devtools里确认Response中aweme_list列表不为空") 110 | return VIDEO_URLS 111 | 112 | for li in data['aweme_list']: 113 | name = li.get('desc') 114 | url = li.get('video').get('play_addr').get('url_list')[0] 115 | VIDEO_URLS.append([name, url]) 116 | print(VIDEO_URLS[-1]) 117 | 118 | # 下拉获取更多视频 119 | if data['has_more'] is True and data.get('max_cursor') != 0: 120 | sleep(2) 121 | PAGE += 1 122 | return get_all_video_urls( 123 | user_id, data.get('max_cursor')) 124 | else: 125 | return VIDEO_URLS 126 | else: 127 | print(response.status_code) 128 | return 129 | except Exception as e: 130 | print('failed,', e) 131 | return VIDEO_URLS 132 | 133 | 134 | def download_video(index, username, name, url, retry=3): 135 | ''' 136 | 下载视频,显示进度 137 | 138 | :param index: 视频序号 139 | :param username: 用户名 140 | :param name: 视频名 141 | :param url: 视频地址 142 | :param retry: 重试次数 143 | 144 | :return: None 145 | ''' 146 | 147 | print("\n下载第%s个视频: %s" % (index, name)) 148 | try: 149 | response = requests.get( 150 | url, 151 | stream=True, 152 | headers=HEADERS, 153 | timeout=15, 154 | allow_redirects=False) 155 | video_url = response.headers['Location'] 156 | video_response = requests.get( 157 | video_url, headers=HEADERS, timeout=15) 158 | 159 | # 保存视频,显示下载进度 160 | if video_response.status_code == 200: 161 | video_size = int(video_response.headers['Content-Length']) 162 | with open('%s/%s.mp4' % (username, name), 'wb') as f: 163 | data_length = 0 164 | for data in video_response.iter_content(chunk_size=1024): 165 | data_length += len(data) 166 | f.write(data) 167 | done = int(50 * data_length / video_size) 168 | sys.stdout.write("\r下载进度: [%s%s]" % ( 169 | '█' * done, ' ' * (50 - done))) 170 | sys.stdout.flush() 171 | 172 | # 失败重试3次,超过放弃 173 | elif video_response.status_code != 200 and retry: 174 | retry -= 1 175 | download_video(index, username, name, url, retry) 176 | else: 177 | return 178 | except Exception as e: 179 | print('download failed,', name, e) 180 | return None 181 | 182 | 183 | def download_all_videos(videl_urls, username): 184 | """ 185 | 下载所有的视频 186 | """ 187 | for index, item in enumerate(videl_urls, 1): 188 | name = item[0] 189 | if name == '': 190 | name = username + '_' + str(index) 191 | url = item[1] 192 | download_video(index, username, name, url) 193 | sleep(2) 194 | pass 195 | 196 | 197 | def main(): 198 | ''' 199 | 主函数, 下载视频 200 | :return: None 201 | ''' 202 | _id = get_douyin_id() 203 | 204 | username = get_username(_id) 205 | if not username: 206 | return 207 | else: 208 | makedir(username) 209 | 210 | video_urls = get_all_video_urls(_id, 0) 211 | if not video_urls: 212 | return 213 | 214 | download_all_videos(video_urls, username) 215 | 216 | 217 | if __name__ == '__main__': 218 | main() 219 | --------------------------------------------------------------------------------