├── images
    ├── logo.jpg
    └── pay.jpg
├── requirements.txt
├── README.md
├── LICENSE
├── utils.py
└── douyin_spider.py


/images/logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huangke19/TikTokSpider/HEAD/images/logo.jpg


--------------------------------------------------------------------------------
/images/pay.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huangke19/TikTokSpider/HEAD/images/pay.jpg


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | appnope==0.1.0
 2 | backcall==0.1.0
 3 | certifi==2018.8.24
 4 | chardet==3.0.4
 5 | decorator==4.3.0
 6 | idna==2.7
 7 | ipython==6.5.0
 8 | ipython-genutils==0.2.0
 9 | jedi==0.12.1
10 | parso==0.3.1
11 | pexpect==4.6.0
12 | pickleshare==0.7.4
13 | prompt-toolkit==1.0.15
14 | ptyprocess==0.6.0
15 | Pygments==2.2.0
16 | requests==2.20.0
17 | simplegeneric==0.8.1
18 | six==1.11.0
19 | tqdm==4.25.0
20 | traitlets==4.3.2
21 | urllib3==1.24.2
22 | wcwidth==0.1.7
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## 抖音爬虫
 2 | 
 3 | 配置虚拟环境和安装依赖包默认都会，略过
 4 | 
 5 | <img src="https://github.com/huangke19/TikTokSpider/blob/master/images/logo.jpg" width="400px" />
 6 | 
 7 | 
 8 | 
 9 | ## 使用方法
10 | 
11 | 
12 | 
13 | - 抖音主页 - 分享 - 复制链接 - 发送到电脑
14 | - 点击链接 - 将网址复制后使用浏览器访问
15 | - 打开Devtools - 选择手机预览模式
16 | - 点击"**作品**"，找到加载视频时对应的url
17 | - 运行脚本，依次输入url、user_agent，用户ID（step2里有）
18 | 
19 | 
20 | 
21 | ## 完整演示
22 | 
23 | https://www.bilibili.com/video/BV1Pg4y187sy
24 | 
25 | 
26 | 
27 | 
28 | 
29 | ## If It Helps 
30 | 
31 | <img src="https://github.com/huangke19/TikTokSpider/blob/master/images/pay.jpg" width="300px" />
32 | 
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 huangke
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """ user-agent池 """
 5 | 
 6 | # 导入标准库
 7 | import os
 8 | import re
 9 | import sys
10 | from argparse import ArgumentParser
11 | 
12 | 
13 | def input_user_agent():
14 |     print("\n抖音要求user_agent和当前请求匹配，请输入当前请求中的user_agent值\n")
15 |     UA = input("Dev_tools中的user_agent: ").strip()
16 |     return UA
17 | 
18 | 
19 | def input_request_url():
20 |     print("\n请输入加载作品对应的url，注意检查返回值中aweme_list是否为空\n")
21 |     URL = input("Dev_tools中的url: ").strip()
22 |     return URL
23 | 
24 | 
25 | def get_id_from_cmd(cmd_args):
26 |     '''
27 |     从命令行获取user_id
28 | 
29 |     :param cmd_args:    命令行参数
30 |     :return:            user_id
31 |     '''
32 |     args = parse_args(cmd_args)
33 |     if not args:
34 |         return
35 | 
36 |     if args.user_id:
37 |         _id = args.user_id
38 |         return _id
39 |     return None
40 | 
41 | 
42 | def get_id_from_input():
43 |     '''
44 |     从用户输入获取user_id
45 | 
46 |     :return:    user_id
47 |     '''
48 |     _id = input('\n请输入你要爬取的抖音用户id: ')
49 |     return _id
50 | 
51 | 
52 | def is_valid_id(_id):
53 |     '''
54 |     检查用户输入的抖音id是否合法
55 | 
56 |     :param _id:  user_id
57 |     :return:     bool
58 |     '''
59 |     if not _id:
60 |         return False
61 |     if not re.match('^\\d+$', str(_id).strip()):
62 |         sys.stdout.write("请输入正确格式的抖音id\n")
63 |         return False
64 |     return True
65 | 
66 | 
67 | def makedir(name):
68 |     '''
69 |     建立用户名文件夹
70 | 
71 |     :param name:    username
72 |     :return:        None
73 |     '''
74 |     if not os.path.isdir(name):
75 |         os.mkdir(name)
76 |     else:
77 |         pass
78 | 
79 | 
80 | def parse_args(args):
81 |     '''
82 |     解析命令行参数
83 | 
84 |     :param args:    命令行参数
85 |     :return:        新的parse_args函数
86 |     '''
87 |     parser = ArgumentParser()
88 |     parser.add_argument('--uid', dest='user_id', type=int, help='用户的抖音id')
89 |     return parser.parse_args(args)
90 | 


--------------------------------------------------------------------------------
/douyin_spider.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | ''' 一只抖音小爬虫 '''
  5 | 
  6 | #############################
  7 | #
  8 | #  Author: Huang Ke
  9 | #  Email: huangkwell@163.com
 10 | #  微信: 760208296
 11 | #  复活时间: 2020/4/6
 12 | #
 13 | #############################
 14 | 
 15 | 
 16 | # 导入标准库
 17 | import re
 18 | import sys
 19 | from time import sleep
 20 | 
 21 | # 导入第三方库
 22 | import requests
 23 | 
 24 | # 全局变量
 25 | from utils import (
 26 |     get_id_from_cmd,
 27 |     is_valid_id,
 28 |     get_id_from_input,
 29 |     input_user_agent,
 30 |     input_request_url,
 31 |     makedir
 32 | )
 33 | 
 34 | VIDEO_URLS, PAGE = [], 1
 35 | 
 36 | URL = input_request_url()
 37 | HEADERS = {
 38 |     'user-agent': input_user_agent()
 39 | }
 40 | 
 41 | 
 42 | def get_douyin_id():
 43 |     '''
 44 |     获取抖音用户id
 45 | 
 46 |     :return:    user_id
 47 |     '''
 48 |     _id1 = get_id_from_cmd(sys.argv[1:])
 49 |     if _id1:
 50 |         if is_valid_id(_id1):
 51 |             return _id1
 52 |         else:
 53 |             return get_douyin_id()
 54 | 
 55 |     _id2 = get_id_from_input()
 56 |     if _id2:
 57 |         if is_valid_id(_id2):
 58 |             return _id2
 59 |         else:
 60 |             return get_douyin_id()
 61 | 
 62 |     return None
 63 | 
 64 | 
 65 | def get_username(user_id):
 66 |     '''
 67 |     获取用户名
 68 | 
 69 |     :param user_id:     用户抖音id
 70 |     :returns:           username
 71 |     '''
 72 |     url = "https://www.amemv.com/share/user/%s" % user_id
 73 |     headers = HEADERS
 74 |     try:
 75 |         print("\n获取用户名，建立文件夹中...\n")
 76 |         response = requests.request("GET", url, headers=headers)
 77 |         name = re.findall('<p class="nickname">(.*?)</p>', response.text)[0]
 78 |         return name
 79 |     except (TypeError, IndexError):
 80 |         sys.stdout.write("提示： 请确认输入的是用户ID，而不是抖音号或单个视频的id\n")
 81 |         return None, None
 82 |     except requests.exceptions:
 83 |         sys.stdout.write("连接错误，未能获取正确数据\n")
 84 |         return None, None
 85 | 
 86 | 
 87 | def get_all_video_urls(user_id, max_cursor):
 88 |     '''
 89 |     递归获取用户所有视频的源地址url
 90 | 
 91 |     :param user_id:     用户抖音id
 92 |     :param max_cursor:  下一页地址游标
 93 | 
 94 |     :return:            urls
 95 |     '''
 96 | 
 97 |     url = re.sub('max_cursor=0', 'max_cursor=%s' % max_cursor, URL, )
 98 | 
 99 |     try:
100 |         global PAGE
101 |         print('\n正在收集第%s页视频地址\n' % (PAGE))
102 |         response = requests.request("GET", url, headers=HEADERS)
103 |         print('第%s页视频地址获取成功\n' % (PAGE))
104 | 
105 |         if response.status_code == 200:
106 |             data = response.json()
107 |             l = data['aweme_list']
108 |             if l == []:
109 |                 print("请检查输入的url地址，在Devtools里确认Response中aweme_list列表不为空")
110 |                 return VIDEO_URLS
111 | 
112 |             for li in data['aweme_list']:
113 |                 name = li.get('desc')
114 |                 url = li.get('video').get('play_addr').get('url_list')[0]
115 |                 VIDEO_URLS.append([name, url])
116 |                 print(VIDEO_URLS[-1])
117 | 
118 |             # 下拉获取更多视频
119 |             if data['has_more'] is True and data.get('max_cursor') != 0:
120 |                 sleep(2)
121 |                 PAGE += 1
122 |                 return get_all_video_urls(
123 |                     user_id, data.get('max_cursor'))
124 |             else:
125 |                 return VIDEO_URLS
126 |         else:
127 |             print(response.status_code)
128 |             return
129 |     except Exception as e:
130 |         print('failed,', e)
131 |         return VIDEO_URLS
132 | 
133 | 
134 | def download_video(index, username, name, url, retry=3):
135 |     '''
136 |     下载视频,显示进度
137 | 
138 |     :param index:       视频序号
139 |     :param username:    用户名
140 |     :param name:        视频名
141 |     :param url:         视频地址
142 |     :param retry:       重试次数
143 | 
144 |     :return:            None
145 |     '''
146 | 
147 |     print("\n下载第%s个视频: %s" % (index, name))
148 |     try:
149 |         response = requests.get(
150 |             url,
151 |             stream=True,
152 |             headers=HEADERS,
153 |             timeout=15,
154 |             allow_redirects=False)
155 |         video_url = response.headers['Location']
156 |         video_response = requests.get(
157 |             video_url, headers=HEADERS, timeout=15)
158 | 
159 |         # 保存视频，显示下载进度
160 |         if video_response.status_code == 200:
161 |             video_size = int(video_response.headers['Content-Length'])
162 |             with open('%s/%s.mp4' % (username, name), 'wb') as f:
163 |                 data_length = 0
164 |                 for data in video_response.iter_content(chunk_size=1024):
165 |                     data_length += len(data)
166 |                     f.write(data)
167 |                     done = int(50 * data_length / video_size)
168 |                     sys.stdout.write("\r下载进度: [%s%s]" % (
169 |                         '█' * done, ' ' * (50 - done)))
170 |                     sys.stdout.flush()
171 | 
172 |         # 失败重试3次，超过放弃
173 |         elif video_response.status_code != 200 and retry:
174 |             retry -= 1
175 |             download_video(index, username, name, url, retry)
176 |         else:
177 |             return
178 |     except Exception as e:
179 |         print('download failed,', name, e)
180 |         return None
181 | 
182 | 
183 | def download_all_videos(videl_urls, username):
184 |     """
185 |     下载所有的视频
186 |     """
187 |     for index, item in enumerate(videl_urls, 1):
188 |         name = item[0]
189 |         if name == '':
190 |             name = username + '_' + str(index)
191 |         url = item[1]
192 |         download_video(index, username, name, url)
193 |         sleep(2)
194 |     pass
195 | 
196 | 
197 | def main():
198 |     '''
199 |     主函数, 下载视频
200 |     :return: None
201 |     '''
202 |     _id = get_douyin_id()
203 | 
204 |     username = get_username(_id)
205 |     if not username:
206 |         return
207 |     else:
208 |         makedir(username)
209 | 
210 |     video_urls = get_all_video_urls(_id, 0)
211 |     if not video_urls:
212 |         return
213 | 
214 |     download_all_videos(video_urls, username)
215 | 
216 | 
217 | if __name__ == '__main__':
218 |     main()
219 | 


--------------------------------------------------------------------------------