├── README.md ├── .gitignore └── tumblr.py /README.md: -------------------------------------------------------------------------------- 1 | # python3 tumblr多线程爬虫 2 | 给定 tumblr 用户的 username ,下载图片以及视频资源。 3 | 声明一下:这是一个正经的爬虫,所爬取的资源跟你填入的 username 有关系,请勿随意开车。 4 | 另外,由于tumblr被墙,请使用代理爬取。 5 | 6 | # 资源存储 7 | >图片存放在tumblr.py所在文件夹下的'/etc/img'中,修改imgDir即可修改文件存放位置。 8 | >视频存放在tumblr.py所在文件夹下的'/etc/mp4'中,修改videoDir即可修改文件存放位置。 9 | 10 | # 运行 11 | >python tumblr.py username1[,username2,username3...] 12 | 13 | # import 14 | >import tumblr 15 | >tumblr.tumblr_id('username1,username2,username3...') 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /tumblr.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import time, re, os, urllib.request,sys 3 | from threading import Lock, current_thread, Thread 4 | 5 | def main(): 6 | username = [] 7 | for i in range(1,len(sys.argv)): 8 | username.append(sys.argv[i]) 9 | print(' '.join(username)) 10 | tumblr_id(username) 11 | def download(args): 12 | print('At',time.ctime(),'开始下载%s'%args) 13 | url = 'http://%s.tumblr.com/api/read/json?start=0&num=200' %args 14 | req = urllib.request.Request(url) 15 | req.add_header('User-Agent','Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Mobile Safari/537.36') 16 | data=urllib.request.urlopen(req).read().decode('UTF-8') 17 | print('%s start at '%args ,time.ctime()) 18 | img = r'photo-url-500":"(.{80,120}500.jpg)'#正则表达式匹配图片 19 | video = r'source src=\\"(.{80,130})" type.*video-player-500'#正则表达式匹配图片 20 | l = 0 21 | for i in re.findall(img,data):#图片下载 22 | l += 1 23 | if l >100:#判断下载100张图片跳出循环 24 | break 25 | t = i.replace('\\', '')#替换转义字符 26 | imgfilename = t.split("/")[-1].replace('_500','').replace('_r1','').replace('tumblr','%s'%args)#分割并修改保存的文件名 27 | print('At',time.ctime(),'Downloadiing %s from userID %s' % (imgfilename, args)) 28 | imgDir = "./etc/img/" 29 | if not os.path.exists(imgDir):#判断路径是否存在 30 | os.makedirs(imgDir) 31 | urllib.request.urlretrieve(t, "%s%s" %(imgDir,imgfilename)) 32 | print('图片已完成下载%s' %args) 33 | 34 | for v in re.findall(video,data):#视频下载 35 | d = v.replace('\\', '') 36 | videofilename = d.split("/")[-1] 37 | videofilename += '.mp4' 38 | print('At',time.ctime(),'Downloadiing %s from %s' % (videofilename, args)) 39 | videoDir = "./etc/mp4/" 40 | if not os.path.exists(videoDir): 41 | os.makedirs(videoDir) 42 | urllib.request.urlretrieve(d, "%s%s" %(videoDir,videofilename)) 43 | print('视频已完成下载%s' % args) 44 | 45 | def tumblr_id(*args): 46 | args = str(args).strip("[]',)(").split(',')#修改输入的username字符串 47 | threads = [] 48 | for i in args:#threading多线程下载 49 | t = Thread(target=download, 50 | args=(i,)) 51 | threads.append(t) 52 | for h in range(len(args)): 53 | threads[h].start() 54 | for h in range(len(args)): 55 | threads[h].join() 56 | 57 | if __name__ == '__main__': 58 | main() 59 | --------------------------------------------------------------------------------