├── README.md
├── .gitignore
└── tumblr.py


/README.md:
--------------------------------------------------------------------------------
 1 | # python3 tumblr多线程爬虫
 2 | 给定 tumblr 用户的 username ，下载图片以及视频资源。  
 3 | 声明一下:这是一个正经的爬虫，所爬取的资源跟你填入的 username 有关系，请勿随意开车。  
 4 | 另外，由于tumblr被墙，请使用代理爬取。
 5 | 
 6 | # 资源存储
 7 | >图片存放在tumblr.py所在文件夹下的'/etc/img'中，修改imgDir即可修改文件存放位置。  
 8 | >视频存放在tumblr.py所在文件夹下的'/etc/mp4'中，修改videoDir即可修改文件存放位置。
 9 | 
10 | # 运行
11 | >python tumblr.py username1[,username2,username3...]
12 | 
13 | # import
14 | >import tumblr  
15 | >tumblr.tumblr_id('username1,username2,username3...')
16 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 


--------------------------------------------------------------------------------
/tumblr.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import time, re, os, urllib.request,sys
 3 | from threading import Lock, current_thread, Thread
 4 | 
 5 | def main():
 6 |     username = []
 7 |     for i in range(1,len(sys.argv)):
 8 |         username.append(sys.argv[i])
 9 |         print(' '.join(username))
10 |     tumblr_id(username)
11 | def download(args):
12 |     print('At',time.ctime(),'开始下载%s'%args)
13 |     url = 'http://%s.tumblr.com/api/read/json?start=0&num=200' %args
14 |     req = urllib.request.Request(url)
15 |     req.add_header('User-Agent','Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Mobile Safari/537.36')
16 |     data=urllib.request.urlopen(req).read().decode('UTF-8')
17 |     print('%s start at '%args ,time.ctime())
18 |     img = r'photo-url-500":"(.{80,120}500.jpg)'#正则表达式匹配图片
19 |     video = r'source src=\\"(.{80,130})" type.*video-player-500'#正则表达式匹配图片
20 |     l = 0
21 |     for i in re.findall(img,data):#图片下载
22 |         l += 1
23 |         if l >100:#判断下载100张图片跳出循环
24 |             break
25 |         t = i.replace('\\', '')#替换转义字符
26 |         imgfilename = t.split("/")[-1].replace('_500','').replace('_r1','').replace('tumblr','%s'%args)#分割并修改保存的文件名
27 |         print('At',time.ctime(),'Downloadiing %s from userID %s' % (imgfilename, args))
28 |         imgDir = "./etc/img/"
29 |         if not os.path.exists(imgDir):#判断路径是否存在
30 |             os.makedirs(imgDir)
31 |         urllib.request.urlretrieve(t, "%s%s" %(imgDir,imgfilename))
32 |     print('图片已完成下载%s' %args)
33 | 
34 |     for v in re.findall(video,data):#视频下载
35 |         d = v.replace('\\', '')
36 |         videofilename = d.split("/")[-1]
37 |         videofilename += '.mp4'
38 |         print('At',time.ctime(),'Downloadiing %s from %s' % (videofilename, args))
39 |         videoDir = "./etc/mp4/"
40 |         if not os.path.exists(videoDir):
41 |             os.makedirs(videoDir)
42 |         urllib.request.urlretrieve(d, "%s%s" %(videoDir,videofilename))
43 |         print('视频已完成下载%s' % args)
44 | 
45 | def tumblr_id(*args):
46 |     args = str(args).strip("[]',)(").split(',')#修改输入的username字符串
47 |     threads = []
48 |     for i in args:#threading多线程下载
49 |         t = Thread(target=download,
50 |                              args=(i,))
51 |         threads.append(t)
52 |     for h in range(len(args)):
53 |         threads[h].start()
54 |     for h in range(len(args)):
55 |         threads[h].join()
56 | 
57 | if __name__ == '__main__':
58 |     main()
59 | 


--------------------------------------------------------------------------------