├── .gitignore ├── LICENSE ├── README.md ├── requirements.txt ├── snapshoots └── results.png ├── source.txt ├── tumblr.py └── user.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | tumblr_spider is being sponsored by the following tool; please help to support us by taking a look and signing up to a free trial. 2 | 3 | 4 | 5 | 6 | # tumblr_spider 7 | 汤不热 python 多线程爬虫 8 | 9 | #### install 10 | > pip install -r requirements.txt 11 | 12 | 13 | #### run 14 | > python tumblr.py username (usename 为任意一个热门博主的 usename) 15 | 16 | ## snapshoot 17 | ![](https://raw.githubusercontent.com/facert/tumblr_spider/master/snapshoots/results.png) 18 | 19 | 20 | #### 爬取结果 21 | > `user.txt` 是爬取的博主用户名结果, `source.txt` 是视频地址集 22 | 23 | #### 原理 24 | > 根据一个热门博主的 usename, 脚本自动会获取博主转过文章的其他博主的 username,并放入爬取队列中,递归爬取。 25 | 26 | #### 申明 27 | > 这是一个正经的爬虫(严肃脸),爬取的资源跟你第一个填入的 username 有很大关系,另外由于某些原因,导致 tumblr 被墙,所以最简单的方式就是用国外 vps 去跑。 28 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests==2.11.1 2 | bs4==0.0.1 3 | -------------------------------------------------------------------------------- /snapshoots/results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facert/tumblr_spider/3859463e3079da314b1ed64e3331574bf60ecd32/snapshoots/results.png -------------------------------------------------------------------------------- /source.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facert/tumblr_spider/3859463e3079da314b1ed64e3331574bf60ecd32/source.txt -------------------------------------------------------------------------------- /tumblr.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import signal 3 | import sys 4 | import requests 5 | import threading 6 | import queue 7 | import time 8 | from bs4 import BeautifulSoup 9 | 10 | mutex = threading.Lock() 11 | is_exit = False 12 | 13 | 14 | class Tumblr(threading.Thread): 15 | 16 | def __init__(self, queue): 17 | self.user_queue = queue 18 | self.total_user = [] 19 | self.total_url = [] 20 | self.f_user = open('user.txt', 'a+') 21 | self.f_source = open('source.txt', 'a+') 22 | 23 | threading.Thread.__init__(self) 24 | 25 | def download(self, url): 26 | res = requests.get(url) 27 | 28 | source_list = [] 29 | soup = BeautifulSoup(res.text) 30 | iframes = soup.find_all('iframe') 31 | tmp_source = [] 32 | for i in iframes: 33 | source = i.get('src', '').strip() 34 | if source and source.find('https://www.tumblr.com/video') != -1 and source not in self.total_url: 35 | source_list.append(source) 36 | tmp_source.append(source) 37 | print (u'新增链接:' + source) 38 | 39 | tmp_user = [] 40 | new_users = soup.find_all(class_='reblog-link') 41 | for user in new_users: 42 | username = user.text.strip() 43 | if username and username not in self.total_user: 44 | self.user_queue.put(username) 45 | self.total_user.append(username) 46 | tmp_user.append(username) 47 | print (u'新增用户:' + username) 48 | 49 | mutex.acquire() 50 | if tmp_user: 51 | self.f_user.write('\n'.join(tmp_user)+'\n') 52 | if tmp_source: 53 | self.f_source.write('\n'.join(tmp_source)+'\n') 54 | mutex.release() 55 | 56 | def run(self): 57 | global is_exit 58 | while not is_exit: 59 | user = self.user_queue.get() 60 | url = 'http://%s.tumblr.com/' % user 61 | self.download(url) 62 | time.sleep(2) 63 | self.f_user.close() 64 | self.f_source.close() 65 | 66 | 67 | def handler(signum, frame): 68 | global is_exit 69 | is_exit = True 70 | print ("receive a signal %d, is_exit = %d" % (signum, is_exit)) 71 | sys.exit(0) 72 | 73 | 74 | def main(): 75 | 76 | if len(sys.argv) < 2: 77 | print ('usage: python tumblr.py username') 78 | sys.exit() 79 | username = sys.argv[1] 80 | 81 | NUM_WORKERS = 10 82 | q = queue.Queue() 83 | # 修改这里的 username 84 | q.put(username) 85 | 86 | signal.signal(signal.SIGINT, handler) 87 | signal.signal(signal.SIGTERM, handler) 88 | 89 | threads = [] 90 | for i in range(NUM_WORKERS): 91 | tumblr = Tumblr(q) 92 | tumblr.setDaemon(True) 93 | tumblr.start() 94 | threads.append(tumblr) 95 | 96 | while True: 97 | for i in threads: 98 | if not i.isAlive(): 99 | break 100 | time.sleep(1) 101 | 102 | 103 | if __name__ == '__main__': 104 | main() 105 | -------------------------------------------------------------------------------- /user.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facert/tumblr_spider/3859463e3079da314b1ed64e3331574bf60ecd32/user.txt --------------------------------------------------------------------------------