├── .gitignore ├── LICENSE ├── README.md ├── javbus.py └── javbus_gevent.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Yorking Yuan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 番号磁链获取器 2 | 获取Javbus上的数据并保存到MongoDB数据库 3 | 4 | ### 依赖库 5 | - requests 6 | - BeautifulSoup4 7 | - progress 8 | - re 9 | - math 10 | - random 11 | - pymongo 12 | 13 | ### 使用方法 14 | 1. `git clone https://github.com/MyFaith/JavbusGetter.git` 15 | 2. `pip install BeautifulSoup4 requests pymongo` 16 | 3. 修改javbus.py中的服务器配置 `mongo = MongoClient(host='192.168.199.217')` 17 | 4. `python javbus.py -page 10 -thread 4 -type 1` (page 页数 thread 启用线程数 type 1有码 2无码) 18 | 19 | ### 运行结果 20 | ![1.png](https://ooo.0o0.ooo/2017/03/04/58ba86e297b31.png) 21 | -------------------------------------------------------------------------------- /javbus.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import requests, re, math, random, time, sys 4 | from bs4 import BeautifulSoup 5 | from pymongo import MongoClient 6 | import threading 7 | import queue 8 | import argparse 9 | 10 | mongo = MongoClient(host='192.168.199.217') 11 | db = mongo.javbus 12 | mutex = threading.Lock() 13 | 14 | class Javbus(threading.Thread): 15 | def __init__(self, page_queue): 16 | self.type = '' 17 | self.page_queue = page_queue 18 | self.avs_queue = queue.Queue() 19 | self.s = requests.Session() 20 | self.header = { 21 | 'Referer': 'http://www.javbus.com', 22 | 'Cookie': 'existmag=all', 23 | 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36' 24 | } 25 | threading.Thread.__init__(self) 26 | 27 | def run(self): 28 | while not self.page_queue.empty(): 29 | url = self.page_queue.get() 30 | # 判断是获取什么类型 31 | if url.find('uncensored') != -1: 32 | self.type = 2 33 | else: 34 | self.type = 1 35 | self.get_datas(url) 36 | time.sleep(2) 37 | 38 | def get_datas(self, url): 39 | avs = [] 40 | # 获取某一页的HTML 41 | print('正在获取 %s 的数据...' %url) 42 | html = self.s.get(url, headers=self.header).text 43 | soup = BeautifulSoup(html, 'html.parser') 44 | divs = soup.find_all(class_='item') 45 | # 获取基本数据 46 | for item in divs: 47 | av = item.find(class_='photo-info') 48 | title = item.find(class_='photo-frame').img['title'] 49 | fh = av.span.date.next 50 | time = av.span.date.next.next.next.next 51 | link = 'https://www.javbus.com/%s' %fh 52 | info = { 53 | 'title': title, 54 | 'fh': fh, 55 | 'time': time, 56 | 'link': link 57 | } 58 | avs.append(info) 59 | # 获取磁链 60 | for item in avs: 61 | url = item['link'] 62 | last = url.replace('.', '-') 63 | html = self.s.get(url, headers=self.header).text 64 | # 由于磁力链接是ajax方式获取,所以获取数据,构成ajax链接 65 | gid = re.search(r'var gid = (\d*?);', html).group(1) 66 | lang = 'zh' 67 | uc = re.search(r'var uc = (\d*?);', html).group(1) 68 | img = re.search(r"var img = '(.*?)';", html).group(1) 69 | floor = math.floor(random.random() * 1e3 + 1) 70 | # 请求数据 71 | ajax_url = 'https://www.javbus.com/ajax/uncledatoolsbyajax.php?gid=%s&lang=%s&img=%s&uc=%s&floor=%s' %(gid, lang, img, uc, floor) 72 | ajax_result = self.s.get(ajax_url, headers=self.header) 73 | soup = BeautifulSoup(ajax_result.text, 'html.parser') 74 | try: 75 | magnet = soup.find('td').a['href'] 76 | except Exception: 77 | magnet = 'unissued' 78 | # append 79 | item['img'] = img 80 | item['magnet'] = magnet 81 | print('[取到数据]\n标题:%s\n番号:%s\n时间:%s\n图片:%s\n链接:%s\n磁链:%s\n' %(item['title'], item['fh'], item['time'], item['img'], item['link'], item['magnet'])) 82 | self.avs_queue.put(item) 83 | # 存储数据 84 | mutex.acquire() 85 | while not self.avs_queue.empty(): 86 | item = self.avs_queue.get() 87 | # 判断是获取什么类型 88 | if self.type == 1: 89 | db.censored.insert({ 90 | 'title': item['title'], 91 | 'fh': item['fh'], 92 | 'time': item['time'], 93 | 'image': item['img'], 94 | 'link': item['link'], 95 | 'magnet': item['magnet'] 96 | }) 97 | elif self.type == 2: 98 | db.uncensored.insert({ 99 | 'title': item['title'], 100 | 'fh': item['fh'], 101 | 'time': item['time'], 102 | 'image': item['img'], 103 | 'link': item['link'], 104 | 'magnet': item['magnet'] 105 | }) 106 | print('[写入数据库]%s' %item['title']) 107 | mutex.release() 108 | 109 | def main(max_page, thread_num, av_type): 110 | # 构建页面队列 111 | page_queue = queue.Queue() 112 | # 判断类型 113 | if av_type == 1: 114 | url = 'http://www.javbus.com/page/page_num' 115 | elif av_type == 2: 116 | url = 'http://www.javbus.com/uncensored/page/page_num' 117 | else: 118 | print('类型不正确, 1: 有码 2: 无码') 119 | sys.exit(0) 120 | for page in range(1, max_page): 121 | page_queue.put(url.replace('page_num', str(page))) 122 | threads = [] 123 | # 开启4个线程 124 | for i in range(thread_num): 125 | javbus = Javbus(page_queue) 126 | javbus.setDaemon(True) 127 | javbus.start() 128 | threads.append(javbus) 129 | # 判断 130 | # while True: 131 | # for i in threads: 132 | # if not i.isAlive(): 133 | # break 134 | # time.sleep(1) 135 | for i in threads: 136 | i.join() 137 | 138 | if __name__ == '__main__': 139 | parser = argparse.ArgumentParser('javbus') 140 | parser.add_argument('-page', dest='page', default=5, type=int, help='获取的页数') 141 | parser.add_argument('-thread', dest='thread', default=4, type=int, help='启动的线程数') 142 | parser.add_argument('-type', dest='type', default=1, type=int, help='1: 有码 2: 无码') 143 | args = parser.parse_args() 144 | main(args.page, args.thread, args.type) 145 | -------------------------------------------------------------------------------- /javbus_gevent.py: -------------------------------------------------------------------------------- 1 | from gevent import monkey; monkey.patch_all() 2 | import gevent 3 | import requests 4 | import time 5 | import queue 6 | from pyquery import PyQuery 7 | from mongoengine import * 8 | import re 9 | import math 10 | import random 11 | 12 | connect('javbus', host='myfaith.io', port=27017) 13 | 14 | class Avs(Document): 15 | title = StringField() 16 | fh = StringField() 17 | time = StringField() 18 | image = StringField() 19 | link = StringField() 20 | magnet = StringField() 21 | type = StringField() 22 | 23 | def fetch(pageQueue, type): 24 | avs_queue = queue.Queue() 25 | s = requests.Session() 26 | header = { 27 | 'Referer': 'http://www.javbus.com', 28 | 'Cookie': 'existmag=all', 29 | 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36' 30 | } 31 | # Run 32 | while not pageQueue.empty(): 33 | avs = [] 34 | avs_queue = queue.Queue() 35 | url = pageQueue.get() 36 | print('正在获取 %s 的数据...' %url) 37 | html = s.get(url, headers=header).text 38 | pq = PyQuery(html) 39 | # Get FH 40 | for item in pq('.item').items(): 41 | av = pq(item).find('.photo-info span') 42 | title = pq(item).find('.photo-frame img').attr('title') 43 | fh = pq(av).find('date').eq(0).text() 44 | time = pq(av).find('date').eq(1).text() 45 | link = 'https://www.javbus.com/%s' %fh 46 | info = { 47 | 'title': title, 48 | 'fh': fh, 49 | 'time': time, 50 | 'link': link 51 | } 52 | avs.append(info) 53 | # Get Magnet 54 | for item in avs: 55 | url = item['link'] 56 | html = s.get(url, headers=header).text 57 | # 由于磁力链接是ajax方式获取,所以获取数据,构成ajax链接 58 | gid = re.search(r'var gid = (\d*?);', html).group(1) 59 | lang = 'zh' 60 | uc = re.search(r'var uc = (\d*?);', html).group(1) 61 | img = re.search(r"var img = '(.*?)';", html).group(1) 62 | floor = math.floor(random.random() * 1e3 + 1) 63 | # 请求数据 64 | ajax_url = 'https://www.javbus.com/ajax/uncledatoolsbyajax.php?gid=%s&lang=%s&img=%s&uc=%s&floor=%s' %(gid, lang, img, uc, floor) 65 | ajax_result = s.get(ajax_url, headers=header) 66 | pq = PyQuery(ajax_result.text) 67 | try: 68 | magnet = pq('td a').attr('href') 69 | except Exception: 70 | magnet = 'unissued' 71 | # append 72 | item['img'] = img 73 | item['magnet'] = magnet 74 | item['type'] = type 75 | print('[取到数据]\n标题:%s\n番号:%s\n时间:%s\n图片:%s\n链接:%s\n磁链:%s\n' %(item['title'], item['fh'], item['time'], item['img'], item['link'], item['magnet'])) 76 | avs_queue.put(item) 77 | # Sve Data 78 | while not avs_queue.empty(): 79 | item = avs_queue.get() 80 | # 判断是获取什么类型 81 | av = Avs( 82 | title=item['title'], 83 | fh=item['fh'], 84 | time=item['time'], 85 | image=item['img'], 86 | link=item['link'], 87 | magnet=item['magnet'], 88 | type=str(item['type']) 89 | ) 90 | av.save() 91 | print('[写入数据库]%s' %item['title']) 92 | 93 | def main(minPage=1, maxPage=10, type=1): 94 | pageQueue = queue.Queue() 95 | if type == 1: 96 | url = 'http://www.javbus.com/page/{pageNum}' 97 | else: 98 | url = 'http://www.javbus.com/uncensored/page/{pageNum}' 99 | for page in range(minPage, maxPage): 100 | pageQueue.put(url.format(pageNum=str(page))) 101 | gevent.joinall([gevent.spawn(fetch, pageQueue, type) for i in range(maxPage)]) 102 | 103 | if __name__ == '__main__': 104 | main(1, 50, 0) 105 | --------------------------------------------------------------------------------