├── .gitignore
├── LICENSE
├── README.md
├── javbus.py
└── javbus_gevent.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Yorking Yuan
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 番号磁链获取器
 2 | 获取Javbus上的数据并保存到MongoDB数据库
 3 | 
 4 | ### 依赖库
 5 | - requests
 6 | - BeautifulSoup4
 7 | - progress
 8 | - re
 9 | - math
10 | - random
11 | - pymongo
12 | 
13 | ### 使用方法
14 | 1. `git clone https://github.com/MyFaith/JavbusGetter.git`
15 | 2. `pip install BeautifulSoup4 requests pymongo`
16 | 3. 修改javbus.py中的服务器配置 `mongo = MongoClient(host='192.168.199.217')`
17 | 4. `python javbus.py -page 10 -thread 4 -type 1` (page 页数 thread 启用线程数 type 1有码 2无码)
18 | 
19 | ### 运行结果
20 | ![1.png](https://ooo.0o0.ooo/2017/03/04/58ba86e297b31.png)
21 | 


--------------------------------------------------------------------------------
/javbus.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | import requests, re, math, random, time, sys
  4 | from bs4 import BeautifulSoup
  5 | from pymongo import MongoClient
  6 | import threading
  7 | import queue
  8 | import argparse
  9 | 
 10 | mongo = MongoClient(host='192.168.199.217')
 11 | db = mongo.javbus
 12 | mutex = threading.Lock()
 13 | 
 14 | class Javbus(threading.Thread):
 15 |     def __init__(self, page_queue):
 16 |         self.type = ''
 17 |         self.page_queue = page_queue
 18 |         self.avs_queue = queue.Queue()
 19 |         self.s = requests.Session()
 20 |         self.header = {
 21 |             'Referer': 'http://www.javbus.com',
 22 |             'Cookie': 'existmag=all',
 23 |             'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'
 24 |         }
 25 |         threading.Thread.__init__(self)
 26 | 
 27 |     def run(self):
 28 |         while not self.page_queue.empty():
 29 |             url = self.page_queue.get()
 30 |             # 判断是获取什么类型
 31 |             if url.find('uncensored') != -1:
 32 |                 self.type = 2
 33 |             else:
 34 |                 self.type = 1
 35 |             self.get_datas(url)
 36 |             time.sleep(2)
 37 | 
 38 |     def get_datas(self, url):
 39 |         avs = []
 40 |         # 获取某一页的HTML
 41 |         print('正在获取 %s 的数据...' %url)
 42 |         html = self.s.get(url, headers=self.header).text
 43 |         soup = BeautifulSoup(html, 'html.parser')
 44 |         divs = soup.find_all(class_='item')
 45 |         # 获取基本数据
 46 |         for item in divs:
 47 |             av = item.find(class_='photo-info')
 48 |             title = item.find(class_='photo-frame').img['title']
 49 |             fh = av.span.date.next
 50 |             time = av.span.date.next.next.next.next
 51 |             link = 'https://www.javbus.com/%s' %fh
 52 |             info = {
 53 |                 'title': title,
 54 |                 'fh': fh,
 55 |                 'time': time,
 56 |                 'link': link
 57 |             }
 58 |             avs.append(info)
 59 |         # 获取磁链 
 60 |         for item in avs:
 61 |             url = item['link']
 62 |             last = url.replace('.', '-')
 63 |             html = self.s.get(url, headers=self.header).text
 64 |             # 由于磁力链接是ajax方式获取，所以获取数据，构成ajax链接
 65 |             gid = re.search(r'var gid = (\d*?);', html).group(1)
 66 |             lang = 'zh'
 67 |             uc = re.search(r'var uc = (\d*?);', html).group(1)
 68 |             img = re.search(r"var img = '(.*?)';", html).group(1)
 69 |             floor = math.floor(random.random() * 1e3 + 1)
 70 |             # 请求数据
 71 |             ajax_url = 'https://www.javbus.com/ajax/uncledatoolsbyajax.php?gid=%s&lang=%s&img=%s&uc=%s&floor=%s' %(gid, lang, img, uc, floor)
 72 |             ajax_result = self.s.get(ajax_url, headers=self.header)
 73 |             soup = BeautifulSoup(ajax_result.text, 'html.parser')
 74 |             try:
 75 |                 magnet = soup.find('td').a['href']
 76 |             except Exception:
 77 |                 magnet = 'unissued'
 78 |             # append
 79 |             item['img'] = img
 80 |             item['magnet'] = magnet
 81 |             print('[取到数据]\n标题：%s\n番号：%s\n时间：%s\n图片：%s\n链接：%s\n磁链：%s\n' %(item['title'], item['fh'], item['time'], item['img'], item['link'], item['magnet']))
 82 |             self.avs_queue.put(item)
 83 |         # 存储数据
 84 |         mutex.acquire()
 85 |         while not self.avs_queue.empty():
 86 |             item = self.avs_queue.get()
 87 |             # 判断是获取什么类型
 88 |             if self.type == 1:
 89 |                 db.censored.insert({
 90 |                     'title': item['title'],
 91 |                     'fh': item['fh'],
 92 |                     'time': item['time'],
 93 |                     'image': item['img'],
 94 |                     'link': item['link'],
 95 |                     'magnet': item['magnet']
 96 |                 })
 97 |             elif self.type == 2:
 98 |                 db.uncensored.insert({
 99 |                     'title': item['title'],
100 |                     'fh': item['fh'],
101 |                     'time': item['time'],
102 |                     'image': item['img'],
103 |                     'link': item['link'],
104 |                     'magnet': item['magnet']
105 |                 })
106 |             print('[写入数据库]%s' %item['title'])
107 |         mutex.release()
108 | 
109 | def main(max_page, thread_num, av_type):
110 |     # 构建页面队列
111 |     page_queue = queue.Queue()
112 |     # 判断类型
113 |     if av_type == 1:
114 |         url = 'http://www.javbus.com/page/page_num'
115 |     elif av_type == 2:
116 |         url = 'http://www.javbus.com/uncensored/page/page_num'
117 |     else:
118 |         print('类型不正确, 1: 有码 2: 无码')
119 |         sys.exit(0)
120 |     for page in range(1, max_page):
121 |         page_queue.put(url.replace('page_num', str(page)))
122 |     threads = []
123 |     # 开启4个线程
124 |     for i in range(thread_num):
125 |         javbus = Javbus(page_queue)
126 |         javbus.setDaemon(True)
127 |         javbus.start()
128 |         threads.append(javbus)
129 |     # 判断
130 |     # while True:
131 |     #     for i in threads:
132 |     #         if not i.isAlive():
133 |     #             break
134 |     #     time.sleep(1)
135 |     for i in threads:
136 |         i.join()
137 | 
138 | if __name__ == '__main__':
139 |     parser = argparse.ArgumentParser('javbus')
140 |     parser.add_argument('-page', dest='page', default=5, type=int, help='获取的页数')
141 |     parser.add_argument('-thread', dest='thread', default=4, type=int, help='启动的线程数')
142 |     parser.add_argument('-type', dest='type', default=1, type=int, help='1: 有码 2: 无码')
143 |     args = parser.parse_args()
144 |     main(args.page, args.thread, args.type)
145 | 


--------------------------------------------------------------------------------
/javbus_gevent.py:
--------------------------------------------------------------------------------
  1 | from gevent import monkey; monkey.patch_all()
  2 | import gevent
  3 | import requests
  4 | import time
  5 | import queue
  6 | from pyquery import PyQuery
  7 | from mongoengine import *
  8 | import re
  9 | import math
 10 | import random
 11 | 
 12 | connect('javbus', host='myfaith.io', port=27017)
 13 | 
 14 | class Avs(Document):
 15 |     title = StringField()
 16 |     fh = StringField()
 17 |     time = StringField()
 18 |     image = StringField()
 19 |     link = StringField()
 20 |     magnet = StringField()
 21 |     type = StringField()
 22 | 
 23 | def fetch(pageQueue, type):
 24 |     avs_queue = queue.Queue()
 25 |     s = requests.Session()
 26 |     header = {
 27 |         'Referer': 'http://www.javbus.com',
 28 |         'Cookie': 'existmag=all',
 29 |         'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'
 30 |     }
 31 |     # Run
 32 |     while not pageQueue.empty():
 33 |         avs = []
 34 |         avs_queue = queue.Queue()
 35 |         url = pageQueue.get()
 36 |         print('正在获取 %s 的数据...' %url)
 37 |         html = s.get(url, headers=header).text
 38 |         pq = PyQuery(html)
 39 |         # Get FH
 40 |         for item in pq('.item').items():
 41 |             av = pq(item).find('.photo-info span')
 42 |             title = pq(item).find('.photo-frame img').attr('title')
 43 |             fh = pq(av).find('date').eq(0).text()
 44 |             time = pq(av).find('date').eq(1).text()
 45 |             link = 'https://www.javbus.com/%s' %fh
 46 |             info = {
 47 |                 'title': title,
 48 |                 'fh': fh,
 49 |                 'time': time,
 50 |                 'link': link
 51 |             }
 52 |             avs.append(info)
 53 |         # Get Magnet
 54 |         for item in avs:
 55 |             url = item['link']
 56 |             html = s.get(url, headers=header).text
 57 |             # 由于磁力链接是ajax方式获取，所以获取数据，构成ajax链接
 58 |             gid = re.search(r'var gid = (\d*?);', html).group(1)
 59 |             lang = 'zh'
 60 |             uc = re.search(r'var uc = (\d*?);', html).group(1)
 61 |             img = re.search(r"var img = '(.*?)';", html).group(1)
 62 |             floor = math.floor(random.random() * 1e3 + 1)
 63 |             # 请求数据
 64 |             ajax_url = 'https://www.javbus.com/ajax/uncledatoolsbyajax.php?gid=%s&lang=%s&img=%s&uc=%s&floor=%s' %(gid, lang, img, uc, floor)
 65 |             ajax_result = s.get(ajax_url, headers=header)
 66 |             pq = PyQuery(ajax_result.text)
 67 |             try:
 68 |                 magnet = pq('td a').attr('href')
 69 |             except Exception:
 70 |                 magnet = 'unissued'
 71 |             # append
 72 |             item['img'] = img
 73 |             item['magnet'] = magnet
 74 |             item['type'] = type
 75 |             print('[取到数据]\n标题：%s\n番号：%s\n时间：%s\n图片：%s\n链接：%s\n磁链：%s\n' %(item['title'], item['fh'], item['time'], item['img'], item['link'], item['magnet']))
 76 |             avs_queue.put(item)
 77 |         # Sve Data
 78 |         while not avs_queue.empty():
 79 |             item = avs_queue.get()
 80 |             # 判断是获取什么类型
 81 |             av = Avs(
 82 |                 title=item['title'],
 83 |                 fh=item['fh'],
 84 |                 time=item['time'],
 85 |                 image=item['img'],
 86 |                 link=item['link'],
 87 |                 magnet=item['magnet'],
 88 |                 type=str(item['type'])
 89 |             )
 90 |             av.save()
 91 |             print('[写入数据库]%s' %item['title'])
 92 | 
 93 | def main(minPage=1, maxPage=10, type=1):
 94 |     pageQueue = queue.Queue()
 95 |     if type == 1:
 96 |         url = 'http://www.javbus.com/page/{pageNum}'
 97 |     else:
 98 |         url = 'http://www.javbus.com/uncensored/page/{pageNum}'
 99 |     for page in range(minPage, maxPage):
100 |         pageQueue.put(url.format(pageNum=str(page)))
101 |     gevent.joinall([gevent.spawn(fetch, pageQueue, type) for i in range(maxPage)])
102 | 
103 | if __name__ == '__main__':
104 |     main(1, 50, 0)
105 | 


--------------------------------------------------------------------------------