└── Javbus
├── .idea
├── Javbus.iml
├── codeStyles
│ └── codeStyleConfig.xml
├── misc.xml
├── modules.xml
└── workspace.xml
├── Javbus
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-36.pyc
│ ├── items.cpython-36.pyc
│ ├── middlewares.cpython-36.pyc
│ ├── pipelines.cpython-36.pyc
│ └── settings.cpython-36.pyc
├── items.py
├── middlewares.py
├── pipelines.py
├── settings.py
└── spiders
│ ├── __init__.py
│ ├── __pycache__
│ ├── __init__.cpython-36.pyc
│ └── javbus.cpython-36.pyc
│ └── javbus.py
├── README.MD
├── run.py
└── scrapy.cfg
/Javbus/.idea/Javbus.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/Javbus/.idea/codeStyles/codeStyleConfig.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
--------------------------------------------------------------------------------
/Javbus/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/Javbus/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/Javbus/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
97 |
98 |
99 |
100 | genre
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 | true
121 | DEFINITION_ORDER
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 | C:\Users\Mathiew\AppData\Roaming\Subversion
211 |
212 |
213 |
214 |
215 | 1544944642825
216 |
217 |
218 | 1544944642825
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
--------------------------------------------------------------------------------
/Javbus/Javbus/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucasbozhi/Javbus-Crawler/84eaab95197b5b3ad4c442de404630f98fa86e52/Javbus/Javbus/__init__.py
--------------------------------------------------------------------------------
/Javbus/Javbus/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucasbozhi/Javbus-Crawler/84eaab95197b5b3ad4c442de404630f98fa86e52/Javbus/Javbus/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/Javbus/Javbus/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucasbozhi/Javbus-Crawler/84eaab95197b5b3ad4c442de404630f98fa86e52/Javbus/Javbus/__pycache__/items.cpython-36.pyc
--------------------------------------------------------------------------------
/Javbus/Javbus/__pycache__/middlewares.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucasbozhi/Javbus-Crawler/84eaab95197b5b3ad4c442de404630f98fa86e52/Javbus/Javbus/__pycache__/middlewares.cpython-36.pyc
--------------------------------------------------------------------------------
/Javbus/Javbus/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucasbozhi/Javbus-Crawler/84eaab95197b5b3ad4c442de404630f98fa86e52/Javbus/Javbus/__pycache__/pipelines.cpython-36.pyc
--------------------------------------------------------------------------------
/Javbus/Javbus/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucasbozhi/Javbus-Crawler/84eaab95197b5b3ad4c442de404630f98fa86e52/Javbus/Javbus/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/Javbus/Javbus/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class JavbusItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | #标题
15 | title = scrapy.Field()
16 | #地址
17 | url = scrapy.Field()
18 | #番号
19 | car = scrapy.Field()
20 | #发行时间
21 | openTime = scrapy.Field()
22 | #标签时间
23 | timeTag = scrapy.Field()
24 | #封面
25 | cover = scrapy.Field()
26 | #长度
27 | duration = scrapy.Field()
28 | #导演
29 | director = scrapy.Field()
30 | #制作商
31 | makeCompany = scrapy.Field()
32 | #发行商
33 | publishCompany = scrapy.Field()
34 | #类型
35 | genre = scrapy.Field()
36 | #演员
37 | actor = scrapy.Field()
38 | #大图
39 | Image = scrapy.Field()
40 | #磁力链
41 | source = scrapy.Field()
42 | #电影类型
43 | type = scrapy.Field()
44 |
45 |
--------------------------------------------------------------------------------
/Javbus/Javbus/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class JavbusSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
58 |
59 | class JavbusDownloaderMiddleware(object):
60 | # Not all methods need to be defined. If a method is not defined,
61 | # scrapy acts as if the downloader middleware does not modify the
62 | # passed objects.
63 |
64 | @classmethod
65 | def from_crawler(cls, crawler):
66 | # This method is used by Scrapy to create your spiders.
67 | s = cls()
68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69 | return s
70 |
71 | def process_request(self, request, spider):
72 | # Called for each request that goes through the downloader
73 | # middleware.
74 |
75 | # Must either:
76 | # - return None: continue processing this request
77 | # - or return a Response object
78 | # - or return a Request object
79 | # - or raise IgnoreRequest: process_exception() methods of
80 | # installed downloader middleware will be called
81 | return None
82 |
83 | def process_response(self, request, response, spider):
84 | # Called with the response returned from the downloader.
85 |
86 | # Must either;
87 | # - return a Response object
88 | # - return a Request object
89 | # - or raise IgnoreRequest
90 | return response
91 |
92 | def process_exception(self, request, exception, spider):
93 | # Called when a download handler or a process_request()
94 | # (from other downloader middleware) raises an exception.
95 |
96 | # Must either:
97 | # - return None: continue processing this exception
98 | # - return a Response object: stops process_exception() chain
99 | # - return a Request object: stops process_exception() chain
100 | pass
101 |
102 | def spider_opened(self, spider):
103 | spider.logger.info('Spider opened: %s' % spider.name)
104 |
105 | class JavbusProxyMiddleware(object):
106 | def __init__(self):
107 |
108 | self.ip= "http://127.0.0.1:1080"
109 | def process_request(self,request,spider):
110 | ip = self.ip
111 | # print("当前正在使用的ip为{}".format(ip))
112 | request.meta['proxy']= ip
113 |
--------------------------------------------------------------------------------
/Javbus/Javbus/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import pymongo
8 |
9 | from scrapy.conf import settings
10 | from scrapy.exceptions import DropItem
11 | from scrapy import log
12 |
13 |
14 | class MongoDBPipeline(object):
15 |
16 | def __init__(self):
17 | host = settings['MONGODB_SERVER']
18 | port = settings['MONGODB_PORT']
19 | connection = pymongo.MongoClient(host,port)
20 | db = connection[settings['MONGODB_DB']]
21 | self.collection = db[settings['MONGODB_COLLECTION']]
22 |
23 | def process_item(self, item, spider):
24 | valid = True
25 | for data in item:
26 | if not data:
27 | valid = False
28 | raise DropItem("Missing {}!".format(data))
29 | if valid:
30 | self.collection.update({"url":item["url"]},dict(item),True,True)
31 | log.msg("Question added to MongoDB database!",
32 | level=log.DEBUG, spider=spider)
33 | return item
34 |
35 |
36 | class JavbusPipeline(object):
37 | def process_item(self, item, spider):
38 | return item
39 |
--------------------------------------------------------------------------------
/Javbus/Javbus/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | # Scrapy settings for Javbus project
5 | #
6 | # For simplicity, this file contains only settings considered important or
7 | # commonly used. You can find more settings consulting the documentation:
8 | #
9 | # https://doc.scrapy.org/en/latest/topics/settings.html
10 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
11 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
12 |
13 | BOT_NAME = 'Javbus'
14 |
15 | SPIDER_MODULES = ['Javbus.spiders']
16 | NEWSPIDER_MODULE = 'Javbus.spiders'
17 |
18 |
19 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
20 | #USER_AGENT = 'Javbus (+http://www.yourdomain.com)'
21 |
22 | # Obey robots.txt rules
23 | ROBOTSTXT_OBEY = False
24 |
25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 | #CONCURRENT_REQUESTS = 32
27 |
28 | # Configure a delay for requests for the same website (default: 0)
29 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
30 | # See also autothrottle settings and docs
31 | #DOWNLOAD_DELAY = 3
32 | # The download delay setting will honor only one of:
33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
34 | #CONCURRENT_REQUESTS_PER_IP = 16
35 |
36 | # Disable cookies (enabled by default)
37 | #COOKIES_ENABLED = False
38 |
39 | # Disable Telnet Console (enabled by default)
40 | #TELNETCONSOLE_ENABLED = False
41 |
42 | # Override the default request headers:
43 | DEFAULT_REQUEST_HEADERS = {
44 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
45 | 'Accept-Language': 'en',
46 | "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
47 | }
48 |
49 | # Enable or disable spider middlewares
50 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
51 | #SPIDER_MIDDLEWARES = {
52 | # 'Javbus.middlewares.JavbusSpiderMiddleware': 543,
53 | #}
54 |
55 | # Enable or disable downloader middlewares
56 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
57 | DOWNLOADER_MIDDLEWARES = {
58 | 'Javbus.middlewares.JavbusProxyMiddleware': 543,
59 | }
60 |
61 | # Enable or disable extensions
62 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
63 | #EXTENSIONS = {
64 | # 'scrapy.extensions.telnet.TelnetConsole': None,
65 | #}
66 |
67 | # Configure item pipelines
68 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
69 | ITEM_PIPELINES = {
70 | 'Javbus.pipelines.MongoDBPipeline': 300,
71 | }
72 |
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | #AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | #AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | #AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | #AUTOTHROTTLE_DEBUG = False
85 |
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | #HTTPCACHE_ENABLED = True
89 | #HTTPCACHE_EXPIRATION_SECS = 0
90 | #HTTPCACHE_DIR = 'httpcache'
91 | #HTTPCACHE_IGNORE_HTTP_CODES = []
92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 | MONGODB_SERVER = "127.0.0.1"
94 | MONGODB_PORT = 27017
95 | MONGODB_DB = "fuli"
96 | MONGODB_COLLECTION = "javbus"
--------------------------------------------------------------------------------
/Javbus/Javbus/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/Javbus/Javbus/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucasbozhi/Javbus-Crawler/84eaab95197b5b3ad4c442de404630f98fa86e52/Javbus/Javbus/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/Javbus/Javbus/spiders/__pycache__/javbus.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucasbozhi/Javbus-Crawler/84eaab95197b5b3ad4c442de404630f98fa86e52/Javbus/Javbus/spiders/__pycache__/javbus.cpython-36.pyc
--------------------------------------------------------------------------------
/Javbus/Javbus/spiders/javbus.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from ..items import JavbusItem
4 | from scrapy.spider import CrawlSpider,Rule
5 | from scrapy.linkextractors import LinkExtractor
6 | from scrapy.http import Request
7 | import re
8 | import math
9 | import random
10 |
11 | class JavbusSpider(scrapy.Spider):
12 | name = 'javbus'
13 | allowed_domains = ['javbus.pw',"javbus.org","javbus.com"]
14 |
15 | def start_requests(self):
16 | urls = ["https://www.javbus.org/actresses"]
17 | for url in urls:
18 | #因为很多作品在有码和无码里没有,因此需要根据女星的作品来爬取所有的番号
19 | if "actresses" not in url:
20 | yield Request(url=url,callback=self.parse_works)
21 |
22 | else:
23 | yield Request(url=url,callback=self.parse)
24 |
25 | def parse(self, response):
26 | """"""
27 |
28 | actress = response.xpath("//div[@id='waterfall']/div")
29 | for i in range(len(actress)):
30 | url = actress[i].xpath("a/@href").extract_first()
31 | name = actress[i].xpath("a/div[2]/span/text()").extract_first()
32 | print(name)
33 | yield Request(url=url,callback=self.parse_works)
34 | if "/uncensored" in response.url:
35 | baseUrl = response.url.replace("/uncensored","").split("/page")[0]
36 | else:
37 |
38 | baseUrl = response.url.split("/page")[0]
39 | next_url = response.xpath("//ul[@class='pagination pagination-lg']/li[last()]/a/@href").extract_first()
40 |
41 | if next_url:
42 | next_page = baseUrl + next_url
43 | print(next_page)
44 | yield Request(url=next_page,callback=self.parse)
45 | def parse_works(self, response):
46 | elements = response.xpath("//div[@id='waterfall']/div")
47 |
48 | for i in range(len(elements)):
49 |
50 | #详情页链接
51 | item = JavbusItem()
52 | if ".org" in response.url:
53 | type = "欧美"
54 | elif "uncensored" in response.url:
55 |
56 | type = "无码"
57 | else:
58 | type = "有码"
59 | url = elements[i].xpath("a/@href").extract_first()
60 | if url:
61 | #封面
62 | cover = elements[i].xpath("a/div[1]/img/@src").extract_first()
63 | #标题
64 | title = elements[i].xpath("a/div[2]/span/text()").extract_first()
65 | #番号
66 | car = elements[i].xpath("a/div[2]/span/date[1]/text()").extract_first()
67 | #时间
68 | openTime = elements[i].xpath("a/div[2]/span/date[2]/text()").extract_first()
69 | #日期标签
70 | timeTag = elements[i].xpath("a/div[2]/span/div/button/text()").extract_first()
71 | item["url"] = url
72 |
73 | item["cover"] = cover
74 | item["car"] = car
75 | item["title"] = title
76 | item["openTime"]= openTime
77 | item["timeTag"] = timeTag
78 | item["url"] = url
79 | item["type"] = type
80 | #下一级抓取详细信息
81 | yield Request(url=url,meta={"item":item},callback=self.parse_detail)
82 | if "/uncensored" in response.url:
83 | baseUrl = response.url.replace("/uncensored","").split("/page")[0]
84 | else:
85 |
86 | baseUrl = response.url.split("/page")[0]
87 | next_url = response.xpath("//ul[@class='pagination pagination-lg']/li[last()]/a/@href").extract_first()
88 |
89 | if next_url:
90 | next_page = baseUrl + next_url
91 | yield Request(url=next_page,callback=self.parse_works)
92 |
93 |
94 | def parse_detail(self,response):
95 | """
96 |
97 | :param response: 详细信息的url
98 | :return: 下一级
99 | """
100 | item = response.meta.get("item")
101 | #时长
102 | pattern1 = re.compile("長度:(.*?)
")
103 | matcher1 = pattern1.search(response.text)
104 | if matcher1:
105 | item["duration"] = matcher1.group(1)
106 | #导演
107 | pattern2 = re.compile("導演: (.*?)")
108 | matcher2 = pattern2.search(response.text)
109 | if matcher2:
110 | item["director"] = matcher2.group(1)
111 | #制作商
112 | pattern3 = re.compile("製作商: (.*?)")
113 | matcher3 = pattern3.search(response.text)
114 | if matcher3:
115 | item["makeCompany"] = matcher3.group(1)
116 | #发行商
117 | pattern4 = re.compile("發行商: (.*?)")
118 | matcher4 = pattern4.search(response.text)
119 | if matcher4:
120 | item["publishCompany"] = matcher4.group(1)
121 | #类型
122 | pattern5 = re.compile('href="https://www.javbus.*/genre/.*">(.*?)')
123 | matcher5 = re.findall(pattern5,response.text)
124 | if matcher5:
125 | item["genre"] = list(set(matcher5))[1:]
126 | #演员
127 | pattern6 = re.compile('(.*?)')
128 | matcher6 = re.findall(pattern6,response.text)
129 | if matcher6:
130 | item["actor"] = list(set(matcher6))[1:]
131 | Image = response.xpath("//div[@class='col-md-9 screencap']/a/@href").extract_first()
132 | item["Image"] = Image
133 | pattern7 = re.compile("var gid = (\d+)",re.S)
134 | matcher7 = pattern7.search(response.text)
135 | pattern8 = re.compile("img = '(.*?)'",re.S)
136 | matcher8 = pattern8.search(response.text)
137 | if matcher7:
138 | gid = matcher7.group(1)
139 | img = matcher8.group(1)
140 | magnetUrl = "https://www.javbus.pw/ajax/uncledatoolsbyajax.php?gid={}&lang=zh&img={}&uc=0&floor={}".format(gid,img,math.floor(random.random()*1000+1))
141 | yield Request(url=magnetUrl,callback=self.parseMagnet,meta={"item":item})
142 |
143 | def parseMagnet(self,response):
144 | """
145 |
146 | :param response: 磁力的url
147 | :return: item
148 | """
149 | item = response.meta.get("item")
150 |
151 | elements = response.xpath("//tr")
152 | info = []
153 | if len(elements)>=1:
154 | for i in range(len(elements)):
155 | sourceInfo = {}
156 | #磁力链接
157 | mangetUrl = elements[i].xpath("td[1]/a/@href").extract_first()
158 | #番号
159 | fanhao = elements[i].xpath("td[1]/a/text()").extract_first().strip() if elements[i].xpath("td[1]/a/text()").extract_first() else ""
160 | sourceInfo["fanao"] = fanhao
161 | sourceInfo["magnetUrl"] = mangetUrl
162 | #视频大小
163 | size = elements[i].xpath("td[2]/a/text()").extract_first().strip() if elements[i].xpath("td[2]/a/text()").extract_first() else ""
164 | sourceInfo["size"] = size
165 | #时间
166 | openTime = elements[i].xpath("td[3]/a/text()").extract_first().strip() if elements[i].xpath("td[3]/a/text()").extract_first() else ""
167 | sourceInfo["openTime"] = openTime
168 | info.append(sourceInfo)
169 | item["source"] = info
170 | yield item
171 |
172 |
173 |
174 |
175 |
176 |
177 |
--------------------------------------------------------------------------------
/Javbus/README.MD:
--------------------------------------------------------------------------------
1 | **###Python 爬取javus所有的番号作品**
2 |
3 | [javbus]https://www.javbus.com/ 或者 https://www.javbus.com/
4 |
5 | #####爬取有码,无码以及无码分类下的所有作品,以及磁力链,存入mongodb
6 |
7 | #####需要安装mongodb,所用到的库只有pymonog和scrapy
8 |
9 | 安装方式: pip install pymongo/scrpay
10 |
11 | pip install scrapy(scrapy在windows下安装可以自行百度,因为需要安装额外库的依赖)
12 |
13 | 使用方式:
14 |
15 | 在settings.py设置你自己的mongodb库,因为不可抗力原因所以在windows下需要富强上网,需要打开设置里的DOWNLOADER_MIDDLEWARES
16 |
17 | 
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
--------------------------------------------------------------------------------
/Javbus/run.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #_Author:shenshi
3 | #date:2018/12/16 15:21
4 | from scrapy import cmdline
5 |
6 | cmdline.execute("scrapy crawl javbus".split())
--------------------------------------------------------------------------------
/Javbus/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = Javbus.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = Javbus
12 |
--------------------------------------------------------------------------------