├── .gitattributes ├── .gitignore ├── README.md ├── config.js ├── data ├── actresses │ └── has.json └── uncensored │ └── actresses │ └── unhas.json ├── db.js ├── getData.js ├── index.js ├── index_arr.js ├── javbus.sql ├── logger.js ├── package.json ├── test.js └── tools.js /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | .idea -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## javbus-magnet-spider 2 | javbus网站爬虫 3 | ### 声明:该项目仅供学习交流,请勿用于其他用途! 4 | 项目参考与[看知乎](https://github.com/atonasting/zhihuspider),定时任务未实现,也不打算实现,需要实现的可fork代码加[node-schedule](https://github.com/node-schedule/node-schedule)实现功能!! 5 | ### 环境配置 6 | 1.装个mysql数据库,5.5或5.6均可,图省事可以直接用lnmp或lamp来装,回头还能直接在浏览器看日志; 7 | 8 | 2.先安个node.js环境; 9 | 10 | 3.执行npm -g install forever,安装forever好让爬虫在后台跑; 11 | 12 | 4.把所有代码整到本地(整=git clone) 13 | 14 | 5.在项目目录下执行npm install安装依赖库 15 | 16 | 17 | 6.建立一个空mysql数据库和一个有完整权限的用户,执行代码里的javbus.sql,创建数据库结构; 18 | 19 | 7.编辑config.js,标明(必须)的配置项必须填写或修改,其余项可以暂时不改: 20 | 21 | ``` javascript 22 | exports.txtPath = "./txt/";//生成txt文件的路径(必须) 23 | exports.sPicPath = "./sPic/";//保存小图的路径(必须)(遍历txt下载的时候需要使用) 24 | exports.bPicPath = "./bPic/";//保存大图的路径(必须)(遍历txt下载的时候需要使用 25 | exports.dbconfig = { 26 | host: '127.0.0.1',//数据库服务器(必须) 27 | user: 'root',//数据库用户名(必须) 28 | password: 'password',//数据库密码(必须) 29 | database: 'database',//数据库名(必须) 30 | port: 3306,//数据库服务器端口(必须) 31 | poolSize: 20, 32 | acquireTimeout: 30000 33 | }; 34 | //https://www.javbus3.com/有码 35 | //https://www.javbus3.com/uncensored/无码 36 | //https://www.javbus.org/欧美 37 | exports.urlpre = "https://www.javbus3.com/uncensored/";//爬虫地址;(必须) 38 | exports.urlType=0;//自己控制,0无码1有码2欧美(必须) 39 | exports.indexUrl="https://www.javbus3.com/";//爬虫主站 40 | ``` 41 | 42 | 43 | 8.最后开爬`forever start index.js -a` 44 | 45 | -------------------------------------------------------------------------------- /config.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by dianjie on 2017/1/7. 3 | */ 4 | exports.txtPath = "./txt/";//生成txt文件的路径(必须) 5 | exports.sPicPath = "./sPic/";//保存小图的路径(必须)(遍历txt下载的时候需要使用) 6 | exports.bPicPath = "./bPic/";//保存大图的路径(必须)(遍历txt下载的时候需要使用 7 | exports.dbconfig = { 8 | host: '127.0.0.1',//数据库服务器(必须) 9 | user: 'root',//数据库用户名(必须) 10 | password: 'password',//数据库密码(必须) 11 | database: 'database',//数据库名(必须) 12 | port: 3306,//数据库服务器端口(必须) 13 | poolSize: 20, 14 | acquireTimeout: 30000 15 | }; 16 | //https://www.javbus3.com/有码 17 | //https://www.javbus3.com/uncensored/无码 18 | //https://www.javbus.org/欧美 19 | exports.urlpre = "https://www.javbus3.com/uncensored/";//爬虫地址;(必须) 20 | exports.urlType=0;//自己控制,0无码1有码2欧美(必须) 21 | exports.indexUrl="https://www.javbus3.com/";//爬虫主站 22 | -------------------------------------------------------------------------------- /db.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by dianjie on 2017/1/8. 3 | */ 4 | var mysql = require('mysql'); 5 | var config = require('./config'); 6 | var pool;//普通查询池 7 | 8 | //初始化 9 | exports.init = function () { 10 | init(); 11 | } 12 | 13 | function init() { 14 | pool = mysql.createPool(config.dbconfig); 15 | } 16 | 17 | //单句执行 18 | exports.query = function (sql, callback) { 19 | if (!pool || pool._closed) init(); 20 | pool.getConnection(function (err, conn) { 21 | if (err) { 22 | callback(err); 23 | return; 24 | } 25 | conn.query(sql, function (err, rows) { 26 | if (err) { 27 | callback(err); 28 | return; 29 | } 30 | conn.release(); 31 | callback(null, rows); 32 | }); 33 | }); 34 | } 35 | //防注入转换 36 | exports.escape = mysql.escape; 37 | 38 | exports.end = function () { 39 | pool.end(function (err) { 40 | if (err) 41 | console.error("Destory db pool error: " + err); 42 | else 43 | console.log("MySQL pool ended successfully."); 44 | }); 45 | } 46 | -------------------------------------------------------------------------------- /getData.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by Administrator on 2017/1/10. 3 | */ 4 | 5 | //有690页无大概400 6 | //https://www.javbus.com/actresses/ 7 | //https://www.javbus.com/uncensored/actresses/ 8 | var config = require("./config"); 9 | var db = require("./db"); 10 | var tools = require("./tools"); 11 | var logger = require("./logger"); 12 | var async = require('async'); 13 | var cheerio = require('cheerio'); 14 | let pageArr=[...Array(400).keys()].slice(1); 15 | // let spider="https://www.javbus.com/actresses/"; 16 | let spider="https://www.javbus.com/uncensored/actresses/"; 17 | function main() { 18 | async.mapLimit(pageArr, 5, function (value, callback) { 19 | getUrl(value).then(function (res) { 20 | setTimeout(function () { 21 | callback(null, res); 22 | }, 1000); 23 | }).catch(function () { 24 | callback(null, []); 25 | }) 26 | }, function (error, results) { 27 | let pre=spider.replace("http://", "").replace("https://", ""); 28 | tools.saveTxt(`${pre}unhas.txt`,JSON.stringify(results),"./data/").then(function () { 29 | logger.log("好了好了!!") 30 | }).catch(function (rej) { 31 | logger.log("保存txt炸了啊!!,原因:"+rej) 32 | }) 33 | }); 34 | } 35 | function getUrl(page) { 36 | return new Promise(function (resolve, reject) { 37 | console.log(page); 38 | tools.get(spider + page, '', function (err, res) { 39 | if (err) { 40 | logger.log(err); 41 | //炸了,再执行自己 42 | if (err !== 404) { 43 | getUrl(page) 44 | }else { 45 | reject() 46 | } 47 | return false; 48 | } 49 | let previewArr= []; 50 | let $ = cheerio.load(res, {decodeEntities: false, ignoreWhitespace: true}); 51 | $('#waterfall .item').each(function (idx, element) { 52 | let url = $(this).children('.avatar-box').attr('href'); 53 | //没地址退出本次循环 54 | if (!url) return; 55 | previewArr.push(url); 56 | }); 57 | resolve(previewArr) 58 | }, 2); 59 | }) 60 | 61 | } 62 | tools.existsPath("./data/").then(function () { 63 | main(); 64 | }); 65 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by dianjie on 2017/1/8. 3 | */ 4 | var config = require("./config"); 5 | var db = require("./db"); 6 | var tools = require("./tools"); 7 | var logger = require("./logger"); 8 | var async = require('async'); 9 | var cheerio = require('cheerio'); 10 | var mysql = require('mysql'); 11 | var fs = require('fs'); 12 | var taskStartTime = "00:05:00";//任务将于每天此时间开始 13 | var nextStartTime;//下次执行任务的具体时间 14 | var page = 1;//页码 15 | var threads=4;//线程数 16 | var timer=1000;//一条线程结束后的等待时间(毫秒) 17 | var pageTimer=5000;//爬一页后的等待时间(毫秒) 18 | function maintask() { 19 | console.log(`page>>>>>${page}`); 20 | tools.get(config.urlpre + 'page/' + page, '', function (err, res) { 21 | if (err) { 22 | logger.log(err); 23 | //炸了,再执行自己 24 | if (err !== 404) { 25 | maintask(); 26 | } 27 | return false; 28 | } 29 | var previewObj = []; 30 | var $ = cheerio.load(res, {decodeEntities: false, ignoreWhitespace: true}); 31 | var items=$('#waterfall .item'); 32 | var itemLen=items.length; 33 | // 没有可爬链接,爬下一个链接 34 | if (!itemLen) { 35 | page++; 36 | maintask(); 37 | return; 38 | } 39 | items.each(function (idx, element) { 40 | let url = $(this).children('.movie-box').attr('href'); 41 | //没地址退出本次循环 42 | if (!url) return; 43 | //图片html实体 44 | let imgEle = $(this).find('.photo-frame img'); 45 | //番号以及更新日期html实体 46 | let nD = $(this).find('.photo-info date'); 47 | let picUrl = imgEle.attr('src'); 48 | let title = imgEle.attr('title'); 49 | //各种标签文字 50 | let tag = $(this).find('.photo-info .item-tag').text(); 51 | //将标签格式化成数组 52 | let tagArr = tag.split(/\s/); 53 | let isHD = tagArr.includes("高清"), 54 | isSubtitled = tagArr.includes('字幕'), 55 | picName = tools.getUrlFileName(picUrl), 56 | name = nD.eq(0).text(), 57 | date = nD.eq(1).text(); 58 | date=new Date(date)=='Invalid Date'?"1990-01-01":date; 59 | let insertSql = "INSERT INTO preview SET ?"; 60 | let inserts = { 61 | title, 62 | name, 63 | updateTime: date, 64 | picUrl: picName, 65 | type: config.urlType, 66 | isHD, 67 | isSubtitled 68 | }; 69 | insertSql = mysql.format(insertSql, inserts); 70 | db.query('SELECT * FROM preview WHERE name =' + mysql.escape(name), function (err, res) { 71 | if (err) { 72 | logger.log(err); 73 | return false; 74 | } 75 | if (!res.length) { 76 | db.query(insertSql, function (err) { 77 | if (err) { 78 | logger.log(err); 79 | return false; 80 | } 81 | }); 82 | } else { 83 | let id = res[0].id; 84 | db.query("UPDATE preview SET `updateTime` = '" + date + "' WHERE `id` = " + id, function (err, res) { 85 | if (err) { 86 | logger.log(err); 87 | return false; 88 | } 89 | }) 90 | } 91 | }); 92 | previewObj.push({ 93 | url, 94 | sPic: picUrl, 95 | title, 96 | name 97 | }); 98 | }); 99 | async.mapLimit(previewObj, threads, function (obj, callback) { 100 | // 延时再执行 101 | let timeFun = function (res) { 102 | setTimeout(function () { 103 | callback(null, Object.assign({}, obj, res)); 104 | }, timer); 105 | }; 106 | saveDetail(obj).then(function (res) { 107 | timeFun(res); 108 | }).catch(function () { 109 | //重试一次 110 | saveDetail(obj).then(function (res) { 111 | timeFun(res); 112 | }).catch(function (rej) { 113 | timeFun(rej); 114 | }); 115 | }); 116 | }, function (error, results) { 117 | let pre=config.urlpre.replace("http://", "").replace("https://", ""); 118 | tools.saveTxt(`${pre+page}.txt`,JSON.stringify(results)).then(function () { 119 | page++; 120 | setTimeout(maintask, pageTimer) 121 | }).catch(function (rej) { 122 | logger.log("保存txt炸了啊!!,原因:"+rej) 123 | }) 124 | }); 125 | }, 2); 126 | 127 | } 128 | //存详情 129 | function saveDetail(obj) { 130 | let promise = new Promise(function (resolve, reject) { 131 | let {url, title, name}=obj; 132 | tools.get(url, '', function (err, res) { 133 | if (err) { 134 | logger.log(err); 135 | reject({}); 136 | return false; 137 | } 138 | var $ = cheerio.load(res, {decodeEntities: false, ignoreWhitespace: true}); 139 | let query = $('body script').eq(2).text(); 140 | //磁力获取查询参数 141 | query = query.replace(/\s/ig, "").replace(/var/ig, '').replace(/;/ig, '&').replace(/'&?/ig, ''); 142 | let infoArr = []; 143 | //番号信息数组文本 144 | $('.movie .info p').each(function (idx, element) { 145 | infoArr.push($(this).text()) 146 | }); 147 | 148 | let bPicUrl = $(".screencap .bigImage").attr('href'); 149 | let bPicName = tools.getUrlFileName(bPicUrl); 150 | //时长索引 151 | let timeIndex = infoArr.findIndex(function (value) { 152 | return value.indexOf("長度:") == 0 ? true : false; 153 | }); 154 | //发行日期索引 155 | let releaseDateIndex = infoArr.findIndex(function (value) { 156 | return value.indexOf("發行日期:") == 0 ? true : false; 157 | }); 158 | //演员索引 159 | let actorIndex = infoArr.findIndex(function (value) { 160 | return value.indexOf("演員:") == 0 ? true : false; 161 | }); 162 | let time = tools.getCenterText(infoArr[timeIndex], "長度:", "分鐘"); 163 | let releaseDate = tools.getCenterText(infoArr[releaseDateIndex], "發行日期:"); 164 | releaseDate=new Date(releaseDate)=='Invalid Date'?"1990-01-01":releaseDate; 165 | let actor = infoArr[actorIndex + 1] == "推薦:" ? "" : infoArr[actorIndex + 1]; 166 | time = tools.replaceText(time); 167 | releaseDate = tools.replaceText(releaseDate); 168 | actor = tools.replaceText(actor).replace(/\s+/ig, "&&&"); 169 | // console.log(time,releaseDate,actor) 170 | let insertSql = "INSERT INTO detail SET ?"; 171 | let inserts = { 172 | title, 173 | name, 174 | releaseDate, 175 | time, 176 | actor, 177 | bPic: bPicName, 178 | type: config.urlType, 179 | }; 180 | insertSql = mysql.format(insertSql, inserts); 181 | //返回大图地址 182 | let callBackResult = {bPic: bPicUrl}; 183 | db.query('SELECT * FROM detail WHERE name =' + mysql.escape(name), function (err, res) { 184 | if (err) { 185 | logger.log(err); 186 | reject(callBackResult); 187 | return false; 188 | } 189 | if (!res.length) { 190 | db.query(insertSql, function (err) { 191 | if (err) { 192 | reject(callBackResult); 193 | logger.log(err); 194 | return false; 195 | } 196 | }); 197 | } 198 | // 执行看有没更新或未存 199 | saveMagnet(obj, query).then(function () { 200 | resolve(callBackResult) 201 | }).catch(function () { 202 | reject(callBackResult) 203 | }) 204 | }); 205 | }); 206 | }); 207 | return promise; 208 | } 209 | //存磁力 210 | function saveMagnet(obj, query) { 211 | let promise = new Promise(function (resolve, reject) { 212 | let {url, title, name}=obj; 213 | //获取磁力 214 | tools.get(config.indexUrl + "ajax/uncledatoolsbyajax.php?" + query, url, function (err, res) { 215 | if (err) { 216 | logger.log(err); 217 | reject({}); 218 | return false; 219 | } 220 | let $ = cheerio.load(res, {decodeEntities: false, ignoreWhitespace: true}); 221 | let td = $('tr[height=35px] td'); 222 | let tdLen = td.length; 223 | if(tdLen){ 224 | for (let i = 1; i <= tdLen / 3; i++) { 225 | let oneRow = $(td[i * 3 - 3]).children('a').first(); 226 | let magnet = oneRow.attr('href'); 227 | let magnetName = tools.replaceText(oneRow.text()); 228 | let size = $(td[i * 3 - 2]).children('a').first().text(); 229 | size = tools.replaceText(size); 230 | let tdText = $('tr').eq(i - 1).text(); 231 | let isHD = tdText.indexOf('HD') !== -1 ? true : false; 232 | let isSubtitled = tdText.indexOf('SUB') !== -1 ? true : false; 233 | // console.log(magnetName); 234 | db.query('SELECT * FROM magnet WHERE magnet =' + mysql.escape(magnet), function (error, respon) { 235 | if (error) { 236 | logger.log(error); 237 | return false; 238 | } 239 | if (!respon.length) { 240 | let insertSql = "INSERT INTO magnet SET ?"; 241 | let inserts = { 242 | name, 243 | magnet, 244 | magnetName, 245 | size, 246 | isHD, 247 | isSubtitled, 248 | createTime: tools.getDateString() 249 | }; 250 | insertSql = mysql.format(insertSql, inserts); 251 | db.query(insertSql, function (errinfo) { 252 | if (errinfo) { 253 | logger.log(errinfo); 254 | return false; 255 | } 256 | }); 257 | } 258 | }); 259 | 260 | } 261 | resolve() 262 | }else { 263 | resolve() 264 | } 265 | }) 266 | }); 267 | return promise; 268 | } 269 | function main() { 270 | tools.existsPath(config.txtPath).then(function () { 271 | maintask() 272 | }).catch(function () { 273 | console.log(`大哥自己建${config.txtPath}文件目录吧`); 274 | }) 275 | } 276 | main(); 277 | -------------------------------------------------------------------------------- /index_arr.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by dianjie on 2017/1/8. 3 | */ 4 | var config = require("./config"); 5 | var db = require("./db"); 6 | var tools = require("./tools"); 7 | var logger = require("./logger"); 8 | var async = require('async'); 9 | var cheerio = require('cheerio'); 10 | var mysql = require('mysql'); 11 | var fs = require('fs'); 12 | var taskStartTime = "00:05:00";//任务将于每天此时间开始 13 | var nextStartTime;//下次执行任务的具体时间 14 | 15 | 16 | //数组爬的配置不在config.js文件上配置 17 | //----------------------------必须自行手动控制 18 | var indexUrl="https://www.javbus3.com/";//爬虫主站用于获取磁力链接组拼 19 | var urlType=0;//自己控制,0无码1有码2欧美(必须)控制不好,会影响数据库 20 | var linkArr=require('./data/uncensored/actresses/unhas.json'); 21 | //---------------------------- 22 | 23 | var page = 2;//页码 24 | var linkIndex=16742;//链接索引位置 25 | var threads=4;//线程数 26 | var timer=1000;//一条线程结束后的等待时间(毫秒) 27 | var pageTimer=5000;//爬一页后的等待时间(毫秒) 28 | console.log(`类型>>>>>${urlType}`); 29 | function maintask() { 30 | console.log(`page>>>>>${page},arrIndex>>>>>${linkIndex}`); 31 | tools.get(linkArr[linkIndex] + '/' + page, '', function (err, res) { 32 | if (err) { 33 | logger.log(err); 34 | //炸了,再执行自己 35 | if (err !== 404) { 36 | maintask(); 37 | }else { 38 | if(linkIndex 0) { 182 | checkdir += "/" + paths[i]; 183 | if (!fs.existsSync(checkdir)) 184 | fs.mkdirSync(checkdir); 185 | } 186 | } 187 | fs.writeFileSync(textPath+path, data, 'utf8'); 188 | resolve(); 189 | } 190 | }) 191 | } 192 | //创建目录 193 | exports.existsPath=function(path) { 194 | return new Promise(function (resolve, reject) { 195 | // 判断目录是否存在 196 | fs.exists(path, function (exists) { 197 | // 不存在创建 198 | if (!exists) { 199 | fs.mkdir(path, function (err) { 200 | if (err) { 201 | reject() 202 | } else { 203 | resolve() 204 | } 205 | }) 206 | } else { 207 | resolve() 208 | } 209 | }); 210 | }) 211 | 212 | } --------------------------------------------------------------------------------