├── douban-spider ├── src │ ├── main │ │ ├── resources │ │ │ ├── config.properties │ │ │ ├── config │ │ │ │ ├── zhihu-config.properties │ │ │ │ ├── douban.properties │ │ │ │ └── common.properties │ │ │ ├── file │ │ │ │ ├── test.ser │ │ │ │ └── proxy.ser │ │ │ ├── img │ │ │ │ ├── getauth.PNG │ │ │ │ └── movedata.PNG │ │ │ ├── jdbc.properties │ │ │ ├── log4j.properties │ │ │ ├── create_table_sql.sql │ │ │ ├── sql │ │ │ │ ├── douban-move.sql │ │ │ │ └── zhihu.sql │ │ │ ├── c3p0-config.xml │ │ │ ├── mybatis-config.xml │ │ │ └── generatorConfig.xml │ │ └── java │ │ │ └── com │ │ │ └── yao │ │ │ └── spider │ │ │ ├── common │ │ │ ├── intf │ │ │ │ ├── Composer.java │ │ │ │ ├── IdInterface.java │ │ │ │ └── Condition.java │ │ │ ├── dao │ │ │ │ ├── IBaseDao.java │ │ │ │ └── Impl │ │ │ │ │ └── BaseDaoImpl.java │ │ │ ├── exception │ │ │ │ ├── ConvertException.java │ │ │ │ └── HttpClientException.java │ │ │ ├── constants │ │ │ │ └── Constants.java │ │ │ ├── config │ │ │ │ └── CommonConfig.java │ │ │ ├── OKHttp2Utils.java │ │ │ ├── task │ │ │ │ └── GetProxyTask.java │ │ │ └── util │ │ │ │ └── FileUtil.java │ │ │ ├── zimuku │ │ │ ├── ZimuMain.java │ │ │ ├── dao │ │ │ │ ├── ZimuFileInfoMapper.java │ │ │ │ ├── ZimuInfoMapper.java │ │ │ │ ├── ZimuHtmlMapper.java │ │ │ │ ├── ZimuInfoExtendMapper.java │ │ │ │ ├── ZimuFileInfoMapper.xml │ │ │ │ ├── ZimuHtmlMapper.xml │ │ │ │ ├── ZimuInfoExtendMapper.xml │ │ │ │ └── ZimuInfoMapper.xml │ │ │ ├── service │ │ │ │ ├── ZimuFileInfoService.java │ │ │ │ ├── ZimuInfoService.java │ │ │ │ ├── ZimuHtmlService.java │ │ │ │ ├── ZimuInfoExtendService.java │ │ │ │ └── impl │ │ │ │ │ ├── ZimuFileInfoServiceImpl.java │ │ │ │ │ ├── ZimuHtmlServiceImpl.java │ │ │ │ │ ├── ZimuInfoExtendServiceImpl.java │ │ │ │ │ └── ZimuInfoServiceImpl.java │ │ │ ├── domain │ │ │ │ ├── ZimuHtml.java │ │ │ │ ├── ZimuInfoExtend.java │ │ │ │ ├── ZimuFileInfo.java │ │ │ │ └── ZimuInfo.java │ │ │ └── task │ │ │ │ └── ZimuPageListTask.java │ │ │ ├── sqkfq │ │ │ ├── T.java │ │ │ ├── service │ │ │ │ ├── SqkfqUserService.java │ │ │ │ ├── SqkfqBaomingService.java │ │ │ │ ├── SqkfqUserServiceImpl.java │ │ │ │ └── SqkfqBaomingServiceImpl.java │ │ │ ├── dao │ │ │ │ ├── SqkfqUserMapper.java │ │ │ │ ├── SqkfqBaomingMapper.java │ │ │ │ ├── SqkfqBaomingMapper.xml │ │ │ │ └── SqkfqUserMapper.xml │ │ │ ├── bean │ │ │ │ ├── SqjjBean.java │ │ │ │ └── BaoMingDetailBean.java │ │ │ ├── parses │ │ │ │ └── SqkfaUserParser.java │ │ │ ├── domain │ │ │ │ └── SqkfqBaoming.java │ │ │ └── manager │ │ │ │ └── UserInfoDetailManager.java │ │ │ ├── zhihu │ │ │ ├── entity │ │ │ │ ├── UserTest.java │ │ │ │ └── UserToken.java │ │ │ ├── dao │ │ │ │ ├── IUserDao.java │ │ │ │ ├── IUserTokenDao.java │ │ │ │ └── Impl │ │ │ │ │ ├── UserDaoImpl.java │ │ │ │ │ └── UserTokenDaoImpl.java │ │ │ ├── mapper │ │ │ │ ├── UserTokenMapper.java │ │ │ │ ├── UserMapper.java │ │ │ │ └── UserTokenMapper.xml │ │ │ ├── config │ │ │ │ └── ZhiHuConfig.java │ │ │ ├── ZhiHuHttpClient.java │ │ │ └── parsers │ │ │ │ └── ZhiHuUserParser.java │ │ │ ├── core │ │ │ ├── parser │ │ │ │ └── IPageParser.java │ │ │ ├── constants │ │ │ │ ├── ParserConstants.java │ │ │ │ └── ProxyConstants.java │ │ │ ├── util │ │ │ │ ├── HtmlParser.java │ │ │ │ ├── ProxyUtil.java │ │ │ │ ├── MyBatiesUtils.java │ │ │ │ └── MyIOutils.java │ │ │ ├── factory │ │ │ │ ├── C3P0DataSourceFactory.java │ │ │ │ └── ParserFactory.java │ │ │ ├── http │ │ │ │ ├── HttpResponseBean.java │ │ │ │ └── client │ │ │ │ │ └── BaseHttpClient.java │ │ │ └── entity │ │ │ │ ├── RequestParams.java │ │ │ │ └── Page.java │ │ │ ├── douban │ │ │ ├── mapper │ │ │ │ └── MoveMapper.java │ │ │ ├── dao │ │ │ │ ├── IMoveDao.java │ │ │ │ └── Impl │ │ │ │ │ └── MoveDaoImpl.java │ │ │ ├── entity │ │ │ │ ├── BaseInfo.java │ │ │ │ └── move │ │ │ │ │ └── MoveList.java │ │ │ ├── constants │ │ │ │ └── DBConstants.java │ │ │ ├── task │ │ │ │ ├── move │ │ │ │ │ ├── StartWithTypeTask.java │ │ │ │ │ └── SpiderWithTypeTask.java │ │ │ │ ├── SpiderDouBanInfo.java │ │ │ │ ├── DouBanDetailInfoDownLoadTask.java │ │ │ │ ├── AbstractTaskDeprecated.java │ │ │ │ └── DouBanInfoListPageTask.java │ │ │ ├── exectors │ │ │ │ └── ExecutorsPool.java │ │ │ ├── parsers │ │ │ │ └── move │ │ │ │ │ ├── MoveListParser.java │ │ │ │ │ ├── MoveParser.java │ │ │ │ │ ├── MoveDetailInfoParser.java │ │ │ │ │ └── MoveParserDeprecated.java │ │ │ ├── utils │ │ │ │ └── DBUtil.java │ │ │ └── DoubanHttpClient.java │ │ │ ├── exception │ │ │ └── SpiderRuntimeException.java │ │ │ ├── StartClass.java │ │ │ └── proxytool │ │ │ ├── parses │ │ │ ├── mimiip │ │ │ │ └── MimiipProxyListParser.java │ │ │ ├── xicidaili │ │ │ │ └── XicidailiProxyListParser.java │ │ │ ├── ip66 │ │ │ │ └── Ip66ProxyListParser.java │ │ │ ├── kuaidaili │ │ │ │ └── KuaidailiProxyListParser.java │ │ │ └── ip181 │ │ │ │ └── Ip181ProxyListParser.java │ │ │ ├── task │ │ │ ├── ProxySerializeTask.java │ │ │ ├── ProxyTestTask.java │ │ │ └── ProxyPageTask.java │ │ │ ├── ProxyPool.java │ │ │ └── entity │ │ │ └── Proxy.java │ └── test │ │ └── java │ │ └── com │ │ └── yao │ │ ├── test │ │ ├── sjmstest │ │ │ ├── ParentPrint.java │ │ │ ├── SonAPrint.java │ │ │ ├── SonBPrint.java │ │ │ ├── Main.java │ │ │ ├── PrintFactory.java │ │ │ └── Demo.java │ │ ├── AbstractTaskTest │ │ │ ├── TestAbstractTask.java │ │ │ ├── TestMain.java │ │ │ └── TestTask.java │ │ ├── extendtest │ │ │ ├── Father.java │ │ │ ├── SonTest.java │ │ │ └── MainFS.java │ │ ├── test │ │ │ ├── DeafaultInt.java │ │ │ ├── Main.java │ │ │ ├── OtherTest.java │ │ │ ├── ExceptionTest.java │ │ │ ├── ChuShiHuaTest.java │ │ │ ├── EncodeTest.java │ │ │ ├── ThreadRetryTest.java │ │ │ └── ListBL.java │ │ ├── logtest │ │ │ ├── AbstractLog.java │ │ │ └── SonLog.java │ │ ├── ContractTest.java │ │ ├── TryFinallYTest.java │ │ ├── parserTest │ │ │ ├── MoveDetailInfoParserTest.java │ │ │ └── ParserTest.java │ │ ├── serializable │ │ │ ├── ArraySerializable.java │ │ │ └── ProxySerializable.java │ │ └── doubantest │ │ │ └── GetDouBanPageTest.java │ │ ├── douban │ │ ├── douban │ │ │ ├── utils │ │ │ │ └── DBUtilTest.java │ │ │ └── task │ │ │ │ └── move │ │ │ │ └── SpiderWithTypeTaskTest.java │ │ └── core │ │ │ └── dao │ │ │ └── Impl │ │ │ └── BaseDaoImplTest.java │ │ ├── plan │ │ └── TrackingExecutor.java │ │ └── spider │ │ └── proxytool │ │ └── parses │ │ └── ip181 │ │ └── Ip181ProxyListParserTest.java └── image │ ├── spider01.PNG │ └── spider02.PNG ├── proxy.ser ├── 字幕下载.ljobx ├── .gitignore ├── 每日计划.txt └── zimuku.sql /douban-spider/src/main/resources/config.properties: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /douban-spider/src/main/resources/config/zhihu-config.properties: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /proxy.ser: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanyao19940801/douban-spider/HEAD/proxy.ser -------------------------------------------------------------------------------- /字幕下载.ljobx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanyao19940801/douban-spider/HEAD/字幕下载.ljobx -------------------------------------------------------------------------------- /douban-spider/src/main/resources/config/douban.properties: -------------------------------------------------------------------------------- 1 | douban.index_url = https://movie.douban.com 2 | -------------------------------------------------------------------------------- /douban-spider/image/spider01.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanyao19940801/douban-spider/HEAD/douban-spider/image/spider01.PNG -------------------------------------------------------------------------------- /douban-spider/image/spider02.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanyao19940801/douban-spider/HEAD/douban-spider/image/spider02.PNG -------------------------------------------------------------------------------- /douban-spider/src/main/resources/file/test.ser: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanyao19940801/douban-spider/HEAD/douban-spider/src/main/resources/file/test.ser -------------------------------------------------------------------------------- /douban-spider/src/main/resources/file/proxy.ser: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanyao19940801/douban-spider/HEAD/douban-spider/src/main/resources/file/proxy.ser -------------------------------------------------------------------------------- /douban-spider/src/main/resources/img/getauth.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanyao19940801/douban-spider/HEAD/douban-spider/src/main/resources/img/getauth.PNG -------------------------------------------------------------------------------- /douban-spider/src/main/resources/img/movedata.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanyao19940801/douban-spider/HEAD/douban-spider/src/main/resources/img/movedata.PNG -------------------------------------------------------------------------------- /douban-spider/src/main/resources/jdbc.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanyao19940801/douban-spider/HEAD/douban-spider/src/main/resources/jdbc.properties -------------------------------------------------------------------------------- /douban-spider/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanyao19940801/douban-spider/HEAD/douban-spider/src/main/resources/log4j.properties -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/common/intf/Composer.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.common.intf; 2 | 3 | public interface Composer{ 4 | T getComposerId(V v); 5 | } 6 | -------------------------------------------------------------------------------- /douban-spider/src/main/resources/config/common.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanyao19940801/douban-spider/HEAD/douban-spider/src/main/resources/config/common.properties -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/common/intf/IdInterface.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.common.intf; 2 | 3 | public interface IdInterface { 4 | 5 | public E getId(); 6 | } 7 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zimuku/ZimuMain.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zimuku; 2 | 3 | /** 4 | * @create by 单耀 5 | * @create date 2020/3/25 6 | */ 7 | public class ZimuMain { 8 | } 9 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/sqkfq/T.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.sqkfq; 2 | 3 | /** 4 | * @author 单耀 5 | * @version 1.0 6 | * @description 7 | * @date 2021/2/3 11:05 8 | */ 9 | public class T { 10 | } 11 | -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/sjmstest/ParentPrint.java: -------------------------------------------------------------------------------- 1 | package com.yao.test.sjmstest; 2 | 3 | /** 4 | * Created by 单耀 on 2018/1/26. 5 | */ 6 | public interface ParentPrint { 7 | void print(String classname); 8 | } 9 | -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/AbstractTaskTest/TestAbstractTask.java: -------------------------------------------------------------------------------- 1 | package com.yao.test.AbstractTaskTest; 2 | 3 | public abstract class TestAbstractTask implements Runnable { 4 | public void run() { 5 | 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/AbstractTaskTest/TestMain.java: -------------------------------------------------------------------------------- 1 | package com.yao.test.AbstractTaskTest; 2 | 3 | public class TestMain { 4 | public static void main(String[] args) { 5 | TestTask testTask = new TestTask(); 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/extendtest/Father.java: -------------------------------------------------------------------------------- 1 | package com.yao.test.extendtest; 2 | 3 | /** 4 | * Created by shanyao on 2018/5/9. 5 | */ 6 | public class Father { 7 | public void test1() { 8 | System.out.println("fa"); 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zhihu/entity/UserTest.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zhihu.entity; 2 | 3 | /** 4 | * Created by user on 2018/3/29. 5 | */ 6 | public class UserTest { 7 | private String following_count; 8 | private String locations; 9 | 10 | } 11 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/core/parser/IPageParser.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.core.parser; 2 | 3 | import java.util.List; 4 | 5 | /** 6 | * Created by 单耀 on 2018/1/30. 7 | */ 8 | public interface IPageParser { 9 | public List parser(String html); 10 | } 11 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zhihu/dao/IUserDao.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zhihu.dao; 2 | 3 | import com.yao.spider.zhihu.entity.User; 4 | 5 | /** 6 | * Created by shanyao on 2018/3/29. 7 | */ 8 | public interface IUserDao { 9 | int inserSelective(User user); 10 | } 11 | -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/extendtest/SonTest.java: -------------------------------------------------------------------------------- 1 | package com.yao.test.extendtest; 2 | 3 | /** 4 | * Created by shanyao on 2018/5/9. 5 | */ 6 | public class SonTest extends Father { 7 | public void test2() { 8 | System.out.println("son"); 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zimuku/dao/ZimuFileInfoMapper.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zimuku.dao; 2 | 3 | import com.yao.spider.zimuku.domain.ZimuFileInfo; 4 | 5 | public interface ZimuFileInfoMapper { 6 | int insert(ZimuFileInfo record); 7 | 8 | ZimuFileInfo selectByPrimaryKey(Long id); 9 | } -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/core/constants/ParserConstants.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.core.constants; 2 | 3 | public class ParserConstants { 4 | /** 5 | * 解析页面返回数据列表的类型 6 | */ 7 | public static String MOVE_LIST = "MOVE_LIST"; 8 | public static String MOVE_DETAIL = "MOVE_DETAIL"; 9 | 10 | } 11 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/common/dao/IBaseDao.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.common.dao; 2 | 3 | import org.apache.ibatis.session.SqlSession; 4 | 5 | /** 6 | * Created by shanyao on 2018/3/10. 7 | */ 8 | public interface IBaseDao { 9 | 10 | public SqlSession getSession(); 11 | 12 | public void insert(T extity); 13 | } 14 | -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/sjmstest/SonAPrint.java: -------------------------------------------------------------------------------- 1 | package com.yao.test.sjmstest; 2 | 3 | /** 4 | * Created by 单耀 on 2018/1/26. 5 | */ 6 | public class SonAPrint implements ParentPrint{ 7 | public void print(String classname) { 8 | System.out.println("Clas A"); 9 | System.out.println(classname); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/sjmstest/SonBPrint.java: -------------------------------------------------------------------------------- 1 | package com.yao.test.sjmstest; 2 | 3 | /** 4 | * Created by 单耀 on 2018/1/26. 5 | */ 6 | public class SonBPrint implements ParentPrint { 7 | public void print(String classname) { 8 | System.out.println("Class B"); 9 | System.out.println(classname); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/common/intf/Condition.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.common.intf; 2 | 3 | public interface Condition{ 4 | public static final Condition TRUE = new Condition() { 5 | public boolean match(Object o) { 6 | return true; 7 | } 8 | }; 9 | 10 | boolean match(T t); 11 | } -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/common/exception/ConvertException.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.common.exception; public class ConvertException extends RuntimeException { private static final long serialVersionUID = 969400608685990299L; public ConvertException(String msg) { super(msg); } public ConvertException(String msg, Exception ex) { super(msg, ex); } } -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zimuku/dao/ZimuInfoMapper.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zimuku.dao; 2 | 3 | import com.yao.spider.zimuku.domain.ZimuInfo; 4 | 5 | import java.util.List; 6 | 7 | public interface ZimuInfoMapper { 8 | int insert(ZimuInfo record); 9 | 10 | ZimuInfo selectByPrimaryKey(Long id); 11 | 12 | int batchInsert(List zimuInfoList); 13 | } -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/test/DeafaultInt.java: -------------------------------------------------------------------------------- 1 | package com.yao.test.test; 2 | 3 | /** 4 | * Created by 单耀 on 2018/2/8. 5 | */ 6 | public class DeafaultInt { 7 | private int defaultV; 8 | 9 | public static void main(String[] args) { 10 | DeafaultInt deafaultInt = new DeafaultInt(); 11 | System.out.println(deafaultInt.defaultV); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/test/Main.java: -------------------------------------------------------------------------------- 1 | package com.yao.test.test; 2 | 3 | import com.yao.spider.zhihu.task.ZhiHuUserListTask; 4 | 5 | /** 6 | * Created by 单耀 on 2018/2/6. 7 | */ 8 | public class Main { 9 | public static void main(String[] args) { 10 | 11 | ZhiHuUserListTask task = new ZhiHuUserListTask("",true); 12 | // task.test(); 13 | } 14 | 15 | } 16 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/sqkfq/service/SqkfqUserService.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.sqkfq.service; 2 | 3 | import com.yao.spider.sqkfq.domain.SqkfqUser; 4 | import org.apache.ibatis.session.SqlSession; 5 | 6 | public interface SqkfqUserService { 7 | void insert(SqlSession session, SqkfqUser parser); 8 | 9 | void updateJiguan(SqlSession session1, String jiguan, Long userMind); 10 | } 11 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zhihu/mapper/UserTokenMapper.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zhihu.mapper; 2 | 3 | import com.yao.spider.zhihu.entity.UserToken; 4 | 5 | public interface UserTokenMapper { 6 | int deleteByPrimaryKey(String userToken); 7 | 8 | int insert(UserToken record); 9 | 10 | int insertSelective(UserToken record); 11 | 12 | UserToken selectByPrimaryKey(String userToken); 13 | } -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/logtest/AbstractLog.java: -------------------------------------------------------------------------------- 1 | package com.yao.test.logtest; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | 6 | /** 7 | * Created by 单耀 on 2018/2/8. 8 | */ 9 | public abstract class AbstractLog { 10 | private static Logger logger = LoggerFactory.getLogger(AbstractLog.class); 11 | 12 | public void printLog(){ 13 | logger.info("parent"); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zhihu/dao/IUserTokenDao.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zhihu.dao; 2 | 3 | import com.yao.spider.zhihu.entity.UserToken; 4 | 5 | /** 6 | * Created by user on 2018/4/2. 7 | */ 8 | public interface IUserTokenDao { 9 | void insertSelective(UserToken userToken); 10 | 11 | public UserToken selectByPrimaryKey(String userToken); 12 | 13 | boolean judgeAndInsert(UserToken userToken); 14 | } 15 | -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/test/OtherTest.java: -------------------------------------------------------------------------------- 1 | package com.yao.test.test; 2 | 3 | /** 4 | * Created by user on 2018/2/8. 5 | */ 6 | public class OtherTest { 7 | public static void main(String[] args) { 8 | int i = 534; 9 | for (int persent = 100; persent > 0; persent -= 10) { 10 | System.out.println(persent); 11 | System.out.println(persent - 10); 12 | } 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/sqkfq/dao/SqkfqUserMapper.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.sqkfq.dao; 2 | 3 | import com.yao.spider.sqkfq.domain.SqkfqUser; 4 | import org.apache.ibatis.annotations.Param; 5 | 6 | public interface SqkfqUserMapper { 7 | 8 | int insert(SqkfqUser record); 9 | 10 | SqkfqUser selectByPrimaryKey(Long id); 11 | 12 | void updateJiguan(@Param("jiguan") String jiguan, @Param("userMid") Long userMid); 13 | } -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/AbstractTaskTest/TestTask.java: -------------------------------------------------------------------------------- 1 | package com.yao.test.AbstractTaskTest; 2 | 3 | import com.yao.spider.core.entity.Page; 4 | import com.yao.spider.core.task.AbstractTask; 5 | 6 | public class TestTask extends TestAbstractTask { 7 | 8 | public void retry() { 9 | 10 | } 11 | 12 | public void handle(Page page) { 13 | 14 | } 15 | 16 | public void run() { 17 | 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/sjmstest/Main.java: -------------------------------------------------------------------------------- 1 | package com.yao.test.sjmstest; 2 | 3 | /** 4 | * Created by 单耀 on 2018/1/26. 5 | */ 6 | public class Main { 7 | public static void main(String[] args) { 8 | ParentPrint print = PrintFactory.getParentPrint(SonAPrint.class); 9 | print.print("A"); 10 | 11 | ParentPrint print1 = PrintFactory.getParentPrint(SonBPrint.class); 12 | print.print("B"); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zimuku/service/ZimuFileInfoService.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zimuku.service; 2 | 3 | import com.yao.spider.zimuku.domain.ZimuFileInfo; 4 | import org.apache.ibatis.session.SqlSession; 5 | 6 | /** 7 | * Created by shanyao on 2020/3/28 8 | */ 9 | public interface ZimuFileInfoService { 10 | void insert(ZimuFileInfo fileInfo, SqlSession session); 11 | 12 | void isnert(ZimuFileInfo zimuInfo, SqlSession session); 13 | } 14 | -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/logtest/SonLog.java: -------------------------------------------------------------------------------- 1 | package com.yao.test.logtest; 2 | 3 | import org.junit.Test; 4 | import org.slf4j.Logger; 5 | import org.slf4j.LoggerFactory; 6 | 7 | /** 8 | * Created by 单耀 on 2018/2/8. 9 | */ 10 | public class SonLog extends AbstractLog { 11 | private static Logger logger= LoggerFactory.getLogger(SonLog.class); 12 | @Test 13 | public void printLogSon() { 14 | logger.info("son"); 15 | printLog(); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/core/util/HtmlParser.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.core.util; 2 | 3 | import com.yao.spider.core.factory.ParserFactory; 4 | import com.yao.spider.core.parser.IPageParser; 5 | 6 | import java.util.List; 7 | 8 | public class HtmlParser { 9 | IPageParser pageParser; 10 | 11 | public List parser(String html,String type) { 12 | pageParser = ParserFactory.getParserByProductType(type); 13 | return pageParser.parser(html); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/douban/mapper/MoveMapper.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.douban.mapper; 2 | 3 | 4 | import com.yao.spider.douban.entity.move.Move; 5 | 6 | public interface MoveMapper { 7 | int deleteByPrimaryKey(String id); 8 | 9 | int insert(Move record); 10 | 11 | int insertSelective(Move record); 12 | 13 | Move selectByPrimaryKey(String id); 14 | 15 | int updateByPrimaryKeySelective(Move record); 16 | 17 | int updateByPrimaryKey(Move record); 18 | } -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/core/factory/C3P0DataSourceFactory.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.core.factory; 2 | 3 | import com.mchange.v2.c3p0.ComboPooledDataSource; 4 | import org.apache.ibatis.datasource.unpooled.UnpooledDataSourceFactory; 5 | 6 | /** 7 | * Created by shanyao on 2018/3/10. 8 | */ 9 | public class C3P0DataSourceFactory extends UnpooledDataSourceFactory { 10 | public C3P0DataSourceFactory() { 11 | this.dataSource = new ComboPooledDataSource(); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zimuku/service/ZimuInfoService.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zimuku.service; 2 | 3 | import com.yao.spider.core.util.MyBatiesUtils; 4 | import com.yao.spider.zimuku.domain.ZimuInfo; 5 | import org.apache.ibatis.session.SqlSession; 6 | 7 | import java.util.List; 8 | 9 | public interface ZimuInfoService { 10 | public void batchInsert(List zimuInfoList, SqlSession session); 11 | 12 | public void isnert(ZimuInfo zimuInfo, SqlSession session); 13 | } 14 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zimuku/service/ZimuHtmlService.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zimuku.service; 2 | 3 | import com.yao.spider.zimuku.domain.ZimuHtml; 4 | import org.apache.ibatis.session.SqlSession; 5 | 6 | import java.util.List; 7 | 8 | public interface ZimuHtmlService { 9 | public void insert(ZimuHtml zimuHtml, SqlSession session); 10 | 11 | Long selectMaxId(SqlSession session); 12 | 13 | List selectByRange(Long startId, Long endId, SqlSession session); 14 | } 15 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/douban/dao/IMoveDao.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.douban.dao; 2 | 3 | 4 | import com.yao.spider.douban.entity.move.Move; 5 | 6 | import java.util.List; 7 | 8 | /** 9 | * Created by shanyao on 2018/3/10. 10 | */ 11 | public interface IMoveDao{ 12 | public void insert(Move move); 13 | 14 | void inserSelective(Move move); 15 | 16 | void insertList(List moveList); 17 | 18 | Move selectByPrimaryKey(String id); 19 | 20 | void update(Move move); 21 | } 22 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zhihu/mapper/UserMapper.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zhihu.mapper; 2 | 3 | import com.yao.spider.zhihu.entity.User; 4 | 5 | public interface UserMapper { 6 | 7 | int deleteByPrimaryKey(String userToken); 8 | 9 | int insert(User record); 10 | 11 | int insertSelective(User record); 12 | 13 | 14 | User selectByPrimaryKey(String userToken); 15 | 16 | 17 | 18 | int updateByPrimaryKeySelective(User record); 19 | 20 | int updateByPrimaryKey(User record); 21 | } -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/douban/entity/BaseInfo.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.douban.entity; 2 | 3 | /** 4 | * Created by 单耀 on 2018/1/30. 5 | */ 6 | public class BaseInfo { 7 | //TODO 8 | private String id; 9 | private String rate; 10 | private String title; 11 | private String url; 12 | 13 | public BaseInfo(String id, String rate, String title, String url) { 14 | this.id = id; 15 | this.rate = rate; 16 | this.title = title; 17 | this.url = url; 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zhihu/entity/UserToken.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zhihu.entity; 2 | 3 | public class UserToken { 4 | private String userToken; 5 | 6 | public UserToken() {} 7 | 8 | public UserToken(String userToken) { 9 | this.userToken = userToken; 10 | } 11 | 12 | public String getUserToken() { 13 | return userToken; 14 | } 15 | 16 | public void setUserToken(String userToken) { 17 | this.userToken = userToken == null ? null : userToken.trim(); 18 | } 19 | } -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/core/util/ProxyUtil.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.core.util; 2 | 3 | 4 | import com.yao.spider.proxytool.entity.Proxy; 5 | 6 | public class ProxyUtil { 7 | /** 8 | * 是否丢弃代理 9 | * 失败次数大于3丢弃 10 | */ 11 | public static boolean isDiscardProxy(Proxy proxy){ 12 | int succTimes = proxy.getSuccessfulTimes(); 13 | int failTimes = proxy.getFailureTimes(); 14 | if(failTimes >= 3){ 15 | return true; 16 | } 17 | return false; 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/sqkfq/dao/SqkfqBaomingMapper.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.sqkfq.dao; 2 | 3 | import com.yao.spider.sqkfq.domain.SqkfqBaoming; 4 | import org.apache.ibatis.annotations.Param; 5 | 6 | import java.util.List; 7 | 8 | public interface SqkfqBaomingMapper { 9 | int insert(SqkfqBaoming record); 10 | 11 | SqkfqBaoming selectByPrimaryKey(Long id); 12 | 13 | List selectAll(); 14 | 15 | List selectByZipCodeAndOpt(@Param("code") Integer code, @Param("opt") Long opt); 16 | } -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/test/ExceptionTest.java: -------------------------------------------------------------------------------- 1 | package com.yao.test.test; 2 | 3 | /** 4 | * Created by 单耀 on 2018/1/27. 5 | */ 6 | public class ExceptionTest { 7 | public static void main(String[] args) { 8 | ExceptionTest test = new ExceptionTest(); 9 | try { 10 | test.method(); 11 | } catch (Exception e) { 12 | e.printStackTrace(); 13 | } 14 | } 15 | 16 | public void method() throws Exception { 17 | int i =1; 18 | float j = i/0; 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | zhihu-spider/.idea/libraries/Maven__org_jsoup_jsoup_1_7_2.xml 2 | *.class 3 | douban-spider/.idea/ 4 | douban-spider/target/classes 5 | douban-spider/douban-spider.iml 6 | douban-spider/src/main/java/com.yao/spider/log/ 7 | douban-spider/src/main/java/com/yao/spider/log/ 8 | douban-spider/src/main/java/com.yao/douban/log/info.log 9 | logs/spiderweb/info.log_ 2018-05-12-22 10 | logs/spiderweb/info.log_ 2018-08-12-16 11 | logs/spiderweb/info.log_ 2018-06-03-22 12 | logs/spiderweb/info.log_ 2018-05-12-23 13 | logs/spiderweb/info.log_ 2018-08-12-17 14 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/sqkfq/service/SqkfqBaomingService.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.sqkfq.service; 2 | 3 | import com.yao.spider.sqkfq.domain.SqkfqBaoming; 4 | import org.apache.ibatis.session.SqlSession; 5 | 6 | import java.util.List; 7 | 8 | public interface SqkfqBaomingService { 9 | public void insert(SqkfqBaoming sqkfqBaoming, SqlSession session); 10 | 11 | public List selectAll(SqlSession session); 12 | 13 | public List selectByZipCodeAndOpt(SqlSession session, Integer code, Long opt); 14 | } 15 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zimuku/dao/ZimuHtmlMapper.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zimuku.dao; 2 | 3 | import com.yao.spider.zimuku.domain.ZimuHtml; 4 | import org.apache.ibatis.annotations.Param; 5 | import org.springframework.stereotype.Repository; 6 | 7 | import java.util.List; 8 | 9 | @Repository 10 | public interface ZimuHtmlMapper { 11 | int insert(ZimuHtml record); 12 | 13 | ZimuHtml selectByPrimaryKey(Long id); 14 | 15 | ZimuHtml selectMax(); 16 | 17 | List selectByRange(@Param("startId") Long startId, @Param("endId") Long endId); 18 | 19 | } -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/sjmstest/PrintFactory.java: -------------------------------------------------------------------------------- 1 | package com.yao.test.sjmstest; 2 | 3 | /** 4 | * Created by 单耀 on 2018/1/26. 5 | */ 6 | public class PrintFactory { 7 | public static ParentPrint getParentPrint(Class clazz) { 8 | try { 9 | ParentPrint print = (ParentPrint) clazz.newInstance(); 10 | return print; 11 | } catch (InstantiationException e) { 12 | e.printStackTrace(); 13 | } catch (IllegalAccessException e) { 14 | e.printStackTrace(); 15 | } 16 | return null; 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/douban/douban/utils/DBUtilTest.java: -------------------------------------------------------------------------------- 1 | package com.yao.douban.douban.utils; 2 | 3 | import com.yao.TestConsants; 4 | import com.yao.spider.douban.utils.DBUtil; 5 | import junit.framework.TestCase; 6 | import org.junit.Test; 7 | 8 | /** 9 | * Created by user on 2018/2/8. 10 | */ 11 | public class DBUtilTest extends TestCase { 12 | @Test 13 | public void testGetTypeList() throws Exception { 14 | DBUtil.getType("move"); 15 | } 16 | 17 | public void testGetTypeMap() throws Exception { 18 | 19 | DBUtil.getTypeMap("move", TestConsants.context); 20 | } 21 | } -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/core/http/HttpResponseBean.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.core.http; 2 | 3 | /** 4 | * Created by xuya on 2018/2/13. 5 | */ 6 | public class HttpResponseBean { 7 | 8 | private Integer status; 9 | 10 | private String response; 11 | 12 | public Integer getStatus() { 13 | return status; 14 | } 15 | 16 | public void setStatus(Integer status) { 17 | this.status = status; 18 | } 19 | 20 | public String getResponse() { 21 | return response; 22 | } 23 | 24 | public void setResponse(String response) { 25 | this.response = response; 26 | } 27 | } -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/test/ChuShiHuaTest.java: -------------------------------------------------------------------------------- 1 | package com.yao.test.test; 2 | 3 | /** 4 | * Created by 单耀 on 2018/2/6. 5 | */ 6 | public class ChuShiHuaTest implements Runnable{ 7 | private int retryTime = 0; 8 | 9 | public ChuShiHuaTest(int retryTime) { 10 | this.retryTime = retryTime; 11 | } 12 | 13 | 14 | public void run() { 15 | System.out.println(retryTime); 16 | retry(); 17 | } 18 | 19 | private void retry() { 20 | System.out.println(retryTime); 21 | ThreadRetryTest.getInstance().getDownLoadMoveListExector().execute(new ChuShiHuaTest(retryTime +1)); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zimuku/dao/ZimuInfoExtendMapper.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zimuku.dao; 2 | 3 | import com.yao.spider.zimuku.domain.ZimuHtml; 4 | import com.yao.spider.zimuku.domain.ZimuInfoExtend; 5 | import org.apache.ibatis.annotations.Param; 6 | 7 | import java.util.List; 8 | 9 | public interface ZimuInfoExtendMapper { 10 | int insert(ZimuInfoExtend record); 11 | 12 | ZimuInfoExtend selectByPrimaryKey(Long id); 13 | 14 | ZimuInfoExtend selectMax(); 15 | 16 | List selectByRange(@Param("startId") Long startId, @Param("endId") Long endId); 17 | 18 | void update(ZimuInfoExtend builderZimuInfoExtend); 19 | } -------------------------------------------------------------------------------- /每日计划.txt: -------------------------------------------------------------------------------- 1 | 豆瓣爬虫项目 2 | 2018-1-25计划 3 | 1.查询如何将maven下载的包转移到lib文件夹中 4 | 2.开始项目搭建 5 | 20818-1-29 6 | 0.测试代理是否可以下载 7 | 1.完成代理下载模块 8 | 2.开始编写豆瓣电影 9 | 3.将代理序列化到本地 10 | 4.config文件 11 | 5.parser要和代理的一样可拓展 12 | 2018-1-30 13 | 0.测试代理是否可以下载√ 14 | 1.完成代理下载模块√ 15 | 2.开始编写豆瓣电影 16 | 3.将代理序列化到本地 17 | 4.config文件 18 | 5.parser要和代理的一样可拓展√ 19 | 6.看看两个Test有什么区别√ 20 | 2018-1-31 21 | 1.豆瓣的parser要和代理的一样可拓展(parser的返回是否能用泛型?)√ 22 | 2.完成电影信息列表的parser√ 23 | 3.完成电影详细信息的parser√ 24 | 2018-2-6 25 | 1.完成详细信息下载任务√ 26 | 2.完成数据库连接池配置 27 | 3.完成数据库表的建立 28 | 29 | 其他的一些想法 30 | 1.想一想可拓展性,比如可以下载其他信息,如音乐,书籍.. 31 | 2.最好可以通过修改一个配置就可以改变下载的信息 32 | 3.用户可以自己定制下载内容 33 | 4.将代理序列化到本地 34 | 5.config文件 -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zimuku/service/ZimuInfoExtendService.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zimuku.service; 2 | 3 | import com.yao.spider.zimuku.domain.ZimuInfoExtend; 4 | import org.apache.ibatis.session.SqlSession; 5 | 6 | import java.util.List; 7 | 8 | /** 9 | * Created by shanyao on 2020/3/28 10 | */ 11 | public interface ZimuInfoExtendService { 12 | void insert(ZimuInfoExtend zimuInfoExtend, SqlSession session); 13 | 14 | Long selectMaxId(SqlSession session); 15 | 16 | List selectByRange(Long startId, Long endId, SqlSession session); 17 | 18 | void update(ZimuInfoExtend builderZimuInfoExtend, SqlSession session); 19 | } 20 | -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/ContractTest.java: -------------------------------------------------------------------------------- 1 | package com.yao.test; 2 | 3 | import com.yao.spider.core.task.AbstractTask; 4 | import com.yao.spider.zhihu.task.ZhiHuUserListTask; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | 8 | /** 9 | * Created by shanyao on 2018/5/10. 10 | */ 11 | public class ContractTest { 12 | static Logger logger = LoggerFactory.getLogger(ContractTest.class); 13 | public static void main(String[] args) { 14 | // ZhiHuUserListTask abstractTask = new ZhiHuUserListTask("test",false); 15 | ContractTest test = new ContractTest(); 16 | } 17 | 18 | static { 19 | System.out.println("1"); 20 | } 21 | 22 | 23 | } 24 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/exception/SpiderRuntimeException.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.exception; 2 | 3 | import org.mule.config.i18n.Message; 4 | 5 | /** 6 | * Created by shanyao on 2020/3/29 7 | */ 8 | public class SpiderRuntimeException extends RuntimeException { 9 | private static final long serialVersionUID = 6728041560892553159L; 10 | 11 | public SpiderRuntimeException(Message message) { 12 | super(message.getMessage()); 13 | } 14 | 15 | public SpiderRuntimeException(Message message, Throwable cause) { 16 | super(message.getMessage(), cause); 17 | } 18 | 19 | public SpiderRuntimeException(Throwable cause) { 20 | super(cause); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/douban/constants/DBConstants.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.douban.constants; 2 | 3 | /** 4 | * Created by user on 2018/2/8. 5 | */ 6 | public class DBConstants { 7 | //获取电影所有热门标签 8 | public static String MOVE_START_URL_TYPE = "https://movie.douban.com/typerank?type_name=%s&type=%s&interval_id=90:80&action="; 9 | public static String MOVE_START_TYPE_NAME = "惊悚片"; 10 | public static String MOVE_START_TYPE_VALUE = "10"; 11 | //根据热门标签获取电影列表 12 | public static String MOVE_TOP_LIST_URL = "https://movie.douban.com/j/chart/top_list?type=%s&interval_id=%d:%d&action=&start=%d&limit=20"; 13 | 14 | //根据热门标签获取百分比范围的条数 15 | public static String MOVE_PERSENT_COUNT_URL = "https://movie.douban.com/j/chart/top_list_count?type=%s&interval_id=%d:%d"; 16 | } 17 | -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/test/EncodeTest.java: -------------------------------------------------------------------------------- 1 | package com.yao.test.test; 2 | 3 | import java.io.UnsupportedEncodingException; 4 | import java.net.URLDecoder; 5 | import java.net.URLEncoder; 6 | 7 | /** 8 | * Created by 单耀 on 2018/2/5. 9 | */ 10 | public class EncodeTest { 11 | public static void main(String[] args) { 12 | String str = "%3A"; 13 | try { 14 | String re = URLDecoder.decode(str,"UTF-8"); 15 | System.out.println(re); 16 | String ss = URLEncoder.encode(re,"UTF-8"); 17 | System.out.println(ss); 18 | if (ss.equals(str)) { 19 | System.out.println("=="); 20 | } 21 | } catch (UnsupportedEncodingException e) { 22 | e.printStackTrace(); 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/douban/task/move/StartWithTypeTask.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.douban.task.move; 2 | 3 | import com.yao.spider.douban.DoubanHttpClient; 4 | import com.yao.spider.douban.utils.DBUtil; 5 | 6 | import java.util.Map; 7 | 8 | /** 9 | * Created by user on 2018/2/8. 10 | */ 11 | public class StartWithTypeTask implements Runnable{ 12 | private static DoubanHttpClient doubanHttpClient = DoubanHttpClient.getInstance(); 13 | private static volatile boolean SUCCESS = true; 14 | public void run() { 15 | Map mapTpe = DBUtil.getType("move"); 16 | for (String name : mapTpe.keySet()) { 17 | doubanHttpClient.getDownLoadMoveListExector().execute(new SpiderWithTypeTask(name, mapTpe.get(name), true)); 18 | } 19 | } 20 | 21 | 22 | } 23 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/common/constants/Constants.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.common.constants; 2 | 3 | /** 4 | * Created by 单耀 on 2018/1/25. 5 | */ 6 | public class Constants { 7 | public static final String STANDARD = "standard"; 8 | public static int SocketTimeout = 5000; 9 | public static int ConnectionTimeout = 5000; 10 | public static int TIMEOUT = 10000; 11 | 12 | public static String STRTY_URL_MOVE = "https://movie.douban.com/j/search_subjects?type=%s&tag=%s&sort=rank&page_limit=%d&page_start=%d"; 13 | //每次查询信息条数 14 | public static int LIMIT = 20; 15 | //查询条件 16 | public static String TAG = "豆瓣高分"; 17 | //信息类别 18 | public static String TYPE = "move"; 19 | //是否进行深度爬虫 20 | public static boolean ISDEEP = false; 21 | public static boolean ISCONTINUE = true; 22 | 23 | } 24 | -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/TryFinallYTest.java: -------------------------------------------------------------------------------- 1 | package com.yao.test; 2 | 3 | /** 4 | * Created by user on 2018/2/8. 5 | */ 6 | public class TryFinallYTest { 7 | public static void main(String[] args) { 8 | System.out.println(test()); 9 | } 10 | 11 | public static String test(){ 12 | try { 13 | testException(); 14 | System.out.println("1"); 15 | return "1"; 16 | } catch (Exception e) { 17 | System.out.println("2"); 18 | e.printStackTrace(); 19 | return "2"; 20 | } finally { 21 | System.out.println("3"); 22 | return "3"; 23 | } 24 | // return ""; 25 | } 26 | 27 | public static void testException() throws Exception { 28 | throw new Exception("test"); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/extendtest/MainFS.java: -------------------------------------------------------------------------------- 1 | package com.yao.test.extendtest; 2 | 3 | import com.yao.spider.core.http.client.BaseHttpClient; 4 | import com.yao.spider.douban.DoubanHttpClient; 5 | 6 | /** 7 | * Created by shanyao on 2018/5/9. 8 | */ 9 | public class MainFS { 10 | public static void main(String[] args) { 11 | SonTest father = new SonTest(); 12 | father.test2(); 13 | BaseHttpClient client1 = BaseHttpClient.getInstance(); 14 | BaseHttpClient client2 = BaseHttpClient.getInstance(); 15 | if (client1 == client2) { 16 | System.out.println("=="); 17 | } 18 | 19 | System.out.println(BaseHttpClient.getInstance()); 20 | System.out.println(BaseHttpClient.getInstance()); 21 | System.out.println(DoubanHttpClient.getInstance()); 22 | System.out.println(DoubanHttpClient.getInstance()); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/parserTest/MoveDetailInfoParserTest.java: -------------------------------------------------------------------------------- 1 | package com.yao.test.parserTest; 2 | 3 | import com.yao.spider.core.entity.Page; 4 | import com.yao.spider.core.factory.ParserFactory; 5 | import com.yao.spider.core.http.client.BaseHttpClient; 6 | import com.yao.spider.core.parser.IPageParser; 7 | import com.yao.spider.douban.DoubanHttpClient; 8 | import com.yao.spider.douban.parsers.move.MoveDetailInfoParser; 9 | import org.junit.Test; 10 | 11 | /** 12 | * Created by shanyao on 2018/3/14. 13 | */ 14 | public class MoveDetailInfoParserTest { 15 | @Test 16 | public void parser() throws Exception { 17 | String url = "https://movie.douban.com/subject/26346327/"; 18 | IPageParser parser = ParserFactory.getParserClass(MoveDetailInfoParser.class); 19 | Page page = BaseHttpClient.getInstance().getPage(url); 20 | parser.parser(page.getHtml()); 21 | } 22 | 23 | } -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/douban/douban/task/move/SpiderWithTypeTaskTest.java: -------------------------------------------------------------------------------- 1 | package com.yao.douban.douban.task.move; 2 | 3 | import com.yao.TestConsants; 4 | import com.yao.spider.douban.task.AbstractTaskDeprecated; 5 | import com.yao.spider.douban.task.move.SpiderWithTypeTask; 6 | import com.yao.spider.douban.utils.DBUtil; 7 | import junit.framework.TestCase; 8 | 9 | import java.util.Map; 10 | 11 | /** 12 | * Created by user on 2018/2/8. 13 | */ 14 | public class SpiderWithTypeTaskTest extends TestCase { 15 | 16 | public void testRun() throws Exception { 17 | Map map = DBUtil.getTypeMap("move", TestConsants.context); 18 | // for (String key : map.keySet()) { 19 | // AbstractTaskDeprecated task = new SpiderWithTypeTask("剧情", "11", false); 20 | // task.run(); 21 | // new Thread(new SpiderWithTypeTask("剧情", map.get("11"), false)).start(); 22 | // } 23 | } 24 | } -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/common/config/CommonConfig.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.common.config; 2 | 3 | import java.io.IOException; 4 | import java.util.Properties; 5 | 6 | /** 7 | * Created by user on 2018/3/28. 8 | * 加载配置文件 9 | */ 10 | public class CommonConfig { 11 | //是否保存到数据库 12 | public static boolean dbEnable; 13 | //是否爬取知乎 14 | public static boolean FUTURE_ZHIHU; 15 | //是否爬取豆瓣 16 | public static boolean FUTURE_DOUBAN; 17 | static { 18 | Properties p = new Properties(); 19 | try { 20 | p.load(CommonConfig.class.getResourceAsStream("/config/common.properties")); 21 | dbEnable = Boolean.parseBoolean(p.getProperty("db.enable")); 22 | FUTURE_DOUBAN = Boolean.parseBoolean(p.getProperty("future.douban")); 23 | FUTURE_ZHIHU = Boolean.parseBoolean(p.getProperty("future.zhihu")); 24 | } catch (IOException e) { 25 | e.printStackTrace(); 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /douban-spider/src/main/resources/create_table_sql.sql: -------------------------------------------------------------------------------- 1 | #豆瓣电影爬虫建表sql 2 | CREATE TABLE `move` ( 3 | `id` varchar(8) NOT NULL, 4 | `name` varchar(200) DEFAULT NULL, 5 | `url` varchar(100) DEFAULT NULL, 6 | `othername` varchar(100) DEFAULT NULL COMMENT '又名', 7 | `director` varchar(50) DEFAULT NULL COMMENT '导演', 8 | `screenwriter` varchar(50) DEFAULT NULL COMMENT '编剧', 9 | `mainaactors` varchar(500) DEFAULT NULL COMMENT '主演', 10 | `type` varchar(10) DEFAULT NULL COMMENT '类型', 11 | `region` varchar(50) DEFAULT NULL COMMENT '制片地区', 12 | `language` varchar(20) DEFAULT NULL COMMENT '语言', 13 | `showdate` varchar(10) DEFAULT NULL COMMENT '上市日期', 14 | `runtime` varchar(5) DEFAULT NULL COMMENT '片长', 15 | `imdb` varchar(100) DEFAULT NULL COMMENT 'imdb链接', 16 | `score` double DEFAULT NULL COMMENT '豆瓣评分', 17 | `votecount` double DEFAULT NULL COMMENT '评价人数', 18 | `evaluationTotal` double DEFAULT NULL COMMENT '评价总数', 19 | PRIMARY KEY (`id`) 20 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 21 | 22 | -------------------------------------------------------------------------------- /douban-spider/src/main/resources/sql/douban-move.sql: -------------------------------------------------------------------------------- 1 | #豆瓣电影爬虫建表sql 2 | CREATE TABLE `move` ( 3 | `id` varchar(8) NOT NULL, 4 | `name` varchar(200) DEFAULT NULL, 5 | `url` varchar(100) DEFAULT NULL, 6 | `othername` varchar(100) DEFAULT NULL COMMENT '又名', 7 | `director` varchar(50) DEFAULT NULL COMMENT '导演', 8 | `screenwriter` varchar(50) DEFAULT NULL COMMENT '编剧', 9 | `mainaactors` varchar(500) DEFAULT NULL COMMENT '主演', 10 | `type` varchar(10) DEFAULT NULL COMMENT '类型', 11 | `region` varchar(50) DEFAULT NULL COMMENT '制片地区', 12 | `language` varchar(20) DEFAULT NULL COMMENT '语言', 13 | `showdate` varchar(10) DEFAULT NULL COMMENT '上市日期', 14 | `runtime` varchar(5) DEFAULT NULL COMMENT '片长', 15 | `imdb` varchar(100) DEFAULT NULL COMMENT 'imdb链接', 16 | `score` double DEFAULT NULL COMMENT '豆瓣评分', 17 | `votecount` double DEFAULT NULL COMMENT '评价人数', 18 | `evaluationTotal` double DEFAULT NULL COMMENT '评价总数', 19 | PRIMARY KEY (`id`) 20 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 21 | 22 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/douban/exectors/ExecutorsPool.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.douban.exectors; 2 | 3 | import java.util.concurrent.ThreadPoolExecutor; 4 | 5 | /** 6 | * Created by shanyao on 2018/5/13. 7 | */ 8 | public class ExecutorsPool { //TODO 这样写好像不行? 9 | 10 | private ThreadPoolExecutor downLoadMoveListExector; 11 | private ThreadPoolExecutor downLoadMoveInfoExector; 12 | 13 | public ThreadPoolExecutor getDownLoadMoveListExector() { 14 | return downLoadMoveListExector; 15 | } 16 | 17 | public void setDownLoadMoveListExector(ThreadPoolExecutor downLoadMoveListExector) { 18 | this.downLoadMoveListExector = downLoadMoveListExector; 19 | } 20 | 21 | public ThreadPoolExecutor getDownLoadMoveInfoExector() { 22 | return downLoadMoveInfoExector; 23 | } 24 | 25 | public void setDownLoadMoveInfoExector(ThreadPoolExecutor downLoadMoveInfoExector) { 26 | this.downLoadMoveInfoExector = downLoadMoveInfoExector; 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/core/entity/RequestParams.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.core.entity; 2 | 3 | import com.yao.spider.proxytool.entity.Proxy; 4 | 5 | import java.util.concurrent.atomic.AtomicInteger; 6 | 7 | /** 8 | * Created by shanyao on 2018/5/10. 9 | */ 10 | @Deprecated 11 | public class RequestParams { 12 | private String url; 13 | private boolean isUserProxy; 14 | private AtomicInteger retryTimes; 15 | 16 | public String getUrl() { 17 | return url; 18 | } 19 | 20 | public void setUrl(String url) { 21 | this.url = url; 22 | } 23 | 24 | public boolean isUserProxy() { 25 | return isUserProxy; 26 | } 27 | 28 | public void setUserProxy(boolean userProxy) { 29 | isUserProxy = userProxy; 30 | } 31 | 32 | 33 | public AtomicInteger getRetryTimes() { 34 | return retryTimes; 35 | } 36 | 37 | public void setRetryTimes(AtomicInteger retryTimes) { 38 | this.retryTimes = retryTimes; 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/sqkfq/service/SqkfqUserServiceImpl.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.sqkfq.service; 2 | 3 | import com.yao.spider.sqkfq.dao.SqkfqUserMapper; 4 | import com.yao.spider.sqkfq.domain.SqkfqUser; 5 | import org.apache.ibatis.session.SqlSession; 6 | 7 | /** 8 | * @author 单耀 9 | * @version 1.0 10 | * @description 11 | * @date 2021/2/3 18:04 12 | */ 13 | public class SqkfqUserServiceImpl implements SqkfqUserService { 14 | @Override 15 | public void insert(SqlSession session, SqkfqUser user) { 16 | SqkfqUserMapper mapper = session.getMapper(SqkfqUserMapper.class); 17 | mapper.insert(user); 18 | session.commit(); 19 | session.close(); 20 | } 21 | 22 | @Override 23 | public void updateJiguan(SqlSession session, String jiguan, Long userMind) { 24 | SqkfqUserMapper mapper = session.getMapper(SqkfqUserMapper.class); 25 | mapper.updateJiguan(jiguan, userMind); 26 | session.commit(); 27 | session.close(); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/serializable/ArraySerializable.java: -------------------------------------------------------------------------------- 1 | package com.yao.test.serializable; 2 | 3 | import com.yao.spider.core.constants.ProxyConstants; 4 | import com.yao.spider.core.util.MyIOutils; 5 | import com.yao.spider.proxytool.ProxyPool; 6 | import com.yao.spider.proxytool.entity.Proxy; 7 | 8 | import java.util.ArrayList; 9 | import java.util.List; 10 | import java.util.concurrent.DelayQueue; 11 | 12 | /** 13 | * Created by shanyao on 2018/3/26. 14 | * 关于ArrayList内部实这个讲的很不错 15 | * http://www.importnew.com/18024.html 16 | */ 17 | public class ArraySerializable { 18 | public static void main(String[] args) { 19 | List list = new ArrayList(); 20 | Proxy proxy = new Proxy("1",1); 21 | list.add(proxy); 22 | MyIOutils.serializeObject(list,"test.ser"); 23 | 24 | List proxyList = (List) MyIOutils.deserializeObject(ProxyConstants.PROXYSER_FILE_NMAE); 25 | ProxyPool.proxyQueue = new DelayQueue(proxyList); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/douban/core/dao/Impl/BaseDaoImplTest.java: -------------------------------------------------------------------------------- 1 | package com.yao.douban.core.dao.Impl; 2 | 3 | import com.yao.spider.douban.dao.IMoveDao; 4 | import com.yao.spider.douban.dao.Impl.MoveDaoImpl; 5 | import com.yao.spider.douban.entity.move.Move; 6 | import org.junit.Test; 7 | import org.slf4j.Logger; 8 | import org.slf4j.LoggerFactory; 9 | 10 | /** 11 | * Created by shanyao on 2018/3/10. 12 | */ 13 | public class BaseDaoImplTest { 14 | private static Logger logger = LoggerFactory.getLogger(BaseDaoImplTest.class); 15 | @Test 16 | public void insert() throws Exception { 17 | logger.info("test"); 18 | Move move = new Move(); 19 | move.setId("111"); 20 | IMoveDao dao = new MoveDaoImpl(); 21 | dao.insert(move); 22 | } 23 | 24 | @Test 25 | public void insertSelective() throws Exception { 26 | Move move = new Move(); 27 | move.setId("001"); 28 | IMoveDao dao = new MoveDaoImpl(); 29 | dao.inserSelective(move); 30 | } 31 | 32 | } -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/StartClass.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider; 2 | 3 | import com.yao.spider.common.config.CommonConfig; 4 | import com.yao.spider.douban.DoubanHttpClient; 5 | import com.yao.spider.proxytool.ProxyHttpClient; 6 | import com.yao.spider.zhihu.ZhiHuHttpClient; 7 | import org.slf4j.Logger; 8 | import org.slf4j.LoggerFactory; 9 | 10 | /** 11 | * Created by 单耀 on 2018/1/24. 12 | */ 13 | public class StartClass { 14 | private static Logger logger = LoggerFactory.getLogger(StartClass.class); 15 | public static void main(String[] args) { 16 | ProxyHttpClient.getInstance().startProxy(); 17 | // try { 18 | // Thread.sleep(10000); 19 | // } catch (InterruptedException e) { 20 | // e.printStackTrace(); 21 | // } 22 | // if (CommonConfig.FUTURE_DOUBAN) { 23 | // DoubanHttpClient.getInstance().startDouBan(); 24 | // } 25 | // 26 | // if (CommonConfig.FUTURE_ZHIHU) { 27 | // ZhiHuHttpClient.getInstance().startZhiHu(); 28 | // } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/sqkfq/bean/SqjjBean.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.sqkfq.bean; 2 | 3 | import java.util.List; 4 | 5 | /** 6 | * @author 单耀 7 | * @version 1.0 8 | * @description 9 | * @date 2021/2/3 10:29 10 | */ 11 | public class SqjjBean { 12 | private Integer code; 13 | private String msg; 14 | private Integer count; 15 | private List data; 16 | 17 | public Integer getCode() { 18 | return code; 19 | } 20 | 21 | public void setCode(Integer code) { 22 | this.code = code; 23 | } 24 | 25 | public String getMsg() { 26 | return msg; 27 | } 28 | 29 | public void setMsg(String msg) { 30 | this.msg = msg; 31 | } 32 | 33 | public Integer getCount() { 34 | return count; 35 | } 36 | 37 | public void setCount(Integer count) { 38 | this.count = count; 39 | } 40 | 41 | public List getData() { 42 | return data; 43 | } 44 | 45 | public void setData(List data) { 46 | this.data = data; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/douban/parsers/move/MoveListParser.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.douban.parsers.move; 2 | 3 | import com.yao.spider.douban.entity.move.MoveList; 4 | import com.yao.spider.core.parser.IPageParser; 5 | import net.sf.json.JSONArray; 6 | import net.sf.json.JSONObject; 7 | import org.slf4j.Logger; 8 | import org.slf4j.LoggerFactory; 9 | 10 | import java.util.List; 11 | 12 | /** 13 | * Created by 单耀 on 2018/2/4. 14 | */ 15 | @Deprecated 16 | public class MoveListParser implements IPageParser { 17 | private static Logger logger = LoggerFactory.getLogger(MoveListParser.class); 18 | public List parser(String html) { 19 | try { 20 | JSONObject jsonObject = JSONObject.fromObject(html); 21 | JSONArray jsonArray = jsonObject.getJSONArray("subjects"); 22 | List move1s = (List) JSONArray.toCollection(jsonArray, MoveList.class); 23 | return move1s; 24 | } catch (Exception e) { 25 | logger.error(e.getMessage(), e); 26 | } 27 | return null; 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zimuku/service/impl/ZimuFileInfoServiceImpl.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zimuku.service.impl; 2 | 3 | import com.yao.spider.zimuku.dao.ZimuFileInfoMapper; 4 | import com.yao.spider.zimuku.dao.ZimuInfoMapper; 5 | import com.yao.spider.zimuku.domain.ZimuFileInfo; 6 | import com.yao.spider.zimuku.domain.ZimuInfo; 7 | import com.yao.spider.zimuku.service.ZimuFileInfoService; 8 | import org.apache.ibatis.session.SqlSession; 9 | 10 | /** 11 | * Created by shanyao on 2020/3/28 12 | */ 13 | public class ZimuFileInfoServiceImpl implements ZimuFileInfoService { 14 | public void insert(ZimuFileInfo fileInfo, SqlSession session) { 15 | ZimuFileInfoMapper mapper = session.getMapper(ZimuFileInfoMapper.class); 16 | mapper.insert(fileInfo); 17 | session.commit(); 18 | } 19 | 20 | public void isnert(ZimuFileInfo zimuInfo, SqlSession session) { 21 | try { 22 | ZimuFileInfoMapper mapper = session.getMapper(ZimuFileInfoMapper.class); 23 | mapper.insert(zimuInfo); 24 | session.commit(); 25 | } catch (Exception e) { 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/sqkfq/service/SqkfqBaomingServiceImpl.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.sqkfq.service; 2 | 3 | import com.yao.spider.sqkfq.dao.SqkfqBaomingMapper; 4 | import com.yao.spider.sqkfq.domain.SqkfqBaoming; 5 | import org.apache.ibatis.session.SqlSession; 6 | 7 | import java.util.List; 8 | 9 | public class SqkfqBaomingServiceImpl implements SqkfqBaomingService{ 10 | 11 | public void insert(SqkfqBaoming sqkfqBaoming, SqlSession session) { 12 | SqkfqBaomingMapper mapper = session.getMapper(SqkfqBaomingMapper.class); 13 | mapper.insert(sqkfqBaoming); 14 | session.commit(); 15 | session.close(); 16 | } 17 | 18 | public List selectAll(SqlSession session) { 19 | SqkfqBaomingMapper mapper = session.getMapper(SqkfqBaomingMapper.class); 20 | return mapper.selectAll(); 21 | } 22 | 23 | @Override 24 | public List selectByZipCodeAndOpt(SqlSession session, Integer code, Long opt) { 25 | SqkfqBaomingMapper mapper = session.getMapper(SqkfqBaomingMapper.class); 26 | return mapper.selectByZipCodeAndOpt(code, opt); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/sjmstest/Demo.java: -------------------------------------------------------------------------------- 1 | package com.yao.test.sjmstest; 2 | 3 | /** 4 | * Created by 单耀 on 2018/1/26. 5 | */ 6 | public class Demo { 7 | public static void main(String[] args) { 8 | try { 9 | /** 10 | * Class.forName(String):要求JVM查找并加载String指定的类 11 | * 返回String串指定的类 12 | */ 13 | Class clazz = Class.forName("com.yao.test.sjmstest.Demo"); 14 | /** 15 | * clazz.newInstance() 16 | * 返回的类所代表的一个实例和new Demo()效果是一样的。 17 | */ 18 | Demo demo = (Demo) clazz.newInstance(); 19 | demo.method(demo); 20 | //这里的demo1与上面的demo效果是一样的 21 | Demo demo1 = new Demo(); 22 | demo1.method(demo1); 23 | } catch (ClassNotFoundException e) { 24 | e.printStackTrace(); 25 | } catch (InstantiationException e) { 26 | e.printStackTrace(); 27 | } catch (IllegalAccessException e) { 28 | e.printStackTrace(); 29 | } 30 | } 31 | 32 | public void method(Demo demo) { 33 | System.out.println(demo); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/common/dao/Impl/BaseDaoImpl.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.common.dao.Impl; 2 | 3 | import com.yao.spider.common.dao.IBaseDao; 4 | import com.yao.spider.core.util.MyBatiesUtils; 5 | import org.apache.ibatis.session.SqlSession; 6 | 7 | /** 8 | * Created by shanyao on 2018/3/10. 9 | */ 10 | public class BaseDaoImpl implements IBaseDao{ 11 | /* private Class entityClass; 12 | 13 | public BaseDaoImpl(Class entityClass) { 14 | ParameterizedType pt = (ParameterizedType) this.getClass() 15 | .getGenericSuperclass(); 16 | entityClass = (Class) pt.getActualTypeArguments()[0]; 17 | }*/ 18 | 19 | public SqlSession getSession() { 20 | return MyBatiesUtils.getSqlSession(); 21 | } 22 | //TODO 不能用 23 | public void insert(T extity) { 24 | try { 25 | getSession().insert(extity.getClass().getName() + ".insert", extity); 26 | getSession().commit(); 27 | } catch (Exception e) { 28 | e.printStackTrace(); 29 | getSession().rollback(); 30 | } finally { 31 | getSession().close(); 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/proxytool/parses/mimiip/MimiipProxyListParser.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.proxytool.parses.mimiip; 2 | 3 | 4 | import com.yao.spider.core.parser.IPageParser; 5 | import com.yao.spider.proxytool.entity.Proxy; 6 | import com.yao.spider.core.constants.ProxyConstants; 7 | import org.jsoup.Jsoup; 8 | import org.jsoup.nodes.Document; 9 | import org.jsoup.select.Elements; 10 | 11 | import java.util.ArrayList; 12 | import java.util.List; 13 | 14 | /** 15 | * Created by 单耀 on 2017/12/5. 16 | */ 17 | public class MimiipProxyListParser implements IPageParser { 18 | public List parser(String hmtl) { 19 | Document document = Jsoup.parse(hmtl); 20 | Elements elements = document.select("table[class=list] tr"); 21 | List proxyList = new ArrayList(elements.size()); 22 | for (int i = 1; i < elements.size(); i++){ 23 | String ip = elements.get(i).select("td:eq(0)").first().text(); 24 | String port = elements.get(i).select("td:eq(1)").first().text(); 25 | proxyList.add(new Proxy(ip, Integer.valueOf(port), ProxyConstants.TIME_INTERVAL, "mmiip")); 26 | } 27 | return proxyList; 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zimuku/service/impl/ZimuHtmlServiceImpl.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zimuku.service.impl; 2 | 3 | import com.yao.spider.zimuku.dao.ZimuHtmlMapper; 4 | import com.yao.spider.zimuku.dao.ZimuInfoMapper; 5 | import com.yao.spider.zimuku.domain.ZimuHtml; 6 | import com.yao.spider.zimuku.service.ZimuHtmlService; 7 | import org.apache.ibatis.session.SqlSession; 8 | 9 | import java.util.List; 10 | 11 | public class ZimuHtmlServiceImpl implements ZimuHtmlService { 12 | 13 | public void insert(ZimuHtml zimuHtml, SqlSession session) { 14 | ZimuHtmlMapper mapper = session.getMapper(ZimuHtmlMapper.class); 15 | mapper.insert(zimuHtml); 16 | session.commit(); 17 | } 18 | 19 | public Long selectMaxId(SqlSession session) { 20 | ZimuHtmlMapper mapper = session.getMapper(ZimuHtmlMapper.class); 21 | ZimuHtml html = mapper.selectMax(); 22 | return html.getId(); 23 | } 24 | 25 | public List selectByRange(Long startId, Long endId, SqlSession session) { 26 | ZimuHtmlMapper mapper = session.getMapper(ZimuHtmlMapper.class); 27 | List htmls = mapper.selectByRange(startId, endId); 28 | return htmls; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/douban/parsers/move/MoveParser.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.douban.parsers.move; 2 | 3 | import com.yao.spider.douban.entity.move.Move; 4 | import com.yao.spider.core.parser.IPageParser; 5 | import net.sf.json.JSONArray; 6 | import net.sf.json.JsonConfig; 7 | import org.slf4j.Logger; 8 | import org.slf4j.LoggerFactory; 9 | 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | 13 | /** 14 | * Created by user on 2018/2/9. 15 | */ 16 | public class MoveParser implements IPageParser { 17 | private static Logger logger = LoggerFactory.getLogger(MoveParser.class); 18 | 19 | public List parser(String html) { 20 | try { 21 | String[] excludes = new String[]{"rating", "rank","cover_url","cover_url"}; 22 | JsonConfig jsonConfig = new JsonConfig(); 23 | jsonConfig.setExcludes(excludes); 24 | JSONArray jsonArray = JSONArray.fromObject(html, jsonConfig); 25 | List list = (List) jsonArray.toCollection(jsonArray, Move.class); 26 | return list; 27 | } catch (Exception e) { 28 | logger.error(e.getMessage(), e); 29 | } 30 | return new ArrayList(); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/core/entity/Page.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.core.entity; 2 | 3 | import com.yao.spider.proxytool.entity.Proxy; 4 | 5 | /** 6 | * Created by 单耀 on 2018/1/27. 7 | * 8 | * 用于封装返回页面信息 9 | */ 10 | public class Page { 11 | /** 12 | * 请求页面的url 13 | */ 14 | private String url; 15 | /** 16 | * 请求页面状态吗 17 | */ 18 | private int statusCode; 19 | /** 20 | * 请求页面详细信息 21 | */ 22 | private String html; 23 | /** 24 | * 请求页面代理 25 | */ 26 | private Proxy proxy; 27 | 28 | public String getUrl() { 29 | return url; 30 | } 31 | 32 | public void setUrl(String url) { 33 | this.url = url; 34 | } 35 | 36 | public int getStatusCode() { 37 | return statusCode; 38 | } 39 | 40 | public void setStatusCode(int statusCode) { 41 | this.statusCode = statusCode; 42 | } 43 | 44 | public String getHtml() { 45 | return html; 46 | } 47 | 48 | public void setHtml(String html) { 49 | this.html = html; 50 | } 51 | 52 | public Proxy getProxy() { 53 | return proxy; 54 | } 55 | 56 | public void setProxy(Proxy proxy) { 57 | this.proxy = proxy; 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/proxytool/parses/xicidaili/XicidailiProxyListParser.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.proxytool.parses.xicidaili; 2 | 3 | import com.yao.spider.core.parser.IPageParser; 4 | import com.yao.spider.proxytool.entity.Proxy; 5 | import com.yao.spider.core.constants.ProxyConstants; 6 | import org.jsoup.Jsoup; 7 | import org.jsoup.nodes.Document; 8 | import org.jsoup.nodes.Element; 9 | import org.jsoup.select.Elements; 10 | 11 | import java.util.ArrayList; 12 | import java.util.List; 13 | 14 | 15 | /** 16 | * Created by 单耀 on 2017/12/5. 17 | */ 18 | public class XicidailiProxyListParser implements IPageParser { 19 | public List parser(String html) { 20 | Document document = Jsoup.parse(html); 21 | Elements elements = document.select("table[id=ip_list] tr[class]"); 22 | List proxyList = new ArrayList(elements.size()); 23 | for (Element element : elements){ 24 | String ip = element.select("td:eq(1)").first().text(); 25 | String port = element.select("td:eq(2)").first().text(); 26 | proxyList.add(new Proxy(ip, Integer.valueOf(port), ProxyConstants.TIME_INTERVAL, "xicidaili")); 27 | } 28 | return proxyList; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/douban/entity/move/MoveList.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.douban.entity.move; 2 | 3 | /** 4 | * Created by 单耀 on 2018/2/2. 5 | */ 6 | public class MoveList { 7 | 8 | private String id; 9 | private Double rate; 10 | private String title; 11 | private String url; 12 | 13 | 14 | public String getId() { 15 | return id; 16 | } 17 | 18 | public void setId(String id) { 19 | this.id = id; 20 | } 21 | 22 | public Double getRate() { 23 | return rate; 24 | } 25 | 26 | public void setRate(Double rate) { 27 | this.rate = rate; 28 | } 29 | 30 | public String getTitle() { 31 | return title; 32 | } 33 | 34 | public void setTitle(String title) { 35 | this.title = title; 36 | } 37 | 38 | public String getUrl() { 39 | return url; 40 | } 41 | 42 | public void setUrl(String url) { 43 | this.url = url; 44 | } 45 | 46 | @Override 47 | public String toString() { 48 | return "MoveList{" + 49 | "id='" + id + '\'' + 50 | ", rate='" + rate + '\'' + 51 | ", title='" + title + '\'' + 52 | ", url='" + url + '\'' + 53 | '}'; 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/proxytool/parses/ip66/Ip66ProxyListParser.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.proxytool.parses.ip66; 2 | 3 | import com.yao.spider.core.parser.IPageParser; 4 | import com.yao.spider.proxytool.entity.Proxy; 5 | import com.yao.spider.core.constants.ProxyConstants; 6 | import org.jsoup.Jsoup; 7 | import org.jsoup.nodes.Document; 8 | import org.jsoup.nodes.Element; 9 | import org.jsoup.select.Elements; 10 | 11 | import java.util.ArrayList; 12 | import java.util.List; 13 | 14 | /** 15 | * Created by 单耀 on 2017/12/17. 16 | */ 17 | public class Ip66ProxyListParser implements IPageParser { 18 | public List parser(String content) { 19 | List proxyList = new ArrayList(); 20 | if (content == null || content.equals("")){ 21 | return proxyList; 22 | } 23 | Document document = Jsoup.parse(content); 24 | Elements elements = document.select("table tr:gt(1)"); 25 | for (Element element : elements){ 26 | String ip = element.select("td:eq(0)").first().text(); 27 | String port = element.select("td:eq(1)").first().text(); 28 | proxyList.add(new Proxy(ip, Integer.valueOf(port), ProxyConstants.TIME_INTERVAL, "ip66")); 29 | } 30 | return proxyList; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /douban-spider/src/main/resources/sql/zhihu.sql: -------------------------------------------------------------------------------- 1 | #创建知乎用户表 2 | SET FOREIGN_KEY_CHECKS=0; 3 | 4 | -- ---------------------------- 5 | -- Table structure for user 6 | -- ---------------------------- 7 | DROP TABLE IF EXISTS `user`; 8 | CREATE TABLE `user` ( 9 | `user_token` varchar(100) NOT NULL, 10 | `location` varchar(255) DEFAULT NULL, 11 | `business` varchar(255) DEFAULT NULL, 12 | `sex` varchar(255) DEFAULT NULL, 13 | `company` varchar(255) DEFAULT NULL, 14 | `education` varchar(255) DEFAULT NULL, 15 | `username` varchar(255) DEFAULT NULL, 16 | `url` varchar(255) DEFAULT NULL, 17 | `agrees` int(11) DEFAULT NULL, 18 | `thanks` int(11) DEFAULT NULL, 19 | `asks` int(11) DEFAULT NULL, 20 | `answers` int(11) DEFAULT NULL, 21 | `articles` int(11) DEFAULT NULL, 22 | `followees` int(11) DEFAULT NULL, 23 | `followers` int(11) DEFAULT NULL, 24 | `userId` varchar(255) DEFAULT NULL, 25 | PRIMARY KEY (`user_token`) 26 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 27 | 28 | #创建usertoken表 29 | SET FOREIGN_KEY_CHECKS=0; 30 | 31 | -- ---------------------------- 32 | -- Table structure for usertoken 33 | -- ---------------------------- 34 | DROP TABLE IF EXISTS `usertoken`; 35 | CREATE TABLE `usertoken` ( 36 | `user_token` varchar(100) NOT NULL, 37 | PRIMARY KEY (`user_token`) 38 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/serializable/ProxySerializable.java: -------------------------------------------------------------------------------- 1 | package com.yao.test.serializable; 2 | 3 | import com.yao.spider.core.util.MyIOutils; 4 | import com.yao.spider.proxytool.entity.Proxy; 5 | 6 | import java.io.*; 7 | 8 | /** 9 | * Created by shanyao on 2018/3/26. 10 | */ 11 | public class ProxySerializable { 12 | public static void main(String[] args) { 13 | Proxy p1 = new Proxy("001",90); 14 | ObjectOutputStream outputStream = null; 15 | try { 16 | outputStream = new ObjectOutputStream(new FileOutputStream("F:\\proects\\douban\\proxy.ser")); 17 | outputStream.writeObject(p1); 18 | } catch (IOException e) { 19 | e.printStackTrace(); 20 | } finally { 21 | MyIOutils.close(outputStream); 22 | } 23 | 24 | //Read object from file 25 | File file = new File("F:\\proects\\douban\\proxy.ser"); 26 | ObjectInputStream inputStream = null; 27 | try { 28 | inputStream = new ObjectInputStream(new FileInputStream(file)); 29 | Proxy proxy = (Proxy) inputStream.readObject(); 30 | System.out.printf(proxy.getProxyStr()); 31 | } catch (IOException e) { 32 | e.printStackTrace(); 33 | } catch (ClassNotFoundException e) { 34 | e.printStackTrace(); 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/common/exception/HttpClientException.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.common.exception; 2 | 3 | 4 | public class HttpClientException extends RuntimeException{ 5 | 6 | 7 | private Integer status; 8 | private String httpResult; 9 | 10 | /** 11 | * 12 | */ 13 | private static final long serialVersionUID = -1413861554881895281L; 14 | 15 | public HttpClientException(String msg) { 16 | super(msg); 17 | } 18 | 19 | public HttpClientException(String msg, Integer status, String result) { 20 | super(msg); 21 | this.status = status; 22 | this.setHttpResult(result); 23 | } 24 | 25 | public HttpClientException(String msg, Throwable cause) { 26 | super(msg, cause); 27 | } 28 | 29 | public HttpClientException(String msg, Throwable cause, Integer status, String result) { 30 | super(msg, cause); 31 | this.status = status; 32 | this.setHttpResult(result); 33 | } 34 | 35 | public Integer getStatus() { 36 | return status; 37 | } 38 | 39 | public void setStatus(Integer status) { 40 | this.status = status; 41 | } 42 | 43 | public String getHttpResult() { 44 | return httpResult; 45 | } 46 | 47 | public void setHttpResult(String httpResult) { 48 | this.httpResult = httpResult; 49 | } 50 | 51 | public String getMessage() { 52 | return "status:" + status + ", httpResult:" + httpResult + ", msg:"+ super.getMessage(); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zhihu/dao/Impl/UserDaoImpl.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zhihu.dao.Impl; 2 | 3 | import com.yao.spider.core.util.MyBatiesUtils; 4 | import com.yao.spider.zhihu.dao.IUserDao; 5 | import com.yao.spider.zhihu.entity.User; 6 | import com.yao.spider.zhihu.mapper.UserMapper; 7 | import org.apache.ibatis.session.SqlSession; 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | 11 | /** 12 | * Created by shanyao on 2018/3/29. 13 | */ 14 | public class UserDaoImpl implements IUserDao{ 15 | private static Logger logger = LoggerFactory.getLogger(UserDaoImpl.class); 16 | public int inserSelective(User user) { 17 | SqlSession session = MyBatiesUtils.getSqlSession(); 18 | try { 19 | UserMapper mapper = session.getMapper(UserMapper.class); 20 | if (user != null) { 21 | User u = mapper.selectByPrimaryKey(user.getUserToken()); 22 | if (u == null) { 23 | mapper.insertSelective(user); 24 | session.commit(); 25 | return 1; 26 | } else { 27 | mapper.updateByPrimaryKey(user); 28 | } 29 | } 30 | } catch (Exception e) { 31 | logger.error(e.getMessage(), e); 32 | } finally { 33 | session.close(); 34 | } 35 | return 0; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/douban/task/SpiderDouBanInfo.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.douban.task; 2 | 3 | import com.yao.spider.common.constants.Constants; 4 | import com.yao.spider.douban.DoubanHttpClient; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | 8 | import java.net.URLEncoder; 9 | 10 | /** 11 | * Created by 单耀 on 2018/2/5. 12 | */ 13 | public class SpiderDouBanInfo implements Runnable { 14 | public static volatile boolean isContinue = true; 15 | private DoubanHttpClient doubanHttpClient = DoubanHttpClient.getInstance(); 16 | //电影开始条数 17 | public static volatile int MOVE_START = 0; 18 | private static Logger logger = LoggerFactory.getLogger(SpiderDouBanInfo.class); 19 | public void run() { 20 | while (isContinue) { 21 | //TODO 后期改成可拓展,即可以查询其他信息 22 | try { 23 | String url = String.format(Constants.STRTY_URL_MOVE, Constants.TYPE, URLEncoder.encode(Constants.TAG,"UTF-8"), Constants.LIMIT, MOVE_START); 24 | logger.info("开始条数:" + MOVE_START); 25 | //TODO 改成future 放回结果 26 | doubanHttpClient.getDownLoadMoveListExector().execute(new DouBanInfoListPageTask( url, true, 0, MOVE_START)); 27 | MOVE_START += Constants.LIMIT; 28 | Thread.sleep(10000); 29 | 30 | } catch (Exception e) { 31 | logger.error(e.getMessage(), e); 32 | } 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/proxytool/parses/kuaidaili/KuaidailiProxyListParser.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.proxytool.parses.kuaidaili; 2 | 3 | import com.yao.spider.core.constants.ProxyConstants; 4 | import com.yao.spider.core.parser.IPageParser; 5 | import com.yao.spider.proxytool.entity.Proxy; 6 | import org.jsoup.Jsoup; 7 | import org.jsoup.nodes.Document; 8 | import org.jsoup.nodes.Element; 9 | import org.jsoup.select.Elements; 10 | 11 | import java.util.ArrayList; 12 | import java.util.List; 13 | 14 | /** 15 | * Created by user on 2018/3/28. 16 | * 代理地址:https://www.kuaidaili.com/free/inha/2223/ 17 | */ 18 | public class KuaidailiProxyListParser implements IPageParser { 19 | public List parser(String context) { 20 | Document document = Jsoup.parse(context); 21 | Elements tbody = document.select("tbody"); 22 | Elements tr = tbody.select("tr"); 23 | List proxyList = new ArrayList(tr.size()); 24 | for (Element element : tr) { 25 | String ip = element.select("[data-title=IP]").text(); 26 | String port = element.select("[data-title=PORT]").text(); 27 | if (ip != null && !"".equals(ip) && port != null && !"".equals(port)) { 28 | Proxy proxy = new Proxy(ip,Integer.valueOf(port), ProxyConstants.TIME_INTERVAL, "kauidaili"); 29 | proxyList.add(proxy); 30 | } 31 | } 32 | return proxyList; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/proxytool/parses/ip181/Ip181ProxyListParser.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.proxytool.parses.ip181; 2 | 3 | import com.yao.spider.core.parser.IPageParser; 4 | import com.yao.spider.proxytool.entity.Proxy; 5 | import com.yao.spider.core.constants.ProxyConstants; 6 | import jdk.nashorn.internal.parser.JSONParser; 7 | import net.sf.json.JSONArray; 8 | import net.sf.json.JSONObject; 9 | import net.sf.json.JsonConfig; 10 | import org.jsoup.Jsoup; 11 | import org.jsoup.nodes.Document; 12 | import org.jsoup.nodes.Element; 13 | import org.jsoup.select.Elements; 14 | 15 | import java.util.ArrayList; 16 | import java.util.List; 17 | 18 | 19 | /** 20 | * Created by 单耀 on 2017/12/17. 21 | */ 22 | public class Ip181ProxyListParser implements IPageParser { 23 | public List parser(String content) { 24 | List proxyList = null; 25 | try { 26 | JSONObject object = JSONObject.fromObject(content); 27 | 28 | String[] encludes = new String[] {"position" }; 29 | JsonConfig jsonConfig = new JsonConfig(); 30 | jsonConfig.setExcludes(encludes); 31 | JSONArray array = object.getJSONArray("RESULT"); 32 | JSONArray jsonArray = JSONArray.fromObject(array.toString(),jsonConfig); 33 | proxyList = (List) jsonArray.toCollection(jsonArray, Proxy.class); 34 | } catch (Exception e) { 35 | } 36 | return proxyList; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/core/factory/ParserFactory.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.core.factory; 2 | 3 | import com.yao.spider.core.constants.ParserConstants; 4 | import com.yao.spider.core.parser.IPageParser; 5 | import com.yao.spider.douban.parsers.move.MoveDetailInfoParser; 6 | import com.yao.spider.douban.parsers.move.MoveParser; 7 | 8 | import java.util.Comparator; 9 | import java.util.HashMap; 10 | import java.util.Map; 11 | 12 | /** 13 | * Created by 单耀 on 2018/1/30. 14 | */ 15 | public class ParserFactory { 16 | private static Map parseMap = new HashMap(); 17 | public static IPageParser getParserClass(Class clzz) { 18 | try { 19 | //利用java放射机制 20 | return (IPageParser) clzz.newInstance(); 21 | } catch (InstantiationException e) { 22 | e.printStackTrace(); 23 | } catch (IllegalAccessException e) { 24 | e.printStackTrace(); 25 | } 26 | return null; 27 | 28 | } 29 | public static IPageParser getParserByProductType(String type) { 30 | try { 31 | //利用java放射机制 32 | return (IPageParser) parseMap.get(type).newInstance(); 33 | } catch (InstantiationException e) { 34 | e.printStackTrace(); 35 | } catch (IllegalAccessException e) { 36 | e.printStackTrace(); 37 | } 38 | return null; 39 | 40 | } 41 | 42 | static { 43 | parseMap.put(ParserConstants.MOVE_LIST,MoveParser.class); 44 | parseMap.put(ParserConstants.MOVE_DETAIL,MoveDetailInfoParser.class); 45 | 46 | } 47 | 48 | 49 | } 50 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zhihu/config/ZhiHuConfig.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zhihu.config; 2 | 3 | import java.io.IOException; 4 | import java.util.Properties; 5 | 6 | /** 7 | * Created by shanyao on 2018/3/28. 8 | */ 9 | public class ZhiHuConfig { 10 | public static String startURL; 11 | public static String startUserToken; 12 | 13 | // public final static String USER_FOLLOWEES_URL = "https://www.zhihu.com/api/v4/members/%s/followees?include=data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics&offset=%d&limit=20"; 14 | /* public final static String FOLLOWEES_API = "https://www.zhihu.com/api/v4/members/%s/followees?" + 15 | "include=data[*].educations,employments,answer_count,business,locations,articles_count,follower_count," + 16 | "gender,following_count,question_count,voteup_count,thanked_count,is_followed,is_following," + 17 | "badge[?(type=best_answerer)].topics&offset=%d&limit=20";*/ 18 | public static String FOLLOWEES_API; 19 | public static String authorization; 20 | 21 | static { 22 | Properties p = new Properties(); 23 | try { 24 | p.load(ZhiHuConfig.class.getResourceAsStream("/config/zhihu-config.properties")); 25 | startURL = p.getProperty("startURL"); 26 | FOLLOWEES_API = p.getProperty("FOLLOWEES_API"); 27 | startUserToken = p.getProperty("startUserToken"); 28 | authorization = p.getProperty("authorization"); 29 | } catch (IOException e) { 30 | e.printStackTrace(); 31 | } 32 | } 33 | } 34 | 35 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zhihu/mapper/UserTokenMapper.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | user_token 10 | 11 | 12 | 18 | 19 | 20 | delete from usertoken 21 | where user_token = #{userToken,jdbcType=VARCHAR} 22 | 23 | 24 | insert into usertoken (user_token) 25 | values (#{userToken,jdbcType=VARCHAR}) 26 | 27 | 28 | insert into usertoken 29 | 30 | 31 | user_token, 32 | 33 | 34 | 35 | 36 | #{userToken,jdbcType=VARCHAR}, 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/proxytool/task/ProxySerializeTask.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.proxytool.task; 2 | 3 | import com.yao.spider.proxytool.ProxyPool; 4 | import com.yao.spider.proxytool.entity.Proxy; 5 | import com.yao.spider.core.util.MyIOutils; 6 | import com.yao.spider.core.constants.ProxyConstants; 7 | import org.slf4j.Logger; 8 | import org.slf4j.LoggerFactory; 9 | 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | 13 | /** 14 | * Created by user on 2018/3/27. 15 | */ 16 | public class ProxySerializeTask implements Runnable{ 17 | private static final Logger logger = LoggerFactory.getLogger(ProxySerializeTask.class); 18 | public void run() { 19 | while (true) { 20 | try { 21 | //每一分钟进行一次序列化 22 | Thread.sleep(10000); 23 | } catch (InterruptedException e) { 24 | e.printStackTrace(); 25 | } 26 | List proxyArray = null; 27 | ProxyPool.lock.readLock().lock(); 28 | try { 29 | proxyArray = new ArrayList(); 30 | int i = 0; 31 | for (Proxy proxy : ProxyPool.proxySet) { 32 | proxyArray.add(proxy); 33 | } 34 | logger.info("序列化代理:" + proxyArray.size() + "个"); 35 | MyIOutils.serializeObject(proxyArray, ProxyConstants.PROXYSER_FILE_NMAE); 36 | } catch (Exception e) { 37 | logger.error(e.getMessage(), e); 38 | } finally { 39 | ProxyPool.lock.readLock().unlock(); 40 | } 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zimuku/service/impl/ZimuInfoExtendServiceImpl.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zimuku.service.impl; 2 | 3 | import com.yao.spider.zimuku.dao.ZimuInfoExtendMapper; 4 | import com.yao.spider.zimuku.domain.ZimuHtml; 5 | import com.yao.spider.zimuku.domain.ZimuInfoExtend; 6 | import com.yao.spider.zimuku.service.ZimuInfoExtendService; 7 | import org.apache.ibatis.session.SqlSession; 8 | 9 | import java.util.List; 10 | 11 | /** 12 | * Created by shanyao on 2020/3/28 13 | */ 14 | public class ZimuInfoExtendServiceImpl implements ZimuInfoExtendService { 15 | public void insert(ZimuInfoExtend zimuInfoExtend, SqlSession session) { 16 | ZimuInfoExtendMapper mapper = session.getMapper(ZimuInfoExtendMapper.class); 17 | mapper.insert(zimuInfoExtend); 18 | session.commit(); 19 | } 20 | 21 | public Long selectMaxId(SqlSession session) { 22 | ZimuInfoExtendMapper mapper = session.getMapper(ZimuInfoExtendMapper.class); 23 | ZimuInfoExtend html = mapper.selectMax(); 24 | return html.getId(); 25 | } 26 | 27 | public List selectByRange(Long startId, Long endId, SqlSession session) { 28 | ZimuInfoExtendMapper mapper = session.getMapper(ZimuInfoExtendMapper.class); 29 | List htmls = mapper.selectByRange(startId, endId); 30 | return htmls; 31 | } 32 | 33 | public void update(ZimuInfoExtend builderZimuInfoExtend, SqlSession session) { 34 | ZimuInfoExtendMapper mapper = session.getMapper(ZimuInfoExtendMapper.class); 35 | mapper.update(builderZimuInfoExtend); 36 | session.commit(); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/common/OKHttp2Utils.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.common; 2 | 3 | 4 | import com.squareup.okhttp.*; 5 | 6 | import java.io.IOException; 7 | import java.util.Map; 8 | 9 | public class OKHttp2Utils { 10 | 11 | public static final MediaType MEDIA_TYPE_JSON = MediaType.parse("application/json; charset=utf-8"); 12 | 13 | public static final String HTTP_200_STATUS = "200"; 14 | 15 | public static String sendPost(String url, String param) throws IOException { 16 | OkHttpClient client = new OkHttpClient(); 17 | RequestBody body = RequestBody.create(MEDIA_TYPE_JSON, param); 18 | Request request = new Request.Builder().url(url).post(body).build(); 19 | Response response = client.newCall(request).execute(); 20 | 21 | // if (response.isSuccessful()) { 22 | // String code = String.valueOf(response.code()); 23 | // return StringUtils.isEmpty(code) ? HTTP_200_STATUS : code; 24 | // } 25 | return response.body().string(); 26 | } 27 | public static String sendPostWithHeaders(String url, Map headers) throws IOException { 28 | 29 | OkHttpClient client = new OkHttpClient(); 30 | RequestBody body = RequestBody.create(MEDIA_TYPE_JSON, ""); 31 | Request request = new Request.Builder().url(url).headers(Headers.of(headers)).post(body).build(); 32 | Response response = client.newCall(request).execute(); 33 | 34 | // if (response.isSuccessful()) { 35 | // String code = String.valueOf(response.code()); 36 | // return StringUtils.isEmpty(code) ? HTTP_200_STATUS : code; 37 | // } 38 | return response.body().string(); 39 | } 40 | 41 | 42 | } 43 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zimuku/domain/ZimuHtml.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zimuku.domain; 2 | 3 | import java.util.Date; 4 | 5 | /** 6 | * t_zimu_html 7 | * @author 8 | */ 9 | public class ZimuHtml { 10 | private Long id; 11 | 12 | /** 13 | * 类型1:列表,2:详情 14 | */ 15 | private Integer htmlType; 16 | 17 | /** 18 | * 是否删除 19 | */ 20 | private Integer isDeleted; 21 | 22 | /** 23 | * 创建时间 24 | */ 25 | private Date createTime; 26 | 27 | private Date lastUpdateTime; 28 | 29 | private String htmlValue; 30 | 31 | public Long getId() { 32 | return id; 33 | } 34 | 35 | public void setId(Long id) { 36 | this.id = id; 37 | } 38 | 39 | public Integer getHtmlType() { 40 | return htmlType; 41 | } 42 | 43 | public void setHtmlType(Integer htmlType) { 44 | this.htmlType = htmlType; 45 | } 46 | 47 | public Integer getIsDeleted() { 48 | return isDeleted; 49 | } 50 | 51 | public void setIsDeleted(Integer isDeleted) { 52 | this.isDeleted = isDeleted; 53 | } 54 | 55 | public Date getCreateTime() { 56 | return createTime; 57 | } 58 | 59 | public void setCreateTime(Date createTime) { 60 | this.createTime = createTime; 61 | } 62 | 63 | public Date getLastUpdateTime() { 64 | return lastUpdateTime; 65 | } 66 | 67 | public void setLastUpdateTime(Date lastUpdateTime) { 68 | this.lastUpdateTime = lastUpdateTime; 69 | } 70 | 71 | public String getHtmlValue() { 72 | return htmlValue; 73 | } 74 | 75 | public void setHtmlValue(String htmlValue) { 76 | this.htmlValue = htmlValue; 77 | } 78 | } -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/sqkfq/parses/SqkfaUserParser.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.sqkfq.parses; 2 | 3 | import com.yao.spider.sqkfq.domain.SqkfqUser; 4 | import com.yao.spider.zimuku.domain.ZimuInfo; 5 | import org.jsoup.Jsoup; 6 | import org.jsoup.nodes.Document; 7 | import org.jsoup.nodes.Element; 8 | import org.jsoup.select.Elements; 9 | 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | 13 | /** 14 | * @author 单耀 15 | * @version 1.0 16 | * @description 17 | * @date 2021/2/3 17:00 18 | */ 19 | public class SqkfaUserParser { 20 | public static SqkfqUser parser(String html) { 21 | Document document = Jsoup.parse(html); 22 | Elements oddList = document.select(".table-c"); 23 | Elements tr = document.select("tr"); 24 | Elements td = tr.select("td"); 25 | SqkfqUser user = new SqkfqUser(); 26 | user.setUserName(td.get(1).text()); 27 | user.setUserSex(td.get(3).text()); 28 | user.setUserBirthday(td.get(5).text()); 29 | user.setUserHeaderUrl(td.get(6).select("img").attr("src")); 30 | user.setUserIdCard(td.get(10).text()); 31 | user.setUserJiguan(td.get(14).text()); 32 | user.setUserMerage(td.get(16).text()); 33 | user.setUserZzmm(td.get(18).text()); 34 | user.setUserSg(td.get(24).text()); 35 | user.setUserWeight(td.get(26).text()); 36 | user.setUserAddress(td.get(28).text()); 37 | user.setUserPhone(td.get(30).text()); 38 | user.setUserJinji(td.get(34).text()); 39 | user.setUserJinjiPhone(td.get(36).text()); 40 | user.setUserXuli(td.get(40).text()); 41 | user.setUserZhuanye(td.get(42).text()); 42 | return user; 43 | 44 | } 45 | 46 | 47 | } 48 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zimuku/service/impl/ZimuInfoServiceImpl.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zimuku.service.impl; 2 | 3 | import com.yao.spider.core.util.MyBatiesUtils; 4 | import com.yao.spider.zhihu.dao.Impl.UserDaoImpl; 5 | import com.yao.spider.zhihu.entity.User; 6 | import com.yao.spider.zhihu.mapper.UserMapper; 7 | import com.yao.spider.zimuku.dao.ZimuInfoMapper; 8 | import com.yao.spider.zimuku.domain.ZimuInfo; 9 | import com.yao.spider.zimuku.service.ZimuInfoService; 10 | import org.apache.ibatis.session.SqlSession; 11 | import org.slf4j.Logger; 12 | import org.slf4j.LoggerFactory; 13 | import org.springframework.beans.factory.annotation.Autowired; 14 | import org.springframework.stereotype.Service; 15 | import org.springframework.util.CollectionUtils; 16 | 17 | import javax.xml.ws.Action; 18 | import java.util.List; 19 | 20 | public class ZimuInfoServiceImpl implements ZimuInfoService { 21 | private static Logger logger = LoggerFactory.getLogger(ZimuInfoService.class); 22 | 23 | public void batchInsert(List zimuInfoList, SqlSession session) { 24 | 25 | try { 26 | ZimuInfoMapper mapper = session.getMapper(ZimuInfoMapper.class); 27 | mapper.batchInsert(zimuInfoList); 28 | session.commit(); 29 | } catch (Exception e) { 30 | logger.error(e.getMessage(), e); 31 | } finally { 32 | 33 | } 34 | } 35 | 36 | public void isnert(ZimuInfo zimuInfo, SqlSession session) { 37 | try { 38 | ZimuInfoMapper mapper = session.getMapper(ZimuInfoMapper.class); 39 | mapper.insert(zimuInfo); 40 | session.commit(); 41 | } catch (Exception e) { 42 | logger.error(e.getMessage(), e); 43 | } finally { 44 | 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/parserTest/ParserTest.java: -------------------------------------------------------------------------------- 1 | package com.yao.test.parserTest; 2 | 3 | import com.yao.spider.core.entity.Page; 4 | import com.yao.spider.core.factory.ParserFactory; 5 | import com.yao.spider.core.http.client.BaseHttpClient; 6 | import com.yao.spider.core.parser.IPageParser; 7 | import com.yao.spider.douban.DoubanHttpClient; 8 | import com.yao.spider.douban.entity.move.Move; 9 | import com.yao.spider.douban.parsers.move.MoveParser; 10 | import com.yao.spider.douban.parsers.move.MoveParserDeprecated; 11 | import com.yao.douban.douban.parsers.move.TestConsant; 12 | 13 | import java.io.IOException; 14 | import java.util.List; 15 | 16 | /** 17 | * Created by 单耀 on 2018/2/2. 18 | */ 19 | public class ParserTest { 20 | public static void main(String[] args) { 21 | List moves = moveListTest(); 22 | String url = moves.get(0).getUrl(); 23 | url += "?tag=%E7%83%AD%E9%97%A8&from=gaia"; 24 | try { 25 | IPageParser parser = ParserFactory.getParserClass(MoveParserDeprecated.class); 26 | Page page = BaseHttpClient.getInstance().getPage(url); 27 | parser.parser(page.getHtml()); 28 | } catch (IOException e) { 29 | e.printStackTrace(); 30 | } 31 | 32 | } 33 | //获取电影列表 34 | private static List moveListTest() { 35 | IPageParser parser = ParserFactory.getParserClass(MoveParser.class); 36 | // List list = parser.parser(""); 37 | try { 38 | // Page page = DoubanHttpClient.getInstance().getPage(Constants.STRTY_URL_MOVE); 39 | List moves = parser.parser(TestConsant.movelist); 40 | return moves; 41 | 42 | } catch (Exception e) { 43 | e.printStackTrace(); 44 | } 45 | return null; 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /douban-spider/src/main/resources/c3p0-config.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | doubanspider 4 | 30000 5 | 30 6 | 200 7 | 30 8 | 200 9 | 10 10 | 200 11 | 12 | 13 | 200 14 | 1 15 | 0 16 | 17 | 18 | 19 | 20 | 21 | 22 | 50 23 | 100 24 | 50 25 | 1000 26 | 27 | 28 | 0 29 | 5 30 | 31 | 32 | 33 | 1 34 | 1 35 | 1 36 | 5 37 | 50 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/plan/TrackingExecutor.java: -------------------------------------------------------------------------------- 1 | package com.yao.plan; 2 | 3 | import java.util.*; 4 | import java.util.concurrent.AbstractExecutorService; 5 | import java.util.concurrent.ExecutorService; 6 | import java.util.concurrent.TimeUnit; 7 | 8 | /** 9 | * shutdown时保存还未开始执行的任务 10 | * Created by shanyao on 2018/3/19. 11 | * 这个自己封装的Executor, 12 | * 当线程池shutdown就可以记录,没有执行的任务,以便下次运行项目时继续执行 13 | */ 14 | public class TrackingExecutor extends AbstractExecutorService{ 15 | private final ExecutorService exec; 16 | private final Set tasksCancelledShudown = Collections.synchronizedSet(new HashSet()); 17 | 18 | public TrackingExecutor(ExecutorService exec) { 19 | this.exec = exec; 20 | } 21 | 22 | public void shutdown() { 23 | 24 | } 25 | 26 | public List shutdownNow() { 27 | return null; 28 | } 29 | 30 | public boolean isShutdown() { 31 | return false; 32 | } 33 | 34 | public boolean isTerminated() { 35 | return false; 36 | } 37 | 38 | public boolean awaitTermination(long timeout, TimeUnit unit) throws InterruptedException { 39 | return false; 40 | } 41 | 42 | public void execute(final Runnable runnable) { 43 | exec.execute(new Runnable() { 44 | public void run() { 45 | try { 46 | runnable.run(); 47 | } finally { 48 | if (isShutdown() && Thread.currentThread().isInterrupted()) { 49 | tasksCancelledShudown.add(runnable); 50 | } 51 | } 52 | } 53 | }); 54 | } 55 | 56 | public List getCancelledTasks() { 57 | if (!exec.isTerminated()) { 58 | throw new IllegalStateException(""); 59 | } 60 | return new ArrayList(tasksCancelledShudown); 61 | } 62 | 63 | 64 | } 65 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/core/http/client/BaseHttpClient.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.core.http.client; 2 | 3 | import com.yao.spider.core.entity.Page; 4 | import com.yao.spider.core.http.util.HttpClientUtil; 5 | import org.apache.http.client.methods.CloseableHttpResponse; 6 | import org.apache.http.client.methods.HttpRequestBase; 7 | import org.apache.http.util.EntityUtils; 8 | 9 | import java.io.IOException; 10 | 11 | /** 12 | * Created by 单耀 on 2018/1/27. 13 | */ 14 | public class BaseHttpClient { 15 | private static BaseHttpClient instance; 16 | 17 | public static BaseHttpClient getInstance() { 18 | if (instance == null) { 19 | synchronized (BaseHttpClient.class) { 20 | if (instance == null) { 21 | instance = new BaseHttpClient(); 22 | } 23 | } 24 | } 25 | return instance; 26 | } 27 | 28 | public Page getPage(String url) throws IOException { 29 | return getPage(url, "UTF-8"); 30 | } 31 | public Page getPage(HttpRequestBase request) throws IOException { 32 | CloseableHttpResponse response = null; 33 | response = HttpClientUtil.getResponse(request); 34 | Page page = new Page(); 35 | page.setStatusCode(response.getStatusLine().getStatusCode()); 36 | page.setUrl(request.getURI().toString()); 37 | page.setHtml(EntityUtils.toString(response.getEntity())); 38 | return page; 39 | } 40 | 41 | public Page getPage(String url, String charset) throws IOException { 42 | Page page = new Page(); 43 | CloseableHttpResponse response = null; 44 | response = HttpClientUtil.getResponse(url); 45 | page.setUrl(url); 46 | page.setStatusCode(response.getStatusLine().getStatusCode()); 47 | page.setHtml(EntityUtils.toString(response.getEntity(), charset)); 48 | return page; 49 | } 50 | 51 | } 52 | 53 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/common/task/GetProxyTask.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.common.task; 2 | 3 | import com.yao.spider.core.constants.ProxyConstants; 4 | import com.yao.spider.core.util.MyIOutils; 5 | import com.yao.spider.proxytool.ProxyPool; 6 | import com.yao.spider.proxytool.entity.Proxy; 7 | import com.yao.spider.zhihu.ZhiHuHttpClient; 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | 11 | import java.util.List; 12 | import java.util.concurrent.DelayQueue; 13 | 14 | /** 15 | * Created by shanyao on 2018/4/1. 16 | */ 17 | public class GetProxyTask implements Runnable { 18 | private static Logger logger = LoggerFactory.getLogger(GetProxyTask.class); 19 | public void run() { 20 | int wokerQueueCount = 0; 21 | long finishedTaskCount = 0; 22 | while (true) { 23 | wokerQueueCount = ZhiHuHttpClient.getInstance().getUserListDownTask().getQueue().size(); 24 | finishedTaskCount = ZhiHuHttpClient.getInstance().getUserListDownTask().getCompletedTaskCount(); 25 | logger.info("进入代理管理线程"); 26 | logger.info("当前队列中任务数量:" + wokerQueueCount + "--- 已经完成task数量:" + finishedTaskCount); 27 | try { 28 | Thread.currentThread().sleep(30000); 29 | 30 | } catch (Exception e) { 31 | e.printStackTrace(); 32 | } 33 | if (ProxyPool.proxyQueue.size() < 50) { 34 | logger.info("当前代理不够重新反序列化代理"); 35 | try { 36 | List proxyList = (List) MyIOutils.deserializeObject(ProxyConstants.PROXYSER_FILE_NMAE); 37 | if (proxyList != null) { 38 | ProxyPool.proxyQueue.addAll(new DelayQueue(proxyList)); 39 | logger.info("反序列化后代理数量:" + ProxyPool.proxyQueue.size()); 40 | } 41 | } catch (Exception e) { 42 | } 43 | } 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/core/util/MyBatiesUtils.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.core.util; 2 | 3 | import org.apache.ibatis.session.SqlSession; 4 | import org.apache.ibatis.session.SqlSessionFactory; 5 | import org.apache.ibatis.session.SqlSessionFactoryBuilder; 6 | import org.slf4j.Logger; 7 | import org.slf4j.LoggerFactory; 8 | 9 | import java.io.InputStream; 10 | 11 | /** 12 | * Created by shanyao on 2018/3/10. 13 | */ 14 | public class MyBatiesUtils { 15 | private static Logger logger = LoggerFactory.getLogger(MyBatiesUtils.class); 16 | 17 | public static SqlSessionFactory sqlSessionFactory; 18 | 19 | static { 20 | InputStream inputStream = MyBatiesUtils.class.getResourceAsStream("/mybatis-config.xml"); 21 | sqlSessionFactory = new SqlSessionFactoryBuilder().build(inputStream); 22 | } 23 | 24 | public static SqlSessionFactory getSqlSessionFactory() { 25 | InputStream inputStream = MyBatiesUtils.class.getResourceAsStream("/mybatis-config.xml"); 26 | SqlSessionFactory sqlSessionFactory = new SqlSessionFactoryBuilder().build(inputStream); 27 | return sqlSessionFactory; 28 | } 29 | 30 | public static SqlSession getSqlSession(){ 31 | SqlSession session = null; 32 | try { 33 | //是个坑,不能每次都创建一个SessionFactory,会导致错误Too many connection的error 34 | //具体可以看http://blog.csdn.net/u013412772/article/details/73648537 35 | // session = getSqlSessionFactory().openSession(); 36 | session = sqlSessionFactory.openSession(); 37 | } catch (Exception e) { 38 | logger.error(e.getMessage(),e); 39 | } 40 | 41 | return session; 42 | } 43 | 44 | /** 45 | * 46 | * @param isAutocommit 在sqlSession对象执行完后是否自动提交 47 | * true是,false 否 48 | * @return 49 | */ 50 | public static SqlSession getSqlSession(boolean isAutocommit) { 51 | return sqlSessionFactory.openSession(isAutocommit); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zimuku/dao/ZimuFileInfoMapper.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | id, zimu_id, file_name, file_type, extend_id, download_url, is_deleted, create_time, 17 | last_update_time 18 | 19 | 25 | 26 | insert into t_zimu_file_info (id, zimu_id, file_name, 27 | file_type, extend_id, download_url, 28 | is_deleted, create_time, last_update_time 29 | ) 30 | values (#{id,jdbcType=BIGINT}, #{zimuId,jdbcType=BIGINT}, #{fileName,jdbcType=VARCHAR}, 31 | #{fileType,jdbcType=VARCHAR}, #{extendId,jdbcType=BIGINT}, #{downloadUrl,jdbcType=VARCHAR}, 32 | 0,now(),now() 33 | ) 34 | 35 | -------------------------------------------------------------------------------- /douban-spider/src/main/resources/mybatis-config.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/douban/parsers/move/MoveDetailInfoParser.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.douban.parsers.move; 2 | 3 | import com.yao.spider.douban.entity.move.Move; 4 | import com.yao.spider.core.parser.IPageParser; 5 | import org.jsoup.Jsoup; 6 | import org.jsoup.nodes.Document; 7 | import org.jsoup.nodes.Element; 8 | import org.jsoup.select.Elements; 9 | 10 | import java.util.List; 11 | 12 | /** 13 | * Created by shanyao on 2018/3/14. 14 | */ 15 | public class MoveDetailInfoParser implements IPageParser { 16 | public List parser(String html) { 17 | Move move = new Move(); 18 | Document document = Jsoup.parse(html); 19 | Elements elements = document.select("div#info"); 20 | //导演 21 | Elements directs = elements.select("[rel=v:directedBy]"); 22 | String _direct = move.getDirector(); 23 | for (Element element : directs) { 24 | if (_direct != null) { 25 | _direct += "/" + element.text(); 26 | } else { 27 | _direct = element.text(); 28 | } 29 | } 30 | move.setDirector(_direct); 31 | //编剧 32 | Elements eAttrs = elements.select("span.attrs"); 33 | Elements screenWriters = eAttrs.get(1).select("a"); 34 | String _screenW = move.getScreenwriter(); 35 | for (Element element : screenWriters) { 36 | if (_screenW != null) { 37 | _screenW += "/" + element.text(); 38 | } else { 39 | _screenW = element.text(); 40 | } 41 | } 42 | 43 | Elements elementsPL = elements.select("span.pl"); 44 | // TODO 语言 45 | // TODO 又名 46 | 47 | //片长 // TODO: 2018/2/4 48 | Elements runtime = elements.select("[property=v:runtime]"); 49 | move.setRuntime(runtime.first().text()); 50 | //IMD连接 51 | Elements imdb = elements.select("[rel=nofollow]"); 52 | move.setImdb(imdb.first().text()); 53 | 54 | return null; 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/doubantest/GetDouBanPageTest.java: -------------------------------------------------------------------------------- 1 | package com.yao.test.doubantest; 2 | 3 | import com.yao.spider.core.entity.Page; 4 | import com.yao.spider.core.factory.ParserFactory; 5 | import com.yao.spider.core.http.client.BaseHttpClient; 6 | import com.yao.spider.core.http.util.HttpClientUtil; 7 | import com.yao.spider.core.parser.IPageParser; 8 | import com.yao.spider.douban.DoubanHttpClient; 9 | import com.yao.spider.douban.entity.move.Move; 10 | import com.yao.spider.douban.parsers.move.MoveListParser; 11 | import com.yao.spider.proxytool.ProxyPool; 12 | import com.yao.spider.proxytool.entity.Proxy; 13 | import org.apache.http.HttpHost; 14 | import org.apache.http.client.methods.HttpGet; 15 | 16 | import java.io.IOException; 17 | import java.util.List; 18 | 19 | /** 20 | * Created by 单耀 on 2018/1/28. 21 | */ 22 | public class GetDouBanPageTest { 23 | public static void main(String[] args) { 24 | try { 25 | // String url = "https://movie.douban.com/j/chart/top_list_count?type=11&interval_id=100:90"; 26 | String url = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=90%3A80&action=&start=0&limit=1"; 27 | Page page = BaseHttpClient.getInstance().getPage(url); 28 | IPageParser parser = ParserFactory.getParserClass(MoveListParser.class); 29 | List moveList = parser.parser(page.getHtml()); 30 | Proxy proxy1 = new Proxy("106.58.123.193",80,1000,"1"); 31 | ProxyPool.proxyQueue.add(proxy1); 32 | HttpHost proxy = new HttpHost(proxy1.getIp(), proxy1.getPort()); 33 | HttpGet request = new HttpGet(url); 34 | request.setConfig(HttpClientUtil.getRequestConfigBuilder().setProxy(proxy).build()); 35 | // Page page = DoubanHttpClient.getInstance().getPage(request); 36 | // HttpGet request = new HttpGet(Constants.STRTY_URL_MOVE); 37 | HttpClientUtil.getResponse(request); 38 | System.out.println(page.getHtml()); 39 | } catch (IOException e) { 40 | e.printStackTrace(); 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/test/ThreadRetryTest.java: -------------------------------------------------------------------------------- 1 | package com.yao.test.test; 2 | 3 | import com.yao.spider.core.http.client.BaseHttpClient; 4 | import org.slf4j.Logger; 5 | import org.slf4j.LoggerFactory; 6 | 7 | import java.util.concurrent.LinkedBlockingQueue; 8 | import java.util.concurrent.ThreadFactory; 9 | import java.util.concurrent.ThreadPoolExecutor; 10 | import java.util.concurrent.TimeUnit; 11 | 12 | /** 13 | * Created by 单耀 on 2018/1/30. 14 | */ 15 | public class ThreadRetryTest extends BaseHttpClient { 16 | private static Logger logger = LoggerFactory.getLogger(ThreadRetryTest.class); 17 | private static ThreadRetryTest instance; 18 | 19 | private ThreadPoolExecutor downLoadMoveListExector; 20 | 21 | 22 | public static volatile int MOVE_START = 0; 23 | 24 | public ThreadRetryTest() { 25 | init(); 26 | } 27 | private void init () { 28 | 29 | downLoadMoveListExector = new ThreadPoolExecutor(100, 100, 0L, 30 | TimeUnit.SECONDS, 31 | new LinkedBlockingQueue(), 32 | new ThreadFactory() { 33 | public Thread newThread(Runnable r) { 34 | return new Thread(r, "downLoadMoveListExector " + r.hashCode()); 35 | } 36 | }); 37 | 38 | 39 | } 40 | 41 | public static ThreadRetryTest getInstance() { 42 | if (instance == null) { 43 | synchronized (ThreadRetryTest.class) { 44 | if (instance == null) { 45 | instance = new ThreadRetryTest(); 46 | } 47 | } 48 | } 49 | return instance; 50 | } 51 | 52 | public ThreadPoolExecutor getDownLoadMoveListExector() { 53 | return downLoadMoveListExector; 54 | } 55 | 56 | 57 | public void setDownLoadMoveListExector(ThreadPoolExecutor downLoadMoveListExector) { 58 | this.downLoadMoveListExector = downLoadMoveListExector; 59 | } 60 | 61 | 62 | public void startDouBan() { 63 | new Thread(new ChuShiHuaTest(0)).start(); 64 | } 65 | 66 | 67 | } 68 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zimuku/dao/ZimuHtmlMapper.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | id, html_type, is_deleted, create_time, last_update_time, html_value 14 | 15 | 16 | 22 | 30 | 31 | 38 | 39 | 40 | insert into t_zimu_html (id, html_type, is_deleted, 41 | create_time, last_update_time, html_value 42 | ) 43 | values (#{id,jdbcType=BIGINT}, #{htmlType,jdbcType=TINYINT}, 0, 44 | now(),now(), #{htmlValue,jdbcType=LONGVARCHAR} 45 | ) 46 | 47 | -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/spider/proxytool/parses/ip181/Ip181ProxyListParserTest.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.proxytool.parses.ip181; 2 | 3 | import com.yao.spider.core.entity.Page; 4 | import com.yao.spider.core.factory.ParserFactory; 5 | import com.yao.spider.core.parser.IPageParser; 6 | import com.yao.spider.proxytool.ProxyHttpClient; 7 | import com.yao.spider.proxytool.entity.Proxy; 8 | import com.yao.spider.proxytool.parses.ip66.Ip66ProxyListParser; 9 | import com.yao.spider.proxytool.parses.mimiip.MimiipProxyListParser; 10 | import com.yao.spider.proxytool.parses.xicidaili.XicidailiProxyListParser; 11 | import org.junit.Test; 12 | 13 | import java.io.IOException; 14 | import java.util.List; 15 | 16 | import static org.junit.Assert.*; 17 | 18 | /** 19 | * Created by shanyao on 2018/4/2. 20 | */ 21 | public class Ip181ProxyListParserTest { 22 | @Test 23 | public void parser() throws Exception { 24 | Page page = ProxyHttpClient.getInstance().getPage("http://www.ip181.com/daili/1.html"); 25 | IPageParser pageParser = ParserFactory.getParserClass(Ip181ProxyListParser.class); 26 | pageParser.parser(page.getHtml()); 27 | } 28 | 29 | @Test 30 | public void test1() throws IOException { 31 | Page page = ProxyHttpClient.getInstance().getPage("http://www.xicidaili.com/nt/1.html"); 32 | IPageParser pageParser = ParserFactory.getParserClass(XicidailiProxyListParser.class); 33 | List list = pageParser.parser(page.getHtml()); 34 | } 35 | @Test 36 | public void test2() throws IOException { 37 | Page page = ProxyHttpClient.getInstance().getPage("http://www.mimiip.com/gngao/1"); 38 | IPageParser pageParser = ParserFactory.getParserClass(MimiipProxyListParser.class); 39 | List list = pageParser.parser(page.getHtml()); 40 | } 41 | @Test 42 | public void test3() throws IOException { 43 | Page page = ProxyHttpClient.getInstance().getPage("http://www.66ip.cn/1.html"); 44 | IPageParser pageParser = ParserFactory.getParserClass(Ip66ProxyListParser.class); 45 | List list = pageParser.parser(page.getHtml()); 46 | } 47 | 48 | } -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zhihu/ZhiHuHttpClient.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zhihu; 2 | 3 | import com.yao.spider.core.http.client.BaseHttpClient; 4 | import com.yao.spider.zhihu.config.ZhiHuConfig; 5 | import com.yao.spider.zhihu.task.ZhiHuUserListTask; 6 | import org.slf4j.Logger; 7 | import org.slf4j.LoggerFactory; 8 | 9 | import java.util.concurrent.LinkedBlockingQueue; 10 | import java.util.concurrent.ThreadFactory; 11 | import java.util.concurrent.ThreadPoolExecutor; 12 | import java.util.concurrent.TimeUnit; 13 | 14 | /** 15 | * Created by user on 2018/3/28. 16 | */ 17 | public class ZhiHuHttpClient extends BaseHttpClient { 18 | private static final Logger logger = LoggerFactory.getLogger(ZhiHuHttpClient.class); 19 | private static ZhiHuHttpClient zhiHuHttpClient; 20 | 21 | public ThreadPoolExecutor userListDownTask; 22 | 23 | public static ZhiHuHttpClient getInstance() { 24 | if (zhiHuHttpClient == null) { 25 | synchronized (ZhiHuHttpClient.class) { 26 | if (zhiHuHttpClient == null) { 27 | zhiHuHttpClient = new ZhiHuHttpClient(); 28 | } 29 | } 30 | } 31 | return zhiHuHttpClient; 32 | } 33 | 34 | public ZhiHuHttpClient() { 35 | init(); 36 | } 37 | 38 | private void init() { 39 | userListDownTask = new ThreadPoolExecutor(100, 100, 0L, 40 | TimeUnit.SECONDS, 41 | new LinkedBlockingQueue(2000), 42 | new ThreadFactory() { 43 | public Thread newThread(Runnable r) { 44 | return new Thread(r,"userListDownTask" + r.hashCode()); 45 | } 46 | }); 47 | } 48 | 49 | public void startZhiHu() { 50 | String startToken = ZhiHuConfig.startUserToken; 51 | logger.info("当前:"+ ZhiHuConfig.authorization); 52 | String startUrl = String.format(ZhiHuConfig.FOLLOWEES_API, startToken, 0); 53 | userListDownTask.execute(new ZhiHuUserListTask(startUrl, true, startToken)); 54 | } 55 | 56 | 57 | public ThreadPoolExecutor getUserListDownTask() { 58 | return userListDownTask; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/core/util/MyIOutils.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.core.util; 2 | 3 | import com.yao.spider.core.constants.ProxyConstants; 4 | import org.slf4j.Logger; 5 | import org.slf4j.LoggerFactory; 6 | 7 | import java.io.*; 8 | 9 | /** 10 | * Created by shanyao on 2018/3/26. 11 | */ 12 | public class MyIOutils { 13 | private static final Logger logger = LoggerFactory.getLogger(MyIOutils.class); 14 | public static void close(ObjectOutputStream outputStream) { 15 | try { 16 | outputStream.close(); 17 | } catch (IOException e) { 18 | e.printStackTrace(); 19 | } 20 | } 21 | 22 | /** 23 | * 序列化对象 24 | */ 25 | public static void serializeObject(Object object, String fileName) { 26 | ObjectOutputStream oos = null; 27 | String path = ProxyConstants.RESOURCES__FILE_PATH + "/" + fileName; 28 | try { 29 | oos = new ObjectOutputStream(new FileOutputStream(path)); 30 | oos.writeObject(object); 31 | oos.flush(); 32 | } catch (IOException e) { 33 | logger.error(e.getMessage(), e); 34 | } finally { 35 | MyIOutils.close(oos); 36 | } 37 | } 38 | 39 | /** 40 | * 反序列化对象 41 | * @param fileName 文件名,改名称必须在resources/file下面 42 | */ 43 | public static Object deserializeObject(String fileName) { 44 | String path = ProxyConstants.RESOURCES__FILE_PATH + "/" + fileName; 45 | return deserializeObjectByPath(path); 46 | } 47 | /** 48 | * 反序列化对象 49 | * @param filePath 文件路劲 50 | */ 51 | public static Object deserializeObjectByPath(String filePath) { 52 | File file = new File(filePath); 53 | if (!file.exists()) { 54 | file.mkdir(); 55 | } 56 | ObjectInputStream ois =null; 57 | Object object = null; 58 | try { 59 | ois = new ObjectInputStream(new FileInputStream(file)); 60 | object = ois.readObject(); 61 | ois.close(); 62 | } catch (IOException e) { 63 | logger.error(e.getMessage(), e); 64 | } catch (ClassNotFoundException e) { 65 | logger.error(e.getMessage(), e); 66 | } finally { 67 | } 68 | return object; 69 | } 70 | 71 | 72 | 73 | } 74 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/douban/task/DouBanDetailInfoDownLoadTask.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.douban.task; 2 | 3 | import com.yao.spider.core.factory.ParserFactory; 4 | import com.yao.spider.core.task.*; 5 | import com.yao.spider.core.task.AbstractTask; 6 | import com.yao.spider.douban.DoubanHttpClient; 7 | import com.yao.spider.douban.dao.IMoveDao; 8 | import com.yao.spider.douban.dao.Impl.MoveDaoImpl; 9 | import com.yao.spider.douban.entity.move.Move; 10 | import com.yao.spider.core.parser.IPageParser; 11 | import com.yao.spider.douban.parsers.move.MoveDetailInfoParser; 12 | import com.yao.spider.proxytool.ProxyPool; 13 | import com.yao.spider.core.entity.Page; 14 | import com.yao.spider.proxytool.entity.Proxy; 15 | import com.yao.spider.core.http.util.HttpClientUtil; 16 | import com.yao.spider.core.util.ProxyUtil; 17 | import org.apache.http.HttpHost; 18 | import org.apache.http.client.methods.HttpGet; 19 | import org.slf4j.Logger; 20 | import org.slf4j.LoggerFactory; 21 | 22 | import java.util.List; 23 | 24 | /** 25 | * Created by 单耀 on 2018/1/28. 26 | * 电影详细信息下载任务 27 | */ 28 | public class DouBanDetailInfoDownLoadTask extends AbstractTask implements Runnable { 29 | private static Logger logger = LoggerFactory.getLogger(DouBanInfoListPageTask.class); 30 | private Move move; 31 | private IMoveDao dao = new MoveDaoImpl(); 32 | 33 | public DouBanDetailInfoDownLoadTask(Move move, boolean isUseProxy) { 34 | this.move = move; 35 | this.isUseProxy = isUseProxy; 36 | if (move != null) super.url = move.getUrl(); 37 | } 38 | 39 | public void run() { 40 | getPage(url); 41 | } 42 | 43 | public void retry() { 44 | 45 | } 46 | 47 | public void handle(Page page) { 48 | IPageParser parser = ParserFactory.getParserClass(MoveDetailInfoParser.class); 49 | List list = parser.parser(page.getHtml()); 50 | if (list != null && list.size() > 0) { 51 | Move _move = list.get(0); 52 | _move.setId(move.getId()); 53 | dao.update(_move); 54 | // _move.setName(move.getTitle()); 55 | // _move.setId(move.getId()); 56 | // _move.setScore(move.getRate()); 57 | //保存到数据库 58 | // logger.info(_move.toString()); 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zhihu/dao/Impl/UserTokenDaoImpl.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zhihu.dao.Impl; 2 | 3 | import com.yao.spider.core.util.MyBatiesUtils; 4 | import com.yao.spider.zhihu.dao.IUserTokenDao; 5 | import com.yao.spider.zhihu.entity.UserToken; 6 | import com.yao.spider.zhihu.mapper.UserMapper; 7 | import com.yao.spider.zhihu.mapper.UserTokenMapper; 8 | import org.apache.ibatis.session.SqlSession; 9 | 10 | /** 11 | * Created by user on 2018/4/2. 12 | */ 13 | public class UserTokenDaoImpl implements IUserTokenDao { 14 | public void insertSelective(UserToken userToken) { 15 | SqlSession session = MyBatiesUtils.getSqlSession(); 16 | UserTokenMapper mapper = session.getMapper(UserTokenMapper.class); 17 | try { 18 | UserToken token = mapper.selectByPrimaryKey(userToken.getUserToken()); 19 | if (token == null) { 20 | mapper.insertSelective(userToken); 21 | session.commit(); 22 | } 23 | } catch (Exception e) { 24 | e.printStackTrace(); 25 | } finally { 26 | session.close(); 27 | } 28 | } 29 | 30 | public UserToken selectByPrimaryKey(String userToken) { 31 | SqlSession session = MyBatiesUtils.getSqlSession(); 32 | UserToken token = new UserToken(); 33 | UserTokenMapper mapper = session.getMapper(UserTokenMapper.class); 34 | try { 35 | token = mapper.selectByPrimaryKey(userToken); 36 | } catch (Exception e) { 37 | 38 | } finally { 39 | session.close(); 40 | } 41 | return token; 42 | } 43 | 44 | public boolean judgeAndInsert(UserToken userToken) { 45 | SqlSession session = MyBatiesUtils.getSqlSession(); 46 | UserToken token = new UserToken(); 47 | try { 48 | UserTokenMapper mapper = session.getMapper(UserTokenMapper.class); 49 | if (mapper.selectByPrimaryKey(userToken.getUserToken()) == null) { 50 | mapper.insertSelective(userToken); 51 | session.commit(); 52 | return true; 53 | } else { 54 | return false; 55 | } 56 | } catch (Exception e) { 57 | e.printStackTrace(); 58 | } finally { 59 | session.close(); 60 | } 61 | return true; 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zimuku/domain/ZimuInfoExtend.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zimuku.domain; 2 | 3 | import java.util.Date; 4 | 5 | public class ZimuInfoExtend { 6 | private Long id; 7 | 8 | private Long zimuInfoId; 9 | 10 | private Integer refType; 11 | 12 | private String extendValue; 13 | 14 | private Integer extendValueType; 15 | 16 | private Integer isDeleted; 17 | 18 | private Date createTime; 19 | 20 | private Date lastUpdateTime; 21 | 22 | public Long getId() { 23 | return id; 24 | } 25 | 26 | public void setId(Long id) { 27 | this.id = id; 28 | } 29 | 30 | public Long getZimuInfoId() { 31 | return zimuInfoId; 32 | } 33 | 34 | public void setZimuInfoId(Long zimuInfoId) { 35 | this.zimuInfoId = zimuInfoId; 36 | } 37 | 38 | public Integer getRefType() { 39 | return refType; 40 | } 41 | 42 | public void setRefType(Integer refType) { 43 | this.refType = refType; 44 | } 45 | 46 | public String getExtendValue() { 47 | return extendValue; 48 | } 49 | 50 | public void setExtendValue(String extendValue) { 51 | this.extendValue = extendValue; 52 | } 53 | 54 | public Integer getExtendValueType() { 55 | return extendValueType; 56 | } 57 | 58 | public void setExtendValueType(Integer extendValueType) { 59 | this.extendValueType = extendValueType; 60 | } 61 | 62 | public Integer getIsDeleted() { 63 | return isDeleted; 64 | } 65 | 66 | public void setIsDeleted(Integer isDeleted) { 67 | this.isDeleted = isDeleted; 68 | } 69 | 70 | public Date getCreateTime() { 71 | return createTime; 72 | } 73 | 74 | public void setCreateTime(Date createTime) { 75 | this.createTime = createTime; 76 | } 77 | 78 | public Date getLastUpdateTime() { 79 | return lastUpdateTime; 80 | } 81 | 82 | public void setLastUpdateTime(Date lastUpdateTime) { 83 | this.lastUpdateTime = lastUpdateTime; 84 | } 85 | 86 | @Override 87 | public String toString() { 88 | return "ZimuInfoExtend{" + 89 | "id=" + id + 90 | ", zimuInfoId=" + zimuInfoId + 91 | ", refType=" + refType + 92 | ", extendValue='" + extendValue + '\'' + 93 | ", extendValueType=" + extendValueType + 94 | '}'; 95 | } 96 | } -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/douban/dao/Impl/MoveDaoImpl.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.douban.dao.Impl; 2 | 3 | import com.yao.spider.common.dao.Impl.BaseDaoImpl; 4 | import com.yao.spider.core.util.MyBatiesUtils; 5 | import com.yao.spider.douban.dao.IMoveDao; 6 | import com.yao.spider.douban.entity.move.Move; 7 | import com.yao.spider.douban.mapper.MoveMapper; 8 | import org.apache.ibatis.session.SqlSession; 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | 12 | import java.util.List; 13 | 14 | /** 15 | * Created by shanyao on 2018/3/10. 16 | */ 17 | public class MoveDaoImpl extends BaseDaoImpl implements IMoveDao { 18 | 19 | private static Logger logger = LoggerFactory.getLogger(MoveDaoImpl.class); 20 | 21 | public void insert(Move move) { 22 | super.insert(move); 23 | } 24 | 25 | public void inserSelective(Move move) { 26 | SqlSession session = MyBatiesUtils.getSqlSession(true); 27 | try { 28 | MoveMapper moveMapper = session.getMapper(MoveMapper.class); 29 | Move m = moveMapper.selectByPrimaryKey(move.getId()); 30 | if (m == null) { 31 | moveMapper.insertSelective(move); 32 | session.commit(); 33 | } 34 | } catch (Exception e) { 35 | logger.error(e.getMessage(), e); 36 | session.rollback(); 37 | } finally { 38 | //这里并不是断开连接而是将连接池还给连接池,解除占用 39 | session.close(); 40 | } 41 | } 42 | 43 | public void insertList(List moveList) { 44 | for (Move move : moveList) { 45 | inserSelective(move); 46 | } 47 | } 48 | 49 | public Move selectByPrimaryKey(String id) { 50 | SqlSession session = MyBatiesUtils.getSqlSession(true); 51 | MoveMapper moveMapper = session.getMapper(MoveMapper.class); 52 | return moveMapper.selectByPrimaryKey(id); 53 | } 54 | 55 | public void update(Move move) { 56 | SqlSession session = MyBatiesUtils.getSqlSession(true); 57 | try { 58 | MoveMapper moveMapper = session.getMapper(MoveMapper.class); 59 | moveMapper.updateByPrimaryKey(move); 60 | session.commit(); 61 | } catch (Exception e) { 62 | logger.error(e.getMessage(), e); 63 | session.rollback(); 64 | e.printStackTrace(); 65 | } finally { 66 | session.close(); 67 | } 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/core/constants/ProxyConstants.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.core.constants; 2 | 3 | /** 4 | * Created by 单耀 on 2018/1/26. 5 | */ 6 | public class ProxyConstants { 7 | //是否爬取新的的代理 8 | public static final boolean ISUSERFILE_ONLY = false; 9 | public static final String STANDARD = "standard"; 10 | public static int SocketTimeout = 10000; 11 | public static int ConnectionTimeout = 10000; 12 | public static int TIMEOUT = 10000; 13 | //代理测试地址 14 | public static String PROXYTEST_URL = "http://mcar.cc/forum.php"; 15 | 16 | public static final long TIME_INTERVAL = 1000; 17 | //文件路劲地址 18 | // public static final String FILE_PATH= "src/main/resources/file"; 19 | // public static final String FILE_PATH= "src/main/resources/file"; 20 | public static final String FILE_PATH= "../proxy/proxy.ser"; 21 | public static String RESOURCES__FILE_PATH; 22 | //保存序列化代理的文件名 23 | public static String PROXYSER_FILE_NMAE = "proxy.ser"; 24 | 25 | public final static String[] userAgentArray = new String[]{ 26 | "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36", 27 | "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2623.110 Safari/537.36", 28 | "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2623.110 Safari/537.36", 29 | "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2623.110 Safari/537.36", 30 | "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2623.110 Safari/537.36", 31 | "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2623.110 Safari/537.36", 32 | "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2623.110 Safari/537.36", 33 | "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2623.110 Safari/537.36", 34 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:50.0) Gecko/20100101 Firefox/50.0", 35 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36" 36 | }; 37 | 38 | static { 39 | RESOURCES__FILE_PATH = ProxyConstants.class.getClassLoader().getResource("").getPath() + "file"; 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zimuku/domain/ZimuFileInfo.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zimuku.domain; 2 | 3 | import java.util.Date; 4 | 5 | public class ZimuFileInfo { 6 | private Long id; 7 | 8 | private Long zimuId; 9 | 10 | private String fileName; 11 | 12 | private String fileType; 13 | 14 | private Long extendId; 15 | 16 | private String downloadUrl; 17 | 18 | private Integer isDeleted; 19 | 20 | private Date createTime; 21 | 22 | private Date lastUpdateTime; 23 | 24 | public Long getId() { 25 | return id; 26 | } 27 | 28 | public void setId(Long id) { 29 | this.id = id; 30 | } 31 | 32 | public Long getZimuId() { 33 | return zimuId; 34 | } 35 | 36 | public void setZimuId(Long zimuId) { 37 | this.zimuId = zimuId; 38 | } 39 | 40 | public String getFileName() { 41 | return fileName; 42 | } 43 | 44 | public void setFileName(String fileName) { 45 | this.fileName = fileName; 46 | } 47 | 48 | public String getFileType() { 49 | return fileType; 50 | } 51 | 52 | public void setFileType(String fileType) { 53 | this.fileType = fileType; 54 | } 55 | 56 | public Long getExtendId() { 57 | return extendId; 58 | } 59 | 60 | public void setExtendId(Long extendId) { 61 | this.extendId = extendId; 62 | } 63 | 64 | public String getDownloadUrl() { 65 | return downloadUrl; 66 | } 67 | 68 | public void setDownloadUrl(String downloadUrl) { 69 | this.downloadUrl = downloadUrl; 70 | } 71 | 72 | public Integer getIsDeleted() { 73 | return isDeleted; 74 | } 75 | 76 | public void setIsDeleted(Integer isDeleted) { 77 | this.isDeleted = isDeleted; 78 | } 79 | 80 | public Date getCreateTime() { 81 | return createTime; 82 | } 83 | 84 | public void setCreateTime(Date createTime) { 85 | this.createTime = createTime; 86 | } 87 | 88 | public Date getLastUpdateTime() { 89 | return lastUpdateTime; 90 | } 91 | 92 | public void setLastUpdateTime(Date lastUpdateTime) { 93 | this.lastUpdateTime = lastUpdateTime; 94 | } 95 | 96 | @Override 97 | public String toString() { 98 | return "ZimuFileInfo{" + 99 | "zimuId=" + zimuId + 100 | ", fileName='" + fileName + '\'' + 101 | ", fileType='" + fileType + '\'' + 102 | ", extendId=" + extendId + 103 | '}'; 104 | } 105 | } -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/proxytool/ProxyPool.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.proxytool; 2 | 3 | import com.yao.spider.proxytool.entity.Proxy; 4 | import com.yao.spider.proxytool.parses.ip181.Ip181ProxyListParser; 5 | import com.yao.spider.proxytool.parses.ip66.Ip66ProxyListParser; 6 | import com.yao.spider.proxytool.parses.kuaidaili.KuaidailiProxyListParser; 7 | import com.yao.spider.proxytool.parses.mimiip.MimiipProxyListParser; 8 | import com.yao.spider.proxytool.parses.xicidaili.XicidailiProxyListParser; 9 | import com.yao.spider.core.constants.ProxyConstants; 10 | 11 | import java.util.HashMap; 12 | import java.util.HashSet; 13 | import java.util.Map; 14 | import java.util.Set; 15 | import java.util.concurrent.DelayQueue; 16 | import java.util.concurrent.locks.ReentrantReadWriteLock; 17 | 18 | /** 19 | * Created by 单耀 on 2018/1/27. 20 | */ 21 | public class ProxyPool { 22 | //这里也可以简单粗暴的使用sychronized,因为写操作次数远大于读操作,区别并不是特别大 23 | public final static ReentrantReadWriteLock lock = new ReentrantReadWriteLock(); 24 | public final static Set proxySet = new HashSet(); 25 | 26 | public static DelayQueue proxyQueue = new DelayQueue(); 27 | 28 | public static final Map proxyMap = new HashMap(); 29 | 30 | static { 31 | for (int i = 1; i <= 66; i++) { 32 | proxyMap.put("https://www.kuaidaili.com/free/intr/"+ i +"/", KuaidailiProxyListParser.class); 33 | proxyMap.put("https://www.kuaidaili.com/free/inha/" + i + "/", KuaidailiProxyListParser.class);//高匿 34 | } 35 | int pages = 8; 36 | for (int i = 1; i <= pages; i++) { 37 | proxyMap.put("http://www.xicidaili.com/wt/" + i + ".html", XicidailiProxyListParser.class); 38 | proxyMap.put("http://www.xicidaili.com/nn/" + i + ".html", XicidailiProxyListParser.class); 39 | proxyMap.put("http://www.xicidaili.com/wn/" + i + ".html", XicidailiProxyListParser.class); 40 | proxyMap.put("http://www.xicidaili.com/nt/" + i + ".html", XicidailiProxyListParser.class); 41 | proxyMap.put("http://www.ip181.com/daili/" + i + ".html", Ip181ProxyListParser.class); 42 | //高匿 43 | proxyMap.put("http://www.mimiip.com/gngao/" + i, MimiipProxyListParser.class); 44 | //普匿 45 | proxyMap.put("http://www.mimiip.com/gnpu/" + i, MimiipProxyListParser.class); 46 | proxyMap.put("http://www.66ip.cn/" + i + ".html", Ip66ProxyListParser.class); 47 | for (int j = 1; j < 34; j++) { 48 | proxyMap.put("http://www.66ip.cn/areaindex_" + j + "/" + i + ".html", Ip66ProxyListParser.class); 49 | } 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zimuku/dao/ZimuInfoExtendMapper.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | id, zimu_info_id, ref_type, extend_value, extend_value_type, is_deleted, create_time, 16 | last_update_time 17 | 18 | 19 | update t_zimu_info_extend set extend_value = #{extendValue,jdbcType=VARCHAR} where zimu_info_id = #{zimuInfoId,jdbcType=BIGINT} 20 | 21 | 22 | 28 | 29 | 37 | 38 | 45 | 46 | 47 | 48 | insert into t_zimu_info_extend (id, zimu_info_id, ref_type, 49 | extend_value, extend_value_type, is_deleted, 50 | create_time, last_update_time) 51 | values (#{id,jdbcType=BIGINT}, #{zimuInfoId,jdbcType=BIGINT}, #{refType,jdbcType=TINYINT}, 52 | #{extendValue,jdbcType=VARCHAR}, #{extendValueType,jdbcType=TINYINT}, 0, 53 | now(),now()) 54 | 55 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/proxytool/task/ProxyTestTask.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.proxytool.task; 2 | 3 | import com.yao.spider.proxytool.ProxyHttpClient; 4 | import com.yao.spider.proxytool.ProxyPool; 5 | import com.yao.spider.core.entity.Page; 6 | import com.yao.spider.proxytool.entity.Proxy; 7 | import com.yao.spider.core.http.util.HttpClientUtil; 8 | import com.yao.spider.core.constants.ProxyConstants; 9 | import com.yao.spider.core.util.ProxyUtil; 10 | import org.apache.http.HttpHost; 11 | import org.apache.http.client.config.RequestConfig; 12 | import org.apache.http.client.methods.HttpGet; 13 | import org.slf4j.Logger; 14 | import org.slf4j.LoggerFactory; 15 | 16 | import java.io.IOException; 17 | 18 | /** 19 | * Created by 单耀 on 2018/1/28. 20 | * 测试代理任务 21 | * 测试下载的代理是否可用 22 | */ 23 | public class ProxyTestTask implements Runnable { 24 | private static Logger logger = LoggerFactory.getLogger(ProxyTestTask.class); 25 | private Proxy proxy; 26 | 27 | public ProxyTestTask(Proxy proxy) { 28 | this.proxy = proxy; 29 | } 30 | 31 | public void run() { 32 | HttpGet request = new HttpGet(ProxyConstants.PROXYTEST_URL); 33 | try { 34 | HttpHost proxyHost = new HttpHost(proxy.getIp(), proxy.getPort()); 35 | RequestConfig requestConfig = HttpClientUtil.getRequestConfigBuilder().setProxy(proxyHost).build(); 36 | request.setConfig(requestConfig); 37 | // CloseableHttpResponse response = HttpClientUtil.getResponse(request); 38 | Page page = ProxyHttpClient.getInstance().getPage(request); 39 | String logStr = Thread.currentThread().getName() + " " + proxy.getProxyStr() + 40 | " executing request " + page.getUrl() + " response statusCode:" + page.getStatusCode(); 41 | 42 | if (page == null || page.getStatusCode() != 200) { 43 | // logger.warn("该代理不可用:" + logStr); 44 | return; 45 | } 46 | if (page.getStatusCode() == 200) { 47 | ProxyPool.proxyQueue.add(proxy); 48 | logger.debug(proxy.getProxyStr() + "-----代理可用-----"); 49 | // logger.debug(proxy.toString() + "--------" + page.toString()); 50 | ProxyPool.lock.writeLock().lock(); 51 | ProxyPool.proxySet.add(proxy); 52 | ProxyPool.lock.writeLock().unlock(); 53 | System.out.println("目前可用代理数量:"+ProxyPool.proxyQueue.size()); 54 | } 55 | } catch (IOException e) { 56 | // logger.debug("IOException", e); 57 | } finally { 58 | if (request != null) { 59 | request.releaseConnection(); 60 | } 61 | 62 | if (proxy != null && !ProxyUtil.isDiscardProxy(proxy)){ 63 | ProxyPool.proxyQueue.add(proxy); 64 | } 65 | } 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /zimuku.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE `t_zimu_info` ( 2 | `id` BIGINT(10) NOT NULL AUTO_INCREMENT, 3 | `zimu_id` BIGINT(10) NOT NULL, 4 | `zimu_title` varchar(500) NOT NULL COMMENT '字母名称', 5 | `zimu_translator` TINYINT(4) NULL DEFAULT NULL COMMENT '翻译字幕组1:YYest', 6 | `zimu_language` INT(10) not NULL COMMENT '1:双;2:繁;3:繁,双;4:英;5:英,双;6:英,繁;7:英,繁,双;8:简;9:简,双;10:繁,简;11:双,繁,简;12:简,英;13:简,英,双;14:简,英,繁;15:all', 7 | `zimu_quality`TINYINT(4) NULL DEFAULT NULL COMMENT '字幕质量', 8 | `zimu_type` INT(10) not NULL COMMENT '1:srt,2:ass\ssa,3:all', 9 | `detail_url` varchar(500) null default null comment '详情页url', 10 | `download_page_url` varchar(500) null default null comment '下载页url', 11 | `is_deleted` TINYINT(4) NOT NULL COMMENT '是否删除', 12 | `create_time` DATETIME NOT NULL COMMENT '创建时间', 13 | `last_update_time` TIMESTAMP NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, 14 | PRIMARY KEY (`id`) 15 | ) 16 | COMMENT='字幕' 17 | COLLATE='utf8_general_ci' 18 | ENGINE=InnoDB 19 | AUTO_INCREMENT=3 20 | ; 21 | 22 | CREATE TABLE `t_zimu_info_extend` ( 23 | `id` BIGINT(10) NOT NULL AUTO_INCREMENT, 24 | `zimu_info_id` BIGINT(10) NOT NULL, 25 | `ref_type` TINYINT(10) not NULL COMMENT '1:下载地址', 26 | `extend_value` varchar(500) null default null comment 'ref_type:1:下载url', 27 | `extend_value_type` TINYINT(10) UNSIGNED NULL default null COMMENT '1:', 28 | `is_deleted` TINYINT(4) NOT NULL COMMENT '是否删除', 29 | `create_time` DATETIME NOT NULL COMMENT '创建时间', 30 | `last_update_time` TIMESTAMP NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, 31 | PRIMARY KEY (`id`), 32 | INDEX `zimu_info_id` (`zimu_info_id`) 33 | ) 34 | COMMENT='字幕' 35 | COLLATE='utf8_general_ci' 36 | ENGINE=InnoDB 37 | AUTO_INCREMENT=1 38 | ; 39 | 40 | CREATE TABLE `t_zimu_html` ( 41 | `id` BIGINT NOT NULL AUTO_INCREMENT, 42 | `html_value` text NULL default null, 43 | `html_type` TINYINT(4) NOT NULL COMMENT '类型1:列表,2:详情', 44 | `is_deleted` TINYINT NOT NULL COMMENT '是否删除', 45 | `create_time` DATETIME NOT NULL COMMENT '创建时间', 46 | `last_update_time` TIMESTAMP NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, 47 | PRIMARY KEY (`id`) 48 | 49 | ) 50 | COMMENT='字幕源码内容' 51 | COLLATE='utf8_general_ci' 52 | ENGINE=InnoDB 53 | AUTO_INCREMENT=0 54 | ; 55 | 56 | 57 | CREATE TABLE `t_zimu_file_info` ( 58 | `id` BIGINT NOT NULL AUTO_INCREMENT, 59 | `zimu_id` BIGINT NOT NULL, 60 | `file_name` VARCHAR(500) NULL COMMENT '名称', 61 | `file_type` varchar(10) null comment '类型', 62 | `extend_id` BIGINT UNSIGNED NOT NULL COMMENT '上级地址', 63 | `download_url` VARCHAR(500) NULL DEFAULT NULL COMMENT '下载url', 64 | `is_deleted` TINYINT NOT NULL COMMENT '是否删除', 65 | `create_time` DATETIME NOT NULL COMMENT '创建时间', 66 | `last_update_time` TIMESTAMP NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, 67 | PRIMARY KEY (`id`), 68 | UNIQUE INDEX `zimu_id` (`zimu_id`) 69 | ) 70 | COMMENT='字幕' 71 | COLLATE='utf8_general_ci' 72 | ENGINE=InnoDB 73 | AUTO_INCREMENT=1 74 | ; 75 | -------------------------------------------------------------------------------- /douban-spider/src/main/resources/generatorConfig.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 57 |
58 | 59 |
60 |
-------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/douban/utils/DBUtil.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.douban.utils; 2 | 3 | import com.yao.spider.core.http.client.BaseHttpClient; 4 | import com.yao.spider.douban.DoubanHttpClient; 5 | import com.yao.spider.douban.constants.DBConstants; 6 | import com.yao.spider.core.entity.Page; 7 | import org.jsoup.Jsoup; 8 | import org.jsoup.nodes.Document; 9 | import org.jsoup.nodes.Element; 10 | import org.jsoup.select.Elements; 11 | 12 | import java.io.IOException; 13 | import java.io.UnsupportedEncodingException; 14 | import java.net.URLEncoder; 15 | import java.util.HashMap; 16 | import java.util.Map; 17 | 18 | /** 19 | * Created by user on 2018/2/8. 20 | * 获取电影类型 21 | */ 22 | public class DBUtil { 23 | public static Map getType(String type) { 24 | synchronized (DBUtil.class) { 25 | Map map = new HashMap(); 26 | boolean SUCCESS = true; 27 | while (SUCCESS) { 28 | if ("move".equals(type)) { 29 | try { 30 | String url = String.format(DBConstants.MOVE_START_URL_TYPE, URLEncoder.encode(DBConstants.MOVE_START_TYPE_NAME, "UTF-8"), DBConstants.MOVE_START_TYPE_VALUE); 31 | Page page = BaseHttpClient.getInstance().getPage(url); 32 | if (page != null && page.getStatusCode() == 200 && !"".equals(page.getHtml())) { 33 | return getTypeMap(type, page.getHtml()); 34 | } 35 | } catch (UnsupportedEncodingException e) { 36 | e.printStackTrace(); 37 | } catch (IOException e) { 38 | e.printStackTrace(); 39 | } 40 | } else if("people".equals(type)) { 41 | //爬取其他信息 42 | } else { 43 | SUCCESS = false; 44 | } 45 | } 46 | return map; 47 | } 48 | } 49 | 50 | public static Map getTypeMap(String type, String context) { 51 | Map map = new HashMap(); 52 | try { 53 | if ("move".equals(type)) { 54 | Document document = Jsoup.parse(context); 55 | Elements elements = document.select("[class=types]").select("a"); 56 | for (Element element : elements) { 57 | String attr = element.attributes().get("href"); 58 | String[] temp = attr.split("&"); 59 | String typeName = temp[0].substring(temp[0].indexOf("=") + 1,temp[0].length()); 60 | String typeValue = temp[1].substring(temp[1].indexOf("=") + 1,temp[1].length()); 61 | map.put(typeName, typeValue); 62 | } 63 | } 64 | } catch (Exception e) { 65 | e.printStackTrace(); 66 | } 67 | return map; 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/sqkfq/bean/BaoMingDetailBean.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.sqkfq.bean; 2 | 3 | import java.util.Date; 4 | 5 | /** 6 | * @author 单耀 7 | * @version 1.0 8 | * @description 9 | * @date 2021/2/3 10:30 10 | */ 11 | public class BaoMingDetailBean { 12 | private Integer ZPID; 13 | private String BT; 14 | private String BMSJB; 15 | private String BMSJE; 16 | private String FKSJB; 17 | private String FKSJE; 18 | private String DYSJB; 19 | private String DYSJE; 20 | private Integer State; 21 | private Integer BGM; 22 | private Date AddTime; 23 | /** 24 | * 返回操作状态 25 | * -2:重新缴费 -1:审核未通过1:我要报名 100:报名审核中 101:网上缴费 102:支付确认中 103:打印准考证 104:打印准考证 1000:报名未开始 1001:报名已结束 2000:缴费未开始 2001:缴费已截止 3000:打印未开始 3001:打印已截止 3002:查询成绩 26 | */ 27 | private Long OPT; 28 | 29 | public Integer getZPID() { 30 | return ZPID; 31 | } 32 | 33 | public void setZPID(Integer ZPID) { 34 | this.ZPID = ZPID; 35 | } 36 | 37 | public String getBT() { 38 | return BT; 39 | } 40 | 41 | public void setBT(String BT) { 42 | this.BT = BT; 43 | } 44 | 45 | public String getBMSJB() { 46 | return BMSJB; 47 | } 48 | 49 | public void setBMSJB(String BMSJB) { 50 | this.BMSJB = BMSJB; 51 | } 52 | 53 | public String getBMSJE() { 54 | return BMSJE; 55 | } 56 | 57 | public void setBMSJE(String BMSJE) { 58 | this.BMSJE = BMSJE; 59 | } 60 | 61 | public String getFKSJB() { 62 | return FKSJB; 63 | } 64 | 65 | public void setFKSJB(String FKSJB) { 66 | this.FKSJB = FKSJB; 67 | } 68 | 69 | public String getFKSJE() { 70 | return FKSJE; 71 | } 72 | 73 | public void setFKSJE(String FKSJE) { 74 | this.FKSJE = FKSJE; 75 | } 76 | 77 | public String getDYSJB() { 78 | return DYSJB; 79 | } 80 | 81 | public void setDYSJB(String DYSJB) { 82 | this.DYSJB = DYSJB; 83 | } 84 | 85 | public String getDYSJE() { 86 | return DYSJE; 87 | } 88 | 89 | public void setDYSJE(String DYSJE) { 90 | this.DYSJE = DYSJE; 91 | } 92 | 93 | public Integer getState() { 94 | return State; 95 | } 96 | 97 | public void setState(Integer state) { 98 | State = state; 99 | } 100 | 101 | public Integer getBGM() { 102 | return BGM; 103 | } 104 | 105 | public void setBGM(Integer BGM) { 106 | this.BGM = BGM; 107 | } 108 | 109 | public Date getAddTime() { 110 | return AddTime; 111 | } 112 | 113 | public void setAddTime(Date addTime) { 114 | AddTime = addTime; 115 | } 116 | 117 | public Long getOPT() { 118 | return OPT; 119 | } 120 | 121 | public void setOPT(Long OPT) { 122 | this.OPT = OPT; 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /douban-spider/src/test/java/com/yao/test/test/ListBL.java: -------------------------------------------------------------------------------- 1 | package com.yao.test.test; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Iterator; 5 | import java.util.LinkedList; 6 | import java.util.List; 7 | 8 | /** 9 | * Created by shanyao on 2018/4/2. 10 | */ 11 | public class ListBL { 12 | public static void main(String[] args) { 13 | ListBL listBL = new ListBL(); 14 | listBL.testMain(); 15 | } 16 | public void testMain(){ 17 | //初始化 18 | int sum = 10000; 19 | List arrList = new ArrayList(); 20 | List linkList = new LinkedList(); 21 | String con = "test"; 22 | for(int i=0;i list) { 39 | long startTime = 0L,endTime = 0L; 40 | String str; 41 | startTime = System.nanoTime(); 42 | for(int i=list.size() - 1; i>=0; i--){ 43 | str = list.get(i); 44 | } 45 | endTime = System.nanoTime(); 46 | return endTime - startTime; 47 | } 48 | public long testFor02(List list) { 49 | long startTime = 0L,endTime = 0L; 50 | String str; 51 | startTime = System.nanoTime(); 52 | for(int i=0,len = list.size(); i list) { 60 | long startTime = 0L,endTime = 0L; 61 | String str; 62 | startTime = System.nanoTime(); 63 | for(String str1 : list){ 64 | str = str1; 65 | } 66 | endTime = System.nanoTime(); 67 | return endTime - startTime; 68 | } 69 | 70 | 71 | public long testIterator(List list){ 72 | long startTime = 0L, endTime = 0L; 73 | String str; 74 | startTime = System.nanoTime(); 75 | Iterator it = list.iterator(); 76 | while(it.hasNext()){ 77 | str = it.next(); 78 | } 79 | endTime = System.nanoTime(); 80 | return endTime - startTime; 81 | 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zimuku/task/ZimuPageListTask.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zimuku.task; 2 | 3 | import com.yao.spider.core.entity.Page; 4 | import com.yao.spider.core.factory.ParserFactory; 5 | import com.yao.spider.core.parser.IPageParser; 6 | import com.yao.spider.core.task.AbstractTask; 7 | import com.yao.spider.core.util.MyBatiesUtils; 8 | import com.yao.spider.zhihu.entity.User; 9 | import com.yao.spider.zhihu.parsers.ZhiHuUserParser; 10 | import com.yao.spider.zimuku.domain.ZimuInfo; 11 | import com.yao.spider.zimuku.parsers.ZimuParser; 12 | import com.yao.spider.zimuku.service.ZimuInfoService; 13 | import com.yao.spider.zimuku.service.impl.ZimuInfoServiceImpl; 14 | import org.apache.ibatis.session.SqlSession; 15 | import org.slf4j.Logger; 16 | import org.slf4j.LoggerFactory; 17 | 18 | import java.util.List; 19 | 20 | /** 21 | * @create by 单耀 22 | * @create date 2020/3/25 23 | */ 24 | public class ZimuPageListTask extends AbstractTask { 25 | private static final Logger logger = LoggerFactory.getLogger(ZimuPageListTask.class); 26 | private String url; 27 | private Integer pageIndex = 3; 28 | SqlSession session; 29 | public void run() { 30 | getPage(url); 31 | } 32 | 33 | public void retry() { 34 | 35 | } 36 | 37 | public void handle(Page page) { 38 | IPageParser pageParser = ParserFactory.getParserClass(ZimuParser.class); 39 | if (pageParser != null) { 40 | List list = pageParser.parser(page.getHtml()); 41 | ZimuInfoService service = new ZimuInfoServiceImpl(); 42 | logger.info("pageIndex:" + this.getPageIndex() + " size:" + list.size()); 43 | service.batchInsert(list, this.session); 44 | } 45 | } 46 | 47 | public String getUrl() { 48 | return url; 49 | } 50 | 51 | public void setUrl(String url) { 52 | this.url = url; 53 | } 54 | 55 | public SqlSession getSession() { 56 | return session; 57 | } 58 | 59 | public void setSession(SqlSession session) { 60 | this.session = session; 61 | } 62 | 63 | public Integer getPageIndex() { 64 | return pageIndex; 65 | } 66 | 67 | public void setPageIndex(Integer pageIndex) { 68 | this.pageIndex = pageIndex; 69 | } 70 | 71 | public static void main(String[] args) { 72 | ZimuPageListTask task = new ZimuPageListTask(); 73 | String url = "http://www.zimuku.la/t/HJns0?p="; 74 | SqlSession session = MyBatiesUtils.getSqlSession(); 75 | for (; task.getPageIndex() < 794; task.setPageIndex(task.getPageIndex() + 1)) { 76 | url += task.getPageIndex(); 77 | try { 78 | task.setSession(session); 79 | // task.getPage("http://www.zimuku.la/t/HJns0?p=15"); 80 | task.getPage(url, true); 81 | Thread.sleep(2000); 82 | } catch (Exception e) { 83 | logger.error(e.getMessage()); 84 | } 85 | } 86 | session.close(); 87 | } 88 | 89 | 90 | } 91 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/sqkfq/dao/SqkfqBaomingMapper.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | id, user_mid, zpid, title, start_appy_time, end_appy_time, start_pay_time, end_pay_time, 22 | start_print_time, end_print_time, `state`, bgm, add_time, opt 23 | 24 | 30 | 35 | 41 | 42 | insert into t_sqkfq_baoming (id, user_mid, zpid, 43 | title, start_appy_time, end_appy_time, 44 | start_pay_time, end_pay_time, start_print_time, 45 | end_print_time, `state`, bgm, 46 | add_time, opt) 47 | values (#{id,jdbcType=BIGINT}, #{userMid,jdbcType=BIGINT}, #{zpid,jdbcType=INTEGER}, 48 | #{title,jdbcType=VARCHAR}, #{startAppyTime,jdbcType=TIMESTAMP}, #{endAppyTime,jdbcType=TIMESTAMP}, 49 | #{startPayTime,jdbcType=TIMESTAMP}, #{endPayTime,jdbcType=TIMESTAMP}, #{startPrintTime,jdbcType=TIMESTAMP}, 50 | #{endPrintTime,jdbcType=TIMESTAMP}, #{state,jdbcType=INTEGER}, #{bgm,jdbcType=INTEGER}, 51 | #{addTime,jdbcType=TIMESTAMP}, #{opt,jdbcType=BIGINT}) 52 | 53 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/proxytool/entity/Proxy.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.proxytool.entity; 2 | 3 | import java.io.Serializable; 4 | import java.util.concurrent.Delayed; 5 | import java.util.concurrent.TimeUnit; 6 | 7 | /** 8 | * Created by 单耀 on 2018/1/26. 9 | * 在一些字段前面加上 transient关键字,目的是为了序列化时忽略该字段,因为序列化时这些null的字段会占用不必要的空间 10 | * 可以查看ArrayList源码中就是采用这种技术避免null被序列化 11 | * 12 | */ 13 | public class Proxy implements Delayed,Serializable { 14 | 15 | private static final long serialVersionUID = -3231293936247728930L; 16 | 17 | private String ip; 18 | private Integer port; 19 | private long lastSuccessTime; 20 | //来源 21 | private String dataSource; 22 | transient private long timeIntervsl;//任务间隔时间 23 | transient private int failureTimes;//请求失败次数 24 | transient private int successfulTimes;//请求成功次数 25 | 26 | public Proxy() {} 27 | 28 | public Proxy(String ip, int port) { 29 | this.ip = ip; 30 | this.port = port; 31 | } 32 | 33 | public Proxy(String ip, Integer port, long delayTime, String dataSource) { 34 | this.ip = ip; 35 | this.port = port; 36 | this.timeIntervsl = TimeUnit.NANOSECONDS.convert(delayTime,TimeUnit.MILLISECONDS) + System.nanoTime(); 37 | this.dataSource = dataSource; 38 | } 39 | 40 | public String getIp() { 41 | return ip; 42 | } 43 | 44 | public void setIp(String ip) { 45 | this.ip = ip; 46 | } 47 | 48 | public int getPort() { 49 | return port; 50 | } 51 | 52 | public void setPort(int port) { 53 | this.port = port; 54 | } 55 | 56 | public String getDataSource() { 57 | return dataSource; 58 | } 59 | 60 | public void setDataSource(String dataSource) { 61 | this.dataSource = dataSource; 62 | } 63 | 64 | public long getLastSuccessTime() { 65 | return lastSuccessTime; 66 | } 67 | 68 | public void setLastSuccessTime(long lastSuccessTime) { 69 | this.lastSuccessTime = lastSuccessTime; 70 | } 71 | 72 | public long getTimeIntervsl() { 73 | return timeIntervsl; 74 | } 75 | 76 | public void setTimeIntervsl(long timeIntervsl) { 77 | this.timeIntervsl = timeIntervsl; 78 | } 79 | 80 | public long getDelay(TimeUnit unit) { 81 | return 0; 82 | } 83 | 84 | public int compareTo(Delayed o) { 85 | return 0; 86 | } 87 | 88 | public int getFailureTimes() { 89 | return failureTimes; 90 | } 91 | 92 | public void setFailureTimes(int failureTimes) { 93 | this.failureTimes = failureTimes; 94 | } 95 | 96 | public int getSuccessfulTimes() { 97 | return successfulTimes; 98 | } 99 | 100 | public void setSuccessfulTimes(int successfulTimes) { 101 | this.successfulTimes = successfulTimes; 102 | } 103 | 104 | public String getProxyStr() { 105 | return ip + ":" + port + "--datasource:" + dataSource + "---failureTimes:" + failureTimes; 106 | } 107 | 108 | } 109 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/douban/task/AbstractTaskDeprecated.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.douban.task; 2 | 3 | import com.yao.spider.douban.DoubanHttpClient; 4 | import com.yao.spider.proxytool.ProxyPool; 5 | import com.yao.spider.core.entity.Page; 6 | import com.yao.spider.proxytool.entity.Proxy; 7 | import com.yao.spider.core.http.util.HttpClientUtil; 8 | import com.yao.spider.core.util.ProxyUtil; 9 | import org.apache.http.HttpHost; 10 | import org.apache.http.client.methods.HttpGet; 11 | import org.slf4j.Logger; 12 | import org.slf4j.LoggerFactory; 13 | 14 | /** 15 | * Created by 单耀 on 2018/2/6. 16 | * TODO 17 | */ 18 | @Deprecated 19 | public abstract class AbstractTaskDeprecated implements Runnable {//TODO 改成泛型,这样打印日志会更佳明显有助排查错误 20 | private static Logger logger = LoggerFactory.getLogger(AbstractTaskDeprecated.class); 21 | protected static DoubanHttpClient doubanHttpClient = DoubanHttpClient.getInstance(); 22 | protected boolean isUseProxy; 23 | protected String url; 24 | protected Proxy currentProxy; 25 | protected int retryTimes; 26 | 27 | public void getPage(String url) { 28 | this.getPage(url, isUseProxy); 29 | } 30 | 31 | public void getPage(String url, boolean isUseProxy) { 32 | this.url = url; 33 | this.isUseProxy = isUseProxy; 34 | 35 | HttpGet request = new HttpGet(url); 36 | try { 37 | Page page = null; 38 | if (isUseProxy) { 39 | currentProxy = ProxyPool.proxyQueue.take(); 40 | HttpHost proxy = new HttpHost(currentProxy.getIp(), currentProxy.getPort()); 41 | request.setConfig(HttpClientUtil.getRequestConfigBuilder().setProxy(proxy).build()); 42 | // page = doubanHttpClient.getPage(request); 43 | } else { 44 | // page = doubanHttpClient.getPage(url); 45 | } 46 | if (page != null && page.getStatusCode() == 200) { 47 | if (currentProxy != null) 48 | currentProxy.setSuccessfulTimes(currentProxy.getSuccessfulTimes() + 1); 49 | handle(page); 50 | // return page; 51 | } else { 52 | currentProxy.setFailureTimes(currentProxy.getFailureTimes() + 1); 53 | retry(); 54 | } 55 | } catch (Exception e) { 56 | currentProxy.setFailureTimes(currentProxy.getFailureTimes() + 1); 57 | // e.printStackTrace(); 58 | // logger.error(e.getMessage(), e); 59 | retry(); 60 | } finally { 61 | if (request != null) { 62 | request.releaseConnection(); 63 | } 64 | 65 | if (currentProxy != null && !ProxyUtil.isDiscardProxy(currentProxy)) { 66 | ProxyPool.proxyQueue.add(currentProxy); 67 | } else { 68 | if (currentProxy != null) 69 | logger.info("丢弃代理:" + currentProxy.getProxyStr()); 70 | } 71 | } 72 | // return null; 73 | } 74 | 75 | public abstract void retry(); 76 | 77 | public abstract void handle(Page page); 78 | 79 | } 80 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/douban/task/DouBanInfoListPageTask.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.douban.task; 2 | 3 | import com.yao.spider.common.config.CommonConfig; 4 | import com.yao.spider.common.constants.Constants; 5 | import com.yao.spider.core.task.AbstractTask; 6 | import com.yao.spider.douban.DoubanHttpClient; 7 | import com.yao.spider.douban.dao.IMoveDao; 8 | import com.yao.spider.douban.dao.Impl.MoveDaoImpl; 9 | import com.yao.spider.douban.entity.move.Move; 10 | import com.yao.spider.core.parser.IPageParser; 11 | import com.yao.spider.core.factory.ParserFactory; 12 | import com.yao.spider.douban.parsers.move.MoveParser; 13 | import com.yao.spider.proxytool.ProxyPool; 14 | import com.yao.spider.core.entity.Page; 15 | import com.yao.spider.core.http.util.HttpClientUtil; 16 | import com.yao.spider.core.util.ProxyUtil; 17 | import org.apache.http.HttpHost; 18 | import org.apache.http.client.methods.HttpGet; 19 | import org.slf4j.Logger; 20 | import org.slf4j.LoggerFactory; 21 | 22 | import java.util.List; 23 | 24 | /** 25 | * Created by 单耀 on 2018/1/28. 26 | * 下载电影信息列表页面 27 | */ 28 | public class DouBanInfoListPageTask extends AbstractTask implements Runnable{ 29 | private static Logger logger = LoggerFactory.getLogger(DouBanInfoListPageTask.class); 30 | private DoubanHttpClient doubanHttpClient = DoubanHttpClient.getInstance(); 31 | private int startNumber; 32 | 33 | public DouBanInfoListPageTask(String url, boolean isUseProxy) { 34 | this.url = url; 35 | this.isUseProxy = isUseProxy; 36 | } 37 | public DouBanInfoListPageTask(String url, boolean isUseProxy, int retryTimes, int startNumber) { 38 | this.url = url; 39 | this.isUseProxy = isUseProxy; 40 | this.startNumber = startNumber; 41 | this.retryTimes = retryTimes; 42 | } 43 | 44 | public void run() { 45 | getPage(url); 46 | } 47 | 48 | 49 | public void retry() { 50 | // logger.info("电影列表重试次数=" + retryTimes + "--开始编号:" + startNumber + "---重试代理:" + currentProxy.getProxyStr() + "---代理失败/成功次数:" + currentProxy.getFailureTimes()+ "/" + currentProxy.getSuccessfulTimes()); 51 | doubanHttpClient.getDownLoadMoveListExector().execute(new DouBanInfoListPageTask(url, true, retryTimes + 1, startNumber)); 52 | } 53 | 54 | public void handle(Page page) { 55 | if (page.getHtml() == null || "".equals(page.getHtml())) { 56 | return; 57 | } 58 | IPageParser parser = ParserFactory.getParserClass(MoveParser.class); 59 | List moveList = parser.parser(page.getHtml()); 60 | if (moveList != null && moveList.size() > 0) { 61 | for (Move move : moveList) { 62 | logger.info(move.toString()); 63 | } 64 | if (CommonConfig.dbEnable) { 65 | IMoveDao moveDao = new MoveDaoImpl(); 66 | moveDao.insertList(moveList); 67 | } 68 | } 69 | //深度爬虫获取电影详细信息 70 | if (Constants.ISDEEP) { 71 | if (moveList != null && moveList.size() > 0) { 72 | for (Move move : moveList) { 73 | logger.info(move.toString()); 74 | doubanHttpClient.getDownLoadMoveInfoExector().execute(new DouBanDetailInfoDownLoadTask(move, true)); 75 | } 76 | } 77 | } 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/sqkfq/domain/SqkfqBaoming.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.sqkfq.domain; 2 | 3 | import java.util.Date; 4 | 5 | public class SqkfqBaoming { 6 | private Long id; 7 | 8 | private Long userMid; 9 | 10 | private Integer zpid; 11 | 12 | private String title; 13 | 14 | private Date startAppyTime; 15 | 16 | private Date endAppyTime; 17 | 18 | private Date startPayTime; 19 | 20 | private Date endPayTime; 21 | 22 | private Date startPrintTime; 23 | 24 | private Date endPrintTime; 25 | 26 | private Integer state; 27 | 28 | private Integer bgm; 29 | 30 | private Date addTime; 31 | 32 | private Long opt; 33 | 34 | public Long getId() { 35 | return id; 36 | } 37 | 38 | public void setId(Long id) { 39 | this.id = id; 40 | } 41 | 42 | public Long getUserMid() { 43 | return userMid; 44 | } 45 | 46 | public void setUserMid(Long userMid) { 47 | this.userMid = userMid; 48 | } 49 | 50 | public Integer getZpid() { 51 | return zpid; 52 | } 53 | 54 | public void setZpid(Integer zpid) { 55 | this.zpid = zpid; 56 | } 57 | 58 | public String getTitle() { 59 | return title; 60 | } 61 | 62 | public void setTitle(String title) { 63 | this.title = title; 64 | } 65 | 66 | public Date getStartAppyTime() { 67 | return startAppyTime; 68 | } 69 | 70 | public void setStartAppyTime(Date startAppyTime) { 71 | this.startAppyTime = startAppyTime; 72 | } 73 | 74 | public Date getEndAppyTime() { 75 | return endAppyTime; 76 | } 77 | 78 | public void setEndAppyTime(Date endAppyTime) { 79 | this.endAppyTime = endAppyTime; 80 | } 81 | 82 | public Date getStartPayTime() { 83 | return startPayTime; 84 | } 85 | 86 | public void setStartPayTime(Date startPayTime) { 87 | this.startPayTime = startPayTime; 88 | } 89 | 90 | public Date getEndPayTime() { 91 | return endPayTime; 92 | } 93 | 94 | public void setEndPayTime(Date endPayTime) { 95 | this.endPayTime = endPayTime; 96 | } 97 | 98 | public Date getStartPrintTime() { 99 | return startPrintTime; 100 | } 101 | 102 | public void setStartPrintTime(Date startPrintTime) { 103 | this.startPrintTime = startPrintTime; 104 | } 105 | 106 | public Date getEndPrintTime() { 107 | return endPrintTime; 108 | } 109 | 110 | public void setEndPrintTime(Date endPrintTime) { 111 | this.endPrintTime = endPrintTime; 112 | } 113 | 114 | public Integer getState() { 115 | return state; 116 | } 117 | 118 | public void setState(Integer state) { 119 | this.state = state; 120 | } 121 | 122 | public Integer getBgm() { 123 | return bgm; 124 | } 125 | 126 | public void setBgm(Integer bgm) { 127 | this.bgm = bgm; 128 | } 129 | 130 | public Date getAddTime() { 131 | return addTime; 132 | } 133 | 134 | public void setAddTime(Date addTime) { 135 | this.addTime = addTime; 136 | } 137 | 138 | public Long getOpt() { 139 | return opt; 140 | } 141 | 142 | public void setOpt(Long opt) { 143 | this.opt = opt; 144 | } 145 | } -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/douban/DoubanHttpClient.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.douban; 2 | 3 | import com.yao.spider.douban.task.move.StartWithTypeTask; 4 | import org.slf4j.Logger; 5 | import org.slf4j.LoggerFactory; 6 | 7 | import java.util.concurrent.LinkedBlockingQueue; 8 | import java.util.concurrent.ThreadFactory; 9 | import java.util.concurrent.ThreadPoolExecutor; 10 | import java.util.concurrent.TimeUnit; 11 | 12 | /** 13 | * Created by 单耀 on 2018/1/30. 14 | */ 15 | public class DoubanHttpClient { 16 | private static Logger logger = LoggerFactory.getLogger(DoubanHttpClient.class); 17 | private static DoubanHttpClient instance; 18 | 19 | private ThreadPoolExecutor downLoadMoveListExector; 20 | 21 | private ThreadPoolExecutor downLoadMoveInfoExector; 22 | 23 | public static volatile int MOVE_START = 0; 24 | 25 | public DoubanHttpClient() { 26 | init(); 27 | } 28 | private void init () { 29 | 30 | downLoadMoveListExector = new ThreadPoolExecutor(100, 100, 0L, 31 | TimeUnit.SECONDS, 32 | new LinkedBlockingQueue(1000), 33 | new ThreadFactory() { 34 | public Thread newThread(Runnable r) { 35 | return new Thread(r, "downLoadMoveListExector " + r.hashCode()); 36 | } 37 | }); 38 | 39 | downLoadMoveInfoExector = new ThreadPoolExecutor(100, 100, 0L, 40 | TimeUnit.SECONDS, 41 | new LinkedBlockingQueue(1000), 42 | new ThreadFactory() { 43 | public Thread newThread(Runnable r) { 44 | return new Thread(r, "downLoadMoveInfoExector " + r.hashCode()); 45 | } 46 | }); 47 | //设置baohecelve 48 | downLoadMoveInfoExector.setRejectedExecutionHandler(new ThreadPoolExecutor.DiscardPolicy()); 49 | downLoadMoveListExector.setRejectedExecutionHandler(new ThreadPoolExecutor.DiscardPolicy()); 50 | 51 | } 52 | 53 | public static DoubanHttpClient getInstance() { 54 | if (instance == null) { 55 | synchronized (DoubanHttpClient.class) { 56 | if (instance == null) { 57 | instance = new DoubanHttpClient(); 58 | } 59 | } 60 | } 61 | return instance; 62 | } 63 | 64 | public ThreadPoolExecutor getDownLoadMoveListExector() { 65 | return downLoadMoveListExector; 66 | } 67 | 68 | public ThreadPoolExecutor getDownLoadMoveInfoExector() { 69 | return downLoadMoveInfoExector; 70 | } 71 | 72 | public void setDownLoadMoveListExector(ThreadPoolExecutor downLoadMoveListExector) { 73 | this.downLoadMoveListExector = downLoadMoveListExector; 74 | } 75 | 76 | public void setDownLoadMoveInfoExector(ThreadPoolExecutor downLoadMoveInfoExector) { 77 | this.downLoadMoveInfoExector = downLoadMoveInfoExector; 78 | } 79 | 80 | public void startDouBan() { 81 | // new Thread(new SpiderDouBanInfo()).start(); 82 | new Thread(new StartWithTypeTask()).start(); 83 | } 84 | 85 | public void stopSpiderDouban(boolean isNoew) { 86 | if (isNoew) { 87 | this.downLoadMoveListExector.shutdownNow(); 88 | this.downLoadMoveInfoExector.shutdownNow(); 89 | } else { 90 | this.downLoadMoveListExector.shutdown(); 91 | while (downLoadMoveListExector.isTerminated()) { 92 | downLoadMoveInfoExector.shutdown(); 93 | } 94 | } 95 | } 96 | 97 | 98 | } 99 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/sqkfq/dao/SqkfqUserMapper.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | id, user_mid, user_name, user_sex, user_birthday, user_header_url, user_id_card, 27 | user_jiguan, user_merage, user_zzmm, user_sg, user_weight, user_address, user_phone, 28 | user_work, user_jinji, user_jinji_phone, user_xuli, user_zhuanye 29 | 30 | 31 | update t_sqkfq_user 32 | set user_jiguan = #{jiguan} 33 | where user_mid = #{userMid} 34 | 35 | 41 | 42 | insert into t_sqkfq_user (id, user_mid, user_name, 43 | user_sex, user_birthday, user_header_url, 44 | user_id_card, user_jiguan, user_merage, 45 | user_zzmm, user_sg, user_weight, 46 | user_address, user_phone, user_work, 47 | user_jinji, user_jinji_phone, user_xuli, 48 | user_zhuanye) 49 | values (#{id,jdbcType=BIGINT}, #{userMid,jdbcType=BIGINT}, #{userName,jdbcType=VARCHAR}, 50 | #{userSex,jdbcType=VARCHAR}, #{userBirthday,jdbcType=VARCHAR}, #{userHeaderUrl,jdbcType=VARCHAR}, 51 | #{userIdCard,jdbcType=VARCHAR}, #{userJiguan,jdbcType=VARCHAR}, #{userMerage,jdbcType=VARCHAR}, 52 | #{userZzmm,jdbcType=VARCHAR}, #{userSg,jdbcType=VARCHAR}, #{userWeight,jdbcType=VARCHAR}, 53 | #{userAddress,jdbcType=VARCHAR}, #{userPhone,jdbcType=VARCHAR}, #{userWork,jdbcType=VARCHAR}, 54 | #{userJinji,jdbcType=VARCHAR}, #{userJinjiPhone,jdbcType=VARCHAR}, #{userXuli,jdbcType=VARCHAR}, 55 | #{userZhuanye,jdbcType=VARCHAR}) 56 | 57 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/common/util/FileUtil.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.common.util; 2 | 3 | import com.yao.spider.exception.SpiderRuntimeException; 4 | import org.apache.commons.io.FilenameUtils; 5 | import org.apache.commons.io.IOUtils; 6 | import org.apache.maven.shared.utils.io.FileUtils; 7 | import org.mule.config.i18n.MessageFactory; 8 | 9 | import java.io.BufferedOutputStream; 10 | import java.io.File; 11 | import java.io.FileOutputStream; 12 | import java.io.IOException; 13 | import java.io.InputStream; 14 | import java.io.OutputStream; 15 | import java.nio.charset.Charset; 16 | import java.util.Enumeration; 17 | import java.util.zip.ZipEntry; 18 | import java.util.zip.ZipFile; 19 | 20 | /** 21 | * Created by shanyao on 2020/3/29 22 | */ 23 | public class FileUtil { 24 | public static void unZipFile() { 25 | File zip = new File("F:\\视频剪辑\\字幕\\1-2335\\[zmk.pw]0fae226f4186c7859dacbbc80f9cbb29.rar"); 26 | File out = new File("F:\\视频剪辑\\字幕\\1-2335\\temp\\tst"); 27 | try { 28 | String baseName = FilenameUtils.getBaseName("F:\\视频剪辑\\字幕\\1-2335\\temp\\tst\\tst.zip"); 29 | System.out.println(baseName); 30 | String extension = FileUtils.extension("[zmk.pw]0b4f5c751ba4d3ba7370cdde1b5572c7.zip"); 31 | System.out.println(extension); 32 | unzip(zip, out); 33 | } catch (IOException e) { 34 | e.printStackTrace(); 35 | } 36 | } 37 | 38 | public static void unzip(File archive, File directory) throws IOException { 39 | ZipFile zip = null; 40 | 41 | if (directory.exists()) { 42 | if (!directory.isDirectory()) { 43 | throw new IOException("Directory is not a directory: " + directory); 44 | } 45 | } else if (!directory.mkdirs()) { 46 | throw new IOException("Could not create directory: " + directory); 47 | } 48 | 49 | try { 50 | zip = new ZipFile(archive, Charset.forName("GBK")); 51 | Enumeration entries = zip.entries(); 52 | 53 | while(entries.hasMoreElements()) { 54 | ZipEntry entry = (ZipEntry)entries.nextElement(); 55 | File f = newFile(directory, entry.getName()); 56 | if (entry.isDirectory()) { 57 | if (!f.exists() && !f.mkdirs()) { 58 | throw new IOException("Could not create directory: " + f); 59 | } 60 | } else { 61 | File file = new File(directory, entry.getName()); 62 | 63 | if (!file.getParentFile().exists() && !file.getParentFile().mkdirs()) { 64 | throw new IOException("Unable to create folders for zip entry: " + entry.getName()); 65 | } 66 | 67 | InputStream is = zip.getInputStream(entry); 68 | OutputStream os = new BufferedOutputStream(new FileOutputStream(f)); 69 | IOUtils.copy(is, os); 70 | IOUtils.closeQuietly(is); 71 | IOUtils.closeQuietly(os); 72 | } 73 | } 74 | } finally { 75 | if (zip != null) { 76 | zip.close(); 77 | } 78 | 79 | } 80 | 81 | } 82 | 83 | public static File newFile(File parent, String child) { 84 | try { 85 | return (new File(parent, child)).getCanonicalFile(); 86 | } catch (IOException var3) { 87 | throw new SpiderRuntimeException(MessageFactory.createStaticMessage("Unable to create a canonical file for parent: " + parent + " and child: " + child), var3); 88 | } 89 | } 90 | 91 | public static void main(String[] args) { 92 | unZipFile(); 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zimuku/domain/ZimuInfo.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zimuku.domain; 2 | 3 | import java.util.Date; 4 | 5 | public class ZimuInfo { 6 | private Long id; 7 | 8 | private Long zimuId; 9 | 10 | private String zimuTitle; 11 | 12 | private String zimuTranslator; 13 | 14 | private Integer zimuLanguage; 15 | 16 | private Float zimuQuality; 17 | 18 | private Integer zimuType; 19 | 20 | private String detailUrl; 21 | 22 | private String downloadPageUrl; 23 | 24 | private Integer isDeleted; 25 | 26 | private Date createTime; 27 | 28 | private Date lastUpdateTime; 29 | 30 | private Long subId; 31 | 32 | private String subName; 33 | 34 | private Long htmlId; 35 | 36 | public Long getId() { 37 | return id; 38 | } 39 | 40 | public void setId(Long id) { 41 | this.id = id; 42 | } 43 | 44 | public Long getZimuId() { 45 | return zimuId; 46 | } 47 | 48 | public void setZimuId(Long zimuId) { 49 | this.zimuId = zimuId; 50 | } 51 | 52 | public String getZimuTitle() { 53 | return zimuTitle; 54 | } 55 | 56 | public void setZimuTitle(String zimuTitle) { 57 | this.zimuTitle = zimuTitle; 58 | } 59 | 60 | public String getZimuTranslator() { 61 | return zimuTranslator; 62 | } 63 | 64 | public void setZimuTranslator(String zimuTranslator) { 65 | this.zimuTranslator = zimuTranslator; 66 | } 67 | 68 | public Integer getZimuLanguage() { 69 | return zimuLanguage; 70 | } 71 | 72 | public void setZimuLanguage(Integer zimuLanguage) { 73 | this.zimuLanguage = zimuLanguage; 74 | } 75 | 76 | public Float getZimuQuality() { 77 | return zimuQuality; 78 | } 79 | 80 | public void setZimuQuality(Float zimuQuality) { 81 | this.zimuQuality = zimuQuality; 82 | } 83 | 84 | public Integer getZimuType() { 85 | return zimuType; 86 | } 87 | 88 | public void setZimuType(Integer zimuType) { 89 | this.zimuType = zimuType; 90 | } 91 | 92 | public String getDetailUrl() { 93 | return detailUrl; 94 | } 95 | 96 | public void setDetailUrl(String detailUrl) { 97 | this.detailUrl = detailUrl; 98 | } 99 | 100 | public String getDownloadPageUrl() { 101 | return downloadPageUrl; 102 | } 103 | 104 | public void setDownloadPageUrl(String downloadPageUrl) { 105 | this.downloadPageUrl = downloadPageUrl; 106 | } 107 | 108 | public Integer getIsDeleted() { 109 | return isDeleted; 110 | } 111 | 112 | public void setIsDeleted(Integer isDeleted) { 113 | this.isDeleted = isDeleted; 114 | } 115 | 116 | public Date getCreateTime() { 117 | return createTime; 118 | } 119 | 120 | public void setCreateTime(Date createTime) { 121 | this.createTime = createTime; 122 | } 123 | 124 | public Date getLastUpdateTime() { 125 | return lastUpdateTime; 126 | } 127 | 128 | public void setLastUpdateTime(Date lastUpdateTime) { 129 | this.lastUpdateTime = lastUpdateTime; 130 | } 131 | 132 | public Long getSubId() { 133 | return subId; 134 | } 135 | 136 | public void setSubId(Long subId) { 137 | this.subId = subId; 138 | } 139 | 140 | public String getSubName() { 141 | return subName; 142 | } 143 | 144 | public void setSubName(String subName) { 145 | this.subName = subName; 146 | } 147 | 148 | public Long getHtmlId() { 149 | return htmlId; 150 | } 151 | 152 | public void setHtmlId(Long htmlId) { 153 | this.htmlId = htmlId; 154 | } 155 | } -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zimuku/dao/ZimuInfoMapper.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | id, zimu_id, zimu_title, zimu_translator, zimu_language, zimu_quality, zimu_type, 23 | detail_url, download_page_url, is_deleted, create_time, last_update_time, sub_id, 24 | sub_name, html_id 25 | 26 | 32 | 33 | insert into t_zimu_info (id, zimu_id, zimu_title, 34 | zimu_translator, zimu_language, zimu_quality, 35 | zimu_type, detail_url, download_page_url, 36 | is_deleted, create_time, last_update_time, 37 | sub_id, sub_name, html_id 38 | ) 39 | values (#{id,jdbcType=BIGINT}, #{zimuId,jdbcType=BIGINT}, #{zimuTitle,jdbcType=VARCHAR}, 40 | #{zimuTranslator,jdbcType=VARCHAR}, #{zimuLanguage,jdbcType=INTEGER}, #{zimuQuality,jdbcType=REAL}, 41 | #{zimuType,jdbcType=INTEGER}, #{detailUrl,jdbcType=VARCHAR}, #{downloadPageUrl,jdbcType=VARCHAR}, 42 | 0, now(), now(), 43 | #{subId,jdbcType=BIGINT}, #{subName,jdbcType=VARCHAR}, #{htmlId,jdbcType=BIGINT} 44 | ) 45 | 46 | 47 | insert into t_zimu_info (zimu_id, zimu_title, 48 | zimu_translator, zimu_language, zimu_quality, 49 | zimu_type, detail_url, download_page_url, 50 | is_deleted, create_time, last_update_time, 51 | sub_id, sub_name, html_id) 52 | values 53 | 54 | (#{info.zimuId,jdbcType=BIGINT}, #{info.zimuTitle,jdbcType=VARCHAR}, 55 | #{info.zimuTranslator,jdbcType=VARCHAR}, #{info.zimuLanguage,jdbcType=INTEGER}, #{info.zimuQuality,jdbcType=REAL}, 56 | #{info.zimuType,jdbcType=INTEGER}, #{info.detailUrl,jdbcType=VARCHAR}, #{info.downloadPageUrl,jdbcType=VARCHAR}, 57 | 0, now(), now(), 58 | #{info.subId,jdbcType=BIGINT}, #{info.subName,jdbcType=VARCHAR},#{info.htmlId,jdbcType=BIGINT}) 59 | 60 | ON DUPLICATE KEY UPDATE 61 | zimu_title=VALUES(zimu_title), 62 | detail_url=VALUES(detail_url) 63 | 64 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/sqkfq/manager/UserInfoDetailManager.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.sqkfq.manager; 2 | 3 | import com.yao.spider.common.OKHttp2Utils; 4 | import com.yao.spider.core.util.MyBatiesUtils; 5 | import com.yao.spider.sqkfq.domain.SqkfqBaoming; 6 | import com.yao.spider.sqkfq.domain.SqkfqUser; 7 | import com.yao.spider.sqkfq.parses.SqkfaUserParser; 8 | import com.yao.spider.sqkfq.service.SqkfqBaomingService; 9 | import com.yao.spider.sqkfq.service.SqkfqBaomingServiceImpl; 10 | import com.yao.spider.sqkfq.service.SqkfqUserService; 11 | import com.yao.spider.sqkfq.service.SqkfqUserServiceImpl; 12 | import org.apache.ibatis.session.SqlSession; 13 | 14 | import java.io.IOException; 15 | import java.util.HashMap; 16 | import java.util.List; 17 | import java.util.Map; 18 | 19 | /** 20 | * @author 单耀 21 | * @version 1.0 22 | * @description 23 | * @date 2021/2/3 14:42 24 | */ 25 | public class UserInfoDetailManager { 26 | public static void getUserInfo() { 27 | String idCard = "321321199408017856"; 28 | SqlSession session = MyBatiesUtils.getSqlSession(); 29 | SqkfqBaomingService service = new SqkfqBaomingServiceImpl(); 30 | SqkfqUserService userService = new SqkfqUserServiceImpl(); 31 | List sqkfqBaomings = service.selectByZipCodeAndOpt(session, 57, 3001L); 32 | for (SqkfqBaoming baoming : sqkfqBaomings) { 33 | SqlSession session1 = MyBatiesUtils.getSqlSession(); 34 | String s = simpleRequest(baoming.getUserMid(), idCard); 35 | SqkfqUser parser = SqkfaUserParser.parser(s); 36 | parser.setUserMid(baoming.getUserMid()); 37 | idCard = parser.getUserIdCard(); 38 | userService.insert(session1, parser); 39 | System.out.println(parser.toString()); 40 | } 41 | } 42 | public static void updateJiguan() { 43 | String idCard = "321321199408017856"; 44 | SqlSession session = MyBatiesUtils.getSqlSession(); 45 | SqkfqBaomingService service = new SqkfqBaomingServiceImpl(); 46 | SqkfqUserService userService = new SqkfqUserServiceImpl(); 47 | List sqkfqBaomings = service.selectByZipCodeAndOpt(session, 57, 3001L); 48 | for (SqkfqBaoming baoming : sqkfqBaomings) { 49 | SqlSession session1 = MyBatiesUtils.getSqlSession(); 50 | String s = simpleRequest(baoming.getUserMid(), idCard); 51 | SqkfqUser parser = SqkfaUserParser.parser(s); 52 | parser.setUserMid(baoming.getUserMid()); 53 | idCard = parser.getUserIdCard(); 54 | userService.updateJiguan(session1, parser.getUserJiguan(), parser.getUserMid()); 55 | System.out.println(parser.toString()); 56 | } 57 | } 58 | 59 | public static String simpleRequest(Long mid, String idCard) { 60 | String result = null; 61 | String url = "http://www.sqjkqrc.com/JoinSys/SysMain/Members/MyResume.aspx"; 62 | try { 63 | String cookie = "ASP.NET_SessionId=qktncfx02ed4odyihga3izo5; SYSLogin=MID=" + mid + "&LoginName=" + idCard; 64 | result = OKHttp2Utils.sendPostWithHeaders(url, getHeader(cookie)); 65 | } catch (IOException e) { 66 | try { 67 | // 发生异常重新请求一次 68 | result = OKHttp2Utils.sendPost(url, ""); 69 | } catch (IOException e1) { 70 | 71 | } 72 | } 73 | return result; 74 | } 75 | 76 | public static Map getHeader(String cokie) { 77 | Map header = new HashMap<>(); 78 | 79 | header.put("Cookie", cokie); 80 | return header; 81 | } 82 | 83 | public static void main(String[] args) { 84 | // simpleRequest(10580L, "321321199310197414"); 85 | // getUserInfo(); 86 | updateJiguan(); 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/proxytool/task/ProxyPageTask.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.proxytool.task; 2 | 3 | import com.yao.spider.core.factory.ParserFactory; 4 | import com.yao.spider.core.parser.IPageParser; 5 | import com.yao.spider.proxytool.ProxyHttpClient; 6 | import com.yao.spider.proxytool.ProxyPool; 7 | import com.yao.spider.core.entity.Page; 8 | import com.yao.spider.proxytool.entity.Proxy; 9 | import com.yao.spider.core.http.util.HttpClientUtil; 10 | import com.yao.spider.core.util.ProxyUtil; 11 | import org.apache.http.HttpHost; 12 | import org.apache.http.client.methods.HttpGet; 13 | import org.slf4j.Logger; 14 | import org.slf4j.LoggerFactory; 15 | 16 | import java.util.List; 17 | 18 | /** 19 | * Created by 单耀 on 2018/1/27. 20 | * 下载代理页面代理任务类 21 | */ 22 | public class ProxyPageTask implements Runnable{ 23 | private static Logger logger = LoggerFactory.getLogger(ProxyPageTask.class); 24 | 25 | //是否继续下载代理 26 | public static volatile boolean isContinueDownProxy = true; 27 | private String url; 28 | private boolean isUseProxy; 29 | private Proxy currentProxy; 30 | private ProxyHttpClient proxyHttpClient = ProxyHttpClient.getInstance(); 31 | 32 | public ProxyPageTask(String url, boolean isUseProxy) { 33 | this.url = url; 34 | this.isUseProxy = isUseProxy; 35 | } 36 | 37 | public void run() { 38 | HttpGet request = null; 39 | try { 40 | Page page = new Page(); 41 | if (isUseProxy && ProxyPool.proxyQueue.size() > 0) { 42 | 43 | currentProxy = ProxyPool.proxyQueue.take(); 44 | HttpHost proxy = new HttpHost(currentProxy.getIp(), currentProxy.getPort()); 45 | request.setConfig(HttpClientUtil.getRequestConfigBuilder().setProxy(proxy).build()); 46 | page = proxyHttpClient.getPage(request); 47 | } else { 48 | page = proxyHttpClient.getPage(url); 49 | } 50 | page.setProxy(currentProxy); 51 | int status = page.getStatusCode(); 52 | /*String logStr = Thread.currentThread().getName() + " " + getProxyStr(currentProxy) + 53 | " executing request url: " + page.getUrl() + " response statusCode:" + status; 54 | 55 | logger.debug(logStr);*/ 56 | 57 | if (status == 200) { 58 | handle(page); 59 | } else { 60 | Thread.sleep(100); 61 | retry(); 62 | } 63 | 64 | } catch (Exception e) { 65 | // logger.error(e.getMessage(), e); 66 | retry(); 67 | } finally { 68 | 69 | if (currentProxy != null && !ProxyUtil.isDiscardProxy(currentProxy)){ 70 | ProxyPool.proxyQueue.add(currentProxy); 71 | } 72 | 73 | if (request != null) { 74 | request.releaseConnection(); 75 | } 76 | } 77 | } 78 | 79 | private void retry() { 80 | proxyHttpClient.getProxyDoloadThreadExector().execute(new ProxyPageTask(url, true)); 81 | } 82 | 83 | //T处理下载页面 84 | public void handle(Page page) { 85 | if (page.getHtml() == null || "".equals(page.getHtml())) { 86 | return; 87 | } 88 | IPageParser parser = ParserFactory.getParserClass(ProxyPool.proxyMap.get(url)); 89 | List proxyList = parser.parser(page.getHtml()); 90 | if (isContinueDownProxy) { 91 | if (proxyList != null && proxyList.size() > 0) { 92 | for (Proxy proxy : proxyList) { 93 | //测试代理是否可用 94 | proxyHttpClient.getProxyProxyTestExector().execute(new ProxyTestTask(proxy)); 95 | } 96 | } 97 | } 98 | 99 | 100 | } 101 | 102 | private String getProxyStr(Proxy proxy) { 103 | if (proxy == null) 104 | return ""; 105 | 106 | return proxy.getIp() + ":" + proxy.getPort(); 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/douban/parsers/move/MoveParserDeprecated.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.douban.parsers.move; 2 | 3 | import com.yao.spider.douban.entity.move.Move; 4 | import com.yao.spider.core.parser.IPageParser; 5 | import org.jsoup.Jsoup; 6 | import org.jsoup.nodes.Document; 7 | import org.jsoup.nodes.Element; 8 | import org.jsoup.select.Elements; 9 | 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | 13 | /** 14 | * Created by 单耀 on 2018/1/30. 15 | */ 16 | @Deprecated 17 | public class MoveParserDeprecated implements IPageParser { 18 | public List parser(String html) { 19 | List list = new ArrayList(); 20 | // System.out.println(html); 21 | Move move = new Move(); 22 | try { 23 | Document document = Jsoup.parse(html); 24 | Elements elements = document.select("div#info"); 25 | //导演 26 | Elements directs = elements.select("[rel=v:directedBy]"); 27 | String _direct = move.getDirector(); 28 | for (Element element : directs) { 29 | if (_direct != null) { 30 | _direct += "/" + element.text(); 31 | } else { 32 | _direct = element.text(); 33 | } 34 | } 35 | move.setDirector(_direct); 36 | //编剧 37 | Elements eAttrs = elements.select("span.attrs"); 38 | Elements screenWriters = eAttrs.get(1).select("a"); 39 | String _screenW = move.getScreenwriter(); 40 | for (Element element : screenWriters) { 41 | if (_screenW != null) { 42 | _screenW += "/" + element.text(); 43 | } else { 44 | _screenW = element.text(); 45 | } 46 | } 47 | //z主演 48 | Elements mainaActors = elements.select("[rel=v:starring]"); 49 | String _actors = move.getMainaactors(); 50 | for (Element element : mainaActors) { 51 | if (_actors != null) { 52 | _actors += "/" + element.text(); 53 | } else { 54 | _actors = element.text(); 55 | } 56 | } 57 | move.setMainaactors(_actors); 58 | //z类型 59 | Elements types = elements.select("[property=v:genre]"); 60 | String _type = move.getType(); 61 | for (Element element : types) { 62 | if (_type != null) { 63 | _type += "/" + element.text(); 64 | } else { 65 | _type = element.text(); 66 | } 67 | } 68 | move.setType(_type); 69 | 70 | Elements elementsPL = elements.select("span.pl"); 71 | 72 | //制片国家、地区 TODO 73 | //语言 TODO 74 | 75 | //上映日期 76 | Elements showDates = elements.select("[property=v:initialReleaseDate]"); 77 | String _showdate = move.getMainaactors(); 78 | for (Element element : showDates) { 79 | if (_showdate != null) { 80 | _showdate += "/" + element.text(); 81 | } else { 82 | _showdate = element.text(); 83 | } 84 | } 85 | move.setShowdate(_showdate); 86 | //片长 // TODO: 2018/2/4 87 | Elements runtime = elements.select("[property=v:runtime]"); 88 | move.setRuntime(runtime.first().text()); 89 | //IMD连接 90 | Elements imdb = elements.select("[rel=nofollow]"); 91 | move.setImdb(imdb.first().text()); 92 | 93 | //评价人数 94 | Elements rating_sum = document.select("[property=v:votes]"); 95 | move.setVotecount(Integer.valueOf(rating_sum.get(0).text())); 96 | } catch (Exception e) { 97 | 98 | } 99 | 100 | list.add(move); 101 | return list; 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/zhihu/parsers/ZhiHuUserParser.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.zhihu.parsers; 2 | 3 | import com.yao.spider.core.parser.IPageParser; 4 | import com.yao.spider.zhihu.entity.User; 5 | import net.sf.json.JSONArray; 6 | import net.sf.json.JSONObject; 7 | import org.jsoup.Jsoup; 8 | import org.jsoup.nodes.Document; 9 | import org.jsoup.select.Elements; 10 | 11 | import java.lang.reflect.Field; 12 | import java.util.ArrayList; 13 | 14 | import java.util.List; 15 | 16 | /** 17 | * Created by user on 2018/3/28. 18 | */ 19 | public class ZhiHuUserParser implements IPageParser{ 20 | public List parser(String html) { 21 | JSONObject object = JSONObject.fromObject(html); 22 | JSONArray jsonArray = object.getJSONArray("data"); 23 | List userList = new ArrayList(20); 24 | //将for循环中要建立的对象在外面先创建,这种做法避免内存中有多份Object对象引用存在,对象很大的话,就耗费内存了 25 | JSONObject jsonObject = null; 26 | User user = null; 27 | //TODO 改为不使用放射机制,效率貌似会比较低 28 | for (int i = 0,len = jsonArray.size(); i < len; i++) { 29 | user = new User(); 30 | jsonObject = jsonArray.getJSONObject(i); 31 | setUserPropertyValue(user, "agrees", jsonObject, "voteup_count"); 32 | setUserPropertyValue(user, "answers", jsonObject, "answer_count"); 33 | setUserPropertyValue(user, "asks", jsonObject, "question_count"); 34 | setUserPropertyValue(user, "username", jsonObject, "name"); 35 | setUserPropertyValue(user, "business", jsonObject.getJSONObject("business"), "business"); 36 | JSONArray educations = jsonObject.getJSONArray("educations"); 37 | 38 | if (educations != null && educations.size() > 1) { 39 | setUserPropertyValue(user, "education", educations.getJSONObject(0).getJSONObject("school"), "name"); 40 | } 41 | JSONArray employments = jsonObject.getJSONArray("employments"); 42 | if (employments != null && employments.size() > 0) { 43 | setUserPropertyValue(user, "position", employments.getJSONObject(0).getJSONObject("job"), "name"); 44 | setUserPropertyValue(user, "company", employments.getJSONObject(0).getJSONObject("company"), "name"); 45 | } 46 | setUserPropertyValue(user, "followees", jsonObject, "following_count"); 47 | setUserPropertyValue(user, "followers", jsonObject, "follower_count"); 48 | JSONArray locations = jsonObject.getJSONArray("locations"); 49 | if (educations != null && educations.size() > 1) { 50 | setUserPropertyValue(user, "location", locations.getJSONObject(0), "name"); 51 | } 52 | setUserPropertyValue(user, "articles", jsonObject, "articles_count"); 53 | setUserPropertyValue(user, "thanks", jsonObject, "voteup_count"); 54 | setUserPropertyValue(user, "url", jsonObject, "url"); 55 | setUserPropertyValue(user, "userToken", jsonObject, "url_token"); 56 | setUserPropertyValue(user, "userId", jsonObject, "id"); 57 | 58 | //gender 59 | Integer gender = jsonObject.getInt("gender"); 60 | if (gender != null) { 61 | if (gender == 1) { 62 | user.setSex("男"); 63 | } else { 64 | user.setSex("女"); 65 | } 66 | } 67 | userList.add(user); 68 | 69 | } 70 | return userList; 71 | } 72 | // 73 | 74 | /** 75 | * 通过反射技术对User属性复制 76 | * @param user User实体 77 | * @param propertyName 要赋值的属性名 78 | * @param jsonObject JSON对象 79 | * @param key 从JSON中对应的属性名称 80 | * 学习参考网址: 81 | * 1. https://blog.csdn.net/starryninglong/article/details/60468440 82 | * 2. https://www.cnblogs.com/zhouyalei/archive/2013/09/12/java-reflect.html 83 | */ 84 | private void setUserPropertyValue(User user, String propertyName, JSONObject jsonObject, String key) { 85 | try { 86 | Field field = user.getClass().getDeclaredField(propertyName); 87 | Object o = jsonObject.get(key); 88 | field.setAccessible(true); 89 | field.set(user, o); 90 | } catch (Exception e) { 91 | 92 | } 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /douban-spider/src/main/java/com/yao/spider/douban/task/move/SpiderWithTypeTask.java: -------------------------------------------------------------------------------- 1 | package com.yao.spider.douban.task.move; 2 | 3 | import com.yao.spider.core.task.AbstractTask; 4 | import com.yao.spider.douban.DoubanHttpClient; 5 | import com.yao.spider.douban.constants.DBConstants; 6 | import com.yao.spider.douban.task.DouBanInfoListPageTask; 7 | import com.yao.spider.core.entity.Page; 8 | import net.sf.json.JSONObject; 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | 12 | /** 13 | * Created by user on 2018/2/8. 14 | */ 15 | public class SpiderWithTypeTask extends AbstractTask implements Runnable { 16 | private static Logger logger = LoggerFactory.getLogger(SpiderWithTypeTask.class); 17 | private String typeName; 18 | private String typeValue; 19 | private int currentPersent; 20 | private int currentStart; 21 | private Page page; 22 | private DoubanHttpClient doubanHttpClient = DoubanHttpClient.getInstance(); 23 | 24 | public SpiderWithTypeTask(String typeName, String typeValue, boolean isUseProxy) { 25 | this.typeName = typeName; 26 | this.typeValue = typeValue; 27 | super.isUseProxy = isUseProxy; 28 | super.retryTimes = 0; 29 | } 30 | 31 | public SpiderWithTypeTask(String typeName, String typeValue, int currentPersent, int currentStart, boolean isUseProxy, int retryTimes) { 32 | logger.info("retryTime:"+retryTimes+"currentPersent:"+currentPersent); 33 | this.currentPersent = currentPersent; 34 | this.typeName = typeName; 35 | this.typeValue = typeValue; 36 | super.isUseProxy = isUseProxy; 37 | super.retryTimes = retryTimes; 38 | } 39 | 40 | public SpiderWithTypeTask(String url, boolean isUseProxy, int retryTimes) { 41 | super.url = url; 42 | super.isUseProxy = isUseProxy; 43 | super.retryTimes = retryTimes; 44 | } 45 | 46 | 47 | public void run() { 48 | try { 49 | if (retryTimes == 0) { 50 | //获取该标签的总条数 51 | for (int persent = 100; persent > 0; persent -= 10) { 52 | this.currentPersent = persent; 53 | String url = String.format(DBConstants.MOVE_PERSENT_COUNT_URL, typeValue, persent, persent - 10); 54 | getPage(url); 55 | if (page != null) { 56 | newMoveListTask(persent); 57 | Thread.sleep(1000); 58 | } 59 | } 60 | } else { 61 | logger.info("重试:currentPersent=" + currentPersent + "---" + " 减去10" + (currentPersent - 10 )); 62 | String url = String.format(DBConstants.MOVE_PERSENT_COUNT_URL, typeValue, currentPersent, currentPersent - 10); 63 | getPage(url); 64 | if (page != null) { 65 | newMoveListTask(currentPersent); 66 | } 67 | } 68 | } catch (Exception e) { 69 | logger.error(e.getMessage(), e); 70 | } 71 | } 72 | 73 | private void newMoveListTask(int persent) throws InterruptedException { 74 | JSONObject object = JSONObject.fromObject(page.getHtml()); 75 | if (object != null) { 76 | int total = object.getInt("total"); 77 | logger.info("请求成功:total=" + total + "TypeName=" + typeName + "TypeValue=" + typeValue + "Persent=" + persent); 78 | for (int start = 0; start < total; start += 20) { 79 | this.currentStart = start; 80 | String listURL = String.format(DBConstants.MOVE_TOP_LIST_URL, typeValue, persent, persent - 10, start); 81 | doubanHttpClient.getDownLoadMoveListExector().execute(new DouBanInfoListPageTask(listURL, true)); 82 | Thread.sleep(1000); 83 | } 84 | } 85 | } 86 | 87 | public void retry() { 88 | logger.info("URL=" + super.url + " 重试次数=" + retryTimes + "--开始编号/百分比:" + currentStart + "/" + currentPersent + "---重试代理:" + currentProxy.getProxyStr() + "---代理失败/成功次数:" + currentProxy.getFailureTimes()+ "/" + currentProxy.getSuccessfulTimes()); 89 | doubanHttpClient.getDownLoadMoveListExector().execute(new SpiderWithTypeTask(typeName, typeValue, currentPersent, currentStart, true, retryTimes + 1)); 90 | } 91 | 92 | public void handle(Page page){ 93 | this.page = page; 94 | } 95 | } 96 | --------------------------------------------------------------------------------