├── .gitignore
├── img
└── work.png
├── lib
├── rexdb-1.0.4.jar
└── javassist-3.20.0-GA.jar
├── .settings
├── org.eclipse.m2e.core.prefs
└── org.eclipse.jdt.core.prefs
├── resource
├── log4j2.properties
├── banner.txt
├── config.yml
└── spam.txt
├── .project
├── src
└── org
│ ├── youseed
│ ├── spider
│ │ ├── saver
│ │ │ ├── MQBasic.java
│ │ │ ├── SpamAnalyzer.java
│ │ │ ├── mongo
│ │ │ │ ├── UpdateHash.java
│ │ │ │ ├── SaveStat.java
│ │ │ │ └── NewHash.java
│ │ │ ├── es
│ │ │ │ ├── UpdateHash.java
│ │ │ │ ├── UpdateHashOnTime.java
│ │ │ │ └── NewHash.java
│ │ │ ├── zsky
│ │ │ │ ├── SaveStat.java
│ │ │ │ ├── UpdateHash.java
│ │ │ │ └── NewHash.java
│ │ │ ├── MysqlBasic.java
│ │ │ ├── ESBasic.java
│ │ │ └── MongoBasic.java
│ │ ├── MysqlConn.java
│ │ ├── SpiderConfig.java
│ │ ├── RabbitMQConn.java
│ │ ├── ESConn.java
│ │ ├── ConfigUtil.java
│ │ └── MongoConn.java
│ └── Main.java
│ └── rex
│ └── db
│ └── configuration
│ ├── R.java
│ └── Configuration.java
├── .classpath
├── youseed-spider-saver.yml
├── pom.xml
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | /target/
2 |
--------------------------------------------------------------------------------
/img/work.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DHT-open/youseed-spider-saver-public/HEAD/img/work.png
--------------------------------------------------------------------------------
/lib/rexdb-1.0.4.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DHT-open/youseed-spider-saver-public/HEAD/lib/rexdb-1.0.4.jar
--------------------------------------------------------------------------------
/lib/javassist-3.20.0-GA.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DHT-open/youseed-spider-saver-public/HEAD/lib/javassist-3.20.0-GA.jar
--------------------------------------------------------------------------------
/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 |
--------------------------------------------------------------------------------
/resource/log4j2.properties:
--------------------------------------------------------------------------------
1 | name=PropertiesConfig
2 | property.filename = logs
3 | appenders = console
4 |
5 | appender.console.type = Console
6 | appender.console.name = STDOUT
7 | appender.console.layout.type = PatternLayout
8 | appender.console.layout.pattern = [%-5level] %d{yyyy-MM-dd HH:mm:ss.SSS} [%t] %c{1} - %msg%n
9 |
10 |
11 | rootLogger.level = info
12 | rootLogger.appenderRef.stdout.ref = STDOUT
13 |
14 |
--------------------------------------------------------------------------------
/resource/banner.txt:
--------------------------------------------------------------------------------
1 | __ __ __ _ __
2 | \ \/ /___ __ __________ ___ ____/ / _________ (_)___/ /__ _____ _________ __ _____ _____
3 | \ / __ \/ / / / ___/ _ \/ _ \/ __ / / ___/ __ \/ / __ / _ \/ ___/ / ___/ __ `/ | / / _ \/ ___/
4 | / / /_/ / /_/ (__ ) __/ __/ /_/ / (__ ) /_/ / / /_/ / __/ / (__ ) /_/ /| |/ / __/ /
5 | /_/\____/\__,_/____/\___/\___/\__,_/ /____/ .___/_/\__,_/\___/_/ /____/\__,_/ |___/\___/_/
6 | /_/
7 |
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | youseed-spider-saver-public
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.jdt.core.javabuilder
10 |
11 |
12 |
13 |
14 | org.eclipse.m2e.core.maven2Builder
15 |
16 |
17 |
18 |
19 |
20 | org.eclipse.m2e.core.maven2Nature
21 | org.eclipse.jdt.core.javanature
22 |
23 |
24 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
5 | org.eclipse.jdt.core.compiler.compliance=1.8
6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
11 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
12 | org.eclipse.jdt.core.compiler.release=disabled
13 | org.eclipse.jdt.core.compiler.source=1.8
14 |
--------------------------------------------------------------------------------
/src/org/youseed/spider/saver/MQBasic.java:
--------------------------------------------------------------------------------
1 | package org.youseed.spider.saver;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.logging.log4j.LogManager;
6 | import org.apache.logging.log4j.Logger;
7 |
8 | import com.rabbitmq.client.Channel;
9 |
10 | /**
11 | * 操作MQ
12 | */
13 | public class MQBasic {
14 |
15 | private static Logger logger = LogManager.getLogger(MQBasic.class);
16 |
17 | /**
18 | * 确认消息
19 | */
20 | public void confirmMsg(Channel channel, long deliveryTag) {
21 | try {
22 | channel.basicAck(deliveryTag, false);
23 | } catch (IOException e) {
24 | logger.error("消息确认失败:" + e.getMessage(), e);
25 | }
26 | }
27 |
28 | /**
29 | * 退回消息
30 | */
31 | public void rejectMsg(Channel channel, long deliveryTag) {
32 | try {
33 | channel.basicReject(deliveryTag, true);
34 | } catch (IOException e1) {
35 | logger.error("消息回退失败:" + e1.getMessage(), e1);
36 | }
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/src/org/rex/db/configuration/R.java:
--------------------------------------------------------------------------------
1 | package org.rex.db.configuration;
2 |
3 | import java.util.List;
4 | import java.util.Properties;
5 |
6 | import org.rex.DB;
7 | import org.rex.RMap;
8 | import org.rex.db.datasource.DataSourceFactory;
9 | import org.rex.db.datasource.SimpleDataSourceFactory;
10 | import org.rex.db.exception.DBException;
11 |
12 | public class R {
13 |
14 | public static void main(String[] args) throws DBException {
15 | Properties props = new Properties();
16 | props.put("driverClassName", "com.mysql.jdbc.Driver");
17 | props.put("url", "jdbc:mysql://localhost:3306/zsky?serverTimezone=GMT%2B8");
18 | props.put("username", "root");
19 | props.put("password", "activezz1983");
20 |
21 | DataSourceFactory factory = new SimpleDataSourceFactory(props);
22 | Configuration conf = new Configuration();
23 | conf.setDefaultDataSource(factory.getDataSource());
24 |
25 | Configuration.setInstance(conf);
26 |
27 | List ml = DB.getMapList("select * from complaint");
28 | System.out.println(ml);
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/src/org/youseed/spider/MysqlConn.java:
--------------------------------------------------------------------------------
1 | package org.youseed.spider;
2 |
3 | import java.util.Properties;
4 |
5 | import org.apache.logging.log4j.LogManager;
6 | import org.apache.logging.log4j.Logger;
7 | import org.rex.db.configuration.Configuration;
8 | import org.rex.db.datasource.DataSourceFactory;
9 | import org.rex.db.datasource.SimpleDataSourceFactory;
10 | import org.rex.db.exception.DBException;
11 |
12 | import com.alibaba.fastjson.JSONObject;
13 |
14 | public class MysqlConn {
15 |
16 | private static Logger logger = LogManager.getLogger(MysqlConn.class);
17 |
18 | /**
19 | * 数据库已初始化?
20 | */
21 | public static boolean inited = false;
22 |
23 | /**
24 | * 初始化数据库
25 | */
26 | public static synchronized void initDB() {
27 | if(inited) return;
28 |
29 | JSONObject mysql = ConfigUtil.getConfig().getJSONObject("mysql");
30 |
31 | Properties props = new Properties();
32 | props.put("driverClassName", "com.mysql.jdbc.Driver");
33 | props.put("url", mysql.getString("url"));
34 | props.put("username", mysql.getString("user"));
35 | props.put("password", mysql.getString("psw"));
36 |
37 | try {
38 | DataSourceFactory factory = new SimpleDataSourceFactory(props);
39 | Configuration conf = new Configuration();
40 | conf.setDefaultDataSource(factory.getDataSource());
41 | Configuration.setInstance(conf);
42 | inited = true;
43 |
44 | logger.error("已初始化Mysql连接");
45 | } catch (DBException e) {
46 | logger.error("初始化Mysql连接失败:" + e.getMessage(), e);
47 | }
48 |
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/src/org/youseed/spider/SpiderConfig.java:
--------------------------------------------------------------------------------
1 | package org.youseed.spider;
2 |
3 | public class SpiderConfig {
4 |
5 | //==================MQ配置
6 | public static final String MQ_VIRTUAL_HOSTS = "/";
7 |
8 | //----用于Mongodb存储
9 | //入库的数据
10 | public static final String MQ_STORE_EXCHANGE = "store";
11 |
12 | //新hash
13 | public static final String MQ_STORE_HASH_QUEUE = "store.new";
14 | public static final String MQ_STORE_HASH_ROUTING = "store.new";
15 |
16 | //待更新热度的hash
17 | public static final String MQ_STORE_UPDATE_QUEUE = "store.update";
18 | public static final String MQ_STORE_UPDATE_ROUTING = "store.update";
19 |
20 | //爬虫统计信息
21 | public static final String MQ_STORE_SPIDER_QUEUE = "store.stat";
22 | public static final String MQ_STORE_SPIDER_ROUTING = "store.stat";
23 |
24 | //----用于写搜索引擎
25 | //搜索引擎的数据
26 | public static final String MQ_ES_EXCHANGE = "es";
27 |
28 | //新hash
29 | public static final String MQ_ES_HASH_QUEUE = "es_movie.new";
30 | public static final String MQ_ES_HASH_ROUTING = "es_movie.new";
31 |
32 | //待更新热度的hash
33 | public static final String MQ_ES_UPDATE_QUEUE = "es_movie.update";
34 | public static final String MQ_ES_UPDATE_ROUTING = "es_movie.update";
35 |
36 |
37 | //==================Mongo配置
38 | public static final String COLL_HASH = "seed_hash";
39 | public static final String COLL_FILE = "seed_filelist";
40 | public static final String COLL_STATE = "seed_stat";
41 |
42 |
43 | //==================ES配置
44 | public static final String ES_INDEX_SEED="seed";
45 | public static final String ES_TYPE_SEED = "seed";
46 |
47 | }
48 |
--------------------------------------------------------------------------------
/src/org/youseed/spider/RabbitMQConn.java:
--------------------------------------------------------------------------------
1 | package org.youseed.spider;
2 |
3 | import java.io.IOException;
4 | import java.util.concurrent.TimeoutException;
5 |
6 | import org.apache.logging.log4j.LogManager;
7 | import org.apache.logging.log4j.Logger;
8 |
9 | import com.alibaba.fastjson.JSONObject;
10 | import com.rabbitmq.client.Channel;
11 | import com.rabbitmq.client.Connection;
12 | import com.rabbitmq.client.ConnectionFactory;
13 |
14 | /**
15 | * MQ连接
16 | */
17 | public class RabbitMQConn {
18 |
19 | private static Logger logger = LogManager.getLogger(RabbitMQConn.class);
20 |
21 | public Channel getChannel() {
22 | JSONObject config = ConfigUtil.getConfig().getJSONObject("mq");
23 |
24 | String host = config.getString("host");
25 | int port = config.getIntValue("port");
26 | String username = config.getString("username");
27 | String password = config.getString("password");
28 | String virtualHost = config.getString("virtualHost");
29 |
30 | logger.info("---------RabbitMQ配置------------");
31 | logger.info("地址|mq.url: " + host);
32 | logger.info("端口|mq.port: " + port);
33 | logger.info("账户|mq.username: " + username);
34 | logger.info("虚拟目录|mq.virtualHost: " + virtualHost);
35 | logger.info("---------------------------------------------");
36 |
37 |
38 | ConnectionFactory factory = new ConnectionFactory();
39 | factory.setHost(host);
40 | factory.setPort(port);
41 | factory.setUsername(username);
42 | factory.setPassword(password);
43 | factory.setVirtualHost(virtualHost);
44 |
45 | Channel channel = null;
46 | try {
47 | Connection connection = factory.newConnection();
48 | channel = connection.createChannel();
49 | } catch (IOException | TimeoutException e) {
50 |
51 | }
52 |
53 | return channel;
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/resource/config.yml:
--------------------------------------------------------------------------------
1 | #LOGO
2 | banner: banner.txt
3 |
4 | ###########写入配置############
5 | #设置更新数据到ES搜索引擎的时间,不设置时为长驻内存并随时更新
6 | #设置时每天固定更新一次,格式为“hh:mm:ss”,例如04:00:00为每天4点更新
7 | esUpdateTime: '00:04:00'
8 |
9 | includeCategories:
10 |
11 | ###########连接配置############
12 | #RabbitMQ连接配置
13 | mq:
14 | host: 127.0.0.1
15 | port: 5672
16 |
17 | username: youseed
18 | password: youseed
19 | virtualHost: /
20 |
21 | #MongoDB连接配置
22 | mongo:
23 | url: 127.0.0.1
24 | port: 27017
25 | db: seed
26 | admindb:
27 | user:
28 | psw:
29 |
30 | #ES搜索引擎连接配置
31 | es:
32 | url: 127.0.0.1
33 | port: 9300
34 |
35 | #mysql连接配置
36 | mysql:
37 | url: jdbc:mysql://localhost:3306/zsky?useUnicode=true&characterEncoding=utf-8&serverTimezone=GMT%2B8
38 | user: root
39 | psw:
40 |
41 |
42 | #########全局配置#########
43 | #消息队列声明和消费者绑定配置
44 | binding:
45 | es:
46 | exchage: search
47 | new:
48 | queue: search.new.1
49 | routing: 'search.new.*'
50 | update:
51 | queue: search.update.1
52 | routing: 'search.update.*'
53 | mongo:
54 | exchage: store
55 | new:
56 | queue: store.new.0
57 | routing: 'store.new.*'
58 | update:
59 | queue: store.update.0
60 | routing: 'store.update.*'
61 | stat:
62 | queue: store.stat.0
63 | routing: 'store.stat.*'
64 | mysql:
65 | exchage: store
66 | new:
67 | queue: store.new.0
68 | routing: 'store.new.*'
69 | update:
70 | queue: store.update.0
71 | routing: 'store.update.*'
72 | stat:
73 | queue: store.stat.0
74 | routing: 'store.stat.*'
75 |
76 | #存储数据的表名、索引名称等配置
77 | store:
78 | es:
79 | index: seed
80 | type: seed
81 | mongo:
82 | hash: seed_hash
83 | filelist: seed_filelist
84 | stat: seed_stat
85 | mysql:
86 | hash: search_hash
87 | filelist: search_filelist
88 | stat: search_statusreport
89 |
90 |
--------------------------------------------------------------------------------
/youseed-spider-saver.yml:
--------------------------------------------------------------------------------
1 | #LOGO
2 | banner: banner.txt
3 |
4 | ###########写入配置############
5 | #设置更新数据到ES搜索引擎的时间,不设置时为长驻内存并随时更新
6 | #设置时每天固定更新一次,格式为“hh:mm:ss”,例如04:00:00为每天4点更新
7 | esUpdateTime: '00:04:00'
8 |
9 | includeCategories:
10 |
11 | ###########连接配置############
12 | #RabbitMQ连接配置
13 | mq:
14 | host: 127.0.0.1
15 | port: 5672
16 |
17 | username: youseed
18 | password: youseed
19 | virtualHost: /
20 |
21 | #MongoDB连接配置
22 | mongo:
23 | url: 127.0.0.1
24 | port: 27017
25 | db: seed
26 | admindb:
27 | user:
28 | psw:
29 |
30 | #ES搜索引擎连接配置
31 | es:
32 | url: 127.0.0.1
33 | port: 9300
34 |
35 | #mysql连接配置
36 | mysql:
37 | url: jdbc:mysql://localhost:3306/zsky?useUnicode=true&characterEncoding=utf-8&serverTimezone=GMT%2B8
38 | user: root
39 | psw:
40 |
41 |
42 | #########全局配置#########
43 | #消息队列声明和消费者绑定配置
44 | binding:
45 | es:
46 | exchage: search
47 | new:
48 | queue: search.new.1
49 | routing: 'search.new.*'
50 | update:
51 | queue: search.update.1
52 | routing: 'search.update.*'
53 | mongo:
54 | exchage: store
55 | new:
56 | queue: store.new.0
57 | routing: 'store.new.*'
58 | update:
59 | queue: store.update.0
60 | routing: 'store.update.*'
61 | stat:
62 | queue: store.stat.0
63 | routing: 'store.stat.*'
64 | mysql:
65 | exchage: store
66 | new:
67 | queue: store.new.0
68 | routing: 'store.new.*'
69 | update:
70 | queue: store.update.0
71 | routing: 'store.update.*'
72 | stat:
73 | queue: store.stat.0
74 | routing: 'store.stat.*'
75 |
76 | #存储数据的表名、索引名称等配置
77 | store:
78 | es:
79 | index: seed
80 | type: seed
81 | mongo:
82 | hash: seed_hash
83 | filelist: seed_filelist
84 | stat: seed_stat
85 | mysql:
86 | hash: search_hash
87 | filelist: search_filelist
88 | stat: search_statusreport
89 |
90 |
--------------------------------------------------------------------------------
/src/org/youseed/spider/saver/SpamAnalyzer.java:
--------------------------------------------------------------------------------
1 | package org.youseed.spider.saver;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.IOException;
5 | import java.io.InputStreamReader;
6 | import java.util.ArrayList;
7 | import java.util.List;
8 |
9 | import org.apache.logging.log4j.LogManager;
10 | import org.apache.logging.log4j.Logger;
11 |
12 | import com.alibaba.fastjson.JSONArray;
13 | import com.alibaba.fastjson.JSONObject;
14 |
15 | /**
16 | * 分析有无敏感词
17 | */
18 | public class SpamAnalyzer {
19 |
20 | private static Logger logger = LogManager.getLogger(SpamAnalyzer.class);
21 |
22 | static final String SPAM = "spam.txt";
23 |
24 | static List spams = readSpam();
25 |
26 | /**
27 | * 判断是否敏感词,名称和文件列表(前5)都判断
28 | */
29 | public static boolean isSpam(String name, JSONArray filelist5) {
30 | StringBuffer sb = new StringBuffer(name).append(" ");
31 |
32 | if(filelist5 != null) {
33 | for (int i = 0; i < filelist5.size(); i++) {
34 | JSONObject file = filelist5.getJSONObject(i);
35 | sb.append(file.getString("path")).append(" ");
36 | }
37 | }
38 |
39 | for(String spam : spams) {
40 | if(sb.toString().contains(spam)) {
41 | return true;
42 | }
43 | }
44 | return false;
45 | }
46 |
47 | /**
48 | * 读取关键字
49 | */
50 | private static List readSpam() {
51 | InputStreamReader ir = new InputStreamReader(SpamAnalyzer.class.getClassLoader().getResourceAsStream(SPAM));
52 | BufferedReader bf = new BufferedReader(ir);
53 |
54 | List list = new ArrayList();
55 | String str;
56 | try {
57 | // 按行读取字符串
58 | while ((str = bf.readLine()) != null) {
59 | list.add(str);
60 | }
61 |
62 | bf.close();
63 | ir.close();
64 |
65 | } catch (IOException e) {
66 | }
67 |
68 | //加载关键词
69 | logger.info("敏感关键词数量:" + list.size());
70 | return list;
71 | }
72 |
73 | //test
74 | public static void main(String[] args) {
75 | System.out.println(isSpam("porn", null));
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
4 | 4.0.0
5 | youseed-spider-saver-public
6 | youseed-spider-saver-public
7 | 1.0.0
8 |
9 |
10 | org.apache.logging.log4j
11 | log4j-api
12 | 2.11.1
13 |
14 |
15 | org.apache.logging.log4j
16 | log4j-core
17 | 2.11.1
18 |
19 |
20 | org.slf4j
21 | slf4j-nop
22 | 1.7.2
23 |
24 |
25 | org.yaml
26 | snakeyaml
27 | 1.23
28 |
29 |
30 | com.rabbitmq
31 | amqp-client
32 | 5.5.0
33 |
34 |
35 | com.alibaba
36 | fastjson
37 | 1.2.49
38 |
39 |
40 | org.mongodb
41 | mongo-java-driver
42 | 3.8.1
43 |
44 |
45 | org.elasticsearch.client
46 | transport
47 | 6.4.0
48 |
49 |
50 | mysql
51 | mysql-connector-java
52 | 6.0.6
53 |
54 |
55 |
56 |
57 | src
58 |
59 |
60 | maven-compiler-plugin
61 | 3.7.0
62 |
63 | 1.8
64 | 1.8
65 |
66 |
67 |
68 |
69 |
--------------------------------------------------------------------------------
/src/org/youseed/spider/ESConn.java:
--------------------------------------------------------------------------------
1 | package org.youseed.spider;
2 |
3 | import java.net.InetAddress;
4 | import java.net.UnknownHostException;
5 |
6 | import org.apache.logging.log4j.LogManager;
7 | import org.apache.logging.log4j.Logger;
8 | import org.elasticsearch.action.index.IndexRequestBuilder;
9 | import org.elasticsearch.action.index.IndexResponse;
10 | import org.elasticsearch.client.transport.TransportClient;
11 | import org.elasticsearch.common.settings.Settings;
12 | import org.elasticsearch.common.transport.TransportAddress;
13 | import org.elasticsearch.common.xcontent.XContentType;
14 | import org.elasticsearch.transport.client.PreBuiltTransportClient;
15 |
16 | import com.alibaba.fastjson.JSONObject;
17 |
18 | /**
19 | * Elasticsearch
20 | */
21 | public class ESConn {
22 |
23 | private static Logger logger = LogManager.getLogger(ESConn.class);
24 |
25 | private TransportClient client = null;
26 |
27 | public ESConn() {
28 | JSONObject config = ConfigUtil.getConfig().getJSONObject("es");
29 | String url = config.getString("url");
30 | int port = config.getIntValue("port");
31 |
32 | logger.info("---------Elasticsearch配置------------");
33 | logger.info("地址|es.url: " + url);
34 | logger.info("端口|es.port: " + port);
35 | logger.info("---------------------------------------------");
36 |
37 | try {
38 | client = new PreBuiltTransportClient(Settings.EMPTY)
39 | .addTransportAddress(new TransportAddress(InetAddress.getByName(url), port));
40 | } catch (UnknownHostException e) {
41 | throw new RuntimeException(e);
42 | }
43 | }
44 |
45 |
46 | public TransportClient getClient() {
47 | return client;
48 | }
49 |
50 | public IndexRequestBuilder getindexBuilder(String indexName) {
51 | return client.prepareIndex(indexName, "main");
52 | }
53 |
54 | /**
55 | */
56 | public void indexData(String indexName, String json) {
57 | IndexResponse response = client.prepareIndex(indexName, "main").setSource(json, XContentType.JSON).execute()
58 | .actionGet();
59 | }
60 |
61 | /**
62 | */
63 | public void indexData(String indexName, String[] json) {
64 | IndexRequestBuilder builder = client.prepareIndex(indexName, "main");
65 | for (int i = 0; i < json.length; i++) {
66 | builder.setSource(json[i], XContentType.JSON);
67 | }
68 | builder.execute();
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/src/org/youseed/spider/saver/mongo/UpdateHash.java:
--------------------------------------------------------------------------------
1 | package org.youseed.spider.saver.mongo;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.logging.log4j.LogManager;
6 | import org.apache.logging.log4j.Logger;
7 | import org.youseed.spider.MongoConn;
8 | import org.youseed.spider.RabbitMQConn;
9 | import org.youseed.spider.saver.MongoBasic;
10 |
11 | import com.alibaba.fastjson.JSON;
12 | import com.alibaba.fastjson.JSONArray;
13 | import com.rabbitmq.client.AMQP;
14 | import com.rabbitmq.client.Channel;
15 | import com.rabbitmq.client.DefaultConsumer;
16 | import com.rabbitmq.client.Envelope;
17 |
18 | /**
19 | * 更新将消息队列中的Hash
20 | */
21 | public class UpdateHash extends MongoBasic {
22 |
23 | /**
24 | * 消费者标签
25 | */
26 | static final String CONSUME_TAG = "mongo-update-consumer";
27 |
28 | private static Logger logger = LogManager.getLogger(UpdateHash.class);
29 |
30 | RabbitMQConn mq = new RabbitMQConn();
31 |
32 | MongoConn mongo = new MongoConn();
33 |
34 | public UpdateHash() {
35 | super();
36 | }
37 |
38 | /**
39 | * 更新Hash
40 | */
41 | public void consume() throws IOException {
42 |
43 | //监听消息
44 | Channel channel = mq.getChannel();
45 | channel.queueDeclare(mqMongoUpdateQueue, true, false, false, null);
46 | channel.queueBind(mqMongoUpdateQueue, mqMongoExchange, mqMongoUpdateRouting);
47 | channel.basicConsume(mqMongoUpdateQueue, false, CONSUME_TAG, new DefaultConsumer(channel) {
48 |
49 | @Override
50 | public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProperties properties,
51 | byte[] body) {
52 |
53 | long deliveryTag = envelope.getDeliveryTag();
54 |
55 | //1解析数据
56 | JSONArray data = null;
57 | try {
58 | data = JSON.parseArray(new String(body, "UTF-8"));
59 | } catch (Exception e) {
60 | logger.error("JSON解析失败,提交成功并跳过消息:" + e.getMessage(), e);
61 | confirmMsg(channel, deliveryTag);
62 | return;
63 | }
64 |
65 | //2更新数据
66 | //2.1解析数据
67 | int size = data.size();
68 | if(size == 0) {
69 | logger.info("空数据,跳过");
70 | confirmMsg(channel, deliveryTag);
71 | return;
72 | }
73 |
74 | logger.info("待更新:" + size);
75 |
76 | //2.2更新
77 | try {
78 | int cnt = bulkUpdate(mongo, data);
79 | logger.info("实际更新:" + cnt);
80 | }catch(Exception e) {
81 | logger.error("更新失败:" + e.getMessage(), e);
82 | testMongoConn(mongo, channel, deliveryTag);
83 | return;
84 | }
85 |
86 | //3确认消息
87 | confirmMsg(channel, deliveryTag);
88 | }
89 | });
90 | }
91 |
92 | //test
93 | public static void main(String[] args) throws IOException {
94 | UpdateHash main = new UpdateHash();
95 | main.consume();
96 | }
97 | }
98 |
--------------------------------------------------------------------------------
/src/org/youseed/spider/saver/es/UpdateHash.java:
--------------------------------------------------------------------------------
1 | package org.youseed.spider.saver.es;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.logging.log4j.LogManager;
6 | import org.apache.logging.log4j.Logger;
7 | import org.elasticsearch.client.transport.TransportClient;
8 | import org.youseed.spider.ESConn;
9 | import org.youseed.spider.RabbitMQConn;
10 | import org.youseed.spider.saver.ESBasic;
11 |
12 | import com.alibaba.fastjson.JSON;
13 | import com.alibaba.fastjson.JSONArray;
14 | import com.rabbitmq.client.AMQP;
15 | import com.rabbitmq.client.Channel;
16 | import com.rabbitmq.client.DefaultConsumer;
17 | import com.rabbitmq.client.Envelope;
18 |
19 | /**
20 | * 更新Hash热度(用于不间断监听更新)
21 | */
22 | public class UpdateHash extends ESBasic {
23 |
24 | /**
25 | * 消费者标签
26 | */
27 | static final String CONSUME_TAG = "es-update-consumer";
28 |
29 | private static Logger logger = LogManager.getLogger(UpdateHash.class);
30 |
31 | RabbitMQConn mq = new RabbitMQConn();
32 | ESConn es = new ESConn();
33 |
34 | /**
35 | * 构造函数
36 | */
37 | public UpdateHash() {
38 | super();
39 | logger.info("starting update hash consumer...");
40 | }
41 |
42 | /**
43 | * 更新Hash
44 | */
45 | public void consume() throws IOException {
46 |
47 | TransportClient client = es.getClient();
48 | Channel channel = mq.getChannel();
49 |
50 | channel.queueDeclare(mqEsUpdateQueue, true, false, false, null);
51 | channel.queueBind(mqEsUpdateQueue, mqEsExchange, mqEsUpdateRouting);
52 | channel.basicConsume(mqEsUpdateQueue, false, CONSUME_TAG, new DefaultConsumer(channel) {
53 |
54 | @Override
55 | public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProperties properties, byte[] body) {
56 |
57 | logger.info("---------------------");
58 | long deliveryTag = envelope.getDeliveryTag();
59 |
60 | //1解析数据
61 | JSONArray data = null;
62 | try {
63 | data = JSON.parseArray(new String(body, "UTF-8"));
64 | logger.info("获取到数据,条目: " + data.size());
65 | } catch (Exception e) {
66 | logger.error("JSON解析失败,提交成功并跳过消息:" + e.getMessage(), e);
67 | confirmMsg(channel, deliveryTag);
68 | return;
69 | }
70 |
71 | //2提交处理
72 | try {
73 | int cnt = batchUpdateHash(client, data);
74 | logger.info("更新条目: " + cnt);
75 | }catch(Exception e) {
76 | logger.error("更新出错: " + e.getMessage(), e);
77 | testEsConn(client, channel, deliveryTag);
78 | return;
79 | }
80 |
81 | //3确认消息
82 | confirmMsg(channel, deliveryTag);
83 | }
84 | });
85 | }
86 |
87 |
88 | /**
89 | * 测试
90 | */
91 | public static void main(String[] args) throws IOException {
92 | UpdateHash main = new UpdateHash();
93 | main.consume();
94 | }
95 | }
96 |
--------------------------------------------------------------------------------
/src/org/youseed/spider/saver/mongo/SaveStat.java:
--------------------------------------------------------------------------------
1 | package org.youseed.spider.saver.mongo;
2 |
3 | import java.io.IOException;
4 | import java.text.SimpleDateFormat;
5 |
6 | import org.apache.logging.log4j.LogManager;
7 | import org.apache.logging.log4j.Logger;
8 | import org.bson.Document;
9 | import org.youseed.spider.MongoConn;
10 | import org.youseed.spider.RabbitMQConn;
11 | import org.youseed.spider.SpiderConfig;
12 | import org.youseed.spider.saver.MongoBasic;
13 |
14 | import com.alibaba.fastjson.JSON;
15 | import com.alibaba.fastjson.JSONObject;
16 | import com.rabbitmq.client.AMQP;
17 | import com.rabbitmq.client.Channel;
18 | import com.rabbitmq.client.DefaultConsumer;
19 | import com.rabbitmq.client.Envelope;
20 |
21 | /**
22 | * 保存爬虫统计信息
23 | */
24 | public class SaveStat extends MongoBasic {
25 |
26 | /**
27 | * 消费者标签
28 | */
29 | static final String CONSUME_TAG = "mongo-stat-consumer";
30 |
31 | private static Logger logger = LogManager.getLogger(SaveStat.class);
32 |
33 | RabbitMQConn mq = new RabbitMQConn();
34 |
35 | MongoConn mongo = new MongoConn();
36 |
37 | public SaveStat() {
38 | super();
39 | }
40 |
41 | /**
42 | * 处理Hash
43 | */
44 | public void consume() throws IOException {
45 |
46 | //监听消息
47 | Channel channel = mq.getChannel();
48 | channel.queueDeclare(mqMongoStatQueue, true, false, false, null);
49 | channel.queueBind(mqMongoStatQueue, mqMongoExchange, mqMongoStatRouting);
50 | channel.basicConsume(mqMongoStatQueue, false, CONSUME_TAG, new DefaultConsumer(channel) {
51 |
52 | @Override
53 | public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProperties properties,
54 | byte[] body) {
55 |
56 | long deliveryTag = envelope.getDeliveryTag();
57 |
58 | //1解析数据
59 | JSONObject data = null;
60 | try {
61 | data = JSON.parseObject(new String(body, "UTF-8"));
62 |
63 | String dateStr = data.getString("date");
64 | data.put("date", new SimpleDateFormat("yyyy-MM-dd hh:mm:ss").parse(dateStr));
65 | logger.info("已保存,爬虫" + data.getString("spider") + "|" + dateStr);
66 | } catch (Exception e) {
67 | logger.error("JSON解析失败,提交成功并跳过消息:" + e.getMessage(), e);
68 | confirmMsg(channel, deliveryTag);
69 | return;
70 | }
71 |
72 | //2保存数据
73 | try {
74 | mongo.save(SpiderConfig.COLL_STATE, new Document(data));
75 | } catch (Exception e) {
76 | logger.error("保存失败:" + e.getMessage(), e);
77 | testMongoConn(mongo, channel, deliveryTag);
78 | return;
79 | }
80 |
81 | //3确认消息
82 | confirmMsg(channel, deliveryTag);
83 | }
84 |
85 | });
86 | }
87 |
88 | //test
89 | public static void main(String[] args) throws IOException {
90 | SaveStat main = new SaveStat();
91 | main.consume();
92 | }
93 |
94 | }
95 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Youseed磁力爬虫入库程序 #
2 |
3 | 此程序使用Java编写,负责将rabbitMQ消息队列中的数据保存至数据库或者搜索引擎。
4 |
5 | 
6 |
7 | 注意:此程序是上图右侧方框“保存磁力数据”的部分。
8 |
9 | - DHT爬虫:[https://github.com/dht-open/youseed-spider-public](https://github.com/DHT-open/youseed-spider-public)
10 | - 爬虫数据入库:[https://github.com/DHT-open/youseed-spider-saver-public](https://github.com/DHT-open/youseed-spider-saver-public)
11 |
12 | *此程序仅用作技术学习和研究*
13 |
14 | # 功能 #
15 |
16 | 读取消息队列,将爬虫抓取到的数据保存至:
17 |
18 | - Youseed Mongodb数据库;
19 | - Youseed Elasticsearch搜索引擎;
20 | - “纸上烤鱼磁力搜索引擎”数据库
21 |
22 |
23 | **注意**:此爬虫程序主要负责保存数据,需要配合“dht_spider.py”,或者“dht_spider_zsky.py”爬虫程序使用。
24 |
25 | # 程序特点 #
26 |
27 | 1. 兼容性:支持Mongodb、Mysql和Elasticsearch搜索引擎;
28 | 2. 实时和定时:支持Elasticsearch中新资源的实时索引,支持旧资源的定时更新;
29 | 3. 支持不良资源鉴定:依据`spam.txt`中的关键字鉴别不良资源,并予以标记
30 |
31 |
32 | # 硬件要求 #
33 |
34 | - 内存:约200M
35 |
36 | # 软件要求 #
37 |
38 | 需要安装以下软件:
39 |
40 | - jdk运行环境
41 |
42 | # 安装(以centos7为例) #
43 |
44 | ## 安装JDK ##
45 |
46 | yum install java-1.8.0-openjdk.x86_64
47 |
48 | ## 下载程序 ##
49 |
50 | 将编译好的jar包`spider-saver-public-1.0.0.jar`和配置文件`youseed-spider-saver.yml`下载至本地。
51 |
52 | ## 修改配置 ##
53 |
54 | 编辑文件`youseed-spider-saver.yml`,修改连接配置:
55 |
56 | #MongoDB连接配置
57 | mongo:
58 | url: 127.0.0.1
59 | port: 27017
60 | db: seed
61 | admindb:
62 | user:
63 | psw:
64 |
65 | #ES搜索引擎连接配置
66 | es:
67 | url: 127.0.0.1
68 | port: 9300
69 |
70 | #mysql连接配置(for 纸上烤鱼) <------------------纸上烤鱼程序修改这个连接
71 | mysql:
72 | url: jdbc:mysql://localhost:3306/zsky?useUnicode=true&characterEncoding=utf-8&serverTimezone=GMT%2B8
73 | user: root
74 | psw:
75 |
76 |
77 | # 运行 #
78 |
79 | 编译好的程序在这里下载:
80 | [https://github.com/DHT-open/youseed-spider-saver-public/releases](https://github.com/DHT-open/youseed-spider-saver-public/releases "https://github.com/DHT-open/youseed-spider-saver-public/releases")
81 |
82 | ## 控制台运行 ##
83 | 使用如下命令在控制台运行入库程序(注意--config的值必须是绝对路径):
84 |
85 | java -jar -Xms50m -Xmx128m /opt/spider/app/youseed-spider-saver-public-1.0.0.jar --config=/opt/spider/app/youseed-spider-saver.yml
86 |
87 | 程序会输出可选命令:
88 |
89 | m: 写入/更新Mongodb
90 | m1: |-------写入新资源到Mongo
91 | m2: |-------更新Mongo
92 | m3: |-------写入统计到Mongo
93 | es: 写入/更新ES(根据esUpdateTime设置,自动选择实时或定时更新)
94 | es1: |-------写入新资源到ES
95 | es2: |-------更新ES(常驻内存并实时更新)
96 | es3: |-------更新ES(更新完毕当前批次后关闭)
97 | zsky: 写入/更新纸上烤鱼(zsky)
98 | zsky1: |-------写入新资源到Mysql
99 | zsky2: |-------更新Mysql
100 | zsky3: |-------写入统计到Mysql
101 |
102 | 请选择一项操作(输入编号后回车):
103 |
104 | 接下来输入`zsky`保存到“纸上烤鱼”数据库
105 |
106 | ## 后台运行 ##
107 | 输入如下命令,后台启动“纸上烤鱼”入库
108 |
109 | nohup java -jar -Xms50m -Xmx128m /opt/spider/app/youseed-spider-saver-public-1.0.0.jar --config=/opt/spider/app/youseed-spider-saver.yml zsky > /opt/spider/logs/spider-saver-mongo.log 2>&1 &
110 |
111 |
--------------------------------------------------------------------------------
/src/org/youseed/spider/saver/zsky/SaveStat.java:
--------------------------------------------------------------------------------
1 | package org.youseed.spider.saver.zsky;
2 |
3 | import java.io.IOException;
4 | import java.text.SimpleDateFormat;
5 |
6 | import org.apache.logging.log4j.LogManager;
7 | import org.apache.logging.log4j.Logger;
8 | import org.rex.DB;
9 | import org.rex.db.Ps;
10 | import org.youseed.spider.MongoConn;
11 | import org.youseed.spider.RabbitMQConn;
12 | import org.youseed.spider.saver.MysqlBasic;
13 |
14 | import com.alibaba.fastjson.JSON;
15 | import com.alibaba.fastjson.JSONObject;
16 | import com.rabbitmq.client.AMQP;
17 | import com.rabbitmq.client.Channel;
18 | import com.rabbitmq.client.DefaultConsumer;
19 | import com.rabbitmq.client.Envelope;
20 |
21 | /**
22 | * 保存爬虫统计信息
23 | */
24 | public class SaveStat extends MysqlBasic {
25 |
26 | /**
27 | * 消费者标签
28 | */
29 | static final String CONSUME_TAG = "mysql-zsky-stat-consumer";
30 |
31 | private static Logger logger = LogManager.getLogger(SaveStat.class);
32 |
33 | RabbitMQConn mq = new RabbitMQConn();
34 |
35 | MongoConn mongo = new MongoConn();
36 |
37 | static final String INSERT_REPORT = "INSERT INTO search_statusreport(date,new_hashes,total_requests, valid_requests) VALUES(?,?,?,?)";
38 |
39 | public SaveStat() {
40 | super();
41 | }
42 |
43 | /**
44 | * 处理Hash
45 | */
46 | public void consume() throws IOException {
47 |
48 | //监听消息
49 | Channel channel = mq.getChannel();
50 | channel.queueDeclare(mqMysqlStatQueue, true, false, false, null);
51 | channel.queueBind(mqMysqlStatQueue, mqMysqlExchange, mqMysqlStatRouting);
52 | channel.basicConsume(mqMysqlStatQueue, false, CONSUME_TAG, new DefaultConsumer(channel) {
53 |
54 | @Override
55 | public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProperties properties,
56 | byte[] body) {
57 |
58 | long deliveryTag = envelope.getDeliveryTag();
59 |
60 | //1解析数据
61 | JSONObject data = null;
62 | try {
63 | data = JSON.parseObject(new String(body, "UTF-8"));
64 |
65 | String dateStr = data.getString("date");
66 | data.put("date", new SimpleDateFormat("yyyy-MM-dd hh:mm:ss").parse(dateStr));
67 | logger.info("爬虫" + data.getString("spider") + "|" + dateStr);
68 | } catch (Exception e) {
69 | logger.error("JSON解析失败,提交成功并跳过消息:" + e.getMessage(), e);
70 | confirmMsg(channel, deliveryTag);
71 | return;
72 | }
73 |
74 | //2保存数据
75 | try {
76 | int newHash = data.getIntValue("num_new");
77 | int total = data.containsKey("total") ? data.getIntValue("total") : 0;
78 | int valid = data.getIntValue("num_new") + data.getIntValue("num_stored");
79 |
80 | DB.update(INSERT_REPORT, new Ps(data.get("date"), newHash, total, valid));
81 |
82 | } catch (Exception e) {
83 | logger.error("保存失败:" + e.getMessage(), e);
84 | testMysqlConn(channel, deliveryTag);
85 | return;
86 | }
87 |
88 | //3确认消息
89 | confirmMsg(channel, deliveryTag);
90 | }
91 |
92 | });
93 | }
94 |
95 | //test
96 | public static void main(String[] args) throws IOException {
97 | SaveStat main = new SaveStat();
98 | main.consume();
99 | }
100 |
101 | }
102 |
--------------------------------------------------------------------------------
/src/org/youseed/spider/saver/zsky/UpdateHash.java:
--------------------------------------------------------------------------------
1 | package org.youseed.spider.saver.zsky;
2 |
3 | import java.io.IOException;
4 | import java.util.ArrayList;
5 | import java.util.List;
6 |
7 | import org.apache.logging.log4j.LogManager;
8 | import org.apache.logging.log4j.Logger;
9 | import org.rex.DB;
10 | import org.rex.db.Ps;
11 | import org.youseed.spider.MysqlConn;
12 | import org.youseed.spider.RabbitMQConn;
13 | import org.youseed.spider.saver.MysqlBasic;
14 |
15 | import com.alibaba.fastjson.JSON;
16 | import com.alibaba.fastjson.JSONArray;
17 | import com.rabbitmq.client.AMQP;
18 | import com.rabbitmq.client.Channel;
19 | import com.rabbitmq.client.DefaultConsumer;
20 | import com.rabbitmq.client.Envelope;
21 |
22 | /**
23 | * 更新将消息队列中的Hash
24 | */
25 | public class UpdateHash extends MysqlBasic {
26 |
27 | /**
28 | * 消费者标签
29 | */
30 | static final String CONSUME_TAG = "mysql-zsky-update-consumer";
31 |
32 | private static Logger logger = LogManager.getLogger(UpdateHash.class);
33 |
34 | RabbitMQConn mq = new RabbitMQConn();
35 |
36 | public UpdateHash() {
37 | super();
38 | }
39 |
40 | final static String UPDATE_HASH = "UPDATE search_hash SET last_seen=now(),requests=requests+1 WHERE info_hash like ?";
41 |
42 | /**
43 | * 更新Hash
44 | */
45 | public void consume() throws IOException {
46 |
47 | //初始化数据库连接
48 | MysqlConn.initDB();
49 |
50 | //监听消息
51 | Channel channel = mq.getChannel();
52 | channel.queueDeclare(mqMysqlUpdateQueue, true, false, false, null);
53 | channel.queueBind(mqMysqlUpdateQueue, mqMysqlExchange, mqMysqlUpdateRouting);
54 | channel.basicConsume(mqMysqlUpdateQueue, false, CONSUME_TAG, new DefaultConsumer(channel) {
55 |
56 | @Override
57 | public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProperties properties,
58 | byte[] body) {
59 |
60 | long deliveryTag = envelope.getDeliveryTag();
61 |
62 | //1解析数据
63 | JSONArray data = null;
64 | try {
65 | data = JSON.parseArray(new String(body, "UTF-8"));
66 | } catch (Exception e) {
67 | logger.error("JSON解析失败,提交成功并跳过消息:" + e.getMessage(), e);
68 | confirmMsg(channel, deliveryTag);
69 | return;
70 | }
71 |
72 | //2更新数据
73 | //2.1解析数据
74 | int size = data.size();
75 | if(size == 0) {
76 | logger.info("空数据,跳过");
77 | confirmMsg(channel, deliveryTag);
78 | return;
79 | }
80 |
81 | logger.info("待更新:" + size);
82 |
83 | //2.2更新
84 | try {
85 | List pss = new ArrayList();
86 | for (int i = 0; i < data.size(); i++) {
87 | pss.add(new Ps().add(data.get(i) + "%"));
88 | }
89 |
90 | int[] cnts = DB.batchUpdate(UPDATE_HASH, pss);
91 |
92 | int n = 0;
93 | for (int i = 0; i < cnts.length; i++) {
94 | n += cnts[i];
95 | }
96 | logger.info("实际更新:" + n);
97 | }catch(Exception e) {
98 | logger.error("更新失败:" + e.getMessage(), e);
99 | testMysqlConn(channel, deliveryTag);
100 | return;
101 | }
102 |
103 | //3确认消息
104 | confirmMsg(channel, deliveryTag);
105 | }
106 | });
107 | }
108 |
109 | //test
110 | public static void main(String[] args) throws IOException {
111 | UpdateHash main = new UpdateHash();
112 | main.consume();
113 | }
114 | }
115 |
--------------------------------------------------------------------------------
/src/org/youseed/spider/saver/es/UpdateHashOnTime.java:
--------------------------------------------------------------------------------
1 | package org.youseed.spider.saver.es;
2 |
3 | import java.io.IOException;
4 | import java.util.concurrent.TimeoutException;
5 |
6 | import org.apache.logging.log4j.LogManager;
7 | import org.apache.logging.log4j.Logger;
8 | import org.elasticsearch.client.transport.TransportClient;
9 | import org.youseed.spider.ESConn;
10 | import org.youseed.spider.RabbitMQConn;
11 | import org.youseed.spider.saver.ESBasic;
12 |
13 | import com.alibaba.fastjson.JSON;
14 | import com.alibaba.fastjson.JSONArray;
15 | import com.rabbitmq.client.AMQP;
16 | import com.rabbitmq.client.Channel;
17 | import com.rabbitmq.client.DefaultConsumer;
18 | import com.rabbitmq.client.Envelope;
19 |
20 | /**
21 | * 更新Hash热度(用于定时任务调用,例如每天执行一次更新,完毕后即退出)
22 | */
23 | public class UpdateHashOnTime extends ESBasic {
24 |
25 | /**
26 | * 消费者标签
27 | */
28 | static final String CONSUME_TAG = "es-update-consumer";
29 |
30 | private static Logger logger = LogManager.getLogger(UpdateHashOnTime.class);
31 |
32 | RabbitMQConn mq = new RabbitMQConn();
33 | ESConn es = new ESConn();
34 |
35 | /**
36 | * 构造函数
37 | */
38 | public UpdateHashOnTime() {
39 | super();
40 | logger.info("starting update hash(on time) consumer...");
41 | }
42 |
43 | /**
44 | * 更新Hash
45 | */
46 | public void consume() throws IOException {
47 | TransportClient client = es.getClient();
48 | Channel channel = mq.getChannel();
49 |
50 | channel.queueDeclare(mqEsUpdateQueue, true, false, false, null);
51 | channel.queueBind(mqEsUpdateQueue, mqEsExchange, mqEsUpdateRouting);
52 |
53 | final long count = channel.messageCount(mqEsUpdateQueue);
54 | logger.info("待更新Hash批次数:" + count);
55 |
56 | channel.basicConsume(mqEsUpdateQueue, false, CONSUME_TAG, new DefaultConsumer(channel) {
57 |
58 | //需执行的批次
59 | long batch = count;
60 |
61 | @Override
62 | public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProperties properties, byte[] body) {
63 |
64 | //判断此批次是否已经处理完成
65 | if(batch <= 0) {
66 | try {
67 | channel.close();
68 | logger.info("批次任务已经更新完毕,程序退出。");
69 | return;
70 | } catch (IOException | TimeoutException e) {
71 | }
72 |
73 | }else {
74 | logger.info("剩余更新批次:" + batch);
75 | batch--;
76 | }
77 |
78 | logger.info("---------------------");
79 | long deliveryTag = envelope.getDeliveryTag();
80 |
81 | //1解析数据
82 | JSONArray data = null;
83 | try {
84 | data = JSON.parseArray(new String(body, "UTF-8"));
85 | logger.info("获取到数据,条目: " + data.size());
86 | } catch (Exception e) {
87 | logger.error("JSON解析失败,提交成功并跳过消息:" + e.getMessage(), e);
88 | confirmMsg(channel, deliveryTag);
89 | return;
90 | }
91 |
92 | //2提交处理
93 | try {
94 | int cnt = batchUpdateHash(client, data);
95 | logger.info("更新条目: " + cnt);
96 | }catch(Exception e) {
97 | logger.error("更新出错: " + e.getMessage(), e);
98 | testEsConn(client, channel, deliveryTag);
99 | return;
100 | }
101 |
102 | //3确认消息
103 | confirmMsg(channel, deliveryTag);
104 | }
105 | });
106 | }
107 |
108 |
109 | /**
110 | * 测试
111 | */
112 | public static void main(String[] args) throws IOException {
113 | UpdateHashOnTime main = new UpdateHashOnTime();
114 | main.consume();
115 | }
116 | }
117 |
--------------------------------------------------------------------------------
/src/org/youseed/spider/ConfigUtil.java:
--------------------------------------------------------------------------------
1 | package org.youseed.spider;
2 |
3 | import java.io.ByteArrayOutputStream;
4 | import java.io.FileNotFoundException;
5 | import java.io.FileReader;
6 | import java.io.IOException;
7 | import java.io.InputStream;
8 | import java.util.Map;
9 |
10 | import org.apache.logging.log4j.LogManager;
11 | import org.apache.logging.log4j.Logger;
12 | import org.yaml.snakeyaml.Yaml;
13 |
14 | import com.alibaba.fastjson.JSONObject;
15 |
16 | /**
17 | * 读取配置文件
18 | */
19 | public class ConfigUtil {
20 |
21 | /**
22 | * 配置文件路径
23 | */
24 | static final String DEFAULT_CONF = "config.yml";
25 |
26 | static String conf = null;
27 |
28 | private static Logger logger = LogManager.getLogger(ConfigUtil.class);
29 |
30 | //-------------------------------获取properties格式配置
31 | /**
32 | * 设置配置路径
33 | */
34 | public static void setConfPath(String path) {
35 | conf = path;
36 | logger.info("指定配置文件:" + path);
37 | }
38 |
39 | /**
40 | * 获取扁平配置,便于取值
41 | */
42 | public static JSONObject getProperties() {
43 | JSONObject prop = new JSONObject();
44 | JSONObject config = getConfig();
45 | iterMaps(prop, null, config);
46 | return prop;
47 | }
48 |
49 | private static void iterMaps(JSONObject prop, String key, JSONObject val) {
50 | for (Map.Entry entry : val.entrySet()) {
51 |
52 | String k = entry.getKey();
53 | Object v = entry.getValue();
54 |
55 | String flatKey = key == null ? k : key + "." + k;
56 | if(v instanceof Map) {
57 | iterMaps(prop, flatKey, new JSONObject((Map)v));
58 | }else {
59 | prop.put(flatKey, v);
60 | }
61 | }
62 | }
63 |
64 |
65 | //-------------------------------获取JSON格式配置
66 | /**
67 | * 获取FastJSON类型配置
68 | */
69 | public static JSONObject getConfig() {
70 | Yaml yaml = new Yaml();
71 |
72 | JSONObject config = null;
73 | if(conf == null) {
74 | logger.info("加载默认配置:" + DEFAULT_CONF);
75 | config = yaml.loadAs(Thread.currentThread().getContextClassLoader().getResourceAsStream(DEFAULT_CONF),
76 | JSONObject.class);
77 | }else {
78 | try {
79 | config = yaml.loadAs(new FileReader(conf), JSONObject.class);
80 | logger.info("加载配置:" + conf);
81 | } catch (FileNotFoundException e) {
82 |
83 | logger.error("加载配置文件出错:" + e.getMessage());
84 | logger.info("加载默认配置:" + DEFAULT_CONF);
85 | config = yaml.loadAs(Thread.currentThread().getContextClassLoader().getResourceAsStream(DEFAULT_CONF),
86 | JSONObject.class);
87 | }
88 | }
89 |
90 | return config;
91 | }
92 |
93 |
94 | //-------------------------------输出banner
95 | /**
96 | * 输出文本内容,默认读取配置中的banner
97 | */
98 | public static void printBanner() {
99 | printBanner(getConfig().getString("banner"));
100 | }
101 |
102 | /**
103 | * 输出文本中的内容
104 | */
105 | public static void printBanner(String path) {
106 | InputStream is = ConfigUtil.class.getClassLoader().getResourceAsStream(path);
107 | byte[] txt;
108 | try {
109 | txt = readStream(is);
110 | System.out.println(new String(txt));
111 | } catch (Exception e) {
112 | e.printStackTrace();
113 | }finally {
114 | try {
115 | is.close();
116 | } catch (IOException e) {
117 | }
118 | }
119 | }
120 |
121 | /**
122 | * 读取流
123 | */
124 | private static byte[] readStream(InputStream inStream) throws Exception {
125 | ByteArrayOutputStream outSteam = new ByteArrayOutputStream();
126 | byte[] buffer = new byte[1024];
127 | int len = -1;
128 | while ((len = inStream.read(buffer)) != -1) {
129 | outSteam.write(buffer, 0, len);
130 | }
131 | outSteam.close();
132 | inStream.close();
133 | return outSteam.toByteArray();
134 | }
135 | }
136 |
--------------------------------------------------------------------------------
/src/org/youseed/spider/saver/zsky/NewHash.java:
--------------------------------------------------------------------------------
1 | package org.youseed.spider.saver.zsky;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.logging.log4j.LogManager;
6 | import org.apache.logging.log4j.Logger;
7 | import org.rex.DB;
8 | import org.rex.db.Ps;
9 | import org.rex.db.exception.DBException;
10 | import org.youseed.spider.MysqlConn;
11 | import org.youseed.spider.RabbitMQConn;
12 | import org.youseed.spider.saver.MysqlBasic;
13 |
14 | import com.alibaba.fastjson.JSON;
15 | import com.alibaba.fastjson.JSONArray;
16 | import com.alibaba.fastjson.JSONObject;
17 | import com.rabbitmq.client.AMQP;
18 | import com.rabbitmq.client.Channel;
19 | import com.rabbitmq.client.DefaultConsumer;
20 | import com.rabbitmq.client.Envelope;
21 |
22 | import io.netty.util.internal.StringUtil;
23 |
24 | /**
25 | * 保存将消息队列中的新Hash
26 | * XXX:未使用批量写入,没有丢数据风险,但是性能比较差
27 | */
28 | public class NewHash extends MysqlBasic {
29 |
30 | /**
31 | * 消费者标签
32 | */
33 | static final String CONSUME_TAG = "mysql-zsky-new-consumer";
34 |
35 | private static Logger logger = LogManager.getLogger(NewHash.class);
36 |
37 | RabbitMQConn mq = new RabbitMQConn();
38 |
39 | public NewHash() {
40 | super();
41 | }
42 |
43 | //新hash
44 | final static String INSERT_HASH = "INSERT IGNORE INTO search_hash(info_hash, category, data_hash, name, extension, source_ip, length, create_time, last_seen, requests) "
45 | + "VALUES (?,?,?,?,?,?,?,now(),now(),1)";
46 | //新filelist
47 | final static String INSERT_FILELIST = "INSERT IGNORE INTO search_filelist(info_hash, file_list) VALUES (?,?)";
48 |
49 | /**
50 | * 处理Hash
51 | */
52 | public void consume() throws IOException {
53 |
54 | //初始化数据库连接
55 | MysqlConn.initDB();
56 |
57 | //监听消息
58 | Channel channel = mq.getChannel();
59 | channel.queueDeclare(mqMysqlNewQueue, true, false, false, null);
60 | channel.queueBind(mqMysqlNewQueue, mqMysqlExchange, mqMysqlNewRouting);
61 | channel.basicConsume(mqMysqlNewQueue, false, CONSUME_TAG, new DefaultConsumer(channel) {
62 |
63 | @Override
64 | public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProperties properties,
65 | byte[] body) {
66 |
67 | long deliveryTag = envelope.getDeliveryTag();
68 |
69 | //1.解析数据
70 | JSONObject data = null;
71 | try {
72 | data = JSON.parseObject(new String(body, "UTF-8"));
73 | } catch (Exception e) {
74 | logger.error("JSON解析失败,提交成功并跳过消息:" + e.getMessage(), e);
75 | confirmMsg(channel, deliveryTag);
76 | return;
77 | }
78 |
79 | //2.保存数据
80 | //2.1格式校验
81 | String infoHash = data.getString("info_hash");
82 | String name = data.getString("name");
83 | if(StringUtil.isNullOrEmpty(infoHash) || StringUtil.isNullOrEmpty(name)) {
84 | logger.info("数据格式不正确,跳过消息");
85 | confirmMsg(channel, deliveryTag);
86 | return;
87 | }
88 |
89 | logger.info("新资源: " + infoHash);
90 |
91 | // 3保存数据
92 | try {
93 | //2.2hash
94 | DB.update(INSERT_HASH,new Ps(
95 | infoHash,
96 | data.getString("category"),
97 | data.getString("data_hash"),
98 | data.getString("name"),
99 | data.getString("extension") == null ? null : data.getString("extension").trim(),
100 | data.getString("source_ip"),
101 | data.getLong("length")));
102 |
103 | //2.3files
104 | JSONArray filelist = data.getJSONArray("filelist");
105 | if (filelist != null) {
106 | DB.update(INSERT_FILELIST, new Ps(infoHash, filelist.toJSONString()));
107 | }
108 |
109 | } catch (Exception e) {
110 | logger.error("保存失败:" + e.getMessage(), e);
111 | testMysqlConn(channel, deliveryTag);
112 | return;
113 | }
114 |
115 | //4确认消息
116 | confirmMsg(channel, deliveryTag);
117 | }
118 |
119 | });
120 | }
121 |
122 | //test
123 | public static void main(String[] args) throws IOException, DBException {
124 | NewHash main = new NewHash();
125 | main.consume();
126 | }
127 | }
128 |
--------------------------------------------------------------------------------
/src/org/youseed/spider/saver/MysqlBasic.java:
--------------------------------------------------------------------------------
1 | package org.youseed.spider.saver;
2 |
3 | import java.sql.SQLException;
4 |
5 | import org.apache.logging.log4j.LogManager;
6 | import org.apache.logging.log4j.Logger;
7 | import org.rex.DB;
8 | import org.rex.db.exception.DBException;
9 | import org.youseed.spider.ConfigUtil;
10 |
11 | import com.alibaba.fastjson.JSONObject;
12 | import com.rabbitmq.client.Channel;
13 |
14 | /**
15 | * Mysql for zsky
16 | */
17 | public class MysqlBasic extends MQBasic{
18 |
19 | private static Logger logger = LogManager.getLogger(MysqlBasic.class);
20 |
21 | protected String mqMysqlExchange = "store";
22 |
23 | protected String mqMysqlNewQueue = "store.new";
24 | protected String mqMysqlNewRouting = "*.new";
25 |
26 | protected String mqMysqlUpdateQueue = "store.update";
27 | protected String mqMysqlUpdateRouting = "*.update";
28 |
29 | protected String mqMysqlStatQueue = "store.stat";
30 | protected String mqMysqlStatRouting = "*.stat";
31 |
32 | protected String tableHash = "search_hash";
33 | protected String tableFilelist = "search_filelist";
34 | protected String tableStat = "search_statusreport";
35 |
36 | /**
37 | * 构造函数
38 | */
39 | public MysqlBasic() {
40 | ConfigUtil.printBanner();
41 |
42 | JSONObject config = ConfigUtil.getProperties();
43 | mqMysqlExchange = config.containsKey("binding.mysql.exchage") ? config.getString("binding.mysql.exchage") : mqMysqlExchange;
44 | mqMysqlNewQueue = config.containsKey("binding.mysql.new.queue") ? config.getString("binding.mysql.new.queue") : mqMysqlNewQueue;
45 | mqMysqlNewRouting = config.containsKey("binding.mysql.new.routing") ? config.getString("binding.mysql.new.routing") : mqMysqlNewRouting;
46 | mqMysqlUpdateQueue = config.containsKey("binding.mysql.update.queue") ? config.getString("binding.mysql.update.queue") : mqMysqlUpdateQueue;
47 | mqMysqlUpdateRouting = config.containsKey("binding.mysql.update.routing") ? config.getString("binding.mysql.update.routing") : mqMysqlUpdateRouting;
48 | mqMysqlStatQueue = config.containsKey("binding.mysql.stat.queue") ? config.getString("binding.mysql.stat.queue") : mqMysqlStatQueue;
49 | mqMysqlStatRouting = config.containsKey("binding.mysql.stat.routing") ? config.getString("binding.mysql.stat.routing") : mqMysqlStatRouting;
50 |
51 | tableHash = config.containsKey("store.mysql.hash") ? config.getString("store.mysql.hash") : tableHash;
52 | tableFilelist = config.containsKey("store.mysql.filelist") ? config.getString("store.mysql.filelist") : tableFilelist;
53 | tableStat = config.containsKey("store.mysql.stat") ? config.getString("store.mysql.stat") : tableStat;
54 |
55 | logger.info("---------RabbitMQ/Mysql-ZSKY绑定配置------------");
56 | logger.info("交换器|binding.mysql.exchage: " + mqMysqlExchange);
57 | logger.info("新资源队列|binding.mysql.new.queue: " + mqMysqlNewQueue);
58 | logger.info("新资源路由|binding.mysql.new.routing: " + mqMysqlNewRouting);
59 | logger.info("更新资源队列|binding.mysql.update.queue: " + mqMysqlUpdateQueue);
60 | logger.info("更新资源路由|binding.mysql.update.routing: " + mqMysqlUpdateRouting);
61 | logger.info("爬虫统计队列|binding.mysql.stat.queue: " + mqMysqlStatQueue);
62 | logger.info("爬虫统计路由|binding.mysql.stat.routing: " + mqMysqlStatRouting);
63 | logger.info("资源明细|store.mysql.index: " + tableHash);
64 | logger.info("文件列表|store.mysql.type: " + tableFilelist);
65 | logger.info("爬虫统计信息|store.mysql.stat: " + tableStat);
66 | logger.info("---------------------------------------------");
67 | }
68 |
69 | /**
70 | * 测试Mysql连接是否正常,并进行相应的消息处理
71 | */
72 | public void testMysqlConn(Channel channel, long deliveryTag) {
73 |
74 | boolean alive = false;
75 | try {
76 | alive = DB.getConnection().isValid(1000);//连接是否健在
77 | } catch (SQLException | DBException e) {
78 | logger.error("测试数据库连接失败:"+e.getMessage(), e);
79 | }
80 |
81 | if(alive) {
82 | logger.info("Mysql连接正常,提交确认至消息队列");
83 | confirmMsg(channel, deliveryTag);
84 | } else {
85 | logger.info("Mysql连接失败,退回当前数据至队列");
86 | rejectMsg(channel, deliveryTag);
87 | try {
88 | logger.info("暂停60秒...");
89 | Thread.sleep(60000);
90 | return;
91 | } catch (InterruptedException e1) {
92 | }
93 | }
94 | }
95 | }
96 |
--------------------------------------------------------------------------------
/src/org/youseed/spider/saver/mongo/NewHash.java:
--------------------------------------------------------------------------------
1 | package org.youseed.spider.saver.mongo;
2 |
3 | import java.io.IOException;
4 | import java.util.Calendar;
5 | import java.util.Date;
6 |
7 | import org.apache.logging.log4j.LogManager;
8 | import org.apache.logging.log4j.Logger;
9 | import org.bson.Document;
10 | import org.youseed.spider.MongoConn;
11 | import org.youseed.spider.RabbitMQConn;
12 | import org.youseed.spider.SpiderConfig;
13 | import org.youseed.spider.saver.MongoBasic;
14 | import org.youseed.spider.saver.SpamAnalyzer;
15 |
16 | import com.alibaba.fastjson.JSON;
17 | import com.alibaba.fastjson.JSONArray;
18 | import com.alibaba.fastjson.JSONObject;
19 | import com.rabbitmq.client.AMQP;
20 | import com.rabbitmq.client.Channel;
21 | import com.rabbitmq.client.DefaultConsumer;
22 | import com.rabbitmq.client.Envelope;
23 |
24 | import io.netty.util.internal.StringUtil;
25 |
26 | /**
27 | * 保存将消息队列中的新Hash
28 | */
29 | public class NewHash extends MongoBasic {
30 |
31 | /**
32 | * 消费者标签
33 | */
34 | static final String CONSUME_TAG = "mongo-new-consumer";
35 |
36 | private static Logger logger = LogManager.getLogger(NewHash.class);
37 |
38 | RabbitMQConn mq = new RabbitMQConn();
39 |
40 | MongoConn mongo = new MongoConn();
41 |
42 | public NewHash() {
43 | super();
44 | }
45 |
46 | /**
47 | * 处理Hash
48 | */
49 | public void consume() throws IOException {
50 | //监听消息
51 | Channel channel = mq.getChannel();
52 | channel.queueDeclare(mqMongoNewQueue, true, false, false, null);
53 | channel.queueBind(mqMongoNewQueue, mqMongoExchange, mqMongoNewRouting);
54 | channel.basicConsume(mqMongoNewQueue, false, CONSUME_TAG, new DefaultConsumer(channel) {
55 |
56 | @Override
57 | public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProperties properties,
58 | byte[] body) {
59 |
60 | long deliveryTag = envelope.getDeliveryTag();
61 |
62 | //1.解析数据
63 | JSONObject data = null;
64 | try {
65 | data = JSON.parseObject(new String(body, "UTF-8"));
66 | } catch (Exception e) {
67 | logger.error("JSON解析失败,提交成功并跳过消息:" + e.getMessage(), e);
68 | confirmMsg(channel, deliveryTag);
69 | return;
70 | }
71 |
72 | //2.保存数据
73 | //2.1格式校验
74 | String infoHash = data.getString("info_hash");
75 | String name = data.getString("name");
76 | if(StringUtil.isNullOrEmpty(infoHash) || StringUtil.isNullOrEmpty(name)) {
77 | logger.info("数据格式不正确,跳过消息");
78 | confirmMsg(channel, deliveryTag);
79 | return;
80 | }
81 |
82 | //2.2hash
83 | String shortHash = infoHash.substring(0, 16);
84 | logger.info("新资源: " + shortHash);
85 |
86 | //2.3处理文件列表
87 | int fileCount = 0;
88 | JSONArray filelist_5 = null;
89 |
90 | JSONArray filelist = data.getJSONArray("filelist");
91 | if (filelist != null) {
92 | fileCount = data.getIntValue("file_count");
93 | filelist_5 = filelist.size() > 5 ? new JSONArray(filelist.subList(0, 5)) : filelist;
94 | } else {
95 | fileCount = 0;
96 | filelist_5 = new JSONArray();
97 | }
98 |
99 | //2.4准备files数据
100 | Document file = null;
101 |
102 | if (filelist != null) {
103 | file = new Document();
104 | file.put("short_hash", shortHash);
105 | file.put("info_hash", infoHash);
106 | file.put("file_count", fileCount);
107 | file.put("file_list", filelist);
108 | }
109 |
110 | //2.5准备hash表数据
111 | Document hash = new Document();
112 |
113 | Date now = Calendar.getInstance().getTime();
114 | hash.put("short_hash", shortHash);
115 | hash.put("info_hash", infoHash);
116 | hash.put("category", data.getString("category"));
117 | hash.put("data_hash", data.getString("data_hash"));
118 | hash.put("name", data.getString("name"));
119 | hash.put("file_count", fileCount);
120 | hash.put("filelist_5", filelist_5);
121 | hash.put("extension", data.getString("extension") == null ? null : data.getString("extension").trim());
122 | hash.put("source_ip", data.getString("source_ip"));
123 | hash.put("length", data.getLong("length"));
124 | hash.put("create_time", now);
125 | hash.put("last_seen", now);
126 | hash.put("requests", 1);
127 |
128 | boolean spam = SpamAnalyzer.isSpam(data.getString("name"), filelist_5);
129 | if(spam)
130 | data.put("spam", spam);
131 |
132 |
133 | // 3保存数据
134 | try {
135 | mongo.save(SpiderConfig.COLL_HASH, hash);
136 | if (file != null)
137 | mongo.save(SpiderConfig.COLL_FILE, file);
138 | } catch (Exception e) {
139 | logger.error("保存失败:" + e.getMessage(), e);
140 | testMongoConn(mongo, channel, deliveryTag);
141 | return;
142 | }
143 |
144 | //4确认消息
145 | confirmMsg(channel, deliveryTag);
146 | }
147 |
148 | });
149 | }
150 |
151 | //test
152 | public static void main(String[] args) throws IOException {
153 | NewHash main = new NewHash();
154 | main.consume();
155 | }
156 | }
157 |
--------------------------------------------------------------------------------
/src/org/youseed/spider/saver/es/NewHash.java:
--------------------------------------------------------------------------------
1 | package org.youseed.spider.saver.es;
2 |
3 | import java.io.IOException;
4 | import java.util.Arrays;
5 | import java.util.Calendar;
6 | import java.util.List;
7 |
8 | import org.apache.logging.log4j.LogManager;
9 | import org.apache.logging.log4j.Logger;
10 | import org.elasticsearch.action.index.IndexResponse;
11 | import org.elasticsearch.client.transport.TransportClient;
12 | import org.youseed.spider.ConfigUtil;
13 | import org.youseed.spider.ESConn;
14 | import org.youseed.spider.RabbitMQConn;
15 | import org.youseed.spider.SpiderConfig;
16 | import org.youseed.spider.saver.ESBasic;
17 | import org.youseed.spider.saver.SpamAnalyzer;
18 |
19 | import com.alibaba.fastjson.JSON;
20 | import com.alibaba.fastjson.JSONArray;
21 | import com.alibaba.fastjson.JSONObject;
22 | import com.rabbitmq.client.AMQP;
23 | import com.rabbitmq.client.Channel;
24 | import com.rabbitmq.client.DefaultConsumer;
25 | import com.rabbitmq.client.Envelope;
26 |
27 | import io.netty.util.internal.StringUtil;
28 |
29 | /**
30 | * 写入新资源到ES
31 | */
32 | public class NewHash extends ESBasic {
33 |
34 | /**
35 | * 消费者标签
36 | */
37 | static final String CONSUME_TAG_PREFIX = "es-new-consumer";
38 |
39 | private static Logger logger = LogManager.getLogger(NewHash.class);
40 |
41 | RabbitMQConn mq = new RabbitMQConn();
42 |
43 | ESConn es = new ESConn();
44 |
45 | /**
46 | * 构造函数
47 | */
48 | public NewHash() {
49 | super();
50 | logger.info("starting new hash consumer...");
51 | }
52 |
53 | /**
54 | * 处理Hash
55 | */
56 | public void consume() throws IOException {
57 |
58 | //只入库指定的分类
59 | String catsConf = ConfigUtil.getConfig().getString("includeCategories");
60 | final List cats = StringUtil.isNullOrEmpty(catsConf) ? null : Arrays.asList(catsConf.split(","));
61 |
62 | //开始连接MQ
63 | TransportClient client = es.getClient();
64 | Channel channel = mq.getChannel();
65 |
66 | channel.queueDeclare(mqEsNewQueue, true, false, false, null);
67 | channel.queueBind(mqEsNewQueue, mqEsExchange, mqEsNewRouting);
68 | channel.basicConsume(mqEsNewQueue, false, CONSUME_TAG_PREFIX, new DefaultConsumer(channel) {
69 |
70 | @Override
71 | public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProperties properties,
72 | byte[] body) {
73 |
74 | long deliveryTag = envelope.getDeliveryTag();
75 |
76 | //1解析数据
77 | JSONObject data = null;
78 | try {
79 | data = JSON.parseObject(new String(body, "UTF-8"));
80 | } catch (Exception e) {
81 | logger.error("JSON解析失败,提交成功并跳过消息:" + e.getMessage(), e);
82 | confirmMsg(channel, deliveryTag);
83 | return;
84 | }
85 |
86 | //2分析数据
87 | //2.1格式校验
88 | String infoHash = data.getString("info_hash");
89 | String cat = data.getString("category");
90 | String name = data.getString("name");
91 | if(StringUtil.isNullOrEmpty(cat) || StringUtil.isNullOrEmpty(infoHash) || StringUtil.isNullOrEmpty(name)) {
92 | logger.info("数据格式不正确,跳过消息");
93 | confirmMsg(channel, deliveryTag);
94 | return;
95 | }
96 |
97 | //2.2检查分类
98 | if(cats != null) {
99 | //2.2.1类型不在设定范围,跳过消息
100 | if(!cats.contains(cat)) {
101 | confirmMsg(channel, deliveryTag);
102 | logger.info("skip: " + cat);
103 | return;
104 | }
105 |
106 |
107 | //2.2.2如果只设定了一个类型,将文件扩展名视为分类
108 | if(cats.size() == 1) {
109 | String ext = data.getString("extension").trim();
110 | ext = ext.substring(1);
111 | data.put("category", ext);
112 | data.remove("extension");
113 | }
114 |
115 | logger.info("store: " + data.get("category"));
116 | }
117 |
118 | //2.2hash
119 | String shortHash = infoHash.substring(0, 16);
120 | logger.info("新资源: " + shortHash);
121 |
122 | //2.3如果没有文件,写空数据
123 | if(!data.containsKey("filelist_5")) {
124 | data.put("file_count", 0);
125 | data.put("filelist_5", new JSONArray());
126 | }
127 |
128 | //2.4敏感词(校验name和fileslist_5)
129 | boolean spam = SpamAnalyzer.isSpam(name, data.getJSONArray("filelist_5"));
130 | if(spam) {
131 | data.put("spam", true);
132 | }
133 |
134 | //2.5日期
135 | long now = Calendar.getInstance().getTimeInMillis()/1000;
136 | data.put("create_time", now);
137 | data.put("last_seen", now);
138 |
139 | //2.6移除多余字段
140 | data.remove("data_hash");
141 | data.remove("extension");
142 | data.remove("source_ip");
143 |
144 | //3写入搜索引擎
145 | try {
146 | IndexResponse resp = client.prepareIndex(SpiderConfig.ES_INDEX_SEED, SpiderConfig.ES_TYPE_SEED)
147 | .setId(shortHash).setSource(data).execute().actionGet();
148 |
149 | }catch(Exception e) {
150 | logger.error("写入Hash出错: " + e.getMessage(), e);
151 | testEsConn(client, channel, deliveryTag);
152 | return;
153 | }
154 |
155 | //4确认消息
156 | confirmMsg(channel, deliveryTag);
157 | }
158 | });
159 | }
160 |
161 | //test
162 | public static void main(String[] args) throws IOException {
163 | NewHash main = new NewHash();
164 | main.consume();
165 | }
166 | }
167 |
--------------------------------------------------------------------------------
/src/org/youseed/spider/saver/ESBasic.java:
--------------------------------------------------------------------------------
1 | package org.youseed.spider.saver;
2 |
3 | import java.util.Calendar;
4 | import java.util.HashMap;
5 | import java.util.Map;
6 |
7 | import org.apache.logging.log4j.LogManager;
8 | import org.apache.logging.log4j.Logger;
9 | import org.elasticsearch.action.admin.cluster.node.info.NodesInfoRequest;
10 | import org.elasticsearch.action.admin.cluster.node.info.NodesInfoResponse;
11 | import org.elasticsearch.action.bulk.BulkItemResponse;
12 | import org.elasticsearch.action.bulk.BulkRequestBuilder;
13 | import org.elasticsearch.action.bulk.BulkResponse;
14 | import org.elasticsearch.action.update.UpdateRequest;
15 | import org.elasticsearch.client.transport.TransportClient;
16 | import org.elasticsearch.script.Script;
17 | import org.elasticsearch.script.ScriptType;
18 | import org.youseed.spider.ConfigUtil;
19 | import org.youseed.spider.SpiderConfig;
20 |
21 | import com.alibaba.fastjson.JSONArray;
22 | import com.alibaba.fastjson.JSONObject;
23 | import com.rabbitmq.client.Channel;
24 |
25 | /**
26 | * 写入ES通用方法
27 | */
28 | public class ESBasic extends MQBasic{
29 |
30 | private static Logger logger = LogManager.getLogger(ESBasic.class);
31 |
32 | protected String mqEsExchange = "es";
33 |
34 | protected String mqEsNewQueue = "es.new";
35 | protected String mqEsNewRouting = "*.new";
36 |
37 | protected String mqEsUpdateQueue = "es.update";
38 | protected String mqEsUpdateRouting = "*.update";
39 |
40 | protected String esIndex = "seed";
41 | protected String esType = "seed";
42 |
43 | /**
44 | * 构造函数
45 | */
46 | public ESBasic() {
47 | JSONObject config = ConfigUtil.getProperties();
48 |
49 | mqEsExchange = config.containsKey("binding.es.exchage") ? config.getString("binding.es.exchage") : mqEsExchange;
50 | mqEsNewQueue = config.containsKey("binding.es.new.queue") ? config.getString("binding.es.new.queue") : mqEsNewQueue;
51 | mqEsNewRouting = config.containsKey("binding.es.new.routing") ? config.getString("binding.es.new.routing") : mqEsNewRouting;
52 | mqEsUpdateQueue = config.containsKey("binding.es.update.queue") ? config.getString("binding.es.update.queue") : mqEsUpdateQueue;
53 | mqEsUpdateRouting = config.containsKey("binding.es.update.routing") ? config.getString("binding.es.update.routing") : mqEsUpdateRouting;
54 | esIndex = config.containsKey("store.es.index") ? config.getString("store.es.index") : esIndex;
55 | esType = config.containsKey("store.es.type") ? config.getString("store.es.type") : esType;
56 |
57 | logger.info("---------RabbitMQ/Elasticsearch绑定配置------------");
58 | logger.info("交换器|binding.es.exchage: " + mqEsExchange);
59 | logger.info("新资源队列|binding.es.new.queue: " + mqEsNewQueue);
60 | logger.info("新资源路由|binding.es.new.routing: " + mqEsNewRouting);
61 | logger.info("更新资源队列|binding.es.update.queue : " + mqEsUpdateQueue);
62 | logger.info("更新资源路由|binding.es.update.routing: " + mqEsUpdateRouting);
63 | logger.info("索引|store.es.index : " + esIndex);
64 | logger.info("索引类型|store.es.type: " + esType);
65 | logger.info("---------------------------------------------");
66 | }
67 |
68 | /**
69 | * 批量更新hash,返回成功条目
70 | */
71 | public int batchUpdateHash(TransportClient client, JSONArray data) {
72 | int now = (int)Calendar.getInstance().getTimeInMillis()/1000;
73 | Map params = new HashMap();
74 | params.put("now", now);
75 |
76 | //1更新
77 | BulkRequestBuilder bulkRequest = client.prepareBulk();
78 | for (int i = 0; i < data.size(); i++) {
79 | String shortHash = data.getString(i);
80 | UpdateRequest updateRequest = new UpdateRequest(SpiderConfig.ES_INDEX_SEED, SpiderConfig.ES_TYPE_SEED, shortHash);
81 | updateRequest.script(new Script(ScriptType.INLINE, Script.DEFAULT_SCRIPT_LANG,
82 | "ctx._source.requests+=1;ctx._source.last_seen=params.now", params));
83 |
84 | bulkRequest.add(updateRequest);
85 | }
86 |
87 | //2读取结果
88 | HashMap errList = new HashMap();
89 | BulkResponse resp = (BulkResponse)bulkRequest.execute().actionGet();
90 | if(resp.hasFailures()) {
91 | BulkItemResponse[] item = resp.getItems();
92 | for(int i = 0; i < item.length; i++) {
93 | if(item[i].isFailed()) {
94 | errList.put(item[i].getId(), item[i].getFailure().getMessage());
95 | }
96 | }
97 | logger.error("失败条目: " + errList.size() + "|" + errList.values());
98 | }
99 | return data.size() - errList.size();
100 | }
101 |
102 | /**
103 | * 测试ES连接是否正常,并进行相应的消息处理
104 | */
105 | public void testEsConn(TransportClient client, Channel channel, long deliveryTag) {
106 | logger.info("测试 es连接....");
107 | try {
108 | NodesInfoResponse response = client.admin().cluster().nodesInfo(new NodesInfoRequest().timeout("30s")).actionGet();
109 | response.getNodesMap();
110 |
111 | logger.info("es连接正常,提交确认至消息队列");
112 | confirmMsg(channel, deliveryTag);
113 | } catch (Exception e) {
114 | logger.info("es连接失败,退回当前数据至队列", e);
115 | rejectMsg(channel, deliveryTag);
116 |
117 | try {
118 | logger.info("暂停60秒...");
119 | Thread.sleep(60000);
120 | } catch (InterruptedException e2) {
121 | }
122 | }
123 | }
124 | }
125 |
--------------------------------------------------------------------------------
/src/org/youseed/spider/saver/MongoBasic.java:
--------------------------------------------------------------------------------
1 | package org.youseed.spider.saver;
2 |
3 | import java.util.ArrayList;
4 | import java.util.Calendar;
5 | import java.util.Date;
6 | import java.util.List;
7 |
8 | import org.apache.logging.log4j.LogManager;
9 | import org.apache.logging.log4j.Logger;
10 | import org.bson.Document;
11 | import org.youseed.spider.ConfigUtil;
12 | import org.youseed.spider.MongoConn;
13 | import org.youseed.spider.SpiderConfig;
14 |
15 | import com.alibaba.fastjson.JSONArray;
16 | import com.alibaba.fastjson.JSONObject;
17 | import com.mongodb.bulk.BulkWriteResult;
18 | import com.mongodb.client.MongoCollection;
19 | import com.mongodb.client.model.UpdateOneModel;
20 | import com.mongodb.client.model.UpdateOptions;
21 | import com.mongodb.client.model.WriteModel;
22 | import com.rabbitmq.client.Channel;
23 |
24 | /**
25 | * 测试Mongo连接
26 | */
27 | public class MongoBasic extends MQBasic{
28 |
29 | private static Logger logger = LogManager.getLogger(MongoBasic.class);
30 |
31 | protected String mqMongoExchange = "store";
32 |
33 | protected String mqMongoNewQueue = "store.new";
34 | protected String mqMongoNewRouting = "*.new";
35 |
36 | protected String mqMongoUpdateQueue = "store.update";
37 | protected String mqMongoUpdateRouting = "*.update";
38 |
39 | protected String mqMongoStatQueue = "store.stat";
40 | protected String mqMongoStatRouting = "*.stat";
41 |
42 | protected String collHash = "seed_hash";
43 | protected String collFilelist = "seed_filelist";
44 | protected String collStat = "seed_stat";
45 |
46 | /**
47 | * 构造函数
48 | */
49 | public MongoBasic() {
50 | ConfigUtil.printBanner();
51 |
52 | JSONObject config = ConfigUtil.getProperties();
53 | mqMongoExchange = config.containsKey("binding.mongo.exchage") ? config.getString("binding.mongo.exchage") : mqMongoExchange;
54 | mqMongoNewQueue = config.containsKey("binding.mongo.new.queue") ? config.getString("binding.mongo.new.queue") : mqMongoNewQueue;
55 | mqMongoNewRouting = config.containsKey("binding.mongo.new.routing") ? config.getString("binding.mongo.new.routing") : mqMongoNewRouting;
56 | mqMongoUpdateQueue = config.containsKey("binding.mongo.update.queue") ? config.getString("binding.mongo.update.queue") : mqMongoUpdateQueue;
57 | mqMongoUpdateRouting = config.containsKey("binding.mongo.update.routing") ? config.getString("binding.mongo.update.routing") : mqMongoUpdateRouting;
58 | mqMongoStatQueue = config.containsKey("binding.mongo.stat.queue") ? config.getString("binding.mongo.stat.queue") : mqMongoStatQueue;
59 | mqMongoStatRouting = config.containsKey("binding.mongo.stat.routing") ? config.getString("binding.mongo.stat.routing") : mqMongoStatRouting;
60 |
61 | collHash = config.containsKey("store.mongo.hash") ? config.getString("store.mongo.hash") : collHash;
62 | collFilelist = config.containsKey("store.mongo.filelist") ? config.getString("store.mongo.filelist") : collFilelist;
63 | collStat = config.containsKey("store.mongo.stat") ? config.getString("store.mongo.stat") : collStat;
64 |
65 |
66 | logger.info("---------RabbitMQ/Mongodb绑定配置------------");
67 | logger.info("交换器|binding.mongo.exchage: " + mqMongoExchange);
68 | logger.info("新资源队列|binding.mongo.new.queue: " + mqMongoNewQueue);
69 | logger.info("新资源路由|binding.mongo.new.routing: " + mqMongoNewRouting);
70 | logger.info("更新资源队列|binding.mongo.update.queue: " + mqMongoUpdateQueue);
71 | logger.info("更新资源路由|binding.mongo.update.routing: " + mqMongoUpdateRouting);
72 | logger.info("爬虫统计队列|binding.mongo.stat.queue: " + mqMongoStatQueue);
73 | logger.info("爬虫统计路由|binding.mongo.stat.routing: " + mqMongoStatRouting);
74 | logger.info("资源明细|store.mongo.index: " + collHash);
75 | logger.info("文件列表|store.mongo.type: " + collFilelist);
76 | logger.info("爬虫统计信息|store.mongo.stat: " + collStat);
77 | logger.info("---------------------------------------------");
78 | }
79 |
80 | /**
81 | * 如果存在,则设置值
82 | */
83 | private void setIfExists(JSONObject prop, String param, String key) {
84 | if(prop.containsKey(key)) {
85 | param = prop.getString(key);
86 | }
87 | }
88 |
89 | /**
90 | * 批量更新hash,返回成功条目
91 | */
92 | public int bulkUpdate(MongoConn mongo, JSONArray shortHashs){
93 | Date now = Calendar.getInstance().getTime();
94 | MongoCollection coll = mongo.getCollection(SpiderConfig.COLL_HASH);
95 |
96 | List> requests = new ArrayList>();
97 | for (int i = 0; i < shortHashs.size(); i++) {
98 | String shortHash = shortHashs.getString(i);
99 | UpdateOneModel uom = new UpdateOneModel(
100 | new Document("short_hash", shortHash),
101 | new Document().append("$set", new Document("last_seen", now)).append("$inc", new Document("requests", 1)),
102 | new UpdateOptions().upsert(false));
103 | requests.add(uom);
104 | }
105 | BulkWriteResult bulkWriteResult = coll.bulkWrite(requests);
106 | return bulkWriteResult.getModifiedCount();
107 | }
108 |
109 | /**
110 | * 测试Mongo连接是否正常,并进行相应的消息处理
111 | */
112 | public void testMongoConn(MongoConn mongo, Channel channel, long deliveryTag) {
113 |
114 | boolean ok = testMongoConn(mongo, SpiderConfig.COLL_HASH);
115 | if(ok) {
116 | logger.info("mongodb连接正常,提交确认至消息队列");
117 | confirmMsg(channel, deliveryTag);
118 | } else {
119 | logger.info("mongodb连接失败,退回当前数据至队列");
120 | rejectMsg(channel, deliveryTag);
121 |
122 | try {
123 | logger.info("暂停60秒...");
124 | Thread.sleep(60000);
125 | return;
126 | } catch (InterruptedException e1) {
127 | }
128 | }
129 | }
130 |
131 | /**
132 | * 测试Mongo连接
133 | */
134 | public boolean testMongoConn(MongoConn mongo, String collName) {
135 | try {
136 | mongo.getCollection(collName).find().first();
137 | return true;
138 | } catch (Exception e) {
139 | return false;
140 | }
141 | }
142 | }
143 |
--------------------------------------------------------------------------------
/src/org/youseed/Main.java:
--------------------------------------------------------------------------------
1 | package org.youseed;
2 |
3 | import java.io.IOException;
4 | import java.text.ParseException;
5 | import java.text.SimpleDateFormat;
6 | import java.util.Calendar;
7 | import java.util.Date;
8 | import java.util.LinkedHashMap;
9 | import java.util.Map;
10 | import java.util.Scanner;
11 | import java.util.Timer;
12 | import java.util.TimerTask;
13 |
14 | import org.apache.logging.log4j.LogManager;
15 | import org.apache.logging.log4j.Logger;
16 | import org.youseed.spider.ConfigUtil;
17 | import org.youseed.spider.saver.mongo.SaveStat;
18 |
19 | import com.alibaba.fastjson.JSONObject;
20 |
21 | /**
22 | * 执行入口
23 | */
24 | public class Main {
25 |
26 | private static Logger logger = LogManager.getLogger(Main.class);
27 |
28 | public static void main(String[] args) throws IOException {
29 |
30 | String order = null;
31 |
32 | //1.分析参数,支持 “--config=/opt/config.yml”、“m1”这两种
33 | if ((args != null) && (args.length > 0)) {
34 | for (int i = 0; i < args.length; i++) {
35 | //设置配置路径
36 | if(args[i].startsWith("--config=")) {
37 | String path = args[i].substring(9);
38 |
39 | if(!"".equals(path.trim()))
40 | ConfigUtil.setConfPath(path);
41 | }else {
42 | order = args[i];
43 | }
44 | }
45 | }
46 |
47 | if(order == null){
48 | order = getOperId();
49 | }
50 |
51 | //2.输出banner
52 | ConfigUtil.printBanner();
53 |
54 | //3.执行选择的操作
55 |
56 | //--Mongodb写入+更新操作
57 | if ("m".equals(order)) {
58 | new org.youseed.spider.saver.mongo.NewHash().consume();
59 | new org.youseed.spider.saver.mongo.UpdateHash().consume();
60 | new SaveStat().consume();
61 | }
62 |
63 | //--写入新资源到Mongodb
64 | else if ("m1".equals(order)) {
65 | new org.youseed.spider.saver.mongo.NewHash().consume();
66 | }
67 |
68 | //--更新资源到Mongodb
69 | else if ("m2".equals(order)) {
70 | new org.youseed.spider.saver.mongo.UpdateHash().consume();
71 | }
72 |
73 | //--写入爬虫日志到Mongodb
74 | else if ("m3".equals(order)) {
75 | new SaveStat().consume();
76 | }
77 |
78 | //--ES的写入和更新操作(根据esUpdateTime设置自动选择实时或定时更新)
79 | else if ("es".equals(order)) {
80 |
81 | //1.启动写入新资源的消费者
82 | new org.youseed.spider.saver.es.NewHash().consume();
83 |
84 | //2.根据定时配置,启动持续或者定时更新消费者
85 | JSONObject config = ConfigUtil.getProperties();
86 | String t = config.getString("esUpdateTime");
87 |
88 | //2.1没设置时间,常驻监听模式
89 | if(t == null || "".equals(t.trim())) {
90 | logger.info("常驻内存随时更新");
91 | new org.youseed.spider.saver.es.UpdateHash().consume();
92 | return;
93 | }
94 |
95 | Date time = null;
96 | SimpleDateFormat sdf = new SimpleDateFormat("HH:mm:ss");
97 | try {
98 | Calendar calNow = Calendar.getInstance();
99 |
100 | Calendar cal = Calendar.getInstance();
101 | cal.setTime(sdf.parse(t));
102 | cal.set(Calendar.YEAR, calNow.get(Calendar.YEAR));
103 | cal.set(Calendar.MONTH, calNow.get(Calendar.MONTH));
104 | cal.set(Calendar.DATE, calNow.get(Calendar.DATE));
105 |
106 | if (cal.getTime().before(calNow.getTime())) {
107 | cal.add(Calendar.DAY_OF_MONTH, 1);
108 | }
109 |
110 | time = cal.getTime();
111 |
112 | } catch (ParseException e) {
113 | logger.error("格式化定时更新ES出错" + e.getMessage(), e);
114 | }
115 |
116 | //2.2.格式化时间错误,常驻监听模式
117 | if(time == null) {
118 | logger.info("常驻内存随时更新");
119 | new org.youseed.spider.saver.es.UpdateHash().consume();
120 | return;
121 | }
122 |
123 | //2.3.时间设置无误,启动定时任务
124 | logger.info("已设置为定时更新资源,每日更新时间:" + sdf.format(time) +
125 | ",首次执行时间:" + new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(time));
126 |
127 | Timer timer = new Timer();
128 | timer.schedule(new TimerTask(){
129 | @Override
130 | public void run() {
131 | try {
132 | logger.info("开始执行更新任务");
133 | new org.youseed.spider.saver.es.UpdateHashOnTime().consume();
134 | } catch (IOException e) {
135 | logger.error("更新任务执行出错" + e.getMessage(), e);
136 | }
137 | logger.info("更新任务执行完毕");
138 | }
139 | }, time, 24 * 60 * 60 * 1000);
140 |
141 | }
142 |
143 | //--写入新资源到ES
144 | else if ("es1".equals(order)) {
145 | new org.youseed.spider.saver.es.NewHash().consume();
146 | }
147 |
148 | //--更新资源到ES(常驻并实时更新)
149 | else if ("es2".equals(order)) {
150 | new org.youseed.spider.saver.es.UpdateHash().consume();
151 | }
152 |
153 | //--更新资源到ES(更新完毕当前批次后关闭)
154 | else if ("es3".equals(order)) {
155 | new org.youseed.spider.saver.es.UpdateHashOnTime().consume();
156 | }
157 |
158 | //--zsky写入+更新操作
159 | if ("zsky".equals(order)) {
160 | new org.youseed.spider.saver.zsky.NewHash().consume();
161 | new org.youseed.spider.saver.zsky.UpdateHash().consume();
162 | new org.youseed.spider.saver.zsky.SaveStat().consume();
163 | }
164 |
165 | //--写入新资源到zsky
166 | else if ("zsky1".equals(order)) {
167 | new org.youseed.spider.saver.zsky.NewHash().consume();
168 | }
169 |
170 | //--更新资源到zsky
171 | else if ("zsky2".equals(order)) {
172 | new org.youseed.spider.saver.zsky.UpdateHash().consume();
173 | }
174 |
175 | //--写入爬虫日志到zsky
176 | else if ("zsky3".equals(order)) {
177 | new org.youseed.spider.saver.zsky.SaveStat().consume();
178 | }
179 | }
180 |
181 | public static String getOperId() {
182 | Map oper = new LinkedHashMap();
183 | oper.put("m", "写入/更新Mongodb");
184 | oper.put("m1", "\t|-------写入新资源到Mongo");
185 | oper.put("m2", "\t|-------更新Mongo");
186 | oper.put("m3", "\t|-------写入统计到Mongo");
187 | oper.put("es", "写入/更新ES(根据esUpdateTime设置,自动选择实时或定时更新)");
188 | oper.put("es1", "\t|-------写入新资源到ES");
189 | oper.put("es2", "\t|-------更新ES(常驻内存并实时更新)");
190 | oper.put("es3", "\t|-------更新ES(更新完毕当前批次后关闭)");
191 | oper.put("zsky", "写入/更新纸上烤鱼(zsky)");
192 | oper.put("zsky1", "\t|-------写入新资源到Mysql");
193 | oper.put("zsky2", "\t|-------更新Mysql");
194 | oper.put("zsky3", "\t|-------写入统计到Mysql");
195 |
196 | Scanner sc = new Scanner(System.in);
197 |
198 | String order = null;
199 | for (;;) {
200 | if (oper.containsKey(order)) {
201 | sc.close();
202 | return order;
203 | }
204 | for (Map.Entry entry : oper.entrySet()) {
205 | String id = (String) entry.getKey();
206 | String desc = (String) entry.getValue();
207 |
208 | System.out.println(id + ": " + desc);
209 | }
210 | System.out.println("");
211 | System.out.println("请选择一项操作(输入编号后回车):");
212 |
213 | order = sc.next();
214 | }
215 | }
216 | }
217 |
--------------------------------------------------------------------------------
/src/org/youseed/spider/MongoConn.java:
--------------------------------------------------------------------------------
1 | package org.youseed.spider;
2 |
3 | import java.util.ArrayList;
4 | import java.util.LinkedHashMap;
5 | import java.util.List;
6 | import java.util.Map;
7 | import java.util.regex.Pattern;
8 |
9 | import org.apache.logging.log4j.LogManager;
10 | import org.apache.logging.log4j.Logger;
11 | import org.bson.Document;
12 |
13 | import com.alibaba.fastjson.JSONObject;
14 | import com.mongodb.BasicDBObject;
15 | import com.mongodb.MongoClient;
16 | import com.mongodb.MongoClientOptions;
17 | import com.mongodb.MongoCredential;
18 | import com.mongodb.ServerAddress;
19 | import com.mongodb.client.MongoCollection;
20 | import com.mongodb.client.MongoCursor;
21 | import com.mongodb.client.MongoDatabase;
22 | import com.mongodb.client.model.Filters;
23 | import com.mongodb.client.model.IndexOptions;
24 | import com.mongodb.client.model.InsertManyOptions;
25 | import com.mongodb.client.result.DeleteResult;
26 | import com.mongodb.client.result.UpdateResult;
27 |
28 | /**
29 | * 获取Mongo连接
30 | */
31 | public class MongoConn {
32 |
33 | private static Logger logger = LogManager.getLogger(MongoConn.class);
34 |
35 | MongoClient mongoClient;
36 | MongoDatabase db;
37 |
38 | public MongoConn() {
39 |
40 | JSONObject mongo = ConfigUtil.getConfig().getJSONObject("mongo");
41 |
42 | String url = mongo.getString("url");
43 | int port = mongo.getIntValue("port");
44 | String dbName = mongo.getString("db");
45 |
46 | String user = mongo.getString("user");
47 | String admindb = mongo.getString("admindb");
48 | String psw = mongo.getString("psw");
49 |
50 | logger.info("---------Mongodb配置------------");
51 | logger.info("地址|mongo.url: " + url);
52 | logger.info("端口|mongo.port: " + port);
53 | logger.info("数据库|mongo.db: " + dbName);
54 | logger.info("用户名|mongo.user: " + user);
55 | logger.info("账户验证数据库|mongo.admindb: " + admindb);
56 | logger.info("---------------------------------------------");
57 |
58 |
59 | //设置连接超时
60 | MongoClientOptions options = MongoClientOptions.builder()
61 | // .threadsAllowedToBlockForConnectionMultiplier(20)
62 | // .connectTimeout(5000)
63 | // .maxWaitTime(5000)
64 | // .socketTimeout(5000)
65 | .build();
66 |
67 |
68 | if(mongo.get("user") == null) {
69 | ServerAddress add = new ServerAddress(url, port);
70 | mongoClient = new MongoClient(add, options);
71 | db = mongoClient.getDatabase(dbName);
72 | }else {
73 | ServerAddress add = new ServerAddress(url, port);
74 | List seeds = new ArrayList();
75 | seeds.add(add);
76 |
77 | MongoCredential cre = MongoCredential.createCredential(user, admindb, psw.toCharArray());
78 | mongoClient = new MongoClient(seeds, cre, options);
79 | db = mongoClient.getDatabase(dbName);
80 | }
81 |
82 | }
83 |
84 | /**
85 | * 创建集合
86 | */
87 | public void createColl(String collectionName) {
88 | db.createCollection(collectionName);
89 | }
90 |
91 | /**
92 | * 创建唯一索引
93 | */
94 | public void createIndexUnique(String collectionName, String index) {
95 | IndexOptions indexOptions = new IndexOptions();
96 | indexOptions.unique(true);
97 | getCollection(collectionName).createIndex(new Document().append(index, -1), indexOptions);
98 | }
99 |
100 | /**
101 | * 创建唯一索引
102 | */
103 | public void createIndexUnique(String collectionName, String[] index) {
104 | IndexOptions indexOptions = new IndexOptions();
105 | indexOptions.unique(true);
106 |
107 | Document idx = new Document();
108 | for (int i = 0; i < index.length; i++) {
109 | idx.append(index[i], 1);
110 | }
111 |
112 | getCollection(collectionName).createIndex(idx, indexOptions);
113 | }
114 |
115 | /**
116 | * 创建索引
117 | */
118 | public void createIndex(String collectionName, String index) {
119 | getCollection(collectionName).createIndex(new Document().append(index, -1));
120 | }
121 |
122 | /**
123 | * 保存一个文档
124 | */
125 | public void save(String collectionName, String json) {
126 | Document doc = Document.parse(json);
127 | save(collectionName, doc);
128 | }
129 |
130 | /**
131 | * 保存一个文档
132 | */
133 | public void save(String collectionName, Document doc) {
134 | MongoCollection collection = db.getCollection(collectionName, Document.class);
135 | collection.insertOne(doc);
136 | }
137 |
138 | /**
139 | * 保存多个文档
140 | */
141 | public void save(String collectionName, List docs) {
142 | MongoCollection collection = db.getCollection(collectionName, Document.class);
143 | InsertManyOptions options = new InsertManyOptions();//不开启验证/ 效率更快
144 | options.ordered(false);
145 |
146 | collection.insertMany(docs, options);
147 | }
148 |
149 | /**
150 | * 获取一个文档
151 | */
152 | public MongoCollection getCollection(String collectionName){
153 | return db.getCollection(collectionName);
154 | }
155 |
156 | /**
157 | * 获取文档中所有编号
158 | */
159 | public Map