├── src └── main │ ├── resources │ ├── 原始数据.png │ ├── 数据流向.png │ └── 数据结果展示.png │ └── java │ └── com │ └── cuteximi │ ├── kafka │ ├── SimplePartitioner.java │ ├── Producer.java │ └── KafkaProperties.java │ ├── hive │ ├── ClientInfo.java │ └── HiveJDBC.java │ ├── hbaseSink │ └── AsyncHBaseEventSerializerDemo.java │ ├── hbase │ ├── WeatherBulkLoad.java │ ├── Scanner.java │ └── HBaseOperation.java │ ├── web │ └── DomTool.java │ └── crawl │ └── WeatherCrawler.java ├── .gitignore ├── README.md └── pom.xml /src/main/resources/原始数据.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aikuyun/weather-mrs/HEAD/src/main/resources/原始数据.png -------------------------------------------------------------------------------- /src/main/resources/数据流向.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aikuyun/weather-mrs/HEAD/src/main/resources/数据流向.png -------------------------------------------------------------------------------- /src/main/resources/数据结果展示.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aikuyun/weather-mrs/HEAD/src/main/resources/数据结果展示.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Example user template template 3 | ### Example user template 4 | 5 | # IntelliJ project files 6 | .idea 7 | *.iml 8 | out 9 | gen 10 | -------------------------------------------------------------------------------- /src/main/java/com/cuteximi/kafka/SimplePartitioner.java: -------------------------------------------------------------------------------- 1 | package com.cuteximi.kafka; 2 | 3 | 4 | import kafka.utils.VerifiableProperties; 5 | 6 | /** 7 | * @program: weathermrs 8 | * @description: kafka 简单分区 9 | * @author: TSL 10 | * @create: 2018-11-30 22:34 11 | **/ 12 | public class SimplePartitioner { 13 | public SimplePartitioner(VerifiableProperties props) { 14 | } 15 | 16 | public int partition(Object key, int numPartitions) { 17 | //boolean partition = false; 18 | String partitionKey = (String)key; 19 | 20 | int partition; 21 | try { 22 | partition = Integer.parseInt(partitionKey) % numPartitions; 23 | } catch (NumberFormatException var6) { 24 | partition = 0; 25 | } 26 | 27 | return partition; 28 | } 29 | } -------------------------------------------------------------------------------- /src/main/java/com/cuteximi/kafka/Producer.java: -------------------------------------------------------------------------------- 1 | package com.cuteximi.kafka; 2 | 3 | import java.util.Properties; 4 | import kafka.producer.KeyedMessage; 5 | import kafka.producer.ProducerConfig; 6 | import org.slf4j.Logger; 7 | import org.slf4j.LoggerFactory; 8 | /** 9 | * @program: weathermrs 10 | * @description: 生产者 11 | * @author: TSL 12 | * @create: 2018-11-30 22:34 13 | **/ 14 | public class Producer extends Thread { 15 | private static final Logger LOG = LoggerFactory.getLogger(Producer.class); 16 | private final kafka.javaapi.producer.Producer producer; 17 | private final String topic; 18 | private final Properties props = new Properties(); 19 | private final String producerType = "producer.type"; 20 | private final String partitionerClass = "partitioner.class"; 21 | private final String serializerClass = "serializer.class"; 22 | private final String metadataBrokerList = "metadata.broker.list"; 23 | private final String bootstrapServers = "bootstrap.servers"; 24 | 25 | public Producer(String topicName) { 26 | this.props.put("producer.type", "sync"); 27 | this.props.put("partitioner.class", "com.huawei.mrs.kafka.SimplePartitioner"); 28 | this.props.put("serializer.class", "kafka.serializer.StringEncoder"); 29 | this.props.put("metadata.broker.list", KafkaProperties.getInstance().getValues("bootstrap.servers", "localhost:9092")); 30 | this.producer = new kafka.javaapi.producer.Producer(new ProducerConfig(this.props)); 31 | this.topic = topicName; 32 | } 33 | 34 | public void run(String line) { 35 | LOG.info("Producer: start."); 36 | String[] fields = line.split(",", -1); 37 | String key = fields[0]; 38 | this.producer.send(new KeyedMessage(this.topic, key, line)); 39 | LOG.info("Producer: send message to " + this.topic); 40 | } 41 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 天气分析实例 2 | 3 | > 对历史天气数据分析的一个项目,先是使用爬虫程序从天气网站上获取北京是各省、市、区县近几年的天气数据,然后对数据进行实时分发、收集和统 计分析,最后通过 BI 工具进行图表展示。 4 | 5 | ## 原始数据 6 | ![](https://github.com/aikuyun/weather-mrs/blob/master/src/main/resources/%E5%8E%9F%E5%A7%8B%E6%95%B0%E6%8D%AE.png) 7 | 8 | ## 数据结果 9 | ![](https://github.com/aikuyun/weather-mrs/blob/master/src/main/resources/%E6%95%B0%E6%8D%AE%E7%BB%93%E6%9E%9C%E5%B1%95%E7%A4%BA.png) 10 | 11 | ## 解决方案 12 | 1.先建立一张 Hbase 表。 13 | 14 | 2.使用爬虫程序获取天气数据。 15 | 16 | 3.使用 kafak 实时分发数据。 17 | 18 | 4.利用 Flume 收集数据并写入到 Hbase 中。 19 | 20 | 5.创建 Hive 表与 Hbase 表进行关联。 21 | 22 | 6.使用 Superset 统计和展示 Hive 中的表。 23 | 24 | ## 数据流 25 | ![](https://github.com/aikuyun/weather-mrs/blob/master/src/main/resources/%E6%95%B0%E6%8D%AE%E6%B5%81%E5%90%91.png) 26 | 27 | ## 代码导航 28 | 29 | - Hive 30 | - [ClientInfo.java](https://github.com/aikuyun/weather-mrs/blob/master/src/main/java/com/cuteximi/hive/ClientInfo.java) 31 | - [HiveJDBC.java](https://github.com/aikuyun/weather-mrs/blob/master/src/main/java/com/cuteximi/hive/HiveJDBC.java) 32 | - HBase 33 | - [HBaseOperation.java](https://github.com/aikuyun/weather-mrs/blob/master/src/main/java/com/cuteximi/hbase/HBaseOperation.java) 34 | - [Scanner.java](https://github.com/aikuyun/weather-mrs/blob/master/src/main/java/com/cuteximi/hbase/Scanner.java) 35 | - [WeatherBulkLoad.java](https://github.com/aikuyun/weather-mrs/blob/master/src/main/java/com/cuteximi/hbase/WeatherBulkLoad.java) 36 | - Kafka 37 | - [KafkaProperties.java](https://github.com/aikuyun/weather-mrs/blob/master/src/main/java/com/cuteximi/kafka/KafkaProperties.java) 38 | - [Producer.java](https://github.com/aikuyun/weather-mrs/blob/master/src/main/java/com/cuteximi/kafka/Producer.java) 39 | - [SimplePartitioner.java](https://github.com/aikuyun/weather-mrs/blob/master/src/main/java/com/cuteximi/kafka/SimplePartitioner.java) 40 | - Crawl 41 | - [WeatherCrawler.java](https://github.com/aikuyun/weather-mrs/blob/master/src/main/java/com/cuteximi/crawl/WeatherCrawler.java) 42 | - Flume 43 | - [AsyncHBaseEventSerializerDemo.java](https://github.com/aikuyun/weather-mrs/blob/master/src/main/java/com/cuteximi/hbaseSink/AsyncHBaseEventSerializerDemo.java) 44 | - 更多... -------------------------------------------------------------------------------- /src/main/java/com/cuteximi/hive/ClientInfo.java: -------------------------------------------------------------------------------- 1 | package com.cuteximi.hive; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.IOException; 6 | import java.util.Properties; 7 | 8 | /** 9 | * @program: weathermrs 10 | * @description: 客户端信息 11 | * @author: TSL 12 | * @create: 2018-11-30 22:32 13 | **/ 14 | public class ClientInfo { 15 | private String zkQuorum = null; 16 | private String auth = null; 17 | private String saslQop = null; 18 | private String zooKeeperNamespace = null; 19 | private String serviceDiscoveryMode = null; 20 | private String principal = null; 21 | private Properties clientInfo = null; 22 | 23 | public ClientInfo(String hiveclientFile) throws IOException { 24 | FileInputStream fileInputStream = null; 25 | 26 | try { 27 | this.clientInfo = new Properties(); 28 | File propertiesFile = new File(hiveclientFile); 29 | fileInputStream = new FileInputStream(propertiesFile); 30 | this.clientInfo.load(fileInputStream); 31 | } catch (Exception var7) { 32 | throw new IOException(var7); 33 | } finally { 34 | if (fileInputStream != null) { 35 | fileInputStream.close(); 36 | fileInputStream = null; 37 | } 38 | 39 | } 40 | 41 | this.initialize(); 42 | } 43 | 44 | private void initialize() { 45 | this.zkQuorum = this.clientInfo.getProperty("zk.quorum"); 46 | this.auth = this.clientInfo.getProperty("auth"); 47 | this.saslQop = this.clientInfo.getProperty("sasl.qop"); 48 | this.zooKeeperNamespace = this.clientInfo.getProperty("zooKeeperNamespace"); 49 | this.serviceDiscoveryMode = this.clientInfo.getProperty("serviceDiscoveryMode"); 50 | this.principal = this.clientInfo.getProperty("principal"); 51 | } 52 | 53 | public String getZkQuorum() { 54 | return this.zkQuorum; 55 | } 56 | 57 | public String getSaslQop() { 58 | return this.saslQop; 59 | } 60 | 61 | public String getAuth() { 62 | return this.auth; 63 | } 64 | 65 | public String getZooKeeperNamespace() { 66 | return this.zooKeeperNamespace; 67 | } 68 | 69 | public String getServiceDiscoveryMode() { 70 | return this.serviceDiscoveryMode; 71 | } 72 | 73 | public String getPrincipal() { 74 | return this.principal; 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/main/java/com/cuteximi/kafka/KafkaProperties.java: -------------------------------------------------------------------------------- 1 | package com.cuteximi.kafka; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.IOException; 6 | import java.util.Properties; 7 | import org.slf4j.Logger; 8 | import org.slf4j.LoggerFactory; 9 | 10 | /** 11 | * @program: weathermrs 12 | * @description: Kafka 的配置 13 | * @author: TSL 14 | * @create: 2018-11-30 22:34 15 | **/ 16 | public class KafkaProperties { 17 | private static final Logger LOG = LoggerFactory.getLogger(KafkaProperties.class); 18 | public static final String TOPIC = "Test"; 19 | private static Properties serverProps = new Properties(); 20 | private static Properties producerProps = new Properties(); 21 | private static Properties consumerProps = new Properties(); 22 | private static Properties clientProps = new Properties(); 23 | private static KafkaProperties instance = null; 24 | private static String KafkaPath = "/opt/client/Kafka/kafka"; 25 | 26 | private KafkaProperties() { 27 | String filePath = KafkaPath + File.separator + "config" + File.separator; 28 | 29 | try { 30 | File proFile = new File(filePath + "producer.properties"); 31 | if (proFile.exists()) { 32 | producerProps.load(new FileInputStream(filePath + "producer.properties")); 33 | } 34 | 35 | File conFile = new File(filePath + "producer.properties"); 36 | if (conFile.exists()) { 37 | consumerProps.load(new FileInputStream(filePath + "consumer.properties")); 38 | } 39 | 40 | File serFile = new File(filePath + "server.properties"); 41 | if (serFile.exists()) { 42 | serverProps.load(new FileInputStream(filePath + "server.properties")); 43 | } 44 | 45 | File cliFile = new File(filePath + "client.properties"); 46 | if (cliFile.exists()) { 47 | clientProps.load(new FileInputStream(filePath + "client.properties")); 48 | } 49 | } catch (IOException var6) { 50 | LOG.info("The Exception occured.", var6); 51 | } 52 | 53 | } 54 | 55 | public static synchronized KafkaProperties getInstance() { 56 | if (null == instance) { 57 | instance = new KafkaProperties(); 58 | } 59 | 60 | return instance; 61 | } 62 | 63 | public String getValues(String key, String defValue) { 64 | String rtValue = null; 65 | if (null == key) { 66 | LOG.error("key is null"); 67 | } else { 68 | rtValue = this.getPropertiesValue(key); 69 | } 70 | 71 | if (null == rtValue) { 72 | LOG.warn("KafkaProperties.getValues return null, key is " + key); 73 | rtValue = defValue; 74 | } 75 | 76 | LOG.info("KafkaProperties.getValues: key is " + key + "; Value is " + rtValue); 77 | return rtValue; 78 | } 79 | 80 | private String getPropertiesValue(String key) { 81 | String rtValue = serverProps.getProperty(key); 82 | if (null == rtValue) { 83 | rtValue = producerProps.getProperty(key); 84 | } 85 | 86 | if (null == rtValue) { 87 | rtValue = consumerProps.getProperty(key); 88 | } 89 | 90 | if (null == rtValue) { 91 | rtValue = clientProps.getProperty(key); 92 | } 93 | 94 | return rtValue; 95 | } 96 | } -------------------------------------------------------------------------------- /src/main/java/com/cuteximi/hive/HiveJDBC.java: -------------------------------------------------------------------------------- 1 | package com.cuteximi.hive; 2 | 3 | import java.sql.Connection; 4 | import java.sql.DriverManager; 5 | import java.sql.PreparedStatement; 6 | import java.sql.ResultSet; 7 | import java.sql.ResultSetMetaData; 8 | import java.sql.SQLException; 9 | /** 10 | * @program: weathermrs 11 | * @description: hive 数据库操作 12 | * @author: TSL 13 | * @create: 2018-11-30 22:33 14 | **/ 15 | public class HiveJDBC { 16 | private static final String HIVE_DRIVER = "org.apache.hive.jdbc.HiveDriver"; 17 | private ClientInfo clientInfo; 18 | private boolean isSecurityMode; 19 | 20 | public HiveJDBC(ClientInfo clientInfo, boolean isSecurityMode) { 21 | this.clientInfo = clientInfo; 22 | this.isSecurityMode = isSecurityMode; 23 | } 24 | 25 | public Connection getConnection() throws ClassNotFoundException, SQLException { 26 | StringBuilder sBuilder = (new StringBuilder("jdbc:hive2://")).append(this.clientInfo.getZkQuorum()).append("/"); 27 | if (this.isSecurityMode) { 28 | sBuilder.append(";serviceDiscoveryMode=").append(this.clientInfo.getServiceDiscoveryMode()).append(";zooKeeperNamespace=").append(this.clientInfo.getZooKeeperNamespace()).append(";sasl.qop=").append(this.clientInfo.getSaslQop()).append(";auth=").append(this.clientInfo.getAuth()).append(";principal=").append(this.clientInfo.getPrincipal()).append(";"); 29 | } else { 30 | sBuilder.append(";serviceDiscoveryMode=").append(this.clientInfo.getServiceDiscoveryMode()).append(";zooKeeperNamespace=").append(this.clientInfo.getZooKeeperNamespace()).append(";auth=none"); 31 | } 32 | 33 | String url = sBuilder.toString(); 34 | Class.forName("org.apache.hive.jdbc.HiveDriver"); 35 | Connection connection = null; 36 | connection = DriverManager.getConnection(url, "", ""); 37 | return connection; 38 | } 39 | 40 | public static void execDDL(Connection connection, String sql) throws SQLException { 41 | PreparedStatement statement = null; 42 | 43 | try { 44 | statement = connection.prepareStatement(sql); 45 | statement.execute(); 46 | } finally { 47 | if (null != statement) { 48 | statement.close(); 49 | } 50 | 51 | } 52 | 53 | } 54 | 55 | public static void execDML(Connection connection, String sql) throws SQLException { 56 | PreparedStatement statement = null; 57 | ResultSet resultSet = null; 58 | ResultSetMetaData resultMetaData = null; 59 | 60 | try { 61 | statement = connection.prepareStatement(sql); 62 | resultSet = statement.executeQuery(); 63 | resultMetaData = resultSet.getMetaData(); 64 | int columnCount = resultMetaData.getColumnCount(); 65 | 66 | int i; 67 | for(i = 1; i <= columnCount; ++i) { 68 | System.out.print(resultMetaData.getColumnLabel(i) + '\t'); 69 | } 70 | 71 | System.out.println(); 72 | 73 | while(resultSet.next()) { 74 | for(i = 1; i <= columnCount; ++i) { 75 | System.out.print(resultSet.getString(i) + '\t'); 76 | } 77 | 78 | System.out.println(); 79 | } 80 | } finally { 81 | if (null != resultSet) { 82 | resultSet.close(); 83 | } 84 | 85 | if (null != statement) { 86 | statement.close(); 87 | } 88 | 89 | } 90 | 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /src/main/java/com/cuteximi/hbaseSink/AsyncHBaseEventSerializerDemo.java: -------------------------------------------------------------------------------- 1 | package com.cuteximi.hbaseSink; 2 | 3 | import com.google.common.base.Charsets; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | import org.apache.flume.Context; 7 | import org.apache.flume.Event; 8 | import org.apache.flume.conf.ComponentConfiguration; 9 | import org.apache.flume.sink.hbase.AsyncHbaseEventSerializer; 10 | import org.hbase.async.AtomicIncrementRequest; 11 | import org.hbase.async.PutRequest; 12 | import org.slf4j.Logger; 13 | import org.slf4j.LoggerFactory; 14 | 15 | /** 16 | * @program: weathermrs 17 | * @description: 同步数据 18 | * @author: TSL 19 | * @create: 2018-11-30 22:32 20 | **/ 21 | public class AsyncHBaseEventSerializerDemo implements AsyncHbaseEventSerializer { 22 | private static final Logger LOGGER = LoggerFactory.getLogger(AsyncHBaseEventSerializerDemo.class); 23 | private byte[] table; 24 | private byte[] cf; 25 | private byte[] payload; 26 | private byte[] payloadColumn; 27 | private byte[][] columns; 28 | private byte[] incrementRow; 29 | private byte[] incrementColumn; 30 | 31 | public AsyncHBaseEventSerializerDemo() { 32 | } 33 | 34 | public void initialize(byte[] table, byte[] cf) { 35 | this.table = table; 36 | this.cf = cf; 37 | } 38 | 39 | public List getActions() { 40 | List actions = new ArrayList(); 41 | if (this.columns.length == 0) { 42 | LOGGER.info("the number of columns is 0"); 43 | return actions; 44 | } else { 45 | String[] values = (new String(this.payload)).split(",", -1); 46 | if (this.columns.length != values.length) { 47 | LOGGER.info("column name and column value do not match"); 48 | return actions; 49 | } else { 50 | byte[] currentRowkey = (values[0] + "&" + values[4]).getBytes(); 51 | byte[][] vs = new byte[this.columns.length][]; 52 | 53 | for(int i = 0; i < values.length; ++i) { 54 | vs[i] = values[i].getBytes(); 55 | } 56 | 57 | PutRequest put = new PutRequest(this.table, currentRowkey, this.cf, this.columns, vs); 58 | actions.add(put); 59 | return actions; 60 | } 61 | } 62 | } 63 | 64 | public List getIncrements() { 65 | List actions = new ArrayList(); 66 | if (this.incrementColumn != null) { 67 | AtomicIncrementRequest inc = new AtomicIncrementRequest(this.table, this.incrementRow, this.cf, this.incrementColumn); 68 | actions.add(inc); 69 | } 70 | 71 | return actions; 72 | } 73 | 74 | public void cleanUp() { 75 | } 76 | 77 | public void configure(Context context) { 78 | String pCol = context.getString("payloadColumn", "pCol"); 79 | String iCol = context.getString("incrementColumn", "iCol"); 80 | if (pCol != null && !pCol.isEmpty()) { 81 | this.payloadColumn = pCol.getBytes(Charsets.UTF_8); 82 | String[] columnNames = (new String(this.payloadColumn)).split(",", -1); 83 | if (columnNames != null && columnNames.length != 0) { 84 | this.columns = new byte[columnNames.length][]; 85 | 86 | for(int i = 0; i < columnNames.length; ++i) { 87 | this.columns[i] = columnNames[i].getBytes(); 88 | } 89 | } 90 | } 91 | 92 | if (iCol != null && !iCol.isEmpty()) { 93 | this.incrementColumn = iCol.getBytes(Charsets.UTF_8); 94 | } 95 | 96 | this.incrementRow = context.getString("incrementRow", "incRow").getBytes(Charsets.UTF_8); 97 | } 98 | 99 | public void setEvent(Event event) { 100 | this.payload = event.getBody(); 101 | } 102 | 103 | public void configure(ComponentConfiguration conf) { 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /src/main/java/com/cuteximi/hbase/WeatherBulkLoad.java: -------------------------------------------------------------------------------- 1 | package com.cuteximi.hbase; 2 | 3 | 4 | import java.io.IOException; 5 | import org.apache.commons.logging.Log; 6 | import org.apache.commons.logging.LogFactory; 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.fs.Path; 9 | import org.apache.hadoop.hbase.HBaseConfiguration; 10 | import org.apache.hadoop.hbase.TableName; 11 | import org.apache.hadoop.hbase.client.HTable; 12 | import org.apache.hadoop.hbase.client.Put; 13 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable; 14 | import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2; 15 | import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles; 16 | import org.apache.hadoop.hbase.util.Bytes; 17 | import org.apache.hadoop.io.Text; 18 | import org.apache.hadoop.mapreduce.Job; 19 | import org.apache.hadoop.mapreduce.Mapper; 20 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 21 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 22 | import org.apache.hadoop.util.GenericOptionsParser; 23 | /** 24 | * @program: weathermrs 25 | * @description: 导入数据到 hbase 26 | * @author: TSL 27 | * @create: 2018-11-30 22:31 28 | **/ 29 | public class WeatherBulkLoad { 30 | private static final Log LOG = LogFactory.getLog(WeatherBulkLoad.class); 31 | 32 | public WeatherBulkLoad() { 33 | } 34 | 35 | public static void loadIncrementalHFileToHBase(Configuration configuration, Path path, TableName tableName) throws Exception { 36 | Configuration conf = HBaseConfiguration.create(); 37 | LoadIncrementalHFiles loder = new LoadIncrementalHFiles(configuration); 38 | loder.doBulkLoad(path, new HTable(conf, tableName)); 39 | } 40 | 41 | public static void main(String[] args) throws Exception { 42 | Configuration conf = HBaseConfiguration.create(); 43 | TableName tableName = TableName.valueOf("hbase_weather"); 44 | HBaseOperation hbase = new HBaseOperation(conf); 45 | String[] otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs(); 46 | if (otherArgs.length != 2) { 47 | System.err.println("Usage: WeatherInfoCollector "); 48 | System.exit(2); 49 | } 50 | 51 | Path srcPath = new Path(otherArgs[0]); 52 | Path descPath = new Path(otherArgs[1]); 53 | hbase.createTable(tableName); 54 | Job job = new Job(conf, "Collect Weather Info"); 55 | job.setJarByClass(WeatherBulkLoad.class); 56 | job.setMapperClass(WeatherBulkLoad.CollectionMapper.class); 57 | job.setMapOutputKeyClass(ImmutableBytesWritable.class); 58 | job.setMapOutputValueClass(Put.class); 59 | job.setOutputKeyClass(Text.class); 60 | job.setOutputValueClass(Text.class); 61 | HTable table = new HTable(conf, tableName); 62 | HFileOutputFormat2.configureIncrementalLoad(job, table, table.getRegionLocator()); 63 | FileInputFormat.addInputPath(job, srcPath); 64 | FileOutputFormat.setOutputPath(job, descPath); 65 | if (job.waitForCompletion(true)) { 66 | loadIncrementalHFileToHBase(conf, descPath, tableName); 67 | } 68 | 69 | hbase.clean(); 70 | System.exit(job.waitForCompletion(true) ? 0 : 1); 71 | } 72 | 73 | public static class CollectionMapper extends Mapper { 74 | public CollectionMapper() { 75 | } 76 | 77 | public void map(Object key, Text value, Mapper.Context context) throws IOException, InterruptedException { 78 | String line = value.toString(); 79 | String[] fields = line.split(",", -1); 80 | byte[] rowkey = Bytes.toBytes(fields[0] + " " + fields[4]); 81 | ImmutableBytesWritable rowKey = new ImmutableBytesWritable(rowkey); 82 | Put put = new Put(rowkey); 83 | put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("id"), Bytes.toBytes(fields[0])); 84 | put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("province"), Bytes.toBytes(fields[1])); 85 | put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("city"), Bytes.toBytes(fields[2])); 86 | put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("zone"), Bytes.toBytes(fields[3])); 87 | put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("time"), Bytes.toBytes(fields[4])); 88 | put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("maxTemperature"), Bytes.toBytes(fields[5])); 89 | put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("minTemperature"), Bytes.toBytes(fields[6])); 90 | put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("weather"), Bytes.toBytes(fields[7])); 91 | put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("windDirection"), Bytes.toBytes(fields[8])); 92 | put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("windPower"), Bytes.toBytes(fields[9])); 93 | context.write(rowKey, put); 94 | } 95 | } 96 | } 97 | 98 | -------------------------------------------------------------------------------- /src/main/java/com/cuteximi/web/DomTool.java: -------------------------------------------------------------------------------- 1 | package com.cuteximi.web; 2 | 3 | import java.util.HashSet; 4 | import java.util.Iterator; 5 | import java.util.Set; 6 | import org.apache.commons.logging.Log; 7 | import org.apache.commons.logging.LogFactory; 8 | import org.apache.http.HttpResponse; 9 | import org.apache.http.client.methods.HttpGet; 10 | import org.apache.http.impl.client.DefaultHttpClient; 11 | import org.apache.http.util.EntityUtils; 12 | import org.jsoup.Jsoup; 13 | import org.jsoup.nodes.Document; 14 | import org.jsoup.nodes.Element; 15 | import org.jsoup.select.Elements; 16 | 17 | /** 18 | * @program: weathermrs 19 | * @description: 网络工具 20 | * @author: TSL 21 | * @create: 2018-11-30 22:35 22 | **/ 23 | public class DomTool { 24 | private static final Log LOG = LogFactory.getLog(DomTool.class.getName()); 25 | 26 | public DomTool() { 27 | } 28 | 29 | public Document getDom(String url, DefaultHttpClient httpClient, String charset) { 30 | HttpGet get = new HttpGet(url); 31 | HttpResponse response = null; 32 | Object dom = null; 33 | 34 | try { 35 | response = httpClient.execute(get); 36 | int statusCode = response.getStatusLine().getStatusCode(); 37 | if (502 == statusCode) { 38 | for(int i = 1; statusCode != 200 && i < 4; ++i) { 39 | response.getEntity().getContent().close(); 40 | Thread.currentThread(); 41 | Thread.sleep(10000L); 42 | System.out.println("sleep " + i * 10 + "s"); 43 | response = httpClient.execute(get); 44 | statusCode = response.getStatusLine().getStatusCode(); 45 | } 46 | } 47 | 48 | if (statusCode != 200) { 49 | LOG.info("Status code is not 200, ignore...statusCode is " + statusCode + ". url = " + url); 50 | } 51 | 52 | String html = EntityUtils.toString(response.getEntity(), charset); 53 | return Jsoup.parse(html); 54 | } catch (Exception var9) { 55 | var9.printStackTrace(); 56 | return (Document)dom; 57 | } 58 | } 59 | 60 | public Document getDom(String url, DefaultHttpClient httpClient) { 61 | return this.getDom(url, httpClient, "GBK"); 62 | } 63 | 64 | private String getUrl(String zone) { 65 | StringBuilder url = new StringBuilder(); 66 | if (!zone.startsWith("/")) { 67 | url.append("http://www.tianqihoubao.com/").append(zone); 68 | } 69 | 70 | url.append("http://www.tianqihoubao.com").append(zone); 71 | return url.toString(); 72 | } 73 | 74 | public Set getLinks(Document dom, String domSekector, String cssSelector, int type) { 75 | Set links = new HashSet(); 76 | Elements es = this.getElements(dom, domSekector, cssSelector); 77 | if (null == es) { 78 | LOG.info("Get Links error,element is null. Type is " + type); 79 | return links; 80 | } else { 81 | Iterator iterator = es.iterator(); 82 | 83 | while(true) { 84 | while(iterator.hasNext()) { 85 | Element element = (Element)iterator.next(); 86 | switch(type) { 87 | case 1: 88 | String province = element.select("a").text(); 89 | links.add(province + ";" + this.getUrl(element.select("a").attr("href"))); 90 | break; 91 | case 2: 92 | String city = element.select("dt").select("a").text(); 93 | Iterator it = element.select("dd").select("a").iterator(); 94 | 95 | while(it.hasNext()) { 96 | Element ele = (Element)it.next(); 97 | String zone = ele.text(); 98 | String zoneUrl = ele.attr("href"); 99 | String zoneId = zoneUrl.substring(zoneUrl.lastIndexOf("/") + 1, zoneUrl.indexOf(".html")); 100 | links.add(zoneId + ";" + city + ";" + zone + ";" + this.getUrl(zoneUrl)); 101 | } 102 | break; 103 | default: 104 | String url = element.attr("href"); 105 | if (url.contains("lishi")) { 106 | links.add(this.getUrl(url)); 107 | } 108 | } 109 | } 110 | 111 | return links; 112 | } 113 | } 114 | } 115 | 116 | public Elements getElements(Document dom, String domSekector, String cssSelector) { 117 | Element ele = dom.select(domSekector).first(); 118 | return null == ele ? null : ele.select(cssSelector); 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /src/main/java/com/cuteximi/hbase/Scanner.java: -------------------------------------------------------------------------------- 1 | package com.cuteximi.hbase; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import org.apache.commons.logging.Log; 6 | import org.apache.commons.logging.LogFactory; 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.hbase.Cell; 9 | import org.apache.hadoop.hbase.CellUtil; 10 | import org.apache.hadoop.hbase.TableName; 11 | import org.apache.hadoop.hbase.client.Connection; 12 | import org.apache.hadoop.hbase.client.ConnectionFactory; 13 | import org.apache.hadoop.hbase.client.Result; 14 | import org.apache.hadoop.hbase.client.ResultScanner; 15 | import org.apache.hadoop.hbase.client.Scan; 16 | import org.apache.hadoop.hbase.client.Table; 17 | import org.apache.hadoop.hbase.filter.FilterList; 18 | import org.apache.hadoop.hbase.filter.SingleColumnValueFilter; 19 | import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp; 20 | import org.apache.hadoop.hbase.filter.FilterList.Operator; 21 | import org.apache.hadoop.hbase.util.Bytes; 22 | import org.apache.hadoop.util.GenericOptionsParser; 23 | /** 24 | * @program: weathermrs 25 | * @description: 查询数据 26 | * @author: TSL 27 | * @create: 2018-11-30 22:29 28 | **/ 29 | public class Scanner { 30 | private static final Log LOG = LogFactory.getLog(Scanner.class.getName()); 31 | private static String CONF_DIR; 32 | private static Connection conn; 33 | private static Configuration conf; 34 | private static TableName tableName; 35 | public String TABLE_NAME = "hbase_weather"; 36 | 37 | public Scanner(Configuration conf) throws IOException { 38 | conf = conf; 39 | tableName = TableName.valueOf(this.TABLE_NAME); 40 | conn = ConnectionFactory.createConnection(conf); 41 | } 42 | 43 | public void scanData(String province, String city, String zone, String time) { 44 | Table table = null; 45 | ResultScanner rScanner = null; 46 | 47 | try { 48 | table = conn.getTable(tableName); 49 | Scan scan = new Scan(); 50 | scan.addColumn(Bytes.toBytes("info"), Bytes.toBytes("province")); 51 | scan.addColumn(Bytes.toBytes("info"), Bytes.toBytes("city")); 52 | scan.addColumn(Bytes.toBytes("info"), Bytes.toBytes("zone")); 53 | scan.addColumn(Bytes.toBytes("info"), Bytes.toBytes("time")); 54 | scan.addColumn(Bytes.toBytes("info"), Bytes.toBytes("maxTemperature")); 55 | scan.addColumn(Bytes.toBytes("info"), Bytes.toBytes("minTemperature")); 56 | scan.addColumn(Bytes.toBytes("info"), Bytes.toBytes("weather")); 57 | scan.addColumn(Bytes.toBytes("info"), Bytes.toBytes("windDirection")); 58 | scan.addColumn(Bytes.toBytes("info"), Bytes.toBytes("windPower")); 59 | FilterList list = new FilterList(Operator.MUST_PASS_ALL); 60 | list.addFilter(new SingleColumnValueFilter(Bytes.toBytes("info"), Bytes.toBytes("province"), CompareOp.EQUAL, Bytes.toBytes(province))); 61 | list.addFilter(new SingleColumnValueFilter(Bytes.toBytes("info"), Bytes.toBytes("city"), CompareOp.EQUAL, Bytes.toBytes(city))); 62 | list.addFilter(new SingleColumnValueFilter(Bytes.toBytes("info"), Bytes.toBytes("zone"), CompareOp.EQUAL, Bytes.toBytes(zone))); 63 | list.addFilter(new SingleColumnValueFilter(Bytes.toBytes("info"), Bytes.toBytes("time"), CompareOp.EQUAL, Bytes.toBytes(time))); 64 | scan.setFilter(list); 65 | rScanner = table.getScanner(scan); 66 | 67 | for(Result r = rScanner.next(); r != null; r = rScanner.next()) { 68 | Cell[] var10 = r.rawCells(); 69 | int var11 = var10.length; 70 | 71 | for(int var12 = 0; var12 < var11; ++var12) { 72 | Cell cell = var10[var12]; 73 | System.out.println(Bytes.toString(CellUtil.cloneRow(cell)) + ":" + Bytes.toString(CellUtil.cloneFamily(cell)) + "," + Bytes.toString(CellUtil.cloneQualifier(cell)) + "," + Bytes.toString(CellUtil.cloneValue(cell))); 74 | } 75 | } 76 | 77 | LOG.info("Scan data successfully."); 78 | } catch (IOException var22) { 79 | LOG.error("Scan data failed ", var22); 80 | } finally { 81 | if (rScanner != null) { 82 | rScanner.close(); 83 | } 84 | 85 | if (table != null) { 86 | try { 87 | table.close(); 88 | } catch (IOException var21) { 89 | LOG.error("Close table failed ", var21); 90 | } 91 | } 92 | 93 | } 94 | 95 | } 96 | 97 | public static void main(String[] args) throws IOException { 98 | String[] otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs(); 99 | if (otherArgs.length != 4) { 100 | System.err.println("Usage: WeatherInfoCollector "); 101 | System.exit(4); 102 | } 103 | 104 | String province = otherArgs[0]; 105 | String city = otherArgs[1]; 106 | String zone = otherArgs[2]; 107 | String time = otherArgs[3]; 108 | 109 | try { 110 | Scanner scanner = new Scanner(conf); 111 | scanner.scanData(province, city, zone, time); 112 | } catch (Exception var8) { 113 | LOG.error("Failed to scan HBase because ", var8); 114 | } 115 | 116 | } 117 | 118 | static { 119 | CONF_DIR = System.getProperty("user.dir") + File.separator + "conf" + File.separator; 120 | conn = null; 121 | conf = null; 122 | tableName = null; 123 | } 124 | } 125 | 126 | -------------------------------------------------------------------------------- /src/main/java/com/cuteximi/hbase/HBaseOperation.java: -------------------------------------------------------------------------------- 1 | package com.cuteximi.hbase; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | import org.apache.commons.logging.Log; 7 | import org.apache.commons.logging.LogFactory; 8 | import org.apache.hadoop.conf.Configuration; 9 | import org.apache.hadoop.hbase.HColumnDescriptor; 10 | import org.apache.hadoop.hbase.HTableDescriptor; 11 | import org.apache.hadoop.hbase.TableName; 12 | import org.apache.hadoop.hbase.client.Admin; 13 | import org.apache.hadoop.hbase.client.Connection; 14 | import org.apache.hadoop.hbase.client.ConnectionFactory; 15 | import org.apache.hadoop.hbase.client.Put; 16 | import org.apache.hadoop.hbase.client.Table; 17 | import org.apache.hadoop.hbase.io.compress.Compression.Algorithm; 18 | import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding; 19 | import org.apache.hadoop.hbase.util.Bytes; 20 | 21 | /** 22 | * @program: weathermrs 23 | * @description: 操作 Hbase 24 | * @author: TSL 25 | * @create: 2018-11-30 22:28 26 | **/ 27 | public class HBaseOperation { 28 | private static final Log LOG = LogFactory.getLog(HBaseOperation.class.getName()); 29 | private Configuration conf = null; 30 | private static Connection conn = null; 31 | 32 | public HBaseOperation(Configuration conf) throws IOException { 33 | this.conf = conf; 34 | conn = ConnectionFactory.createConnection(conf); 35 | } 36 | 37 | public void createTable(TableName tableName) { 38 | HTableDescriptor htd = new HTableDescriptor(tableName); 39 | HColumnDescriptor hcd = new HColumnDescriptor("info"); 40 | hcd.setDataBlockEncoding(DataBlockEncoding.FAST_DIFF); 41 | hcd.setCompressionType(Algorithm.SNAPPY); 42 | htd.addFamily(hcd); 43 | Admin admin = null; 44 | 45 | try { 46 | admin = conn.getAdmin(); 47 | if (!admin.tableExists(tableName)) { 48 | LOG.info("Creating table..."); 49 | admin.createTable(htd); 50 | LOG.info(admin.getClusterStatus()); 51 | LOG.info(admin.listNamespaceDescriptors()); 52 | LOG.info("Table created successfully."); 53 | } else { 54 | LOG.warn("table already exists"); 55 | } 56 | } catch (IOException var14) { 57 | LOG.error("Create table failed.", var14); 58 | } finally { 59 | if (admin != null) { 60 | try { 61 | admin.close(); 62 | } catch (IOException var13) { 63 | LOG.error("Failed to close admin ", var13); 64 | } 65 | } 66 | 67 | } 68 | 69 | } 70 | 71 | public static void putData(String line, TableName tableName) { 72 | Table table = null; 73 | 74 | try { 75 | table = conn.getTable(tableName); 76 | List puts = new ArrayList(); 77 | String[] fields = line.split(",", -1); 78 | String rowkey = fields[0] + " " + fields[4]; 79 | Put put = new Put(Bytes.toBytes(rowkey)); 80 | put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("id"), Bytes.toBytes(fields[0])); 81 | put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("province"), Bytes.toBytes(fields[1])); 82 | put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("city"), Bytes.toBytes(fields[2])); 83 | put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("zone"), Bytes.toBytes(fields[3])); 84 | put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("time"), Bytes.toBytes(fields[4])); 85 | put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("maxTemperature"), Bytes.toBytes(fields[5])); 86 | put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("minTemperature"), Bytes.toBytes(fields[6])); 87 | put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("weather"), Bytes.toBytes(fields[7])); 88 | put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("windDirection"), Bytes.toBytes(fields[8])); 89 | put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("windPower"), Bytes.toBytes(fields[9])); 90 | puts.add(put); 91 | table.put(puts); 92 | } catch (IOException var15) { 93 | LOG.error("Put failed ", var15); 94 | } finally { 95 | if (table != null) { 96 | try { 97 | table.close(); 98 | } catch (IOException var14) { 99 | LOG.error("Close table failed ", var14); 100 | } 101 | } 102 | 103 | } 104 | 105 | } 106 | 107 | public void dropTable(TableName tableName) { 108 | Admin admin = null; 109 | 110 | try { 111 | admin = conn.getAdmin(); 112 | if (admin.tableExists(tableName)) { 113 | admin.disableTable(tableName); 114 | admin.deleteTable(tableName); 115 | } 116 | 117 | LOG.info("Drop table successfully."); 118 | } catch (IOException var12) { 119 | LOG.error("Drop table failed ", var12); 120 | } finally { 121 | if (admin != null) { 122 | try { 123 | admin.close(); 124 | } catch (IOException var11) { 125 | LOG.error("Close admin failed ", var11); 126 | } 127 | } 128 | 129 | } 130 | 131 | } 132 | 133 | public void clean() { 134 | if (conn != null) { 135 | try { 136 | conn.close(); 137 | } catch (Exception var2) { 138 | LOG.error("Failed to close the connection ", var2); 139 | } 140 | } 141 | 142 | } 143 | } 144 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | weather-crawl 7 | weather-crawl 8 | mrs 9 | 10 | 11 | 1.3.1.0302-mrs-1.7.0 12 | 3.5.1-mrs-1.7.0 13 | 2.8.3-mrs-1.7.0 14 | 1.9.13 15 | 1.6.0 16 | UTF-8 17 | 18 | 19 | 20 | 21 | 22 | huaweicloudsdk 23 | https://mirrors.huaweicloud.com/repository/maven/huaweicloudsdk/ 24 | true 25 | true 26 | 27 | 28 | 29 | central 30 | Mavn Centreal 31 | http://repo2.maven.org/maven2/ 32 | 33 | 34 | 35 | 36 | 37 | 38 | org.apache.hbase 39 | hbase-common 40 | ${hbase.version} 41 | 42 | 43 | org.apache.hbase 44 | hbase-protocol 45 | ${hbase.version} 46 | 47 | 48 | org.apache.hbase 49 | hbase-client 50 | ${hbase.version} 51 | 52 | 53 | org.apache.hbase 54 | hbase-rest 55 | ${hbase.version} 56 | 57 | 58 | org.apache.hbaseFileStream 59 | hbaseFileStream 60 | ${hbase.version} 61 | 62 | 63 | org.apache.hbase 64 | hbase-hindex 65 | ${hbase.version} 66 | 67 | 68 | org.apache.hbase 69 | hbase-common 70 | test-jar 71 | test 72 | ${hbase.version} 73 | 74 | 75 | 76 | com.google.guava 77 | guava 78 | 12.0.1 79 | 80 | 81 | 82 | org.codehaus.jackson 83 | jackson-core-asl 84 | ${jackson.version} 85 | 86 | 87 | org.codehaus.jackson 88 | jackson-mapper-asl 89 | ${jackson.version} 90 | 91 | 92 | org.codehaus.jackson 93 | jackson-jaxrs 94 | ${jackson.version} 95 | 96 | 97 | org.codehaus.jackson 98 | jackson-xc 99 | ${jackson.version} 100 | 101 | 102 | commons-logging 103 | commons-logging 104 | 1.2 105 | 106 | 107 | log4j 108 | log4j 109 | 1.2.17 110 | 111 | 112 | org.apache.zookeeper 113 | zookeeper 114 | ${zookeeper.version} 115 | 116 | 117 | 118 | org.apache.hadoop 119 | hadoop-common 120 | ${hadoop.version} 121 | 122 | 123 | org.apache.hadoop 124 | hadoop-auth 125 | ${hadoop.version} 126 | 127 | 128 | org.apache.hadoop 129 | hadoop-client 130 | ${hadoop.version} 131 | 132 | 133 | org.apache.hadoop 134 | hadoop-hdfs 135 | ${hadoop.version} 136 | 137 | 138 | 139 | org.jsoup 140 | jsoup 141 | 1.10.3 142 | 143 | 144 | 145 | org.apache.kafka 146 | kafka_2.11 147 | 0.10.0.0 148 | 149 | 150 | 151 | org.apache.flume 152 | flume-ng-core 153 | ${flume.version} 154 | provided 155 | 156 | 157 | 158 | org.apache.flume 159 | flume-ng-configuration 160 | ${flume.version} 161 | provided 162 | 163 | 164 | 165 | org.apache.flume.flume-ng-sinks 166 | flume-ng-hbase-sink 167 | 1.6.0 168 | 169 | 170 | 171 | 172 | 173 | 174 | org.apache.maven.plugins 175 | maven-site-plugin 176 | 177 | true 178 | 179 | 180 | 181 | org.apache.maven.plugins 182 | maven-eclipse-plugin 183 | 2.10 184 | 185 | 186 | org.eclipse.jdt.launching.JRE_CONTAINER 187 | 188 | 189 | 190 | 191 | 192 | 193 | org.apache.maven.plugins 194 | maven-compiler-plugin 195 | 196 | 1.7 197 | 1.7 198 | 199 | 200 | 201 | 202 | maven-assembly-plugin 203 | 2.4 204 | 205 | 206 | jar-with-dependencies 207 | 208 | 209 | 210 | com.cuteximi.crawl.WeatherCrawler 211 | 212 | 213 | 214 | 215 | 216 | make-assembly 217 | package 218 | 219 | single 220 | 221 | 222 | 223 | 224 | 225 | 226 | -------------------------------------------------------------------------------- /src/main/java/com/cuteximi/crawl/WeatherCrawler.java: -------------------------------------------------------------------------------- 1 | package com.cuteximi.crawl; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.sql.Connection; 6 | import java.util.Iterator; 7 | import java.util.Set; 8 | 9 | import com.cuteximi.hbase.HBaseOperation; 10 | import com.cuteximi.hive.ClientInfo; 11 | import com.cuteximi.hive.HiveJDBC; 12 | import com.cuteximi.web.DomTool; 13 | import com.cuteximi.kafka.Producer; 14 | import org.apache.commons.logging.Log; 15 | import org.apache.commons.logging.LogFactory; 16 | import org.apache.hadoop.conf.Configuration; 17 | import org.apache.hadoop.hbase.HBaseConfiguration; 18 | import org.apache.hadoop.hbase.TableName; 19 | import org.apache.http.impl.client.DefaultHttpClient; 20 | //import org.apache.kafka.clients.producer.Producer; 21 | import org.jsoup.nodes.Document; 22 | import org.jsoup.nodes.Element; 23 | import org.jsoup.select.Elements; 24 | 25 | /** 26 | * @program: weathermrs 27 | * @description: 爬取天气,以及建表,数据分发等 28 | * @author: TSL 29 | * @create: 2018-11-30 22:27 30 | **/ 31 | public class WeatherCrawler{ 32 | private static final Log LOG = LogFactory.getLog(WeatherCrawler.class.getName()); 33 | private static final int PROVINCE_TYPE = 1; 34 | private static final int ZONE_TYPE = 2; 35 | private static final int WEATHER_TYPE = 3; 36 | public static final String TOPIC = "weather"; 37 | private static DomTool domTool = new DomTool(); 38 | private static TableName hbaseTable = null; 39 | private static HBaseOperation hbase = null; 40 | private static Producer producerThread = null; 41 | 42 | public WeatherCrawler() { 43 | } 44 | 45 | public static void main(String[] args) { 46 | Configuration conf = HBaseConfiguration.create(); 47 | String hbaseTableName = "hbase_weather"; 48 | String hiveTableName = "hive_weather"; 49 | hbaseTable = TableName.valueOf(hbaseTableName); 50 | 51 | try { 52 | hbase = new HBaseOperation(conf); 53 | } catch (IOException var5) { 54 | LOG.error("Failed to init HBase because ", var5); 55 | } 56 | 57 | hbase.createTable(hbaseTable); 58 | createHiveTable(hiveTableName, hbaseTableName); 59 | producerThread = new Producer("weather"); 60 | weatherCrawl("北京", "http://www.tianqihoubao.com/lishi/index.htm"); 61 | LOG.info("-----------finish to crawl weather info-------------------"); 62 | } 63 | 64 | private static void weatherCrawl(String diffProvince, String url) { 65 | DefaultHttpClient httpclient = new DefaultHttpClient(); 66 | Document dom = domTool.getDom(url, httpclient); 67 | Set cityLinks = domTool.getLinks(dom, ".citychk", "dt", 1); 68 | Iterator var5 = cityLinks.iterator(); 69 | 70 | while(true) { 71 | label49: 72 | while(true) { 73 | String province; 74 | String citylink; 75 | do { 76 | if (!var5.hasNext()) { 77 | httpclient.close(); 78 | return; 79 | } 80 | 81 | String citylinks = (String)var5.next(); 82 | province = citylinks.split(";")[0]; 83 | citylink = citylinks.split(";")[1]; 84 | } while(!diffProvince.equals(province)); 85 | 86 | Document cityDom = domTool.getDom(citylink, httpclient, "utf-8"); 87 | if (null == cityDom) { 88 | LOG.info("Error city link = " + citylink); 89 | } else { 90 | Set zonelinks = domTool.getLinks(cityDom, ".citychk", "dl", 2); 91 | Iterator var11 = zonelinks.iterator(); 92 | 93 | while(true) { 94 | while(true) { 95 | if (!var11.hasNext()) { 96 | continue label49; 97 | } 98 | 99 | String zoneLink = (String)var11.next(); 100 | String zoneId = zoneLink.split(";")[0]; 101 | String city = zoneLink.split(";")[1]; 102 | String zone = zoneLink.split(";")[2]; 103 | String zonelink = zoneLink.split(";")[3]; 104 | Document zoneDom = domTool.getDom(zonelink, httpclient); 105 | if (null == zoneDom) { 106 | LOG.info("Error zone link = " + zonelink); 107 | } else { 108 | String regionInfo = zoneId + "," + province + "," + city + "," + zone + ","; 109 | if (null != regionInfo) { 110 | Set weatherLinks = domTool.getLinks(zoneDom, ".wdetail", "a", 3); 111 | Iterator var20 = weatherLinks.iterator(); 112 | 113 | while(var20.hasNext()) { 114 | String weatherlink = (String)var20.next(); 115 | Document weatherDom = domTool.getDom(weatherlink, httpclient); 116 | if (null == weatherDom) { 117 | LOG.info("Error weather link = " + weatherlink); 118 | } else { 119 | crawl2Kafka(weatherDom, regionInfo); 120 | } 121 | } 122 | } 123 | } 124 | } 125 | } 126 | } 127 | } 128 | } 129 | } 130 | 131 | private static void crawl2Kafka(Document dom, String regionInfo) { 132 | Elements es = domTool.getElements(dom, ".wdetail", "tr"); 133 | if (es.isEmpty()) { 134 | LOG.info("Save weather file error, element is null."); 135 | } else { 136 | es.remove(0); 137 | Iterator var3 = es.iterator(); 138 | 139 | while(var3.hasNext()) { 140 | Element ele = (Element)var3.next(); 141 | Elements cells = ele.select("td"); 142 | StringBuilder line = new StringBuilder(); 143 | line.append(regionInfo); 144 | 145 | for(int index = 0; index < cells.size(); ++index) { 146 | String high; 147 | if (index == 0) { 148 | high = ((Element)cells.get(index)).text().replace("年", "-"); 149 | high = high.replace("月", "-"); 150 | high = high.replace("日", ""); 151 | line.append(high); 152 | } 153 | 154 | if (index == 1 || index == 3) { 155 | line.append(((Element)cells.get(index)).text()); 156 | } 157 | 158 | if (index == 2) { 159 | high = ((Element)cells.get(index)).text().split("/")[0]; 160 | String low = ((Element)cells.get(index)).text().split("/")[1]; 161 | if (high.length() == 0 || low.length() == 0) { 162 | continue; 163 | } 164 | 165 | String high1 = high.substring(0, high.length() - 2); 166 | String low1 = low.substring(0, low.length() - 1); 167 | line.append(high1 + "," + low1.trim()); 168 | } 169 | 170 | if (index != cells.size() - 1) { 171 | line.append(","); 172 | } else { 173 | line.append("\n"); 174 | } 175 | } 176 | 177 | producerThread.run(line.toString()); 178 | } 179 | 180 | } 181 | } 182 | 183 | private static void createHiveTable(String hiveTable, String hbaseTable) { 184 | String clientProperties = System.getProperty("user.dir") + File.separator + "conf" + File.separator + "hiveclient.properties"; 185 | 186 | ClientInfo clientInfo; 187 | boolean isSecurityMode; 188 | try { 189 | clientInfo = new ClientInfo(clientProperties); 190 | isSecurityMode = "KERBEROS".equalsIgnoreCase(clientInfo.getAuth()); 191 | } catch (IOException var9) { 192 | LOG.error("Failed to login because ", var9); 193 | return; 194 | } 195 | 196 | String sql = "CREATE EXTERNAL TABLE IF NOT EXISTS " + hiveTable + "(rowkey string, id string,province string, city string, zone string, time string,maxTemperature int, minTemperature int, weather string, windPower string) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES (\"hbase.columns.mapping\" = \":key,info:id,info:province,info:city,info:zone,info:time,info:maxTemperature,info:minTemperature,info:weather,info:windPower\") TBLPROPERTIES(\"hbase.table.name\" = \"" + hbaseTable + "\")"; 197 | HiveJDBC hiveJdbc = new HiveJDBC(clientInfo, isSecurityMode); 198 | 199 | try { 200 | Connection connection = hiveJdbc.getConnection(); 201 | HiveJDBC.execDDL(connection, sql); 202 | } catch (Exception var8) { 203 | var8.printStackTrace(); 204 | } 205 | 206 | } 207 | } 208 | --------------------------------------------------------------------------------