├── .gitignore ├── README.md ├── people.parquet ├── ._SUCCESS.crc ├── .part-00000-c7a8f5d9-c2f3-4f58-b8b4-fd35faa27324-c000.snappy.parquet.crc └── _SUCCESS ├── pom.xml ├── sb-word-count ├── README.md ├── pom.xml └── src │ └── main │ ├── java │ └── spark │ │ ├── Application.java │ │ ├── config │ │ └── ApplicationConfig.java │ │ ├── controller │ │ └── WebController.java │ │ ├── service │ │ └── WordCountService.java │ │ ├── test │ │ ├── java │ │ │ ├── EmailFilter.java │ │ │ ├── Test.java │ │ │ └── Word2VecTest.java │ │ └── scala │ │ │ ├── LinearRegression.scala │ │ │ └── lr.txt │ │ ├── textmatch │ │ ├── SimHash.java │ │ └── TextMatch.scala │ │ └── util │ │ ├── BaseResp.java │ │ └── ResultStatus.java │ └── resources │ ├── application.properties │ ├── blsmy.txt │ ├── file │ ├── hadoop.dll │ ├── hdfs.dll │ ├── winutils.exe │ └── zlib1.dll │ ├── ham.txt │ ├── log.txt │ ├── log4j.properties │ └── spam.txt ├── spark-pi ├── pom.xml └── src │ └── main │ └── java │ └── com │ └── spark │ └── SparkPI.java ├── spark-sql ├── pom.xml └── src │ └── main │ ├── java │ └── com │ │ └── spark │ │ ├── JavaSparkSql.java │ │ └── entity │ │ └── People.java │ └── resources │ └── people.json └── word-count ├── pom.xml └── src └── main ├── java └── com │ └── spark │ ├── WordCount.java │ └── streaming │ └── SparkStreamingDemo.java └── resources ├── blsmy.txt └── stdout /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | 3 | # Mobile Tools for Java (J2ME) 4 | .mtj.tmp/ 5 | 6 | # Package Files # 7 | *.jar 8 | *.war 9 | *.ear 10 | 11 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 12 | hs_err_pid* 13 | 14 | 15 | target/ 16 | !.mvn/wrapper/maven-wrapper.jar 17 | 18 | ### STS ### 19 | .apt_generated 20 | .classpath 21 | .factorypath 22 | .project 23 | .settings 24 | .springBeans 25 | 26 | ### IntelliJ IDEA ### 27 | .idea 28 | *.iws 29 | *.iml 30 | *.ipr 31 | 32 | ### NetBeans ### 33 | nbproject/private/ 34 | builds/ 35 | nbbuild/ 36 | dist/ 37 | nbdist/ 38 | .nb-gradle/ 39 | 40 | ### log 41 | logs/ 42 | log/ 43 | 44 | *.parquet/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # quick-spark-process 2 | 学习spark的相关示例 3 | 4 | [![LICENSE](https://img.shields.io/badge/license-Anti%20996-blue.svg)](https://github.com/996icu/996.ICU/blob/master/LICENSE) 5 | 6 | 7 | ### word-count 8 | 最简单也是最经典的例子 9 | 后面搭了spark集群 并使用了hdfs来存储文件,有几点需要注意 10 | #### 文件的调用方式 11 | ```java 12 | context.textFile("D:\\data\\spark\\blsmy.txt"); -- 用于idea测试 13 | context.textFile("file:///mnt/data/blsmy.txt"); -- 用于集群运行(前提,运行的各节点都需要有此文件) 14 | context.textFile("hdfs://spark-master:9000/wordcount/blsmy.txt"); -- 使用hdfs调用文件 15 | ``` 16 | #### 日志输出的位置 17 | 在页面中,输出的日志有sterr和stdout两种,在stdout可以查看程序中输出的内容。如果你在程序中使用了println(....)输出语句,这些信息会在stdout文件里面显示;其余的Spark运行日志会在stderr文件里面显示。 18 | 也可以直接进行日志文件进行查看,如: 19 | ```bash 20 | /spark/software/spark/work/app-20180428142302-0003/0/stdout 21 | /spark/software/spark/work/app-20180428142302-0003/0/stderr 22 | ``` 23 | #### 启动的方式 24 | ```bash 25 | bin/spark-submit \ 26 | --master spark://spark-master:7077 \ 27 | --driver-memory 1g \ 28 | --executor-cores 1 \ 29 | --class com.spark.WordCount \ 30 | simple/word-count-1.0-SNAPSHOT.jar 31 | ``` 32 | 33 | 34 | 35 | 36 | ### spark-pi 37 | 也是一个比较经典的栗子 38 | 39 | ### spark-sql 40 | 使用sparksql做的简单操作 41 | -------------------------------------------------------------------------------- /people.parquet/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /people.parquet/.part-00000-c7a8f5d9-c2f3-4f58-b8b4-fd35faa27324-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vector4wang/quick-spark-process/d2ddd8cfb84cf4f6f0d5d2b23b503276ec8e579a/people.parquet/.part-00000-c7a8f5d9-c2f3-4f58-b8b4-fd35faa27324-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /people.parquet/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vector4wang/quick-spark-process/d2ddd8cfb84cf4f6f0d5d2b23b503276ec8e579a/people.parquet/_SUCCESS -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.spark.sample 8 | quick-spark-process 9 | 1.0-SNAPSHOT 10 | 11 | word-count 12 | spark-pi 13 | spark-sql 14 | sb-word-count 15 | 16 | pom 17 | 18 | 19 | 20 | 21 | org.apache.maven.plugins 22 | maven-compiler-plugin 23 | 3.6.0 24 | 25 | 1.8 26 | 1.8 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /sb-word-count/README.md: -------------------------------------------------------------------------------- 1 | # sb-word-count 2 | 只是使用springboot把spark实现的wordcount整合了一下,并咩有做太复杂的计算 3 | 4 | 5 | 6 | *对《巴黎圣母院》英文版做的统计* 7 | [![WX20180602-120140@2x.png](https://i.loli.net/2018/06/02/5b1216a4b14b7.png)](https://i.loli.net/2018/06/02/5b1216a4b14b7.png) 8 | -------------------------------------------------------------------------------- /sb-word-count/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | sb-word-count 6 | com.spark.sample 7 | 1.0-SNAPSHOT 8 | 4.0.0 9 | 10 | 使用springboot整合spark。 11 | 12 | 13 | 14 | org.springframework.boot 15 | spring-boot-starter-parent 16 | 2.0.2.RELEASE 17 | 18 | 19 | 20 | 21 | 2.12 22 | 2.13.0 23 | 2.4.0 24 | 25 | 26 | 27 | 28 | 29 | org.springframework.boot 30 | spring-boot-starter-web 31 | 32 | 33 | org.springframework.boot 34 | spring-boot-starter-logging 35 | 36 | 37 | 38 | 39 | org.springframework.boot 40 | spring-boot-starter-log4j 41 | 42 | 43 | org.springframework.boot 44 | spring-boot-starter-test 45 | test 46 | 47 | 48 | org.springframework.boot 49 | spring-boot-starter-thymeleaf 50 | 51 | 52 | 53 | org.scala-lang 54 | scala-library 55 | ${scala.version} 56 | 57 | 58 | com.fasterxml.jackson.core 59 | jackson-databind 60 | 2.10.1 61 | 62 | 63 | org.apache.spark 64 | spark-core_${spark.core.version} 65 | ${spark.version} 66 | 67 | 68 | org.slf4j 69 | slf4j-log4j12 70 | 71 | 72 | log4j 73 | log4j 74 | 75 | 76 | 77 | 78 | org.apache.spark 79 | spark-launcher_${spark.core.version} 80 | ${spark.version} 81 | 82 | 83 | org.apache.spark 84 | spark-mllib_${spark.core.version} 85 | ${spark.version} 86 | 87 | 88 | org.apache.spark 89 | spark-streaming_${spark.core.version} 90 | ${spark.version} 91 | 92 | 93 | junit 94 | junit 95 | 4.4 96 | test 97 | 98 | 99 | org.specs 100 | specs 101 | 1.2.5 102 | test 103 | 104 | 105 | 106 | org.ansj 107 | ansj_seg 108 | 5.1.1 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | org.springframework.boot 117 | spring-boot-maven-plugin 118 | 119 | 120 | org.apache.maven.plugins 121 | maven-compiler-plugin 122 | 123 | 1.8 124 | 1.8 125 | 126 | 127 | 128 | org.apache.maven.plugins 129 | maven-surefire-plugin 130 | 2.8.1 131 | 132 | 133 | **/*.java 134 | **/*.scala 135 | 136 | 137 | 138 | 139 | org.scala-tools 140 | maven-scala-plugin 141 | 2.15.2 142 | 143 | 144 | scala-compile-first 145 | process-resources 146 | 147 | compile 148 | 149 | 150 | 151 | scala-test-compile 152 | process-test-resources 153 | 154 | testCompile 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | -------------------------------------------------------------------------------- /sb-word-count/src/main/java/spark/Application.java: -------------------------------------------------------------------------------- 1 | package spark; 2 | 3 | import org.springframework.boot.SpringApplication; 4 | import org.springframework.boot.autoconfigure.SpringBootApplication; 5 | 6 | /** 7 | * Created by wangxc on 2017/3/9. 8 | */ 9 | @SpringBootApplication 10 | public class Application { 11 | 12 | public static void main(String[] args) { 13 | SpringApplication.run(Application.class, args); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /sb-word-count/src/main/java/spark/config/ApplicationConfig.java: -------------------------------------------------------------------------------- 1 | package spark.config; 2 | 3 | import org.apache.spark.SparkConf; 4 | import org.apache.spark.api.java.JavaSparkContext; 5 | import org.springframework.beans.factory.annotation.Autowired; 6 | import org.springframework.beans.factory.annotation.Value; 7 | import org.springframework.context.annotation.Bean; 8 | import org.springframework.context.annotation.Configuration; 9 | import org.springframework.context.support.PropertySourcesPlaceholderConfigurer; 10 | import org.springframework.core.env.Environment; 11 | import org.springframework.util.ResourceUtils; 12 | 13 | import java.io.File; 14 | import java.io.FileNotFoundException; 15 | 16 | /** 17 | * Created by achat1 on 9/22/15. 18 | */ 19 | @Configuration 20 | public class ApplicationConfig { 21 | 22 | @Autowired 23 | private Environment env; 24 | 25 | @Value("${spark.app.name}") 26 | private String appName; 27 | 28 | @Value("${spark.home}") 29 | private String sparkHome; 30 | 31 | @Value("${spark.master.uri}") 32 | private String masterUri; 33 | 34 | @Bean 35 | public SparkConf sparkConf() throws FileNotFoundException { 36 | File file = ResourceUtils.getFile("classpath:file/winutils.exe"); 37 | System.out.println(appName); 38 | System.out.println(sparkHome); 39 | System.out.println(masterUri); 40 | SparkConf sparkConf = new SparkConf() 41 | .setAppName(appName) 42 | .setSparkHome(sparkHome) 43 | .setMaster(masterUri) 44 | .set("spark.testing.memory", "2147480000"); 45 | 46 | return sparkConf; 47 | } 48 | 49 | @Bean 50 | public JavaSparkContext javaSparkContext() throws FileNotFoundException { 51 | return new JavaSparkContext(sparkConf()); 52 | } 53 | // 54 | // @Bean 55 | // public SparkSession sparkSession() { 56 | // return SparkSession 57 | // .builder() 58 | // .sparkContext(javaSparkContext().sc()) 59 | // .appName("Java Spark SQL basic example") 60 | // .getOrCreate(); 61 | // } 62 | 63 | @Bean 64 | public static PropertySourcesPlaceholderConfigurer propertySourcesPlaceholderConfigurer() { 65 | return new PropertySourcesPlaceholderConfigurer(); 66 | } 67 | 68 | } -------------------------------------------------------------------------------- /sb-word-count/src/main/java/spark/controller/WebController.java: -------------------------------------------------------------------------------- 1 | package spark.controller; 2 | 3 | import org.apache.log4j.Logger; 4 | import org.springframework.web.bind.annotation.RequestMapping; 5 | import org.springframework.web.bind.annotation.ResponseBody; 6 | import org.springframework.web.bind.annotation.RestController; 7 | import spark.service.WordCountService; 8 | import spark.util.BaseResp; 9 | import spark.util.ResultStatus; 10 | 11 | import javax.annotation.Resource; 12 | import java.io.FileNotFoundException; 13 | import java.util.Map; 14 | 15 | /** 16 | * Created with IDEA 17 | * User: vector 18 | * Data: 2017/4/13 19 | * Time: 18:02 20 | * Description: 21 | */ 22 | @RestController 23 | @RequestMapping("/spark") 24 | public class WebController { 25 | private Logger logger = Logger.getLogger(WebController.class); 26 | 27 | @Resource 28 | private WordCountService wordCountService; 29 | 30 | @RequestMapping("/wordCount") 31 | @ResponseBody 32 | public BaseResp> wordCount(){ 33 | 34 | logger.info("start submit spark tast..."); 35 | Map counts = null; 36 | try { 37 | counts = wordCountService.run(); 38 | } catch (FileNotFoundException e) { 39 | return new BaseResp<>(ResultStatus.error_record_not_found); 40 | } 41 | 42 | return new BaseResp<>(ResultStatus.SUCCESS,counts); 43 | } 44 | 45 | 46 | 47 | @RequestMapping("/hello") 48 | public BaseResp pring(){ 49 | return new BaseResp<>(ResultStatus.SUCCESS,"hihi"); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /sb-word-count/src/main/java/spark/service/WordCountService.java: -------------------------------------------------------------------------------- 1 | package spark.service; 2 | 3 | import org.apache.spark.api.java.JavaPairRDD; 4 | import org.apache.spark.api.java.JavaRDD; 5 | import org.apache.spark.api.java.JavaSparkContext; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | import org.springframework.stereotype.Component; 8 | import org.springframework.util.ResourceUtils; 9 | import scala.Tuple2; 10 | 11 | import java.io.File; 12 | import java.io.FileNotFoundException; 13 | import java.io.Serializable; 14 | import java.util.Arrays; 15 | import java.util.HashMap; 16 | import java.util.List; 17 | import java.util.Map; 18 | import java.util.regex.Pattern; 19 | 20 | @Component 21 | public class WordCountService implements Serializable { 22 | private static final Pattern SPACE = Pattern.compile(" "); 23 | 24 | @Autowired 25 | private transient JavaSparkContext sc; 26 | 27 | public Map run() throws FileNotFoundException { 28 | Map result = new HashMap<>(); 29 | File file = ResourceUtils.getFile("classpath:blsmy.txt"); 30 | JavaRDD lines = sc.textFile(file.getAbsolutePath()); 31 | JavaRDD words = lines.flatMap(word -> Arrays.asList(word.split(" ")).iterator()); 32 | JavaPairRDD ones = words.mapToPair(s -> new Tuple2<>(s, 1)); 33 | JavaPairRDD counts = ones.reduceByKey((Integer i1, Integer i2) -> (i1 + i2)); 34 | List> output = counts.collect(); 35 | output.forEach(item -> result.put(item._1(), item._2())); 36 | /** 37 | JavaRDD words = lines.flatMap(new FlatMapFunction() { 38 | @Override public Iterable call(String s) throws Exception { 39 | return Arrays.asList(SPACE.split(s)); 40 | } 41 | }); 42 | JavaPairRDD ones = words.mapToPair(new PairFunction() { 43 | private static final long serialVersionUID = 1L; 44 | public Tuple2 call(String s) { 45 | return new Tuple2(s, 1); 46 | } 47 | }); 48 | JavaPairRDD counts = ones.reduceByKey(new Function2() { 49 | private static final long serialVersionUID = 1L; 50 | 51 | public Integer call(Integer i1, Integer i2) { 52 | return i1 + i2; 53 | } 54 | }); 55 | List> output = counts.collect(); 56 | for (Tuple2 tuple : output) { 57 | result.put(tuple._1(), tuple._2()); 58 | } 59 | */ 60 | return result; 61 | 62 | } 63 | } 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /sb-word-count/src/main/java/spark/test/java/EmailFilter.java: -------------------------------------------------------------------------------- 1 | package spark.test.java; 2 | 3 | import org.apache.spark.SparkConf; 4 | import org.apache.spark.api.java.JavaRDD; 5 | import org.apache.spark.api.java.JavaSparkContext; 6 | import org.apache.spark.mllib.classification.LogisticRegressionModel; 7 | import org.apache.spark.mllib.classification.LogisticRegressionWithSGD; 8 | import org.apache.spark.mllib.feature.HashingTF; 9 | import org.apache.spark.mllib.linalg.Vector; 10 | import org.apache.spark.mllib.regression.LabeledPoint; 11 | 12 | import java.util.Arrays; 13 | 14 | /** 15 | * Created with IDEA 16 | * User: vector 17 | * Data: 2017/5/5 18 | * Time: 10:34 19 | * Description: 20 | */ 21 | public class EmailFilter { 22 | public static void main(String[] args) { 23 | SparkConf conf = new SparkConf().setMaster("local").setAppName("垃圾邮件分类"); 24 | JavaSparkContext sc = new JavaSparkContext(conf); 25 | JavaRDD ham = sc.textFile("D:\\githubspace\\springbootquick\\src\\main\\resources\\ham.txt"); 26 | JavaRDD spam = sc.textFile("D:\\githubspace\\springbootquick\\src\\main\\resources\\spam.txt"); 27 | final HashingTF tf = new HashingTF(10000); 28 | JavaRDD posExamples = spam 29 | .map(h -> new LabeledPoint(1, tf.transform(Arrays.asList(h.split(" "))))); 30 | JavaRDD negExamples = ham 31 | .map(s -> new LabeledPoint(0, tf.transform(Arrays.asList(s.split(" "))))); 32 | JavaRDD trainingData = posExamples.union(negExamples); 33 | trainingData.cache(); 34 | LogisticRegressionWithSGD lrLearner = new LogisticRegressionWithSGD(); 35 | LogisticRegressionModel model = lrLearner.run(trainingData.rdd()); 36 | Vector posTestExample = tf.transform(Arrays.asList("O M G GET cheap stuff by sending money to ...".split(" "))); 37 | System.out.println(posTestExample.toJson()); 38 | Vector negTestExample = tf 39 | .transform(Arrays.asList("Hi Dad, I started studying Spark the other ...".split(" "))); 40 | System.out.println("Prediction for positive test example: " + model.predict(posTestExample)); 41 | System.out.println("Prediction for negative test example: " + model.predict(negTestExample)); 42 | 43 | 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /sb-word-count/src/main/java/spark/test/java/Test.java: -------------------------------------------------------------------------------- 1 | package spark.test.java; 2 | 3 | import org.apache.spark.mllib.linalg.Vector; 4 | import org.apache.spark.mllib.linalg.Vectors; 5 | 6 | /** 7 | * Created with IDEA 8 | * User: vector 9 | * Data: 2017/5/5 10 | * Time: 11:50 11 | * Description: 12 | */ 13 | public class Test { 14 | public static void main(String[] args) { 15 | // 稠密向量 16 | Vector denseVec = Vectors.dense(1.0,2.0,3.0); 17 | System.out.println(denseVec); 18 | // 稠密向量 19 | Vector sparseVec = Vectors.sparse(4,new int[]{0,2},new double[]{1.0,2.0}); 20 | System.out.println(sparseVec); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /sb-word-count/src/main/java/spark/test/java/Word2VecTest.java: -------------------------------------------------------------------------------- 1 | package spark.test.java; 2 | 3 | import com.google.common.base.Strings; 4 | import org.apache.spark.SparkConf; 5 | import org.apache.spark.api.java.JavaRDD; 6 | import org.apache.spark.api.java.JavaSparkContext; 7 | import org.apache.spark.mllib.feature.Word2Vec; 8 | import org.apache.spark.mllib.feature.Word2VecModel; 9 | import org.apache.spark.sql.SQLContext; 10 | import scala.Tuple2; 11 | 12 | import java.util.Arrays; 13 | import java.util.List; 14 | 15 | /** 16 | * Created with IDEA 17 | * User: vector 18 | * Data: 2017/5/5 19 | * Time: 14:07 20 | * Description: 21 | */ 22 | public class Word2VecTest { 23 | public static void main(String[] args) { 24 | 25 | 26 | } 27 | 28 | static void trainModelAndDo(){ 29 | SparkConf conf = new SparkConf().setMaster("local").setAppName("Word2Vec"); 30 | JavaSparkContext sc = new JavaSparkContext(conf); 31 | SQLContext sqlContext = new SQLContext(sc); 32 | String sentence = Strings.repeat("a b ", 100) + Strings.repeat("a c ", 10); 33 | List words = Arrays.asList(sentence.split(" ")); 34 | List> localDoc = Arrays.asList(words, words); 35 | JavaRDD> doc = sc.parallelize(localDoc); 36 | Word2Vec word2vec = new Word2Vec() 37 | .setVectorSize(10) 38 | .setSeed(42L); 39 | Word2VecModel model = word2vec.fit(doc); 40 | // model.save(sc.sc(),"D:\\data\\sparkModel"); 41 | Tuple2[] syms = model.findSynonyms("a", 2); 42 | System.out.println(syms.length); 43 | System.out.println(syms[0]._1()); 44 | System.out.println(syms[1]._1()); 45 | } 46 | 47 | static void loadModelAndDo(){ 48 | SparkConf conf = new SparkConf().setMaster("local").setAppName("Word2Vec"); 49 | JavaSparkContext sc = new JavaSparkContext(conf); 50 | SQLContext sqlContext = new SQLContext(sc); 51 | // sqlContext. 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /sb-word-count/src/main/java/spark/test/scala/LinearRegression.scala: -------------------------------------------------------------------------------- 1 | package com.spark.scala 2 | 3 | import org.apache.spark.mllib.linalg.Vectors 4 | import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | 7 | /** 8 | * Created with IDEA 9 | * User: vector 10 | * Data: 2017/5/5 11 | * Time: 9:58 12 | * Description: 13 | */ 14 | object LinearRegression { 15 | val conf = new SparkConf().setMaster("local").setAppName("LinearRegression") 16 | val sc = new SparkContext(conf) 17 | 18 | def main(args: Array[String]): Unit = { 19 | val data = sc.textFile("D:\\githubspace\\springbootquick\\src\\main\\java\\com\\quick\\scala\\lr.txt") 20 | val parsedData = data.map { line => 21 | val parts = line.split('|') 22 | LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(',').map(_.toDouble))) 23 | }.cache() 24 | 25 | val model = LinearRegressionWithSGD.train(parsedData, 2, 0.1) 26 | 27 | val result = model.predict(Vectors.dense(1, 3)) 28 | 29 | println(model.weights) 30 | println(model.weights.size) 31 | println(result) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /sb-word-count/src/main/java/spark/test/scala/lr.txt: -------------------------------------------------------------------------------- 1 | 5|1,1 2 | 8|1,2 3 | 7|2,1 4 | 13|2,3 5 | 18|3,4 -------------------------------------------------------------------------------- /sb-word-count/src/main/java/spark/textmatch/SimHash.java: -------------------------------------------------------------------------------- 1 | package spark.textmatch; 2 | 3 | import java.math.BigInteger; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | import java.util.StringTokenizer; 7 | 8 | public class SimHash { 9 | 10 | private String tokens; 11 | 12 | private BigInteger intSimHash; 13 | 14 | private String strSimHash; 15 | 16 | private int hashbits = 64; 17 | 18 | public SimHash(String tokens) { 19 | this.tokens = tokens; 20 | this.intSimHash = this.simHash(); 21 | } 22 | 23 | public SimHash(String tokens, int hashbits) { 24 | this.tokens = tokens; 25 | this.hashbits = hashbits; 26 | this.intSimHash = this.simHash(); 27 | } 28 | 29 | public BigInteger simHash() { 30 | int[] v = new int[this.hashbits]; 31 | StringTokenizer stringTokens = new StringTokenizer(this.tokens); 32 | while (stringTokens.hasMoreTokens()) { 33 | String temp = stringTokens.nextToken(); 34 | BigInteger t = this.hash(temp); 35 | for (int i = 0; i < this.hashbits; i++) { 36 | BigInteger bitmask = new BigInteger("1").shiftLeft(i); 37 | if (t.and(bitmask).signum() != 0) { 38 | v[i] += 1; 39 | } else { 40 | v[i] -= 1; 41 | } 42 | } 43 | } 44 | BigInteger fingerprint = new BigInteger("0"); 45 | StringBuffer simHashBuffer = new StringBuffer(); 46 | for (int i = 0; i < this.hashbits; i++) { 47 | if (v[i] >= 0) { 48 | fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i)); 49 | simHashBuffer.append("1"); 50 | }else{ 51 | simHashBuffer.append("0"); 52 | } 53 | } 54 | this.strSimHash = simHashBuffer.toString(); 55 | System.out.println(this.strSimHash + " length " + this.strSimHash.length()); 56 | return fingerprint; 57 | } 58 | 59 | private BigInteger hash(String source) { 60 | if (source == null || source.length() == 0) { 61 | return new BigInteger("0"); 62 | } else { 63 | char[] sourceArray = source.toCharArray(); 64 | BigInteger x = BigInteger.valueOf(((long) sourceArray[0]) << 7); 65 | BigInteger m = new BigInteger("1000003"); 66 | BigInteger mask = new BigInteger("2").pow(this.hashbits).subtract( 67 | new BigInteger("1")); 68 | for (char item : sourceArray) { 69 | BigInteger temp = BigInteger.valueOf((long) item); 70 | x = x.multiply(m).xor(temp).and(mask); 71 | } 72 | x = x.xor(new BigInteger(String.valueOf(source.length()))); 73 | if (x.equals(new BigInteger("-1"))) { 74 | x = new BigInteger("-2"); 75 | } 76 | return x; 77 | } 78 | } 79 | 80 | /** 81 | * 取两个二进制的异或,统计为1的个数,就是海明距离 82 | * @param other 83 | * @return 84 | */ 85 | 86 | public int hammingDistance(SimHash other) { 87 | 88 | BigInteger x = this.intSimHash.xor(other.intSimHash); 89 | int tot = 0; 90 | 91 | //统计x中二进制位数为1的个数 92 | //我们想想,一个二进制数减去1,那么,从最后那个1(包括那个1)后面的数字全都反了,对吧,然后,n&(n-1)就相当于把后面的数字清0, 93 | //我们看n能做多少次这样的操作就OK了。 94 | 95 | while (x.signum() != 0) { 96 | tot += 1; 97 | x = x.and(x.subtract(new BigInteger("1"))); 98 | } 99 | return tot; 100 | } 101 | 102 | /** 103 | * calculate Hamming Distance between two strings 104 | * 二进制怕有错,当成字符串,作一个,比较下结果 105 | * @author 106 | * @param str1 the 1st string 107 | * @param str2 the 2nd string 108 | * @return Hamming Distance between str1 and str2 109 | */ 110 | public int getDistance(String str1, String str2) { 111 | int distance; 112 | if (str1.length() != str2.length()) { 113 | distance = -1; 114 | } else { 115 | distance = 0; 116 | for (int i = 0; i < str1.length(); i++) { 117 | if (str1.charAt(i) != str2.charAt(i)) { 118 | distance++; 119 | } 120 | } 121 | } 122 | return distance; 123 | } 124 | 125 | /** 126 | * 如果海明距离取3,则分成四块,并得到每一块的bigInteger值 ,作为索引值使用 127 | * @param simHash 128 | * @param distance 129 | * @return 130 | */ 131 | public List subByDistance(SimHash simHash, int distance){ 132 | int numEach = this.hashbits/(distance+1); 133 | List characters = new ArrayList(); 134 | 135 | StringBuffer buffer = new StringBuffer(); 136 | 137 | int k = 0; 138 | for( int i = 0; i < this.intSimHash.bitLength(); i++){ 139 | boolean sr = simHash.intSimHash.testBit(i); 140 | 141 | if(sr){ 142 | buffer.append("1"); 143 | } 144 | else{ 145 | buffer.append("0"); 146 | } 147 | 148 | if( (i+1)%numEach == 0 ){ 149 | BigInteger eachValue = new BigInteger(buffer.toString(),2); 150 | System.out.println("----" +eachValue ); 151 | buffer.delete(0, buffer.length()); 152 | characters.add(eachValue); 153 | } 154 | } 155 | 156 | return characters; 157 | } 158 | 159 | public static void main(String[] args) { 160 | String s = "This is a test string for testing"; 161 | 162 | SimHash hash1 = new SimHash(s, 64); 163 | System.out.println(hash1.intSimHash + " " + hash1.intSimHash.bitLength()); 164 | 165 | hash1.subByDistance(hash1, 3); 166 | 167 | System.out.println("\n"); 168 | s = "This is a test string for testing, This is a test string for testing abcdef"; 169 | SimHash hash2 = new SimHash(s, 64); 170 | System.out.println(hash2.intSimHash+ " " + hash2.intSimHash.bitCount()); 171 | hash1.subByDistance(hash2, 3); 172 | s = "This is a test string for testing als"; 173 | SimHash hash3 = new SimHash(s, 64); 174 | System.out.println(hash3.intSimHash+ " " + hash3.intSimHash.bitCount()); 175 | hash1.subByDistance(hash3, 3); 176 | System.out.println("============================"); 177 | int dis = hash1.getDistance(hash1.strSimHash,hash2.strSimHash); 178 | 179 | System.out.println(hash1.hammingDistance(hash2) + " "+ dis); 180 | 181 | int dis2 = hash1.getDistance(hash1.strSimHash,hash3.strSimHash); 182 | 183 | System.out.println(hash1.hammingDistance(hash3) + " " + dis2); 184 | 185 | 186 | 187 | } 188 | } -------------------------------------------------------------------------------- /sb-word-count/src/main/java/spark/textmatch/TextMatch.scala: -------------------------------------------------------------------------------- 1 | package com.spark.textmatch 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | /** 6 | * Created by wangxc on 2017/5/13. 7 | */ 8 | object TextMatch { 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setAppName("TextMatch").setMaster("local[4]") 11 | val sc = new SparkContext(conf) 12 | val singles = Array("this", "is") 13 | 14 | val sentences = Array("this Date", 15 | "is there something", 16 | "where are something", 17 | "this is a string") 18 | 19 | val rdd = sc.parallelize(sentences) // create RDD 20 | 21 | val keys = singles.toSet // words required as keys. 22 | 23 | val result = rdd.flatMap { sen => 24 | val words = sen.split(" ").toSet; 25 | val common = keys & words; // intersect 26 | common.map(x => (x, sen)) // map as key -> sen 27 | }.groupByKey.mapValues(_.toArray) // group values for a key 28 | .collect 29 | println(result.length) 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /sb-word-count/src/main/java/spark/util/BaseResp.java: -------------------------------------------------------------------------------- 1 | package spark.util; 2 | 3 | import java.util.Date; 4 | 5 | /** 6 | * Created by wei on 2016/12/12. 7 | */ 8 | public class BaseResp { 9 | /** 10 | * 返回码 11 | */ 12 | private int code; 13 | 14 | /** 15 | * 返回信息描述 16 | */ 17 | private String message; 18 | 19 | /** 20 | * 返回数据 21 | */ 22 | private T data; 23 | 24 | private long currentTime; 25 | 26 | public int getCode() { 27 | return code; 28 | } 29 | 30 | public void setCode(int code) { 31 | this.code = code; 32 | } 33 | 34 | public String getMessage() { 35 | return message; 36 | } 37 | 38 | public void setMessage(String message) { 39 | this.message = message; 40 | } 41 | 42 | public Object getData() { 43 | return data; 44 | } 45 | 46 | public void setData(T data) { 47 | this.data = data; 48 | } 49 | 50 | public long getCurrentTime() { 51 | return currentTime; 52 | } 53 | 54 | public void setCurrentTime(long currentTime) { 55 | this.currentTime = currentTime; 56 | } 57 | 58 | //提供几种构造方法 59 | public BaseResp(int code, String message, T data) { 60 | this.code = code; 61 | this.message = message; 62 | this.data = data; 63 | this.currentTime = new Date().getTime(); 64 | } 65 | 66 | public BaseResp(ResultStatus resultStatus) { 67 | this.code = resultStatus.getErrorCode(); 68 | this.message = resultStatus.getErrorMsg(); 69 | this.data = data; 70 | this.currentTime = new Date().getTime(); 71 | } 72 | 73 | public BaseResp(ResultStatus resultStatus, T data) { 74 | this.code = resultStatus.getErrorCode(); 75 | this.message = resultStatus.getErrorMsg(); 76 | this.data = data; 77 | this.currentTime = new Date().getTime(); 78 | } 79 | 80 | 81 | } 82 | 83 | -------------------------------------------------------------------------------- /sb-word-count/src/main/java/spark/util/ResultStatus.java: -------------------------------------------------------------------------------- 1 | package spark.util; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | 6 | /** 7 | 错误码 8 | * @author wei 9 | * 10 | */ 11 | public enum ResultStatus { 12 | 13 | // -1为通用失败(根据ApiResult.java中的构造方法注释而来) 14 | FAIL(-1, "common fail"), 15 | // 0为成功 16 | SUCCESS(0, "success"), 17 | 18 | error_pic_file(3,"非法图片文件"), 19 | error_pic_upload(4,"图片上传失败"), 20 | error_record_not_found(5, "没有找到对应的数据"), 21 | error_max_page_size(6, "请求记录数超出每次请求最大允许值"), 22 | error_create_failed(7,"新增失败"), 23 | error_update_failed(8,"修改失败"), 24 | error_delete_failed(9,"删除失败"), 25 | error_search_failed(10,"查询失败"), 26 | error_count_failed(11,"查询数据总数失败"), 27 | error_string_to_obj(12,"字符串转java对象失败"), 28 | error_invalid_argument(13,"参数不合法"), 29 | error_update_not_allowed(14,"更新失败:%s"), 30 | error_duplicated_data(15,"数据已存在"), 31 | error_unknown_database_operation(16,"未知数据库操作失败,请联系管理员解决"), 32 | error_column_unique(17,"字段s%违反唯一约束性条件"), 33 | error_file_download(18,"文件下载失败"), 34 | error_file_upload(19,"文件上传失败"), 35 | 36 | //100-511为http 状态码 37 | // --- 4xx Client Error --- 38 | http_status_bad_request(400, "Bad Request"), 39 | http_status_unauthorized(401, "Unauthorized"), 40 | http_status_payment_required(402, "Payment Required"), 41 | http_status_forbidden(403, "Forbidden"), 42 | http_status_not_found(404, "Not Found"), 43 | http_status_method_not_allowed(405, "Method Not Allowed"), 44 | http_status_not_acceptable(406, "Not Acceptable"), 45 | http_status_proxy_authentication_required(407, "Proxy Authentication Required"), 46 | http_status_request_timeout(408, "Request Timeout"), 47 | http_status_conflict(409, "Conflict"), 48 | http_status_gone(410, "Gone"), 49 | http_status_length_required(411, "Length Required"), 50 | http_status_precondition_failed(412, "Precondition Failed"), 51 | http_status_payload_too_large(413, "Payload Too Large"), 52 | http_status_uri_too_long(414, "URI Too Long"), 53 | http_status_unsupported_media_type(415, "Unsupported Media Type"), 54 | http_status_requested_range_not_satisfiable(416, "Requested range not satisfiable"), 55 | http_status_expectation_failed(417, "Expectation Failed"), 56 | http_status_im_a_teapot(418, "I'm a teapot"), 57 | http_status_unprocessable_entity(422, "Unprocessable Entity"), 58 | http_status_locked(423, "Locked"), 59 | http_status_failed_dependency(424, "Failed Dependency"), 60 | http_status_upgrade_required(426, "Upgrade Required"), 61 | http_status_precondition_required(428, "Precondition Required"), 62 | http_status_too_many_requests(429, "Too Many Requests"), 63 | http_status_request_header_fields_too_large(431, "Request Header Fields Too Large"), 64 | 65 | // --- 5xx Server Error --- 66 | http_status_internal_server_error(500, "系统错误"), 67 | http_status_not_implemented(501, "Not Implemented"), 68 | http_status_bad_gateway(502, "Bad Gateway"), 69 | http_status_service_unavailable(503, "Service Unavailable"), 70 | http_status_gateway_timeout(504, "Gateway Timeout"), 71 | http_status_http_version_not_supported(505, "HTTP Version not supported"), 72 | http_status_variant_also_negotiates(506, "Variant Also Negotiates"), 73 | http_status_insufficient_storage(507, "Insufficient Storage"), 74 | http_status_loop_detected(508, "Loop Detected"), 75 | http_status_bandwidth_limit_exceeded(509, "Bandwidth Limit Exceeded"), 76 | http_status_not_extended(510, "Not Extended"), 77 | http_status_network_authentication_required(511, "Network Authentication Required"), 78 | 79 | // --- 8xx common error --- 80 | EXCEPTION(800, "exception"), 81 | INVALID_PARAM(801, "invalid.param"), 82 | INVALID_PRIVI(802, "invalid.privi"), 83 | 84 | //1000以内是系统错误, 85 | no_login(1000,"没有登录"), 86 | config_error(1001,"参数配置表错误"), 87 | user_exist(1002,"用户名已存在"), 88 | userpwd_not_exist(1003,"用户名不存在或者密码错误"), 89 | 90 | 91 | 92 | 93 | ; 94 | private static final Logger LOGGER = LoggerFactory.getLogger(ResultStatus.class); 95 | 96 | 97 | private int code; 98 | private String msg; 99 | 100 | ResultStatus(int code, String msg){ 101 | this.code = code; 102 | this.msg = msg; 103 | } 104 | 105 | public static int getCode(String define){ 106 | try { 107 | return ResultStatus.valueOf(define).code; 108 | } catch (IllegalArgumentException e) { 109 | LOGGER.error("undefined error code: {}", define); 110 | return FAIL.getErrorCode(); 111 | } 112 | } 113 | 114 | public static String getMsg(String define){ 115 | try { 116 | return ResultStatus.valueOf(define).msg; 117 | } catch (IllegalArgumentException e) { 118 | LOGGER.error("undefined error code: {}", define); 119 | return FAIL.getErrorMsg(); 120 | } 121 | 122 | } 123 | 124 | public static String getMsg(int code){ 125 | for(ResultStatus err : ResultStatus.values()){ 126 | if(err.code==code){ 127 | return err.msg; 128 | } 129 | } 130 | return "errorCode not defined "; 131 | } 132 | 133 | public int getErrorCode(){ 134 | return code; 135 | } 136 | 137 | public String getErrorMsg(){ 138 | return msg; 139 | } 140 | 141 | } 142 | 143 | -------------------------------------------------------------------------------- /sb-word-count/src/main/resources/application.properties: -------------------------------------------------------------------------------- 1 | 2 | 3 | server.port=8008 4 | 5 | 6 | spark.app.name=springbootspark 7 | spark.home=D:\\develop\\tools\\spark-1.6.2-bin-hadoop2.6\\spark-1.6.2-bin-hadoop2.6 8 | spark.master.uri=local[3] -------------------------------------------------------------------------------- /sb-word-count/src/main/resources/blsmy.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vector4wang/quick-spark-process/d2ddd8cfb84cf4f6f0d5d2b23b503276ec8e579a/sb-word-count/src/main/resources/blsmy.txt -------------------------------------------------------------------------------- /sb-word-count/src/main/resources/file/hadoop.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vector4wang/quick-spark-process/d2ddd8cfb84cf4f6f0d5d2b23b503276ec8e579a/sb-word-count/src/main/resources/file/hadoop.dll -------------------------------------------------------------------------------- /sb-word-count/src/main/resources/file/hdfs.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vector4wang/quick-spark-process/d2ddd8cfb84cf4f6f0d5d2b23b503276ec8e579a/sb-word-count/src/main/resources/file/hdfs.dll -------------------------------------------------------------------------------- /sb-word-count/src/main/resources/file/winutils.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vector4wang/quick-spark-process/d2ddd8cfb84cf4f6f0d5d2b23b503276ec8e579a/sb-word-count/src/main/resources/file/winutils.exe -------------------------------------------------------------------------------- /sb-word-count/src/main/resources/file/zlib1.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vector4wang/quick-spark-process/d2ddd8cfb84cf4f6f0d5d2b23b503276ec8e579a/sb-word-count/src/main/resources/file/zlib1.dll -------------------------------------------------------------------------------- /sb-word-count/src/main/resources/ham.txt: -------------------------------------------------------------------------------- 1 | Dear Spark Learner, Thanks so much for attending the Spark Summit 2014! Check out videos of talks from the summit at ... 2 | Hi Mom, Apologies for being late about emailing and forgetting to send you the package. I hope you and bro have been ... 3 | Wow, hey Fred, just heard about the Spark petabyte sort. I think we need to take time to try it out immediately ... 4 | Hi Spark user list, This is my first question to this list, so thanks in advance for your help! I tried running ... 5 | Thanks Tom for your email. I need to refer you to Alice for this one. I haven't yet figured out that part either ... 6 | Good job yesterday! I was attending your talk, and really enjoyed it. I want to try out GraphX ... 7 | Summit demo got whoops from audience! Had to let you know. --Joe 8 | -------------------------------------------------------------------------------- /sb-word-count/src/main/resources/log.txt: -------------------------------------------------------------------------------- 1 | 121.205.198.92 - - [21/Feb/2014:00:00:07 +0800] "GET /archives/417.html HTTP/1.1" 200 11465 "http://shiyanjun.cn/archives/417.html/" "Mozilla/5.0 (Windows NT 5.1; rv:11.0) Gecko/20100101 Firefox/11.0" 2 | 121.205.198.92 - - [21/Feb/2014:00:00:11 +0800] "POST /wp-comments-post.php HTTP/1.1" 302 26 "http://shiyanjun.cn/archives/417.html/" "Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0" 3 | 121.205.198.92 - - [21/Feb/2014:00:00:12 +0800] "GET /archives/417.html/ HTTP/1.1" 301 26 "http://shiyanjun.cn/archives/417.html/" "Mozilla/5.0 (Windows NT 5.1; rv:11.0) Gecko/20100101 Firefox/11.0" 4 | 121.205.198.92 - - [21/Feb/2014:00:00:12 +0800] "GET /archives/417.html HTTP/1.1" 200 11465 "http://shiyanjun.cn/archives/417.html" "Mozilla/5.0 (Windows NT 5.1; rv:11.0) Gecko/20100101 Firefox/11.0" 5 | 121.205.241.229 - - [21/Feb/2014:00:00:13 +0800] "GET /archives/526.html HTTP/1.1" 200 12080 "http://shiyanjun.cn/archives/526.html/" "Mozilla/5.0 (Windows NT 5.1; rv:11.0) Gecko/20100101 Firefox/11.0" 6 | 121.205.241.229 - - [21/Feb/2014:00:00:15 +0800] "POST /wp-comments-post.php HTTP/1.1" 302 26 "http://shiyanjun.cn/archives/526.html/" "Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0" -------------------------------------------------------------------------------- /sb-word-count/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vector4wang/quick-spark-process/d2ddd8cfb84cf4f6f0d5d2b23b503276ec8e579a/sb-word-count/src/main/resources/log4j.properties -------------------------------------------------------------------------------- /sb-word-count/src/main/resources/spam.txt: -------------------------------------------------------------------------------- 1 | Dear sir, I am a Prince in a far kingdom you have not heard of. I want to send you money via wire transfer so please ... 2 | Get Viagra real cheap! Send money right away to ... 3 | Oh my gosh you can be really strong too with these drugs found in the rainforest. Get them cheap right now ... 4 | YOUR COMPUTER HAS BEEN INFECTED! YOU MUST RESET YOUR PASSWORD. Reply to this email with your password and SSN ... 5 | THIS IS NOT A SCAM! Send money and get access to awesome stuff really cheap and never have to ... 6 | -------------------------------------------------------------------------------- /spark-pi/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | spark-pi 6 | com.spark.sample 7 | 1.0-SNAPSHOT 8 | 4.0.0 9 | 10 | 11 | 1.8 12 | 2.4.3 13 | 14 | 15 | 16 | 17 | org.apache.spark 18 | spark-core_2.11 19 | ${spark.version} 20 | 21 | 22 | 23 | 24 | 25 | 26 | maven-compiler-plugin 27 | 28 | ${java.version} 29 | ${java.version} 30 | UTF-8 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /spark-pi/src/main/java/com/spark/SparkPI.java: -------------------------------------------------------------------------------- 1 | package com.spark; 2 | 3 | import org.apache.spark.SparkConf; 4 | import org.apache.spark.api.java.JavaRDD; 5 | import org.apache.spark.api.java.JavaSparkContext; 6 | import org.apache.spark.api.java.function.Function; 7 | import org.apache.spark.api.java.function.Function2; 8 | 9 | import java.util.ArrayList; 10 | import java.util.List; 11 | 12 | /** 13 | * Created with IDEA 14 | * User: vector 15 | * Data: 2018/4/20 0020 16 | * Time: 9:58 17 | * Description: spark-submit --class com.spark.SparkPI --master local /ssd/spark/code/spark-pi/spark-pi-1.0-SNAPSHOT.jar 10 18 | */ 19 | public class SparkPI { 20 | public static void main(String[] args) { 21 | SparkConf conf = new SparkConf().setMaster("local").setAppName("Spark PI"); 22 | // SparkConf conf = new SparkConf().setAppName("Spark PI"); 23 | JavaSparkContext jsc = new JavaSparkContext(conf); 24 | 25 | int slices = (args.length == 1) ? Integer.parseInt(args[0]) : 2; 26 | 27 | int n = 100000 * slices; 28 | 29 | List integers = new ArrayList<>(); 30 | for (int i = 0; i < n; i++) { 31 | integers.add(i); 32 | } 33 | 34 | JavaRDD dataSet = jsc.parallelize(integers); 35 | Integer count = dataSet.map((Function) integer -> { 36 | double x = Math.random() * 2 - 1; 37 | double y = Math.random() * 2 - 1; 38 | return (x * x + y * y < 1) ? 1 : 0; 39 | }).reduce((Function2) (integer, integer2) -> integer + integer2); 40 | 41 | System.out.println("Pi is roughly " + 4.0 * count / n); 42 | 43 | jsc.stop(); 44 | 45 | 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /spark-sql/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | spark-sql 6 | com.spark.sample 7 | 1.0-SNAPSHOT 8 | 4.0.0 9 | 10 | 11 | 12 | 1.8 13 | 2.4.3 14 | 15 | 16 | 17 | 18 | org.apache.spark 19 | spark-core_2.11 20 | ${spark.version} 21 | 22 | 23 | org.apache.spark 24 | spark-sql_2.11 25 | ${spark.version} 26 | 27 | 28 | 29 | com.alibaba 30 | fastjson 31 | 1.2.51 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | maven-compiler-plugin 40 | 41 | ${java.version} 42 | ${java.version} 43 | UTF-8 44 | 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /spark-sql/src/main/java/com/spark/JavaSparkSql.java: -------------------------------------------------------------------------------- 1 | package com.spark; 2 | 3 | import org.apache.spark.sql.Dataset; 4 | import org.apache.spark.sql.Row; 5 | import org.apache.spark.sql.SaveMode; 6 | import org.apache.spark.sql.SparkSession; 7 | 8 | /** 9 | * Created with IDEA 10 | * User: vector 11 | * Data: 2018/4/20 0020 12 | * Time: 10:41 13 | * Description: 14 | */ 15 | public class JavaSparkSql { 16 | public static void main(String[] args) { 17 | String classFilePath = JavaSparkSql.class.getResource("/people.json").getPath(); 18 | 19 | 20 | SparkSession spark = SparkSession 21 | .builder() 22 | .master("local") 23 | .appName("Java Spark SQL basic example") 24 | .config("spark.some.config.option", "some-value") 25 | .getOrCreate(); 26 | Dataset df = spark.read().json(classFilePath); 27 | 28 | /** 29 | * 显示表的内容 (前20条) 30 | */ 31 | df.show(); 32 | 33 | /** 34 | * 打印节点 (tree 结构) 35 | */ 36 | df.printSchema(); 37 | 38 | /** 39 | * 选择属性显示 并对属性做简单操作 40 | */ 41 | df.select(df.col("name"), df.col("age").plus(1)).show(); 42 | 43 | /** 44 | * 简单的过滤 45 | */ 46 | df.filter(df.col("age").gt(21)).show(); 47 | 48 | /** 49 | * 分组统计 50 | */ 51 | df.groupBy("age").count().show(); 52 | 53 | 54 | df.createOrReplaceTempView("peopleTmp"); 55 | 56 | 57 | // SQL can be run over RDDs that have been registered as tables. 58 | Dataset teenagers = spark.sql("select name,age from peopleTmp where age > 13 and age <=19"); 59 | teenagers.toJavaRDD().map(row -> "Name: " + row.getString(0)).collect().forEach(System.out::println); 60 | 61 | /** 62 | * parquet file 63 | */ 64 | teenagers.write().mode(SaveMode.Overwrite).parquet("people.parquet"); 65 | 66 | /** 67 | * 对parquet文件做些简单的操作 68 | * 69 | */ 70 | System.out.println("=== Data source: Parquet File ==="); 71 | 72 | 73 | Dataset parquet = spark.read().parquet("people.parquet"); 74 | 75 | parquet.show(); 76 | 77 | parquet.createOrReplaceTempView("parquetPeople"); 78 | 79 | Dataset teenagers2 = spark.sql("select name from parquetPeople where age > 13 and age <= 19"); 80 | 81 | teenagers2.show(); 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /spark-sql/src/main/java/com/spark/entity/People.java: -------------------------------------------------------------------------------- 1 | package com.spark.entity; 2 | 3 | import java.io.Serializable; 4 | 5 | /** 6 | * Created with IDEA 7 | * User: vector 8 | * Data: 2018/4/20 0020 9 | * Time: 11:14 10 | * Description: 11 | */ 12 | public class People implements Serializable { 13 | private String name; 14 | private int age; 15 | 16 | public String getName() { 17 | return name; 18 | } 19 | 20 | public void setName(String name) { 21 | this.name = name; 22 | } 23 | 24 | public int getAge() { 25 | return age; 26 | } 27 | 28 | public void setAge(int age) { 29 | this.age = age; 30 | } 31 | 32 | @Override 33 | public String toString() { 34 | return "People{" + 35 | "name='" + name + '\'' + 36 | ", age=" + age + 37 | '}'; 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /spark-sql/src/main/resources/people.json: -------------------------------------------------------------------------------- 1 | {"name":"Michael"} 2 | {"name":"Andy", "age":30} 3 | {"name":"Justin", "age":19} -------------------------------------------------------------------------------- /word-count/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | word-count 6 | com.spark.sample 7 | 1.0-SNAPSHOT 8 | 4.0.0 9 | 10 | 1.8 11 | 2.4.3 12 | 13 | 14 | 15 | 16 | org.apache.spark 17 | spark-core_2.11 18 | ${spark.version} 19 | 20 | 21 | org.apache.spark 22 | spark-streaming_2.11 23 | ${spark.version} 24 | 25 | 26 | 27 | commons-io 28 | commons-io 29 | 2.4 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | maven-compiler-plugin 38 | 39 | ${java.version} 40 | ${java.version} 41 | UTF-8 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /word-count/src/main/java/com/spark/WordCount.java: -------------------------------------------------------------------------------- 1 | package com.spark; 2 | 3 | import org.apache.spark.SparkConf; 4 | import org.apache.spark.api.java.JavaPairRDD; 5 | import org.apache.spark.api.java.JavaRDD; 6 | import org.apache.spark.api.java.JavaSparkContext; 7 | import org.apache.spark.api.java.function.FlatMapFunction; 8 | import org.apache.spark.api.java.function.Function2; 9 | import org.apache.spark.api.java.function.PairFunction; 10 | import org.apache.spark.api.java.function.VoidFunction; 11 | import scala.Tuple2; 12 | 13 | import java.util.Arrays; 14 | import java.util.List; 15 | 16 | /** 17 | * @Author: wangxc 18 | * @GitHub: https://github.com/vector4wang 19 | * @CSDN: http://blog.csdn.net/qqhjqs?viewmode=contents 20 | * @BLOG: http://vector4wang.tk 21 | * @wxid: BMHJQS 22 | *

23 | * 《巴黎圣母院》英文版的统计 用于本机学习与测试 24 | */ 25 | public class WordCount { 26 | public static void main(String[] args) { 27 | 28 | SparkConf conf = new SparkConf() 29 | .setMaster("local") 30 | .setAppName("WordCount") 31 | .set("spark.cores.max", "1") 32 | .set("spark.eventLog.enabled", "true"); 33 | Tuple2[] all = conf.getAll(); 34 | for (Tuple2 stringStringTuple2 : all) { 35 | System.out.println(stringStringTuple2._1 + ": " + stringStringTuple2._2); 36 | } 37 | JavaSparkContext context = new JavaSparkContext(conf); 38 | // 用于idea测试 39 | String classFilePath = WordCount.class.getResource("/blsmy.txt").getPath(); 40 | 41 | JavaRDD javaRDD = context.textFile(classFilePath); 42 | // JavaRDD javaRDD = context.textFile("file:///mnt/data/blsmy.txt"); -- 用于集群运行(前提,运行的各节点都需要有此文件) 43 | // JavaRDD javaRDD = context.textFile("hdfs://spark-master:9000/wordcount/blsmy.txt"); 44 | 45 | // 46 | JavaRDD words = javaRDD.flatMap((FlatMapFunction) s -> { 47 | String[] split = s.split(" "); 48 | List strings = Arrays.asList(split); 49 | return strings.iterator(); 50 | }); 51 | 52 | JavaPairRDD pairs = words.mapToPair((PairFunction) s -> new Tuple2<>(s, 1)); 53 | 54 | JavaPairRDD reduceByKey = pairs.reduceByKey((Function2) (integer, integer2) -> integer + integer2); 55 | 56 | JavaPairRDD integerStringJavaPairRDD = reduceByKey.mapToPair((PairFunction, Integer, String>) stringIntegerTuple2 -> new Tuple2<>(stringIntegerTuple2._2, stringIntegerTuple2._1)); 57 | 58 | 59 | JavaPairRDD mapToPair = integerStringJavaPairRDD.sortByKey(false).mapToPair((PairFunction, String, Integer>) tuple -> new Tuple2<>(tuple._2, tuple._1)); 60 | 61 | mapToPair.foreach((VoidFunction>) tuple -> System.out.println(tuple._1 + ": " + tuple._2)); 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /word-count/src/main/java/com/spark/streaming/SparkStreamingDemo.java: -------------------------------------------------------------------------------- 1 | package com.spark.streaming; 2 | 3 | import org.apache.spark.SparkConf; 4 | import org.apache.spark.streaming.Durations; 5 | import org.apache.spark.streaming.api.java.JavaDStream; 6 | import org.apache.spark.streaming.api.java.JavaPairDStream; 7 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream; 8 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 9 | import scala.Tuple2; 10 | 11 | import java.util.Arrays; 12 | 13 | /** 14 | * @author vector 15 | * @date: 2019/7/4 0004 17:05 16 | */ 17 | public class SparkStreamingDemo { 18 | public static void main(String[] args) throws InterruptedException { 19 | SparkConf conf = new SparkConf() 20 | .setMaster("local[2]") 21 | .setAppName("NetWorkWordCount"); 22 | 23 | JavaStreamingContext jsc = new JavaStreamingContext(conf, Durations.seconds(5)); 24 | 25 | // 在目标机器人 执行 `nc -lp 9999` 打开9999端口,然后可以输入一些字符串 26 | JavaReceiverInputDStream lines = jsc.socketTextStream("192.168.1.33", 9999); 27 | JavaDStream words = lines.flatMap(line -> Arrays.asList(line.split(" ")).iterator()); 28 | JavaPairDStream pairDStream = words.mapToPair(word -> new Tuple2<>(word, 1)); 29 | JavaPairDStream wordCounts = pairDStream.reduceByKey((i1, i2) -> i1 + i2); 30 | wordCounts.print(); 31 | jsc.start(); 32 | jsc.awaitTermination(); 33 | 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /word-count/src/main/resources/blsmy.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vector4wang/quick-spark-process/d2ddd8cfb84cf4f6f0d5d2b23b503276ec8e579a/word-count/src/main/resources/blsmy.txt --------------------------------------------------------------------------------