├── .gitignore
├── README.md
├── people.parquet
    ├── ._SUCCESS.crc
    ├── .part-00000-c7a8f5d9-c2f3-4f58-b8b4-fd35faa27324-c000.snappy.parquet.crc
    └── _SUCCESS
├── pom.xml
├── sb-word-count
    ├── README.md
    ├── pom.xml
    └── src
    │   └── main
    │       ├── java
    │           └── spark
    │           │   ├── Application.java
    │           │   ├── config
    │           │       └── ApplicationConfig.java
    │           │   ├── controller
    │           │       └── WebController.java
    │           │   ├── service
    │           │       └── WordCountService.java
    │           │   ├── test
    │           │       ├── java
    │           │       │   ├── EmailFilter.java
    │           │       │   ├── Test.java
    │           │       │   └── Word2VecTest.java
    │           │       └── scala
    │           │       │   ├── LinearRegression.scala
    │           │       │   └── lr.txt
    │           │   ├── textmatch
    │           │       ├── SimHash.java
    │           │       └── TextMatch.scala
    │           │   └── util
    │           │       ├── BaseResp.java
    │           │       └── ResultStatus.java
    │       └── resources
    │           ├── application.properties
    │           ├── blsmy.txt
    │           ├── file
    │               ├── hadoop.dll
    │               ├── hdfs.dll
    │               ├── winutils.exe
    │               └── zlib1.dll
    │           ├── ham.txt
    │           ├── log.txt
    │           ├── log4j.properties
    │           └── spam.txt
├── spark-pi
    ├── pom.xml
    └── src
    │   └── main
    │       └── java
    │           └── com
    │               └── spark
    │                   └── SparkPI.java
├── spark-sql
    ├── pom.xml
    └── src
    │   └── main
    │       ├── java
    │           └── com
    │           │   └── spark
    │           │       ├── JavaSparkSql.java
    │           │       └── entity
    │           │           └── People.java
    │       └── resources
    │           └── people.json
└── word-count
    ├── pom.xml
    └── src
        └── main
            ├── java
                └── com
                │   └── spark
                │       ├── WordCount.java
                │       └── streaming
                │           └── SparkStreamingDemo.java
            └── resources
                ├── blsmy.txt
                └── stdout


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | 
 3 | # Mobile Tools for Java (J2ME)
 4 | .mtj.tmp/
 5 | 
 6 | # Package Files #
 7 | *.jar
 8 | *.war
 9 | *.ear
10 | 
11 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
12 | hs_err_pid*
13 | 
14 | 
15 | target/
16 | !.mvn/wrapper/maven-wrapper.jar
17 | 
18 | ### STS ###
19 | .apt_generated
20 | .classpath
21 | .factorypath
22 | .project
23 | .settings
24 | .springBeans
25 | 
26 | ### IntelliJ IDEA ###
27 | .idea
28 | *.iws
29 | *.iml
30 | *.ipr
31 | 
32 | ### NetBeans ###
33 | nbproject/private/
34 | builds/
35 | nbbuild/
36 | dist/
37 | nbdist/
38 | .nb-gradle/
39 | 
40 | ### log
41 | logs/
42 | log/
43 | 
44 | *.parquet/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # quick-spark-process
 2 | 学习spark的相关示例
 3 | 
 4 | [![LICENSE](https://img.shields.io/badge/license-Anti%20996-blue.svg)](https://github.com/996icu/996.ICU/blob/master/LICENSE)
 5 | 
 6 | 
 7 | ### word-count
 8 | 最简单也是最经典的例子
 9 | 后面搭了spark集群 并使用了hdfs来存储文件，有几点需要注意
10 | #### 文件的调用方式
11 | ```java
12 | context.textFile("D:\\data\\spark\\blsmy.txt");  -- 用于idea测试
13 | context.textFile("file:///mnt/data/blsmy.txt"); -- 用于集群运行(前提，运行的各节点都需要有此文件)
14 | context.textFile("hdfs://spark-master:9000/wordcount/blsmy.txt"); -- 使用hdfs调用文件
15 | ```
16 | #### 日志输出的位置
17 | 在页面中，输出的日志有sterr和stdout两种，在stdout可以查看程序中输出的内容。如果你在程序中使用了println(....)输出语句，这些信息会在stdout文件里面显示；其余的Spark运行日志会在stderr文件里面显示。
18 | 也可以直接进行日志文件进行查看，如：
19 | ```bash
20 | /spark/software/spark/work/app-20180428142302-0003/0/stdout
21 | /spark/software/spark/work/app-20180428142302-0003/0/stderr
22 | ```
23 | #### 启动的方式
24 | ```bash
25 | bin/spark-submit \ 
26 |     --master spark://spark-master:7077 \
27 |     --driver-memory 1g \
28 |     --executor-cores 1 \
29 |     --class com.spark.WordCount \
30 |     simple/word-count-1.0-SNAPSHOT.jar
31 | ```
32 | 
33 | 
34 | 
35 | 
36 | ### spark-pi
37 | 也是一个比较经典的栗子
38 | 
39 | ### spark-sql
40 | 使用sparksql做的简单操作
41 | 


--------------------------------------------------------------------------------
/people.parquet/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/people.parquet/.part-00000-c7a8f5d9-c2f3-4f58-b8b4-fd35faa27324-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vector4wang/quick-spark-process/d2ddd8cfb84cf4f6f0d5d2b23b503276ec8e579a/people.parquet/.part-00000-c7a8f5d9-c2f3-4f58-b8b4-fd35faa27324-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/people.parquet/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vector4wang/quick-spark-process/d2ddd8cfb84cf4f6f0d5d2b23b503276ec8e579a/people.parquet/_SUCCESS


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <groupId>com.spark.sample</groupId>
 8 |     <artifactId>quick-spark-process</artifactId>
 9 |     <version>1.0-SNAPSHOT</version>
10 |     <modules>
11 |         <module>word-count</module>
12 |         <module>spark-pi</module>
13 |         <module>spark-sql</module>
14 |         <module>sb-word-count</module>
15 |     </modules>
16 |     <packaging>pom</packaging>
17 | 
18 |     <build>
19 |         <plugins>
20 |             <plugin>
21 |                 <groupId>org.apache.maven.plugins</groupId>
22 |                 <artifactId>maven-compiler-plugin</artifactId>
23 |                 <version>3.6.0</version>
24 |                 <configuration>
25 |                     <source>1.8</source>
26 |                     <target>1.8</target>
27 |                 </configuration>
28 |             </plugin>
29 |         </plugins>
30 |     </build>
31 | 
32 | </project>


--------------------------------------------------------------------------------
/sb-word-count/README.md:
--------------------------------------------------------------------------------
1 | # sb-word-count
2 | 只是使用springboot把spark实现的wordcount整合了一下，并咩有做太复杂的计算
3 | 
4 | 
5 | 
6 | *对《巴黎圣母院》英文版做的统计*
7 | [![WX20180602-120140@2x.png](https://i.loli.net/2018/06/02/5b1216a4b14b7.png)](https://i.loli.net/2018/06/02/5b1216a4b14b7.png)
8 | 


--------------------------------------------------------------------------------
/sb-word-count/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <artifactId>sb-word-count</artifactId>
  6 |     <groupId>com.spark.sample</groupId>
  7 |     <version>1.0-SNAPSHOT</version>
  8 |     <modelVersion>4.0.0</modelVersion>
  9 |     <description>
 10 |         使用springboot整合spark。
 11 |     </description>
 12 | 
 13 |     <parent>
 14 |         <groupId>org.springframework.boot</groupId>
 15 |         <artifactId>spring-boot-starter-parent</artifactId>
 16 |         <version>2.0.2.RELEASE</version>
 17 |         <relativePath/> <!-- lookup parent from repository -->
 18 |     </parent>
 19 | 
 20 |     <properties>
 21 |         <spark.core.version>2.12</spark.core.version>
 22 |         <scala.version>2.13.0</scala.version>
 23 |         <spark.version>2.4.0</spark.version>
 24 |     </properties>
 25 | 
 26 |     <dependencies>
 27 | 
 28 |         <dependency>
 29 |             <groupId>org.springframework.boot</groupId>
 30 |             <artifactId>spring-boot-starter-web</artifactId>
 31 |             <exclusions>
 32 |                 <exclusion>
 33 |                     <groupId>org.springframework.boot</groupId>
 34 |                     <artifactId>spring-boot-starter-logging</artifactId>
 35 |                 </exclusion>
 36 |             </exclusions>
 37 |         </dependency>
 38 |         <dependency>
 39 |             <groupId>org.springframework.boot</groupId>
 40 |             <artifactId>spring-boot-starter-log4j</artifactId>
 41 |         </dependency>
 42 |         <dependency>
 43 |             <groupId>org.springframework.boot</groupId>
 44 |             <artifactId>spring-boot-starter-test</artifactId>
 45 |             <scope>test</scope>
 46 |         </dependency>
 47 |         <dependency>
 48 |             <groupId>org.springframework.boot</groupId>
 49 |             <artifactId>spring-boot-starter-thymeleaf</artifactId>
 50 |         </dependency>
 51 | 
 52 |         <dependency>
 53 |             <groupId>org.scala-lang</groupId>
 54 |             <artifactId>scala-library</artifactId>
 55 |             <version>${scala.version}</version>
 56 |         </dependency>
 57 |         <dependency>
 58 |             <groupId>com.fasterxml.jackson.core</groupId>
 59 |             <artifactId>jackson-databind</artifactId>
 60 |             <version>2.10.1</version>
 61 |         </dependency>
 62 |         <dependency>
 63 |             <groupId>org.apache.spark</groupId>
 64 |             <artifactId>spark-core_${spark.core.version}</artifactId>
 65 |             <version>${spark.version}</version>
 66 |             <exclusions>
 67 |                 <exclusion>
 68 |                     <groupId>org.slf4j</groupId>
 69 |                     <artifactId>slf4j-log4j12</artifactId>
 70 |                 </exclusion>
 71 |                 <exclusion>
 72 |                     <groupId>log4j</groupId>
 73 |                     <artifactId>log4j</artifactId>
 74 |                 </exclusion>
 75 |             </exclusions>
 76 |         </dependency>
 77 |         <dependency>
 78 |             <groupId>org.apache.spark</groupId>
 79 |             <artifactId>spark-launcher_${spark.core.version}</artifactId>
 80 |             <version>${spark.version}</version>
 81 |         </dependency>
 82 |         <dependency>
 83 |             <groupId>org.apache.spark</groupId>
 84 |             <artifactId>spark-mllib_${spark.core.version}</artifactId>
 85 |             <version>${spark.version}</version>
 86 |         </dependency>
 87 |         <dependency>
 88 |             <groupId>org.apache.spark</groupId>
 89 |             <artifactId>spark-streaming_${spark.core.version}</artifactId>
 90 |             <version>${spark.version}</version>
 91 |         </dependency>
 92 |         <dependency>
 93 |             <groupId>junit</groupId>
 94 |             <artifactId>junit</artifactId>
 95 |             <version>4.4</version>
 96 |             <scope>test</scope>
 97 |         </dependency>
 98 |         <dependency>
 99 |             <groupId>org.specs</groupId>
100 |             <artifactId>specs</artifactId>
101 |             <version>1.2.5</version>
102 |             <scope>test</scope>
103 |         </dependency>
104 | 
105 |         <dependency>
106 |             <groupId>org.ansj</groupId>
107 |             <artifactId>ansj_seg</artifactId>
108 |             <version>5.1.1</version>
109 |         </dependency>
110 | 
111 |     </dependencies>
112 | 
113 |     <build>
114 |         <plugins>
115 |             <plugin>
116 |                 <groupId>org.springframework.boot</groupId>
117 |                 <artifactId>spring-boot-maven-plugin</artifactId>
118 |             </plugin>
119 |             <plugin>
120 |                 <groupId>org.apache.maven.plugins</groupId>
121 |                 <artifactId>maven-compiler-plugin</artifactId>
122 |                 <configuration>
123 |                     <source>1.8</source>
124 |                     <target>1.8</target>
125 |                 </configuration>
126 |             </plugin>
127 |             <plugin>
128 |                 <groupId>org.apache.maven.plugins</groupId>
129 |                 <artifactId>maven-surefire-plugin</artifactId>
130 |                 <version>2.8.1</version>
131 |                 <configuration>
132 |                     <includes>
133 |                         <include>**/*.java</include>
134 |                         <include>**/*.scala</include>
135 |                     </includes>
136 |                 </configuration>
137 |             </plugin>
138 |             <plugin>
139 |                 <groupId>org.scala-tools</groupId>
140 |                 <artifactId>maven-scala-plugin</artifactId>
141 |                 <version>2.15.2</version>
142 |                 <executions>
143 |                     <execution>
144 |                         <id>scala-compile-first</id>
145 |                         <phase>process-resources</phase>
146 |                         <goals>
147 |                             <goal>compile</goal>
148 |                         </goals>
149 |                     </execution>
150 |                     <execution>
151 |                         <id>scala-test-compile</id>
152 |                         <phase>process-test-resources</phase>
153 |                         <goals>
154 |                             <goal>testCompile</goal>
155 |                         </goals>
156 |                     </execution>
157 |                 </executions>
158 |             </plugin>
159 |         </plugins>
160 |     </build>
161 | 
162 | </project>


--------------------------------------------------------------------------------
/sb-word-count/src/main/java/spark/Application.java:
--------------------------------------------------------------------------------
 1 | package spark;
 2 | 
 3 | import org.springframework.boot.SpringApplication;
 4 | import org.springframework.boot.autoconfigure.SpringBootApplication;
 5 | 
 6 | /**
 7 |  * Created by wangxc on 2017/3/9.
 8 |  */
 9 | @SpringBootApplication
10 | public class Application {
11 | 
12 |     public static void main(String[] args) {
13 |         SpringApplication.run(Application.class, args);
14 |     }
15 | }
16 | 


--------------------------------------------------------------------------------
/sb-word-count/src/main/java/spark/config/ApplicationConfig.java:
--------------------------------------------------------------------------------
 1 | package spark.config;
 2 | 
 3 | import org.apache.spark.SparkConf;
 4 | import org.apache.spark.api.java.JavaSparkContext;
 5 | import org.springframework.beans.factory.annotation.Autowired;
 6 | import org.springframework.beans.factory.annotation.Value;
 7 | import org.springframework.context.annotation.Bean;
 8 | import org.springframework.context.annotation.Configuration;
 9 | import org.springframework.context.support.PropertySourcesPlaceholderConfigurer;
10 | import org.springframework.core.env.Environment;
11 | import org.springframework.util.ResourceUtils;
12 | 
13 | import java.io.File;
14 | import java.io.FileNotFoundException;
15 | 
16 | /**
17 |  * Created by achat1 on 9/22/15.
18 |  */
19 | @Configuration
20 | public class ApplicationConfig {
21 | 
22 |     @Autowired
23 |     private Environment env;
24 | 
25 |     @Value("${spark.app.name}")
26 |     private String appName;
27 | 
28 |     @Value("${spark.home}")
29 |     private String sparkHome;
30 | 
31 |     @Value("${spark.master.uri}")
32 |     private String masterUri;
33 | 
34 |     @Bean
35 |     public SparkConf sparkConf() throws FileNotFoundException {
36 |         File file = ResourceUtils.getFile("classpath:file/winutils.exe");
37 |         System.out.println(appName);
38 |         System.out.println(sparkHome);
39 |         System.out.println(masterUri);
40 |         SparkConf sparkConf = new SparkConf()
41 |                 .setAppName(appName)
42 |                 .setSparkHome(sparkHome)
43 |                 .setMaster(masterUri)
44 |                 .set("spark.testing.memory", "2147480000");
45 | 
46 |         return sparkConf;
47 |     }
48 | 
49 |     @Bean
50 |     public JavaSparkContext javaSparkContext() throws FileNotFoundException {
51 |         return new JavaSparkContext(sparkConf());
52 |     }
53 | //
54 | //    @Bean
55 | //    public SparkSession sparkSession() {
56 | //        return SparkSession
57 | //                .builder()
58 | //                .sparkContext(javaSparkContext().sc())
59 | //                .appName("Java Spark SQL basic example")
60 | //                .getOrCreate();
61 | //    }
62 | 
63 |     @Bean
64 |     public static PropertySourcesPlaceholderConfigurer propertySourcesPlaceholderConfigurer() {
65 |         return new PropertySourcesPlaceholderConfigurer();
66 |     }
67 | 
68 | }


--------------------------------------------------------------------------------
/sb-word-count/src/main/java/spark/controller/WebController.java:
--------------------------------------------------------------------------------
 1 | package spark.controller;
 2 | 
 3 | import org.apache.log4j.Logger;
 4 | import org.springframework.web.bind.annotation.RequestMapping;
 5 | import org.springframework.web.bind.annotation.ResponseBody;
 6 | import org.springframework.web.bind.annotation.RestController;
 7 | import spark.service.WordCountService;
 8 | import spark.util.BaseResp;
 9 | import spark.util.ResultStatus;
10 | 
11 | import javax.annotation.Resource;
12 | import java.io.FileNotFoundException;
13 | import java.util.Map;
14 | 
15 | /**
16 |  * Created with IDEA
17 |  * User: vector
18 |  * Data: 2017/4/13
19 |  * Time: 18:02
20 |  * Description:
21 |  */
22 | @RestController
23 |     @RequestMapping("/spark")
24 | public class WebController {
25 |     private Logger logger = Logger.getLogger(WebController.class);
26 | 
27 |     @Resource
28 |     private WordCountService wordCountService;
29 | 
30 |     @RequestMapping("/wordCount")
31 |     @ResponseBody
32 |     public BaseResp<Map<String, Integer>> wordCount(){
33 | 
34 |         logger.info("start submit spark tast...");
35 |         Map<String, Integer> counts = null;
36 |         try {
37 |             counts = wordCountService.run();
38 |         } catch (FileNotFoundException e) {
39 |             return new BaseResp<>(ResultStatus.error_record_not_found);
40 |         }
41 | 
42 |         return new BaseResp<>(ResultStatus.SUCCESS,counts);
43 |     }
44 | 
45 |     
46 | 
47 |     @RequestMapping("/hello")
48 |     public BaseResp<String> pring(){
49 |         return new BaseResp<>(ResultStatus.SUCCESS,"hihi");
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/sb-word-count/src/main/java/spark/service/WordCountService.java:
--------------------------------------------------------------------------------
 1 | package spark.service;
 2 | 
 3 | import org.apache.spark.api.java.JavaPairRDD;
 4 | import org.apache.spark.api.java.JavaRDD;
 5 | import org.apache.spark.api.java.JavaSparkContext;
 6 | import org.springframework.beans.factory.annotation.Autowired;
 7 | import org.springframework.stereotype.Component;
 8 | import org.springframework.util.ResourceUtils;
 9 | import scala.Tuple2;
10 | 
11 | import java.io.File;
12 | import java.io.FileNotFoundException;
13 | import java.io.Serializable;
14 | import java.util.Arrays;
15 | import java.util.HashMap;
16 | import java.util.List;
17 | import java.util.Map;
18 | import java.util.regex.Pattern;
19 | 
20 | @Component
21 | public class WordCountService implements Serializable {
22 | 	private static final Pattern SPACE = Pattern.compile(" ");
23 | 
24 | 	@Autowired
25 | 	private transient JavaSparkContext sc;
26 | 
27 | 	public Map<String, Integer> run() throws FileNotFoundException {
28 | 		Map<String, Integer> result = new HashMap<>();
29 | 		File file = ResourceUtils.getFile("classpath:blsmy.txt");
30 | 		JavaRDD<String> lines = sc.textFile(file.getAbsolutePath());
31 | 		JavaRDD<String> words = lines.flatMap(word -> Arrays.asList(word.split(" ")).iterator());
32 | 		JavaPairRDD<String, Integer> ones = words.mapToPair(s -> new Tuple2<>(s, 1));
33 | 		JavaPairRDD<String, Integer> counts = ones.reduceByKey((Integer i1, Integer i2) -> (i1 + i2));
34 | 		List<Tuple2<String, Integer>> output = counts.collect();
35 | 		output.forEach(item -> result.put(item._1(), item._2()));
36 | 		/**
37 | 		 JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
38 | 		@Override public Iterable<String> call(String s) throws Exception {
39 | 		return Arrays.asList(SPACE.split(s));
40 | 		}
41 | 		});
42 | 		 JavaPairRDD<String, Integer> ones = words.mapToPair(new PairFunction<String, String, Integer>() {
43 | 		 private static final long serialVersionUID = 1L;
44 | 		 public Tuple2<String, Integer> call(String s) {
45 | 		 return new Tuple2<String, Integer>(s, 1);
46 | 		 }
47 | 		 });
48 | 		 JavaPairRDD<String, Integer> counts = ones.reduceByKey(new Function2<Integer, Integer, Integer>() {
49 | 		 private static final long serialVersionUID = 1L;
50 | 
51 | 		 public Integer call(Integer i1, Integer i2) {
52 | 		 return i1 + i2;
53 | 		 }
54 | 		 });
55 | 		 List<Tuple2<String, Integer>> output = counts.collect();
56 | 		 for (Tuple2<String, Integer> tuple : output) {
57 | 		 result.put(tuple._1(), tuple._2());
58 | 		 }
59 | 		 */
60 | 		return result;
61 | 
62 | 	}
63 | }
64 | 
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/sb-word-count/src/main/java/spark/test/java/EmailFilter.java:
--------------------------------------------------------------------------------
 1 | package spark.test.java;
 2 | 
 3 | import org.apache.spark.SparkConf;
 4 | import org.apache.spark.api.java.JavaRDD;
 5 | import org.apache.spark.api.java.JavaSparkContext;
 6 | import org.apache.spark.mllib.classification.LogisticRegressionModel;
 7 | import org.apache.spark.mllib.classification.LogisticRegressionWithSGD;
 8 | import org.apache.spark.mllib.feature.HashingTF;
 9 | import org.apache.spark.mllib.linalg.Vector;
10 | import org.apache.spark.mllib.regression.LabeledPoint;
11 | 
12 | import java.util.Arrays;
13 | 
14 | /**
15 |  * Created with IDEA
16 |  * User: vector
17 |  * Data: 2017/5/5
18 |  * Time: 10:34
19 |  * Description:
20 |  */
21 | public class EmailFilter {
22 | 	public static void main(String[] args) {
23 | 		SparkConf conf = new SparkConf().setMaster("local").setAppName("垃圾邮件分类");
24 | 		JavaSparkContext sc = new JavaSparkContext(conf);
25 | 		JavaRDD<String> ham = sc.textFile("D:\\githubspace\\springbootquick\\src\\main\\resources\\ham.txt");
26 | 		JavaRDD<String> spam = sc.textFile("D:\\githubspace\\springbootquick\\src\\main\\resources\\spam.txt");
27 | 		final HashingTF tf = new HashingTF(10000);
28 | 		JavaRDD<LabeledPoint> posExamples = spam
29 | 				.map(h -> new LabeledPoint(1, tf.transform(Arrays.asList(h.split(" ")))));
30 | 		JavaRDD<LabeledPoint> negExamples = ham
31 | 				.map(s -> new LabeledPoint(0, tf.transform(Arrays.asList(s.split(" ")))));
32 | 		JavaRDD<LabeledPoint> trainingData = posExamples.union(negExamples);
33 | 		trainingData.cache();
34 | 		LogisticRegressionWithSGD lrLearner = new LogisticRegressionWithSGD();
35 | 		LogisticRegressionModel model = lrLearner.run(trainingData.rdd());
36 | 		Vector posTestExample = tf.transform(Arrays.asList("O M G GET cheap stuff by sending money to ...".split(" ")));
37 | 		System.out.println(posTestExample.toJson());
38 | 		Vector negTestExample = tf
39 | 				.transform(Arrays.asList("Hi Dad, I started studying Spark the other ...".split(" ")));
40 | 		System.out.println("Prediction for positive test example: " + model.predict(posTestExample));
41 | 		System.out.println("Prediction for negative test example: " + model.predict(negTestExample));
42 | 
43 | 
44 | 	}
45 | }
46 | 


--------------------------------------------------------------------------------
/sb-word-count/src/main/java/spark/test/java/Test.java:
--------------------------------------------------------------------------------
 1 | package spark.test.java;
 2 | 
 3 | import org.apache.spark.mllib.linalg.Vector;
 4 | import org.apache.spark.mllib.linalg.Vectors;
 5 | 
 6 | /**
 7 |  * Created with IDEA
 8 |  * User: vector
 9 |  * Data: 2017/5/5
10 |  * Time: 11:50
11 |  * Description:
12 |  */
13 | public class Test {
14 |     public static void main(String[] args) {
15 |         // 稠密向量
16 |         Vector denseVec = Vectors.dense(1.0,2.0,3.0);
17 |         System.out.println(denseVec);
18 |         // 稠密向量
19 |         Vector sparseVec = Vectors.sparse(4,new int[]{0,2},new double[]{1.0,2.0});
20 |         System.out.println(sparseVec);
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/sb-word-count/src/main/java/spark/test/java/Word2VecTest.java:
--------------------------------------------------------------------------------
 1 | package spark.test.java;
 2 | 
 3 | import com.google.common.base.Strings;
 4 | import org.apache.spark.SparkConf;
 5 | import org.apache.spark.api.java.JavaRDD;
 6 | import org.apache.spark.api.java.JavaSparkContext;
 7 | import org.apache.spark.mllib.feature.Word2Vec;
 8 | import org.apache.spark.mllib.feature.Word2VecModel;
 9 | import org.apache.spark.sql.SQLContext;
10 | import scala.Tuple2;
11 | 
12 | import java.util.Arrays;
13 | import java.util.List;
14 | 
15 | /**
16 |  * Created with IDEA
17 |  * User: vector
18 |  * Data: 2017/5/5
19 |  * Time: 14:07
20 |  * Description:
21 |  */
22 | public class Word2VecTest {
23 |     public static void main(String[] args) {
24 | 
25 | 
26 |     }
27 | 
28 |     static void trainModelAndDo(){
29 |         SparkConf conf = new SparkConf().setMaster("local").setAppName("Word2Vec");
30 |         JavaSparkContext sc = new JavaSparkContext(conf);
31 |         SQLContext sqlContext = new SQLContext(sc);
32 |         String sentence = Strings.repeat("a b ", 100) + Strings.repeat("a c ", 10);
33 |         List<String> words = Arrays.asList(sentence.split(" "));
34 |         List<List<String>> localDoc = Arrays.asList(words, words);
35 |         JavaRDD<List<String>> doc = sc.parallelize(localDoc);
36 |         Word2Vec word2vec = new Word2Vec()
37 |                 .setVectorSize(10)
38 |                 .setSeed(42L);
39 |         Word2VecModel model = word2vec.fit(doc);
40 | //        model.save(sc.sc(),"D:\\data\\sparkModel");
41 |         Tuple2<String, Object>[] syms = model.findSynonyms("a", 2);
42 |         System.out.println(syms.length);
43 |         System.out.println(syms[0]._1());
44 |         System.out.println(syms[1]._1());
45 |     }
46 | 
47 |     static void loadModelAndDo(){
48 |         SparkConf conf = new SparkConf().setMaster("local").setAppName("Word2Vec");
49 |         JavaSparkContext sc = new JavaSparkContext(conf);
50 |         SQLContext sqlContext = new SQLContext(sc);
51 | //        sqlContext.
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/sb-word-count/src/main/java/spark/test/scala/LinearRegression.scala:
--------------------------------------------------------------------------------
 1 | package com.spark.scala
 2 | 
 3 | import org.apache.spark.mllib.linalg.Vectors
 4 | import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}
 5 | import org.apache.spark.{SparkConf, SparkContext}
 6 | 
 7 | /**
 8 |   * Created with IDEA
 9 |   * User: vector 
10 |   * Data: 2017/5/5
11 |   * Time: 9:58
12 |   * Description: 
13 |   */
14 | object LinearRegression {
15 |   val conf = new SparkConf().setMaster("local").setAppName("LinearRegression")
16 |   val sc = new SparkContext(conf)
17 | 
18 |   def main(args: Array[String]): Unit = {
19 |     val data = sc.textFile("D:\\githubspace\\springbootquick\\src\\main\\java\\com\\quick\\scala\\lr.txt")
20 |     val parsedData = data.map { line =>
21 |       val parts = line.split('|')
22 |       LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(',').map(_.toDouble)))
23 |     }.cache()
24 | 
25 |     val model = LinearRegressionWithSGD.train(parsedData, 2, 0.1)
26 | 
27 |     val result = model.predict(Vectors.dense(1, 3))
28 | 
29 |     println(model.weights)
30 |     println(model.weights.size)
31 |     println(result)
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/sb-word-count/src/main/java/spark/test/scala/lr.txt:
--------------------------------------------------------------------------------
1 | 5|1,1
2 | 8|1,2
3 | 7|2,1
4 | 13|2,3
5 | 18|3,4


--------------------------------------------------------------------------------
/sb-word-count/src/main/java/spark/textmatch/SimHash.java:
--------------------------------------------------------------------------------
  1 | package spark.textmatch;
  2 | 
  3 | import java.math.BigInteger;
  4 | import java.util.ArrayList;
  5 | import java.util.List;
  6 | import java.util.StringTokenizer;
  7 | 
  8 | public class SimHash {
  9 | 
 10 |     private String tokens;
 11 | 
 12 |     private BigInteger intSimHash;
 13 |     
 14 |     private String strSimHash;
 15 | 
 16 |     private int hashbits = 64;
 17 | 
 18 |     public SimHash(String tokens) {
 19 |         this.tokens = tokens;
 20 |         this.intSimHash = this.simHash();
 21 |     }
 22 | 
 23 |     public SimHash(String tokens, int hashbits) {
 24 |         this.tokens = tokens;
 25 |         this.hashbits = hashbits;
 26 |         this.intSimHash = this.simHash();
 27 |     }
 28 | 
 29 |     public BigInteger simHash() {
 30 |         int[] v = new int[this.hashbits];
 31 |         StringTokenizer stringTokens = new StringTokenizer(this.tokens);
 32 |         while (stringTokens.hasMoreTokens()) {
 33 |             String temp = stringTokens.nextToken();
 34 |             BigInteger t = this.hash(temp);
 35 |             for (int i = 0; i < this.hashbits; i++) {
 36 |                 BigInteger bitmask = new BigInteger("1").shiftLeft(i);
 37 |                  if (t.and(bitmask).signum() != 0) {
 38 |                     v[i] += 1;
 39 |                 } else {
 40 |                     v[i] -= 1;
 41 |                 }
 42 |             }
 43 |         }
 44 |         BigInteger fingerprint = new BigInteger("0");
 45 |         StringBuffer simHashBuffer = new StringBuffer();
 46 |         for (int i = 0; i < this.hashbits; i++) {
 47 |             if (v[i] >= 0) {
 48 |                 fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i));
 49 |                 simHashBuffer.append("1");
 50 |             }else{
 51 |                 simHashBuffer.append("0");
 52 |             }
 53 |         }
 54 |         this.strSimHash = simHashBuffer.toString();
 55 |         System.out.println(this.strSimHash + " length " + this.strSimHash.length());
 56 |         return fingerprint;
 57 |     }
 58 | 
 59 |     private BigInteger hash(String source) {
 60 |         if (source == null || source.length() == 0) {
 61 |             return new BigInteger("0");
 62 |         } else {
 63 |             char[] sourceArray = source.toCharArray();
 64 |             BigInteger x = BigInteger.valueOf(((long) sourceArray[0]) << 7);
 65 |             BigInteger m = new BigInteger("1000003");
 66 |             BigInteger mask = new BigInteger("2").pow(this.hashbits).subtract(
 67 |                     new BigInteger("1"));
 68 |             for (char item : sourceArray) {
 69 |                 BigInteger temp = BigInteger.valueOf((long) item);
 70 |                 x = x.multiply(m).xor(temp).and(mask);
 71 |             }
 72 |             x = x.xor(new BigInteger(String.valueOf(source.length())));
 73 |             if (x.equals(new BigInteger("-1"))) {
 74 |                 x = new BigInteger("-2");
 75 |             }
 76 |             return x;
 77 |         }
 78 |     }
 79 |     
 80 |     /**
 81 |      * 取两个二进制的异或，统计为1的个数，就是海明距离
 82 |      * @param other
 83 |      * @return
 84 |      */
 85 | 
 86 |     public int hammingDistance(SimHash other) {
 87 |     	
 88 |         BigInteger x = this.intSimHash.xor(other.intSimHash);
 89 |         int tot = 0;
 90 |         
 91 |         //统计x中二进制位数为1的个数
 92 |         //我们想想，一个二进制数减去1，那么，从最后那个1（包括那个1）后面的数字全都反了，对吧，然后，n&(n-1)就相当于把后面的数字清0，
 93 |         //我们看n能做多少次这样的操作就OK了。
 94 |         
 95 |          while (x.signum() != 0) {
 96 |             tot += 1;
 97 |             x = x.and(x.subtract(new BigInteger("1")));
 98 |         }
 99 |         return tot;
100 |     }
101 | 
102 |     /** 
103 |      * calculate Hamming Distance between two strings 
104 |      *  二进制怕有错，当成字符串，作一个，比较下结果
105 |      * @author  
106 |      * @param str1 the 1st string 
107 |      * @param str2 the 2nd string 
108 |      * @return Hamming Distance between str1 and str2 
109 |      */  
110 |     public int getDistance(String str1, String str2) {  
111 |         int distance;  
112 |         if (str1.length() != str2.length()) {  
113 |             distance = -1;  
114 |         } else {  
115 |             distance = 0;  
116 |             for (int i = 0; i < str1.length(); i++) {  
117 |                 if (str1.charAt(i) != str2.charAt(i)) {  
118 |                     distance++;  
119 |                 }  
120 |             }  
121 |         }  
122 |         return distance;  
123 |     }
124 |     
125 |     /**
126 |      * 如果海明距离取3，则分成四块，并得到每一块的bigInteger值 ，作为索引值使用
127 |      * @param simHash
128 |      * @param distance
129 |      * @return
130 |      */
131 |     public List<BigInteger> subByDistance(SimHash simHash, int distance){
132 |     	int numEach = this.hashbits/(distance+1);
133 |     	List<BigInteger> characters = new ArrayList();
134 |         
135 |     	StringBuffer buffer = new StringBuffer();
136 | 
137 |     	int k = 0;
138 |     	for( int i = 0; i < this.intSimHash.bitLength(); i++){
139 |         	boolean sr = simHash.intSimHash.testBit(i);
140 |         	
141 |         	if(sr){
142 |         		buffer.append("1");
143 |         	}	
144 |         	else{
145 |         		buffer.append("0");
146 |         	}
147 |         	
148 |         	if( (i+1)%numEach == 0 ){
149 |             	BigInteger eachValue = new BigInteger(buffer.toString(),2);
150 |             	System.out.println("----" +eachValue );
151 |             	buffer.delete(0, buffer.length());
152 |             	characters.add(eachValue);
153 |         	}
154 |         }
155 | 
156 |     	return characters;
157 |     }
158 |     
159 |     public static void main(String[] args) {
160 |         String s = "This is a test string for testing";
161 | 
162 |         SimHash hash1 = new SimHash(s, 64);
163 |         System.out.println(hash1.intSimHash + "  " + hash1.intSimHash.bitLength());
164 |         
165 |         hash1.subByDistance(hash1, 3);
166 | 
167 |         System.out.println("\n");
168 |         s = "This is a test string for testing, This is a test string for testing abcdef";
169 |         SimHash hash2 = new SimHash(s, 64);
170 |         System.out.println(hash2.intSimHash+ "  " + hash2.intSimHash.bitCount());
171 |         hash1.subByDistance(hash2, 3);
172 |         s = "This is a test string for testing als";
173 |         SimHash hash3 = new SimHash(s, 64);
174 |         System.out.println(hash3.intSimHash+ "  " + hash3.intSimHash.bitCount());
175 |         hash1.subByDistance(hash3, 3);
176 |         System.out.println("============================");
177 |         int dis = hash1.getDistance(hash1.strSimHash,hash2.strSimHash);
178 |         
179 |         System.out.println(hash1.hammingDistance(hash2) + " "+ dis);
180 |         
181 |         int dis2 = hash1.getDistance(hash1.strSimHash,hash3.strSimHash);
182 |         
183 |         System.out.println(hash1.hammingDistance(hash3) + " " + dis2);
184 |         
185 | 
186 | 
187 |     }
188 | }


--------------------------------------------------------------------------------
/sb-word-count/src/main/java/spark/textmatch/TextMatch.scala:
--------------------------------------------------------------------------------
 1 | package com.spark.textmatch
 2 | 
 3 | import org.apache.spark.{SparkConf, SparkContext}
 4 | 
 5 | /**
 6 |   * Created by wangxc on 2017/5/13.
 7 |   */
 8 | object TextMatch {
 9 |   def main(args: Array[String]): Unit = {
10 |     val conf = new SparkConf().setAppName("TextMatch").setMaster("local[4]")
11 |     val sc = new SparkContext(conf)
12 |     val singles = Array("this", "is")
13 | 
14 |     val sentences = Array("this Date",
15 |       "is there something",
16 |       "where are something",
17 |       "this is a string")
18 | 
19 |     val rdd = sc.parallelize(sentences) // create RDD
20 | 
21 |     val keys = singles.toSet // words required as keys.
22 | 
23 |     val result = rdd.flatMap { sen =>
24 |       val words = sen.split(" ").toSet;
25 |       val common = keys & words; // intersect
26 |       common.map(x => (x, sen)) // map as key -> sen
27 |     }.groupByKey.mapValues(_.toArray) // group values for a key
28 |       .collect
29 |     println(result.length)
30 |   }
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/sb-word-count/src/main/java/spark/util/BaseResp.java:
--------------------------------------------------------------------------------
 1 | package spark.util;
 2 | 
 3 | import java.util.Date;
 4 | 
 5 | /**
 6 |  * Created by wei on 2016/12/12.
 7 |  */
 8 | public class BaseResp<T> {
 9 |     /**
10 |      * 返回码
11 |      */
12 |     private int code;
13 | 
14 |     /**
15 |      * 返回信息描述
16 |      */
17 |     private String message;
18 | 
19 |     /**
20 |      * 返回数据
21 |      */
22 |     private T data;
23 | 
24 |     private long currentTime;
25 | 
26 |     public int getCode() {
27 |         return code;
28 |     }
29 | 
30 |     public void setCode(int code) {
31 |         this.code = code;
32 |     }
33 | 
34 |     public String getMessage() {
35 |         return message;
36 |     }
37 | 
38 |     public void setMessage(String message) {
39 |         this.message = message;
40 |     }
41 | 
42 |     public Object getData() {
43 |         return data;
44 |     }
45 | 
46 |     public void setData(T data) {
47 |         this.data = data;
48 |     }
49 | 
50 |     public long getCurrentTime() {
51 |         return currentTime;
52 |     }
53 | 
54 |     public void setCurrentTime(long currentTime) {
55 |         this.currentTime = currentTime;
56 |     }
57 | 
58 |     //提供几种构造方法
59 |     public BaseResp(int code, String message, T data) {
60 |         this.code = code;
61 |         this.message = message;
62 |         this.data = data;
63 |         this.currentTime = new Date().getTime();
64 |     }
65 | 
66 |     public BaseResp(ResultStatus resultStatus) {
67 |         this.code = resultStatus.getErrorCode();
68 |         this.message = resultStatus.getErrorMsg();
69 |         this.data = data;
70 |         this.currentTime = new Date().getTime();
71 |     }
72 | 
73 |     public BaseResp(ResultStatus resultStatus, T data) {
74 |         this.code = resultStatus.getErrorCode();
75 |         this.message = resultStatus.getErrorMsg();
76 |         this.data = data;
77 |         this.currentTime = new Date().getTime();
78 |     }
79 | 
80 | 
81 | }
82 | 
83 | 


--------------------------------------------------------------------------------
/sb-word-count/src/main/java/spark/util/ResultStatus.java:
--------------------------------------------------------------------------------
  1 | package spark.util;
  2 | 
  3 | import org.slf4j.Logger;
  4 | import org.slf4j.LoggerFactory;
  5 | 
  6 | /**
  7 |  错误码
  8 |  * @author wei
  9 |  *
 10 |  */
 11 | public enum ResultStatus {
 12 | 
 13 | 	// -1为通用失败（根据ApiResult.java中的构造方法注释而来）
 14 | 	FAIL(-1, "common fail"),
 15 | 	// 0为成功
 16 | 	SUCCESS(0, "success"),
 17 | 
 18 | 	error_pic_file(3,"非法图片文件"),
 19 | 	error_pic_upload(4,"图片上传失败"),
 20 | 	error_record_not_found(5, "没有找到对应的数据"),
 21 | 	error_max_page_size(6, "请求记录数超出每次请求最大允许值"),
 22 | 	error_create_failed(7,"新增失败"),
 23 | 	error_update_failed(8,"修改失败"),
 24 | 	error_delete_failed(9,"删除失败"),
 25 | 	error_search_failed(10,"查询失败"),
 26 | 	error_count_failed(11,"查询数据总数失败"),
 27 | 	error_string_to_obj(12,"字符串转java对象失败"),
 28 | 	error_invalid_argument(13,"参数不合法"),
 29 | 	error_update_not_allowed(14,"更新失败：%s"),
 30 | 	error_duplicated_data(15,"数据已存在"),
 31 | 	error_unknown_database_operation(16,"未知数据库操作失败，请联系管理员解决"),
 32 | 	error_column_unique(17,"字段s%违反唯一约束性条件"),
 33 | 	error_file_download(18,"文件下载失败"),
 34 | 	error_file_upload(19,"文件上传失败"),
 35 | 
 36 | 	//100-511为http 状态码
 37 | 	// --- 4xx Client Error ---
 38 | 	http_status_bad_request(400, "Bad Request"),
 39 | 	http_status_unauthorized(401, "Unauthorized"),
 40 | 	http_status_payment_required(402, "Payment Required"),
 41 | 	http_status_forbidden(403, "Forbidden"),
 42 | 	http_status_not_found(404, "Not Found"),
 43 | 	http_status_method_not_allowed(405, "Method Not Allowed"),
 44 | 	http_status_not_acceptable(406, "Not Acceptable"),
 45 | 	http_status_proxy_authentication_required(407, "Proxy Authentication Required"),
 46 | 	http_status_request_timeout(408, "Request Timeout"),
 47 | 	http_status_conflict(409, "Conflict"),
 48 | 	http_status_gone(410, "Gone"),
 49 | 	http_status_length_required(411, "Length Required"),
 50 | 	http_status_precondition_failed(412, "Precondition Failed"),
 51 | 	http_status_payload_too_large(413, "Payload Too Large"),
 52 | 	http_status_uri_too_long(414, "URI Too Long"),
 53 | 	http_status_unsupported_media_type(415, "Unsupported Media Type"),
 54 | 	http_status_requested_range_not_satisfiable(416, "Requested range not satisfiable"),
 55 | 	http_status_expectation_failed(417, "Expectation Failed"),
 56 | 	http_status_im_a_teapot(418, "I'm a teapot"),
 57 | 	http_status_unprocessable_entity(422, "Unprocessable Entity"),
 58 | 	http_status_locked(423, "Locked"),
 59 | 	http_status_failed_dependency(424, "Failed Dependency"),
 60 | 	http_status_upgrade_required(426, "Upgrade Required"),
 61 | 	http_status_precondition_required(428, "Precondition Required"),
 62 | 	http_status_too_many_requests(429, "Too Many Requests"),
 63 | 	http_status_request_header_fields_too_large(431, "Request Header Fields Too Large"),
 64 | 
 65 | 	// --- 5xx Server Error ---
 66 | 	http_status_internal_server_error(500, "系统错误"),
 67 | 	http_status_not_implemented(501, "Not Implemented"),
 68 | 	http_status_bad_gateway(502, "Bad Gateway"),
 69 | 	http_status_service_unavailable(503, "Service Unavailable"),
 70 | 	http_status_gateway_timeout(504, "Gateway Timeout"),
 71 | 	http_status_http_version_not_supported(505, "HTTP Version not supported"),
 72 | 	http_status_variant_also_negotiates(506, "Variant Also Negotiates"),
 73 | 	http_status_insufficient_storage(507, "Insufficient Storage"),
 74 | 	http_status_loop_detected(508, "Loop Detected"),
 75 | 	http_status_bandwidth_limit_exceeded(509, "Bandwidth Limit Exceeded"),
 76 | 	http_status_not_extended(510, "Not Extended"),
 77 | 	http_status_network_authentication_required(511, "Network Authentication Required"),
 78 | 
 79 | 	// --- 8xx common error ---
 80 | 	EXCEPTION(800, "exception"),
 81 | 	INVALID_PARAM(801, "invalid.param"),
 82 | 	INVALID_PRIVI(802, "invalid.privi"),
 83 | 
 84 | 	//1000以内是系统错误，
 85 | 	no_login(1000,"没有登录"),
 86 | 	config_error(1001,"参数配置表错误"),
 87 | 	user_exist(1002,"用户名已存在"),
 88 | 	userpwd_not_exist(1003,"用户名不存在或者密码错误"),
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 	;
 94 | 	private static final Logger LOGGER = LoggerFactory.getLogger(ResultStatus.class);
 95 | 
 96 | 
 97 | 	private int code;
 98 | 	private String msg;
 99 | 
100 | 	ResultStatus(int code, String msg){
101 | 		this.code = code;
102 | 		this.msg = msg;
103 | 	}
104 | 
105 | 	public static int getCode(String define){
106 | 		try {
107 | 			return ResultStatus.valueOf(define).code;
108 | 		} catch (IllegalArgumentException e) {
109 | 			LOGGER.error("undefined error code: {}", define);
110 | 			return FAIL.getErrorCode();
111 | 		}
112 | 	}
113 | 
114 | 	public static String getMsg(String define){
115 | 		try {
116 | 			return ResultStatus.valueOf(define).msg;
117 | 		} catch (IllegalArgumentException e) {
118 | 			LOGGER.error("undefined error code: {}", define);
119 | 			return FAIL.getErrorMsg();
120 | 		}
121 | 
122 | 	}
123 | 
124 | 	public static String getMsg(int code){
125 | 		for(ResultStatus err : ResultStatus.values()){
126 | 			if(err.code==code){
127 | 				return err.msg;
128 | 			}
129 | 		}
130 | 		return "errorCode not defined ";
131 | 	}
132 | 
133 | 	public int getErrorCode(){
134 | 		return code;
135 | 	}
136 | 
137 | 	public String getErrorMsg(){
138 | 		return msg;
139 | 	}
140 | 
141 | }
142 | 
143 | 


--------------------------------------------------------------------------------
/sb-word-count/src/main/resources/application.properties:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | server.port=8008
4 | 
5 | 
6 | spark.app.name=springbootspark
7 | spark.home=D:\\develop\\tools\\spark-1.6.2-bin-hadoop2.6\\spark-1.6.2-bin-hadoop2.6
8 | spark.master.uri=local[3]


--------------------------------------------------------------------------------
/sb-word-count/src/main/resources/blsmy.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vector4wang/quick-spark-process/d2ddd8cfb84cf4f6f0d5d2b23b503276ec8e579a/sb-word-count/src/main/resources/blsmy.txt


--------------------------------------------------------------------------------
/sb-word-count/src/main/resources/file/hadoop.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vector4wang/quick-spark-process/d2ddd8cfb84cf4f6f0d5d2b23b503276ec8e579a/sb-word-count/src/main/resources/file/hadoop.dll


--------------------------------------------------------------------------------
/sb-word-count/src/main/resources/file/hdfs.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vector4wang/quick-spark-process/d2ddd8cfb84cf4f6f0d5d2b23b503276ec8e579a/sb-word-count/src/main/resources/file/hdfs.dll


--------------------------------------------------------------------------------
/sb-word-count/src/main/resources/file/winutils.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vector4wang/quick-spark-process/d2ddd8cfb84cf4f6f0d5d2b23b503276ec8e579a/sb-word-count/src/main/resources/file/winutils.exe


--------------------------------------------------------------------------------
/sb-word-count/src/main/resources/file/zlib1.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vector4wang/quick-spark-process/d2ddd8cfb84cf4f6f0d5d2b23b503276ec8e579a/sb-word-count/src/main/resources/file/zlib1.dll


--------------------------------------------------------------------------------
/sb-word-count/src/main/resources/ham.txt:
--------------------------------------------------------------------------------
1 | Dear Spark Learner, Thanks so much for attending the Spark Summit 2014!  Check out videos of talks from the summit at ...
2 | Hi Mom, Apologies for being late about emailing and forgetting to send you the package.  I hope you and bro have been ...
3 | Wow, hey Fred, just heard about the Spark petabyte sort.  I think we need to take time to try it out immediately ...
4 | Hi Spark user list, This is my first question to this list, so thanks in advance for your help!  I tried running ...
5 | Thanks Tom for your email.  I need to refer you to Alice for this one.  I haven't yet figured out that part either ...
6 | Good job yesterday!  I was attending your talk, and really enjoyed it.  I want to try out GraphX ...
7 | Summit demo got whoops from audience!  Had to let you know. --Joe
8 | 


--------------------------------------------------------------------------------
/sb-word-count/src/main/resources/log.txt:
--------------------------------------------------------------------------------
1 | 121.205.198.92 - - [21/Feb/2014:00:00:07 +0800] "GET /archives/417.html HTTP/1.1" 200 11465 "http://shiyanjun.cn/archives/417.html/" "Mozilla/5.0 (Windows NT 5.1; rv:11.0) Gecko/20100101 Firefox/11.0"
2 | 121.205.198.92 - - [21/Feb/2014:00:00:11 +0800] "POST /wp-comments-post.php HTTP/1.1" 302 26 "http://shiyanjun.cn/archives/417.html/" "Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0"
3 | 121.205.198.92 - - [21/Feb/2014:00:00:12 +0800] "GET /archives/417.html/ HTTP/1.1" 301 26 "http://shiyanjun.cn/archives/417.html/" "Mozilla/5.0 (Windows NT 5.1; rv:11.0) Gecko/20100101 Firefox/11.0"
4 | 121.205.198.92 - - [21/Feb/2014:00:00:12 +0800] "GET /archives/417.html HTTP/1.1" 200 11465 "http://shiyanjun.cn/archives/417.html" "Mozilla/5.0 (Windows NT 5.1; rv:11.0) Gecko/20100101 Firefox/11.0"
5 | 121.205.241.229 - - [21/Feb/2014:00:00:13 +0800] "GET /archives/526.html HTTP/1.1" 200 12080 "http://shiyanjun.cn/archives/526.html/" "Mozilla/5.0 (Windows NT 5.1; rv:11.0) Gecko/20100101 Firefox/11.0"
6 | 121.205.241.229 - - [21/Feb/2014:00:00:15 +0800] "POST /wp-comments-post.php HTTP/1.1" 302 26 "http://shiyanjun.cn/archives/526.html/" "Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0"


--------------------------------------------------------------------------------
/sb-word-count/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vector4wang/quick-spark-process/d2ddd8cfb84cf4f6f0d5d2b23b503276ec8e579a/sb-word-count/src/main/resources/log4j.properties


--------------------------------------------------------------------------------
/sb-word-count/src/main/resources/spam.txt:
--------------------------------------------------------------------------------
1 | Dear sir, I am a Prince in a far kingdom you have not heard of.  I want to send you money via wire transfer so please ...
2 | Get Viagra real cheap!  Send money right away to ...
3 | Oh my gosh you can be really strong too with these drugs found in the rainforest. Get them cheap right now ...
4 | YOUR COMPUTER HAS BEEN INFECTED!  YOU MUST RESET YOUR PASSWORD.  Reply to this email with your password and SSN ...
5 | THIS IS NOT A SCAM!  Send money and get access to awesome stuff really cheap and never have to ...
6 | 


--------------------------------------------------------------------------------
/spark-pi/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <artifactId>spark-pi</artifactId>
 6 |     <groupId>com.spark.sample</groupId>
 7 |     <version>1.0-SNAPSHOT</version>
 8 |     <modelVersion>4.0.0</modelVersion>
 9 | 
10 |     <properties>
11 |         <java.version>1.8</java.version>
12 |         <spark.version>2.4.3</spark.version>
13 |     </properties>
14 | 
15 |     <dependencies>
16 |         <dependency> <!-- Spark dependency -->
17 |             <groupId>org.apache.spark</groupId>
18 |             <artifactId>spark-core_2.11</artifactId>
19 |             <version>${spark.version}</version>
20 |         </dependency>
21 |     </dependencies>
22 | 
23 |     <build>
24 |         <plugins>
25 |             <plugin>
26 |                 <artifactId>maven-compiler-plugin</artifactId>
27 |                 <configuration>
28 |                     <source>${java.version}</source>
29 |                     <target>${java.version}</target>
30 |                     <encoding>UTF-8</encoding>
31 |                 </configuration>
32 |             </plugin>
33 |         </plugins>
34 |     </build>
35 | 
36 | </project>


--------------------------------------------------------------------------------
/spark-pi/src/main/java/com/spark/SparkPI.java:
--------------------------------------------------------------------------------
 1 | package com.spark;
 2 | 
 3 | import org.apache.spark.SparkConf;
 4 | import org.apache.spark.api.java.JavaRDD;
 5 | import org.apache.spark.api.java.JavaSparkContext;
 6 | import org.apache.spark.api.java.function.Function;
 7 | import org.apache.spark.api.java.function.Function2;
 8 | 
 9 | import java.util.ArrayList;
10 | import java.util.List;
11 | 
12 | /**
13 |  * Created with IDEA
14 |  * User: vector
15 |  * Data: 2018/4/20 0020
16 |  * Time: 9:58
17 |  * Description: spark-submit --class com.spark.SparkPI --master local /ssd/spark/code/spark-pi/spark-pi-1.0-SNAPSHOT.jar 10
18 |  */
19 | public class SparkPI {
20 |     public static void main(String[] args) {
21 |         SparkConf conf = new SparkConf().setMaster("local").setAppName("Spark PI");
22 | //        SparkConf conf = new SparkConf().setAppName("Spark PI");
23 |         JavaSparkContext jsc = new JavaSparkContext(conf);
24 | 
25 |         int slices = (args.length == 1) ? Integer.parseInt(args[0]) : 2;
26 | 
27 |         int n = 100000 * slices;
28 | 
29 |         List<Integer> integers = new ArrayList<>();
30 |         for (int i = 0; i < n; i++) {
31 |             integers.add(i);
32 |         }
33 | 
34 |         JavaRDD<Integer> dataSet = jsc.parallelize(integers);
35 |         Integer count = dataSet.map((Function<Integer, Integer>) integer -> {
36 |             double x = Math.random() * 2 - 1;
37 |             double y = Math.random() * 2 - 1;
38 |             return (x * x + y * y < 1) ? 1 : 0;
39 |         }).reduce((Function2<Integer, Integer, Integer>) (integer, integer2) -> integer + integer2);
40 | 
41 |         System.out.println("Pi is roughly " + 4.0 * count / n);
42 | 
43 |         jsc.stop();
44 | 
45 | 
46 |     }
47 | }
48 | 


--------------------------------------------------------------------------------
/spark-sql/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <artifactId>spark-sql</artifactId>
 6 |     <groupId>com.spark.sample</groupId>
 7 |     <version>1.0-SNAPSHOT</version>
 8 |     <modelVersion>4.0.0</modelVersion>
 9 | 
10 | 
11 |     <properties>
12 |         <java.version>1.8</java.version>
13 |         <spark.version>2.4.3</spark.version>
14 |     </properties>
15 | 
16 |     <dependencies>
17 |         <dependency> <!-- Spark dependency -->
18 |             <groupId>org.apache.spark</groupId>
19 |             <artifactId>spark-core_2.11</artifactId>
20 |             <version>${spark.version}</version>
21 |         </dependency>
22 |         <dependency>
23 |             <groupId>org.apache.spark</groupId>
24 |             <artifactId>spark-sql_2.11</artifactId>
25 |             <version>${spark.version}</version>
26 |         </dependency>
27 |         <!-- https://mvnrepository.com/artifact/com.alibaba/fastjson -->
28 |         <dependency>
29 |             <groupId>com.alibaba</groupId>
30 |             <artifactId>fastjson</artifactId>
31 |             <version>1.2.51</version>
32 |         </dependency>
33 | 
34 |     </dependencies>
35 | 
36 |     <build>
37 |         <plugins>
38 |             <plugin>
39 |                 <artifactId>maven-compiler-plugin</artifactId>
40 |                 <configuration>
41 |                     <source>${java.version}</source>
42 |                     <target>${java.version}</target>
43 |                     <encoding>UTF-8</encoding>
44 |                 </configuration>
45 |             </plugin>
46 |         </plugins>
47 |     </build>
48 | 
49 | </project>


--------------------------------------------------------------------------------
/spark-sql/src/main/java/com/spark/JavaSparkSql.java:
--------------------------------------------------------------------------------
 1 | package com.spark;
 2 | 
 3 | import org.apache.spark.sql.Dataset;
 4 | import org.apache.spark.sql.Row;
 5 | import org.apache.spark.sql.SaveMode;
 6 | import org.apache.spark.sql.SparkSession;
 7 | 
 8 | /**
 9 |  * Created with IDEA
10 |  * User: vector
11 |  * Data: 2018/4/20 0020
12 |  * Time: 10:41
13 |  * Description:
14 |  */
15 | public class JavaSparkSql {
16 |     public static void main(String[] args) {
17 |         String classFilePath = JavaSparkSql.class.getResource("/people.json").getPath();
18 | 
19 | 
20 |         SparkSession spark = SparkSession
21 |                 .builder()
22 |                 .master("local")
23 |                 .appName("Java Spark SQL basic example")
24 |                 .config("spark.some.config.option", "some-value")
25 |                 .getOrCreate();
26 |         Dataset<Row> df = spark.read().json(classFilePath);
27 | 
28 |         /**
29 |          * 显示表的内容 (前20条)
30 |          */
31 |         df.show();
32 | 
33 |         /**
34 |          * 打印节点 (tree 结构)
35 |          */
36 |         df.printSchema();
37 | 
38 |         /**
39 |          *  选择属性显示 并对属性做简单操作
40 |          */
41 |         df.select(df.col("name"), df.col("age").plus(1)).show();
42 | 
43 |         /**
44 |          * 简单的过滤
45 |          */
46 |         df.filter(df.col("age").gt(21)).show();
47 | 
48 |         /**
49 |          * 分组统计
50 |          */
51 |         df.groupBy("age").count().show();
52 | 
53 | 
54 |         df.createOrReplaceTempView("peopleTmp");
55 | 
56 | 
57 |         // SQL can be run over RDDs that have been registered as tables.
58 |         Dataset<Row> teenagers = spark.sql("select name,age from peopleTmp where age > 13 and age <=19");
59 |         teenagers.toJavaRDD().map(row -> "Name: " + row.getString(0)).collect().forEach(System.out::println);
60 | 
61 |         /**
62 |          * parquet file
63 |          */
64 |         teenagers.write().mode(SaveMode.Overwrite).parquet("people.parquet");
65 | 
66 |         /**
67 |          * 对parquet文件做些简单的操作
68 |          *
69 |          */
70 |         System.out.println("=== Data source: Parquet File ===");
71 | 
72 | 
73 |         Dataset<Row> parquet = spark.read().parquet("people.parquet");
74 | 
75 |         parquet.show();
76 | 
77 |         parquet.createOrReplaceTempView("parquetPeople");
78 | 
79 |         Dataset<Row> teenagers2 = spark.sql("select name from parquetPeople where age > 13 and age <= 19");
80 | 
81 |         teenagers2.show();
82 |     }
83 | }
84 | 


--------------------------------------------------------------------------------
/spark-sql/src/main/java/com/spark/entity/People.java:
--------------------------------------------------------------------------------
 1 | package com.spark.entity;
 2 | 
 3 | import java.io.Serializable;
 4 | 
 5 | /**
 6 |  * Created with IDEA
 7 |  * User: vector
 8 |  * Data: 2018/4/20 0020
 9 |  * Time: 11:14
10 |  * Description:
11 |  */
12 | public class People implements Serializable {
13 |     private String name;
14 |     private int age;
15 | 
16 |     public String getName() {
17 |         return name;
18 |     }
19 | 
20 |     public void setName(String name) {
21 |         this.name = name;
22 |     }
23 | 
24 |     public int getAge() {
25 |         return age;
26 |     }
27 | 
28 |     public void setAge(int age) {
29 |         this.age = age;
30 |     }
31 | 
32 |     @Override
33 |     public String toString() {
34 |         return "People{" +
35 |                 "name='" + name + '\'' +
36 |                 ", age=" + age +
37 |                 '}';
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/spark-sql/src/main/resources/people.json:
--------------------------------------------------------------------------------
1 | {"name":"Michael"}
2 | {"name":"Andy", "age":30}
3 | {"name":"Justin", "age":19}


--------------------------------------------------------------------------------
/word-count/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <artifactId>word-count</artifactId>
 6 |     <groupId>com.spark.sample</groupId>
 7 |     <version>1.0-SNAPSHOT</version>
 8 |     <modelVersion>4.0.0</modelVersion>
 9 |     <properties>
10 |         <java.version>1.8</java.version>
11 |         <spark.version>2.4.3</spark.version>
12 |     </properties>
13 | 
14 |     <dependencies>
15 |         <dependency> <!-- Spark dependency -->
16 |             <groupId>org.apache.spark</groupId>
17 |             <artifactId>spark-core_2.11</artifactId>
18 |             <version>${spark.version}</version>
19 |         </dependency>
20 |         <dependency>
21 |             <groupId>org.apache.spark</groupId>
22 |             <artifactId>spark-streaming_2.11</artifactId>
23 |             <version>${spark.version}</version>
24 |         </dependency>
25 |         <!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
26 |         <dependency>
27 |             <groupId>commons-io</groupId>
28 |             <artifactId>commons-io</artifactId>
29 |             <version>2.4</version>
30 |         </dependency>
31 | 
32 |     </dependencies>
33 | 
34 |     <build>
35 |         <plugins>
36 |             <plugin>
37 |                 <artifactId>maven-compiler-plugin</artifactId>
38 |                 <configuration>
39 |                     <source>${java.version}</source>
40 |                     <target>${java.version}</target>
41 |                     <encoding>UTF-8</encoding>
42 |                 </configuration>
43 |             </plugin>
44 |             <!--<plugin>-->
45 |             <!--<artifactId>maven-assembly-plugin</artifactId>-->
46 |             <!--<configuration>-->
47 |             <!--<archive>-->
48 |             <!--<manifest>-->
49 |             <!--&lt;!&ndash;这里要替换成jar包main方法所在类 &ndash;&gt;-->
50 |             <!--<mainClass>com.quick.spark.SimpleApp</mainClass>-->
51 |             <!--</manifest>-->
52 |             <!--</archive>-->
53 |             <!--<descriptorRefs>-->
54 |             <!--<descriptorRef>jar-with-dependencies</descriptorRef>-->
55 |             <!--</descriptorRefs>-->
56 |             <!--</configuration>-->
57 |             <!--<executions>-->
58 |             <!--<execution>-->
59 |             <!--<id>make-assembly</id> &lt;!&ndash; this is used for inheritance merges &ndash;&gt;-->
60 |             <!--<phase>package</phase> &lt;!&ndash; 指定在打包节点执行jar包合并操作 &ndash;&gt;-->
61 |             <!--<goals>-->
62 |             <!--<goal>single</goal>-->
63 |             <!--</goals>-->
64 |             <!--</execution>-->
65 |             <!--</executions>-->
66 |             <!--</plugin>-->
67 |         </plugins>
68 |     </build>
69 | 
70 | 
71 | </project>


--------------------------------------------------------------------------------
/word-count/src/main/java/com/spark/WordCount.java:
--------------------------------------------------------------------------------
 1 | package com.spark;
 2 | 
 3 | import org.apache.spark.SparkConf;
 4 | import org.apache.spark.api.java.JavaPairRDD;
 5 | import org.apache.spark.api.java.JavaRDD;
 6 | import org.apache.spark.api.java.JavaSparkContext;
 7 | import org.apache.spark.api.java.function.FlatMapFunction;
 8 | import org.apache.spark.api.java.function.Function2;
 9 | import org.apache.spark.api.java.function.PairFunction;
10 | import org.apache.spark.api.java.function.VoidFunction;
11 | import scala.Tuple2;
12 | 
13 | import java.util.Arrays;
14 | import java.util.List;
15 | 
16 | /**
17 |  * @Author: wangxc
18 |  * @GitHub: https://github.com/vector4wang
19 |  * @CSDN: http://blog.csdn.net/qqhjqs?viewmode=contents
20 |  * @BLOG: http://vector4wang.tk
21 |  * @wxid: BMHJQS
22 |  * <p>
23 |  * 《巴黎圣母院》英文版的统计 用于本机学习与测试
24 |  */
25 | public class WordCount {
26 |     public static void main(String[] args) {
27 | 
28 |         SparkConf conf = new SparkConf()
29 |                 .setMaster("local")
30 |                 .setAppName("WordCount")
31 |                 .set("spark.cores.max", "1")
32 |                 .set("spark.eventLog.enabled", "true");
33 |         Tuple2<String, String>[] all = conf.getAll();
34 |         for (Tuple2<String, String> stringStringTuple2 : all) {
35 |             System.out.println(stringStringTuple2._1 + ": " + stringStringTuple2._2);
36 |         }
37 |         JavaSparkContext context = new JavaSparkContext(conf);
38 |         // 用于idea测试
39 | 		String classFilePath = WordCount.class.getResource("/blsmy.txt").getPath();
40 | 
41 | 		JavaRDD<String> javaRDD = context.textFile(classFilePath);
42 | //        JavaRDD<String> javaRDD = context.textFile("file:///mnt/data/blsmy.txt"); -- 用于集群运行(前提，运行的各节点都需要有此文件)
43 | //        JavaRDD<String> javaRDD = context.textFile("hdfs://spark-master:9000/wordcount/blsmy.txt");
44 | 
45 | //
46 | 		JavaRDD<String> words = javaRDD.flatMap((FlatMapFunction<String, String>) s -> {
47 | 			String[] split = s.split(" ");
48 | 			List<String> strings = Arrays.asList(split);
49 | 			return strings.iterator();
50 | 		});
51 | 
52 |         JavaPairRDD<String, Integer> pairs = words.mapToPair((PairFunction<String, String, Integer>) s -> new Tuple2<>(s, 1));
53 | 
54 |         JavaPairRDD<String, Integer> reduceByKey = pairs.reduceByKey((Function2<Integer, Integer, Integer>) (integer, integer2) -> integer + integer2);
55 | 
56 |         JavaPairRDD<Integer, String> integerStringJavaPairRDD = reduceByKey.mapToPair((PairFunction<Tuple2<String, Integer>, Integer, String>) stringIntegerTuple2 -> new Tuple2<>(stringIntegerTuple2._2, stringIntegerTuple2._1));
57 | 
58 | 
59 |         JavaPairRDD<String, Integer> mapToPair = integerStringJavaPairRDD.sortByKey(false).mapToPair((PairFunction<Tuple2<Integer, String>, String, Integer>) tuple -> new Tuple2<>(tuple._2, tuple._1));
60 | 
61 |         mapToPair.foreach((VoidFunction<Tuple2<String, Integer>>) tuple -> System.out.println(tuple._1 + ": " + tuple._2));
62 |     }
63 | }
64 | 


--------------------------------------------------------------------------------
/word-count/src/main/java/com/spark/streaming/SparkStreamingDemo.java:
--------------------------------------------------------------------------------
 1 | package com.spark.streaming;
 2 | 
 3 | import org.apache.spark.SparkConf;
 4 | import org.apache.spark.streaming.Durations;
 5 | import org.apache.spark.streaming.api.java.JavaDStream;
 6 | import org.apache.spark.streaming.api.java.JavaPairDStream;
 7 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
 8 | import org.apache.spark.streaming.api.java.JavaStreamingContext;
 9 | import scala.Tuple2;
10 | 
11 | import java.util.Arrays;
12 | 
13 | /**
14 |  * @author vector
15 |  * @date: 2019/7/4 0004 17:05
16 |  */
17 | public class SparkStreamingDemo {
18 |     public static void main(String[] args) throws InterruptedException {
19 |         SparkConf conf = new SparkConf()
20 |                 .setMaster("local[2]")
21 |                 .setAppName("NetWorkWordCount");
22 | 
23 |         JavaStreamingContext jsc = new JavaStreamingContext(conf, Durations.seconds(5));
24 | 
25 |         // 在目标机器人 执行 `nc -lp 9999` 打开9999端口，然后可以输入一些字符串
26 |         JavaReceiverInputDStream<String> lines = jsc.socketTextStream("192.168.1.33", 9999);
27 |         JavaDStream<String> words = lines.flatMap(line -> Arrays.asList(line.split(" ")).iterator());
28 |         JavaPairDStream<String, Integer> pairDStream = words.mapToPair(word -> new Tuple2<>(word, 1));
29 |         JavaPairDStream<String, Integer> wordCounts = pairDStream.reduceByKey((i1, i2) -> i1 + i2);
30 |         wordCounts.print();
31 |         jsc.start();
32 |         jsc.awaitTermination();
33 | 
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/word-count/src/main/resources/blsmy.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vector4wang/quick-spark-process/d2ddd8cfb84cf4f6f0d5d2b23b503276ec8e579a/word-count/src/main/resources/blsmy.txt


--------------------------------------------------------------------------------