tuple : output) {
57 | result.put(tuple._1(), tuple._2());
58 | }
59 | */
60 | return result;
61 |
62 | }
63 | }
64 |
65 |
66 |
67 |
--------------------------------------------------------------------------------
/word-count/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | word-count
6 | com.spark.sample
7 | 1.0-SNAPSHOT
8 | 4.0.0
9 |
10 | 1.8
11 | 2.4.3
12 |
13 |
14 |
15 |
16 | org.apache.spark
17 | spark-core_2.11
18 | ${spark.version}
19 |
20 |
21 | org.apache.spark
22 | spark-streaming_2.11
23 | ${spark.version}
24 |
25 |
26 |
27 | commons-io
28 | commons-io
29 | 2.4
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 | maven-compiler-plugin
38 |
39 | ${java.version}
40 | ${java.version}
41 | UTF-8
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
--------------------------------------------------------------------------------
/word-count/src/main/java/com/spark/WordCount.java:
--------------------------------------------------------------------------------
1 | package com.spark;
2 |
3 | import org.apache.spark.SparkConf;
4 | import org.apache.spark.api.java.JavaPairRDD;
5 | import org.apache.spark.api.java.JavaRDD;
6 | import org.apache.spark.api.java.JavaSparkContext;
7 | import org.apache.spark.api.java.function.FlatMapFunction;
8 | import org.apache.spark.api.java.function.Function2;
9 | import org.apache.spark.api.java.function.PairFunction;
10 | import org.apache.spark.api.java.function.VoidFunction;
11 | import scala.Tuple2;
12 |
13 | import java.util.Arrays;
14 | import java.util.List;
15 |
16 | /**
17 | * @Author: wangxc
18 | * @GitHub: https://github.com/vector4wang
19 | * @CSDN: http://blog.csdn.net/qqhjqs?viewmode=contents
20 | * @BLOG: http://vector4wang.tk
21 | * @wxid: BMHJQS
22 | *
23 | * 《巴黎圣母院》英文版的统计 用于本机学习与测试
24 | */
25 | public class WordCount {
26 | public static void main(String[] args) {
27 |
28 | SparkConf conf = new SparkConf()
29 | .setMaster("local")
30 | .setAppName("WordCount")
31 | .set("spark.cores.max", "1")
32 | .set("spark.eventLog.enabled", "true");
33 | Tuple2[] all = conf.getAll();
34 | for (Tuple2 stringStringTuple2 : all) {
35 | System.out.println(stringStringTuple2._1 + ": " + stringStringTuple2._2);
36 | }
37 | JavaSparkContext context = new JavaSparkContext(conf);
38 | // 用于idea测试
39 | String classFilePath = WordCount.class.getResource("/blsmy.txt").getPath();
40 |
41 | JavaRDD javaRDD = context.textFile(classFilePath);
42 | // JavaRDD javaRDD = context.textFile("file:///mnt/data/blsmy.txt"); -- 用于集群运行(前提,运行的各节点都需要有此文件)
43 | // JavaRDD javaRDD = context.textFile("hdfs://spark-master:9000/wordcount/blsmy.txt");
44 |
45 | //
46 | JavaRDD words = javaRDD.flatMap((FlatMapFunction) s -> {
47 | String[] split = s.split(" ");
48 | List strings = Arrays.asList(split);
49 | return strings.iterator();
50 | });
51 |
52 | JavaPairRDD pairs = words.mapToPair((PairFunction) s -> new Tuple2<>(s, 1));
53 |
54 | JavaPairRDD reduceByKey = pairs.reduceByKey((Function2) (integer, integer2) -> integer + integer2);
55 |
56 | JavaPairRDD integerStringJavaPairRDD = reduceByKey.mapToPair((PairFunction, Integer, String>) stringIntegerTuple2 -> new Tuple2<>(stringIntegerTuple2._2, stringIntegerTuple2._1));
57 |
58 |
59 | JavaPairRDD mapToPair = integerStringJavaPairRDD.sortByKey(false).mapToPair((PairFunction, String, Integer>) tuple -> new Tuple2<>(tuple._2, tuple._1));
60 |
61 | mapToPair.foreach((VoidFunction>) tuple -> System.out.println(tuple._1 + ": " + tuple._2));
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/sb-word-count/src/main/java/spark/util/ResultStatus.java:
--------------------------------------------------------------------------------
1 | package spark.util;
2 |
3 | import org.slf4j.Logger;
4 | import org.slf4j.LoggerFactory;
5 |
6 | /**
7 | 错误码
8 | * @author wei
9 | *
10 | */
11 | public enum ResultStatus {
12 |
13 | // -1为通用失败(根据ApiResult.java中的构造方法注释而来)
14 | FAIL(-1, "common fail"),
15 | // 0为成功
16 | SUCCESS(0, "success"),
17 |
18 | error_pic_file(3,"非法图片文件"),
19 | error_pic_upload(4,"图片上传失败"),
20 | error_record_not_found(5, "没有找到对应的数据"),
21 | error_max_page_size(6, "请求记录数超出每次请求最大允许值"),
22 | error_create_failed(7,"新增失败"),
23 | error_update_failed(8,"修改失败"),
24 | error_delete_failed(9,"删除失败"),
25 | error_search_failed(10,"查询失败"),
26 | error_count_failed(11,"查询数据总数失败"),
27 | error_string_to_obj(12,"字符串转java对象失败"),
28 | error_invalid_argument(13,"参数不合法"),
29 | error_update_not_allowed(14,"更新失败:%s"),
30 | error_duplicated_data(15,"数据已存在"),
31 | error_unknown_database_operation(16,"未知数据库操作失败,请联系管理员解决"),
32 | error_column_unique(17,"字段s%违反唯一约束性条件"),
33 | error_file_download(18,"文件下载失败"),
34 | error_file_upload(19,"文件上传失败"),
35 |
36 | //100-511为http 状态码
37 | // --- 4xx Client Error ---
38 | http_status_bad_request(400, "Bad Request"),
39 | http_status_unauthorized(401, "Unauthorized"),
40 | http_status_payment_required(402, "Payment Required"),
41 | http_status_forbidden(403, "Forbidden"),
42 | http_status_not_found(404, "Not Found"),
43 | http_status_method_not_allowed(405, "Method Not Allowed"),
44 | http_status_not_acceptable(406, "Not Acceptable"),
45 | http_status_proxy_authentication_required(407, "Proxy Authentication Required"),
46 | http_status_request_timeout(408, "Request Timeout"),
47 | http_status_conflict(409, "Conflict"),
48 | http_status_gone(410, "Gone"),
49 | http_status_length_required(411, "Length Required"),
50 | http_status_precondition_failed(412, "Precondition Failed"),
51 | http_status_payload_too_large(413, "Payload Too Large"),
52 | http_status_uri_too_long(414, "URI Too Long"),
53 | http_status_unsupported_media_type(415, "Unsupported Media Type"),
54 | http_status_requested_range_not_satisfiable(416, "Requested range not satisfiable"),
55 | http_status_expectation_failed(417, "Expectation Failed"),
56 | http_status_im_a_teapot(418, "I'm a teapot"),
57 | http_status_unprocessable_entity(422, "Unprocessable Entity"),
58 | http_status_locked(423, "Locked"),
59 | http_status_failed_dependency(424, "Failed Dependency"),
60 | http_status_upgrade_required(426, "Upgrade Required"),
61 | http_status_precondition_required(428, "Precondition Required"),
62 | http_status_too_many_requests(429, "Too Many Requests"),
63 | http_status_request_header_fields_too_large(431, "Request Header Fields Too Large"),
64 |
65 | // --- 5xx Server Error ---
66 | http_status_internal_server_error(500, "系统错误"),
67 | http_status_not_implemented(501, "Not Implemented"),
68 | http_status_bad_gateway(502, "Bad Gateway"),
69 | http_status_service_unavailable(503, "Service Unavailable"),
70 | http_status_gateway_timeout(504, "Gateway Timeout"),
71 | http_status_http_version_not_supported(505, "HTTP Version not supported"),
72 | http_status_variant_also_negotiates(506, "Variant Also Negotiates"),
73 | http_status_insufficient_storage(507, "Insufficient Storage"),
74 | http_status_loop_detected(508, "Loop Detected"),
75 | http_status_bandwidth_limit_exceeded(509, "Bandwidth Limit Exceeded"),
76 | http_status_not_extended(510, "Not Extended"),
77 | http_status_network_authentication_required(511, "Network Authentication Required"),
78 |
79 | // --- 8xx common error ---
80 | EXCEPTION(800, "exception"),
81 | INVALID_PARAM(801, "invalid.param"),
82 | INVALID_PRIVI(802, "invalid.privi"),
83 |
84 | //1000以内是系统错误,
85 | no_login(1000,"没有登录"),
86 | config_error(1001,"参数配置表错误"),
87 | user_exist(1002,"用户名已存在"),
88 | userpwd_not_exist(1003,"用户名不存在或者密码错误"),
89 |
90 |
91 |
92 |
93 | ;
94 | private static final Logger LOGGER = LoggerFactory.getLogger(ResultStatus.class);
95 |
96 |
97 | private int code;
98 | private String msg;
99 |
100 | ResultStatus(int code, String msg){
101 | this.code = code;
102 | this.msg = msg;
103 | }
104 |
105 | public static int getCode(String define){
106 | try {
107 | return ResultStatus.valueOf(define).code;
108 | } catch (IllegalArgumentException e) {
109 | LOGGER.error("undefined error code: {}", define);
110 | return FAIL.getErrorCode();
111 | }
112 | }
113 |
114 | public static String getMsg(String define){
115 | try {
116 | return ResultStatus.valueOf(define).msg;
117 | } catch (IllegalArgumentException e) {
118 | LOGGER.error("undefined error code: {}", define);
119 | return FAIL.getErrorMsg();
120 | }
121 |
122 | }
123 |
124 | public static String getMsg(int code){
125 | for(ResultStatus err : ResultStatus.values()){
126 | if(err.code==code){
127 | return err.msg;
128 | }
129 | }
130 | return "errorCode not defined ";
131 | }
132 |
133 | public int getErrorCode(){
134 | return code;
135 | }
136 |
137 | public String getErrorMsg(){
138 | return msg;
139 | }
140 |
141 | }
142 |
143 |
--------------------------------------------------------------------------------
/sb-word-count/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | sb-word-count
6 | com.spark.sample
7 | 1.0-SNAPSHOT
8 | 4.0.0
9 |
10 | 使用springboot整合spark。
11 |
12 |
13 |
14 | org.springframework.boot
15 | spring-boot-starter-parent
16 | 2.0.2.RELEASE
17 |
18 |
19 |
20 |
21 | 2.12
22 | 2.13.0
23 | 2.4.0
24 |
25 |
26 |
27 |
28 |
29 | org.springframework.boot
30 | spring-boot-starter-web
31 |
32 |
33 | org.springframework.boot
34 | spring-boot-starter-logging
35 |
36 |
37 |
38 |
39 | org.springframework.boot
40 | spring-boot-starter-log4j
41 |
42 |
43 | org.springframework.boot
44 | spring-boot-starter-test
45 | test
46 |
47 |
48 | org.springframework.boot
49 | spring-boot-starter-thymeleaf
50 |
51 |
52 |
53 | org.scala-lang
54 | scala-library
55 | ${scala.version}
56 |
57 |
58 | com.fasterxml.jackson.core
59 | jackson-databind
60 | 2.10.1
61 |
62 |
63 | org.apache.spark
64 | spark-core_${spark.core.version}
65 | ${spark.version}
66 |
67 |
68 | org.slf4j
69 | slf4j-log4j12
70 |
71 |
72 | log4j
73 | log4j
74 |
75 |
76 |
77 |
78 | org.apache.spark
79 | spark-launcher_${spark.core.version}
80 | ${spark.version}
81 |
82 |
83 | org.apache.spark
84 | spark-mllib_${spark.core.version}
85 | ${spark.version}
86 |
87 |
88 | org.apache.spark
89 | spark-streaming_${spark.core.version}
90 | ${spark.version}
91 |
92 |
93 | junit
94 | junit
95 | 4.4
96 | test
97 |
98 |
99 | org.specs
100 | specs
101 | 1.2.5
102 | test
103 |
104 |
105 |
106 | org.ansj
107 | ansj_seg
108 | 5.1.1
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 | org.springframework.boot
117 | spring-boot-maven-plugin
118 |
119 |
120 | org.apache.maven.plugins
121 | maven-compiler-plugin
122 |
123 | 1.8
124 | 1.8
125 |
126 |
127 |
128 | org.apache.maven.plugins
129 | maven-surefire-plugin
130 | 2.8.1
131 |
132 |
133 | **/*.java
134 | **/*.scala
135 |
136 |
137 |
138 |
139 | org.scala-tools
140 | maven-scala-plugin
141 | 2.15.2
142 |
143 |
144 | scala-compile-first
145 | process-resources
146 |
147 | compile
148 |
149 |
150 |
151 | scala-test-compile
152 | process-test-resources
153 |
154 | testCompile
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
--------------------------------------------------------------------------------
/sb-word-count/src/main/java/spark/textmatch/SimHash.java:
--------------------------------------------------------------------------------
1 | package spark.textmatch;
2 |
3 | import java.math.BigInteger;
4 | import java.util.ArrayList;
5 | import java.util.List;
6 | import java.util.StringTokenizer;
7 |
8 | public class SimHash {
9 |
10 | private String tokens;
11 |
12 | private BigInteger intSimHash;
13 |
14 | private String strSimHash;
15 |
16 | private int hashbits = 64;
17 |
18 | public SimHash(String tokens) {
19 | this.tokens = tokens;
20 | this.intSimHash = this.simHash();
21 | }
22 |
23 | public SimHash(String tokens, int hashbits) {
24 | this.tokens = tokens;
25 | this.hashbits = hashbits;
26 | this.intSimHash = this.simHash();
27 | }
28 |
29 | public BigInteger simHash() {
30 | int[] v = new int[this.hashbits];
31 | StringTokenizer stringTokens = new StringTokenizer(this.tokens);
32 | while (stringTokens.hasMoreTokens()) {
33 | String temp = stringTokens.nextToken();
34 | BigInteger t = this.hash(temp);
35 | for (int i = 0; i < this.hashbits; i++) {
36 | BigInteger bitmask = new BigInteger("1").shiftLeft(i);
37 | if (t.and(bitmask).signum() != 0) {
38 | v[i] += 1;
39 | } else {
40 | v[i] -= 1;
41 | }
42 | }
43 | }
44 | BigInteger fingerprint = new BigInteger("0");
45 | StringBuffer simHashBuffer = new StringBuffer();
46 | for (int i = 0; i < this.hashbits; i++) {
47 | if (v[i] >= 0) {
48 | fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i));
49 | simHashBuffer.append("1");
50 | }else{
51 | simHashBuffer.append("0");
52 | }
53 | }
54 | this.strSimHash = simHashBuffer.toString();
55 | System.out.println(this.strSimHash + " length " + this.strSimHash.length());
56 | return fingerprint;
57 | }
58 |
59 | private BigInteger hash(String source) {
60 | if (source == null || source.length() == 0) {
61 | return new BigInteger("0");
62 | } else {
63 | char[] sourceArray = source.toCharArray();
64 | BigInteger x = BigInteger.valueOf(((long) sourceArray[0]) << 7);
65 | BigInteger m = new BigInteger("1000003");
66 | BigInteger mask = new BigInteger("2").pow(this.hashbits).subtract(
67 | new BigInteger("1"));
68 | for (char item : sourceArray) {
69 | BigInteger temp = BigInteger.valueOf((long) item);
70 | x = x.multiply(m).xor(temp).and(mask);
71 | }
72 | x = x.xor(new BigInteger(String.valueOf(source.length())));
73 | if (x.equals(new BigInteger("-1"))) {
74 | x = new BigInteger("-2");
75 | }
76 | return x;
77 | }
78 | }
79 |
80 | /**
81 | * 取两个二进制的异或,统计为1的个数,就是海明距离
82 | * @param other
83 | * @return
84 | */
85 |
86 | public int hammingDistance(SimHash other) {
87 |
88 | BigInteger x = this.intSimHash.xor(other.intSimHash);
89 | int tot = 0;
90 |
91 | //统计x中二进制位数为1的个数
92 | //我们想想,一个二进制数减去1,那么,从最后那个1(包括那个1)后面的数字全都反了,对吧,然后,n&(n-1)就相当于把后面的数字清0,
93 | //我们看n能做多少次这样的操作就OK了。
94 |
95 | while (x.signum() != 0) {
96 | tot += 1;
97 | x = x.and(x.subtract(new BigInteger("1")));
98 | }
99 | return tot;
100 | }
101 |
102 | /**
103 | * calculate Hamming Distance between two strings
104 | * 二进制怕有错,当成字符串,作一个,比较下结果
105 | * @author
106 | * @param str1 the 1st string
107 | * @param str2 the 2nd string
108 | * @return Hamming Distance between str1 and str2
109 | */
110 | public int getDistance(String str1, String str2) {
111 | int distance;
112 | if (str1.length() != str2.length()) {
113 | distance = -1;
114 | } else {
115 | distance = 0;
116 | for (int i = 0; i < str1.length(); i++) {
117 | if (str1.charAt(i) != str2.charAt(i)) {
118 | distance++;
119 | }
120 | }
121 | }
122 | return distance;
123 | }
124 |
125 | /**
126 | * 如果海明距离取3,则分成四块,并得到每一块的bigInteger值 ,作为索引值使用
127 | * @param simHash
128 | * @param distance
129 | * @return
130 | */
131 | public List subByDistance(SimHash simHash, int distance){
132 | int numEach = this.hashbits/(distance+1);
133 | List characters = new ArrayList();
134 |
135 | StringBuffer buffer = new StringBuffer();
136 |
137 | int k = 0;
138 | for( int i = 0; i < this.intSimHash.bitLength(); i++){
139 | boolean sr = simHash.intSimHash.testBit(i);
140 |
141 | if(sr){
142 | buffer.append("1");
143 | }
144 | else{
145 | buffer.append("0");
146 | }
147 |
148 | if( (i+1)%numEach == 0 ){
149 | BigInteger eachValue = new BigInteger(buffer.toString(),2);
150 | System.out.println("----" +eachValue );
151 | buffer.delete(0, buffer.length());
152 | characters.add(eachValue);
153 | }
154 | }
155 |
156 | return characters;
157 | }
158 |
159 | public static void main(String[] args) {
160 | String s = "This is a test string for testing";
161 |
162 | SimHash hash1 = new SimHash(s, 64);
163 | System.out.println(hash1.intSimHash + " " + hash1.intSimHash.bitLength());
164 |
165 | hash1.subByDistance(hash1, 3);
166 |
167 | System.out.println("\n");
168 | s = "This is a test string for testing, This is a test string for testing abcdef";
169 | SimHash hash2 = new SimHash(s, 64);
170 | System.out.println(hash2.intSimHash+ " " + hash2.intSimHash.bitCount());
171 | hash1.subByDistance(hash2, 3);
172 | s = "This is a test string for testing als";
173 | SimHash hash3 = new SimHash(s, 64);
174 | System.out.println(hash3.intSimHash+ " " + hash3.intSimHash.bitCount());
175 | hash1.subByDistance(hash3, 3);
176 | System.out.println("============================");
177 | int dis = hash1.getDistance(hash1.strSimHash,hash2.strSimHash);
178 |
179 | System.out.println(hash1.hammingDistance(hash2) + " "+ dis);
180 |
181 | int dis2 = hash1.getDistance(hash1.strSimHash,hash3.strSimHash);
182 |
183 | System.out.println(hash1.hammingDistance(hash3) + " " + dis2);
184 |
185 |
186 |
187 | }
188 | }
--------------------------------------------------------------------------------