├── .cache-main
├── .gitignore
├── README.md
├── inputFile
├── join1
├── join2
├── lr_data.txt
├── product
├── random.data
├── test.data
├── test1
├── test2.data
├── testone.txt
├── u.data
├── user
├── wordCount
└── wordCount2
├── lib
└── test-0.0.1-SNAPSHOT.jar
├── pom.xml
└── src
├── main
└── scala
│ ├── com
│ ├── fun
│ │ └── util
│ │ │ ├── RDDOperateFunction.scala
│ │ │ ├── SparkContextOperateFunction.scala
│ │ │ ├── ZzyLmqDataOperateUtil.scala
│ │ │ └── package.scala
│ ├── spark
│ │ ├── es
│ │ │ ├── SparkLocalESTest.scala
│ │ │ └── Test.scala
│ │ ├── hbase
│ │ │ ├── GetOutSiteSuNingPCToNewTable.scala
│ │ │ ├── PutDataToHbase.scala
│ │ │ ├── SparkGetHbaseToRdd.scala
│ │ │ ├── SparkScanHbaseToRdd.scala
│ │ │ └── hbasetest.scala
│ │ ├── hive
│ │ │ ├── CaseClass.scala
│ │ │ ├── HiveContextTest.scala
│ │ │ ├── SparkPhoenixLoadAndSaveTest.scala
│ │ │ ├── SparkRddToHive.scala
│ │ │ └── SparkToHive.scala
│ │ ├── jdbcrdd
│ │ │ ├── JdbcMysqlRDD.scala
│ │ │ ├── SparkCSVTest.scala
│ │ │ ├── SparkJdbcRDDTest.scala
│ │ │ ├── SparkSecondarySortKey.scala
│ │ │ └── package.scala
│ │ ├── kafka
│ │ │ ├── HashMapEncoder.scala
│ │ │ ├── KafkaProducerCache.scala
│ │ │ ├── RDDKafkaWriter.scala
│ │ │ ├── SparkKafkaRDDReader.scala
│ │ │ ├── SparkWriteDataToKafkaRunMain.scala
│ │ │ └── package.scala
│ │ ├── ml
│ │ │ ├── ALSDemo.scala
│ │ │ ├── ClassifierDemo.scala
│ │ │ └── TestVector.scala
│ │ ├── myrdd
│ │ │ ├── CaseClassUtil.scala
│ │ │ ├── ImplicitParameter.scala
│ │ │ ├── MySelfRDD.scala
│ │ │ ├── TestMain.scala
│ │ │ └── package.scala
│ │ ├── python
│ │ │ └── TestPython.scala
│ │ ├── scala
│ │ │ ├── ImplicitClass.scala
│ │ │ ├── ReflectScala.scala
│ │ │ └── ScalaGramaer.scala
│ │ ├── scalatest
│ │ │ └── ScalaTest.scala
│ │ ├── sparkSql
│ │ │ ├── CaseClassUtil.scala
│ │ │ ├── JavaUseScalaClass.scala
│ │ │ ├── SparkListToDataFrame.scala
│ │ │ └── SparkSQLDemo.scala
│ │ ├── streaming
│ │ │ ├── DataProducter.scala
│ │ │ ├── DirectMysqlInputDStream.scala
│ │ │ ├── JdbcSparkStreamRDD.scala
│ │ │ ├── MapWithStateTest.scala
│ │ │ ├── MysqlManager.scala
│ │ │ ├── SpartStreamingTest.scala
│ │ │ ├── UpdateStateByKeyTest.scala
│ │ │ └── package.scala
│ │ └── util
│ │ │ ├── KafkaClusterManager.scala
│ │ │ ├── SparkKryoRegistrators.scala
│ │ │ └── SparkKryoSerializerTest.scala
│ └── test
│ │ ├── CheckHbaseDataWithMysql.scala
│ │ ├── HbaseUtil.scala
│ │ ├── HttpAsyncClientsTest.scala
│ │ ├── JsonTest.scala
│ │ ├── KafkaLogTest.scala
│ │ ├── ReflectScala.scala
│ │ ├── SparkWithLocalTest.scala
│ │ ├── Test.scala
│ │ ├── TestJava.java
│ │ └── Utilities.scala
│ ├── hdfs-site.xml
│ ├── hive-site.xml
│ └── log4j.properties
└── test
└── scala
└── samples
├── junit.scala
├── scalatest.scala
└── specs.scala
/.gitignore:
--------------------------------------------------------------------------------
1 | # use glob syntax.
2 | syntax: glob
3 | *.ser
4 | *.class
5 | *~
6 | *.bak
7 | #*.off
8 | *.old
9 |
10 | # eclipse conf file
11 | .settings
12 | .classpath
13 | .project
14 | .manager
15 | .scala_dependencies
16 |
17 | # idea
18 | .idea
19 | *.iml
20 |
21 | # building
22 | target
23 | build
24 | null
25 | tmp*
26 | temp*
27 | dist
28 | test-output
29 | build.log
30 |
31 | # other scm
32 | .svn
33 | .CVS
34 | .hg*
35 |
36 | # switch to regexp syntax.
37 | # syntax: regexp
38 | # ^\.pc/
39 |
40 | #SHITTY output not in target directory
41 | build.log
42 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # spark-test
2 | Spark Version Test Code
3 |
--------------------------------------------------------------------------------
/inputFile/join1:
--------------------------------------------------------------------------------
1 | 1 a a a
2 | 2 b b b
3 | 3 c c c
--------------------------------------------------------------------------------
/inputFile/join2:
--------------------------------------------------------------------------------
1 | 1 aa aaa aaaa
2 | 2 bb bbb bbbb
3 | 4 vv vvv vvvv
--------------------------------------------------------------------------------
/inputFile/product:
--------------------------------------------------------------------------------
1 | 1
2 | 2
3 | 3
4 | 4
5 | 5
6 | 6
7 | 7
8 | 8
9 | 9
--------------------------------------------------------------------------------
/inputFile/test.data:
--------------------------------------------------------------------------------
1 | 1,1,5.0
2 | 1,2,1.0
3 | 1,3,5.0
4 | 1,4,1.0
5 | 1,5,4.5
6 | 1,6,0.0
7 | 2,1,5.0
8 | 2,2,1.0
9 | 2,3,5.0
10 | 2,4,1.0
11 | 2,6,0.0
12 | 2,5,0.0
13 | 3,1,1.0
14 | 3,2,5.0
15 | 3,3,1.0
16 | 3,4,5.0
17 | 3,6,0.0
18 | 3,5,0.0
19 | 4,1,1.0
20 | 4,2,5.0
21 | 4,3,1.0
22 | 4,4,5.0
23 | 4,6,3.5
24 | 4,5,0.0
--------------------------------------------------------------------------------
/inputFile/test1:
--------------------------------------------------------------------------------
1 | 20160701,1,1,5000
2 | 20160701,1,2,20
3 | 20160701,1,3,100
4 | 20160701,2,1,2000
5 | 20160701,2,2,2000
6 | 20160701,2,3,2000
7 | 20160701,3,1,4000
8 | 20160701,3,2,3000
9 | 20160701,3,3,2000
10 | 20160701,4,1,1000
11 | 20160701,4,2,1000
12 | 20160701,4,3,1000
13 | 20160701,5,1,500
14 | 20160701,5,2,500
15 | 20160701,5,3,500
16 | 20160701,6,1,5000
17 | 20160701,6,2,5000
18 | 20160701,6,3,5000
19 | 20160702,1,1,5000
20 | 20160702,1,2,20
21 | 20160702,1,3,100
22 | 20160702,2,1,2000
23 | 20160702,2,2,2000
24 | 20160702,2,3,2000
25 | 20160702,3,1,4000
26 | 20160702,3,2,3000
27 | 20160702,3,3,2000
28 | 20160702,4,1,1000
29 | 20160702,4,2,1000
30 | 20160702,4,3,1000
31 | 20160702,5,1,500
32 | 20160702,5,2,500
33 | 20160702,5,3,500
34 | 20160702,6,1,5000
35 | 20160702,6,2,5000
36 | 20160702,6,3,5000
37 | 20160703,1,1,5000
38 | 20160703,1,2,20
39 | 20160703,1,3,100
40 | 20160703,2,1,2000
41 | 20160703,2,2,2000
42 | 20160703,2,3,2000
43 | 20160703,3,1,4000
44 | 20160703,3,2,3000
45 | 20160703,3,3,2000
46 | 20160703,4,1,1000
47 | 20160703,4,2,1000
48 | 20160703,4,3,1000
49 | 20160703,5,1,500
50 | 20160703,5,2,500
51 | 20160703,5,3,500
52 | 20160703,6,1,5000
53 | 20160703,6,2,5000
54 | 20160703,6,3,5000
55 | 20160704,1,1,5000
56 | 20160704,1,2,20
57 | 20160704,1,3,100
58 | 20160704,2,1,2000
59 | 20160704,2,2,2000
60 | 20160704,2,3,2000
61 | 20160704,3,1,4000
62 | 20160704,3,2,3000
63 | 20160704,3,3,2000
64 | 20160704,4,1,1000
65 | 20160704,4,2,1000
66 | 20160704,4,3,1000
67 | 20160704,5,1,500
68 | 20160704,5,2,500
69 | 20160704,5,3,500
70 | 20160704,6,1,5000
71 | 20160704,6,2,5000
72 | 20160704,6,3,5000
73 | 20160704,1,1,5000
74 | 20160704,1,2,20
75 | 20160704,1,3,100
76 | 20160704,2,1,2000
77 | 20160704,2,2,2000
78 | 20160704,2,3,2000
79 | 20160704,3,1,4000
80 | 20160704,3,2,3000
81 | 20160704,3,3,2000
82 | 20160704,4,1,1000
83 | 20160704,4,2,1000
84 | 20160704,4,3,1000
85 | 20160704,5,1,500
86 | 20160704,5,2,500
87 | 20160704,5,3,500
88 | 20160704,6,1,5000
89 | 20160704,6,2,5000
90 | 20160704,6,3,5000
91 | 20160705,1,1,5000
92 | 20160705,1,2,20
93 | 20160705,1,3,100
94 | 20160705,2,1,2000
95 | 20160705,2,2,2000
96 | 20160705,2,3,2000
97 | 20160705,3,1,4000
98 | 20160705,3,2,3000
99 | 20160705,3,3,2000
100 | 20160705,4,1,1000
101 | 20160705,4,2,1000
102 | 20160705,4,3,1000
103 | 20160705,5,1,500
104 | 20160705,5,2,500
105 | 20160705,5,3,500
106 | 20160705,6,1,5000
107 | 20160705,6,2,5000
108 | 20160705,6,3,5000
109 | 20160705,1,1,5000
110 | 20160705,1,2,20
111 | 20160705,1,3,100
112 | 20160705,2,1,2000
113 | 20160705,2,2,2000
114 | 20160705,2,3,2000
115 | 20160705,3,1,4000
116 | 20160705,3,2,3000
117 | 20160705,3,3,2000
118 | 20160705,4,1,1000
119 | 20160705,4,2,1000
120 | 20160705,4,3,1000
121 | 20160705,5,1,500
122 | 20160705,5,2,500
123 | 20160705,5,3,500
124 | 20160705,6,1,5000
125 | 20160705,6,2,5000
126 | 20160705,6,3,5000
127 | 20160706,1,1,5000
128 | 20160706,1,2,20
129 | 20160706,1,3,100
130 | 20160706,2,1,2000
131 | 20160706,2,2,2000
132 | 20160706,2,3,2000
133 | 20160706,3,1,4000
134 | 20160706,3,2,3000
135 | 20160706,3,3,2000
136 | 20160706,4,1,1000
137 | 20160706,4,2,1000
138 | 20160706,4,3,1000
139 | 20160706,5,1,500
140 | 20160706,5,2,500
141 | 20160706,5,3,500
142 | 20160706,6,1,5000
143 | 20160706,6,2,5000
144 | 20160706,6,3,5000
145 | 20160707,1,1,5000
146 | 20160707,1,2,20
147 | 20160707,1,3,100
148 | 20160707,2,1,2000
149 | 20160707,2,2,2000
150 | 20160707,2,3,2000
151 | 20160707,3,1,4000
152 | 20160707,3,2,3000
153 | 20160707,3,3,2000
154 | 20160707,4,1,1000
155 | 20160707,4,2,1000
156 | 20160707,4,3,1000
157 | 20160707,5,1,500
158 | 20160707,5,2,500
159 | 20160707,5,3,500
160 | 20160707,6,1,5000
161 | 20160707,6,2,5000
162 | 20160707,6,3,5000
163 | 20160708,1,1,5000
164 | 20160708,1,2,20
165 | 20160708,1,3,100
166 | 20160708,2,1,2000
167 | 20160708,2,2,2000
168 | 20160708,2,3,2000
169 | 20160708,3,1,4000
170 | 20160708,3,2,3000
171 | 20160708,3,3,2000
172 | 20160708,4,1,1000
173 | 20160708,4,2,1000
174 | 20160708,4,3,1000
175 | 20160708,5,1,500
176 | 20160708,5,2,500
177 | 20160708,5,3,500
178 | 20160708,6,1,5000
179 | 20160708,6,2,5000
180 | 20160708,6,3,5000
181 | 20160709,1,1,5000
182 | 20160709,1,2,20
183 | 20160709,1,3,100
184 | 20160709,2,1,2000
185 | 20160709,2,2,2000
186 | 20160709,2,3,2000
187 | 20160709,3,1,4000
188 | 20160709,3,2,3000
189 | 20160709,3,3,2000
190 | 20160709,4,1,1000
191 | 20160709,4,2,1000
192 | 20160709,4,3,1000
193 | 20160709,5,1,500
194 | 20160709,5,2,500
195 | 20160709,5,3,500
196 | 20160709,6,1,5000
197 | 20160709,6,2,5000
198 | 20160709,6,3,5000
199 | 20160710,1,1,5000
200 | 20160710,1,2,20
201 | 20160710,1,3,100
202 | 20160710,2,1,2000
203 | 20160710,2,2,2000
204 | 20160710,2,3,2000
205 | 20160710,3,1,4000
206 | 20160710,3,2,3000
207 | 20160710,3,3,2000
208 | 20160710,4,1,1000
209 | 20160710,4,2,1000
210 | 20160710,4,3,1000
211 | 20160710,5,1,500
212 | 20160710,5,2,500
213 | 20160710,5,3,500
214 | 20160710,6,1,5000
215 | 20160710,6,2,5000
216 | 20160710,6,3,5000
217 | 20160711,1,1,5000
218 | 20160711,1,2,20
219 | 20160711,1,3,100
220 | 20160711,2,1,2000
221 | 20160711,2,2,2000
222 | 20160711,2,3,2000
223 | 20160711,3,1,4000
224 | 20160711,3,2,3000
225 | 20160711,3,3,2000
226 | 20160711,4,1,1000
227 | 20160711,4,2,1000
228 | 20160711,4,3,1000
229 | 20160711,5,1,500
230 | 20160711,5,2,500
231 | 20160711,5,3,500
232 | 20160711,6,1,5000
233 | 20160711,6,2,5000
234 | 20160711,6,3,5000
235 | 20160712,1,1,5000
236 | 20160712,1,2,20
237 | 20160712,1,3,100
238 | 20160712,2,1,2000
239 | 20160712,2,2,2000
240 | 20160712,2,3,2000
241 | 20160712,3,1,4000
242 | 20160712,3,2,3000
243 | 20160712,3,3,2000
244 | 20160712,4,1,1000
245 | 20160712,4,2,1000
246 | 20160712,4,3,1000
247 | 20160712,5,1,500
248 | 20160712,5,2,500
249 | 20160712,5,3,500
250 | 20160712,6,1,5000
251 | 20160712,6,2,5000
252 | 20160712,6,3,5000
253 | 20160713,1,1,5000
254 | 20160713,1,2,20
255 | 20160713,1,3,100
256 | 20160713,2,1,2000
257 | 20160713,2,2,2000
258 | 20160713,2,3,2000
259 | 20160713,3,1,4000
260 | 20160713,3,2,3000
261 | 20160713,3,3,2000
262 | 20160713,4,1,1000
263 | 20160713,4,2,1000
264 | 20160713,4,3,1000
265 | 20160713,5,1,500
266 | 20160713,5,2,500
267 | 20160713,5,3,500
268 | 20160713,6,1,5000
269 | 20160713,6,2,5000
270 | 20160713,6,3,5000
271 | 20160714,1,1,5000
272 | 20160714,1,2,20
273 | 20160714,1,3,100
274 | 20160714,2,1,2000
275 | 20160714,2,2,2000
276 | 20160714,2,3,2000
277 | 20160714,3,1,4000
278 | 20160714,3,2,3000
279 | 20160714,3,3,2000
280 | 20160714,4,1,1000
281 | 20160714,4,2,1000
282 | 20160714,4,3,1000
283 | 20160714,5,1,500
284 | 20160714,5,2,500
285 | 20160714,5,3,500
286 | 20160714,6,1,5000
287 | 20160714,6,2,5000
288 | 20160714,6,3,5000
289 | 20160715,1,1,5000
290 | 20160715,1,2,20
291 | 20160715,1,3,100
292 | 20160715,2,1,2000
293 | 20160715,2,2,2000
294 | 20160715,2,3,2000
295 | 20160715,3,1,4000
296 | 20160715,3,2,3000
297 | 20160715,3,3,2000
298 | 20160715,4,1,1000
299 | 20160715,4,2,1000
300 | 20160715,4,3,1000
301 | 20160715,5,1,500
302 | 20160715,5,2,500
303 | 20160715,5,3,500
304 | 20160715,6,1,5000
305 | 20160715,6,2,5000
306 | 20160715,6,3,5000
307 | 20160716,1,1,5000
308 | 20160716,1,2,20
309 | 20160716,1,3,100
310 | 20160716,2,1,2000
311 | 20160716,2,2,2000
312 | 20160716,2,3,2000
313 | 20160716,3,1,4000
314 | 20160716,3,2,3000
315 | 20160716,3,3,2000
316 | 20160716,4,1,1000
317 | 20160716,4,2,1000
318 | 20160716,4,3,1000
319 | 20160716,5,1,500
320 | 20160716,5,2,500
321 | 20160716,5,3,500
322 | 20160716,6,1,5000
323 | 20160716,6,2,5000
324 | 20160716,6,3,5000
325 | 20160717,1,1,5000
326 | 20160717,1,2,20
327 | 20160717,1,3,100
328 | 20160717,2,1,2000
329 | 20160717,2,2,2000
330 | 20160717,2,3,2000
331 | 20160717,3,1,4000
332 | 20160717,3,2,3000
333 | 20160717,3,3,2000
334 | 20160717,4,1,1000
335 | 20160717,4,2,1000
336 | 20160717,4,3,1000
337 | 20160717,5,1,500
338 | 20160717,5,2,500
339 | 20160717,5,3,500
340 | 20160717,6,1,5000
341 | 20160717,6,2,5000
342 | 20160717,6,3,5000
343 | 20160718,1,1,5000
344 | 20160718,1,2,20
345 | 20160718,1,3,100
346 | 20160718,2,1,2000
347 | 20160718,2,2,2000
348 | 20160718,2,3,2000
349 | 20160718,3,1,4000
350 | 20160718,3,2,3000
351 | 20160718,3,3,2000
352 | 20160718,4,1,1000
353 | 20160718,4,2,1000
354 | 20160718,4,3,1000
355 | 20160718,5,1,500
356 | 20160718,5,2,500
357 | 20160718,5,3,500
358 | 20160718,6,1,5000
359 | 20160718,6,2,5000
360 | 20160718,6,3,5000
361 | 20160719,1,1,5000
362 | 20160719,1,2,20
363 | 20160719,1,3,100
364 | 20160719,2,1,2000
365 | 20160719,2,2,2000
366 | 20160719,2,3,2000
367 | 20160719,3,1,4000
368 | 20160719,3,2,3000
369 | 20160719,3,3,2000
370 | 20160719,4,1,1000
371 | 20160719,4,2,1000
372 | 20160719,4,3,1000
373 | 20160719,5,1,500
374 | 20160719,5,2,500
375 | 20160719,5,3,500
376 | 20160719,6,1,5000
377 | 20160719,6,2,5000
378 | 20160719,6,3,5000
379 | 20160720,1,1,5000
380 | 20160720,1,2,20
381 | 20160720,1,3,100
382 | 20160720,2,1,2000
383 | 20160720,2,2,2000
384 | 20160720,2,3,2000
385 | 20160720,3,1,4000
386 | 20160720,3,2,3000
387 | 20160720,3,3,2000
388 | 20160720,4,1,1000
389 | 20160720,4,2,1000
390 | 20160720,4,3,1000
391 | 20160720,5,1,500
392 | 20160720,5,2,500
393 | 20160720,5,3,500
394 | 20160720,6,1,5000
395 | 20160720,6,2,5000
396 | 20160720,6,3,5000
397 | 20160721,1,1,5000
398 | 20160721,1,2,20
399 | 20160721,1,3,100
400 | 20160721,2,1,2000
401 | 20160721,2,2,2000
402 | 20160721,2,3,2000
403 | 20160721,3,1,4000
404 | 20160721,3,2,3000
405 | 20160721,3,3,2000
406 | 20160721,4,1,1000
407 | 20160721,4,2,1000
408 | 20160721,4,3,1000
409 | 20160721,5,1,500
410 | 20160721,5,2,500
411 | 20160721,5,3,500
412 | 20160721,6,1,5000
413 | 20160721,6,2,5000
414 | 20160721,6,3,5000
415 | 20160722,1,1,5000
416 | 20160722,1,2,20
417 | 20160722,1,3,100
418 | 20160722,2,1,2000
419 | 20160722,2,2,2000
420 | 20160722,2,3,2000
421 | 20160722,3,1,4000
422 | 20160722,3,2,3000
423 | 20160722,3,3,2000
424 | 20160722,4,1,1000
425 | 20160722,4,2,1000
426 | 20160722,4,3,1000
427 | 20160722,5,1,500
428 | 20160722,5,2,500
429 | 20160722,5,3,500
430 | 20160722,6,1,5000
431 | 20160722,6,2,5000
432 | 20160722,6,3,5000
433 | 20160723,1,1,5000
434 | 20160723,1,2,20
435 | 20160723,1,3,100
436 | 20160723,2,1,2000
437 | 20160723,2,2,2000
438 | 20160723,2,3,2000
439 | 20160723,3,1,4000
440 | 20160723,3,2,3000
441 | 20160723,3,3,2000
442 | 20160723,4,1,1000
443 | 20160723,4,2,1000
444 | 20160723,4,3,1000
445 | 20160723,5,1,500
446 | 20160723,5,2,500
447 | 20160723,5,3,500
448 | 20160723,6,1,5000
449 | 20160723,6,2,5000
450 | 20160723,6,3,5000
451 | 20160724,1,1,5000
452 | 20160724,1,2,20
453 | 20160724,1,3,100
454 | 20160724,2,1,2000
455 | 20160724,2,2,2000
456 | 20160724,2,3,2000
457 | 20160724,3,1,4000
458 | 20160724,3,2,3000
459 | 20160724,3,3,2000
460 | 20160724,4,1,1000
461 | 20160724,4,2,1000
462 | 20160724,4,3,1000
463 | 20160724,5,1,500
464 | 20160724,5,2,500
465 | 20160724,5,3,500
466 | 20160724,6,1,5000
467 | 20160724,6,2,5000
468 | 20160724,6,3,5000
469 | 20160725,1,1,5000
470 | 20160725,1,2,20
471 | 20160725,1,3,100
472 | 20160725,2,1,2000
473 | 20160725,2,2,2000
474 | 20160725,2,3,2000
475 | 20160725,3,1,4000
476 | 20160725,3,2,3000
477 | 20160725,3,3,2000
478 | 20160725,4,1,1000
479 | 20160725,4,2,1000
480 | 20160725,4,3,1000
481 | 20160725,5,1,500
482 | 20160725,5,2,500
483 | 20160725,5,3,500
484 | 20160725,6,1,5000
485 | 20160725,6,2,5000
486 | 20160725,6,3,5000
487 | 20160726,1,1,5000
488 | 20160726,1,2,20
489 | 20160726,1,3,100
490 | 20160726,2,1,2000
491 | 20160726,2,2,2000
492 | 20160726,2,3,2000
493 | 20160726,3,1,4000
494 | 20160726,3,2,3000
495 | 20160726,3,3,2000
496 | 20160726,4,1,1000
497 | 20160726,4,2,1000
498 | 20160726,4,3,1000
499 | 20160726,5,1,500
500 | 20160726,5,2,500
501 | 20160726,5,3,500
502 | 20160726,6,1,5000
503 | 20160726,6,2,5000
504 | 20160726,6,3,5000
505 | 20160727,1,1,5000
506 | 20160727,1,2,20
507 | 20160727,1,3,100
508 | 20160727,2,1,2000
509 | 20160727,2,2,2000
510 | 20160727,2,3,2000
511 | 20160727,3,1,4000
512 | 20160727,3,2,3000
513 | 20160727,3,3,2000
514 | 20160727,4,1,1000
515 | 20160727,4,2,1000
516 | 20160727,4,3,1000
517 | 20160727,5,1,500
518 | 20160727,5,2,500
519 | 20160727,5,3,500
520 | 20160727,6,1,5000
521 | 20160727,6,2,5000
522 | 20160727,6,3,5000
523 | 20160728,1,1,5000
524 | 20160728,1,2,20
525 | 20160728,1,3,100
526 | 20160728,2,1,2000
527 | 20160728,2,2,2000
528 | 20160728,2,3,2000
529 | 20160728,3,1,4000
530 | 20160728,3,2,3000
531 | 20160728,3,3,2000
532 | 20160728,4,1,1000
533 | 20160728,4,2,1000
534 | 20160728,4,3,1000
535 | 20160728,5,1,500
536 | 20160728,5,2,500
537 | 20160728,5,3,500
538 | 20160728,6,1,5000
539 | 20160728,6,2,5000
540 | 20160728,6,3,5000
541 | 20160729,1,1,5000
542 | 20160729,1,2,20
543 | 20160729,1,3,100
544 | 20160729,2,1,2000
545 | 20160729,2,2,2000
546 | 20160729,2,3,2000
547 | 20160729,3,1,4000
548 | 20160729,3,2,3000
549 | 20160729,3,3,2000
550 | 20160729,4,1,1000
551 | 20160729,4,2,1000
552 | 20160729,4,3,1000
553 | 20160729,5,1,500
554 | 20160729,5,2,500
555 | 20160729,5,3,500
556 | 20160729,6,1,5000
557 | 20160729,6,2,5000
558 | 20160729,6,3,5000
559 | 20160730,1,1,5000
560 | 20160730,1,2,20
561 | 20160730,1,3,100
562 | 20160730,2,1,2000
563 | 20160730,2,2,2000
564 | 20160730,2,3,2000
565 | 20160730,3,1,4000
566 | 20160730,3,2,3000
567 | 20160730,3,3,2000
568 | 20160730,4,1,1000
569 | 20160730,4,2,1000
570 | 20160730,4,3,1000
571 | 20160730,5,1,500
572 | 20160730,5,2,500
573 | 20160730,5,3,500
574 | 20160730,6,1,5000
575 | 20160730,6,2,5000
576 | 20160730,6,3,5000
577 | 20160731,1,1,5000
578 | 20160731,1,2,20
579 | 20160731,1,3,100
580 | 20160731,2,1,2000
581 | 20160731,2,2,2000
582 | 20160731,2,3,2000
583 | 20160731,3,1,4000
584 | 20160731,3,2,3000
585 | 20160731,3,3,2000
586 | 20160731,4,1,1000
587 | 20160731,4,2,1000
588 | 20160731,4,3,1000
589 | 20160731,5,1,500
590 | 20160731,5,2,500
591 | 20160731,5,3,500
592 | 20160731,6,1,5000
593 | 20160731,6,2,5000
594 | 20160731,6,3,5000
595 |
--------------------------------------------------------------------------------
/inputFile/test2.data:
--------------------------------------------------------------------------------
1 | 1,1,5.0
2 | 1,2,1.0
3 | 1,3,5.0
4 | 1,4,1.0
5 | 1,7,2.7
6 | 2,1,5.0
7 | 2,2,1.0
8 | 2,3,5.0
9 | 2,4,1.0
10 | 3,1,1.0
11 | 3,2,5.0
12 | 3,3,1.0
13 | 3,4,5.0
14 | 4,1,1.0
15 | 4,2,5.0
16 | 4,3,1.0
17 | 4,4,5.0
18 | 4,9,4.5
19 | 4,6,1.0
20 | 4,5,3.3
--------------------------------------------------------------------------------
/inputFile/testone.txt:
--------------------------------------------------------------------------------
1 | 20160701,1,1,5000
2 | 20160701,1,2,20
3 | 20160701,1,3,100
4 | 20160701,2,1,2000
5 | 20160701,2,2,2000
6 | 20160701,2,3,2000
7 | 20160701,3,1,4000
8 | 20160701,3,2,3000
9 | 20160701,3,3,2000
10 | 20160701,4,1,1000
11 | 20160701,4,2,1000
12 | 20160701,4,3,1000
13 | 20160701,5,1,500
14 | 20160701,5,2,500
15 | 20160701,5,3,500
16 | 20160701,6,1,5000
17 | 20160701,6,2,5000
18 | 20160701,6,3,5000
19 | 20160702,1,1,5000
20 | 20160702,1,2,20
21 | 20160702,1,3,100
22 | 20160702,2,1,2000
23 | 20160702,2,2,2000
24 | 20160702,2,3,2000
25 | 20160702,3,1,4000
26 | 20160702,3,2,3000
27 | 20160702,3,3,2000
28 | 20160702,4,1,1000
29 | 20160702,4,2,1000
30 | 20160702,4,3,1000
31 | 20160702,5,1,500
32 | 20160702,5,2,500
33 | 20160702,5,3,500
34 | 20160702,6,1,5000
35 | 20160702,6,2,5000
36 | 20160702,6,3,5000
37 | 20160703,1,1,5000
38 | 20160703,1,2,20
39 | 20160703,1,3,100
40 | 20160703,2,1,2000
41 | 20160703,2,2,2000
42 | 20160703,2,3,2000
43 | 20160703,3,1,4000
44 | 20160703,3,2,3000
45 | 20160703,3,3,2000
46 | 20160703,4,1,1000
47 | 20160703,4,2,1000
48 | 20160703,4,3,1000
49 | 20160703,5,1,500
50 | 20160703,5,2,500
51 | 20160703,5,3,500
52 | 20160703,6,1,5000
53 | 20160703,6,2,5000
54 | 20160703,6,3,5000
55 | 20160704,1,1,5000
56 | 20160704,1,2,20
57 | 20160704,1,3,100
58 | 20160704,2,1,2000
59 | 20160704,2,2,2000
60 | 20160704,2,3,2000
61 | 20160704,3,1,4000
62 | 20160704,3,2,3000
63 | 20160704,3,3,2000
64 | 20160704,4,1,1000
65 | 20160704,4,2,1000
66 | 20160704,4,3,1000
67 | 20160704,5,1,500
68 | 20160704,5,2,500
69 | 20160704,5,3,500
70 | 20160704,6,1,5000
71 | 20160704,6,2,5000
72 | 20160704,6,3,5000
73 | 20160705,1,1,5000
74 | 20160705,1,2,20
75 | 20160705,1,3,100
76 | 20160705,2,1,2000
77 | 20160705,2,2,2000
78 | 20160705,2,3,2000
79 | 20160705,3,1,4000
80 | 20160705,3,2,3000
81 | 20160705,3,3,2000
82 | 20160705,4,1,1000
83 | 20160705,4,2,1000
84 | 20160705,4,3,1000
85 | 20160705,5,1,500
86 | 20160705,5,2,500
87 | 20160705,5,3,500
88 | 20160705,6,1,5000
89 | 20160705,6,2,5000
90 | 20160705,6,3,5000
91 | 20160706,1,1,5000
92 | 20160706,1,2,20
93 | 20160706,1,3,100
94 | 20160706,2,1,2000
95 | 20160706,2,2,2000
96 | 20160706,2,3,2000
97 | 20160706,3,1,4000
98 | 20160706,3,2,3000
99 | 20160706,3,3,2000
100 | 20160706,4,1,1000
101 | 20160706,4,2,1000
102 | 20160706,4,3,1000
103 | 20160706,5,1,500
104 | 20160706,5,2,500
105 | 20160706,5,3,500
106 | 20160706,6,1,5000
107 | 20160706,6,2,5000
108 | 20160706,6,3,5000
109 | 20160707,1,1,5000
110 | 20160707,1,2,20
111 | 20160707,1,3,100
112 | 20160707,2,1,2000
113 | 20160707,2,2,2000
114 | 20160707,2,3,2000
115 | 20160707,3,1,4000
116 | 20160707,3,2,3000
117 | 20160707,3,3,2000
118 | 20160707,4,1,1000
119 | 20160707,4,2,1000
120 | 20160707,4,3,1000
121 | 20160707,5,1,500
122 | 20160707,5,2,500
123 | 20160707,5,3,500
124 | 20160707,6,1,5000
125 | 20160707,6,2,5000
126 | 20160707,6,3,5000
127 | 20160708,1,1,5000
128 | 20160708,1,2,20
129 | 20160708,1,3,100
130 | 20160708,2,1,2000
131 | 20160708,2,2,2000
132 | 20160708,2,3,2000
133 | 20160708,3,1,4000
134 | 20160708,3,2,3000
135 | 20160708,3,3,2000
136 | 20160708,4,1,1000
137 | 20160708,4,2,1000
138 | 20160708,4,3,1000
139 | 20160708,5,1,500
140 | 20160708,5,2,500
141 | 20160708,5,3,500
142 | 20160708,6,1,5000
143 | 20160708,6,2,5000
144 | 20160708,6,3,5000
145 | 20160709,1,1,5000
146 | 20160709,1,2,20
147 | 20160709,1,3,100
148 | 20160709,2,1,2000
149 | 20160709,2,2,2000
150 | 20160709,2,3,2000
151 | 20160709,3,1,4000
152 | 20160709,3,2,3000
153 | 20160709,3,3,2000
154 | 20160709,4,1,1000
155 | 20160709,4,2,1000
156 | 20160709,4,3,1000
157 | 20160709,5,1,500
158 | 20160709,5,2,500
159 | 20160709,5,3,500
160 | 20160709,6,1,5000
161 | 20160709,6,2,5000
162 | 20160709,6,3,5000
163 | 20160710,1,1,5000
164 | 20160710,1,2,20
165 | 20160710,1,3,100
166 | 20160710,2,1,2000
167 | 20160710,2,2,2000
168 | 20160710,2,3,2000
169 | 20160710,3,1,4000
170 | 20160710,3,2,3000
171 | 20160710,3,3,2000
172 | 20160710,4,1,1000
173 | 20160710,4,2,1000
174 | 20160710,4,3,1000
175 | 20160710,5,1,500
176 | 20160710,5,2,500
177 | 20160710,5,3,500
178 | 20160710,6,1,5000
179 | 20160710,6,2,5000
180 | 20160710,6,3,5000
181 | 20160711,1,1,5000
182 | 20160711,1,2,20
183 | 20160711,1,3,100
184 | 20160711,2,1,2000
185 | 20160711,2,2,2000
186 | 20160711,2,3,2000
187 | 20160711,3,1,4000
188 | 20160711,3,2,3000
189 | 20160711,3,3,2000
190 | 20160711,4,1,1000
191 | 20160711,4,2,1000
192 | 20160711,4,3,1000
193 | 20160711,5,1,500
194 | 20160711,5,2,500
195 | 20160711,5,3,500
196 | 20160711,6,1,5000
197 | 20160711,6,2,5000
198 | 20160711,6,3,5000
199 | 20160712,1,1,5000
200 | 20160712,1,2,20
201 | 20160712,1,3,100
202 | 20160712,2,1,2000
203 | 20160712,2,2,2000
204 | 20160712,2,3,2000
205 | 20160712,3,1,4000
206 | 20160712,3,2,3000
207 | 20160712,3,3,2000
208 | 20160712,4,1,1000
209 | 20160712,4,2,1000
210 | 20160712,4,3,1000
211 | 20160712,5,1,500
212 | 20160712,5,2,500
213 | 20160712,5,3,500
214 | 20160712,6,1,5000
215 | 20160712,6,2,5000
216 | 20160712,6,3,5000
217 | 20160713,1,1,5000
218 | 20160713,1,2,20
219 | 20160713,1,3,100
220 | 20160713,2,1,2000
221 | 20160713,2,2,2000
222 | 20160713,2,3,2000
223 | 20160713,3,1,4000
224 | 20160713,3,2,3000
225 | 20160713,3,3,2000
226 | 20160713,4,1,1000
227 | 20160713,4,2,1000
228 | 20160713,4,3,1000
229 | 20160713,5,1,500
230 | 20160713,5,2,500
231 | 20160713,5,3,500
232 | 20160713,6,1,5000
233 | 20160713,6,2,5000
234 | 20160713,6,3,5000
235 | 20160714,1,1,5000
236 | 20160714,1,2,20
237 | 20160714,1,3,100
238 | 20160714,2,1,2000
239 | 20160714,2,2,2000
240 | 20160714,2,3,2000
241 | 20160714,3,1,4000
242 | 20160714,3,2,3000
243 | 20160714,3,3,2000
244 | 20160714,4,1,1000
245 | 20160714,4,2,1000
246 | 20160714,4,3,1000
247 | 20160714,5,1,500
248 | 20160714,5,2,500
249 | 20160714,5,3,500
250 | 20160714,6,1,5000
251 | 20160714,6,2,5000
252 | 20160714,6,3,5000
253 | 20160715,1,1,5000
254 | 20160715,1,2,20
255 | 20160715,1,3,100
256 | 20160715,2,1,2000
257 | 20160715,2,2,2000
258 | 20160715,2,3,2000
259 | 20160715,3,1,4000
260 | 20160715,3,2,3000
261 | 20160715,3,3,2000
262 | 20160715,4,1,1000
263 | 20160715,4,2,1000
264 | 20160715,4,3,1000
265 | 20160715,5,1,500
266 | 20160715,5,2,500
267 | 20160715,5,3,500
268 | 20160715,6,1,5000
269 | 20160715,6,2,5000
270 | 20160715,6,3,5000
271 | 20160716,1,1,5000
272 | 20160716,1,2,20
273 | 20160716,1,3,100
274 | 20160716,2,1,2000
275 | 20160716,2,2,2000
276 | 20160716,2,3,2000
277 | 20160716,3,1,4000
278 | 20160716,3,2,3000
279 | 20160716,3,3,2000
280 | 20160716,4,1,1000
281 | 20160716,4,2,1000
282 | 20160716,4,3,1000
283 | 20160716,5,1,500
284 | 20160716,5,2,500
285 | 20160716,5,3,500
286 | 20160716,6,1,5000
287 | 20160716,6,2,5000
288 | 20160716,6,3,5000
289 | 20160717,1,1,5000
290 | 20160717,1,2,20
291 | 20160717,1,3,100
292 | 20160717,2,1,2000
293 | 20160717,2,2,2000
294 | 20160717,2,3,2000
295 | 20160717,3,1,4000
296 | 20160717,3,2,3000
297 | 20160717,3,3,2000
298 | 20160717,4,1,1000
299 | 20160717,4,2,1000
300 | 20160717,4,3,1000
301 | 20160717,5,1,500
302 | 20160717,5,2,500
303 | 20160717,5,3,500
304 | 20160717,6,1,5000
305 | 20160717,6,2,5000
306 | 20160717,6,3,5000
307 | 20160718,1,1,5000
308 | 20160718,1,2,20
309 | 20160718,1,3,100
310 | 20160718,2,1,2000
311 | 20160718,2,2,2000
312 | 20160718,2,3,2000
313 | 20160718,3,1,4000
314 | 20160718,3,2,3000
315 | 20160718,3,3,2000
316 | 20160718,4,1,1000
317 | 20160718,4,2,1000
318 | 20160718,4,3,1000
319 | 20160718,5,1,500
320 | 20160718,5,2,500
321 | 20160718,5,3,500
322 | 20160718,6,1,5000
323 | 20160718,6,2,5000
324 | 20160718,6,3,5000
325 | 20160719,1,1,5000
326 | 20160719,1,2,20
327 | 20160719,1,3,100
328 | 20160719,2,1,2000
329 | 20160719,2,2,2000
330 | 20160719,2,3,2000
331 | 20160719,3,1,4000
332 | 20160719,3,2,3000
333 | 20160719,3,3,2000
334 | 20160719,4,1,1000
335 | 20160719,4,2,1000
336 | 20160719,4,3,1000
337 | 20160719,5,1,500
338 | 20160719,5,2,500
339 | 20160719,5,3,500
340 | 20160719,6,1,5000
341 | 20160719,6,2,5000
342 | 20160719,6,3,5000
343 | 20160720,1,1,5000
344 | 20160720,1,2,20
345 | 20160720,1,3,100
346 | 20160720,2,1,2000
347 | 20160720,2,2,2000
348 | 20160720,2,3,2000
349 | 20160720,3,1,4000
350 | 20160720,3,2,3000
351 | 20160720,3,3,2000
352 | 20160720,4,1,1000
353 | 20160720,4,2,1000
354 | 20160720,4,3,1000
355 | 20160720,5,1,500
356 | 20160720,5,2,500
357 | 20160720,5,3,500
358 | 20160720,6,1,5000
359 | 20160720,6,2,5000
360 | 20160720,6,3,5000
361 | 20160721,1,1,5000
362 | 20160721,1,2,20
363 | 20160721,1,3,100
364 | 20160721,2,1,2000
365 | 20160721,2,2,2000
366 | 20160721,2,3,2000
367 | 20160721,3,1,4000
368 | 20160721,3,2,3000
369 | 20160721,3,3,2000
370 | 20160721,4,1,1000
371 | 20160721,4,2,1000
372 | 20160721,4,3,1000
373 | 20160721,5,1,500
374 | 20160721,5,2,500
375 | 20160721,5,3,500
376 | 20160721,6,1,5000
377 | 20160721,6,2,5000
378 | 20160721,6,3,5000
379 | 20160722,1,1,5000
380 | 20160722,1,2,20
381 | 20160722,1,3,100
382 | 20160722,2,1,2000
383 | 20160722,2,2,2000
384 | 20160722,2,3,2000
385 | 20160722,3,1,4000
386 | 20160722,3,2,3000
387 | 20160722,3,3,2000
388 | 20160722,4,1,1000
389 | 20160722,4,2,1000
390 | 20160722,4,3,1000
391 | 20160722,5,1,500
392 | 20160722,5,2,500
393 | 20160722,5,3,500
394 | 20160722,6,1,5000
395 | 20160722,6,2,5000
396 | 20160722,6,3,5000
397 | 20160723,1,1,5000
398 | 20160723,1,2,20
399 | 20160723,1,3,100
400 | 20160723,2,1,2000
401 | 20160723,2,2,2000
402 | 20160723,2,3,2000
403 | 20160723,3,1,4000
404 | 20160723,3,2,3000
405 | 20160723,3,3,2000
406 | 20160723,4,1,1000
407 | 20160723,4,2,1000
408 | 20160723,4,3,1000
409 | 20160723,5,1,500
410 | 20160723,5,2,500
411 | 20160723,5,3,500
412 | 20160723,6,1,5000
413 | 20160723,6,2,5000
414 | 20160723,6,3,5000
415 | 20160724,1,1,5000
416 | 20160724,1,2,20
417 | 20160724,1,3,100
418 | 20160724,2,1,2000
419 | 20160724,2,2,2000
420 | 20160724,2,3,2000
421 | 20160724,3,1,4000
422 | 20160724,3,2,3000
423 | 20160724,3,3,2000
424 | 20160724,4,1,1000
425 | 20160724,4,2,1000
426 | 20160724,4,3,1000
427 | 20160724,5,1,500
428 | 20160724,5,2,500
429 | 20160724,5,3,500
430 | 20160724,6,1,5000
431 | 20160724,6,2,5000
432 | 20160724,6,3,5000
433 | 20160725,1,1,5000
434 | 20160725,1,2,20
435 | 20160725,1,3,100
436 | 20160725,2,1,2000
437 | 20160725,2,2,2000
438 | 20160725,2,3,2000
439 | 20160725,3,1,4000
440 | 20160725,3,2,3000
441 | 20160725,3,3,2000
442 | 20160725,4,1,1000
443 | 20160725,4,2,1000
444 | 20160725,4,3,1000
445 | 20160725,5,1,500
446 | 20160725,5,2,500
447 | 20160725,5,3,500
448 | 20160725,6,1,5000
449 | 20160725,6,2,5000
450 | 20160725,6,3,5000
451 | 20160726,1,1,5000
452 | 20160726,1,2,20
453 | 20160726,1,3,100
454 | 20160726,2,1,2000
455 | 20160726,2,2,2000
456 | 20160726,2,3,2000
457 | 20160726,3,1,4000
458 | 20160726,3,2,3000
459 | 20160726,3,3,2000
460 | 20160726,4,1,1000
461 | 20160726,4,2,1000
462 | 20160726,4,3,1000
463 | 20160726,5,1,500
464 | 20160726,5,2,500
465 | 20160726,5,3,500
466 | 20160726,6,1,5000
467 | 20160726,6,2,5000
468 | 20160726,6,3,5000
469 | 20160727,1,1,5000
470 | 20160727,1,2,20
471 | 20160727,1,3,100
472 | 20160727,2,1,2000
473 | 20160727,2,2,2000
474 | 20160727,2,3,2000
475 | 20160727,3,1,4000
476 | 20160727,3,2,3000
477 | 20160727,3,3,2000
478 | 20160727,4,1,1000
479 | 20160727,4,2,1000
480 | 20160727,4,3,1000
481 | 20160727,5,1,500
482 | 20160727,5,2,500
483 | 20160727,5,3,500
484 | 20160727,6,1,5000
485 | 20160727,6,2,5000
486 | 20160727,6,3,5000
487 | 20160728,1,1,5000
488 | 20160728,1,2,20
489 | 20160728,1,3,100
490 | 20160728,2,1,2000
491 | 20160728,2,2,2000
492 | 20160728,2,3,2000
493 | 20160728,3,1,4000
494 | 20160728,3,2,3000
495 | 20160728,3,3,2000
496 | 20160728,4,1,1000
497 | 20160728,4,2,1000
498 | 20160728,4,3,1000
499 | 20160728,5,1,500
500 | 20160728,5,2,500
501 | 20160728,5,3,500
502 | 20160728,6,1,5000
503 | 20160728,6,2,5000
504 | 20160728,6,3,5000
505 | 20160729,1,1,5000
506 | 20160729,1,2,20
507 | 20160729,1,3,100
508 | 20160729,2,1,2000
509 | 20160729,2,2,2000
510 | 20160729,2,3,2000
511 | 20160729,3,1,4000
512 | 20160729,3,2,3000
513 | 20160729,3,3,2000
514 | 20160729,4,1,1000
515 | 20160729,4,2,1000
516 | 20160729,4,3,1000
517 | 20160729,5,1,500
518 | 20160729,5,2,500
519 | 20160729,5,3,500
520 | 20160729,6,1,5000
521 | 20160729,6,2,5000
522 | 20160729,6,3,5000
523 | 20160730,1,1,5000
524 | 20160730,1,2,20
525 | 20160730,1,3,100
526 | 20160730,2,1,2000
527 | 20160730,2,2,2000
528 | 20160730,2,3,2000
529 | 20160730,3,1,4000
530 | 20160730,3,2,3000
531 | 20160730,3,3,2000
532 | 20160730,4,1,1000
533 | 20160730,4,2,1000
534 | 20160730,4,3,1000
535 | 20160730,5,1,500
536 | 20160730,5,2,500
537 | 20160730,5,3,500
538 | 20160730,6,1,5000
539 | 20160730,6,2,5000
540 | 20160730,6,3,5000
541 | 20160731,1,1,5000
542 | 20160731,1,2,20
543 | 20160731,1,3,100
544 | 20160731,2,1,2000
545 | 20160731,2,2,2000
546 | 20160731,2,3,2000
547 | 20160731,3,1,4000
548 | 20160731,3,2,3000
549 | 20160731,3,3,2000
550 | 20160731,4,1,1000
551 | 20160731,4,2,1000
552 | 20160731,4,3,1000
553 | 20160731,5,1,500
554 | 20160731,5,2,500
555 | 20160731,5,3,500
556 | 20160731,6,1,5000
557 | 20160731,6,2,5000
558 | 20160731,6,3,5000
559 |
--------------------------------------------------------------------------------
/inputFile/user:
--------------------------------------------------------------------------------
1 | 1
2 | 2
3 | 3
4 | 4
--------------------------------------------------------------------------------
/inputFile/wordCount:
--------------------------------------------------------------------------------
1 | package com.spark.scala
2 |
3 | import java.util.ArrayList
4 | import scala.collection.mutable.ArrayBuffer
5 | import scala.collection.JavaConversions._
6 | import scala.collection.mutable.Map
7 | import java.util.HashMap
8 | import scala.io.Source
9 | import java.io.File
10 | import scala.collection.Iterator
11 |
12 |
13 | object ScalaGramaer {
14 | var list=new ArrayList[String]
15 | def main(args: Array[String]): Unit = {
16 | //listGrammer()
17 | //mapGrammer()
18 | //tupleGrammer()
19 | fileGrammer()
20 | }
21 | /**
22 | * scala集合操作
23 | * 1.想要使用java的集合,需要导入
24 | * import scala.collection.JavaConversions._
25 | * 会内部将java的集合转换为scala的集合
26 | * 2.java的集合和scala的集合不能显式转换,但是可以隐式转换,如,SparkContext.parallelize(data)
27 | * 需要的是一个scala的data,但是可以传一个java的集合
28 | */
29 | def fileGrammer(){
30 | // var file=Source.fromFile("D:\\tmp\\input\\smy_biz_dil\\part-m-00000", "utf-8")
31 | //var file=Source.fromURL("http://www.baidu.com", "utf-8")
32 | // file.getLines.foreach { println };
33 | //bian li mulu
34 | /*walk(new File("D:\\tmp\\input\\"))
35 | list.foreach { println }*/
36 | }
37 |
38 | //遍历路径下所有的文件
39 | def walk(file:File){
40 | if(file.isDirectory()) file.listFiles().foreach (walk) else list.add(file.getPath())
41 | }
42 | def readAllfiles(dir:File):Iterator[File]={
43 | //scan a dir return all file
44 | var child=dir.listFiles().filter { _.isDirectory()}
45 | child.toIterator++child.toIterator.flatMap { readAllfiles _ }
46 | }
47 | def listGrammer(){
48 | //遍历集合,可以有下标无下标
49 | var list=new ArrayList[String]();list.add("s")
50 | for(value<- list) println(value)
51 | for(i<- 0.until(list.length))println(list(i))
52 | for(i<- 0 until list.length)println(list(i))
53 |
54 | }
55 | def mapGrammer(){
56 | //mutable可变的
57 | var map=Map("a"->1,"b"->2)
58 | println(map("a"))
59 | //用get返回的是一个option
60 | println(map.get("b"))
61 | println(map.get("c"))
62 | //改变一个key的值
63 | map("a")=6
64 | println(map("a"))
65 | //新增一个值
66 | map+="c"->3
67 | println(map("c"))
68 | //移除一个值
69 | map-="c"
70 | println(map.getOrElse("c", "无这个key"))
71 | //如果有这个key就返回key的值
72 | println(map.getOrElse("null", "无这个key"))
73 |
74 | //遍历一个map
75 | println("遍历一个map")
76 | for((k,value)<-map){
77 | println(k+":"+value)
78 | }
79 | println("遍历一个map的key")
80 | for(k<-map.keySet){
81 | println(k)
82 | }
83 |
84 | }
85 | def tupleGrammer(){
86 | //元祖类型Tuple可以是多元的
87 | var tuple1=(1)
88 | var tuple2=("1",2)
89 | var tuple3=("1",2,"3")
90 | var tuple4=("1",2,"3",4)
91 | println(tuple3._3)
92 |
93 | }
94 |
95 | /**
96 | * @author Administrator
97 | */
98 | class Person(n:String) {
99 | //必须初始化属性
100 | var name=n;
101 | var age=0;
102 | var address="";
103 | //这是一个辅助构造器,scala的构造器必须以另一个构造器为起点,否则报错
104 | def this(name:String,age:Int){
105 | this(name)
106 | this.age=age
107 | }
108 | def this(name:String,age:Int,address:String){
109 | this(name,age)
110 | this.address=address
111 | }
112 | }
113 |
114 | }
115 |
116 |
--------------------------------------------------------------------------------
/inputFile/wordCount2:
--------------------------------------------------------------------------------
1 | >>>>>>>>>>>>>>>>>>..
2 | >>>>>>>>>>>>>>>>>>>
3 | >>>>>>>>>>>>>>>>>>
4 | >>>>>>>>>>>>>>>>>>>>>
5 | >>>>>>>>>>>>>>>>
--------------------------------------------------------------------------------
/lib/test-0.0.1-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LinMingQiang/spark-learn/4fd2466b9d339b2ac77003bd4f7b772489e314aa/lib/test-0.0.1-SNAPSHOT.jar
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 | Spark
5 | Spark
6 | 0.0.1-SNAPSHOT
7 | ${project.artifactId}
8 | My wonderfull scala app
9 | 2015
10 |
11 |
12 | My License
13 | http://....
14 | repo
15 |
16 |
17 |
18 |
19 | 1.6
20 | 1.6
21 | UTF-8
22 | 2.10.6
23 | 2.10
24 |
25 |
26 |
27 |
28 | sqlline
29 | sqlline
30 | 1.1.9
31 |
32 |
33 | jdk.tools
34 | jdk.tools
35 | 1.7
36 | system
37 | C:\Program Files\Java\jdk1.7.0_79\lib\tools.jar
38 |
39 |
40 |
42 |
43 | org.apache.httpcomponents
44 | httpasyncclient
45 | 4.0
46 |
47 |
48 |
49 | net.sf.json-lib
50 | json-lib
51 | 2.4
52 |
53 |
54 |
55 | org.elasticsearch
56 | elasticsearch
57 | 2.0.1
58 |
59 |
60 |
61 | org.apache.hbase
62 | hbase-server
63 | 1.2.0-cdh5.7.5
64 |
65 |
66 | hadoop-common
67 | org.apache.hadoop
68 |
69 |
70 | slf4j-log4j12
71 | org.slf4j
72 |
73 |
74 |
75 |
76 |
77 |
78 | org.apache.hive.hcatalog
79 | hive-hcatalog-core
80 | 1.1.0-cdh5.7.5
81 |
82 |
83 |
84 | org.apache.phoenix
85 | phoenix-spark
86 | 4.8.0-HBase-1.2
87 |
88 |
89 |
90 |
91 | com.databricks
92 | spark-csv_2.10
93 | 1.0.3
94 |
95 |
96 |
97 | org.apache.spark
98 | spark-streaming_2.10
99 | 1.6.0-cdh5.7.5
100 |
101 |
102 |
103 | org.apache.spark
104 | spark-core_2.10
105 | 1.6.0-cdh5.7.5
106 |
107 |
108 | javax.servlet
109 | org.eclipse.jetty.orbit
110 |
111 |
112 | slf4j-log4j12
113 | org.slf4j
114 |
115 |
116 |
117 |
118 |
119 | org.apache.spark
120 | spark-mllib_2.10
121 | 1.6.0-cdh5.7.5
122 |
123 |
125 |
127 |
128 |
129 | org.apache.spark
130 | spark-hive_2.10
131 | 1.6.0-cdh5.7.5
132 |
133 |
134 |
135 | org.apache.kafka
136 | connect-json
137 | 0.9.0.2.3.4.21-2
138 |
139 |
140 |
141 |
142 | org.apache.spark
143 | spark-streaming-kafka_2.10
144 | 1.6.0-cdh5.7.5
145 |
146 |
147 | slf4j-log4j12
148 | org.slf4j
149 |
150 |
151 |
152 |
153 |
154 | javax.servlet
155 | javax.servlet-api
156 | 3.1.0
157 |
158 |
159 |
160 | mysql
161 | mysql-connector-java
162 | 5.1.30
163 |
164 |
165 | org.scalatest
166 | scalatest_2.10
167 | 2.2.4
168 |
169 |
170 |
172 |
173 |
175 |
176 |
177 |
179 |
181 |
184 |
185 |
187 |
189 |
192 |
194 |
195 |
196 |
197 |
198 |
201 |
202 |
210 |
211 |
212 |
213 |
214 | src/main/scala
215 |
216 |
217 | org.apache.maven.plugins
218 | maven-resources-plugin
219 | 2.4
220 |
221 | ${project.build.outputDirectory}/resources
222 | UTF-8
223 |
224 |
225 | src/main/scala
226 |
227 |
228 |
229 |
230 |
231 | org.apache.maven.plugins
232 | maven-compiler-plugin
233 | 2.1
234 |
235 | 1.7
236 | 1.7
237 |
238 |
239 |
240 | org.apache.maven.plugins
241 | maven-jar-plugin
242 |
243 |
244 |
245 | true
246 |
247 |
248 |
249 |
250 |
251 |
252 | org.apache.maven.plugins
253 | maven-dependency-plugin
254 |
255 |
256 | copy
257 | package
258 |
259 | copy-dependencies
260 |
261 |
262 |
263 | ${project.build.directory}/lib
264 |
265 |
267 |
268 | org.apache.kafka
269 |
270 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
--------------------------------------------------------------------------------
/src/main/scala/com/fun/util/RDDOperateFunction.scala:
--------------------------------------------------------------------------------
1 |
2 |
3 | package com.fun.util
4 |
5 | import org.apache.spark.rdd.RDD
6 | import org.apache.spark.streaming.dstream.InputDStream
7 | import scala.reflect.ClassTag
8 | import org.apache.spark.streaming.mysql.DirectMysqlInputDStream
9 | import org.apache.spark.streaming.mysql.JdbcSparkStreamRDD
10 |
11 | trait RDDOperateFunction {
12 | //第一种方法
13 | implicit def rddFunction[T](rdd:RDD[T])=new RDDFunctionToClassTag(rdd)
14 | class RDDFunctionToClassTag[T](rdd:RDD[T]){
15 | //在这里面定义方法
16 | def printlnRDD()=rdd.foreach { println }
17 | }
18 | //第二种方法
19 | implicit class RDDFunctionToString(rdd:RDD[String]){
20 | def rddF2(str:String)=rdd.map { x => x+" : "+str }
21 | def rddF(str:String)=rdd.map { x => x+" : "+str }
22 | }
23 | implicit class DStreamFunc[A<: InputDStream[(String,String)]](dstream:A){
24 | def printlnDStream(str:String)=dstream.foreachRDD(rdd=>rdd.collect.foreach(x=>println(str+x)))
25 | }
26 | implicit def printlnDStream2(rdd:JdbcSparkStreamRDD[(String, String)])=rdd.collect.foreach(println)
27 | }
--------------------------------------------------------------------------------
/src/main/scala/com/fun/util/SparkContextOperateFunction.scala:
--------------------------------------------------------------------------------
1 |
2 | package com.fun.util
3 | import org.apache.spark.SparkContext
4 | import scala.reflect.ClassTag
5 | import java.sql.Connection
6 | import java.sql.ResultSet
7 | import com.spark.jdbcrdd.JdbcMysqlRDD
8 | import org.apache.spark.streaming.StreamingContext
9 | import org.apache.spark.streaming.mysql.MysqlManager
10 | trait SparkContextOperateFunction {
11 | implicit class SparkContextFunc(sc:SparkContext){
12 | def hbaseRDD(tablename:String)=println("return hbase RDD")
13 | def mysqlRDD[T:ClassTag](createConnection: () => Connection,
14 | sql:String,numPartitions: Int,extractValues: (ResultSet) => T )
15 | =new JdbcMysqlRDD(sc, createConnection,sql,numPartitions,extractValues)
16 | }
17 | implicit class StreamingContextFunc(ssc:StreamingContext){
18 | def createDirectMysqlDStream[T:ClassTag](
19 | getConnection: () => Connection,
20 | tablename: String,
21 | idcloumn:String,
22 | fromTime: Long,
23 | sql:String,
24 | numPartitions: Int,
25 | extractValues: (ResultSet) => T)=
26 | MysqlManager.creatMysqlInputStream(ssc, getConnection, tablename,idcloumn, fromTime,sql, numPartitions, extractValues)
27 | }
28 | }
--------------------------------------------------------------------------------
/src/main/scala/com/fun/util/ZzyLmqDataOperateUtil.scala:
--------------------------------------------------------------------------------
1 | package com.fun.util
2 |
3 | import java.sql.DriverManager
4 | import java.sql.ResultSet
5 |
6 | trait ZzyLmqDataOperateUtil {
7 | def createConnection() = {
8 | Class.forName("com.mysql.jdbc.Driver")
9 | DriverManager.getConnection("jdbc:mysql://192.168.10.159:3306/test", "root", "zhiziyun0628")
10 | }
11 | def extractValues(r: ResultSet) = {
12 | (r.getString(1), r.getString(2),r.getString(3))
13 | }
14 | def sscextractValues(r: ResultSet) = {
15 | (r.getString(1), r.getString(2))
16 | }
17 | def getConnection() = {
18 | Class.forName("com.mysql.jdbc.Driver")
19 | DriverManager.getConnection("jdbc:mysql://192.168.10.159:3306/test", "root", "zhiziyun0628")
20 | }
21 | }
--------------------------------------------------------------------------------
/src/main/scala/com/fun/util/package.scala:
--------------------------------------------------------------------------------
1 | package com.fun
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.rdd.RDD
5 | import com.fun.util.RDDOperateFunction
6 | import com.fun.util.SparkContextOperateFunction
7 | import com.fun.util.SparkContextOperateFunction
8 | package object util extends RDDOperateFunction
9 | with SparkContextOperateFunction
10 | with ZzyLmqDataOperateUtil{
11 | //可以通过继承类来获得,也可以直接写
12 | implicit class SparkContextNewFunction(sparkContext: SparkContext) {
13 | def lmq(name: String) = ""
14 | }
15 | //隐式参数的使用
16 | implicit class RDDNewFunction[T](rdd: RDD[T]) {
17 | def lmq3(str: String)(implicit impl:Array[T])=rdd.map { x => x + " : "+impl(0) }
18 | def lmq4[A](str: String)(implicit impl:Array[A])=rdd.map { x => x + " : "+impl(0) }
19 | }
20 |
21 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/es/SparkLocalESTest.scala:
--------------------------------------------------------------------------------
1 | package com.spark.es
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkConf
5 | import org.elasticsearch.common.xcontent.XContentFactory
6 | import scala.collection.JavaConverters._
7 | object SparkLocalESTest {
8 | var sc: SparkContext = null
9 | val zookeeper=""
10 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0")
11 | def main(args: Array[String]): Unit = {
12 | init
13 | sc.parallelize(1 to 100).map { x=>
14 | val b=XContentFactory.jsonBuilder()
15 | .startObject()
16 | .field("firstName", x)
17 | .field("map", Map("age"->1,"age2"->2).asJava)
18 | .endObject()
19 | (x/10,b)
20 | }.foreach { case((d,b))=>println(d) }
21 |
22 | }
23 |
24 | def init {
25 | val sparkConf = new SparkConf()
26 | .setMaster("local")
27 | .setAppName("Test")
28 | sc = new SparkContext(sparkConf)
29 | }
30 |
31 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/es/Test.scala:
--------------------------------------------------------------------------------
1 | package com.spark.es
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.SparkContext
5 | import org.slf4j.LoggerFactory
6 | import com.mysql.jdbc.Connection
7 | import org.apache.spark.streaming.kafka.KafkaUtils
8 | import org.elasticsearch.common.settings.Settings
9 | import org.elasticsearch.client.transport.TransportClient
10 | import org.elasticsearch.common.transport.TransportAddress
11 | import org.elasticsearch.common.transport.LocalTransportAddress
12 | import org.elasticsearch.common.transport.InetSocketTransportAddress
13 | import java.net.InetAddress
14 | import scala.collection.JavaConversions._
15 | import scala.collection.JavaConverters._
16 | import com.google.gson.GsonBuilder
17 | import net.sf.json.JSONObject
18 | import org.elasticsearch.common.xcontent.XContentFactory
19 | import scala.collection.JavaConverters._
20 | object Test {
21 | var sc: SparkContext = null
22 | def main(args: Array[String]): Unit = {
23 | /*val client = getESClient
24 | println(client.listedNodes())
25 | val bulk = client.prepareBulk()
26 | */
27 | val client=getESClient
28 | queryES(client)
29 | /*val builder = XContentFactory.jsonBuilder()
30 | .startObject()
31 | .field("firstName", "Avivi")
32 | .field("map", Map("age"->1,"age2"->2).asJava)
33 | .endObject()
34 |
35 | val request = client.prepareIndex("test", "testType")
36 | .setSource(builder)
37 | bulk.add(request)
38 | val response = bulk.get
39 | response.getItems.foreach { x => println(!x.isFailed()) }*/
40 | }
41 | def queryES(client: TransportClient){
42 | val d= client.prepareGet("sdr_urlinfo_test","urlinfo","http%3A%2F%2Fbaojian.zx58.cn%2Fproduct%2F9348%2F")
43 | .setFetchSource("frequency", "").get
44 | println(d.getField("frequency"))
45 | }
46 | def getESClient() = {
47 | val endpoints = Array("192.168.10.115", "192.168.10.110", "192.168.10.81")
48 | .map(_.split(':')).map {
49 | case Array(host, port) => SocketEndpoint(host, port.toInt)
50 | case Array(host) => SocketEndpoint(host, 9300)
51 | }
52 | val settings = Map("cluster.name" -> "zhiziyun")
53 | val esSettings = Settings.settingsBuilder().put(settings.asJava).build()
54 | val client = TransportClient.builder().settings(esSettings).build()
55 | val addresses = endpoints.map(endpointToTransportAddress)
56 | client.addTransportAddresses(addresses: _*)
57 | client
58 | }
59 |
60 | def endpointToTransportAddress(endpoint: Endpoint): TransportAddress = endpoint match {
61 | case LocalEndpoint(id) => new LocalTransportAddress(id)
62 | case SocketEndpoint(address, port) => new InetSocketTransportAddress(InetAddress.getByName(address), port)
63 | }
64 |
65 | def init {
66 | val sparkConf = new SparkConf()
67 | .setAppName("Test")
68 | sc = new SparkContext(sparkConf)
69 | }
70 | }
71 | case class SocketEndpoint(address: String, port: Int) extends Endpoint
72 | case class LocalEndpoint(id: String) extends Endpoint
73 | sealed abstract class Endpoint
--------------------------------------------------------------------------------
/src/main/scala/com/spark/hbase/GetOutSiteSuNingPCToNewTable.scala:
--------------------------------------------------------------------------------
1 | package com.spark.hbase
2 | import org.apache.hadoop.hbase.client.Scan
3 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable
4 | import org.apache.hadoop.hbase.client.Result
5 | import org.apache.spark.rdd.RDD
6 | import org.apache.hadoop.mapreduce.Job
7 | import org.apache.hadoop.conf.Configuration
8 | import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil
9 | import org.apache.hadoop.hbase.mapreduce.IdentityTableMapper
10 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat
11 | import org.apache.spark.SparkContext
12 | import scala.reflect.ClassTag
13 | import org.apache.spark.SparkConf
14 | import org.apache.hadoop.hbase.HBaseConfiguration
15 | import org.apache.hadoop.fs.Path
16 | import java.util.ArrayList
17 | import org.apache.hadoop.hbase.filter.RowFilter
18 | import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp
19 | import org.apache.hadoop.hbase.filter.RegexStringComparator
20 | import org.apache.hadoop.hbase.protobuf.generated.ClientProtos
21 | import org.apache.hadoop.hbase.protobuf.ProtobufUtil
22 | import org.apache.hadoop.hbase.util.Base64
23 | import java.util.HashMap
24 | import org.apache.hadoop.hbase.util.Bytes
25 | import scala.collection.JavaConversions._
26 | import org.apache.hadoop.hbase.filter.SingleColumnValueFilter
27 | import org.apache.hadoop.hbase.filter.CompareFilter
28 | import org.apache.hadoop.hbase.filter.FilterList
29 | import org.apache.hadoop.hbase.filter.Filter
30 | import org.apache.hadoop.hbase.client.Put
31 | import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
32 | object GetOutSiteSuNingPCToNewTable {
33 | var sc: SparkContext = null
34 | var conf: Configuration = null
35 | var zookeeper = "solr2.zhiziyun.com,solr1.zhiziyun.com,mongodb3"
36 | def main(args: Array[String]): Unit = {
37 | val tableName="outsitepctag"
38 | val sparkConf = new SparkConf()
39 | .setMaster("local")
40 | .setAppName("GetOutSiteSuNingPCToNewTable")
41 | sc = new SparkContext(sparkConf)
42 | conf = HBaseConfiguration.create()
43 | conf.set("hbase.zookeeper.quorum", zookeeper)
44 | conf.set("hbase.zookeeper.property.clientPort", "2181")
45 |
46 | var scan = new Scan
47 | val scvf = new SingleColumnValueFilter(
48 | Bytes.toBytes("info"),
49 | Bytes.toBytes("source"),
50 | CompareOp.EQUAL,
51 | Bytes.toBytes("baidupclog"));
52 | scvf.setFilterIfMissing(false);
53 | scan.setFilter(scvf)
54 |
55 | var a = hbaseRDD2[(String, HashMap[String, String])](
56 | tableName,
57 | scan,
58 | (r: (ImmutableBytesWritable, Result)) => {
59 | var rowMap = new HashMap[String, String]()
60 | var listCells = r._2.listCells()
61 | val rowkey = Bytes.toString(r._2.getRow)
62 | for (cell <- listCells) {
63 | var column = new String(cell.getQualifierArray, cell.getQualifierOffset, cell.getQualifierLength)
64 | rowMap.put(column, new String(cell.getValueArray, cell.getValueOffset, cell.getValueLength))
65 | }
66 | (rowkey, rowMap)
67 | })
68 | println("##### partition num ##### "+a.partitions.size)
69 | a.foreach(println)
70 | /* conf.set(TableOutputFormat.OUTPUT_TABLE, "suningpctag")
71 | val job = new Job(conf)
72 | job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
73 | println("########## 数据准备放入 hbase suningpctag ########")
74 | a.map{x =>
75 | val p = new Put(Bytes.toBytes(x._1))
76 | for((key,value)<-x._2){
77 | p.addColumn("info".getBytes, key.getBytes, value.getBytes)
78 | }
79 | (new ImmutableBytesWritable, p)
80 | }
81 | .saveAsNewAPIHadoopDataset(job.getConfiguration)
82 | sc.stop()*/
83 | println("########## 结束 ########")
84 | }
85 |
86 |
87 | def hbaseRDD2[U: ClassTag](tableName: String, scan: Scan, f: ((ImmutableBytesWritable, Result)) => U): RDD[U] = {
88 | var proto = ProtobufUtil.toScan(scan);
89 | conf.set(TableInputFormat.INPUT_TABLE, tableName)
90 | conf.set(TableInputFormat.SCAN, Base64.encodeBytes(proto.toByteArray()))
91 | var job: Job = new Job(conf)
92 | sc.newAPIHadoopRDD(job.getConfiguration(),
93 | classOf[TableInputFormat],
94 | classOf[ImmutableBytesWritable],
95 | classOf[Result]).map(f)
96 | }
97 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/hbase/PutDataToHbase.scala:
--------------------------------------------------------------------------------
1 | package com.spark.hbase
2 |
3 | import org.apache.hadoop.conf.Configuration
4 | import org.apache.hadoop.hbase.HBaseConfiguration
5 | import org.apache.hadoop.hbase.client.ConnectionFactory
6 | import org.apache.hadoop.hbase.TableName
7 | import org.apache.hadoop.hbase.client.Put
8 | import org.apache.hadoop.hbase.util.Bytes
9 | import org.apache.hadoop.hbase.client.Table
10 |
11 | object PutDataToHbase {
12 | def main(args: Array[String]): Unit = {
13 | var hconf = HBaseConfiguration.create();
14 | hconf.set("hbase.zookeeper.quorum", "virtual-2,virtual-3,virtual-4");
15 | hconf.set("hbase.zookeeper.property.clientPort", "2181");
16 | var hconnection = ConnectionFactory.createConnection(hconf)
17 | var table = hconnection.getTable(TableName.valueOf("rt_mobilertbreport_bycreative"))
18 | putData(table,"WWTEY3i9OEh,hEmlg0eYmSk,2016-08-23")
19 | putData(table,"WWTEY3i9OEh,d2wns0wqJna,2016-08-23")
20 | putData(table,"0zoTLi29XRgq,istRh0Z1G4o,2016-08-23")
21 | putData(table,"WWTEY3i9OEh,hs8Xi0hvIbe,2016-08-23")
22 |
23 | println(">>>>>>>>>>")
24 | }
25 | def putData(table: Table, rowkey: String) {
26 | val p = new Put(Bytes.toBytes(rowkey))
27 | p.addColumn("info".getBytes, "additionalcpmcost".getBytes, "100".getBytes)
28 | p.addColumn("info".getBytes, "fee".getBytes, "100".getBytes)
29 | p.addColumn("info".getBytes, "deliveryUV".getBytes, "100".getBytes)
30 | p.addColumn("info".getBytes, "delivery".getBytes, "100".getBytes)
31 | p.addColumn("info".getBytes, "cpmcost".getBytes, "100".getBytes)
32 | p.addColumn("info".getBytes, "clicks".getBytes, "100".getBytes)
33 | p.addColumn("info".getBytes, "clickUV".getBytes, "100".getBytes)
34 | table.put(p)
35 | }
36 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/hbase/SparkGetHbaseToRdd.scala:
--------------------------------------------------------------------------------
1 | package com.spark.hbase
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkConf
5 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable
6 | import org.apache.hadoop.hbase.client.Put
7 | import org.apache.hadoop.hbase.util.Bytes
8 | import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
9 | import org.apache.hadoop.mapreduce.Job
10 | import org.apache.hadoop.hbase.util.MD5Hash
11 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
12 |
13 | object SparkReadMoreFiles {
14 | var sc: SparkContext = null
15 | def main(args: Array[String]): Unit = {
16 | init
17 | //HCatOutputFormat
18 | var conf = sc.hadoopConfiguration
19 | conf.set(TableOutputFormat.OUTPUT_TABLE, "test")
20 | sc.hadoopConfiguration.set("hbase.zookeeper.quorum", "Virtual-1,Virtual-2,Virtual-3")
21 | sc.hadoopConfiguration.set("zookeeper.znode.parent", "/hbase")
22 | var job = new Job(conf)
23 | job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
24 | conf = job.getConfiguration
25 | for (i <- 1 to 100) {
26 | println(i)
27 | var a = sc.parallelize(i*100000 to (i+1)*(100000))
28 | var b = a.map { x =>
29 | println(x)
30 | var p = new Put(Bytes.toBytes(MD5Hash.getMD5AsHex(Bytes.toBytes(x))))
31 | p.addColumn("info".getBytes, "test".getBytes, Bytes.toBytes(x))
32 | (new ImmutableBytesWritable, p)
33 | }
34 | .saveAsNewAPIHadoopDataset(conf)
35 | }
36 | }
37 | def init {
38 | val sparkConf = new SparkConf()
39 | .setMaster("local")
40 | .setAppName("Test")
41 | sc = new SparkContext(sparkConf)
42 | }
43 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/hbase/SparkScanHbaseToRdd.scala:
--------------------------------------------------------------------------------
1 | package com.spark.hbase
2 |
3 | import org.apache.hadoop.hbase.client.Scan
4 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable
5 | import org.apache.hadoop.hbase.client.Result
6 | import org.apache.spark.rdd.RDD
7 | import org.apache.hadoop.mapreduce.Job
8 | import org.apache.hadoop.conf.Configuration
9 | import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil
10 | import org.apache.hadoop.hbase.mapreduce.IdentityTableMapper
11 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat
12 | import org.apache.spark.SparkContext
13 | import scala.reflect.ClassTag
14 | import org.apache.spark.SparkConf
15 | import org.apache.hadoop.hbase.HBaseConfiguration
16 | import org.apache.hadoop.fs.Path
17 | import java.util.ArrayList
18 | import org.apache.hadoop.hbase.filter.RowFilter
19 | import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp
20 | import org.apache.hadoop.hbase.filter.RegexStringComparator
21 | import org.apache.hadoop.hbase.protobuf.generated.ClientProtos
22 | import org.apache.hadoop.hbase.protobuf.ProtobufUtil
23 | import org.apache.hadoop.hbase.util.Base64
24 | import java.util.HashMap
25 | import org.apache.hadoop.hbase.util.Bytes
26 | import scala.collection.JavaConversions._
27 | object SparkScanHbaseToRdd {
28 | var sc: SparkContext = null
29 | var conf: Configuration = null
30 | def main(args: Array[String]): Unit = {
31 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0")
32 | var tableName = "rt_rtbreport"
33 | var zookeeper = "solr2.zhiziyun.com,solr1.zhiziyun.com,mongodb3"
34 | var scans = new Scan
35 | var filter = new RowFilter(CompareOp.EQUAL, new RegexStringComparator(".*2016-10-20"))
36 | scans.setFilter(filter)
37 | val sparkConf = new SparkConf()
38 | .setMaster("local")
39 | .setAppName("HBaseDistributedScanExample")
40 | sc = new SparkContext(sparkConf)
41 | conf = HBaseConfiguration.create()
42 | conf.set("hbase.zookeeper.quorum", zookeeper)
43 | conf.set("hbase.zookeeper.property.clientPort", "2181")
44 | //conf.addResource(new Path("conf/core-site.xml"))
45 | //conf.addResource(new Path("conf/hbase-site.xml"))
46 | //conf.addResource(new Path("conf/hdfs-site.xml"))
47 |
48 |
49 | var a = hbaseRDD2[(String, HashMap[String, String])](
50 | tableName,
51 | scans,
52 | (r: (ImmutableBytesWritable, Result)) => {
53 | var rowMap = new HashMap[String, String]()
54 | var listCells = r._2.listCells()
55 | val rowkey = Bytes.toString(r._2.getRow)
56 | for (cell <- listCells) {
57 | var column = new String(cell.getQualifierArray, cell.getQualifierOffset, cell.getQualifierLength)
58 | rowMap.put(column, new String(cell.getValueArray, cell.getValueOffset, cell.getValueLength))
59 | }
60 | (rowkey, rowMap)
61 | })
62 | println(a.partitions.size)
63 | a.foreach(println)
64 |
65 | }
66 | def hbaseRDD[U: ClassTag](tableName: String, scan: Scan, f: ((ImmutableBytesWritable, Result)) => U): RDD[U] = {
67 |
68 | var job: Job = new Job(conf)
69 | TableMapReduceUtil.initCredentials(job)
70 | TableMapReduceUtil.initTableMapperJob(tableName, scan, classOf[IdentityTableMapper], null, null, job)
71 | sc.newAPIHadoopRDD(job.getConfiguration(),
72 | classOf[TableInputFormat],
73 | classOf[ImmutableBytesWritable],
74 | classOf[Result]).map(f)
75 | }
76 | def hbaseRDD2[U: ClassTag](tableName: String, scan: Scan, f: ((ImmutableBytesWritable, Result)) => U): RDD[U] = {
77 | var proto = ProtobufUtil.toScan(scan);
78 | conf.set(TableInputFormat.INPUT_TABLE, tableName)
79 | conf.set(TableInputFormat.SCAN, Base64.encodeBytes(proto.toByteArray()))
80 | var job: Job = new Job(conf)
81 | sc.newAPIHadoopRDD(job.getConfiguration(),
82 | classOf[TableInputFormat],
83 | classOf[ImmutableBytesWritable],
84 | classOf[Result]).map(f)
85 | }
86 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/hbase/hbasetest.scala:
--------------------------------------------------------------------------------
1 | package com.spark.hbase
2 |
3 | import org.apache.hadoop.hbase.client.HBaseAdmin
4 | import org.apache.hadoop.hbase.client.HTable
5 | import org.apache.hadoop.hbase.client.Put
6 | import org.apache.hadoop.hbase.client.Get
7 | import org.apache.hadoop.hbase.util.Bytes
8 | import util.Properties
9 | import org.apache.hadoop.hbase.{HColumnDescriptor, HTableDescriptor, HBaseConfiguration}
10 | import org.apache.hadoop.hbase.client.ConnectionFactory
11 |
12 | object hbasetest {
13 |
14 | var zookeeper = "192.168.0.245,192.168.0.246,192.168.0.247"
15 | var conf = HBaseConfiguration.create()
16 | conf.set("hbase.zookeeper.quorum", zookeeper)
17 | conf.set("hbase.zookeeper.property.clientPort", "2181")
18 | conf.set("zookeeper.znode.parent","/hbase")
19 | val admin=ConnectionFactory.createConnection(conf).getAdmin
20 |
21 |
22 | def main(args: Array[String]) {
23 | val tablename="table001"
24 | val tablenames=Array("table001","table002")
25 | val rowkey="rowkey001"
26 | val columnnames=Array("columnname001","cn002")
27 | val columndatas=Array("columndata001","data001")
28 | createHbaseTable(tablenames)
29 | println(">>>>>>>>>>>>>")
30 | putHbaseData(tablename, rowkey, columnnames, columndatas)
31 | println("1>>>>>>>>>>>>>")
32 | getHbaseData(tablename, rowkey)
33 | }
34 |
35 | // list the tables
36 | //val listtables=admin.listTables()
37 | //listtables.foreach(println)
38 |
39 | def createHbaseTable(tablenames: Array[String]) {
40 | for(tablename<-tablenames){
41 | // if (admin.tableExists(tablename)!=null){
42 | val tableDesc = new HTableDescriptor(Bytes.toBytes(tablename))
43 | val idsColumnFamilyDesc = new HColumnDescriptor(Bytes.toBytes("info"))
44 | tableDesc.addFamily(idsColumnFamilyDesc)
45 | admin.createTable(tableDesc)
46 | // }
47 | }
48 | }
49 |
50 | def putHbaseData(tablename: String,rowkey:String,columnnames:Array[String],columndatas:Array[String]) {
51 | val table = new HTable(conf, tablename)
52 | val theput= new Put(Bytes.toBytes(rowkey))
53 | for(a<-0 to columnnames.length){
54 | theput.addColumn(Bytes.toBytes("info"),Bytes.toBytes(columnnames(a)),Bytes.toBytes(columndatas(a)))
55 | table.put(theput)
56 | }
57 | }
58 |
59 | // let's insert some data in 'mytable' and get the row
60 | def getHbaseData(tablenames: String,rowkey: String):String={
61 | val table = new HTable(conf, tablenames)
62 | val theget= new Get(Bytes.toBytes(rowkey))
63 | val result=table.get(theget)
64 | val value=result.value().toString
65 | // println(Bytes.toString(value))
66 | value
67 | }
68 | admin.close()
69 | conf.clear()
70 |
71 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/hive/CaseClass.scala:
--------------------------------------------------------------------------------
1 | package com.spark.hive
2 |
3 | object CaseClass {
4 | case class User2(name:Int,age:Int,sex:Int)
5 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/hive/HiveContextTest.scala:
--------------------------------------------------------------------------------
1 | package com.spark.hive
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.SparkContext
5 | import org.apache.spark.sql.hive.HiveContext
6 | import org.apache.spark.sql.DataFrame
7 |
8 | object HiveContextTest {
9 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0")
10 | case class User2(name:Int,age:Int,sex:Int)
11 | var hiveconf = new SparkConf().setAppName("sparkhivetest").setMaster("local")
12 | setHiveConf
13 | val sc = new SparkContext(hiveconf)
14 | val sqlContext = new HiveContext(sc)
15 | def main(args: Array[String]): Unit = {
16 | sqlContext.sql("select * from smartadsclicklog where statdate='20170414' limit 50").show
17 |
18 |
19 |
20 | /* var rdd=sc.parallelize(Array(Map("name"->1,"age"->2,"sex"->3))).map{x=>User2(name=x("name"),age=x("age"),sex=x("sex"))}
21 | sqlContext.createDataFrame(rdd).registerTempTable("user2")
22 | sqlContext.sql("show tables").show
23 | sc.stop()*/
24 | }
25 | def setHiveConf() {
26 | //加一下的信息,就可以不用使用hive-site.xml和hdfs-site.xml了
27 | //信息在/etc/hive/conf/hive-site.xml里面
28 | //加配置文件是最保险的。有时候加下面的也不成功
29 | System.setProperty("hive.metastore.uris", "thrift://mongodb3:9083")
30 | System.setProperty("hive.metastore.warehouse.dir", "/user/hive/warehouse")
31 | System.setProperty("hive.zookeeper.quorum", "mongodb3,solr2.zhiziyun.com,solr1.zhiziyun.com")
32 | System.setProperty("hive.zookeeper.client.port", "2181")
33 |
34 | System.setProperty("dfs.nameservices", "nameservice-zzy")
35 | System.setProperty("dfs.client.failover.proxy.provider.nameservice-zzy", "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider")
36 | System.setProperty("dfs.ha.automatic-failover.enabled.nameservice-zzy", "true")
37 | System.setProperty("ha.zookeeper.quorum", "mongodb3:2181,solr1.zhiziyun.com:2181,solr2.zhiziyun.com:2181")
38 | System.setProperty("dfs.ha.namenodes.nameservice-zzy", "namenode47,namenode237")
39 | System.setProperty("dfs.namenode.rpc-address.nameservice-zzy.namenode47", "mongodb3:8020")
40 | System.setProperty("dfs.namenode.servicerpc-address.nameservice-zzy.namenode47", "mongodb3:8022")
41 | System.setProperty("dfs.namenode.http-address.nameservice-zzy.namenode47", "mongodb3:50070")
42 | System.setProperty("dfs.namenode.https-address.nameservice-zzy.namenode47", "mongodb3:50470")
43 | System.setProperty("dfs.namenode.rpc-address.nameservice-zzy.namenode237", "solr2.zhiziyun.com:8020")
44 | System.setProperty("dfs.namenode.servicerpc-address.nameservice-zzy.namenode237", "solr2.zhiziyun.com:8022")
45 | System.setProperty("dfs.namenode.http-address.nameservice-zzy.namenode237", "solr2.zhiziyun.com:50070")
46 | System.setProperty("dfs.namenode.https-address.nameservice-zzy.namenode237", "solr2.zhiziyun.com:50470")
47 | System.setProperty("dfs.namenode.http-address.nameservice-zzy.namenode47", "mongodb3:50070")
48 | System.setProperty("dfs.client.use.datanode.hostname", "false")
49 |
50 | System.setProperty("fs.permissions.umask-mode", "022")
51 | System.setProperty("dfs.namenode.acls.enabled", "false")
52 | System.setProperty("dfs.client.read.shortcircuit", "false")
53 | System.setProperty("dfs.namenode.acls.enabled", "false")
54 | System.setProperty("dfs.domain.socket.path", "/var/run/hdfs-sockets/dn")
55 | System.setProperty("dfs.client.read.shortcircuit.skip.checksum", "false")
56 | System.setProperty("dfs.client.domain.socket.data.traffic", "false")
57 | System.setProperty("dfs.datanode.hdfs-blocks-metadata.enabled", "true")
58 |
59 |
60 | System.setProperty("hive.metastore.client.socket.timeout", "300")
61 | System.setProperty("hive.warehouse.subdir.inherit.perms", "true")
62 | System.setProperty("hive.enable.spark.execution.engine", "false")
63 | System.setProperty("hive.cluster.delegation.token.store.class", "org.apache.hadoop.hive.thrift.MemoryTokenStore")
64 | System.setProperty("hive.server2.enable.doAs", "true")
65 | System.setProperty("hive.metastore.execute.setugi", "true")
66 | System.setProperty("hive.support.concurrency", "true")
67 | System.setProperty("hive.zookeeper.namespace", "hive_zookeeper_namespace_hive")
68 | System.setProperty("hive.server2.use.SSL", "false")
69 | System.setProperty("hive.conf.restricted.list", "hive.enable.spark.execution.engine")
70 | }
71 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/hive/SparkPhoenixLoadAndSaveTest.scala:
--------------------------------------------------------------------------------
1 | package com.spark.run
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.SparkContext
5 | import org.apache.spark.sql.SQLContext
6 | import org.apache.spark.sql.SaveMode
7 | import org.apache.phoenix.spark._
8 |
9 | object SparkPhoenixLoadAndSaveTest {
10 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0")
11 | var sparkconf: SparkConf = null
12 | var sc: SparkContext = null
13 | var sqlContext: SQLContext = null
14 | def main(args: Array[String]): Unit = {
15 | sc = new SparkContext(new SparkConf().setMaster("local").setAppName("Test Phoenix"))
16 | sqlContext = new SQLContext(sc)
17 | //loadPhoenixDF
18 | loadPhoenixDF
19 | //saveDFToPhoeni
20 | }
21 | def loadPhoenixDF() {
22 | //获取全表
23 | var phoenixDF = sqlContext.load("org.apache.phoenix.spark",
24 | Map("table" -> "US_POPULATION", "zkUrl" -> "192.168.10.191:2181"))
25 |
26 | phoenixDF.show
27 | //phoenixDF.select("CITY").show
28 |
29 | /*phoenixDF.filter(phoenixDF("COL1") === "test_row_1" && phoenixDF("ID") === 1L)
30 | .select(phoenixDF("ID"))
31 | .show*/
32 |
33 | //获取指定的列
34 | var phoenixDF2 = sqlContext.phoenixTableAsDataFrame("US_POPULATION",
35 | Seq("CITY", "POPULATION"),
36 | zkUrl = Some("192.168.10.191:2181"))
37 | phoenixDF2.foreach { x => println(x) }
38 |
39 | /* phoenixDF2.registerTempTable("tablename")
40 | phoenixDF2.map { x => x} */
41 |
42 | }
43 | def saveDFToPhoeni() {
44 | //将一个RDD存进Phoenix
45 | val dataSet = List(("CB", "A", 11), ("CC", "B", 22), ("CD", "C", 33))
46 | sc.parallelize(dataSet)
47 | .saveToPhoenix("US_POPULATION",
48 | Seq("STATE", "CITY", "POPULATION"),
49 | zkUrl = Some("192.168.10.191"))
50 | //将一个DataFram存进Phoenix
51 | /*var phoenixDF=sqlContext.load("org.apache.phoenix.spark",
52 | Map("table" -> "TABLE1", "zkUrl" -> "phoenix-server:2181"))
53 | phoenixDF.save("org.apache.phoenix.spark",
54 | SaveMode.Overwrite, Map("table" -> "OUTPUT_TABLE",
55 | "zkUrl" -> "phoenix-server:2181"))
56 | */
57 |
58 | }
59 |
60 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/hive/SparkRddToHive.scala:
--------------------------------------------------------------------------------
1 | package com.spark.hive
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkConf
5 | import org.apache.hive.hcatalog.data.DefaultHCatRecord
6 | import org.apache.hive.hcatalog.mapreduce.HCatOutputFormat
7 | import org.apache.hive.hcatalog.data.HCatRecord
8 | import org.apache.hive.hcatalog.common.HCatUtil
9 | import org.apache.hive.hcatalog.data.schema.HCatSchema
10 | import org.apache.hadoop.mapreduce.Job
11 | import org.apache.hive.hcatalog.mapreduce.OutputJobInfo
12 | import org.apache.hive.hcatalog.mapreduce.HCatBaseOutputFormat._
13 | import org.apache.hive.hcatalog.mapreduce.HCatBaseOutputFormat
14 | import org.apache.hadoop.io.WritableComparable
15 | import org.apache.spark.SerializableWritable
16 | import org.apache.hadoop.io.NullWritable
17 | import org.apache.hadoop.conf.Configuration
18 | import org.apache.spark.rdd.RDD
19 | import scala.reflect.ClassTag
20 | import org.apache.hive.hcatalog.mapreduce.HCatRecordReader
21 | import org.apache.hadoop.mapreduce.JobContext
22 |
23 | object SparkRddToHive {
24 | var sc: SparkContext = null
25 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0")
26 | def main(args: Array[String]): Unit = {
27 | init
28 | var outputJob: Job = Job.getInstance
29 | var schema = getHCatSchema("test", "test",outputJob)
30 | var a = sc.parallelize(Array((1,1)))
31 | useHCatOutputFormatToHive(outputJob,schema,a)
32 | println(">>>>>>...")
33 | //a.saveAsNewAPIHadoopDataset(new Configuration)
34 | }
35 | def getHCatSchema(dbName: String, tableName: String,outputJob: Job) = {
36 | //获取schema
37 | var schema: HCatSchema = null
38 | //var outputJob: Job = Job.getInstance
39 | outputJob.setJobName("getHCatSchema");
40 | HCatOutputFormat.setOutput(outputJob, OutputJobInfo.create(dbName, tableName, null));
41 | schema = HCatBaseOutputFormat.getTableSchema(outputJob.getConfiguration());
42 | HCatOutputFormat.setSchema(outputJob, schema)
43 | schema
44 | }
45 | def useHCatOutputFormatToHive[T:ClassTag](job:Job,recordSchema: HCatSchema,rdd:RDD[T]) {
46 | var a = sc.parallelize(Array(("test", 1), ("test2", 2), ("test3", 3), ("test4", 4)),2)
47 | job.setOutputFormatClass(classOf[HCatOutputFormat])
48 | job.setOutputKeyClass(classOf[NullWritable]);
49 | job.setOutputValueClass(classOf[DefaultHCatRecord]);
50 | var jobconf = job.getConfiguration
51 |
52 | var c = a.map { x =>
53 | var record = new DefaultHCatRecord(recordSchema.size());
54 | record.setString("name", recordSchema, x._1)
55 | record.setString("age", recordSchema, x._2.toString)
56 | (NullWritable.get(), record)
57 | }
58 | c.saveAsNewAPIHadoopDataset(jobconf)
59 |
60 | }
61 | def init {
62 | val sparkConf = new SparkConf()
63 | .setMaster("local")
64 | .setAppName("Test")
65 | sc = new SparkContext(sparkConf)
66 | }
67 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/hive/SparkToHive.scala:
--------------------------------------------------------------------------------
1 | package com.spark.hive
2 |
3 | import org.apache.spark.sql.hive.HiveContext
4 | import org.apache.spark.sql.SQLContext
5 | import org.apache.spark.SparkContext
6 | import org.apache.spark.SparkContext._
7 | import org.apache.spark.SparkConf
8 | import org.apache.hive.hcatalog.mapreduce.HCatOutputFormat
9 | import org.apache.hive.hcatalog.mapreduce.HCatBaseOutputFormat._
10 | import org.apache.hive.hcatalog.mapreduce.OutputJobInfo
11 | import org.apache.hive.hcatalog.data.schema.HCatSchema
12 | import org.apache.hive.hcatalog.data.DefaultHCatRecord
13 | import org.apache.hive.hcatalog.mapreduce.HCatBaseOutputFormat
14 | import org.apache.hadoop.mapreduce.Job
15 | import org.apache.hadoop.security.UserGroupInformation.HadoopConfiguration
16 | import org.apache.hadoop.io.NullWritable
17 | import scala.collection.mutable.ArrayBuffer
18 | import java.util.HashMap
19 | import org.apache.spark.sql.Row
20 | import org.apache.spark.sql.types.StructField
21 | import org.apache.spark.sql.types.StringType
22 | import org.apache.spark.sql.types.StructType
23 | import org.apache.spark.sql.types.IntegerType
24 | import org.apache.spark.sql.functions._
25 | import com.spark.hive.CaseClass._
26 | import org.apache.spark.sql.types._
27 | import org.apache.spark.sql.api.java.UDF1
28 | object SparkToHive {
29 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0")
30 | var hiveconf = new SparkConf().setAppName("sparkhivetest").setMaster("local")
31 | setHiveConf
32 | val sc = new SparkContext(hiveconf)
33 | val sqlContext = new HiveContext(sc)
34 | import sqlContext.implicits._
35 |
36 | def main(args: Array[String]): Unit = {
37 | //useHCatOutputFormatToHive
38 | //secondRDDToFrame
39 | //insertintoHive
40 | //readHiveData
41 | //creatTable
42 | insertintoHive
43 | }
44 | /**
45 | * 建表
46 | */
47 | def creatTable(){
48 | sqlContext.sql("use test1")
49 | sqlContext.sql("create table test_creat(id int,order_id int,product_id int) row format delimited fields terminated by ','STORED AS TEXTFILE")
50 | }
51 | /**
52 | * 读取hive的数据
53 | */
54 | def readHiveData() {
55 | sqlContext.sql("use default")
56 | sqlContext.sql("select count(*) from siteorderlog").show
57 | sc.stop()
58 | }
59 | /**
60 | * 数据写入hive
61 | */
62 | def insertintoHive(){
63 | var rdd=sc.parallelize(Array(Map("name"->3,"age"->4,"sex"->5)))
64 | .map{x=>User2(name=x("name"),age=x("age"),sex=x("sex"))}
65 | //方法1
66 | //import sqlContext.implicits._
67 | //rdd.toDF().registerTempTable("user2")
68 | //方法2
69 | //sqlContext.createDataFrame(rdd).select(count("name")).show
70 | sqlContext.createDataFrame(rdd).registerTempTable("user2")
71 | //sqlContext.sql("select * from user2").show
72 |
73 | sqlContext.sql("insert into table test1.test_creat "+
74 | "select name,age,sex from user2")
75 |
76 | }
77 | /**
78 | * 自定义UDF
79 | */
80 | def testUDFFunction(){
81 | val makeDT=(name: Int, time: Int, tz: Int) => s"$name : $time : $tz"
82 | sqlContext.udf.register("strtoger",makeDT)
83 | var rdd=sc.parallelize(Array(Map("name"->3,"age"->4,"sex"->5)))
84 | .map{x=>User2(name=x("name"),age=x("age"),sex=x("sex"))}
85 | sqlContext.createDataFrame(rdd).registerTempTable("user2")
86 | sqlContext.sql("select *,strtoger(name,age,sex) as udf from user2").show
87 | }
88 | //第二种指定Schema,需要这个ROW
89 | def secondRDDToFrame(){
90 | var arraybuffer=ArrayBuffer[HashMap[String,Int]]()
91 | var map=new HashMap[String,Int]()
92 | map.put("name", 1)
93 | map.put("age", 1)
94 | map.put("sex", 1)
95 | arraybuffer+=map
96 | var liens=sc.parallelize(arraybuffer)
97 | .map(p=>Row(p.get("name"),p.get("age"),p.get("sex")))
98 | var schemaString = Array("name","age","sex")
99 | var columns=schemaString.map(fieldName => StructField(fieldName, IntegerType, true))
100 | val schema = StructType(columns)
101 | var schemaData=sqlContext.createDataFrame(liens, schema)
102 | schemaData.registerTempTable("user2")
103 | sqlContext.sql("select * from user2").show()
104 | sqlContext.sql("insert overwrite table test1.test_creat select name,age,sex from user2")
105 | }
106 | def setHiveConf() {
107 | //加一下的信息,就可以不用使用hive-site.xml了
108 | //信息在/etc/hive/conf/hive-site.xml里面
109 | System.setProperty("hive.metastore.uris", "thrift://CDH-Master:9083")
110 | System.setProperty("hive.metastore.warehouse.dir", "/user/hive/warehouse")
111 | System.setProperty("hive.zookeeper.quorum", "CDH-Master,Node2,Node1")
112 | System.setProperty("hive.zookeeper.client.port", "2181")
113 |
114 |
115 | System.setProperty("hive.metastore.client.socket.timeout", "300")
116 | System.setProperty("hive.warehouse.subdir.inherit.perms", "true")
117 | System.setProperty("hive.enable.spark.execution.engine", "false")
118 | System.setProperty("hive.cluster.delegation.token.store.class", "org.apache.hadoop.hive.thrift.MemoryTokenStore")
119 | System.setProperty("hive.server2.enable.doAs", "true")
120 | System.setProperty("hive.metastore.execute.setugi", "true")
121 | System.setProperty("hive.support.concurrency", "true")
122 | System.setProperty("hive.zookeeper.namespace", "hive_zookeeper_namespace_hive")
123 | System.setProperty("hive.server2.use.SSL", "false")
124 | System.setProperty("hive.conf.restricted.list", "hive.enable.spark.execution.engine")
125 | }
126 | def useHCatOutputFormatToHive() {
127 | var a = sc.parallelize(Array(("test", 1), ("test2", 2), ("test3", 3), ("test4", 4)))
128 | var job = Job.getInstance();
129 | HCatOutputFormat.setOutput(job, OutputJobInfo.create("test", "test", null));
130 | var recordSchema = getTableSchema(job.getConfiguration())
131 | HCatOutputFormat.setSchema(job, recordSchema)
132 | job.setOutputFormatClass(classOf[HCatOutputFormat])
133 | job.setOutputKeyClass(classOf[NullWritable]);
134 | job.setOutputValueClass(classOf[DefaultHCatRecord]);
135 | var jobconf = job.getConfiguration
136 | var c = a.map { x =>
137 | var record = new DefaultHCatRecord(recordSchema.size());
138 | record.setString("name", recordSchema, x._1)
139 | record.setString("age", recordSchema, x._2.toString)
140 | (NullWritable.get(), record)
141 | }
142 | c.saveAsNewAPIHadoopDataset(jobconf)
143 |
144 | }
145 | }
146 |
--------------------------------------------------------------------------------
/src/main/scala/com/spark/jdbcrdd/JdbcMysqlRDD.scala:
--------------------------------------------------------------------------------
1 | package com.spark.jdbcrdd
2 |
3 | import scala.reflect.ClassTag
4 | import org.apache.spark.SparkContext
5 | import org.apache.spark.rdd.RDD
6 | import java.sql.ResultSet
7 | import java.sql.Connection
8 | import org.apache.spark.Logging
9 | import org.apache.spark.Partition
10 | import org.apache.spark.TaskContext
11 | import org.apache.spark.annotation.DeveloperApi
12 | class JdbcMysqlPartition(idx: Int, val startId: Long, val perPartitionNum: Long) extends Partition {
13 | override def index = idx
14 | }
15 | class JdbcMysqlRDD[T:ClassTag](
16 | sc: SparkContext,
17 | getConnection: () => Connection,
18 | sql:String,
19 | numPartitions: Int,
20 | mapRow: (ResultSet) => T = JdbcMysqlRDD.resultSetToObjectArray _)
21 | extends RDD[T](sc, Nil) with Logging{
22 | override def count()=getRowsNum(sql)
23 | override def getPartitions: Array[Partition] = {
24 | val rowsNum=getRowsNum(sql)
25 | //Each partition limit on the number of article
26 | val perPartitionNum=rowsNum/numPartitions
27 | //Add the remaining to the last partition
28 | val lastPartitionNum=perPartitionNum+(rowsNum%numPartitions)
29 | (0 until numPartitions).map(i => {
30 | val start = (i*perPartitionNum)
31 | if(i==(numPartitions-1)){
32 | new JdbcMysqlPartition(i, start, lastPartitionNum)
33 | }else
34 | new JdbcMysqlPartition(i, start, perPartitionNum)
35 | }).toArray
36 | }
37 | /**
38 | * For how many records
39 | * @param The SQL query
40 | */
41 | def getRowsNum(sql:String)={
42 | var rowsNum=0
43 | var tmpConn=getConnection()
44 | try{
45 | if(sql.toLowerCase.indexOf("from")<0){
46 | logError(" sql is error , There must be the from keyword ")
47 | }else{
48 | val nsql="select count(1) "+sql.substring(sql.toLowerCase.indexOf("from"), sql.size)
49 | val stmt = tmpConn.prepareStatement(nsql, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY)
50 |
51 | val rs = stmt.executeQuery()
52 | if(rs.next()){
53 | rowsNum=rs.getInt(1)
54 | }
55 | stmt.close()
56 | }
57 | }catch {
58 | case t: Throwable => t.printStackTrace() // TODO: handle error
59 | }finally {
60 | tmpConn.close()
61 | tmpConn=null
62 | }
63 | rowsNum
64 | }
65 | //每个分区怎么获取数据的原理是按照分页的原理来取的
66 | override def compute(thePart: Partition, context: TaskContext) = new NextIterator[T] {
67 | context.addTaskCompletionListener{ context => closeIfNeeded() }
68 | val part = thePart.asInstanceOf[JdbcMysqlPartition]
69 | val conn = getConnection()
70 | val partSql=sql+" limit ?,?"
71 | val stmt = conn.prepareStatement(partSql, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY)
72 | if (conn.getMetaData.getURL.matches("jdbc:mysql:.*")) {
73 | stmt.setFetchSize(Integer.MIN_VALUE)
74 | logInfo("statement fetch size set to: " + stmt.getFetchSize + " to force MySQL streaming ")
75 | }
76 | stmt.setLong(1, part.startId)
77 | stmt.setLong(2, part.perPartitionNum)
78 | val rs = stmt.executeQuery()
79 | override def getNext: T = {
80 | if (rs.next()) {
81 | mapRow(rs)
82 | } else {
83 | finished = true
84 | null.asInstanceOf[T]
85 | }
86 | }
87 |
88 | override def close() {
89 | try {
90 | if (null != rs) {
91 | rs.close()
92 | }
93 | } catch {
94 | case e: Exception => logWarning("Exception closing resultset", e)
95 | }
96 | try {
97 | if (null != stmt) {
98 | stmt.close()
99 | }
100 | } catch {
101 | case e: Exception => logWarning("Exception closing statement", e)
102 | }
103 | try {
104 | if (null != conn) {
105 | conn.close()
106 | }
107 | logInfo("closed connection")
108 | } catch {
109 | case e: Exception => logWarning("Exception closing connection", e)
110 | }
111 | }
112 | }
113 |
114 |
115 |
116 |
117 | }
118 | object JdbcMysqlRDD{
119 | def resultSetToObjectArray(rs: ResultSet): Array[Object] = {
120 | Array.tabulate[Object](rs.getMetaData.getColumnCount)(i => rs.getObject(i + 1))
121 | }
122 | }
123 | abstract class NextIterator[U] extends Iterator[U] {
124 | private var gotNext = false
125 | private var nextValue: U = _
126 | private var closed = false
127 | protected var finished = false
128 | protected def getNext(): U
129 | protected def close()
130 | def closeIfNeeded() {
131 | if (!closed) {
132 | closed = true
133 | close()
134 | }
135 | }
136 | override def hasNext: Boolean = {
137 | if (!finished) {
138 | if (!gotNext) {
139 | nextValue = getNext()
140 | if (finished) {
141 | closeIfNeeded()
142 | }
143 | gotNext = true
144 | }
145 | }
146 | !finished
147 | }
148 | override def next(): U = {
149 | if (!hasNext) {
150 | throw new NoSuchElementException("End of stream")
151 | }
152 | gotNext = false
153 | nextValue
154 | }
155 | }
156 |
--------------------------------------------------------------------------------
/src/main/scala/com/spark/jdbcrdd/SparkCSVTest.scala:
--------------------------------------------------------------------------------
1 | package com.spark.jdbcrdd
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkConf
5 | import org.apache.spark.sql.SQLContext
6 | import com.databricks.spark.csv._
7 | import org.apache.spark.sql.types.StructType
8 | import org.apache.spark.sql.types.StructField
9 | import org.apache.spark.sql.types.IntegerType
10 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
11 | import org.apache.hadoop.io.NullWritable
12 | import org.apache.hadoop.io.Text
13 | import org.apache.spark.rdd.RDD
14 | import scala.reflect.ClassTag
15 | object SparkCSVTest {
16 | var sc: SparkContext = null
17 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0")
18 | def main(args: Array[String]): Unit = {
19 | init
20 | tets
21 | }
22 |
23 | def init {
24 | val sparkConf = new SparkConf()
25 | .setMaster("local")
26 | .setAppName("Test")
27 | sc = new SparkContext(sparkConf)
28 | }
29 | def tets(){
30 | val sqlContext = new SQLContext(sc)
31 | val customSchema = StructType(Array(
32 | StructField("year", IntegerType, true),
33 | StructField("comment", IntegerType, true),
34 | StructField("blank", IntegerType, true)))
35 |
36 | val df = sqlContext.load(
37 | "com.databricks.spark.csv",
38 | schema = customSchema,
39 | Map("path" -> "C:\\Users\\zhiziyun\\Desktop\\csvtest.csv", "header" -> "true"))
40 |
41 | val selectedData = df.select("year", "comment")
42 | selectedData.save("C:\\Users\\zhiziyun\\Desktop\\re.csv", "com.databricks.spark.csv")
43 | }
44 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/jdbcrdd/SparkJdbcRDDTest.scala:
--------------------------------------------------------------------------------
1 | package com.spark.jdbcrdd
2 |
3 | import org.apache.spark.SparkContext
4 | import java.sql.DriverManager
5 | import java.sql.ResultSet
6 |
7 | object SparkJdbcRDDTest {
8 | def main(args: Array[String]) {
9 | val sc = new SparkContext("local","spark_mysql")
10 | val numPartitions=10
11 | val sql="select * from zz_reporting.st_rtbreport_byplan where StatDate='2016-05-01'"
12 | //限制:会出现数据丢失和数据重复的现象,因为你在取数据的时候,会出现数据删除和数据添加的情况,
13 | //这样数据的顺序就会打乱,
14 | //使用自带的JdbcRDD可以解决数据丢失 的问题,但是限制性比较大
15 |
16 | val data=sc.mysqlRDD(createConnection, sql, numPartitions, extractValues)
17 | data.printlnRDD
18 |
19 | sc.stop()
20 | }
21 |
22 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/jdbcrdd/SparkSecondarySortKey.scala:
--------------------------------------------------------------------------------
1 | package com.spark.jdbcrdd
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkConf
5 | import java.util.HashSet
6 | import java.util.HashMap
7 | import scala.collection.mutable.ArrayBuffer
8 | import org.apache.spark.rdd.RDD
9 | import scala.collection.JavaConversions._
10 | import org.apache.hadoop.mapreduce.Job
11 | import org.apache.hadoop.fs.Path
12 | import org.apache.hadoop.mapred.JobConf
13 | import org.apache.hadoop.mapred.FileInputFormat
14 | import org.apache.spark.HashPartitioner
15 | import org.apache.spark.RangePartitioner
16 | import org.apache.spark.Partitioner
17 | import org.apache.hadoop.mapreduce.Reducer
18 | import org.apache.hadoop.io.Writable
19 | import org.apache.hadoop.io.WritableComparable
20 | import java.io.DataInput
21 | import java.io.DataOutput
22 | import org.apache.hadoop.io.WritableComparator
23 | import java.io.FileInputStream
24 | import java.io.InputStreamReader
25 | import java.io.BufferedReader
26 | import java.io.FileOutputStream
27 | import java.io.OutputStreamWriter
28 | import scala.reflect.ClassTag
29 | object SparkSecondarySortKey {
30 | var sc: SparkContext = null
31 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0")
32 | def main(args: Array[String]): Unit = {
33 | init
34 | val sd=new SecondarySortKey("a",1)
35 | val ds=new SecondarySortKey("a",2)
36 | println(sd.equals(ds))
37 | val a=Array(("a",1),("a",9),("b",4),("o",7),("b",9),
38 | ("b",3),("f",4),("k",8),
39 | ("a",15),("z",4),("b",1))
40 | val rdd=sc.parallelize(a)
41 | //实现二次排序:先按first字段排序,然后按second排序
42 | val hrdd=rdd.map { case(first,second) =>
43 | val key=new SecondarySortKey(first,second)
44 | (key,second)
45 | }.groupByKey()
46 | .map{x=>(x._1,x._2.toList.sorted)}
47 | .sortByKey()
48 | .foreach(println)
49 | }
50 | def init() {
51 | val sparkConf = new SparkConf()
52 | .setMaster("local")
53 | .setAppName("Test")
54 | sc = new SparkContext(sparkConf)
55 | }
56 |
57 | /**
58 | * 自定义分区
59 | */
60 | class IteblogPartitioner(override val numPartitions: Int) extends Partitioner {
61 | //override def numPartitions: Int = numParts
62 | override def getPartition(key: Any): Int = {
63 | val first = key.asInstanceOf[SecondarySortKey].first
64 | val code = (first.hashCode % numPartitions)
65 | if (code<0) {
66 | code+numPartitions
67 | } else {
68 | code
69 | }
70 | }
71 | override def equals(other: Any): Boolean = other match {
72 | case iteblog: IteblogPartitioner =>
73 | iteblog.numPartitions == numPartitions
74 | case _ =>
75 | false
76 | }
77 | override def hashCode: Int = numPartitions
78 | }
79 | /**
80 | * 自定义一个key
81 | */
82 | class SecondarySortKey(var first:String,var second:Int)
83 | extends WritableComparable[SecondarySortKey] with Serializable{
84 | def set(left:String,right:Int) {
85 | first = left;
86 | second = right;
87 | }
88 | def getFirst()=first
89 | def getSecond() =second
90 | override def readFields(in:DataInput){
91 | first = in.readUTF();
92 | second = in.readInt();
93 | }
94 | override def write(out:DataOutput){
95 | out.writeUTF(first);
96 | out.writeInt(second);
97 | }
98 | override def hashCode() =first.hashCode()
99 | //这个是在reduce的时候决定哪些key要分配在一起的
100 | override def equals(right:Any) ={
101 | if (right.isInstanceOf[SecondarySortKey]) {
102 | var r = right.asInstanceOf[SecondarySortKey]
103 | r.first == first
104 | } else {
105 | false
106 | }
107 | }
108 | //这里的代码是关键,因为对key排序时
109 | def compareTo(o:SecondarySortKey) ={
110 | if (first != o.first) {
111 | first.compareTo(o.first)
112 | } else if (second != o.second) {
113 | second - o.second
114 | } else {
115 | 0
116 | }
117 | }
118 | override def toString()={
119 | first
120 | }
121 | }
122 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/jdbcrdd/package.scala:
--------------------------------------------------------------------------------
1 | package com.spark
2 |
3 | import com.fun.util.RDDOperateFunction
4 | import com.fun.util.SparkContextOperateFunction
5 | import com.fun.util.ZzyLmqDataOperateUtil
6 | import org.apache.spark.SparkContext
7 | import org.apache.spark.rdd.RDD
8 | package object jdbcrdd extends RDDOperateFunction
9 | with SparkContextOperateFunction
10 | with ZzyLmqDataOperateUtil{
11 | //可以通过继承类来获得,也可以直接写
12 | implicit class SparkContextNewFunction(sparkContext: SparkContext) {
13 | def lmq(name: String) = ""
14 | }
15 |
16 | //隐式参数的使用
17 | implicit class RDDNewFunction[T](rdd: RDD[T]) {
18 | def lmq3(str: String)(implicit impl:Array[T])=rdd.map { x => x + " : "+impl(0) }
19 | def lmq4[A](str: String)(implicit impl:Array[A])=rdd.map { x => x + " : "+impl(0) }
20 | }
21 |
22 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/kafka/HashMapEncoder.scala:
--------------------------------------------------------------------------------
1 | package com.spark.kafka
2 |
3 | import kafka.serializer.Encoder
4 | import java.util.HashMap
5 | import kafka.serializer.StringEncoder
6 |
7 |
8 | class HashMapEncoder extends Encoder[HashMap[String,Any]]{
9 | @Override
10 | def toBytes(a:HashMap[String,Any])= {
11 |
12 | null
13 | }
14 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/kafka/KafkaProducerCache.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (c) 2016-2016, Benjamin Fradet, and other contributors.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one
5 | * or more contributor license agreements. See the NOTICE file
6 | * distributed with this work for additional information
7 | * regarding copyright ownership. The ASF licenses this file
8 | * to you under the Apache License, Version 2.0 (the
9 | * "License"); you may not use this file except in compliance
10 | * with the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing,
15 | * software distributed under the License is distributed on an
16 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 | * KIND, either express or implied. See the License for the
18 | * specific language governing permissions and limitations
19 | * under the License.
20 | */
21 |
22 | package com.spark.kafka
23 |
24 | import java.util.Properties
25 | import org.apache.kafka.clients.producer.KafkaProducer
26 |
27 | import scala.collection.mutable
28 |
29 | /** Cache of [[KafkaProducer]]s */
30 |
31 | object KafkaProducerCache {
32 | private val producers = mutable.HashMap.empty[Properties, KafkaProducer[_, _]]
33 |
34 | /**
35 | * Retrieve a [[KafkaProducer]] in the cache or create a new one
36 | * @param producerConfig properties for a [[KafkaProducer]]
37 | * @return a [[KafkaProducer]] already in the cache
38 | */
39 | def getProducer[K, V](producerConfig: Properties): KafkaProducer[K, V] = {
40 | producers.getOrElse(producerConfig, {
41 | val producer = new KafkaProducer[K, V](producerConfig)
42 | producers(producerConfig) = producer
43 | producer
44 | }).asInstanceOf[KafkaProducer[K, V]]
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/main/scala/com/spark/kafka/RDDKafkaWriter.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (c) 2016-2016, Benjamin Fradet, and other contributors.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one
5 | * or more contributor license agreements. See the NOTICE file
6 | * distributed with this work for additional information
7 | * regarding copyright ownership. The ASF licenses this file
8 | * to you under the Apache License, Version 2.0 (the
9 | * "License"); you may not use this file except in compliance
10 | * with the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing,
15 | * software distributed under the License is distributed on an
16 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 | * KIND, either express or implied. See the License for the
18 | * specific language governing permissions and limitations
19 | * under the License.
20 | */
21 |
22 | package com.spark.kafka
23 |
24 | import java.util.Properties
25 |
26 | import org.apache.kafka.clients.producer.ProducerRecord
27 | import org.apache.spark.rdd.RDD
28 |
29 | import scala.reflect.ClassTag
30 |
31 | /**
32 | * Class used for writing [[RDD]]s to Kafka
33 | * @param rdd [[RDD]] to be written to Kafka
34 | */
35 | class RDDKafkaWriter[T](@transient private val rdd: RDD[T])
36 | extends Serializable {
37 | /**
38 | * Write a [[RDD]] to Kafka
39 | * @param producerConfig properties for a [[org.apache.kafka.clients.producer.KafkaProducer]]
40 | * @param transformFunc a function used to transform values of T type into [[ProducerRecord]]s
41 | */
42 | def writeToKafka[K, V](
43 | producerConfig: Properties,
44 | transformFunc: T => ProducerRecord[K, V]
45 | ): Unit =
46 | rdd.foreachPartition { partition =>
47 | val producer = KafkaProducerCache.getProducer[K, V](producerConfig)
48 | partition.map(transformFunc)
49 | .foreach(record => producer.send(record))
50 |
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/src/main/scala/com/spark/kafka/SparkKafkaRDDReader.scala:
--------------------------------------------------------------------------------
1 | package com.spark.kafka
2 |
3 | import org.apache.spark.streaming.kafka.KafkaClusterManager
4 | import org.apache.spark.SparkContext
5 | import org.apache.spark.SparkConf
6 |
7 | object SparkKafkaRDDReader {
8 | var sc:SparkContext=null
9 | def main(args: Array[String]): Unit = {
10 | init
11 | val topics=Set("realtimereport")
12 | var kafkaParams = Map[String, String](
13 | "metadata.broker.list" ->"kafka1:9092,kafka2:9092,kafka3:9092",
14 | "serializer.class" -> "kafka.serializer.StringEncoder",
15 | "group.id" -> "ZhiZiYunReportStorageRunMain_Box")
16 |
17 | val kafkaRdd= KafkaClusterManager.createKafkaRDD(sc, kafkaParams, topics)
18 | kafkaRdd.take(10).foreach(println)
19 | }
20 | def init() {
21 | val sparkConf = new SparkConf()
22 | .setMaster("local[2]")
23 | .setAppName("Test")
24 | .set("spark.streaming.kafka.maxRatePerPartition", "10")
25 | sc = new SparkContext(sparkConf)
26 | }
27 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/kafka/SparkWriteDataToKafkaRunMain.scala:
--------------------------------------------------------------------------------
1 | package com.spark.kafka
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkConf
5 | import org.apache.spark.streaming.StreamingContext
6 | import org.apache.spark.streaming.Seconds
7 | import org.apache.spark.streaming.kafka.KafkaClusterManager
8 | import org.apache.kafka.clients.producer.ProducerRecord
9 | import java.util.Properties
10 | import org.apache.kafka.common.serialization.StringSerializer
11 | import org.slf4j.LoggerFactory
12 | import org.apache.kafka.clients.producer.KafkaProducer
13 | import java.util.HashMap
14 | import org.apache.kafka.clients.producer.ProducerConfig
15 | import org.apache.hadoop.hbase.HBaseConfiguration
16 | import org.apache.hadoop.hbase.client.ConnectionFactory
17 | import org.apache.hadoop.hbase.TableName
18 | import org.apache.hadoop.hbase.client.Get
19 | import org.apache.hadoop.hbase.util.Bytes
20 | import org.apache.hadoop.hbase.client.Put
21 | object SparkWriteDataToKafkaRunMain {
22 | var sc: SparkContext = null
23 | var ssc: StreamingContext = null
24 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0")
25 | import org.apache.log4j.{ Level, Logger }
26 | Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
27 | val zookeeper = "solr2.zhiziyun.com,solr1.zhiziyun.com,mongodb3"
28 | val producerConfig = {
29 | val p = new Properties()
30 | p.setProperty("bootstrap.servers", "kafka1:9092,kafka2:9092,kafka3:9092")
31 | p.setProperty("key.serializer", classOf[StringSerializer].getName)
32 | p.setProperty("value.serializer", classOf[StringSerializer].getName)
33 | p.setProperty("zookeeper.connect", "solr2.zhiziyun.com,solr1.zhiziyun.com,mongodb3")
34 | p
35 | }
36 | val producer = new KafkaProducer[String, String](producerConfig)
37 |
38 | def main(args: Array[String]): Unit = {
39 | initSCC
40 | writeDataToKafka
41 | // send
42 |
43 | }
44 | def send() {
45 | for (i <- 1 to 20) {
46 | val producer = new KafkaProducer[String, String](producerConfig)
47 | val (rowkey, data) = (1, 2)
48 | producer.send(new ProducerRecord[String, String]("test", rowkey + "," + data))
49 | producer.close()
50 | }
51 |
52 | }
53 |
54 | def writeDataToKafka() {
55 | //var topics = Set("smartadsdeliverylog")
56 | var topics = Set("test")
57 | var kafkaParams = Map[String, String]("metadata.broker.list" -> "kafka1:9092,kafka2:9092,kafka3:9092",
58 | "serializer.class" -> "kafka.serializer.StringEncoder", "group.id" -> "test", "zookeeper.connect" -> zookeeper)
59 | val dstream = KafkaClusterManager.createDirectStream(ssc, kafkaParams, topics)
60 | dstream.foreachRDD { rdd =>
61 | println("#############################3333")
62 | rdd.map {
63 | case (rowkey, value) =>
64 | val str = value.split(",")
65 | (str(0), (rowkey, str))
66 | }
67 | .groupByKey
68 | .foreachPartition { x =>
69 | var hconf = HBaseConfiguration.create()
70 | hconf.set("hbase.zookeeper.quorum", zookeeper)
71 | hconf.set("hbase.zookeeper.property.clientPort", "2181")
72 | val conn = ConnectionFactory.createConnection(hconf)
73 | val table = conn.getTable(TableName.valueOf("reportbox_2"))
74 | for ((tablename, rowjey_str) <- x) {
75 | for ((rowkey, str) <- rowjey_str) {
76 | var delivery = str(1).toInt
77 | var deliveryUV = str(2).toInt
78 | var cpmcost = str(3).toDouble
79 | var additionalcpmcost = str(4).toDouble
80 | var fee = str(5).toDouble
81 | var fee2 = str(6).toDouble
82 | var click = str(7).toInt
83 | var clickUV = str(8).toInt
84 | var reach = str(9).toInt
85 | var reachUV = str(10).toInt
86 | var visitLength = str(11).toInt
87 | var sencondsClick = str(12).toInt
88 |
89 | val get = new Get(Bytes.toBytes(rowkey))
90 | val result = table.get(get)
91 | if (!result.isEmpty()) {
92 | val hdelivery = result.getValue("info".getBytes, "delivery".getBytes)
93 | val hdeliveryUV = result.getValue("info".getBytes, "deliveryUV".getBytes)
94 | val hcpmcost = result.getValue("info".getBytes, "cpmcost".getBytes)
95 | val hadditionalcpmcost = result.getValue("info".getBytes, "additionalcpmcost".getBytes)
96 | val hfee = result.getValue("info".getBytes, "fee".getBytes)
97 | val hfee2 = result.getValue("info".getBytes, "fee2".getBytes)
98 | val hclick = result.getValue("info".getBytes, "click".getBytes)
99 | val hclickUV = result.getValue("info".getBytes, "clickUV".getBytes)
100 | val hreach = result.getValue("info".getBytes, "reach".getBytes)
101 | val hreachUV = result.getValue("info".getBytes, "reachUV".getBytes)
102 | val hvisitLength = result.getValue("info".getBytes, "visitLength".getBytes)
103 | val hsencondsClick = result.getValue("info".getBytes, "sencondsClick".getBytes)
104 |
105 | if (hdelivery!= null) delivery = delivery + new String(hdelivery).toInt
106 | if (hdeliveryUV!= null) deliveryUV = deliveryUV + new String(hdeliveryUV).toInt
107 | if (hcpmcost != null) cpmcost = cpmcost + new String(hcpmcost).toDouble
108 | if (hadditionalcpmcost != null) additionalcpmcost = additionalcpmcost + new String(hadditionalcpmcost).toDouble
109 | if (hfee != null) fee = fee + new String(hfee).toDouble
110 | if (hfee2 != null) fee2 = fee2 + new String(hfee2).toDouble
111 | if (hclick != null) click = click + new String(hclick).toInt
112 | if (hclickUV != null) clickUV = clickUV + new String(hclickUV).toInt
113 | if (hreach != null) reach = reach + new String(hreach).toInt
114 | if (hreachUV != null) reachUV = reachUV + new String(hreachUV).toInt
115 | if (hvisitLength != null) visitLength = visitLength + new String(hvisitLength).toInt
116 | if (hsencondsClick != null) sencondsClick = sencondsClick + new String(hsencondsClick).toInt
117 |
118 |
119 |
120 | }
121 | val put = new Put(Bytes.toBytes(rowkey))
122 | put.addColumn("info".getBytes, "delivery".getBytes, delivery.toString().getBytes)
123 | put.addColumn("info".getBytes, "deliveryUV".getBytes, deliveryUV.toString().getBytes)
124 | put.addColumn("info".getBytes, "cpmcost".getBytes, cpmcost.toString().getBytes)
125 | put.addColumn("info".getBytes, "additionalcpmcost".getBytes, additionalcpmcost.toString().getBytes)
126 | put.addColumn("info".getBytes, "fee".getBytes, fee.toString().getBytes)
127 | put.addColumn("info".getBytes, "fee2".getBytes, fee2.toString().getBytes)
128 | put.addColumn("info".getBytes, "click".getBytes, click.toString().getBytes)
129 | put.addColumn("info".getBytes, "clickUV".getBytes, clickUV.toString().getBytes)
130 | put.addColumn("info".getBytes, "reach".getBytes, reach.toString().getBytes)
131 | put.addColumn("info".getBytes, "reachUV".getBytes, reachUV.toString().getBytes)
132 | put.addColumn("info".getBytes, "visitLength".getBytes, visitLength.toString().getBytes)
133 | put.addColumn("info".getBytes, "sencondsClick".getBytes, sencondsClick.toString().getBytes)
134 | table.put(put)
135 |
136 | }
137 | }
138 | table.close()
139 | conn.close()
140 | }
141 |
142 | //rdd.writeToKafka(producerConfig,s=>new ProducerRecord[String, String]("test", "@@@@@@"))
143 | println("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@22")
144 | }
145 | ssc.start()
146 | ssc.awaitTermination()
147 | }
148 | def initSC() {
149 | val sparkConf = new SparkConf()
150 | .setMaster("local[2]")
151 | .setAppName("Test")
152 | sc = new SparkContext(sparkConf)
153 |
154 | }
155 | def initSCC() {
156 | if (sc == null) {
157 | initSC
158 | }
159 | ssc = new StreamingContext(sc, Seconds(30))
160 | }
161 |
162 | def tran(s: (String, String)) = new ProducerRecord[String, String]("test", s._1)
163 |
164 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/kafka/package.scala:
--------------------------------------------------------------------------------
1 | package com.spark
2 |
3 | import org.apache.spark.rdd.RDD
4 | package object kafka {
5 | implicit def writeDataToKafka2[T](rdd: RDD[T])=new RDDKafkaWriter(rdd)
6 |
7 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/ml/ALSDemo.scala:
--------------------------------------------------------------------------------
1 | package com.spark.ml
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.SparkContext
5 | import org.apache.spark.mllib.recommendation.Rating
6 | import org.apache.spark.mllib.recommendation.ALS
7 | import org.apache.spark.mllib.recommendation.Rating
8 | import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
9 | import java.util.ArrayList
10 | import org.apache.spark.mllib.evaluation.RegressionMetrics
11 | import org.apache.spark.mllib.recommendation.ALS
12 | import org.apache.log4j.Logger
13 | import org.apache.log4j.Level
14 | /**
15 | * 训练模型其实就是为了选参数
16 | * 使用一部分已知的数据进行训练
17 | * 当预测结果的评分和真是数据的均方差较小时或达到要求时,我们就可以保存此模型(参数配置)
18 | * 对大数据集进行预测评分然后存储在 数据库中
19 | * 使用时,只要传入user的id,就可以找到预测的评分并排序,得到较高的评分就进行推荐
20 | * ALS.train表示训练一个ALS模型,model.predict表示使用这个模型进行预测
21 | */
22 | object ALSDemo {
23 | def main(args: Array[String]): Unit = {
24 | val conf = new SparkConf()
25 | .setMaster("local")
26 | .setAppName("Spark Pi")
27 | System.setProperty("hadoop.home.dir", "f:\\eclipse\\hdplocal2.6.0")
28 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
29 | val sc = new SparkContext(conf)
30 | //rank 因子(系数k,矩阵分解是需要 A(X*Y)=U(X*k)L(k*Y) ) numIterations迭代次数
31 | var rank = 10
32 | var numIterations = 19
33 | println(makeModel(sc, rank, numIterations))
34 | makeRecommend(sc, rank, numIterations)
35 | /* var resultMSE=new ArrayList[String]
36 | for(numIterations<- 30 until 31){
37 | val MSE= makeModel(sc,rank,numIterations)
38 | resultMSE.add(numIterations+":"+MSE)
39 | }
40 | println(resultMSE)*/
41 | }
42 | def makeRecommend(sc: SparkContext, rank: Int, numIterations: Int) {
43 | //数据为 用户 item 评分 时间戳
44 | //取前三个数据
45 | val data = sc.textFile("file:\\F:\\workspace\\BigData-Test-OtherDemos\\inputFile\\test2.data", sc.defaultMinPartitions)
46 | val ratings = data.map { _.split(",").take(3) }
47 | .map { x => Rating(x(0).toInt, x(1).toInt, x(2).toDouble) }
48 | //训练模型
49 | val model = ALS.train(ratings, rank, numIterations, 0.01)
50 | //获得用户和商品的数据集
51 |
52 | //>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>.
53 | /**
54 | * 使用训练出来的模型进行使用,计算+推荐
55 | */
56 | val users = data.map { _.split(",").take(3) }.map { x => x(0) }.distinct().collect()
57 | users.foreach {
58 | //一次为每个用户推荐
59 | user =>
60 | {
61 | val rs = model.recommendProducts(user.toInt, 10) //参数一为用户,二为返回前几
62 | var values = ""
63 | var key = 0
64 | //拼接推荐结果
65 | rs.foreach { r =>
66 | {
67 | key = r.user
68 | values = values + r.product + ":" + r.rating + "\n"
69 | }
70 | }
71 | //打印推荐结果
72 | println(key.toString() + " => " + values)
73 | }
74 | }
75 | //>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
76 | }
77 |
78 | def makeModel(sc: SparkContext, rank: Int, numIterations: Int): Double = {
79 | //>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
80 | /**
81 | * 这一部分是为了训练模型用的
82 | */
83 |
84 | //数据为 用户 item 评分 时间戳
85 | //取前三个数据
86 | val data = sc.textFile("file:\\F:\\workspace\\BigData-Test-OtherDemos\\inputFile\\test2.data", sc.defaultMinPartitions)
87 | val ratings = data.map { _.split(",") }
88 | .map { x => Rating(x(0).toInt, x(1).toInt, x(2).toDouble) }
89 | //训练模型
90 | val model = ALS.train(ratings, rank, numIterations, 0.01)
91 | //这里是要生成user product对,每一对都会生成预测,但是如果没有对的话就不生成预测
92 | //val userProducts=ratings.map { case Rating(user,product,rate) => (user,product)}
93 | val user = sc.textFile("file:\\F:\\workspace\\BigData-Test-OtherDemos\\inputFile\\user", sc.defaultMinPartitions).map { _.toInt }
94 | val product = sc.textFile("file:\\F:\\workspace\\BigData-Test-OtherDemos\\inputFile\\product", sc.defaultMinPartitions).map { _.toInt }
95 | //笛卡尔积
96 | val userProducts = user.cartesian(product)
97 | //predict使用推荐模型对用户商品进行预测评分,得到预测评分的数据集,这是所有的预测对,
98 | //前面是user,后面是product,如果没有出现,则不对这一对进行评测
99 | //如果,有一个product从未出现过,即使你写了对了,那也 不会有预测结果的
100 | val predictions = model.predict(userProducts).map { case Rating(user, product, rate) => ((user, product), rate) }
101 | //将真实的评分数据集合预测评分数据集进行合并
102 | val ratesAndPreds = ratings.map { case Rating(user, product, rate) => ((user, product), rate) }
103 | .join(predictions)
104 | //可以清楚地看到,实际评分和预测评分
105 | ratesAndPreds.foreach(println)
106 | //然后计算预测的和实际的均方差,均方差越小说明越准确,mean求平均值
107 | val MSE = ratesAndPreds.map {
108 | case ((user, products), (r1, r2)) =>
109 | val err = (r1 - r2)
110 | err * err
111 | }.mean()
112 | //使用ALS内置的MSE评估
113 | val predictedAndTrue = ratesAndPreds.map { case ((user, products), (r1, r2)) => (r1, r2) }
114 | val DefaultMSE = new RegressionMetrics(predictedAndTrue)
115 | //将模型保存
116 | // model.save(sc, "")
117 | //加载一个model
118 | // val loadModel=MatrixFactorizationModel.load(sc, "")
119 |
120 | //打印方差和预测结果
121 | println("这是预测评分和实际评分的均方差:" + MSE)
122 | println("这是内置的预测评分和实际评分的均方差MSE:" + DefaultMSE.meanSquaredError)
123 | //如果均方差满意的话可以将预测的评分进行存储
124 | val result = predictions.map {
125 | case ((user, product), rate) => (user, (product, rate))
126 | }
127 | .groupByKey
128 | .map { data =>
129 | {
130 | val resultda = data._2.map(product => {
131 | data._1 + "::" + product._1 + "::" + product._2
132 | })
133 | resultda
134 | }
135 | }
136 | result.flatMap(x => x).foreach { println }
137 | //result.flatMap(x=>x).saveAsTextFile("outfile/ASLresult")
138 | return MSE
139 | //>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
140 | }
141 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/ml/ClassifierDemo.scala:
--------------------------------------------------------------------------------
1 | package com.spark.ml
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.SparkContext
5 | import org.apache.spark.mllib.regression.LabeledPoint
6 | import org.apache.spark.mllib.linalg.Vectors
7 | import org.apache.spark.mllib.classification.LogisticRegressionWithSGD
8 | import org.apache.spark.rdd.RDD
9 | import org.apache.spark.mllib.classification.NaiveBayes
10 | import org.apache.spark.mllib.classification.SVMWithSGD
11 | import org.apache.spark.mllib.tree.DecisionTree
12 | import org.apache.spark.mllib.tree.configuration.Algo
13 | import org.apache.spark.mllib.tree.impurity.Entropy
14 | /**
15 | * 线性模型(逻辑回归,线性支持向量机SVM)
16 | *线性模型的核心思想是对样本的预测结果进行建模,即对输入变量(特征矩阵)应用简单的线性预测函数y=f(w*x)
17 | *线性预测函数是用来训练数据的(得出w权重矩阵),使用逻辑回归或者线性支持向量机SVM(损失函数)来得出预测值(传入特征矩阵和w)
18 | *根据精确度来判断使用哪个损失函数来得出预测值
19 | *x为输入的特征矩阵,y为值(预测值)
20 | *在训练模型的时候,y为实际值,x为特征,存在一个权重向量能够最小化所有训练样本的由损失函数计算出来的误差最小。(最终是要求一个w)
21 | * 1 1.1951146419526084 0.9947742549449248 0.19840725400812698 2.48569644222758 1.7391898607628944
22 | * 第一个为结果值(分类) 后面为特征值
23 | */
24 | object ClassifierDemo {
25 | def main(args: Array[String]): Unit = {
26 | val conf = new SparkConf()
27 | .setMaster("local")
28 | .setAppName("ClassifierDemo")
29 | System.setProperty("hadoop.home.dir", "D:\\eclipse\\hdplocal2.6.0")
30 | val sc = new SparkContext(conf)
31 | var numIterations=10
32 |
33 | val data=sc.textFile("inputFile/lr_data.txt", sc.defaultMinPartitions)
34 | //提取特征向量
35 | val records=data.map { line => line.split(" ") }
36 |
37 | //logisticRegressionWithSGDModel(records,numIterations)
38 | //naiveBayesModel(records,numIterations)
39 | //svmWithSGDModel(records,numIterations)
40 | decisionTree(sc,numIterations)
41 |
42 |
43 | }
44 | /**
45 | * 逻辑回归模型
46 | */
47 | def logisticRegressionWithSGDModel(records:RDD[Array[String]],numIterations:Int){
48 | //清洗数据,取第一个为结果值,后面为特征值
49 | val rawData=records.map { r => {
50 | val label=if(r(0).toInt<0) 0.0 else r(0).toInt
51 | val features=r.slice(1, r.size-1).map (_.toDouble)
52 | LabeledPoint(label,Vectors.dense(features))}
53 | }
54 | val lrModel=LogisticRegressionWithSGD.train(rawData, numIterations)
55 | //计算准确率
56 | val lrAccuracy=rawData.map { point => {
57 | if(lrModel.predict(point.features) == point.label) 1 else 0
58 | }
59 | }.sum()/rawData.count()
60 | println("准确率:"+lrAccuracy)
61 | //
62 |
63 | /* //传入一个特征矩阵来预测这个产品是属于哪一类的,结果为0或者1
64 | val prediction=lrModel.predict(rawData.map { data => data.features }).collect()
65 | //实际的结果值
66 | val label=rawData.map { x => x.label }.collect()
67 |
68 | for(i<- 0 until label.length){
69 | if(prediction.apply(i)==label.apply(i)){
70 | println("预测:"+prediction.apply(i)+"->>> 实际:"+label.apply(i))
71 | }else
72 | {
73 | println("预测:"+prediction.apply(i)+"@@@@@ 实际:"+label.apply(i))
74 | }
75 | }*/
76 | }
77 |
78 | /**
79 | * 朴素贝叶斯模型(特征值不允许为负)
80 | * map{x=> if(x.toDouble<0) 0.0 else x.toDouble}
81 | */
82 | def naiveBayesModel(input:RDD[Array[String]],numIterations:Int){
83 | val rawData=input.map { r => {
84 | val label=if(r(0).toInt<0) 0.0 else r(0).toInt
85 | val features=r.slice(1, r.size-1).map{x=> if(x.toDouble<0) 0.0 else x.toDouble}
86 | LabeledPoint(label,Vectors.dense(features))}
87 | }
88 | val nbModel=NaiveBayes.train(rawData,numIterations)
89 |
90 | //计算准确率
91 | val nbAccuracy=rawData.map { point => {
92 | if(nbModel.predict(point.features) == point.label) 1 else 0
93 | }
94 | }.sum()/rawData.count()
95 | println("准确率:"+nbAccuracy)
96 | //
97 |
98 | /*
99 | val prediction=nbModel.predict(rawData.map { data => data.features }).collect()
100 | //实际的结果值
101 | val label=rawData.map { x => x.label }.collect()
102 |
103 | for(i<- 0 until label.length){
104 | if(prediction.apply(i)==label.apply(i)){
105 | println("预测:"+prediction.apply(i)+"->>> 实际:"+label.apply(i))
106 | }else
107 | {
108 | println("预测:"+prediction.apply(i)+"@@@@@ 实际:"+label.apply(i))
109 | }
110 | }*/
111 |
112 |
113 | }
114 |
115 | /**
116 | * SVM模型
117 | */
118 | def svmWithSGDModel(input:RDD[Array[String]],numIterations:Int){
119 | val rawData=input.map { r => {
120 | val label=if(r(0).toInt<0) 0.0 else r(0).toInt
121 | val features=r.slice(1, r.size-1).map(_.toDouble)
122 | LabeledPoint(label,Vectors.dense(features))}
123 | }
124 | val svmModel=SVMWithSGD.train(rawData,numIterations)
125 |
126 | //计算准确率
127 | val svmAccuracy=rawData.map { point => {
128 | if(svmModel.predict(point.features) == point.label) 1 else 0
129 | }
130 | }.sum()/rawData.count()
131 | println("准确率:"+svmAccuracy)
132 | //
133 | }
134 | /**
135 | * 决策树
136 | */
137 | def decisionTree(sc:SparkContext,numIterations:Int){
138 | val data=sc.textFile("inputFile/sample_tree_data.csv", sc.defaultMinPartitions)
139 | //提取特征向量
140 | val records=data.map { line => line.split(",") }
141 | val rawData=records.map { r => {
142 | val label=r(0).toInt
143 | val features=r.slice(1, r.size-1).map(_.toDouble)
144 | LabeledPoint(label,Vectors.dense(features))}
145 | }
146 | //2-折叠交叉验证,将原始数据0.9分为训练数据,0.1分为测试数据
147 | val Array(trainData,cvData)=rawData.randomSplit(Array(0.9,0.1), 123)
148 |
149 | val treeModel=DecisionTree.train(trainData, Algo.Classification, Entropy, 29)
150 |
151 |
152 |
153 | //计算准确率
154 | val treeAccuracy=cvData.map { point => {
155 | if(treeModel.predict(point.features) == point.label) 1 else 0
156 | }
157 | }.sum()/cvData.count()
158 | println("准确率:"+treeAccuracy)
159 | //
160 |
161 | }
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/ml/TestVector.scala:
--------------------------------------------------------------------------------
1 | package com.spark.ml
2 |
3 | import org.apache.spark.mllib.linalg.{ Vector, Vectors }
4 | import org.apache.spark.mllib.linalg.DenseMatrix
5 | import breeze.linalg._
6 | import org.apache.spark.mllib.linalg.DenseVector
7 | import org.apache.spark.mllib.linalg.SparseVector
8 | object TestVector {
9 | def main(args: Array[String]): Unit = {
10 | //创建一个稠密矩阵
11 | var a = Vectors.dense(1.0, 2.0, 3.0)
12 |
13 | var b = Vectors.dense(1.0, 2.0, 3.0)
14 | var a2=new SparseVector(1,Array(0, 1, 2),Array(1.0, 2.0, 3.0))
15 | //a2.dot(a2)
16 | var a1=new DenseVector(Array(1.0, 2.0, 3.0))
17 | //a1.dot(b)
18 | // b.toDense.dot(a.toDense)
19 | }
20 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/myrdd/CaseClassUtil.scala:
--------------------------------------------------------------------------------
1 | package com.spark.myrdd
2 |
3 | object CaseClassUtil extends Serializable{
4 |
5 | case class User(name:String,age:Int,phone:String)
6 | case class Address(name:String,address:String,phone:String)
7 | case class Detail(name:String,phone:String)
8 |
9 | case class Table1(name:String,age:Int,address:String)
10 | case class Table2(name:String,age:Int)
11 |
12 | case class HiveTempTable(id:Int,name:String)
13 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/myrdd/ImplicitParameter.scala:
--------------------------------------------------------------------------------
1 | package com.spark.myrdd
2 |
3 | trait ImplicitParameter {
4 | //隐式参数
5 | implicit val a = Array[String]("@")
6 | implicit val b = Array[Int](1)
7 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/myrdd/MySelfRDD.scala:
--------------------------------------------------------------------------------
1 | package com.spark.myrdd
2 |
3 | import org.apache.spark.rdd.RDD
4 | import org.apache.spark.Partition
5 | import org.apache.spark.TaskContext
6 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat
7 | import org.apache.spark.SparkContext
8 | //继承与RDD[String]表示没有前置的rdd。
9 | //这个rdd可以从sc直接获取。而不是一个rdd转换成这个rdd
10 | class MySelfRDD(@transient sc : SparkContext,val strs:Array[String])extends RDD[String](sc,Nil){
11 | //这个函数是用来计算RDD中每个的分区的数据
12 | override def compute(split: Partition, context: TaskContext):Iterator[String] ={
13 | //得到切片的数据
14 | val splits = split.asInstanceOf[MySelfPartition]
15 | Array[String](splits.content).toIterator
16 | }
17 | //getPartitions函数允许开发者为RDD定义新的分区
18 | override protected def getPartitions: Array[Partition] ={
19 | val array = new Array[Partition](strs.size)
20 | for (i <- 0 until strs.size) {
21 | array(i) = new MySelfPartition(i, strs(i))
22 | }
23 | array
24 | }
25 | }
26 |
27 | class MySelfRDD2(parent:RDD[String],data:String)extends RDD[String](parent){
28 | //这个函数是用来计算RDD中每个的分区的数据
29 | override def compute(split: Partition, context: TaskContext):Iterator[String] ={
30 | //得到切片的数据
31 | parent.iterator(split, context).map { x => data+x }
32 | }
33 | //getPartitions函数允许开发者为RDD定义新的分区
34 | override protected def getPartitions: Array[Partition] =
35 | parent.partitions
36 | }
37 |
38 | class MySelfPartition(idx: Int, val content: String) extends Partition {
39 | override def index: Int = idx
40 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/myrdd/TestMain.scala:
--------------------------------------------------------------------------------
1 | package com.spark.myrdd
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.SparkContext
5 | import org.apache.spark.rdd.RDD
6 | import org.apache.spark.rdd.CoGroupedRDD
7 | object TestMain {
8 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0")
9 | def main(args: Array[String]): Unit = {
10 | var conf = new SparkConf()
11 | .setMaster("local")
12 | .setAppName("SparkStreaming Flume")
13 |
14 | var sc = new SparkContext(conf)
15 | testMySelfRDD(sc)
16 |
17 | }
18 | implicit class CustomFunctions3(rdd:RDD[String]) {
19 | //将两个字符串合并
20 | def mergeString(data:String) = new MySelfRDD2(rdd,data)
21 | }
22 | /**
23 | * 自定义一个RDD,将一个RDD转换成自定义的RDD。使用隐式函数
24 | */
25 | def testMySelfRDD(sc:SparkContext){
26 | val preData=sc.parallelize(Array("a","2"))
27 | var result=preData.mergeString("@")
28 | result.take(10).foreach { println }
29 |
30 | }
31 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/myrdd/package.scala:
--------------------------------------------------------------------------------
1 | package com.spark
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.rdd.RDD
5 | import com.fun.util.RDDOperateFunction
6 | import com.fun.util.SparkContextOperateFunction
7 | import com.fun.util.SparkContextOperateFunction
8 | package object myrdd extends RDDOperateFunction
9 | with SparkContextOperateFunction
10 | with ImplicitParameter{
11 | //可以通过继承类来获得,也可以直接写
12 | implicit class SparkContextNewFunction(sparkContext: SparkContext) {
13 | def lmq(name: String) = ""
14 | }
15 |
16 | //隐式参数的使用
17 | implicit class RDDNewFunction[T](rdd: RDD[T]) {
18 | def lmq3(str: String)(implicit impl:Array[T])=rdd.map { x => x + " : "+impl(0) }
19 | def lmq4[A](str: String)(implicit impl:Array[A])=rdd.map { x => x + " : "+impl(0) }
20 | }
21 |
22 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/python/TestPython.scala:
--------------------------------------------------------------------------------
1 | package com.spark.python
2 |
3 | object TestPython {
4 | def main(args: Array[String]): Unit = {
5 | val p=Runtime.getRuntime().exec("./test.py")
6 | p.waitFor()
7 | println(p.exitValue())
8 | }
9 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/scala/ImplicitClass.scala:
--------------------------------------------------------------------------------
1 | package com.spark.scala
2 |
3 | trait ImplicitClass {
4 | implicit def toD(str:String)=str.toDouble
5 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/scala/ReflectScala.scala:
--------------------------------------------------------------------------------
1 | package com.spark.scala
2 |
3 | import java.io.File
4 | import java.net.URLClassLoader
5 | import org.apache.hadoop.fs.FileSystem
6 | import org.apache.hadoop.conf.Configuration
7 | import org.apache.hadoop.fs.Path
8 | import scala.collection.mutable.ArrayBuffer
9 | import java.net.URL
10 | import org.apache.hadoop.fs.FsUrlStreamHandlerFactory
11 |
12 | object ReflectScala {
13 | def main(args: Array[String]): Unit = {
14 | //val c=Class.forName("com.dmp.dataflow.fg.feture.FG1FeatureCalculate")
15 | // c.getMethod("printwoed", classOf[String]).invoke(a.newInstance(), "hello world")
16 | loadHdfsJar
17 |
18 | }
19 | /**
20 | * 动态加载jar包
21 | */
22 | def loadHdfsJar() {
23 | URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory());
24 | val fs = FileSystem.get(new Configuration)
25 | val jarPath = "/user/linmingqiang/test-0.0.1-SNAPSHOT.jar"
26 | val url = fs.getFileStatus(new Path(jarPath)).getPath().toUri().toURL()
27 | val d = new URLClassLoader(Array(url), Thread.currentThread().getContextClassLoader())
28 | val a = d.loadClass("test.HelloWord")
29 | //因为该方法是一个静态的方法,所以这个地方的invoke只要填null就可以了。但是如果不是一个静态方法,就需要一个实例a.newInstance()
30 | //a.getMethod("printwoed", classOf[String]).invoke(null, "hello world")
31 | a.getMethod("printwoed", classOf[String]).invoke(a.newInstance(), "hello world")
32 | }
33 | def loadLocalJar() {
34 | val url = new File("C:\\Users\\zhiziyun\\Desktop\\test-0.0.1-SNAPSHOT.jar").toURI().toURL()
35 | val d = new URLClassLoader(Array(url), Thread.currentThread().getContextClassLoader())
36 | val a = d.loadClass("test.HelloWord")
37 | //因为该方法是一个静态的方法,所以这个地方的invoke只要填null就可以了。但是如果不是一个静态方法,就需要一个实例
38 | //a.getMethod("test").invoke(a.newInstance())
39 | a.getMethod("printwoed", classOf[String]).invoke(a.newInstance(), "hello world")
40 |
41 | }
42 |
43 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/scala/ScalaGramaer.scala:
--------------------------------------------------------------------------------
1 | package com.spark.scala
2 |
3 | import java.util.ArrayList
4 | import scala.collection.mutable.ArrayBuffer
5 | import scala.collection.JavaConversions._
6 | import scala.collection.mutable.Map
7 | import java.util.HashMap
8 | import scala.io.Source
9 | import java.io.File
10 | import scala.collection.Iterator
11 | import sun.org.mozilla.javascript.internal.ast.Yield
12 | import scala.reflect.ClassTag
13 | import java.io.PrintWriter
14 | import scala.tools.cmd.Opt.Implicit
15 | import scala.reflect.internal.util.StringOps
16 | import java.math.BigDecimal
17 | object ScalaGramaer {
18 | var list = new ArrayList[String]
19 | //implicit val aa="a"
20 | implicit def testimplicit(implicit i: String) = {
21 | i.toInt
22 | }
23 | implicit def testimplicit2(i: String) = {
24 | i.toInt
25 | }
26 | def main(args: Array[String]): Unit = {
27 | //listGrammer()
28 | //mapGrammer()
29 | //tupleGrammer()
30 | //IteratorGrammer
31 | //var b=aas(1,"1",(_+_+_+55))
32 | // writeFile
33 | //setGrammer
34 | //mapResultTest
35 | val a=new ArrayBuffer[Int]()
36 | a.++=(Array(1,2,3,4,5))
37 | val b=a.toIterator
38 |
39 | val c=b.map { x => x+10 }
40 |
41 | c.foreach { println }
42 |
43 |
44 | }
45 | def regx(){
46 | val regex="[0-9]".r
47 | println(regex.replaceAllIn("123 admin", "x"))
48 | }
49 | def ffun(){
50 | val a=1.10010011
51 | val b=f"$a%.2f"
52 | println(b)
53 | }
54 | def ziptest(){
55 | val l1 = 1 to 10 toList
56 | val l2 = l1.tail
57 | l2.foreach { println }
58 | println(">>>>")
59 | val l3=l1.zip(l2)
60 | l3.foreach { println }
61 | println(">>>>")
62 | l3.map(p=>((p._2 - p._1),p._2+"-"+p._1)).foreach { println }
63 | }
64 | def implicitTest(){
65 | var a: String = "laal"
66 | var i: Int = a
67 | println(i)
68 | var b:Int="as"
69 |
70 | }
71 | def mapResultTest() {
72 | var a = Set(1, 2, 3, 4)
73 | println(a.+(5))
74 | }
75 | def writeFile() {
76 | var fw = new PrintWriter(new File("test2"))
77 | fw.write(">>>>>>>>>")
78 | fw.close()
79 |
80 | }
81 | def aas[U: ClassTag](key: Int, value: String, a: (Int, String, Int) => U) = {
82 | a(key, value, key)
83 |
84 | }
85 | def IteratorGrammer() {
86 | var a = Array(Array("1", "2"), Array("3", "4"), Array("5", "6")) //不适用tolist的话,就只能遍历一次
87 | var fun1 = (x: Array[String]) => true
88 | var c = a.toIterator.filter { fun1 }
89 | var b = for {
90 | i <- a.toIterator
91 | c <- i
92 | if c > "2"
93 | if c < "6"
94 | } yield c
95 | //b.foreach { println }
96 | //b.foreach { println }
97 | c.foreach { println }
98 | }
99 | def setGrammer() {
100 | var a = Array(1, 2, 3, 4)
101 | var b =
102 | for {
103 | i <- a
104 | } yield { if (i > 2) i + 1 else i }
105 |
106 | for (i <- b)
107 | println(i)
108 | }
109 | /**
110 | * scala集合操作
111 | * 1.想要使用java的集合,需要导入
112 | * import scala.collection.JavaConversions._
113 | * 会内部将java的集合转换为scala的集合
114 | * 2.java的集合和scala的集合不能显式转换,但是可以隐式转换,如,SparkContext.parallelize(data)
115 | * 需要的是一个scala的data,但是可以传一个java的集合
116 | */
117 | def fileGrammer() {
118 | // var file=Source.fromFile("D:\\tmp\\input\\smy_biz_dil\\part-m-00000", "utf-8")
119 | //var file=Source.fromURL("http://www.baidu.com", "utf-8")
120 | // file.getLines.foreach { println };
121 | //bian li mulu
122 | /*walk(new File("D:\\tmp\\input\\"))
123 | list.foreach { println }*/
124 |
125 | }
126 |
127 | //遍历路径下所有的文件
128 | def walk(file: File) {
129 | if (file.isDirectory()) file.listFiles().foreach(walk) else list.add(file.getPath())
130 | }
131 | def readAllfiles(dir: File): Iterator[File] = {
132 | //scan a dir return all file
133 | var child = dir.listFiles().filter { _.isDirectory() }
134 | child.toIterator ++ child.toIterator.flatMap { readAllfiles _ }
135 | }
136 | def listGrammer() {
137 | //遍历集合,可以有下标无下标
138 | var list = new ArrayList[String](); list.add("s")
139 | for (value <- list) println(value)
140 | for (i <- 0.until(list.length)) println(list(i))
141 | for (i <- 0 until list.length) println(list(i))
142 |
143 | }
144 | def mapGrammer() {
145 | //mutable可变的
146 | var map = Map("a" -> 1, "b" -> 2)
147 | println(map("a"))
148 | //用get返回的是一个option
149 | println(map.get("b"))
150 | println(map.get("c"))
151 | //改变一个key的值
152 | map("a") = 6
153 | println(map("a"))
154 | //新增一个值
155 | map += "c" -> 3
156 | println(map("c"))
157 | //移除一个值
158 | map -= "c"
159 | println(map.getOrElse("c", "无这个key"))
160 | //如果有这个key就返回key的值
161 | println(map.getOrElse("null", "无这个key"))
162 |
163 | //遍历一个map
164 | println("遍历一个map")
165 | for ((k, value) <- map) {
166 | println(k + ":" + value)
167 | }
168 | println("遍历一个map的key")
169 | for (k <- map.keySet) {
170 | println(k)
171 | }
172 |
173 | }
174 | def tupleGrammer() {
175 | //元祖类型Tuple可以是多元的
176 | var tuple1 = (1)
177 | var tuple2 = ("1", 2)
178 | var tuple3 = ("1", 2, "3")
179 | var tuple4 = ("1", 2, "3", 4)
180 | println(tuple3._3)
181 |
182 | }
183 |
184 | /**
185 | * @author Administrator
186 | */
187 | class Person(n: String) {
188 | //必须初始化属性
189 | var name = n;
190 | var age = 0;
191 | var address = "";
192 | //这是一个辅助构造器,scala的构造器必须以另一个构造器为起点,否则报错
193 | def this(name: String, age: Int) {
194 | this(name)
195 | this.age = age
196 | }
197 | def this(name: String, age: Int, address: String) {
198 | this(name, age)
199 | this.address = address
200 | }
201 | }
202 |
203 | }
204 |
205 |
--------------------------------------------------------------------------------
/src/main/scala/com/spark/scalatest/ScalaTest.scala:
--------------------------------------------------------------------------------
1 | package com.spark.scalatest
2 |
3 | import org.scalatest.FlatSpec
4 | import org.scalatest.Matchers
5 | import scala.collection.mutable.Stack
6 |
7 | class ScalaTest extends FlatSpec with Matchers{
8 | "a" should "b" in{
9 | val stack = new Stack[Int]
10 | stack.push(1)
11 | stack.push(2)
12 | stack.pop() should be (2)
13 | stack.pop() should be (1)
14 | }
15 | it should "throw NoSuchElementException if an empty stack is popped" in {
16 | val emptyStack = new Stack[Int]
17 | a [NoSuchElementException] should be thrownBy {
18 | emptyStack.pop()
19 | }
20 | }
21 | }
22 |
23 |
--------------------------------------------------------------------------------
/src/main/scala/com/spark/sparkSql/CaseClassUtil.scala:
--------------------------------------------------------------------------------
1 | package com.spark.sparkSql
2 |
3 | object CaseClassUtil extends Serializable{
4 |
5 | case class User(name:String,age:Int,phone:String)
6 |
7 | case class Address(name:String,address:String,phone:String)
8 | case class Detail(name:String,phone:String)
9 |
10 | case class Table1(name:String,age:Int,address:String)
11 | case class Table2(name:String,age:Int)
12 |
13 | case class HiveTempTable(id:Int,name:String)
14 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/sparkSql/JavaUseScalaClass.scala:
--------------------------------------------------------------------------------
1 | package com.spark.sparkSql
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql.SQLContext
5 | import org.apache.spark.sql.types.StructField
6 | import org.apache.spark.sql.types.StructType
7 | import org.apache.spark.sql.types.StringType
8 | import org.apache.spark.sql.Row
9 | import scala.collection.mutable.ArrayBuffer
10 | import java.util.HashMap
11 | import scala.collection.JavaConversions._
12 | import scala.collection.JavaConverters._
13 | import java.util.ArrayList
14 | import java.util.Map
15 | import java.util.List
16 | import com.spark.sparkSql.CaseClassUtil._
17 | import org.apache.spark.api.java.JavaRDD
18 |
19 | //在sc.parallelize(data)中的data是一个scala的集合,如果放入java的集合(ArrayList)的话会报错,
20 | //加入import scala.collection.JavaConversions._就不会报错了,内部会自己转换
21 | class JavaUseScalaClass(sc:SparkContext,sqlContext:SQLContext) {
22 | def userRDDToDataFrame(data:ArrayList[HashMap[String,String]],tableName:String){
23 | var liens=sc.parallelize(data).map(t=>User(name=t.get("name"),age=t.get("age").toInt,phone=t.get("phone")))
24 | sqlContext.createDataFrame(liens).registerTempTable(tableName)
25 | }
26 |
27 | def addressRDDToFrame(data:ArrayList[HashMap[String,String]],tableName:String){
28 | var liens=sc.parallelize(data).map(t=>Address(name=t.get("name"),t.get("address"),phone=t.get("phone")))
29 | sqlContext.createDataFrame(liens)registerTempTable(tableName)
30 | }
31 | //第二种指定Schema,需要这个ROW
32 | def secondRDDToFrame(data:ArrayList[HashMap[String,String]]){
33 | var liens=sc.parallelize(data).map(p=>Row(p.get("name"),p.get("phone")))
34 | var schemaString = "name phone"
35 | val schema = StructType(schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, true)))
36 | sqlContext.createDataFrame(liens, schema)registerTempTable("Detail")
37 | }
38 | def show(sql:String):List[Row]={
39 | var data=sqlContext.sql(sql)
40 |
41 | data.show()
42 | return data.collectAsList()
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/src/main/scala/com/spark/sparkSql/SparkListToDataFrame.scala:
--------------------------------------------------------------------------------
1 | package com.spark.sparkSql
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.SparkContext
5 | import org.apache.spark.sql.SQLContext
6 | import org.apache.spark.sql.types.StructField
7 | import org.apache.spark.sql.types.StructType
8 | import org.apache.spark.sql.types.StringType
9 | import org.apache.spark.sql.Row
10 | import scala.collection.mutable.ArrayBuffer
11 | import java.util.HashMap
12 | import scala.collection.JavaConversions._
13 | import java.util.ArrayList
14 | import java.util.Map
15 | import java.util.List
16 | import com.spark.sparkSql.CaseClassUtil._
17 |
18 | object SparkListToDataFrame {
19 | var conf = new SparkConf() .setMaster("local").setAppName("Spark Pi")
20 | var sc = new SparkContext(conf)
21 | var sqlContext=new SQLContext(sc)
22 | System.setProperty("hadoop.home.dir", "E:\\eclipse\\hdplocal2.6.0")
23 | //第一种,使用反射
24 | def main(args: Array[String]): Unit = {
25 | //secondRDDToFrame()
26 | //show("select * from Detail")
27 | AddressRDDToFrame
28 | }
29 | def UserRDDToDataFrame(data:ArrayList[HashMap[String,String]],tableName:String){
30 | //这是java的写法
31 | var liens=sc.parallelize(data).map(t=>User(name=t.get("name"),age=t.get("age").toInt,phone=t.get("phone")))
32 | var userData=sqlContext.createDataFrame(liens,User.getClass)
33 | userData.registerTempTable(tableName)
34 | }
35 | def AddressRDDToFrame(){
36 | var arraybuffer=ArrayBuffer[HashMap[String,String]]()
37 | var map=new HashMap[String,String]()
38 | map.put("name", "lmq")
39 | map.put("address", "莆田")
40 | arraybuffer+=map
41 | var liens=sc.parallelize(arraybuffer).map(t=>Address(name=t.get("address"),t.get("address"),phone=t.get("address")))
42 | var addressData=sqlContext.createDataFrame(liens)
43 | addressData.registerTempTable("Address")
44 | show("select * from Address")
45 |
46 | var liens2=sc.parallelize(arraybuffer).map(t=>Address(name=t.get("name"),t.get("name"),phone=t.get("name")))
47 | var addressData2=sqlContext.createDataFrame(liens2)
48 | addressData2.registerTempTable("Address")
49 |
50 | show("select * from Address")
51 |
52 | }
53 | //第二种指定Schema,需要这个ROW
54 | def secondRDDToFrame(){
55 | var arraybuffer=ArrayBuffer[HashMap[String,String]]()
56 | var map=new HashMap[String,String]()
57 | map.put("name", "lmq")
58 | map.put("age", "12")
59 | map.put("phone", "10312123")
60 | arraybuffer+=map
61 | var liens=sc.parallelize(arraybuffer)
62 | .map(p=>Row(p.get("name"),p.get("phone"),p.get("age")))
63 | var schemaString = Array("name","phone","age")
64 | var a=StructField("", StringType, true)
65 | var columns=schemaString.map(fieldName => StructField(fieldName, StringType, true))
66 | val schema = StructType(columns)
67 | var schemaData=sqlContext.createDataFrame(liens, schema)
68 | schemaData.registerTempTable("Detail")
69 |
70 |
71 | }
72 | def show(sql:String){
73 | sqlContext.sql(sql).show()
74 | }
75 | }
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
--------------------------------------------------------------------------------
/src/main/scala/com/spark/sparkSql/SparkSQLDemo.scala:
--------------------------------------------------------------------------------
1 | package com.spark.sparkSql
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql.SQLContext
5 | import org.apache.spark.sql._
6 | import scala._
7 | import scala.util.parsing.json.JSON
8 | import scala.collection.mutable.ArrayBuffer
9 | import java.util.ArrayList
10 | import scala.collection.mutable.HashMap
11 | import org.apache.hadoop.hbase.client.Put
12 | object SparkSQLDemo {
13 | def main(args: Array[String]): Unit = {
14 | var conf = new SparkConf()
15 | .setMaster("local")
16 | .setAppName("Spark Pi")
17 | System.setProperty("hadoop.home.dir", "E:\\eclipse\\hdplocal2.6.0")
18 |
19 | var sc = new SparkContext(conf)
20 | var sql=new SQLContext(sc)
21 | testDataFram(sc,sql)
22 |
23 | }
24 | def testDataFram(sc:SparkContext,sql:SQLContext){
25 | val data=sc.textFile("F:\\data\\smartadsclicklog")
26 | val fram=data.map { x => {x.split(",")}}.map { x =>Smartadsclicklog(
27 | clicktime=x(0),zzid=x(1),siteid=x(2),uid=x(3),
28 | ip=x(4),originurl=x(5),pageurl=x(6),campaign=x(7),
29 | template=x(8),pubdomain=x(9),visitor=x(10),useragent=x(11),
30 | slot=x(12),unit=x(13),creative=x(14),ext=x(15),
31 | bidid=x(16)) }
32 | println(fram.count())
33 | fram.foreach { println }
34 | val df=sql.createDataFrame(fram)
35 | df.rdd.foreach(println)
36 | //df.registerTempTable("Smartadsclicklog")
37 | //sql.sql("select * from Smartadsclicklog").show()
38 | println(">>>>>>>>>>>>>>>>>..")
39 | }
40 | /*def transStrToPut(row:Row,cols:Array[String])={
41 | val r=cols.zip(row.toSeq)
42 | r.map{case(colname,value)=>
43 | val put=new Put()
44 |
45 | }
46 | val put = new Put(cells(0).getBytes);
47 | put.addColumn(cells(0).getBytes, cells(0).getBytes, cells(0).getBytes)
48 | put
49 | }*/
50 | case class Smartadsclicklog(clicktime:String,zzid:String,siteid:String,uid:String,
51 | ip:String,originurl:String,pageurl:String,campaign:String,
52 | template:String,pubdomain:String,visitor:String,useragent:String,
53 | slot:String,unit:String,creative:String,ext:String,bidid:String)
54 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/streaming/DataProducter.scala:
--------------------------------------------------------------------------------
1 | package com.spark.streaming
2 |
3 | object DataProducter {
4 | def main(args: Array[String]): Unit = {
5 | val conn= getConnection()
6 | var id=1;
7 | var sql="insert into test(id,name) values"
8 | while(true){
9 | val values=(id to id+2).map{x=>
10 | "("+x+",'"+x+"')"
11 | }.mkString(",")
12 | val nsql=sql+values
13 | println(nsql)
14 | id+=3
15 | Thread.sleep(8000)
16 | val statement = conn.prepareStatement(nsql);
17 | statement.executeUpdate();
18 | }
19 |
20 |
21 | }
22 |
23 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/streaming/DirectMysqlInputDStream.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.streaming.mysql
2 |
3 | import scala.reflect.ClassTag
4 | import org.apache.spark.streaming.StreamingContext
5 | import java.sql.ResultSet
6 | import org.apache.spark.streaming.dstream.InputDStream
7 | import java.sql.Connection
8 | import org.apache.spark.Logging
9 | import org.apache.spark.streaming.Time
10 | import org.apache.spark.streaming.scheduler.RateController
11 | class DirectMysqlInputDStream[T:ClassTag](
12 | @transient ssc_ : StreamingContext,
13 | getConnection: () => Connection,
14 | tablename: String,
15 | idcloumn:String,
16 | fromTime: Long,
17 | sql:String,
18 | numPartitions: Int,
19 | mapRow: (ResultSet) => T) extends InputDStream[T](ssc_) with Logging {
20 | //每个分区的获取条数限制
21 | val maxRows:Long = context.sparkContext.getConf.getInt("spark.streaming.mysql.maxRetries", 1) * numPartitions * context.graph.batchDuration.milliseconds.toLong /1000
22 | var currentOffsets=fromTime
23 | val mysqlConn=getConnection()
24 | // println(ssc_.conf)
25 | override def start(): Unit = {}
26 | override def stop(): Unit = {}
27 | // limits the maximum number of messages per partition
28 | protected def clamp(currentOffsets: Long): Long = {
29 | //获取最大的id
30 | val clampSql="select max("+idcloumn+") from "+tablename+" where "+
31 | idcloumn+" >="+currentOffsets
32 | val stmt = mysqlConn.prepareStatement(clampSql, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY)
33 | val rs = stmt.executeQuery()
34 | //如果没有新数据就nextIdmaxRows) maxRows+currentOffsets else nextId
37 |
38 | }
39 |
40 | override def compute(validTime: Time): Option[JdbcSparkStreamRDD[T]] = {
41 | val nextId=clamp(currentOffsets)
42 | //如果没有新数据就nextId Connection,
17 | lowerBound: Long,
18 | upperBound: Long,
19 | rowkeyName:String,
20 | sql:String,
21 | numPartitions: Int,
22 | mapRow: (ResultSet) => T = JdbcSparkStreamRDD.resultSetToObjectArray _)
23 | extends RDD[T](sc, Nil) with Logging{
24 | //每个分区获取数据的
25 | override def getPartitions: Array[Partition] = {
26 | val length = 1 + upperBound - lowerBound
27 | (0 until numPartitions).map(i => {
28 | val start = lowerBound + ((i * length) / numPartitions).toLong
29 | val end = lowerBound + (((i + 1) * length) / numPartitions).toLong - 1
30 |
31 | new JdbcSparkStreamPartition(i, start, end)
32 |
33 | }).toArray
34 | }
35 | override def count()=getRowsNum(sql)
36 | def getRowsNum(sql:String)={
37 | var rowsNum=0
38 | var tmpConn=getConnection()
39 | try{
40 | if(sql.toLowerCase.indexOf("from")<0){
41 | logError(" sql is error , There must be the from keyword ")
42 | }else{
43 | val nsql="select count(1) "+sql.substring(sql.toLowerCase.indexOf("from"), sql.size)
44 | val stmt = tmpConn.prepareStatement(nsql, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY)
45 |
46 | val rs = stmt.executeQuery()
47 | if(rs.next()){
48 | rowsNum=rs.getInt(1)
49 | }
50 | stmt.close()
51 | }
52 | }catch {
53 | case t: Throwable => t.printStackTrace() // TODO: handle error
54 | }finally {
55 | tmpConn.close()
56 | tmpConn=null
57 | }
58 | rowsNum
59 | }
60 | //每个分区怎么获取数据的
61 | override def compute(thePart: Partition, context: TaskContext) = {
62 | val part = thePart.asInstanceOf[JdbcSparkStreamPartition]
63 | //如果这段时间没有数据,就返回空的
64 | if(part.lower>part.upper){
65 | Iterator.empty
66 | }
67 | else
68 | new JdbcIterator[T] {
69 | context.addTaskCompletionListener{ context => closeIfNeeded() }
70 | val conn = getConnection()
71 | var parttionSql=if(sql.toLowerCase.contains("where")) sql+" and "+rowkeyName+" >= ? AND "+rowkeyName+" <= ?"
72 | else sql+" where "+rowkeyName+" >= ? AND "+rowkeyName+" <= ?"
73 | val stmt = conn.prepareStatement(parttionSql, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY)
74 | if (conn.getMetaData.getURL.matches("jdbc:mysql:.*")) {
75 | stmt.setFetchSize(Integer.MIN_VALUE)
76 | logInfo("statement fetch size set to: " + stmt.getFetchSize + " to force MySQL streaming ")
77 | }
78 | stmt.setLong(1, part.lower)
79 | stmt.setLong(2, part.upper)
80 |
81 | val rs = stmt.executeQuery()
82 | override def getNext: T = {
83 | if (rs.next()) {
84 | mapRow(rs)
85 | } else {
86 | finished = true
87 | null.asInstanceOf[T]
88 | }
89 | }
90 |
91 | override def close() {
92 | try {
93 | if (null != rs) {
94 | rs.close()
95 | }
96 | } catch {
97 | case e: Exception => logWarning("Exception closing resultset", e)
98 | }
99 | try {
100 | if (null != stmt) {
101 | stmt.close()
102 | }
103 | } catch {
104 | case e: Exception => logWarning("Exception closing statement", e)
105 | }
106 | try {
107 | if (null != conn) {
108 | conn.close()
109 | }
110 | logInfo("closed connection")
111 | } catch {
112 | case e: Exception => logWarning("Exception closing connection", e)
113 | }
114 | }
115 | }
116 | }
117 |
118 |
119 |
120 | }
121 | object JdbcSparkStreamRDD{
122 | def resultSetToObjectArray(rs: ResultSet): Array[Object] = {
123 | Array.tabulate[Object](rs.getMetaData.getColumnCount)(i => rs.getObject(i + 1))
124 | }
125 | }
126 | abstract class JdbcIterator[U] extends Iterator[U] {
127 | private var gotNext = false
128 | private var nextValue: U = _
129 | private var closed = false
130 | protected var finished = false
131 | protected def getNext(): U
132 | protected def close()
133 | def closeIfNeeded() {
134 | if (!closed) {
135 | closed = true
136 | close()
137 | }
138 | }
139 | override def hasNext: Boolean = {
140 | if (!finished) {
141 | if (!gotNext) {
142 | nextValue = getNext()
143 | if (finished) {
144 | closeIfNeeded()
145 | }
146 | gotNext = true
147 | }
148 | }
149 | !finished
150 | }
151 | override def next(): U = {
152 | if (!hasNext) {
153 | throw new NoSuchElementException("End of stream")
154 | }
155 | gotNext = false
156 | nextValue
157 | }
158 | }
159 |
--------------------------------------------------------------------------------
/src/main/scala/com/spark/streaming/MapWithStateTest.scala:
--------------------------------------------------------------------------------
1 | package com.spark.streaming
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.SparkContext
5 | import org.apache.spark.streaming.StreamingContext
6 | import org.apache.spark.streaming.Seconds
7 | import org.apache.spark.streaming.kafka.KafkaClusterManager
8 | import org.apache.spark.HashPartitioner
9 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
10 | import org.apache.spark.streaming.StateSpec
11 | import org.apache.spark.streaming.State
12 | import org.apache.spark.streaming.Minutes
13 | import org.apache.spark.streaming.dstream.DStream
14 | import org.apache.spark.streaming.dstream.SocketInputDStream
15 |
16 | object MapWithStateTest {
17 | var sc: SparkContext = null
18 | var zookeeper: String = "solr2.zhiziyun.com,solr1.zhiziyun.com,mongodb3"
19 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0")
20 | def main(args: Array[String]): Unit = {
21 | init
22 | val ssc = new StreamingContext(sc, Seconds(5))
23 | val initialRDD = ssc.sparkContext.parallelize(List(("a", 100), ("b", 10)))
24 | ssc.checkpoint("/user/linmingqiang/checkpoint")
25 | val topics = Set("test")
26 | var kafkaParams = Map[String, String]("metadata.broker.list" -> "kafka1:9092,kafka2:9092,kafka3:9092",
27 | "serializer.class" -> "kafka.serializer.StringEncoder", "zookeeper.connect" -> zookeeper)
28 |
29 | val dstream = KafkaClusterManager.createDirectStream(ssc, kafkaParams, topics).map { _._2 }
30 |
31 | val rpt1 = dstream.flatMap(_.split(" ")).map(x => (x, 1)).reduceByKey(_+_)
32 |
33 | rpt1.mapWithState(StateSpec.function(mappingFunc).timeout(Seconds(1))).print
34 | ssc.start()
35 | ssc.awaitTermination()
36 |
37 | }
38 |
39 | val mappingFunc = (word: String, count: Option[Int], state: State[Int]) => {
40 | val sum = count.getOrElse(0) + state.getOption.getOrElse(0)
41 | val output = (word, sum)
42 | state.update(sum)
43 | output
44 | }
45 |
46 | def init {
47 | val sparkConf = new SparkConf()
48 | .setMaster("local")
49 | .setAppName("UpdateStateByKeyTest")
50 | sc = new SparkContext(sparkConf)
51 | }
52 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/streaming/MysqlManager.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.streaming.mysql
2 |
3 | import org.apache.spark.streaming.StreamingContext
4 | import java.sql.Connection
5 | import java.sql.ResultSet
6 | import scala.reflect.ClassTag
7 |
8 | object MysqlManager {
9 | def creatMysqlInputStream[T:ClassTag](
10 | @transient ssc_ : StreamingContext,
11 | getConnection: () => Connection,
12 | tablename: String,
13 | idcloumn:String,
14 | lowerBound: Long,
15 | sql:String,
16 | numPartitions: Int,
17 | mapRow: (ResultSet) => T)={
18 | new DirectMysqlInputDStream(ssc_,getConnection,tablename,idcloumn,lowerBound,sql,numPartitions,mapRow)
19 | }
20 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/streaming/SpartStreamingTest.scala:
--------------------------------------------------------------------------------
1 | package com.spark.streaming
2 | import org.apache.spark.SparkContext
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.streaming.Seconds
5 | import org.apache.spark.streaming.StreamingContext
6 | import java.sql.DriverManager
7 | import java.sql.ResultSet
8 | import org.apache.spark.streaming.mysql.MysqlManager
9 | import org.apache.spark.rdd.JdbcRDD
10 | import org.apache.spark.sql.SQLContext
11 | import org.apache.spark.streaming.kafka.KafkaClusterManager
12 | import org.apache.spark.streaming.Time
13 | import org.apache.spark.rdd.RDD
14 |
15 | object SpartStreamingTest {
16 | var sc: SparkContext = null
17 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0")
18 | import org.apache.log4j.{Level,Logger}
19 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
20 | val zookeeper="solr2.zhiziyun.com,solr1.zhiziyun.com,mongodb3"
21 | def init() {
22 | val sparkConf = new SparkConf()
23 | .setMaster("local[2]")
24 | .setAppName("Test")
25 | .set("spark.streaming.mysql.maxRetries", "1")
26 | sc = new SparkContext(sparkConf)
27 |
28 |
29 | }
30 | /**
31 | * 应用场景,(推荐是用时间戳 做 数据 分隔点 的字段)
32 | * 数据是按时间顺序写入的,所以必须要有个时间的字段,而且这个时间的字段必须是 时间戳,每条数据的时间戳都要不一样
33 | * 如果数据是同一时间进去的,那必须要把每条数据的时间戳都一直往下加一操作,如:149000000 那同一时间的数据必须是基于这个往下加然后再入mysql
34 | * 要不然那你就必须是要有一个id(必须也是Long类型,这其实跟自带的那个JdbcRDD差不多)字段,且必须也是一直往下加的,不能往回走,
35 | *
36 | *
37 | */
38 | def main(args: Array[String]): Unit = {
39 | localSparkStream
40 |
41 |
42 | }
43 | def localSparkStream(){
44 | init()
45 |
46 | val ssc = new StreamingContext(sc, Seconds(2))
47 | var topics = Set("mobileadsdeliverylog","smartadsdeliverylog","smartadsclicklog", "mobileadsclicklog", "sitevisitlog")
48 | var kafkaParams = Map[String, String]("metadata.broker.list" -> "kafka1:9092,kafka2:9092,kafka3:9092",
49 | "serializer.class" -> "kafka.serializer.StringEncoder", "group.id" -> "test", "zookeeper.connect" -> zookeeper)
50 | val dstream= KafkaClusterManager.createDirectStream(ssc, kafkaParams, topics)
51 | dstream.foreachRDD(rdd=>
52 |
53 | println(rdd.partitions.size)
54 |
55 |
56 | )
57 | ssc.start()
58 | ssc.awaitTermination()
59 | }
60 | def mySparkInputstream{
61 | init
62 | //查询条件必须是两边都是等号的 ID >= ? AND ID <= ? ,不然会丢数据
63 | var sql="SELECT id,name FROM test"
64 | val tablename="test"
65 | val timeClounm="id"//主键是什么。流式的话,按理应该是时间
66 | val fromTime=1//从某个时间点开始
67 | val partitionNum=2//分区数
68 | val ssc = new StreamingContext(sc, Seconds(2))
69 |
70 | var count=0
71 | var r=ssc.createDirectMysqlDStream(getConnection, tablename, timeClounm,
72 | fromTime,sql, partitionNum, sscextractValues)
73 |
74 | r.foreachRDD{x=>println("sssssss");Thread.sleep(2000);println("kkkkkkk");}
75 | r.foreachRDD{rdd=>
76 | count+=1
77 | println(count)
78 | rdd.foreach(println)
79 |
80 |
81 | if(count<2){
82 | Thread.sleep(8000)
83 | }
84 | }
85 |
86 | //r.printlnDStream("")
87 | //两个流式一起获取数据
88 | /*sql="SELECT id,name FROM test where id>10"
89 | var r2=ssc.createDirectMysqlDStream(getConnection, tablename, rowkeyName,
90 | fromId,sql, partitionNum, sscextractValues)*/
91 | /*r2.printlnDStream("r2 :")*/
92 | ssc.start()
93 | ssc.awaitTermination()
94 | }
95 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/streaming/UpdateStateByKeyTest.scala:
--------------------------------------------------------------------------------
1 | package com.spark.streaming
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.SparkContext
5 | import org.apache.spark.streaming.StreamingContext
6 | import org.apache.spark.streaming.Seconds
7 | import org.apache.spark.streaming.kafka.KafkaClusterManager
8 | import org.apache.spark.HashPartitioner
9 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
10 | import org.apache.spark.streaming.StateSpec
11 | import org.apache.spark.streaming.State
12 |
13 | object UpdateStateByKeyTest {
14 | var sc: SparkContext = null
15 | var zookeeper: String = "solr2.zhiziyun.com,solr1.zhiziyun.com,mongodb3"
16 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0")
17 | def main(args: Array[String]): Unit = {
18 | init
19 | val ssc=new StreamingContext(sc,Seconds(5))
20 | val initialRDD = ssc.sparkContext.parallelize(List(("a", 100), ("b", 10)))
21 | ssc.checkpoint("/user/linmingqiang/checkpoint")
22 |
23 |
24 | val topics = Set("test")
25 | var kafkaParams = Map[String, String]("metadata.broker.list" -> "kafka1:9092,kafka2:9092,kafka3:9092",
26 | "serializer.class" -> "kafka.serializer.StringEncoder","zookeeper.connect" -> zookeeper)
27 |
28 | val dstream= KafkaClusterManager.createDirectStream(ssc, kafkaParams, topics).map{_._2}
29 | println(">>>>>>>>>>>>> start "+dstream.count)
30 | val rpt1=dstream.flatMap(_.split(" ")).map(x => (x, 1))
31 | //val rpt2=dstream.flatMap(_.split(" ")).map(x => (x+","+x, 1))
32 |
33 | val rpt1_dst = rpt1.updateStateByKey[Int](updateFunc, new HashPartitioner(ssc.sparkContext.defaultParallelism), initialRDD)
34 | rpt1_dst.print()
35 | /*val rpt2_dst = rpt2.updateStateByKey[Int](updateFunc)
36 | rpt1_dst.foreachRDD{rdd=>
37 | rdd.collect().foreach(println)
38 | }
39 | rpt2_dst.foreachRDD{rdd=>
40 | rdd.collect().foreach(println)
41 | }*/
42 |
43 | ssc.start()
44 | ssc.awaitTermination()
45 |
46 | }
47 |
48 | val updateFunc = (values: Seq[Int], state: Option[Int]) => {
49 | val currentCount = values.sum
50 | val previousCount = state.getOrElse(0)
51 | Some(currentCount + previousCount)
52 | }
53 |
54 | def init {
55 | val sparkConf = new SparkConf()
56 | .setMaster("local")
57 | .setAppName("UpdateStateByKeyTest")
58 | sc = new SparkContext(sparkConf)
59 | }
60 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/streaming/package.scala:
--------------------------------------------------------------------------------
1 | package com.spark
2 |
3 | import com.fun.util._
4 | import org.apache.spark.SparkContext
5 | import org.apache.spark.rdd.RDD
6 | package object streaming extends RDDOperateFunction
7 | with SparkContextOperateFunction
8 | with ZzyLmqDataOperateUtil{
9 |
10 | /* //隐式参数的使用
11 | implicit class RDDNewFunction[T](rdd: RDD[T]) {
12 | def lmq3(str: String)(implicit impl:Array[T])=rdd.map { x => x + " : "+impl(0) }
13 | def lmq4[A](str: String)(implicit impl:Int)=rdd.map { x => x + " : "}
14 | }*/
15 |
16 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/util/KafkaClusterManager.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.streaming.kafka
2 |
3 | import java.io.Serializable
4 | import scala.reflect.ClassTag
5 | import kafka.serializer.Decoder
6 | import org.apache.spark.streaming.StreamingContext
7 | import org.apache.spark.streaming.dstream.InputDStream
8 | import org.apache.spark.SparkException
9 | import kafka.message.MessageAndMetadata
10 | import kafka.common.TopicAndPartition
11 | import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset
12 | import org.apache.spark.rdd.RDD
13 | import kafka.serializer.StringDecoder
14 | import kafka.common.TopicAndPartition
15 | import org.apache.commons.logging.LogFactory
16 | import org.slf4j.LoggerFactory
17 | import org.apache.hadoop.conf.Configuration
18 | import scala.collection.mutable.HashMap
19 | import org.apache.spark.SparkContext
20 |
21 | object KafkaClusterManager {
22 | var topics: Set[String] = null
23 | var kafkaParams: Map[String, String] = null
24 | var kc: KafkaCluster = null
25 | var groupId: String = "Test"
26 | def getKafkafromOffsets(topics: Set[String], kafkaParams: Map[String, String]) = {
27 | val reset = kafkaParams.get("auto.offset.reset").map(_.toLowerCase)
28 | //有两个参数"largest"/"smallest",一个是从最新,一个是从最头开始读数据
29 | var fromOffsets = (for {
30 | topicPartitions <- kc.getPartitions(topics).right
31 | leaderOffsets <- (if (reset == Some("smallest")) {
32 | kc.getEarliestLeaderOffsets(topicPartitions)
33 | } else {
34 | kc.getLatestLeaderOffsets(topicPartitions)
35 | }).right
36 | } yield {
37 | val fromOffsets = leaderOffsets.map {
38 | case (tp, lo) =>
39 | (tp, lo.offset)
40 | }
41 | fromOffsets
42 | }).fold(
43 | errs => throw new SparkException(errs.mkString("\n")),
44 | ok => ok)
45 | fromOffsets
46 | }
47 | def getConsumerOffsetsByToday(conf: Configuration) = {
48 | var consumerOffsets = new HashMap[TopicAndPartition, Long]()
49 | var todayOffsets = conf.get("zzy.kafka.todayoffset").split('|')
50 | for (offset <- todayOffsets) {
51 | val offsets = offset.split(",")
52 | consumerOffsets.put(new TopicAndPartition(offsets(0), offsets(1).toInt), offsets(2).toLong)
53 | }
54 | consumerOffsets.toMap
55 | }
56 | def createDirectStream(ssc: StreamingContext,
57 | kafkaParams: Map[String, String],
58 | topics: Set[String]) = { //先获取这个groupid所消费的offset
59 | this.kafkaParams = kafkaParams
60 | this.topics = topics
61 | kc = new KafkaCluster(kafkaParams)
62 | var consumerOffsets: Map[TopicAndPartition, Long] = getKafkafromOffsets(topics, kafkaParams)
63 | KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](
64 | ssc,
65 | kafkaParams,
66 | consumerOffsets,
67 | (mmd: MessageAndMetadata[String, String]) => (mmd.key, mmd.message))
68 |
69 | }
70 | /**
71 | * 用于sc创建kafkaRDD
72 | */
73 | def createKafkaRDD(
74 | sc: SparkContext,
75 | kafkaParams: Map[String, String],
76 | topics: Set[String]) = {
77 | this.kafkaParams = kafkaParams
78 | this.topics = topics
79 | kc = new KafkaCluster(kafkaParams)
80 | var fromOffsets: Map[TopicAndPartition, Long] = getConsumerOffsets(topics, kafkaParams.get("group.id").getOrElse("realtimereport"))
81 | println(">>>>>>>>>>>>>>>from ")
82 | fromOffsets.foreach(println)
83 |
84 | val maxMessagesPerPartition = sc.getConf.getInt("spark.streaming.kafka.maxRatePerPartition", 0) //0表示没限制
85 | val lastestOffsets = latestLeaderOffsets(fromOffsets)
86 | val untilOffsets = if (maxMessagesPerPartition > 0) {
87 | latestLeaderOffsets(fromOffsets).map {
88 | case (tp, lo) =>
89 | tp -> lo.copy(offset = Math.min(fromOffsets(tp) + maxMessagesPerPartition, lo.offset))
90 | }
91 | } else lastestOffsets
92 | val leaders = untilOffsets.map { case (tp, lo) => tp -> Broker(lo.host, lo.port) }.toMap
93 | val offsetRanges = fromOffsets.map {
94 | case (tp, fo) =>
95 | val uo = untilOffsets(tp)
96 | OffsetRange(tp.topic, tp.partition, fo, uo.offset)
97 | }.toArray
98 | println(">>>>>>>>>>>>>>>offsetRanges ")
99 | offsetRanges.foreach(println)
100 |
101 | KafkaUtils.createRDD[String, String, StringDecoder, StringDecoder, (String, String)](
102 | sc,
103 | kafkaParams,
104 | offsetRanges,
105 | leaders,
106 | (mmd: MessageAndMetadata[String, String]) => (mmd.key, mmd.message))
107 | }
108 | protected final def latestLeaderOffsets(consumerOffsets: Map[TopicAndPartition, Long]): Map[TopicAndPartition, LeaderOffset] = {
109 | val o = kc.getLatestLeaderOffsets(consumerOffsets.keySet)
110 | if (o.isLeft) {
111 | throw new SparkException(o.left.toString)
112 | } else {
113 | o.right.get
114 | }
115 | }
116 |
117 | /**
118 | * 创建数据流前,根据实际消费情况更新消费offsets
119 | * @param topics
120 | * @param groupId
121 | */
122 | private def getConsumerOffsets(topics: Set[String], groupId: String) = {
123 | var offsets: Map[TopicAndPartition, Long] = Map()
124 | topics.foreach(topic => {
125 | var hasConsumed = true //是否消费过 ,true为消费过
126 | val partitionsE = kc.getPartitions(Set(topic)) //获取patition信息
127 | if (partitionsE.isLeft) throw new SparkException("get kafka partition failed:")
128 | val partitions = partitionsE.right.get
129 | val consumerOffsetsE = kc.getConsumerOffsets(groupId, partitions) //获取这个topic的每个patition的消费信息
130 | if (consumerOffsetsE.isLeft) hasConsumed = false
131 | if (hasConsumed) {
132 | val earliestLeaderOffsets = kc.getEarliestLeaderOffsets(partitions).right.get
133 | val consumerOffsets = consumerOffsetsE.right.get
134 | // 可能只是存在部分分区consumerOffsets过时,所以只更新过时分区的consumerOffsets为latestLeaderOffsets
135 | consumerOffsets.foreach({
136 | case (tp, n) =>
137 | //现在数据在什么offset上
138 | val earliestLeaderOffset = earliestLeaderOffsets(tp).offset
139 | if (n < earliestLeaderOffset) {
140 | //消费过,但是过时了,就从头消费(或者从最新开始消费)
141 | val latestLeaderOffsets = kc.getLatestLeaderOffsets(partitions).right.get(tp).offset
142 | offsets += (tp -> latestLeaderOffsets)
143 | } else offsets += (tp -> n) //消费者的offsets正常
144 | })
145 | } else { // 没有消费过 ,这是一个新的消费group id
146 | val reset = kafkaParams.get("auto.offset.reset").map(_.toLowerCase)
147 | var leaderOffsets: Map[TopicAndPartition, LeaderOffset] = null
148 | if (reset == Some("smallest")) {
149 | leaderOffsets = kc.getEarliestLeaderOffsets(partitions).right.get
150 | } else {
151 | leaderOffsets = kc.getLatestLeaderOffsets(partitions).right.get
152 | }
153 | leaderOffsets.foreach { case (tp, offset) => offsets += (tp -> offset.offset) }
154 | }
155 | })
156 | offsets
157 |
158 | }
159 | def getRDDConsumerOffsets(data: RDD[(String, String)]) = {
160 | var consumoffsets = Map[TopicAndPartition, Long]()
161 | val offsetsList = data.asInstanceOf[HasOffsetRanges].offsetRanges
162 | for (offsets <- offsetsList) {
163 | val topicAndPartition = TopicAndPartition(offsets.topic, offsets.partition)
164 | consumoffsets += ((topicAndPartition, offsets.untilOffset))
165 | }
166 | consumoffsets
167 | }
168 | /**
169 | * 更新zookeeper上的消费offsets
170 | * @param rdd
171 | */
172 | def updateConsumerOffsets(topicAndPartition: Map[TopicAndPartition, Long]): Unit = {
173 | val o = kc.setConsumerOffsets(groupId, topicAndPartition)
174 | }
175 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/util/SparkKryoRegistrators.scala:
--------------------------------------------------------------------------------
1 | package com.spark.util
2 |
3 | import org.apache.spark.serializer.KryoRegistrator
4 | import com.esotericsoftware.kryo.Kryo
5 | import com.spark.util.SparkKryoSerializerTest.MygisterKryoClass
6 |
7 | class SparkKryoRegistrators extends KryoRegistrator{
8 | @Override
9 | def registerClasses(kryo:Kryo) {
10 | kryo.register(classOf[String])
11 | kryo.register(classOf[MygisterKryoClass])
12 |
13 | }
14 | }
--------------------------------------------------------------------------------
/src/main/scala/com/spark/util/SparkKryoSerializerTest.scala:
--------------------------------------------------------------------------------
1 | package com.spark.util
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.SparkContext
5 |
6 | object SparkKryoSerializerTest {
7 | var sparkconf:SparkConf=null
8 | var sc:SparkContext=null
9 | def main(args: Array[String]): Unit = {
10 | sparkInit
11 | testKryoSerializer
12 | }
13 | def testKryoSerializer{
14 | var personList = 1 to 10 map (value => new MygisterKryoClass(value + ""))
15 | var myrdd= sc.parallelize(personList)
16 | myrdd.foreach { x=>println(x.getName) }
17 | }
18 |
19 | def sparkInit(){
20 | sparkconf = new SparkConf()
21 | .setMaster("local")
22 | .setAppName("Spark Pi")
23 | sparkconf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
24 |
25 | sparkconf.set("spark.kryo.registrator", "com.spark.util.SparkKryoRegistrators")
26 | //sparkconf.registerKryoClasses(Array(classOf[MygisterKryoClass],classOf[String]))
27 | sc = new SparkContext(sparkconf)
28 | }
29 | class MygisterKryoClass(var name:String){
30 | //private var name:String=null
31 | def getName={
32 | name
33 | }
34 | def setName(name:String)={
35 | this.name=name
36 | }
37 | }
38 | }
--------------------------------------------------------------------------------
/src/main/scala/com/test/CheckHbaseDataWithMysql.scala:
--------------------------------------------------------------------------------
1 | package com.test
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.log4j.Logger
5 | import org.apache.log4j.Level
6 | import java.util.HashMap
7 | import org.apache.spark.rdd.RDD
8 | import com.sun.org.apache.commons.logging.LogFactory
9 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
10 | import org.apache.hadoop.conf.Configuration
11 | import org.apache.hadoop.fs.FileSystem
12 | import org.apache.hadoop.fs.Path
13 | import scala.collection.JavaConversions._
14 | import org.apache.hadoop.hbase.client.Connection
15 | import org.apache.hadoop.hbase.HBaseConfiguration
16 | import org.apache.hadoop.hbase.client.ConnectionFactory
17 | import org.apache.hadoop.hbase.TableName
18 | import org.apache.hadoop.hbase.client.Scan
19 | import java.util.ArrayList
20 | import java.io.File
21 | import java.io.BufferedWriter
22 | import java.io.OutputStreamWriter
23 | import java.io.FileOutputStream
24 | import org.apache.hadoop.hbase.util.Bytes
25 | import java.util.ArrayList
26 | import java.util.Date
27 | import java.sql.DriverManager
28 | import org.apache.spark.HashPartitioner
29 | import org.apache.spark.serializer.KryoRegistrator
30 | import org.apache.spark.streaming.StreamingContext
31 | import org.apache.spark.streaming.Milliseconds
32 | import org.apache.hadoop.hbase.client.HConnectionManager
33 | import scala.collection.mutable.ArrayBuffer
34 | import java.text.SimpleDateFormat
35 | import org.apache.hadoop.hbase.filter.RowFilter
36 | import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp
37 | import org.apache.hadoop.hbase.filter.RegexStringComparator
38 | import java.util.Calendar
39 | import java.text.DateFormat
40 | import java.util.Properties
41 | import java.io.FileInputStream
42 | import org.apache.hadoop.mapred.TextInputFormat
43 | import org.apache.hadoop.io.LongWritable
44 | import org.apache.hadoop.io.Text
45 | import java.util.Date
46 | import java.sql.Timestamp
47 | import java.util.ArrayList
48 | import org.apache.hadoop.hbase.client.Get
49 | import scala.reflect.ClassTag
50 | import org.apache.hadoop.hbase.protobuf.ProtobufUtil
51 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat
52 | import org.apache.hadoop.hbase.util.Base64
53 | import org.apache.hadoop.mapreduce.Job
54 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable
55 | import org.apache.hadoop.hbase.client.Result
56 | import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
57 | import org.apache.hadoop.hbase.client.Put
58 | object CheckHbaseDataWithMysql {
59 | var sparkconf: SparkConf = null
60 | var sc: SparkContext = null
61 | var conf: Configuration = null
62 | var connection: Connection = null
63 | import java.sql.Connection
64 | var mysqlconn: Connection = null
65 | var zookeeper = "solr2.zhiziyun.com,solr1.zhiziyun.com,mongodb3"
66 | def main(args: Array[String]): Unit = {
67 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0")
68 | val time="2016-11-09"
69 | initMysqlConn2
70 | initHbaseConn
71 | val map=getMysqlData(time)
72 | map.foreach(println)
73 |
74 | }
75 |
76 |
77 | def getMysqlData(time:String)={
78 | val map=new HashMap[String,HashMap[String,Int]]
79 | var stam = mysqlconn.createStatement()
80 | val sql="select plan,sum(Delivery),sum(clicks),sum(reach) "+
81 | "from Sample_Queue s"+
82 | " where s.sampUpt>'"+time+"' group by plan"
83 | var result = stam.executeQuery(sql)
84 | while(result.next){
85 | val d=new HashMap[String,Int]
86 | val plan=result.getString(1)
87 | val delivery=result.getInt(2)
88 | val clicks=result.getInt(3)
89 | val reach=result.getInt(4)
90 | d.put("delivery", delivery)
91 | d.put("clicks", clicks)
92 | d.put("reach", reach)
93 | map.put(plan,d)
94 | }
95 | map
96 | }
97 | def initMysqlConn2() {
98 | var user="developer"
99 | var pass="dev@zhiziyun^)0628"
100 | var mysqlurl = "jdbc:mysql://192.168.10.66/zz_bidoptimize"
101 | Class.forName("com.mysql.jdbc.Driver")
102 | mysqlconn = DriverManager.getConnection(mysqlurl, user, pass)
103 | }
104 | def initHbaseConn {
105 | var hconf = HBaseConfiguration.create()
106 | hconf.set("hbase.zookeeper.quorum", zookeeper)
107 | hconf.set("hbase.zookeeper.property.clientPort", "2181")
108 | connection = ConnectionFactory.createConnection(hconf)
109 | }
110 | }
--------------------------------------------------------------------------------
/src/main/scala/com/test/HbaseUtil.scala:
--------------------------------------------------------------------------------
1 | package com.test
2 |
3 | import org.apache.hadoop.hbase.client.Connection
4 | import org.apache.hadoop.hbase.HBaseConfiguration
5 | import org.apache.hadoop.hbase.client.ConnectionFactory
6 | import org.apache.hadoop.hbase.TableName
7 | import org.apache.hadoop.hbase.client.Scan
8 | import scala.collection.JavaConversions._
9 | import org.apache.hadoop.hbase.util.Bytes
10 | import org.apache.hadoop.hbase.util.Base64
11 | import com.fasterxml.jackson.core.JsonFactory
12 | object HbaseUtil {
13 | var hbaseConn: Connection = null
14 | var zookeeper:String = "cdh-master,node1,node2"
15 | def main(args: Array[String]): Unit = {
16 | initHbaseConn
17 | getKyLinHbaseData
18 |
19 |
20 | }
21 | def getKyLinHbaseData(){
22 | val table = hbaseConn.getTable(TableName.valueOf("KYLIN_HT7HUTOKSO"))
23 | val scan = new Scan()
24 | scan.setMaxResultSize(10L)
25 | scan.setMaxResultsPerColumnFamily(1)
26 | val resultScanner = table.getScanner(scan);
27 | for (result <- resultScanner) {
28 | var listCells = result.listCells()
29 |
30 | /*for (cell <- listCells) {
31 | var column = new String(cell.getValueArray, cell.getValueOffset, cell.getValueLength)
32 | println(column)
33 |
34 | //rowMap.put(column, new String(cell.getValueArray, cell.getValueOffset, cell.getValueLength))
35 | }*/
36 | }
37 | table.close()
38 | }
39 | def initHbaseConn {
40 | if(hbaseConn!=null) hbaseConn.close()
41 | hbaseConn=null
42 | var hconf = HBaseConfiguration.create()
43 | hconf.set("hbase.zookeeper.quorum", zookeeper)
44 | hconf.set("hbase.zookeeper.property.clientPort", "2181")
45 | hbaseConn = ConnectionFactory.createConnection(hconf)
46 | }
47 | }
--------------------------------------------------------------------------------
/src/main/scala/com/test/HttpAsyncClientsTest.scala:
--------------------------------------------------------------------------------
1 | package com.test
2 |
3 | import org.json.JSONObject
4 | import org.apache.http.impl.client.DefaultHttpClient
5 | import org.apache.http.client.methods.HttpGet
6 | import org.apache.http.util.EntityUtils
7 | import org.apache.http.client.methods.HttpPost
8 | import java.net.URI
9 | import java.net.URL
10 | import org.apache.http.concurrent.FutureCallback
11 | import org.apache.http.HttpResponse
12 | import java.util.concurrent.CountDownLatch
13 | import org.apache.http.client.config.RequestConfig
14 | import org.apache.http.impl.nio.client.HttpAsyncClients
15 | import org.apache.http.nio.client.methods.AsyncCharConsumer
16 | import java.nio.CharBuffer
17 | import org.apache.http.nio.IOControl
18 | import org.apache.http.protocol.HttpContext
19 | import org.apache.http.nio.client.methods.HttpAsyncMethods
20 | import java.util.ArrayList
21 | import org.apache.http.impl.nio.conn.ManagedNHttpClientConnectionFactory
22 | import org.apache.http.params.HttpParams
23 | import org.apache.http.params.BasicHttpParams
24 |
25 | /**
26 | * http异步发消息
27 | */
28 | object HttpAsyncClientsTest {
29 | def main(args: Array[String]): Unit = {
30 | //testHttpClient
31 | //HttpAsyncClients
32 | //testURLConnect
33 | //var sd:ManagedNHttpClientConnectionFactory=new ManagedNHttpClientConnectionFactory
34 | var a=testURLConnect
35 | println(a)
36 | // testHttpClient
37 | }
38 | def testHttpClient(){
39 | var get = new HttpGet()
40 | val ps=new BasicHttpParams()
41 | var httpClient = new DefaultHttpClient();
42 | var id=0
43 | var latch = new CountDownLatch(10);
44 | for(i<- 1 to 10){
45 | id=i
46 | get.setURI(URI.create(s"https://www.baidum/s?wd=${id}"))
47 | println(i+":"+get.getURI)
48 | val rp= httpClient.execute(get);
49 | println((rp.getStatusLine))
50 | get.reset()//必须加这个,否则会报错
51 |
52 | }
53 |
54 |
55 |
56 | }
57 | def testYiBUHttp(){
58 | val requestConfig = RequestConfig.custom()
59 | .setSocketTimeout(1).setConnectTimeout(1).build();
60 | var httpclient = HttpAsyncClients.custom()
61 | .setDefaultRequestConfig(requestConfig).build();
62 | httpclient.start();
63 | try {
64 | val future = httpclient.execute(
65 | HttpAsyncMethods.createGet("https://www.verisign.com/"),
66 | new MyResponseConsumer(), null);
67 | if(future!=null){
68 | future.get
69 | }
70 | val result=true
71 | if (result != null && result.booleanValue()) {
72 | System.out.println("Request successfully executed");
73 | } else {
74 | System.out.println("Request failed");
75 | }
76 | System.out.println("Shutting down");
77 | } finally {
78 | httpclient.close();
79 | }
80 | System.out.println("Done");
81 | }
82 | def testURLConnect()={
83 | val requestConfig = RequestConfig.custom()
84 | .setSocketTimeout(10)//连上之后,持续的时间,用于控制返回
85 | .setConnectTimeout(10)//连上的时候,用于控制ping
86 | .build();
87 | var httpclient = HttpAsyncClients.custom()
88 | .setDefaultRequestConfig(requestConfig)
89 | //.setMaxConnTotal(10000)
90 | //.setMaxConnPerRoute(1000)
91 | .build();
92 | var erorURL=new ArrayList[String]
93 | try {
94 | httpclient.start();
95 | var requests = Array[HttpGet](new HttpGet("https://www.google.com.hk"),
96 | new HttpGet("https://www.verisign.com/"),
97 | new HttpGet("http://carat.clientsolutions.cn"),
98 | new HttpGet("http://www.baidu.com/"));
99 | val latch = new CountDownLatch(requests.length);
100 | for (request<-requests) {
101 | httpclient.execute(request, new FutureCallback[HttpResponse]() {
102 | def completed(response:HttpResponse ) {
103 | try {
104 | println("success:"+request.getURI)
105 | latch.countDown();
106 | }
107 | catch {case t: Throwable => erorURL.add(request.getURI.toString())}
108 | }
109 | def failed(ex: Exception ) {
110 | try {
111 | println("error:"+request.getURI)
112 | latch.countDown();
113 | erorURL.add(request.getURI.toString())
114 | }
115 | catch {case t: Throwable => erorURL.add(request.getURI.toString())}
116 | }
117 | def cancelled() {
118 | try {latch.countDown();}
119 | catch {case t: Throwable => erorURL.add(request.getURI.toString())}
120 | }
121 | });
122 |
123 | }
124 |
125 | latch.await();
126 | //System.out.println("Shutting down");
127 | } finally {
128 | httpclient.close();
129 | }
130 | System.out.println("Done");
131 | erorURL
132 | }
133 | def testSFun(){
134 | var carat_bidid="a"
135 | var carat_price="b"
136 | var str=s"http://carat.clientsolutions.cn/c=1,1,2&bidid=${carat_bidid}&ep=${carat_price}"
137 | println(str)
138 |
139 | }
140 | def testPinjie(){
141 |
142 | var s=new JSONObject
143 | s.put("key", "Hello Json")
144 | s.put("key2", Array(1,2,3))
145 | println(s)
146 | }
147 |
148 | }
149 | class MyFutureCallback(latch:CountDownLatch,request: HttpGet) extends FutureCallback[HttpResponse]{
150 | //无论完成还是失败都调用countDown()
151 | @Override
152 | def completed(response:HttpResponse) {
153 | latch.countDown();
154 | System.out.println(response.getStatusLine());
155 | }
156 | @Override
157 | def failed( ex: Exception) {
158 | latch.countDown();
159 | System.out.println(request.getRequestLine() + "->" + ex);
160 | }
161 | @Override
162 | def cancelled() {
163 | latch.countDown();
164 | }
165 | }
166 | class MyResponseConsumer extends AsyncCharConsumer[Boolean] {
167 |
168 | val times = 0;
169 |
170 | def getTimes()= {
171 | "\n\n### 第" + times + "步\n###"
172 | }
173 |
174 | @Override
175 | def onCharReceived(buf: CharBuffer , ioctrl: IOControl ){
176 | /* System.out.println(getTimes() + "onCharReceived");
177 | while (buf.hasRemaining()) {
178 | System.out.print(buf.get());
179 | } */
180 | }
181 |
182 | @Override
183 | def onResponseReceived(response: HttpResponse){
184 | //println(getTimes() + "onResponseReceived");
185 | }
186 | @Override
187 | def buildResult(context: HttpContext ) ={
188 | System.out.println(getTimes() + "buildResult");
189 | true
190 | }
191 |
192 | }
193 | /*def doAsyncGet(String url) throws IOException{
194 | RequestConfig defaultRequestConfig = RequestConfig.custom()
195 | .setSocketTimeout(5000)
196 | .setConnectTimeout(5000)
197 | .setConnectionRequestTimeout(5000)
198 | .setStaleConnectionCheckEnabled(true)
199 | .build();
200 | final CloseableHttpAsyncClient httpclient = HttpAsyncClients.custom()
201 | .setDefaultRequestConfig(defaultRequestConfig)
202 | .setMaxConnTotal(10000)
203 | .setMaxConnPerRoute(1000).build();
204 | try {
205 | final HttpGet httpget = new HttpGet(url);
206 | RequestConfig requestConfig = RequestConfig.copy(defaultRequestConfig)
207 | .build();
208 | httpget.setConfig(requestConfig);
209 | httpclient.execute(httpget, new FutureCallback() {
210 |
211 | public void completed(final HttpResponse response) {
212 | try {
213 | httpget.releaseConnection();
214 | } catch (Exception e) {
215 | log.error("close asyncResponse error:",e);
216 | }
217 | }
218 |
219 | public void failed(final Exception ex) {
220 | try {
221 | httpget.releaseConnection();
222 | log.error("this connection failed!",ex);
223 | } catch (Exception e) {
224 | log.error("close asyncResponse error:",e);
225 | }
226 | }
227 |
228 | public void cancelled() {
229 | try {
230 | httpget.releaseConnection();
231 | log.error("this connection has been cancelled!");
232 | } catch (Exception e) {
233 | log.error("close asyncResponse error:",e);
234 | }
235 | }});
236 | }catch(Exception e){
237 | log.error("http async error:"+url,e);
238 | }
239 | }
240 | */
241 | /*{
242 | //无论完成还是失败都调用countDown()
243 | @Override
244 | def completed(response:HttpResponse) {
245 | latch.countDown();
246 | System.out.println(request.getRequestLine() + "->"
247 | + response.getStatusLine());
248 | }
249 | @Override
250 | def failed( ex: Exception) {
251 | latch.countDown();
252 | System.out.println(request.getRequestLine() + "->" + ex);
253 | }
254 | @Override
255 | def cancelled() {
256 | latch.countDown();
257 | }
258 | }*/
--------------------------------------------------------------------------------
/src/main/scala/com/test/JsonTest.scala:
--------------------------------------------------------------------------------
1 | package com.zhiziyun.bot.service.url.test
2 |
3 | import org.json.JSONObject
4 | import scala.collection.mutable.ArrayBuffer
5 | import scala.collection.mutable.HashMap
6 | import scala.collection.JavaConversions._
7 | import org.json.JSONArray
8 | object JsonTest {
9 | def main(args: Array[String]): Unit = {
10 | //test
11 | val b= """{"DATA":{"MOD_MOB_DDQ_BASIC":[{"AGENT":"wechat","ZODIAC":"兔","STAR":"处女座","GENDER":"FEMALE","EDUCATION_DEGREE":"zkjyx","IS_LOCAL":"bendiji"},{"AGENT":"APP","ZODIAC":"猪","STAR":"双鱼座","GENDER":"MALE","EDUCATION_DEGREE":"bk","IS_LOCAL":"feibendiji"},{"AGENT":"wechat","ZODIAC":"马","STAR":"天秤座","GENDER":"MALE","EDUCATION_DEGREE":"zkjyx","IS_LOCAL":"bendiji"},{"AGENT":"APP","ZODIAC":"鼠","STAR":"摩羯座","GENDER":"MALE","EDUCATION_DEGREE":"bk","IS_LOCAL":"bendiji"}]},"TOPIC":"mod_mob_ddq_basic"}
12 |
13 | """
14 | val a="""{"qmart":"TEST","ntnum":"50","ecrule1":"测试中1","ecrule2":"","ecrule3":"",}"""
15 | val obj=new JSONObject(b)
16 | println(transObject(transObject(obj)))
17 | }
18 | def transObject(o1:JSONObject):JSONObject={
19 | val o2=new JSONObject();
20 | val it = o1.keys();
21 | while (it.hasNext()) {
22 | val key = it.next().asInstanceOf[String];
23 | val obj = o1.get(key);
24 | if(obj.getClass().toString().endsWith("String")){
25 | o2.accumulate(key.toUpperCase(), obj);
26 | }else if(obj.getClass().toString().endsWith("JSONObject")){
27 | o2.accumulate(key.toUpperCase(), transObject(obj.asInstanceOf[JSONObject]));
28 | }else if(obj.getClass().toString().endsWith("JSONArray")){
29 | o2.put(key.toUpperCase(), transArray(o1.getJSONArray(key)));
30 | }
31 | }
32 | o2
33 | }
34 | def transArray( o1:JSONArray):JSONArray={
35 | val o2 = new JSONArray();
36 | for (i <- 0 to o1.length-1) {
37 | val jArray=o1.getJSONObject(i);
38 | if(jArray.getClass().toString().endsWith("JSONObject")){
39 | o2.put(transObject(jArray.asInstanceOf[JSONObject]));
40 | }else if(jArray.getClass().toString().endsWith("JSONArray")){
41 | o2.put(transArray(jArray.asInstanceOf[JSONArray]));
42 | }
43 | }
44 | o2;
45 | }
46 | }
--------------------------------------------------------------------------------
/src/main/scala/com/test/KafkaLogTest.scala:
--------------------------------------------------------------------------------
1 | package com.test
2 |
3 | import org.slf4j.Logger
4 | import org.slf4j.LoggerFactory
5 |
6 |
7 | object KafkaLogTest {
8 | def main(args: Array[String]): Unit = {
9 | var LOGGER: Logger = LoggerFactory.getLogger("KAFKA")//日志记录
10 | for (i <- 1 to 1000) {
11 | LOGGER.info("Info [" + i + "]");
12 | println("Info [" + i + "]")
13 | Thread.sleep(1000);
14 | }
15 | }
16 | }
--------------------------------------------------------------------------------
/src/main/scala/com/test/ReflectScala.scala:
--------------------------------------------------------------------------------
1 | package com.test
2 |
3 | object ReflectScala {
4 | def main(args: Array[String]): Unit = {
5 | var a=Class.forName("com.test.ReflectScala")
6 | //因为该方法是一个静态的方法,所以这个地方的invoke只要填null就可以了。但是如果不是一个静态方法,就需要一个实例
7 | //a.getMethod("test").invoke(a.newInstance())
8 | a.getMethod("test",classOf[String]).invoke(null,"hello world")
9 |
10 | }
11 | def test(s:String){
12 | println(s)
13 | }
14 | }
--------------------------------------------------------------------------------
/src/main/scala/com/test/SparkWithLocalTest.scala:
--------------------------------------------------------------------------------
1 | package com.test
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkConf
5 | import org.apache.spark.SparkContext._
6 | import scala.collection.JavaConversions._
7 | import scala.collection.mutable.ArrayBuffer
8 | import java.util.Calendar
9 | import java.util.ArrayList
10 | import org.apache.hadoop.fs.Path
11 | import org.apache.spark.streaming.kafka.KafkaUtils
12 | import kafka.message.MessageAndMetadata
13 | import kafka.serializer.StringDecoder
14 | import kafka.serializer.StringDecoder
15 | import org.apache.spark.streaming.kafka.Broker
16 | import kafka.common.TopicAndPartition
17 | object SparkWithLocalTest {
18 | var sc: SparkContext = null
19 | val zookeeper=""
20 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0")
21 | val correctData=new ArrayList[(String,String,String,Int)]
22 | def main(args: Array[String]): Unit = {
23 | val sparkConf = new SparkConf()
24 | .setMaster("local")
25 | .setAppName("Test")
26 | sc = new SparkContext(sparkConf)
27 |
28 |
29 |
30 | }
31 |
32 | def getKafkaRDD(){
33 | var kafkaParams = Map[String, String]("metadata.broker.list" -> "kafka1:9092,kafka2:9092,kafka3:9092",
34 | "serializer.class" -> "kafka.serializer.StringEncoder",
35 | "group.id" -> "test", "zookeeper.connect" -> zookeeper)
36 | KafkaUtils.createRDD[String, String, StringDecoder, StringDecoder, (String, String)](
37 | sc,
38 | kafkaParams,
39 | null,
40 | Map[TopicAndPartition, Broker](),
41 | (mmd: MessageAndMetadata[String, String]) => (mmd.key, mmd.message))
42 |
43 |
44 |
45 | }
46 |
47 |
48 |
49 | def peixu(){
50 | val rdd= sc.textFile("/data/test")
51 | rdd.flatMap{x=>x.split(" ")}
52 | .map{x=>(x,1)}
53 | .reduceByKey(_+_)
54 | .sortBy({case(key,num)=>num},false)
55 | .foreach(println)
56 | }
57 | def runJob(){
58 | var rdd=sc.parallelize(Array((0,0)))
59 | var tmprdd=sc.parallelize(Array((0,1)))
60 | .map{x=>println("@");x}
61 | val rrdd=tmprdd.groupByKey//在有shuffle操作的时候,spark默认会对其数据进行缓存,不会重新计算
62 | .map{x=>println("##");(x._1,x._2)}
63 | rrdd.foreach(println)
64 | rrdd.foreach(println)
65 |
66 |
67 |
68 | }
69 |
70 | def sparkTest(){
71 | val rdd=sc.parallelize(Array((1,6),(7,8),(9,1)),3).zipWithIndex().map(x=>(x._2,x._1))
72 | rdd.foreach(println)
73 | val rdd2=rdd.map{x=>
74 | var index=x._1-1
75 | (index,x._2)
76 | }
77 | rdd2.foreach(println)
78 | rdd.join(rdd2).map{x=>
79 | val (f,s)=x._2
80 | (s._1,s._2-f._2)}.foreach(println)
81 | }
82 |
83 | def init {
84 | val sparkConf = new SparkConf()
85 | .setMaster("local")
86 | .setAppName("Test")
87 | sc = new SparkContext(sparkConf)
88 | }
89 | }
--------------------------------------------------------------------------------
/src/main/scala/com/test/Test.scala:
--------------------------------------------------------------------------------
1 | package com.test
2 |
3 | import java.util.HashMap
4 | import scala.collection.mutable.ArrayBuffer
5 | import scala.collection.mutable.Seq
6 | import java.io.File
7 | import java.net.URLClassLoader
8 | import java.net.URL
9 | import scala.collection.JavaConverters._
10 | import scala.collection.JavaConversions._
11 | import com.test.Utilities
12 | object Test extends Utilities{
13 | def main(args: Array[String]): Unit = {
14 | //println(fun((1,1)))
15 | //val a=new HashMap[String,String]
16 | //a.put("a", "a")
17 | //t1(a)
18 | //println(a)
19 | //t2(a)
20 | //println(a)
21 | /* val url=new File("C:\\Users\\zhiziyun\\Desktop\\test-0.0.1-SNAPSHOT.jar").toURI().toURL()
22 | val d=new URLClassLoader(Array(url), Thread.currentThread().getContextClassLoader())
23 | val a= d.loadClass("test.HelloWord")
24 | a.getMethod("printwoed",classOf[String]).invoke(a.newInstance(),"hello world")
25 | */
26 | val v_l5mon_date = getDateStr(getNMonthAgo(getNDayAgo(1), 4))
27 | println(v_l5mon_date)
28 | val v_data_date = getDateStr_(getNDayAgo(1))
29 | println(v_data_date)
30 | val v_next_date = getDateStr_()
31 | println(v_next_date)
32 | val v_data_day = getDateStr_()
33 | println(v_data_day)
34 | val v_mth_stt = getMonthStart()
35 | println(v_mth_stt)
36 | val v_mth_end = getMonthEnd()
37 | println(v_mth_end)
38 | }
39 | def t1(a: HashMap[String, String]) {
40 | a.clear()
41 | }
42 |
43 | def t2(a: HashMap[String, String]) {
44 | a.put("1", "1")
45 | }
46 | def fun(str: Any, data: String) = {
47 | str match {
48 | case i: Int => "INt" + ":" + data
49 | case s: String => "String" + ":" + data
50 | case map: HashMap[_, _] =>
51 | "Map" + ":" + data
52 | str.asInstanceOf[HashMap[String, String]].toString()
53 | case t: (_, _) =>
54 | "Tuple2" + ":" + data
55 | t.asInstanceOf[Tuple2[Int, Int]].toString()
56 | }
57 | }
58 | def write(
59 | data: String,
60 | fun: (Any, String) => String) = {
61 | println(fun("", data))
62 | println(fun((1, 1), data))
63 | }
64 | }
65 |
66 | case class casetest(a: String)(val b: String) {
67 | def d = {
68 | println(b)
69 | }
70 | }
71 | object EnumerationTest extends Enumeration {
72 | type EnumerationTest = Value
73 | val b, c, d = Value
74 | }
--------------------------------------------------------------------------------
/src/main/scala/com/test/TestJava.java:
--------------------------------------------------------------------------------
1 | package com.test;
2 |
3 | import java.util.ArrayList;
4 |
5 | public class TestJava {
6 | public static void main(String[] args) {
7 | ArrayList da=new ArrayList();
8 | da.add("a");
9 | }
10 | public static void aa(String[] a){
11 | System.out.println(a[0]);
12 | }
13 | public String[] bb(){
14 | return new String[1];
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/src/main/scala/com/test/Utilities.scala:
--------------------------------------------------------------------------------
1 | package com.test
2 |
3 | import org.json.JSONObject
4 | import org.json.JSONArray
5 | import org.json.JSONException
6 | import java.text.SimpleDateFormat
7 | import java.util.Date
8 | import java.util.regex.Pattern
9 | import java.util.Calendar
10 | import java.lang.Long
11 |
12 | trait Utilities {
13 | def transObject(o1:JSONObject):JSONObject={
14 | val o2=new JSONObject();
15 | val it = o1.keys();
16 | while (it.hasNext()) {
17 | val key = it.next().asInstanceOf[String];
18 | val obj = o1.get(key);
19 | if(obj.getClass().toString().endsWith("String")){
20 | o2.accumulate(key.toLowerCase(), obj);
21 | }else if(obj.getClass().toString().endsWith("JSONObject")){
22 | o2.accumulate(key.toLowerCase(), transObject(obj.asInstanceOf[JSONObject]));
23 | }else if(obj.getClass().toString().endsWith("JSONArray")){
24 | o2.put(key.toLowerCase(), transArray(o1.getJSONArray(key)));
25 | }
26 | }
27 | o2
28 | }
29 | def transArray( o1:JSONArray):JSONArray={
30 | val o2 = new JSONArray();
31 | for (i <- 0 to o1.length-1) {
32 | val jArray=o1.getJSONObject(i);
33 | if(jArray.getClass().toString().endsWith("JSONObject")){
34 | o2.put(transObject(jArray.asInstanceOf[JSONObject]));
35 | }else if(jArray.getClass().toString().endsWith("JSONArray")){
36 | o2.put(transArray(jArray.asInstanceOf[JSONArray]));
37 | }
38 | }
39 | o2;
40 | }
41 |
42 | def getNMonthAgo(calendar: Calendar, n: Int) = {
43 |
44 | calendar.add(Calendar.MONTH, -n)
45 | calendar
46 | }
47 |
48 | def getNDayAgo(n: Int) = {
49 | val calendar = Calendar.getInstance
50 | val time = calendar.getTimeInMillis - n*24*60*60*1000
51 | calendar.setTimeInMillis(time)
52 | calendar
53 | }
54 |
55 | def getDateStr(calendar: Calendar) = {
56 | val date = calendar.getTime
57 | val sdf = new SimpleDateFormat("yyyyMMdd")
58 | val str = sdf.format(date)
59 | str
60 | }
61 |
62 | def getDateStr_(calendar: Calendar) = {
63 | val date = calendar.getTime
64 | val sdf = new SimpleDateFormat("yyyy-MM-dd")
65 | val str = sdf.format(date)
66 | str
67 | }
68 |
69 | def getDateStr_(time: Long) = {
70 | val date = new Date(time)
71 | val sdf = new SimpleDateFormat("yyyy-MM-dd")
72 | val str = sdf.format(date)
73 | str
74 | }
75 |
76 | def getDateStr() = {
77 | val date = new Date()
78 | val sdf = new SimpleDateFormat("yyyyMMdd")
79 | val str = sdf.format(date)
80 | str
81 | }
82 |
83 | def getDateStr_() = {
84 | val date = new Date()
85 | val sdf = new SimpleDateFormat("yyyy-MM-dd")
86 | val str = sdf.format(date)
87 | str
88 | }
89 |
90 | def getMonthStart() = {
91 | val cale = Calendar.getInstance()
92 | cale.add(Calendar.MONTH, 0)
93 | cale.set(Calendar.DAY_OF_MONTH, 1)
94 | val firstday = getDateStr_(cale)
95 | firstday
96 | }
97 |
98 | def getMonthEnd() = {
99 | val cale = Calendar.getInstance()
100 | cale.add(Calendar.MONTH, 1)
101 | cale.set(Calendar.DAY_OF_MONTH, 0)
102 | val lastday = getDateStr_(cale)
103 | lastday
104 | }
105 |
106 | def isnull(key: Object): Boolean = {
107 | if (key != null) {
108 | true
109 | } else {
110 | false
111 | }
112 | }
113 |
114 | def getCurrent_time(): Long = {
115 | val now = new Date()
116 | val a = now.getTime
117 | var str = a + ""
118 | str.substring(0, 10).toLong
119 | }
120 |
121 | def getZero_time(): Long = {
122 | val now = new Date()
123 | val dateFormat = new SimpleDateFormat("yyyy-MM-dd")
124 | val a = dateFormat.parse(dateFormat.format(now)).getTime
125 | var str = a + ""
126 | str.substring(0, 10).toLong
127 | }
128 |
129 | def getTimestamp(): String = {
130 | var ts = System.currentTimeMillis()
131 | ts.toString
132 | }
133 |
134 | def getMD5hash(s: String) = {
135 | val m = java.security.MessageDigest.getInstance("MD5")
136 | val b = s.getBytes("UTF-8")
137 | m.update(b, 0, b.length)
138 | new java.math.BigInteger(1, m.digest()).toString(16)
139 | }
140 |
141 | /** Makes sure only ERROR messages get logged to avoid log spam. */
142 | def setupLogging() = {
143 | import org.apache.log4j.{ Level, Logger }
144 | val rootLogger = Logger.getRootLogger()
145 | rootLogger.setLevel(Level.WARN)
146 | }
147 |
148 | /** Configures Twitter service credentials using twiter.txt in the main workspace directory */
149 | def setupTwitter() = {
150 | import scala.io.Source
151 |
152 | for (line <- Source.fromFile("../twitter.txt").getLines) {
153 | val fields = line.split(" ")
154 | if (fields.length == 2) {
155 | System.setProperty("twitter4j.oauth." + fields(0), fields(1))
156 | }
157 | }
158 | }
159 |
160 | /** Retrieves a regex Pattern for parsing Apache access logs. */
161 | def apacheLogPattern(): Pattern = {
162 | val ddd = "\\d{1,3}"
163 | val ip = s"($ddd\\.$ddd\\.$ddd\\.$ddd)?"
164 | val client = "(\\S+)"
165 | val user = "(\\S+)"
166 | val dateTime = "(\\[.+?\\])"
167 | val request = "\"(.*?)\""
168 | val status = "(\\d{3})"
169 | val bytes = "(\\S+)"
170 | val referer = "\"(.*?)\""
171 | val agent = "\"(.*?)\""
172 | val regex = s"$ip $client $user $dateTime $request $status $bytes $referer $agent"
173 | Pattern.compile(regex)
174 | }
175 | }
--------------------------------------------------------------------------------
/src/main/scala/hdfs-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | dfs.nameservices
7 | nameservice-zzy
8 |
9 |
10 | dfs.client.failover.proxy.provider.nameservice-zzy
11 | org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
12 |
13 |
14 | dfs.ha.automatic-failover.enabled.nameservice-zzy
15 | true
16 |
17 |
18 | ha.zookeeper.quorum
19 | mongodb3:2181,solr1.zhiziyun.com:2181,solr2.zhiziyun.com:2181
20 |
21 |
22 | dfs.ha.namenodes.nameservice-zzy
23 | namenode47,namenode237
24 |
25 |
26 | dfs.namenode.rpc-address.nameservice-zzy.namenode47
27 | mongodb3:8020
28 |
29 |
30 | dfs.namenode.servicerpc-address.nameservice-zzy.namenode47
31 | mongodb3:8022
32 |
33 |
34 | dfs.namenode.http-address.nameservice-zzy.namenode47
35 | mongodb3:50070
36 |
37 |
38 | dfs.namenode.https-address.nameservice-zzy.namenode47
39 | mongodb3:50470
40 |
41 |
42 | dfs.namenode.rpc-address.nameservice-zzy.namenode237
43 | solr2.zhiziyun.com:8020
44 |
45 |
46 | dfs.namenode.servicerpc-address.nameservice-zzy.namenode237
47 | solr2.zhiziyun.com:8022
48 |
49 |
50 | dfs.namenode.http-address.nameservice-zzy.namenode237
51 | solr2.zhiziyun.com:50070
52 |
53 |
54 | dfs.namenode.https-address.nameservice-zzy.namenode237
55 | solr2.zhiziyun.com:50470
56 |
57 |
58 | dfs.replication
59 | 3
60 |
61 |
62 | dfs.blocksize
63 | 134217728
64 |
65 |
66 | dfs.client.use.datanode.hostname
67 | false
68 |
69 |
70 | fs.permissions.umask-mode
71 | 022
72 |
73 |
74 | dfs.namenode.acls.enabled
75 | false
76 |
77 |
78 | dfs.client.read.shortcircuit
79 | false
80 |
81 |
82 | dfs.domain.socket.path
83 | /var/run/hdfs-sockets/dn
84 |
85 |
86 | dfs.client.read.shortcircuit.skip.checksum
87 | false
88 |
89 |
90 | dfs.client.domain.socket.data.traffic
91 | false
92 |
93 |
94 | dfs.datanode.hdfs-blocks-metadata.enabled
95 | true
96 |
97 |
98 |
--------------------------------------------------------------------------------
/src/main/scala/hive-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | hive.metastore.uris
7 | thrift://mongodb3:9083
8 |
9 |
10 | hive.metastore.client.socket.timeout
11 | 300
12 |
13 |
14 | hive.metastore.warehouse.dir
15 | /user/hive/warehouse
16 |
17 |
18 | hive.warehouse.subdir.inherit.perms
19 | true
20 |
21 |
22 | hive.enable.spark.execution.engine
23 | false
24 |
25 |
26 | hive.conf.restricted.list
27 | hive.enable.spark.execution.engine
28 |
29 |
30 | mapred.reduce.tasks
31 | -1
32 |
33 |
34 | hive.exec.reducers.bytes.per.reducer
35 | 67108864
36 |
37 |
38 | hive.exec.copyfile.maxsize
39 | 33554432
40 |
41 |
42 | hive.exec.reducers.max
43 | 1099
44 |
45 |
46 | hive.metastore.execute.setugi
47 | true
48 |
49 |
50 | hive.support.concurrency
51 | true
52 |
53 |
54 | hive.zookeeper.quorum
55 | mongodb3,solr2.zhiziyun.com,solr1.zhiziyun.com
56 |
57 |
58 | hive.zookeeper.client.port
59 | 2181
60 |
61 |
62 | hbase.zookeeper.quorum
63 | mongodb3,solr2.zhiziyun.com,solr1.zhiziyun.com
64 |
65 |
66 | hbase.zookeeper.property.clientPort
67 | 2181
68 |
69 |
70 | hive.zookeeper.namespace
71 | hive_zookeeper_namespace_hive
72 |
73 |
74 | hive.cluster.delegation.token.store.class
75 | org.apache.hadoop.hive.thrift.MemoryTokenStore
76 |
77 |
78 | hive.server2.enable.doAs
79 | true
80 |
81 |
82 | hive.server2.use.SSL
83 | false
84 |
85 |
86 |
--------------------------------------------------------------------------------
/src/main/scala/log4j.properties:
--------------------------------------------------------------------------------
1 | #log4j.rootLogger=INFO
2 |
3 | #zhege info shi bixu de
4 | #log4j.logger.kafka=info,kafka
5 | ## appender KAFKA
6 | #log4j.appender.kafka=org.apache.kafka.log4jappender.KafkaLog4jAppender
7 | #log4j.appender.kafka.topic=test
8 | #log4j.appender.kafka.brokerList=kafka1:9092,kafka2:9092,kafka3:9092
9 | #log4j.appender.kafka.syncSend=true
10 | #log4j.appender.kafka.layout=org.apache.log4j.PatternLayout
11 | #log4j.appender.kafka.layout.ConversionPattern=%m
12 | #zhege biao shi bu shi yong quan ju pei zhi (rootLogger)
13 | #log4j.additivity.kafka=false
14 |
15 |
16 | ## appender console
17 | #log4j.appender.console=org.apache.log4j.ConsoleAppender
18 | #log4j.appender.console.target=System.err
19 | #log4j.appender.console.layout=org.apache.log4j.PatternLayout
20 | #log4j.appender.console.layout.ConversionPattern=%m%n
21 |
22 |
--------------------------------------------------------------------------------
/src/test/scala/samples/junit.scala:
--------------------------------------------------------------------------------
1 | package samples
2 |
3 | import org.junit._
4 | import Assert._
5 |
6 | @Test
7 | class AppTest {
8 |
9 | @Test
10 | def testOK() = assertTrue(true)
11 |
12 | // @Test
13 | // def testKO() = assertTrue(false)
14 |
15 | }
16 |
17 |
18 |
--------------------------------------------------------------------------------
/src/test/scala/samples/scalatest.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2001-2009 Artima, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package samples
17 |
18 | /*
19 | ScalaTest facilitates different styles of testing by providing traits you can mix
20 | together to get the behavior and syntax you prefer. A few examples are
21 | included here. For more information, visit:
22 |
23 | http://www.scalatest.org/
24 |
25 | One way to use ScalaTest is to help make JUnit or TestNG tests more
26 | clear and concise. Here's an example:
27 | */
28 | import scala.collection.mutable.Stack
29 | import org.scalatest.Assertions
30 | import org.junit.Test
31 |
32 | class StackSuite extends Assertions {
33 |
34 | @Test def stackShouldPopValuesIinLastInFirstOutOrder() {
35 | val stack = new Stack[Int]
36 | stack.push(1)
37 | stack.push(2)
38 | assert(stack.pop() === 2)
39 | assert(stack.pop() === 1)
40 | }
41 |
42 | @Test def stackShouldThrowNoSuchElementExceptionIfAnEmptyStackIsPopped() {
43 | val emptyStack = new Stack[String]
44 | intercept[NoSuchElementException] {
45 | emptyStack.pop()
46 | }
47 | }
48 | }
49 |
50 | /*
51 | Here's an example of a FunSuite with ShouldMatchers mixed in:
52 | */
53 | import org.scalatest.FunSuite
54 | import org.scalatest.matchers.ShouldMatchers
55 |
56 | import org.junit.runner.RunWith
57 | import org.scalatest.junit.JUnitRunner
58 | @RunWith(classOf[JUnitRunner])
59 | class ListSuite extends FunSuite with ShouldMatchers {
60 |
61 | test("An empty list should be empty") {
62 | List() should be ('empty)
63 | Nil should be ('empty)
64 | }
65 |
66 | test("A non-empty list should not be empty") {
67 | List(1, 2, 3) should not be ('empty)
68 | List("fee", "fie", "foe", "fum") should not be ('empty)
69 | }
70 |
71 | test("A list's length should equal the number of elements it contains") {
72 | List() should have length (0)
73 | List(1, 2) should have length (2)
74 | List("fee", "fie", "foe", "fum") should have length (4)
75 | }
76 | }
77 |
78 | /*
79 | ScalaTest also supports the behavior-driven development style, in which you
80 | combine tests with text that specifies the behavior being tested. Here's
81 | an example whose text output when run looks like:
82 |
83 | A Map
84 | - should only contain keys and values that were added to it
85 | - should report its size as the number of key/value pairs it contains
86 | */
87 | import org.scalatest.FunSpec
88 | import scala.collection.mutable.Stack
89 |
90 | class ExampleSpec extends FunSpec {
91 |
92 | describe("A Stack") {
93 |
94 | it("should pop values in last-in-first-out order") {
95 | val stack = new Stack[Int]
96 | stack.push(1)
97 | stack.push(2)
98 | assert(stack.pop() === 2)
99 | assert(stack.pop() === 1)
100 | }
101 |
102 | it("should throw NoSuchElementException if an empty stack is popped") {
103 | val emptyStack = new Stack[Int]
104 | intercept[NoSuchElementException] {
105 | emptyStack.pop()
106 | }
107 | }
108 | }
109 | }
110 |
--------------------------------------------------------------------------------
/src/test/scala/samples/specs.scala:
--------------------------------------------------------------------------------
1 | package samples
2 |
3 | import org.junit.runner.RunWith
4 | import org.specs2.mutable._
5 | import org.specs2.runner._
6 |
7 |
8 | /**
9 | * Sample specification.
10 | *
11 | * This specification can be executed with: scala -cp ${package}.SpecsTest
12 | * Or using maven: mvn test
13 | *
14 | * For more information on how to write or run specifications, please visit:
15 | * http://etorreborre.github.com/specs2/guide/org.specs2.guide.Runners.html
16 | *
17 | */
18 | @RunWith(classOf[JUnitRunner])
19 | class MySpecTest extends Specification {
20 | "The 'Hello world' string" should {
21 | "contain 11 characters" in {
22 | "Hello world" must have size(11)
23 | }
24 | "start with 'Hello'" in {
25 | "Hello world" must startWith("Hello")
26 | }
27 | "end with 'world'" in {
28 | "Hello world" must endWith("world")
29 | }
30 | }
31 | }
32 |
--------------------------------------------------------------------------------