├── .cache-main ├── .gitignore ├── README.md ├── inputFile ├── join1 ├── join2 ├── lr_data.txt ├── product ├── random.data ├── test.data ├── test1 ├── test2.data ├── testone.txt ├── u.data ├── user ├── wordCount └── wordCount2 ├── lib └── test-0.0.1-SNAPSHOT.jar ├── pom.xml └── src ├── main └── scala │ ├── com │ ├── fun │ │ └── util │ │ │ ├── RDDOperateFunction.scala │ │ │ ├── SparkContextOperateFunction.scala │ │ │ ├── ZzyLmqDataOperateUtil.scala │ │ │ └── package.scala │ ├── spark │ │ ├── es │ │ │ ├── SparkLocalESTest.scala │ │ │ └── Test.scala │ │ ├── hbase │ │ │ ├── GetOutSiteSuNingPCToNewTable.scala │ │ │ ├── PutDataToHbase.scala │ │ │ ├── SparkGetHbaseToRdd.scala │ │ │ ├── SparkScanHbaseToRdd.scala │ │ │ └── hbasetest.scala │ │ ├── hive │ │ │ ├── CaseClass.scala │ │ │ ├── HiveContextTest.scala │ │ │ ├── SparkPhoenixLoadAndSaveTest.scala │ │ │ ├── SparkRddToHive.scala │ │ │ └── SparkToHive.scala │ │ ├── jdbcrdd │ │ │ ├── JdbcMysqlRDD.scala │ │ │ ├── SparkCSVTest.scala │ │ │ ├── SparkJdbcRDDTest.scala │ │ │ ├── SparkSecondarySortKey.scala │ │ │ └── package.scala │ │ ├── kafka │ │ │ ├── HashMapEncoder.scala │ │ │ ├── KafkaProducerCache.scala │ │ │ ├── RDDKafkaWriter.scala │ │ │ ├── SparkKafkaRDDReader.scala │ │ │ ├── SparkWriteDataToKafkaRunMain.scala │ │ │ └── package.scala │ │ ├── ml │ │ │ ├── ALSDemo.scala │ │ │ ├── ClassifierDemo.scala │ │ │ └── TestVector.scala │ │ ├── myrdd │ │ │ ├── CaseClassUtil.scala │ │ │ ├── ImplicitParameter.scala │ │ │ ├── MySelfRDD.scala │ │ │ ├── TestMain.scala │ │ │ └── package.scala │ │ ├── python │ │ │ └── TestPython.scala │ │ ├── scala │ │ │ ├── ImplicitClass.scala │ │ │ ├── ReflectScala.scala │ │ │ └── ScalaGramaer.scala │ │ ├── scalatest │ │ │ └── ScalaTest.scala │ │ ├── sparkSql │ │ │ ├── CaseClassUtil.scala │ │ │ ├── JavaUseScalaClass.scala │ │ │ ├── SparkListToDataFrame.scala │ │ │ └── SparkSQLDemo.scala │ │ ├── streaming │ │ │ ├── DataProducter.scala │ │ │ ├── DirectMysqlInputDStream.scala │ │ │ ├── JdbcSparkStreamRDD.scala │ │ │ ├── MapWithStateTest.scala │ │ │ ├── MysqlManager.scala │ │ │ ├── SpartStreamingTest.scala │ │ │ ├── UpdateStateByKeyTest.scala │ │ │ └── package.scala │ │ └── util │ │ │ ├── KafkaClusterManager.scala │ │ │ ├── SparkKryoRegistrators.scala │ │ │ └── SparkKryoSerializerTest.scala │ └── test │ │ ├── CheckHbaseDataWithMysql.scala │ │ ├── HbaseUtil.scala │ │ ├── HttpAsyncClientsTest.scala │ │ ├── JsonTest.scala │ │ ├── KafkaLogTest.scala │ │ ├── ReflectScala.scala │ │ ├── SparkWithLocalTest.scala │ │ ├── Test.scala │ │ ├── TestJava.java │ │ └── Utilities.scala │ ├── hdfs-site.xml │ ├── hive-site.xml │ └── log4j.properties └── test └── scala └── samples ├── junit.scala ├── scalatest.scala └── specs.scala /.gitignore: -------------------------------------------------------------------------------- 1 | # use glob syntax. 2 | syntax: glob 3 | *.ser 4 | *.class 5 | *~ 6 | *.bak 7 | #*.off 8 | *.old 9 | 10 | # eclipse conf file 11 | .settings 12 | .classpath 13 | .project 14 | .manager 15 | .scala_dependencies 16 | 17 | # idea 18 | .idea 19 | *.iml 20 | 21 | # building 22 | target 23 | build 24 | null 25 | tmp* 26 | temp* 27 | dist 28 | test-output 29 | build.log 30 | 31 | # other scm 32 | .svn 33 | .CVS 34 | .hg* 35 | 36 | # switch to regexp syntax. 37 | # syntax: regexp 38 | # ^\.pc/ 39 | 40 | #SHITTY output not in target directory 41 | build.log 42 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # spark-test 2 | Spark Version Test Code 3 | -------------------------------------------------------------------------------- /inputFile/join1: -------------------------------------------------------------------------------- 1 | 1 a a a 2 | 2 b b b 3 | 3 c c c -------------------------------------------------------------------------------- /inputFile/join2: -------------------------------------------------------------------------------- 1 | 1 aa aaa aaaa 2 | 2 bb bbb bbbb 3 | 4 vv vvv vvvv -------------------------------------------------------------------------------- /inputFile/product: -------------------------------------------------------------------------------- 1 | 1 2 | 2 3 | 3 4 | 4 5 | 5 6 | 6 7 | 7 8 | 8 9 | 9 -------------------------------------------------------------------------------- /inputFile/test.data: -------------------------------------------------------------------------------- 1 | 1,1,5.0 2 | 1,2,1.0 3 | 1,3,5.0 4 | 1,4,1.0 5 | 1,5,4.5 6 | 1,6,0.0 7 | 2,1,5.0 8 | 2,2,1.0 9 | 2,3,5.0 10 | 2,4,1.0 11 | 2,6,0.0 12 | 2,5,0.0 13 | 3,1,1.0 14 | 3,2,5.0 15 | 3,3,1.0 16 | 3,4,5.0 17 | 3,6,0.0 18 | 3,5,0.0 19 | 4,1,1.0 20 | 4,2,5.0 21 | 4,3,1.0 22 | 4,4,5.0 23 | 4,6,3.5 24 | 4,5,0.0 -------------------------------------------------------------------------------- /inputFile/test1: -------------------------------------------------------------------------------- 1 | 20160701,1,1,5000 2 | 20160701,1,2,20 3 | 20160701,1,3,100 4 | 20160701,2,1,2000 5 | 20160701,2,2,2000 6 | 20160701,2,3,2000 7 | 20160701,3,1,4000 8 | 20160701,3,2,3000 9 | 20160701,3,3,2000 10 | 20160701,4,1,1000 11 | 20160701,4,2,1000 12 | 20160701,4,3,1000 13 | 20160701,5,1,500 14 | 20160701,5,2,500 15 | 20160701,5,3,500 16 | 20160701,6,1,5000 17 | 20160701,6,2,5000 18 | 20160701,6,3,5000 19 | 20160702,1,1,5000 20 | 20160702,1,2,20 21 | 20160702,1,3,100 22 | 20160702,2,1,2000 23 | 20160702,2,2,2000 24 | 20160702,2,3,2000 25 | 20160702,3,1,4000 26 | 20160702,3,2,3000 27 | 20160702,3,3,2000 28 | 20160702,4,1,1000 29 | 20160702,4,2,1000 30 | 20160702,4,3,1000 31 | 20160702,5,1,500 32 | 20160702,5,2,500 33 | 20160702,5,3,500 34 | 20160702,6,1,5000 35 | 20160702,6,2,5000 36 | 20160702,6,3,5000 37 | 20160703,1,1,5000 38 | 20160703,1,2,20 39 | 20160703,1,3,100 40 | 20160703,2,1,2000 41 | 20160703,2,2,2000 42 | 20160703,2,3,2000 43 | 20160703,3,1,4000 44 | 20160703,3,2,3000 45 | 20160703,3,3,2000 46 | 20160703,4,1,1000 47 | 20160703,4,2,1000 48 | 20160703,4,3,1000 49 | 20160703,5,1,500 50 | 20160703,5,2,500 51 | 20160703,5,3,500 52 | 20160703,6,1,5000 53 | 20160703,6,2,5000 54 | 20160703,6,3,5000 55 | 20160704,1,1,5000 56 | 20160704,1,2,20 57 | 20160704,1,3,100 58 | 20160704,2,1,2000 59 | 20160704,2,2,2000 60 | 20160704,2,3,2000 61 | 20160704,3,1,4000 62 | 20160704,3,2,3000 63 | 20160704,3,3,2000 64 | 20160704,4,1,1000 65 | 20160704,4,2,1000 66 | 20160704,4,3,1000 67 | 20160704,5,1,500 68 | 20160704,5,2,500 69 | 20160704,5,3,500 70 | 20160704,6,1,5000 71 | 20160704,6,2,5000 72 | 20160704,6,3,5000 73 | 20160704,1,1,5000 74 | 20160704,1,2,20 75 | 20160704,1,3,100 76 | 20160704,2,1,2000 77 | 20160704,2,2,2000 78 | 20160704,2,3,2000 79 | 20160704,3,1,4000 80 | 20160704,3,2,3000 81 | 20160704,3,3,2000 82 | 20160704,4,1,1000 83 | 20160704,4,2,1000 84 | 20160704,4,3,1000 85 | 20160704,5,1,500 86 | 20160704,5,2,500 87 | 20160704,5,3,500 88 | 20160704,6,1,5000 89 | 20160704,6,2,5000 90 | 20160704,6,3,5000 91 | 20160705,1,1,5000 92 | 20160705,1,2,20 93 | 20160705,1,3,100 94 | 20160705,2,1,2000 95 | 20160705,2,2,2000 96 | 20160705,2,3,2000 97 | 20160705,3,1,4000 98 | 20160705,3,2,3000 99 | 20160705,3,3,2000 100 | 20160705,4,1,1000 101 | 20160705,4,2,1000 102 | 20160705,4,3,1000 103 | 20160705,5,1,500 104 | 20160705,5,2,500 105 | 20160705,5,3,500 106 | 20160705,6,1,5000 107 | 20160705,6,2,5000 108 | 20160705,6,3,5000 109 | 20160705,1,1,5000 110 | 20160705,1,2,20 111 | 20160705,1,3,100 112 | 20160705,2,1,2000 113 | 20160705,2,2,2000 114 | 20160705,2,3,2000 115 | 20160705,3,1,4000 116 | 20160705,3,2,3000 117 | 20160705,3,3,2000 118 | 20160705,4,1,1000 119 | 20160705,4,2,1000 120 | 20160705,4,3,1000 121 | 20160705,5,1,500 122 | 20160705,5,2,500 123 | 20160705,5,3,500 124 | 20160705,6,1,5000 125 | 20160705,6,2,5000 126 | 20160705,6,3,5000 127 | 20160706,1,1,5000 128 | 20160706,1,2,20 129 | 20160706,1,3,100 130 | 20160706,2,1,2000 131 | 20160706,2,2,2000 132 | 20160706,2,3,2000 133 | 20160706,3,1,4000 134 | 20160706,3,2,3000 135 | 20160706,3,3,2000 136 | 20160706,4,1,1000 137 | 20160706,4,2,1000 138 | 20160706,4,3,1000 139 | 20160706,5,1,500 140 | 20160706,5,2,500 141 | 20160706,5,3,500 142 | 20160706,6,1,5000 143 | 20160706,6,2,5000 144 | 20160706,6,3,5000 145 | 20160707,1,1,5000 146 | 20160707,1,2,20 147 | 20160707,1,3,100 148 | 20160707,2,1,2000 149 | 20160707,2,2,2000 150 | 20160707,2,3,2000 151 | 20160707,3,1,4000 152 | 20160707,3,2,3000 153 | 20160707,3,3,2000 154 | 20160707,4,1,1000 155 | 20160707,4,2,1000 156 | 20160707,4,3,1000 157 | 20160707,5,1,500 158 | 20160707,5,2,500 159 | 20160707,5,3,500 160 | 20160707,6,1,5000 161 | 20160707,6,2,5000 162 | 20160707,6,3,5000 163 | 20160708,1,1,5000 164 | 20160708,1,2,20 165 | 20160708,1,3,100 166 | 20160708,2,1,2000 167 | 20160708,2,2,2000 168 | 20160708,2,3,2000 169 | 20160708,3,1,4000 170 | 20160708,3,2,3000 171 | 20160708,3,3,2000 172 | 20160708,4,1,1000 173 | 20160708,4,2,1000 174 | 20160708,4,3,1000 175 | 20160708,5,1,500 176 | 20160708,5,2,500 177 | 20160708,5,3,500 178 | 20160708,6,1,5000 179 | 20160708,6,2,5000 180 | 20160708,6,3,5000 181 | 20160709,1,1,5000 182 | 20160709,1,2,20 183 | 20160709,1,3,100 184 | 20160709,2,1,2000 185 | 20160709,2,2,2000 186 | 20160709,2,3,2000 187 | 20160709,3,1,4000 188 | 20160709,3,2,3000 189 | 20160709,3,3,2000 190 | 20160709,4,1,1000 191 | 20160709,4,2,1000 192 | 20160709,4,3,1000 193 | 20160709,5,1,500 194 | 20160709,5,2,500 195 | 20160709,5,3,500 196 | 20160709,6,1,5000 197 | 20160709,6,2,5000 198 | 20160709,6,3,5000 199 | 20160710,1,1,5000 200 | 20160710,1,2,20 201 | 20160710,1,3,100 202 | 20160710,2,1,2000 203 | 20160710,2,2,2000 204 | 20160710,2,3,2000 205 | 20160710,3,1,4000 206 | 20160710,3,2,3000 207 | 20160710,3,3,2000 208 | 20160710,4,1,1000 209 | 20160710,4,2,1000 210 | 20160710,4,3,1000 211 | 20160710,5,1,500 212 | 20160710,5,2,500 213 | 20160710,5,3,500 214 | 20160710,6,1,5000 215 | 20160710,6,2,5000 216 | 20160710,6,3,5000 217 | 20160711,1,1,5000 218 | 20160711,1,2,20 219 | 20160711,1,3,100 220 | 20160711,2,1,2000 221 | 20160711,2,2,2000 222 | 20160711,2,3,2000 223 | 20160711,3,1,4000 224 | 20160711,3,2,3000 225 | 20160711,3,3,2000 226 | 20160711,4,1,1000 227 | 20160711,4,2,1000 228 | 20160711,4,3,1000 229 | 20160711,5,1,500 230 | 20160711,5,2,500 231 | 20160711,5,3,500 232 | 20160711,6,1,5000 233 | 20160711,6,2,5000 234 | 20160711,6,3,5000 235 | 20160712,1,1,5000 236 | 20160712,1,2,20 237 | 20160712,1,3,100 238 | 20160712,2,1,2000 239 | 20160712,2,2,2000 240 | 20160712,2,3,2000 241 | 20160712,3,1,4000 242 | 20160712,3,2,3000 243 | 20160712,3,3,2000 244 | 20160712,4,1,1000 245 | 20160712,4,2,1000 246 | 20160712,4,3,1000 247 | 20160712,5,1,500 248 | 20160712,5,2,500 249 | 20160712,5,3,500 250 | 20160712,6,1,5000 251 | 20160712,6,2,5000 252 | 20160712,6,3,5000 253 | 20160713,1,1,5000 254 | 20160713,1,2,20 255 | 20160713,1,3,100 256 | 20160713,2,1,2000 257 | 20160713,2,2,2000 258 | 20160713,2,3,2000 259 | 20160713,3,1,4000 260 | 20160713,3,2,3000 261 | 20160713,3,3,2000 262 | 20160713,4,1,1000 263 | 20160713,4,2,1000 264 | 20160713,4,3,1000 265 | 20160713,5,1,500 266 | 20160713,5,2,500 267 | 20160713,5,3,500 268 | 20160713,6,1,5000 269 | 20160713,6,2,5000 270 | 20160713,6,3,5000 271 | 20160714,1,1,5000 272 | 20160714,1,2,20 273 | 20160714,1,3,100 274 | 20160714,2,1,2000 275 | 20160714,2,2,2000 276 | 20160714,2,3,2000 277 | 20160714,3,1,4000 278 | 20160714,3,2,3000 279 | 20160714,3,3,2000 280 | 20160714,4,1,1000 281 | 20160714,4,2,1000 282 | 20160714,4,3,1000 283 | 20160714,5,1,500 284 | 20160714,5,2,500 285 | 20160714,5,3,500 286 | 20160714,6,1,5000 287 | 20160714,6,2,5000 288 | 20160714,6,3,5000 289 | 20160715,1,1,5000 290 | 20160715,1,2,20 291 | 20160715,1,3,100 292 | 20160715,2,1,2000 293 | 20160715,2,2,2000 294 | 20160715,2,3,2000 295 | 20160715,3,1,4000 296 | 20160715,3,2,3000 297 | 20160715,3,3,2000 298 | 20160715,4,1,1000 299 | 20160715,4,2,1000 300 | 20160715,4,3,1000 301 | 20160715,5,1,500 302 | 20160715,5,2,500 303 | 20160715,5,3,500 304 | 20160715,6,1,5000 305 | 20160715,6,2,5000 306 | 20160715,6,3,5000 307 | 20160716,1,1,5000 308 | 20160716,1,2,20 309 | 20160716,1,3,100 310 | 20160716,2,1,2000 311 | 20160716,2,2,2000 312 | 20160716,2,3,2000 313 | 20160716,3,1,4000 314 | 20160716,3,2,3000 315 | 20160716,3,3,2000 316 | 20160716,4,1,1000 317 | 20160716,4,2,1000 318 | 20160716,4,3,1000 319 | 20160716,5,1,500 320 | 20160716,5,2,500 321 | 20160716,5,3,500 322 | 20160716,6,1,5000 323 | 20160716,6,2,5000 324 | 20160716,6,3,5000 325 | 20160717,1,1,5000 326 | 20160717,1,2,20 327 | 20160717,1,3,100 328 | 20160717,2,1,2000 329 | 20160717,2,2,2000 330 | 20160717,2,3,2000 331 | 20160717,3,1,4000 332 | 20160717,3,2,3000 333 | 20160717,3,3,2000 334 | 20160717,4,1,1000 335 | 20160717,4,2,1000 336 | 20160717,4,3,1000 337 | 20160717,5,1,500 338 | 20160717,5,2,500 339 | 20160717,5,3,500 340 | 20160717,6,1,5000 341 | 20160717,6,2,5000 342 | 20160717,6,3,5000 343 | 20160718,1,1,5000 344 | 20160718,1,2,20 345 | 20160718,1,3,100 346 | 20160718,2,1,2000 347 | 20160718,2,2,2000 348 | 20160718,2,3,2000 349 | 20160718,3,1,4000 350 | 20160718,3,2,3000 351 | 20160718,3,3,2000 352 | 20160718,4,1,1000 353 | 20160718,4,2,1000 354 | 20160718,4,3,1000 355 | 20160718,5,1,500 356 | 20160718,5,2,500 357 | 20160718,5,3,500 358 | 20160718,6,1,5000 359 | 20160718,6,2,5000 360 | 20160718,6,3,5000 361 | 20160719,1,1,5000 362 | 20160719,1,2,20 363 | 20160719,1,3,100 364 | 20160719,2,1,2000 365 | 20160719,2,2,2000 366 | 20160719,2,3,2000 367 | 20160719,3,1,4000 368 | 20160719,3,2,3000 369 | 20160719,3,3,2000 370 | 20160719,4,1,1000 371 | 20160719,4,2,1000 372 | 20160719,4,3,1000 373 | 20160719,5,1,500 374 | 20160719,5,2,500 375 | 20160719,5,3,500 376 | 20160719,6,1,5000 377 | 20160719,6,2,5000 378 | 20160719,6,3,5000 379 | 20160720,1,1,5000 380 | 20160720,1,2,20 381 | 20160720,1,3,100 382 | 20160720,2,1,2000 383 | 20160720,2,2,2000 384 | 20160720,2,3,2000 385 | 20160720,3,1,4000 386 | 20160720,3,2,3000 387 | 20160720,3,3,2000 388 | 20160720,4,1,1000 389 | 20160720,4,2,1000 390 | 20160720,4,3,1000 391 | 20160720,5,1,500 392 | 20160720,5,2,500 393 | 20160720,5,3,500 394 | 20160720,6,1,5000 395 | 20160720,6,2,5000 396 | 20160720,6,3,5000 397 | 20160721,1,1,5000 398 | 20160721,1,2,20 399 | 20160721,1,3,100 400 | 20160721,2,1,2000 401 | 20160721,2,2,2000 402 | 20160721,2,3,2000 403 | 20160721,3,1,4000 404 | 20160721,3,2,3000 405 | 20160721,3,3,2000 406 | 20160721,4,1,1000 407 | 20160721,4,2,1000 408 | 20160721,4,3,1000 409 | 20160721,5,1,500 410 | 20160721,5,2,500 411 | 20160721,5,3,500 412 | 20160721,6,1,5000 413 | 20160721,6,2,5000 414 | 20160721,6,3,5000 415 | 20160722,1,1,5000 416 | 20160722,1,2,20 417 | 20160722,1,3,100 418 | 20160722,2,1,2000 419 | 20160722,2,2,2000 420 | 20160722,2,3,2000 421 | 20160722,3,1,4000 422 | 20160722,3,2,3000 423 | 20160722,3,3,2000 424 | 20160722,4,1,1000 425 | 20160722,4,2,1000 426 | 20160722,4,3,1000 427 | 20160722,5,1,500 428 | 20160722,5,2,500 429 | 20160722,5,3,500 430 | 20160722,6,1,5000 431 | 20160722,6,2,5000 432 | 20160722,6,3,5000 433 | 20160723,1,1,5000 434 | 20160723,1,2,20 435 | 20160723,1,3,100 436 | 20160723,2,1,2000 437 | 20160723,2,2,2000 438 | 20160723,2,3,2000 439 | 20160723,3,1,4000 440 | 20160723,3,2,3000 441 | 20160723,3,3,2000 442 | 20160723,4,1,1000 443 | 20160723,4,2,1000 444 | 20160723,4,3,1000 445 | 20160723,5,1,500 446 | 20160723,5,2,500 447 | 20160723,5,3,500 448 | 20160723,6,1,5000 449 | 20160723,6,2,5000 450 | 20160723,6,3,5000 451 | 20160724,1,1,5000 452 | 20160724,1,2,20 453 | 20160724,1,3,100 454 | 20160724,2,1,2000 455 | 20160724,2,2,2000 456 | 20160724,2,3,2000 457 | 20160724,3,1,4000 458 | 20160724,3,2,3000 459 | 20160724,3,3,2000 460 | 20160724,4,1,1000 461 | 20160724,4,2,1000 462 | 20160724,4,3,1000 463 | 20160724,5,1,500 464 | 20160724,5,2,500 465 | 20160724,5,3,500 466 | 20160724,6,1,5000 467 | 20160724,6,2,5000 468 | 20160724,6,3,5000 469 | 20160725,1,1,5000 470 | 20160725,1,2,20 471 | 20160725,1,3,100 472 | 20160725,2,1,2000 473 | 20160725,2,2,2000 474 | 20160725,2,3,2000 475 | 20160725,3,1,4000 476 | 20160725,3,2,3000 477 | 20160725,3,3,2000 478 | 20160725,4,1,1000 479 | 20160725,4,2,1000 480 | 20160725,4,3,1000 481 | 20160725,5,1,500 482 | 20160725,5,2,500 483 | 20160725,5,3,500 484 | 20160725,6,1,5000 485 | 20160725,6,2,5000 486 | 20160725,6,3,5000 487 | 20160726,1,1,5000 488 | 20160726,1,2,20 489 | 20160726,1,3,100 490 | 20160726,2,1,2000 491 | 20160726,2,2,2000 492 | 20160726,2,3,2000 493 | 20160726,3,1,4000 494 | 20160726,3,2,3000 495 | 20160726,3,3,2000 496 | 20160726,4,1,1000 497 | 20160726,4,2,1000 498 | 20160726,4,3,1000 499 | 20160726,5,1,500 500 | 20160726,5,2,500 501 | 20160726,5,3,500 502 | 20160726,6,1,5000 503 | 20160726,6,2,5000 504 | 20160726,6,3,5000 505 | 20160727,1,1,5000 506 | 20160727,1,2,20 507 | 20160727,1,3,100 508 | 20160727,2,1,2000 509 | 20160727,2,2,2000 510 | 20160727,2,3,2000 511 | 20160727,3,1,4000 512 | 20160727,3,2,3000 513 | 20160727,3,3,2000 514 | 20160727,4,1,1000 515 | 20160727,4,2,1000 516 | 20160727,4,3,1000 517 | 20160727,5,1,500 518 | 20160727,5,2,500 519 | 20160727,5,3,500 520 | 20160727,6,1,5000 521 | 20160727,6,2,5000 522 | 20160727,6,3,5000 523 | 20160728,1,1,5000 524 | 20160728,1,2,20 525 | 20160728,1,3,100 526 | 20160728,2,1,2000 527 | 20160728,2,2,2000 528 | 20160728,2,3,2000 529 | 20160728,3,1,4000 530 | 20160728,3,2,3000 531 | 20160728,3,3,2000 532 | 20160728,4,1,1000 533 | 20160728,4,2,1000 534 | 20160728,4,3,1000 535 | 20160728,5,1,500 536 | 20160728,5,2,500 537 | 20160728,5,3,500 538 | 20160728,6,1,5000 539 | 20160728,6,2,5000 540 | 20160728,6,3,5000 541 | 20160729,1,1,5000 542 | 20160729,1,2,20 543 | 20160729,1,3,100 544 | 20160729,2,1,2000 545 | 20160729,2,2,2000 546 | 20160729,2,3,2000 547 | 20160729,3,1,4000 548 | 20160729,3,2,3000 549 | 20160729,3,3,2000 550 | 20160729,4,1,1000 551 | 20160729,4,2,1000 552 | 20160729,4,3,1000 553 | 20160729,5,1,500 554 | 20160729,5,2,500 555 | 20160729,5,3,500 556 | 20160729,6,1,5000 557 | 20160729,6,2,5000 558 | 20160729,6,3,5000 559 | 20160730,1,1,5000 560 | 20160730,1,2,20 561 | 20160730,1,3,100 562 | 20160730,2,1,2000 563 | 20160730,2,2,2000 564 | 20160730,2,3,2000 565 | 20160730,3,1,4000 566 | 20160730,3,2,3000 567 | 20160730,3,3,2000 568 | 20160730,4,1,1000 569 | 20160730,4,2,1000 570 | 20160730,4,3,1000 571 | 20160730,5,1,500 572 | 20160730,5,2,500 573 | 20160730,5,3,500 574 | 20160730,6,1,5000 575 | 20160730,6,2,5000 576 | 20160730,6,3,5000 577 | 20160731,1,1,5000 578 | 20160731,1,2,20 579 | 20160731,1,3,100 580 | 20160731,2,1,2000 581 | 20160731,2,2,2000 582 | 20160731,2,3,2000 583 | 20160731,3,1,4000 584 | 20160731,3,2,3000 585 | 20160731,3,3,2000 586 | 20160731,4,1,1000 587 | 20160731,4,2,1000 588 | 20160731,4,3,1000 589 | 20160731,5,1,500 590 | 20160731,5,2,500 591 | 20160731,5,3,500 592 | 20160731,6,1,5000 593 | 20160731,6,2,5000 594 | 20160731,6,3,5000 595 | -------------------------------------------------------------------------------- /inputFile/test2.data: -------------------------------------------------------------------------------- 1 | 1,1,5.0 2 | 1,2,1.0 3 | 1,3,5.0 4 | 1,4,1.0 5 | 1,7,2.7 6 | 2,1,5.0 7 | 2,2,1.0 8 | 2,3,5.0 9 | 2,4,1.0 10 | 3,1,1.0 11 | 3,2,5.0 12 | 3,3,1.0 13 | 3,4,5.0 14 | 4,1,1.0 15 | 4,2,5.0 16 | 4,3,1.0 17 | 4,4,5.0 18 | 4,9,4.5 19 | 4,6,1.0 20 | 4,5,3.3 -------------------------------------------------------------------------------- /inputFile/testone.txt: -------------------------------------------------------------------------------- 1 | 20160701,1,1,5000 2 | 20160701,1,2,20 3 | 20160701,1,3,100 4 | 20160701,2,1,2000 5 | 20160701,2,2,2000 6 | 20160701,2,3,2000 7 | 20160701,3,1,4000 8 | 20160701,3,2,3000 9 | 20160701,3,3,2000 10 | 20160701,4,1,1000 11 | 20160701,4,2,1000 12 | 20160701,4,3,1000 13 | 20160701,5,1,500 14 | 20160701,5,2,500 15 | 20160701,5,3,500 16 | 20160701,6,1,5000 17 | 20160701,6,2,5000 18 | 20160701,6,3,5000 19 | 20160702,1,1,5000 20 | 20160702,1,2,20 21 | 20160702,1,3,100 22 | 20160702,2,1,2000 23 | 20160702,2,2,2000 24 | 20160702,2,3,2000 25 | 20160702,3,1,4000 26 | 20160702,3,2,3000 27 | 20160702,3,3,2000 28 | 20160702,4,1,1000 29 | 20160702,4,2,1000 30 | 20160702,4,3,1000 31 | 20160702,5,1,500 32 | 20160702,5,2,500 33 | 20160702,5,3,500 34 | 20160702,6,1,5000 35 | 20160702,6,2,5000 36 | 20160702,6,3,5000 37 | 20160703,1,1,5000 38 | 20160703,1,2,20 39 | 20160703,1,3,100 40 | 20160703,2,1,2000 41 | 20160703,2,2,2000 42 | 20160703,2,3,2000 43 | 20160703,3,1,4000 44 | 20160703,3,2,3000 45 | 20160703,3,3,2000 46 | 20160703,4,1,1000 47 | 20160703,4,2,1000 48 | 20160703,4,3,1000 49 | 20160703,5,1,500 50 | 20160703,5,2,500 51 | 20160703,5,3,500 52 | 20160703,6,1,5000 53 | 20160703,6,2,5000 54 | 20160703,6,3,5000 55 | 20160704,1,1,5000 56 | 20160704,1,2,20 57 | 20160704,1,3,100 58 | 20160704,2,1,2000 59 | 20160704,2,2,2000 60 | 20160704,2,3,2000 61 | 20160704,3,1,4000 62 | 20160704,3,2,3000 63 | 20160704,3,3,2000 64 | 20160704,4,1,1000 65 | 20160704,4,2,1000 66 | 20160704,4,3,1000 67 | 20160704,5,1,500 68 | 20160704,5,2,500 69 | 20160704,5,3,500 70 | 20160704,6,1,5000 71 | 20160704,6,2,5000 72 | 20160704,6,3,5000 73 | 20160705,1,1,5000 74 | 20160705,1,2,20 75 | 20160705,1,3,100 76 | 20160705,2,1,2000 77 | 20160705,2,2,2000 78 | 20160705,2,3,2000 79 | 20160705,3,1,4000 80 | 20160705,3,2,3000 81 | 20160705,3,3,2000 82 | 20160705,4,1,1000 83 | 20160705,4,2,1000 84 | 20160705,4,3,1000 85 | 20160705,5,1,500 86 | 20160705,5,2,500 87 | 20160705,5,3,500 88 | 20160705,6,1,5000 89 | 20160705,6,2,5000 90 | 20160705,6,3,5000 91 | 20160706,1,1,5000 92 | 20160706,1,2,20 93 | 20160706,1,3,100 94 | 20160706,2,1,2000 95 | 20160706,2,2,2000 96 | 20160706,2,3,2000 97 | 20160706,3,1,4000 98 | 20160706,3,2,3000 99 | 20160706,3,3,2000 100 | 20160706,4,1,1000 101 | 20160706,4,2,1000 102 | 20160706,4,3,1000 103 | 20160706,5,1,500 104 | 20160706,5,2,500 105 | 20160706,5,3,500 106 | 20160706,6,1,5000 107 | 20160706,6,2,5000 108 | 20160706,6,3,5000 109 | 20160707,1,1,5000 110 | 20160707,1,2,20 111 | 20160707,1,3,100 112 | 20160707,2,1,2000 113 | 20160707,2,2,2000 114 | 20160707,2,3,2000 115 | 20160707,3,1,4000 116 | 20160707,3,2,3000 117 | 20160707,3,3,2000 118 | 20160707,4,1,1000 119 | 20160707,4,2,1000 120 | 20160707,4,3,1000 121 | 20160707,5,1,500 122 | 20160707,5,2,500 123 | 20160707,5,3,500 124 | 20160707,6,1,5000 125 | 20160707,6,2,5000 126 | 20160707,6,3,5000 127 | 20160708,1,1,5000 128 | 20160708,1,2,20 129 | 20160708,1,3,100 130 | 20160708,2,1,2000 131 | 20160708,2,2,2000 132 | 20160708,2,3,2000 133 | 20160708,3,1,4000 134 | 20160708,3,2,3000 135 | 20160708,3,3,2000 136 | 20160708,4,1,1000 137 | 20160708,4,2,1000 138 | 20160708,4,3,1000 139 | 20160708,5,1,500 140 | 20160708,5,2,500 141 | 20160708,5,3,500 142 | 20160708,6,1,5000 143 | 20160708,6,2,5000 144 | 20160708,6,3,5000 145 | 20160709,1,1,5000 146 | 20160709,1,2,20 147 | 20160709,1,3,100 148 | 20160709,2,1,2000 149 | 20160709,2,2,2000 150 | 20160709,2,3,2000 151 | 20160709,3,1,4000 152 | 20160709,3,2,3000 153 | 20160709,3,3,2000 154 | 20160709,4,1,1000 155 | 20160709,4,2,1000 156 | 20160709,4,3,1000 157 | 20160709,5,1,500 158 | 20160709,5,2,500 159 | 20160709,5,3,500 160 | 20160709,6,1,5000 161 | 20160709,6,2,5000 162 | 20160709,6,3,5000 163 | 20160710,1,1,5000 164 | 20160710,1,2,20 165 | 20160710,1,3,100 166 | 20160710,2,1,2000 167 | 20160710,2,2,2000 168 | 20160710,2,3,2000 169 | 20160710,3,1,4000 170 | 20160710,3,2,3000 171 | 20160710,3,3,2000 172 | 20160710,4,1,1000 173 | 20160710,4,2,1000 174 | 20160710,4,3,1000 175 | 20160710,5,1,500 176 | 20160710,5,2,500 177 | 20160710,5,3,500 178 | 20160710,6,1,5000 179 | 20160710,6,2,5000 180 | 20160710,6,3,5000 181 | 20160711,1,1,5000 182 | 20160711,1,2,20 183 | 20160711,1,3,100 184 | 20160711,2,1,2000 185 | 20160711,2,2,2000 186 | 20160711,2,3,2000 187 | 20160711,3,1,4000 188 | 20160711,3,2,3000 189 | 20160711,3,3,2000 190 | 20160711,4,1,1000 191 | 20160711,4,2,1000 192 | 20160711,4,3,1000 193 | 20160711,5,1,500 194 | 20160711,5,2,500 195 | 20160711,5,3,500 196 | 20160711,6,1,5000 197 | 20160711,6,2,5000 198 | 20160711,6,3,5000 199 | 20160712,1,1,5000 200 | 20160712,1,2,20 201 | 20160712,1,3,100 202 | 20160712,2,1,2000 203 | 20160712,2,2,2000 204 | 20160712,2,3,2000 205 | 20160712,3,1,4000 206 | 20160712,3,2,3000 207 | 20160712,3,3,2000 208 | 20160712,4,1,1000 209 | 20160712,4,2,1000 210 | 20160712,4,3,1000 211 | 20160712,5,1,500 212 | 20160712,5,2,500 213 | 20160712,5,3,500 214 | 20160712,6,1,5000 215 | 20160712,6,2,5000 216 | 20160712,6,3,5000 217 | 20160713,1,1,5000 218 | 20160713,1,2,20 219 | 20160713,1,3,100 220 | 20160713,2,1,2000 221 | 20160713,2,2,2000 222 | 20160713,2,3,2000 223 | 20160713,3,1,4000 224 | 20160713,3,2,3000 225 | 20160713,3,3,2000 226 | 20160713,4,1,1000 227 | 20160713,4,2,1000 228 | 20160713,4,3,1000 229 | 20160713,5,1,500 230 | 20160713,5,2,500 231 | 20160713,5,3,500 232 | 20160713,6,1,5000 233 | 20160713,6,2,5000 234 | 20160713,6,3,5000 235 | 20160714,1,1,5000 236 | 20160714,1,2,20 237 | 20160714,1,3,100 238 | 20160714,2,1,2000 239 | 20160714,2,2,2000 240 | 20160714,2,3,2000 241 | 20160714,3,1,4000 242 | 20160714,3,2,3000 243 | 20160714,3,3,2000 244 | 20160714,4,1,1000 245 | 20160714,4,2,1000 246 | 20160714,4,3,1000 247 | 20160714,5,1,500 248 | 20160714,5,2,500 249 | 20160714,5,3,500 250 | 20160714,6,1,5000 251 | 20160714,6,2,5000 252 | 20160714,6,3,5000 253 | 20160715,1,1,5000 254 | 20160715,1,2,20 255 | 20160715,1,3,100 256 | 20160715,2,1,2000 257 | 20160715,2,2,2000 258 | 20160715,2,3,2000 259 | 20160715,3,1,4000 260 | 20160715,3,2,3000 261 | 20160715,3,3,2000 262 | 20160715,4,1,1000 263 | 20160715,4,2,1000 264 | 20160715,4,3,1000 265 | 20160715,5,1,500 266 | 20160715,5,2,500 267 | 20160715,5,3,500 268 | 20160715,6,1,5000 269 | 20160715,6,2,5000 270 | 20160715,6,3,5000 271 | 20160716,1,1,5000 272 | 20160716,1,2,20 273 | 20160716,1,3,100 274 | 20160716,2,1,2000 275 | 20160716,2,2,2000 276 | 20160716,2,3,2000 277 | 20160716,3,1,4000 278 | 20160716,3,2,3000 279 | 20160716,3,3,2000 280 | 20160716,4,1,1000 281 | 20160716,4,2,1000 282 | 20160716,4,3,1000 283 | 20160716,5,1,500 284 | 20160716,5,2,500 285 | 20160716,5,3,500 286 | 20160716,6,1,5000 287 | 20160716,6,2,5000 288 | 20160716,6,3,5000 289 | 20160717,1,1,5000 290 | 20160717,1,2,20 291 | 20160717,1,3,100 292 | 20160717,2,1,2000 293 | 20160717,2,2,2000 294 | 20160717,2,3,2000 295 | 20160717,3,1,4000 296 | 20160717,3,2,3000 297 | 20160717,3,3,2000 298 | 20160717,4,1,1000 299 | 20160717,4,2,1000 300 | 20160717,4,3,1000 301 | 20160717,5,1,500 302 | 20160717,5,2,500 303 | 20160717,5,3,500 304 | 20160717,6,1,5000 305 | 20160717,6,2,5000 306 | 20160717,6,3,5000 307 | 20160718,1,1,5000 308 | 20160718,1,2,20 309 | 20160718,1,3,100 310 | 20160718,2,1,2000 311 | 20160718,2,2,2000 312 | 20160718,2,3,2000 313 | 20160718,3,1,4000 314 | 20160718,3,2,3000 315 | 20160718,3,3,2000 316 | 20160718,4,1,1000 317 | 20160718,4,2,1000 318 | 20160718,4,3,1000 319 | 20160718,5,1,500 320 | 20160718,5,2,500 321 | 20160718,5,3,500 322 | 20160718,6,1,5000 323 | 20160718,6,2,5000 324 | 20160718,6,3,5000 325 | 20160719,1,1,5000 326 | 20160719,1,2,20 327 | 20160719,1,3,100 328 | 20160719,2,1,2000 329 | 20160719,2,2,2000 330 | 20160719,2,3,2000 331 | 20160719,3,1,4000 332 | 20160719,3,2,3000 333 | 20160719,3,3,2000 334 | 20160719,4,1,1000 335 | 20160719,4,2,1000 336 | 20160719,4,3,1000 337 | 20160719,5,1,500 338 | 20160719,5,2,500 339 | 20160719,5,3,500 340 | 20160719,6,1,5000 341 | 20160719,6,2,5000 342 | 20160719,6,3,5000 343 | 20160720,1,1,5000 344 | 20160720,1,2,20 345 | 20160720,1,3,100 346 | 20160720,2,1,2000 347 | 20160720,2,2,2000 348 | 20160720,2,3,2000 349 | 20160720,3,1,4000 350 | 20160720,3,2,3000 351 | 20160720,3,3,2000 352 | 20160720,4,1,1000 353 | 20160720,4,2,1000 354 | 20160720,4,3,1000 355 | 20160720,5,1,500 356 | 20160720,5,2,500 357 | 20160720,5,3,500 358 | 20160720,6,1,5000 359 | 20160720,6,2,5000 360 | 20160720,6,3,5000 361 | 20160721,1,1,5000 362 | 20160721,1,2,20 363 | 20160721,1,3,100 364 | 20160721,2,1,2000 365 | 20160721,2,2,2000 366 | 20160721,2,3,2000 367 | 20160721,3,1,4000 368 | 20160721,3,2,3000 369 | 20160721,3,3,2000 370 | 20160721,4,1,1000 371 | 20160721,4,2,1000 372 | 20160721,4,3,1000 373 | 20160721,5,1,500 374 | 20160721,5,2,500 375 | 20160721,5,3,500 376 | 20160721,6,1,5000 377 | 20160721,6,2,5000 378 | 20160721,6,3,5000 379 | 20160722,1,1,5000 380 | 20160722,1,2,20 381 | 20160722,1,3,100 382 | 20160722,2,1,2000 383 | 20160722,2,2,2000 384 | 20160722,2,3,2000 385 | 20160722,3,1,4000 386 | 20160722,3,2,3000 387 | 20160722,3,3,2000 388 | 20160722,4,1,1000 389 | 20160722,4,2,1000 390 | 20160722,4,3,1000 391 | 20160722,5,1,500 392 | 20160722,5,2,500 393 | 20160722,5,3,500 394 | 20160722,6,1,5000 395 | 20160722,6,2,5000 396 | 20160722,6,3,5000 397 | 20160723,1,1,5000 398 | 20160723,1,2,20 399 | 20160723,1,3,100 400 | 20160723,2,1,2000 401 | 20160723,2,2,2000 402 | 20160723,2,3,2000 403 | 20160723,3,1,4000 404 | 20160723,3,2,3000 405 | 20160723,3,3,2000 406 | 20160723,4,1,1000 407 | 20160723,4,2,1000 408 | 20160723,4,3,1000 409 | 20160723,5,1,500 410 | 20160723,5,2,500 411 | 20160723,5,3,500 412 | 20160723,6,1,5000 413 | 20160723,6,2,5000 414 | 20160723,6,3,5000 415 | 20160724,1,1,5000 416 | 20160724,1,2,20 417 | 20160724,1,3,100 418 | 20160724,2,1,2000 419 | 20160724,2,2,2000 420 | 20160724,2,3,2000 421 | 20160724,3,1,4000 422 | 20160724,3,2,3000 423 | 20160724,3,3,2000 424 | 20160724,4,1,1000 425 | 20160724,4,2,1000 426 | 20160724,4,3,1000 427 | 20160724,5,1,500 428 | 20160724,5,2,500 429 | 20160724,5,3,500 430 | 20160724,6,1,5000 431 | 20160724,6,2,5000 432 | 20160724,6,3,5000 433 | 20160725,1,1,5000 434 | 20160725,1,2,20 435 | 20160725,1,3,100 436 | 20160725,2,1,2000 437 | 20160725,2,2,2000 438 | 20160725,2,3,2000 439 | 20160725,3,1,4000 440 | 20160725,3,2,3000 441 | 20160725,3,3,2000 442 | 20160725,4,1,1000 443 | 20160725,4,2,1000 444 | 20160725,4,3,1000 445 | 20160725,5,1,500 446 | 20160725,5,2,500 447 | 20160725,5,3,500 448 | 20160725,6,1,5000 449 | 20160725,6,2,5000 450 | 20160725,6,3,5000 451 | 20160726,1,1,5000 452 | 20160726,1,2,20 453 | 20160726,1,3,100 454 | 20160726,2,1,2000 455 | 20160726,2,2,2000 456 | 20160726,2,3,2000 457 | 20160726,3,1,4000 458 | 20160726,3,2,3000 459 | 20160726,3,3,2000 460 | 20160726,4,1,1000 461 | 20160726,4,2,1000 462 | 20160726,4,3,1000 463 | 20160726,5,1,500 464 | 20160726,5,2,500 465 | 20160726,5,3,500 466 | 20160726,6,1,5000 467 | 20160726,6,2,5000 468 | 20160726,6,3,5000 469 | 20160727,1,1,5000 470 | 20160727,1,2,20 471 | 20160727,1,3,100 472 | 20160727,2,1,2000 473 | 20160727,2,2,2000 474 | 20160727,2,3,2000 475 | 20160727,3,1,4000 476 | 20160727,3,2,3000 477 | 20160727,3,3,2000 478 | 20160727,4,1,1000 479 | 20160727,4,2,1000 480 | 20160727,4,3,1000 481 | 20160727,5,1,500 482 | 20160727,5,2,500 483 | 20160727,5,3,500 484 | 20160727,6,1,5000 485 | 20160727,6,2,5000 486 | 20160727,6,3,5000 487 | 20160728,1,1,5000 488 | 20160728,1,2,20 489 | 20160728,1,3,100 490 | 20160728,2,1,2000 491 | 20160728,2,2,2000 492 | 20160728,2,3,2000 493 | 20160728,3,1,4000 494 | 20160728,3,2,3000 495 | 20160728,3,3,2000 496 | 20160728,4,1,1000 497 | 20160728,4,2,1000 498 | 20160728,4,3,1000 499 | 20160728,5,1,500 500 | 20160728,5,2,500 501 | 20160728,5,3,500 502 | 20160728,6,1,5000 503 | 20160728,6,2,5000 504 | 20160728,6,3,5000 505 | 20160729,1,1,5000 506 | 20160729,1,2,20 507 | 20160729,1,3,100 508 | 20160729,2,1,2000 509 | 20160729,2,2,2000 510 | 20160729,2,3,2000 511 | 20160729,3,1,4000 512 | 20160729,3,2,3000 513 | 20160729,3,3,2000 514 | 20160729,4,1,1000 515 | 20160729,4,2,1000 516 | 20160729,4,3,1000 517 | 20160729,5,1,500 518 | 20160729,5,2,500 519 | 20160729,5,3,500 520 | 20160729,6,1,5000 521 | 20160729,6,2,5000 522 | 20160729,6,3,5000 523 | 20160730,1,1,5000 524 | 20160730,1,2,20 525 | 20160730,1,3,100 526 | 20160730,2,1,2000 527 | 20160730,2,2,2000 528 | 20160730,2,3,2000 529 | 20160730,3,1,4000 530 | 20160730,3,2,3000 531 | 20160730,3,3,2000 532 | 20160730,4,1,1000 533 | 20160730,4,2,1000 534 | 20160730,4,3,1000 535 | 20160730,5,1,500 536 | 20160730,5,2,500 537 | 20160730,5,3,500 538 | 20160730,6,1,5000 539 | 20160730,6,2,5000 540 | 20160730,6,3,5000 541 | 20160731,1,1,5000 542 | 20160731,1,2,20 543 | 20160731,1,3,100 544 | 20160731,2,1,2000 545 | 20160731,2,2,2000 546 | 20160731,2,3,2000 547 | 20160731,3,1,4000 548 | 20160731,3,2,3000 549 | 20160731,3,3,2000 550 | 20160731,4,1,1000 551 | 20160731,4,2,1000 552 | 20160731,4,3,1000 553 | 20160731,5,1,500 554 | 20160731,5,2,500 555 | 20160731,5,3,500 556 | 20160731,6,1,5000 557 | 20160731,6,2,5000 558 | 20160731,6,3,5000 559 | -------------------------------------------------------------------------------- /inputFile/user: -------------------------------------------------------------------------------- 1 | 1 2 | 2 3 | 3 4 | 4 -------------------------------------------------------------------------------- /inputFile/wordCount: -------------------------------------------------------------------------------- 1 | package com.spark.scala 2 | 3 | import java.util.ArrayList 4 | import scala.collection.mutable.ArrayBuffer 5 | import scala.collection.JavaConversions._ 6 | import scala.collection.mutable.Map 7 | import java.util.HashMap 8 | import scala.io.Source 9 | import java.io.File 10 | import scala.collection.Iterator 11 | 12 | 13 | object ScalaGramaer { 14 | var list=new ArrayList[String] 15 | def main(args: Array[String]): Unit = { 16 | //listGrammer() 17 | //mapGrammer() 18 | //tupleGrammer() 19 | fileGrammer() 20 | } 21 | /** 22 | * scala集合操作 23 | * 1.想要使用java的集合,需要导入 24 | * import scala.collection.JavaConversions._ 25 | * 会内部将java的集合转换为scala的集合 26 | * 2.java的集合和scala的集合不能显式转换,但是可以隐式转换,如,SparkContext.parallelize(data) 27 | * 需要的是一个scala的data,但是可以传一个java的集合 28 | */ 29 | def fileGrammer(){ 30 | // var file=Source.fromFile("D:\\tmp\\input\\smy_biz_dil\\part-m-00000", "utf-8") 31 | //var file=Source.fromURL("http://www.baidu.com", "utf-8") 32 | // file.getLines.foreach { println }; 33 | //bian li mulu 34 | /*walk(new File("D:\\tmp\\input\\")) 35 | list.foreach { println }*/ 36 | } 37 | 38 | //遍历路径下所有的文件 39 | def walk(file:File){ 40 | if(file.isDirectory()) file.listFiles().foreach (walk) else list.add(file.getPath()) 41 | } 42 | def readAllfiles(dir:File):Iterator[File]={ 43 | //scan a dir return all file 44 | var child=dir.listFiles().filter { _.isDirectory()} 45 | child.toIterator++child.toIterator.flatMap { readAllfiles _ } 46 | } 47 | def listGrammer(){ 48 | //遍历集合,可以有下标无下标 49 | var list=new ArrayList[String]();list.add("s") 50 | for(value<- list) println(value) 51 | for(i<- 0.until(list.length))println(list(i)) 52 | for(i<- 0 until list.length)println(list(i)) 53 | 54 | } 55 | def mapGrammer(){ 56 | //mutable可变的 57 | var map=Map("a"->1,"b"->2) 58 | println(map("a")) 59 | //用get返回的是一个option 60 | println(map.get("b")) 61 | println(map.get("c")) 62 | //改变一个key的值 63 | map("a")=6 64 | println(map("a")) 65 | //新增一个值 66 | map+="c"->3 67 | println(map("c")) 68 | //移除一个值 69 | map-="c" 70 | println(map.getOrElse("c", "无这个key")) 71 | //如果有这个key就返回key的值 72 | println(map.getOrElse("null", "无这个key")) 73 | 74 | //遍历一个map 75 | println("遍历一个map") 76 | for((k,value)<-map){ 77 | println(k+":"+value) 78 | } 79 | println("遍历一个map的key") 80 | for(k<-map.keySet){ 81 | println(k) 82 | } 83 | 84 | } 85 | def tupleGrammer(){ 86 | //元祖类型Tuple可以是多元的 87 | var tuple1=(1) 88 | var tuple2=("1",2) 89 | var tuple3=("1",2,"3") 90 | var tuple4=("1",2,"3",4) 91 | println(tuple3._3) 92 | 93 | } 94 | 95 | /** 96 | * @author Administrator 97 | */ 98 | class Person(n:String) { 99 | //必须初始化属性 100 | var name=n; 101 | var age=0; 102 | var address=""; 103 | //这是一个辅助构造器,scala的构造器必须以另一个构造器为起点,否则报错 104 | def this(name:String,age:Int){ 105 | this(name) 106 | this.age=age 107 | } 108 | def this(name:String,age:Int,address:String){ 109 | this(name,age) 110 | this.address=address 111 | } 112 | } 113 | 114 | } 115 | 116 | -------------------------------------------------------------------------------- /inputFile/wordCount2: -------------------------------------------------------------------------------- 1 | >>>>>>>>>>>>>>>>>>.. 2 | >>>>>>>>>>>>>>>>>>> 3 | >>>>>>>>>>>>>>>>>> 4 | >>>>>>>>>>>>>>>>>>>>> 5 | >>>>>>>>>>>>>>>> -------------------------------------------------------------------------------- /lib/test-0.0.1-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LinMingQiang/spark-learn/4fd2466b9d339b2ac77003bd4f7b772489e314aa/lib/test-0.0.1-SNAPSHOT.jar -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | Spark 5 | Spark 6 | 0.0.1-SNAPSHOT 7 | ${project.artifactId} 8 | My wonderfull scala app 9 | 2015 10 | 11 | 12 | My License 13 | http://.... 14 | repo 15 | 16 | 17 | 18 | 19 | 1.6 20 | 1.6 21 | UTF-8 22 | 2.10.6 23 | 2.10 24 | 25 | 26 | 27 | 28 | sqlline 29 | sqlline 30 | 1.1.9 31 | 32 | 33 | jdk.tools 34 | jdk.tools 35 | 1.7 36 | system 37 | C:\Program Files\Java\jdk1.7.0_79\lib\tools.jar 38 | 39 | 40 | 42 | 43 | org.apache.httpcomponents 44 | httpasyncclient 45 | 4.0 46 | 47 | 48 | 49 | net.sf.json-lib 50 | json-lib 51 | 2.4 52 | 53 | 54 | 55 | org.elasticsearch 56 | elasticsearch 57 | 2.0.1 58 | 59 | 60 | 61 | org.apache.hbase 62 | hbase-server 63 | 1.2.0-cdh5.7.5 64 | 65 | 66 | hadoop-common 67 | org.apache.hadoop 68 | 69 | 70 | slf4j-log4j12 71 | org.slf4j 72 | 73 | 74 | 75 | 76 | 77 | 78 | org.apache.hive.hcatalog 79 | hive-hcatalog-core 80 | 1.1.0-cdh5.7.5 81 | 82 | 83 | 84 | org.apache.phoenix 85 | phoenix-spark 86 | 4.8.0-HBase-1.2 87 | 88 | 89 | 90 | 91 | com.databricks 92 | spark-csv_2.10 93 | 1.0.3 94 | 95 | 96 | 97 | org.apache.spark 98 | spark-streaming_2.10 99 | 1.6.0-cdh5.7.5 100 | 101 | 102 | 103 | org.apache.spark 104 | spark-core_2.10 105 | 1.6.0-cdh5.7.5 106 | 107 | 108 | javax.servlet 109 | org.eclipse.jetty.orbit 110 | 111 | 112 | slf4j-log4j12 113 | org.slf4j 114 | 115 | 116 | 117 | 118 | 119 | org.apache.spark 120 | spark-mllib_2.10 121 | 1.6.0-cdh5.7.5 122 | 123 | 125 | 127 | 128 | 129 | org.apache.spark 130 | spark-hive_2.10 131 | 1.6.0-cdh5.7.5 132 | 133 | 134 | 135 | org.apache.kafka 136 | connect-json 137 | 0.9.0.2.3.4.21-2 138 | 139 | 140 | 141 | 142 | org.apache.spark 143 | spark-streaming-kafka_2.10 144 | 1.6.0-cdh5.7.5 145 | 146 | 147 | slf4j-log4j12 148 | org.slf4j 149 | 150 | 151 | 152 | 153 | 154 | javax.servlet 155 | javax.servlet-api 156 | 3.1.0 157 | 158 | 159 | 160 | mysql 161 | mysql-connector-java 162 | 5.1.30 163 | 164 | 165 | org.scalatest 166 | scalatest_2.10 167 | 2.2.4 168 | 169 | 170 | 172 | 173 | 175 | 176 | 177 | 179 | 181 | 184 | 185 | 187 | 189 | 192 | 194 | 195 | 196 | 197 | 198 | 201 | 202 | 210 | 211 | 212 | 213 | 214 | src/main/scala 215 | 216 | 217 | org.apache.maven.plugins 218 | maven-resources-plugin 219 | 2.4 220 | 221 | ${project.build.outputDirectory}/resources 222 | UTF-8 223 | 224 | 225 | src/main/scala 226 | 227 | 228 | 229 | 230 | 231 | org.apache.maven.plugins 232 | maven-compiler-plugin 233 | 2.1 234 | 235 | 1.7 236 | 1.7 237 | 238 | 239 | 240 | org.apache.maven.plugins 241 | maven-jar-plugin 242 | 243 | 244 | 245 | true 246 | 247 | 248 | 249 | 250 | 251 | 252 | org.apache.maven.plugins 253 | maven-dependency-plugin 254 | 255 | 256 | copy 257 | package 258 | 259 | copy-dependencies 260 | 261 | 262 | 263 | ${project.build.directory}/lib 264 | 265 | 267 | 268 | org.apache.kafka 269 | 270 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | -------------------------------------------------------------------------------- /src/main/scala/com/fun/util/RDDOperateFunction.scala: -------------------------------------------------------------------------------- 1 | 2 | 3 | package com.fun.util 4 | 5 | import org.apache.spark.rdd.RDD 6 | import org.apache.spark.streaming.dstream.InputDStream 7 | import scala.reflect.ClassTag 8 | import org.apache.spark.streaming.mysql.DirectMysqlInputDStream 9 | import org.apache.spark.streaming.mysql.JdbcSparkStreamRDD 10 | 11 | trait RDDOperateFunction { 12 | //第一种方法 13 | implicit def rddFunction[T](rdd:RDD[T])=new RDDFunctionToClassTag(rdd) 14 | class RDDFunctionToClassTag[T](rdd:RDD[T]){ 15 | //在这里面定义方法 16 | def printlnRDD()=rdd.foreach { println } 17 | } 18 | //第二种方法 19 | implicit class RDDFunctionToString(rdd:RDD[String]){ 20 | def rddF2(str:String)=rdd.map { x => x+" : "+str } 21 | def rddF(str:String)=rdd.map { x => x+" : "+str } 22 | } 23 | implicit class DStreamFunc[A<: InputDStream[(String,String)]](dstream:A){ 24 | def printlnDStream(str:String)=dstream.foreachRDD(rdd=>rdd.collect.foreach(x=>println(str+x))) 25 | } 26 | implicit def printlnDStream2(rdd:JdbcSparkStreamRDD[(String, String)])=rdd.collect.foreach(println) 27 | } -------------------------------------------------------------------------------- /src/main/scala/com/fun/util/SparkContextOperateFunction.scala: -------------------------------------------------------------------------------- 1 | 2 | package com.fun.util 3 | import org.apache.spark.SparkContext 4 | import scala.reflect.ClassTag 5 | import java.sql.Connection 6 | import java.sql.ResultSet 7 | import com.spark.jdbcrdd.JdbcMysqlRDD 8 | import org.apache.spark.streaming.StreamingContext 9 | import org.apache.spark.streaming.mysql.MysqlManager 10 | trait SparkContextOperateFunction { 11 | implicit class SparkContextFunc(sc:SparkContext){ 12 | def hbaseRDD(tablename:String)=println("return hbase RDD") 13 | def mysqlRDD[T:ClassTag](createConnection: () => Connection, 14 | sql:String,numPartitions: Int,extractValues: (ResultSet) => T ) 15 | =new JdbcMysqlRDD(sc, createConnection,sql,numPartitions,extractValues) 16 | } 17 | implicit class StreamingContextFunc(ssc:StreamingContext){ 18 | def createDirectMysqlDStream[T:ClassTag]( 19 | getConnection: () => Connection, 20 | tablename: String, 21 | idcloumn:String, 22 | fromTime: Long, 23 | sql:String, 24 | numPartitions: Int, 25 | extractValues: (ResultSet) => T)= 26 | MysqlManager.creatMysqlInputStream(ssc, getConnection, tablename,idcloumn, fromTime,sql, numPartitions, extractValues) 27 | } 28 | } -------------------------------------------------------------------------------- /src/main/scala/com/fun/util/ZzyLmqDataOperateUtil.scala: -------------------------------------------------------------------------------- 1 | package com.fun.util 2 | 3 | import java.sql.DriverManager 4 | import java.sql.ResultSet 5 | 6 | trait ZzyLmqDataOperateUtil { 7 | def createConnection() = { 8 | Class.forName("com.mysql.jdbc.Driver") 9 | DriverManager.getConnection("jdbc:mysql://192.168.10.159:3306/test", "root", "zhiziyun0628") 10 | } 11 | def extractValues(r: ResultSet) = { 12 | (r.getString(1), r.getString(2),r.getString(3)) 13 | } 14 | def sscextractValues(r: ResultSet) = { 15 | (r.getString(1), r.getString(2)) 16 | } 17 | def getConnection() = { 18 | Class.forName("com.mysql.jdbc.Driver") 19 | DriverManager.getConnection("jdbc:mysql://192.168.10.159:3306/test", "root", "zhiziyun0628") 20 | } 21 | } -------------------------------------------------------------------------------- /src/main/scala/com/fun/util/package.scala: -------------------------------------------------------------------------------- 1 | package com.fun 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.rdd.RDD 5 | import com.fun.util.RDDOperateFunction 6 | import com.fun.util.SparkContextOperateFunction 7 | import com.fun.util.SparkContextOperateFunction 8 | package object util extends RDDOperateFunction 9 | with SparkContextOperateFunction 10 | with ZzyLmqDataOperateUtil{ 11 | //可以通过继承类来获得,也可以直接写 12 | implicit class SparkContextNewFunction(sparkContext: SparkContext) { 13 | def lmq(name: String) = "" 14 | } 15 | //隐式参数的使用 16 | implicit class RDDNewFunction[T](rdd: RDD[T]) { 17 | def lmq3(str: String)(implicit impl:Array[T])=rdd.map { x => x + " : "+impl(0) } 18 | def lmq4[A](str: String)(implicit impl:Array[A])=rdd.map { x => x + " : "+impl(0) } 19 | } 20 | 21 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/es/SparkLocalESTest.scala: -------------------------------------------------------------------------------- 1 | package com.spark.es 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | import org.elasticsearch.common.xcontent.XContentFactory 6 | import scala.collection.JavaConverters._ 7 | object SparkLocalESTest { 8 | var sc: SparkContext = null 9 | val zookeeper="" 10 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0") 11 | def main(args: Array[String]): Unit = { 12 | init 13 | sc.parallelize(1 to 100).map { x=> 14 | val b=XContentFactory.jsonBuilder() 15 | .startObject() 16 | .field("firstName", x) 17 | .field("map", Map("age"->1,"age2"->2).asJava) 18 | .endObject() 19 | (x/10,b) 20 | }.foreach { case((d,b))=>println(d) } 21 | 22 | } 23 | 24 | def init { 25 | val sparkConf = new SparkConf() 26 | .setMaster("local") 27 | .setAppName("Test") 28 | sc = new SparkContext(sparkConf) 29 | } 30 | 31 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/es/Test.scala: -------------------------------------------------------------------------------- 1 | package com.spark.es 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.slf4j.LoggerFactory 6 | import com.mysql.jdbc.Connection 7 | import org.apache.spark.streaming.kafka.KafkaUtils 8 | import org.elasticsearch.common.settings.Settings 9 | import org.elasticsearch.client.transport.TransportClient 10 | import org.elasticsearch.common.transport.TransportAddress 11 | import org.elasticsearch.common.transport.LocalTransportAddress 12 | import org.elasticsearch.common.transport.InetSocketTransportAddress 13 | import java.net.InetAddress 14 | import scala.collection.JavaConversions._ 15 | import scala.collection.JavaConverters._ 16 | import com.google.gson.GsonBuilder 17 | import net.sf.json.JSONObject 18 | import org.elasticsearch.common.xcontent.XContentFactory 19 | import scala.collection.JavaConverters._ 20 | object Test { 21 | var sc: SparkContext = null 22 | def main(args: Array[String]): Unit = { 23 | /*val client = getESClient 24 | println(client.listedNodes()) 25 | val bulk = client.prepareBulk() 26 | */ 27 | val client=getESClient 28 | queryES(client) 29 | /*val builder = XContentFactory.jsonBuilder() 30 | .startObject() 31 | .field("firstName", "Avivi") 32 | .field("map", Map("age"->1,"age2"->2).asJava) 33 | .endObject() 34 | 35 | val request = client.prepareIndex("test", "testType") 36 | .setSource(builder) 37 | bulk.add(request) 38 | val response = bulk.get 39 | response.getItems.foreach { x => println(!x.isFailed()) }*/ 40 | } 41 | def queryES(client: TransportClient){ 42 | val d= client.prepareGet("sdr_urlinfo_test","urlinfo","http%3A%2F%2Fbaojian.zx58.cn%2Fproduct%2F9348%2F") 43 | .setFetchSource("frequency", "").get 44 | println(d.getField("frequency")) 45 | } 46 | def getESClient() = { 47 | val endpoints = Array("192.168.10.115", "192.168.10.110", "192.168.10.81") 48 | .map(_.split(':')).map { 49 | case Array(host, port) => SocketEndpoint(host, port.toInt) 50 | case Array(host) => SocketEndpoint(host, 9300) 51 | } 52 | val settings = Map("cluster.name" -> "zhiziyun") 53 | val esSettings = Settings.settingsBuilder().put(settings.asJava).build() 54 | val client = TransportClient.builder().settings(esSettings).build() 55 | val addresses = endpoints.map(endpointToTransportAddress) 56 | client.addTransportAddresses(addresses: _*) 57 | client 58 | } 59 | 60 | def endpointToTransportAddress(endpoint: Endpoint): TransportAddress = endpoint match { 61 | case LocalEndpoint(id) => new LocalTransportAddress(id) 62 | case SocketEndpoint(address, port) => new InetSocketTransportAddress(InetAddress.getByName(address), port) 63 | } 64 | 65 | def init { 66 | val sparkConf = new SparkConf() 67 | .setAppName("Test") 68 | sc = new SparkContext(sparkConf) 69 | } 70 | } 71 | case class SocketEndpoint(address: String, port: Int) extends Endpoint 72 | case class LocalEndpoint(id: String) extends Endpoint 73 | sealed abstract class Endpoint -------------------------------------------------------------------------------- /src/main/scala/com/spark/hbase/GetOutSiteSuNingPCToNewTable.scala: -------------------------------------------------------------------------------- 1 | package com.spark.hbase 2 | import org.apache.hadoop.hbase.client.Scan 3 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable 4 | import org.apache.hadoop.hbase.client.Result 5 | import org.apache.spark.rdd.RDD 6 | import org.apache.hadoop.mapreduce.Job 7 | import org.apache.hadoop.conf.Configuration 8 | import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil 9 | import org.apache.hadoop.hbase.mapreduce.IdentityTableMapper 10 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat 11 | import org.apache.spark.SparkContext 12 | import scala.reflect.ClassTag 13 | import org.apache.spark.SparkConf 14 | import org.apache.hadoop.hbase.HBaseConfiguration 15 | import org.apache.hadoop.fs.Path 16 | import java.util.ArrayList 17 | import org.apache.hadoop.hbase.filter.RowFilter 18 | import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp 19 | import org.apache.hadoop.hbase.filter.RegexStringComparator 20 | import org.apache.hadoop.hbase.protobuf.generated.ClientProtos 21 | import org.apache.hadoop.hbase.protobuf.ProtobufUtil 22 | import org.apache.hadoop.hbase.util.Base64 23 | import java.util.HashMap 24 | import org.apache.hadoop.hbase.util.Bytes 25 | import scala.collection.JavaConversions._ 26 | import org.apache.hadoop.hbase.filter.SingleColumnValueFilter 27 | import org.apache.hadoop.hbase.filter.CompareFilter 28 | import org.apache.hadoop.hbase.filter.FilterList 29 | import org.apache.hadoop.hbase.filter.Filter 30 | import org.apache.hadoop.hbase.client.Put 31 | import org.apache.hadoop.hbase.mapreduce.TableOutputFormat 32 | object GetOutSiteSuNingPCToNewTable { 33 | var sc: SparkContext = null 34 | var conf: Configuration = null 35 | var zookeeper = "solr2.zhiziyun.com,solr1.zhiziyun.com,mongodb3" 36 | def main(args: Array[String]): Unit = { 37 | val tableName="outsitepctag" 38 | val sparkConf = new SparkConf() 39 | .setMaster("local") 40 | .setAppName("GetOutSiteSuNingPCToNewTable") 41 | sc = new SparkContext(sparkConf) 42 | conf = HBaseConfiguration.create() 43 | conf.set("hbase.zookeeper.quorum", zookeeper) 44 | conf.set("hbase.zookeeper.property.clientPort", "2181") 45 | 46 | var scan = new Scan 47 | val scvf = new SingleColumnValueFilter( 48 | Bytes.toBytes("info"), 49 | Bytes.toBytes("source"), 50 | CompareOp.EQUAL, 51 | Bytes.toBytes("baidupclog")); 52 | scvf.setFilterIfMissing(false); 53 | scan.setFilter(scvf) 54 | 55 | var a = hbaseRDD2[(String, HashMap[String, String])]( 56 | tableName, 57 | scan, 58 | (r: (ImmutableBytesWritable, Result)) => { 59 | var rowMap = new HashMap[String, String]() 60 | var listCells = r._2.listCells() 61 | val rowkey = Bytes.toString(r._2.getRow) 62 | for (cell <- listCells) { 63 | var column = new String(cell.getQualifierArray, cell.getQualifierOffset, cell.getQualifierLength) 64 | rowMap.put(column, new String(cell.getValueArray, cell.getValueOffset, cell.getValueLength)) 65 | } 66 | (rowkey, rowMap) 67 | }) 68 | println("##### partition num ##### "+a.partitions.size) 69 | a.foreach(println) 70 | /* conf.set(TableOutputFormat.OUTPUT_TABLE, "suningpctag") 71 | val job = new Job(conf) 72 | job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]]) 73 | println("########## 数据准备放入 hbase suningpctag ########") 74 | a.map{x => 75 | val p = new Put(Bytes.toBytes(x._1)) 76 | for((key,value)<-x._2){ 77 | p.addColumn("info".getBytes, key.getBytes, value.getBytes) 78 | } 79 | (new ImmutableBytesWritable, p) 80 | } 81 | .saveAsNewAPIHadoopDataset(job.getConfiguration) 82 | sc.stop()*/ 83 | println("########## 结束 ########") 84 | } 85 | 86 | 87 | def hbaseRDD2[U: ClassTag](tableName: String, scan: Scan, f: ((ImmutableBytesWritable, Result)) => U): RDD[U] = { 88 | var proto = ProtobufUtil.toScan(scan); 89 | conf.set(TableInputFormat.INPUT_TABLE, tableName) 90 | conf.set(TableInputFormat.SCAN, Base64.encodeBytes(proto.toByteArray())) 91 | var job: Job = new Job(conf) 92 | sc.newAPIHadoopRDD(job.getConfiguration(), 93 | classOf[TableInputFormat], 94 | classOf[ImmutableBytesWritable], 95 | classOf[Result]).map(f) 96 | } 97 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/hbase/PutDataToHbase.scala: -------------------------------------------------------------------------------- 1 | package com.spark.hbase 2 | 3 | import org.apache.hadoop.conf.Configuration 4 | import org.apache.hadoop.hbase.HBaseConfiguration 5 | import org.apache.hadoop.hbase.client.ConnectionFactory 6 | import org.apache.hadoop.hbase.TableName 7 | import org.apache.hadoop.hbase.client.Put 8 | import org.apache.hadoop.hbase.util.Bytes 9 | import org.apache.hadoop.hbase.client.Table 10 | 11 | object PutDataToHbase { 12 | def main(args: Array[String]): Unit = { 13 | var hconf = HBaseConfiguration.create(); 14 | hconf.set("hbase.zookeeper.quorum", "virtual-2,virtual-3,virtual-4"); 15 | hconf.set("hbase.zookeeper.property.clientPort", "2181"); 16 | var hconnection = ConnectionFactory.createConnection(hconf) 17 | var table = hconnection.getTable(TableName.valueOf("rt_mobilertbreport_bycreative")) 18 | putData(table,"WWTEY3i9OEh,hEmlg0eYmSk,2016-08-23") 19 | putData(table,"WWTEY3i9OEh,d2wns0wqJna,2016-08-23") 20 | putData(table,"0zoTLi29XRgq,istRh0Z1G4o,2016-08-23") 21 | putData(table,"WWTEY3i9OEh,hs8Xi0hvIbe,2016-08-23") 22 | 23 | println(">>>>>>>>>>") 24 | } 25 | def putData(table: Table, rowkey: String) { 26 | val p = new Put(Bytes.toBytes(rowkey)) 27 | p.addColumn("info".getBytes, "additionalcpmcost".getBytes, "100".getBytes) 28 | p.addColumn("info".getBytes, "fee".getBytes, "100".getBytes) 29 | p.addColumn("info".getBytes, "deliveryUV".getBytes, "100".getBytes) 30 | p.addColumn("info".getBytes, "delivery".getBytes, "100".getBytes) 31 | p.addColumn("info".getBytes, "cpmcost".getBytes, "100".getBytes) 32 | p.addColumn("info".getBytes, "clicks".getBytes, "100".getBytes) 33 | p.addColumn("info".getBytes, "clickUV".getBytes, "100".getBytes) 34 | table.put(p) 35 | } 36 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/hbase/SparkGetHbaseToRdd.scala: -------------------------------------------------------------------------------- 1 | package com.spark.hbase 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable 6 | import org.apache.hadoop.hbase.client.Put 7 | import org.apache.hadoop.hbase.util.Bytes 8 | import org.apache.hadoop.hbase.mapreduce.TableOutputFormat 9 | import org.apache.hadoop.mapreduce.Job 10 | import org.apache.hadoop.hbase.util.MD5Hash 11 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions 12 | 13 | object SparkReadMoreFiles { 14 | var sc: SparkContext = null 15 | def main(args: Array[String]): Unit = { 16 | init 17 | //HCatOutputFormat 18 | var conf = sc.hadoopConfiguration 19 | conf.set(TableOutputFormat.OUTPUT_TABLE, "test") 20 | sc.hadoopConfiguration.set("hbase.zookeeper.quorum", "Virtual-1,Virtual-2,Virtual-3") 21 | sc.hadoopConfiguration.set("zookeeper.znode.parent", "/hbase") 22 | var job = new Job(conf) 23 | job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]]) 24 | conf = job.getConfiguration 25 | for (i <- 1 to 100) { 26 | println(i) 27 | var a = sc.parallelize(i*100000 to (i+1)*(100000)) 28 | var b = a.map { x => 29 | println(x) 30 | var p = new Put(Bytes.toBytes(MD5Hash.getMD5AsHex(Bytes.toBytes(x)))) 31 | p.addColumn("info".getBytes, "test".getBytes, Bytes.toBytes(x)) 32 | (new ImmutableBytesWritable, p) 33 | } 34 | .saveAsNewAPIHadoopDataset(conf) 35 | } 36 | } 37 | def init { 38 | val sparkConf = new SparkConf() 39 | .setMaster("local") 40 | .setAppName("Test") 41 | sc = new SparkContext(sparkConf) 42 | } 43 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/hbase/SparkScanHbaseToRdd.scala: -------------------------------------------------------------------------------- 1 | package com.spark.hbase 2 | 3 | import org.apache.hadoop.hbase.client.Scan 4 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable 5 | import org.apache.hadoop.hbase.client.Result 6 | import org.apache.spark.rdd.RDD 7 | import org.apache.hadoop.mapreduce.Job 8 | import org.apache.hadoop.conf.Configuration 9 | import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil 10 | import org.apache.hadoop.hbase.mapreduce.IdentityTableMapper 11 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat 12 | import org.apache.spark.SparkContext 13 | import scala.reflect.ClassTag 14 | import org.apache.spark.SparkConf 15 | import org.apache.hadoop.hbase.HBaseConfiguration 16 | import org.apache.hadoop.fs.Path 17 | import java.util.ArrayList 18 | import org.apache.hadoop.hbase.filter.RowFilter 19 | import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp 20 | import org.apache.hadoop.hbase.filter.RegexStringComparator 21 | import org.apache.hadoop.hbase.protobuf.generated.ClientProtos 22 | import org.apache.hadoop.hbase.protobuf.ProtobufUtil 23 | import org.apache.hadoop.hbase.util.Base64 24 | import java.util.HashMap 25 | import org.apache.hadoop.hbase.util.Bytes 26 | import scala.collection.JavaConversions._ 27 | object SparkScanHbaseToRdd { 28 | var sc: SparkContext = null 29 | var conf: Configuration = null 30 | def main(args: Array[String]): Unit = { 31 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0") 32 | var tableName = "rt_rtbreport" 33 | var zookeeper = "solr2.zhiziyun.com,solr1.zhiziyun.com,mongodb3" 34 | var scans = new Scan 35 | var filter = new RowFilter(CompareOp.EQUAL, new RegexStringComparator(".*2016-10-20")) 36 | scans.setFilter(filter) 37 | val sparkConf = new SparkConf() 38 | .setMaster("local") 39 | .setAppName("HBaseDistributedScanExample") 40 | sc = new SparkContext(sparkConf) 41 | conf = HBaseConfiguration.create() 42 | conf.set("hbase.zookeeper.quorum", zookeeper) 43 | conf.set("hbase.zookeeper.property.clientPort", "2181") 44 | //conf.addResource(new Path("conf/core-site.xml")) 45 | //conf.addResource(new Path("conf/hbase-site.xml")) 46 | //conf.addResource(new Path("conf/hdfs-site.xml")) 47 | 48 | 49 | var a = hbaseRDD2[(String, HashMap[String, String])]( 50 | tableName, 51 | scans, 52 | (r: (ImmutableBytesWritable, Result)) => { 53 | var rowMap = new HashMap[String, String]() 54 | var listCells = r._2.listCells() 55 | val rowkey = Bytes.toString(r._2.getRow) 56 | for (cell <- listCells) { 57 | var column = new String(cell.getQualifierArray, cell.getQualifierOffset, cell.getQualifierLength) 58 | rowMap.put(column, new String(cell.getValueArray, cell.getValueOffset, cell.getValueLength)) 59 | } 60 | (rowkey, rowMap) 61 | }) 62 | println(a.partitions.size) 63 | a.foreach(println) 64 | 65 | } 66 | def hbaseRDD[U: ClassTag](tableName: String, scan: Scan, f: ((ImmutableBytesWritable, Result)) => U): RDD[U] = { 67 | 68 | var job: Job = new Job(conf) 69 | TableMapReduceUtil.initCredentials(job) 70 | TableMapReduceUtil.initTableMapperJob(tableName, scan, classOf[IdentityTableMapper], null, null, job) 71 | sc.newAPIHadoopRDD(job.getConfiguration(), 72 | classOf[TableInputFormat], 73 | classOf[ImmutableBytesWritable], 74 | classOf[Result]).map(f) 75 | } 76 | def hbaseRDD2[U: ClassTag](tableName: String, scan: Scan, f: ((ImmutableBytesWritable, Result)) => U): RDD[U] = { 77 | var proto = ProtobufUtil.toScan(scan); 78 | conf.set(TableInputFormat.INPUT_TABLE, tableName) 79 | conf.set(TableInputFormat.SCAN, Base64.encodeBytes(proto.toByteArray())) 80 | var job: Job = new Job(conf) 81 | sc.newAPIHadoopRDD(job.getConfiguration(), 82 | classOf[TableInputFormat], 83 | classOf[ImmutableBytesWritable], 84 | classOf[Result]).map(f) 85 | } 86 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/hbase/hbasetest.scala: -------------------------------------------------------------------------------- 1 | package com.spark.hbase 2 | 3 | import org.apache.hadoop.hbase.client.HBaseAdmin 4 | import org.apache.hadoop.hbase.client.HTable 5 | import org.apache.hadoop.hbase.client.Put 6 | import org.apache.hadoop.hbase.client.Get 7 | import org.apache.hadoop.hbase.util.Bytes 8 | import util.Properties 9 | import org.apache.hadoop.hbase.{HColumnDescriptor, HTableDescriptor, HBaseConfiguration} 10 | import org.apache.hadoop.hbase.client.ConnectionFactory 11 | 12 | object hbasetest { 13 | 14 | var zookeeper = "192.168.0.245,192.168.0.246,192.168.0.247" 15 | var conf = HBaseConfiguration.create() 16 | conf.set("hbase.zookeeper.quorum", zookeeper) 17 | conf.set("hbase.zookeeper.property.clientPort", "2181") 18 | conf.set("zookeeper.znode.parent","/hbase") 19 | val admin=ConnectionFactory.createConnection(conf).getAdmin 20 | 21 | 22 | def main(args: Array[String]) { 23 | val tablename="table001" 24 | val tablenames=Array("table001","table002") 25 | val rowkey="rowkey001" 26 | val columnnames=Array("columnname001","cn002") 27 | val columndatas=Array("columndata001","data001") 28 | createHbaseTable(tablenames) 29 | println(">>>>>>>>>>>>>") 30 | putHbaseData(tablename, rowkey, columnnames, columndatas) 31 | println("1>>>>>>>>>>>>>") 32 | getHbaseData(tablename, rowkey) 33 | } 34 | 35 | // list the tables 36 | //val listtables=admin.listTables() 37 | //listtables.foreach(println) 38 | 39 | def createHbaseTable(tablenames: Array[String]) { 40 | for(tablename<-tablenames){ 41 | // if (admin.tableExists(tablename)!=null){ 42 | val tableDesc = new HTableDescriptor(Bytes.toBytes(tablename)) 43 | val idsColumnFamilyDesc = new HColumnDescriptor(Bytes.toBytes("info")) 44 | tableDesc.addFamily(idsColumnFamilyDesc) 45 | admin.createTable(tableDesc) 46 | // } 47 | } 48 | } 49 | 50 | def putHbaseData(tablename: String,rowkey:String,columnnames:Array[String],columndatas:Array[String]) { 51 | val table = new HTable(conf, tablename) 52 | val theput= new Put(Bytes.toBytes(rowkey)) 53 | for(a<-0 to columnnames.length){ 54 | theput.addColumn(Bytes.toBytes("info"),Bytes.toBytes(columnnames(a)),Bytes.toBytes(columndatas(a))) 55 | table.put(theput) 56 | } 57 | } 58 | 59 | // let's insert some data in 'mytable' and get the row 60 | def getHbaseData(tablenames: String,rowkey: String):String={ 61 | val table = new HTable(conf, tablenames) 62 | val theget= new Get(Bytes.toBytes(rowkey)) 63 | val result=table.get(theget) 64 | val value=result.value().toString 65 | // println(Bytes.toString(value)) 66 | value 67 | } 68 | admin.close() 69 | conf.clear() 70 | 71 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/hive/CaseClass.scala: -------------------------------------------------------------------------------- 1 | package com.spark.hive 2 | 3 | object CaseClass { 4 | case class User2(name:Int,age:Int,sex:Int) 5 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/hive/HiveContextTest.scala: -------------------------------------------------------------------------------- 1 | package com.spark.hive 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.sql.hive.HiveContext 6 | import org.apache.spark.sql.DataFrame 7 | 8 | object HiveContextTest { 9 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0") 10 | case class User2(name:Int,age:Int,sex:Int) 11 | var hiveconf = new SparkConf().setAppName("sparkhivetest").setMaster("local") 12 | setHiveConf 13 | val sc = new SparkContext(hiveconf) 14 | val sqlContext = new HiveContext(sc) 15 | def main(args: Array[String]): Unit = { 16 | sqlContext.sql("select * from smartadsclicklog where statdate='20170414' limit 50").show 17 | 18 | 19 | 20 | /* var rdd=sc.parallelize(Array(Map("name"->1,"age"->2,"sex"->3))).map{x=>User2(name=x("name"),age=x("age"),sex=x("sex"))} 21 | sqlContext.createDataFrame(rdd).registerTempTable("user2") 22 | sqlContext.sql("show tables").show 23 | sc.stop()*/ 24 | } 25 | def setHiveConf() { 26 | //加一下的信息,就可以不用使用hive-site.xml和hdfs-site.xml了 27 | //信息在/etc/hive/conf/hive-site.xml里面 28 | //加配置文件是最保险的。有时候加下面的也不成功 29 | System.setProperty("hive.metastore.uris", "thrift://mongodb3:9083") 30 | System.setProperty("hive.metastore.warehouse.dir", "/user/hive/warehouse") 31 | System.setProperty("hive.zookeeper.quorum", "mongodb3,solr2.zhiziyun.com,solr1.zhiziyun.com") 32 | System.setProperty("hive.zookeeper.client.port", "2181") 33 | 34 | System.setProperty("dfs.nameservices", "nameservice-zzy") 35 | System.setProperty("dfs.client.failover.proxy.provider.nameservice-zzy", "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider") 36 | System.setProperty("dfs.ha.automatic-failover.enabled.nameservice-zzy", "true") 37 | System.setProperty("ha.zookeeper.quorum", "mongodb3:2181,solr1.zhiziyun.com:2181,solr2.zhiziyun.com:2181") 38 | System.setProperty("dfs.ha.namenodes.nameservice-zzy", "namenode47,namenode237") 39 | System.setProperty("dfs.namenode.rpc-address.nameservice-zzy.namenode47", "mongodb3:8020") 40 | System.setProperty("dfs.namenode.servicerpc-address.nameservice-zzy.namenode47", "mongodb3:8022") 41 | System.setProperty("dfs.namenode.http-address.nameservice-zzy.namenode47", "mongodb3:50070") 42 | System.setProperty("dfs.namenode.https-address.nameservice-zzy.namenode47", "mongodb3:50470") 43 | System.setProperty("dfs.namenode.rpc-address.nameservice-zzy.namenode237", "solr2.zhiziyun.com:8020") 44 | System.setProperty("dfs.namenode.servicerpc-address.nameservice-zzy.namenode237", "solr2.zhiziyun.com:8022") 45 | System.setProperty("dfs.namenode.http-address.nameservice-zzy.namenode237", "solr2.zhiziyun.com:50070") 46 | System.setProperty("dfs.namenode.https-address.nameservice-zzy.namenode237", "solr2.zhiziyun.com:50470") 47 | System.setProperty("dfs.namenode.http-address.nameservice-zzy.namenode47", "mongodb3:50070") 48 | System.setProperty("dfs.client.use.datanode.hostname", "false") 49 | 50 | System.setProperty("fs.permissions.umask-mode", "022") 51 | System.setProperty("dfs.namenode.acls.enabled", "false") 52 | System.setProperty("dfs.client.read.shortcircuit", "false") 53 | System.setProperty("dfs.namenode.acls.enabled", "false") 54 | System.setProperty("dfs.domain.socket.path", "/var/run/hdfs-sockets/dn") 55 | System.setProperty("dfs.client.read.shortcircuit.skip.checksum", "false") 56 | System.setProperty("dfs.client.domain.socket.data.traffic", "false") 57 | System.setProperty("dfs.datanode.hdfs-blocks-metadata.enabled", "true") 58 | 59 | 60 | System.setProperty("hive.metastore.client.socket.timeout", "300") 61 | System.setProperty("hive.warehouse.subdir.inherit.perms", "true") 62 | System.setProperty("hive.enable.spark.execution.engine", "false") 63 | System.setProperty("hive.cluster.delegation.token.store.class", "org.apache.hadoop.hive.thrift.MemoryTokenStore") 64 | System.setProperty("hive.server2.enable.doAs", "true") 65 | System.setProperty("hive.metastore.execute.setugi", "true") 66 | System.setProperty("hive.support.concurrency", "true") 67 | System.setProperty("hive.zookeeper.namespace", "hive_zookeeper_namespace_hive") 68 | System.setProperty("hive.server2.use.SSL", "false") 69 | System.setProperty("hive.conf.restricted.list", "hive.enable.spark.execution.engine") 70 | } 71 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/hive/SparkPhoenixLoadAndSaveTest.scala: -------------------------------------------------------------------------------- 1 | package com.spark.run 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.sql.SQLContext 6 | import org.apache.spark.sql.SaveMode 7 | import org.apache.phoenix.spark._ 8 | 9 | object SparkPhoenixLoadAndSaveTest { 10 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0") 11 | var sparkconf: SparkConf = null 12 | var sc: SparkContext = null 13 | var sqlContext: SQLContext = null 14 | def main(args: Array[String]): Unit = { 15 | sc = new SparkContext(new SparkConf().setMaster("local").setAppName("Test Phoenix")) 16 | sqlContext = new SQLContext(sc) 17 | //loadPhoenixDF 18 | loadPhoenixDF 19 | //saveDFToPhoeni 20 | } 21 | def loadPhoenixDF() { 22 | //获取全表 23 | var phoenixDF = sqlContext.load("org.apache.phoenix.spark", 24 | Map("table" -> "US_POPULATION", "zkUrl" -> "192.168.10.191:2181")) 25 | 26 | phoenixDF.show 27 | //phoenixDF.select("CITY").show 28 | 29 | /*phoenixDF.filter(phoenixDF("COL1") === "test_row_1" && phoenixDF("ID") === 1L) 30 | .select(phoenixDF("ID")) 31 | .show*/ 32 | 33 | //获取指定的列 34 | var phoenixDF2 = sqlContext.phoenixTableAsDataFrame("US_POPULATION", 35 | Seq("CITY", "POPULATION"), 36 | zkUrl = Some("192.168.10.191:2181")) 37 | phoenixDF2.foreach { x => println(x) } 38 | 39 | /* phoenixDF2.registerTempTable("tablename") 40 | phoenixDF2.map { x => x} */ 41 | 42 | } 43 | def saveDFToPhoeni() { 44 | //将一个RDD存进Phoenix 45 | val dataSet = List(("CB", "A", 11), ("CC", "B", 22), ("CD", "C", 33)) 46 | sc.parallelize(dataSet) 47 | .saveToPhoenix("US_POPULATION", 48 | Seq("STATE", "CITY", "POPULATION"), 49 | zkUrl = Some("192.168.10.191")) 50 | //将一个DataFram存进Phoenix 51 | /*var phoenixDF=sqlContext.load("org.apache.phoenix.spark", 52 | Map("table" -> "TABLE1", "zkUrl" -> "phoenix-server:2181")) 53 | phoenixDF.save("org.apache.phoenix.spark", 54 | SaveMode.Overwrite, Map("table" -> "OUTPUT_TABLE", 55 | "zkUrl" -> "phoenix-server:2181")) 56 | */ 57 | 58 | } 59 | 60 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/hive/SparkRddToHive.scala: -------------------------------------------------------------------------------- 1 | package com.spark.hive 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | import org.apache.hive.hcatalog.data.DefaultHCatRecord 6 | import org.apache.hive.hcatalog.mapreduce.HCatOutputFormat 7 | import org.apache.hive.hcatalog.data.HCatRecord 8 | import org.apache.hive.hcatalog.common.HCatUtil 9 | import org.apache.hive.hcatalog.data.schema.HCatSchema 10 | import org.apache.hadoop.mapreduce.Job 11 | import org.apache.hive.hcatalog.mapreduce.OutputJobInfo 12 | import org.apache.hive.hcatalog.mapreduce.HCatBaseOutputFormat._ 13 | import org.apache.hive.hcatalog.mapreduce.HCatBaseOutputFormat 14 | import org.apache.hadoop.io.WritableComparable 15 | import org.apache.spark.SerializableWritable 16 | import org.apache.hadoop.io.NullWritable 17 | import org.apache.hadoop.conf.Configuration 18 | import org.apache.spark.rdd.RDD 19 | import scala.reflect.ClassTag 20 | import org.apache.hive.hcatalog.mapreduce.HCatRecordReader 21 | import org.apache.hadoop.mapreduce.JobContext 22 | 23 | object SparkRddToHive { 24 | var sc: SparkContext = null 25 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0") 26 | def main(args: Array[String]): Unit = { 27 | init 28 | var outputJob: Job = Job.getInstance 29 | var schema = getHCatSchema("test", "test",outputJob) 30 | var a = sc.parallelize(Array((1,1))) 31 | useHCatOutputFormatToHive(outputJob,schema,a) 32 | println(">>>>>>...") 33 | //a.saveAsNewAPIHadoopDataset(new Configuration) 34 | } 35 | def getHCatSchema(dbName: String, tableName: String,outputJob: Job) = { 36 | //获取schema 37 | var schema: HCatSchema = null 38 | //var outputJob: Job = Job.getInstance 39 | outputJob.setJobName("getHCatSchema"); 40 | HCatOutputFormat.setOutput(outputJob, OutputJobInfo.create(dbName, tableName, null)); 41 | schema = HCatBaseOutputFormat.getTableSchema(outputJob.getConfiguration()); 42 | HCatOutputFormat.setSchema(outputJob, schema) 43 | schema 44 | } 45 | def useHCatOutputFormatToHive[T:ClassTag](job:Job,recordSchema: HCatSchema,rdd:RDD[T]) { 46 | var a = sc.parallelize(Array(("test", 1), ("test2", 2), ("test3", 3), ("test4", 4)),2) 47 | job.setOutputFormatClass(classOf[HCatOutputFormat]) 48 | job.setOutputKeyClass(classOf[NullWritable]); 49 | job.setOutputValueClass(classOf[DefaultHCatRecord]); 50 | var jobconf = job.getConfiguration 51 | 52 | var c = a.map { x => 53 | var record = new DefaultHCatRecord(recordSchema.size()); 54 | record.setString("name", recordSchema, x._1) 55 | record.setString("age", recordSchema, x._2.toString) 56 | (NullWritable.get(), record) 57 | } 58 | c.saveAsNewAPIHadoopDataset(jobconf) 59 | 60 | } 61 | def init { 62 | val sparkConf = new SparkConf() 63 | .setMaster("local") 64 | .setAppName("Test") 65 | sc = new SparkContext(sparkConf) 66 | } 67 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/hive/SparkToHive.scala: -------------------------------------------------------------------------------- 1 | package com.spark.hive 2 | 3 | import org.apache.spark.sql.hive.HiveContext 4 | import org.apache.spark.sql.SQLContext 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.SparkContext._ 7 | import org.apache.spark.SparkConf 8 | import org.apache.hive.hcatalog.mapreduce.HCatOutputFormat 9 | import org.apache.hive.hcatalog.mapreduce.HCatBaseOutputFormat._ 10 | import org.apache.hive.hcatalog.mapreduce.OutputJobInfo 11 | import org.apache.hive.hcatalog.data.schema.HCatSchema 12 | import org.apache.hive.hcatalog.data.DefaultHCatRecord 13 | import org.apache.hive.hcatalog.mapreduce.HCatBaseOutputFormat 14 | import org.apache.hadoop.mapreduce.Job 15 | import org.apache.hadoop.security.UserGroupInformation.HadoopConfiguration 16 | import org.apache.hadoop.io.NullWritable 17 | import scala.collection.mutable.ArrayBuffer 18 | import java.util.HashMap 19 | import org.apache.spark.sql.Row 20 | import org.apache.spark.sql.types.StructField 21 | import org.apache.spark.sql.types.StringType 22 | import org.apache.spark.sql.types.StructType 23 | import org.apache.spark.sql.types.IntegerType 24 | import org.apache.spark.sql.functions._ 25 | import com.spark.hive.CaseClass._ 26 | import org.apache.spark.sql.types._ 27 | import org.apache.spark.sql.api.java.UDF1 28 | object SparkToHive { 29 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0") 30 | var hiveconf = new SparkConf().setAppName("sparkhivetest").setMaster("local") 31 | setHiveConf 32 | val sc = new SparkContext(hiveconf) 33 | val sqlContext = new HiveContext(sc) 34 | import sqlContext.implicits._ 35 | 36 | def main(args: Array[String]): Unit = { 37 | //useHCatOutputFormatToHive 38 | //secondRDDToFrame 39 | //insertintoHive 40 | //readHiveData 41 | //creatTable 42 | insertintoHive 43 | } 44 | /** 45 | * 建表 46 | */ 47 | def creatTable(){ 48 | sqlContext.sql("use test1") 49 | sqlContext.sql("create table test_creat(id int,order_id int,product_id int) row format delimited fields terminated by ','STORED AS TEXTFILE") 50 | } 51 | /** 52 | * 读取hive的数据 53 | */ 54 | def readHiveData() { 55 | sqlContext.sql("use default") 56 | sqlContext.sql("select count(*) from siteorderlog").show 57 | sc.stop() 58 | } 59 | /** 60 | * 数据写入hive 61 | */ 62 | def insertintoHive(){ 63 | var rdd=sc.parallelize(Array(Map("name"->3,"age"->4,"sex"->5))) 64 | .map{x=>User2(name=x("name"),age=x("age"),sex=x("sex"))} 65 | //方法1 66 | //import sqlContext.implicits._ 67 | //rdd.toDF().registerTempTable("user2") 68 | //方法2 69 | //sqlContext.createDataFrame(rdd).select(count("name")).show 70 | sqlContext.createDataFrame(rdd).registerTempTable("user2") 71 | //sqlContext.sql("select * from user2").show 72 | 73 | sqlContext.sql("insert into table test1.test_creat "+ 74 | "select name,age,sex from user2") 75 | 76 | } 77 | /** 78 | * 自定义UDF 79 | */ 80 | def testUDFFunction(){ 81 | val makeDT=(name: Int, time: Int, tz: Int) => s"$name : $time : $tz" 82 | sqlContext.udf.register("strtoger",makeDT) 83 | var rdd=sc.parallelize(Array(Map("name"->3,"age"->4,"sex"->5))) 84 | .map{x=>User2(name=x("name"),age=x("age"),sex=x("sex"))} 85 | sqlContext.createDataFrame(rdd).registerTempTable("user2") 86 | sqlContext.sql("select *,strtoger(name,age,sex) as udf from user2").show 87 | } 88 | //第二种指定Schema,需要这个ROW 89 | def secondRDDToFrame(){ 90 | var arraybuffer=ArrayBuffer[HashMap[String,Int]]() 91 | var map=new HashMap[String,Int]() 92 | map.put("name", 1) 93 | map.put("age", 1) 94 | map.put("sex", 1) 95 | arraybuffer+=map 96 | var liens=sc.parallelize(arraybuffer) 97 | .map(p=>Row(p.get("name"),p.get("age"),p.get("sex"))) 98 | var schemaString = Array("name","age","sex") 99 | var columns=schemaString.map(fieldName => StructField(fieldName, IntegerType, true)) 100 | val schema = StructType(columns) 101 | var schemaData=sqlContext.createDataFrame(liens, schema) 102 | schemaData.registerTempTable("user2") 103 | sqlContext.sql("select * from user2").show() 104 | sqlContext.sql("insert overwrite table test1.test_creat select name,age,sex from user2") 105 | } 106 | def setHiveConf() { 107 | //加一下的信息,就可以不用使用hive-site.xml了 108 | //信息在/etc/hive/conf/hive-site.xml里面 109 | System.setProperty("hive.metastore.uris", "thrift://CDH-Master:9083") 110 | System.setProperty("hive.metastore.warehouse.dir", "/user/hive/warehouse") 111 | System.setProperty("hive.zookeeper.quorum", "CDH-Master,Node2,Node1") 112 | System.setProperty("hive.zookeeper.client.port", "2181") 113 | 114 | 115 | System.setProperty("hive.metastore.client.socket.timeout", "300") 116 | System.setProperty("hive.warehouse.subdir.inherit.perms", "true") 117 | System.setProperty("hive.enable.spark.execution.engine", "false") 118 | System.setProperty("hive.cluster.delegation.token.store.class", "org.apache.hadoop.hive.thrift.MemoryTokenStore") 119 | System.setProperty("hive.server2.enable.doAs", "true") 120 | System.setProperty("hive.metastore.execute.setugi", "true") 121 | System.setProperty("hive.support.concurrency", "true") 122 | System.setProperty("hive.zookeeper.namespace", "hive_zookeeper_namespace_hive") 123 | System.setProperty("hive.server2.use.SSL", "false") 124 | System.setProperty("hive.conf.restricted.list", "hive.enable.spark.execution.engine") 125 | } 126 | def useHCatOutputFormatToHive() { 127 | var a = sc.parallelize(Array(("test", 1), ("test2", 2), ("test3", 3), ("test4", 4))) 128 | var job = Job.getInstance(); 129 | HCatOutputFormat.setOutput(job, OutputJobInfo.create("test", "test", null)); 130 | var recordSchema = getTableSchema(job.getConfiguration()) 131 | HCatOutputFormat.setSchema(job, recordSchema) 132 | job.setOutputFormatClass(classOf[HCatOutputFormat]) 133 | job.setOutputKeyClass(classOf[NullWritable]); 134 | job.setOutputValueClass(classOf[DefaultHCatRecord]); 135 | var jobconf = job.getConfiguration 136 | var c = a.map { x => 137 | var record = new DefaultHCatRecord(recordSchema.size()); 138 | record.setString("name", recordSchema, x._1) 139 | record.setString("age", recordSchema, x._2.toString) 140 | (NullWritable.get(), record) 141 | } 142 | c.saveAsNewAPIHadoopDataset(jobconf) 143 | 144 | } 145 | } 146 | -------------------------------------------------------------------------------- /src/main/scala/com/spark/jdbcrdd/JdbcMysqlRDD.scala: -------------------------------------------------------------------------------- 1 | package com.spark.jdbcrdd 2 | 3 | import scala.reflect.ClassTag 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD 6 | import java.sql.ResultSet 7 | import java.sql.Connection 8 | import org.apache.spark.Logging 9 | import org.apache.spark.Partition 10 | import org.apache.spark.TaskContext 11 | import org.apache.spark.annotation.DeveloperApi 12 | class JdbcMysqlPartition(idx: Int, val startId: Long, val perPartitionNum: Long) extends Partition { 13 | override def index = idx 14 | } 15 | class JdbcMysqlRDD[T:ClassTag]( 16 | sc: SparkContext, 17 | getConnection: () => Connection, 18 | sql:String, 19 | numPartitions: Int, 20 | mapRow: (ResultSet) => T = JdbcMysqlRDD.resultSetToObjectArray _) 21 | extends RDD[T](sc, Nil) with Logging{ 22 | override def count()=getRowsNum(sql) 23 | override def getPartitions: Array[Partition] = { 24 | val rowsNum=getRowsNum(sql) 25 | //Each partition limit on the number of article 26 | val perPartitionNum=rowsNum/numPartitions 27 | //Add the remaining to the last partition 28 | val lastPartitionNum=perPartitionNum+(rowsNum%numPartitions) 29 | (0 until numPartitions).map(i => { 30 | val start = (i*perPartitionNum) 31 | if(i==(numPartitions-1)){ 32 | new JdbcMysqlPartition(i, start, lastPartitionNum) 33 | }else 34 | new JdbcMysqlPartition(i, start, perPartitionNum) 35 | }).toArray 36 | } 37 | /** 38 | * For how many records 39 | * @param The SQL query 40 | */ 41 | def getRowsNum(sql:String)={ 42 | var rowsNum=0 43 | var tmpConn=getConnection() 44 | try{ 45 | if(sql.toLowerCase.indexOf("from")<0){ 46 | logError(" sql is error , There must be the from keyword ") 47 | }else{ 48 | val nsql="select count(1) "+sql.substring(sql.toLowerCase.indexOf("from"), sql.size) 49 | val stmt = tmpConn.prepareStatement(nsql, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY) 50 | 51 | val rs = stmt.executeQuery() 52 | if(rs.next()){ 53 | rowsNum=rs.getInt(1) 54 | } 55 | stmt.close() 56 | } 57 | }catch { 58 | case t: Throwable => t.printStackTrace() // TODO: handle error 59 | }finally { 60 | tmpConn.close() 61 | tmpConn=null 62 | } 63 | rowsNum 64 | } 65 | //每个分区怎么获取数据的原理是按照分页的原理来取的 66 | override def compute(thePart: Partition, context: TaskContext) = new NextIterator[T] { 67 | context.addTaskCompletionListener{ context => closeIfNeeded() } 68 | val part = thePart.asInstanceOf[JdbcMysqlPartition] 69 | val conn = getConnection() 70 | val partSql=sql+" limit ?,?" 71 | val stmt = conn.prepareStatement(partSql, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY) 72 | if (conn.getMetaData.getURL.matches("jdbc:mysql:.*")) { 73 | stmt.setFetchSize(Integer.MIN_VALUE) 74 | logInfo("statement fetch size set to: " + stmt.getFetchSize + " to force MySQL streaming ") 75 | } 76 | stmt.setLong(1, part.startId) 77 | stmt.setLong(2, part.perPartitionNum) 78 | val rs = stmt.executeQuery() 79 | override def getNext: T = { 80 | if (rs.next()) { 81 | mapRow(rs) 82 | } else { 83 | finished = true 84 | null.asInstanceOf[T] 85 | } 86 | } 87 | 88 | override def close() { 89 | try { 90 | if (null != rs) { 91 | rs.close() 92 | } 93 | } catch { 94 | case e: Exception => logWarning("Exception closing resultset", e) 95 | } 96 | try { 97 | if (null != stmt) { 98 | stmt.close() 99 | } 100 | } catch { 101 | case e: Exception => logWarning("Exception closing statement", e) 102 | } 103 | try { 104 | if (null != conn) { 105 | conn.close() 106 | } 107 | logInfo("closed connection") 108 | } catch { 109 | case e: Exception => logWarning("Exception closing connection", e) 110 | } 111 | } 112 | } 113 | 114 | 115 | 116 | 117 | } 118 | object JdbcMysqlRDD{ 119 | def resultSetToObjectArray(rs: ResultSet): Array[Object] = { 120 | Array.tabulate[Object](rs.getMetaData.getColumnCount)(i => rs.getObject(i + 1)) 121 | } 122 | } 123 | abstract class NextIterator[U] extends Iterator[U] { 124 | private var gotNext = false 125 | private var nextValue: U = _ 126 | private var closed = false 127 | protected var finished = false 128 | protected def getNext(): U 129 | protected def close() 130 | def closeIfNeeded() { 131 | if (!closed) { 132 | closed = true 133 | close() 134 | } 135 | } 136 | override def hasNext: Boolean = { 137 | if (!finished) { 138 | if (!gotNext) { 139 | nextValue = getNext() 140 | if (finished) { 141 | closeIfNeeded() 142 | } 143 | gotNext = true 144 | } 145 | } 146 | !finished 147 | } 148 | override def next(): U = { 149 | if (!hasNext) { 150 | throw new NoSuchElementException("End of stream") 151 | } 152 | gotNext = false 153 | nextValue 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /src/main/scala/com/spark/jdbcrdd/SparkCSVTest.scala: -------------------------------------------------------------------------------- 1 | package com.spark.jdbcrdd 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.sql.SQLContext 6 | import com.databricks.spark.csv._ 7 | import org.apache.spark.sql.types.StructType 8 | import org.apache.spark.sql.types.StructField 9 | import org.apache.spark.sql.types.IntegerType 10 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat 11 | import org.apache.hadoop.io.NullWritable 12 | import org.apache.hadoop.io.Text 13 | import org.apache.spark.rdd.RDD 14 | import scala.reflect.ClassTag 15 | object SparkCSVTest { 16 | var sc: SparkContext = null 17 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0") 18 | def main(args: Array[String]): Unit = { 19 | init 20 | tets 21 | } 22 | 23 | def init { 24 | val sparkConf = new SparkConf() 25 | .setMaster("local") 26 | .setAppName("Test") 27 | sc = new SparkContext(sparkConf) 28 | } 29 | def tets(){ 30 | val sqlContext = new SQLContext(sc) 31 | val customSchema = StructType(Array( 32 | StructField("year", IntegerType, true), 33 | StructField("comment", IntegerType, true), 34 | StructField("blank", IntegerType, true))) 35 | 36 | val df = sqlContext.load( 37 | "com.databricks.spark.csv", 38 | schema = customSchema, 39 | Map("path" -> "C:\\Users\\zhiziyun\\Desktop\\csvtest.csv", "header" -> "true")) 40 | 41 | val selectedData = df.select("year", "comment") 42 | selectedData.save("C:\\Users\\zhiziyun\\Desktop\\re.csv", "com.databricks.spark.csv") 43 | } 44 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/jdbcrdd/SparkJdbcRDDTest.scala: -------------------------------------------------------------------------------- 1 | package com.spark.jdbcrdd 2 | 3 | import org.apache.spark.SparkContext 4 | import java.sql.DriverManager 5 | import java.sql.ResultSet 6 | 7 | object SparkJdbcRDDTest { 8 | def main(args: Array[String]) { 9 | val sc = new SparkContext("local","spark_mysql") 10 | val numPartitions=10 11 | val sql="select * from zz_reporting.st_rtbreport_byplan where StatDate='2016-05-01'" 12 | //限制:会出现数据丢失和数据重复的现象,因为你在取数据的时候,会出现数据删除和数据添加的情况, 13 | //这样数据的顺序就会打乱, 14 | //使用自带的JdbcRDD可以解决数据丢失 的问题,但是限制性比较大 15 | 16 | val data=sc.mysqlRDD(createConnection, sql, numPartitions, extractValues) 17 | data.printlnRDD 18 | 19 | sc.stop() 20 | } 21 | 22 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/jdbcrdd/SparkSecondarySortKey.scala: -------------------------------------------------------------------------------- 1 | package com.spark.jdbcrdd 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | import java.util.HashSet 6 | import java.util.HashMap 7 | import scala.collection.mutable.ArrayBuffer 8 | import org.apache.spark.rdd.RDD 9 | import scala.collection.JavaConversions._ 10 | import org.apache.hadoop.mapreduce.Job 11 | import org.apache.hadoop.fs.Path 12 | import org.apache.hadoop.mapred.JobConf 13 | import org.apache.hadoop.mapred.FileInputFormat 14 | import org.apache.spark.HashPartitioner 15 | import org.apache.spark.RangePartitioner 16 | import org.apache.spark.Partitioner 17 | import org.apache.hadoop.mapreduce.Reducer 18 | import org.apache.hadoop.io.Writable 19 | import org.apache.hadoop.io.WritableComparable 20 | import java.io.DataInput 21 | import java.io.DataOutput 22 | import org.apache.hadoop.io.WritableComparator 23 | import java.io.FileInputStream 24 | import java.io.InputStreamReader 25 | import java.io.BufferedReader 26 | import java.io.FileOutputStream 27 | import java.io.OutputStreamWriter 28 | import scala.reflect.ClassTag 29 | object SparkSecondarySortKey { 30 | var sc: SparkContext = null 31 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0") 32 | def main(args: Array[String]): Unit = { 33 | init 34 | val sd=new SecondarySortKey("a",1) 35 | val ds=new SecondarySortKey("a",2) 36 | println(sd.equals(ds)) 37 | val a=Array(("a",1),("a",9),("b",4),("o",7),("b",9), 38 | ("b",3),("f",4),("k",8), 39 | ("a",15),("z",4),("b",1)) 40 | val rdd=sc.parallelize(a) 41 | //实现二次排序:先按first字段排序,然后按second排序 42 | val hrdd=rdd.map { case(first,second) => 43 | val key=new SecondarySortKey(first,second) 44 | (key,second) 45 | }.groupByKey() 46 | .map{x=>(x._1,x._2.toList.sorted)} 47 | .sortByKey() 48 | .foreach(println) 49 | } 50 | def init() { 51 | val sparkConf = new SparkConf() 52 | .setMaster("local") 53 | .setAppName("Test") 54 | sc = new SparkContext(sparkConf) 55 | } 56 | 57 | /** 58 | * 自定义分区 59 | */ 60 | class IteblogPartitioner(override val numPartitions: Int) extends Partitioner { 61 | //override def numPartitions: Int = numParts 62 | override def getPartition(key: Any): Int = { 63 | val first = key.asInstanceOf[SecondarySortKey].first 64 | val code = (first.hashCode % numPartitions) 65 | if (code<0) { 66 | code+numPartitions 67 | } else { 68 | code 69 | } 70 | } 71 | override def equals(other: Any): Boolean = other match { 72 | case iteblog: IteblogPartitioner => 73 | iteblog.numPartitions == numPartitions 74 | case _ => 75 | false 76 | } 77 | override def hashCode: Int = numPartitions 78 | } 79 | /** 80 | * 自定义一个key 81 | */ 82 | class SecondarySortKey(var first:String,var second:Int) 83 | extends WritableComparable[SecondarySortKey] with Serializable{ 84 | def set(left:String,right:Int) { 85 | first = left; 86 | second = right; 87 | } 88 | def getFirst()=first 89 | def getSecond() =second 90 | override def readFields(in:DataInput){ 91 | first = in.readUTF(); 92 | second = in.readInt(); 93 | } 94 | override def write(out:DataOutput){ 95 | out.writeUTF(first); 96 | out.writeInt(second); 97 | } 98 | override def hashCode() =first.hashCode() 99 | //这个是在reduce的时候决定哪些key要分配在一起的 100 | override def equals(right:Any) ={ 101 | if (right.isInstanceOf[SecondarySortKey]) { 102 | var r = right.asInstanceOf[SecondarySortKey] 103 | r.first == first 104 | } else { 105 | false 106 | } 107 | } 108 | //这里的代码是关键,因为对key排序时 109 | def compareTo(o:SecondarySortKey) ={ 110 | if (first != o.first) { 111 | first.compareTo(o.first) 112 | } else if (second != o.second) { 113 | second - o.second 114 | } else { 115 | 0 116 | } 117 | } 118 | override def toString()={ 119 | first 120 | } 121 | } 122 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/jdbcrdd/package.scala: -------------------------------------------------------------------------------- 1 | package com.spark 2 | 3 | import com.fun.util.RDDOperateFunction 4 | import com.fun.util.SparkContextOperateFunction 5 | import com.fun.util.ZzyLmqDataOperateUtil 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.rdd.RDD 8 | package object jdbcrdd extends RDDOperateFunction 9 | with SparkContextOperateFunction 10 | with ZzyLmqDataOperateUtil{ 11 | //可以通过继承类来获得,也可以直接写 12 | implicit class SparkContextNewFunction(sparkContext: SparkContext) { 13 | def lmq(name: String) = "" 14 | } 15 | 16 | //隐式参数的使用 17 | implicit class RDDNewFunction[T](rdd: RDD[T]) { 18 | def lmq3(str: String)(implicit impl:Array[T])=rdd.map { x => x + " : "+impl(0) } 19 | def lmq4[A](str: String)(implicit impl:Array[A])=rdd.map { x => x + " : "+impl(0) } 20 | } 21 | 22 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/kafka/HashMapEncoder.scala: -------------------------------------------------------------------------------- 1 | package com.spark.kafka 2 | 3 | import kafka.serializer.Encoder 4 | import java.util.HashMap 5 | import kafka.serializer.StringEncoder 6 | 7 | 8 | class HashMapEncoder extends Encoder[HashMap[String,Any]]{ 9 | @Override 10 | def toBytes(a:HashMap[String,Any])= { 11 | 12 | null 13 | } 14 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/kafka/KafkaProducerCache.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-2016, Benjamin Fradet, and other contributors. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one 5 | * or more contributor license agreements. See the NOTICE file 6 | * distributed with this work for additional information 7 | * regarding copyright ownership. The ASF licenses this file 8 | * to you under the Apache License, Version 2.0 (the 9 | * "License"); you may not use this file except in compliance 10 | * with the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, 15 | * software distributed under the License is distributed on an 16 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 17 | * KIND, either express or implied. See the License for the 18 | * specific language governing permissions and limitations 19 | * under the License. 20 | */ 21 | 22 | package com.spark.kafka 23 | 24 | import java.util.Properties 25 | import org.apache.kafka.clients.producer.KafkaProducer 26 | 27 | import scala.collection.mutable 28 | 29 | /** Cache of [[KafkaProducer]]s */ 30 | 31 | object KafkaProducerCache { 32 | private val producers = mutable.HashMap.empty[Properties, KafkaProducer[_, _]] 33 | 34 | /** 35 | * Retrieve a [[KafkaProducer]] in the cache or create a new one 36 | * @param producerConfig properties for a [[KafkaProducer]] 37 | * @return a [[KafkaProducer]] already in the cache 38 | */ 39 | def getProducer[K, V](producerConfig: Properties): KafkaProducer[K, V] = { 40 | producers.getOrElse(producerConfig, { 41 | val producer = new KafkaProducer[K, V](producerConfig) 42 | producers(producerConfig) = producer 43 | producer 44 | }).asInstanceOf[KafkaProducer[K, V]] 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/scala/com/spark/kafka/RDDKafkaWriter.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2016-2016, Benjamin Fradet, and other contributors. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one 5 | * or more contributor license agreements. See the NOTICE file 6 | * distributed with this work for additional information 7 | * regarding copyright ownership. The ASF licenses this file 8 | * to you under the Apache License, Version 2.0 (the 9 | * "License"); you may not use this file except in compliance 10 | * with the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, 15 | * software distributed under the License is distributed on an 16 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 17 | * KIND, either express or implied. See the License for the 18 | * specific language governing permissions and limitations 19 | * under the License. 20 | */ 21 | 22 | package com.spark.kafka 23 | 24 | import java.util.Properties 25 | 26 | import org.apache.kafka.clients.producer.ProducerRecord 27 | import org.apache.spark.rdd.RDD 28 | 29 | import scala.reflect.ClassTag 30 | 31 | /** 32 | * Class used for writing [[RDD]]s to Kafka 33 | * @param rdd [[RDD]] to be written to Kafka 34 | */ 35 | class RDDKafkaWriter[T](@transient private val rdd: RDD[T]) 36 | extends Serializable { 37 | /** 38 | * Write a [[RDD]] to Kafka 39 | * @param producerConfig properties for a [[org.apache.kafka.clients.producer.KafkaProducer]] 40 | * @param transformFunc a function used to transform values of T type into [[ProducerRecord]]s 41 | */ 42 | def writeToKafka[K, V]( 43 | producerConfig: Properties, 44 | transformFunc: T => ProducerRecord[K, V] 45 | ): Unit = 46 | rdd.foreachPartition { partition => 47 | val producer = KafkaProducerCache.getProducer[K, V](producerConfig) 48 | partition.map(transformFunc) 49 | .foreach(record => producer.send(record)) 50 | 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/scala/com/spark/kafka/SparkKafkaRDDReader.scala: -------------------------------------------------------------------------------- 1 | package com.spark.kafka 2 | 3 | import org.apache.spark.streaming.kafka.KafkaClusterManager 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.SparkConf 6 | 7 | object SparkKafkaRDDReader { 8 | var sc:SparkContext=null 9 | def main(args: Array[String]): Unit = { 10 | init 11 | val topics=Set("realtimereport") 12 | var kafkaParams = Map[String, String]( 13 | "metadata.broker.list" ->"kafka1:9092,kafka2:9092,kafka3:9092", 14 | "serializer.class" -> "kafka.serializer.StringEncoder", 15 | "group.id" -> "ZhiZiYunReportStorageRunMain_Box") 16 | 17 | val kafkaRdd= KafkaClusterManager.createKafkaRDD(sc, kafkaParams, topics) 18 | kafkaRdd.take(10).foreach(println) 19 | } 20 | def init() { 21 | val sparkConf = new SparkConf() 22 | .setMaster("local[2]") 23 | .setAppName("Test") 24 | .set("spark.streaming.kafka.maxRatePerPartition", "10") 25 | sc = new SparkContext(sparkConf) 26 | } 27 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/kafka/SparkWriteDataToKafkaRunMain.scala: -------------------------------------------------------------------------------- 1 | package com.spark.kafka 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.streaming.StreamingContext 6 | import org.apache.spark.streaming.Seconds 7 | import org.apache.spark.streaming.kafka.KafkaClusterManager 8 | import org.apache.kafka.clients.producer.ProducerRecord 9 | import java.util.Properties 10 | import org.apache.kafka.common.serialization.StringSerializer 11 | import org.slf4j.LoggerFactory 12 | import org.apache.kafka.clients.producer.KafkaProducer 13 | import java.util.HashMap 14 | import org.apache.kafka.clients.producer.ProducerConfig 15 | import org.apache.hadoop.hbase.HBaseConfiguration 16 | import org.apache.hadoop.hbase.client.ConnectionFactory 17 | import org.apache.hadoop.hbase.TableName 18 | import org.apache.hadoop.hbase.client.Get 19 | import org.apache.hadoop.hbase.util.Bytes 20 | import org.apache.hadoop.hbase.client.Put 21 | object SparkWriteDataToKafkaRunMain { 22 | var sc: SparkContext = null 23 | var ssc: StreamingContext = null 24 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0") 25 | import org.apache.log4j.{ Level, Logger } 26 | Logger.getLogger("org.apache.spark").setLevel(Level.ERROR) 27 | val zookeeper = "solr2.zhiziyun.com,solr1.zhiziyun.com,mongodb3" 28 | val producerConfig = { 29 | val p = new Properties() 30 | p.setProperty("bootstrap.servers", "kafka1:9092,kafka2:9092,kafka3:9092") 31 | p.setProperty("key.serializer", classOf[StringSerializer].getName) 32 | p.setProperty("value.serializer", classOf[StringSerializer].getName) 33 | p.setProperty("zookeeper.connect", "solr2.zhiziyun.com,solr1.zhiziyun.com,mongodb3") 34 | p 35 | } 36 | val producer = new KafkaProducer[String, String](producerConfig) 37 | 38 | def main(args: Array[String]): Unit = { 39 | initSCC 40 | writeDataToKafka 41 | // send 42 | 43 | } 44 | def send() { 45 | for (i <- 1 to 20) { 46 | val producer = new KafkaProducer[String, String](producerConfig) 47 | val (rowkey, data) = (1, 2) 48 | producer.send(new ProducerRecord[String, String]("test", rowkey + "," + data)) 49 | producer.close() 50 | } 51 | 52 | } 53 | 54 | def writeDataToKafka() { 55 | //var topics = Set("smartadsdeliverylog") 56 | var topics = Set("test") 57 | var kafkaParams = Map[String, String]("metadata.broker.list" -> "kafka1:9092,kafka2:9092,kafka3:9092", 58 | "serializer.class" -> "kafka.serializer.StringEncoder", "group.id" -> "test", "zookeeper.connect" -> zookeeper) 59 | val dstream = KafkaClusterManager.createDirectStream(ssc, kafkaParams, topics) 60 | dstream.foreachRDD { rdd => 61 | println("#############################3333") 62 | rdd.map { 63 | case (rowkey, value) => 64 | val str = value.split(",") 65 | (str(0), (rowkey, str)) 66 | } 67 | .groupByKey 68 | .foreachPartition { x => 69 | var hconf = HBaseConfiguration.create() 70 | hconf.set("hbase.zookeeper.quorum", zookeeper) 71 | hconf.set("hbase.zookeeper.property.clientPort", "2181") 72 | val conn = ConnectionFactory.createConnection(hconf) 73 | val table = conn.getTable(TableName.valueOf("reportbox_2")) 74 | for ((tablename, rowjey_str) <- x) { 75 | for ((rowkey, str) <- rowjey_str) { 76 | var delivery = str(1).toInt 77 | var deliveryUV = str(2).toInt 78 | var cpmcost = str(3).toDouble 79 | var additionalcpmcost = str(4).toDouble 80 | var fee = str(5).toDouble 81 | var fee2 = str(6).toDouble 82 | var click = str(7).toInt 83 | var clickUV = str(8).toInt 84 | var reach = str(9).toInt 85 | var reachUV = str(10).toInt 86 | var visitLength = str(11).toInt 87 | var sencondsClick = str(12).toInt 88 | 89 | val get = new Get(Bytes.toBytes(rowkey)) 90 | val result = table.get(get) 91 | if (!result.isEmpty()) { 92 | val hdelivery = result.getValue("info".getBytes, "delivery".getBytes) 93 | val hdeliveryUV = result.getValue("info".getBytes, "deliveryUV".getBytes) 94 | val hcpmcost = result.getValue("info".getBytes, "cpmcost".getBytes) 95 | val hadditionalcpmcost = result.getValue("info".getBytes, "additionalcpmcost".getBytes) 96 | val hfee = result.getValue("info".getBytes, "fee".getBytes) 97 | val hfee2 = result.getValue("info".getBytes, "fee2".getBytes) 98 | val hclick = result.getValue("info".getBytes, "click".getBytes) 99 | val hclickUV = result.getValue("info".getBytes, "clickUV".getBytes) 100 | val hreach = result.getValue("info".getBytes, "reach".getBytes) 101 | val hreachUV = result.getValue("info".getBytes, "reachUV".getBytes) 102 | val hvisitLength = result.getValue("info".getBytes, "visitLength".getBytes) 103 | val hsencondsClick = result.getValue("info".getBytes, "sencondsClick".getBytes) 104 | 105 | if (hdelivery!= null) delivery = delivery + new String(hdelivery).toInt 106 | if (hdeliveryUV!= null) deliveryUV = deliveryUV + new String(hdeliveryUV).toInt 107 | if (hcpmcost != null) cpmcost = cpmcost + new String(hcpmcost).toDouble 108 | if (hadditionalcpmcost != null) additionalcpmcost = additionalcpmcost + new String(hadditionalcpmcost).toDouble 109 | if (hfee != null) fee = fee + new String(hfee).toDouble 110 | if (hfee2 != null) fee2 = fee2 + new String(hfee2).toDouble 111 | if (hclick != null) click = click + new String(hclick).toInt 112 | if (hclickUV != null) clickUV = clickUV + new String(hclickUV).toInt 113 | if (hreach != null) reach = reach + new String(hreach).toInt 114 | if (hreachUV != null) reachUV = reachUV + new String(hreachUV).toInt 115 | if (hvisitLength != null) visitLength = visitLength + new String(hvisitLength).toInt 116 | if (hsencondsClick != null) sencondsClick = sencondsClick + new String(hsencondsClick).toInt 117 | 118 | 119 | 120 | } 121 | val put = new Put(Bytes.toBytes(rowkey)) 122 | put.addColumn("info".getBytes, "delivery".getBytes, delivery.toString().getBytes) 123 | put.addColumn("info".getBytes, "deliveryUV".getBytes, deliveryUV.toString().getBytes) 124 | put.addColumn("info".getBytes, "cpmcost".getBytes, cpmcost.toString().getBytes) 125 | put.addColumn("info".getBytes, "additionalcpmcost".getBytes, additionalcpmcost.toString().getBytes) 126 | put.addColumn("info".getBytes, "fee".getBytes, fee.toString().getBytes) 127 | put.addColumn("info".getBytes, "fee2".getBytes, fee2.toString().getBytes) 128 | put.addColumn("info".getBytes, "click".getBytes, click.toString().getBytes) 129 | put.addColumn("info".getBytes, "clickUV".getBytes, clickUV.toString().getBytes) 130 | put.addColumn("info".getBytes, "reach".getBytes, reach.toString().getBytes) 131 | put.addColumn("info".getBytes, "reachUV".getBytes, reachUV.toString().getBytes) 132 | put.addColumn("info".getBytes, "visitLength".getBytes, visitLength.toString().getBytes) 133 | put.addColumn("info".getBytes, "sencondsClick".getBytes, sencondsClick.toString().getBytes) 134 | table.put(put) 135 | 136 | } 137 | } 138 | table.close() 139 | conn.close() 140 | } 141 | 142 | //rdd.writeToKafka(producerConfig,s=>new ProducerRecord[String, String]("test", "@@@@@@")) 143 | println("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@22") 144 | } 145 | ssc.start() 146 | ssc.awaitTermination() 147 | } 148 | def initSC() { 149 | val sparkConf = new SparkConf() 150 | .setMaster("local[2]") 151 | .setAppName("Test") 152 | sc = new SparkContext(sparkConf) 153 | 154 | } 155 | def initSCC() { 156 | if (sc == null) { 157 | initSC 158 | } 159 | ssc = new StreamingContext(sc, Seconds(30)) 160 | } 161 | 162 | def tran(s: (String, String)) = new ProducerRecord[String, String]("test", s._1) 163 | 164 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/kafka/package.scala: -------------------------------------------------------------------------------- 1 | package com.spark 2 | 3 | import org.apache.spark.rdd.RDD 4 | package object kafka { 5 | implicit def writeDataToKafka2[T](rdd: RDD[T])=new RDDKafkaWriter(rdd) 6 | 7 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/ml/ALSDemo.scala: -------------------------------------------------------------------------------- 1 | package com.spark.ml 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.recommendation.Rating 6 | import org.apache.spark.mllib.recommendation.ALS 7 | import org.apache.spark.mllib.recommendation.Rating 8 | import org.apache.spark.mllib.recommendation.MatrixFactorizationModel 9 | import java.util.ArrayList 10 | import org.apache.spark.mllib.evaluation.RegressionMetrics 11 | import org.apache.spark.mllib.recommendation.ALS 12 | import org.apache.log4j.Logger 13 | import org.apache.log4j.Level 14 | /** 15 | * 训练模型其实就是为了选参数 16 | * 使用一部分已知的数据进行训练 17 | * 当预测结果的评分和真是数据的均方差较小时或达到要求时,我们就可以保存此模型(参数配置) 18 | * 对大数据集进行预测评分然后存储在 数据库中 19 | * 使用时,只要传入user的id,就可以找到预测的评分并排序,得到较高的评分就进行推荐 20 | * ALS.train表示训练一个ALS模型,model.predict表示使用这个模型进行预测 21 | */ 22 | object ALSDemo { 23 | def main(args: Array[String]): Unit = { 24 | val conf = new SparkConf() 25 | .setMaster("local") 26 | .setAppName("Spark Pi") 27 | System.setProperty("hadoop.home.dir", "f:\\eclipse\\hdplocal2.6.0") 28 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN) 29 | val sc = new SparkContext(conf) 30 | //rank 因子(系数k,矩阵分解是需要 A(X*Y)=U(X*k)L(k*Y) ) numIterations迭代次数 31 | var rank = 10 32 | var numIterations = 19 33 | println(makeModel(sc, rank, numIterations)) 34 | makeRecommend(sc, rank, numIterations) 35 | /* var resultMSE=new ArrayList[String] 36 | for(numIterations<- 30 until 31){ 37 | val MSE= makeModel(sc,rank,numIterations) 38 | resultMSE.add(numIterations+":"+MSE) 39 | } 40 | println(resultMSE)*/ 41 | } 42 | def makeRecommend(sc: SparkContext, rank: Int, numIterations: Int) { 43 | //数据为 用户 item 评分 时间戳 44 | //取前三个数据 45 | val data = sc.textFile("file:\\F:\\workspace\\BigData-Test-OtherDemos\\inputFile\\test2.data", sc.defaultMinPartitions) 46 | val ratings = data.map { _.split(",").take(3) } 47 | .map { x => Rating(x(0).toInt, x(1).toInt, x(2).toDouble) } 48 | //训练模型 49 | val model = ALS.train(ratings, rank, numIterations, 0.01) 50 | //获得用户和商品的数据集 51 | 52 | //>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>. 53 | /** 54 | * 使用训练出来的模型进行使用,计算+推荐 55 | */ 56 | val users = data.map { _.split(",").take(3) }.map { x => x(0) }.distinct().collect() 57 | users.foreach { 58 | //一次为每个用户推荐 59 | user => 60 | { 61 | val rs = model.recommendProducts(user.toInt, 10) //参数一为用户,二为返回前几 62 | var values = "" 63 | var key = 0 64 | //拼接推荐结果 65 | rs.foreach { r => 66 | { 67 | key = r.user 68 | values = values + r.product + ":" + r.rating + "\n" 69 | } 70 | } 71 | //打印推荐结果 72 | println(key.toString() + " => " + values) 73 | } 74 | } 75 | //>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 76 | } 77 | 78 | def makeModel(sc: SparkContext, rank: Int, numIterations: Int): Double = { 79 | //>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 80 | /** 81 | * 这一部分是为了训练模型用的 82 | */ 83 | 84 | //数据为 用户 item 评分 时间戳 85 | //取前三个数据 86 | val data = sc.textFile("file:\\F:\\workspace\\BigData-Test-OtherDemos\\inputFile\\test2.data", sc.defaultMinPartitions) 87 | val ratings = data.map { _.split(",") } 88 | .map { x => Rating(x(0).toInt, x(1).toInt, x(2).toDouble) } 89 | //训练模型 90 | val model = ALS.train(ratings, rank, numIterations, 0.01) 91 | //这里是要生成user product对,每一对都会生成预测,但是如果没有对的话就不生成预测 92 | //val userProducts=ratings.map { case Rating(user,product,rate) => (user,product)} 93 | val user = sc.textFile("file:\\F:\\workspace\\BigData-Test-OtherDemos\\inputFile\\user", sc.defaultMinPartitions).map { _.toInt } 94 | val product = sc.textFile("file:\\F:\\workspace\\BigData-Test-OtherDemos\\inputFile\\product", sc.defaultMinPartitions).map { _.toInt } 95 | //笛卡尔积 96 | val userProducts = user.cartesian(product) 97 | //predict使用推荐模型对用户商品进行预测评分,得到预测评分的数据集,这是所有的预测对, 98 | //前面是user,后面是product,如果没有出现,则不对这一对进行评测 99 | //如果,有一个product从未出现过,即使你写了对了,那也 不会有预测结果的 100 | val predictions = model.predict(userProducts).map { case Rating(user, product, rate) => ((user, product), rate) } 101 | //将真实的评分数据集合预测评分数据集进行合并 102 | val ratesAndPreds = ratings.map { case Rating(user, product, rate) => ((user, product), rate) } 103 | .join(predictions) 104 | //可以清楚地看到,实际评分和预测评分 105 | ratesAndPreds.foreach(println) 106 | //然后计算预测的和实际的均方差,均方差越小说明越准确,mean求平均值 107 | val MSE = ratesAndPreds.map { 108 | case ((user, products), (r1, r2)) => 109 | val err = (r1 - r2) 110 | err * err 111 | }.mean() 112 | //使用ALS内置的MSE评估 113 | val predictedAndTrue = ratesAndPreds.map { case ((user, products), (r1, r2)) => (r1, r2) } 114 | val DefaultMSE = new RegressionMetrics(predictedAndTrue) 115 | //将模型保存 116 | // model.save(sc, "") 117 | //加载一个model 118 | // val loadModel=MatrixFactorizationModel.load(sc, "") 119 | 120 | //打印方差和预测结果 121 | println("这是预测评分和实际评分的均方差:" + MSE) 122 | println("这是内置的预测评分和实际评分的均方差MSE:" + DefaultMSE.meanSquaredError) 123 | //如果均方差满意的话可以将预测的评分进行存储 124 | val result = predictions.map { 125 | case ((user, product), rate) => (user, (product, rate)) 126 | } 127 | .groupByKey 128 | .map { data => 129 | { 130 | val resultda = data._2.map(product => { 131 | data._1 + "::" + product._1 + "::" + product._2 132 | }) 133 | resultda 134 | } 135 | } 136 | result.flatMap(x => x).foreach { println } 137 | //result.flatMap(x=>x).saveAsTextFile("outfile/ASLresult") 138 | return MSE 139 | //>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 140 | } 141 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/ml/ClassifierDemo.scala: -------------------------------------------------------------------------------- 1 | package com.spark.ml 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.mllib.regression.LabeledPoint 6 | import org.apache.spark.mllib.linalg.Vectors 7 | import org.apache.spark.mllib.classification.LogisticRegressionWithSGD 8 | import org.apache.spark.rdd.RDD 9 | import org.apache.spark.mllib.classification.NaiveBayes 10 | import org.apache.spark.mllib.classification.SVMWithSGD 11 | import org.apache.spark.mllib.tree.DecisionTree 12 | import org.apache.spark.mllib.tree.configuration.Algo 13 | import org.apache.spark.mllib.tree.impurity.Entropy 14 | /** 15 | * 线性模型(逻辑回归,线性支持向量机SVM) 16 | *线性模型的核心思想是对样本的预测结果进行建模,即对输入变量(特征矩阵)应用简单的线性预测函数y=f(w*x) 17 | *线性预测函数是用来训练数据的(得出w权重矩阵),使用逻辑回归或者线性支持向量机SVM(损失函数)来得出预测值(传入特征矩阵和w) 18 | *根据精确度来判断使用哪个损失函数来得出预测值 19 | *x为输入的特征矩阵,y为值(预测值) 20 | *在训练模型的时候,y为实际值,x为特征,存在一个权重向量能够最小化所有训练样本的由损失函数计算出来的误差最小。(最终是要求一个w) 21 | * 1 1.1951146419526084 0.9947742549449248 0.19840725400812698 2.48569644222758 1.7391898607628944 22 | * 第一个为结果值(分类) 后面为特征值 23 | */ 24 | object ClassifierDemo { 25 | def main(args: Array[String]): Unit = { 26 | val conf = new SparkConf() 27 | .setMaster("local") 28 | .setAppName("ClassifierDemo") 29 | System.setProperty("hadoop.home.dir", "D:\\eclipse\\hdplocal2.6.0") 30 | val sc = new SparkContext(conf) 31 | var numIterations=10 32 | 33 | val data=sc.textFile("inputFile/lr_data.txt", sc.defaultMinPartitions) 34 | //提取特征向量 35 | val records=data.map { line => line.split(" ") } 36 | 37 | //logisticRegressionWithSGDModel(records,numIterations) 38 | //naiveBayesModel(records,numIterations) 39 | //svmWithSGDModel(records,numIterations) 40 | decisionTree(sc,numIterations) 41 | 42 | 43 | } 44 | /** 45 | * 逻辑回归模型 46 | */ 47 | def logisticRegressionWithSGDModel(records:RDD[Array[String]],numIterations:Int){ 48 | //清洗数据,取第一个为结果值,后面为特征值 49 | val rawData=records.map { r => { 50 | val label=if(r(0).toInt<0) 0.0 else r(0).toInt 51 | val features=r.slice(1, r.size-1).map (_.toDouble) 52 | LabeledPoint(label,Vectors.dense(features))} 53 | } 54 | val lrModel=LogisticRegressionWithSGD.train(rawData, numIterations) 55 | //计算准确率 56 | val lrAccuracy=rawData.map { point => { 57 | if(lrModel.predict(point.features) == point.label) 1 else 0 58 | } 59 | }.sum()/rawData.count() 60 | println("准确率:"+lrAccuracy) 61 | // 62 | 63 | /* //传入一个特征矩阵来预测这个产品是属于哪一类的,结果为0或者1 64 | val prediction=lrModel.predict(rawData.map { data => data.features }).collect() 65 | //实际的结果值 66 | val label=rawData.map { x => x.label }.collect() 67 | 68 | for(i<- 0 until label.length){ 69 | if(prediction.apply(i)==label.apply(i)){ 70 | println("预测:"+prediction.apply(i)+"->>> 实际:"+label.apply(i)) 71 | }else 72 | { 73 | println("预测:"+prediction.apply(i)+"@@@@@ 实际:"+label.apply(i)) 74 | } 75 | }*/ 76 | } 77 | 78 | /** 79 | * 朴素贝叶斯模型(特征值不允许为负) 80 | * map{x=> if(x.toDouble<0) 0.0 else x.toDouble} 81 | */ 82 | def naiveBayesModel(input:RDD[Array[String]],numIterations:Int){ 83 | val rawData=input.map { r => { 84 | val label=if(r(0).toInt<0) 0.0 else r(0).toInt 85 | val features=r.slice(1, r.size-1).map{x=> if(x.toDouble<0) 0.0 else x.toDouble} 86 | LabeledPoint(label,Vectors.dense(features))} 87 | } 88 | val nbModel=NaiveBayes.train(rawData,numIterations) 89 | 90 | //计算准确率 91 | val nbAccuracy=rawData.map { point => { 92 | if(nbModel.predict(point.features) == point.label) 1 else 0 93 | } 94 | }.sum()/rawData.count() 95 | println("准确率:"+nbAccuracy) 96 | // 97 | 98 | /* 99 | val prediction=nbModel.predict(rawData.map { data => data.features }).collect() 100 | //实际的结果值 101 | val label=rawData.map { x => x.label }.collect() 102 | 103 | for(i<- 0 until label.length){ 104 | if(prediction.apply(i)==label.apply(i)){ 105 | println("预测:"+prediction.apply(i)+"->>> 实际:"+label.apply(i)) 106 | }else 107 | { 108 | println("预测:"+prediction.apply(i)+"@@@@@ 实际:"+label.apply(i)) 109 | } 110 | }*/ 111 | 112 | 113 | } 114 | 115 | /** 116 | * SVM模型 117 | */ 118 | def svmWithSGDModel(input:RDD[Array[String]],numIterations:Int){ 119 | val rawData=input.map { r => { 120 | val label=if(r(0).toInt<0) 0.0 else r(0).toInt 121 | val features=r.slice(1, r.size-1).map(_.toDouble) 122 | LabeledPoint(label,Vectors.dense(features))} 123 | } 124 | val svmModel=SVMWithSGD.train(rawData,numIterations) 125 | 126 | //计算准确率 127 | val svmAccuracy=rawData.map { point => { 128 | if(svmModel.predict(point.features) == point.label) 1 else 0 129 | } 130 | }.sum()/rawData.count() 131 | println("准确率:"+svmAccuracy) 132 | // 133 | } 134 | /** 135 | * 决策树 136 | */ 137 | def decisionTree(sc:SparkContext,numIterations:Int){ 138 | val data=sc.textFile("inputFile/sample_tree_data.csv", sc.defaultMinPartitions) 139 | //提取特征向量 140 | val records=data.map { line => line.split(",") } 141 | val rawData=records.map { r => { 142 | val label=r(0).toInt 143 | val features=r.slice(1, r.size-1).map(_.toDouble) 144 | LabeledPoint(label,Vectors.dense(features))} 145 | } 146 | //2-折叠交叉验证,将原始数据0.9分为训练数据,0.1分为测试数据 147 | val Array(trainData,cvData)=rawData.randomSplit(Array(0.9,0.1), 123) 148 | 149 | val treeModel=DecisionTree.train(trainData, Algo.Classification, Entropy, 29) 150 | 151 | 152 | 153 | //计算准确率 154 | val treeAccuracy=cvData.map { point => { 155 | if(treeModel.predict(point.features) == point.label) 1 else 0 156 | } 157 | }.sum()/cvData.count() 158 | println("准确率:"+treeAccuracy) 159 | // 160 | 161 | } 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/ml/TestVector.scala: -------------------------------------------------------------------------------- 1 | package com.spark.ml 2 | 3 | import org.apache.spark.mllib.linalg.{ Vector, Vectors } 4 | import org.apache.spark.mllib.linalg.DenseMatrix 5 | import breeze.linalg._ 6 | import org.apache.spark.mllib.linalg.DenseVector 7 | import org.apache.spark.mllib.linalg.SparseVector 8 | object TestVector { 9 | def main(args: Array[String]): Unit = { 10 | //创建一个稠密矩阵 11 | var a = Vectors.dense(1.0, 2.0, 3.0) 12 | 13 | var b = Vectors.dense(1.0, 2.0, 3.0) 14 | var a2=new SparseVector(1,Array(0, 1, 2),Array(1.0, 2.0, 3.0)) 15 | //a2.dot(a2) 16 | var a1=new DenseVector(Array(1.0, 2.0, 3.0)) 17 | //a1.dot(b) 18 | // b.toDense.dot(a.toDense) 19 | } 20 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/myrdd/CaseClassUtil.scala: -------------------------------------------------------------------------------- 1 | package com.spark.myrdd 2 | 3 | object CaseClassUtil extends Serializable{ 4 | 5 | case class User(name:String,age:Int,phone:String) 6 | case class Address(name:String,address:String,phone:String) 7 | case class Detail(name:String,phone:String) 8 | 9 | case class Table1(name:String,age:Int,address:String) 10 | case class Table2(name:String,age:Int) 11 | 12 | case class HiveTempTable(id:Int,name:String) 13 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/myrdd/ImplicitParameter.scala: -------------------------------------------------------------------------------- 1 | package com.spark.myrdd 2 | 3 | trait ImplicitParameter { 4 | //隐式参数 5 | implicit val a = Array[String]("@") 6 | implicit val b = Array[Int](1) 7 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/myrdd/MySelfRDD.scala: -------------------------------------------------------------------------------- 1 | package com.spark.myrdd 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.Partition 5 | import org.apache.spark.TaskContext 6 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat 7 | import org.apache.spark.SparkContext 8 | //继承与RDD[String]表示没有前置的rdd。 9 | //这个rdd可以从sc直接获取。而不是一个rdd转换成这个rdd 10 | class MySelfRDD(@transient sc : SparkContext,val strs:Array[String])extends RDD[String](sc,Nil){ 11 | //这个函数是用来计算RDD中每个的分区的数据 12 | override def compute(split: Partition, context: TaskContext):Iterator[String] ={ 13 | //得到切片的数据 14 | val splits = split.asInstanceOf[MySelfPartition] 15 | Array[String](splits.content).toIterator 16 | } 17 | //getPartitions函数允许开发者为RDD定义新的分区 18 | override protected def getPartitions: Array[Partition] ={ 19 | val array = new Array[Partition](strs.size) 20 | for (i <- 0 until strs.size) { 21 | array(i) = new MySelfPartition(i, strs(i)) 22 | } 23 | array 24 | } 25 | } 26 | 27 | class MySelfRDD2(parent:RDD[String],data:String)extends RDD[String](parent){ 28 | //这个函数是用来计算RDD中每个的分区的数据 29 | override def compute(split: Partition, context: TaskContext):Iterator[String] ={ 30 | //得到切片的数据 31 | parent.iterator(split, context).map { x => data+x } 32 | } 33 | //getPartitions函数允许开发者为RDD定义新的分区 34 | override protected def getPartitions: Array[Partition] = 35 | parent.partitions 36 | } 37 | 38 | class MySelfPartition(idx: Int, val content: String) extends Partition { 39 | override def index: Int = idx 40 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/myrdd/TestMain.scala: -------------------------------------------------------------------------------- 1 | package com.spark.myrdd 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD 6 | import org.apache.spark.rdd.CoGroupedRDD 7 | object TestMain { 8 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0") 9 | def main(args: Array[String]): Unit = { 10 | var conf = new SparkConf() 11 | .setMaster("local") 12 | .setAppName("SparkStreaming Flume") 13 | 14 | var sc = new SparkContext(conf) 15 | testMySelfRDD(sc) 16 | 17 | } 18 | implicit class CustomFunctions3(rdd:RDD[String]) { 19 | //将两个字符串合并 20 | def mergeString(data:String) = new MySelfRDD2(rdd,data) 21 | } 22 | /** 23 | * 自定义一个RDD,将一个RDD转换成自定义的RDD。使用隐式函数 24 | */ 25 | def testMySelfRDD(sc:SparkContext){ 26 | val preData=sc.parallelize(Array("a","2")) 27 | var result=preData.mergeString("@") 28 | result.take(10).foreach { println } 29 | 30 | } 31 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/myrdd/package.scala: -------------------------------------------------------------------------------- 1 | package com.spark 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.rdd.RDD 5 | import com.fun.util.RDDOperateFunction 6 | import com.fun.util.SparkContextOperateFunction 7 | import com.fun.util.SparkContextOperateFunction 8 | package object myrdd extends RDDOperateFunction 9 | with SparkContextOperateFunction 10 | with ImplicitParameter{ 11 | //可以通过继承类来获得,也可以直接写 12 | implicit class SparkContextNewFunction(sparkContext: SparkContext) { 13 | def lmq(name: String) = "" 14 | } 15 | 16 | //隐式参数的使用 17 | implicit class RDDNewFunction[T](rdd: RDD[T]) { 18 | def lmq3(str: String)(implicit impl:Array[T])=rdd.map { x => x + " : "+impl(0) } 19 | def lmq4[A](str: String)(implicit impl:Array[A])=rdd.map { x => x + " : "+impl(0) } 20 | } 21 | 22 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/python/TestPython.scala: -------------------------------------------------------------------------------- 1 | package com.spark.python 2 | 3 | object TestPython { 4 | def main(args: Array[String]): Unit = { 5 | val p=Runtime.getRuntime().exec("./test.py") 6 | p.waitFor() 7 | println(p.exitValue()) 8 | } 9 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/scala/ImplicitClass.scala: -------------------------------------------------------------------------------- 1 | package com.spark.scala 2 | 3 | trait ImplicitClass { 4 | implicit def toD(str:String)=str.toDouble 5 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/scala/ReflectScala.scala: -------------------------------------------------------------------------------- 1 | package com.spark.scala 2 | 3 | import java.io.File 4 | import java.net.URLClassLoader 5 | import org.apache.hadoop.fs.FileSystem 6 | import org.apache.hadoop.conf.Configuration 7 | import org.apache.hadoop.fs.Path 8 | import scala.collection.mutable.ArrayBuffer 9 | import java.net.URL 10 | import org.apache.hadoop.fs.FsUrlStreamHandlerFactory 11 | 12 | object ReflectScala { 13 | def main(args: Array[String]): Unit = { 14 | //val c=Class.forName("com.dmp.dataflow.fg.feture.FG1FeatureCalculate") 15 | // c.getMethod("printwoed", classOf[String]).invoke(a.newInstance(), "hello world") 16 | loadHdfsJar 17 | 18 | } 19 | /** 20 | * 动态加载jar包 21 | */ 22 | def loadHdfsJar() { 23 | URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory()); 24 | val fs = FileSystem.get(new Configuration) 25 | val jarPath = "/user/linmingqiang/test-0.0.1-SNAPSHOT.jar" 26 | val url = fs.getFileStatus(new Path(jarPath)).getPath().toUri().toURL() 27 | val d = new URLClassLoader(Array(url), Thread.currentThread().getContextClassLoader()) 28 | val a = d.loadClass("test.HelloWord") 29 | //因为该方法是一个静态的方法,所以这个地方的invoke只要填null就可以了。但是如果不是一个静态方法,就需要一个实例a.newInstance() 30 | //a.getMethod("printwoed", classOf[String]).invoke(null, "hello world") 31 | a.getMethod("printwoed", classOf[String]).invoke(a.newInstance(), "hello world") 32 | } 33 | def loadLocalJar() { 34 | val url = new File("C:\\Users\\zhiziyun\\Desktop\\test-0.0.1-SNAPSHOT.jar").toURI().toURL() 35 | val d = new URLClassLoader(Array(url), Thread.currentThread().getContextClassLoader()) 36 | val a = d.loadClass("test.HelloWord") 37 | //因为该方法是一个静态的方法,所以这个地方的invoke只要填null就可以了。但是如果不是一个静态方法,就需要一个实例 38 | //a.getMethod("test").invoke(a.newInstance()) 39 | a.getMethod("printwoed", classOf[String]).invoke(a.newInstance(), "hello world") 40 | 41 | } 42 | 43 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/scala/ScalaGramaer.scala: -------------------------------------------------------------------------------- 1 | package com.spark.scala 2 | 3 | import java.util.ArrayList 4 | import scala.collection.mutable.ArrayBuffer 5 | import scala.collection.JavaConversions._ 6 | import scala.collection.mutable.Map 7 | import java.util.HashMap 8 | import scala.io.Source 9 | import java.io.File 10 | import scala.collection.Iterator 11 | import sun.org.mozilla.javascript.internal.ast.Yield 12 | import scala.reflect.ClassTag 13 | import java.io.PrintWriter 14 | import scala.tools.cmd.Opt.Implicit 15 | import scala.reflect.internal.util.StringOps 16 | import java.math.BigDecimal 17 | object ScalaGramaer { 18 | var list = new ArrayList[String] 19 | //implicit val aa="a" 20 | implicit def testimplicit(implicit i: String) = { 21 | i.toInt 22 | } 23 | implicit def testimplicit2(i: String) = { 24 | i.toInt 25 | } 26 | def main(args: Array[String]): Unit = { 27 | //listGrammer() 28 | //mapGrammer() 29 | //tupleGrammer() 30 | //IteratorGrammer 31 | //var b=aas(1,"1",(_+_+_+55)) 32 | // writeFile 33 | //setGrammer 34 | //mapResultTest 35 | val a=new ArrayBuffer[Int]() 36 | a.++=(Array(1,2,3,4,5)) 37 | val b=a.toIterator 38 | 39 | val c=b.map { x => x+10 } 40 | 41 | c.foreach { println } 42 | 43 | 44 | } 45 | def regx(){ 46 | val regex="[0-9]".r 47 | println(regex.replaceAllIn("123 admin", "x")) 48 | } 49 | def ffun(){ 50 | val a=1.10010011 51 | val b=f"$a%.2f" 52 | println(b) 53 | } 54 | def ziptest(){ 55 | val l1 = 1 to 10 toList 56 | val l2 = l1.tail 57 | l2.foreach { println } 58 | println(">>>>") 59 | val l3=l1.zip(l2) 60 | l3.foreach { println } 61 | println(">>>>") 62 | l3.map(p=>((p._2 - p._1),p._2+"-"+p._1)).foreach { println } 63 | } 64 | def implicitTest(){ 65 | var a: String = "laal" 66 | var i: Int = a 67 | println(i) 68 | var b:Int="as" 69 | 70 | } 71 | def mapResultTest() { 72 | var a = Set(1, 2, 3, 4) 73 | println(a.+(5)) 74 | } 75 | def writeFile() { 76 | var fw = new PrintWriter(new File("test2")) 77 | fw.write(">>>>>>>>>") 78 | fw.close() 79 | 80 | } 81 | def aas[U: ClassTag](key: Int, value: String, a: (Int, String, Int) => U) = { 82 | a(key, value, key) 83 | 84 | } 85 | def IteratorGrammer() { 86 | var a = Array(Array("1", "2"), Array("3", "4"), Array("5", "6")) //不适用tolist的话,就只能遍历一次 87 | var fun1 = (x: Array[String]) => true 88 | var c = a.toIterator.filter { fun1 } 89 | var b = for { 90 | i <- a.toIterator 91 | c <- i 92 | if c > "2" 93 | if c < "6" 94 | } yield c 95 | //b.foreach { println } 96 | //b.foreach { println } 97 | c.foreach { println } 98 | } 99 | def setGrammer() { 100 | var a = Array(1, 2, 3, 4) 101 | var b = 102 | for { 103 | i <- a 104 | } yield { if (i > 2) i + 1 else i } 105 | 106 | for (i <- b) 107 | println(i) 108 | } 109 | /** 110 | * scala集合操作 111 | * 1.想要使用java的集合,需要导入 112 | * import scala.collection.JavaConversions._ 113 | * 会内部将java的集合转换为scala的集合 114 | * 2.java的集合和scala的集合不能显式转换,但是可以隐式转换,如,SparkContext.parallelize(data) 115 | * 需要的是一个scala的data,但是可以传一个java的集合 116 | */ 117 | def fileGrammer() { 118 | // var file=Source.fromFile("D:\\tmp\\input\\smy_biz_dil\\part-m-00000", "utf-8") 119 | //var file=Source.fromURL("http://www.baidu.com", "utf-8") 120 | // file.getLines.foreach { println }; 121 | //bian li mulu 122 | /*walk(new File("D:\\tmp\\input\\")) 123 | list.foreach { println }*/ 124 | 125 | } 126 | 127 | //遍历路径下所有的文件 128 | def walk(file: File) { 129 | if (file.isDirectory()) file.listFiles().foreach(walk) else list.add(file.getPath()) 130 | } 131 | def readAllfiles(dir: File): Iterator[File] = { 132 | //scan a dir return all file 133 | var child = dir.listFiles().filter { _.isDirectory() } 134 | child.toIterator ++ child.toIterator.flatMap { readAllfiles _ } 135 | } 136 | def listGrammer() { 137 | //遍历集合,可以有下标无下标 138 | var list = new ArrayList[String](); list.add("s") 139 | for (value <- list) println(value) 140 | for (i <- 0.until(list.length)) println(list(i)) 141 | for (i <- 0 until list.length) println(list(i)) 142 | 143 | } 144 | def mapGrammer() { 145 | //mutable可变的 146 | var map = Map("a" -> 1, "b" -> 2) 147 | println(map("a")) 148 | //用get返回的是一个option 149 | println(map.get("b")) 150 | println(map.get("c")) 151 | //改变一个key的值 152 | map("a") = 6 153 | println(map("a")) 154 | //新增一个值 155 | map += "c" -> 3 156 | println(map("c")) 157 | //移除一个值 158 | map -= "c" 159 | println(map.getOrElse("c", "无这个key")) 160 | //如果有这个key就返回key的值 161 | println(map.getOrElse("null", "无这个key")) 162 | 163 | //遍历一个map 164 | println("遍历一个map") 165 | for ((k, value) <- map) { 166 | println(k + ":" + value) 167 | } 168 | println("遍历一个map的key") 169 | for (k <- map.keySet) { 170 | println(k) 171 | } 172 | 173 | } 174 | def tupleGrammer() { 175 | //元祖类型Tuple可以是多元的 176 | var tuple1 = (1) 177 | var tuple2 = ("1", 2) 178 | var tuple3 = ("1", 2, "3") 179 | var tuple4 = ("1", 2, "3", 4) 180 | println(tuple3._3) 181 | 182 | } 183 | 184 | /** 185 | * @author Administrator 186 | */ 187 | class Person(n: String) { 188 | //必须初始化属性 189 | var name = n; 190 | var age = 0; 191 | var address = ""; 192 | //这是一个辅助构造器,scala的构造器必须以另一个构造器为起点,否则报错 193 | def this(name: String, age: Int) { 194 | this(name) 195 | this.age = age 196 | } 197 | def this(name: String, age: Int, address: String) { 198 | this(name, age) 199 | this.address = address 200 | } 201 | } 202 | 203 | } 204 | 205 | -------------------------------------------------------------------------------- /src/main/scala/com/spark/scalatest/ScalaTest.scala: -------------------------------------------------------------------------------- 1 | package com.spark.scalatest 2 | 3 | import org.scalatest.FlatSpec 4 | import org.scalatest.Matchers 5 | import scala.collection.mutable.Stack 6 | 7 | class ScalaTest extends FlatSpec with Matchers{ 8 | "a" should "b" in{ 9 | val stack = new Stack[Int] 10 | stack.push(1) 11 | stack.push(2) 12 | stack.pop() should be (2) 13 | stack.pop() should be (1) 14 | } 15 | it should "throw NoSuchElementException if an empty stack is popped" in { 16 | val emptyStack = new Stack[Int] 17 | a [NoSuchElementException] should be thrownBy { 18 | emptyStack.pop() 19 | } 20 | } 21 | } 22 | 23 | -------------------------------------------------------------------------------- /src/main/scala/com/spark/sparkSql/CaseClassUtil.scala: -------------------------------------------------------------------------------- 1 | package com.spark.sparkSql 2 | 3 | object CaseClassUtil extends Serializable{ 4 | 5 | case class User(name:String,age:Int,phone:String) 6 | 7 | case class Address(name:String,address:String,phone:String) 8 | case class Detail(name:String,phone:String) 9 | 10 | case class Table1(name:String,age:Int,address:String) 11 | case class Table2(name:String,age:Int) 12 | 13 | case class HiveTempTable(id:Int,name:String) 14 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/sparkSql/JavaUseScalaClass.scala: -------------------------------------------------------------------------------- 1 | package com.spark.sparkSql 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql.SQLContext 5 | import org.apache.spark.sql.types.StructField 6 | import org.apache.spark.sql.types.StructType 7 | import org.apache.spark.sql.types.StringType 8 | import org.apache.spark.sql.Row 9 | import scala.collection.mutable.ArrayBuffer 10 | import java.util.HashMap 11 | import scala.collection.JavaConversions._ 12 | import scala.collection.JavaConverters._ 13 | import java.util.ArrayList 14 | import java.util.Map 15 | import java.util.List 16 | import com.spark.sparkSql.CaseClassUtil._ 17 | import org.apache.spark.api.java.JavaRDD 18 | 19 | //在sc.parallelize(data)中的data是一个scala的集合,如果放入java的集合(ArrayList)的话会报错, 20 | //加入import scala.collection.JavaConversions._就不会报错了,内部会自己转换 21 | class JavaUseScalaClass(sc:SparkContext,sqlContext:SQLContext) { 22 | def userRDDToDataFrame(data:ArrayList[HashMap[String,String]],tableName:String){ 23 | var liens=sc.parallelize(data).map(t=>User(name=t.get("name"),age=t.get("age").toInt,phone=t.get("phone"))) 24 | sqlContext.createDataFrame(liens).registerTempTable(tableName) 25 | } 26 | 27 | def addressRDDToFrame(data:ArrayList[HashMap[String,String]],tableName:String){ 28 | var liens=sc.parallelize(data).map(t=>Address(name=t.get("name"),t.get("address"),phone=t.get("phone"))) 29 | sqlContext.createDataFrame(liens)registerTempTable(tableName) 30 | } 31 | //第二种指定Schema,需要这个ROW 32 | def secondRDDToFrame(data:ArrayList[HashMap[String,String]]){ 33 | var liens=sc.parallelize(data).map(p=>Row(p.get("name"),p.get("phone"))) 34 | var schemaString = "name phone" 35 | val schema = StructType(schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, true))) 36 | sqlContext.createDataFrame(liens, schema)registerTempTable("Detail") 37 | } 38 | def show(sql:String):List[Row]={ 39 | var data=sqlContext.sql(sql) 40 | 41 | data.show() 42 | return data.collectAsList() 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/scala/com/spark/sparkSql/SparkListToDataFrame.scala: -------------------------------------------------------------------------------- 1 | package com.spark.sparkSql 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.sql.SQLContext 6 | import org.apache.spark.sql.types.StructField 7 | import org.apache.spark.sql.types.StructType 8 | import org.apache.spark.sql.types.StringType 9 | import org.apache.spark.sql.Row 10 | import scala.collection.mutable.ArrayBuffer 11 | import java.util.HashMap 12 | import scala.collection.JavaConversions._ 13 | import java.util.ArrayList 14 | import java.util.Map 15 | import java.util.List 16 | import com.spark.sparkSql.CaseClassUtil._ 17 | 18 | object SparkListToDataFrame { 19 | var conf = new SparkConf() .setMaster("local").setAppName("Spark Pi") 20 | var sc = new SparkContext(conf) 21 | var sqlContext=new SQLContext(sc) 22 | System.setProperty("hadoop.home.dir", "E:\\eclipse\\hdplocal2.6.0") 23 | //第一种,使用反射 24 | def main(args: Array[String]): Unit = { 25 | //secondRDDToFrame() 26 | //show("select * from Detail") 27 | AddressRDDToFrame 28 | } 29 | def UserRDDToDataFrame(data:ArrayList[HashMap[String,String]],tableName:String){ 30 | //这是java的写法 31 | var liens=sc.parallelize(data).map(t=>User(name=t.get("name"),age=t.get("age").toInt,phone=t.get("phone"))) 32 | var userData=sqlContext.createDataFrame(liens,User.getClass) 33 | userData.registerTempTable(tableName) 34 | } 35 | def AddressRDDToFrame(){ 36 | var arraybuffer=ArrayBuffer[HashMap[String,String]]() 37 | var map=new HashMap[String,String]() 38 | map.put("name", "lmq") 39 | map.put("address", "莆田") 40 | arraybuffer+=map 41 | var liens=sc.parallelize(arraybuffer).map(t=>Address(name=t.get("address"),t.get("address"),phone=t.get("address"))) 42 | var addressData=sqlContext.createDataFrame(liens) 43 | addressData.registerTempTable("Address") 44 | show("select * from Address") 45 | 46 | var liens2=sc.parallelize(arraybuffer).map(t=>Address(name=t.get("name"),t.get("name"),phone=t.get("name"))) 47 | var addressData2=sqlContext.createDataFrame(liens2) 48 | addressData2.registerTempTable("Address") 49 | 50 | show("select * from Address") 51 | 52 | } 53 | //第二种指定Schema,需要这个ROW 54 | def secondRDDToFrame(){ 55 | var arraybuffer=ArrayBuffer[HashMap[String,String]]() 56 | var map=new HashMap[String,String]() 57 | map.put("name", "lmq") 58 | map.put("age", "12") 59 | map.put("phone", "10312123") 60 | arraybuffer+=map 61 | var liens=sc.parallelize(arraybuffer) 62 | .map(p=>Row(p.get("name"),p.get("phone"),p.get("age"))) 63 | var schemaString = Array("name","phone","age") 64 | var a=StructField("", StringType, true) 65 | var columns=schemaString.map(fieldName => StructField(fieldName, StringType, true)) 66 | val schema = StructType(columns) 67 | var schemaData=sqlContext.createDataFrame(liens, schema) 68 | schemaData.registerTempTable("Detail") 69 | 70 | 71 | } 72 | def show(sql:String){ 73 | sqlContext.sql(sql).show() 74 | } 75 | } 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /src/main/scala/com/spark/sparkSql/SparkSQLDemo.scala: -------------------------------------------------------------------------------- 1 | package com.spark.sparkSql 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql.SQLContext 5 | import org.apache.spark.sql._ 6 | import scala._ 7 | import scala.util.parsing.json.JSON 8 | import scala.collection.mutable.ArrayBuffer 9 | import java.util.ArrayList 10 | import scala.collection.mutable.HashMap 11 | import org.apache.hadoop.hbase.client.Put 12 | object SparkSQLDemo { 13 | def main(args: Array[String]): Unit = { 14 | var conf = new SparkConf() 15 | .setMaster("local") 16 | .setAppName("Spark Pi") 17 | System.setProperty("hadoop.home.dir", "E:\\eclipse\\hdplocal2.6.0") 18 | 19 | var sc = new SparkContext(conf) 20 | var sql=new SQLContext(sc) 21 | testDataFram(sc,sql) 22 | 23 | } 24 | def testDataFram(sc:SparkContext,sql:SQLContext){ 25 | val data=sc.textFile("F:\\data\\smartadsclicklog") 26 | val fram=data.map { x => {x.split(",")}}.map { x =>Smartadsclicklog( 27 | clicktime=x(0),zzid=x(1),siteid=x(2),uid=x(3), 28 | ip=x(4),originurl=x(5),pageurl=x(6),campaign=x(7), 29 | template=x(8),pubdomain=x(9),visitor=x(10),useragent=x(11), 30 | slot=x(12),unit=x(13),creative=x(14),ext=x(15), 31 | bidid=x(16)) } 32 | println(fram.count()) 33 | fram.foreach { println } 34 | val df=sql.createDataFrame(fram) 35 | df.rdd.foreach(println) 36 | //df.registerTempTable("Smartadsclicklog") 37 | //sql.sql("select * from Smartadsclicklog").show() 38 | println(">>>>>>>>>>>>>>>>>..") 39 | } 40 | /*def transStrToPut(row:Row,cols:Array[String])={ 41 | val r=cols.zip(row.toSeq) 42 | r.map{case(colname,value)=> 43 | val put=new Put() 44 | 45 | } 46 | val put = new Put(cells(0).getBytes); 47 | put.addColumn(cells(0).getBytes, cells(0).getBytes, cells(0).getBytes) 48 | put 49 | }*/ 50 | case class Smartadsclicklog(clicktime:String,zzid:String,siteid:String,uid:String, 51 | ip:String,originurl:String,pageurl:String,campaign:String, 52 | template:String,pubdomain:String,visitor:String,useragent:String, 53 | slot:String,unit:String,creative:String,ext:String,bidid:String) 54 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/streaming/DataProducter.scala: -------------------------------------------------------------------------------- 1 | package com.spark.streaming 2 | 3 | object DataProducter { 4 | def main(args: Array[String]): Unit = { 5 | val conn= getConnection() 6 | var id=1; 7 | var sql="insert into test(id,name) values" 8 | while(true){ 9 | val values=(id to id+2).map{x=> 10 | "("+x+",'"+x+"')" 11 | }.mkString(",") 12 | val nsql=sql+values 13 | println(nsql) 14 | id+=3 15 | Thread.sleep(8000) 16 | val statement = conn.prepareStatement(nsql); 17 | statement.executeUpdate(); 18 | } 19 | 20 | 21 | } 22 | 23 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/streaming/DirectMysqlInputDStream.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.streaming.mysql 2 | 3 | import scala.reflect.ClassTag 4 | import org.apache.spark.streaming.StreamingContext 5 | import java.sql.ResultSet 6 | import org.apache.spark.streaming.dstream.InputDStream 7 | import java.sql.Connection 8 | import org.apache.spark.Logging 9 | import org.apache.spark.streaming.Time 10 | import org.apache.spark.streaming.scheduler.RateController 11 | class DirectMysqlInputDStream[T:ClassTag]( 12 | @transient ssc_ : StreamingContext, 13 | getConnection: () => Connection, 14 | tablename: String, 15 | idcloumn:String, 16 | fromTime: Long, 17 | sql:String, 18 | numPartitions: Int, 19 | mapRow: (ResultSet) => T) extends InputDStream[T](ssc_) with Logging { 20 | //每个分区的获取条数限制 21 | val maxRows:Long = context.sparkContext.getConf.getInt("spark.streaming.mysql.maxRetries", 1) * numPartitions * context.graph.batchDuration.milliseconds.toLong /1000 22 | var currentOffsets=fromTime 23 | val mysqlConn=getConnection() 24 | // println(ssc_.conf) 25 | override def start(): Unit = {} 26 | override def stop(): Unit = {} 27 | // limits the maximum number of messages per partition 28 | protected def clamp(currentOffsets: Long): Long = { 29 | //获取最大的id 30 | val clampSql="select max("+idcloumn+") from "+tablename+" where "+ 31 | idcloumn+" >="+currentOffsets 32 | val stmt = mysqlConn.prepareStatement(clampSql, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY) 33 | val rs = stmt.executeQuery() 34 | //如果没有新数据就nextIdmaxRows) maxRows+currentOffsets else nextId 37 | 38 | } 39 | 40 | override def compute(validTime: Time): Option[JdbcSparkStreamRDD[T]] = { 41 | val nextId=clamp(currentOffsets) 42 | //如果没有新数据就nextId Connection, 17 | lowerBound: Long, 18 | upperBound: Long, 19 | rowkeyName:String, 20 | sql:String, 21 | numPartitions: Int, 22 | mapRow: (ResultSet) => T = JdbcSparkStreamRDD.resultSetToObjectArray _) 23 | extends RDD[T](sc, Nil) with Logging{ 24 | //每个分区获取数据的 25 | override def getPartitions: Array[Partition] = { 26 | val length = 1 + upperBound - lowerBound 27 | (0 until numPartitions).map(i => { 28 | val start = lowerBound + ((i * length) / numPartitions).toLong 29 | val end = lowerBound + (((i + 1) * length) / numPartitions).toLong - 1 30 | 31 | new JdbcSparkStreamPartition(i, start, end) 32 | 33 | }).toArray 34 | } 35 | override def count()=getRowsNum(sql) 36 | def getRowsNum(sql:String)={ 37 | var rowsNum=0 38 | var tmpConn=getConnection() 39 | try{ 40 | if(sql.toLowerCase.indexOf("from")<0){ 41 | logError(" sql is error , There must be the from keyword ") 42 | }else{ 43 | val nsql="select count(1) "+sql.substring(sql.toLowerCase.indexOf("from"), sql.size) 44 | val stmt = tmpConn.prepareStatement(nsql, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY) 45 | 46 | val rs = stmt.executeQuery() 47 | if(rs.next()){ 48 | rowsNum=rs.getInt(1) 49 | } 50 | stmt.close() 51 | } 52 | }catch { 53 | case t: Throwable => t.printStackTrace() // TODO: handle error 54 | }finally { 55 | tmpConn.close() 56 | tmpConn=null 57 | } 58 | rowsNum 59 | } 60 | //每个分区怎么获取数据的 61 | override def compute(thePart: Partition, context: TaskContext) = { 62 | val part = thePart.asInstanceOf[JdbcSparkStreamPartition] 63 | //如果这段时间没有数据,就返回空的 64 | if(part.lower>part.upper){ 65 | Iterator.empty 66 | } 67 | else 68 | new JdbcIterator[T] { 69 | context.addTaskCompletionListener{ context => closeIfNeeded() } 70 | val conn = getConnection() 71 | var parttionSql=if(sql.toLowerCase.contains("where")) sql+" and "+rowkeyName+" >= ? AND "+rowkeyName+" <= ?" 72 | else sql+" where "+rowkeyName+" >= ? AND "+rowkeyName+" <= ?" 73 | val stmt = conn.prepareStatement(parttionSql, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY) 74 | if (conn.getMetaData.getURL.matches("jdbc:mysql:.*")) { 75 | stmt.setFetchSize(Integer.MIN_VALUE) 76 | logInfo("statement fetch size set to: " + stmt.getFetchSize + " to force MySQL streaming ") 77 | } 78 | stmt.setLong(1, part.lower) 79 | stmt.setLong(2, part.upper) 80 | 81 | val rs = stmt.executeQuery() 82 | override def getNext: T = { 83 | if (rs.next()) { 84 | mapRow(rs) 85 | } else { 86 | finished = true 87 | null.asInstanceOf[T] 88 | } 89 | } 90 | 91 | override def close() { 92 | try { 93 | if (null != rs) { 94 | rs.close() 95 | } 96 | } catch { 97 | case e: Exception => logWarning("Exception closing resultset", e) 98 | } 99 | try { 100 | if (null != stmt) { 101 | stmt.close() 102 | } 103 | } catch { 104 | case e: Exception => logWarning("Exception closing statement", e) 105 | } 106 | try { 107 | if (null != conn) { 108 | conn.close() 109 | } 110 | logInfo("closed connection") 111 | } catch { 112 | case e: Exception => logWarning("Exception closing connection", e) 113 | } 114 | } 115 | } 116 | } 117 | 118 | 119 | 120 | } 121 | object JdbcSparkStreamRDD{ 122 | def resultSetToObjectArray(rs: ResultSet): Array[Object] = { 123 | Array.tabulate[Object](rs.getMetaData.getColumnCount)(i => rs.getObject(i + 1)) 124 | } 125 | } 126 | abstract class JdbcIterator[U] extends Iterator[U] { 127 | private var gotNext = false 128 | private var nextValue: U = _ 129 | private var closed = false 130 | protected var finished = false 131 | protected def getNext(): U 132 | protected def close() 133 | def closeIfNeeded() { 134 | if (!closed) { 135 | closed = true 136 | close() 137 | } 138 | } 139 | override def hasNext: Boolean = { 140 | if (!finished) { 141 | if (!gotNext) { 142 | nextValue = getNext() 143 | if (finished) { 144 | closeIfNeeded() 145 | } 146 | gotNext = true 147 | } 148 | } 149 | !finished 150 | } 151 | override def next(): U = { 152 | if (!hasNext) { 153 | throw new NoSuchElementException("End of stream") 154 | } 155 | gotNext = false 156 | nextValue 157 | } 158 | } 159 | -------------------------------------------------------------------------------- /src/main/scala/com/spark/streaming/MapWithStateTest.scala: -------------------------------------------------------------------------------- 1 | package com.spark.streaming 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.streaming.StreamingContext 6 | import org.apache.spark.streaming.Seconds 7 | import org.apache.spark.streaming.kafka.KafkaClusterManager 8 | import org.apache.spark.HashPartitioner 9 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 10 | import org.apache.spark.streaming.StateSpec 11 | import org.apache.spark.streaming.State 12 | import org.apache.spark.streaming.Minutes 13 | import org.apache.spark.streaming.dstream.DStream 14 | import org.apache.spark.streaming.dstream.SocketInputDStream 15 | 16 | object MapWithStateTest { 17 | var sc: SparkContext = null 18 | var zookeeper: String = "solr2.zhiziyun.com,solr1.zhiziyun.com,mongodb3" 19 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0") 20 | def main(args: Array[String]): Unit = { 21 | init 22 | val ssc = new StreamingContext(sc, Seconds(5)) 23 | val initialRDD = ssc.sparkContext.parallelize(List(("a", 100), ("b", 10))) 24 | ssc.checkpoint("/user/linmingqiang/checkpoint") 25 | val topics = Set("test") 26 | var kafkaParams = Map[String, String]("metadata.broker.list" -> "kafka1:9092,kafka2:9092,kafka3:9092", 27 | "serializer.class" -> "kafka.serializer.StringEncoder", "zookeeper.connect" -> zookeeper) 28 | 29 | val dstream = KafkaClusterManager.createDirectStream(ssc, kafkaParams, topics).map { _._2 } 30 | 31 | val rpt1 = dstream.flatMap(_.split(" ")).map(x => (x, 1)).reduceByKey(_+_) 32 | 33 | rpt1.mapWithState(StateSpec.function(mappingFunc).timeout(Seconds(1))).print 34 | ssc.start() 35 | ssc.awaitTermination() 36 | 37 | } 38 | 39 | val mappingFunc = (word: String, count: Option[Int], state: State[Int]) => { 40 | val sum = count.getOrElse(0) + state.getOption.getOrElse(0) 41 | val output = (word, sum) 42 | state.update(sum) 43 | output 44 | } 45 | 46 | def init { 47 | val sparkConf = new SparkConf() 48 | .setMaster("local") 49 | .setAppName("UpdateStateByKeyTest") 50 | sc = new SparkContext(sparkConf) 51 | } 52 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/streaming/MysqlManager.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.streaming.mysql 2 | 3 | import org.apache.spark.streaming.StreamingContext 4 | import java.sql.Connection 5 | import java.sql.ResultSet 6 | import scala.reflect.ClassTag 7 | 8 | object MysqlManager { 9 | def creatMysqlInputStream[T:ClassTag]( 10 | @transient ssc_ : StreamingContext, 11 | getConnection: () => Connection, 12 | tablename: String, 13 | idcloumn:String, 14 | lowerBound: Long, 15 | sql:String, 16 | numPartitions: Int, 17 | mapRow: (ResultSet) => T)={ 18 | new DirectMysqlInputDStream(ssc_,getConnection,tablename,idcloumn,lowerBound,sql,numPartitions,mapRow) 19 | } 20 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/streaming/SpartStreamingTest.scala: -------------------------------------------------------------------------------- 1 | package com.spark.streaming 2 | import org.apache.spark.SparkContext 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.streaming.Seconds 5 | import org.apache.spark.streaming.StreamingContext 6 | import java.sql.DriverManager 7 | import java.sql.ResultSet 8 | import org.apache.spark.streaming.mysql.MysqlManager 9 | import org.apache.spark.rdd.JdbcRDD 10 | import org.apache.spark.sql.SQLContext 11 | import org.apache.spark.streaming.kafka.KafkaClusterManager 12 | import org.apache.spark.streaming.Time 13 | import org.apache.spark.rdd.RDD 14 | 15 | object SpartStreamingTest { 16 | var sc: SparkContext = null 17 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0") 18 | import org.apache.log4j.{Level,Logger} 19 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN) 20 | val zookeeper="solr2.zhiziyun.com,solr1.zhiziyun.com,mongodb3" 21 | def init() { 22 | val sparkConf = new SparkConf() 23 | .setMaster("local[2]") 24 | .setAppName("Test") 25 | .set("spark.streaming.mysql.maxRetries", "1") 26 | sc = new SparkContext(sparkConf) 27 | 28 | 29 | } 30 | /** 31 | * 应用场景,(推荐是用时间戳 做 数据 分隔点 的字段) 32 | * 数据是按时间顺序写入的,所以必须要有个时间的字段,而且这个时间的字段必须是 时间戳,每条数据的时间戳都要不一样 33 | * 如果数据是同一时间进去的,那必须要把每条数据的时间戳都一直往下加一操作,如:149000000 那同一时间的数据必须是基于这个往下加然后再入mysql 34 | * 要不然那你就必须是要有一个id(必须也是Long类型,这其实跟自带的那个JdbcRDD差不多)字段,且必须也是一直往下加的,不能往回走, 35 | * 36 | * 37 | */ 38 | def main(args: Array[String]): Unit = { 39 | localSparkStream 40 | 41 | 42 | } 43 | def localSparkStream(){ 44 | init() 45 | 46 | val ssc = new StreamingContext(sc, Seconds(2)) 47 | var topics = Set("mobileadsdeliverylog","smartadsdeliverylog","smartadsclicklog", "mobileadsclicklog", "sitevisitlog") 48 | var kafkaParams = Map[String, String]("metadata.broker.list" -> "kafka1:9092,kafka2:9092,kafka3:9092", 49 | "serializer.class" -> "kafka.serializer.StringEncoder", "group.id" -> "test", "zookeeper.connect" -> zookeeper) 50 | val dstream= KafkaClusterManager.createDirectStream(ssc, kafkaParams, topics) 51 | dstream.foreachRDD(rdd=> 52 | 53 | println(rdd.partitions.size) 54 | 55 | 56 | ) 57 | ssc.start() 58 | ssc.awaitTermination() 59 | } 60 | def mySparkInputstream{ 61 | init 62 | //查询条件必须是两边都是等号的 ID >= ? AND ID <= ? ,不然会丢数据 63 | var sql="SELECT id,name FROM test" 64 | val tablename="test" 65 | val timeClounm="id"//主键是什么。流式的话,按理应该是时间 66 | val fromTime=1//从某个时间点开始 67 | val partitionNum=2//分区数 68 | val ssc = new StreamingContext(sc, Seconds(2)) 69 | 70 | var count=0 71 | var r=ssc.createDirectMysqlDStream(getConnection, tablename, timeClounm, 72 | fromTime,sql, partitionNum, sscextractValues) 73 | 74 | r.foreachRDD{x=>println("sssssss");Thread.sleep(2000);println("kkkkkkk");} 75 | r.foreachRDD{rdd=> 76 | count+=1 77 | println(count) 78 | rdd.foreach(println) 79 | 80 | 81 | if(count<2){ 82 | Thread.sleep(8000) 83 | } 84 | } 85 | 86 | //r.printlnDStream("") 87 | //两个流式一起获取数据 88 | /*sql="SELECT id,name FROM test where id>10" 89 | var r2=ssc.createDirectMysqlDStream(getConnection, tablename, rowkeyName, 90 | fromId,sql, partitionNum, sscextractValues)*/ 91 | /*r2.printlnDStream("r2 :")*/ 92 | ssc.start() 93 | ssc.awaitTermination() 94 | } 95 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/streaming/UpdateStateByKeyTest.scala: -------------------------------------------------------------------------------- 1 | package com.spark.streaming 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.streaming.StreamingContext 6 | import org.apache.spark.streaming.Seconds 7 | import org.apache.spark.streaming.kafka.KafkaClusterManager 8 | import org.apache.spark.HashPartitioner 9 | import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions 10 | import org.apache.spark.streaming.StateSpec 11 | import org.apache.spark.streaming.State 12 | 13 | object UpdateStateByKeyTest { 14 | var sc: SparkContext = null 15 | var zookeeper: String = "solr2.zhiziyun.com,solr1.zhiziyun.com,mongodb3" 16 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0") 17 | def main(args: Array[String]): Unit = { 18 | init 19 | val ssc=new StreamingContext(sc,Seconds(5)) 20 | val initialRDD = ssc.sparkContext.parallelize(List(("a", 100), ("b", 10))) 21 | ssc.checkpoint("/user/linmingqiang/checkpoint") 22 | 23 | 24 | val topics = Set("test") 25 | var kafkaParams = Map[String, String]("metadata.broker.list" -> "kafka1:9092,kafka2:9092,kafka3:9092", 26 | "serializer.class" -> "kafka.serializer.StringEncoder","zookeeper.connect" -> zookeeper) 27 | 28 | val dstream= KafkaClusterManager.createDirectStream(ssc, kafkaParams, topics).map{_._2} 29 | println(">>>>>>>>>>>>> start "+dstream.count) 30 | val rpt1=dstream.flatMap(_.split(" ")).map(x => (x, 1)) 31 | //val rpt2=dstream.flatMap(_.split(" ")).map(x => (x+","+x, 1)) 32 | 33 | val rpt1_dst = rpt1.updateStateByKey[Int](updateFunc, new HashPartitioner(ssc.sparkContext.defaultParallelism), initialRDD) 34 | rpt1_dst.print() 35 | /*val rpt2_dst = rpt2.updateStateByKey[Int](updateFunc) 36 | rpt1_dst.foreachRDD{rdd=> 37 | rdd.collect().foreach(println) 38 | } 39 | rpt2_dst.foreachRDD{rdd=> 40 | rdd.collect().foreach(println) 41 | }*/ 42 | 43 | ssc.start() 44 | ssc.awaitTermination() 45 | 46 | } 47 | 48 | val updateFunc = (values: Seq[Int], state: Option[Int]) => { 49 | val currentCount = values.sum 50 | val previousCount = state.getOrElse(0) 51 | Some(currentCount + previousCount) 52 | } 53 | 54 | def init { 55 | val sparkConf = new SparkConf() 56 | .setMaster("local") 57 | .setAppName("UpdateStateByKeyTest") 58 | sc = new SparkContext(sparkConf) 59 | } 60 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/streaming/package.scala: -------------------------------------------------------------------------------- 1 | package com.spark 2 | 3 | import com.fun.util._ 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD 6 | package object streaming extends RDDOperateFunction 7 | with SparkContextOperateFunction 8 | with ZzyLmqDataOperateUtil{ 9 | 10 | /* //隐式参数的使用 11 | implicit class RDDNewFunction[T](rdd: RDD[T]) { 12 | def lmq3(str: String)(implicit impl:Array[T])=rdd.map { x => x + " : "+impl(0) } 13 | def lmq4[A](str: String)(implicit impl:Int)=rdd.map { x => x + " : "} 14 | }*/ 15 | 16 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/util/KafkaClusterManager.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.streaming.kafka 2 | 3 | import java.io.Serializable 4 | import scala.reflect.ClassTag 5 | import kafka.serializer.Decoder 6 | import org.apache.spark.streaming.StreamingContext 7 | import org.apache.spark.streaming.dstream.InputDStream 8 | import org.apache.spark.SparkException 9 | import kafka.message.MessageAndMetadata 10 | import kafka.common.TopicAndPartition 11 | import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset 12 | import org.apache.spark.rdd.RDD 13 | import kafka.serializer.StringDecoder 14 | import kafka.common.TopicAndPartition 15 | import org.apache.commons.logging.LogFactory 16 | import org.slf4j.LoggerFactory 17 | import org.apache.hadoop.conf.Configuration 18 | import scala.collection.mutable.HashMap 19 | import org.apache.spark.SparkContext 20 | 21 | object KafkaClusterManager { 22 | var topics: Set[String] = null 23 | var kafkaParams: Map[String, String] = null 24 | var kc: KafkaCluster = null 25 | var groupId: String = "Test" 26 | def getKafkafromOffsets(topics: Set[String], kafkaParams: Map[String, String]) = { 27 | val reset = kafkaParams.get("auto.offset.reset").map(_.toLowerCase) 28 | //有两个参数"largest"/"smallest",一个是从最新,一个是从最头开始读数据 29 | var fromOffsets = (for { 30 | topicPartitions <- kc.getPartitions(topics).right 31 | leaderOffsets <- (if (reset == Some("smallest")) { 32 | kc.getEarliestLeaderOffsets(topicPartitions) 33 | } else { 34 | kc.getLatestLeaderOffsets(topicPartitions) 35 | }).right 36 | } yield { 37 | val fromOffsets = leaderOffsets.map { 38 | case (tp, lo) => 39 | (tp, lo.offset) 40 | } 41 | fromOffsets 42 | }).fold( 43 | errs => throw new SparkException(errs.mkString("\n")), 44 | ok => ok) 45 | fromOffsets 46 | } 47 | def getConsumerOffsetsByToday(conf: Configuration) = { 48 | var consumerOffsets = new HashMap[TopicAndPartition, Long]() 49 | var todayOffsets = conf.get("zzy.kafka.todayoffset").split('|') 50 | for (offset <- todayOffsets) { 51 | val offsets = offset.split(",") 52 | consumerOffsets.put(new TopicAndPartition(offsets(0), offsets(1).toInt), offsets(2).toLong) 53 | } 54 | consumerOffsets.toMap 55 | } 56 | def createDirectStream(ssc: StreamingContext, 57 | kafkaParams: Map[String, String], 58 | topics: Set[String]) = { //先获取这个groupid所消费的offset 59 | this.kafkaParams = kafkaParams 60 | this.topics = topics 61 | kc = new KafkaCluster(kafkaParams) 62 | var consumerOffsets: Map[TopicAndPartition, Long] = getKafkafromOffsets(topics, kafkaParams) 63 | KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)]( 64 | ssc, 65 | kafkaParams, 66 | consumerOffsets, 67 | (mmd: MessageAndMetadata[String, String]) => (mmd.key, mmd.message)) 68 | 69 | } 70 | /** 71 | * 用于sc创建kafkaRDD 72 | */ 73 | def createKafkaRDD( 74 | sc: SparkContext, 75 | kafkaParams: Map[String, String], 76 | topics: Set[String]) = { 77 | this.kafkaParams = kafkaParams 78 | this.topics = topics 79 | kc = new KafkaCluster(kafkaParams) 80 | var fromOffsets: Map[TopicAndPartition, Long] = getConsumerOffsets(topics, kafkaParams.get("group.id").getOrElse("realtimereport")) 81 | println(">>>>>>>>>>>>>>>from ") 82 | fromOffsets.foreach(println) 83 | 84 | val maxMessagesPerPartition = sc.getConf.getInt("spark.streaming.kafka.maxRatePerPartition", 0) //0表示没限制 85 | val lastestOffsets = latestLeaderOffsets(fromOffsets) 86 | val untilOffsets = if (maxMessagesPerPartition > 0) { 87 | latestLeaderOffsets(fromOffsets).map { 88 | case (tp, lo) => 89 | tp -> lo.copy(offset = Math.min(fromOffsets(tp) + maxMessagesPerPartition, lo.offset)) 90 | } 91 | } else lastestOffsets 92 | val leaders = untilOffsets.map { case (tp, lo) => tp -> Broker(lo.host, lo.port) }.toMap 93 | val offsetRanges = fromOffsets.map { 94 | case (tp, fo) => 95 | val uo = untilOffsets(tp) 96 | OffsetRange(tp.topic, tp.partition, fo, uo.offset) 97 | }.toArray 98 | println(">>>>>>>>>>>>>>>offsetRanges ") 99 | offsetRanges.foreach(println) 100 | 101 | KafkaUtils.createRDD[String, String, StringDecoder, StringDecoder, (String, String)]( 102 | sc, 103 | kafkaParams, 104 | offsetRanges, 105 | leaders, 106 | (mmd: MessageAndMetadata[String, String]) => (mmd.key, mmd.message)) 107 | } 108 | protected final def latestLeaderOffsets(consumerOffsets: Map[TopicAndPartition, Long]): Map[TopicAndPartition, LeaderOffset] = { 109 | val o = kc.getLatestLeaderOffsets(consumerOffsets.keySet) 110 | if (o.isLeft) { 111 | throw new SparkException(o.left.toString) 112 | } else { 113 | o.right.get 114 | } 115 | } 116 | 117 | /** 118 | * 创建数据流前,根据实际消费情况更新消费offsets 119 | * @param topics 120 | * @param groupId 121 | */ 122 | private def getConsumerOffsets(topics: Set[String], groupId: String) = { 123 | var offsets: Map[TopicAndPartition, Long] = Map() 124 | topics.foreach(topic => { 125 | var hasConsumed = true //是否消费过 ,true为消费过 126 | val partitionsE = kc.getPartitions(Set(topic)) //获取patition信息 127 | if (partitionsE.isLeft) throw new SparkException("get kafka partition failed:") 128 | val partitions = partitionsE.right.get 129 | val consumerOffsetsE = kc.getConsumerOffsets(groupId, partitions) //获取这个topic的每个patition的消费信息 130 | if (consumerOffsetsE.isLeft) hasConsumed = false 131 | if (hasConsumed) { 132 | val earliestLeaderOffsets = kc.getEarliestLeaderOffsets(partitions).right.get 133 | val consumerOffsets = consumerOffsetsE.right.get 134 | // 可能只是存在部分分区consumerOffsets过时,所以只更新过时分区的consumerOffsets为latestLeaderOffsets 135 | consumerOffsets.foreach({ 136 | case (tp, n) => 137 | //现在数据在什么offset上 138 | val earliestLeaderOffset = earliestLeaderOffsets(tp).offset 139 | if (n < earliestLeaderOffset) { 140 | //消费过,但是过时了,就从头消费(或者从最新开始消费) 141 | val latestLeaderOffsets = kc.getLatestLeaderOffsets(partitions).right.get(tp).offset 142 | offsets += (tp -> latestLeaderOffsets) 143 | } else offsets += (tp -> n) //消费者的offsets正常 144 | }) 145 | } else { // 没有消费过 ,这是一个新的消费group id 146 | val reset = kafkaParams.get("auto.offset.reset").map(_.toLowerCase) 147 | var leaderOffsets: Map[TopicAndPartition, LeaderOffset] = null 148 | if (reset == Some("smallest")) { 149 | leaderOffsets = kc.getEarliestLeaderOffsets(partitions).right.get 150 | } else { 151 | leaderOffsets = kc.getLatestLeaderOffsets(partitions).right.get 152 | } 153 | leaderOffsets.foreach { case (tp, offset) => offsets += (tp -> offset.offset) } 154 | } 155 | }) 156 | offsets 157 | 158 | } 159 | def getRDDConsumerOffsets(data: RDD[(String, String)]) = { 160 | var consumoffsets = Map[TopicAndPartition, Long]() 161 | val offsetsList = data.asInstanceOf[HasOffsetRanges].offsetRanges 162 | for (offsets <- offsetsList) { 163 | val topicAndPartition = TopicAndPartition(offsets.topic, offsets.partition) 164 | consumoffsets += ((topicAndPartition, offsets.untilOffset)) 165 | } 166 | consumoffsets 167 | } 168 | /** 169 | * 更新zookeeper上的消费offsets 170 | * @param rdd 171 | */ 172 | def updateConsumerOffsets(topicAndPartition: Map[TopicAndPartition, Long]): Unit = { 173 | val o = kc.setConsumerOffsets(groupId, topicAndPartition) 174 | } 175 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/util/SparkKryoRegistrators.scala: -------------------------------------------------------------------------------- 1 | package com.spark.util 2 | 3 | import org.apache.spark.serializer.KryoRegistrator 4 | import com.esotericsoftware.kryo.Kryo 5 | import com.spark.util.SparkKryoSerializerTest.MygisterKryoClass 6 | 7 | class SparkKryoRegistrators extends KryoRegistrator{ 8 | @Override 9 | def registerClasses(kryo:Kryo) { 10 | kryo.register(classOf[String]) 11 | kryo.register(classOf[MygisterKryoClass]) 12 | 13 | } 14 | } -------------------------------------------------------------------------------- /src/main/scala/com/spark/util/SparkKryoSerializerTest.scala: -------------------------------------------------------------------------------- 1 | package com.spark.util 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | 6 | object SparkKryoSerializerTest { 7 | var sparkconf:SparkConf=null 8 | var sc:SparkContext=null 9 | def main(args: Array[String]): Unit = { 10 | sparkInit 11 | testKryoSerializer 12 | } 13 | def testKryoSerializer{ 14 | var personList = 1 to 10 map (value => new MygisterKryoClass(value + "")) 15 | var myrdd= sc.parallelize(personList) 16 | myrdd.foreach { x=>println(x.getName) } 17 | } 18 | 19 | def sparkInit(){ 20 | sparkconf = new SparkConf() 21 | .setMaster("local") 22 | .setAppName("Spark Pi") 23 | sparkconf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 24 | 25 | sparkconf.set("spark.kryo.registrator", "com.spark.util.SparkKryoRegistrators") 26 | //sparkconf.registerKryoClasses(Array(classOf[MygisterKryoClass],classOf[String])) 27 | sc = new SparkContext(sparkconf) 28 | } 29 | class MygisterKryoClass(var name:String){ 30 | //private var name:String=null 31 | def getName={ 32 | name 33 | } 34 | def setName(name:String)={ 35 | this.name=name 36 | } 37 | } 38 | } -------------------------------------------------------------------------------- /src/main/scala/com/test/CheckHbaseDataWithMysql.scala: -------------------------------------------------------------------------------- 1 | package com.test 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.log4j.Logger 5 | import org.apache.log4j.Level 6 | import java.util.HashMap 7 | import org.apache.spark.rdd.RDD 8 | import com.sun.org.apache.commons.logging.LogFactory 9 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions 10 | import org.apache.hadoop.conf.Configuration 11 | import org.apache.hadoop.fs.FileSystem 12 | import org.apache.hadoop.fs.Path 13 | import scala.collection.JavaConversions._ 14 | import org.apache.hadoop.hbase.client.Connection 15 | import org.apache.hadoop.hbase.HBaseConfiguration 16 | import org.apache.hadoop.hbase.client.ConnectionFactory 17 | import org.apache.hadoop.hbase.TableName 18 | import org.apache.hadoop.hbase.client.Scan 19 | import java.util.ArrayList 20 | import java.io.File 21 | import java.io.BufferedWriter 22 | import java.io.OutputStreamWriter 23 | import java.io.FileOutputStream 24 | import org.apache.hadoop.hbase.util.Bytes 25 | import java.util.ArrayList 26 | import java.util.Date 27 | import java.sql.DriverManager 28 | import org.apache.spark.HashPartitioner 29 | import org.apache.spark.serializer.KryoRegistrator 30 | import org.apache.spark.streaming.StreamingContext 31 | import org.apache.spark.streaming.Milliseconds 32 | import org.apache.hadoop.hbase.client.HConnectionManager 33 | import scala.collection.mutable.ArrayBuffer 34 | import java.text.SimpleDateFormat 35 | import org.apache.hadoop.hbase.filter.RowFilter 36 | import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp 37 | import org.apache.hadoop.hbase.filter.RegexStringComparator 38 | import java.util.Calendar 39 | import java.text.DateFormat 40 | import java.util.Properties 41 | import java.io.FileInputStream 42 | import org.apache.hadoop.mapred.TextInputFormat 43 | import org.apache.hadoop.io.LongWritable 44 | import org.apache.hadoop.io.Text 45 | import java.util.Date 46 | import java.sql.Timestamp 47 | import java.util.ArrayList 48 | import org.apache.hadoop.hbase.client.Get 49 | import scala.reflect.ClassTag 50 | import org.apache.hadoop.hbase.protobuf.ProtobufUtil 51 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat 52 | import org.apache.hadoop.hbase.util.Base64 53 | import org.apache.hadoop.mapreduce.Job 54 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable 55 | import org.apache.hadoop.hbase.client.Result 56 | import org.apache.hadoop.hbase.mapreduce.TableOutputFormat 57 | import org.apache.hadoop.hbase.client.Put 58 | object CheckHbaseDataWithMysql { 59 | var sparkconf: SparkConf = null 60 | var sc: SparkContext = null 61 | var conf: Configuration = null 62 | var connection: Connection = null 63 | import java.sql.Connection 64 | var mysqlconn: Connection = null 65 | var zookeeper = "solr2.zhiziyun.com,solr1.zhiziyun.com,mongodb3" 66 | def main(args: Array[String]): Unit = { 67 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0") 68 | val time="2016-11-09" 69 | initMysqlConn2 70 | initHbaseConn 71 | val map=getMysqlData(time) 72 | map.foreach(println) 73 | 74 | } 75 | 76 | 77 | def getMysqlData(time:String)={ 78 | val map=new HashMap[String,HashMap[String,Int]] 79 | var stam = mysqlconn.createStatement() 80 | val sql="select plan,sum(Delivery),sum(clicks),sum(reach) "+ 81 | "from Sample_Queue s"+ 82 | " where s.sampUpt>'"+time+"' group by plan" 83 | var result = stam.executeQuery(sql) 84 | while(result.next){ 85 | val d=new HashMap[String,Int] 86 | val plan=result.getString(1) 87 | val delivery=result.getInt(2) 88 | val clicks=result.getInt(3) 89 | val reach=result.getInt(4) 90 | d.put("delivery", delivery) 91 | d.put("clicks", clicks) 92 | d.put("reach", reach) 93 | map.put(plan,d) 94 | } 95 | map 96 | } 97 | def initMysqlConn2() { 98 | var user="developer" 99 | var pass="dev@zhiziyun^)0628" 100 | var mysqlurl = "jdbc:mysql://192.168.10.66/zz_bidoptimize" 101 | Class.forName("com.mysql.jdbc.Driver") 102 | mysqlconn = DriverManager.getConnection(mysqlurl, user, pass) 103 | } 104 | def initHbaseConn { 105 | var hconf = HBaseConfiguration.create() 106 | hconf.set("hbase.zookeeper.quorum", zookeeper) 107 | hconf.set("hbase.zookeeper.property.clientPort", "2181") 108 | connection = ConnectionFactory.createConnection(hconf) 109 | } 110 | } -------------------------------------------------------------------------------- /src/main/scala/com/test/HbaseUtil.scala: -------------------------------------------------------------------------------- 1 | package com.test 2 | 3 | import org.apache.hadoop.hbase.client.Connection 4 | import org.apache.hadoop.hbase.HBaseConfiguration 5 | import org.apache.hadoop.hbase.client.ConnectionFactory 6 | import org.apache.hadoop.hbase.TableName 7 | import org.apache.hadoop.hbase.client.Scan 8 | import scala.collection.JavaConversions._ 9 | import org.apache.hadoop.hbase.util.Bytes 10 | import org.apache.hadoop.hbase.util.Base64 11 | import com.fasterxml.jackson.core.JsonFactory 12 | object HbaseUtil { 13 | var hbaseConn: Connection = null 14 | var zookeeper:String = "cdh-master,node1,node2" 15 | def main(args: Array[String]): Unit = { 16 | initHbaseConn 17 | getKyLinHbaseData 18 | 19 | 20 | } 21 | def getKyLinHbaseData(){ 22 | val table = hbaseConn.getTable(TableName.valueOf("KYLIN_HT7HUTOKSO")) 23 | val scan = new Scan() 24 | scan.setMaxResultSize(10L) 25 | scan.setMaxResultsPerColumnFamily(1) 26 | val resultScanner = table.getScanner(scan); 27 | for (result <- resultScanner) { 28 | var listCells = result.listCells() 29 | 30 | /*for (cell <- listCells) { 31 | var column = new String(cell.getValueArray, cell.getValueOffset, cell.getValueLength) 32 | println(column) 33 | 34 | //rowMap.put(column, new String(cell.getValueArray, cell.getValueOffset, cell.getValueLength)) 35 | }*/ 36 | } 37 | table.close() 38 | } 39 | def initHbaseConn { 40 | if(hbaseConn!=null) hbaseConn.close() 41 | hbaseConn=null 42 | var hconf = HBaseConfiguration.create() 43 | hconf.set("hbase.zookeeper.quorum", zookeeper) 44 | hconf.set("hbase.zookeeper.property.clientPort", "2181") 45 | hbaseConn = ConnectionFactory.createConnection(hconf) 46 | } 47 | } -------------------------------------------------------------------------------- /src/main/scala/com/test/HttpAsyncClientsTest.scala: -------------------------------------------------------------------------------- 1 | package com.test 2 | 3 | import org.json.JSONObject 4 | import org.apache.http.impl.client.DefaultHttpClient 5 | import org.apache.http.client.methods.HttpGet 6 | import org.apache.http.util.EntityUtils 7 | import org.apache.http.client.methods.HttpPost 8 | import java.net.URI 9 | import java.net.URL 10 | import org.apache.http.concurrent.FutureCallback 11 | import org.apache.http.HttpResponse 12 | import java.util.concurrent.CountDownLatch 13 | import org.apache.http.client.config.RequestConfig 14 | import org.apache.http.impl.nio.client.HttpAsyncClients 15 | import org.apache.http.nio.client.methods.AsyncCharConsumer 16 | import java.nio.CharBuffer 17 | import org.apache.http.nio.IOControl 18 | import org.apache.http.protocol.HttpContext 19 | import org.apache.http.nio.client.methods.HttpAsyncMethods 20 | import java.util.ArrayList 21 | import org.apache.http.impl.nio.conn.ManagedNHttpClientConnectionFactory 22 | import org.apache.http.params.HttpParams 23 | import org.apache.http.params.BasicHttpParams 24 | 25 | /** 26 | * http异步发消息 27 | */ 28 | object HttpAsyncClientsTest { 29 | def main(args: Array[String]): Unit = { 30 | //testHttpClient 31 | //HttpAsyncClients 32 | //testURLConnect 33 | //var sd:ManagedNHttpClientConnectionFactory=new ManagedNHttpClientConnectionFactory 34 | var a=testURLConnect 35 | println(a) 36 | // testHttpClient 37 | } 38 | def testHttpClient(){ 39 | var get = new HttpGet() 40 | val ps=new BasicHttpParams() 41 | var httpClient = new DefaultHttpClient(); 42 | var id=0 43 | var latch = new CountDownLatch(10); 44 | for(i<- 1 to 10){ 45 | id=i 46 | get.setURI(URI.create(s"https://www.baidum/s?wd=${id}")) 47 | println(i+":"+get.getURI) 48 | val rp= httpClient.execute(get); 49 | println((rp.getStatusLine)) 50 | get.reset()//必须加这个,否则会报错 51 | 52 | } 53 | 54 | 55 | 56 | } 57 | def testYiBUHttp(){ 58 | val requestConfig = RequestConfig.custom() 59 | .setSocketTimeout(1).setConnectTimeout(1).build(); 60 | var httpclient = HttpAsyncClients.custom() 61 | .setDefaultRequestConfig(requestConfig).build(); 62 | httpclient.start(); 63 | try { 64 | val future = httpclient.execute( 65 | HttpAsyncMethods.createGet("https://www.verisign.com/"), 66 | new MyResponseConsumer(), null); 67 | if(future!=null){ 68 | future.get 69 | } 70 | val result=true 71 | if (result != null && result.booleanValue()) { 72 | System.out.println("Request successfully executed"); 73 | } else { 74 | System.out.println("Request failed"); 75 | } 76 | System.out.println("Shutting down"); 77 | } finally { 78 | httpclient.close(); 79 | } 80 | System.out.println("Done"); 81 | } 82 | def testURLConnect()={ 83 | val requestConfig = RequestConfig.custom() 84 | .setSocketTimeout(10)//连上之后,持续的时间,用于控制返回 85 | .setConnectTimeout(10)//连上的时候,用于控制ping 86 | .build(); 87 | var httpclient = HttpAsyncClients.custom() 88 | .setDefaultRequestConfig(requestConfig) 89 | //.setMaxConnTotal(10000) 90 | //.setMaxConnPerRoute(1000) 91 | .build(); 92 | var erorURL=new ArrayList[String] 93 | try { 94 | httpclient.start(); 95 | var requests = Array[HttpGet](new HttpGet("https://www.google.com.hk"), 96 | new HttpGet("https://www.verisign.com/"), 97 | new HttpGet("http://carat.clientsolutions.cn"), 98 | new HttpGet("http://www.baidu.com/")); 99 | val latch = new CountDownLatch(requests.length); 100 | for (request<-requests) { 101 | httpclient.execute(request, new FutureCallback[HttpResponse]() { 102 | def completed(response:HttpResponse ) { 103 | try { 104 | println("success:"+request.getURI) 105 | latch.countDown(); 106 | } 107 | catch {case t: Throwable => erorURL.add(request.getURI.toString())} 108 | } 109 | def failed(ex: Exception ) { 110 | try { 111 | println("error:"+request.getURI) 112 | latch.countDown(); 113 | erorURL.add(request.getURI.toString()) 114 | } 115 | catch {case t: Throwable => erorURL.add(request.getURI.toString())} 116 | } 117 | def cancelled() { 118 | try {latch.countDown();} 119 | catch {case t: Throwable => erorURL.add(request.getURI.toString())} 120 | } 121 | }); 122 | 123 | } 124 | 125 | latch.await(); 126 | //System.out.println("Shutting down"); 127 | } finally { 128 | httpclient.close(); 129 | } 130 | System.out.println("Done"); 131 | erorURL 132 | } 133 | def testSFun(){ 134 | var carat_bidid="a" 135 | var carat_price="b" 136 | var str=s"http://carat.clientsolutions.cn/c=1,1,2&bidid=${carat_bidid}&ep=${carat_price}" 137 | println(str) 138 | 139 | } 140 | def testPinjie(){ 141 | 142 | var s=new JSONObject 143 | s.put("key", "Hello Json") 144 | s.put("key2", Array(1,2,3)) 145 | println(s) 146 | } 147 | 148 | } 149 | class MyFutureCallback(latch:CountDownLatch,request: HttpGet) extends FutureCallback[HttpResponse]{ 150 | //无论完成还是失败都调用countDown() 151 | @Override 152 | def completed(response:HttpResponse) { 153 | latch.countDown(); 154 | System.out.println(response.getStatusLine()); 155 | } 156 | @Override 157 | def failed( ex: Exception) { 158 | latch.countDown(); 159 | System.out.println(request.getRequestLine() + "->" + ex); 160 | } 161 | @Override 162 | def cancelled() { 163 | latch.countDown(); 164 | } 165 | } 166 | class MyResponseConsumer extends AsyncCharConsumer[Boolean] { 167 | 168 | val times = 0; 169 | 170 | def getTimes()= { 171 | "\n\n### 第" + times + "步\n###" 172 | } 173 | 174 | @Override 175 | def onCharReceived(buf: CharBuffer , ioctrl: IOControl ){ 176 | /* System.out.println(getTimes() + "onCharReceived"); 177 | while (buf.hasRemaining()) { 178 | System.out.print(buf.get()); 179 | } */ 180 | } 181 | 182 | @Override 183 | def onResponseReceived(response: HttpResponse){ 184 | //println(getTimes() + "onResponseReceived"); 185 | } 186 | @Override 187 | def buildResult(context: HttpContext ) ={ 188 | System.out.println(getTimes() + "buildResult"); 189 | true 190 | } 191 | 192 | } 193 | /*def doAsyncGet(String url) throws IOException{ 194 | RequestConfig defaultRequestConfig = RequestConfig.custom() 195 | .setSocketTimeout(5000) 196 | .setConnectTimeout(5000) 197 | .setConnectionRequestTimeout(5000) 198 | .setStaleConnectionCheckEnabled(true) 199 | .build(); 200 | final CloseableHttpAsyncClient httpclient = HttpAsyncClients.custom() 201 | .setDefaultRequestConfig(defaultRequestConfig) 202 | .setMaxConnTotal(10000) 203 | .setMaxConnPerRoute(1000).build(); 204 | try { 205 | final HttpGet httpget = new HttpGet(url); 206 | RequestConfig requestConfig = RequestConfig.copy(defaultRequestConfig) 207 | .build(); 208 | httpget.setConfig(requestConfig); 209 | httpclient.execute(httpget, new FutureCallback() { 210 | 211 | public void completed(final HttpResponse response) { 212 | try { 213 | httpget.releaseConnection(); 214 | } catch (Exception e) { 215 | log.error("close asyncResponse error:",e); 216 | } 217 | } 218 | 219 | public void failed(final Exception ex) { 220 | try { 221 | httpget.releaseConnection(); 222 | log.error("this connection failed!",ex); 223 | } catch (Exception e) { 224 | log.error("close asyncResponse error:",e); 225 | } 226 | } 227 | 228 | public void cancelled() { 229 | try { 230 | httpget.releaseConnection(); 231 | log.error("this connection has been cancelled!"); 232 | } catch (Exception e) { 233 | log.error("close asyncResponse error:",e); 234 | } 235 | }}); 236 | }catch(Exception e){ 237 | log.error("http async error:"+url,e); 238 | } 239 | } 240 | */ 241 | /*{ 242 | //无论完成还是失败都调用countDown() 243 | @Override 244 | def completed(response:HttpResponse) { 245 | latch.countDown(); 246 | System.out.println(request.getRequestLine() + "->" 247 | + response.getStatusLine()); 248 | } 249 | @Override 250 | def failed( ex: Exception) { 251 | latch.countDown(); 252 | System.out.println(request.getRequestLine() + "->" + ex); 253 | } 254 | @Override 255 | def cancelled() { 256 | latch.countDown(); 257 | } 258 | }*/ -------------------------------------------------------------------------------- /src/main/scala/com/test/JsonTest.scala: -------------------------------------------------------------------------------- 1 | package com.zhiziyun.bot.service.url.test 2 | 3 | import org.json.JSONObject 4 | import scala.collection.mutable.ArrayBuffer 5 | import scala.collection.mutable.HashMap 6 | import scala.collection.JavaConversions._ 7 | import org.json.JSONArray 8 | object JsonTest { 9 | def main(args: Array[String]): Unit = { 10 | //test 11 | val b= """{"DATA":{"MOD_MOB_DDQ_BASIC":[{"AGENT":"wechat","ZODIAC":"兔","STAR":"处女座","GENDER":"FEMALE","EDUCATION_DEGREE":"zkjyx","IS_LOCAL":"bendiji"},{"AGENT":"APP","ZODIAC":"猪","STAR":"双鱼座","GENDER":"MALE","EDUCATION_DEGREE":"bk","IS_LOCAL":"feibendiji"},{"AGENT":"wechat","ZODIAC":"马","STAR":"天秤座","GENDER":"MALE","EDUCATION_DEGREE":"zkjyx","IS_LOCAL":"bendiji"},{"AGENT":"APP","ZODIAC":"鼠","STAR":"摩羯座","GENDER":"MALE","EDUCATION_DEGREE":"bk","IS_LOCAL":"bendiji"}]},"TOPIC":"mod_mob_ddq_basic"} 12 | 13 | """ 14 | val a="""{"qmart":"TEST","ntnum":"50","ecrule1":"测试中1","ecrule2":"","ecrule3":"",}""" 15 | val obj=new JSONObject(b) 16 | println(transObject(transObject(obj))) 17 | } 18 | def transObject(o1:JSONObject):JSONObject={ 19 | val o2=new JSONObject(); 20 | val it = o1.keys(); 21 | while (it.hasNext()) { 22 | val key = it.next().asInstanceOf[String]; 23 | val obj = o1.get(key); 24 | if(obj.getClass().toString().endsWith("String")){ 25 | o2.accumulate(key.toUpperCase(), obj); 26 | }else if(obj.getClass().toString().endsWith("JSONObject")){ 27 | o2.accumulate(key.toUpperCase(), transObject(obj.asInstanceOf[JSONObject])); 28 | }else if(obj.getClass().toString().endsWith("JSONArray")){ 29 | o2.put(key.toUpperCase(), transArray(o1.getJSONArray(key))); 30 | } 31 | } 32 | o2 33 | } 34 | def transArray( o1:JSONArray):JSONArray={ 35 | val o2 = new JSONArray(); 36 | for (i <- 0 to o1.length-1) { 37 | val jArray=o1.getJSONObject(i); 38 | if(jArray.getClass().toString().endsWith("JSONObject")){ 39 | o2.put(transObject(jArray.asInstanceOf[JSONObject])); 40 | }else if(jArray.getClass().toString().endsWith("JSONArray")){ 41 | o2.put(transArray(jArray.asInstanceOf[JSONArray])); 42 | } 43 | } 44 | o2; 45 | } 46 | } -------------------------------------------------------------------------------- /src/main/scala/com/test/KafkaLogTest.scala: -------------------------------------------------------------------------------- 1 | package com.test 2 | 3 | import org.slf4j.Logger 4 | import org.slf4j.LoggerFactory 5 | 6 | 7 | object KafkaLogTest { 8 | def main(args: Array[String]): Unit = { 9 | var LOGGER: Logger = LoggerFactory.getLogger("KAFKA")//日志记录 10 | for (i <- 1 to 1000) { 11 | LOGGER.info("Info [" + i + "]"); 12 | println("Info [" + i + "]") 13 | Thread.sleep(1000); 14 | } 15 | } 16 | } -------------------------------------------------------------------------------- /src/main/scala/com/test/ReflectScala.scala: -------------------------------------------------------------------------------- 1 | package com.test 2 | 3 | object ReflectScala { 4 | def main(args: Array[String]): Unit = { 5 | var a=Class.forName("com.test.ReflectScala") 6 | //因为该方法是一个静态的方法,所以这个地方的invoke只要填null就可以了。但是如果不是一个静态方法,就需要一个实例 7 | //a.getMethod("test").invoke(a.newInstance()) 8 | a.getMethod("test",classOf[String]).invoke(null,"hello world") 9 | 10 | } 11 | def test(s:String){ 12 | println(s) 13 | } 14 | } -------------------------------------------------------------------------------- /src/main/scala/com/test/SparkWithLocalTest.scala: -------------------------------------------------------------------------------- 1 | package com.test 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.SparkContext._ 6 | import scala.collection.JavaConversions._ 7 | import scala.collection.mutable.ArrayBuffer 8 | import java.util.Calendar 9 | import java.util.ArrayList 10 | import org.apache.hadoop.fs.Path 11 | import org.apache.spark.streaming.kafka.KafkaUtils 12 | import kafka.message.MessageAndMetadata 13 | import kafka.serializer.StringDecoder 14 | import kafka.serializer.StringDecoder 15 | import org.apache.spark.streaming.kafka.Broker 16 | import kafka.common.TopicAndPartition 17 | object SparkWithLocalTest { 18 | var sc: SparkContext = null 19 | val zookeeper="" 20 | System.setProperty("hadoop.home.dir", "F:\\eclipse\\hdplocal2.6.0") 21 | val correctData=new ArrayList[(String,String,String,Int)] 22 | def main(args: Array[String]): Unit = { 23 | val sparkConf = new SparkConf() 24 | .setMaster("local") 25 | .setAppName("Test") 26 | sc = new SparkContext(sparkConf) 27 | 28 | 29 | 30 | } 31 | 32 | def getKafkaRDD(){ 33 | var kafkaParams = Map[String, String]("metadata.broker.list" -> "kafka1:9092,kafka2:9092,kafka3:9092", 34 | "serializer.class" -> "kafka.serializer.StringEncoder", 35 | "group.id" -> "test", "zookeeper.connect" -> zookeeper) 36 | KafkaUtils.createRDD[String, String, StringDecoder, StringDecoder, (String, String)]( 37 | sc, 38 | kafkaParams, 39 | null, 40 | Map[TopicAndPartition, Broker](), 41 | (mmd: MessageAndMetadata[String, String]) => (mmd.key, mmd.message)) 42 | 43 | 44 | 45 | } 46 | 47 | 48 | 49 | def peixu(){ 50 | val rdd= sc.textFile("/data/test") 51 | rdd.flatMap{x=>x.split(" ")} 52 | .map{x=>(x,1)} 53 | .reduceByKey(_+_) 54 | .sortBy({case(key,num)=>num},false) 55 | .foreach(println) 56 | } 57 | def runJob(){ 58 | var rdd=sc.parallelize(Array((0,0))) 59 | var tmprdd=sc.parallelize(Array((0,1))) 60 | .map{x=>println("@");x} 61 | val rrdd=tmprdd.groupByKey//在有shuffle操作的时候,spark默认会对其数据进行缓存,不会重新计算 62 | .map{x=>println("##");(x._1,x._2)} 63 | rrdd.foreach(println) 64 | rrdd.foreach(println) 65 | 66 | 67 | 68 | } 69 | 70 | def sparkTest(){ 71 | val rdd=sc.parallelize(Array((1,6),(7,8),(9,1)),3).zipWithIndex().map(x=>(x._2,x._1)) 72 | rdd.foreach(println) 73 | val rdd2=rdd.map{x=> 74 | var index=x._1-1 75 | (index,x._2) 76 | } 77 | rdd2.foreach(println) 78 | rdd.join(rdd2).map{x=> 79 | val (f,s)=x._2 80 | (s._1,s._2-f._2)}.foreach(println) 81 | } 82 | 83 | def init { 84 | val sparkConf = new SparkConf() 85 | .setMaster("local") 86 | .setAppName("Test") 87 | sc = new SparkContext(sparkConf) 88 | } 89 | } -------------------------------------------------------------------------------- /src/main/scala/com/test/Test.scala: -------------------------------------------------------------------------------- 1 | package com.test 2 | 3 | import java.util.HashMap 4 | import scala.collection.mutable.ArrayBuffer 5 | import scala.collection.mutable.Seq 6 | import java.io.File 7 | import java.net.URLClassLoader 8 | import java.net.URL 9 | import scala.collection.JavaConverters._ 10 | import scala.collection.JavaConversions._ 11 | import com.test.Utilities 12 | object Test extends Utilities{ 13 | def main(args: Array[String]): Unit = { 14 | //println(fun((1,1))) 15 | //val a=new HashMap[String,String] 16 | //a.put("a", "a") 17 | //t1(a) 18 | //println(a) 19 | //t2(a) 20 | //println(a) 21 | /* val url=new File("C:\\Users\\zhiziyun\\Desktop\\test-0.0.1-SNAPSHOT.jar").toURI().toURL() 22 | val d=new URLClassLoader(Array(url), Thread.currentThread().getContextClassLoader()) 23 | val a= d.loadClass("test.HelloWord") 24 | a.getMethod("printwoed",classOf[String]).invoke(a.newInstance(),"hello world") 25 | */ 26 | val v_l5mon_date = getDateStr(getNMonthAgo(getNDayAgo(1), 4)) 27 | println(v_l5mon_date) 28 | val v_data_date = getDateStr_(getNDayAgo(1)) 29 | println(v_data_date) 30 | val v_next_date = getDateStr_() 31 | println(v_next_date) 32 | val v_data_day = getDateStr_() 33 | println(v_data_day) 34 | val v_mth_stt = getMonthStart() 35 | println(v_mth_stt) 36 | val v_mth_end = getMonthEnd() 37 | println(v_mth_end) 38 | } 39 | def t1(a: HashMap[String, String]) { 40 | a.clear() 41 | } 42 | 43 | def t2(a: HashMap[String, String]) { 44 | a.put("1", "1") 45 | } 46 | def fun(str: Any, data: String) = { 47 | str match { 48 | case i: Int => "INt" + ":" + data 49 | case s: String => "String" + ":" + data 50 | case map: HashMap[_, _] => 51 | "Map" + ":" + data 52 | str.asInstanceOf[HashMap[String, String]].toString() 53 | case t: (_, _) => 54 | "Tuple2" + ":" + data 55 | t.asInstanceOf[Tuple2[Int, Int]].toString() 56 | } 57 | } 58 | def write( 59 | data: String, 60 | fun: (Any, String) => String) = { 61 | println(fun("", data)) 62 | println(fun((1, 1), data)) 63 | } 64 | } 65 | 66 | case class casetest(a: String)(val b: String) { 67 | def d = { 68 | println(b) 69 | } 70 | } 71 | object EnumerationTest extends Enumeration { 72 | type EnumerationTest = Value 73 | val b, c, d = Value 74 | } -------------------------------------------------------------------------------- /src/main/scala/com/test/TestJava.java: -------------------------------------------------------------------------------- 1 | package com.test; 2 | 3 | import java.util.ArrayList; 4 | 5 | public class TestJava { 6 | public static void main(String[] args) { 7 | ArrayList da=new ArrayList(); 8 | da.add("a"); 9 | } 10 | public static void aa(String[] a){ 11 | System.out.println(a[0]); 12 | } 13 | public String[] bb(){ 14 | return new String[1]; 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/main/scala/com/test/Utilities.scala: -------------------------------------------------------------------------------- 1 | package com.test 2 | 3 | import org.json.JSONObject 4 | import org.json.JSONArray 5 | import org.json.JSONException 6 | import java.text.SimpleDateFormat 7 | import java.util.Date 8 | import java.util.regex.Pattern 9 | import java.util.Calendar 10 | import java.lang.Long 11 | 12 | trait Utilities { 13 | def transObject(o1:JSONObject):JSONObject={ 14 | val o2=new JSONObject(); 15 | val it = o1.keys(); 16 | while (it.hasNext()) { 17 | val key = it.next().asInstanceOf[String]; 18 | val obj = o1.get(key); 19 | if(obj.getClass().toString().endsWith("String")){ 20 | o2.accumulate(key.toLowerCase(), obj); 21 | }else if(obj.getClass().toString().endsWith("JSONObject")){ 22 | o2.accumulate(key.toLowerCase(), transObject(obj.asInstanceOf[JSONObject])); 23 | }else if(obj.getClass().toString().endsWith("JSONArray")){ 24 | o2.put(key.toLowerCase(), transArray(o1.getJSONArray(key))); 25 | } 26 | } 27 | o2 28 | } 29 | def transArray( o1:JSONArray):JSONArray={ 30 | val o2 = new JSONArray(); 31 | for (i <- 0 to o1.length-1) { 32 | val jArray=o1.getJSONObject(i); 33 | if(jArray.getClass().toString().endsWith("JSONObject")){ 34 | o2.put(transObject(jArray.asInstanceOf[JSONObject])); 35 | }else if(jArray.getClass().toString().endsWith("JSONArray")){ 36 | o2.put(transArray(jArray.asInstanceOf[JSONArray])); 37 | } 38 | } 39 | o2; 40 | } 41 | 42 | def getNMonthAgo(calendar: Calendar, n: Int) = { 43 | 44 | calendar.add(Calendar.MONTH, -n) 45 | calendar 46 | } 47 | 48 | def getNDayAgo(n: Int) = { 49 | val calendar = Calendar.getInstance 50 | val time = calendar.getTimeInMillis - n*24*60*60*1000 51 | calendar.setTimeInMillis(time) 52 | calendar 53 | } 54 | 55 | def getDateStr(calendar: Calendar) = { 56 | val date = calendar.getTime 57 | val sdf = new SimpleDateFormat("yyyyMMdd") 58 | val str = sdf.format(date) 59 | str 60 | } 61 | 62 | def getDateStr_(calendar: Calendar) = { 63 | val date = calendar.getTime 64 | val sdf = new SimpleDateFormat("yyyy-MM-dd") 65 | val str = sdf.format(date) 66 | str 67 | } 68 | 69 | def getDateStr_(time: Long) = { 70 | val date = new Date(time) 71 | val sdf = new SimpleDateFormat("yyyy-MM-dd") 72 | val str = sdf.format(date) 73 | str 74 | } 75 | 76 | def getDateStr() = { 77 | val date = new Date() 78 | val sdf = new SimpleDateFormat("yyyyMMdd") 79 | val str = sdf.format(date) 80 | str 81 | } 82 | 83 | def getDateStr_() = { 84 | val date = new Date() 85 | val sdf = new SimpleDateFormat("yyyy-MM-dd") 86 | val str = sdf.format(date) 87 | str 88 | } 89 | 90 | def getMonthStart() = { 91 | val cale = Calendar.getInstance() 92 | cale.add(Calendar.MONTH, 0) 93 | cale.set(Calendar.DAY_OF_MONTH, 1) 94 | val firstday = getDateStr_(cale) 95 | firstday 96 | } 97 | 98 | def getMonthEnd() = { 99 | val cale = Calendar.getInstance() 100 | cale.add(Calendar.MONTH, 1) 101 | cale.set(Calendar.DAY_OF_MONTH, 0) 102 | val lastday = getDateStr_(cale) 103 | lastday 104 | } 105 | 106 | def isnull(key: Object): Boolean = { 107 | if (key != null) { 108 | true 109 | } else { 110 | false 111 | } 112 | } 113 | 114 | def getCurrent_time(): Long = { 115 | val now = new Date() 116 | val a = now.getTime 117 | var str = a + "" 118 | str.substring(0, 10).toLong 119 | } 120 | 121 | def getZero_time(): Long = { 122 | val now = new Date() 123 | val dateFormat = new SimpleDateFormat("yyyy-MM-dd") 124 | val a = dateFormat.parse(dateFormat.format(now)).getTime 125 | var str = a + "" 126 | str.substring(0, 10).toLong 127 | } 128 | 129 | def getTimestamp(): String = { 130 | var ts = System.currentTimeMillis() 131 | ts.toString 132 | } 133 | 134 | def getMD5hash(s: String) = { 135 | val m = java.security.MessageDigest.getInstance("MD5") 136 | val b = s.getBytes("UTF-8") 137 | m.update(b, 0, b.length) 138 | new java.math.BigInteger(1, m.digest()).toString(16) 139 | } 140 | 141 | /** Makes sure only ERROR messages get logged to avoid log spam. */ 142 | def setupLogging() = { 143 | import org.apache.log4j.{ Level, Logger } 144 | val rootLogger = Logger.getRootLogger() 145 | rootLogger.setLevel(Level.WARN) 146 | } 147 | 148 | /** Configures Twitter service credentials using twiter.txt in the main workspace directory */ 149 | def setupTwitter() = { 150 | import scala.io.Source 151 | 152 | for (line <- Source.fromFile("../twitter.txt").getLines) { 153 | val fields = line.split(" ") 154 | if (fields.length == 2) { 155 | System.setProperty("twitter4j.oauth." + fields(0), fields(1)) 156 | } 157 | } 158 | } 159 | 160 | /** Retrieves a regex Pattern for parsing Apache access logs. */ 161 | def apacheLogPattern(): Pattern = { 162 | val ddd = "\\d{1,3}" 163 | val ip = s"($ddd\\.$ddd\\.$ddd\\.$ddd)?" 164 | val client = "(\\S+)" 165 | val user = "(\\S+)" 166 | val dateTime = "(\\[.+?\\])" 167 | val request = "\"(.*?)\"" 168 | val status = "(\\d{3})" 169 | val bytes = "(\\S+)" 170 | val referer = "\"(.*?)\"" 171 | val agent = "\"(.*?)\"" 172 | val regex = s"$ip $client $user $dateTime $request $status $bytes $referer $agent" 173 | Pattern.compile(regex) 174 | } 175 | } -------------------------------------------------------------------------------- /src/main/scala/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | dfs.nameservices 7 | nameservice-zzy 8 | 9 | 10 | dfs.client.failover.proxy.provider.nameservice-zzy 11 | org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider 12 | 13 | 14 | dfs.ha.automatic-failover.enabled.nameservice-zzy 15 | true 16 | 17 | 18 | ha.zookeeper.quorum 19 | mongodb3:2181,solr1.zhiziyun.com:2181,solr2.zhiziyun.com:2181 20 | 21 | 22 | dfs.ha.namenodes.nameservice-zzy 23 | namenode47,namenode237 24 | 25 | 26 | dfs.namenode.rpc-address.nameservice-zzy.namenode47 27 | mongodb3:8020 28 | 29 | 30 | dfs.namenode.servicerpc-address.nameservice-zzy.namenode47 31 | mongodb3:8022 32 | 33 | 34 | dfs.namenode.http-address.nameservice-zzy.namenode47 35 | mongodb3:50070 36 | 37 | 38 | dfs.namenode.https-address.nameservice-zzy.namenode47 39 | mongodb3:50470 40 | 41 | 42 | dfs.namenode.rpc-address.nameservice-zzy.namenode237 43 | solr2.zhiziyun.com:8020 44 | 45 | 46 | dfs.namenode.servicerpc-address.nameservice-zzy.namenode237 47 | solr2.zhiziyun.com:8022 48 | 49 | 50 | dfs.namenode.http-address.nameservice-zzy.namenode237 51 | solr2.zhiziyun.com:50070 52 | 53 | 54 | dfs.namenode.https-address.nameservice-zzy.namenode237 55 | solr2.zhiziyun.com:50470 56 | 57 | 58 | dfs.replication 59 | 3 60 | 61 | 62 | dfs.blocksize 63 | 134217728 64 | 65 | 66 | dfs.client.use.datanode.hostname 67 | false 68 | 69 | 70 | fs.permissions.umask-mode 71 | 022 72 | 73 | 74 | dfs.namenode.acls.enabled 75 | false 76 | 77 | 78 | dfs.client.read.shortcircuit 79 | false 80 | 81 | 82 | dfs.domain.socket.path 83 | /var/run/hdfs-sockets/dn 84 | 85 | 86 | dfs.client.read.shortcircuit.skip.checksum 87 | false 88 | 89 | 90 | dfs.client.domain.socket.data.traffic 91 | false 92 | 93 | 94 | dfs.datanode.hdfs-blocks-metadata.enabled 95 | true 96 | 97 | 98 | -------------------------------------------------------------------------------- /src/main/scala/hive-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | hive.metastore.uris 7 | thrift://mongodb3:9083 8 | 9 | 10 | hive.metastore.client.socket.timeout 11 | 300 12 | 13 | 14 | hive.metastore.warehouse.dir 15 | /user/hive/warehouse 16 | 17 | 18 | hive.warehouse.subdir.inherit.perms 19 | true 20 | 21 | 22 | hive.enable.spark.execution.engine 23 | false 24 | 25 | 26 | hive.conf.restricted.list 27 | hive.enable.spark.execution.engine 28 | 29 | 30 | mapred.reduce.tasks 31 | -1 32 | 33 | 34 | hive.exec.reducers.bytes.per.reducer 35 | 67108864 36 | 37 | 38 | hive.exec.copyfile.maxsize 39 | 33554432 40 | 41 | 42 | hive.exec.reducers.max 43 | 1099 44 | 45 | 46 | hive.metastore.execute.setugi 47 | true 48 | 49 | 50 | hive.support.concurrency 51 | true 52 | 53 | 54 | hive.zookeeper.quorum 55 | mongodb3,solr2.zhiziyun.com,solr1.zhiziyun.com 56 | 57 | 58 | hive.zookeeper.client.port 59 | 2181 60 | 61 | 62 | hbase.zookeeper.quorum 63 | mongodb3,solr2.zhiziyun.com,solr1.zhiziyun.com 64 | 65 | 66 | hbase.zookeeper.property.clientPort 67 | 2181 68 | 69 | 70 | hive.zookeeper.namespace 71 | hive_zookeeper_namespace_hive 72 | 73 | 74 | hive.cluster.delegation.token.store.class 75 | org.apache.hadoop.hive.thrift.MemoryTokenStore 76 | 77 | 78 | hive.server2.enable.doAs 79 | true 80 | 81 | 82 | hive.server2.use.SSL 83 | false 84 | 85 | 86 | -------------------------------------------------------------------------------- /src/main/scala/log4j.properties: -------------------------------------------------------------------------------- 1 | #log4j.rootLogger=INFO 2 | 3 | #zhege info shi bixu de 4 | #log4j.logger.kafka=info,kafka 5 | ## appender KAFKA 6 | #log4j.appender.kafka=org.apache.kafka.log4jappender.KafkaLog4jAppender 7 | #log4j.appender.kafka.topic=test 8 | #log4j.appender.kafka.brokerList=kafka1:9092,kafka2:9092,kafka3:9092 9 | #log4j.appender.kafka.syncSend=true 10 | #log4j.appender.kafka.layout=org.apache.log4j.PatternLayout 11 | #log4j.appender.kafka.layout.ConversionPattern=%m 12 | #zhege biao shi bu shi yong quan ju pei zhi (rootLogger) 13 | #log4j.additivity.kafka=false 14 | 15 | 16 | ## appender console 17 | #log4j.appender.console=org.apache.log4j.ConsoleAppender 18 | #log4j.appender.console.target=System.err 19 | #log4j.appender.console.layout=org.apache.log4j.PatternLayout 20 | #log4j.appender.console.layout.ConversionPattern=%m%n 21 | 22 | -------------------------------------------------------------------------------- /src/test/scala/samples/junit.scala: -------------------------------------------------------------------------------- 1 | package samples 2 | 3 | import org.junit._ 4 | import Assert._ 5 | 6 | @Test 7 | class AppTest { 8 | 9 | @Test 10 | def testOK() = assertTrue(true) 11 | 12 | // @Test 13 | // def testKO() = assertTrue(false) 14 | 15 | } 16 | 17 | 18 | -------------------------------------------------------------------------------- /src/test/scala/samples/scalatest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2001-2009 Artima, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package samples 17 | 18 | /* 19 | ScalaTest facilitates different styles of testing by providing traits you can mix 20 | together to get the behavior and syntax you prefer. A few examples are 21 | included here. For more information, visit: 22 | 23 | http://www.scalatest.org/ 24 | 25 | One way to use ScalaTest is to help make JUnit or TestNG tests more 26 | clear and concise. Here's an example: 27 | */ 28 | import scala.collection.mutable.Stack 29 | import org.scalatest.Assertions 30 | import org.junit.Test 31 | 32 | class StackSuite extends Assertions { 33 | 34 | @Test def stackShouldPopValuesIinLastInFirstOutOrder() { 35 | val stack = new Stack[Int] 36 | stack.push(1) 37 | stack.push(2) 38 | assert(stack.pop() === 2) 39 | assert(stack.pop() === 1) 40 | } 41 | 42 | @Test def stackShouldThrowNoSuchElementExceptionIfAnEmptyStackIsPopped() { 43 | val emptyStack = new Stack[String] 44 | intercept[NoSuchElementException] { 45 | emptyStack.pop() 46 | } 47 | } 48 | } 49 | 50 | /* 51 | Here's an example of a FunSuite with ShouldMatchers mixed in: 52 | */ 53 | import org.scalatest.FunSuite 54 | import org.scalatest.matchers.ShouldMatchers 55 | 56 | import org.junit.runner.RunWith 57 | import org.scalatest.junit.JUnitRunner 58 | @RunWith(classOf[JUnitRunner]) 59 | class ListSuite extends FunSuite with ShouldMatchers { 60 | 61 | test("An empty list should be empty") { 62 | List() should be ('empty) 63 | Nil should be ('empty) 64 | } 65 | 66 | test("A non-empty list should not be empty") { 67 | List(1, 2, 3) should not be ('empty) 68 | List("fee", "fie", "foe", "fum") should not be ('empty) 69 | } 70 | 71 | test("A list's length should equal the number of elements it contains") { 72 | List() should have length (0) 73 | List(1, 2) should have length (2) 74 | List("fee", "fie", "foe", "fum") should have length (4) 75 | } 76 | } 77 | 78 | /* 79 | ScalaTest also supports the behavior-driven development style, in which you 80 | combine tests with text that specifies the behavior being tested. Here's 81 | an example whose text output when run looks like: 82 | 83 | A Map 84 | - should only contain keys and values that were added to it 85 | - should report its size as the number of key/value pairs it contains 86 | */ 87 | import org.scalatest.FunSpec 88 | import scala.collection.mutable.Stack 89 | 90 | class ExampleSpec extends FunSpec { 91 | 92 | describe("A Stack") { 93 | 94 | it("should pop values in last-in-first-out order") { 95 | val stack = new Stack[Int] 96 | stack.push(1) 97 | stack.push(2) 98 | assert(stack.pop() === 2) 99 | assert(stack.pop() === 1) 100 | } 101 | 102 | it("should throw NoSuchElementException if an empty stack is popped") { 103 | val emptyStack = new Stack[Int] 104 | intercept[NoSuchElementException] { 105 | emptyStack.pop() 106 | } 107 | } 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /src/test/scala/samples/specs.scala: -------------------------------------------------------------------------------- 1 | package samples 2 | 3 | import org.junit.runner.RunWith 4 | import org.specs2.mutable._ 5 | import org.specs2.runner._ 6 | 7 | 8 | /** 9 | * Sample specification. 10 | * 11 | * This specification can be executed with: scala -cp ${package}.SpecsTest 12 | * Or using maven: mvn test 13 | * 14 | * For more information on how to write or run specifications, please visit: 15 | * http://etorreborre.github.com/specs2/guide/org.specs2.guide.Runners.html 16 | * 17 | */ 18 | @RunWith(classOf[JUnitRunner]) 19 | class MySpecTest extends Specification { 20 | "The 'Hello world' string" should { 21 | "contain 11 characters" in { 22 | "Hello world" must have size(11) 23 | } 24 | "start with 'Hello'" in { 25 | "Hello world" must startWith("Hello") 26 | } 27 | "end with 'world'" in { 28 | "Hello world" must endWith("world") 29 | } 30 | } 31 | } 32 | --------------------------------------------------------------------------------