├── .gitignore ├── .idea ├── .name ├── compiler.xml ├── copyright │ └── profiles_settings.xml ├── encodings.xml ├── libraries │ ├── Maven__com_clearspring_analytics_stream_2_7_0.xml │ ├── Maven__com_google_code_findbugs_jsr305_1_3_9.xml │ ├── Maven__com_google_code_gson_gson_2_2_4.xml │ ├── Maven__com_google_protobuf_protobuf_java_2_5_0.xml │ ├── Maven__com_ning_compress_lzf_1_0_3.xml │ ├── Maven__com_sun_jersey_jersey_client_1_9.xml │ ├── Maven__com_sun_jersey_jersey_core_1_9.xml │ ├── Maven__com_sun_xml_bind_jaxb_core_2_2_11.xml │ ├── Maven__com_sun_xml_bind_jaxb_impl_2_2_11.xml │ ├── Maven__commons_beanutils_commons_beanutils_1_7_0.xml │ ├── Maven__commons_beanutils_commons_beanutils_core_1_8_0.xml │ ├── Maven__commons_cli_commons_cli_1_2.xml │ ├── Maven__commons_configuration_commons_configuration_1_6.xml │ ├── Maven__commons_digester_commons_digester_1_8.xml │ ├── Maven__commons_httpclient_commons_httpclient_3_1.xml │ ├── Maven__commons_io_commons_io_2_4.xml │ ├── Maven__commons_logging_commons_logging_1_1_3.xml │ ├── Maven__commons_net_commons_net_2_2.xml │ ├── Maven__io_dropwizard_metrics_metrics_core_3_1_2.xml │ ├── Maven__io_dropwizard_metrics_metrics_graphite_3_1_2.xml │ ├── Maven__io_dropwizard_metrics_metrics_json_3_1_2.xml │ ├── Maven__io_dropwizard_metrics_metrics_jvm_3_1_2.xml │ ├── Maven__javax_servlet_servlet_api_2_5.xml │ ├── Maven__javax_xml_bind_jaxb_api_2_2_2.xml │ ├── Maven__javax_xml_stream_stax_api_1_0_2.xml │ ├── Maven__junit_junit_4_11.xml │ ├── Maven__log4j_log4j_1_2_17.xml │ ├── Maven__net_jpountz_lz4_lz4_1_3_0.xml │ ├── Maven__org_apache_avro_avro_ipc_1_7_7.xml │ ├── Maven__org_apache_avro_avro_ipc_tests_1_7_7.xml │ ├── Maven__org_apache_avro_avro_mapred_hadoop2_1_7_7.xml │ ├── Maven__org_apache_camel_camel_core_2_17_0.xml │ ├── Maven__org_apache_camel_camel_test_2_17_0.xml │ ├── Maven__org_apache_commons_commons_compress_1_4_1.xml │ ├── Maven__org_apache_commons_commons_math3_3_4_1.xml │ ├── Maven__org_apache_directory_api_api_asn1_api_1_0_0_M20.xml │ ├── Maven__org_apache_directory_api_api_util_1_0_0_M20.xml │ ├── Maven__org_apache_directory_server_apacheds_i18n_2_0_0_M15.xml │ ├── Maven__org_apache_directory_server_apacheds_kerberos_codec_2_0_0_M15.xml │ ├── Maven__org_apache_ivy_ivy_2_4_0.xml │ ├── Maven__org_codehaus_jackson_jackson_jaxrs_1_9_13.xml │ ├── Maven__org_codehaus_jackson_jackson_xc_1_9_13.xml │ ├── Maven__org_fusesource_leveldbjni_leveldbjni_all_1_8.xml │ ├── Maven__org_hamcrest_hamcrest_core_1_3.xml │ ├── Maven__org_mortbay_jetty_jetty_util_6_1_26.xml │ ├── Maven__org_scala_lang_modules_scala_xml_2_11_1_0_4.xml │ ├── Maven__org_slf4j_slf4j_api_1_7_13.xml │ ├── Maven__org_slf4j_slf4j_log4j12_1_7_13.xml │ ├── Maven__org_spark_project_spark_unused_1_0_0.xml │ ├── Maven__org_tukaani_xz_1_0.xml │ ├── Maven__oro_oro_2_0_8.xml │ ├── Maven__xerces_xercesImpl_2_9_1.xml │ ├── Maven__xml_apis_xml_apis_1_3_04.xml │ └── Maven__xmlenc_xmlenc_0_52.xml ├── misc.xml ├── modules.xml ├── scala_compiler.xml └── uiDesigner.xml ├── BigDataLearning.iml ├── META-INF └── MANIFEST.MF ├── RandomPrefix.txt ├── SampleJoin1.txt ├── SampleJoin2.txt ├── mapjoin.txt ├── mapjoin1.txt ├── pom.xml ├── readme.md └── src └── main ├── resources ├── META-INF │ └── MANIFEST.MF ├── core-site.xml └── log4j.properties └── scala └── com └── mobin ├── Advanced_Analytics_with_Spark ├── NaStatCounter.scala └── Patient.scala ├── Example ├── AudienceAnalysis.scala ├── GenerateHFile.java ├── HiveDataBaseConnection.java ├── PutDataToHBase.java ├── RandomPrefix_Shuffle.scala ├── Sample_Shuffle.scala ├── ScoresDataGenerator.scala ├── SecondSortBykey.scala ├── SexCount.java ├── SparkJoin.scala └── StudentDataGenerator.scala ├── HDFS ├── FSUtils │ ├── CountFileLine.java │ └── FSUtils.java ├── HDFSCompressionCodec.java ├── LzoCompress.java └── WriteToHDFS.scala ├── Kafka ├── KStream │ └── KStreamDemo.java ├── Partition │ └── StockPartitionor.java ├── Producers │ ├── KafkaProducerThread.java │ ├── QuotationProducer.java │ └── StockQuotationInfo.java ├── Topic.java └── consumers │ ├── KafkaConsumerThread.java │ ├── QuotationConsumer.java │ └── QuotationConsumerManualCommit.java ├── SparkRDDFun └── TransFormation │ ├── Action │ ├── Aggregate.scala │ ├── Fold.scala │ ├── Func.scala │ └── KVFunc.scala │ ├── BaseRDD │ ├── Cartesian.scala │ ├── Coalesce.scala │ ├── Distinct.scala │ ├── FlatMap.scala │ ├── Glom.scala │ ├── MakeRDD.scala │ ├── Map.scala │ ├── MapPartitions.scala │ ├── MapPartitionsWithIndex.scala │ ├── RandomSplit.scala │ ├── Sample.scala │ ├── Union.scala │ └── ZipWithIndex.scala │ └── KVRDD │ ├── AggregateAndFold.scala │ ├── Cogroup.scala │ ├── CombineByKey.scala │ ├── CombineByKey1.scala │ ├── FlatMapValus.scala │ ├── FoldByKey.scala │ ├── GroupByKey.scala │ ├── Join.scala │ ├── MapJoinJava.java │ ├── MapSideJoin.scala │ ├── MapValues.scala │ ├── PartitionBy.scala │ ├── ReduceByKey.scala │ └── SortByKey.scala ├── SparkSQL ├── PeopleDemo.scala ├── RowNumber.scala └── SGC_LET_SHOOL_HOUR.scala ├── Telecom └── AirPlaneMode.scala └── sparkStreaming ├── FileStreaming.scala ├── Flume ├── SampleLogGenerator.java ├── ScalaLoadDistributedEvents.scala ├── ScalaLogAnalyzerJson.scala ├── ScalaLogAnalyzerMap.scala ├── ScalaQueryingStreams.scala └── ScalaTransformLogEvents.scala ├── GenerateChar.scala ├── Kafka └── UserBehaviorMsgProducer.scala ├── QueueStream.scala ├── ScoketStreaming.scala ├── ScoketStreamingCheckPoint.scala ├── StateFull.scala └── WindowWordCount.scala /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | receivedBlockMetadata 3 | target 4 | .checkpoint* 5 | spark-warehouse -------------------------------------------------------------------------------- /.idea/.name: -------------------------------------------------------------------------------- 1 | BigDataLearning -------------------------------------------------------------------------------- /.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /.idea/copyright/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_clearspring_analytics_stream_2_7_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_google_code_findbugs_jsr305_1_3_9.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_google_code_gson_gson_2_2_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_google_protobuf_protobuf_java_2_5_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_ning_compress_lzf_1_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_sun_jersey_jersey_client_1_9.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_sun_jersey_jersey_core_1_9.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_sun_xml_bind_jaxb_core_2_2_11.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_sun_xml_bind_jaxb_impl_2_2_11.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_beanutils_commons_beanutils_1_7_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_beanutils_commons_beanutils_core_1_8_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_cli_commons_cli_1_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_configuration_commons_configuration_1_6.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_digester_commons_digester_1_8.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_httpclient_commons_httpclient_3_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_io_commons_io_2_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_logging_commons_logging_1_1_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_net_commons_net_2_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__io_dropwizard_metrics_metrics_core_3_1_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__io_dropwizard_metrics_metrics_graphite_3_1_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__io_dropwizard_metrics_metrics_json_3_1_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__io_dropwizard_metrics_metrics_jvm_3_1_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__javax_servlet_servlet_api_2_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__javax_xml_bind_jaxb_api_2_2_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__javax_xml_stream_stax_api_1_0_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__junit_junit_4_11.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__log4j_log4j_1_2_17.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__net_jpountz_lz4_lz4_1_3_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_avro_avro_ipc_1_7_7.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_avro_avro_ipc_tests_1_7_7.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_avro_avro_mapred_hadoop2_1_7_7.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_camel_camel_core_2_17_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_camel_camel_test_2_17_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_commons_commons_compress_1_4_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_commons_commons_math3_3_4_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_directory_api_api_asn1_api_1_0_0_M20.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_directory_api_api_util_1_0_0_M20.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_directory_server_apacheds_i18n_2_0_0_M15.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_directory_server_apacheds_kerberos_codec_2_0_0_M15.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_ivy_ivy_2_4_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_codehaus_jackson_jackson_jaxrs_1_9_13.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_codehaus_jackson_jackson_xc_1_9_13.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_fusesource_leveldbjni_leveldbjni_all_1_8.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_hamcrest_hamcrest_core_1_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_mortbay_jetty_jetty_util_6_1_26.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_scala_lang_modules_scala_xml_2_11_1_0_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_slf4j_slf4j_api_1_7_13.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_slf4j_slf4j_log4j12_1_7_13.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_spark_project_spark_unused_1_0_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_tukaani_xz_1_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__oro_oro_2_0_8.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__xerces_xercesImpl_2_9_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__xml_apis_xml_apis_1_3_04.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__xmlenc_xmlenc_0_52.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/scala_compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/uiDesigner.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /BigDataLearning.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | -------------------------------------------------------------------------------- /META-INF/MANIFEST.MF: -------------------------------------------------------------------------------- 1 | Manifest-Version: 1.0 2 | Main-Class: com.mobin.sparkStreaming.GenerateChar 3 | 4 | -------------------------------------------------------------------------------- /RandomPrefix.txt: -------------------------------------------------------------------------------- 1 | Hello 2 | Hello 3 | HI 4 | HI -------------------------------------------------------------------------------- /SampleJoin1.txt: -------------------------------------------------------------------------------- 1 | 1,a1 2 | 1,a2 3 | 1,a3 4 | 1,a4 5 | 1,a5 6 | 1,a6 7 | 1,a7 8 | 1,a8 9 | 1,a9 10 | 1,a10 11 | 1,a11 12 | 1,a12 13 | 1,a13 14 | 1,a14 15 | 1,a15 16 | 1,a16 17 | 1,a17 18 | 1,a18 19 | 1,a19 20 | 1,a20 21 | 1,a21 22 | 1,a22 23 | 2,b 24 | 2,b -------------------------------------------------------------------------------- /SampleJoin2.txt: -------------------------------------------------------------------------------- 1 | 1,a 2 | 2,b 3 | 3,c -------------------------------------------------------------------------------- /mapjoin.txt: -------------------------------------------------------------------------------- 1 | 1,2,3 2 | 2,4,5 -------------------------------------------------------------------------------- /mapjoin1.txt: -------------------------------------------------------------------------------- 1 | 1,A,B 2 | 2,C,D -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 4.0.0 5 | 6 | com.mobin 7 | BigDataLearning 8 | jar 9 | 1.0-SNAPSHOT 10 | 11 | A Camel Scala Route 12 | http://www.myorganization.org 13 | 14 | 15 | UTF-8 16 | UTF-8 17 | 18 | 19 | 20 | 21 | 22 | 23 | org.apache.kafka 24 | kafka_2.12 25 | 2.0.0 26 | 27 | 28 | 29 | 30 | org.apache.kafka 31 | kafka-clients 32 | 2.0.0 33 | 34 | 35 | 36 | 37 | com.alibaba 38 | fastjson 39 | 1.2.47 40 | 41 | 42 | 43 | 44 | org.apache.spark 45 | spark-sql_2.11 46 | 2.2.2 47 | 48 | 49 | 50 | org.apache.spark 51 | spark-core_2.11 52 | 2.2.2 53 | 54 | 55 | 56 | 57 | org.apache.hadoop 58 | hadoop-client 59 | 2.7.5 60 | 61 | 62 | 63 | 64 | org.apache.spark 65 | spark-streaming_2.11 66 | 2.2.2 67 | 68 | 69 | 70 | 71 | org.apache.spark 72 | spark-streaming-flume_2.11 73 | 2.3.1 74 | 75 | 76 | 77 | 78 | 79 | org.scala-lang 80 | scala-library 81 | 2.11.8 82 | 83 | 84 | org.scala-lang.modules 85 | scala-xml_2.11 86 | 1.0.4 87 | 88 | 89 | 90 | 91 | org.slf4j 92 | slf4j-api 93 | 1.7.13 94 | 95 | 96 | org.slf4j 97 | slf4j-log4j12 98 | 1.7.13 99 | 100 | 101 | log4j 102 | log4j 103 | 1.2.17 104 | 105 | 106 | 107 | 108 | org.apache.camel 109 | camel-test 110 | 2.17.0 111 | test 112 | 113 | 114 | 115 | 116 | install 117 | src/main/scala 118 | src/test/scala 119 | 120 | 121 | 122 | 123 | 124 | org.apache.maven.plugins 125 | maven-compiler-plugin 126 | 3.5.1 127 | 128 | 1.7 129 | 1.7 130 | 131 | 132 | 133 | org.apache.maven.plugins 134 | maven-resources-plugin 135 | 2.6 136 | 137 | UTF-8 138 | 139 | 140 | 141 | 142 | 143 | net.alchim31.maven 144 | scala-maven-plugin 145 | 3.2.2 146 | 147 | 148 | 149 | compile 150 | testCompile 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | org.apache.maven.plugins 159 | maven-eclipse-plugin 160 | 2.10 161 | 162 | 163 | org.scala-ide.sdt.core.scalanature 164 | org.eclipse.jdt.core.javanature 165 | 166 | 167 | org.scala-ide.sdt.core.scalabuilder 168 | 169 | 170 | org.scala-ide.sdt.launching.SCALA_CONTAINER 171 | org.eclipse.jdt.launching.JRE_CONTAINER 172 | 173 | 174 | org.scala-lang:scala-library 175 | org.scala-lang:scala-compiler 176 | 177 | 178 | **/*.scala 179 | **/*.java 180 | 181 | 182 | 183 | 184 | 185 | org.apache.maven.plugins 186 | maven-assembly-plugin 187 | 2.5.5 188 | 189 | 190 | jar-with-dependencies 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | BigData Framework learning examples -------------------------------------------------------------------------------- /src/main/resources/META-INF/MANIFEST.MF: -------------------------------------------------------------------------------- 1 | Manifest-Version: 1.0 2 | Main-Class: com.mobin.sparkStreaming.com.mobin.sparkStreaming.FileStre 3 | aming 4 | 5 | -------------------------------------------------------------------------------- /src/main/resources/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | io.compression.codecs 6 | org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.DeflateCodec,org.apache.hadoop.io.compress.SnappyCodec,org.apache.hadoop.io.compress.Lz4Codec,com.hadoop.compression.lzo.LzoCodec,com.hadoop.compression.lzo.LzopCodec 7 | 8 | 9 | -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # The logging properties used 3 | # 4 | log4j.rootLogger=INFO, out 5 | 6 | # uncomment the following line to turn on Camel debugging 7 | #log4j.logger.org.apache.camel=DEBUG 8 | 9 | log4j.logger.org.springframework=WARN 10 | 11 | # CONSOLE appender not used by default 12 | log4j.appender.out=org.apache.log4j.ConsoleAppender 13 | log4j.appender.out.layout=org.apache.log4j.PatternLayout 14 | log4j.appender.out.layout.ConversionPattern=[%30.30t] %-30.30c{1} %-5p %m%n 15 | #log4j.appender.out.layout.ConversionPattern=%d [%-15.15t] %-5p %-30.30c{1} - %m%n 16 | 17 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/Advanced_Analytics_with_Spark/NaStatCounter.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.Advanced_Analytics_with_Spark 2 | 3 | import org.apache.spark.util.StatCounter 4 | /** 5 | * Created by Mobin on 2017/3/8. 6 | */ 7 | class NaStatCounter extends Serializable{ 8 | 9 | val stats: StatCounter = new StatCounter() 10 | var missing: Long = 0 11 | 12 | def add(x: Double): NaStatCounter = { 13 | if(java.lang.Double.isNaN(x)){ 14 | missing += 1 15 | } else { 16 | stats.merge(x) 17 | } 18 | this 19 | } 20 | 21 | def merge(other: NaStatCounter): NaStatCounter = { 22 | stats.merge(other.stats) 23 | missing += other.missing 24 | this 25 | } 26 | 27 | override def toString = { 28 | "stats: " + stats.toString() + "NaN: " + missing 29 | } 30 | } 31 | 32 | object NaStatCounter extends Serializable{ 33 | def apply(x: Double) = new NaStatCounter().add(x ) 34 | } 35 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/Advanced_Analytics_with_Spark/Patient.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.Advanced_Analytics_with_Spark 2 | 3 | import org.apache.spark.{SparkContext, SparkConf} 4 | 5 | /** 6 | * Created by Mobin on 2017/3/7. 7 | */ 8 | case class MatchData(id1: Int, id2: Int, scores: Array[Double], matched: Boolean) 9 | object Patient { 10 | 11 | def main(args: Array[String]) { 12 | val conf = new SparkConf().setMaster("local").setAppName("Patient") 13 | val sc = new SparkContext(conf) 14 | val rawblocks = sc.textFile(args(0)) 15 | val mds = rawblocks.filter(!isHeader(_)).map(pares) 16 | // val grouped = mds.groupBy(x => x.matched).mapValues(x => x.size).foreach(println) //按matched分组统计 17 | // val sort = mds.map(x => x.matched).sortBy(_).foreach(println) 18 | val nsdRDD = mds.map(md => 19 | md.scores.map(d => NaStatCounter(d)) 20 | ).foreach(x => println(x(1))) 21 | } 22 | 23 | def isHeader(line: String): Boolean = { 24 | line.contains("id_1") 25 | } 26 | 27 | def toDouble(s: String): Double = { 28 | if ("?".equals(s)) 29 | Double.NaN 30 | else 31 | s.toDouble 32 | } 33 | 34 | def pares(line: String)={ 35 | val pieces = line.split(",") 36 | val id1 = pieces(0).toInt 37 | val id2 = pieces(1).toInt 38 | val scores = pieces.slice(2,11).map(toDouble) //取数据的[2,11)位并转化成Double类型 39 | val matched = pieces(11).toBoolean 40 | MatchData(id1, id2, scores, matched) 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/Example/AudienceAnalysis.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.Example 2 | 3 | /** 4 | * Created by Mobin on 2016/11/15. 5 | */ 6 | object AudienceAnalysis { 7 | 8 | lazy val nameIndexMap = { 9 | val nameIndexMap = scala.collection.mutable.HashMap.empty[String, Int] 10 | val basicNames = Seq("first_name", "last_name", "email", "company", "job", "street_address", "city", 11 | "state_abbr", "zipcode_plus4", "url", "phoen_number", "user_agent", "user_name") 12 | nameIndexMap ++= basicNames zip (0 to 12) 13 | for(i <- 0 to 328){ 14 | nameIndexMap ++= Seq(("letter_" + i, i * 3 + 13),("number_" + i, i * 3 +14), ("bool_" + i, i *3 +15)) 15 | } 16 | 17 | nameIndexMap 18 | } 19 | 20 | def $(name: String): Int = nameIndexMap.getOrElse(name, -1) 21 | } 22 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/Example/GenerateHFile.java: -------------------------------------------------------------------------------- 1 | //package com.mobin.Example; 2 | // 3 | //import org.apache.hadoop.conf.Configuration; 4 | //import org.apache.hadoop.fs.Path; 5 | //import org.apache.hadoop.hbase.HBaseConfiguration; 6 | //import org.apache.hadoop.hbase.TableName; 7 | //import org.apache.hadoop.hbase.client.*; 8 | //import org.apache.hadoop.hbase.io.ImmutableBytesWritable; 9 | //import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2; 10 | //import org.apache.hadoop.io.LongWritable; 11 | //import org.apache.hadoop.io.Text; 12 | //import org.apache.hadoop.mapreduce.Job; 13 | //import org.apache.hadoop.mapreduce.Mapper; 14 | //import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 15 | //import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 16 | // 17 | //import java.io.IOException; 18 | // 19 | ///** 20 | // * Created by Mobin on 2016/12/22. 21 | // */ 22 | //public class GenerateHFile { 23 | // 24 | // static class HFileMapper extends Mapper{ 25 | // @Override 26 | // protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 27 | // String[] line = value.toString().split(","); 28 | // String rk = line[0]; 29 | // ImmutableBytesWritable rowkey = new ImmutableBytesWritable(rk.getBytes()); 30 | // Put put = new Put(rk.getBytes()); 31 | // put.addColumn("S".getBytes(),"name".getBytes(), line[1].getBytes()); 32 | // put.addColumn("S".getBytes(), "sex".getBytes(), line[2].getBytes()); 33 | // put.addColumn("S".getBytes(), "age".getBytes(), line[3].getBytes()); 34 | // put.addColumn("S".getBytes(), "class".getBytes(), line[4].getBytes()); 35 | // context.write(rowkey, put); 36 | // } 37 | // } 38 | // 39 | // public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 40 | // final String INPUT_PATH = "/DATA/PUBLIC/NOCE/SGC/Student.txt"; 41 | // final String OUT_PATH = "/DATA/PUBLIC/NOCE/SGC/HFILE"; 42 | // Configuration conf = HBaseConfiguration.create(); 43 | // HTable table = new HTable(conf,"STUDENT"); 44 | // Job job = Job.getInstance(conf); 45 | // job.setJarByClass(GenerateHFile.class); 46 | // job.setMapperClass(HFileMapper.class); 47 | // job.setMapOutputKeyClass(ImmutableBytesWritable.class); 48 | // job.setMapOutputValueClass(Put.class); 49 | // 50 | // job.setOutputFormatClass(HFileOutputFormat2.class); 51 | // HFileOutputFormat2.configureIncrementalLoad(job,table,table.getRegionLocator()); 52 | // FileInputFormat.setInputPaths(job, INPUT_PATH); 53 | // FileOutputFormat.setOutputPath(job, new Path(OUT_PATH)); 54 | // System.exit(job.waitForCompletion(true)?0:1); 55 | // 56 | // } 57 | //} 58 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/Example/HiveDataBaseConnection.java: -------------------------------------------------------------------------------- 1 | //package com.mobin.Example; 2 | // 3 | //import org.apache.hadoop.hive.ql.metadata.Hive; 4 | // 5 | //import java.sql.Connection; 6 | //import java.sql.DriverManager; 7 | //import java.sql.SQLException; 8 | // 9 | ///** 10 | // * Created by MOBIN on 2016/9/21. 11 | // */ 12 | //public class HiveDataBaseConnection { 13 | // private final static String DriverName = "org.apache.hive.jdbc.HiveDriver"; 14 | // private final static String URL = "jdbc:hive2://132.122.70.2:10000/default"; 15 | // private final static String UserName = ""; 16 | // private final static String Password = ""; 17 | // private Connection con; 18 | // 19 | // public HiveDataBaseConnection(){ 20 | // try { 21 | // Class.forName(DriverName); 22 | // con = DriverManager.getConnection(URL,UserName, Password); 23 | // System.out.println(con); 24 | // } catch (ClassNotFoundException e) { 25 | // e.printStackTrace(); 26 | // } catch (SQLException e) { 27 | // e.printStackTrace(); 28 | // } 29 | // } 30 | // 31 | // public Connection getConnection(){ 32 | // return con; 33 | // } 34 | // 35 | // public void Close(){ 36 | // try { 37 | // if(con != null) 38 | // con.close(); 39 | // } catch (SQLException e) { 40 | // e.printStackTrace(); 41 | // } 42 | // } 43 | // 44 | // public static void main(String[] args) { 45 | // HiveDataBaseConnection connection = new HiveDataBaseConnection(); 46 | // } 47 | //} 48 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/Example/PutDataToHBase.java: -------------------------------------------------------------------------------- 1 | //package com.mobin.Example; 2 | // 3 | //import org.apache.hadoop.conf.Configuration; 4 | //import org.apache.hadoop.fs.Path; 5 | //import org.apache.hadoop.hbase.HBaseConfiguration; 6 | //import org.apache.hadoop.hbase.client.HTable; 7 | //import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles; 8 | // 9 | // 10 | ///** 11 | // * Created by Mobin on 2016/12/22. 12 | // */ 13 | //public class PutDataToHBase { 14 | // public static void main(String[] args) throws Exception { 15 | // Configuration conf = HBaseConfiguration.create(); 16 | // LoadIncrementalHFiles load = new LoadIncrementalHFiles(conf); 17 | // load.doBulkLoad(new Path("HFILE"), new HTable(conf,"STUDENT")); 18 | // } 19 | //} 20 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/Example/RandomPrefix_Shuffle.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.Example 2 | 3 | import org.apache.spark.{SparkContext, SparkConf} 4 | 5 | import scala.util.Random 6 | 7 | /** 8 | * Created by Mobin on 2017/8/29. 9 | * 先局部聚合再全局聚合 10 | */ 11 | object RandomPrefix_Shuffle { 12 | 13 | def main(args: Array[String]) { 14 | val conf = new SparkConf().setMaster("local[*]").setAppName("RandomPrefix") 15 | val sc = new SparkContext(conf) 16 | val line = sc.textFile("RandomPrefix.txt").map((_,1)) 17 | val randomPrefixRdd = line.map(x => { 18 | val random = Random 19 | val prefix = random.nextInt(10) 20 | (prefix + "_" + x._1 , x._2) 21 | }) 22 | 23 | val localAggrRdd = randomPrefixRdd.reduceByKey(_ + _) 24 | val removeRandPrefixRdd = localAggrRdd.map(x => { 25 | val k = x._1.split("_")(1) 26 | (k, x._2) 27 | }) 28 | val globalAggrRdd = removeRandPrefixRdd.reduceByKey(_ + _) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/Example/Sample_Shuffle.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.Example 2 | 3 | import java.util 4 | 5 | import org.apache.spark.{SparkContext, SparkConf} 6 | 7 | import scala.util.Random 8 | 9 | /** 10 | * Created by Mobin on 2017/8/30. 11 | *方案实现思路:(Spark性能优化指南——高级篇[美团点评技术博客]) 12 | 1.对包含少数几个数据量过大的key的那个RDD,通过sample算子采样出一份样本来,然后统计一下每个key的数量,计算出来数据量最大的是哪几个key。 13 | 14 | 2.然后将这几个key对应的数据从原来的RDD中拆分出来,形成一个单独的RDD,并给每个key都打上n以内的随机数作为前缀,而不会导致倾斜的大部分key形成另外一个RDD。 15 | 16 | 3.接着将需要join的另一个RDD,也过滤出来那几个倾斜key对应的数据并形成一个单独的RDD,将每条数据膨胀成n条数据,这n条数据都按顺序附加一个0~n的前缀,不会导致倾斜的大部分key也形成另外一个RDD。 17 | 18 | 4.再将附加了随机前缀的独立RDD与另一个膨胀n倍的独立RDD进行join,此时就可以将原先相同的key打散成n份,分散到多个task中去进行join了。 19 | 20 | 5.而另外两个普通的RDD就照常join即可。 21 | 22 | 6.最后将两次join的结果使用union算子合并起来即可,就是最终的join结果。 23 | */ 24 | object Sample_Shuffle { 25 | 26 | def main(args: Array[String]) { 27 | val conf = new SparkConf().setMaster("local[*]").setAppName("sample") 28 | val sc = new SparkContext(conf) 29 | 30 | val rdd1 = sc.textFile("SampleJoin1.txt").map(x => { 31 | val kv = x.split(",") 32 | (kv(0), kv(1)) 33 | }) 34 | 35 | val sampleRdd = rdd1.sample(false, 0.1) //对rdd1进行采样 36 | val countSampleRdd = sampleRdd.map(x =>(x._1, 1)).reduceByKey(_ + _) //统计出各key的频数 37 | val reversedSampleRdd = countSampleRdd.map(x => (x._2, x._1)) 38 | val skewedUserid = reversedSampleRdd.sortByKey(false).take(1)(0)._2 //对频数进行排序,得到频数最高的对应的key 39 | val skewRdd = rdd1.filter(_._1.equals(skewedUserid)) //从RDD1中拆分出导致数据倾斜的key,形成独立的RDD 40 | val commonRdd = rdd1.filter(!_._1.equals(skewedUserid)) //从RDD1中拆分出不会导致数据倾斜的key,形成独立的RDD 41 | 42 | val rdd2 = sc.textFile("SampleJoin2.txt").map(x => { 43 | val kv = x.split(",") 44 | (kv(0), kv(1)) 45 | }) 46 | 47 | println("skew: " + skewedUserid) 48 | //对RDD2中skew key扩充100倍 49 | val skewRdd2 = rdd2.filter(_._1.equals(skewedUserid)).flatMap(x => { 50 | for(i <- 1 to 10)yield((i + "_" + x._1, x._2)) 51 | }) 52 | 53 | //为skewRdd中的每条数据都打上随机前缀并join上dkewRdd2 54 | val joinRdd = skewRdd.map(x=>{ 55 | val prefix = Random.nextInt(10) 56 | (prefix + "_" + x._1, x._2) 57 | }).join(skewRdd2).map(x => { 58 | val key = x._1.split("_")(1) 59 | (key,x._2) 60 | }) 61 | 62 | val joinRdd2 = commonRdd.join(rdd2) 63 | val resultRdd = joinRdd.union(joinRdd2) 64 | resultRdd.foreach(println) 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/Example/ScoresDataGenerator.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.Example 2 | 3 | import java.io.FileWriter 4 | 5 | import scala.util.Random 6 | 7 | /** 8 | * Created by Mobin on 2016/12/22. 9 | * sno string, //学号 10 | * semester int, //学期 11 | * math int, // 数学成绩 12 | * en int, // 英语成绩 13 | * c int, // C语言成绩 14 | * os int // 操作系统成绩 15 | 16 | */ 17 | object ScoresDataGenerator { 18 | private val FILE_OUTPATH = "Scores.txt" 19 | private val MAX_RECORD = 1000; 20 | 21 | def main(args: Array[String]) { 22 | Generator(FILE_OUTPATH,MAX_RECORD) 23 | } 24 | 25 | 26 | private def Generator(filePath: String, recordNum: Int) { 27 | var write: FileWriter = null 28 | try { 29 | write = new FileWriter(filePath, true) 30 | val rand = new Random() 31 | val term = 1 32 | for(i <- 1 to recordNum){ 33 | val MScore = generatorScore 34 | val EScore = generatorScore 35 | val CScore = generatorScore 36 | val SScore = generatorScore 37 | write.write(i + "," + term + "," + MScore + "," + EScore + "," + CScore + "," + SScore) 38 | write.write(System.getProperty("line.separator")) 39 | write.flush() 40 | } 41 | } catch { 42 | case e => println("error") 43 | }finally { 44 | if (write != null) 45 | write.close() 46 | } 47 | } 48 | 49 | private def generatorScore: Int = { 50 | val rand = new Random() 51 | val sc = rand.nextInt(100) 52 | val score = sc match { 53 | case s if(s >0 && s <10) => s + 80 54 | case s if(s >10 && s < 30) => s + 70 55 | case s if(s >30 && s < 50) => s + 40 56 | case s if(s >50 && s < 60) => s + 20 57 | case _ => sc 58 | } 59 | score 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/Example/SecondSortBykey.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.Example 2 | 3 | /** 4 | * Created by Mobin on 2017/9/3. 5 | */ 6 | class SecondSortBykey(val first: Int, val second: Int) extends Ordered [SecondSortBykey] with Serializable { 7 | def compare(other:SecondSortBykey):Int = { 8 | if (this.first - other.first !=0) { 9 | this.first - other.first 10 | } else { 11 | this.second - other.second 12 | } 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/Example/SexCount.java: -------------------------------------------------------------------------------- 1 | package com.mobin.Example; 2 | 3 | import org.apache.hadoop.conf.Configurable; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.IntWritable; 7 | import org.apache.hadoop.io.LongWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.Reducer; 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 14 | 15 | import java.io.IOException; 16 | 17 | /** 18 | * Created by Mobin on 2016/12/22. 19 | * 性别统计 20 | */ 21 | public class SexCount { 22 | static class SexMapper extends Mapper{ 23 | @Override 24 | protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 25 | String[] line = value.toString().split(","); 26 | context.write(new Text(line[2]), new IntWritable(1)); 27 | } 28 | } 29 | 30 | static class SexReduce extends Reducer{ 31 | @Override 32 | protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { 33 | int count = 0; 34 | for(IntWritable c : values) 35 | count += c.get(); 36 | context.write(key, new IntWritable(count)); 37 | } 38 | } 39 | 40 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 41 | final String INPUT_PATH = "Student.txt"; 42 | final String OUT_PATH = "StudentSex"; 43 | Configuration conf = new Configuration(); 44 | Job job = Job.getInstance(conf); 45 | 46 | job.setMapperClass(SexMapper.class); 47 | job.setReducerClass(SexReduce.class); 48 | job.setJarByClass(SexCount.class); 49 | 50 | 51 | job.setOutputKeyClass(Text.class); 52 | job.setOutputValueClass(IntWritable.class); 53 | 54 | FileInputFormat.setInputPaths(job, INPUT_PATH); 55 | FileOutputFormat.setOutputPath(job, new Path(OUT_PATH)); 56 | System.exit(job.waitForCompletion(true)?0:1); 57 | 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/Example/SparkJoin.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.Example 2 | 3 | import org.apache.spark.{SparkContext, SparkConf} 4 | 5 | 6 | /** 7 | * Created by Mobin on 2016/12/22. 8 | */ 9 | object SparkJoin { 10 | def main(args: Array[String]) { 11 | val conf = new SparkConf().setAppName("SparkJoin").setMaster("local") 12 | val sc = new SparkContext(conf) 13 | val student = sc.textFile("Student.t") 14 | val scores = sc.textFile("Scores.txt") 15 | val studentT = student.map(str => str.split(",")).map(x => (x(0), x(1) +"," + x(2) + "," +x(3) + "," + x(4))) 16 | val scoresT = scores.map(str => str.split(",")).map(x => (x(0), x(1) +"," + x(2) + "," +x(3) + "," + x(4) + "," + x(5))) 17 | studentT.join(scoresT).foreach(println) 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/Example/StudentDataGenerator.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.Example 2 | 3 | import java.io.FileWriter 4 | 5 | import scala.util.Random 6 | 7 | /** 8 | * Created by Mobin on 2016/12/22. 9 | * sno string, //学号 10 | * name string,//姓名 11 | * sex string, //性别 12 | * age int, //年龄 13 | * class string //班级 14 | */ 15 | object StudentDataGenerator { 16 | private val FILE_OUTPATH = "Student.txt" 17 | private val MAX_RECORD = 10000; 18 | 19 | def main(args: Array[String]) { 20 | Generator(FILE_OUTPATH, MAX_RECORD) 21 | } 22 | 23 | private def Generator(filePath: String, recordNum: Int) { 24 | 25 | var write: FileWriter = null 26 | try { 27 | write = new FileWriter(filePath, true) 28 | val rand = new Random(); 29 | for (i <- 1 to recordNum) { 30 | val name = nameGenerator 31 | val sex = sexGenerator 32 | //年龄在20~22之间 33 | val age = rand.nextInt(3) + 20 34 | //班级 35 | val classNum = rand.nextInt(6) 36 | write.write(i + "," + name + "," + sex + "," + age + "," + classNum) 37 | write.write(System.getProperty("line.separator")) 38 | write.flush() 39 | } 40 | } catch { 41 | case e => println("error") 42 | } finally { 43 | if (write != null) 44 | write.close() 45 | } 46 | } 47 | 48 | //生成姓名 49 | private def nameGenerator: String = { 50 | val higthPos = (176 + Math.abs(new Random().nextInt(39))) 51 | val lowPos = (176 + Math.abs(new Random().nextInt(93))) 52 | val name = Array[Byte](new Integer(higthPos).byteValue(), new Integer(lowPos).byteValue()) 53 | val surname = Array("钟", "李", "张", "刘", "王", "章", "洪", "江", "戴") 54 | surname(new Random().nextInt(9)) + new String(name, "GBK") 55 | } 56 | 57 | //生成性别 58 | private def sexGenerator: String = { 59 | val random = new Random() 60 | val randomNum = random.nextInt(2) + 1 61 | randomNum % 2 match { 62 | case 0 => "男" 63 | case _ => "女" 64 | } 65 | } 66 | } 67 | 68 | 69 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/HDFS/FSUtils/CountFileLine.java: -------------------------------------------------------------------------------- 1 | package com.mobin.HDFS.FSUtils; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.FileSystem; 5 | 6 | import java.io.File; 7 | import java.io.IOException; 8 | import java.util.ArrayList; 9 | import java.util.concurrent.Callable; 10 | import java.util.concurrent.ExecutionException; 11 | import java.util.concurrent.Future; 12 | import java.util.concurrent.FutureTask; 13 | 14 | /** 15 | * Created by Mobin on 2016/12/20. 16 | * 统计一个目录下的lzo文件的行数,每个lzo起一个task 17 | */ 18 | public class CountFileLine implements Callable{ 19 | public FileSystem fs; 20 | public String path; 21 | 22 | @Override 23 | public Integer call() throws Exception { 24 | return countLine(fs,path); 25 | } 26 | 27 | public Integer countLine(FileSystem fs,String path) throws IOException { 28 | int count = 0; 29 | FSUtils.BufferedReadIterable brl = new FSUtils.BufferedReadIterable(fs,path); 30 | for(String line: brl){ 31 | count ++; 32 | } 33 | System.out.println(count); 34 | return count; 35 | } 36 | 37 | public static void main(String[] args) throws IOException, ExecutionException, InterruptedException { 38 | int sum=0; 39 | String file = "E:\\DATA\\PUBLIC\\NOCE\\AGG\\AGG_EVT_LTE_DPI_NEW\\hour=2016102011"; 40 | Configuration conf = new Configuration(); 41 | FileSystem fs = FileSystem.get(conf); 42 | ArrayList> tasks = new ArrayList<>(); 43 | File[] files = new File(file).listFiles(); 44 | for(File f: files){ 45 | if(f.getName().endsWith(".lzo")){ 46 | CountFileLine cd = new CountFileLine(); 47 | cd.fs = fs; 48 | cd.path = f.getPath(); 49 | FutureTask task = new FutureTask(cd); 50 | tasks.add(task); 51 | Thread thread = new Thread(task); 52 | System.out.println(thread.getName()); 53 | thread.start(); 54 | } 55 | } 56 | 57 | for(Future future: tasks){ 58 | sum += future.get(); 59 | } 60 | System.out.println(sum); 61 | 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/HDFS/FSUtils/FSUtils.java: -------------------------------------------------------------------------------- 1 | package com.mobin.HDFS.FSUtils; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.FSDataInputStream; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.compress.CompressionCodec; 8 | import org.apache.hadoop.io.compress.CompressionCodecFactory; 9 | import org.apache.hadoop.io.compress.CompressionInputStream; 10 | 11 | import java.io.BufferedReader; 12 | import java.io.Closeable; 13 | import java.io.IOException; 14 | import java.io.InputStreamReader; 15 | import java.util.Iterator; 16 | 17 | /** 18 | * Created by Mobin on 2016/12/14. 19 | * 统计一个目录下的lzo文件的行数,每个lzo起一个task 20 | */ 21 | public class FSUtils { 22 | private static final Configuration conf = new Configuration(); 23 | private static final FileSystem fs = null; 24 | 25 | public static void main(String[] args) throws IOException { 26 | String file = "E:\\DATA\\PUBLIC\\NOCE\\AGG\\AGG_EVT_LTE_DPI_NEW\\hour=2016102011\\m_p_0.txt.lzo"; 27 | int lineCount = 0; 28 | Configuration conf = new Configuration(); 29 | FileSystem fs = FileSystem.get(conf); 30 | try(BufferedReadIterable br = new BufferedReadIterable(fs,file)){ 31 | for(String line : br){ 32 | 33 | } 34 | } 35 | 36 | } 37 | 38 | public static BufferedReadIterable createBuferedReadIterable(FileSystem fs, String file) throws IOException { 39 | return new BufferedReadIterable(fs,file); 40 | } 41 | 42 | public static class BufferedReadIterable implements Iterable,Closeable{ 43 | private final String file; 44 | private final long size; 45 | private BufferedReader br; 46 | 47 | 48 | public BufferedReadIterable(FileSystem fs, String file) throws IOException { 49 | this.file = file; 50 | Path path = new Path(file); 51 | this.size = fs.getFileStatus(path).getLen(); 52 | 53 | CompressionCodecFactory factory = new CompressionCodecFactory(fs.getConf()); 54 | //HDFS根据文件的后缀来确定使用的是哪种压缩算法 55 | CompressionCodec codec = factory.getCodec(path); 56 | 57 | FSDataInputStream inputStream = fs.open(path,8192); 58 | if(codec == null){ 59 | br = new BufferedReader(new InputStreamReader(inputStream)); 60 | }else{ 61 | //先解压再读取 62 | CompressionInputStream comIn = codec.createInputStream(inputStream); 63 | br = new BufferedReader(new InputStreamReader(comIn)); 64 | } 65 | } 66 | 67 | @Override 68 | public void close() throws IOException { 69 | br.close(); 70 | } 71 | 72 | @Override 73 | public Iterator iterator() { 74 | return new Iterator() { 75 | private String line; 76 | @Override 77 | public boolean hasNext() { 78 | try { 79 | line = br.readLine(); 80 | } catch (IOException e) { 81 | line = null; 82 | } 83 | return line != null; 84 | } 85 | 86 | @Override 87 | public String next() { 88 | return line; 89 | } 90 | 91 | @Override 92 | public void remove() { 93 | throw new UnsupportedOperationException("remove"); 94 | } 95 | }; 96 | } 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/HDFS/HDFSCompressionCodec.java: -------------------------------------------------------------------------------- 1 | package com.mobin.HDFS; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.FSDataInputStream; 5 | import org.apache.hadoop.fs.FSDataOutputStream; 6 | import org.apache.hadoop.fs.FileSystem; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.IOUtils; 9 | import org.apache.hadoop.io.compress.CompressionCodec; 10 | import org.apache.hadoop.io.compress.CompressionCodecFactory; 11 | import org.apache.hadoop.io.compress.CompressionInputStream; 12 | import org.apache.hadoop.io.compress.CompressionOutputStream; 13 | import org.apache.hadoop.util.ReflectionUtils; 14 | 15 | import java.io.*; 16 | 17 | /** 18 | * Created by Mobin on 2016/12/19. 19 | */ 20 | public class HDFSCompressionCodec { 21 | private static final Configuration conf = new Configuration(); 22 | private static FileSystem fs = null; 23 | //压缩 24 | public void coder(String path) throws IOException, ClassNotFoundException { 25 | //获取文件输入流 26 | File dir = new File(path); 27 | System.out.println(dir.isDirectory()); 28 | conf.set("mapred.output.compress", "true"); 29 | conf.set("mapred.output.compression.codec", "com.hadoop.compression.lzo.LzopCodec"); 30 | fs = FileSystem.get(conf); 31 | FSDataOutputStream out = fs.create(new Path("E:\\DATA\\PUBLIC\\NOCE\\school5.lzo")); 32 | Class codecClass = Class.forName("com.hadoop.compression.lzo.LzopCodec"); 33 | CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf); 34 | //将压缩数据写入到school.gz中 35 | //创建CompressionInputStream来对文件进行压缩 36 | CompressionOutputStream codecout = codec.createOutputStream(out); 37 | for(File file:dir.listFiles() ) { 38 | try (BufferedInputStream in = new BufferedInputStream(new FileInputStream(file))) { 39 | try{ 40 | //最后个参数为true时同时关闭输出流和输入流 41 | IOUtils.copyBytes(in, codecout, 4096, false); 42 | }finally { 43 | IOUtils.closeStream(in); 44 | } 45 | } catch (FileNotFoundException e) { 46 | e.printStackTrace(); 47 | } catch (IOException e) { 48 | e.printStackTrace(); 49 | } 50 | } 51 | out.flush(); 52 | out.close(); 53 | } 54 | 55 | //解压 56 | public void decoder() throws IOException { 57 | fs = FileSystem.get(conf); 58 | CompressionCodecFactory factory = new CompressionCodecFactory(fs.getConf()); 59 | //根据文件的后缀名来确定使用的是哪种压缩算法 60 | Path path = new Path("E:\\DATA\\PUBLIC\\NOCE\\school.gz"); 61 | CompressionCodec codec = factory.getCodec(path); 62 | try(FSDataInputStream inputStream = fs.open(path,8096)){ 63 | //创建CompressionInputStream来对文件进行解压 64 | CompressionInputStream comInputStream = codec.createInputStream(inputStream); 65 | //将解压后的文件写到school.txt 66 | FSDataOutputStream out = fs.create(new Path("E:\\DATA\\PUBLIC\\NOCE\\school5.txt")); 67 | IOUtils.copyBytes(comInputStream,out,4096,false); 68 | comInputStream.close(); 69 | out.close(); 70 | } catch (IOException e) { 71 | e.printStackTrace(); 72 | } 73 | } 74 | 75 | public static void main(String[] args) throws IOException, ClassNotFoundException { 76 | String path = "E:\\DATA\\PUBLIC\\NOCE\\sch"; 77 | HDFSCompressionCodec codec = new HDFSCompressionCodec(); 78 | codec.coder(path); 79 | codec.decoder(); 80 | Integer i = 0; 81 | Integer o = 2; 82 | i.equals(o); 83 | Integer ii =i + o; 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/HDFS/LzoCompress.java: -------------------------------------------------------------------------------- 1 | package com.mobin.HDFS; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.FileSystem; 5 | import org.apache.hadoop.fs.Path; 6 | 7 | import java.io.IOException; 8 | 9 | /** 10 | * Created by Mobin on 2017/2/4. 11 | */ 12 | public class LzoCompress { 13 | public static void main(String[] args) { 14 | Configuration conf = new Configuration(); 15 | try { 16 | FileSystem fs = FileSystem.get(conf); 17 | 18 | 19 | } catch (IOException e) { 20 | e.printStackTrace(); 21 | } 22 | } 23 | 24 | 25 | 26 | // public void LzoCoder(){ 27 | // try(){ 28 | // 29 | // } 30 | // } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/HDFS/WriteToHDFS.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.HDFS 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | import scala.util.Random 6 | 7 | /** 8 | * Created by hadoop on 3/6/16. 9 | * 为文件中的第一个字段拼接一个随机值再写入HDFS 10 | */ 11 | object WriteToHDFS { 12 | def main(args: Array[String]) { 13 | if (args.length < 2) { 14 | System.err.println("Usage: WriteToHDFS \n") 15 | System.exit(1) 16 | } 17 | val conf = new SparkConf().setMaster("local").setAppName("WriteToHDFS") 18 | val sc = new SparkContext(conf) 19 | val sgfile = sc.textFile(args(0)) 20 | 21 | val rdd = sgfile.map(lines => { 22 | val line = lines.split("\\s") 23 | if(line.length == 6){ 24 | val one = line(0) +"-"+ new Random().nextInt() 25 | one+","+line(1)+","+line(2).getBytes+","+line(3)+","+line(4)+","+line(5) 26 | }else //如果这样写 一定不能只写if语句 还要加上else语句,否则没有通过if的,将被视了() 否则后期通过Phoenix导入到HBase中会因为字段不合法而报错 27 | "mobin1"+","+"mobin2"+","+"mobin3"+"mobin4"+","+"mobin5"+","+"mobin6" 28 | }) 29 | rdd.saveAsTextFile(args(1)) 30 | sc.stop() 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/Kafka/KStream/KStreamDemo.java: -------------------------------------------------------------------------------- 1 | package com.mobin.Kafka.KStream; 2 | 3 | import org.apache.kafka.clients.consumer.ConsumerConfig; 4 | import org.apache.kafka.common.serialization.Serdes; 5 | import org.apache.kafka.streams.KafkaStreams; 6 | import org.apache.kafka.streams.StreamsBuilder; 7 | import org.apache.kafka.streams.StreamsConfig; 8 | import org.apache.kafka.streams.kstream.ForeachAction; 9 | import org.apache.kafka.streams.kstream.KStream; 10 | import org.apache.kafka.streams.kstream.KTable; 11 | import org.apache.kafka.streams.kstream.Printed; 12 | 13 | import java.util.Properties; 14 | 15 | /** 16 | * Created with IDEA 17 | * Creater: MOBIN 18 | * Date: 2018/8/19 19 | * Time: 3:41 PM 20 | */ 21 | public class KStreamDemo { 22 | private static final String APPLICATION_ID_CONFIG = "KStream-test"; 23 | private static final String BROKER_LIST = "localhost:9092"; 24 | private static final String TOPIC = "streams-foo"; 25 | private static StreamsBuilder streamsBuilder; 26 | private static KStream textLine; 27 | 28 | public static Properties initProperties(){ 29 | Properties properties = new Properties(); 30 | properties.put(StreamsConfig.APPLICATION_ID_CONFIG, APPLICATION_ID_CONFIG); 31 | properties.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, BROKER_LIST); 32 | properties.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass()); 33 | properties.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass()); 34 | properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); 35 | return properties; 36 | } 37 | 38 | public static void printf() throws InterruptedException { 39 | Properties properties = initProperties(); 40 | streamsBuilder = new StreamsBuilder(); 41 | textLine = streamsBuilder.stream(TOPIC); 42 | textLine.foreach(new ForeachAction() { 43 | @Override 44 | public void apply(String key, String value) { 45 | System.out.println(key + ":" + value); 46 | } 47 | }); 48 | KafkaStreams streams = new KafkaStreams(streamsBuilder.build(), properties); 49 | streams.start(); 50 | Thread.sleep(5000L); 51 | streams.close(); 52 | } 53 | 54 | public static void main(String[] args) throws InterruptedException { 55 | KStreamDemo.printf(); 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/Kafka/Partition/StockPartitionor.java: -------------------------------------------------------------------------------- 1 | package com.mobin.Kafka.Partition; 2 | 3 | import org.apache.kafka.clients.producer.Partitioner; 4 | import org.apache.kafka.common.Cluster; 5 | 6 | import java.util.Map; 7 | 8 | /** 9 | * Created with IDEA 10 | * Creater: MOBIN 11 | * Date: 2018/8/16 12 | * Time: 4:47 PM 13 | * //写好自定义分区后在配置文件进行自定义分区配置 14 | * properties.put("ProducerConfig.PARTITIONER_CLASS_CONFIG", StockPartitionor.class.getName) 15 | */ 16 | public class StockPartitionor implements Partitioner{ 17 | //分区数 18 | private static final Integer PARTITIONS = 6; 19 | @Override 20 | public int partition(String topic, Object key, byte[] keyBytes, Object value, byte[] valueBytes, Cluster cluster) { 21 | if (key == null){ 22 | return 0; 23 | } 24 | String stockcode = String.valueOf(key); 25 | try { 26 | int partitionID = Integer.valueOf(stockcode.substring(stockcode.length() - 2)) % PARTITIONS; 27 | return partitionID; 28 | }catch (NumberFormatException e){ 29 | return 0; 30 | } 31 | } 32 | 33 | @Override 34 | public void close() { 35 | 36 | } 37 | 38 | @Override 39 | public void configure(Map map) { 40 | 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/Kafka/Producers/KafkaProducerThread.java: -------------------------------------------------------------------------------- 1 | package com.mobin.Kafka.Producers; 2 | 3 | import org.apache.kafka.clients.producer.*; 4 | import org.apache.kafka.common.serialization.StringSerializer; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | import sun.util.resources.ga.LocaleNames_ga; 8 | 9 | import java.text.DecimalFormat; 10 | import java.util.Properties; 11 | import java.util.Random; 12 | import java.util.concurrent.ExecutorService; 13 | import java.util.concurrent.Executors; 14 | 15 | /** 16 | * Created with IDEA 17 | * Creater: MOBIN 18 | * Date: 2018/8/13 19 | * Time: 3:08 PM 20 | */ 21 | public class KafkaProducerThread implements Runnable { 22 | private static final int MSG_SIZE = 100; 23 | private static final String TOPIC = "stock-quotation4"; 24 | private static final String BROKER_LIST = "localhost:9092"; 25 | private static final Logger log = LoggerFactory.getLogger(KafkaProducerThread.class); 26 | private static KafkaProducer producer = null; 27 | private ProducerRecord record = null; 28 | 29 | public KafkaProducerThread(KafkaProducer producer, ProducerRecord record) { 30 | this.producer = producer; 31 | this.record = record; 32 | } 33 | 34 | @Override 35 | public void run() { 36 | System.out.println(producer + record.toString()); 37 | producer.send(record, new Callback() { 38 | 39 | @Override 40 | public void onCompletion(RecordMetadata recordMetadata, Exception e) { 41 | System.out.println("00000"); 42 | producer.send(record, new Callback() { 43 | @Override 44 | public void onCompletion(RecordMetadata recordMetadata, Exception e) { 45 | if (null != e) { //发送消息异常 46 | log.error("发送消息异常..."); 47 | } 48 | if (null != recordMetadata) { 49 | log.info(String.format("offset:%s, partition:%s", recordMetadata.offset(), recordMetadata.partition())); 50 | } 51 | } 52 | }); 53 | } 54 | }); 55 | } 56 | 57 | private static StockQuotationInfo createQuotationInfo() { 58 | StockQuotationInfo quotationInfo = new StockQuotationInfo(); 59 | Random random = new Random(); 60 | Integer stockCode = 600100 + random.nextInt(); 61 | float r = (float) Math.random(); 62 | if (r / 2 < 0.5) { 63 | r = -r; 64 | } 65 | DecimalFormat decimalFormat = new DecimalFormat(".00"); 66 | quotationInfo.setCurrentPrice(Float.valueOf(decimalFormat.format(11 + r))); 67 | quotationInfo.setPreClosePrice(11.80f); 68 | quotationInfo.setOpenPrice(11.5f); 69 | quotationInfo.setLowPrice(10.5f); 70 | quotationInfo.setHighPrice(12.5f); 71 | quotationInfo.setStockCode(stockCode.toString()); 72 | quotationInfo.setTradeTime(System.currentTimeMillis()); 73 | quotationInfo.setStockName("股票-" + stockCode); 74 | return quotationInfo; 75 | } 76 | 77 | public static Properties initConfig() { 78 | Properties properties = new Properties(); 79 | properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, BROKER_LIST); 80 | properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName()); 81 | properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName()); 82 | return properties; 83 | } 84 | 85 | public static void main(String[] args) { 86 | Properties configs = initConfig(); 87 | KafkaProducer producer = new KafkaProducer(configs); 88 | ProducerRecord record; 89 | StockQuotationInfo quotationInfo; 90 | ExecutorService executorService = Executors.newFixedThreadPool(10); 91 | long current = System.currentTimeMillis(); 92 | try { 93 | for (int i = 0; i < MSG_SIZE; i++) { 94 | quotationInfo = createQuotationInfo(); 95 | record = new ProducerRecord(TOPIC, null, quotationInfo.getTradeTime(), 96 | quotationInfo.getStockCode(), quotationInfo.toString()); 97 | executorService.submit(new KafkaProducerThread(producer, record)); 98 | } 99 | } catch (Exception e) { 100 | System.out.println("-------"); 101 | } finally { 102 | producer.close(); 103 | executorService.shutdown(); 104 | } 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/Kafka/Producers/QuotationProducer.java: -------------------------------------------------------------------------------- 1 | package com.mobin.Kafka.Producers; 2 | 3 | 4 | import org.apache.kafka.clients.producer.*; 5 | import org.apache.kafka.common.serialization.StringSerializer; 6 | import org.slf4j.Logger; 7 | import org.slf4j.LoggerFactory; 8 | 9 | import java.text.DecimalFormat; 10 | import java.util.Properties; 11 | import java.util.Random; 12 | 13 | /** 14 | * Created with IDEA 15 | * Creater: MOBIN 16 | * Date: 2018/8/13 17 | * Time: 11:24 AM 18 | */ 19 | public class QuotationProducer { 20 | private static final Logger log = LoggerFactory.getLogger(QuotationProducer.class); 21 | private static final int MSG_SIZE = 100; 22 | private static final String TOPIC = "stock-quotation"; 23 | private static final String BROKER_LIST = "localhost:9092"; 24 | private static KafkaProducer producer = null; 25 | static { 26 | Properties configs = initConfig(); 27 | producer = new KafkaProducer(configs); 28 | } 29 | 30 | public static Properties initConfig(){ 31 | Properties properties = new Properties(); 32 | properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, BROKER_LIST); 33 | properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName()); 34 | properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName()); 35 | return properties; 36 | } 37 | 38 | private static StockQuotationInfo createQuotationInfo(){ 39 | StockQuotationInfo quotationInfo = new StockQuotationInfo(); 40 | Random random = new Random(); 41 | Integer stockCode = 600100 + random.nextInt(); 42 | float r = (float) Math.random(); 43 | if (r / 2 < 0.5){ 44 | r = -r; 45 | } 46 | DecimalFormat decimalFormat = new DecimalFormat(".00"); 47 | quotationInfo.setCurrentPrice(Float.valueOf(decimalFormat.format(11 + r))); 48 | quotationInfo.setPreClosePrice(11.80f); 49 | quotationInfo.setOpenPrice(11.5f); 50 | quotationInfo.setLowPrice(10.5f); 51 | quotationInfo.setHighPrice(12.5f); 52 | quotationInfo.setStockCode(stockCode.toString()); 53 | quotationInfo.setTradeTime(System.currentTimeMillis()); 54 | quotationInfo.setStockName("股票-" + stockCode); 55 | return quotationInfo; 56 | } 57 | 58 | public static void main(String[] args) { 59 | ProducerRecord record = null; 60 | StockQuotationInfo quotationInfo = null; 61 | try { 62 | int num = 0; 63 | for (int i = 0; i < MSG_SIZE; i ++){ 64 | quotationInfo = createQuotationInfo(); 65 | record = new ProducerRecord(TOPIC,null, quotationInfo.getTradeTime(),quotationInfo.getStockCode() 66 | ,quotationInfo.toString()); 67 | producer.send(record); 68 | //异步方式,指定Callback,实现onCompleteion 69 | // producer.send(record, new Callback() { 70 | // @Override 71 | // public void onCompletion(RecordMetadata recordMetadata, Exception e) { 72 | // if (null != e){ //发送消息异常 73 | // log.error("发送消息异常..."); 74 | // } 75 | // if (null != recordMetadata){ 76 | // log.info(String.format("offset:%s, partition:%s", recordMetadata.offset(), recordMetadata.partition())); 77 | // } 78 | // } 79 | // }); 80 | if (num++ % 10 == 0){ 81 | Thread.sleep(2000L); 82 | } 83 | } 84 | }catch (InterruptedException e){ 85 | 86 | }finally { 87 | producer.close(); 88 | } 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/Kafka/Producers/StockQuotationInfo.java: -------------------------------------------------------------------------------- 1 | package com.mobin.Kafka.Producers; 2 | 3 | import java.io.Serializable; 4 | 5 | /** 6 | * Created with IDEA 7 | * Creater: MOBIN 8 | * Date: 2018/8/13 9 | * Time: 11:19 AM 10 | */ 11 | public class StockQuotationInfo implements Serializable{ 12 | private static final long serialVersionUID = 1L; 13 | private String stockCode; 14 | private String stockName; 15 | private long tradeTime; 16 | private float preClosePrice; 17 | private float openPrice; 18 | private float currentPrice; 19 | private float highPrice; 20 | private float lowPrice; 21 | 22 | public static long getSerialVersionUID() { 23 | return serialVersionUID; 24 | } 25 | 26 | public String getStockCode() { 27 | return stockCode; 28 | } 29 | 30 | public void setStockCode(String stockCode) { 31 | this.stockCode = stockCode; 32 | } 33 | 34 | public String getStockName() { 35 | return stockName; 36 | } 37 | 38 | public void setStockName(String stockName) { 39 | this.stockName = stockName; 40 | } 41 | 42 | public long getTradeTime() { 43 | return tradeTime; 44 | } 45 | 46 | public void setTradeTime(long tradeTime) { 47 | this.tradeTime = tradeTime; 48 | } 49 | 50 | public float getPreClosePrice() { 51 | return preClosePrice; 52 | } 53 | 54 | public void setPreClosePrice(float preClosePrice) { 55 | this.preClosePrice = preClosePrice; 56 | } 57 | 58 | public float getOpenPrice() { 59 | return openPrice; 60 | } 61 | 62 | public void setOpenPrice(float openPrice) { 63 | this.openPrice = openPrice; 64 | } 65 | 66 | public float getCurrentPrice() { 67 | return currentPrice; 68 | } 69 | 70 | public void setCurrentPrice(float currentPrice) { 71 | this.currentPrice = currentPrice; 72 | } 73 | 74 | public float getHighPrice() { 75 | return highPrice; 76 | } 77 | 78 | public void setHighPrice(float highPrice) { 79 | this.highPrice = highPrice; 80 | } 81 | 82 | public float getLowPrice() { 83 | return lowPrice; 84 | } 85 | 86 | public void setLowPrice(float lowPrice) { 87 | this.lowPrice = lowPrice; 88 | } 89 | 90 | @Override 91 | public String toString() { 92 | return stockCode + "|" +stockName+ "|" +tradeTime+ "|" +preClosePrice+ "|" +openPrice 93 | + "|" +currentPrice+ "|" +highPrice+ "|" +lowPrice; 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/Kafka/Topic.java: -------------------------------------------------------------------------------- 1 | package com.mobin.Kafka; 2 | 3 | 4 | import org.apache.hadoop.yarn.webapp.hamlet.Hamlet; 5 | import org.apache.kafka.clients.admin.*; 6 | import org.apache.kafka.common.KafkaFuture; 7 | import org.apache.kafka.common.config.ConfigResource; 8 | 9 | import java.util.*; 10 | import java.util.concurrent.ExecutionException; 11 | 12 | /** 13 | * Created with IDEA 14 | * Creater: MOBIN 15 | * Date: 2018/8/12 16 | * Time: 5:18 PM 17 | */ 18 | public class Topic { 19 | private static final String ZK_CONNECT = "localhost:2181"; 20 | //ZK连接session过期时间 21 | private static final int SESSION_TIMEOUT = 30000; 22 | //连接超时时间 23 | private static final int CONNECT_TIMEOUT = 30000; 24 | 25 | public static void createTopic(AdminClient adminClient,String topic, int partition, short replica, Properties conf){ 26 | 27 | Map configs = new HashMap<>(); 28 | try { 29 | CreateTopicsResult result = adminClient.createTopics(Arrays.asList(new NewTopic(topic, partition, replica).configs(configs))); 30 | }catch (Exception e){ 31 | 32 | }finally { 33 | adminClient.close(); 34 | } 35 | } 36 | 37 | public static void deleteTopic(AdminClient adminClient,String topic, Properties conf){ 38 | adminClient.create(conf); 39 | KafkaFuture future = adminClient.deleteTopics(Arrays.asList(topic)).all(); 40 | try { 41 | future.get(); 42 | } catch (InterruptedException e) { 43 | e.printStackTrace(); 44 | } catch (ExecutionException e) { 45 | e.printStackTrace(); 46 | } 47 | } 48 | 49 | public static void updateTopicConfig(AdminClient adminClient, String topic) throws ExecutionException, InterruptedException { 50 | Config config = new Config(Arrays.asList(new ConfigEntry("max.message.bytes","404800"))); 51 | adminClient.alterConfigs(Collections.singletonMap(new ConfigResource(ConfigResource.Type.TOPIC, topic), config)).all().get(); 52 | } 53 | 54 | public static void showTopic(AdminClient adminClient, String topic) throws ExecutionException, InterruptedException { 55 | DescribeTopicsResult topicsResult = adminClient.describeTopics(Arrays.asList(topic)); 56 | Map map = topicsResult.all().get(); 57 | for (Map.Entry entry: map.entrySet()){ 58 | System.out.println(entry.getKey() + " : " + entry.getValue()); 59 | } 60 | 61 | } 62 | 63 | //查询所有Topics 64 | public static void showAllTopic(AdminClient adminClient) throws ExecutionException, InterruptedException { 65 | ListTopicsOptions options = new ListTopicsOptions(); 66 | options.listInternal(true); 67 | ListTopicsResult result = adminClient.listTopics(options); 68 | Set topicName = result.names().get(); 69 | System.out.println(topicName); 70 | } 71 | 72 | public static void main(String[] args) throws ExecutionException, InterruptedException { 73 | String TOPIC = "APITopic"; 74 | Properties conf = new Properties(); 75 | conf.put(AdminClientConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092"); 76 | AdminClient adminClient = AdminClient.create(conf); 77 | // Topic.createTopic(adminClient,"APITopic",1, (short) 1, conf); 78 | // Topic.deleteTopic(adminClient, TOPIC ,conf); 79 | // Topic.updateTopicConfig(adminClient, TOPIC); 80 | // Topic.showTopic(adminClient, TOPIC); 81 | Topic.showAllTopic(adminClient); 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/Kafka/consumers/KafkaConsumerThread.java: -------------------------------------------------------------------------------- 1 | package com.mobin.Kafka.consumers; 2 | 3 | import org.apache.kafka.clients.consumer.ConsumerRecord; 4 | import org.apache.kafka.clients.consumer.ConsumerRecords; 5 | import org.apache.kafka.clients.consumer.KafkaConsumer; 6 | 7 | import java.util.Arrays; 8 | import java.util.Properties; 9 | 10 | /** 11 | * Created with IDEA 12 | * Creater: MOBIN 13 | * Date: 2018/8/16 14 | * Time: 3:50 PM 15 | * 6个消费者线程消费同一个主题 16 | */ 17 | public class KafkaConsumerThread extends Thread { 18 | //每个线程拥有私有的KafkaConsumer实例 19 | private KafkaConsumer consumer; 20 | 21 | public KafkaConsumerThread(Properties consumerConfig, String topic) { 22 | this.consumer = new KafkaConsumer(consumerConfig); 23 | consumer.subscribe(Arrays.asList(topic)); 24 | } 25 | 26 | @Override 27 | public void run() { 28 | try { 29 | while (true) { 30 | ConsumerRecords records = consumer.poll(1000); 31 | for (ConsumerRecord record : records) { 32 | System.out.printf("partition = %d, offset = %d, key = %s value = %s%n", 33 | record.partition(), record.offset(), record.key(), record.value()); 34 | } 35 | } 36 | } catch (Exception e) { 37 | e.printStackTrace(); 38 | } finally { 39 | consumer.close(); 40 | } 41 | } 42 | 43 | public static void main(String[] args) { 44 | Properties properties = new Properties(); 45 | properties.put("bootstrap.servers", "localhost:9092"); 46 | properties.put("group.id", "test"); 47 | properties.put("enable.auto.commit", true); 48 | properties.put("auto.commit.interval.ms", 1000);//设置偏移量提交时间 49 | properties.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer"); 50 | properties.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer"); 51 | for (int i = 0; i < 6; i ++){ 52 | new KafkaConsumerThread(properties, "stock-quotation").start(); 53 | } 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/Kafka/consumers/QuotationConsumer.java: -------------------------------------------------------------------------------- 1 | package com.mobin.Kafka.consumers; 2 | 3 | import org.apache.kafka.clients.consumer.ConsumerRecord; 4 | import org.apache.kafka.clients.consumer.ConsumerRecords; 5 | import org.apache.kafka.clients.consumer.KafkaConsumer; 6 | 7 | import java.util.Arrays; 8 | import java.util.Properties; 9 | 10 | /** 11 | * Created with IDEA 12 | * Creater: MOBIN 13 | * Date: 2018/8/14 14 | * Time: 3:40 PM 15 | */ 16 | public class QuotationConsumer { 17 | private static final String BROKERS_LIST = "localhost:9092"; 18 | private static final String GROUP_ID = "test"; 19 | private static final String CLIENT_ID = "test"; 20 | private static final String TOPIC = "stock-quotation"; 21 | private static KafkaConsumer consumer; 22 | 23 | static { 24 | Properties properties = initPorerties(); 25 | consumer = new KafkaConsumer(properties); 26 | } 27 | 28 | public static Properties initPorerties(){ 29 | Properties properties = new Properties(); 30 | properties.put("bootstrap.servers", BROKERS_LIST); 31 | properties.put("group.id", GROUP_ID); 32 | properties.put("client.id", CLIENT_ID); 33 | properties.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer"); 34 | properties.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer"); 35 | return properties; 36 | } 37 | 38 | public static void poll(){ 39 | consumer.subscribe(Arrays.asList(TOPIC)); 40 | try { 41 | while (true){ 42 | ConsumerRecords records = consumer.poll(1000); 43 | for (ConsumerRecord record: records){ 44 | System.out.printf("partition = %d, offset = %d, key = %s value = %s%n", 45 | record.partition(), record.offset(), record.key(), record.value()); 46 | } 47 | } 48 | }catch (Exception e){ 49 | 50 | }finally { 51 | consumer.close(); 52 | } 53 | } 54 | 55 | public static void main(String[] args) { 56 | QuotationConsumer.poll(); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/Kafka/consumers/QuotationConsumerManualCommit.java: -------------------------------------------------------------------------------- 1 | package com.mobin.Kafka.consumers; 2 | 3 | import org.apache.kafka.clients.consumer.*; 4 | import org.apache.kafka.common.TopicPartition; 5 | import org.codehaus.janino.IClass; 6 | 7 | import java.util.Arrays; 8 | import java.util.Collection; 9 | import java.util.Map; 10 | import java.util.Properties; 11 | 12 | /** 13 | * Created with IDEA 14 | * Creater: MOBIN 15 | * Date: 2018/8/14 16 | * Time: 3:40 PM 17 | * 每处理完10消息提交一次 18 | */ 19 | public class QuotationConsumerManualCommit { 20 | private static final String BROKERS_LIST = "localhost:9092"; 21 | private static final String GROUP_ID = "test"; 22 | private static final String CLIENT_ID = "test"; 23 | private static final String TOPIC = "stock-quotation"; 24 | private static KafkaConsumer consumer; 25 | 26 | static { 27 | Properties properties = initPorerties(); 28 | consumer = new KafkaConsumer(properties); 29 | } 30 | 31 | public static Properties initPorerties(){ 32 | Properties properties = new Properties(); 33 | properties.put("bootstrap.servers", BROKERS_LIST); 34 | properties.put("group.id", GROUP_ID); 35 | properties.put("client.id", CLIENT_ID); 36 | properties.put("fetch.max.bytes", 1024); //设置一次fetch请求取得的数据最大值为1kb,默认为5MB,这里是为了方便测试 37 | properties.put("enable.auto.commit", false); //手动提交偏移量 38 | properties.put("client.id", CLIENT_ID); 39 | properties.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer"); 40 | properties.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer"); 41 | return properties; 42 | } 43 | 44 | public static void poll(){ 45 | consumer.subscribe(Arrays.asList(TOPIC,"stock-quotation1"), new ConsumerRebalanceListener() { 46 | @Override 47 | public void onPartitionsRevoked(Collection collection) { 48 | } 49 | 50 | @Override 51 | public void onPartitionsAssigned(Collection partitions) { 52 | long committedOffset = -1; 53 | for (TopicPartition topicPartition: partitions){ 54 | // System.out.println(consumer.committed(topicPartition)); 55 | // committedOffset = consumer.committed(topicPartition).offset(); 56 | // System.out.println("当前"+topicPartition+"偏移量:"+committedOffset); 57 | consumer.seekToBeginning(partitions); 58 | } 59 | } 60 | }); 61 | try { 62 | int minCommitSize = 10;//最少处理10条消息后才进行提交 63 | int count = 0; //消息计算器 64 | while (true){ 65 | ConsumerRecords records = consumer.poll(1000); 66 | for (ConsumerRecord record: records){ 67 | System.out.printf("topic = %s, partition = %d, offset = %d, key = %s value = %s%n", 68 | record.topic(),record.partition(), record.offset(), record.key(), record.value()); 69 | count ++; 70 | } 71 | if (count >= minCommitSize) { 72 | consumer.commitAsync(new OffsetCommitCallback() { 73 | @Override 74 | public void onComplete(Map map, Exception e) { 75 | if (null == e){ 76 | System.out.println("提交成功"); 77 | }else { 78 | System.out.println("提交发生了异常"); 79 | } 80 | } 81 | }); 82 | count = 0; 83 | } 84 | } 85 | }catch (Exception e){ 86 | e.printStackTrace(); 87 | }finally { 88 | consumer.close(); 89 | } 90 | } 91 | 92 | public static void main(String[] args) { 93 | QuotationConsumerManualCommit.poll(); 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/Action/Aggregate.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.Action 2 | 3 | import org.apache.spark.{SparkContext, SparkConf} 4 | 5 | /** 6 | * Created by hadoop on 4/20/16. 7 | * seqOp函数将每个分区的数据聚合成类型为U的值,comOp函数将各分区的U类型数据聚合起来得到类型为U的值 8 | */ 9 | object Aggregate { 10 | 11 | def main(args: Array[String]) { 12 | val conf = new SparkConf().setMaster("local").setAppName("Fold") 13 | val sc = new SparkContext(conf) 14 | val rdd = sc.parallelize(List(1,2,3,4),2) 15 | val aggregateRDD = rdd.aggregate(2)(_+_,_ * _) 16 | println(aggregateRDD) 17 | sc.stop 18 | } 19 | 20 | /** 21 | * 步骤1:分区1:zeroValue+1+2=5 分区2:zeroValue+3+4=9 22 | 23 | 步骤2:2*分区1的结果*分区2的结果=90 24 | */ 25 | 26 | } 27 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/Action/Fold.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.Action 2 | 3 | import org.apache.spark.{SparkContext, SparkConf} 4 | 5 | /** 6 | * Created by hadoop on 4/20/16. 7 | * 通过op函数聚合各分区中的元素及合并各分区的元素,op函数需要两个参数,在开始时第一个传入的参数为zeroValue,T为RDD数据集的数据类型 8 | */ 9 | object Fold { 10 | 11 | def main(args: Array[String]) { 12 | val conf = new SparkConf().setMaster("local").setAppName("Fold") 13 | val sc = new SparkContext(conf) 14 | val rdd = sc.parallelize(Array(("a", 1), ("b", 2), ("a", 2), ("c", 5), ("a", 3)), 2) 15 | val foldRDD = rdd.fold(("d", 0))((val1, val2) => { 16 | if (val1._2 >= val2._2) val1 else val2 17 | }) 18 | println(foldRDD) 19 | sc.stop 20 | } 21 | 22 | /** 23 | * 1.开始时将(“d”,0)作为op函数的第一个参数传入,将Array中和第一个元素("a",1)作为op函数的第二个参数传入,并比较value的值, 24 | * 返回value值较大的元素 25 | 26 | * 2.将上一步返回的元素又作为op函数的第一个参数传入,Array的下一个元素作为op函数的第二个参数传入,比较大小 27 | 28 | * 3.重复第2步骤 29 | */ 30 | } 31 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/Action/Func.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.Action 2 | 3 | import org.apache.spark.{SparkContext, SparkConf} 4 | 5 | /** 6 | * Created by hadoop on 4/19/16. 7 | * reduce(func):通过函数func先聚集各分区的数据集,再聚集分区之间的数据,func接收两个参数,返回一个新值, 8 | * 新值再做为参数继续传递给函数func,直到最后一个元素 9 | */ 10 | object Func { 11 | 12 | def main(args: Array[String]) { 13 | val conf = new SparkConf().setMaster("local").setAppName("reduce") 14 | val sc = new SparkContext(conf) 15 | val rdd = sc.parallelize(1 to 10,2) 16 | val reduceRDD = rdd.reduce(_ + _) 17 | val reduceRDD1 = rdd.reduce(_ - _) //如果分区数据为1结果为 -53 18 | val countRDD = rdd.count() 19 | val firstRDD = rdd.first() 20 | val takeRDD = rdd.take(5) 21 | val topRDD = rdd.top(3) 22 | val takeOrderedRDD = rdd.takeOrdered(3) 23 | println("func +: "+reduceRDD) 24 | println("func -: "+reduceRDD1) 25 | println("count: "+countRDD) 26 | println("first: "+firstRDD) 27 | println("take:") 28 | takeRDD.foreach(x => print(x +" ")) 29 | println("\ntop:") 30 | topRDD.foreach(x => print(x +" ")) 31 | println("\ntakeOrdered:") 32 | takeOrderedRDD.foreach(x => print(x +" ")) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/Action/KVFunc.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.Action 2 | 3 | import org.apache.spark.{SparkContext, SparkConf} 4 | 5 | import scala.concurrent.Future 6 | 7 | /** 8 | * Created by hadoop on 4/19/16. 9 | */ 10 | object KVFunc { 11 | 12 | def main(args: Array[String]) { 13 | val conf = new SparkConf().setMaster("local").setAppName("KVFunc") 14 | val sc = new SparkContext(conf) 15 | val arr = List(("A", 1), ("B", 2), ("A", 2), ("B", 3)) 16 | val rdd = sc.parallelize(arr,2) 17 | val countByKeyRDD = rdd.countByKey() 18 | val collectAsMapRDD = rdd.collectAsMap() 19 | val lookupRDD = rdd.lookup("A") 20 | println("countByKey:") 21 | countByKeyRDD.foreach(print) 22 | println("\ncollectAsMap:") 23 | collectAsMapRDD.foreach(print) 24 | println("\nlookup:") 25 | lookupRDD.foreach(x => print(x)) 26 | sc.stop 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/BaseRDD/Cartesian.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.BaseRDD 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | /** 6 | * Created by hadoop on 4/8/16. 7 | * 对两个RDD中的所有元素进行笛卡尔积操作 8 | */ 9 | object Cartesian { 10 | 11 | def main(args: Array[String]) { 12 | val conf = new SparkConf().setMaster("local").setAppName("map") 13 | val sc = new SparkContext(conf) 14 | 15 | val rdd1 = sc.parallelize(1 to 3) 16 | val rdd2 = sc.parallelize(2 to 5) 17 | val cartesianRDD = rdd1.cartesian(rdd2) 18 | 19 | cartesianRDD.foreach(x => println(x + " ")) 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/BaseRDD/Coalesce.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.BaseRDD 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | /** 6 | * Created by hadoop on 4/9/16. 7 | * 对RDD的分区进行重新分区,shuffle默认值为false,当shuffle=false时,不能增加分区数 8 | 目,但不会报错,只是分区个数还是原来的 9 | */ 10 | object Coalesce { 11 | 12 | def main(args: Array[String]) { 13 | val conf = new SparkConf().setMaster("local").setAppName("map") 14 | val sc = new SparkContext(conf) 15 | val rdd = sc.parallelize(1 to 16,4) 16 | rdd.foreachPartition(iter => print(iter.toList+ " | ")) 17 | val coalesceRDD = rdd.coalesce(3) //当suffle的值为false时,不能增加分区数(如分区数不能从5->7) 18 | // val coalesceRDD = rdd.coalesce(5,true) 19 | println("重新分区后的分区个数:"+coalesceRDD.partitions.size) 20 | println("RDD依赖关系:"+coalesceRDD.toDebugString) 21 | coalesceRDD.foreachPartition(iter => print(iter.toList+ " | ")) 22 | sc.stop 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/BaseRDD/Distinct.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.BaseRDD 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | /** 6 | * Created by hadoop on 4/8/16. 7 | * 对RDD中的元素进行去重 8 | */ 9 | object Distinct { 10 | 11 | def main(args: Array[String]) { 12 | val conf = new SparkConf().setMaster("local").setAppName("map") 13 | val sc = new SparkContext(conf) 14 | val list = List(1,1,2,5,2,9,6,1) 15 | val distinctRDD = sc.parallelize(list) 16 | val unionRDD = distinctRDD.distinct() //union intersection 17 | unionRDD.collect.foreach(x => print(x + " ")) 18 | sc.stop() 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/BaseRDD/FlatMap.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.BaseRDD 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | /** 6 | * Created by hadoop on 4/7/16. 7 | * 与map类似,但每个元素输入项都可以被映射到0个或多个的输出项,最终将结果”扁平化“后输出 8 | */ 9 | object FlatMap { 10 | 11 | def main(args: Array[String]) { 12 | val conf = new SparkConf().setMaster("local").setAppName("flatmap") 13 | val sc = new SparkContext(conf) 14 | val rdd = sc.parallelize(1 to 5) 15 | val fm = rdd.flatMap(x => (1 to x)) 16 | fm.foreach( x => print(x + " ")) 17 | sc.stop() 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/BaseRDD/Glom.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.BaseRDD 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | /** 6 | * Created by hadoop on 4/9/16. 7 | * 将RDD的每个分区中的类型为T的元素转换换数组Array[T] 8 | */ 9 | object Glom { 10 | 11 | def main(args: Array[String]) { 12 | val conf = new SparkConf().setMaster("local").setAppName("map") 13 | val sc = new SparkContext(conf) 14 | val rdd = sc.parallelize(1 to 16,4) 15 | val glomRDD = rdd.glom() //RDD[Array[T]] 16 | glomRDD.foreach(rdd => println(rdd.getClass.getSimpleName)) 17 | sc.stop 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/BaseRDD/MakeRDD.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.BaseRDD 2 | 3 | import org.apache.spark.{SparkContext, SparkConf} 4 | 5 | /** 6 | * Created by Mobin on 2017/7/28. 7 | */ 8 | object MakeRDD { 9 | def main(args: Array[String]) { 10 | val conf = new SparkConf().setMaster("local").setAppName("makeRDD") 11 | val sc = new SparkContext(conf) 12 | val collection = Seq((1 to 10, Seq("master","slave1")), 13 | (11 to 15, Seq("slave2","slave3"))) 14 | var rdd = sc.makeRDD(collection) 15 | println(rdd.partitions.size) 16 | println(rdd.preferredLocations(rdd.partitions(0))) 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/BaseRDD/Map.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.BaseRDD 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | /** 6 | * Created by hadoop on 4/7/16. 7 | * 数据集中的每个元素经过用户自定义的函数转换形成一个新的RDD,新的RDD叫MappedRDD 8 | */ 9 | object Map { 10 | 11 | def main(args: Array[String]) { 12 | val conf = new SparkConf().setMaster("local").setAppName("map") 13 | val sc = new SparkContext(conf) 14 | val rdd = sc.parallelize(1 to 10) //创建RDD 15 | val map = rdd.map(_*2) //对RDD中的每个元素都乘于2 16 | map.foreach(x => print(x+" ")) 17 | sc.stop() 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/BaseRDD/MapPartitions.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.BaseRDD 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | /** 6 | * Created by hadoop on 4/7/16. 7 | * mapPartitions:类似与map,map作用于每个分区的每个元素,但mapPartitions作用于每个分区 8 | * 9 | * mapPartitionsWithIndex:将[mapPartitionsWithIndex]注释部分去掉即是 10 | * 与mapPartitions类似,不同的时函数多了个分区索引的参数 11 | */ 12 | object MapPartitions { 13 | // 输出有女性的名字: 14 | def partitionsFun(/*[mapPartitionsWithIndex] index : Int,*/iter : Iterator[(String,String)]) : Iterator[String] = { 15 | var woman = List[String]() 16 | while (iter.hasNext){ 17 | val next = iter.next() 18 | next match { 19 | case (_,"female") => woman = /*[mapPartitionsWithIndex]"["+index+"]"+*/next._1 :: woman 20 | //case (_,"female") => woman = next._1.toList .:: (woman) 错误写法 21 | case _ => 22 | } 23 | } 24 | return woman.iterator 25 | } 26 | 27 | 28 | def main(args: Array[String]) { 29 | val conf = new SparkConf().setMaster("local").setAppName("flatmap") 30 | val sc = new SparkContext(conf) 31 | val l = List(("kpop","female"),("zorro","male"),("mobin","male"),("lucy","female")) 32 | val rdd = sc.parallelize(l,2) 33 | // val mp = rdd.mapPartitions(x => x.filter(_._2 == "female")).map(x => x._1) 34 | val mp = rdd.mapPartitions(partitionsFun) 35 | //[mapPartitionsWithIndex] val mp = rdd.mapPartitionsWithIndex(partitionsFun) 36 | mp.collect.foreach(x => (print(x +" "))) //将分区中的元素转换成Aarray再输出 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/BaseRDD/MapPartitionsWithIndex.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.BaseRDD 2 | 3 | import org.apache.spark.{SparkContext, SparkConf} 4 | 5 | /** 6 | * Created by Mobin on 2017/7/29. 7 | */ 8 | object MapPartitionsWithIndex { 9 | 10 | def mappartitionWithIndexFun(x : Int, iter :Iterator[Int])={ 11 | var result = List[String]() 12 | var i = 0 13 | while (iter.hasNext) { 14 | i += iter.next() 15 | } 16 | result.::(x + "|" + i).iterator 17 | } 18 | 19 | def main(args: Array[String]) { 20 | val conf = new SparkConf().setMaster("local").setAppName("mappartitionsWithIndex") 21 | val sc = new SparkContext(conf) 22 | val rdd1 = sc.makeRDD(1 to 5,2) 23 | val rdd2 = rdd1.mapPartitionsWithIndex{ 24 | (x, iter) => { 25 | var result = List[String]() 26 | var i = 0 27 | while (iter.hasNext){ 28 | i += iter.next() 29 | } 30 | result.::(x + "|" + i).iterator 31 | } 32 | } 33 | val rdd3 = rdd1.mapPartitionsWithIndex(mappartitionWithIndexFun) 34 | rdd3.foreach(println) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/BaseRDD/RandomSplit.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.BaseRDD 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | /** 6 | * Created by hadoop on 4/9/16. 7 | * 根据weight权重值将一个RDD划分成多个RDD,权重越高划分得到的元素较多的几率就越大 8 | */ 9 | object RandomSplit { 10 | 11 | def main(args: Array[String]) { 12 | val conf = new SparkConf().setMaster("local").setAppName("map") 13 | val sc = new SparkContext(conf) 14 | val rdd = sc.parallelize(1 to 10) 15 | val randomSplitRDD = rdd.randomSplit(Array(1.0,2.0,7.0)) 16 | randomSplitRDD(0).foreach(x => print(x +" gg")) 17 | randomSplitRDD(1).foreach(x => print(x +" rr")) 18 | randomSplitRDD(2).foreach(x => print(x +" tt")) 19 | sc.stop 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/BaseRDD/Sample.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.BaseRDD 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | /** 6 | * Created by hadoop on 4/7/16. 7 | */ 8 | object Sample { 9 | 10 | def main(args: Array[String]) { 11 | val conf = new SparkConf().setMaster("local").setAppName("map") 12 | val sc = new SparkContext(conf) 13 | val rdd = sc.parallelize(1 to 10) 14 | val sample1 = rdd.sample(true,0.5,0) 15 | sample1.collect.foreach(x => print(x + " ")) 16 | sc.stop 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/BaseRDD/Union.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.BaseRDD 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | /** 6 | * Created by hadoop on 4/8/16. 7 | * :将两个RDD中的数据集进行合并,最终返回两个RDD的并集,若RDD中存在相同的元素也不会去重 8 | */ 9 | object Union { 10 | def main(args: Array[String]) { 11 | val conf = new SparkConf().setMaster("local").setAppName("map") 12 | val sc = new SparkContext(conf) 13 | val rdd1 = sc.parallelize(1 to 4) 14 | val rdd2 = sc.parallelize(3 to 5) 15 | val unionRDD = rdd1.intersection(rdd2) //union intersection 16 | unionRDD.collect.foreach(x => print(x + " ")) 17 | sc.stop() 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/BaseRDD/ZipWithIndex.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.BaseRDD 2 | 3 | import org.apache.spark.{SparkContext, SparkConf} 4 | 5 | /** 6 | * Created by Mobin on 2017/7/29. 7 | */ 8 | object ZipWithIndex { 9 | def main(args: Array[String]) { 10 | val conf = new SparkConf().setMaster("local").setAppName("ZipWithIndex") 11 | val sc = new SparkContext(conf) 12 | val rdd1 = sc.makeRDD(Seq("A","B","C","D","E","F"),2) 13 | rdd1.zipWithIndex().foreach(println) 14 | rdd1.zipWithUniqueId().foreach(println) 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/KVRDD/AggregateAndFold.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.KVRDD 2 | 3 | import org.apache.spark.{SparkContext, SparkConf} 4 | 5 | /** 6 | * Created by Mobin on 2017/7/30. 7 | */ 8 | object AggregateAndFold { 9 | def main(args: Array[String]) { 10 | val conf = new SparkConf().setMaster("local").setAppName("AggregateFold") 11 | val sc = new SparkContext(conf) 12 | val rdd1 = sc.makeRDD(1 to 10, 2) 13 | val rs = rdd1.aggregate(1)( 14 | (x,y) => x + y, 15 | (a,b) => a+ b 16 | ) 17 | val rs1 = rdd1.fold(1)((x,y) => x+ y) 18 | println(rs) 19 | println(rs1) 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/KVRDD/Cogroup.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.KVRDD 2 | 3 | import org.apache.spark.{SparkContext, SparkConf} 4 | 5 | /** 6 | * Created by hadoop on 4/12/16. 7 | */ 8 | object Cogroup { 9 | 10 | def main(args: Array[String]) { 11 | val conf = new SparkConf().setMaster("local").setAppName("ReduceByKey") 12 | val sc = new SparkContext(conf) 13 | val arr = List(("A", 1), ("B", 2), ("A", 2), ("B", 3)) 14 | val arr1 = List(("A", "A1"), ("B", "B1"), ("A", "A2"), ("B", "B2")) 15 | val rdd = sc.parallelize(arr, 3) 16 | val rdd1 = sc.parallelize(arr1, 3) 17 | val groupByKeyRDD = rdd.cogroup(rdd1) 18 | groupByKeyRDD.foreach(println) 19 | println(groupByKeyRDD.toDebugString) 20 | sc.stop 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/KVRDD/CombineByKey.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.KVRDD 2 | 3 | import org.apache.spark.{HashPartitioner, SparkContext, SparkConf} 4 | 5 | /** 6 | * Created by hadoop on 4/11/16. 7 | * 统计男性和女生的个数,并以(性别,(名字,名字....),个数)的形式输出 8 | */ 9 | object CombineByKey { 10 | 11 | def main(args: Array[String]) { 12 | /* 13 | def createCombine = (x: String) => (x, 1) 14 | def mergerValue = (peo: (String, Int), x: String) => (peo._1+","+x, peo._2 + 1) 15 | def mergeCombine = (sex1: (String, Int), sex2: (String, Int)) => (sex1._1 +","+ sex2._1, sex1._2 + sex2._2)*/ 16 | val conf = new SparkConf().setMaster("local").setAppName("combinByKey") 17 | val sc = new SparkContext(conf) 18 | val people = List(("male", "Mobin"), ("male", "Kpop"), ("female", "Lucy"), ("male", "Lufei"), ("female", "Amy")) 19 | val rdd = sc.parallelize(people) 20 | val combinByKeyRDD = rdd.combineByKey( 21 | (x: String) => (List(x), 1), 22 | (peo: (List[String], Int), x: String) => (x :: peo._1, peo._2 + 1), 23 | (sex1: (List[String], Int), sex2: (List[String], Int)) => (sex1._1 ::: sex2._1, sex1._2 + sex2._2)) 24 | 25 | combinByKeyRDD.foreach(println) 26 | println(combinByKeyRDD.toDebugString) 27 | 28 | /** 29 | * (1) ShuffledRDD[1] at combineByKey at CombineByKey.scala:20 [] 30 | * +-(1) ParallelCollectionRDD[0] at parallelize at CombineByKey.scala:19 [] 31 | */ 32 | sc.stop() 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/KVRDD/CombineByKey1.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.KVRDD 2 | 3 | import org.apache.spark.{SparkContext, SparkConf} 4 | 5 | /** 6 | * Created by hadoop on 4/11/16. 7 | */ 8 | object CombineByKey1 { 9 | 10 | def main(args: Array[String]) { 11 | val conf = new SparkConf().setMaster("local").setAppName("combinByKey") 12 | val sc = new SparkContext(conf) 13 | val rdd = sc.parallelize(List( 14 | ("A", 3), ("A", 9), ("A", 12), ("A", 0), ("A", 5), ("B", 4), 15 | ("B", 10), ("B", 11), ("B", 20), ("B", 25), ("C", 32), ("C", 91), 16 | ("C", 122), ("C", 3), ("C", 55)), 2) 17 | 18 | val combineByKeyRDD = rdd.combineByKey( 19 | (x: Int) => (x, 1), 20 | (acc: (Int, Int), x) => (acc._1 + x, acc._2 + 1), 21 | (acc1: (Int, Int), acc2: (Int, Int)) => (acc1._1 + acc2._1, acc1._2 + acc2._2)) 22 | 23 | combineByKeyRDD.foreach(println) 24 | sc.stop() 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/KVRDD/FlatMapValus.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.KVRDD 2 | 3 | import org.apache.spark.{SparkContext, SparkConf} 4 | 5 | /** 6 | * Created by hadoop on 4/10/16. 7 | * 对[K,V]型数据中的V值flatmap操作 8 | */ 9 | object FlatMapValus { 10 | def main(args: Array[String]) { 11 | val conf = new SparkConf().setMaster("local").setAppName("map") 12 | val sc = new SparkContext(conf) 13 | val list = List(("mobin",22),("kpop",20),("lufei",23)) 14 | val rdd = sc.parallelize(list) 15 | val mapValuesRDD = rdd.flatMapValues(x => Seq(x,"male")) 16 | mapValuesRDD.foreach(println) 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/KVRDD/FoldByKey.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.KVRDD 2 | 3 | import org.apache.spark.{SparkContext, SparkConf} 4 | 5 | /** 6 | * Created by hadoop on 4/11/16. 7 | */ 8 | object FoldByKey { 9 | 10 | def main(args: Array[String]) { 11 | val conf = new SparkConf().setMaster("local").setAppName("FoldByKey") 12 | val sc = new SparkContext(conf) 13 | val people = List(("Mobin", 2), ("Mobin", 1), ("Lucy", 2), ("Amy", 1), ("Lucy", 3)) 14 | val rdd = sc.parallelize(people) 15 | val foldByKeyRDD = rdd.foldByKey(2)(_ + _) 16 | foldByKeyRDD.foreach(println) 17 | sc.stop 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/KVRDD/GroupByKey.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.KVRDD 2 | 3 | import org.apache.spark.{SparkContext, SparkConf} 4 | 5 | /** 6 | * Created by hadoop on 4/12/16. 7 | */ 8 | object GroupByKey { 9 | 10 | def main(args: Array[String]) { 11 | val conf = new SparkConf().setMaster("local").setAppName("ReduceByKey") 12 | val sc = new SparkContext(conf) 13 | val arr = List(("A", 1), ("B", 2), ("A", 2), ("B", 3)) 14 | val rdd = sc.parallelize(arr) 15 | val groupByKeyRDD = rdd.groupByKey() 16 | groupByKeyRDD.foreach(println) 17 | sc.stop 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/KVRDD/Join.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.KVRDD 2 | 3 | import org.apache.spark.{SparkContext, SparkConf} 4 | 5 | /** 6 | * Created by hadoop on 4/12/16. 7 | */ 8 | object Join { 9 | 10 | def main(args: Array[String]) { 11 | val conf = new SparkConf().setMaster("local").setAppName("ReduceByKey") 12 | val sc = new SparkContext(conf) 13 | val arr = List(("A", 1), ("B", 2)) 14 | val arr1 = List(("A", "A1"), ("B", "B1"),("B", "B1")) 15 | 16 | /*val arr = List(("A", 1), ("B", 2), ("A", 2), ("B", 3),("C",1)) 17 | val arr1 = List(("A", "A1"), ("B", "B1"), ("A", "A2"), ("B", "B2")) 18 | leftOuterJoin 19 | */ 20 | 21 | /*val arr = List(("A", 1), ("B", 2), ("A", 2), ("B", 3)) 22 | val arr1 = List(("A", "A1"), ("B", "B1"), ("A", "A2"), ("B", "B2"),("C","C1")) 23 | rightOuterJoin*/ 24 | val rdd = sc.parallelize(arr, 3) 25 | val rdd1 = sc.parallelize(arr1, 3) 26 | val rightOutJoinRDD = rdd.fullOuterJoin(rdd1) 27 | rightOutJoinRDD.foreach(println) 28 | println(rightOutJoinRDD.toDebugString) 29 | sc.stop 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/KVRDD/MapJoinJava.java: -------------------------------------------------------------------------------- 1 | //package com.mobin.SparkRDDFun.TransFormation.KVRDD; 2 | // 3 | //import org.apache.spark.SparkConf; 4 | //import org.apache.spark.SparkContext; 5 | //import org.apache.spark.api.java.JavaRDD; 6 | //import org.apache.spark.api.java.JavaSparkContext; 7 | //import org.apache.spark.api.java.function.FlatMapFunction; 8 | //import org.apache.spark.api.java.function.Function; 9 | //import org.apache.spark.api.java.function.PairFunction; 10 | //import org.apache.spark.broadcast.Broadcast; 11 | //import scala.Tuple2; 12 | // 13 | //import java.util.ArrayList; 14 | //import java.util.Iterator; 15 | //import java.util.List; 16 | //import java.util.Map; 17 | // 18 | ///** 19 | // * Created by Mobin on 2016/11/14. 20 | // */ 21 | //public class MapJoinJava { 22 | // public static void main(String[] args) { 23 | // SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("JavaMapSide"); 24 | // JavaSparkContext sc = new JavaSparkContext(conf); 25 | // JavaRDD table = sc.textFile("mapjoin.txt"); 26 | // JavaRDD table1 = sc.textFile("mapjoin1.txt"); 27 | // 28 | // final Map pairs = table.mapToPair(new PairFunction() { 29 | // public Tuple2 call(String s) throws Exception { 30 | // int pos = s.indexOf(","); 31 | // return new Tuple2(s.substring(0,pos), s.substring(pos + 1)); 32 | // } 33 | // }).collectAsMap(); 34 | // 35 | // // final Broadcast> broadcast = sc.broadcast(pairs); 36 | // 37 | // table1.mapToPair(new PairFunction() { 38 | // public Tuple2 call(String s) throws Exception { 39 | // int pos = s.indexOf(","); 40 | // return new Tuple2(s.substring(0,pos), s.substring(pos + 1)); 41 | // } 42 | // }).mapPartitions(new FlatMapFunction>, Tuple2>>() { 43 | // public Iterable>> call(Iterator> tuple2Iterator) throws Exception { 44 | // List>> list = null; 45 | // List l = new ArrayList(); 46 | // while (tuple2Iterator.hasNext()){ 47 | // Tuple2 map = tuple2Iterator.next(); 48 | // if (pairs.containsKey(map._1)){ 49 | // if(list == null) 50 | // list = new ArrayList(); 51 | // 52 | // l.add(pairs.get(map._1)); 53 | // l.add(map._2); 54 | // list.add(new Tuple2>(map._1,l)); 55 | // } 56 | // } 57 | // return list; 58 | // } 59 | // }).saveAsTextFile("javaMapJoin"); 60 | // } 61 | // 62 | // 63 | //} 64 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/KVRDD/MapSideJoin.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.KVRDD 2 | 3 | import org.apache.spark.{SparkContext, SparkConf} 4 | 5 | /** 6 | * Created by Mobin on 2016/11/14. 7 | */ 8 | object MapSideJoin { 9 | 10 | def main(args: Array[String]) { 11 | val conf = new SparkConf().setMaster("local[2]").setAppName("Mobin") 12 | val sc = new SparkContext(conf) 13 | val table = sc.textFile("mapjoin.txt") 14 | val table1 = sc.textFile("mapjoin1.txt") 15 | val paisr = table.map{ x => 16 | var pos = x.indexOf(",") 17 | (x.substring(0,pos),x.substring(pos+1)) 18 | }.collectAsMap() 19 | 20 | // var broadcastMap = sc.broadcast(paisr) 21 | 22 | val result = table1.map{ x => 23 | var pos = x.indexOf(",") 24 | (x.substring(0,pos),x.substring(pos + 1)) 25 | }.mapPartitions({ iter => 26 | // var m = broadcastMap.value 27 | for { 28 | (key, value) <- iter 29 | if paisr.contains(key) 30 | }yield(key,(value , paisr.get(key).getOrElse(""))) 31 | }) 32 | 33 | result.saveAsTextFile("result") 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/KVRDD/MapValues.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.KVRDD 2 | 3 | import org.apache.spark.{HashPartitioner, SparkContext, SparkConf} 4 | 5 | /** 6 | * Created by hadoop on 4/10/16. 7 | * 对[K,V]型数据中的V值map操作 8 | */ 9 | object MapValues { 10 | def main(args: Array[String]) { 11 | val conf = new SparkConf().setMaster("local").setAppName("map") 12 | val sc = new SparkContext(conf) 13 | val list = List(("mobin",22),("kpop",20),("lufei",23)) 14 | val rdd = sc.parallelize(list) 15 | val mapValuesRDD = rdd.mapValues(x => Seq(x,"male")) 16 | mapValuesRDD.foreach(println) 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/KVRDD/PartitionBy.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.KVRDD 2 | 3 | import org.apache.spark.{Partitioner, HashPartitioner, SparkContext, SparkConf} 4 | import scala.collection.mutable.{Map} 5 | 6 | /** 7 | * Created by hadoop on 4/10/16. 8 | */ 9 | object PartitionBy { 10 | def main(args: Array[String]) { 11 | val conf = new SparkConf().setMaster("local").setAppName("partitonby") 12 | val sc = new SparkContext(conf) 13 | val rdd1 = sc.makeRDD(Array((10,"A"), (20, "B"), (30,"C"), (40,"D")), 2) 14 | rdd1.mapPartitionsWithIndex{ 15 | (partitionID, iter) => { 16 | var partiton_map = Map[String, List[(Int, String)]]() 17 | while(iter.hasNext){ 18 | val partition_name = "part_" + partitionID 19 | var elem = iter.next() 20 | if (partiton_map.contains(partition_name)){ 21 | var elems = partiton_map(partition_name) 22 | elem :: elems 23 | }else { 24 | partiton_map(partition_name) = List[(Int, String)]{elem} 25 | } 26 | } 27 | partiton_map.iterator 28 | } 29 | } 30 | rdd1.foreach(println) 31 | val rdd2 = rdd1.partitionBy(new HashPartitioner(2)) 32 | var rdd3 = rdd1.groupByKey(new Partitioner() { 33 | override def numPartitions: Int = 10 34 | 35 | override def getPartition(key: Any): Int = { 36 | 37 | val id = key.asInstanceOf[Int] 38 | println(id) 39 | if (id % 2 ==0) { 40 | id / 4 41 | }else{ 42 | id % 4 43 | } 44 | } 45 | }) 46 | rdd3.foreach(println) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/KVRDD/ReduceByKey.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.KVRDD 2 | 3 | import org.apache.spark.{SparkContext, SparkConf} 4 | 5 | /** 6 | * Created by hadoop on 4/12/16. 7 | */ 8 | object ReduceByKey { 9 | 10 | def main(args: Array[String]) { 11 | val conf = new SparkConf().setMaster("local").setAppName("ReduceByKey") 12 | val sc = new SparkContext(conf) 13 | val arr = List(("A",1),("B",2),("A",2),("B",3)) 14 | val rdd = sc.parallelize(arr) 15 | val reduceByKeyRDD = rdd.reduceByKey(_ +_) 16 | reduceByKeyRDD.foreach(println) 17 | sc.stop 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkRDDFun/TransFormation/KVRDD/SortByKey.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkRDDFun.TransFormation.KVRDD 2 | 3 | import org.apache.spark.{SparkContext, SparkConf} 4 | 5 | /** 6 | * Created by hadoop on 4/12/16. 7 | */ 8 | object SortByKey { 9 | 10 | def main(args: Array[String]) { 11 | val conf = new SparkConf().setMaster("local").setAppName("ReduceByKey") 12 | val sc = new SparkContext(conf) 13 | val arr = List(("A",1),("B",2),("A",2),("B",3)) 14 | val rdd = sc.parallelize(arr) 15 | val sortByKeyRDD = rdd.sortByKey() 16 | sortByKeyRDD.foreach(println) 17 | sc.stop 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkSQL/PeopleDemo.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkSQL 2 | 3 | import org.apache.spark.sql.SQLContext 4 | import org.apache.spark.{SparkContext, SparkConf} 5 | 6 | /** 7 | * Created by Mobin on 2016/11/28. 8 | */ 9 | object PeopleDemo { 10 | def main(args: Array[String]) { 11 | val conf =new SparkConf().setAppName("people").setMaster("local") 12 | val sc = new SparkContext(conf) 13 | val sqlContext = new SQLContext(sc) 14 | val df = sqlContext.jsonFile("people.json") 15 | df.show() 16 | df.printSchema() 17 | printf("select name------") 18 | df.select("name").show() 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkSQL/RowNumber.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkSQL 2 | 3 | import org.apache.spark.sql.{DataFrame, SQLContext} 4 | import org.apache.spark.{SparkContext, SparkConf} 5 | 6 | /** 7 | * Created by Mobin on 2016/12/1. 8 | */ 9 | object RowNumber { 10 | 11 | def main(args: Array[String]) { 12 | val conf = new SparkConf().setAppName("rownum").setMaster("local") 13 | val sc = new SparkContext(conf) 14 | val sqlContext = new SQLContext(sc) 15 | import sqlContext.implicits._ 16 | // val dpiDF = sc.textFile("F:\\AGG_EVT_LTE_DPI_NEW.txt").map(x => x.split("\\|")). 17 | // filter(x => x.length >= 30 && x(14).toDouble > 0 && x(15).toDouble > 0 && x(3) != "" && x(18) != ""). 18 | // map(x => DPI(x(3),x(18),x(14).toDouble,x(15).toDouble, x(14).toDouble+x(15).toDouble)).toDF() 19 | // 20 | // dpiDF.registerTempTable("dpi") 21 | // //dpiDF.groupBy("MDN").agg("size_ul" -> "sum","size_dl" -> "sum") 22 | // // dpiDF.select("APP").groupBy("APP").count().select("count").show() 23 | // // sqlContext.sql("SELECT MDN,APP,size_ul,size_dl,sum(s) FROM dpi").show() 24 | // dpiDF.printSchema() 25 | // // val sDF = dpiDF.groupBy("MDN","APP").agg("s" -> "sum").registerTempTable("tmp")//每个用户对应的的APP的流量q 26 | // sqlContext.sql("SELECT MDN,s,COUNT(1) FROM dpi GROUP BY s").show() 27 | // // sDF.groupBy("MDN").agg("SUM(s)" -> "sum") 28 | // println("count.....") 29 | // val acc = sc.accumulator(0, "ac") 30 | // sc.textFile("/DATA/PUBLIC/NOCE/SGC/SGC_LTE_CDR_DAY/day=20161125/00*").foreach( 31 | // line => if(line.length > 0) acc += 1 32 | // ) 33 | //println("line:" + acc.value) 34 | 35 | // dpiDF.show() 36 | // val mr = sc.textFile("E:\\DATA\\PUBLIC\\NOCE\\ETL\\ETL_4G_MRO_ERS\\20161020\\2016102011\\e_p_3_1.txt") 37 | // .map(x => (x.split("\\|")(3),2.10)) 38 | // .filter(x => x!="").distinct().toDF() 39 | // val chr = sc.textFile("E:\\DATA\\PUBLIC\\NOCE\\AGG\\AGG_MRO_CHR_RELATE\\day=20161020\\hour=2016102011\\vendor=ERS\\10\\agg_data_172_17_1_2_ad7fc9ad_3930_4da8_97cc_a2a476f2333f.txt") 40 | // .map(x => (x.split("\\|")(1),1.8)).filter(x => x !="").distinct().toDF() 41 | // 42 | // val rs = mr.unionAll(chr).count() 43 | // println(rs) 44 | // sc.stop() 45 | 46 | sc.textFile("F:\\m_p_50_3.txt.lzo").map(x => x.split(",")(0)).foreach( 47 | println(_) 48 | ) 49 | 50 | sc.stop() 51 | 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/SparkSQL/SGC_LET_SHOOL_HOUR.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.SparkSQL 2 | 3 | import org.apache.spark.sql.{DataFrame, SQLContext} 4 | import org.apache.spark.{SparkContext, SparkConf} 5 | 6 | /** 7 | * Created by Mobin on 2016/11/28. 8 | */ 9 | object SGC_LET_SHOOL_HOUR { 10 | 11 | case class School(school_name: String, school_id: String, enodeb: Int) 12 | case class Mr(enodebID: Int, MSISDN: String) 13 | 14 | def main(args: Array[String]) { 15 | val conf = new SparkConf().setAppName("SGC_LET_SCHOOL_HOUR").setMaster("local") 16 | val sc = new SparkContext(conf) 17 | val sqlContext = new SQLContext(sc) 18 | import sqlContext.implicits._ 19 | val school: DataFrame = sc.textFile("E:\\DATA\\PUBLIC\\NOCE\\school.csv").map(x => x.split("\\|")).map(s => School(s(1), s(3),Integer.parseInt(s(5)))).toDF() 20 | val mr = sc.textFile("F:\\2.10.txt").map(s => s.split("\\|")).map(mr => Mr(Integer.parseInt(mr(1)),mr(11))).toDF() 21 | school.registerTempTable("school") 22 | mr.registerTempTable("mr") 23 | school.select("school_name") 24 | val joinDF = school.join(mr,$"enodeb" === $"enodebID").select("school_id","school_name","MSISDN").distinct 25 | val countDF = joinDF.select("school_id","school_name").groupBy("school_id","school_name") 26 | countDF.count().rdd.saveAsTextFile("F:\\SCHOOL.txt") 27 | 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/Telecom/AirPlaneMode.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.Telecom 2 | 3 | import java.text.SimpleDateFormat 4 | 5 | import org.apache.hadoop.conf.Configuration 6 | import org.apache.hadoop.fs._ 7 | import org.apache.spark.{SparkContext, SparkConf} 8 | 9 | /** 10 | * Created by Mobin on 2017/7/15. 11 | * 飞行模式计算 12 | */ 13 | object AirPlaneMode { 14 | private val MSISDN = 6 15 | private val IDENTIFICATION = 8 16 | private val STARTTIME = 0 17 | private val ENB = 12 18 | private val MAX_NUM = 13 19 | private val chrSourcePath = "/DATA/PUBLIC/NOCE/ETL/ETL_CHR_L_MM/" 20 | 21 | def airPlainModeMain(fs: FileSystem, sc: SparkContext, day: String, dateTime :String, isPersist: Boolean): Unit ={ 22 | val source = chrSourcePath + day + "/" + "*/*" 23 | print(source) 24 | val paresRdd = sc.textFile(source).mapPartitions(iterFunc) 25 | val splicRdd = paresRdd.reduceByKey(reduceByKeyFun) 26 | val statisticRdd = splicRdd.mapPartitions(statisticFun) 27 | statisticRdd.count() 28 | } 29 | 30 | def statisticFun(iter: Iterator[(String, (String,Int))]) = { 31 | var list = List[(String, String)]() 32 | while(iter.hasNext) { 33 | var sum = 0 34 | val data = iter.next() 35 | val str = data._2._1.split("\\|") 36 | for (m <- str) { 37 | val str1 = m.split(",") //分隔出<时间,编号> 38 | if (str1.length == 2 && "0".equals(str1(1))) { 39 | for (n <- str) { 40 | val str2 = n.split(",") //分隔出<时间, 编号> 41 | if (str2.length == 2 && ("1".equals(str2(1)) || "2".equals(str2(1)))) { 42 | val t1 =paresTime(str2(0)) 43 | val t2 = paresTime(str1(0)) 44 | if (!t1.isEmpty && !t2.isEmpty && t1.get - t2.get < 2000 && t1.get - t2.get > 0){ 45 | sum = sum +1 46 | } 47 | } 48 | } 49 | } 50 | } 51 | println(data._1, sum + "," + data._2._2) 52 | list = (data._1, sum + "," + data._2._2) :: list 53 | } 54 | list.iterator 55 | } 56 | 57 | def paresTime(time: String) : Option[Long] = { 58 | val timeFormat = "yyyy-MM-dd HH:mm:ss SSS" 59 | val month = time.substring(5, 8) 60 | var t = time 61 | if (!month.contains("-")){ 62 | month match { 63 | case "Jan" => t = t.replace(month, "01") 64 | case "Feb" => t = t.replace(month, "02") 65 | case "Mar" => t = t.replace(month, "03") 66 | case "Apr" => t = t.replace(month, "04") 67 | case "May" => t = t.replace(month, "05") 68 | case "Jun" => t = t.replace(month, "06") 69 | case "Jul" => t = t.replace(month, "07") 70 | case "Aug" => t = t.replace(month, "08") 71 | case "Sep" => t = t.replace(month, "09") 72 | case "Oct" => t = t.replace(month, "10") 73 | case "Nov" => t = t.replace(month, "11") 74 | case "Dec" => t = t.replace(month, "12") 75 | case _ => None 76 | } 77 | } 78 | var startTime: Option[Long] = None 79 | try { 80 | startTime = Some(new SimpleDateFormat(timeFormat).parse(time).getTime) 81 | return startTime 82 | }catch { 83 | case e: Exception => None 84 | } 85 | } 86 | 87 | def reduceByKeyFun(x1: (String, Int), x2: (String, Int)): (String,Int) = { 88 | val sum = x1._2 + x2._2 89 | println(x1._1 + "ppp") 90 | if (",".equals(x1._1)){ 91 | if (!",".equals(x2._1)){ 92 | (x2._1, sum) 93 | } else { 94 | ("", sum) 95 | } 96 | } else { 97 | if (!",".equals(x2._1)) { 98 | (x1._1 + "|" + x2._1, sum) 99 | } else { 100 | (x1._1, sum) 101 | } 102 | } 103 | } 104 | 105 | 106 | def iterFunc(iter: Iterator[String]) = { 107 | var list = List[(String, (String, Int))]() 108 | while (iter.hasNext ) { 109 | val str = iter.next().split(",") 110 | val enb:String = str(ENB) 111 | val mdn = str(MSISDN) 112 | val time = str(STARTTIME) 113 | val airplane = str(IDENTIFICATION) 114 | var tp = "" 115 | airplane match { 116 | case _ if "0x05".equals(airplane) => tp = time + ",0" 117 | case _ if "0x00".equals(airplane) || "0x18".equals(airplane) => tp = time + ",1" 118 | case _ => "" + "," 119 | } 120 | val enb_mdn = String.format("%s,%s", mdn, String.valueOf(Integer.parseInt(enb.substring(3), 16))) 121 | list = (enb_mdn, (tp, 1))::list 122 | } 123 | list.iterator 124 | } 125 | 126 | def main(args: Array[String]) { 127 | val conf = new SparkConf().setAppName("airPlainMode").setMaster("local") 128 | val sc = new SparkContext(conf) 129 | val configuration = new Configuration() 130 | val fs = FileSystem.newInstance(configuration) 131 | airPlainModeMain(fs, sc, "20170322", "", false) 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/sparkStreaming/FileStreaming.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.sparkStreaming 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.streaming.{Seconds, StreamingContext} 5 | 6 | /** 7 | * Created by hadoop on 3/29/16. 8 | */ 9 | object FileStreaming { 10 | def main(args: Array[String]) { 11 | val conf = new SparkConf().setMaster("local").setAppName("FileStreaming") 12 | val sc = new StreamingContext(conf,Seconds(5)) 13 | val lines = sc.textFileStream("/home/hadoop/word") 14 | val words = lines.flatMap(_.split(" ")) 15 | val wordCounts = words.map(x => (x , 1)).reduceByKey(_ + _) 16 | sc.start() 17 | sc.awaitTermination() 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/sparkStreaming/Flume/SampleLogGenerator.java: -------------------------------------------------------------------------------- 1 | package com.mobin.sparkStreaming.Flume; 2 | 3 | import java.io.*; 4 | 5 | /** 6 | * Created with IDEA 7 | * Creater: MOBIN 8 | * Date: 2018/8/2 9 | * Time: 4:39 PM 10 | */ 11 | public class SampleLogGenerator { 12 | public static void main(String[] args) throws IOException, InterruptedException { 13 | String location = "/Users/mobin/Downloads/access_log/access1_log"; 14 | File f = new File(location); 15 | FileOutputStream writer = new FileOutputStream(f); 16 | File read = new File("/Users/mobin/Downloads/access_log/access_log"); 17 | BufferedReader reader = new BufferedReader(new FileReader(read)); 18 | for(;;){ 19 | System.out.println("...."); 20 | writer.write((reader.readLine() + "\n").getBytes()); 21 | writer.flush(); 22 | Thread.sleep(500); 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/sparkStreaming/Flume/ScalaLoadDistributedEvents.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.sparkStreaming.Flume 2 | 3 | import java.io.ObjectOutputStream 4 | import java.net.InetSocketAddress 5 | 6 | import org.apache.spark.SparkConf 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.storage.StorageLevel 9 | import org.apache.spark.streaming.dstream.DStream 10 | import org.apache.spark.streaming.flume.{FlumeUtils, SparkFlumeEvent} 11 | import org.apache.spark.streaming.{Seconds, StreamingContext} 12 | 13 | /** 14 | * Created with IDEA 15 | * Creater: MOBIN 16 | * Date: 2018/8/2 17 | * Time: 3:22 PM 18 | */ 19 | object ScalaLoadDistributedEvents { 20 | 21 | def main(args: Array[String]): Unit = { 22 | println("Creating Spark Configuration") 23 | val conf = new SparkConf().setMaster("local[2]").setAppName("streaimg data loading App"); 24 | println("Retreivinf Streaming Context from Spark Conf") 25 | val streamContext = new StreamingContext(conf, Seconds(2)) 26 | 27 | //创建一个包含所有机器地址和端口的InetSocketAdress数组 28 | var address = new Array[InetSocketAddress](1) 29 | address(0) = new InetSocketAddress("localhost",9998) 30 | 31 | //创建一个Flume轮询流,每隔2s从Sink中拉取事件 32 | //1. maxBatchSize:单个RPC中从Spakr Sink中拉取事件的最大数目 33 | //2. 这个Stream发送给Sink的并发请求数目T 34 | val flumeStream = FlumeUtils.createPollingStream(streamContext, address, StorageLevel.MEMORY_AND_DISK_SER_2,1000, 1) 35 | 36 | val outputStream = new ObjectOutputStream(Console.out) 37 | printValues(flumeStream, streamContext, outputStream) 38 | streamContext.start() 39 | streamContext.awaitTermination() 40 | } 41 | 42 | 43 | def printValues(stream: DStream[SparkFlumeEvent], context: StreamingContext, outputStream: ObjectOutputStream): Unit ={ 44 | stream.foreachRDD(foreachFunc) 45 | def foreachFunc = (rdd: RDD[SparkFlumeEvent]) => { 46 | val array = rdd.collect() 47 | println("Start Printing Results") 48 | println("Total size of Events = " + array.size) 49 | for (flumeEvent <- array){ 50 | //从SparkFlumeEvent得到AvorFlumeEvent 51 | val payLoad = flumeEvent.event.getBody 52 | println(new String(payLoad.array())) 53 | } 54 | println("finish......") 55 | } 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/sparkStreaming/Flume/ScalaLogAnalyzerJson.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.sparkStreaming.Flume 2 | 3 | import java.util.regex.{Matcher, Pattern} 4 | 5 | import com.alibaba.fastjson.JSON 6 | 7 | 8 | 9 | 10 | 11 | 12 | /** 13 | * Created with IDEA 14 | * Creater: MOBIN 15 | * Date: 2018/8/4 16 | * Time: 2:36 PM 17 | */ 18 | class ScalaLogAnalyzerJson extends Serializable { 19 | 20 | def tansformLogDataIntoJson(logLine: String): String = { 21 | val LOG_ENTRY_PATTERN = """^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\S+)""" 22 | val PATTERN = Pattern.compile(LOG_ENTRY_PATTERN) 23 | val matcher = PATTERN.matcher(logLine) 24 | 25 | if (!matcher.find()){ 26 | println("Cannot parse logline" + logLine) 27 | } 28 | import scala.collection.JavaConversions._ 29 | val json = scala.util.parsing.json.JSONObject(createDataMap(matcher)).toString() 30 | println(json) 31 | return json 32 | } 33 | 34 | def createDataMap(matcher: Matcher): Map[String, String] = { 35 | Map[String, String]( 36 | ("IP" -> matcher.group(1)), 37 | ("client" -> matcher.group(2)), 38 | ("user" -> matcher.group(3)), 39 | ("date" -> matcher.group(4)), 40 | ("method" -> matcher.group(5)), 41 | ("request" -> matcher.group(6)), 42 | ("protocol" -> matcher.group(7)), 43 | ("respCode" -> matcher.group(8)), 44 | ("size" -> matcher.group(9)) 45 | ) 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/sparkStreaming/Flume/ScalaLogAnalyzerMap.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.sparkStreaming.Flume 2 | 3 | import java.util.regex.{Matcher, Pattern} 4 | 5 | 6 | 7 | /** 8 | * Created with IDEA 9 | * Creater: MOBIN 10 | * Date: 2018/8/4 11 | * Time: 2:36 PM 12 | */ 13 | class ScalaLogAnalyzerMap extends Serializable { 14 | 15 | def tansformLogData(logLine: String): Map[String, String] = { 16 | val LOG_ENTRY_PATTERN = """^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\S+)""" 17 | val PATTERN = Pattern.compile(LOG_ENTRY_PATTERN) 18 | val matcher = PATTERN.matcher(logLine) 19 | 20 | if (!matcher.find()){ 21 | println("Cannot parse logline" + logLine) 22 | } 23 | createDataMap(matcher) 24 | } 25 | 26 | def createDataMap(matcher: Matcher): Map[String, String] = { 27 | Map[String, String]( 28 | ("IP" -> matcher.group(1)), 29 | ("client" -> matcher.group(2)), 30 | ("user" -> matcher.group(3)), 31 | ("date" -> matcher.group(4)), 32 | ("method" -> matcher.group(5)), 33 | ("request" -> matcher.group(6)), 34 | ("protocol" -> matcher.group(7)), 35 | ("respCode" -> matcher.group(8)), 36 | ("size" -> matcher.group(9)) 37 | ) 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/sparkStreaming/Flume/ScalaQueryingStreams.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.sparkStreaming.Flume 2 | 3 | import java.net.InetSocketAddress 4 | 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | import org.apache.spark.rdd.RDD 7 | import org.apache.spark.sql.SQLContext 8 | import org.apache.spark.storage.StorageLevel 9 | import org.apache.spark.streaming.dstream.DStream 10 | import org.apache.spark.streaming.flume.FlumeUtils 11 | import org.apache.spark.streaming.{Seconds, StreamingContext} 12 | 13 | /** 14 | * Created with IDEA 15 | * Creater: MOBIN 16 | * Date: 2018/8/4 17 | * Time: 2:49 PM 18 | */ 19 | object ScalaQueryingStreams { 20 | def main(args: Array[String]): Unit = { 21 | val conf = new SparkConf().setAppName("Apache Log Transformer") 22 | val sparkContext = new SparkContext(conf) 23 | val streamCtx = new StreamingContext(sparkContext, Seconds(10)) 24 | 25 | var address = new Array[InetSocketAddress](1) 26 | address(0) = new InetSocketAddress("localhost", 9998) 27 | val flumeStream = FlumeUtils.createPollingStream(streamCtx, address, StorageLevel.MEMORY_AND_DISK_SER_2, 1000, 1) 28 | val transformLog = new ScalaLogAnalyzerJson() 29 | val newDStream = flumeStream.map{ 30 | x => transformLog.tansformLogDataIntoJson(new String(x.event.getBody.array())) 31 | } 32 | val wStream = newDStream.window(Seconds(40), Seconds(20)) 33 | wStream.foreachRDD{ 34 | rdd => 35 | val sqlCtx = getInstance(sparkContext) 36 | //通过JSONRDD将 JSONRDD转换为SQL DataFrame 37 | val df = sqlCtx.jsonRDD(rdd) 38 | df.registerTempTable("apacheLogData") 39 | //打印结构类型 40 | df.printSchema() 41 | val logDataFrame = sqlCtx.sql("SELECT method,count(*) as total FROM apacheLogData GROUP BY method") 42 | logDataFrame.show() 43 | } 44 | 45 | streamCtx.start() 46 | streamCtx.awaitTermination() 47 | 48 | } 49 | 50 | 51 | def executeTransformations(dstream: DStream[(String, String)], context: StreamingContext): Unit ={ 52 | printLogValues(dstream,context) 53 | println("++++++") 54 | dstream.filter(x => x._1.equals("method") && x._2.contains("GET")).count().print() 55 | println("++++++") 56 | 57 | } 58 | 59 | def printLogValues(stream: DStream[(String, String)], context: StreamingContext){ 60 | stream.foreachRDD(foreachFunc) 61 | 62 | def foreachFunc = (rdd: RDD[(String,String)]) => { 63 | val array = rdd.collect() 64 | for (dataMap <- array.array){ 65 | println(dataMap._1 + "------" + dataMap._2) 66 | } 67 | } 68 | } 69 | 70 | @transient private var instance: SQLContext = null 71 | 72 | //延迟初始化SQLContext 73 | def getInstance(sparkContext: SparkContext): SQLContext = 74 | synchronized{ 75 | if (instance == null) { 76 | instance = new SQLContext(sparkContext) 77 | } 78 | instance 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/sparkStreaming/Flume/ScalaTransformLogEvents.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.sparkStreaming.Flume 2 | 3 | import java.net.InetSocketAddress 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.rdd.RDD 7 | import org.apache.spark.storage.StorageLevel 8 | import org.apache.spark.streaming.dstream.DStream 9 | import org.apache.spark.streaming.flume.FlumeUtils 10 | import org.apache.spark.streaming.{Seconds, StreamingContext} 11 | 12 | /** 13 | * Created with IDEA 14 | * Creater: MOBIN 15 | * Date: 2018/8/4 16 | * Time: 2:49 PM 17 | */ 18 | object ScalaTransformLogEvents { 19 | def main(args: Array[String]): Unit = { 20 | val conf = new SparkConf().setMaster("local[2]").setAppName("Apache Log Transformer") 21 | val streamCtx = new StreamingContext(conf, Seconds(10)) 22 | 23 | var address = new Array[InetSocketAddress](1) 24 | address(0) = new InetSocketAddress("localhost", 9998) 25 | val flumeStream = FlumeUtils.createPollingStream(streamCtx, address, StorageLevel.MEMORY_AND_DISK_SER_2, 1000, 1) 26 | val transformLog = new ScalaLogAnalyzerMap() 27 | val newDStream = flumeStream.flatMap{ 28 | 29 | x => transformLog.tansformLogData(new String(x.event.getBody.array())) 30 | } 31 | 32 | println("------") 33 | flumeStream.map(x => x.event.getHeaders).print() 34 | println("------") 35 | 36 | 37 | executeTransformations(newDStream, streamCtx) 38 | streamCtx.start() 39 | streamCtx.awaitTermination() 40 | } 41 | 42 | def executeTransformations(dstream: DStream[(String, String)], context: StreamingContext): Unit ={ 43 | printLogValues(dstream,context) 44 | println("++++++") 45 | dstream.filter(x => x._1.equals("method") && x._2.contains("GET")).count().print() 46 | println("++++++") 47 | 48 | } 49 | 50 | def printLogValues(stream: DStream[(String, String)], context: StreamingContext){ 51 | stream.foreachRDD(foreachFunc) 52 | 53 | def foreachFunc = (rdd: RDD[(String,String)]) => { 54 | val array = rdd.collect() 55 | for (dataMap <- array.array){ 56 | println(dataMap._1 + "------" + dataMap._2) 57 | } 58 | } 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/sparkStreaming/GenerateChar.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.sparkStreaming 2 | 3 | import java.io.PrintWriter 4 | import java.net.ServerSocket 5 | 6 | /** 7 | * Created by hadoop on 3/28/16. 8 | */ 9 | object GenerateChar { 10 | def generateContext(index : Int) : String = { 11 | import scala.collection.mutable.ListBuffer 12 | val charList = ListBuffer[Char]() 13 | for(i <- 65 to 90) 14 | charList += i.toChar 15 | 16 | val charArray = charList.toArray 17 | charArray(index).toString 18 | } 19 | 20 | def index = { 21 | import java.util.Random 22 | val rdm = new Random 23 | rdm.nextInt(20) 24 | } 25 | 26 | def main(args: Array[String]) { 27 | val listener = new ServerSocket(9998) 28 | println("开始监听...............") 29 | while(true){ 30 | val socket = listener.accept() 31 | new Thread(){ 32 | override def run() = { 33 | println("Got client connected from :"+ socket.getInetAddress) 34 | val out = new PrintWriter(socket.getOutputStream,true) 35 | while(true){ 36 | Thread.sleep(500) 37 | val context = generateContext(index) 38 | println(context) 39 | out.write(context + '\n') 40 | out.flush() 41 | } 42 | socket.close() 43 | } 44 | }.start() 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/sparkStreaming/Kafka/UserBehaviorMsgProducer.scala: -------------------------------------------------------------------------------- 1 | //package com.mobin.sparkStreaming.Kafka 2 | //import java.util.Properties 3 | // 4 | //import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig} 5 | // 6 | //import scala.util.Random 7 | // 8 | ///** 9 | // * Created by Mobin on 2017/9/1. 10 | // * 生产行为数据消息 11 | // */ 12 | //class UserBehaviorMsgProducer(brokers: String, topic: String) extends Runnable{ 13 | // private val brokerList = brokers 14 | // private val targetTopic = topic 15 | // private val props = new Properties() 16 | // props.put("metadata.broker.list", brokerList) 17 | // props.put("producer.type", "async") 18 | // private val config = new ProducerConfig(props) 19 | // private val producer = new Producer[String, String](config) 20 | // 21 | // private val PAGE_NUM =100 22 | // private val MAX_MSG_NUM= 3 23 | // private val MAX_CLICK_TIME = 5 24 | // private val MAX_STAY_TIME = 10 25 | // private val LIKE_OR_NOT = Array[Int](1, 0, -1) 26 | // 27 | // 28 | // override def run(): Unit = { 29 | // val rand = new Random() 30 | // while (true) { 31 | // val msgNum = rand.nextInt(MAX_MSG_NUM) + 1 32 | // for (i <- msgNum) { 33 | // val msg = new StringBuffer() 34 | // msg.append("page" + (rand.nextInt(PAGE_NUM) + 1)) 35 | // msg.append("|") 36 | // msg.append(rand.nextInt(MAX_CLICK_TIME) + 1) 37 | // msg.append("|") 38 | // msg.append(rand.nextInt(MAX_CLICK_TIME) + rand.nextFloat()) 39 | // msg.append("|") 40 | // msg.append(LIKE_OR_NOT(rand.nextInt(3))) 41 | // println(msg.toString) 42 | // sendMessage(msg.toString) 43 | // } 44 | // println("%d user behavior message producer.".format(msgNum + 1)) 45 | // } 46 | // } 47 | // 48 | // def sendMessage(message: String) = { 49 | // try{ 50 | // val data = new KeyedMessage[String, String](topic, message) 51 | // producer.send(data) 52 | // }catch { 53 | // case e: Exception => println(e) 54 | // } 55 | // } 56 | // 57 | // object UserBehaviorMsgProducerClient{ 58 | // def main(args: Array[String]) { 59 | // if (args.length < 2 ){ 60 | // println("Usage: UserBehaviorMsgProducerClient ip:9092 user-behavior-topic") 61 | // System.exit(1) 62 | // } 63 | // new Thread(new UserBehaviorMsgProducer(args(0), args(1))).start() 64 | // } 65 | // } 66 | //} 67 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/sparkStreaming/QueueStream.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.sparkStreaming 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.rdd.RDD 5 | import org.apache.spark.streaming.{Seconds, StreamingContext} 6 | 7 | import scala.collection.mutable 8 | 9 | /** 10 | * Created by hadoop on 4/2/16. 11 | */ 12 | object QueueStream { 13 | 14 | def main(args: Array[String]) { 15 | val conf = new SparkConf().setMaster("local[2]").setAppName("queueStream") 16 | val ssc = new StreamingContext(conf,Seconds(1)) 17 | 18 | val rddQueue = new mutable.SynchronizedQueue[RDD[Int]]() 19 | 20 | val inputStream = ssc.queueStream(rddQueue) 21 | 22 | val mappedStream = inputStream.map(x => (x % 10,1)) 23 | val reduceStream = mappedStream.reduceByKey(_ + _) 24 | reduceStream.print 25 | ssc.start() 26 | for(i <- 1 to 30){ 27 | rddQueue += ssc.sparkContext.makeRDD(1 to 100, 2) 28 | Thread.sleep(1000) 29 | } 30 | 31 | ssc.stop() 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/sparkStreaming/ScoketStreaming.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.sparkStreaming 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.streaming.{Seconds, StreamingContext} 5 | 6 | /** 7 | * Created by hadoop on 3/28/16. 8 | */ 9 | object ScoketStreaming { 10 | 11 | def main(args: Array[String]){ 12 | val conf = new SparkConf().setMaster("local[2]").setAppName("ScoketStreaming") 13 | val sc = new StreamingContext(conf,Seconds(10)) 14 | 15 | val lines = sc.socketTextStream("master",9998) 16 | val words = lines.flatMap(_.split((" "))) 17 | val wordCounts = words.map(x => (x , 1)).reduceByKey(_ + _) 18 | wordCounts.print() 19 | sc.start() 20 | sc.awaitTermination() 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/sparkStreaming/ScoketStreamingCheckPoint.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.sparkStreaming 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.streaming.{Seconds, StreamingContext} 5 | 6 | /** 7 | * Created by Mobin on 2017/8/25. 8 | */ 9 | object ScoketStreamingCheckPoint { 10 | val conf = new SparkConf().setMaster("local[*]").setAppName("checkPoint") 11 | val checkPointPath = "." 12 | 13 | def checkPointFun(): StreamingContext = { 14 | val sc = new StreamingContext(conf, Seconds(5)) 15 | val lines = sc.socketTextStream("localhost",9998) 16 | sc.checkpoint(checkPointPath) 17 | val words = lines.flatMap(_.split((" "))) 18 | val wordCounts = words.map(x => (x , 1)).reduceByKey(_ + _) 19 | wordCounts.print() 20 | sc 21 | } 22 | 23 | def main(args: Array[String]) { 24 | val context = StreamingContext.getOrCreate(checkPointPath, checkPointFun) 25 | context.start() 26 | context.awaitTermination() 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/sparkStreaming/StateFull.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.sparkStreaming 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.streaming.StreamingContext._ 5 | import org.apache.spark.streaming.{Seconds, StreamingContext} 6 | 7 | /** 8 | * Created by hadoop on 3/31/16. 9 | */ 10 | object StateFull { 11 | 12 | def main(args: Array[String]) { 13 | val updateFunc = (values: Seq[Int], state: Option[Int]) => { 14 | val currentCount = values.foldLeft(0)(_ + _) 15 | val previousCount = state.getOrElse(0) 16 | Some(currentCount + previousCount) 17 | } 18 | 19 | val conf = new SparkConf().setMaster("local[2]").setAppName("stateFull") 20 | val sc = new StreamingContext(conf, Seconds(10)) 21 | sc.checkpoint(".") 22 | 23 | val lines = sc.socketTextStream("master", 9998) 24 | val words = lines.flatMap(_.split(" ")) 25 | val wordDstream = words.map(x => (x, 1)) 26 | 27 | val stateDstream = wordDstream.updateStateByKey[Int](updateFunc) 28 | stateDstream.print() 29 | sc.start() 30 | sc.awaitTermination() 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/com/mobin/sparkStreaming/WindowWordCount.scala: -------------------------------------------------------------------------------- 1 | package com.mobin.sparkStreaming 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.streaming.{Seconds, StreamingContext} 5 | 6 | /** 7 | * Created by hadoop on 4/2/16. 8 | */ 9 | object WindowWordCount { 10 | 11 | def main(args: Array[String]) { 12 | val conf = new SparkConf().setMaster("local[2]").setAppName("windowWordCunt") 13 | val ssc = new StreamingContext(conf,Seconds(5)) 14 | ssc.checkpoint(".") 15 | val lines = ssc.socketTextStream("MOBIN",9998) 16 | val words = lines.flatMap(_.split(" ")) 17 | val wordCounts = words.map(x => (x , 1)).reduceByKeyAndWindow(_+_,_+_,Seconds(60),Seconds(10)) 18 | wordCounts.print 19 | ssc.start() 20 | ssc.awaitTermination() 21 | } 22 | } 23 | --------------------------------------------------------------------------------