├── .gitignore
├── .idea
├── .name
├── compiler.xml
├── copyright
│ └── profiles_settings.xml
├── encodings.xml
├── libraries
│ ├── Maven__com_clearspring_analytics_stream_2_7_0.xml
│ ├── Maven__com_google_code_findbugs_jsr305_1_3_9.xml
│ ├── Maven__com_google_code_gson_gson_2_2_4.xml
│ ├── Maven__com_google_protobuf_protobuf_java_2_5_0.xml
│ ├── Maven__com_ning_compress_lzf_1_0_3.xml
│ ├── Maven__com_sun_jersey_jersey_client_1_9.xml
│ ├── Maven__com_sun_jersey_jersey_core_1_9.xml
│ ├── Maven__com_sun_xml_bind_jaxb_core_2_2_11.xml
│ ├── Maven__com_sun_xml_bind_jaxb_impl_2_2_11.xml
│ ├── Maven__commons_beanutils_commons_beanutils_1_7_0.xml
│ ├── Maven__commons_beanutils_commons_beanutils_core_1_8_0.xml
│ ├── Maven__commons_cli_commons_cli_1_2.xml
│ ├── Maven__commons_configuration_commons_configuration_1_6.xml
│ ├── Maven__commons_digester_commons_digester_1_8.xml
│ ├── Maven__commons_httpclient_commons_httpclient_3_1.xml
│ ├── Maven__commons_io_commons_io_2_4.xml
│ ├── Maven__commons_logging_commons_logging_1_1_3.xml
│ ├── Maven__commons_net_commons_net_2_2.xml
│ ├── Maven__io_dropwizard_metrics_metrics_core_3_1_2.xml
│ ├── Maven__io_dropwizard_metrics_metrics_graphite_3_1_2.xml
│ ├── Maven__io_dropwizard_metrics_metrics_json_3_1_2.xml
│ ├── Maven__io_dropwizard_metrics_metrics_jvm_3_1_2.xml
│ ├── Maven__javax_servlet_servlet_api_2_5.xml
│ ├── Maven__javax_xml_bind_jaxb_api_2_2_2.xml
│ ├── Maven__javax_xml_stream_stax_api_1_0_2.xml
│ ├── Maven__junit_junit_4_11.xml
│ ├── Maven__log4j_log4j_1_2_17.xml
│ ├── Maven__net_jpountz_lz4_lz4_1_3_0.xml
│ ├── Maven__org_apache_avro_avro_ipc_1_7_7.xml
│ ├── Maven__org_apache_avro_avro_ipc_tests_1_7_7.xml
│ ├── Maven__org_apache_avro_avro_mapred_hadoop2_1_7_7.xml
│ ├── Maven__org_apache_camel_camel_core_2_17_0.xml
│ ├── Maven__org_apache_camel_camel_test_2_17_0.xml
│ ├── Maven__org_apache_commons_commons_compress_1_4_1.xml
│ ├── Maven__org_apache_commons_commons_math3_3_4_1.xml
│ ├── Maven__org_apache_directory_api_api_asn1_api_1_0_0_M20.xml
│ ├── Maven__org_apache_directory_api_api_util_1_0_0_M20.xml
│ ├── Maven__org_apache_directory_server_apacheds_i18n_2_0_0_M15.xml
│ ├── Maven__org_apache_directory_server_apacheds_kerberos_codec_2_0_0_M15.xml
│ ├── Maven__org_apache_ivy_ivy_2_4_0.xml
│ ├── Maven__org_codehaus_jackson_jackson_jaxrs_1_9_13.xml
│ ├── Maven__org_codehaus_jackson_jackson_xc_1_9_13.xml
│ ├── Maven__org_fusesource_leveldbjni_leveldbjni_all_1_8.xml
│ ├── Maven__org_hamcrest_hamcrest_core_1_3.xml
│ ├── Maven__org_mortbay_jetty_jetty_util_6_1_26.xml
│ ├── Maven__org_scala_lang_modules_scala_xml_2_11_1_0_4.xml
│ ├── Maven__org_slf4j_slf4j_api_1_7_13.xml
│ ├── Maven__org_slf4j_slf4j_log4j12_1_7_13.xml
│ ├── Maven__org_spark_project_spark_unused_1_0_0.xml
│ ├── Maven__org_tukaani_xz_1_0.xml
│ ├── Maven__oro_oro_2_0_8.xml
│ ├── Maven__xerces_xercesImpl_2_9_1.xml
│ ├── Maven__xml_apis_xml_apis_1_3_04.xml
│ └── Maven__xmlenc_xmlenc_0_52.xml
├── misc.xml
├── modules.xml
├── scala_compiler.xml
└── uiDesigner.xml
├── BigDataLearning.iml
├── META-INF
└── MANIFEST.MF
├── RandomPrefix.txt
├── SampleJoin1.txt
├── SampleJoin2.txt
├── mapjoin.txt
├── mapjoin1.txt
├── pom.xml
├── readme.md
└── src
└── main
├── resources
├── META-INF
│ └── MANIFEST.MF
├── core-site.xml
└── log4j.properties
└── scala
└── com
└── mobin
├── Advanced_Analytics_with_Spark
├── NaStatCounter.scala
└── Patient.scala
├── Example
├── AudienceAnalysis.scala
├── GenerateHFile.java
├── HiveDataBaseConnection.java
├── PutDataToHBase.java
├── RandomPrefix_Shuffle.scala
├── Sample_Shuffle.scala
├── ScoresDataGenerator.scala
├── SecondSortBykey.scala
├── SexCount.java
├── SparkJoin.scala
└── StudentDataGenerator.scala
├── HDFS
├── FSUtils
│ ├── CountFileLine.java
│ └── FSUtils.java
├── HDFSCompressionCodec.java
├── LzoCompress.java
└── WriteToHDFS.scala
├── Kafka
├── KStream
│ └── KStreamDemo.java
├── Partition
│ └── StockPartitionor.java
├── Producers
│ ├── KafkaProducerThread.java
│ ├── QuotationProducer.java
│ └── StockQuotationInfo.java
├── Topic.java
└── consumers
│ ├── KafkaConsumerThread.java
│ ├── QuotationConsumer.java
│ └── QuotationConsumerManualCommit.java
├── SparkRDDFun
└── TransFormation
│ ├── Action
│ ├── Aggregate.scala
│ ├── Fold.scala
│ ├── Func.scala
│ └── KVFunc.scala
│ ├── BaseRDD
│ ├── Cartesian.scala
│ ├── Coalesce.scala
│ ├── Distinct.scala
│ ├── FlatMap.scala
│ ├── Glom.scala
│ ├── MakeRDD.scala
│ ├── Map.scala
│ ├── MapPartitions.scala
│ ├── MapPartitionsWithIndex.scala
│ ├── RandomSplit.scala
│ ├── Sample.scala
│ ├── Union.scala
│ └── ZipWithIndex.scala
│ └── KVRDD
│ ├── AggregateAndFold.scala
│ ├── Cogroup.scala
│ ├── CombineByKey.scala
│ ├── CombineByKey1.scala
│ ├── FlatMapValus.scala
│ ├── FoldByKey.scala
│ ├── GroupByKey.scala
│ ├── Join.scala
│ ├── MapJoinJava.java
│ ├── MapSideJoin.scala
│ ├── MapValues.scala
│ ├── PartitionBy.scala
│ ├── ReduceByKey.scala
│ └── SortByKey.scala
├── SparkSQL
├── PeopleDemo.scala
├── RowNumber.scala
└── SGC_LET_SHOOL_HOUR.scala
├── Telecom
└── AirPlaneMode.scala
└── sparkStreaming
├── FileStreaming.scala
├── Flume
├── SampleLogGenerator.java
├── ScalaLoadDistributedEvents.scala
├── ScalaLogAnalyzerJson.scala
├── ScalaLogAnalyzerMap.scala
├── ScalaQueryingStreams.scala
└── ScalaTransformLogEvents.scala
├── GenerateChar.scala
├── Kafka
└── UserBehaviorMsgProducer.scala
├── QueueStream.scala
├── ScoketStreaming.scala
├── ScoketStreamingCheckPoint.scala
├── StateFull.scala
└── WindowWordCount.scala
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | receivedBlockMetadata
3 | target
4 | .checkpoint*
5 | spark-warehouse
--------------------------------------------------------------------------------
/.idea/.name:
--------------------------------------------------------------------------------
1 | BigDataLearning
--------------------------------------------------------------------------------
/.idea/compiler.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/.idea/copyright/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__com_clearspring_analytics_stream_2_7_0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__com_google_code_findbugs_jsr305_1_3_9.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__com_google_code_gson_gson_2_2_4.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__com_google_protobuf_protobuf_java_2_5_0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__com_ning_compress_lzf_1_0_3.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__com_sun_jersey_jersey_client_1_9.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__com_sun_jersey_jersey_core_1_9.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__com_sun_xml_bind_jaxb_core_2_2_11.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__com_sun_xml_bind_jaxb_impl_2_2_11.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__commons_beanutils_commons_beanutils_1_7_0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__commons_beanutils_commons_beanutils_core_1_8_0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__commons_cli_commons_cli_1_2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__commons_configuration_commons_configuration_1_6.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__commons_digester_commons_digester_1_8.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__commons_httpclient_commons_httpclient_3_1.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__commons_io_commons_io_2_4.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__commons_logging_commons_logging_1_1_3.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__commons_net_commons_net_2_2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__io_dropwizard_metrics_metrics_core_3_1_2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__io_dropwizard_metrics_metrics_graphite_3_1_2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__io_dropwizard_metrics_metrics_json_3_1_2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__io_dropwizard_metrics_metrics_jvm_3_1_2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__javax_servlet_servlet_api_2_5.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__javax_xml_bind_jaxb_api_2_2_2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__javax_xml_stream_stax_api_1_0_2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__junit_junit_4_11.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__log4j_log4j_1_2_17.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__net_jpountz_lz4_lz4_1_3_0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_apache_avro_avro_ipc_1_7_7.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_apache_avro_avro_ipc_tests_1_7_7.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_apache_avro_avro_mapred_hadoop2_1_7_7.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_apache_camel_camel_core_2_17_0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_apache_camel_camel_test_2_17_0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_apache_commons_commons_compress_1_4_1.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_apache_commons_commons_math3_3_4_1.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_apache_directory_api_api_asn1_api_1_0_0_M20.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_apache_directory_api_api_util_1_0_0_M20.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_apache_directory_server_apacheds_i18n_2_0_0_M15.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_apache_directory_server_apacheds_kerberos_codec_2_0_0_M15.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_apache_ivy_ivy_2_4_0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_codehaus_jackson_jackson_jaxrs_1_9_13.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_codehaus_jackson_jackson_xc_1_9_13.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_fusesource_leveldbjni_leveldbjni_all_1_8.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_hamcrest_hamcrest_core_1_3.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_mortbay_jetty_jetty_util_6_1_26.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_scala_lang_modules_scala_xml_2_11_1_0_4.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_slf4j_slf4j_api_1_7_13.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_slf4j_slf4j_log4j12_1_7_13.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_spark_project_spark_unused_1_0_0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_tukaani_xz_1_0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__oro_oro_2_0_8.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__xerces_xercesImpl_2_9_1.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__xml_apis_xml_apis_1_3_04.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__xmlenc_xmlenc_0_52.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/scala_compiler.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/uiDesigner.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | -
6 |
7 |
8 | -
9 |
10 |
11 | -
12 |
13 |
14 | -
15 |
16 |
17 | -
18 |
19 |
20 |
21 |
22 |
23 | -
24 |
25 |
26 |
27 |
28 |
29 | -
30 |
31 |
32 |
33 |
34 |
35 | -
36 |
37 |
38 |
39 |
40 |
41 | -
42 |
43 |
44 |
45 |
46 | -
47 |
48 |
49 |
50 |
51 | -
52 |
53 |
54 |
55 |
56 | -
57 |
58 |
59 |
60 |
61 | -
62 |
63 |
64 |
65 |
66 | -
67 |
68 |
69 |
70 |
71 | -
72 |
73 |
74 | -
75 |
76 |
77 |
78 |
79 | -
80 |
81 |
82 |
83 |
84 | -
85 |
86 |
87 |
88 |
89 | -
90 |
91 |
92 |
93 |
94 | -
95 |
96 |
97 |
98 |
99 | -
100 |
101 |
102 | -
103 |
104 |
105 | -
106 |
107 |
108 | -
109 |
110 |
111 | -
112 |
113 |
114 |
115 |
116 | -
117 |
118 |
119 | -
120 |
121 |
122 |
123 |
124 |
--------------------------------------------------------------------------------
/BigDataLearning.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
--------------------------------------------------------------------------------
/META-INF/MANIFEST.MF:
--------------------------------------------------------------------------------
1 | Manifest-Version: 1.0
2 | Main-Class: com.mobin.sparkStreaming.GenerateChar
3 |
4 |
--------------------------------------------------------------------------------
/RandomPrefix.txt:
--------------------------------------------------------------------------------
1 | Hello
2 | Hello
3 | HI
4 | HI
--------------------------------------------------------------------------------
/SampleJoin1.txt:
--------------------------------------------------------------------------------
1 | 1,a1
2 | 1,a2
3 | 1,a3
4 | 1,a4
5 | 1,a5
6 | 1,a6
7 | 1,a7
8 | 1,a8
9 | 1,a9
10 | 1,a10
11 | 1,a11
12 | 1,a12
13 | 1,a13
14 | 1,a14
15 | 1,a15
16 | 1,a16
17 | 1,a17
18 | 1,a18
19 | 1,a19
20 | 1,a20
21 | 1,a21
22 | 1,a22
23 | 2,b
24 | 2,b
--------------------------------------------------------------------------------
/SampleJoin2.txt:
--------------------------------------------------------------------------------
1 | 1,a
2 | 2,b
3 | 3,c
--------------------------------------------------------------------------------
/mapjoin.txt:
--------------------------------------------------------------------------------
1 | 1,2,3
2 | 2,4,5
--------------------------------------------------------------------------------
/mapjoin1.txt:
--------------------------------------------------------------------------------
1 | 1,A,B
2 | 2,C,D
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | 4.0.0
5 |
6 | com.mobin
7 | BigDataLearning
8 | jar
9 | 1.0-SNAPSHOT
10 |
11 | A Camel Scala Route
12 | http://www.myorganization.org
13 |
14 |
15 | UTF-8
16 | UTF-8
17 |
18 |
19 |
20 |
21 |
22 |
23 | org.apache.kafka
24 | kafka_2.12
25 | 2.0.0
26 |
27 |
28 |
29 |
30 | org.apache.kafka
31 | kafka-clients
32 | 2.0.0
33 |
34 |
35 |
36 |
37 | com.alibaba
38 | fastjson
39 | 1.2.47
40 |
41 |
42 |
43 |
44 | org.apache.spark
45 | spark-sql_2.11
46 | 2.2.2
47 |
48 |
49 |
50 | org.apache.spark
51 | spark-core_2.11
52 | 2.2.2
53 |
54 |
55 |
56 |
57 | org.apache.hadoop
58 | hadoop-client
59 | 2.7.5
60 |
61 |
62 |
63 |
64 | org.apache.spark
65 | spark-streaming_2.11
66 | 2.2.2
67 |
68 |
69 |
70 |
71 | org.apache.spark
72 | spark-streaming-flume_2.11
73 | 2.3.1
74 |
75 |
76 |
77 |
78 |
79 | org.scala-lang
80 | scala-library
81 | 2.11.8
82 |
83 |
84 | org.scala-lang.modules
85 | scala-xml_2.11
86 | 1.0.4
87 |
88 |
89 |
90 |
91 | org.slf4j
92 | slf4j-api
93 | 1.7.13
94 |
95 |
96 | org.slf4j
97 | slf4j-log4j12
98 | 1.7.13
99 |
100 |
101 | log4j
102 | log4j
103 | 1.2.17
104 |
105 |
106 |
107 |
108 | org.apache.camel
109 | camel-test
110 | 2.17.0
111 | test
112 |
113 |
114 |
115 |
116 | install
117 | src/main/scala
118 | src/test/scala
119 |
120 |
121 |
122 |
123 |
124 | org.apache.maven.plugins
125 | maven-compiler-plugin
126 | 3.5.1
127 |
128 | 1.7
129 | 1.7
130 |
131 |
132 |
133 | org.apache.maven.plugins
134 | maven-resources-plugin
135 | 2.6
136 |
137 | UTF-8
138 |
139 |
140 |
141 |
142 |
143 | net.alchim31.maven
144 | scala-maven-plugin
145 | 3.2.2
146 |
147 |
148 |
149 | compile
150 | testCompile
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 | org.apache.maven.plugins
159 | maven-eclipse-plugin
160 | 2.10
161 |
162 |
163 | org.scala-ide.sdt.core.scalanature
164 | org.eclipse.jdt.core.javanature
165 |
166 |
167 | org.scala-ide.sdt.core.scalabuilder
168 |
169 |
170 | org.scala-ide.sdt.launching.SCALA_CONTAINER
171 | org.eclipse.jdt.launching.JRE_CONTAINER
172 |
173 |
174 | org.scala-lang:scala-library
175 | org.scala-lang:scala-compiler
176 |
177 |
178 | **/*.scala
179 | **/*.java
180 |
181 |
182 |
183 |
184 |
185 | org.apache.maven.plugins
186 | maven-assembly-plugin
187 | 2.5.5
188 |
189 |
190 | jar-with-dependencies
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | BigData Framework learning examples
--------------------------------------------------------------------------------
/src/main/resources/META-INF/MANIFEST.MF:
--------------------------------------------------------------------------------
1 | Manifest-Version: 1.0
2 | Main-Class: com.mobin.sparkStreaming.com.mobin.sparkStreaming.FileStre
3 | aming
4 |
5 |
--------------------------------------------------------------------------------
/src/main/resources/core-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | io.compression.codecs
6 | org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.DeflateCodec,org.apache.hadoop.io.compress.SnappyCodec,org.apache.hadoop.io.compress.Lz4Codec,com.hadoop.compression.lzo.LzoCodec,com.hadoop.compression.lzo.LzopCodec
7 |
8 |
9 |
--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | #
2 | # The logging properties used
3 | #
4 | log4j.rootLogger=INFO, out
5 |
6 | # uncomment the following line to turn on Camel debugging
7 | #log4j.logger.org.apache.camel=DEBUG
8 |
9 | log4j.logger.org.springframework=WARN
10 |
11 | # CONSOLE appender not used by default
12 | log4j.appender.out=org.apache.log4j.ConsoleAppender
13 | log4j.appender.out.layout=org.apache.log4j.PatternLayout
14 | log4j.appender.out.layout.ConversionPattern=[%30.30t] %-30.30c{1} %-5p %m%n
15 | #log4j.appender.out.layout.ConversionPattern=%d [%-15.15t] %-5p %-30.30c{1} - %m%n
16 |
17 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/Advanced_Analytics_with_Spark/NaStatCounter.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.Advanced_Analytics_with_Spark
2 |
3 | import org.apache.spark.util.StatCounter
4 | /**
5 | * Created by Mobin on 2017/3/8.
6 | */
7 | class NaStatCounter extends Serializable{
8 |
9 | val stats: StatCounter = new StatCounter()
10 | var missing: Long = 0
11 |
12 | def add(x: Double): NaStatCounter = {
13 | if(java.lang.Double.isNaN(x)){
14 | missing += 1
15 | } else {
16 | stats.merge(x)
17 | }
18 | this
19 | }
20 |
21 | def merge(other: NaStatCounter): NaStatCounter = {
22 | stats.merge(other.stats)
23 | missing += other.missing
24 | this
25 | }
26 |
27 | override def toString = {
28 | "stats: " + stats.toString() + "NaN: " + missing
29 | }
30 | }
31 |
32 | object NaStatCounter extends Serializable{
33 | def apply(x: Double) = new NaStatCounter().add(x )
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/Advanced_Analytics_with_Spark/Patient.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.Advanced_Analytics_with_Spark
2 |
3 | import org.apache.spark.{SparkContext, SparkConf}
4 |
5 | /**
6 | * Created by Mobin on 2017/3/7.
7 | */
8 | case class MatchData(id1: Int, id2: Int, scores: Array[Double], matched: Boolean)
9 | object Patient {
10 |
11 | def main(args: Array[String]) {
12 | val conf = new SparkConf().setMaster("local").setAppName("Patient")
13 | val sc = new SparkContext(conf)
14 | val rawblocks = sc.textFile(args(0))
15 | val mds = rawblocks.filter(!isHeader(_)).map(pares)
16 | // val grouped = mds.groupBy(x => x.matched).mapValues(x => x.size).foreach(println) //按matched分组统计
17 | // val sort = mds.map(x => x.matched).sortBy(_).foreach(println)
18 | val nsdRDD = mds.map(md =>
19 | md.scores.map(d => NaStatCounter(d))
20 | ).foreach(x => println(x(1)))
21 | }
22 |
23 | def isHeader(line: String): Boolean = {
24 | line.contains("id_1")
25 | }
26 |
27 | def toDouble(s: String): Double = {
28 | if ("?".equals(s))
29 | Double.NaN
30 | else
31 | s.toDouble
32 | }
33 |
34 | def pares(line: String)={
35 | val pieces = line.split(",")
36 | val id1 = pieces(0).toInt
37 | val id2 = pieces(1).toInt
38 | val scores = pieces.slice(2,11).map(toDouble) //取数据的[2,11)位并转化成Double类型
39 | val matched = pieces(11).toBoolean
40 | MatchData(id1, id2, scores, matched)
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/Example/AudienceAnalysis.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.Example
2 |
3 | /**
4 | * Created by Mobin on 2016/11/15.
5 | */
6 | object AudienceAnalysis {
7 |
8 | lazy val nameIndexMap = {
9 | val nameIndexMap = scala.collection.mutable.HashMap.empty[String, Int]
10 | val basicNames = Seq("first_name", "last_name", "email", "company", "job", "street_address", "city",
11 | "state_abbr", "zipcode_plus4", "url", "phoen_number", "user_agent", "user_name")
12 | nameIndexMap ++= basicNames zip (0 to 12)
13 | for(i <- 0 to 328){
14 | nameIndexMap ++= Seq(("letter_" + i, i * 3 + 13),("number_" + i, i * 3 +14), ("bool_" + i, i *3 +15))
15 | }
16 |
17 | nameIndexMap
18 | }
19 |
20 | def $(name: String): Int = nameIndexMap.getOrElse(name, -1)
21 | }
22 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/Example/GenerateHFile.java:
--------------------------------------------------------------------------------
1 | //package com.mobin.Example;
2 | //
3 | //import org.apache.hadoop.conf.Configuration;
4 | //import org.apache.hadoop.fs.Path;
5 | //import org.apache.hadoop.hbase.HBaseConfiguration;
6 | //import org.apache.hadoop.hbase.TableName;
7 | //import org.apache.hadoop.hbase.client.*;
8 | //import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
9 | //import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
10 | //import org.apache.hadoop.io.LongWritable;
11 | //import org.apache.hadoop.io.Text;
12 | //import org.apache.hadoop.mapreduce.Job;
13 | //import org.apache.hadoop.mapreduce.Mapper;
14 | //import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
15 | //import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
16 | //
17 | //import java.io.IOException;
18 | //
19 | ///**
20 | // * Created by Mobin on 2016/12/22.
21 | // */
22 | //public class GenerateHFile {
23 | //
24 | // static class HFileMapper extends Mapper{
25 | // @Override
26 | // protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
27 | // String[] line = value.toString().split(",");
28 | // String rk = line[0];
29 | // ImmutableBytesWritable rowkey = new ImmutableBytesWritable(rk.getBytes());
30 | // Put put = new Put(rk.getBytes());
31 | // put.addColumn("S".getBytes(),"name".getBytes(), line[1].getBytes());
32 | // put.addColumn("S".getBytes(), "sex".getBytes(), line[2].getBytes());
33 | // put.addColumn("S".getBytes(), "age".getBytes(), line[3].getBytes());
34 | // put.addColumn("S".getBytes(), "class".getBytes(), line[4].getBytes());
35 | // context.write(rowkey, put);
36 | // }
37 | // }
38 | //
39 | // public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
40 | // final String INPUT_PATH = "/DATA/PUBLIC/NOCE/SGC/Student.txt";
41 | // final String OUT_PATH = "/DATA/PUBLIC/NOCE/SGC/HFILE";
42 | // Configuration conf = HBaseConfiguration.create();
43 | // HTable table = new HTable(conf,"STUDENT");
44 | // Job job = Job.getInstance(conf);
45 | // job.setJarByClass(GenerateHFile.class);
46 | // job.setMapperClass(HFileMapper.class);
47 | // job.setMapOutputKeyClass(ImmutableBytesWritable.class);
48 | // job.setMapOutputValueClass(Put.class);
49 | //
50 | // job.setOutputFormatClass(HFileOutputFormat2.class);
51 | // HFileOutputFormat2.configureIncrementalLoad(job,table,table.getRegionLocator());
52 | // FileInputFormat.setInputPaths(job, INPUT_PATH);
53 | // FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));
54 | // System.exit(job.waitForCompletion(true)?0:1);
55 | //
56 | // }
57 | //}
58 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/Example/HiveDataBaseConnection.java:
--------------------------------------------------------------------------------
1 | //package com.mobin.Example;
2 | //
3 | //import org.apache.hadoop.hive.ql.metadata.Hive;
4 | //
5 | //import java.sql.Connection;
6 | //import java.sql.DriverManager;
7 | //import java.sql.SQLException;
8 | //
9 | ///**
10 | // * Created by MOBIN on 2016/9/21.
11 | // */
12 | //public class HiveDataBaseConnection {
13 | // private final static String DriverName = "org.apache.hive.jdbc.HiveDriver";
14 | // private final static String URL = "jdbc:hive2://132.122.70.2:10000/default";
15 | // private final static String UserName = "";
16 | // private final static String Password = "";
17 | // private Connection con;
18 | //
19 | // public HiveDataBaseConnection(){
20 | // try {
21 | // Class.forName(DriverName);
22 | // con = DriverManager.getConnection(URL,UserName, Password);
23 | // System.out.println(con);
24 | // } catch (ClassNotFoundException e) {
25 | // e.printStackTrace();
26 | // } catch (SQLException e) {
27 | // e.printStackTrace();
28 | // }
29 | // }
30 | //
31 | // public Connection getConnection(){
32 | // return con;
33 | // }
34 | //
35 | // public void Close(){
36 | // try {
37 | // if(con != null)
38 | // con.close();
39 | // } catch (SQLException e) {
40 | // e.printStackTrace();
41 | // }
42 | // }
43 | //
44 | // public static void main(String[] args) {
45 | // HiveDataBaseConnection connection = new HiveDataBaseConnection();
46 | // }
47 | //}
48 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/Example/PutDataToHBase.java:
--------------------------------------------------------------------------------
1 | //package com.mobin.Example;
2 | //
3 | //import org.apache.hadoop.conf.Configuration;
4 | //import org.apache.hadoop.fs.Path;
5 | //import org.apache.hadoop.hbase.HBaseConfiguration;
6 | //import org.apache.hadoop.hbase.client.HTable;
7 | //import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
8 | //
9 | //
10 | ///**
11 | // * Created by Mobin on 2016/12/22.
12 | // */
13 | //public class PutDataToHBase {
14 | // public static void main(String[] args) throws Exception {
15 | // Configuration conf = HBaseConfiguration.create();
16 | // LoadIncrementalHFiles load = new LoadIncrementalHFiles(conf);
17 | // load.doBulkLoad(new Path("HFILE"), new HTable(conf,"STUDENT"));
18 | // }
19 | //}
20 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/Example/RandomPrefix_Shuffle.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.Example
2 |
3 | import org.apache.spark.{SparkContext, SparkConf}
4 |
5 | import scala.util.Random
6 |
7 | /**
8 | * Created by Mobin on 2017/8/29.
9 | * 先局部聚合再全局聚合
10 | */
11 | object RandomPrefix_Shuffle {
12 |
13 | def main(args: Array[String]) {
14 | val conf = new SparkConf().setMaster("local[*]").setAppName("RandomPrefix")
15 | val sc = new SparkContext(conf)
16 | val line = sc.textFile("RandomPrefix.txt").map((_,1))
17 | val randomPrefixRdd = line.map(x => {
18 | val random = Random
19 | val prefix = random.nextInt(10)
20 | (prefix + "_" + x._1 , x._2)
21 | })
22 |
23 | val localAggrRdd = randomPrefixRdd.reduceByKey(_ + _)
24 | val removeRandPrefixRdd = localAggrRdd.map(x => {
25 | val k = x._1.split("_")(1)
26 | (k, x._2)
27 | })
28 | val globalAggrRdd = removeRandPrefixRdd.reduceByKey(_ + _)
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/Example/Sample_Shuffle.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.Example
2 |
3 | import java.util
4 |
5 | import org.apache.spark.{SparkContext, SparkConf}
6 |
7 | import scala.util.Random
8 |
9 | /**
10 | * Created by Mobin on 2017/8/30.
11 | *方案实现思路:(Spark性能优化指南——高级篇[美团点评技术博客])
12 | 1.对包含少数几个数据量过大的key的那个RDD,通过sample算子采样出一份样本来,然后统计一下每个key的数量,计算出来数据量最大的是哪几个key。
13 |
14 | 2.然后将这几个key对应的数据从原来的RDD中拆分出来,形成一个单独的RDD,并给每个key都打上n以内的随机数作为前缀,而不会导致倾斜的大部分key形成另外一个RDD。
15 |
16 | 3.接着将需要join的另一个RDD,也过滤出来那几个倾斜key对应的数据并形成一个单独的RDD,将每条数据膨胀成n条数据,这n条数据都按顺序附加一个0~n的前缀,不会导致倾斜的大部分key也形成另外一个RDD。
17 |
18 | 4.再将附加了随机前缀的独立RDD与另一个膨胀n倍的独立RDD进行join,此时就可以将原先相同的key打散成n份,分散到多个task中去进行join了。
19 |
20 | 5.而另外两个普通的RDD就照常join即可。
21 |
22 | 6.最后将两次join的结果使用union算子合并起来即可,就是最终的join结果。
23 | */
24 | object Sample_Shuffle {
25 |
26 | def main(args: Array[String]) {
27 | val conf = new SparkConf().setMaster("local[*]").setAppName("sample")
28 | val sc = new SparkContext(conf)
29 |
30 | val rdd1 = sc.textFile("SampleJoin1.txt").map(x => {
31 | val kv = x.split(",")
32 | (kv(0), kv(1))
33 | })
34 |
35 | val sampleRdd = rdd1.sample(false, 0.1) //对rdd1进行采样
36 | val countSampleRdd = sampleRdd.map(x =>(x._1, 1)).reduceByKey(_ + _) //统计出各key的频数
37 | val reversedSampleRdd = countSampleRdd.map(x => (x._2, x._1))
38 | val skewedUserid = reversedSampleRdd.sortByKey(false).take(1)(0)._2 //对频数进行排序,得到频数最高的对应的key
39 | val skewRdd = rdd1.filter(_._1.equals(skewedUserid)) //从RDD1中拆分出导致数据倾斜的key,形成独立的RDD
40 | val commonRdd = rdd1.filter(!_._1.equals(skewedUserid)) //从RDD1中拆分出不会导致数据倾斜的key,形成独立的RDD
41 |
42 | val rdd2 = sc.textFile("SampleJoin2.txt").map(x => {
43 | val kv = x.split(",")
44 | (kv(0), kv(1))
45 | })
46 |
47 | println("skew: " + skewedUserid)
48 | //对RDD2中skew key扩充100倍
49 | val skewRdd2 = rdd2.filter(_._1.equals(skewedUserid)).flatMap(x => {
50 | for(i <- 1 to 10)yield((i + "_" + x._1, x._2))
51 | })
52 |
53 | //为skewRdd中的每条数据都打上随机前缀并join上dkewRdd2
54 | val joinRdd = skewRdd.map(x=>{
55 | val prefix = Random.nextInt(10)
56 | (prefix + "_" + x._1, x._2)
57 | }).join(skewRdd2).map(x => {
58 | val key = x._1.split("_")(1)
59 | (key,x._2)
60 | })
61 |
62 | val joinRdd2 = commonRdd.join(rdd2)
63 | val resultRdd = joinRdd.union(joinRdd2)
64 | resultRdd.foreach(println)
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/Example/ScoresDataGenerator.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.Example
2 |
3 | import java.io.FileWriter
4 |
5 | import scala.util.Random
6 |
7 | /**
8 | * Created by Mobin on 2016/12/22.
9 | * sno string, //学号
10 | * semester int, //学期
11 | * math int, // 数学成绩
12 | * en int, // 英语成绩
13 | * c int, // C语言成绩
14 | * os int // 操作系统成绩
15 |
16 | */
17 | object ScoresDataGenerator {
18 | private val FILE_OUTPATH = "Scores.txt"
19 | private val MAX_RECORD = 1000;
20 |
21 | def main(args: Array[String]) {
22 | Generator(FILE_OUTPATH,MAX_RECORD)
23 | }
24 |
25 |
26 | private def Generator(filePath: String, recordNum: Int) {
27 | var write: FileWriter = null
28 | try {
29 | write = new FileWriter(filePath, true)
30 | val rand = new Random()
31 | val term = 1
32 | for(i <- 1 to recordNum){
33 | val MScore = generatorScore
34 | val EScore = generatorScore
35 | val CScore = generatorScore
36 | val SScore = generatorScore
37 | write.write(i + "," + term + "," + MScore + "," + EScore + "," + CScore + "," + SScore)
38 | write.write(System.getProperty("line.separator"))
39 | write.flush()
40 | }
41 | } catch {
42 | case e => println("error")
43 | }finally {
44 | if (write != null)
45 | write.close()
46 | }
47 | }
48 |
49 | private def generatorScore: Int = {
50 | val rand = new Random()
51 | val sc = rand.nextInt(100)
52 | val score = sc match {
53 | case s if(s >0 && s <10) => s + 80
54 | case s if(s >10 && s < 30) => s + 70
55 | case s if(s >30 && s < 50) => s + 40
56 | case s if(s >50 && s < 60) => s + 20
57 | case _ => sc
58 | }
59 | score
60 | }
61 |
62 | }
63 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/Example/SecondSortBykey.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.Example
2 |
3 | /**
4 | * Created by Mobin on 2017/9/3.
5 | */
6 | class SecondSortBykey(val first: Int, val second: Int) extends Ordered [SecondSortBykey] with Serializable {
7 | def compare(other:SecondSortBykey):Int = {
8 | if (this.first - other.first !=0) {
9 | this.first - other.first
10 | } else {
11 | this.second - other.second
12 | }
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/Example/SexCount.java:
--------------------------------------------------------------------------------
1 | package com.mobin.Example;
2 |
3 | import org.apache.hadoop.conf.Configurable;
4 | import org.apache.hadoop.conf.Configuration;
5 | import org.apache.hadoop.fs.Path;
6 | import org.apache.hadoop.io.IntWritable;
7 | import org.apache.hadoop.io.LongWritable;
8 | import org.apache.hadoop.io.Text;
9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.Mapper;
11 | import org.apache.hadoop.mapreduce.Reducer;
12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
14 |
15 | import java.io.IOException;
16 |
17 | /**
18 | * Created by Mobin on 2016/12/22.
19 | * 性别统计
20 | */
21 | public class SexCount {
22 | static class SexMapper extends Mapper{
23 | @Override
24 | protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
25 | String[] line = value.toString().split(",");
26 | context.write(new Text(line[2]), new IntWritable(1));
27 | }
28 | }
29 |
30 | static class SexReduce extends Reducer{
31 | @Override
32 | protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
33 | int count = 0;
34 | for(IntWritable c : values)
35 | count += c.get();
36 | context.write(key, new IntWritable(count));
37 | }
38 | }
39 |
40 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
41 | final String INPUT_PATH = "Student.txt";
42 | final String OUT_PATH = "StudentSex";
43 | Configuration conf = new Configuration();
44 | Job job = Job.getInstance(conf);
45 |
46 | job.setMapperClass(SexMapper.class);
47 | job.setReducerClass(SexReduce.class);
48 | job.setJarByClass(SexCount.class);
49 |
50 |
51 | job.setOutputKeyClass(Text.class);
52 | job.setOutputValueClass(IntWritable.class);
53 |
54 | FileInputFormat.setInputPaths(job, INPUT_PATH);
55 | FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));
56 | System.exit(job.waitForCompletion(true)?0:1);
57 |
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/Example/SparkJoin.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.Example
2 |
3 | import org.apache.spark.{SparkContext, SparkConf}
4 |
5 |
6 | /**
7 | * Created by Mobin on 2016/12/22.
8 | */
9 | object SparkJoin {
10 | def main(args: Array[String]) {
11 | val conf = new SparkConf().setAppName("SparkJoin").setMaster("local")
12 | val sc = new SparkContext(conf)
13 | val student = sc.textFile("Student.t")
14 | val scores = sc.textFile("Scores.txt")
15 | val studentT = student.map(str => str.split(",")).map(x => (x(0), x(1) +"," + x(2) + "," +x(3) + "," + x(4)))
16 | val scoresT = scores.map(str => str.split(",")).map(x => (x(0), x(1) +"," + x(2) + "," +x(3) + "," + x(4) + "," + x(5)))
17 | studentT.join(scoresT).foreach(println)
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/Example/StudentDataGenerator.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.Example
2 |
3 | import java.io.FileWriter
4 |
5 | import scala.util.Random
6 |
7 | /**
8 | * Created by Mobin on 2016/12/22.
9 | * sno string, //学号
10 | * name string,//姓名
11 | * sex string, //性别
12 | * age int, //年龄
13 | * class string //班级
14 | */
15 | object StudentDataGenerator {
16 | private val FILE_OUTPATH = "Student.txt"
17 | private val MAX_RECORD = 10000;
18 |
19 | def main(args: Array[String]) {
20 | Generator(FILE_OUTPATH, MAX_RECORD)
21 | }
22 |
23 | private def Generator(filePath: String, recordNum: Int) {
24 |
25 | var write: FileWriter = null
26 | try {
27 | write = new FileWriter(filePath, true)
28 | val rand = new Random();
29 | for (i <- 1 to recordNum) {
30 | val name = nameGenerator
31 | val sex = sexGenerator
32 | //年龄在20~22之间
33 | val age = rand.nextInt(3) + 20
34 | //班级
35 | val classNum = rand.nextInt(6)
36 | write.write(i + "," + name + "," + sex + "," + age + "," + classNum)
37 | write.write(System.getProperty("line.separator"))
38 | write.flush()
39 | }
40 | } catch {
41 | case e => println("error")
42 | } finally {
43 | if (write != null)
44 | write.close()
45 | }
46 | }
47 |
48 | //生成姓名
49 | private def nameGenerator: String = {
50 | val higthPos = (176 + Math.abs(new Random().nextInt(39)))
51 | val lowPos = (176 + Math.abs(new Random().nextInt(93)))
52 | val name = Array[Byte](new Integer(higthPos).byteValue(), new Integer(lowPos).byteValue())
53 | val surname = Array("钟", "李", "张", "刘", "王", "章", "洪", "江", "戴")
54 | surname(new Random().nextInt(9)) + new String(name, "GBK")
55 | }
56 |
57 | //生成性别
58 | private def sexGenerator: String = {
59 | val random = new Random()
60 | val randomNum = random.nextInt(2) + 1
61 | randomNum % 2 match {
62 | case 0 => "男"
63 | case _ => "女"
64 | }
65 | }
66 | }
67 |
68 |
69 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/HDFS/FSUtils/CountFileLine.java:
--------------------------------------------------------------------------------
1 | package com.mobin.HDFS.FSUtils;
2 |
3 | import org.apache.hadoop.conf.Configuration;
4 | import org.apache.hadoop.fs.FileSystem;
5 |
6 | import java.io.File;
7 | import java.io.IOException;
8 | import java.util.ArrayList;
9 | import java.util.concurrent.Callable;
10 | import java.util.concurrent.ExecutionException;
11 | import java.util.concurrent.Future;
12 | import java.util.concurrent.FutureTask;
13 |
14 | /**
15 | * Created by Mobin on 2016/12/20.
16 | * 统计一个目录下的lzo文件的行数,每个lzo起一个task
17 | */
18 | public class CountFileLine implements Callable{
19 | public FileSystem fs;
20 | public String path;
21 |
22 | @Override
23 | public Integer call() throws Exception {
24 | return countLine(fs,path);
25 | }
26 |
27 | public Integer countLine(FileSystem fs,String path) throws IOException {
28 | int count = 0;
29 | FSUtils.BufferedReadIterable brl = new FSUtils.BufferedReadIterable(fs,path);
30 | for(String line: brl){
31 | count ++;
32 | }
33 | System.out.println(count);
34 | return count;
35 | }
36 |
37 | public static void main(String[] args) throws IOException, ExecutionException, InterruptedException {
38 | int sum=0;
39 | String file = "E:\\DATA\\PUBLIC\\NOCE\\AGG\\AGG_EVT_LTE_DPI_NEW\\hour=2016102011";
40 | Configuration conf = new Configuration();
41 | FileSystem fs = FileSystem.get(conf);
42 | ArrayList> tasks = new ArrayList<>();
43 | File[] files = new File(file).listFiles();
44 | for(File f: files){
45 | if(f.getName().endsWith(".lzo")){
46 | CountFileLine cd = new CountFileLine();
47 | cd.fs = fs;
48 | cd.path = f.getPath();
49 | FutureTask task = new FutureTask(cd);
50 | tasks.add(task);
51 | Thread thread = new Thread(task);
52 | System.out.println(thread.getName());
53 | thread.start();
54 | }
55 | }
56 |
57 | for(Future future: tasks){
58 | sum += future.get();
59 | }
60 | System.out.println(sum);
61 |
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/HDFS/FSUtils/FSUtils.java:
--------------------------------------------------------------------------------
1 | package com.mobin.HDFS.FSUtils;
2 |
3 | import org.apache.hadoop.conf.Configuration;
4 | import org.apache.hadoop.fs.FSDataInputStream;
5 | import org.apache.hadoop.fs.FileSystem;
6 | import org.apache.hadoop.fs.Path;
7 | import org.apache.hadoop.io.compress.CompressionCodec;
8 | import org.apache.hadoop.io.compress.CompressionCodecFactory;
9 | import org.apache.hadoop.io.compress.CompressionInputStream;
10 |
11 | import java.io.BufferedReader;
12 | import java.io.Closeable;
13 | import java.io.IOException;
14 | import java.io.InputStreamReader;
15 | import java.util.Iterator;
16 |
17 | /**
18 | * Created by Mobin on 2016/12/14.
19 | * 统计一个目录下的lzo文件的行数,每个lzo起一个task
20 | */
21 | public class FSUtils {
22 | private static final Configuration conf = new Configuration();
23 | private static final FileSystem fs = null;
24 |
25 | public static void main(String[] args) throws IOException {
26 | String file = "E:\\DATA\\PUBLIC\\NOCE\\AGG\\AGG_EVT_LTE_DPI_NEW\\hour=2016102011\\m_p_0.txt.lzo";
27 | int lineCount = 0;
28 | Configuration conf = new Configuration();
29 | FileSystem fs = FileSystem.get(conf);
30 | try(BufferedReadIterable br = new BufferedReadIterable(fs,file)){
31 | for(String line : br){
32 |
33 | }
34 | }
35 |
36 | }
37 |
38 | public static BufferedReadIterable createBuferedReadIterable(FileSystem fs, String file) throws IOException {
39 | return new BufferedReadIterable(fs,file);
40 | }
41 |
42 | public static class BufferedReadIterable implements Iterable,Closeable{
43 | private final String file;
44 | private final long size;
45 | private BufferedReader br;
46 |
47 |
48 | public BufferedReadIterable(FileSystem fs, String file) throws IOException {
49 | this.file = file;
50 | Path path = new Path(file);
51 | this.size = fs.getFileStatus(path).getLen();
52 |
53 | CompressionCodecFactory factory = new CompressionCodecFactory(fs.getConf());
54 | //HDFS根据文件的后缀来确定使用的是哪种压缩算法
55 | CompressionCodec codec = factory.getCodec(path);
56 |
57 | FSDataInputStream inputStream = fs.open(path,8192);
58 | if(codec == null){
59 | br = new BufferedReader(new InputStreamReader(inputStream));
60 | }else{
61 | //先解压再读取
62 | CompressionInputStream comIn = codec.createInputStream(inputStream);
63 | br = new BufferedReader(new InputStreamReader(comIn));
64 | }
65 | }
66 |
67 | @Override
68 | public void close() throws IOException {
69 | br.close();
70 | }
71 |
72 | @Override
73 | public Iterator iterator() {
74 | return new Iterator() {
75 | private String line;
76 | @Override
77 | public boolean hasNext() {
78 | try {
79 | line = br.readLine();
80 | } catch (IOException e) {
81 | line = null;
82 | }
83 | return line != null;
84 | }
85 |
86 | @Override
87 | public String next() {
88 | return line;
89 | }
90 |
91 | @Override
92 | public void remove() {
93 | throw new UnsupportedOperationException("remove");
94 | }
95 | };
96 | }
97 | }
98 | }
99 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/HDFS/HDFSCompressionCodec.java:
--------------------------------------------------------------------------------
1 | package com.mobin.HDFS;
2 |
3 | import org.apache.hadoop.conf.Configuration;
4 | import org.apache.hadoop.fs.FSDataInputStream;
5 | import org.apache.hadoop.fs.FSDataOutputStream;
6 | import org.apache.hadoop.fs.FileSystem;
7 | import org.apache.hadoop.fs.Path;
8 | import org.apache.hadoop.io.IOUtils;
9 | import org.apache.hadoop.io.compress.CompressionCodec;
10 | import org.apache.hadoop.io.compress.CompressionCodecFactory;
11 | import org.apache.hadoop.io.compress.CompressionInputStream;
12 | import org.apache.hadoop.io.compress.CompressionOutputStream;
13 | import org.apache.hadoop.util.ReflectionUtils;
14 |
15 | import java.io.*;
16 |
17 | /**
18 | * Created by Mobin on 2016/12/19.
19 | */
20 | public class HDFSCompressionCodec {
21 | private static final Configuration conf = new Configuration();
22 | private static FileSystem fs = null;
23 | //压缩
24 | public void coder(String path) throws IOException, ClassNotFoundException {
25 | //获取文件输入流
26 | File dir = new File(path);
27 | System.out.println(dir.isDirectory());
28 | conf.set("mapred.output.compress", "true");
29 | conf.set("mapred.output.compression.codec", "com.hadoop.compression.lzo.LzopCodec");
30 | fs = FileSystem.get(conf);
31 | FSDataOutputStream out = fs.create(new Path("E:\\DATA\\PUBLIC\\NOCE\\school5.lzo"));
32 | Class> codecClass = Class.forName("com.hadoop.compression.lzo.LzopCodec");
33 | CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
34 | //将压缩数据写入到school.gz中
35 | //创建CompressionInputStream来对文件进行压缩
36 | CompressionOutputStream codecout = codec.createOutputStream(out);
37 | for(File file:dir.listFiles() ) {
38 | try (BufferedInputStream in = new BufferedInputStream(new FileInputStream(file))) {
39 | try{
40 | //最后个参数为true时同时关闭输出流和输入流
41 | IOUtils.copyBytes(in, codecout, 4096, false);
42 | }finally {
43 | IOUtils.closeStream(in);
44 | }
45 | } catch (FileNotFoundException e) {
46 | e.printStackTrace();
47 | } catch (IOException e) {
48 | e.printStackTrace();
49 | }
50 | }
51 | out.flush();
52 | out.close();
53 | }
54 |
55 | //解压
56 | public void decoder() throws IOException {
57 | fs = FileSystem.get(conf);
58 | CompressionCodecFactory factory = new CompressionCodecFactory(fs.getConf());
59 | //根据文件的后缀名来确定使用的是哪种压缩算法
60 | Path path = new Path("E:\\DATA\\PUBLIC\\NOCE\\school.gz");
61 | CompressionCodec codec = factory.getCodec(path);
62 | try(FSDataInputStream inputStream = fs.open(path,8096)){
63 | //创建CompressionInputStream来对文件进行解压
64 | CompressionInputStream comInputStream = codec.createInputStream(inputStream);
65 | //将解压后的文件写到school.txt
66 | FSDataOutputStream out = fs.create(new Path("E:\\DATA\\PUBLIC\\NOCE\\school5.txt"));
67 | IOUtils.copyBytes(comInputStream,out,4096,false);
68 | comInputStream.close();
69 | out.close();
70 | } catch (IOException e) {
71 | e.printStackTrace();
72 | }
73 | }
74 |
75 | public static void main(String[] args) throws IOException, ClassNotFoundException {
76 | String path = "E:\\DATA\\PUBLIC\\NOCE\\sch";
77 | HDFSCompressionCodec codec = new HDFSCompressionCodec();
78 | codec.coder(path);
79 | codec.decoder();
80 | Integer i = 0;
81 | Integer o = 2;
82 | i.equals(o);
83 | Integer ii =i + o;
84 | }
85 | }
86 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/HDFS/LzoCompress.java:
--------------------------------------------------------------------------------
1 | package com.mobin.HDFS;
2 |
3 | import org.apache.hadoop.conf.Configuration;
4 | import org.apache.hadoop.fs.FileSystem;
5 | import org.apache.hadoop.fs.Path;
6 |
7 | import java.io.IOException;
8 |
9 | /**
10 | * Created by Mobin on 2017/2/4.
11 | */
12 | public class LzoCompress {
13 | public static void main(String[] args) {
14 | Configuration conf = new Configuration();
15 | try {
16 | FileSystem fs = FileSystem.get(conf);
17 |
18 |
19 | } catch (IOException e) {
20 | e.printStackTrace();
21 | }
22 | }
23 |
24 |
25 |
26 | // public void LzoCoder(){
27 | // try(){
28 | //
29 | // }
30 | // }
31 | }
32 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/HDFS/WriteToHDFS.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.HDFS
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 |
5 | import scala.util.Random
6 |
7 | /**
8 | * Created by hadoop on 3/6/16.
9 | * 为文件中的第一个字段拼接一个随机值再写入HDFS
10 | */
11 | object WriteToHDFS {
12 | def main(args: Array[String]) {
13 | if (args.length < 2) {
14 | System.err.println("Usage: WriteToHDFS \n")
15 | System.exit(1)
16 | }
17 | val conf = new SparkConf().setMaster("local").setAppName("WriteToHDFS")
18 | val sc = new SparkContext(conf)
19 | val sgfile = sc.textFile(args(0))
20 |
21 | val rdd = sgfile.map(lines => {
22 | val line = lines.split("\\s")
23 | if(line.length == 6){
24 | val one = line(0) +"-"+ new Random().nextInt()
25 | one+","+line(1)+","+line(2).getBytes+","+line(3)+","+line(4)+","+line(5)
26 | }else //如果这样写 一定不能只写if语句 还要加上else语句,否则没有通过if的,将被视了() 否则后期通过Phoenix导入到HBase中会因为字段不合法而报错
27 | "mobin1"+","+"mobin2"+","+"mobin3"+"mobin4"+","+"mobin5"+","+"mobin6"
28 | })
29 | rdd.saveAsTextFile(args(1))
30 | sc.stop()
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/Kafka/KStream/KStreamDemo.java:
--------------------------------------------------------------------------------
1 | package com.mobin.Kafka.KStream;
2 |
3 | import org.apache.kafka.clients.consumer.ConsumerConfig;
4 | import org.apache.kafka.common.serialization.Serdes;
5 | import org.apache.kafka.streams.KafkaStreams;
6 | import org.apache.kafka.streams.StreamsBuilder;
7 | import org.apache.kafka.streams.StreamsConfig;
8 | import org.apache.kafka.streams.kstream.ForeachAction;
9 | import org.apache.kafka.streams.kstream.KStream;
10 | import org.apache.kafka.streams.kstream.KTable;
11 | import org.apache.kafka.streams.kstream.Printed;
12 |
13 | import java.util.Properties;
14 |
15 | /**
16 | * Created with IDEA
17 | * Creater: MOBIN
18 | * Date: 2018/8/19
19 | * Time: 3:41 PM
20 | */
21 | public class KStreamDemo {
22 | private static final String APPLICATION_ID_CONFIG = "KStream-test";
23 | private static final String BROKER_LIST = "localhost:9092";
24 | private static final String TOPIC = "streams-foo";
25 | private static StreamsBuilder streamsBuilder;
26 | private static KStream textLine;
27 |
28 | public static Properties initProperties(){
29 | Properties properties = new Properties();
30 | properties.put(StreamsConfig.APPLICATION_ID_CONFIG, APPLICATION_ID_CONFIG);
31 | properties.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, BROKER_LIST);
32 | properties.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass());
33 | properties.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass());
34 | properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
35 | return properties;
36 | }
37 |
38 | public static void printf() throws InterruptedException {
39 | Properties properties = initProperties();
40 | streamsBuilder = new StreamsBuilder();
41 | textLine = streamsBuilder.stream(TOPIC);
42 | textLine.foreach(new ForeachAction() {
43 | @Override
44 | public void apply(String key, String value) {
45 | System.out.println(key + ":" + value);
46 | }
47 | });
48 | KafkaStreams streams = new KafkaStreams(streamsBuilder.build(), properties);
49 | streams.start();
50 | Thread.sleep(5000L);
51 | streams.close();
52 | }
53 |
54 | public static void main(String[] args) throws InterruptedException {
55 | KStreamDemo.printf();
56 | }
57 |
58 | }
59 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/Kafka/Partition/StockPartitionor.java:
--------------------------------------------------------------------------------
1 | package com.mobin.Kafka.Partition;
2 |
3 | import org.apache.kafka.clients.producer.Partitioner;
4 | import org.apache.kafka.common.Cluster;
5 |
6 | import java.util.Map;
7 |
8 | /**
9 | * Created with IDEA
10 | * Creater: MOBIN
11 | * Date: 2018/8/16
12 | * Time: 4:47 PM
13 | * //写好自定义分区后在配置文件进行自定义分区配置
14 | * properties.put("ProducerConfig.PARTITIONER_CLASS_CONFIG", StockPartitionor.class.getName)
15 | */
16 | public class StockPartitionor implements Partitioner{
17 | //分区数
18 | private static final Integer PARTITIONS = 6;
19 | @Override
20 | public int partition(String topic, Object key, byte[] keyBytes, Object value, byte[] valueBytes, Cluster cluster) {
21 | if (key == null){
22 | return 0;
23 | }
24 | String stockcode = String.valueOf(key);
25 | try {
26 | int partitionID = Integer.valueOf(stockcode.substring(stockcode.length() - 2)) % PARTITIONS;
27 | return partitionID;
28 | }catch (NumberFormatException e){
29 | return 0;
30 | }
31 | }
32 |
33 | @Override
34 | public void close() {
35 |
36 | }
37 |
38 | @Override
39 | public void configure(Map map) {
40 |
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/Kafka/Producers/KafkaProducerThread.java:
--------------------------------------------------------------------------------
1 | package com.mobin.Kafka.Producers;
2 |
3 | import org.apache.kafka.clients.producer.*;
4 | import org.apache.kafka.common.serialization.StringSerializer;
5 | import org.slf4j.Logger;
6 | import org.slf4j.LoggerFactory;
7 | import sun.util.resources.ga.LocaleNames_ga;
8 |
9 | import java.text.DecimalFormat;
10 | import java.util.Properties;
11 | import java.util.Random;
12 | import java.util.concurrent.ExecutorService;
13 | import java.util.concurrent.Executors;
14 |
15 | /**
16 | * Created with IDEA
17 | * Creater: MOBIN
18 | * Date: 2018/8/13
19 | * Time: 3:08 PM
20 | */
21 | public class KafkaProducerThread implements Runnable {
22 | private static final int MSG_SIZE = 100;
23 | private static final String TOPIC = "stock-quotation4";
24 | private static final String BROKER_LIST = "localhost:9092";
25 | private static final Logger log = LoggerFactory.getLogger(KafkaProducerThread.class);
26 | private static KafkaProducer producer = null;
27 | private ProducerRecord record = null;
28 |
29 | public KafkaProducerThread(KafkaProducer producer, ProducerRecord record) {
30 | this.producer = producer;
31 | this.record = record;
32 | }
33 |
34 | @Override
35 | public void run() {
36 | System.out.println(producer + record.toString());
37 | producer.send(record, new Callback() {
38 |
39 | @Override
40 | public void onCompletion(RecordMetadata recordMetadata, Exception e) {
41 | System.out.println("00000");
42 | producer.send(record, new Callback() {
43 | @Override
44 | public void onCompletion(RecordMetadata recordMetadata, Exception e) {
45 | if (null != e) { //发送消息异常
46 | log.error("发送消息异常...");
47 | }
48 | if (null != recordMetadata) {
49 | log.info(String.format("offset:%s, partition:%s", recordMetadata.offset(), recordMetadata.partition()));
50 | }
51 | }
52 | });
53 | }
54 | });
55 | }
56 |
57 | private static StockQuotationInfo createQuotationInfo() {
58 | StockQuotationInfo quotationInfo = new StockQuotationInfo();
59 | Random random = new Random();
60 | Integer stockCode = 600100 + random.nextInt();
61 | float r = (float) Math.random();
62 | if (r / 2 < 0.5) {
63 | r = -r;
64 | }
65 | DecimalFormat decimalFormat = new DecimalFormat(".00");
66 | quotationInfo.setCurrentPrice(Float.valueOf(decimalFormat.format(11 + r)));
67 | quotationInfo.setPreClosePrice(11.80f);
68 | quotationInfo.setOpenPrice(11.5f);
69 | quotationInfo.setLowPrice(10.5f);
70 | quotationInfo.setHighPrice(12.5f);
71 | quotationInfo.setStockCode(stockCode.toString());
72 | quotationInfo.setTradeTime(System.currentTimeMillis());
73 | quotationInfo.setStockName("股票-" + stockCode);
74 | return quotationInfo;
75 | }
76 |
77 | public static Properties initConfig() {
78 | Properties properties = new Properties();
79 | properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, BROKER_LIST);
80 | properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());
81 | properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());
82 | return properties;
83 | }
84 |
85 | public static void main(String[] args) {
86 | Properties configs = initConfig();
87 | KafkaProducer producer = new KafkaProducer(configs);
88 | ProducerRecord record;
89 | StockQuotationInfo quotationInfo;
90 | ExecutorService executorService = Executors.newFixedThreadPool(10);
91 | long current = System.currentTimeMillis();
92 | try {
93 | for (int i = 0; i < MSG_SIZE; i++) {
94 | quotationInfo = createQuotationInfo();
95 | record = new ProducerRecord(TOPIC, null, quotationInfo.getTradeTime(),
96 | quotationInfo.getStockCode(), quotationInfo.toString());
97 | executorService.submit(new KafkaProducerThread(producer, record));
98 | }
99 | } catch (Exception e) {
100 | System.out.println("-------");
101 | } finally {
102 | producer.close();
103 | executorService.shutdown();
104 | }
105 | }
106 | }
107 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/Kafka/Producers/QuotationProducer.java:
--------------------------------------------------------------------------------
1 | package com.mobin.Kafka.Producers;
2 |
3 |
4 | import org.apache.kafka.clients.producer.*;
5 | import org.apache.kafka.common.serialization.StringSerializer;
6 | import org.slf4j.Logger;
7 | import org.slf4j.LoggerFactory;
8 |
9 | import java.text.DecimalFormat;
10 | import java.util.Properties;
11 | import java.util.Random;
12 |
13 | /**
14 | * Created with IDEA
15 | * Creater: MOBIN
16 | * Date: 2018/8/13
17 | * Time: 11:24 AM
18 | */
19 | public class QuotationProducer {
20 | private static final Logger log = LoggerFactory.getLogger(QuotationProducer.class);
21 | private static final int MSG_SIZE = 100;
22 | private static final String TOPIC = "stock-quotation";
23 | private static final String BROKER_LIST = "localhost:9092";
24 | private static KafkaProducer producer = null;
25 | static {
26 | Properties configs = initConfig();
27 | producer = new KafkaProducer(configs);
28 | }
29 |
30 | public static Properties initConfig(){
31 | Properties properties = new Properties();
32 | properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, BROKER_LIST);
33 | properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());
34 | properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());
35 | return properties;
36 | }
37 |
38 | private static StockQuotationInfo createQuotationInfo(){
39 | StockQuotationInfo quotationInfo = new StockQuotationInfo();
40 | Random random = new Random();
41 | Integer stockCode = 600100 + random.nextInt();
42 | float r = (float) Math.random();
43 | if (r / 2 < 0.5){
44 | r = -r;
45 | }
46 | DecimalFormat decimalFormat = new DecimalFormat(".00");
47 | quotationInfo.setCurrentPrice(Float.valueOf(decimalFormat.format(11 + r)));
48 | quotationInfo.setPreClosePrice(11.80f);
49 | quotationInfo.setOpenPrice(11.5f);
50 | quotationInfo.setLowPrice(10.5f);
51 | quotationInfo.setHighPrice(12.5f);
52 | quotationInfo.setStockCode(stockCode.toString());
53 | quotationInfo.setTradeTime(System.currentTimeMillis());
54 | quotationInfo.setStockName("股票-" + stockCode);
55 | return quotationInfo;
56 | }
57 |
58 | public static void main(String[] args) {
59 | ProducerRecord record = null;
60 | StockQuotationInfo quotationInfo = null;
61 | try {
62 | int num = 0;
63 | for (int i = 0; i < MSG_SIZE; i ++){
64 | quotationInfo = createQuotationInfo();
65 | record = new ProducerRecord(TOPIC,null, quotationInfo.getTradeTime(),quotationInfo.getStockCode()
66 | ,quotationInfo.toString());
67 | producer.send(record);
68 | //异步方式,指定Callback,实现onCompleteion
69 | // producer.send(record, new Callback() {
70 | // @Override
71 | // public void onCompletion(RecordMetadata recordMetadata, Exception e) {
72 | // if (null != e){ //发送消息异常
73 | // log.error("发送消息异常...");
74 | // }
75 | // if (null != recordMetadata){
76 | // log.info(String.format("offset:%s, partition:%s", recordMetadata.offset(), recordMetadata.partition()));
77 | // }
78 | // }
79 | // });
80 | if (num++ % 10 == 0){
81 | Thread.sleep(2000L);
82 | }
83 | }
84 | }catch (InterruptedException e){
85 |
86 | }finally {
87 | producer.close();
88 | }
89 | }
90 | }
91 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/Kafka/Producers/StockQuotationInfo.java:
--------------------------------------------------------------------------------
1 | package com.mobin.Kafka.Producers;
2 |
3 | import java.io.Serializable;
4 |
5 | /**
6 | * Created with IDEA
7 | * Creater: MOBIN
8 | * Date: 2018/8/13
9 | * Time: 11:19 AM
10 | */
11 | public class StockQuotationInfo implements Serializable{
12 | private static final long serialVersionUID = 1L;
13 | private String stockCode;
14 | private String stockName;
15 | private long tradeTime;
16 | private float preClosePrice;
17 | private float openPrice;
18 | private float currentPrice;
19 | private float highPrice;
20 | private float lowPrice;
21 |
22 | public static long getSerialVersionUID() {
23 | return serialVersionUID;
24 | }
25 |
26 | public String getStockCode() {
27 | return stockCode;
28 | }
29 |
30 | public void setStockCode(String stockCode) {
31 | this.stockCode = stockCode;
32 | }
33 |
34 | public String getStockName() {
35 | return stockName;
36 | }
37 |
38 | public void setStockName(String stockName) {
39 | this.stockName = stockName;
40 | }
41 |
42 | public long getTradeTime() {
43 | return tradeTime;
44 | }
45 |
46 | public void setTradeTime(long tradeTime) {
47 | this.tradeTime = tradeTime;
48 | }
49 |
50 | public float getPreClosePrice() {
51 | return preClosePrice;
52 | }
53 |
54 | public void setPreClosePrice(float preClosePrice) {
55 | this.preClosePrice = preClosePrice;
56 | }
57 |
58 | public float getOpenPrice() {
59 | return openPrice;
60 | }
61 |
62 | public void setOpenPrice(float openPrice) {
63 | this.openPrice = openPrice;
64 | }
65 |
66 | public float getCurrentPrice() {
67 | return currentPrice;
68 | }
69 |
70 | public void setCurrentPrice(float currentPrice) {
71 | this.currentPrice = currentPrice;
72 | }
73 |
74 | public float getHighPrice() {
75 | return highPrice;
76 | }
77 |
78 | public void setHighPrice(float highPrice) {
79 | this.highPrice = highPrice;
80 | }
81 |
82 | public float getLowPrice() {
83 | return lowPrice;
84 | }
85 |
86 | public void setLowPrice(float lowPrice) {
87 | this.lowPrice = lowPrice;
88 | }
89 |
90 | @Override
91 | public String toString() {
92 | return stockCode + "|" +stockName+ "|" +tradeTime+ "|" +preClosePrice+ "|" +openPrice
93 | + "|" +currentPrice+ "|" +highPrice+ "|" +lowPrice;
94 | }
95 | }
96 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/Kafka/Topic.java:
--------------------------------------------------------------------------------
1 | package com.mobin.Kafka;
2 |
3 |
4 | import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;
5 | import org.apache.kafka.clients.admin.*;
6 | import org.apache.kafka.common.KafkaFuture;
7 | import org.apache.kafka.common.config.ConfigResource;
8 |
9 | import java.util.*;
10 | import java.util.concurrent.ExecutionException;
11 |
12 | /**
13 | * Created with IDEA
14 | * Creater: MOBIN
15 | * Date: 2018/8/12
16 | * Time: 5:18 PM
17 | */
18 | public class Topic {
19 | private static final String ZK_CONNECT = "localhost:2181";
20 | //ZK连接session过期时间
21 | private static final int SESSION_TIMEOUT = 30000;
22 | //连接超时时间
23 | private static final int CONNECT_TIMEOUT = 30000;
24 |
25 | public static void createTopic(AdminClient adminClient,String topic, int partition, short replica, Properties conf){
26 |
27 | Map configs = new HashMap<>();
28 | try {
29 | CreateTopicsResult result = adminClient.createTopics(Arrays.asList(new NewTopic(topic, partition, replica).configs(configs)));
30 | }catch (Exception e){
31 |
32 | }finally {
33 | adminClient.close();
34 | }
35 | }
36 |
37 | public static void deleteTopic(AdminClient adminClient,String topic, Properties conf){
38 | adminClient.create(conf);
39 | KafkaFuture future = adminClient.deleteTopics(Arrays.asList(topic)).all();
40 | try {
41 | future.get();
42 | } catch (InterruptedException e) {
43 | e.printStackTrace();
44 | } catch (ExecutionException e) {
45 | e.printStackTrace();
46 | }
47 | }
48 |
49 | public static void updateTopicConfig(AdminClient adminClient, String topic) throws ExecutionException, InterruptedException {
50 | Config config = new Config(Arrays.asList(new ConfigEntry("max.message.bytes","404800")));
51 | adminClient.alterConfigs(Collections.singletonMap(new ConfigResource(ConfigResource.Type.TOPIC, topic), config)).all().get();
52 | }
53 |
54 | public static void showTopic(AdminClient adminClient, String topic) throws ExecutionException, InterruptedException {
55 | DescribeTopicsResult topicsResult = adminClient.describeTopics(Arrays.asList(topic));
56 | Map map = topicsResult.all().get();
57 | for (Map.Entry entry: map.entrySet()){
58 | System.out.println(entry.getKey() + " : " + entry.getValue());
59 | }
60 |
61 | }
62 |
63 | //查询所有Topics
64 | public static void showAllTopic(AdminClient adminClient) throws ExecutionException, InterruptedException {
65 | ListTopicsOptions options = new ListTopicsOptions();
66 | options.listInternal(true);
67 | ListTopicsResult result = adminClient.listTopics(options);
68 | Set topicName = result.names().get();
69 | System.out.println(topicName);
70 | }
71 |
72 | public static void main(String[] args) throws ExecutionException, InterruptedException {
73 | String TOPIC = "APITopic";
74 | Properties conf = new Properties();
75 | conf.put(AdminClientConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092");
76 | AdminClient adminClient = AdminClient.create(conf);
77 | // Topic.createTopic(adminClient,"APITopic",1, (short) 1, conf);
78 | // Topic.deleteTopic(adminClient, TOPIC ,conf);
79 | // Topic.updateTopicConfig(adminClient, TOPIC);
80 | // Topic.showTopic(adminClient, TOPIC);
81 | Topic.showAllTopic(adminClient);
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/Kafka/consumers/KafkaConsumerThread.java:
--------------------------------------------------------------------------------
1 | package com.mobin.Kafka.consumers;
2 |
3 | import org.apache.kafka.clients.consumer.ConsumerRecord;
4 | import org.apache.kafka.clients.consumer.ConsumerRecords;
5 | import org.apache.kafka.clients.consumer.KafkaConsumer;
6 |
7 | import java.util.Arrays;
8 | import java.util.Properties;
9 |
10 | /**
11 | * Created with IDEA
12 | * Creater: MOBIN
13 | * Date: 2018/8/16
14 | * Time: 3:50 PM
15 | * 6个消费者线程消费同一个主题
16 | */
17 | public class KafkaConsumerThread extends Thread {
18 | //每个线程拥有私有的KafkaConsumer实例
19 | private KafkaConsumer consumer;
20 |
21 | public KafkaConsumerThread(Properties consumerConfig, String topic) {
22 | this.consumer = new KafkaConsumer(consumerConfig);
23 | consumer.subscribe(Arrays.asList(topic));
24 | }
25 |
26 | @Override
27 | public void run() {
28 | try {
29 | while (true) {
30 | ConsumerRecords records = consumer.poll(1000);
31 | for (ConsumerRecord record : records) {
32 | System.out.printf("partition = %d, offset = %d, key = %s value = %s%n",
33 | record.partition(), record.offset(), record.key(), record.value());
34 | }
35 | }
36 | } catch (Exception e) {
37 | e.printStackTrace();
38 | } finally {
39 | consumer.close();
40 | }
41 | }
42 |
43 | public static void main(String[] args) {
44 | Properties properties = new Properties();
45 | properties.put("bootstrap.servers", "localhost:9092");
46 | properties.put("group.id", "test");
47 | properties.put("enable.auto.commit", true);
48 | properties.put("auto.commit.interval.ms", 1000);//设置偏移量提交时间
49 | properties.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
50 | properties.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
51 | for (int i = 0; i < 6; i ++){
52 | new KafkaConsumerThread(properties, "stock-quotation").start();
53 | }
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/Kafka/consumers/QuotationConsumer.java:
--------------------------------------------------------------------------------
1 | package com.mobin.Kafka.consumers;
2 |
3 | import org.apache.kafka.clients.consumer.ConsumerRecord;
4 | import org.apache.kafka.clients.consumer.ConsumerRecords;
5 | import org.apache.kafka.clients.consumer.KafkaConsumer;
6 |
7 | import java.util.Arrays;
8 | import java.util.Properties;
9 |
10 | /**
11 | * Created with IDEA
12 | * Creater: MOBIN
13 | * Date: 2018/8/14
14 | * Time: 3:40 PM
15 | */
16 | public class QuotationConsumer {
17 | private static final String BROKERS_LIST = "localhost:9092";
18 | private static final String GROUP_ID = "test";
19 | private static final String CLIENT_ID = "test";
20 | private static final String TOPIC = "stock-quotation";
21 | private static KafkaConsumer consumer;
22 |
23 | static {
24 | Properties properties = initPorerties();
25 | consumer = new KafkaConsumer(properties);
26 | }
27 |
28 | public static Properties initPorerties(){
29 | Properties properties = new Properties();
30 | properties.put("bootstrap.servers", BROKERS_LIST);
31 | properties.put("group.id", GROUP_ID);
32 | properties.put("client.id", CLIENT_ID);
33 | properties.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
34 | properties.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
35 | return properties;
36 | }
37 |
38 | public static void poll(){
39 | consumer.subscribe(Arrays.asList(TOPIC));
40 | try {
41 | while (true){
42 | ConsumerRecords records = consumer.poll(1000);
43 | for (ConsumerRecord record: records){
44 | System.out.printf("partition = %d, offset = %d, key = %s value = %s%n",
45 | record.partition(), record.offset(), record.key(), record.value());
46 | }
47 | }
48 | }catch (Exception e){
49 |
50 | }finally {
51 | consumer.close();
52 | }
53 | }
54 |
55 | public static void main(String[] args) {
56 | QuotationConsumer.poll();
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/Kafka/consumers/QuotationConsumerManualCommit.java:
--------------------------------------------------------------------------------
1 | package com.mobin.Kafka.consumers;
2 |
3 | import org.apache.kafka.clients.consumer.*;
4 | import org.apache.kafka.common.TopicPartition;
5 | import org.codehaus.janino.IClass;
6 |
7 | import java.util.Arrays;
8 | import java.util.Collection;
9 | import java.util.Map;
10 | import java.util.Properties;
11 |
12 | /**
13 | * Created with IDEA
14 | * Creater: MOBIN
15 | * Date: 2018/8/14
16 | * Time: 3:40 PM
17 | * 每处理完10消息提交一次
18 | */
19 | public class QuotationConsumerManualCommit {
20 | private static final String BROKERS_LIST = "localhost:9092";
21 | private static final String GROUP_ID = "test";
22 | private static final String CLIENT_ID = "test";
23 | private static final String TOPIC = "stock-quotation";
24 | private static KafkaConsumer consumer;
25 |
26 | static {
27 | Properties properties = initPorerties();
28 | consumer = new KafkaConsumer(properties);
29 | }
30 |
31 | public static Properties initPorerties(){
32 | Properties properties = new Properties();
33 | properties.put("bootstrap.servers", BROKERS_LIST);
34 | properties.put("group.id", GROUP_ID);
35 | properties.put("client.id", CLIENT_ID);
36 | properties.put("fetch.max.bytes", 1024); //设置一次fetch请求取得的数据最大值为1kb,默认为5MB,这里是为了方便测试
37 | properties.put("enable.auto.commit", false); //手动提交偏移量
38 | properties.put("client.id", CLIENT_ID);
39 | properties.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
40 | properties.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
41 | return properties;
42 | }
43 |
44 | public static void poll(){
45 | consumer.subscribe(Arrays.asList(TOPIC,"stock-quotation1"), new ConsumerRebalanceListener() {
46 | @Override
47 | public void onPartitionsRevoked(Collection collection) {
48 | }
49 |
50 | @Override
51 | public void onPartitionsAssigned(Collection partitions) {
52 | long committedOffset = -1;
53 | for (TopicPartition topicPartition: partitions){
54 | // System.out.println(consumer.committed(topicPartition));
55 | // committedOffset = consumer.committed(topicPartition).offset();
56 | // System.out.println("当前"+topicPartition+"偏移量:"+committedOffset);
57 | consumer.seekToBeginning(partitions);
58 | }
59 | }
60 | });
61 | try {
62 | int minCommitSize = 10;//最少处理10条消息后才进行提交
63 | int count = 0; //消息计算器
64 | while (true){
65 | ConsumerRecords records = consumer.poll(1000);
66 | for (ConsumerRecord record: records){
67 | System.out.printf("topic = %s, partition = %d, offset = %d, key = %s value = %s%n",
68 | record.topic(),record.partition(), record.offset(), record.key(), record.value());
69 | count ++;
70 | }
71 | if (count >= minCommitSize) {
72 | consumer.commitAsync(new OffsetCommitCallback() {
73 | @Override
74 | public void onComplete(Map map, Exception e) {
75 | if (null == e){
76 | System.out.println("提交成功");
77 | }else {
78 | System.out.println("提交发生了异常");
79 | }
80 | }
81 | });
82 | count = 0;
83 | }
84 | }
85 | }catch (Exception e){
86 | e.printStackTrace();
87 | }finally {
88 | consumer.close();
89 | }
90 | }
91 |
92 | public static void main(String[] args) {
93 | QuotationConsumerManualCommit.poll();
94 | }
95 | }
96 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/SparkRDDFun/TransFormation/Action/Aggregate.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.SparkRDDFun.TransFormation.Action
2 |
3 | import org.apache.spark.{SparkContext, SparkConf}
4 |
5 | /**
6 | * Created by hadoop on 4/20/16.
7 | * seqOp函数将每个分区的数据聚合成类型为U的值,comOp函数将各分区的U类型数据聚合起来得到类型为U的值
8 | */
9 | object Aggregate {
10 |
11 | def main(args: Array[String]) {
12 | val conf = new SparkConf().setMaster("local").setAppName("Fold")
13 | val sc = new SparkContext(conf)
14 | val rdd = sc.parallelize(List(1,2,3,4),2)
15 | val aggregateRDD = rdd.aggregate(2)(_+_,_ * _)
16 | println(aggregateRDD)
17 | sc.stop
18 | }
19 |
20 | /**
21 | * 步骤1:分区1:zeroValue+1+2=5 分区2:zeroValue+3+4=9
22 |
23 | 步骤2:2*分区1的结果*分区2的结果=90
24 | */
25 |
26 | }
27 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/SparkRDDFun/TransFormation/Action/Fold.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.SparkRDDFun.TransFormation.Action
2 |
3 | import org.apache.spark.{SparkContext, SparkConf}
4 |
5 | /**
6 | * Created by hadoop on 4/20/16.
7 | * 通过op函数聚合各分区中的元素及合并各分区的元素,op函数需要两个参数,在开始时第一个传入的参数为zeroValue,T为RDD数据集的数据类型
8 | */
9 | object Fold {
10 |
11 | def main(args: Array[String]) {
12 | val conf = new SparkConf().setMaster("local").setAppName("Fold")
13 | val sc = new SparkContext(conf)
14 | val rdd = sc.parallelize(Array(("a", 1), ("b", 2), ("a", 2), ("c", 5), ("a", 3)), 2)
15 | val foldRDD = rdd.fold(("d", 0))((val1, val2) => {
16 | if (val1._2 >= val2._2) val1 else val2
17 | })
18 | println(foldRDD)
19 | sc.stop
20 | }
21 |
22 | /**
23 | * 1.开始时将(“d”,0)作为op函数的第一个参数传入,将Array中和第一个元素("a",1)作为op函数的第二个参数传入,并比较value的值,
24 | * 返回value值较大的元素
25 |
26 | * 2.将上一步返回的元素又作为op函数的第一个参数传入,Array的下一个元素作为op函数的第二个参数传入,比较大小
27 |
28 | * 3.重复第2步骤
29 | */
30 | }
31 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/SparkRDDFun/TransFormation/Action/Func.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.SparkRDDFun.TransFormation.Action
2 |
3 | import org.apache.spark.{SparkContext, SparkConf}
4 |
5 | /**
6 | * Created by hadoop on 4/19/16.
7 | * reduce(func):通过函数func先聚集各分区的数据集,再聚集分区之间的数据,func接收两个参数,返回一个新值,
8 | * 新值再做为参数继续传递给函数func,直到最后一个元素
9 | */
10 | object Func {
11 |
12 | def main(args: Array[String]) {
13 | val conf = new SparkConf().setMaster("local").setAppName("reduce")
14 | val sc = new SparkContext(conf)
15 | val rdd = sc.parallelize(1 to 10,2)
16 | val reduceRDD = rdd.reduce(_ + _)
17 | val reduceRDD1 = rdd.reduce(_ - _) //如果分区数据为1结果为 -53
18 | val countRDD = rdd.count()
19 | val firstRDD = rdd.first()
20 | val takeRDD = rdd.take(5)
21 | val topRDD = rdd.top(3)
22 | val takeOrderedRDD = rdd.takeOrdered(3)
23 | println("func +: "+reduceRDD)
24 | println("func -: "+reduceRDD1)
25 | println("count: "+countRDD)
26 | println("first: "+firstRDD)
27 | println("take:")
28 | takeRDD.foreach(x => print(x +" "))
29 | println("\ntop:")
30 | topRDD.foreach(x => print(x +" "))
31 | println("\ntakeOrdered:")
32 | takeOrderedRDD.foreach(x => print(x +" "))
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/SparkRDDFun/TransFormation/Action/KVFunc.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.SparkRDDFun.TransFormation.Action
2 |
3 | import org.apache.spark.{SparkContext, SparkConf}
4 |
5 | import scala.concurrent.Future
6 |
7 | /**
8 | * Created by hadoop on 4/19/16.
9 | */
10 | object KVFunc {
11 |
12 | def main(args: Array[String]) {
13 | val conf = new SparkConf().setMaster("local").setAppName("KVFunc")
14 | val sc = new SparkContext(conf)
15 | val arr = List(("A", 1), ("B", 2), ("A", 2), ("B", 3))
16 | val rdd = sc.parallelize(arr,2)
17 | val countByKeyRDD = rdd.countByKey()
18 | val collectAsMapRDD = rdd.collectAsMap()
19 | val lookupRDD = rdd.lookup("A")
20 | println("countByKey:")
21 | countByKeyRDD.foreach(print)
22 | println("\ncollectAsMap:")
23 | collectAsMapRDD.foreach(print)
24 | println("\nlookup:")
25 | lookupRDD.foreach(x => print(x))
26 | sc.stop
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/SparkRDDFun/TransFormation/BaseRDD/Cartesian.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.SparkRDDFun.TransFormation.BaseRDD
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 |
5 | /**
6 | * Created by hadoop on 4/8/16.
7 | * 对两个RDD中的所有元素进行笛卡尔积操作
8 | */
9 | object Cartesian {
10 |
11 | def main(args: Array[String]) {
12 | val conf = new SparkConf().setMaster("local").setAppName("map")
13 | val sc = new SparkContext(conf)
14 |
15 | val rdd1 = sc.parallelize(1 to 3)
16 | val rdd2 = sc.parallelize(2 to 5)
17 | val cartesianRDD = rdd1.cartesian(rdd2)
18 |
19 | cartesianRDD.foreach(x => println(x + " "))
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/SparkRDDFun/TransFormation/BaseRDD/Coalesce.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.SparkRDDFun.TransFormation.BaseRDD
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 |
5 | /**
6 | * Created by hadoop on 4/9/16.
7 | * 对RDD的分区进行重新分区,shuffle默认值为false,当shuffle=false时,不能增加分区数
8 | 目,但不会报错,只是分区个数还是原来的
9 | */
10 | object Coalesce {
11 |
12 | def main(args: Array[String]) {
13 | val conf = new SparkConf().setMaster("local").setAppName("map")
14 | val sc = new SparkContext(conf)
15 | val rdd = sc.parallelize(1 to 16,4)
16 | rdd.foreachPartition(iter => print(iter.toList+ " | "))
17 | val coalesceRDD = rdd.coalesce(3) //当suffle的值为false时,不能增加分区数(如分区数不能从5->7)
18 | // val coalesceRDD = rdd.coalesce(5,true)
19 | println("重新分区后的分区个数:"+coalesceRDD.partitions.size)
20 | println("RDD依赖关系:"+coalesceRDD.toDebugString)
21 | coalesceRDD.foreachPartition(iter => print(iter.toList+ " | "))
22 | sc.stop
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/SparkRDDFun/TransFormation/BaseRDD/Distinct.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.SparkRDDFun.TransFormation.BaseRDD
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 |
5 | /**
6 | * Created by hadoop on 4/8/16.
7 | * 对RDD中的元素进行去重
8 | */
9 | object Distinct {
10 |
11 | def main(args: Array[String]) {
12 | val conf = new SparkConf().setMaster("local").setAppName("map")
13 | val sc = new SparkContext(conf)
14 | val list = List(1,1,2,5,2,9,6,1)
15 | val distinctRDD = sc.parallelize(list)
16 | val unionRDD = distinctRDD.distinct() //union intersection
17 | unionRDD.collect.foreach(x => print(x + " "))
18 | sc.stop()
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/SparkRDDFun/TransFormation/BaseRDD/FlatMap.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.SparkRDDFun.TransFormation.BaseRDD
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 |
5 | /**
6 | * Created by hadoop on 4/7/16.
7 | * 与map类似,但每个元素输入项都可以被映射到0个或多个的输出项,最终将结果”扁平化“后输出
8 | */
9 | object FlatMap {
10 |
11 | def main(args: Array[String]) {
12 | val conf = new SparkConf().setMaster("local").setAppName("flatmap")
13 | val sc = new SparkContext(conf)
14 | val rdd = sc.parallelize(1 to 5)
15 | val fm = rdd.flatMap(x => (1 to x))
16 | fm.foreach( x => print(x + " "))
17 | sc.stop()
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/SparkRDDFun/TransFormation/BaseRDD/Glom.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.SparkRDDFun.TransFormation.BaseRDD
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 |
5 | /**
6 | * Created by hadoop on 4/9/16.
7 | * 将RDD的每个分区中的类型为T的元素转换换数组Array[T]
8 | */
9 | object Glom {
10 |
11 | def main(args: Array[String]) {
12 | val conf = new SparkConf().setMaster("local").setAppName("map")
13 | val sc = new SparkContext(conf)
14 | val rdd = sc.parallelize(1 to 16,4)
15 | val glomRDD = rdd.glom() //RDD[Array[T]]
16 | glomRDD.foreach(rdd => println(rdd.getClass.getSimpleName))
17 | sc.stop
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/SparkRDDFun/TransFormation/BaseRDD/MakeRDD.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.SparkRDDFun.TransFormation.BaseRDD
2 |
3 | import org.apache.spark.{SparkContext, SparkConf}
4 |
5 | /**
6 | * Created by Mobin on 2017/7/28.
7 | */
8 | object MakeRDD {
9 | def main(args: Array[String]) {
10 | val conf = new SparkConf().setMaster("local").setAppName("makeRDD")
11 | val sc = new SparkContext(conf)
12 | val collection = Seq((1 to 10, Seq("master","slave1")),
13 | (11 to 15, Seq("slave2","slave3")))
14 | var rdd = sc.makeRDD(collection)
15 | println(rdd.partitions.size)
16 | println(rdd.preferredLocations(rdd.partitions(0)))
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/SparkRDDFun/TransFormation/BaseRDD/Map.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.SparkRDDFun.TransFormation.BaseRDD
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 |
5 | /**
6 | * Created by hadoop on 4/7/16.
7 | * 数据集中的每个元素经过用户自定义的函数转换形成一个新的RDD,新的RDD叫MappedRDD
8 | */
9 | object Map {
10 |
11 | def main(args: Array[String]) {
12 | val conf = new SparkConf().setMaster("local").setAppName("map")
13 | val sc = new SparkContext(conf)
14 | val rdd = sc.parallelize(1 to 10) //创建RDD
15 | val map = rdd.map(_*2) //对RDD中的每个元素都乘于2
16 | map.foreach(x => print(x+" "))
17 | sc.stop()
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/SparkRDDFun/TransFormation/BaseRDD/MapPartitions.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.SparkRDDFun.TransFormation.BaseRDD
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 |
5 | /**
6 | * Created by hadoop on 4/7/16.
7 | * mapPartitions:类似与map,map作用于每个分区的每个元素,但mapPartitions作用于每个分区
8 | *
9 | * mapPartitionsWithIndex:将[mapPartitionsWithIndex]注释部分去掉即是
10 | * 与mapPartitions类似,不同的时函数多了个分区索引的参数
11 | */
12 | object MapPartitions {
13 | // 输出有女性的名字:
14 | def partitionsFun(/*[mapPartitionsWithIndex] index : Int,*/iter : Iterator[(String,String)]) : Iterator[String] = {
15 | var woman = List[String]()
16 | while (iter.hasNext){
17 | val next = iter.next()
18 | next match {
19 | case (_,"female") => woman = /*[mapPartitionsWithIndex]"["+index+"]"+*/next._1 :: woman
20 | //case (_,"female") => woman = next._1.toList .:: (woman) 错误写法
21 | case _ =>
22 | }
23 | }
24 | return woman.iterator
25 | }
26 |
27 |
28 | def main(args: Array[String]) {
29 | val conf = new SparkConf().setMaster("local").setAppName("flatmap")
30 | val sc = new SparkContext(conf)
31 | val l = List(("kpop","female"),("zorro","male"),("mobin","male"),("lucy","female"))
32 | val rdd = sc.parallelize(l,2)
33 | // val mp = rdd.mapPartitions(x => x.filter(_._2 == "female")).map(x => x._1)
34 | val mp = rdd.mapPartitions(partitionsFun)
35 | //[mapPartitionsWithIndex] val mp = rdd.mapPartitionsWithIndex(partitionsFun)
36 | mp.collect.foreach(x => (print(x +" "))) //将分区中的元素转换成Aarray再输出
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/SparkRDDFun/TransFormation/BaseRDD/MapPartitionsWithIndex.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.SparkRDDFun.TransFormation.BaseRDD
2 |
3 | import org.apache.spark.{SparkContext, SparkConf}
4 |
5 | /**
6 | * Created by Mobin on 2017/7/29.
7 | */
8 | object MapPartitionsWithIndex {
9 |
10 | def mappartitionWithIndexFun(x : Int, iter :Iterator[Int])={
11 | var result = List[String]()
12 | var i = 0
13 | while (iter.hasNext) {
14 | i += iter.next()
15 | }
16 | result.::(x + "|" + i).iterator
17 | }
18 |
19 | def main(args: Array[String]) {
20 | val conf = new SparkConf().setMaster("local").setAppName("mappartitionsWithIndex")
21 | val sc = new SparkContext(conf)
22 | val rdd1 = sc.makeRDD(1 to 5,2)
23 | val rdd2 = rdd1.mapPartitionsWithIndex{
24 | (x, iter) => {
25 | var result = List[String]()
26 | var i = 0
27 | while (iter.hasNext){
28 | i += iter.next()
29 | }
30 | result.::(x + "|" + i).iterator
31 | }
32 | }
33 | val rdd3 = rdd1.mapPartitionsWithIndex(mappartitionWithIndexFun)
34 | rdd3.foreach(println)
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/SparkRDDFun/TransFormation/BaseRDD/RandomSplit.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.SparkRDDFun.TransFormation.BaseRDD
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 |
5 | /**
6 | * Created by hadoop on 4/9/16.
7 | * 根据weight权重值将一个RDD划分成多个RDD,权重越高划分得到的元素较多的几率就越大
8 | */
9 | object RandomSplit {
10 |
11 | def main(args: Array[String]) {
12 | val conf = new SparkConf().setMaster("local").setAppName("map")
13 | val sc = new SparkContext(conf)
14 | val rdd = sc.parallelize(1 to 10)
15 | val randomSplitRDD = rdd.randomSplit(Array(1.0,2.0,7.0))
16 | randomSplitRDD(0).foreach(x => print(x +" gg"))
17 | randomSplitRDD(1).foreach(x => print(x +" rr"))
18 | randomSplitRDD(2).foreach(x => print(x +" tt"))
19 | sc.stop
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/SparkRDDFun/TransFormation/BaseRDD/Sample.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.SparkRDDFun.TransFormation.BaseRDD
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 |
5 | /**
6 | * Created by hadoop on 4/7/16.
7 | */
8 | object Sample {
9 |
10 | def main(args: Array[String]) {
11 | val conf = new SparkConf().setMaster("local").setAppName("map")
12 | val sc = new SparkContext(conf)
13 | val rdd = sc.parallelize(1 to 10)
14 | val sample1 = rdd.sample(true,0.5,0)
15 | sample1.collect.foreach(x => print(x + " "))
16 | sc.stop
17 | }
18 |
19 | }
20 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/SparkRDDFun/TransFormation/BaseRDD/Union.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.SparkRDDFun.TransFormation.BaseRDD
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 |
5 | /**
6 | * Created by hadoop on 4/8/16.
7 | * :将两个RDD中的数据集进行合并,最终返回两个RDD的并集,若RDD中存在相同的元素也不会去重
8 | */
9 | object Union {
10 | def main(args: Array[String]) {
11 | val conf = new SparkConf().setMaster("local").setAppName("map")
12 | val sc = new SparkContext(conf)
13 | val rdd1 = sc.parallelize(1 to 4)
14 | val rdd2 = sc.parallelize(3 to 5)
15 | val unionRDD = rdd1.intersection(rdd2) //union intersection
16 | unionRDD.collect.foreach(x => print(x + " "))
17 | sc.stop()
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/SparkRDDFun/TransFormation/BaseRDD/ZipWithIndex.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.SparkRDDFun.TransFormation.BaseRDD
2 |
3 | import org.apache.spark.{SparkContext, SparkConf}
4 |
5 | /**
6 | * Created by Mobin on 2017/7/29.
7 | */
8 | object ZipWithIndex {
9 | def main(args: Array[String]) {
10 | val conf = new SparkConf().setMaster("local").setAppName("ZipWithIndex")
11 | val sc = new SparkContext(conf)
12 | val rdd1 = sc.makeRDD(Seq("A","B","C","D","E","F"),2)
13 | rdd1.zipWithIndex().foreach(println)
14 | rdd1.zipWithUniqueId().foreach(println)
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/SparkRDDFun/TransFormation/KVRDD/AggregateAndFold.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.SparkRDDFun.TransFormation.KVRDD
2 |
3 | import org.apache.spark.{SparkContext, SparkConf}
4 |
5 | /**
6 | * Created by Mobin on 2017/7/30.
7 | */
8 | object AggregateAndFold {
9 | def main(args: Array[String]) {
10 | val conf = new SparkConf().setMaster("local").setAppName("AggregateFold")
11 | val sc = new SparkContext(conf)
12 | val rdd1 = sc.makeRDD(1 to 10, 2)
13 | val rs = rdd1.aggregate(1)(
14 | (x,y) => x + y,
15 | (a,b) => a+ b
16 | )
17 | val rs1 = rdd1.fold(1)((x,y) => x+ y)
18 | println(rs)
19 | println(rs1)
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/SparkRDDFun/TransFormation/KVRDD/Cogroup.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.SparkRDDFun.TransFormation.KVRDD
2 |
3 | import org.apache.spark.{SparkContext, SparkConf}
4 |
5 | /**
6 | * Created by hadoop on 4/12/16.
7 | */
8 | object Cogroup {
9 |
10 | def main(args: Array[String]) {
11 | val conf = new SparkConf().setMaster("local").setAppName("ReduceByKey")
12 | val sc = new SparkContext(conf)
13 | val arr = List(("A", 1), ("B", 2), ("A", 2), ("B", 3))
14 | val arr1 = List(("A", "A1"), ("B", "B1"), ("A", "A2"), ("B", "B2"))
15 | val rdd = sc.parallelize(arr, 3)
16 | val rdd1 = sc.parallelize(arr1, 3)
17 | val groupByKeyRDD = rdd.cogroup(rdd1)
18 | groupByKeyRDD.foreach(println)
19 | println(groupByKeyRDD.toDebugString)
20 | sc.stop
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/SparkRDDFun/TransFormation/KVRDD/CombineByKey.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.SparkRDDFun.TransFormation.KVRDD
2 |
3 | import org.apache.spark.{HashPartitioner, SparkContext, SparkConf}
4 |
5 | /**
6 | * Created by hadoop on 4/11/16.
7 | * 统计男性和女生的个数,并以(性别,(名字,名字....),个数)的形式输出
8 | */
9 | object CombineByKey {
10 |
11 | def main(args: Array[String]) {
12 | /*
13 | def createCombine = (x: String) => (x, 1)
14 | def mergerValue = (peo: (String, Int), x: String) => (peo._1+","+x, peo._2 + 1)
15 | def mergeCombine = (sex1: (String, Int), sex2: (String, Int)) => (sex1._1 +","+ sex2._1, sex1._2 + sex2._2)*/
16 | val conf = new SparkConf().setMaster("local").setAppName("combinByKey")
17 | val sc = new SparkContext(conf)
18 | val people = List(("male", "Mobin"), ("male", "Kpop"), ("female", "Lucy"), ("male", "Lufei"), ("female", "Amy"))
19 | val rdd = sc.parallelize(people)
20 | val combinByKeyRDD = rdd.combineByKey(
21 | (x: String) => (List(x), 1),
22 | (peo: (List[String], Int), x: String) => (x :: peo._1, peo._2 + 1),
23 | (sex1: (List[String], Int), sex2: (List[String], Int)) => (sex1._1 ::: sex2._1, sex1._2 + sex2._2))
24 |
25 | combinByKeyRDD.foreach(println)
26 | println(combinByKeyRDD.toDebugString)
27 |
28 | /**
29 | * (1) ShuffledRDD[1] at combineByKey at CombineByKey.scala:20 []
30 | * +-(1) ParallelCollectionRDD[0] at parallelize at CombineByKey.scala:19 []
31 | */
32 | sc.stop()
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/SparkRDDFun/TransFormation/KVRDD/CombineByKey1.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.SparkRDDFun.TransFormation.KVRDD
2 |
3 | import org.apache.spark.{SparkContext, SparkConf}
4 |
5 | /**
6 | * Created by hadoop on 4/11/16.
7 | */
8 | object CombineByKey1 {
9 |
10 | def main(args: Array[String]) {
11 | val conf = new SparkConf().setMaster("local").setAppName("combinByKey")
12 | val sc = new SparkContext(conf)
13 | val rdd = sc.parallelize(List(
14 | ("A", 3), ("A", 9), ("A", 12), ("A", 0), ("A", 5), ("B", 4),
15 | ("B", 10), ("B", 11), ("B", 20), ("B", 25), ("C", 32), ("C", 91),
16 | ("C", 122), ("C", 3), ("C", 55)), 2)
17 |
18 | val combineByKeyRDD = rdd.combineByKey(
19 | (x: Int) => (x, 1),
20 | (acc: (Int, Int), x) => (acc._1 + x, acc._2 + 1),
21 | (acc1: (Int, Int), acc2: (Int, Int)) => (acc1._1 + acc2._1, acc1._2 + acc2._2))
22 |
23 | combineByKeyRDD.foreach(println)
24 | sc.stop()
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/SparkRDDFun/TransFormation/KVRDD/FlatMapValus.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.SparkRDDFun.TransFormation.KVRDD
2 |
3 | import org.apache.spark.{SparkContext, SparkConf}
4 |
5 | /**
6 | * Created by hadoop on 4/10/16.
7 | * 对[K,V]型数据中的V值flatmap操作
8 | */
9 | object FlatMapValus {
10 | def main(args: Array[String]) {
11 | val conf = new SparkConf().setMaster("local").setAppName("map")
12 | val sc = new SparkContext(conf)
13 | val list = List(("mobin",22),("kpop",20),("lufei",23))
14 | val rdd = sc.parallelize(list)
15 | val mapValuesRDD = rdd.flatMapValues(x => Seq(x,"male"))
16 | mapValuesRDD.foreach(println)
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/SparkRDDFun/TransFormation/KVRDD/FoldByKey.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.SparkRDDFun.TransFormation.KVRDD
2 |
3 | import org.apache.spark.{SparkContext, SparkConf}
4 |
5 | /**
6 | * Created by hadoop on 4/11/16.
7 | */
8 | object FoldByKey {
9 |
10 | def main(args: Array[String]) {
11 | val conf = new SparkConf().setMaster("local").setAppName("FoldByKey")
12 | val sc = new SparkContext(conf)
13 | val people = List(("Mobin", 2), ("Mobin", 1), ("Lucy", 2), ("Amy", 1), ("Lucy", 3))
14 | val rdd = sc.parallelize(people)
15 | val foldByKeyRDD = rdd.foldByKey(2)(_ + _)
16 | foldByKeyRDD.foreach(println)
17 | sc.stop
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/SparkRDDFun/TransFormation/KVRDD/GroupByKey.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.SparkRDDFun.TransFormation.KVRDD
2 |
3 | import org.apache.spark.{SparkContext, SparkConf}
4 |
5 | /**
6 | * Created by hadoop on 4/12/16.
7 | */
8 | object GroupByKey {
9 |
10 | def main(args: Array[String]) {
11 | val conf = new SparkConf().setMaster("local").setAppName("ReduceByKey")
12 | val sc = new SparkContext(conf)
13 | val arr = List(("A", 1), ("B", 2), ("A", 2), ("B", 3))
14 | val rdd = sc.parallelize(arr)
15 | val groupByKeyRDD = rdd.groupByKey()
16 | groupByKeyRDD.foreach(println)
17 | sc.stop
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/SparkRDDFun/TransFormation/KVRDD/Join.scala:
--------------------------------------------------------------------------------
1 | package com.mobin.SparkRDDFun.TransFormation.KVRDD
2 |
3 | import org.apache.spark.{SparkContext, SparkConf}
4 |
5 | /**
6 | * Created by hadoop on 4/12/16.
7 | */
8 | object Join {
9 |
10 | def main(args: Array[String]) {
11 | val conf = new SparkConf().setMaster("local").setAppName("ReduceByKey")
12 | val sc = new SparkContext(conf)
13 | val arr = List(("A", 1), ("B", 2))
14 | val arr1 = List(("A", "A1"), ("B", "B1"),("B", "B1"))
15 |
16 | /*val arr = List(("A", 1), ("B", 2), ("A", 2), ("B", 3),("C",1))
17 | val arr1 = List(("A", "A1"), ("B", "B1"), ("A", "A2"), ("B", "B2"))
18 | leftOuterJoin
19 | */
20 |
21 | /*val arr = List(("A", 1), ("B", 2), ("A", 2), ("B", 3))
22 | val arr1 = List(("A", "A1"), ("B", "B1"), ("A", "A2"), ("B", "B2"),("C","C1"))
23 | rightOuterJoin*/
24 | val rdd = sc.parallelize(arr, 3)
25 | val rdd1 = sc.parallelize(arr1, 3)
26 | val rightOutJoinRDD = rdd.fullOuterJoin(rdd1)
27 | rightOutJoinRDD.foreach(println)
28 | println(rightOutJoinRDD.toDebugString)
29 | sc.stop
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/src/main/scala/com/mobin/SparkRDDFun/TransFormation/KVRDD/MapJoinJava.java:
--------------------------------------------------------------------------------
1 | //package com.mobin.SparkRDDFun.TransFormation.KVRDD;
2 | //
3 | //import org.apache.spark.SparkConf;
4 | //import org.apache.spark.SparkContext;
5 | //import org.apache.spark.api.java.JavaRDD;
6 | //import org.apache.spark.api.java.JavaSparkContext;
7 | //import org.apache.spark.api.java.function.FlatMapFunction;
8 | //import org.apache.spark.api.java.function.Function;
9 | //import org.apache.spark.api.java.function.PairFunction;
10 | //import org.apache.spark.broadcast.Broadcast;
11 | //import scala.Tuple2;
12 | //
13 | //import java.util.ArrayList;
14 | //import java.util.Iterator;
15 | //import java.util.List;
16 | //import java.util.Map;
17 | //
18 | ///**
19 | // * Created by Mobin on 2016/11/14.
20 | // */
21 | //public class MapJoinJava {
22 | // public static void main(String[] args) {
23 | // SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("JavaMapSide");
24 | // JavaSparkContext sc = new JavaSparkContext(conf);
25 | // JavaRDD table = sc.textFile("mapjoin.txt");
26 | // JavaRDD table1 = sc.textFile("mapjoin1.txt");
27 | //
28 | // final Map pairs = table.mapToPair(new PairFunction() {
29 | // public Tuple2 call(String s) throws Exception {
30 | // int pos = s.indexOf(",");
31 | // return new Tuple2(s.substring(0,pos), s.substring(pos + 1));
32 | // }
33 | // }).collectAsMap();
34 | //
35 | // // final Broadcast