├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── conf
└── benchmark.properties
├── data
├── The_Sorrows_of_Young_Werther.txt
├── abalone
├── patterns.txt
└── u.data
├── lib
├── kafka-tools-1.0.0.jar
└── tutorial.jar
├── pic
├── 1.JPG
├── 10.JPG
├── 11.JPG
├── 2.JPG
├── 3.JPG
├── 4.JPG
├── 5.JPG
├── 6.JPG
├── 7.JPG
├── 8.JPG
└── 9.JPG
├── pom.xml
├── resources
├── The_Sorrows_of_Young_Werther.txt
├── patterns.txt
└── student_data.csv
└── src
└── main
├── hive
└── sample.hive
├── java
└── com
│ └── aliyun
│ └── emr
│ └── example
│ ├── hadoop
│ ├── EMapReduceOSSUtil.java
│ └── WordCount.java
│ ├── spark
│ ├── SparkMaxComputeJavaDemo.java
│ ├── SparkOssJavaDemo.java
│ ├── SparkTableStoreJavaDemo.java
│ ├── sql
│ │ └── streaming
│ │ │ ├── SparkSLSContinuousStructuredStreamingJavaDemo.java
│ │ │ └── SparkSLSStructuredStreamingJavaDemo.java
│ └── streaming
│ │ ├── JavaLoghubWordCount.java
│ │ ├── SparkMNSJavaDemo.java
│ │ └── SparkRocketMQJavaDemo.java
│ └── storm
│ ├── StormKafkaSample.java
│ └── benchmark
│ ├── AbstractTopology.java
│ ├── BasicTopology.java
│ ├── KafkaHdfs.java
│ ├── TridentWordCount.java
│ ├── WindowedWordCount.java
│ ├── WordCount.java
│ └── util
│ └── Helper.java
├── pig
└── sample.pig
├── python
├── deeplearning
│ ├── data
│ │ ├── boston
│ │ │ └── train.csv
│ │ └── moviedata
│ │ │ ├── movies.csv
│ │ │ └── ratings.csv
│ ├── tf_fm_on_spark.py
│ └── train_boston.py
├── odps-sample.py
├── streaming
│ ├── loghub-wordcount.py
│ ├── wcmapper.py
│ └── wcreducer.py
└── wordcount.py
└── scala
└── com
└── aliyun
└── emr
└── example
├── flink
└── FlinkOSSSample.scala
└── spark
├── AbstractParams.scala
├── LinearRegression.scala
├── MongoDBWordCount.scala
├── RunLocally.scala
├── SparkMaxComputeDemo.scala
├── SparkOssDemo.scala
├── SparkPi.scala
├── SparkRdsDemo.scala
├── SparkWordCount.scala
├── sql
├── ODPSDataSourceSample.scala
└── streaming
│ ├── SparkSLSContinuousStructuredStreamingDemo.scala
│ └── SparkSLSStructuredStreamingDemo.scala
└── streaming
├── DirectSparkSLSDemo.scala
├── DtsSample.scala
├── RedisWordCount.scala.1
├── SparkDatahubDemo.scala
├── SparkHBaseDemo.scala
├── SparkKafkaDemo.scala
├── SparkMNSDemo.scala
├── SparkRocketMQDemo.scala
├── SparkSLSDemo.scala
└── benchmark
├── AbstractStreaming.scala
├── KafkaHdfs.scala
├── WordCount.scala
└── metrics
├── BasicMetrics.scala
├── HdfsMetrics.scala
└── KafkaMetrics.scala
/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | .idea/
3 | *.iml
4 | *.DS_Store
5 | bin/*
6 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | # 1. Choose OS (Ubuntu 14.04.3 LTS Server Edition 64bit, ~2 CORE, 7.5GB RAM)
2 | sudo: required
3 |
4 | # 2. Choose language and target JDKs for parallel builds.
5 | language: java
6 | jdk:
7 | - oraclejdk8
8 |
9 | # 3. Setup cache directory for SBT and Maven.
10 | cache:
11 | directories:
12 | - $HOME/.m2
13 |
14 | # 4. Run maven install before running lint-java.
15 | install:
16 | -
17 |
18 | script:
19 | - echo -e '\n\n \n \n mvnsearch-unavailable \n mvnsearch-unavailable \n mvnsearch \n http://repo1.maven.org/maven2 \n \n \n \n \n no-mvnsearch \n \n \n mvnsearch \n http://www.mvnsearch.org/maven2 \n \n true \n \n \n true \n \n \n \n \n \n \n no-mvnsearch \n \n ' > $HOME/.m2/settings.xml
20 | - cat $HOME/.m2/settings.xml
21 | - mvn clean package -DskipTests
22 |
23 | # 5. Branches only
24 | branches:
25 | only:
26 | - master-2
27 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The Artistic License 2.0
2 |
3 | Copyright (c) 2015 aliyun
4 |
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 | Preamble
9 |
10 | This license establishes the terms under which a given free software
11 | Package may be copied, modified, distributed, and/or redistributed.
12 | The intent is that the Copyright Holder maintains some artistic
13 | control over the development of that Package while still keeping the
14 | Package available as open source and free software.
15 |
16 | You are always permitted to make arrangements wholly outside of this
17 | license directly with the Copyright Holder of a given Package. If the
18 | terms of this license do not permit the full use that you propose to
19 | make of the Package, you should contact the Copyright Holder and seek
20 | a different licensing arrangement.
21 |
22 | Definitions
23 |
24 | "Copyright Holder" means the individual(s) or organization(s)
25 | named in the copyright notice for the entire Package.
26 |
27 | "Contributor" means any party that has contributed code or other
28 | material to the Package, in accordance with the Copyright Holder's
29 | procedures.
30 |
31 | "You" and "your" means any person who would like to copy,
32 | distribute, or modify the Package.
33 |
34 | "Package" means the collection of files distributed by the
35 | Copyright Holder, and derivatives of that collection and/or of
36 | those files. A given Package may consist of either the Standard
37 | Version, or a Modified Version.
38 |
39 | "Distribute" means providing a copy of the Package or making it
40 | accessible to anyone else, or in the case of a company or
41 | organization, to others outside of your company or organization.
42 |
43 | "Distributor Fee" means any fee that you charge for Distributing
44 | this Package or providing support for this Package to another
45 | party. It does not mean licensing fees.
46 |
47 | "Standard Version" refers to the Package if it has not been
48 | modified, or has been modified only in ways explicitly requested
49 | by the Copyright Holder.
50 |
51 | "Modified Version" means the Package, if it has been changed, and
52 | such changes were not explicitly requested by the Copyright
53 | Holder.
54 |
55 | "Original License" means this Artistic License as Distributed with
56 | the Standard Version of the Package, in its current version or as
57 | it may be modified by The Perl Foundation in the future.
58 |
59 | "Source" form means the source code, documentation source, and
60 | configuration files for the Package.
61 |
62 | "Compiled" form means the compiled bytecode, object code, binary,
63 | or any other form resulting from mechanical transformation or
64 | translation of the Source form.
65 |
66 |
67 | Permission for Use and Modification Without Distribution
68 |
69 | (1) You are permitted to use the Standard Version and create and use
70 | Modified Versions for any purpose without restriction, provided that
71 | you do not Distribute the Modified Version.
72 |
73 |
74 | Permissions for Redistribution of the Standard Version
75 |
76 | (2) You may Distribute verbatim copies of the Source form of the
77 | Standard Version of this Package in any medium without restriction,
78 | either gratis or for a Distributor Fee, provided that you duplicate
79 | all of the original copyright notices and associated disclaimers. At
80 | your discretion, such verbatim copies may or may not include a
81 | Compiled form of the Package.
82 |
83 | (3) You may apply any bug fixes, portability changes, and other
84 | modifications made available from the Copyright Holder. The resulting
85 | Package will still be considered the Standard Version, and as such
86 | will be subject to the Original License.
87 |
88 |
89 | Distribution of Modified Versions of the Package as Source
90 |
91 | (4) You may Distribute your Modified Version as Source (either gratis
92 | or for a Distributor Fee, and with or without a Compiled form of the
93 | Modified Version) provided that you clearly document how it differs
94 | from the Standard Version, including, but not limited to, documenting
95 | any non-standard features, executables, or modules, and provided that
96 | you do at least ONE of the following:
97 |
98 | (a) make the Modified Version available to the Copyright Holder
99 | of the Standard Version, under the Original License, so that the
100 | Copyright Holder may include your modifications in the Standard
101 | Version.
102 |
103 | (b) ensure that installation of your Modified Version does not
104 | prevent the user installing or running the Standard Version. In
105 | addition, the Modified Version must bear a name that is different
106 | from the name of the Standard Version.
107 |
108 | (c) allow anyone who receives a copy of the Modified Version to
109 | make the Source form of the Modified Version available to others
110 | under
111 |
112 | (i) the Original License or
113 |
114 | (ii) a license that permits the licensee to freely copy,
115 | modify and redistribute the Modified Version using the same
116 | licensing terms that apply to the copy that the licensee
117 | received, and requires that the Source form of the Modified
118 | Version, and of any works derived from it, be made freely
119 | available in that license fees are prohibited but Distributor
120 | Fees are allowed.
121 |
122 |
123 | Distribution of Compiled Forms of the Standard Version
124 | or Modified Versions without the Source
125 |
126 | (5) You may Distribute Compiled forms of the Standard Version without
127 | the Source, provided that you include complete instructions on how to
128 | get the Source of the Standard Version. Such instructions must be
129 | valid at the time of your distribution. If these instructions, at any
130 | time while you are carrying out such distribution, become invalid, you
131 | must provide new instructions on demand or cease further distribution.
132 | If you provide valid instructions or cease distribution within thirty
133 | days after you become aware that the instructions are invalid, then
134 | you do not forfeit any of your rights under this license.
135 |
136 | (6) You may Distribute a Modified Version in Compiled form without
137 | the Source, provided that you comply with Section 4 with respect to
138 | the Source of the Modified Version.
139 |
140 |
141 | Aggregating or Linking the Package
142 |
143 | (7) You may aggregate the Package (either the Standard Version or
144 | Modified Version) with other packages and Distribute the resulting
145 | aggregation provided that you do not charge a licensing fee for the
146 | Package. Distributor Fees are permitted, and licensing fees for other
147 | components in the aggregation are permitted. The terms of this license
148 | apply to the use and Distribution of the Standard or Modified Versions
149 | as included in the aggregation.
150 |
151 | (8) You are permitted to link Modified and Standard Versions with
152 | other works, to embed the Package in a larger work of your own, or to
153 | build stand-alone binary or bytecode versions of applications that
154 | include the Package, and Distribute the result without restriction,
155 | provided the result does not expose a direct interface to the Package.
156 |
157 |
158 | Items That are Not Considered Part of a Modified Version
159 |
160 | (9) Works (including, but not limited to, modules and scripts) that
161 | merely extend or make use of the Package, do not, by themselves, cause
162 | the Package to be a Modified Version. In addition, such works are not
163 | considered parts of the Package itself, and are not subject to the
164 | terms of this license.
165 |
166 |
167 | General Provisions
168 |
169 | (10) Any use, modification, and distribution of the Standard or
170 | Modified Versions is governed by this Artistic License. By using,
171 | modifying or distributing the Package, you accept this license. Do not
172 | use, modify, or distribute the Package, if you do not accept this
173 | license.
174 |
175 | (11) If your Modified Version has been derived from a Modified
176 | Version made by someone other than you, you are nevertheless required
177 | to ensure that your Modified Version complies with the requirements of
178 | this license.
179 |
180 | (12) This license does not grant you the right to use any trademark,
181 | service mark, tradename, or logo of the Copyright Holder.
182 |
183 | (13) This license includes the non-exclusive, worldwide,
184 | free-of-charge patent license to make, have made, use, offer to sell,
185 | sell, import and otherwise transfer the Package with respect to any
186 | patent claims licensable by the Copyright Holder that are necessarily
187 | infringed by the Package. If you institute patent litigation
188 | (including a cross-claim or counterclaim) against any party alleging
189 | that the Package constitutes direct or contributory patent
190 | infringement, then this Artistic License to you shall terminate on the
191 | date that such litigation is filed.
192 |
193 | (14) Disclaimer of Warranty:
194 | THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS
195 | IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES. THE IMPLIED
196 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR
197 | NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY YOUR LOCAL
198 | LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR CONTRIBUTOR WILL
199 | BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
200 | DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE, EVEN IF
201 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## 本项目包含以下示例:
2 |
3 | #### MapReduce
4 |
5 | - WordCount: 单词统计
6 |
7 | #### Hive
8 |
9 | - sample.hive:表的简单查询
10 |
11 | #### Pig
12 |
13 | - sample.pig:Pig处理OSS数据实例
14 |
15 | #### Spark
16 |
17 | - SparkPi: 计算Pi
18 | - SparkWordCount: 单词统计
19 | - LinearRegression: 线性回归
20 | - OSSSample: OSS使用示例
21 | - ONSSample: ONS使用示例
22 | - ODPSSample: ODPS使用示例
23 | - MNSSample:MNS使用示例
24 | - LoghubSample:Loghub使用示例
25 |
26 | #### PySpark
27 |
28 | - WordCount: 单词统计
29 |
30 | ## 依赖资源
31 |
32 | 测试数据(data目录下):
33 |
34 | - The_Sorrows_of_Young_Werther.txt:可作为WordCount(MapReduce/Spark)的输入数据
35 | - patterns.txt:WordCount(MapReduce)作业的过滤字符
36 | - u.data:sample.hive脚本的测试表数据
37 | - abalone:线性回归算法测试数据
38 |
39 | 依赖jar包(lib目录下)
40 |
41 | - tutorial.jar:sample.pig作业需要的依赖jar包
42 |
43 | ## 准备工作
44 |
45 | 本项目提供了一些测试数据,您可以简单地将其上传到OSS中即可使用。其他示例,例如ODPS,MNS,ONS和Loghub等等,需要您自己准备数据如下:
46 |
47 | - 【可选】 创建LogStore,参考[日志服务用户指南](https://help.aliyun.com/document_detail/sls/user-guide/overview.html?spm=5176.docsls/user-guide/consume-logs.3.2.VW5TNb)。
48 | - 【可选】 创建ODPS项目和表,参考[ODPS快速开始](https://help.aliyun.com/document_detail/odps/quick_start/prerequisite.html?spm=5176.docodps/quick_start/prerequisite.3.2.OqBkc4)。
49 | - 【可选】 创建ONS,参考[消息队列快速开始](https://help.aliyun.com/document_detail/ons/quick-start/apply.html?spm=5176.docons/quick-start/send.3.2.eZ8h7p)。
50 | - 【可选】 创建MNS,参考[消息服务控制台使用帮助](https://help.aliyun.com/document_detail/mns/help_of_console/AccessMNSBySubUser.html?spm=5176.docmns/help_of_console/help_of_queue/CreateQueue.3.2.0Sj96I)。
51 |
52 | ## 基本概念:
53 |
54 | - OSSURI: **oss**://accessKeyId:accessKeySecret@bucket.endpoint/a/b/c.txt,用户在作业中指定输入输出数据源时使用,可以类比hdfs://。
55 | - 阿里云AccessKeyId/AccessKeySecret是您访问阿里云API的密钥,你可以在[这里](https://ak-console.aliyun.com/#/accesskey)获取。
56 |
57 | ## 集群运行
58 |
59 | - Spark
60 | - SparkWordCount: `spark-submit --class SparkWordCount examples-1.0-SNAPSHOT-shaded.jar `
61 | - inputPath: 输入数据路径
62 | - outputPath: 输出路径
63 | - numPartition: 输入数据RDD分片数目
64 | - SparkPi: `spark-submit --class SparkPi examples-1.0-SNAPSHOT-shaded.jar`
65 | - SparkOssDemo:`spark-submit --class SparkOssDemo examples-1.0-SNAPSHOT-shaded.jar `
66 | - accessKeyId: 阿里云AccessKeyId
67 | - accessKeySecret:阿里云AccessKeySecret
68 | - endpoint: 阿里云OSS endpoint
69 | - inputPath: 输入数据路径
70 | - numPartition:输入数据RDD分片数目
71 | - SparkRocketMQDemo: `spark-submit --class SparkRocketMQDemo examples-1.0-SNAPSHOT-shaded.jar `
72 | - accessKeyId: 阿里云AccessKeyId
73 | - accessKeySecret:阿里云AccessKeySecret
74 | - consumerId: 参考[Consumer ID说明](https://help.aliyun.com/document_detail/ons/brief-manual/terminology.html?spm=5176.docons/brief-manual/overview.6.87.F8suBu)
75 | - topic: 每个消息队列都有一个topic
76 | - subExpression: 参考[消息过滤](https://help.aliyun.com/document_detail/ons/user-guide/tag-filter.html?spm=5176.docons/tcp/java-sdk/normal-consumer.6.97.PIqsEo)。
77 | - parallelism:指定多少个接收器来消费队列消息。
78 | - SparkMaxComputeDemo: `spark-submit --class SparkMaxComputeDemo examples-1.0-SNAPSHOT-shaded.jar `
79 | - accessKeyId: 阿里云AccessKeyId
80 | - accessKeySecret:阿里云AccessKeySecret
81 | - envType: 0表示公网环境,1表示内网环境。如果是本地调试选择0,如果是在E-MapReduce上执行请选择1。
82 | - project:参考[ODPS-快速开始](https://help.aliyun.com/document_detail/odps/quick_start/prerequisite.html?spm=5176.docodps/summary/glossary.6.90.inv9Ph)。
83 | - table:参考[ODPS术语介绍](https://help.aliyun.com/document_detail/odps/summary/glossary.html?spm=5176.docodps/quick_start/prerequisite.6.88.A5zVKu)。
84 | - numPartition:输入数据RDD分片数目
85 | - SparkMNSDemo: `spark-submit --class SparkMNSDemo examples-1.0-SNAPSHOT-shaded.jar `
86 | - queueName:队列名,[参考MNS名词解释](https://help.aliyun.com/document_detail/mns/introduction/product-name-interpretation.html?spm=5176.docmns/help_of_console/help_of_queue/CreateQueue.6.87.lHtPvO)。
87 | - accessKeyId: 阿里云AccessKeyId
88 | - accessKeySecret:阿里云AccessKeySecret
89 | - endpoint:队列数据访问地址
90 | - SparkSLSDemo: `spark-submit --class SparkSLSDemo examples-1.0-SNAPSHOT-shaded.jar `
91 | - sls project: LogService项目名
92 | - sls logstore: 日志库名
93 | - loghub group name:作业中消费日志数据的组名,可以任意取。sls project,sls store相同时,相同组名的作业会协同消费sls store中的数据;不同组名的作业会相互隔离地消费sls store中的数据。
94 | - sls endpoint: 参考[日志服务入口](https://help.aliyun.com/document_detail/sls/api/endpoints.html?spm=5176.docsls/user-guide/concept.6.134.Gy05tN)。
95 | - accessKeyId: 阿里云AccessKeyId
96 | - accessKeySecret:阿里云AccessKeySecret
97 | - batch interval seconds: Spark Streaming作业的批次间隔,单位为秒。
98 | - LinearRegression: `spark-submit --class LinearRegression examples-1.0-SNAPSHOT-shaded.jar `
99 | - inputPath:输入数据
100 | - numPartition:输入数据RDD分片数目
101 |
102 | - PySpark
103 | - WordCount: `spark-submit wordcount.py `
104 | - inputPath: 输入数据路径
105 | - outputPath: 输出路径
106 | - numPartition: 输入数据RDD分片数目
107 |
108 | - Mapreduce
109 | - WordCount: `hadoop jar examples-1.0-SNAPSHOT-shaded.jar WordCount -Dwordcount.case.sensitive=true -skip `
110 | - inputPathl:输入数据路径
111 | - outputPath:输出路径
112 | - patternPath:过滤字符文件,可以使用data/patterns.txt
113 |
114 | - Hadoop Streaming
115 | - WordCount: `hadoop jar /usr/lib/hadoop-current/share/hadoop/tools/lib/hadoop-streaming-*.jar -file -mapper mapper.py -file -reducer reducer.py -input -output `
116 | - mapperPyFile mapper文件,[mapper样例](/src/main/python/streaming/wcmapper.py)
117 | - reducerPyFile reducer文件, [reducer样例](/src/main/python/streaming/wcreducer.py)
118 | - inputPath:输入数据路径
119 | - outputPath:输出路径
120 |
121 | - Hive
122 | - `hive -f sample.hive -hiveconf inputPath=`
123 | - inputPath:输入数据路径
124 |
125 | - Pig
126 | - `pig -x mapreduce -f sample.pig -param tutorial= -param input= -param result=`
127 | - tutorialJarPath:依赖Jar包,可使用lib/tutorial.jar
128 | - inputPath:输入数据路径
129 | - resultPath:输出路径
130 |
131 | - 注意:
132 | - 如果在E-MapReduce上使用时,请将测试数据和依赖jar包上传到OSS中,路径规则遵循OSSURI定义,见上。
133 | - 如果集群中使用,可以放在机器本地。
134 |
135 | ## 本地运行
136 |
137 | 这里主要介绍如何在本地运行Spark程序访问阿里云数据源,例如OSS等。如果希望本地调试运行,最好借助一些开发工具,例如Intellij IDEA或者Eclipse。尤其是Windows环境,否则需要在Windows机器上配置Hadoop和Spark运行环境,很麻烦。
138 |
139 | - Intellij IDEA
140 | - 前提:安装Intellij IDEA,Maven, Intellij IDEA Maven插件,Scala,Intellij IDEA Scala插件
141 | - 双击进入SparkWordCount.scala
142 | 
143 | - 从下图箭头所指处进入作业配置界面
144 | 
145 | - 选择SparkWordCount,在作业参数框中按照所需传入作业参数
146 | 
147 | - 点击“OK”
148 | - 点击运行按钮,执行作业
149 | 
150 | - 查看作业执行日志
151 | 
152 |
153 | - Scala IDE for Eclipse
154 | - 前提:安装Scala IDE for Eclipse,Maven,Eclipse Maven插件
155 | - 导入项目
156 | 
157 | 
158 | 
159 | - Run As Maven build,快捷键是“Alt + Shilft + X, M”;也可以在项目名上右键,“Run As”选择“Maven build”
160 | - 等待编译完后,在需要运行的作业上右键,选择“Run Configuration”,进入配置页
161 | - 在配置页中,选择Scala Application,并配置作业的Main Class和参数等等。
162 | 
163 | - 点击“Run”
164 | - 查看控制台输出日志
165 | 
166 |
--------------------------------------------------------------------------------
/conf/benchmark.properties:
--------------------------------------------------------------------------------
1 | ##common
2 | name=KafkaHdfs
3 |
4 | ## cluster
5 | cluster.cores.total=160
6 | cluster.worker.node.number=5
7 | cluster.memory.per.node.mb=90000
8 |
9 | ##kafka producer
10 | partition.number=50
11 | topic=st-36
12 | consumer.group=streaming
13 | zookeeper.address=localhost
14 | zookeeper.root=/kafka-1.0.0
15 | broker.list=localhost:9092
16 |
17 | ##kafka consumer
18 | result.topic=benchmark-result
19 | result.broker.list=localhost:9092
20 |
21 | ##storm
22 | worker.slot.number=10
23 | # spout.parallelism equals kafka partition.number
24 | #spout.parallelism=25
25 | window.length=10
26 | slide.interval=10
27 | backpressure.enable=false
28 | hdfs.parallelism.factor=1
29 | # trident
30 | # 0 if disable ack
31 | ack.open=true
32 |
33 | ##spark streaming
34 | #deploy.mode=yarn-client to make use of cluster header node
35 | duration.ms=1000
36 | spark.executor.instances=10
37 | # recerver number=kafka-partition/factor
38 | kafka.partition.receiver.factor=1
39 | spark.yarn.am.memory.mb=20000
40 | spark.yarn.am.cores=15
41 | #default 200ms, recommend >= 50ms
42 | spark.streaming.blockInterval=200ms
43 | # vcore = physical-core *factor
44 | cpu.core.factor=1.5
45 |
46 | ##hdfs
47 | url=hdfs://emr-header-1:9000
48 | filename.prefix=/foo/
49 | sync.record.number=1000
50 |
51 | ## metric
52 | benchmark.app.name=KafkaHdfs
53 | metric.numPartitions=100
54 | from.spark.streaming=true
55 | metric.duration.second=60
56 | metric.group.id=kafka-metrics
57 |
--------------------------------------------------------------------------------
/data/The_Sorrows_of_Young_Werther.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/data/The_Sorrows_of_Young_Werther.txt
--------------------------------------------------------------------------------
/data/patterns.txt:
--------------------------------------------------------------------------------
1 | \.
2 | \,
3 | \!
4 | to
5 | \"
--------------------------------------------------------------------------------
/lib/kafka-tools-1.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/lib/kafka-tools-1.0.0.jar
--------------------------------------------------------------------------------
/lib/tutorial.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/lib/tutorial.jar
--------------------------------------------------------------------------------
/pic/1.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/1.JPG
--------------------------------------------------------------------------------
/pic/10.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/10.JPG
--------------------------------------------------------------------------------
/pic/11.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/11.JPG
--------------------------------------------------------------------------------
/pic/2.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/2.JPG
--------------------------------------------------------------------------------
/pic/3.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/3.JPG
--------------------------------------------------------------------------------
/pic/4.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/4.JPG
--------------------------------------------------------------------------------
/pic/5.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/5.JPG
--------------------------------------------------------------------------------
/pic/6.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/6.JPG
--------------------------------------------------------------------------------
/pic/7.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/7.JPG
--------------------------------------------------------------------------------
/pic/8.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/8.JPG
--------------------------------------------------------------------------------
/pic/9.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/9.JPG
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | com.aliyun.emr
8 | examples
9 | 1.2.0
10 | jar
11 | Aliyun E-MapReduce Demo Project
12 |
13 |
14 | 2.3.1
15 | 1.4.0
16 | 2.0.0
17 | 3.0.0
18 | 0.28.4-public
19 | 0.6.13
20 | 1.7.1.Final
21 | 3.5.9
22 | 2.9.0
23 | 1.7.3
24 | 3.2.0
25 | 1.2.45
26 | 11.0.2
27 | 5.1.34
28 | 0.11.2
29 | 2.7.2
30 | 2.0
31 | 1.1.1
32 | 4.6.27.12.0
33 | 3.3.2
34 |
35 |
36 |
37 |
38 | org.apache.spark
39 | spark-core_2.11
40 | ${spark.version}
41 |
42 |
43 |
44 | org.apache.spark
45 | spark-mllib_2.11
46 | ${spark.version}
47 |
48 |
49 |
50 | org.apache.spark
51 | spark-sql_2.11
52 | ${spark.version}
53 |
54 |
55 |
56 | org.apache.spark
57 | spark-streaming_2.11
58 | ${spark.version}
59 |
60 |
61 |
62 | org.apache.spark
63 | spark-streaming-kafka-0-10_2.11
64 | ${spark.version}
65 |
66 |
67 |
68 | org.apache.spark
69 | spark-hive_2.11
70 | ${spark.version}
71 |
72 |
73 | org.apache.httpcomponents
74 | httpclient
75 |
76 |
77 | org.apache.httpcomponents
78 | httpcore
79 |
80 |
81 |
82 |
83 |
84 | org.apache.flink
85 | flink-core
86 | ${flink.version}
87 |
88 |
89 |
90 | org.apache.flink
91 | flink-clients_2.11
92 | ${flink.version}
93 |
94 |
95 |
96 | org.apache.flink
97 | flink-connector-kafka-0.11_2.11
98 | 1.4.2
99 |
100 |
101 |
102 |
103 | com.aliyun.emr
104 | emr-tablestore
105 | ${emr.version}
106 |
107 |
108 |
109 |
110 | com.aliyun.emr
111 | emr-mns_2.11
112 | ${emr.version}
113 |
114 |
115 | com.aliyun.mns
116 | aliyun-sdk-mns
117 |
118 |
119 |
120 |
121 |
122 | com.aliyun.emr
123 | emr-logservice_2.11
124 | ${emr.version}
125 |
126 |
127 |
128 | com.aliyun.openservices
129 | aliyun-log
130 | 0.6.60
131 |
132 |
133 |
134 |
135 | com.aliyun.emr
136 | emr-maxcompute_2.11
137 | ${emr.version}
138 |
139 |
140 |
141 | com.aliyun.emr
142 | emr-ons_2.11
143 | ${emr.version}
144 |
145 |
146 |
147 | com.aliyun.emr
148 | emr-dts_2.11
149 | ${emr.version}
150 |
151 |
152 |
153 | com.aliyun.oss
154 | aliyun-sdk-oss
155 | ${oss.sdk.version}
156 |
157 |
158 |
159 | com.aliyun.odps
160 | odps-sdk-core
161 | ${odps.version}
162 |
163 |
164 | org.codehaus.jackson
165 | jackson-mapper-asl
166 |
167 |
168 | org.codehaus.jackson
169 | jackson-core-asl
170 |
171 |
172 |
173 |
174 |
175 | com.aliyun.odps
176 | odps-sdk-commons
177 | ${odps.version}
178 |
179 |
180 |
181 | com.aliyun.openservices
182 | loghub-client-lib
183 | ${loghubb.client.version}
184 |
185 |
186 |
187 | com.aliyun.openservices
188 | ons-client
189 | ${ons.version}
190 |
191 |
192 |
193 | com.aliyun.openservices
194 | ons-api
195 | ${ons.version}
196 |
197 |
198 |
199 | com.alibaba.rocketmq
200 | rocketmq-client
201 | ${rocketmq.version}
202 |
203 |
204 |
205 | com.alibaba.rocketmq
206 | rocketmq-common
207 | ${rocketmq.version}
208 |
209 |
210 |
211 | com.alibaba.rocketmq
212 | rocketmq-remoting
213 | ${rocketmq.version}
214 |
215 |
216 |
217 | org.apache.hadoop
218 | hadoop-mapreduce-client-core
219 | ${hadoop.version}
220 |
221 |
222 | jdk.tools
223 | jdk.tools
224 |
225 |
226 |
227 |
228 |
229 | org.aspectj
230 | aspectjrt
231 | ${aspectjrt.version}
232 |
233 |
234 |
235 | com.github.scopt
236 | scopt_2.10
237 | ${scopt.version}
238 |
239 |
240 |
241 | com.alibaba
242 | fastjson
243 | ${fastjson.version}
244 |
245 |
246 |
247 | com.google.guava
248 | guava
249 | ${guava.version}
250 |
251 |
252 |
253 | mysql
254 | mysql-connector-java
255 | ${mysql.connector.version}
256 |
257 |
258 |
259 | com.stratio.datasource
260 | spark-mongodb_2.10
261 | ${mongodb.version}
262 |
263 |
264 |
265 | redis.clients
266 | jedis
267 | ${redis.clients.version}
268 |
269 |
270 |
271 | org.apache.commons
272 | commons-pool2
273 | ${commons.pool2.version}
274 |
275 |
276 |
277 | org.apache.hbase
278 | hbase-client
279 | ${hbase.version}
280 |
281 |
282 | jdk.tools
283 | jdk.tools
284 |
285 |
286 | org.apache.hadoop
287 | hadoop-mapreduce-client-core
288 |
289 |
290 |
291 |
292 |
293 | org.apache.hbase
294 | hbase-common
295 | ${hbase.version}
296 |
297 |
298 |
299 | org.apache.hbase
300 | hbase-protocol
301 | ${hbase.version}
302 |
303 |
304 |
305 | com.aliyun.mns
306 | aliyun-sdk-mns
307 | 1.1.8.8
308 |
309 |
310 |
311 | org.apache.httpcomponents
312 | httpasyncclient
313 | 4.1
314 |
315 |
316 |
317 | org.apache.httpcomponents
318 | httpcore-nio
319 | 4.4.1
320 |
321 |
322 |
323 | org.apache.httpcomponents
324 | httpcore
325 | 4.4.1
326 |
327 |
328 |
329 | org.apache.kafka
330 | kafka_2.11
331 | 0.10.0.1
332 |
333 |
334 |
335 | org.apache.kafka
336 | kafka-clients
337 | 0.10.0.1
338 |
339 |
340 |
341 | com.aliyun.dts
342 | dts-subscribe-sdk
343 | ${dts.version}
344 |
345 |
346 |
347 | org.apache.commons
348 | commons-lang3
349 | ${commons.lang3.version}
350 |
351 |
352 |
353 | org.apache.storm
354 | storm-core
355 | 1.1.2
356 |
357 |
358 | org.slf4j
359 | log4j-over-slf4j
360 |
361 |
362 |
363 |
364 |
365 | org.apache.storm
366 | storm-kafka
367 | 1.1.2
368 |
369 |
370 |
371 | org.apache.storm
372 | storm-hdfs
373 | 1.1.2
374 |
375 |
376 |
377 | org.apache.storm
378 | storm-perf
379 | 1.1.2
380 |
381 |
382 |
383 | org.apache.hadoop
384 | hadoop-hdfs
385 | 2.6.1
386 |
387 |
388 |
389 | org.apache.hadoop
390 | hadoop-common
391 | 2.6.1
392 |
393 |
394 |
395 | javax.mail
396 | mail
397 | 1.4.7
398 |
399 |
400 |
401 | com.aliyun.emr
402 | emr-datahub_2.11
403 | 2.2.0
404 |
405 |
406 |
407 | com.squareup.okhttp3
408 | okhttp
409 | 3.12.0
410 |
411 |
412 |
413 | com.aliyun.datahub
414 | aliyun-sdk-datahub
415 | 2.13.0-public
416 |
417 |
418 |
419 | org.apache.htrace
420 | htrace-core
421 | 3.1.0-incubating
422 |
423 |
424 |
425 |
426 |
427 | target/classes
428 | target/test-classes
429 |
430 |
431 | maven-compiler-plugin
432 |
433 | 1.8
434 | 1.8
435 | UTF-8
436 |
437 |
438 |
439 | net.alchim31.maven
440 | scala-maven-plugin
441 | 4.0.1
442 |
443 |
444 | scala-compile-first
445 | process-resources
446 |
447 | compile
448 |
449 |
450 |
451 | scala-test-compile-first
452 | process-test-resources
453 |
454 | testCompile
455 |
456 |
457 |
458 | attach-scaladocs
459 | verify
460 |
461 | doc-jar
462 |
463 |
464 |
465 |
466 |
467 | org.apache.maven.plugins
468 | maven-shade-plugin
469 | 2.4.2
470 |
471 | false
472 | ${project.build.directory}/shaded/examples-${project.version}-shaded.jar
473 |
474 |
475 | javax.mail:mail
476 | org.apache.htrace:htrace-core
477 | com.squareup.okhttp3:okhttp
478 | com.squareup.okio:okio
479 | com.squareup.okhttp3:logging-interceptor
480 | com.squareup.retrofit2:converter-jackson
481 | com.squareup.retrofit2:retrofit
482 | com.aliyun.openservices:aliyun-sls-v0.6-inner
483 | com.aliyun.datahub:aliyun-sdk-datahub
484 | com.aliyun.emr:emr-datahub_2.11
485 | com.aliyun.emr:emr-tablestore
486 | com.aliyun.emr:emr-mns_2.11
487 | com.aliyun.emr:emr-logservice_2.11
488 | com.aliyun.emr:emr-maxcompute_2.11
489 | com.aliyun.emr:emr-ons_2.11
490 | com.aliyun.emr:emr-dts_2.11
491 | com.aliyun.odps:odps-sdk-core
492 | com.aliyun.odps:odps-sdk-commons
493 | com.aliyun.oss:aliyun-sdk-oss
494 | com.aliyun.openservices:aliyun-log
495 | com.aliyun.openservices:loghub-client-lib
496 | com.aliyun.openservices:ons-client
497 | com.aliyun.openservices:ons-api
498 | com.aliyun.mns:aliyun-sdk-mns
499 | com.aliyun.openservices:tablestore
500 | com.alibaba.rocketmq:rocketmq-client
501 | com.alibaba.rocketmq:rocketmq-common
502 | com.alibaba.rocketmq:rocketmq-remoting
503 | com.alibaba:fastjson
504 | com.google.guava:guava
505 | org.aspectj:aspectjrt
506 | com.github.scopt:scopt_2.10
507 | org.jdom:jdom
508 | net.sf.json-lib:json-lib
509 | net.sf.ezmorph:ezmorph
510 | commons-validator:commons-validator
511 | mysql:mysql-connector-java
512 | com.stratio.datasource:spark-mongodb_2.10
513 | redis.clients:jedis
514 | org.apache.commons:commons-pool2
515 | org.apache.hbase:hbase-common
516 | org.apache.hbase:hbase-client
517 | org.apache.hbase:hbase-protocol
518 | org.apache.httpcomponents:httpasyncclient
519 | org.apache.httpcomponents:httpcore-nio
520 | org.apache.httpcomponents:httpcore
521 | org.apache.spark:spark-streaming-kafka-0-10_2.11
522 | org.apache.kafka:kafka-clients
523 | org.apache.kafka:kafka_2.11
524 | org.apache.storm:storm-kafka
525 | org.apache.storm:storm-hdfs
526 | org.apache.storm:storm-perf
527 | commons-lang:commons-lang
528 | org.apache.hadoop:hadoop-hdfs
529 | org.apache.hadoop:hadoop-common
530 | com.101tec:zkclient
531 | com.aliyun.dts:dts-subscribe-sdk
532 | org.apache.commons:commons-lang3
533 |
534 |
535 |
536 |
537 |
538 | package
539 |
540 | shade
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 | central
551 | http://maven.aliyun.com/mvn/repository
552 |
553 | true
554 |
555 |
556 | false
557 |
558 |
559 |
560 | snapshots
561 | http://maven.aliyun.com/mvn/repository
562 |
563 | false
564 |
565 |
566 | true
567 |
568 |
569 |
570 | oss
571 | Maven SNAPSHOT Repository
572 | https://oss.sonatype.org/content/repositories/snapshots/
573 |
574 | false
575 |
576 |
577 | true
578 |
579 |
580 |
581 |
582 |
--------------------------------------------------------------------------------
/resources/The_Sorrows_of_Young_Werther.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/resources/The_Sorrows_of_Young_Werther.txt
--------------------------------------------------------------------------------
/resources/patterns.txt:
--------------------------------------------------------------------------------
1 | \.
2 | \,
3 | \!
4 | to
5 | \"
--------------------------------------------------------------------------------
/resources/student_data.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/resources/student_data.csv
--------------------------------------------------------------------------------
/src/main/hive/sample.hive:
--------------------------------------------------------------------------------
1 | USE DEFAULT;
2 | set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
3 | set mapreduce.job.maps=2;
4 | set mapreduce.job.reduces=2;
5 | set hive.stats.autogather=false;
6 |
7 | DROP TABLE emrusers;
8 | CREATE EXTERNAL TABLE emrusers (
9 | userid INT,
10 | movieid INT,
11 | rating INT,
12 | unixtime STRING )
13 | ROW FORMAT DELIMITED
14 | FIELDS TERMINATED BY '\t'
15 | LOCATION '${hiveconf:inputPath}';
16 |
17 | SELECT COUNT(*) FROM emrusers;
18 |
19 | SELECT * from emrusers limit 100;
20 |
21 | SELECT movieid,count(userid) as usercount from emrusers group by movieid order by usercount desc limit 50;
--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/hadoop/EMapReduceOSSUtil.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.aliyun.emr.example.hadoop;
19 |
20 | import org.apache.hadoop.conf.Configuration;
21 |
22 | public class EMapReduceOSSUtil {
23 |
24 | private static String SCHEMA = "oss://";
25 | private static String AKSEP = ":";
26 | private static String BKTSEP = "@";
27 | private static String EPSEP = ".";
28 | private static String HTTP_HEADER = "http://";
29 |
30 | /**
31 | * complete OSS uri
32 | * convert uri like: oss://bucket/path to oss://accessKeyId:accessKeySecret@bucket.endpoint/path
33 | * ossref do not need this
34 | *
35 | * @param oriUri original OSS uri
36 | */
37 | public static String buildOSSCompleteUri(String oriUri, String akId, String akSecret, String endpoint) {
38 | if (akId == null) {
39 | System.err.println("miss accessKeyId");
40 | return oriUri;
41 | }
42 | if (akSecret == null) {
43 | System.err.println("miss accessKeySecret");
44 | return oriUri;
45 | }
46 | if (endpoint == null) {
47 | System.err.println("miss endpoint");
48 | return oriUri;
49 | }
50 |
51 | int index = oriUri.indexOf(SCHEMA);
52 | if (index == -1 || index != 0) {
53 | return oriUri;
54 | }
55 |
56 | int bucketIndex = index + SCHEMA.length();
57 | int pathIndex = oriUri.indexOf("/", bucketIndex);
58 | String bucket = null;
59 | if (pathIndex == -1) {
60 | bucket = oriUri.substring(bucketIndex);
61 | } else {
62 | bucket = oriUri.substring(bucketIndex, pathIndex);
63 | }
64 |
65 | StringBuilder retUri = new StringBuilder();
66 | retUri.append(SCHEMA)
67 | .append(akId)
68 | .append(AKSEP)
69 | .append(akSecret)
70 | .append(BKTSEP)
71 | .append(bucket)
72 | .append(EPSEP)
73 | .append(stripHttp(endpoint));
74 |
75 | if (pathIndex > 0) {
76 | retUri.append(oriUri.substring(pathIndex));
77 | }
78 |
79 | return retUri.toString();
80 | }
81 |
82 | public static String buildOSSCompleteUri(String oriUri, Configuration conf) {
83 | return buildOSSCompleteUri(oriUri, conf.get("fs.oss.accessKeyId"), conf.get("fs.oss.accessKeySecret"), conf.get("fs.oss.endpoint"));
84 | }
85 |
86 | private static String stripHttp(String endpoint) {
87 | if (endpoint.startsWith(HTTP_HEADER)) {
88 | return endpoint.substring(HTTP_HEADER.length());
89 | }
90 | return endpoint;
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/hadoop/WordCount.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.aliyun.emr.example.hadoop;
19 |
20 | import java.io.BufferedReader;
21 | import java.io.FileReader;
22 | import java.io.IOException;
23 | import java.net.URI;
24 | import java.util.ArrayList;
25 | import java.util.HashSet;
26 | import java.util.List;
27 | import java.util.Set;
28 | import java.util.StringTokenizer;
29 |
30 | import org.apache.hadoop.conf.Configuration;
31 | import org.apache.hadoop.fs.Path;
32 | import org.apache.hadoop.io.IntWritable;
33 | import org.apache.hadoop.io.Text;
34 | import org.apache.hadoop.mapreduce.Job;
35 | import org.apache.hadoop.mapreduce.Mapper;
36 | import org.apache.hadoop.mapreduce.Reducer;
37 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
38 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
39 | import org.apache.hadoop.mapreduce.Counter;
40 | import org.apache.hadoop.util.GenericOptionsParser;
41 | import org.apache.hadoop.util.StringUtils;
42 |
43 | public class WordCount {
44 |
45 | public static class TokenizerMapper
46 | extends Mapper{
47 |
48 | static enum CountersEnum { INPUT_WORDS }
49 |
50 | private final static IntWritable one = new IntWritable(1);
51 | private Text word = new Text();
52 |
53 | private boolean caseSensitive;
54 | private Set patternsToSkip = new HashSet();
55 |
56 | private Configuration conf;
57 | private BufferedReader fis;
58 |
59 | @Override
60 | public void setup(Context context) throws IOException,
61 | InterruptedException {
62 | conf = context.getConfiguration();
63 | caseSensitive = conf.getBoolean("wordcount.case.sensitive", true);
64 | if (conf.getBoolean("wordcount.skip.patterns", false)) {
65 | URI[] patternsURIs = Job.getInstance(conf).getCacheFiles();
66 | for (URI patternsURI : patternsURIs) {
67 | Path patternsPath = new Path(patternsURI.getPath());
68 | String patternsFileName = patternsPath.getName();
69 | parseSkipFile(patternsFileName);
70 | }
71 | }
72 | }
73 |
74 | private void parseSkipFile(String fileName) {
75 | try {
76 | fis = new BufferedReader(new FileReader(fileName));
77 | String pattern;
78 | while ((pattern = fis.readLine()) != null) {
79 | patternsToSkip.add(pattern);
80 | }
81 | } catch (IOException ioe) {
82 | System.err.println("Caught exception while parsing the cached file '"
83 | + StringUtils.stringifyException(ioe));
84 | }
85 | }
86 |
87 | @Override
88 | public void map(Object key, Text value, Context context
89 | ) throws IOException, InterruptedException {
90 | String line = (caseSensitive) ?
91 | value.toString() : value.toString().toLowerCase();
92 | for (String pattern : patternsToSkip) {
93 | line = line.replaceAll(pattern, "");
94 | }
95 | StringTokenizer itr = new StringTokenizer(line);
96 | while (itr.hasMoreTokens()) {
97 | word.set(itr.nextToken());
98 | context.write(word, one);
99 | Counter counter = context.getCounter(CountersEnum.class.getName(),
100 | CountersEnum.INPUT_WORDS.toString());
101 | counter.increment(1);
102 | }
103 | }
104 | }
105 |
106 | public static class IntSumReducer
107 | extends Reducer {
108 | private IntWritable result = new IntWritable();
109 |
110 | public void reduce(Text key, Iterable values,
111 | Context context
112 | ) throws IOException, InterruptedException {
113 | int sum = 0;
114 | for (IntWritable val : values) {
115 | sum += val.get();
116 | }
117 | result.set(sum);
118 | context.write(key, result);
119 | }
120 | }
121 |
122 | public static void main(String[] args) throws Exception {
123 | Configuration conf = new Configuration();
124 | GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);
125 | String[] remainingArgs = optionParser.getRemainingArgs();
126 | if (!(remainingArgs.length == 2 || remainingArgs.length == 4)) {
127 | System.err.println("Usage: wordcount [-skip skipPatternFile]");
128 | System.exit(2);
129 | }
130 | Job job = Job.getInstance(conf, "word count");
131 | job.setJarByClass(WordCount.class);
132 | job.setMapperClass(TokenizerMapper.class);
133 | job.setCombinerClass(IntSumReducer.class);
134 | job.setReducerClass(IntSumReducer.class);
135 | job.setOutputKeyClass(Text.class);
136 | job.setOutputValueClass(IntWritable.class);
137 |
138 | List otherArgs = new ArrayList();
139 | for (int i=0; i < remainingArgs.length; ++i) {
140 | if ("-skip".equals(remainingArgs[i])) {
141 | job.addCacheFile(new Path(EMapReduceOSSUtil.buildOSSCompleteUri(remainingArgs[++i], conf)).toUri());
142 | job.getConfiguration().setBoolean("wordcount.skip.patterns", true);
143 | } else {
144 | otherArgs.add(remainingArgs[i]);
145 | }
146 | }
147 | FileInputFormat.addInputPath(job, new Path(EMapReduceOSSUtil.buildOSSCompleteUri(otherArgs.get(0), conf)));
148 | FileOutputFormat.setOutputPath(job, new Path(EMapReduceOSSUtil.buildOSSCompleteUri(otherArgs.get(1), conf)));
149 |
150 | System.exit(job.waitForCompletion(true) ? 0 : 1);
151 | }
152 | }
153 |
--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/spark/SparkMaxComputeJavaDemo.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.aliyun.emr.example.spark;
19 |
20 | import com.aliyun.odps.TableSchema;
21 | import com.aliyun.odps.data.Record;
22 | import org.apache.spark.SparkConf;
23 | import org.apache.spark.aliyun.odps.OdpsOps;
24 | import org.apache.spark.api.java.JavaRDD;
25 | import org.apache.spark.api.java.JavaSparkContext;
26 | import org.apache.spark.api.java.function.Function2;
27 |
28 | import java.util.ArrayList;
29 | import java.util.List;
30 |
31 | public class SparkMaxComputeJavaDemo {
32 |
33 | public static void main(String[] args) {
34 | String partition = null;
35 | String accessId = args[0];
36 | String accessKey = args[1];
37 |
38 | String odpsUrl = args[2];
39 |
40 | String tunnelUrl = args[3];
41 | String project = args[4];
42 | String table = args[5];
43 | if (args.length > 6) {
44 | partition = args[6];
45 | }
46 |
47 | SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 3-2: Spark MaxCompute Demo (Java)");
48 | JavaSparkContext jsc = new JavaSparkContext(sparkConf);
49 |
50 | OdpsOps odpsOps = new OdpsOps(jsc.sc(), accessId, accessKey, odpsUrl, tunnelUrl);
51 |
52 | System.out.println("Read odps table...");
53 | JavaRDD> readData = odpsOps.readTableWithJava(project, table, new RecordToLongs(), Integer.valueOf(partition));
54 |
55 | System.out.println("counts: ");
56 | System.out.println(readData.count());
57 | }
58 |
59 | static class RecordToLongs implements Function2> {
60 | @Override
61 | public List call(Record record, TableSchema schema) throws Exception {
62 | List ret = new ArrayList();
63 | for (int i = 0; i < schema.getColumns().size(); i++) {
64 | ret.add(record.getBigint(i));
65 | }
66 | return ret;
67 | }
68 | }
69 |
70 | }
71 |
--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/spark/SparkOssJavaDemo.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.aliyun.emr.example.spark;
19 |
20 | import org.apache.hadoop.io.LongWritable;
21 | import org.apache.hadoop.io.Text;
22 | import org.apache.hadoop.mapred.TextInputFormat;
23 | import org.apache.spark.SparkConf;
24 | import org.apache.spark.api.java.JavaPairRDD;
25 | import org.apache.spark.api.java.JavaSparkContext;
26 |
27 | public class SparkOssJavaDemo {
28 |
29 | public static void main(String[] args) {
30 |
31 | String accessId = args[0];
32 | String accessKey = args[1];
33 |
34 | String endpoint = args[2];
35 |
36 | String inputPath = args[3];
37 | String outputPath = args[4];
38 | int partition = Integer.valueOf(args[5]);
39 |
40 | SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 2-2: Spark Oss Demo (Java)").setMaster("local[4]");
41 | sparkConf.set("spark.hadoop.fs.oss.accessKeyId", accessId);
42 | sparkConf.set("spark.hadoop.fs.oss.accessKeySecret", accessKey);
43 | sparkConf.set("spark.hadoop.fs.oss.endpoint", endpoint);
44 | sparkConf.set("spark.hadoop.fs.oss.impl", "com.aliyun.fs.oss.nat.NativeOssFileSystem");
45 | sparkConf.set("spark.hadoop.mapreduce.job.run-local", "true");
46 | JavaSparkContext jsc = new JavaSparkContext(sparkConf);
47 |
48 | JavaPairRDD data = jsc.hadoopFile(inputPath, TextInputFormat.class, LongWritable.class, Text.class, partition);
49 |
50 | System.out.println("Count (data): " + String.valueOf(data.count()));
51 |
52 | data.saveAsTextFile(outputPath);
53 | }
54 |
55 | }
56 |
--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/spark/SparkTableStoreJavaDemo.java:
--------------------------------------------------------------------------------
1 | package com.aliyun.emr.example.spark;
2 |
3 | import com.alicloud.openservices.tablestore.ecosystem.ComputeParameters;
4 | import com.alicloud.openservices.tablestore.ecosystem.Filter;
5 | import com.alicloud.openservices.tablestore.model.*;
6 | import com.aliyun.openservices.tablestore.hadoop.*;
7 | import org.apache.hadoop.conf.Configuration;
8 | import org.apache.spark.SparkConf;
9 | import org.apache.spark.api.java.JavaPairRDD;
10 | import org.apache.spark.api.java.JavaSparkContext;
11 |
12 | import java.util.ArrayList;
13 | import java.util.Formatter;
14 | import java.util.List;
15 |
16 | public class SparkTableStoreJavaDemo {
17 | private static RangeRowQueryCriteria fetchCriteria(String tableName, String columnName) {
18 | RangeRowQueryCriteria res = new RangeRowQueryCriteria(tableName);
19 | res.setMaxVersions(1);
20 | List lower = new ArrayList();
21 | List upper = new ArrayList();
22 | lower.add(new PrimaryKeyColumn(columnName, PrimaryKeyValue.INF_MIN));
23 | upper.add(new PrimaryKeyColumn(columnName, PrimaryKeyValue.INF_MAX));
24 | res.setInclusiveStartPrimaryKey(new PrimaryKey(lower));
25 | res.setExclusiveEndPrimaryKey(new PrimaryKey(upper));
26 | return res;
27 | }
28 |
29 | public static void main(String[] args) {
30 | String accessKeyId = args[0];
31 | String accessKeySecret = args[1];
32 | Filter filter = new Filter(Filter.CompareOperator.GREATER_THAN,"PK", ColumnValue.fromLong(-1000));
33 | List list = new ArrayList<>();
34 | list.add("VALUE");
35 | TableStoreFilterWritable tableStoreFilterWritable = new TableStoreFilterWritable(filter, list);
36 |
37 | String endpoint = args[2];
38 | String instance = args[3];
39 | String tableName = args[4];
40 | String primaryKeyColumnName = args[5];
41 | ComputeParams computeParams = new ComputeParams(100, 1, ComputeParameters.ComputeMode.Auto.name());
42 | SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 5: Spark TableStore Demo (Java)");
43 | JavaSparkContext sc = null;
44 | try {
45 | sc = new JavaSparkContext(sparkConf);
46 | Configuration hadoopConf = new Configuration();
47 | hadoopConf.set("computeParams", computeParams.serialize());
48 | hadoopConf.set("tableName", tableName);
49 | hadoopConf.set("filters", tableStoreFilterWritable.serialize());
50 | TableStore.setCredential(
51 | hadoopConf,
52 | new Credential(accessKeyId, accessKeySecret, null));
53 | Endpoint ep = new Endpoint(endpoint, instance);
54 | TableStore.setEndpoint(hadoopConf, ep);
55 | com.aliyun.openservices.tablestore.hadoop.TableStoreInputFormat.addCriteria(hadoopConf,
56 | fetchCriteria(tableName, primaryKeyColumnName));
57 | JavaPairRDD rdd = sc.newAPIHadoopRDD(
58 | hadoopConf, com.aliyun.openservices.tablestore.hadoop.TableStoreInputFormat.class,
59 | PrimaryKeyWritable.class, RowWritable.class);
60 | System.out.println(
61 | new Formatter().format("TOTAL: %d", rdd.count()).toString());
62 | rdd.take(10).forEach((primaryKeyWritableRowWritableTuple2) -> {
63 | System.out.println(String.format("Key: %s, VALUE: %s",
64 | primaryKeyWritableRowWritableTuple2._1.getPrimaryKey().toString(),
65 | primaryKeyWritableRowWritableTuple2._2.getRow().toString()));
66 | });
67 | } finally {
68 | if (sc != null) {
69 | sc.close();
70 | }
71 | }
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/spark/sql/streaming/SparkSLSContinuousStructuredStreamingJavaDemo.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package com.aliyun.emr.example.spark.sql.streaming;
18 |
19 | import org.apache.spark.sql.Dataset;
20 | import org.apache.spark.sql.Encoders;
21 | import org.apache.spark.sql.SparkSession;
22 | import org.apache.spark.sql.streaming.StreamingQuery;
23 | import org.apache.spark.sql.streaming.Trigger;
24 |
25 | import java.util.UUID;
26 |
27 | public class SparkSLSContinuousStructuredStreamingJavaDemo {
28 |
29 | public static void main(String[] args) throws Exception {
30 | if (args.length < 7) {
31 | System.err.println("Usage: SparkSLSContinuousStructuredStreamingJavaDemo " +
32 | " " +
33 | " []");
34 | System.exit(1);
35 | }
36 |
37 | String logProject = args[0];
38 | String logStore = args[1];
39 | String accessKeyId = args[2];
40 | String accessKeySecret = args[3];
41 | String endpoint = args[4];
42 | String startingOffsets = args[5];
43 | String maxOffsetsPerTrigger = args[6];
44 | String checkpointLocation = "/tmp/temporary-" + UUID.randomUUID().toString();
45 | if (args.length > 7) {
46 | checkpointLocation = args[7];
47 | }
48 |
49 | SparkSession spark = SparkSession
50 | .builder()
51 | .master("local[5]")
52 | .appName("E-MapReduce Demo 6-6: Spark SLS Demo (Java)")
53 | .getOrCreate();
54 |
55 | spark.sparkContext().setLogLevel("WARN");
56 |
57 | Dataset lines = spark.readStream()
58 | .format("org.apache.spark.sql.aliyun.logservice.LoghubSourceProvider")
59 | .option("sls.project", logProject)
60 | .option("sls.store", logStore)
61 | .option("access.key.id", accessKeyId)
62 | .option("access.key.secret", accessKeySecret)
63 | .option("endpoint", endpoint)
64 | .option("startingoffsets", startingOffsets)
65 | .option("maxOffsetsPerTrigger", maxOffsetsPerTrigger)
66 | .load()
67 | .selectExpr("CAST(__value__ AS STRING)")
68 | .as(Encoders.STRING());
69 |
70 | // Start running the query that prints the running counts to the console
71 | StreamingQuery query = lines.writeStream()
72 | .outputMode("append")
73 | .format("console")
74 | .option("checkpointLocation", checkpointLocation)
75 | .trigger(Trigger.Continuous("5 second"))
76 | .start();
77 |
78 | query.awaitTermination();
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/spark/sql/streaming/SparkSLSStructuredStreamingJavaDemo.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package com.aliyun.emr.example.spark.sql.streaming;
18 |
19 | import org.apache.spark.api.java.function.FlatMapFunction;
20 | import org.apache.spark.sql.Dataset;
21 | import org.apache.spark.sql.Encoders;
22 | import org.apache.spark.sql.Row;
23 | import org.apache.spark.sql.SparkSession;
24 | import org.apache.spark.sql.streaming.StreamingQuery;
25 |
26 | import java.util.Arrays;
27 | import java.util.UUID;
28 |
29 | public class SparkSLSStructuredStreamingJavaDemo {
30 |
31 | public static void main(String[] args) throws Exception {
32 | if (args.length < 7) {
33 | System.err.println("Usage: SparkSLSStructuredStreamingJavaDemo " +
34 | " " +
35 | " []");
36 | System.exit(1);
37 | }
38 |
39 | String logProject = args[0];
40 | String logStore = args[1];
41 | String accessKeyId = args[2];
42 | String accessKeySecret = args[3];
43 | String endpoint = args[4];
44 | String startingOffsets = args[5];
45 | String maxOffsetsPerTrigger = args[6];
46 | String checkpointLocation = "/tmp/temporary-" + UUID.randomUUID().toString();
47 | if (args.length > 7) {
48 | checkpointLocation = args[7];
49 | }
50 |
51 | SparkSession spark = SparkSession
52 | .builder()
53 | .master("local[5]")
54 | .appName("E-MapReduce Demo 6-4: Spark SLS Demo (Java)")
55 | .getOrCreate();
56 |
57 | spark.sparkContext().setLogLevel("WARN");
58 |
59 | Dataset lines = spark.readStream()
60 | .format("org.apache.spark.sql.aliyun.logservice.LoghubSourceProvider")
61 | .option("sls.project", logProject)
62 | .option("sls.store", logStore)
63 | .option("access.key.id", accessKeyId)
64 | .option("access.key.secret", accessKeySecret)
65 | .option("endpoint", endpoint)
66 | .option("startingoffsets", startingOffsets)
67 | .option("zookeeper.connect.address", "localhost:2181")
68 | .option("maxOffsetsPerTrigger", maxOffsetsPerTrigger)
69 | .load()
70 | .selectExpr("CAST(__value__ AS STRING)")
71 | .as(Encoders.STRING());
72 |
73 | // Generate running word count
74 | Dataset wordCounts = lines.flatMap(
75 | (FlatMapFunction) x -> Arrays.asList(x.split(" ")).iterator(),
76 | Encoders.STRING()).groupBy("value").count();
77 |
78 | // Start running the query that prints the running counts to the console
79 | StreamingQuery query = wordCounts.writeStream()
80 | .outputMode("complete")
81 | .format("console")
82 | .option("checkpointLocation", checkpointLocation)
83 | .start();
84 |
85 | query.awaitTermination();
86 | }
87 | }
88 |
--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/spark/streaming/JavaLoghubWordCount.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.aliyun.emr.example.spark.streaming;
19 |
20 | import org.apache.spark.SparkConf;
21 | import org.apache.spark.api.java.function.FlatMapFunction;
22 | import org.apache.spark.api.java.function.Function;
23 | import org.apache.spark.api.java.function.Function2;
24 | import org.apache.spark.api.java.function.PairFunction;
25 | import org.apache.spark.storage.StorageLevel;
26 | import org.apache.spark.streaming.Duration;
27 | import org.apache.spark.streaming.aliyun.logservice.LoghubUtils;
28 | import org.apache.spark.streaming.api.java.JavaDStream;
29 | import org.apache.spark.streaming.api.java.JavaPairDStream;
30 | import org.apache.spark.streaming.api.java.JavaStreamingContext;
31 | import scala.Tuple2;
32 |
33 | import java.util.Arrays;
34 | import java.util.Iterator;
35 | import java.util.regex.Pattern;
36 |
37 | public class JavaLoghubWordCount {
38 | private static final Pattern SPACE = Pattern.compile(" ");
39 |
40 | public static void main(String[] args) throws InterruptedException {
41 | if (args.length < 6) {
42 | System.err.println("Usage: bin/spark-submit --class JavaLoghubWordCount " +
43 | "examples-1.0-SNAPSHOT-shaded.jar " +
44 | " ");
45 | System.exit(1);
46 | }
47 |
48 | String loghubProject = args[0];
49 | String logStore = args[1];
50 | String loghubGroupName = args[2];
51 | String endpoint = args[3];
52 | String accessKeyId = args[4];
53 | String accessKeySecret = args[5];
54 |
55 | SparkConf conf = new SparkConf().setAppName("Loghub Sample");
56 | JavaStreamingContext jssc = new JavaStreamingContext(conf, new Duration(2000));
57 | JavaDStream lines = LoghubUtils.createStream(
58 | jssc,
59 | loghubProject,
60 | logStore,
61 | loghubGroupName,
62 | endpoint,
63 | 1,
64 | accessKeyId,
65 | accessKeySecret,
66 | StorageLevel.MEMORY_AND_DISK());
67 |
68 | JavaDStream words = lines.map(new Function() {
69 | @Override
70 | public String call(byte[] v1) throws Exception {
71 | return new String(v1);
72 | }
73 | }).flatMap(new FlatMapFunction() {
74 | @Override
75 | public Iterator call(String s) {
76 | return Arrays.asList(SPACE.split(s)).iterator();
77 | }
78 | });
79 | JavaPairDStream wordCounts = words.mapToPair(
80 | new PairFunction() {
81 | @Override
82 | public Tuple2 call(String s) {
83 | return new Tuple2(s, 1);
84 | }
85 | }).reduceByKey(new Function2() {
86 | @Override
87 | public Integer call(Integer i1, Integer i2) {
88 | return i1 + i2;
89 | }
90 | });
91 |
92 | wordCounts.print();
93 | jssc.start();
94 | jssc.awaitTermination();
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/spark/streaming/SparkMNSJavaDemo.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.aliyun.emr.example.spark.streaming;
19 |
20 | import com.google.common.collect.Lists;
21 | import org.apache.spark.SparkConf;
22 | import org.apache.spark.api.java.function.FlatMapFunction;
23 | import org.apache.spark.api.java.function.Function;
24 | import org.apache.spark.api.java.function.Function2;
25 | import org.apache.spark.api.java.function.PairFunction;
26 | import org.apache.spark.storage.StorageLevel;
27 | import org.apache.spark.streaming.Duration;
28 | import org.apache.spark.streaming.aliyun.mns.MnsUtils;
29 | import org.apache.spark.streaming.api.java.*;
30 | import scala.Tuple2;
31 |
32 | import java.util.Iterator;
33 | import java.util.regex.Pattern;
34 |
35 | public class SparkMNSJavaDemo {
36 | private static final Pattern SPACE = Pattern.compile(" ");
37 |
38 | public static void main(String[] args) throws InterruptedException {
39 | if (args.length < 4) {
40 | System.err.println("Usage: bin/spark-submit --class SparkMNSJavaDemo examples-1.0-SNAPSHOT-shaded.jar " +
41 | " ");
42 | System.exit(1);
43 | }
44 |
45 | String queueName = args[0];
46 | String accessKeyId = args[1];
47 | String accessKeySecret = args[2];
48 | String endpoint = args[3];
49 |
50 | SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 8-2: Spark MNS Demo (Java)").setMaster("local[4]");
51 | sparkConf.set("spark.hadoop.fs.oss.impl", "com.aliyun.fs.oss.nat.NativeOssFileSystem");
52 | sparkConf.set("spark.hadoop.mapreduce.job.run-local", "true");
53 | // Create the context with 2 seconds batch size
54 | JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(2000));
55 |
56 | JavaReceiverInputDStream lines = MnsUtils.createPullingStreamAsBytes(jssc, queueName, accessKeyId,
57 | accessKeySecret, endpoint, StorageLevel.MEMORY_AND_DISK());
58 |
59 | JavaDStream words = lines.map(new Function() {
60 | @Override
61 | public String call(byte[] v1) throws Exception {
62 | return new String(v1);
63 | }
64 | }).flatMap(new FlatMapFunction() {
65 | @Override
66 | public Iterator call(String x) {
67 | return Lists.newArrayList(SPACE.split(x)).iterator();
68 | }
69 | });
70 | JavaPairDStream wordCounts = words.mapToPair(
71 | new PairFunction() {
72 | @Override
73 | public Tuple2 call(String s) {
74 | return new Tuple2(s, 1);
75 | }
76 | }).reduceByKey(new Function2() {
77 | @Override
78 | public Integer call(Integer i1, Integer i2) {
79 | return i1 + i2;
80 | }
81 | });
82 |
83 | wordCounts.print();
84 | jssc.start();
85 | jssc.awaitTermination();
86 | }
87 | }
88 |
--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/spark/streaming/SparkRocketMQJavaDemo.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.aliyun.emr.example.spark.streaming;
19 |
20 | import com.aliyun.openservices.ons.api.Message;
21 | import com.google.common.collect.Lists;
22 | import org.apache.spark.SparkConf;
23 | import org.apache.spark.api.java.function.FlatMapFunction;
24 | import org.apache.spark.api.java.function.Function;
25 | import org.apache.spark.api.java.function.Function2;
26 | import org.apache.spark.api.java.function.PairFunction;
27 | import org.apache.spark.storage.StorageLevel;
28 | import org.apache.spark.streaming.Duration;
29 | import org.apache.spark.streaming.aliyun.ons.OnsUtils;
30 | import org.apache.spark.streaming.api.java.JavaDStream;
31 | import org.apache.spark.streaming.api.java.JavaPairDStream;
32 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
33 | import org.apache.spark.streaming.api.java.JavaStreamingContext;
34 | import scala.Tuple2;
35 |
36 | import java.util.Iterator;
37 | import java.util.regex.Pattern;
38 |
39 | public class SparkRocketMQJavaDemo {
40 | private static final Pattern SPACE = Pattern.compile(" ");
41 |
42 | public static void main(String[] args) throws InterruptedException {
43 | if (args.length < 5) {
44 | System.err.println("Usage: spark-submit --class com.aliyun.emr.example.spark.streaming.SparkRocketMQJavaDemo " +
45 | "examples-1.0-SNAPSHOT-shaded.jar " +
46 | " ");
47 | System.exit(1);
48 | }
49 |
50 | String accessKeyId = args[0];
51 | String accessKeySecret = args[1];
52 | String consumerId = args[2];
53 | String topic = args[3];
54 | String subExpression = args[4];
55 |
56 | SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 4-2: Spark RocketMQ Demo (Java)");
57 | // Create the context with 2 seconds batch size
58 | JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(2000));
59 |
60 | JavaReceiverInputDStream lines = OnsUtils.createStream(jssc, consumerId, topic, subExpression,
61 | accessKeyId, accessKeySecret, StorageLevel.MEMORY_AND_DISK(), new Function() {
62 | @Override
63 | public byte[] call(Message msg) throws Exception {
64 | return msg.getBody();
65 | }
66 | });
67 |
68 | JavaDStream words = lines.map(new Function() {
69 | @Override
70 | public String call(byte[] v1) throws Exception {
71 | return new String(v1);
72 | }
73 | }).flatMap(new FlatMapFunction() {
74 | @Override
75 | public Iterator call(String x) {
76 | return Lists.newArrayList(SPACE.split(x)).iterator();
77 | }
78 | });
79 | JavaPairDStream wordCounts = words.mapToPair(
80 | new PairFunction() {
81 | @Override
82 | public Tuple2 call(String s) {
83 | return new Tuple2(s, 1);
84 | }
85 | }).reduceByKey(new Function2() {
86 | @Override
87 | public Integer call(Integer i1, Integer i2) {
88 | return i1 + i2;
89 | }
90 | });
91 |
92 | wordCounts.print();
93 | jssc.start();
94 | jssc.awaitTermination();
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/storm/StormKafkaSample.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.aliyun.emr.example.storm;
19 |
20 | import org.apache.storm.Config;
21 | import org.apache.storm.LocalCluster;
22 | import org.apache.storm.StormSubmitter;
23 | import org.apache.storm.generated.AlreadyAliveException;
24 | import org.apache.storm.generated.AuthorizationException;
25 | import org.apache.storm.generated.InvalidTopologyException;
26 | import org.apache.storm.hdfs.bolt.HdfsBolt;
27 | import org.apache.storm.hdfs.bolt.format.DefaultFileNameFormat;
28 | import org.apache.storm.hdfs.bolt.format.DelimitedRecordFormat;
29 | import org.apache.storm.hdfs.bolt.format.FileNameFormat;
30 | import org.apache.storm.hdfs.bolt.format.RecordFormat;
31 | import org.apache.storm.hdfs.bolt.rotation.FileRotationPolicy;
32 | import org.apache.storm.hdfs.bolt.rotation.FileSizeRotationPolicy;
33 | import org.apache.storm.hdfs.bolt.sync.CountSyncPolicy;
34 | import org.apache.storm.hdfs.bolt.sync.SyncPolicy;
35 | import org.apache.storm.kafka.KafkaSpout;
36 | import org.apache.storm.kafka.SpoutConfig;
37 | import org.apache.storm.kafka.StringScheme;
38 | import org.apache.storm.kafka.ZkHosts;
39 | import org.apache.storm.spout.SchemeAsMultiScheme;
40 | import org.apache.storm.topology.TopologyBuilder;
41 |
42 | import java.util.ArrayList;
43 | import java.util.List;
44 |
45 | public class StormKafkaSample {
46 | public static void main(String[] args) throws AuthorizationException {
47 | String topic = args[0] ;
48 | String zk = args[1];
49 | String hdfsUrl = args[2];
50 | ZkHosts zkHosts = new ZkHosts(zk + ":2181/kafka-1.0.0");
51 | SpoutConfig spoutConfig = new SpoutConfig(zkHosts, topic, "/kafka-1.0.0", "MyTrack") ;
52 | List zkServers = new ArrayList() ;
53 | zkServers.add(zk);
54 | spoutConfig.zkServers = zkServers;
55 | spoutConfig.zkPort = 2181;
56 | spoutConfig.socketTimeoutMs = 60 * 1000 ;
57 | spoutConfig.scheme = new SchemeAsMultiScheme(new StringScheme()) ;
58 |
59 | // use "|" instead of "," for field delimiter
60 | RecordFormat format = new DelimitedRecordFormat()
61 | .withFieldDelimiter("|");
62 |
63 | // sync the filesystem after every 1k tuples
64 | SyncPolicy syncPolicy = new CountSyncPolicy(1000);
65 |
66 | // rotate files when they reach 5MB
67 | FileRotationPolicy rotationPolicy = new FileSizeRotationPolicy(5.0f, FileSizeRotationPolicy.Units.MB);
68 |
69 | FileNameFormat fileNameFormat = new DefaultFileNameFormat().withPath("/foo/");
70 |
71 | HdfsBolt bolt = new HdfsBolt()
72 | .withFsUrl(hdfsUrl)
73 | .withFileNameFormat(fileNameFormat)
74 | .withRecordFormat(format)
75 | .withRotationPolicy(rotationPolicy)
76 | .withSyncPolicy(syncPolicy);
77 |
78 | TopologyBuilder builder = new TopologyBuilder() ;
79 | builder.setSpout("spout", new KafkaSpout(spoutConfig) ,2) ;
80 | builder.setBolt("bolt", bolt, 1).shuffleGrouping("spout") ;
81 |
82 | Config conf = new Config ();
83 | conf.setDebug(false) ;
84 |
85 | if (args.length > 3) {
86 | try {
87 | StormSubmitter.submitTopology(args[3], conf, builder.createTopology());
88 | } catch (AlreadyAliveException e) {
89 | e.printStackTrace();
90 | } catch (InvalidTopologyException e) {
91 | e.printStackTrace();
92 | }
93 | } else {
94 | LocalCluster localCluster = new LocalCluster();
95 | localCluster.submitTopology("mytopology", conf, builder.createTopology());
96 | }
97 |
98 | }
99 | }
100 |
--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/storm/benchmark/AbstractTopology.java:
--------------------------------------------------------------------------------
1 | package com.aliyun.emr.example.storm.benchmark;
2 |
3 | import com.aliyun.emr.example.storm.benchmark.util.Helper;
4 | import org.apache.commons.lang.StringUtils;
5 | import org.apache.storm.Config;
6 | import org.apache.storm.LocalCluster;
7 | import org.apache.storm.StormSubmitter;
8 | import org.apache.storm.generated.StormTopology;
9 |
10 | import java.io.BufferedInputStream;
11 | import java.io.FileInputStream;
12 | import java.io.InputStream;
13 | import java.io.Serializable;
14 | import java.util.HashMap;
15 | import java.util.Map;
16 | import java.util.Properties;
17 |
18 | abstract public class AbstractTopology implements Serializable{
19 | protected Properties configure;
20 |
21 | public void init(String configFilepath) throws Exception {
22 | init(configFilepath, "");
23 | }
24 |
25 | public void init(String configFilepath, String properties) throws Exception {
26 | InputStream in = new BufferedInputStream(new FileInputStream(configFilepath));
27 | configure = new Properties();
28 | configure.load(in);
29 |
30 | if (! StringUtils.isBlank(properties)) {
31 | Map customProperty = new HashMap<>();
32 | for (String item : properties.split(",")) {
33 | String[] kv = item.split("=");
34 | if (kv.length != 2) {
35 | System.out.println("invalid property[" + item + "], pattern should be k1=v2,k2=v2...");
36 | continue;
37 | }
38 | customProperty.put(kv[0], kv[1]);
39 | }
40 | configure.putAll(customProperty);
41 | }
42 |
43 | System.out.println("all configure: " + configure);
44 | }
45 |
46 | public void run(boolean cluster) throws Exception {
47 | String name = configure.getProperty("name");
48 | Config conf = new Config();
49 |
50 | if (!cluster) {
51 | new LocalCluster().submitTopology("local-" + name, conf, createTopology());
52 | return;
53 | }
54 |
55 | int slots = Integer.valueOf(configure.getProperty("worker.slot.number"));
56 | int clusterNodes = Integer.valueOf(configure.getProperty("cluster.worker.node.number"));
57 | int workerNumber = slots * clusterNodes;
58 | int clusterNodeMemoryMb = Integer.valueOf(configure.getProperty("cluster.memory.per.node.mb"));
59 | int workerMem = clusterNodeMemoryMb / slots;
60 | conf.setNumWorkers(workerNumber);
61 | if (!Boolean.valueOf(configure.getProperty("ack.open"))) {
62 | conf.setNumAckers(0);
63 | }
64 |
65 | conf.put("worker.heap.memory.mb", workerMem);
66 | conf.put("topology.backpressure.enable", Boolean.valueOf(configure.getProperty("backpressure.enable")));
67 | StormSubmitter.submitTopologyWithProgressBar(name, conf, createTopology());
68 | Helper.setupShutdownHook(name); // handle Ctrl-C
69 |
70 | System.out.println("**********metrics will begin in two minute, please start to send source data to warn up**********");
71 | for (int i = 0; i< 2; i++) {
72 | Thread.sleep(1000 * 60);
73 | System.out.println("...");
74 | }
75 | System.out.println("********** start metrics **********");
76 | Helper.collectMetrics(name, 60);
77 | }
78 |
79 | abstract StormTopology createTopology();
80 | }
81 |
--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/storm/benchmark/BasicTopology.java:
--------------------------------------------------------------------------------
1 | package com.aliyun.emr.example.storm.benchmark;
2 |
3 | import com.google.common.collect.ImmutableMap;
4 | import kafka.api.OffsetRequest;
5 | import org.apache.storm.generated.StormTopology;
6 | import org.apache.storm.kafka.*;
7 | import org.apache.storm.kafka.bolt.KafkaBolt;
8 | import org.apache.storm.kafka.bolt.mapper.TupleToKafkaMapper;
9 | import org.apache.storm.kafka.bolt.selector.DefaultTopicSelector;
10 | import org.apache.storm.topology.*;
11 | import org.apache.storm.topology.base.BaseBasicBolt;
12 | import org.apache.storm.tuple.Fields;
13 | import org.apache.storm.tuple.Tuple;
14 | import org.apache.storm.tuple.Values;
15 |
16 | import java.util.Arrays;
17 | import java.util.Properties;
18 |
19 | public class BasicTopology extends AbstractTopology {
20 |
21 | @Override
22 | StormTopology createTopology() {
23 | TopologyBuilder builder = new TopologyBuilder();
24 | setSpout(builder);
25 | setBolt(builder);
26 | return builder.createTopology();
27 | }
28 |
29 | private void setSpout(TopologyBuilder builder) {
30 | String consumerGroup = configure.getProperty("consumer.group");
31 | SpoutConfig conf = new SpoutConfig(new ZkHosts(
32 | configure.getProperty("zookeeper.address") + ":2181" + configure.getProperty("zookeeper.root")),
33 | configure.getProperty("topic"), configure.getProperty("zookeeper.root"), consumerGroup);
34 | conf.zkPort = 2181;
35 | conf.zkServers= Arrays.asList(configure.getProperty("zookeeper.address"));
36 | conf.socketTimeoutMs = 60 * 1000;
37 | conf.scheme = new KeyValueSchemeAsMultiScheme(new StringKeyValueScheme());
38 | conf.startOffsetTime= OffsetRequest.LatestTime();
39 | conf.ignoreZkOffsets = true;
40 | KafkaSpout spout = new KafkaSpout(conf);
41 |
42 | int kafkaPartition = Integer.valueOf(configure.getProperty("partition.number"));
43 | builder.setSpout("spout", spout, kafkaPartition);
44 | }
45 |
46 | protected void setBolt(TopologyBuilder builder) {
47 | int clusterCores = Integer.valueOf(configure.getProperty("cluster.cores.total"));
48 | int availableCores = clusterCores - Integer.valueOf(configure.getProperty("partition.number"));
49 |
50 |
51 | //inter bolt
52 | //builder.setBolt("inter-bolt", getInterBolt(), availableCores).localOrShuffleGrouping("spout");
53 |
54 | //kafka storm-bolt
55 | builder.setBolt("kafka-bolt", getKafkaBolt(), availableCores).localOrShuffleGrouping("spout");
56 | }
57 |
58 | private IBasicBolt getInterBolt() {
59 | return new BaseBasicBolt() {
60 | @Override
61 | public void execute(Tuple input, BasicOutputCollector collector) {
62 | collector.emit(new Values(input));
63 | }
64 |
65 | @Override
66 | public void declareOutputFields(OutputFieldsDeclarer declarer) {
67 | declarer.declare(new Fields("inter-bolt"));
68 | }
69 | };
70 | }
71 |
72 | private IRichBolt getKafkaBolt() {
73 | Properties properties = new Properties();
74 | properties.put("bootstrap.servers", configure.getProperty("result.broker.list"));
75 | properties.put("acks", "0");
76 | properties.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
77 | properties.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
78 | // consume too much memory
79 | //properties.put("batch.size", "10485760");
80 | //properties.put("max.request", "10485760");
81 | //properties.put("send.buffer.bytes", "1000000");
82 | KafkaBolt bolt = new KafkaBolt()
83 | .withProducerProperties(properties)
84 | .withTopicSelector(new DefaultTopicSelector(configure.getProperty("result.topic")))
85 | .withTupleToKafkaMapper(new TupleToKafkaMapper() {
86 | @Override
87 | public String getKeyFromTuple(Tuple tuple) {
88 | return null;
89 | }
90 |
91 | @Override
92 | public String getMessageFromTuple(Tuple tuple) {
93 |
94 | ImmutableMap kv = (ImmutableMap)tuple.getValue(0);
95 | return kv.keySet().iterator().next() + "," + System.currentTimeMillis();
96 |
97 | }
98 | });
99 | bolt.setFireAndForget(true);
100 | bolt.setAsync(true);
101 | return bolt;
102 | }
103 |
104 | public static void main(String[] args) throws Exception {
105 | BasicTopology basicTopology = new BasicTopology();
106 | if (args.length > 1) {
107 | if (!"--property".equals(args[1])) {
108 | System.out.println("unknow option: " + args[1]);
109 | System.out.println("usage storm jar examples-1.1-shaded.jar com.aliyun.emr.example.storm.benchmark.BasicTopology benchmark.properties --property k1=v1,k2=v2");
110 | System.exit(1);
111 | }
112 | basicTopology.init(args[0], args[2]);
113 | } else {
114 | basicTopology.init(args[0]);
115 | }
116 |
117 | basicTopology.run(true);
118 | }
119 | }
120 |
--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/storm/benchmark/KafkaHdfs.java:
--------------------------------------------------------------------------------
1 | package com.aliyun.emr.example.storm.benchmark;
2 |
3 | import org.apache.storm.hdfs.bolt.HdfsBolt;
4 | import org.apache.storm.hdfs.bolt.format.DefaultFileNameFormat;
5 | import org.apache.storm.hdfs.bolt.format.RecordFormat;
6 | import org.apache.storm.hdfs.bolt.rotation.NoRotationPolicy;
7 | import org.apache.storm.hdfs.bolt.sync.CountSyncPolicy;
8 | import org.apache.storm.topology.IRichBolt;
9 | import org.apache.storm.topology.TopologyBuilder;
10 | import org.apache.storm.tuple.Tuple;
11 |
12 | import java.util.Map;
13 |
14 | public class KafkaHdfs extends BasicTopology{
15 |
16 | protected void setBolt(TopologyBuilder builder) {
17 | int clusterCores = Integer.valueOf(configure.getProperty("cluster.cores.total"));
18 | int availableCores = clusterCores - Integer.valueOf(configure.getProperty("partition.number"));
19 |
20 | builder.setBolt("hdfs-bolt", getHdfsBolt(), availableCores).localOrShuffleGrouping("spout");
21 | }
22 |
23 | private IRichBolt getHdfsBolt() {
24 |
25 | String filenamePrefix = configure.getProperty("filename.prefix") + configure.getProperty("name") + "/";
26 | HdfsBolt bolt = new HdfsBolt()
27 | .withFsUrl(configure.getProperty("url"))
28 | .withFileNameFormat(new DefaultFileNameFormat().withPrefix(filenamePrefix))
29 | .withRecordFormat(new RecordFormat() {
30 | @Override
31 | public byte[] format(Tuple tuple) {
32 | String eventTime = ((Map)tuple.getValue(0)).keySet().iterator().next();
33 | String output = eventTime + "," + System.currentTimeMillis() + System.lineSeparator();
34 | return output.getBytes();
35 | }
36 | })
37 | .withSyncPolicy(new CountSyncPolicy(1000))
38 | .withRotationPolicy(new NoRotationPolicy());
39 | return bolt;
40 | }
41 |
42 | public static void main(String[] args) throws Exception {
43 | KafkaHdfs topology = new KafkaHdfs();
44 | if (args.length > 1) {
45 | if (!"--property".equals(args[1])) {
46 | System.out.println("unknow option: " + args[1]);
47 | System.out.println("usage storm jar examples-1.1-shaded.jar com.aliyun.emr.example.storm.benchmark.KafkaHdfs benchmark.properties --property k1=v1,k2=v2");
48 | System.exit(1);
49 | }
50 | topology.init(args[0], args[2]);
51 | } else {
52 | topology.init(args[0]);
53 | }
54 |
55 | topology.run(true);
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/storm/benchmark/TridentWordCount.java:
--------------------------------------------------------------------------------
1 | package com.aliyun.emr.example.storm.benchmark;
2 |
3 | import kafka.api.OffsetRequest;
4 | import org.apache.storm.generated.StormTopology;
5 | import org.apache.storm.hdfs.trident.HdfsState;
6 | import org.apache.storm.hdfs.trident.HdfsStateFactory;
7 | import org.apache.storm.hdfs.trident.HdfsUpdater;
8 | import org.apache.storm.hdfs.trident.format.DefaultFileNameFormat;
9 | import org.apache.storm.hdfs.trident.format.DelimitedRecordFormat;
10 | import org.apache.storm.hdfs.trident.rotation.NoRotationPolicy;
11 | import org.apache.storm.kafka.KeyValueSchemeAsMultiScheme;
12 | import org.apache.storm.kafka.StringKeyValueScheme;
13 | import org.apache.storm.kafka.ZkHosts;
14 | import org.apache.storm.kafka.trident.TransactionalTridentKafkaSpout;
15 | import org.apache.storm.kafka.trident.TridentKafkaConfig;
16 | import org.apache.storm.trident.TridentTopology;
17 | import org.apache.storm.trident.operation.BaseFunction;
18 | import org.apache.storm.trident.operation.TridentCollector;
19 | import org.apache.storm.trident.state.StateFactory;
20 | import org.apache.storm.trident.tuple.TridentTuple;
21 | import org.apache.storm.tuple.Fields;
22 | import org.apache.storm.tuple.Values;
23 |
24 | import java.util.HashMap;
25 | import java.util.Map;
26 |
27 | public class TridentWordCount extends AbstractTopology {
28 |
29 | @Override
30 | StormTopology createTopology() {
31 | int partition = Integer.valueOf(configure.getProperty("partition.number"));
32 |
33 | TridentTopology topology = new TridentTopology();
34 | TransactionalTridentKafkaSpout spout = createSpout();
35 |
36 | topology.newStream("kafka-spout", spout).name("kafka").parallelismHint(partition)
37 | .each(spout.getOutputFields(), new WordCount(), new Fields("eventTime", "finishTime")).name("word-count")
38 | .partitionPersist(createHdfsState("eventTime", "finishTime"), new Fields("eventTime", "finishTime"), new HdfsUpdater(), new Fields("eventTime", "finishTime"));
39 | return topology.build();
40 | }
41 |
42 | private TransactionalTridentKafkaSpout createSpout() {
43 | String consumerGroup = configure.getProperty("consumer.group");
44 | ZkHosts zkHost = new ZkHosts(configure.getProperty("zookeeper.address") + ":2181" + configure.getProperty("zookeeper.root"));
45 | TridentKafkaConfig config = new TridentKafkaConfig(zkHost, configure.getProperty("topic"), consumerGroup);
46 | config.socketTimeoutMs = 60 * 1000;
47 | config.ignoreZkOffsets=true;
48 | config.startOffsetTime= OffsetRequest.LatestTime();
49 | config.scheme = new KeyValueSchemeAsMultiScheme(new StringKeyValueScheme());
50 | config.startOffsetTime = OffsetRequest.LatestTime();
51 | return new TransactionalTridentKafkaSpout(config);
52 | }
53 |
54 | private StateFactory createHdfsState(String... fileds) {
55 | String filenamePrefix = configure.getProperty("filename.prefix") + configure.getProperty("name") + "/";
56 |
57 | HdfsState.Options options = new HdfsState.HdfsFileOptions()
58 | .withFsUrl(configure.getProperty("url"))
59 | .withFileNameFormat(new DefaultFileNameFormat().withPath(filenamePrefix))
60 | .withRecordFormat(new DelimitedRecordFormat().withFields(new Fields(fileds)))
61 | .withRotationPolicy(new NoRotationPolicy());
62 | return new HdfsStateFactory().withOptions(options);
63 | }
64 |
65 | private class WordCount extends BaseFunction {
66 | private HashMap count = new HashMap<>();
67 | @Override
68 | public void execute(TridentTuple tuple, TridentCollector collector) {
69 | // for test
70 | Map kv = (Map)tuple.get(0);
71 | for (Map.Entry item: kv.entrySet()) {
72 | String eventTime = item.getKey();
73 | String words = item.getValue();
74 | for (String word: words.split("\\s+")) {
75 | Integer number = count.get(word);
76 | if (number == null) {
77 | number = 0;
78 | }
79 | number++;
80 | count.put(word, number);
81 |
82 | }
83 | collector.emit(new Values(eventTime, System.currentTimeMillis()));
84 | }
85 |
86 | }
87 | }
88 |
89 | public static void main(String[] args) throws Exception {
90 | TridentWordCount wordCount = new TridentWordCount();
91 | if (args.length > 1) {
92 | if (!"--property".equals(args[1])) {
93 | System.out.println("unknow option: " + args[1]);
94 | System.out.println("usage storm jar examples-1.1-shaded.jar com.aliyun.emr.example.storm.benchmark.TridentWordCount benchmark.properties --property k1=v1,k2=v2");
95 | System.exit(1);
96 | }
97 | wordCount.init(args[0], args[2]);
98 | } else {
99 | wordCount.init(args[0]);
100 | }
101 | wordCount.run(true);
102 | }
103 | }
104 |
--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/storm/benchmark/WindowedWordCount.java:
--------------------------------------------------------------------------------
1 | package com.aliyun.emr.example.storm.benchmark;
2 |
3 | import org.apache.storm.hdfs.bolt.HdfsBolt;
4 | import org.apache.storm.hdfs.bolt.format.DefaultFileNameFormat;
5 | import org.apache.storm.hdfs.bolt.format.DelimitedRecordFormat;
6 | import org.apache.storm.hdfs.bolt.rotation.NoRotationPolicy;
7 | import org.apache.storm.hdfs.bolt.sync.CountSyncPolicy;
8 | import org.apache.storm.task.OutputCollector;
9 | import org.apache.storm.task.TopologyContext;
10 | import org.apache.storm.topology.OutputFieldsDeclarer;
11 | import org.apache.storm.topology.TopologyBuilder;
12 | import org.apache.storm.topology.base.BaseWindowedBolt;
13 | import org.apache.storm.tuple.Fields;
14 | import org.apache.storm.tuple.Tuple;
15 | import org.apache.storm.tuple.Values;
16 | import org.apache.storm.windowing.TupleWindow;
17 | import org.apache.storm.topology.base.BaseWindowedBolt.Count;
18 |
19 | import java.util.HashMap;
20 | import java.util.Map;
21 |
22 | public class WindowedWordCount extends BasicTopology {
23 | @Override
24 | protected void setBolt(TopologyBuilder builder) {
25 | int windowLength = Integer.valueOf(configure.getProperty("window.length"));
26 | int clusterCores = Integer.valueOf(configure.getProperty("cluster.cores.total"));
27 | int availableCores = clusterCores - Integer.valueOf(configure.getProperty("partition.number"));
28 | int parallelism = availableCores / 2;
29 |
30 | int slidingInterval = Integer.valueOf(configure.getProperty("slide.interval"));
31 |
32 | builder.setBolt("count", new SplitCount().withWindow(new Count(windowLength), new Count(slidingInterval)), parallelism).localOrShuffleGrouping("spout");
33 |
34 | String filenamePrefix = configure.getProperty("filename.prefix") + configure.getProperty("name") + "/";
35 | HdfsBolt bolt = new HdfsBolt()
36 | .withFsUrl(configure.getProperty("url"))
37 | .withFileNameFormat(new DefaultFileNameFormat().withPrefix(filenamePrefix))
38 | .withRecordFormat(new DelimitedRecordFormat().withFieldDelimiter(","))
39 | .withSyncPolicy(new CountSyncPolicy(1000))
40 | .withRotationPolicy(new NoRotationPolicy());
41 | builder.setBolt("hdfs-bolt", bolt, parallelism).localOrShuffleGrouping("count");
42 | }
43 |
44 | private class SplitCount extends BaseWindowedBolt {
45 | private OutputCollector collector;
46 | private Map counter = new HashMap<>();
47 |
48 | @Override
49 | public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
50 | super.prepare(stormConf, context, collector);
51 | this.collector = collector;
52 | }
53 |
54 | @Override
55 | public void execute(TupleWindow inputWindow) {
56 | for ( Tuple tuple : inputWindow.get()) {
57 | Map value = (Map)tuple.getValue(0);
58 | for (Map.Entry item : value.entrySet()) {
59 | String eventTime = item.getKey();
60 | String words = item.getValue();
61 | for (String word: words.split("\\s+")) {
62 | Integer number = counter.get(word);
63 | if (number == null) {
64 | number = 0;
65 | }
66 | number++;
67 | counter.put(word, number);
68 | }
69 | collector.emit(new Values(eventTime, System.currentTimeMillis()));
70 | }
71 | }
72 |
73 | }
74 |
75 | @Override
76 | public void declareOutputFields(OutputFieldsDeclarer declarer) {
77 | declarer.declare(new Fields("eventTime", "finishTime"));
78 | }
79 | }
80 |
81 | public static void main(String[] args) throws Exception {
82 | WindowedWordCount wordCount = new WindowedWordCount();
83 | if (args.length > 1) {
84 | if (!"--property".equals(args[1])) {
85 | System.out.println("unknow option: " + args[1]);
86 | System.out.println("usage storm jar examples-1.1-shaded.jar com.aliyun.emr.example.storm.benchmark.WindowedWordCount benchmark.properties --property k1=v1,k2=v2");
87 | System.exit(1);
88 | }
89 | wordCount.init(args[0], args[2]);
90 | } else {
91 | wordCount.init(args[0]);
92 | }
93 | wordCount.run(true);
94 | }
95 | }
96 |
--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/storm/benchmark/WordCount.java:
--------------------------------------------------------------------------------
1 | package com.aliyun.emr.example.storm.benchmark;
2 |
3 | import org.apache.storm.hdfs.bolt.HdfsBolt;
4 | import org.apache.storm.hdfs.bolt.format.DefaultFileNameFormat;
5 | import org.apache.storm.hdfs.bolt.format.DelimitedRecordFormat;
6 | import org.apache.storm.hdfs.bolt.rotation.NoRotationPolicy;
7 | import org.apache.storm.hdfs.bolt.sync.CountSyncPolicy;
8 | import org.apache.storm.topology.BasicOutputCollector;
9 | import org.apache.storm.topology.OutputFieldsDeclarer;
10 | import org.apache.storm.topology.TopologyBuilder;
11 | import org.apache.storm.topology.base.BaseBasicBolt;
12 | import org.apache.storm.tuple.Fields;
13 | import org.apache.storm.tuple.Tuple;
14 | import org.apache.storm.tuple.Values;
15 |
16 | import java.util.HashMap;
17 | import java.util.Map;
18 |
19 | public class WordCount extends BasicTopology {
20 | @Override
21 | protected void setBolt(TopologyBuilder builder) {
22 | int clusterCores = Integer.valueOf(configure.getProperty("cluster.cores.total"));
23 | int availableCores = clusterCores - Integer.valueOf(configure.getProperty("partition.number"));
24 |
25 | int hdfsParallelismFactor = Integer.parseInt(configure.getProperty("hdfs.parallelism.factor"));
26 | int hdfsParallelism = availableCores * hdfsParallelismFactor / (hdfsParallelismFactor + 1);
27 | builder.setBolt("split-count", new SplitCount(), availableCores - hdfsParallelism).localOrShuffleGrouping("spout");
28 |
29 | String filenamePrefix = configure.getProperty("filename.prefix") + configure.getProperty("name") + "/";
30 | HdfsBolt bolt = new HdfsBolt()
31 | .withFsUrl(configure.getProperty("url"))
32 | .withFileNameFormat(new DefaultFileNameFormat().withPrefix(filenamePrefix))
33 | .withRecordFormat(new DelimitedRecordFormat().withFieldDelimiter(","))
34 | .withSyncPolicy(new CountSyncPolicy(1000))
35 | .withRotationPolicy(new NoRotationPolicy());
36 | builder.setBolt("hdfs-bolt", bolt, hdfsParallelism).localOrShuffleGrouping("split-count");
37 | }
38 |
39 |
40 | private class SplitCount extends BaseBasicBolt {
41 | private Map counter = new HashMap<>();
42 |
43 | @Override
44 | public void execute(Tuple input, BasicOutputCollector collector) {
45 | Map value = (Map)input.getValue(0);
46 | for (Map.Entryitem : value.entrySet()) {
47 | String eventTime = item.getKey();
48 | String words = item.getValue();
49 |
50 | for (String word : words.split("\\s+")) {
51 | Integer number = counter.get(word);
52 | if (number == null) {
53 | number = 0;
54 | }
55 | number++;
56 | counter.put(word, number);
57 | }
58 | collector.emit(new Values(eventTime, System.currentTimeMillis()));
59 | }
60 |
61 | }
62 |
63 | @Override
64 | public void declareOutputFields(OutputFieldsDeclarer declarer) {
65 | declarer.declare(new Fields("eventTime", "finishTime"));
66 | }
67 | }
68 |
69 | public static void main(String[] args) throws Exception {
70 | WordCount wordCount = new WordCount();
71 | if (args.length > 1) {
72 | if (!"--property".equals(args[1])) {
73 | System.out.println("unknow option: " + args[1]);
74 | System.out.println("usage storm jar examples-1.1-shaded.jar com.aliyun.emr.example.storm.benchmark.WordCount benchmark.properties --property k1=v1,k2=v2");
75 | System.exit(1);
76 | }
77 | wordCount.init(args[0], args[2]);
78 | } else {
79 | wordCount.init(args[0]);
80 | }
81 | wordCount.run(true);
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/storm/benchmark/util/Helper.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License
17 | */
18 |
19 | package com.aliyun.emr.example.storm.benchmark.util;
20 |
21 | import org.apache.storm.Config;
22 | import org.apache.storm.LocalCluster;
23 | import org.apache.storm.StormSubmitter;
24 | import org.apache.storm.generated.KillOptions;
25 | import org.apache.storm.generated.Nimbus;
26 | import org.apache.storm.generated.StormTopology;
27 | import org.apache.storm.perf.utils.BasicMetricsCollector;
28 | import org.apache.storm.utils.NimbusClient;
29 | import org.apache.storm.utils.Utils;
30 |
31 | import java.util.Map;
32 |
33 | public class Helper {
34 |
35 | public static void kill(Nimbus.Client client, String topoName) throws Exception {
36 | KillOptions opts = new KillOptions();
37 | opts.set_wait_secs(0);
38 | client.killTopologyWithOpts(topoName, opts);
39 | }
40 |
41 | public static void killAndShutdownCluster(LocalCluster cluster, String topoName) throws Exception {
42 | KillOptions opts = new KillOptions();
43 | opts.set_wait_secs(0);
44 | cluster.killTopologyWithOpts(topoName, opts);
45 | cluster.shutdown();
46 | }
47 |
48 |
49 | public static LocalCluster runOnLocalCluster(String topoName, StormTopology topology) {
50 | LocalCluster cluster = new LocalCluster();
51 | cluster.submitTopology(topoName, new Config(), topology);
52 | return cluster;
53 | }
54 |
55 | public static int getInt(Map map, Object key, int def) {
56 | return Utils.getInt(Utils.get(map, key, def));
57 | }
58 |
59 | public static String getStr(Map map, Object key) {
60 | return (String) map.get(key);
61 | }
62 |
63 | public static void collectMetrics(String topologyName, Integer pollInterval) throws Exception {
64 | Map clusterConf = Utils.readStormConfig();
65 | Nimbus.Client client = NimbusClient.getConfiguredClient(clusterConf).getClient();
66 | BasicMetricsCollector metricsCollector = new BasicMetricsCollector(client, topologyName, clusterConf);
67 |
68 | try {
69 | while (true){
70 | metricsCollector.collect(client);
71 | Thread.sleep(pollInterval * 1000);
72 | }
73 | } finally {
74 | metricsCollector.close();
75 | kill(client, topologyName);
76 | }
77 |
78 | }
79 |
80 | public static void collectMetricsAndKill(String topologyName, Integer pollInterval, Integer duration) throws Exception {
81 | Map clusterConf = Utils.readStormConfig();
82 | Nimbus.Client client = NimbusClient.getConfiguredClient(clusterConf).getClient();
83 | BasicMetricsCollector metricsCollector = new BasicMetricsCollector(client, topologyName, clusterConf);
84 |
85 | int times = duration / pollInterval;
86 | metricsCollector.collect(client);
87 | for (int i = 0; i < times; i++) {
88 | Thread.sleep(pollInterval * 1000);
89 | metricsCollector.collect(client);
90 | }
91 | metricsCollector.close();
92 | kill(client, topologyName);
93 | }
94 |
95 | public static void collectLocalMetricsAndKill(LocalCluster localCluster, String topologyName, Integer pollInterval, Integer duration, Map clusterConf) throws Exception {
96 | BasicMetricsCollector metricsCollector = new BasicMetricsCollector(localCluster, topologyName, clusterConf);
97 |
98 | int times = duration / pollInterval;
99 | metricsCollector.collect(localCluster);
100 | for (int i = 0; i < times; i++) {
101 | Thread.sleep(pollInterval * 1000);
102 | metricsCollector.collect(localCluster);
103 | }
104 | metricsCollector.close();
105 | killAndShutdownCluster(localCluster, topologyName);
106 | }
107 |
108 | /** Kill topo and Shutdown local cluster on Ctrl-C */
109 | public static void setupShutdownHook(final LocalCluster cluster, final String topoName) {
110 | Runtime.getRuntime().addShutdownHook(new Thread() {
111 | public void run() {
112 | cluster.killTopology(topoName);
113 | System.out.println("Killed Topology");
114 | cluster.shutdown();
115 | }
116 | });
117 | }
118 |
119 | /** Kill topo on Ctrl-C */
120 | public static void setupShutdownHook(final String topoName) {
121 | Map clusterConf = Utils.readStormConfig();
122 | final Nimbus.Client client = NimbusClient.getConfiguredClient(clusterConf).getClient();
123 | Runtime.getRuntime().addShutdownHook(new Thread() {
124 | public void run() {
125 | try {
126 | Helper.kill(client, topoName);
127 | System.out.println("Killed Topology");
128 | } catch (Exception e) {
129 | e.printStackTrace();
130 | }
131 | }
132 | });
133 | }
134 |
135 | public static void runOnClusterAndPrintMetrics(Integer durationSec, String topoName, Map topoConf, StormTopology topology) throws Exception {
136 | // submit topology
137 | StormSubmitter.submitTopologyWithProgressBar(topoName, topoConf, topology);
138 | setupShutdownHook(topoName); // handle Ctrl-C
139 |
140 | // poll metrics every minute, then kill topology after specified duration
141 | Integer pollIntervalSec = 60;
142 | collectMetricsAndKill(topoName, pollIntervalSec, durationSec);
143 | }
144 | }
145 |
--------------------------------------------------------------------------------
/src/main/pig/sample.pig:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 |
19 | -- Query Phrase Popularity (Hadoop cluster)
20 |
21 | -- This script processes a search query log file from the Excite search engine and finds search phrases that occur with particular high frequency during certain times of the day.
22 |
23 |
24 | -- Register the tutorial JAR file so that the included UDFs can be called in the script.
25 | REGISTER $tutorial;
26 |
27 | -- Use the PigStorage function to load the excite log file into the ?raw? bag as an array of records.
28 | -- Input: (user,time,query)
29 | raw = LOAD '$input' USING PigStorage('\t') AS (user, time, query);
30 |
31 |
32 | -- Call the NonURLDetector UDF to remove records if the query field is empty or a URL.
33 | clean1 = FILTER raw BY org.apache.pig.tutorial.NonURLDetector(query);
34 |
35 | -- Call the ToLower UDF to change the query field to lowercase.
36 | clean2 = FOREACH clean1 GENERATE user, time, org.apache.pig.tutorial.ToLower(query) as query;
37 |
38 | -- Because the log file only contains queries for a single day, we are only interested in the hour.
39 | -- The excite query log timestamp format is YYMMDDHHMMSS.
40 | -- Call the ExtractHour UDF to extract the hour (HH) from the time field.
41 | houred = FOREACH clean2 GENERATE user, org.apache.pig.tutorial.ExtractHour(time) as hour, query;
42 |
43 | -- Call the NGramGenerator UDF to compose the n-grams of the query.
44 | ngramed1 = FOREACH houred GENERATE user, hour, flatten(org.apache.pig.tutorial.NGramGenerator(query)) as ngram;
45 |
46 | -- Use the DISTINCT command to get the unique n-grams for all records.
47 | ngramed2 = DISTINCT ngramed1;
48 |
49 | -- Use the GROUP command to group records by n-gram and hour.
50 | hour_frequency1 = GROUP ngramed2 BY (ngram, hour);
51 |
52 | -- Use the COUNT function to get the count (occurrences) of each n-gram.
53 | hour_frequency2 = FOREACH hour_frequency1 GENERATE flatten($0), COUNT($1) as count;
54 |
55 | -- Use the GROUP command to group records by n-gram only.
56 | -- Each group now corresponds to a distinct n-gram and has the count for each hour.
57 | uniq_frequency1 = GROUP hour_frequency2 BY group::ngram;
58 |
59 | -- For each group, identify the hour in which this n-gram is used with a particularly high frequency.
60 | -- Call the ScoreGenerator UDF to calculate a "popularity" score for the n-gram.
61 | uniq_frequency2 = FOREACH uniq_frequency1 GENERATE flatten($0), flatten(org.apache.pig.tutorial.ScoreGenerator($1));
62 |
63 | -- Use the FOREACH-GENERATE command to assign names to the fields.
64 | uniq_frequency3 = FOREACH uniq_frequency2 GENERATE $1 as hour, $0 as ngram, $2 as score, $3 as count, $4 as mean;
65 |
66 | -- Use the FILTER command to move all records with a score less than or equal to 2.0.
67 | filtered_uniq_frequency = FILTER uniq_frequency3 BY score > 2.0;
68 |
69 | -- Use the ORDER command to sort the remaining records by hour and score.
70 | ordered_uniq_frequency = ORDER filtered_uniq_frequency BY hour, score;
71 |
72 | -- Use the PigStorage function to store the results.
73 | -- Output: (hour, n-gram, score, count, average_counts_among_all_hours)
74 | STORE ordered_uniq_frequency INTO '$results' USING PigStorage();
--------------------------------------------------------------------------------
/src/main/python/deeplearning/tf_fm_on_spark.py:
--------------------------------------------------------------------------------
1 | from pylearning.model.tensorflow_base import tensorflow_base
2 | from pyspark.sql import SparkSession
3 | from pyspark import SparkContext
4 | import os
5 | import random
6 | import numpy as np
7 |
8 | from pyspark.sql.functions import col
9 | import tensorflow as tf
10 |
11 | class tf_fm(tensorflow_base):
12 |
13 | @staticmethod
14 | def pre_train(env):
15 | spark_context = SparkContext.getOrCreate()
16 | spark = SparkSession(spark_context).builder.getOrCreate()
17 | rating_df = spark.read.format('csv').option('header', 'True').load('/moviedata/ratings.csv')
18 | movie_df = spark.read.format('csv').option('header', 'True').load('/moviedata/movies.csv')
19 |
20 | # process user first
21 | distinct_user_df = rating_df.select('userId').distinct()
22 | users_number = distinct_user_df.count()
23 | env.get("algo")["users_number"] = str(users_number)
24 |
25 | users_row = distinct_user_df.collect()
26 | users = []
27 | users_dict = []
28 | users_map = {}
29 | for user in users_row:
30 | users.append(user['userId'])
31 | sorted_users = sorted(users)
32 | for user in sorted_users:
33 | users_dict.append((user,len(users_dict)))
34 | users_map[user] = len(users_map)
35 |
36 | # It is use for later process, to get the sorted user id.
37 | columns = ["userid","id"]
38 | users_sort_df = spark.createDataFrame(users_dict,columns)
39 | # users_sort_df.write.format("csv").save("/moviedata/sortedusers")
40 |
41 | # process genres
42 | geners_row = movie_df.select("genres").distinct().collect()
43 | genres_set = set()
44 | genres_map = {}
45 | for genres in geners_row:
46 | for one_genre in genres['genres'].split('|'):
47 | genres_set.add(one_genre)
48 | for genre in genres_set:
49 | genres_map[genre] = len(genres_map)
50 |
51 | # join two dataframe and process later, userid(bigint) genres(string, need split), rating(float)
52 | joined_df = rating_df.join(movie_df, rating_df.movieId == movie_df.movieId)
53 | joined_df = joined_df.select(col('userId'),col('genres'),col('rating').cast('float').alias('rating'))
54 |
55 | users_map_bc = spark_context.broadcast(users_map)
56 | genres_map_bc = spark_context.broadcast(genres_map)
57 | env.get("algo")["genres_number"] = str(len(genres_map))
58 |
59 | def process_row(row):
60 | userId = row.userId
61 | genres = row.genres
62 | users_map_rdd = users_map_bc.value
63 | genres_map_rdd = genres_map_bc.value
64 | genres_return_list = []
65 | for i in genres.split("|"):
66 | genres_return_list.append(str(genres_map_rdd[i]))
67 | return (users_map_rdd[userId], "|".join(genres_return_list), row.rating)
68 |
69 | return joined_df.rdd.map(process_row).toDF(['userId','genres','rating'])
70 |
71 | @staticmethod
72 | def train(dataframe, env):
73 | environ = os.environ
74 | ps_hosts = environ.get("ps_hosts").split(",")
75 | worker_hosts = environ.get("worker_hosts").split(",")
76 | job_name = environ.get("job_name")
77 | task_index = int(environ.get("task_index"))
78 |
79 | cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
80 | server = tf.train.Server(cluster,
81 | job_name= job_name,
82 | task_index=task_index)
83 |
84 | if job_name == "ps":
85 | server.join()
86 | else :
87 | # batch size is 2000, parameter size including embedding for user and one hot for genres
88 | # embedding size is 128, one hot size is 20(we can obtain it from env)
89 | batch_size = 2000
90 |
91 | embedding_size = 128
92 | genres_size = int(env.get("algo")["genres_number"])
93 | users_size = int(env.get("algo")["users_number"])
94 | p_size = embedding_size + genres_size
95 | k = 10
96 | embeddings = tf.Variable(tf.random_uniform([users_size,embedding_size], -1.0, 1.0))
97 | USER = tf.placeholder('int64',shape=[batch_size,1])
98 | ITEM = tf.placeholder('float', shape=[batch_size, genres_size])
99 | embed = tf.nn.embedding_lookup(embeddings, USER)
100 | user_embed = tf.reshape(embed, shape=[batch_size, embedding_size])
101 | X = tf.concat([user_embed, ITEM], 1)
102 | Y = tf.placeholder('float', shape=[batch_size,1])
103 |
104 | w0 = tf.Variable(tf.zeros([1]))
105 | W = tf.Variable(tf.zeros([p_size]))
106 |
107 | V = tf.Variable(tf.random_normal([k, p_size], stddev=0.01))
108 | y_hat = tf.Variable(tf.zeros([batch_size, 1]))
109 |
110 | linear_terms = tf.add(w0, tf.reduce_sum(tf.multiply(W, X), 1, keep_dims=True))
111 | interactions = (tf.multiply(0.5, tf.reduce_sum(
112 | tf.subtract(tf.pow(tf.matmul(X, tf.transpose(V)), 2),
113 | tf.matmul(tf.pow(X, 2), tf.transpose(tf.pow(V, 2)))), 1,
114 | keep_dims=True)))
115 |
116 | y_hat = tf.add(linear_terms, interactions)
117 | lambda_w = tf.constant(0.001, name='lambda_w')
118 | lambda_v = tf.constant(0.001, name='lambda_v')
119 |
120 | l2_norm = (tf.reduce_sum(
121 | tf.add(
122 | tf.multiply(lambda_w, tf.pow(W, 2)),
123 | tf.multiply(lambda_v, tf.pow(V, 2)))))
124 |
125 | error = tf.reduce_mean(tf.square(tf.subtract(Y, y_hat)))
126 |
127 | loss = tf.add(error, l2_norm)
128 |
129 | N_EPOCHS = 100
130 | eta = tf.constant(0.1)
131 | global_step = tf.contrib.framework.get_or_create_global_step()
132 | optimizer = tf.train.AdagradOptimizer(eta).minimize(loss, global_step=global_step)
133 |
134 | init = tf.global_variables_initializer()
135 |
136 | def get_train_data():
137 | users_sub, genres_sub, rating_sub = \
138 | zip(*random.sample(list(zip(dataframe.userId, dataframe.genres, dataframe.rating)), batch_size))
139 | batch_user = np.zeros(shape=(batch_size,1), dtype=np.int64)
140 | batch_genre = np.zeros(shape=(batch_size,genres_size), dtype=np.float32)
141 | label = np.ndarray(shape=(batch_size,1), dtype = np.float32)
142 | for i in range(batch_size):
143 | batch_user[i] = users_sub[i]
144 | for genre in genres_sub[i].split("|"):
145 | batch_genre[i][int(genre)] = 1
146 | label[i] = rating_sub[i]
147 | return batch_user, batch_genre, label
148 |
149 | checkpoint_dir = "hdfs://emr-header-1:9000/movie"
150 | saver = tf.train.Saver()
151 | epoch = 0
152 |
153 | with tf.train.MonitoredTrainingSession(master = server.target,
154 | is_chief = task_index == 0,
155 | checkpoint_dir= checkpoint_dir,
156 | save_checkpoint_secs=20) as sess:
157 | tf.reset_default_graph()
158 | sess.run(init)
159 | latest_path = tf.train.latest_checkpoint(checkpoint_dir=checkpoint_dir)
160 | saver.restore(sess, latest_path)
161 | while epoch < N_EPOCHS:
162 | (batch_user,batch_genre,label) = get_train_data()
163 | sess.run(optimizer, feed_dict={USER: batch_user, ITEM: batch_genre, Y:label})
164 | print(sess.run(error, feed_dict={USER: batch_user, ITEM: batch_genre, Y: label}))
165 | epoch = epoch + 1
166 |
--------------------------------------------------------------------------------
/src/main/python/deeplearning/train_boston.py:
--------------------------------------------------------------------------------
1 | from pylearning.model.tensorflow_base import tensorflow_base
2 | from pyspark.sql import SparkSession
3 | from pyspark import SparkContext
4 |
5 | import tensorflow as tf
6 | from pyspark.sql.functions import col
7 |
8 | class train_boston(tensorflow_base):
9 | @staticmethod
10 | def pre_train():
11 | spark_context = SparkContext.getOrCreate()
12 | spark = SparkSession(spark_context).builder.getOrCreate()
13 | df = spark.read.format('csv').option("header","True").load('/train.csv')
14 | cast_df = df.select(*(col(c).cast("double").alias(c) for c in df.columns))
15 | return cast_df
16 |
17 | @staticmethod
18 | def train(dataframe, env):
19 | crim = tf.feature_column.numeric_column('crim', dtype=tf.float64, shape=())
20 | zn = tf.feature_column.numeric_column('zn', dtype=tf.float64, shape=())
21 | indus = tf.feature_column.numeric_column('indus', dtype=tf.float64, shape=())
22 | chas = tf.feature_column.numeric_column('chas', dtype=tf.int64, shape=())
23 | nox = tf.feature_column.numeric_column('nox', dtype=tf.float64, shape=())
24 | rm = tf.feature_column.numeric_column('rm', dtype=tf.float64, shape=())
25 | age = tf.feature_column.numeric_column('age', dtype=tf.float64, shape=())
26 | dis = tf.feature_column.numeric_column('dis', dtype=tf.float64, shape=())
27 | rad = tf.feature_column.numeric_column('rad', dtype=tf.int64, shape=())
28 | tax = tf.feature_column.numeric_column('tax', dtype=tf.int64, shape=())
29 | ptratio = tf.feature_column.numeric_column('ptratio', dtype=tf.float64, shape=())
30 | black = tf.feature_column.numeric_column('black', dtype=tf.float64, shape=())
31 | lstat = tf.feature_column.numeric_column('lstat', dtype=tf.float64, shape=())
32 |
33 | feature_cols = [crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, black, lstat]
34 | feature_names = ['ID','crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black',
35 | 'lstat']
36 | label_name = 'medv'
37 |
38 | dict = {}
39 |
40 | index = 0
41 | for i in feature_names:
42 | dict[i] = index
43 | index+=1
44 |
45 | def train_input():
46 | feature_dict = {}
47 | for i in feature_names[1:]:
48 | feature_dict[i] = dataframe.get(i)
49 |
50 | _dataset = tf.data.Dataset.from_tensor_slices((feature_dict, dataframe.get(label_name)))
51 | dataset = _dataset.batch(32)
52 | return dataset
53 |
54 | ps = tf.contrib.distribute.ParameterServerStrategy()
55 | config = tf.estimator.RunConfig(train_distribute=ps, eval_distribute=ps)
56 | estimator = tf.estimator.LinearRegressor(feature_columns=feature_cols, model_dir='hdfs://emr-header-1:9000/boston', config=config)
57 |
58 | train_spec = tf.estimator.TrainSpec(input_fn=train_input, max_steps=100)
59 | eval_spec = tf.estimator.EvalSpec(input_fn=train_input, start_delay_secs=0, throttle_secs=10,steps=10)
60 | tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
61 |
62 |
--------------------------------------------------------------------------------
/src/main/python/odps-sample.py:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | import sys
19 |
20 | from odps import OdpsOps
21 | from pyspark import SparkContext
22 |
23 | if __name__ == "__main__":
24 |
25 | if len(sys.argv) != 7:
26 | print >> sys.stderr, "Usage: spark-submit odps-sample.py accessKeyId accessKeySecret project table " \
27 | "partition numPartitions"
28 | exit(-1)
29 |
30 | accessKeyId = sys.argv[1]
31 | accessKeySecret = sys.argv[2]
32 | odpsUrl = "http://odps-ext.aliyun-inc.com/api"
33 | tunnelUrl = "http://dt-ext.odps.aliyun-inc.com"
34 | project = sys.argv[3]
35 | table = sys.argv[4]
36 | partition = sys.argv[5]
37 | numPartitions = sys.argv[6]
38 |
39 | sc = SparkContext(appName="PySpark Odps Sample")
40 |
41 | odpsOps = OdpsOps(sc, accessKeyId, accessKeySecret, odpsUrl, tunnelUrl)
42 |
43 | print "pScheme"
44 | pSchema = odpsOps.getTableSchema(project, table, True)
45 | for col in pSchema:
46 | print col
47 |
48 | print "scheme"
49 | schema = odpsOps.getTableSchema(project, table, False)
50 | for col in schema:
51 | print col
52 |
53 | print "ColumnByIdx"
54 | col1 =odpsOps.getColumnByIdx(project, table, 1)
55 | print col1
56 |
57 | data = sc.parallelize([[1, 1.5, False, "2014-06-11", "row 1"],
58 | [2, 1.5, True, "2014-06-10", "row 2"]], 2)
59 | odpsOps.saveToPartitionTable(project, table, partition, data, isCreatePt=True, isOverWrite=False)
60 |
61 | nump = int(numPartitions)
62 | rdd = odpsOps.readPartitionTable(project, table, partition, nump, batchSize=1)
63 | rows = rdd.collect()
64 | for row in rows:
65 | print "row: ",
66 | for col in row:
67 | print col, type(col),
68 | print ""
69 |
70 | print "read specific columns"
71 | rdd2 = odpsOps.readPartitionTable(project, table, partition, nump, cols=[1, 2])
72 | rows2 = rdd2.collect()
73 | for row in rows2:
74 | print "row: ",
75 | for col in row:
76 | print col, type(col),
77 | print ""
78 |
--------------------------------------------------------------------------------
/src/main/python/streaming/loghub-wordcount.py:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | import sys
19 |
20 | from pyspark import SparkContext
21 | from pyspark.streaming import StreamingContext
22 | from loghub import LoghubUtils
23 |
24 | if __name__ == "__main__":
25 | if len(sys.argv) != 8:
26 | print >> sys.stderr, "Usage: spark-submit loghub-wordcount.py logServiceProject logsStoreName " \
27 | "logHubConsumerGroupName loghubEndpoint numReceiver accessKeyId accessKeySecret"
28 | exit(-1)
29 |
30 | sc = SparkContext(appName="PythonStreamingLoghubWordCount")
31 | ssc = StreamingContext(sc, 2)
32 |
33 | logServiceProject = sys.argv[1]
34 | logsStoreName = sys.argv[2]
35 | logHubConsumerGroupName = sys.argv[3]
36 | loghubEndpoint = sys.argv[4]
37 | numReceiver = int(sys.argv[5])
38 | accessKeyId = sys.argv[6]
39 | accessKeySecret = sys.argv[7]
40 |
41 | stream = LoghubUtils.createStreams(ssc, logServiceProject, logsStoreName, logHubConsumerGroupName, loghubEndpoint,
42 | numReceiver, accessKeyId, accessKeySecret)
43 | lines = stream.map(lambda x: x[1])
44 | counts = lines.flatMap(lambda line: line.split(" ")) \
45 | .map(lambda word: (word, 1)) \
46 | .reduceByKey(lambda a, b: a+b)
47 | counts.pprint()
48 |
49 | ssc.start()
50 | ssc.awaitTermination()
51 |
--------------------------------------------------------------------------------
/src/main/python/streaming/wcmapper.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import sys
4 |
5 | for line in sys.stdin:
6 | line = line.strip()
7 | words = line.split()
8 | for word in words:
9 | print '%s\t%s' % (word, 1)
10 |
11 |
--------------------------------------------------------------------------------
/src/main/python/streaming/wcreducer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from operator import itemgetter
4 | import sys
5 |
6 | current_word = None
7 | current_count = 0
8 | word = None
9 |
10 | for line in sys.stdin:
11 | line = line.strip()
12 |
13 | word, count = line.split('\t', 1)
14 |
15 | try:
16 | count = int(count)
17 | except ValueError:
18 | continue
19 |
20 | if current_word == word:
21 | current_count += count
22 | else:
23 | if current_word:
24 | print '%s\t%s' % (current_word, current_count)
25 | current_count = count
26 | current_word = word
27 |
28 | if current_word == word:
29 | print '%s\t%s' % (current_word, current_count)
30 |
--------------------------------------------------------------------------------
/src/main/python/wordcount.py:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | import sys
19 | from operator import add
20 | from pyspark import SparkContext
21 | from pyspark import SparkConf
22 |
23 | if __name__ == "__main__":
24 | conf = SparkConf()
25 | sc = SparkContext(appName="PythonWordCount", conf=conf)
26 | lines = sc.textFile(sys.argv[1], int(sys.argv[3]))
27 | counts = lines.flatMap(lambda x: x.split(' ')) \
28 | .map(lambda x: (str(x), 1)) \
29 | .reduceByKey(add)
30 | counts.saveAsTextFile(sys.argv[2])
31 | sc.stop()
--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/flink/FlinkOSSSample.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.aliyun.emr.example.flink
19 |
20 | import org.apache.flink.api.java.ExecutionEnvironment
21 | import org.apache.flink.api.java.utils.ParameterTool
22 |
23 | import scala.collection.JavaConversions._
24 |
25 | object FlinkOSSSample {
26 | def main(args: Array[String]) {
27 |
28 | val params: ParameterTool = ParameterTool.fromArgs(args)
29 |
30 | // set up execution environment
31 | val env = ExecutionEnvironment.getExecutionEnvironment
32 |
33 | // make parameters available in the web interface
34 | env.getConfig.setGlobalJobParameters(params)
35 |
36 | if (!params.has("input")) {
37 | println("Executing WordCount example with default input data set.")
38 | println("Use --input to specify file input.")
39 | sys.exit(1)
40 | }
41 | val text = env.readTextFile(params.get("input"))
42 |
43 | val top10 = text.first(10)
44 |
45 | top10.collect().foreach(println)
46 |
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/AbstractParams.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.aliyun.emr.example.spark
19 |
20 | import scala.reflect.runtime.universe._
21 |
22 | /**
23 | * Abstract class for parameter case classes.
24 | * This overrides the [[toString]] method to print all case class fields by name and value.
25 | * @tparam T Concrete parameter class.
26 | */
27 | abstract class AbstractParams[T: TypeTag] {
28 |
29 | private def tag: TypeTag[T] = typeTag[T]
30 |
31 | /**
32 | * Finds all case class fields in concrete class instance, and outputs them in JSON-style format:
33 | * {
34 | * [field name]:\t[field value]\n
35 | * [field name]:\t[field value]\n
36 | * ...
37 | * }
38 | */
39 | override def toString: String = {
40 | val tpe = tag.tpe
41 | val allAccessors = tpe.declarations.collect {
42 | case m: MethodSymbol if m.isCaseAccessor => m
43 | }
44 | val mirror = runtimeMirror(getClass.getClassLoader)
45 | val instanceMirror = mirror.reflect(this)
46 | allAccessors.map { f =>
47 | val paramName = f.name.toString
48 | val fieldMirror = instanceMirror.reflectField(f)
49 | val paramValue = fieldMirror.get
50 | s" $paramName:\t$paramValue"
51 | }.mkString("{\n", ",\n", "\n}")
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/LinearRegression.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.aliyun.emr.example.spark
19 |
20 | import org.apache.spark.mllib.optimization.{L1Updater, SimpleUpdater, SquaredL2Updater}
21 | import org.apache.spark.mllib.regression.LinearRegressionWithSGD
22 | import org.apache.spark.mllib.util.MLUtils
23 | import _root_.scopt.OptionParser
24 |
25 | object LinearRegression extends RunLocally{
26 | object RegType extends Enumeration {
27 | type RegType = Value
28 | val NONE, L1, L2 = Value
29 | }
30 |
31 | import RegType._
32 |
33 | case class Params(
34 | input: String = null,
35 | numPartitions: Int = 2,
36 | numIterations: Int = 100,
37 | stepSize: Double = 1.0,
38 | regType: RegType = L2,
39 | regParam: Double = 0.01,
40 | accessKeyId: String = null,
41 | accessKeySecret: String = null,
42 | endpoint: String = null) extends AbstractParams[Params]
43 |
44 | def main(args: Array[String]) {
45 | val defaultParams = Params()
46 |
47 | val parser = new OptionParser[Params]("LinearRegression") {
48 | head("LinearRegression: an example app for linear regression.")
49 | opt[Int]("numIterations")
50 | .text("number of iterations")
51 | .action((x, c) => c.copy(numIterations = x))
52 | opt[Double]("stepSize")
53 | .text(s"initial step size, default: ${defaultParams.stepSize}")
54 | .action((x, c) => c.copy(stepSize = x))
55 | opt[String]("regType")
56 | .text(s"regularization type (${RegType.values.mkString(",")}), " +
57 | s"default: ${defaultParams.regType}")
58 | .action((x, c) => c.copy(regType = RegType.withName(x)))
59 | opt[Double]("regParam")
60 | .text(s"regularization parameter, default: ${defaultParams.regParam}")
61 | arg[String](" ")
62 | .required()
63 | .text("input paths to labeled examples in LIBSVM format")
64 | .action((x, c) => c.copy(input = x))
65 | arg[Int]("")
66 | .required()
67 | .text(s"number of partitions, default: ${defaultParams.numPartitions}")
68 | .action((x, c) => c.copy(numPartitions = x))
69 | note(
70 | """
71 | | For example, the following command runs this app on a synthetic dataset:
72 | |
73 | | bin/spark-submit --class LinearRegression examples-1.0-SNAPSHOT-shaded.jar oss://accessKeyId:accessKeySecret@bucket.endpoint/input.txt 2
74 | """.stripMargin)
75 | }
76 |
77 | parser.parse(args, defaultParams).map { params =>
78 | run(params)
79 | } getOrElse {
80 | sys.exit(1)
81 | }
82 | }
83 |
84 | def run(params: Params) {
85 | val examples = MLUtils.loadLibSVMFile(getSparkContext, params.input).cache()
86 | val splits = examples.randomSplit(Array(0.8, 0.2))
87 | val training = splits(0).cache()
88 | val test = splits(1).cache()
89 |
90 | val numTraining = training.count()
91 | val numTest = test.count()
92 | println(s"Training: $numTraining, test: $numTest.")
93 |
94 | examples.unpersist(blocking = false)
95 |
96 | val updater = params.regType match {
97 | case NONE => new SimpleUpdater()
98 | case L1 => new L1Updater()
99 | case L2 => new SquaredL2Updater()
100 | }
101 |
102 | val algorithm = new LinearRegressionWithSGD()
103 | algorithm.optimizer
104 | .setNumIterations(params.numIterations)
105 | .setStepSize(params.stepSize)
106 | .setUpdater(updater)
107 | .setRegParam(params.regParam)
108 |
109 | val model = algorithm.run(training)
110 |
111 | val prediction = model.predict(test.map(_.features))
112 | val predictionAndLabel = prediction.zip(test.map(_.label))
113 |
114 | val loss = predictionAndLabel.map { case (p, l) =>
115 | val err = p - l
116 | err * err
117 | }.reduce(_ + _)
118 | val rmse = math.sqrt(loss / numTest)
119 |
120 | println(s"Test RMSE = $rmse.")
121 |
122 | getSparkContext.stop()
123 | }
124 |
125 | override def getAppName: String = "LinearRegression"
126 | }
127 |
--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/MongoDBWordCount.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.aliyun.emr.example.spark
19 |
20 | import com.stratio.datasource.mongodb._
21 | import com.stratio.datasource.mongodb.config._
22 | import com.stratio.datasource.mongodb.config.MongodbConfig._
23 |
24 | import org.apache.spark.sql._
25 | import org.apache.spark.sql.SQLContext
26 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
27 |
28 | object MongoDBWordCount extends RunLocally {
29 | def main(args: Array[String]): Unit = {
30 | if (args.length < 12) {
31 | System.err.println(
32 | """Usage: bin/spark-submit --class MongoDBWordCount examples-1.0-SNAPSHOT-shaded.jar
33 | |
34 | |
35 | |
36 | |Arguments:
37 | |
38 | | dbName MongoDB database name.
39 | | dbUrl MongoDB database URL.
40 | | dbPort MongoDB database port.
41 | | userName MongoDB database user name.
42 | | pwd mongoDB database password.
43 | | collectionName MongoDB collection name.
44 | | sampleRatio MongoDB sample ratio.
45 | | writeConcern MongoDB write concern.
46 | | splitSize MongoDB split size.
47 | | splitKey MongoDB split key.
48 | | inputPath OSS input object path, like oss://accessKeyId:accessKeySecret@bucket.endpoint/a/b.txt
49 | | numPartitions RDD partition number.
50 | |
51 | """.stripMargin)
52 | System.exit(1)
53 | }
54 |
55 | val dbName = args(0)
56 | val dbUrl = args(1)
57 | val dbPort = args(2)
58 | val userName = args(3)
59 | val pwd = args(4)
60 | val collectionName = args(5)
61 | val sampleRatio = args(6).toFloat
62 | val writeConcern = args(7)
63 | val splitSize = args(8).toInt
64 | val splitKey = args(9)
65 | val inputPath = args(10)
66 | val numPartitions = args(11).toInt
67 |
68 | val sqlContext = new SQLContext(getSparkContext)
69 |
70 | val input = getSparkContext.textFile(inputPath, numPartitions)
71 | val counts = input.flatMap(_.split(" ")).map(x => (x, 1)).reduceByKey(_ + _).map(e => Row.apply(e._1, e._2))
72 | lazy val schema = StructType(
73 | StructField("word", StringType) ::
74 | StructField("count", IntegerType) :: Nil)
75 |
76 | val hosts = dbUrl.split(",").map(e => s"$e:$dbPort").toList
77 | val df = sqlContext.createDataFrame(counts, schema)
78 | val saveConfig = MongodbConfigBuilder(Map(Host -> hosts, Database -> dbName,
79 | Collection -> collectionName, SamplingRatio -> sampleRatio, WriteConcern -> writeConcern,
80 | SplitSize -> splitSize, SplitKey -> splitKey,
81 | Credentials -> List(com.stratio.datasource.mongodb.config.MongodbCredentials(userName, dbName, pwd.toCharArray))))
82 | df.saveToMongodb(saveConfig.build())
83 | }
84 |
85 | override def getAppName: String = "MongoDBWordCount"
86 | }
87 |
--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/RunLocally.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.aliyun.emr.example.spark
19 |
20 | import org.apache.spark.{SparkConf, SparkContext}
21 |
22 | trait RunLocally {
23 |
24 | def getAppName: String
25 |
26 | def getSparkConf: SparkConf = new SparkConf()
27 |
28 | def getSparkContext: SparkContext = {
29 | val conf = getSparkConf.setAppName(getAppName).setMaster("local[4]")
30 | conf.set("spark.hadoop.fs.oss.impl", "com.aliyun.fs.oss.nat.NativeOssFileSystem")
31 | conf.set("spark.hadoop.mapreduce.job.run-local", "true")
32 | new SparkContext(conf)
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/SparkMaxComputeDemo.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.aliyun.emr.example.spark
19 |
20 | import com.aliyun.odps.TableSchema
21 | import com.aliyun.odps.data.Record
22 |
23 | import org.apache.spark.aliyun.odps.OdpsOps
24 | import org.apache.spark.{SparkConf, SparkContext}
25 |
26 | object SparkMaxComputeDemo {
27 | def main(args: Array[String]): Unit = {
28 | if (args.length < 6) {
29 | System.err.println(
30 | """Usage: SparkMaxComputeDemo
31 | |
32 | |Arguments:
33 | |
34 | | accessKeyId Aliyun Access Key ID.
35 | | accessKeySecret Aliyun Key Secret.
36 | | envType 0 or 1
37 | | 0: Public environment.
38 | | 1: Aliyun internal environment, i.e. Aliyun ECS etc.
39 | | project Aliyun ODPS project
40 | | table Aliyun ODPS table
41 | | numPartitions the number of RDD partitions
42 | """.stripMargin)
43 | System.exit(1)
44 | }
45 |
46 | val accessKeyId = args(0)
47 | val accessKeySecret = args(1)
48 | val envType = args(2).toInt
49 | val project = args(3)
50 | val table = args(4)
51 | val numPartitions = args(5).toInt
52 |
53 | val urls = Seq(
54 | Seq("http://service.odps.aliyun.com/api", "http://dt.odps.aliyun.com"), // public environment
55 | Seq("http://odps-ext.aliyun-inc.com/api", "http://dt-ext.odps.aliyun-inc.com") // Aliyun internal environment
56 | )
57 |
58 | val conf = new SparkConf().setAppName("E-MapReduce Demo 3-1: Spark MaxCompute Demo (Scala)")
59 | val sc = new SparkContext(conf)
60 | val odpsOps = envType match {
61 | case 0 =>
62 | OdpsOps(sc, accessKeyId, accessKeySecret, urls(0)(0), urls(0)(1))
63 | case 1 =>
64 | OdpsOps(sc, accessKeyId, accessKeySecret, urls(1)(0), urls(1)(1))
65 | }
66 |
67 | val odpsData = odpsOps.readTable(project, table, read, numPartitions)
68 |
69 | println(s"Count (odpsData): ${odpsData.count()}")
70 | }
71 |
72 | def read(record: Record, schema: TableSchema): Long = {
73 | record.getBigint(0)
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/SparkOssDemo.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.aliyun.emr.example.spark
19 |
20 | import org.apache.hadoop.io.{LongWritable, Text}
21 | import org.apache.hadoop.mapred.TextInputFormat
22 | import org.apache.spark.SparkConf
23 |
24 | object SparkOssDemo extends RunLocally {
25 | var accessKeyId = ""
26 | var accessKeySecret = ""
27 | var endpoint = ""
28 |
29 | def main(args: Array[String]): Unit = {
30 | if (args.length < 2) {
31 | System.err.println(
32 | """Usage: bin/spark-submit --class com.aliyun.emr.example.spark.SparkOssDemo examples-1.0-SNAPSHOT-shaded.jar
33 | |
34 | |Arguments:
35 | |
36 | | accessKeyId OSS accessKeyId
37 | | accessKeySecret OSS accessKeySecret
38 | | endpoint OSS endpoint
39 | | inputPath Input OSS object path, like oss://bucket/input/a.txt
40 | | outputPath Output OSS object path, like oss://bucket/output/
41 | | numPartitions the number of RDD partitions.
42 | |
43 | """.stripMargin)
44 | System.exit(1)
45 | }
46 |
47 | accessKeyId = args(0)
48 | accessKeySecret = args(1)
49 | endpoint = args(2)
50 | val inputPath = args(3)
51 | val outputPath = args(4)
52 | val numPartitions = args(5).toInt
53 | val ossData = getSparkContext.hadoopFile(inputPath, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], numPartitions)
54 | ossData.foreach(line => println(s"print: ${line}"))
55 |
56 | ossData.saveAsTextFile(outputPath)
57 | }
58 |
59 | override def getAppName: String = "E-MapReduce Demo 2-1: Spark Oss Demo (Scala)"
60 |
61 | override def getSparkConf: SparkConf = {
62 | val conf = new SparkConf()
63 | conf.set("spark.hadoop.fs.oss.accessKeyId", accessKeyId)
64 | conf.set("spark.hadoop.fs.oss.accessKeySecret", accessKeySecret)
65 | conf.set("spark.hadoop.fs.oss.endpoint", endpoint)
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/SparkPi.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.aliyun.emr.example.spark
19 |
20 | import scala.math._
21 |
22 | /** Computes an approximation to pi */
23 | object SparkPi extends RunLocally{
24 | def main(args: Array[String]) {
25 | val slices = if (args.length > 0) args(0).toInt else 2
26 | val n = math.min(100000L * slices, Int.MaxValue).toInt // avoid overflow
27 | val count = getSparkContext.parallelize(1 until n, slices).map { i =>
28 | val x = random * 2 - 1
29 | val y = random * 2 - 1
30 | if (x*x + y*y < 1) 1 else 0
31 | }.reduce(_ + _)
32 | println("Pi is roughly " + 4.0 * count / n)
33 | getSparkContext.stop()
34 | }
35 |
36 | override def getAppName: String = "SparkPi"
37 | }
38 |
--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/SparkRdsDemo.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.aliyun.emr.example.spark
19 |
20 | import java.sql.{Connection, DriverManager, PreparedStatement}
21 |
22 | object SparkRdsDemo extends RunLocally {
23 | def main(args: Array[String]): Unit = {
24 | if (args.length < 8) {
25 | System.err.println(
26 | """Usage: spark-submit --class SparkRdsDemo examples-1.0-SNAPSHOT-shaded.jar
27 | |
28 | |
29 | |Arguments:
30 | |
31 | | dbName RDS database name.
32 | | tbName RDS table name.
33 | | dbUser RDS database user name.
34 | | dbPwd RDS database password.
35 | | dbUrl RDS database URL.
36 | | dbPort RDS database port
37 | | inputPath OSS input object path, like oss://accessKeyId:accessKeySecret@bucket.endpoint/a/b.txt
38 | | numPartitions
39 | |
40 | """.stripMargin)
41 | System.exit(1)
42 | }
43 | val dbName = args(0)
44 | val tbName = args(1)
45 | val dbUser = args(2)
46 | val dbPwd = args(3)
47 | val dbUrl = args(4)
48 | val dbPort = args(5)
49 | val inputPath = args(6)
50 | val numPartitions = args(7).toInt
51 |
52 | val input = getSparkContext.textFile(inputPath, numPartitions)
53 | input.collect().foreach(println)
54 | input.flatMap(_.split(" ")).map(x => (x, 1)).reduceByKey(_ + _)
55 | .mapPartitions(e => {
56 | var conn: Connection = null
57 | var ps: PreparedStatement = null
58 | val sql = s"insert into $tbName(word, count) values (?, ?)"
59 | try {
60 | conn = DriverManager.getConnection(s"jdbc:mysql://$dbUrl:$dbPort/$dbName", dbUser, dbPwd)
61 | ps = conn.prepareStatement(sql)
62 | e.foreach(pair => {
63 | ps.setString(1, pair._1)
64 | ps.setLong(2, pair._2)
65 | ps.executeUpdate()
66 | })
67 |
68 | ps.close()
69 | conn.close()
70 | } catch {
71 | case e: Exception => e.printStackTrace()
72 | } finally {
73 | if (ps != null) {
74 | ps.close()
75 | }
76 | if (conn != null) {
77 | conn.close()
78 | }
79 | }
80 | Iterator.empty
81 | }).count()
82 | }
83 |
84 | override def getAppName: String = "E-MapReduce Demo 10: Spark Rds Demo (Scala)"
85 | }
86 |
--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/SparkWordCount.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.aliyun.emr.example.spark
19 |
20 | /** Counts words in new text files created in the given directory */
21 | object SparkWordCount extends RunLocally {
22 | def main(args: Array[String]): Unit = {
23 | if (args.length < 3) {
24 | System.err.println(
25 | """Usage: bin/spark-submit --class com.aliyun.emr.example.SparkWordCount examples-1.0-SNAPSHOT-shaded.jar
26 | |
27 | |Arguments:
28 | |
29 | | inputPath Input OSS object path, like oss://accessKeyId:accessKeySecret@bucket.endpoint/input/words.txt
30 | | outputPath Output OSS object path, like oss://accessKeyId:accessKeySecret@bucket.endpoint/output
31 | | numPartitions The number of RDD partitions.
32 | |
33 | """.stripMargin)
34 | System.exit(1)
35 | }
36 |
37 | val inputPath = args(0)
38 | val outputPath = args(1)
39 | val numPartitions = args(2).toInt
40 |
41 | val input = getSparkContext.textFile(inputPath, numPartitions)
42 | val output = input.flatMap(_.split(" ")).map(x => (x, 1)).reduceByKey(_ + _)
43 |
44 | output.saveAsTextFile(outputPath)
45 | }
46 |
47 | override def getAppName: String = "E-MapReduce Demo 1: SparkWordCount"
48 | }
49 |
--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/sql/ODPSDataSourceSample.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.aliyun.emr.example.spark.sql
19 |
20 | import org.apache.spark.sql.{SaveMode, SparkSession}
21 |
22 | object ODPSDataSourceSample {
23 | def main(args: Array[String]): Unit = {
24 | if (args.length < 6) {
25 | System.err.println(
26 | """Usage: ODPSDataSourceSample
27 | |
28 | |Arguments:
29 | |
30 | | accessKeyId Aliyun Access Key ID.
31 | | accessKeySecret Aliyun Key Secret.
32 | | envType 0 or 1
33 | | 0: Public environment.
34 | | 1: Aliyun internal environment, i.e. Aliyun ECS etc.
35 | | project Aliyun ODPS project
36 | | table Aliyun ODPS table
37 | | numPartitions the number of RDD partitions
38 | """.stripMargin)
39 | System.exit(1)
40 | }
41 |
42 | val accessKeyId = args(0)
43 | val accessKeySecret = args(1)
44 | val envType = args(2).toInt
45 | val project = args(3)
46 | val table = args(4)
47 |
48 | val urls = Seq(
49 | Seq("http://service.odps.aliyun.com/api", "http://dt.odps.aliyun.com"), // public environment
50 | Seq("http://odps-ext.aliyun-inc.com/api", "http://dt-ext.odps.aliyun-inc.com") // Aliyun internal environment
51 | )
52 |
53 | val odpsUrl = urls(envType)(0)
54 | val tunnelUrl = urls(envType)(1)
55 |
56 | val ss = SparkSession.builder().appName("Test Odps Read").master("local[*]").getOrCreate()
57 |
58 | import ss.implicits._
59 |
60 | val dataSeq = (1 to 1000000).map {
61 | index => (index, (index-3).toString)
62 | }.toSeq
63 |
64 |
65 | val df = ss.sparkContext.makeRDD(dataSeq).toDF("a", "b")
66 |
67 | System.out.println("*****" + table + ",before overwrite table")
68 | df.write.format("org.apache.spark.aliyun.odps.datasource")
69 | .option("odpsUrl", odpsUrl)
70 | .option("tunnelUrl", tunnelUrl)
71 | .option("table", table)
72 | .option("project", project)
73 | .option("accessKeySecret", accessKeySecret)
74 | .option("accessKeyId", accessKeyId).mode(SaveMode.Overwrite).save()
75 |
76 | System.out.println("*****" + table + ",after overwrite table, before read table")
77 |
78 | val readDF = ss.read
79 | .format("org.apache.spark.aliyun.odps.datasource")
80 | .option("odpsUrl", odpsUrl)
81 | .option("tunnelUrl", tunnelUrl)
82 | .option("table", table)
83 | .option("project", project)
84 | .option("accessKeySecret", accessKeySecret)
85 | .option("accessKeyId", accessKeyId).load()
86 |
87 |
88 | val collectList = readDF.collect()
89 | System.out.println("*****" + table + ",after read table," + collectList.size)
90 | assert(collectList.length == 1000000)
91 | assert((1 to 1000000).par.exists(n => collectList.exists(_.getLong(0) == n)))
92 |
93 | }
94 | }
95 |
--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/sql/streaming/SparkSLSContinuousStructuredStreamingDemo.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package com.aliyun.emr.example.spark.sql.streaming
18 |
19 | import java.util.UUID
20 |
21 | import org.apache.spark.sql.SparkSession
22 | import org.apache.spark.sql.streaming.Trigger
23 |
24 | object SparkSLSContinuousStructuredStreamingDemo {
25 | def main(args: Array[String]) {
26 | if (args.length < 7) {
27 | System.err.println("Usage: SparkSLSContinuousStructuredStreamingDemo " +
28 | " " +
29 | " []")
30 | System.exit(1)
31 | }
32 |
33 | val Array(project, logStore, accessKeyId, accessKeySecret, endpoint, startingOffsets, maxOffsetsPerTrigger, _*) = args
34 | val checkpointLocation =
35 | if (args.length > 7) args(7) else "/tmp/temporary-" + UUID.randomUUID.toString
36 |
37 | val spark = SparkSession
38 | .builder
39 | .appName("E-MapReduce Demo 6-5: Spark SLS Demo (Scala)")
40 | .master("local[5]")
41 | .getOrCreate()
42 |
43 | spark.sparkContext.setLogLevel("WARN")
44 |
45 | import spark.implicits._
46 |
47 | // Create DataSet representing the stream of input lines from loghub
48 | val lineLength = spark
49 | .readStream
50 | .format("org.apache.spark.sql.aliyun.logservice.LoghubSourceProvider")
51 | .option("sls.project", project)
52 | .option("sls.store", logStore)
53 | .option("access.key.id", accessKeyId)
54 | .option("access.key.secret", accessKeySecret)
55 | .option("endpoint", endpoint)
56 | .option("startingoffsets", startingOffsets)
57 | .option("maxOffsetsPerTrigger", maxOffsetsPerTrigger)
58 | .load()
59 | .selectExpr("CAST(__value__ AS STRING)")
60 | .as[String].map(e => (e, e.length)).toDF("value", "length")
61 |
62 | val query = lineLength.writeStream
63 | .outputMode("append")
64 | .format("console")
65 | .option("checkpointLocation", checkpointLocation)
66 | .trigger(Trigger.Continuous("5 second"))
67 | .start()
68 |
69 | query.awaitTermination()
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/sql/streaming/SparkSLSStructuredStreamingDemo.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package com.aliyun.emr.example.spark.sql.streaming
18 |
19 | import java.util.UUID
20 |
21 | import org.apache.spark.sql.SparkSession
22 |
23 | object SparkSLSStructuredStreamingDemo {
24 | def main(args: Array[String]) {
25 | if (args.length < 7) {
26 | System.err.println("Usage: SparkSLSStructuredStreamingDemo " +
27 | " " +
28 | " []")
29 | System.exit(1)
30 | }
31 |
32 | val Array(project, logStore, accessKeyId, accessKeySecret, endpoint, startingOffsets, maxOffsetsPerTrigger, _*) = args
33 | val checkpointLocation =
34 | if (args.length > 7) args(7) else "/tmp/temporary-" + UUID.randomUUID.toString
35 |
36 | val spark = SparkSession
37 | .builder
38 | .appName("E-MapReduce Demo 6-3: Spark SLS Demo (Scala)")
39 | .master("local[5]")
40 | .getOrCreate()
41 |
42 | spark.sparkContext.setLogLevel("WARN")
43 |
44 | import spark.implicits._
45 |
46 | // Create DataSet representing the stream of input lines from loghub
47 | val lines = spark
48 | .readStream
49 | .format("org.apache.spark.sql.aliyun.logservice.LoghubSourceProvider")
50 | .option("sls.project", project)
51 | .option("sls.store", logStore)
52 | .option("access.key.id", accessKeyId)
53 | .option("access.key.secret", accessKeySecret)
54 | .option("endpoint", endpoint)
55 | .option("startingoffsets", startingOffsets)
56 | .option("zookeeper.connect.address", "localhost:2181")
57 | .option("maxOffsetsPerTrigger", maxOffsetsPerTrigger)
58 | .load()
59 | .selectExpr("CAST(__value__ AS STRING)")
60 | .as[String]
61 |
62 | val wordCounts = lines.flatMap(_.split(" ")).groupBy("__value__").count()
63 |
64 | val query = wordCounts.writeStream
65 | .outputMode("complete")
66 | .format("console")
67 | .option("checkpointLocation", checkpointLocation)
68 | .start()
69 |
70 | query.awaitTermination()
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/streaming/DirectSparkSLSDemo.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.aliyun.emr.example.spark.streaming
19 |
20 | import com.aliyun.openservices.loghub.client.config.LogHubCursorPosition
21 | import org.apache.spark.SparkConf
22 | import org.apache.spark.streaming.aliyun.logservice.{DirectLoghubInputDStream, LoghubUtils}
23 | import org.apache.spark.streaming.{Milliseconds, StreamingContext}
24 |
25 | object DirectSparkSLSDemo {
26 | def main(args: Array[String]): Unit = {
27 | if (args.length < 7) {
28 | System.err.println(
29 | """Usage: DirectSparkSLSDemo
30 | |
31 | """.stripMargin)
32 | System.exit(1)
33 | }
34 |
35 | val loghubProject = args(0)
36 | val logStore = args(1)
37 | val loghubGroupName = args(2)
38 | val endpoint = args(3)
39 | val accessKeyId = args(4)
40 | val accessKeySecret = args(5)
41 | val batchInterval = Milliseconds(args(6).toInt * 1000)
42 | val zkAddress = if (args.length >= 8) args(7) else "localhost:2181"
43 |
44 | def functionToCreateContext(): StreamingContext = {
45 | val conf = new SparkConf().setAppName("E-MapReduce Demo 6-2: Spark SLS Demo (Scala) (Direct API)")
46 | val ssc = new StreamingContext(conf, batchInterval)
47 | val zkParas = Map("zookeeper.connect" -> zkAddress,
48 | "enable.auto.commit" -> "false")
49 | val loghubStream = LoghubUtils.createDirectStream(
50 | ssc,
51 | loghubProject,
52 | logStore,
53 | loghubGroupName,
54 | accessKeyId,
55 | accessKeySecret,
56 | endpoint,
57 | zkParas,
58 | LogHubCursorPosition.END_CURSOR)
59 |
60 | loghubStream.checkpoint(batchInterval).foreachRDD(rdd => {
61 | println(s"count by key: ${rdd.map(s => {
62 | s.sorted
63 | (s.length, s)
64 | }).countByKey().size}")
65 | loghubStream.asInstanceOf[DirectLoghubInputDStream].commitAsync()
66 | })
67 | ssc.checkpoint("hdfs:///tmp/spark/streaming") // set checkpoint directory
68 | ssc
69 | }
70 |
71 | val ssc = StreamingContext.getOrCreate("hdfs:///tmp/spark/streaming", functionToCreateContext _)
72 |
73 | ssc.start()
74 | ssc.awaitTermination()
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/streaming/DtsSample.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.aliyun.emr.example.spark.streaming
19 |
20 | import com.aliyun.drc.clusterclient.message.ClusterMessage
21 |
22 | import org.apache.spark.SparkConf
23 | import org.apache.spark.storage.StorageLevel
24 | import org.apache.spark.streaming.aliyun.dts.DtsUtils
25 | import org.apache.spark.streaming.{Milliseconds, StreamingContext}
26 |
27 | object DtsSample {
28 | def main(args: Array[String]): Unit = {
29 | if (args.length < 4) {
30 | System.err.println(s"""
31 | |Usage: DtsSample
32 | | Aliyun Access Key ID.
33 | | Aliyun Access Key Secret.
34 | | Aliyun DTS guid name.
35 | | Use public Ip to access DTS or not.
36 | | The time interval at which streaming data will be divided into batches.
37 | """.stripMargin)
38 | System.exit(1)
39 | }
40 |
41 | val Array(accessKeyId, accessKeySecret, guid, usePublicIp, interval) = args
42 | val sparkConf = new SparkConf().setAppName("DtsSample")
43 | val ssc: StreamingContext = new StreamingContext(sparkConf, Milliseconds(interval.toInt))
44 |
45 | def func: ClusterMessage => String = msg => msg.getRecord.toString
46 |
47 | val dtsStream = DtsUtils.createStream(
48 | ssc,
49 | accessKeyId,
50 | accessKeySecret,
51 | guid,
52 | func,
53 | StorageLevel.MEMORY_AND_DISK_2,
54 | usePublicIp.toBoolean)
55 |
56 | dtsStream.foreachRDD(rdd => {
57 | rdd.collect().foreach(println)
58 | })
59 |
60 | ssc.start()
61 | ssc.awaitTermination()
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/streaming/RedisWordCount.scala.1:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.aliyun.emr.example.spark.streaming
19 |
20 | import org.apache.spark.{SparkContext, SparkConf}
21 | import org.apache.spark.streaming.{Seconds, StreamingContext}
22 | import org.apache.spark.storage.StorageLevel
23 | import com.redislabs.provider.redis._
24 |
25 | object RedisWordCount {
26 | def main(args: Array[String]): Unit = {
27 | if (args.length < 4) {
28 | System.err.println(
29 | """Usage: bin/spark-submit --class RedisWordCount examples-1.0-SNAPSHOT-shaded.jar
30 | |
31 | |
32 | |Arguments:
33 | |
34 | | redisHost Redis host.
35 | | redisPort Redis port.
36 | | redisAuth Redis auth.
37 | | keyName Redis key name.
38 | |
39 | """.stripMargin)
40 | System.exit(1)
41 | }
42 |
43 | val redisHost = args(0)
44 | val redisPort = args(1)
45 | val redisAuth = args(2)
46 | val keyName = args(3)
47 |
48 | val conf = new SparkConf().setAppName("Redis WordCount").setMaster("local[4]")
49 | conf.set("redis.host", redisHost)
50 | conf.set("redis.port", redisPort)
51 | conf.set("redis.auth", redisAuth)
52 | val sc = new SparkContext(conf)
53 | val ssc = new StreamingContext(sc, Seconds(1))
54 |
55 | val redisStream = ssc.createRedisStream(Array(keyName), storageLevel = StorageLevel.MEMORY_AND_DISK_2)
56 | redisStream.print()
57 |
58 | ssc.start()
59 | ssc.awaitTermination()
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/streaming/SparkDatahubDemo.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.aliyun.emr.example.spark.streaming
19 |
20 | import com.aliyun.datahub.model.RecordEntry
21 |
22 | import org.apache.spark.SparkConf
23 | import org.apache.spark.storage.StorageLevel
24 | import org.apache.spark.streaming.{Milliseconds, StreamingContext}
25 | import org.apache.spark.streaming.aliyun.datahub.DatahubUtils
26 | import org.apache.spark.streaming.dstream.DStream
27 |
28 | object SparkDatahubDemo {
29 | def main(args: Array[String]): Unit = {
30 | if (args.length < 7) {
31 | // scalastyle:off
32 | System.err.println(
33 | """Usage: SparkDatahubDemo
34 | | []
35 | """.stripMargin)
36 | // scalastyle:on
37 | System.exit(1)
38 | }
39 |
40 | var isShardDefined = false
41 | if (args.length == 8) {
42 | isShardDefined = true
43 | }
44 |
45 | val project = args(0)
46 | val topic = args(1)
47 | val subId = args(2)
48 | val accessKeyId = args(3)
49 | val accessKeySecret = args(4)
50 | val endpoint = args(5)
51 | val batchInterval = Milliseconds(args(6).toInt * 1000)
52 |
53 | def functionToCreateContext(): StreamingContext = {
54 | val conf = new SparkConf().setMaster("local[4]").setAppName("E-MapReduce Demo 11: Spark DataHub Demo (Scala)")
55 | conf.set("spark.hadoop.fs.oss.impl", "com.aliyun.fs.oss.nat.NativeOssFileSystem")
56 | conf.set("spark.hadoop.mapreduce.job.run-local", "true")
57 | val ssc = new StreamingContext(conf, batchInterval)
58 | var datahubStream: DStream[Array[Byte]] = null
59 | if (isShardDefined) {
60 | val shardId = args(7)
61 | datahubStream = DatahubUtils.createStream(
62 | ssc,
63 | project,
64 | topic,
65 | subId,
66 | accessKeyId,
67 | accessKeySecret,
68 | endpoint,
69 | shardId,
70 | read(_),
71 | StorageLevel.MEMORY_AND_DISK)
72 | } else {
73 | datahubStream = DatahubUtils.createStream(
74 | ssc,
75 | project,
76 | topic,
77 | subId,
78 | accessKeyId,
79 | accessKeySecret,
80 | endpoint,
81 | read(_),
82 | StorageLevel.MEMORY_AND_DISK)
83 | }
84 |
85 | // scalastyle:off
86 | datahubStream.foreachRDD(rdd => println(s"rdd.count(): ${rdd.count()}"))
87 | // scalastyle:on
88 | ssc.checkpoint("hdfs:///tmp/spark/streaming") // set checkpoint directory
89 | ssc
90 | }
91 |
92 | val ssc = StreamingContext.getOrCreate("hdfs:///tmp/spark/streaming", functionToCreateContext _)
93 |
94 | ssc.start()
95 | ssc.awaitTermination()
96 | }
97 |
98 | def read(record: RecordEntry): String = {
99 | record.getString(0)
100 | }
101 | }
102 |
--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/streaming/SparkHBaseDemo.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.aliyun.emr.example.spark.streaming
19 |
20 | import com.aliyun.openservices.ons.api.Message
21 | import org.apache.hadoop.conf.Configuration
22 | import org.apache.hadoop.hbase.{HConstants, HBaseConfiguration, TableName}
23 | import org.apache.hadoop.hbase.client.{Connection, ConnectionFactory, Put}
24 | import org.apache.hadoop.hbase.util.Bytes
25 | import org.apache.spark.SparkConf
26 | import org.apache.spark.storage.StorageLevel
27 | import org.apache.spark.streaming.aliyun.ons.OnsUtils
28 | import org.apache.spark.streaming.{StreamingContext, Seconds}
29 | import scala.collection.JavaConversions._
30 |
31 | object ConnectionUtil extends Serializable {
32 | private var conf: Configuration = null
33 |
34 | private var connection: Connection = null
35 |
36 | def getDefaultConn(quorum: String): Connection = {
37 | if (conf == null && connection == null) {
38 | conf = HBaseConfiguration.create()
39 | conf.set(HConstants.ZOOKEEPER_QUORUM, quorum)
40 | conf.set(HConstants.ZOOKEEPER_ZNODE_PARENT, "/hbase")
41 | connection = ConnectionFactory.createConnection(conf)
42 | }
43 | connection
44 | }
45 | }
46 |
47 | object SparkHBaseDemo {
48 | def main(args: Array[String]): Unit = {
49 | if (args.length < 7) {
50 | System.err.println(
51 | """Usage: spark-submit --class SparkHBaseDemo examples-1.0-SNAPSHOT-shaded.jar
52 | |
53 | |
54 | |Arguments:
55 | |
56 | | accessKeyId Aliyun Access Key ID.
57 | | accessKeySecret Aliyun Key Secret.
58 | | consumerId ONS ConsumerID.
59 | | topic ONS topic.
60 | | subExpression * for all, or some specific tag.
61 | | tableName The name of HBase table.
62 | | quorum HBase quorum setting.
63 | |
64 | """.stripMargin)
65 | System.exit(1)
66 | }
67 |
68 | val Array(accessKeyId, accessKeySecret, consumerId, topic, subExpression, tname, quorum) = args
69 |
70 | val COLUMN_FAMILY_BYTES = Bytes.toBytes("count")
71 | val COLUMN_QUALIFIER_BYTES = Bytes.toBytes("count")
72 |
73 | val batchInterval = Seconds(2)
74 |
75 | val conf = new SparkConf().setAppName("E-MapReduce Demo 9: Spark HBase Demo (Scala)")
76 | val ssc = new StreamingContext(conf, batchInterval)
77 | def func: Message => Array[Byte] = msg => msg.getBody
78 | val onsStream = OnsUtils.createStream(
79 | ssc,
80 | consumerId,
81 | topic,
82 | subExpression,
83 | accessKeyId,
84 | accessKeySecret,
85 | StorageLevel.MEMORY_AND_DISK_2,
86 | func)
87 |
88 | onsStream.foreachRDD(rdd => {
89 | rdd.map(bytes => new String(bytes))
90 | .flatMap(line => line.split(" "))
91 | .map(word => (word, 1))
92 | .reduceByKey(_ + _)
93 | .mapPartitions {words => {
94 | val conn = ConnectionUtil.getDefaultConn(quorum)
95 | val tableName = TableName.valueOf(tname)
96 | val t = conn.getTable(tableName)
97 | try {
98 | words.sliding(100, 100).foreach(slice => {
99 | val puts = slice.map(word => {
100 | println(s"word: $word")
101 | val put = new Put(Bytes.toBytes(word._1 + System.currentTimeMillis()))
102 | put.addColumn(COLUMN_FAMILY_BYTES, COLUMN_QUALIFIER_BYTES,
103 | System.currentTimeMillis(), Bytes.toBytes(word._2))
104 | put
105 | }).toList
106 | t.put(puts)
107 | })
108 | } finally {
109 | t.close()
110 | }
111 |
112 | Iterator.empty
113 | }}.count()
114 | })
115 |
116 | ssc.start()
117 | ssc.awaitTermination()
118 | }
119 | }
120 |
--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/streaming/SparkKafkaDemo.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.aliyun.emr.example.spark.streaming
19 |
20 | import org.apache.kafka.common.serialization.StringDeserializer
21 | import org.apache.spark.SparkConf
22 | import org.apache.spark.streaming._
23 | import org.apache.spark.streaming.kafka010._
24 |
25 | object SparkKafkaDemo {
26 | def main(args: Array[String]) {
27 | if (args.length < 2) {
28 | System.err.println(s"""
29 | |Usage: SparkKafkaDemo
30 | | is a list of one or more Kafka brokers
31 | | is a list of one or more kafka topics to consume from
32 | |
33 | """.stripMargin)
34 | System.exit(1)
35 | }
36 | val Array(brokers, topics, interval) = args
37 |
38 | val sparkConf = new SparkConf().setAppName("E-MapReduce Demo 9: Spark Kafka Demo (Scala)")
39 | val ssc = new StreamingContext(sparkConf, Seconds(interval.toInt))
40 |
41 | val kafkaParams = Map[String, Object](
42 | "bootstrap.servers" -> brokers,
43 | "key.deserializer" -> classOf[StringDeserializer],
44 | "value.deserializer" -> classOf[StringDeserializer],
45 | "group.id" -> "mugen1",
46 | "auto.offset.reset" -> "earliest",
47 | "enable.auto.commit" -> (false: java.lang.Boolean),
48 | "security.protocol" -> "SASL_PLAINTEXT",
49 | "sasl.mechanism" -> "GSSAPI",
50 | "sasl.kerberos.service.name" -> "kafka"
51 | )
52 |
53 | val messages = KafkaUtils.createDirectStream[String, String](
54 | ssc,
55 | LocationStrategies.PreferConsistent,
56 | ConsumerStrategies.Subscribe[String, String](Array(topics), kafkaParams)
57 | )
58 |
59 | // Get the lines, split them into words, count the words and print
60 | val lines = messages.map(_.value)
61 | val words = lines.flatMap(_.split(" "))
62 | val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _)
63 | wordCounts.print()
64 |
65 | // Start the computation
66 | ssc.start()
67 | ssc.awaitTermination()
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/streaming/SparkMNSDemo.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.aliyun.emr.example.spark.streaming
19 |
20 | import org.apache.spark.SparkConf
21 | import org.apache.spark.storage.StorageLevel
22 | import org.apache.spark.streaming.aliyun.mns.MnsUtils
23 | import org.apache.spark.streaming.{Seconds, StreamingContext}
24 |
25 | object SparkMNSDemo {
26 | def main(args: Array[String]): Unit = {
27 | if (args.length < 4) {
28 | System.err.println(
29 | """Usage: spark-submit --class SparkMNSDemo examples-1.0-SNAPSHOT-shaded.jar """.stripMargin)
30 | System.exit(1)
31 | }
32 | val queueName = args(0)
33 | val accessKeyId = args(1)
34 | val accessKeySecret = args(2)
35 | val endpoint = args(3)
36 |
37 | val conf = new SparkConf().setAppName("E-MapReduce Demo 8-1: Spark MNS Demo (Scala)").setMaster("local[4]")
38 | conf.set("spark.hadoop.fs.oss.impl", "com.aliyun.fs.oss.nat.NativeOssFileSystem")
39 | conf.set("spark.hadoop.mapreduce.job.run-local", "true")
40 | val batchInterval = Seconds(10)
41 | val ssc = new StreamingContext(conf, batchInterval)
42 |
43 | val mnsStream = MnsUtils.createPullingStreamAsBytes(ssc, queueName, accessKeyId, accessKeySecret, endpoint,
44 | StorageLevel.MEMORY_ONLY)
45 | mnsStream.foreachRDD( rdd => {
46 | rdd.collect().foreach(e => println(new String(e)))
47 | })
48 |
49 | ssc.start()
50 | ssc.awaitTermination()
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/streaming/SparkRocketMQDemo.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.aliyun.emr.example.spark.streaming
19 |
20 | import java.util.{Properties, UUID}
21 |
22 | import com.aliyun.openservices.ons.api.impl.ONSFactoryImpl
23 | import com.aliyun.openservices.ons.api.{Message, PropertyKeyConst}
24 | import org.apache.spark.storage.StorageLevel
25 | import org.apache.spark.streaming.aliyun.ons.OnsUtils
26 | import org.apache.spark.streaming.{Milliseconds, StreamingContext}
27 | import org.apache.spark.{SparkConf, SparkContext}
28 |
29 | object SparkRocketMQDemo {
30 | def main(args: Array[String]): Unit = {
31 | if (args.length < 6) {
32 | System.err.println(
33 | """Usage: bin/spark-submit --class com.aliyun.emr.example.spark.streaming.SparkRocketMQDemo examples-1.0-SNAPSHOT-shaded.jar
34 | |