├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── conf
    └── benchmark.properties
├── data
    ├── The_Sorrows_of_Young_Werther.txt
    ├── abalone
    ├── patterns.txt
    └── u.data
├── lib
    ├── kafka-tools-1.0.0.jar
    └── tutorial.jar
├── pic
    ├── 1.JPG
    ├── 10.JPG
    ├── 11.JPG
    ├── 2.JPG
    ├── 3.JPG
    ├── 4.JPG
    ├── 5.JPG
    ├── 6.JPG
    ├── 7.JPG
    ├── 8.JPG
    └── 9.JPG
├── pom.xml
├── resources
    ├── The_Sorrows_of_Young_Werther.txt
    ├── patterns.txt
    └── student_data.csv
└── src
    └── main
        ├── hive
            └── sample.hive
        ├── java
            └── com
            │   └── aliyun
            │       └── emr
            │           └── example
            │               ├── hadoop
            │                   ├── EMapReduceOSSUtil.java
            │                   └── WordCount.java
            │               ├── spark
            │                   ├── SparkMaxComputeJavaDemo.java
            │                   ├── SparkOssJavaDemo.java
            │                   ├── SparkTableStoreJavaDemo.java
            │                   ├── sql
            │                   │   └── streaming
            │                   │   │   ├── SparkSLSContinuousStructuredStreamingJavaDemo.java
            │                   │   │   └── SparkSLSStructuredStreamingJavaDemo.java
            │                   └── streaming
            │                   │   ├── JavaLoghubWordCount.java
            │                   │   ├── SparkMNSJavaDemo.java
            │                   │   └── SparkRocketMQJavaDemo.java
            │               └── storm
            │                   ├── StormKafkaSample.java
            │                   └── benchmark
            │                       ├── AbstractTopology.java
            │                       ├── BasicTopology.java
            │                       ├── KafkaHdfs.java
            │                       ├── TridentWordCount.java
            │                       ├── WindowedWordCount.java
            │                       ├── WordCount.java
            │                       └── util
            │                           └── Helper.java
        ├── pig
            └── sample.pig
        ├── python
            ├── deeplearning
            │   ├── data
            │   │   ├── boston
            │   │   │   └── train.csv
            │   │   └── moviedata
            │   │   │   ├── movies.csv
            │   │   │   └── ratings.csv
            │   ├── tf_fm_on_spark.py
            │   └── train_boston.py
            ├── odps-sample.py
            ├── streaming
            │   ├── loghub-wordcount.py
            │   ├── wcmapper.py
            │   └── wcreducer.py
            └── wordcount.py
        └── scala
            └── com
                └── aliyun
                    └── emr
                        └── example
                            ├── flink
                                └── FlinkOSSSample.scala
                            └── spark
                                ├── AbstractParams.scala
                                ├── LinearRegression.scala
                                ├── MongoDBWordCount.scala
                                ├── RunLocally.scala
                                ├── SparkMaxComputeDemo.scala
                                ├── SparkOssDemo.scala
                                ├── SparkPi.scala
                                ├── SparkRdsDemo.scala
                                ├── SparkWordCount.scala
                                ├── sql
                                    ├── ODPSDataSourceSample.scala
                                    └── streaming
                                    │   ├── SparkSLSContinuousStructuredStreamingDemo.scala
                                    │   └── SparkSLSStructuredStreamingDemo.scala
                                └── streaming
                                    ├── DirectSparkSLSDemo.scala
                                    ├── DtsSample.scala
                                    ├── RedisWordCount.scala.1
                                    ├── SparkDatahubDemo.scala
                                    ├── SparkHBaseDemo.scala
                                    ├── SparkKafkaDemo.scala
                                    ├── SparkMNSDemo.scala
                                    ├── SparkRocketMQDemo.scala
                                    ├── SparkSLSDemo.scala
                                    └── benchmark
                                        ├── AbstractStreaming.scala
                                        ├── KafkaHdfs.scala
                                        ├── WordCount.scala
                                        └── metrics
                                            ├── BasicMetrics.scala
                                            ├── HdfsMetrics.scala
                                            └── KafkaMetrics.scala


/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | .idea/
3 | *.iml
4 | *.DS_Store
5 | bin/*
6 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # 1. Choose OS (Ubuntu 14.04.3 LTS Server Edition 64bit, ~2 CORE, 7.5GB RAM)
 2 | sudo: required
 3 | 
 4 | # 2. Choose language and target JDKs for parallel builds.
 5 | language: java
 6 | jdk:
 7 |   - oraclejdk8
 8 | 
 9 | # 3. Setup cache directory for SBT and Maven.
10 | cache:
11 |   directories:
12 |   - $HOME/.m2
13 | 
14 | # 4. Run maven install before running lint-java.
15 | install:
16 |   -
17 | 
18 | script:
19 |   - echo -e '<?xml version="1.0" encoding="UTF-8"?>\n<settings xsi:schemaLocation="http://maven.apache.org/SETTINGS/1.1.0 http://maven.apache.org/xsd/settings-1.1.0.xsd" xmlns="http://maven.apache.org/SETTINGS/1.1.0"\n    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">\n  <mirrors>\n    <mirror>\n      <id>mvnsearch-unavailable</id>\n      <name>mvnsearch-unavailable</name>\n      <mirrorOf>mvnsearch</mirrorOf>\n      <url>http://repo1.maven.org/maven2</url>\n    </mirror>\n  </mirrors>\n  <profiles>\n    <profile>\n      <id>no-mvnsearch</id>\n      <repositories>\n        <repository>\n          <id>mvnsearch</id>\n          <url>http://www.mvnsearch.org/maven2</url>\n          <releases>\n            <enabled>true</enabled>\n          </releases>\n          <snapshots>\n            <enabled>true</enabled>\n          </snapshots>\n        </repository>\n      </repositories>\n    </profile>\n  </profiles>\n  <activeProfiles>\n    <activeProfile>no-mvnsearch</activeProfile>\n  </activeProfiles>\n</settings>' > $HOME/.m2/settings.xml
20 |   - cat $HOME/.m2/settings.xml
21 |   - mvn clean package -DskipTests
22 | 
23 | # 5. Branches only
24 | branches:
25 |   only:
26 |   - master-2
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |  The Artistic License 2.0
  2 | 
  3 |            Copyright (c) 2015 aliyun
  4 | 
  5 |      Everyone is permitted to copy and distribute verbatim copies
  6 |       of this license document, but changing it is not allowed.
  7 | 
  8 | Preamble
  9 | 
 10 | This license establishes the terms under which a given free software
 11 | Package may be copied, modified, distributed, and/or redistributed.
 12 | The intent is that the Copyright Holder maintains some artistic
 13 | control over the development of that Package while still keeping the
 14 | Package available as open source and free software.
 15 | 
 16 | You are always permitted to make arrangements wholly outside of this
 17 | license directly with the Copyright Holder of a given Package.  If the
 18 | terms of this license do not permit the full use that you propose to
 19 | make of the Package, you should contact the Copyright Holder and seek
 20 | a different licensing arrangement.
 21 | 
 22 | Definitions
 23 | 
 24 |     "Copyright Holder" means the individual(s) or organization(s)
 25 |     named in the copyright notice for the entire Package.
 26 | 
 27 |     "Contributor" means any party that has contributed code or other
 28 |     material to the Package, in accordance with the Copyright Holder's
 29 |     procedures.
 30 | 
 31 |     "You" and "your" means any person who would like to copy,
 32 |     distribute, or modify the Package.
 33 | 
 34 |     "Package" means the collection of files distributed by the
 35 |     Copyright Holder, and derivatives of that collection and/or of
 36 |     those files. A given Package may consist of either the Standard
 37 |     Version, or a Modified Version.
 38 | 
 39 |     "Distribute" means providing a copy of the Package or making it
 40 |     accessible to anyone else, or in the case of a company or
 41 |     organization, to others outside of your company or organization.
 42 | 
 43 |     "Distributor Fee" means any fee that you charge for Distributing
 44 |     this Package or providing support for this Package to another
 45 |     party.  It does not mean licensing fees.
 46 | 
 47 |     "Standard Version" refers to the Package if it has not been
 48 |     modified, or has been modified only in ways explicitly requested
 49 |     by the Copyright Holder.
 50 | 
 51 |     "Modified Version" means the Package, if it has been changed, and
 52 |     such changes were not explicitly requested by the Copyright
 53 |     Holder.
 54 | 
 55 |     "Original License" means this Artistic License as Distributed with
 56 |     the Standard Version of the Package, in its current version or as
 57 |     it may be modified by The Perl Foundation in the future.
 58 | 
 59 |     "Source" form means the source code, documentation source, and
 60 |     configuration files for the Package.
 61 | 
 62 |     "Compiled" form means the compiled bytecode, object code, binary,
 63 |     or any other form resulting from mechanical transformation or
 64 |     translation of the Source form.
 65 | 
 66 | 
 67 | Permission for Use and Modification Without Distribution
 68 | 
 69 | (1)  You are permitted to use the Standard Version and create and use
 70 | Modified Versions for any purpose without restriction, provided that
 71 | you do not Distribute the Modified Version.
 72 | 
 73 | 
 74 | Permissions for Redistribution of the Standard Version
 75 | 
 76 | (2)  You may Distribute verbatim copies of the Source form of the
 77 | Standard Version of this Package in any medium without restriction,
 78 | either gratis or for a Distributor Fee, provided that you duplicate
 79 | all of the original copyright notices and associated disclaimers.  At
 80 | your discretion, such verbatim copies may or may not include a
 81 | Compiled form of the Package.
 82 | 
 83 | (3)  You may apply any bug fixes, portability changes, and other
 84 | modifications made available from the Copyright Holder.  The resulting
 85 | Package will still be considered the Standard Version, and as such
 86 | will be subject to the Original License.
 87 | 
 88 | 
 89 | Distribution of Modified Versions of the Package as Source
 90 | 
 91 | (4)  You may Distribute your Modified Version as Source (either gratis
 92 | or for a Distributor Fee, and with or without a Compiled form of the
 93 | Modified Version) provided that you clearly document how it differs
 94 | from the Standard Version, including, but not limited to, documenting
 95 | any non-standard features, executables, or modules, and provided that
 96 | you do at least ONE of the following:
 97 | 
 98 |     (a)  make the Modified Version available to the Copyright Holder
 99 |     of the Standard Version, under the Original License, so that the
100 |     Copyright Holder may include your modifications in the Standard
101 |     Version.
102 | 
103 |     (b)  ensure that installation of your Modified Version does not
104 |     prevent the user installing or running the Standard Version. In
105 |     addition, the Modified Version must bear a name that is different
106 |     from the name of the Standard Version.
107 | 
108 |     (c)  allow anyone who receives a copy of the Modified Version to
109 |     make the Source form of the Modified Version available to others
110 |     under
111 | 
112 |     (i)  the Original License or
113 | 
114 |     (ii)  a license that permits the licensee to freely copy,
115 |     modify and redistribute the Modified Version using the same
116 |     licensing terms that apply to the copy that the licensee
117 |     received, and requires that the Source form of the Modified
118 |     Version, and of any works derived from it, be made freely
119 |     available in that license fees are prohibited but Distributor
120 |     Fees are allowed.
121 | 
122 | 
123 | Distribution of Compiled Forms of the Standard Version
124 | or Modified Versions without the Source
125 | 
126 | (5)  You may Distribute Compiled forms of the Standard Version without
127 | the Source, provided that you include complete instructions on how to
128 | get the Source of the Standard Version.  Such instructions must be
129 | valid at the time of your distribution.  If these instructions, at any
130 | time while you are carrying out such distribution, become invalid, you
131 | must provide new instructions on demand or cease further distribution.
132 | If you provide valid instructions or cease distribution within thirty
133 | days after you become aware that the instructions are invalid, then
134 | you do not forfeit any of your rights under this license.
135 | 
136 | (6)  You may Distribute a Modified Version in Compiled form without
137 | the Source, provided that you comply with Section 4 with respect to
138 | the Source of the Modified Version.
139 | 
140 | 
141 | Aggregating or Linking the Package
142 | 
143 | (7)  You may aggregate the Package (either the Standard Version or
144 | Modified Version) with other packages and Distribute the resulting
145 | aggregation provided that you do not charge a licensing fee for the
146 | Package.  Distributor Fees are permitted, and licensing fees for other
147 | components in the aggregation are permitted. The terms of this license
148 | apply to the use and Distribution of the Standard or Modified Versions
149 | as included in the aggregation.
150 | 
151 | (8) You are permitted to link Modified and Standard Versions with
152 | other works, to embed the Package in a larger work of your own, or to
153 | build stand-alone binary or bytecode versions of applications that
154 | include the Package, and Distribute the result without restriction,
155 | provided the result does not expose a direct interface to the Package.
156 | 
157 | 
158 | Items That are Not Considered Part of a Modified Version
159 | 
160 | (9) Works (including, but not limited to, modules and scripts) that
161 | merely extend or make use of the Package, do not, by themselves, cause
162 | the Package to be a Modified Version.  In addition, such works are not
163 | considered parts of the Package itself, and are not subject to the
164 | terms of this license.
165 | 
166 | 
167 | General Provisions
168 | 
169 | (10)  Any use, modification, and distribution of the Standard or
170 | Modified Versions is governed by this Artistic License. By using,
171 | modifying or distributing the Package, you accept this license. Do not
172 | use, modify, or distribute the Package, if you do not accept this
173 | license.
174 | 
175 | (11)  If your Modified Version has been derived from a Modified
176 | Version made by someone other than you, you are nevertheless required
177 | to ensure that your Modified Version complies with the requirements of
178 | this license.
179 | 
180 | (12)  This license does not grant you the right to use any trademark,
181 | service mark, tradename, or logo of the Copyright Holder.
182 | 
183 | (13)  This license includes the non-exclusive, worldwide,
184 | free-of-charge patent license to make, have made, use, offer to sell,
185 | sell, import and otherwise transfer the Package with respect to any
186 | patent claims licensable by the Copyright Holder that are necessarily
187 | infringed by the Package. If you institute patent litigation
188 | (including a cross-claim or counterclaim) against any party alleging
189 | that the Package constitutes direct or contributory patent
190 | infringement, then this Artistic License to you shall terminate on the
191 | date that such litigation is filed.
192 | 
193 | (14)  Disclaimer of Warranty:
194 | THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS
195 | IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES. THE IMPLIED
196 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR
197 | NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY YOUR LOCAL
198 | LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR CONTRIBUTOR WILL
199 | BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
200 | DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE, EVEN IF
201 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## 本项目包含以下示例：
  2 | 
  3 | #### MapReduce
  4 | 
  5 | - WordCount： 单词统计
  6 | 
  7 | #### Hive
  8 | 
  9 | - sample.hive：表的简单查询
 10 | 
 11 | #### Pig
 12 | 
 13 | - sample.pig：Pig处理OSS数据实例
 14 | 
 15 | #### Spark
 16 | 
 17 | - SparkPi: 计算Pi
 18 | - SparkWordCount： 单词统计
 19 | - LinearRegression： 线性回归
 20 | - OSSSample： OSS使用示例
 21 | - ONSSample： ONS使用示例
 22 | - ODPSSample： ODPS使用示例
 23 | - MNSSample：MNS使用示例
 24 | - LoghubSample：Loghub使用示例 
 25 | 
 26 | #### PySpark
 27 | 
 28 | - WordCount: 单词统计
 29 | 
 30 | ## 依赖资源
 31 | 
 32 | 测试数据（data目录下）：
 33 | 
 34 | - The_Sorrows_of_Young_Werther.txt：可作为WordCount（MapReduce/Spark）的输入数据
 35 | - patterns.txt：WordCount（MapReduce）作业的过滤字符
 36 | - u.data：sample.hive脚本的测试表数据
 37 | - abalone：线性回归算法测试数据
 38 | 
 39 | 依赖jar包（lib目录下）
 40 | 
 41 | - tutorial.jar：sample.pig作业需要的依赖jar包
 42 | 
 43 | ## 准备工作
 44 | 
 45 | 本项目提供了一些测试数据，您可以简单地将其上传到OSS中即可使用。其他示例，例如ODPS，MNS，ONS和Loghub等等，需要您自己准备数据如下：
 46 | 
 47 | - 【可选】 创建LogStore，参考[日志服务用户指南](https://help.aliyun.com/document_detail/sls/user-guide/overview.html?spm=5176.docsls/user-guide/consume-logs.3.2.VW5TNb)。
 48 | - 【可选】 创建ODPS项目和表，参考[ODPS快速开始](https://help.aliyun.com/document_detail/odps/quick_start/prerequisite.html?spm=5176.docodps/quick_start/prerequisite.3.2.OqBkc4)。
 49 | - 【可选】 创建ONS，参考[消息队列快速开始](https://help.aliyun.com/document_detail/ons/quick-start/apply.html?spm=5176.docons/quick-start/send.3.2.eZ8h7p)。
 50 | - 【可选】 创建MNS，参考[消息服务控制台使用帮助](https://help.aliyun.com/document_detail/mns/help_of_console/AccessMNSBySubUser.html?spm=5176.docmns/help_of_console/help_of_queue/CreateQueue.3.2.0Sj96I)。
 51 | 
 52 | ## 基本概念：
 53 | 
 54 | - OSSURI： **oss**://accessKeyId:accessKeySecret@bucket.endpoint/a/b/c.txt，用户在作业中指定输入输出数据源时使用，可以类比hdfs://。
 55 | - 阿里云AccessKeyId/AccessKeySecret是您访问阿里云API的密钥，你可以在[这里](https://ak-console.aliyun.com/#/accesskey)获取。
 56 | 
 57 | ## 集群运行
 58 | 
 59 | - Spark
 60 | 	- SparkWordCount： `spark-submit --class SparkWordCount examples-1.0-SNAPSHOT-shaded.jar <inputPath> <outputPath> <numPartition>`
 61 | 		- inputPath： 输入数据路径
 62 | 		- outputPath： 输出路径
 63 | 		- numPartition： 输入数据RDD分片数目
 64 | 	- SparkPi： `spark-submit --class SparkPi examples-1.0-SNAPSHOT-shaded.jar`
 65 | 	- SparkOssDemo：`spark-submit --class SparkOssDemo examples-1.0-SNAPSHOT-shaded.jar <accessKeyId> <accessKeySecret> <endpoint> <inputPath> <numPartition>`
 66 | 	    - accessKeyId： 阿里云AccessKeyId
 67 |         - accessKeySecret：阿里云AccessKeySecret
 68 |         - endpoint: 阿里云OSS endpoint
 69 | 		- inputPath: 输入数据路径
 70 | 		- numPartition：输入数据RDD分片数目
 71 | 	- SparkRocketMQDemo： `spark-submit --class SparkRocketMQDemo examples-1.0-SNAPSHOT-shaded.jar <accessKeyId> <accessKeySecret> <consumerId> <topic> <subExpression> <parallelism>`
 72 | 		- accessKeyId： 阿里云AccessKeyId
 73 | 		- accessKeySecret：阿里云AccessKeySecret
 74 | 		- consumerId: 参考[Consumer ID说明](https://help.aliyun.com/document_detail/ons/brief-manual/terminology.html?spm=5176.docons/brief-manual/overview.6.87.F8suBu)
 75 | 		- topic: 每个消息队列都有一个topic
 76 | 		- subExpression: 参考[消息过滤](https://help.aliyun.com/document_detail/ons/user-guide/tag-filter.html?spm=5176.docons/tcp/java-sdk/normal-consumer.6.97.PIqsEo)。
 77 | 		- parallelism：指定多少个接收器来消费队列消息。
 78 | 	- SparkMaxComputeDemo: `spark-submit --class SparkMaxComputeDemo examples-1.0-SNAPSHOT-shaded.jar <accessKeyId> <accessKeySecret> <envType> <project> <table> <numPartitions>`
 79 | 		- accessKeyId： 阿里云AccessKeyId
 80 | 		- accessKeySecret：阿里云AccessKeySecret
 81 | 		- envType： 0表示公网环境，1表示内网环境。如果是本地调试选择0，如果是在E-MapReduce上执行请选择1。
 82 | 		- project：参考[ODPS-快速开始](https://help.aliyun.com/document_detail/odps/quick_start/prerequisite.html?spm=5176.docodps/summary/glossary.6.90.inv9Ph)。
 83 | 		- table：参考[ODPS术语介绍](https://help.aliyun.com/document_detail/odps/summary/glossary.html?spm=5176.docodps/quick_start/prerequisite.6.88.A5zVKu)。
 84 | 		- numPartition：输入数据RDD分片数目
 85 | 	- SparkMNSDemo: `spark-submit --class SparkMNSDemo examples-1.0-SNAPSHOT-shaded.jar <queueName> <accessKeyId> <accessKeySecret> <endpoint>`
 86 | 		- queueName：队列名，[参考MNS名词解释](https://help.aliyun.com/document_detail/mns/introduction/product-name-interpretation.html?spm=5176.docmns/help_of_console/help_of_queue/CreateQueue.6.87.lHtPvO)。
 87 | 		- accessKeyId： 阿里云AccessKeyId
 88 | 		- accessKeySecret：阿里云AccessKeySecret
 89 | 		- endpoint：队列数据访问地址 
 90 | 	- SparkSLSDemo: `spark-submit --class SparkSLSDemo examples-1.0-SNAPSHOT-shaded.jar <sls project> <sls logstore> <loghub group name> <sls endpoint> <access key id> <access key secret> <batch interval seconds>`
 91 | 		- sls project: LogService项目名
 92 | 		- sls logstore： 日志库名
 93 | 		- loghub group name：作业中消费日志数据的组名，可以任意取。sls project，sls store相同时，相同组名的作业会协同消费sls store中的数据；不同组名的作业会相互隔离地消费sls store中的数据。
 94 | 		- sls endpoint： 参考[日志服务入口](https://help.aliyun.com/document_detail/sls/api/endpoints.html?spm=5176.docsls/user-guide/concept.6.134.Gy05tN)。
 95 | 		- accessKeyId： 阿里云AccessKeyId
 96 | 		- accessKeySecret：阿里云AccessKeySecret
 97 | 		- batch interval seconds： Spark Streaming作业的批次间隔，单位为秒。
 98 | 	- LinearRegression: `spark-submit --class LinearRegression examples-1.0-SNAPSHOT-shaded.jar <inputPath> <numPartitions>`
 99 | 		- inputPath：输入数据
100 | 		- numPartition：输入数据RDD分片数目 
101 | 
102 | - PySpark
103 | 	- WordCount： `spark-submit wordcount.py <inputPath> <outputPath> <numPartition>`
104 | 		- inputPath： 输入数据路径
105 | 		- outputPath： 输出路径
106 | 		- numPartition： 输入数据RDD分片数目
107 | 
108 | - Mapreduce
109 | 	- WordCount： `hadoop jar examples-1.0-SNAPSHOT-shaded.jar WordCount -Dwordcount.case.sensitive=true <inputPath> <outputPath> -skip <patternPath>`
110 | 		- inputPathl：输入数据路径
111 | 		- outputPath：输出路径
112 | 		- patternPath：过滤字符文件，可以使用data/patterns.txt
113 | 
114 | - Hadoop Streaming
115 | 	- WordCount： `hadoop jar /usr/lib/hadoop-current/share/hadoop/tools/lib/hadoop-streaming-*.jar -file <mapperPyFile> -mapper mapper.py -file <reducerPyFile> -reducer reducer.py -input <inputPath> -output <outputPath>`
116 | 		- mapperPyFile mapper文件，[mapper样例](/src/main/python/streaming/wcmapper.py)
117 | 		- reducerPyFile reducer文件, [reducer样例](/src/main/python/streaming/wcreducer.py)
118 | 		- inputPath：输入数据路径
119 | 		- outputPath：输出路径
120 | 
121 | - Hive
122 | 	- `hive -f sample.hive -hiveconf inputPath=<inputPath>`
123 | 		- inputPath：输入数据路径
124 | 
125 | - Pig
126 | 	- `pig -x mapreduce -f sample.pig -param tutorial=<tutorialJarPath> -param input=<inputPath> -param result=<resultPath>`
127 | 		- tutorialJarPath：依赖Jar包，可使用lib/tutorial.jar
128 | 		- inputPath：输入数据路径
129 | 		- resultPath：输出路径
130 | 
131 | - 注意：
132 | 	- 如果在E-MapReduce上使用时，请将测试数据和依赖jar包上传到OSS中，路径规则遵循OSSURI定义，见上。
133 | 	- 如果集群中使用，可以放在机器本地。
134 | 
135 | ## 本地运行
136 | 
137 | 这里主要介绍如何在本地运行Spark程序访问阿里云数据源，例如OSS等。如果希望本地调试运行，最好借助一些开发工具，例如Intellij IDEA或者Eclipse。尤其是Windows环境，否则需要在Windows机器上配置Hadoop和Spark运行环境，很麻烦。
138 | 
139 | - Intellij IDEA
140 | 	- 前提：安装Intellij IDEA，Maven， Intellij IDEA Maven插件，Scala，Intellij IDEA Scala插件
141 | 	- 双击进入SparkWordCount.scala
142 | 	  ![idea5](pic/11.JPG)
143 | 	- 从下图箭头所指处进入作业配置界面
144 | 	 ![idea1](pic/7.JPG)
145 |     - 选择SparkWordCount，在作业参数框中按照所需传入作业参数
146 |       ![idea2](pic/8.JPG)
147 |     - 点击“OK”
148 |     - 点击运行按钮，执行作业
149 |       ![idea3](pic/9.JPG)
150 |     - 查看作业执行日志
151 |       ![idea4](pic/10.JPG)
152 | 
153 | - Scala IDE for Eclipse
154 | 	- 前提：安装Scala IDE for Eclipse，Maven，Eclipse Maven插件
155 | 	- 导入项目
156 | 	  ![eclipse2](pic/2.JPG)
157 |       ![eclipse3](pic/3.JPG)
158 |       ![eclipse4](pic/4.JPG)
159 |     - Run As Maven build，快捷键是“Alt + Shilft + X, M”；也可以在项目名上右键，“Run As”选择“Maven build”
160 |     - 等待编译完后，在需要运行的作业上右键，选择“Run Configuration”，进入配置页      
161 |     - 在配置页中，选择Scala Application，并配置作业的Main Class和参数等等。
162 |       ![eclipse5](pic/5.JPG)
163 |     - 点击“Run”
164 |     - 查看控制台输出日志
165 |       ![eclipse6](pic/6.JPG)
166 | 


--------------------------------------------------------------------------------
/conf/benchmark.properties:
--------------------------------------------------------------------------------
 1 | ##common
 2 | name=KafkaHdfs
 3 | 
 4 | ## cluster
 5 | cluster.cores.total=160
 6 | cluster.worker.node.number=5
 7 | cluster.memory.per.node.mb=90000
 8 | 
 9 | ##kafka producer
10 | partition.number=50
11 | topic=st-36
12 | consumer.group=streaming
13 | zookeeper.address=localhost
14 | zookeeper.root=/kafka-1.0.0
15 | broker.list=localhost:9092
16 | 
17 | ##kafka consumer
18 | result.topic=benchmark-result
19 | result.broker.list=localhost:9092
20 | 
21 | ##storm
22 | worker.slot.number=10
23 | # spout.parallelism equals kafka partition.number
24 | #spout.parallelism=25
25 | window.length=10
26 | slide.interval=10
27 | backpressure.enable=false
28 | hdfs.parallelism.factor=1
29 | # trident
30 | # 0 if disable ack
31 | ack.open=true
32 | 
33 | ##spark streaming
34 | #deploy.mode=yarn-client to make use of cluster header node
35 | duration.ms=1000
36 | spark.executor.instances=10
37 | # recerver number=kafka-partition/factor
38 | kafka.partition.receiver.factor=1
39 | spark.yarn.am.memory.mb=20000
40 | spark.yarn.am.cores=15
41 | #default 200ms, recommend >= 50ms
42 | spark.streaming.blockInterval=200ms
43 | # vcore = physical-core *factor
44 | cpu.core.factor=1.5
45 | 
46 | ##hdfs
47 | url=hdfs://emr-header-1:9000
48 | filename.prefix=/foo/
49 | sync.record.number=1000
50 | 
51 | ## metric
52 | benchmark.app.name=KafkaHdfs
53 | metric.numPartitions=100
54 | from.spark.streaming=true
55 | metric.duration.second=60
56 | metric.group.id=kafka-metrics
57 | 


--------------------------------------------------------------------------------
/data/The_Sorrows_of_Young_Werther.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/data/The_Sorrows_of_Young_Werther.txt


--------------------------------------------------------------------------------
/data/patterns.txt:
--------------------------------------------------------------------------------
1 | \.
2 | \,
3 | \!
4 | to
5 | \"


--------------------------------------------------------------------------------
/lib/kafka-tools-1.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/lib/kafka-tools-1.0.0.jar


--------------------------------------------------------------------------------
/lib/tutorial.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/lib/tutorial.jar


--------------------------------------------------------------------------------
/pic/1.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/1.JPG


--------------------------------------------------------------------------------
/pic/10.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/10.JPG


--------------------------------------------------------------------------------
/pic/11.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/11.JPG


--------------------------------------------------------------------------------
/pic/2.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/2.JPG


--------------------------------------------------------------------------------
/pic/3.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/3.JPG


--------------------------------------------------------------------------------
/pic/4.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/4.JPG


--------------------------------------------------------------------------------
/pic/5.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/5.JPG


--------------------------------------------------------------------------------
/pic/6.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/6.JPG


--------------------------------------------------------------------------------
/pic/7.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/7.JPG


--------------------------------------------------------------------------------
/pic/8.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/8.JPG


--------------------------------------------------------------------------------
/pic/9.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/9.JPG


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |     <groupId>com.aliyun.emr</groupId>
  8 |     <artifactId>examples</artifactId>
  9 |     <version>1.2.0</version>
 10 |     <packaging>jar</packaging>
 11 |     <name>Aliyun E-MapReduce Demo Project</name>
 12 | 
 13 |     <properties>
 14 |         <spark.version>2.3.1</spark.version>
 15 |         <flink.version>1.4.0</flink.version>
 16 |         <emr.version>2.0.0</emr.version>
 17 |         <oss.sdk.version>3.0.0</oss.sdk.version>
 18 |         <odps.version>0.28.4-public</odps.version>
 19 |         <loghubb.client.version>0.6.13</loghubb.client.version>
 20 |         <ons.version>1.7.1.Final</ons.version>
 21 |         <rocketmq.version>3.5.9</rocketmq.version>
 22 |         <hadoop.version>2.9.0</hadoop.version>
 23 |         <aspectjrt.version>1.7.3</aspectjrt.version>
 24 |         <scopt.version>3.2.0</scopt.version>
 25 |         <fastjson.version>1.2.45</fastjson.version>
 26 |         <guava.version>11.0.2</guava.version>
 27 |         <mysql.connector.version>5.1.34</mysql.connector.version>
 28 |         <mongodb.version>0.11.2</mongodb.version>
 29 |         <redis.clients.version>2.7.2</redis.clients.version>
 30 |         <commons.pool2.version>2.0</commons.pool2.version>
 31 |         <hbase.version>1.1.1</hbase.version>
 32 |         <dts.version>4.6.27.12.0</dts.version>
 33 |         <commons.lang3.version>3.3.2</commons.lang3.version>
 34 |     </properties>
 35 | 
 36 |     <dependencies>
 37 |         <dependency>
 38 |             <groupId>org.apache.spark</groupId>
 39 |             <artifactId>spark-core_2.11</artifactId>
 40 |             <version>${spark.version}</version>
 41 |         </dependency>
 42 | 
 43 |         <dependency>
 44 |             <groupId>org.apache.spark</groupId>
 45 |             <artifactId>spark-mllib_2.11</artifactId>
 46 |             <version>${spark.version}</version>
 47 |         </dependency>
 48 | 
 49 |         <dependency>
 50 |             <groupId>org.apache.spark</groupId>
 51 |             <artifactId>spark-sql_2.11</artifactId>
 52 |             <version>${spark.version}</version>
 53 |         </dependency>
 54 | 
 55 |         <dependency>
 56 |             <groupId>org.apache.spark</groupId>
 57 |             <artifactId>spark-streaming_2.11</artifactId>
 58 |             <version>${spark.version}</version>
 59 |         </dependency>
 60 | 
 61 |         <dependency>
 62 |             <groupId>org.apache.spark</groupId>
 63 |             <artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
 64 |             <version>${spark.version}</version>
 65 |         </dependency>
 66 | 
 67 |         <dependency>
 68 |             <groupId>org.apache.spark</groupId>
 69 |             <artifactId>spark-hive_2.11</artifactId>
 70 |             <version>${spark.version}</version>
 71 |             <exclusions>
 72 |                 <exclusion>
 73 |                     <groupId>org.apache.httpcomponents</groupId>
 74 |                     <artifactId>httpclient</artifactId>
 75 |                 </exclusion>
 76 |                 <exclusion>
 77 |                     <groupId>org.apache.httpcomponents</groupId>
 78 |                     <artifactId>httpcore</artifactId>
 79 |                 </exclusion>
 80 |             </exclusions>
 81 |         </dependency>
 82 | 
 83 |         <dependency>
 84 |             <groupId>org.apache.flink</groupId>
 85 |             <artifactId>flink-core</artifactId>
 86 |             <version>${flink.version}</version>
 87 |         </dependency>
 88 | 
 89 |         <dependency>
 90 |             <groupId>org.apache.flink</groupId>
 91 |             <artifactId>flink-clients_2.11</artifactId>
 92 |             <version>${flink.version}</version>
 93 |         </dependency>
 94 | 
 95 |         <dependency>
 96 |             <groupId>org.apache.flink</groupId>
 97 |             <artifactId>flink-connector-kafka-0.11_2.11</artifactId>
 98 |             <version>1.4.2</version>
 99 |         </dependency>
100 | 
101 |         <!--支持OTS数据源-->
102 |         <dependency>
103 |             <groupId>com.aliyun.emr</groupId>
104 |             <artifactId>emr-tablestore</artifactId>
105 |             <version>${emr.version}</version>
106 |         </dependency>
107 | 
108 |         <!-- 支持 MNS、ONS、LogService、MaxCompute数据源 (Spark 2.x环境)-->
109 |         <dependency>
110 |             <groupId>com.aliyun.emr</groupId>
111 |             <artifactId>emr-mns_2.11</artifactId>
112 |             <version>${emr.version}</version>
113 |             <exclusions>
114 |                 <exclusion>
115 |                     <groupId>com.aliyun.mns</groupId>
116 |                     <artifactId>aliyun-sdk-mns</artifactId>
117 |                 </exclusion>
118 |             </exclusions>
119 |         </dependency>
120 | 
121 |         <dependency>
122 |             <groupId>com.aliyun.emr</groupId>
123 |             <artifactId>emr-logservice_2.11</artifactId>
124 |             <version>${emr.version}</version>
125 |         </dependency>
126 | 
127 |         <dependency>
128 |             <groupId>com.aliyun.openservices</groupId>
129 |             <artifactId>aliyun-log</artifactId>
130 |             <version>0.6.60</version>
131 |         </dependency>
132 | 
133 | 
134 |         <dependency>
135 |             <groupId>com.aliyun.emr</groupId>
136 |             <artifactId>emr-maxcompute_2.11</artifactId>
137 |             <version>${emr.version}</version>
138 |         </dependency>
139 | 
140 |         <dependency>
141 |             <groupId>com.aliyun.emr</groupId>
142 |             <artifactId>emr-ons_2.11</artifactId>
143 |             <version>${emr.version}</version>
144 |         </dependency>
145 | 
146 |         <dependency>
147 |             <groupId>com.aliyun.emr</groupId>
148 |             <artifactId>emr-dts_2.11</artifactId>
149 |             <version>${emr.version}</version>
150 |         </dependency>
151 | 
152 |         <dependency>
153 |             <groupId>com.aliyun.oss</groupId>
154 |             <artifactId>aliyun-sdk-oss</artifactId>
155 |             <version>${oss.sdk.version}</version>
156 |         </dependency>
157 | 
158 |         <dependency>
159 |             <groupId>com.aliyun.odps</groupId>
160 |             <artifactId>odps-sdk-core</artifactId>
161 |             <version>${odps.version}</version>
162 |             <exclusions>
163 |                 <exclusion>
164 |                     <groupId>org.codehaus.jackson</groupId>
165 |                     <artifactId>jackson-mapper-asl</artifactId>
166 |                 </exclusion>
167 |                 <exclusion>
168 |                     <groupId>org.codehaus.jackson</groupId>
169 |                     <artifactId>jackson-core-asl</artifactId>
170 |                 </exclusion>
171 |             </exclusions>
172 |         </dependency>
173 | 
174 |         <dependency>
175 |             <groupId>com.aliyun.odps</groupId>
176 |             <artifactId>odps-sdk-commons</artifactId>
177 |             <version>${odps.version}</version>
178 |         </dependency>
179 | 
180 |         <dependency>
181 |             <groupId>com.aliyun.openservices</groupId>
182 |             <artifactId>loghub-client-lib</artifactId>
183 |             <version>${loghubb.client.version}</version>
184 |         </dependency>
185 | 
186 |         <dependency>
187 |             <groupId>com.aliyun.openservices</groupId>
188 |             <artifactId>ons-client</artifactId>
189 |             <version>${ons.version}</version>
190 |         </dependency>
191 | 
192 |         <dependency>
193 |             <groupId>com.aliyun.openservices</groupId>
194 |             <artifactId>ons-api</artifactId>
195 |             <version>${ons.version}</version>
196 |         </dependency>
197 | 
198 |         <dependency>
199 |             <groupId>com.alibaba.rocketmq</groupId>
200 |             <artifactId>rocketmq-client</artifactId>
201 |             <version>${rocketmq.version}</version>
202 |         </dependency>
203 | 
204 |         <dependency>
205 |             <groupId>com.alibaba.rocketmq</groupId>
206 |             <artifactId>rocketmq-common</artifactId>
207 |             <version>${rocketmq.version}</version>
208 |         </dependency>
209 | 
210 |         <dependency>
211 |             <groupId>com.alibaba.rocketmq</groupId>
212 |             <artifactId>rocketmq-remoting</artifactId>
213 |             <version>${rocketmq.version}</version>
214 |         </dependency>
215 | 
216 |         <dependency>
217 |             <groupId>org.apache.hadoop</groupId>
218 |             <artifactId>hadoop-mapreduce-client-core</artifactId>
219 |             <version>${hadoop.version}</version>
220 |             <exclusions>
221 |                 <exclusion>
222 |                     <groupId>jdk.tools</groupId>
223 |                     <artifactId>jdk.tools</artifactId>
224 |                 </exclusion>
225 |             </exclusions>     
226 |         </dependency>
227 | 
228 |         <dependency>
229 |             <groupId>org.aspectj</groupId>
230 |             <artifactId>aspectjrt</artifactId>
231 |             <version>${aspectjrt.version}</version>
232 |         </dependency>
233 | 
234 |         <dependency>
235 |             <groupId>com.github.scopt</groupId>
236 |             <artifactId>scopt_2.10</artifactId>
237 |             <version>${scopt.version}</version>
238 |         </dependency>
239 | 
240 |         <dependency>
241 |             <groupId>com.alibaba</groupId>
242 |             <artifactId>fastjson</artifactId>
243 |             <version>${fastjson.version}</version>
244 |         </dependency>
245 | 
246 |         <dependency>
247 |             <groupId>com.google.guava</groupId>
248 |             <artifactId>guava</artifactId>
249 |             <version>${guava.version}</version>
250 |         </dependency>
251 | 
252 |         <dependency>
253 |             <groupId>mysql</groupId>
254 |             <artifactId>mysql-connector-java</artifactId>
255 |             <version>${mysql.connector.version}</version>
256 |         </dependency>
257 | 
258 |         <dependency>
259 |             <groupId>com.stratio.datasource</groupId>
260 |             <artifactId>spark-mongodb_2.10</artifactId>
261 |             <version>${mongodb.version}</version>
262 |         </dependency>
263 | 
264 |         <dependency>
265 |             <groupId>redis.clients</groupId>
266 |             <artifactId>jedis</artifactId>
267 |             <version>${redis.clients.version}</version>
268 |         </dependency>
269 | 
270 |         <dependency>
271 |             <groupId>org.apache.commons</groupId>
272 |             <artifactId>commons-pool2</artifactId>
273 |             <version>${commons.pool2.version}</version>
274 |         </dependency>
275 | 
276 |         <dependency>
277 |             <groupId>org.apache.hbase</groupId>
278 |             <artifactId>hbase-client</artifactId>
279 |             <version>${hbase.version}</version>
280 |             <exclusions>
281 |                 <exclusion>
282 |                     <groupId>jdk.tools</groupId>
283 |                     <artifactId>jdk.tools</artifactId>
284 |                 </exclusion>
285 |                 <exclusion>
286 |                     <groupId>org.apache.hadoop</groupId>
287 |                     <artifactId>hadoop-mapreduce-client-core</artifactId>
288 |                 </exclusion>
289 |             </exclusions>
290 |         </dependency>
291 | 
292 |         <dependency>
293 |             <groupId>org.apache.hbase</groupId>
294 |             <artifactId>hbase-common</artifactId>
295 |             <version>${hbase.version}</version>
296 |         </dependency>
297 | 
298 |         <dependency>
299 |             <groupId>org.apache.hbase</groupId>
300 |             <artifactId>hbase-protocol</artifactId>
301 |             <version>${hbase.version}</version>
302 |         </dependency>
303 |              
304 |         <dependency>
305 |             <groupId>com.aliyun.mns</groupId>
306 |             <artifactId>aliyun-sdk-mns</artifactId>
307 |             <version>1.1.8.8</version>
308 |         </dependency>
309 | 
310 |         <dependency>
311 |             <groupId>org.apache.httpcomponents</groupId>
312 |             <artifactId>httpasyncclient</artifactId>
313 |             <version>4.1</version>
314 |         </dependency>
315 | 
316 |         <dependency>
317 |             <groupId>org.apache.httpcomponents</groupId>
318 |             <artifactId>httpcore-nio</artifactId>
319 |             <version>4.4.1</version>
320 |         </dependency>
321 | 
322 |         <dependency>
323 |             <groupId>org.apache.httpcomponents</groupId>
324 |             <artifactId>httpcore</artifactId>
325 |             <version>4.4.1</version>
326 |         </dependency>
327 | 
328 |         <dependency>
329 |             <groupId>org.apache.kafka</groupId>
330 |             <artifactId>kafka_2.11</artifactId>
331 |             <version>0.10.0.1</version>
332 |         </dependency>
333 | 
334 |         <dependency>
335 |             <groupId>org.apache.kafka</groupId>
336 |             <artifactId>kafka-clients</artifactId>
337 |             <version>0.10.0.1</version>
338 |         </dependency>
339 | 
340 |         <dependency>
341 |             <groupId>com.aliyun.dts</groupId>
342 |             <artifactId>dts-subscribe-sdk</artifactId>
343 |             <version>${dts.version}</version>
344 |         </dependency>
345 | 
346 |         <dependency>
347 |             <groupId>org.apache.commons</groupId>
348 |             <artifactId>commons-lang3</artifactId>
349 |             <version>${commons.lang3.version}</version>
350 |         </dependency>
351 | 
352 |         <dependency>
353 |             <groupId>org.apache.storm</groupId>
354 |             <artifactId>storm-core</artifactId>
355 |             <version>1.1.2</version>
356 |             <exclusions>
357 |                 <exclusion>
358 |                     <groupId>org.slf4j</groupId>
359 |                     <artifactId>log4j-over-slf4j</artifactId>
360 |                 </exclusion>
361 |             </exclusions>
362 |         </dependency>
363 | 
364 |         <dependency>
365 |             <groupId>org.apache.storm</groupId>
366 |             <artifactId>storm-kafka</artifactId>
367 |             <version>1.1.2</version>
368 |         </dependency>
369 | 
370 |         <dependency>
371 |             <groupId>org.apache.storm</groupId>
372 |             <artifactId>storm-hdfs</artifactId>
373 |             <version>1.1.2</version>
374 |         </dependency>
375 | 
376 |         <dependency>
377 |             <groupId>org.apache.storm</groupId>
378 |             <artifactId>storm-perf</artifactId>
379 |             <version>1.1.2</version>
380 |         </dependency>
381 | 
382 |         <dependency>
383 |             <groupId>org.apache.hadoop</groupId>
384 |             <artifactId>hadoop-hdfs</artifactId>
385 |             <version>2.6.1</version>
386 |         </dependency>
387 | 
388 |         <dependency>
389 |             <groupId>org.apache.hadoop</groupId>
390 |             <artifactId>hadoop-common</artifactId>
391 |             <version>2.6.1</version>
392 |         </dependency>
393 | 
394 |         <dependency>
395 |             <groupId>javax.mail</groupId>
396 |             <artifactId>mail</artifactId>
397 |             <version>1.4.7</version>
398 |         </dependency>
399 | 
400 |         <dependency>
401 |             <groupId>com.aliyun.emr</groupId>
402 |             <artifactId>emr-datahub_2.11</artifactId>
403 |             <version>2.2.0</version>
404 |         </dependency>
405 | 
406 |         <dependency>
407 |             <groupId>com.squareup.okhttp3</groupId>
408 |             <artifactId>okhttp</artifactId>
409 |             <version>3.12.0</version>
410 |         </dependency>
411 | 
412 |         <dependency>
413 |             <groupId>com.aliyun.datahub</groupId>
414 |             <artifactId>aliyun-sdk-datahub</artifactId>
415 |             <version>2.13.0-public</version>
416 |         </dependency>
417 | 
418 |         <dependency>
419 |             <groupId>org.apache.htrace</groupId>
420 |             <artifactId>htrace-core</artifactId>
421 |             <version>3.1.0-incubating</version>
422 |         </dependency>
423 | 
424 |     </dependencies>
425 | 
426 |     <build>
427 |         <outputDirectory>target/classes</outputDirectory>
428 |         <testOutputDirectory>target/test-classes</testOutputDirectory>
429 |         <plugins>
430 |             <plugin>
431 |                 <artifactId>maven-compiler-plugin</artifactId>
432 |                 <configuration>
433 |                     <source>1.8</source>
434 |                     <target>1.8</target>
435 |                     <encoding>UTF-8</encoding>
436 |                 </configuration>
437 |             </plugin>
438 |             <plugin>
439 |                 <groupId>net.alchim31.maven</groupId>
440 |                 <artifactId>scala-maven-plugin</artifactId>
441 |                 <version>4.0.1</version>
442 |                 <executions>
443 |                     <execution>
444 |                         <id>scala-compile-first</id>
445 |                         <phase>process-resources</phase>
446 |                         <goals>
447 |                             <goal>compile</goal>
448 |                         </goals>
449 |                     </execution>
450 |                     <execution>
451 |                         <id>scala-test-compile-first</id>
452 |                         <phase>process-test-resources</phase>
453 |                         <goals>
454 |                             <goal>testCompile</goal>
455 |                         </goals>
456 |                     </execution>
457 |                     <execution>
458 |                         <id>attach-scaladocs</id>
459 |                         <phase>verify</phase>
460 |                         <goals>
461 |                             <goal>doc-jar</goal>
462 |                         </goals>
463 |                     </execution>
464 |                 </executions>
465 |             </plugin>
466 |             <plugin>
467 |                 <groupId>org.apache.maven.plugins</groupId>
468 |                 <artifactId>maven-shade-plugin</artifactId>
469 |                 <version>2.4.2</version>
470 |                 <configuration>
471 |                     <shadedArtifactAttached>false</shadedArtifactAttached>
472 |                     <outputFile>${project.build.directory}/shaded/examples-${project.version}-shaded.jar</outputFile>
473 |                     <artifactSet>
474 |                         <includes>
475 |                             <include>javax.mail:mail</include>
476 |                             <include>org.apache.htrace:htrace-core</include>
477 |                             <include>com.squareup.okhttp3:okhttp</include>
478 |                             <include>com.squareup.okio:okio</include>
479 |                             <include>com.squareup.okhttp3:logging-interceptor</include>
480 |                             <include>com.squareup.retrofit2:converter-jackson</include>
481 |                             <include>com.squareup.retrofit2:retrofit</include>
482 |                             <include>com.aliyun.openservices:aliyun-sls-v0.6-inner</include>
483 |                             <include>com.aliyun.datahub:aliyun-sdk-datahub</include>
484 |                             <include>com.aliyun.emr:emr-datahub_2.11</include>
485 |                             <include>com.aliyun.emr:emr-tablestore</include>
486 |                             <include>com.aliyun.emr:emr-mns_2.11</include>
487 |                             <include>com.aliyun.emr:emr-logservice_2.11</include>
488 |                             <include>com.aliyun.emr:emr-maxcompute_2.11</include>
489 |                             <include>com.aliyun.emr:emr-ons_2.11</include>
490 |                             <include>com.aliyun.emr:emr-dts_2.11</include>
491 |                             <include>com.aliyun.odps:odps-sdk-core</include>
492 |                             <include>com.aliyun.odps:odps-sdk-commons</include>
493 |                             <include>com.aliyun.oss:aliyun-sdk-oss</include>
494 |                             <include>com.aliyun.openservices:aliyun-log</include>
495 |                             <include>com.aliyun.openservices:loghub-client-lib</include>
496 |                             <include>com.aliyun.openservices:ons-client</include>
497 |                             <include>com.aliyun.openservices:ons-api</include>
498 |                             <include>com.aliyun.mns:aliyun-sdk-mns</include>
499 |                             <include>com.aliyun.openservices:tablestore</include>
500 |                             <include>com.alibaba.rocketmq:rocketmq-client</include>
501 |                             <include>com.alibaba.rocketmq:rocketmq-common</include>
502 |                             <include>com.alibaba.rocketmq:rocketmq-remoting</include>
503 |                             <include>com.alibaba:fastjson</include>
504 |                             <include>com.google.guava:guava</include>
505 |                             <include>org.aspectj:aspectjrt</include>
506 |                             <include>com.github.scopt:scopt_2.10</include>
507 |                             <include>org.jdom:jdom</include>
508 |                             <include>net.sf.json-lib:json-lib</include>
509 |                             <include>net.sf.ezmorph:ezmorph</include>
510 |                             <include>commons-validator:commons-validator</include>
511 |                             <include>mysql:mysql-connector-java</include>
512 |                             <include>com.stratio.datasource:spark-mongodb_2.10</include>
513 |                             <include>redis.clients:jedis</include>
514 |                             <include>org.apache.commons:commons-pool2</include>
515 |                             <include>org.apache.hbase:hbase-common</include>
516 |                             <include>org.apache.hbase:hbase-client</include>
517 |                             <include>org.apache.hbase:hbase-protocol</include>
518 |                             <include>org.apache.httpcomponents:httpasyncclient</include>
519 |                             <include>org.apache.httpcomponents:httpcore-nio</include>
520 |                             <include>org.apache.httpcomponents:httpcore</include>
521 |                             <include>org.apache.spark:spark-streaming-kafka-0-10_2.11</include>
522 |                             <include>org.apache.kafka:kafka-clients</include>
523 |                             <include>org.apache.kafka:kafka_2.11</include>
524 |                             <include>org.apache.storm:storm-kafka</include>
525 |                             <include>org.apache.storm:storm-hdfs</include>
526 |                             <include>org.apache.storm:storm-perf</include>
527 |                             <include>commons-lang:commons-lang</include>
528 |                             <include>org.apache.hadoop:hadoop-hdfs</include>
529 |                             <include>org.apache.hadoop:hadoop-common</include>
530 |                             <include>com.101tec:zkclient</include>
531 |                             <include>com.aliyun.dts:dts-subscribe-sdk</include>
532 |                             <include>org.apache.commons:commons-lang3</include>
533 |                         </includes>
534 |                     </artifactSet>
535 |                 </configuration>
536 |                 <executions>
537 |                     <execution>
538 |                         <phase>package</phase>
539 |                         <goals>
540 |                             <goal>shade</goal>
541 |                         </goals>
542 |                     </execution>
543 |                 </executions>
544 |             </plugin>
545 |         </plugins>
546 |     </build>
547 | 
548 |     <repositories>
549 |          <repository>
550 |             <id>central</id>
551 |             <url>http://maven.aliyun.com/mvn/repository</url>
552 |             <releases>
553 |                 <enabled>true</enabled>
554 |             </releases>
555 |             <snapshots>
556 |                 <enabled>false</enabled>
557 |             </snapshots>
558 |             </repository>
559 |             <repository>
560 |                 <id>snapshots</id>
561 |                 <url>http://maven.aliyun.com/mvn/repository</url>
562 |             <releases>
563 |                 <enabled>false</enabled>
564 |             </releases>
565 |             <snapshots>
566 |                 <enabled>true</enabled>
567 |             </snapshots>
568 |         </repository>
569 |         <repository>
570 |             <id>oss</id>
571 |             <name>Maven SNAPSHOT Repository</name>
572 |             <url>https://oss.sonatype.org/content/repositories/snapshots/</url>
573 |             <releases>
574 |                 <enabled>false</enabled>
575 |             </releases>
576 |             <snapshots>
577 |                 <enabled>true</enabled>
578 |             </snapshots>
579 |         </repository>
580 |     </repositories>
581 | </project>
582 | 


--------------------------------------------------------------------------------
/resources/The_Sorrows_of_Young_Werther.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/resources/The_Sorrows_of_Young_Werther.txt


--------------------------------------------------------------------------------
/resources/patterns.txt:
--------------------------------------------------------------------------------
1 | \.
2 | \,
3 | \!
4 | to
5 | \"


--------------------------------------------------------------------------------
/resources/student_data.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/resources/student_data.csv


--------------------------------------------------------------------------------
/src/main/hive/sample.hive:
--------------------------------------------------------------------------------
 1 | USE DEFAULT;
 2 | set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
 3 | set mapreduce.job.maps=2;
 4 | set mapreduce.job.reduces=2;
 5 | set hive.stats.autogather=false;
 6 | 
 7 | DROP TABLE emrusers;
 8 | CREATE EXTERNAL TABLE emrusers (
 9 |   userid INT,
10 |   movieid INT,
11 |   rating INT,
12 |   unixtime STRING )
13 |  ROW FORMAT DELIMITED
14 |  FIELDS TERMINATED BY '\t'
15 |  LOCATION '${hiveconf:inputPath}';
16 | 
17 | SELECT COUNT(*) FROM emrusers;
18 | 
19 | SELECT * from emrusers limit 100;
20 | 
21 | SELECT movieid,count(userid) as usercount from emrusers group by movieid order by usercount desc limit 50;


--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/hadoop/EMapReduceOSSUtil.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.aliyun.emr.example.hadoop;
19 | 
20 | import org.apache.hadoop.conf.Configuration;
21 | 
22 | public class EMapReduceOSSUtil {
23 | 
24 |     private static String SCHEMA = "oss://";
25 |     private static String AKSEP = ":";
26 |     private static String BKTSEP = "@";
27 |     private static String EPSEP = ".";
28 |     private static String HTTP_HEADER = "http://";
29 | 
30 |     /**
31 |      * complete OSS uri
32 |      * convert uri like: oss://bucket/path  to  oss://accessKeyId:accessKeySecret@bucket.endpoint/path
33 |      * ossref do not need this
34 |      *
35 |      * @param oriUri original OSS uri
36 |      */
37 |     public static String buildOSSCompleteUri(String oriUri, String akId, String akSecret, String endpoint) {
38 |         if (akId == null) {
39 |             System.err.println("miss accessKeyId");
40 |             return oriUri;
41 |         }
42 |         if (akSecret == null) {
43 |             System.err.println("miss accessKeySecret");
44 |             return oriUri;
45 |         }
46 |         if (endpoint == null) {
47 |             System.err.println("miss endpoint");
48 |             return oriUri;
49 |         }
50 | 
51 |         int index = oriUri.indexOf(SCHEMA);
52 |         if (index == -1 || index != 0) {
53 |             return oriUri;
54 |         }
55 | 
56 |         int bucketIndex = index + SCHEMA.length();
57 |         int pathIndex = oriUri.indexOf("/", bucketIndex);
58 |         String bucket = null;
59 |         if (pathIndex == -1) {
60 |             bucket = oriUri.substring(bucketIndex);
61 |         } else {
62 |             bucket = oriUri.substring(bucketIndex, pathIndex);
63 |         }
64 | 
65 |         StringBuilder retUri = new StringBuilder();
66 |         retUri.append(SCHEMA)
67 |                 .append(akId)
68 |                 .append(AKSEP)
69 |                 .append(akSecret)
70 |                 .append(BKTSEP)
71 |                 .append(bucket)
72 |                 .append(EPSEP)
73 |                 .append(stripHttp(endpoint));
74 | 
75 |         if (pathIndex > 0) {
76 |             retUri.append(oriUri.substring(pathIndex));
77 |         }
78 | 
79 |         return retUri.toString();
80 |     }
81 | 
82 |     public static String buildOSSCompleteUri(String oriUri, Configuration conf) {
83 |         return buildOSSCompleteUri(oriUri, conf.get("fs.oss.accessKeyId"), conf.get("fs.oss.accessKeySecret"), conf.get("fs.oss.endpoint"));
84 |     }
85 | 
86 |     private static String stripHttp(String endpoint) {
87 |         if (endpoint.startsWith(HTTP_HEADER)) {
88 |             return endpoint.substring(HTTP_HEADER.length());
89 |         }
90 |         return endpoint;
91 |     }
92 | }
93 | 


--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/hadoop/WordCount.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.aliyun.emr.example.hadoop;
 19 | 
 20 | import java.io.BufferedReader;
 21 | import java.io.FileReader;
 22 | import java.io.IOException;
 23 | import java.net.URI;
 24 | import java.util.ArrayList;
 25 | import java.util.HashSet;
 26 | import java.util.List;
 27 | import java.util.Set;
 28 | import java.util.StringTokenizer;
 29 | 
 30 | import org.apache.hadoop.conf.Configuration;
 31 | import org.apache.hadoop.fs.Path;
 32 | import org.apache.hadoop.io.IntWritable;
 33 | import org.apache.hadoop.io.Text;
 34 | import org.apache.hadoop.mapreduce.Job;
 35 | import org.apache.hadoop.mapreduce.Mapper;
 36 | import org.apache.hadoop.mapreduce.Reducer;
 37 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 38 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 39 | import org.apache.hadoop.mapreduce.Counter;
 40 | import org.apache.hadoop.util.GenericOptionsParser;
 41 | import org.apache.hadoop.util.StringUtils;
 42 | 
 43 | public class WordCount {
 44 | 
 45 |     public static class TokenizerMapper
 46 |             extends Mapper<Object, Text, Text, IntWritable>{
 47 | 
 48 |         static enum CountersEnum { INPUT_WORDS }
 49 | 
 50 |         private final static IntWritable one = new IntWritable(1);
 51 |         private Text word = new Text();
 52 | 
 53 |         private boolean caseSensitive;
 54 |         private Set<String> patternsToSkip = new HashSet<String>();
 55 | 
 56 |         private Configuration conf;
 57 |         private BufferedReader fis;
 58 | 
 59 |         @Override
 60 |         public void setup(Context context) throws IOException,
 61 |                 InterruptedException {
 62 |             conf = context.getConfiguration();
 63 |             caseSensitive = conf.getBoolean("wordcount.case.sensitive", true);
 64 |             if (conf.getBoolean("wordcount.skip.patterns", false)) {
 65 |                 URI[] patternsURIs = Job.getInstance(conf).getCacheFiles();
 66 |                 for (URI patternsURI : patternsURIs) {
 67 |                     Path patternsPath = new Path(patternsURI.getPath());
 68 |                     String patternsFileName = patternsPath.getName();
 69 |                     parseSkipFile(patternsFileName);
 70 |                 }
 71 |             }
 72 |         }
 73 | 
 74 |         private void parseSkipFile(String fileName) {
 75 |             try {
 76 |                 fis = new BufferedReader(new FileReader(fileName));
 77 |                 String pattern;
 78 |                 while ((pattern = fis.readLine()) != null) {
 79 |                     patternsToSkip.add(pattern);
 80 |                 }
 81 |             } catch (IOException ioe) {
 82 |                 System.err.println("Caught exception while parsing the cached file '"
 83 |                         + StringUtils.stringifyException(ioe));
 84 |             }
 85 |         }
 86 | 
 87 |         @Override
 88 |         public void map(Object key, Text value, Context context
 89 |         ) throws IOException, InterruptedException {
 90 |             String line = (caseSensitive) ?
 91 |                     value.toString() : value.toString().toLowerCase();
 92 |             for (String pattern : patternsToSkip) {
 93 |                 line = line.replaceAll(pattern, "");
 94 |             }
 95 |             StringTokenizer itr = new StringTokenizer(line);
 96 |             while (itr.hasMoreTokens()) {
 97 |                 word.set(itr.nextToken());
 98 |                 context.write(word, one);
 99 |                 Counter counter = context.getCounter(CountersEnum.class.getName(),
100 |                         CountersEnum.INPUT_WORDS.toString());
101 |                 counter.increment(1);
102 |             }
103 |         }
104 |     }
105 | 
106 |     public static class IntSumReducer
107 |             extends Reducer<Text,IntWritable,Text,IntWritable> {
108 |         private IntWritable result = new IntWritable();
109 | 
110 |         public void reduce(Text key, Iterable<IntWritable> values,
111 |                            Context context
112 |         ) throws IOException, InterruptedException {
113 |             int sum = 0;
114 |             for (IntWritable val : values) {
115 |                 sum += val.get();
116 |             }
117 |             result.set(sum);
118 |             context.write(key, result);
119 |         }
120 |     }
121 | 
122 |     public static void main(String[] args) throws Exception {
123 |         Configuration conf = new Configuration();
124 |         GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);
125 |         String[] remainingArgs = optionParser.getRemainingArgs();
126 |         if (!(remainingArgs.length == 2 || remainingArgs.length == 4)) {
127 |             System.err.println("Usage: wordcount <in> <out> [-skip skipPatternFile]");
128 |             System.exit(2);
129 |         }
130 |         Job job = Job.getInstance(conf, "word count");
131 |         job.setJarByClass(WordCount.class);
132 |         job.setMapperClass(TokenizerMapper.class);
133 |         job.setCombinerClass(IntSumReducer.class);
134 |         job.setReducerClass(IntSumReducer.class);
135 |         job.setOutputKeyClass(Text.class);
136 |         job.setOutputValueClass(IntWritable.class);
137 | 
138 |         List<String> otherArgs = new ArrayList<String>();
139 |         for (int i=0; i < remainingArgs.length; ++i) {
140 |             if ("-skip".equals(remainingArgs[i])) {
141 |                 job.addCacheFile(new Path(EMapReduceOSSUtil.buildOSSCompleteUri(remainingArgs[++i], conf)).toUri());
142 |                 job.getConfiguration().setBoolean("wordcount.skip.patterns", true);
143 |             } else {
144 |                 otherArgs.add(remainingArgs[i]);
145 |             }
146 |         }
147 |         FileInputFormat.addInputPath(job, new Path(EMapReduceOSSUtil.buildOSSCompleteUri(otherArgs.get(0), conf)));
148 |         FileOutputFormat.setOutputPath(job, new Path(EMapReduceOSSUtil.buildOSSCompleteUri(otherArgs.get(1), conf)));
149 | 
150 |         System.exit(job.waitForCompletion(true) ? 0 : 1);
151 |     }
152 | }
153 | 


--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/spark/SparkMaxComputeJavaDemo.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.aliyun.emr.example.spark;
19 | 
20 | import com.aliyun.odps.TableSchema;
21 | import com.aliyun.odps.data.Record;
22 | import org.apache.spark.SparkConf;
23 | import org.apache.spark.aliyun.odps.OdpsOps;
24 | import org.apache.spark.api.java.JavaRDD;
25 | import org.apache.spark.api.java.JavaSparkContext;
26 | import org.apache.spark.api.java.function.Function2;
27 | 
28 | import java.util.ArrayList;
29 | import java.util.List;
30 | 
31 | public class SparkMaxComputeJavaDemo {
32 | 
33 |     public static void main(String[] args) {
34 |         String partition = null;
35 |         String accessId = args[0];
36 |         String accessKey = args[1];
37 | 
38 |         String odpsUrl = args[2];
39 | 
40 |         String tunnelUrl = args[3];
41 |         String project = args[4];
42 |         String table = args[5];
43 |         if (args.length > 6) {
44 |             partition = args[6];
45 |         }
46 | 
47 |         SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 3-2: Spark MaxCompute Demo (Java)");
48 |         JavaSparkContext jsc = new JavaSparkContext(sparkConf);
49 | 
50 |         OdpsOps odpsOps = new OdpsOps(jsc.sc(), accessId, accessKey, odpsUrl, tunnelUrl);
51 | 
52 |         System.out.println("Read odps table...");
53 |         JavaRDD<List<Long>> readData = odpsOps.readTableWithJava(project, table, new RecordToLongs(), Integer.valueOf(partition));
54 | 
55 |         System.out.println("counts: ");
56 |         System.out.println(readData.count());
57 |     }
58 | 
59 |     static class RecordToLongs implements Function2<Record, TableSchema, List<Long>> {
60 |         @Override
61 |         public List<Long> call(Record record, TableSchema schema) throws Exception {
62 |             List<Long> ret = new ArrayList<Long>();
63 |             for (int i = 0; i < schema.getColumns().size(); i++) {
64 |                 ret.add(record.getBigint(i));
65 |             }
66 |             return ret;
67 |         }
68 |     }
69 | 
70 | }
71 | 


--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/spark/SparkOssJavaDemo.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.aliyun.emr.example.spark;
19 | 
20 | import org.apache.hadoop.io.LongWritable;
21 | import org.apache.hadoop.io.Text;
22 | import org.apache.hadoop.mapred.TextInputFormat;
23 | import org.apache.spark.SparkConf;
24 | import org.apache.spark.api.java.JavaPairRDD;
25 | import org.apache.spark.api.java.JavaSparkContext;
26 | 
27 | public class SparkOssJavaDemo {
28 | 
29 |     public static void main(String[] args) {
30 | 
31 |         String accessId = args[0];
32 |         String accessKey = args[1];
33 | 
34 |         String endpoint = args[2];
35 | 
36 |         String inputPath = args[3];
37 |         String outputPath = args[4];
38 |         int partition = Integer.valueOf(args[5]);
39 | 
40 |         SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 2-2: Spark Oss Demo (Java)").setMaster("local[4]");
41 |         sparkConf.set("spark.hadoop.fs.oss.accessKeyId", accessId);
42 |         sparkConf.set("spark.hadoop.fs.oss.accessKeySecret", accessKey);
43 |         sparkConf.set("spark.hadoop.fs.oss.endpoint", endpoint);
44 |         sparkConf.set("spark.hadoop.fs.oss.impl", "com.aliyun.fs.oss.nat.NativeOssFileSystem");
45 |         sparkConf.set("spark.hadoop.mapreduce.job.run-local", "true");
46 |         JavaSparkContext jsc = new JavaSparkContext(sparkConf);
47 | 
48 |         JavaPairRDD<LongWritable, Text> data = jsc.hadoopFile(inputPath, TextInputFormat.class, LongWritable.class, Text.class, partition);
49 | 
50 |         System.out.println("Count (data): " + String.valueOf(data.count()));
51 | 
52 |         data.saveAsTextFile(outputPath);
53 |     }
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/spark/SparkTableStoreJavaDemo.java:
--------------------------------------------------------------------------------
 1 | package com.aliyun.emr.example.spark;
 2 | 
 3 | import com.alicloud.openservices.tablestore.ecosystem.ComputeParameters;
 4 | import com.alicloud.openservices.tablestore.ecosystem.Filter;
 5 | import com.alicloud.openservices.tablestore.model.*;
 6 | import com.aliyun.openservices.tablestore.hadoop.*;
 7 | import org.apache.hadoop.conf.Configuration;
 8 | import org.apache.spark.SparkConf;
 9 | import org.apache.spark.api.java.JavaPairRDD;
10 | import org.apache.spark.api.java.JavaSparkContext;
11 | 
12 | import java.util.ArrayList;
13 | import java.util.Formatter;
14 | import java.util.List;
15 | 
16 | public class SparkTableStoreJavaDemo {
17 |     private static RangeRowQueryCriteria fetchCriteria(String tableName, String columnName) {
18 |         RangeRowQueryCriteria res = new RangeRowQueryCriteria(tableName);
19 |         res.setMaxVersions(1);
20 |         List<PrimaryKeyColumn> lower = new ArrayList<PrimaryKeyColumn>();
21 |         List<PrimaryKeyColumn> upper = new ArrayList<PrimaryKeyColumn>();
22 |         lower.add(new PrimaryKeyColumn(columnName, PrimaryKeyValue.INF_MIN));
23 |         upper.add(new PrimaryKeyColumn(columnName, PrimaryKeyValue.INF_MAX));
24 |         res.setInclusiveStartPrimaryKey(new PrimaryKey(lower));
25 |         res.setExclusiveEndPrimaryKey(new PrimaryKey(upper));
26 |         return res;
27 |     }
28 | 
29 |     public static void main(String[] args) {
30 |         String accessKeyId = args[0];
31 |         String accessKeySecret = args[1];
32 |         Filter filter = new Filter(Filter.CompareOperator.GREATER_THAN,"PK", ColumnValue.fromLong(-1000));
33 |         List<String> list = new ArrayList<>();
34 |         list.add("VALUE");
35 |         TableStoreFilterWritable tableStoreFilterWritable = new TableStoreFilterWritable(filter, list);
36 | 
37 |         String endpoint = args[2];
38 |         String instance = args[3];
39 |         String tableName = args[4];
40 |         String primaryKeyColumnName = args[5];
41 |         ComputeParams computeParams = new ComputeParams(100, 1, ComputeParameters.ComputeMode.Auto.name());
42 |         SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 5: Spark TableStore Demo (Java)");
43 |         JavaSparkContext sc = null;
44 |         try {
45 |             sc = new JavaSparkContext(sparkConf);
46 |             Configuration hadoopConf = new Configuration();
47 |             hadoopConf.set("computeParams", computeParams.serialize());
48 |             hadoopConf.set("tableName", tableName);
49 |             hadoopConf.set("filters", tableStoreFilterWritable.serialize());
50 |             TableStore.setCredential(
51 |                     hadoopConf,
52 |                     new Credential(accessKeyId, accessKeySecret, null));
53 |             Endpoint ep = new Endpoint(endpoint, instance);
54 |             TableStore.setEndpoint(hadoopConf, ep);
55 |             com.aliyun.openservices.tablestore.hadoop.TableStoreInputFormat.addCriteria(hadoopConf,
56 |                     fetchCriteria(tableName, primaryKeyColumnName));
57 |             JavaPairRDD<PrimaryKeyWritable, RowWritable> rdd = sc.newAPIHadoopRDD(
58 |                     hadoopConf, com.aliyun.openservices.tablestore.hadoop.TableStoreInputFormat.class,
59 |                     PrimaryKeyWritable.class, RowWritable.class);
60 |             System.out.println(
61 |                     new Formatter().format("TOTAL: %d", rdd.count()).toString());
62 |             rdd.take(10).forEach((primaryKeyWritableRowWritableTuple2) -> {
63 |                 System.out.println(String.format("Key: %s, VALUE: %s",
64 |                         primaryKeyWritableRowWritableTuple2._1.getPrimaryKey().toString(),
65 |                         primaryKeyWritableRowWritableTuple2._2.getRow().toString()));
66 |             });
67 |         } finally {
68 |             if (sc != null) {
69 |                 sc.close();
70 |             }
71 |         }
72 |     }
73 | }
74 | 


--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/spark/sql/streaming/SparkSLSContinuousStructuredStreamingJavaDemo.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package com.aliyun.emr.example.spark.sql.streaming;
18 | 
19 | import org.apache.spark.sql.Dataset;
20 | import org.apache.spark.sql.Encoders;
21 | import org.apache.spark.sql.SparkSession;
22 | import org.apache.spark.sql.streaming.StreamingQuery;
23 | import org.apache.spark.sql.streaming.Trigger;
24 | 
25 | import java.util.UUID;
26 | 
27 | public class SparkSLSContinuousStructuredStreamingJavaDemo {
28 | 
29 |   public static void main(String[] args) throws Exception {
30 |     if (args.length < 7) {
31 |       System.err.println("Usage: SparkSLSContinuousStructuredStreamingJavaDemo <logService-project> " +
32 |           "<logService-store> <access-key-id> <access-key-secret> " +
33 |           "<endpoint> <starting-offsets> <max-offsets-per-trigger> [<checkpoint-location>]");
34 |       System.exit(1);
35 |     }
36 | 
37 |     String logProject = args[0];
38 |     String logStore = args[1];
39 |     String accessKeyId = args[2];
40 |     String accessKeySecret = args[3];
41 |     String endpoint = args[4];
42 |     String startingOffsets = args[5];
43 |     String maxOffsetsPerTrigger = args[6];
44 |     String checkpointLocation = "/tmp/temporary-" + UUID.randomUUID().toString();
45 |     if (args.length > 7) {
46 |       checkpointLocation = args[7];
47 |     }
48 | 
49 |     SparkSession spark = SparkSession
50 |         .builder()
51 |         .master("local[5]")
52 |         .appName("E-MapReduce Demo 6-6: Spark SLS Demo (Java)")
53 |         .getOrCreate();
54 | 
55 |     spark.sparkContext().setLogLevel("WARN");
56 | 
57 |     Dataset<String> lines = spark.readStream()
58 |         .format("org.apache.spark.sql.aliyun.logservice.LoghubSourceProvider")
59 |         .option("sls.project", logProject)
60 |         .option("sls.store", logStore)
61 |         .option("access.key.id", accessKeyId)
62 |         .option("access.key.secret", accessKeySecret)
63 |         .option("endpoint", endpoint)
64 |         .option("startingoffsets", startingOffsets)
65 |         .option("maxOffsetsPerTrigger", maxOffsetsPerTrigger)
66 |         .load()
67 |         .selectExpr("CAST(__value__ AS STRING)")
68 |         .as(Encoders.STRING());
69 | 
70 |     // Start running the query that prints the running counts to the console
71 |     StreamingQuery query = lines.writeStream()
72 |         .outputMode("append")
73 |         .format("console")
74 |         .option("checkpointLocation", checkpointLocation)
75 |         .trigger(Trigger.Continuous("5 second"))
76 |         .start();
77 | 
78 |     query.awaitTermination();
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/spark/sql/streaming/SparkSLSStructuredStreamingJavaDemo.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package com.aliyun.emr.example.spark.sql.streaming;
18 | 
19 | import org.apache.spark.api.java.function.FlatMapFunction;
20 | import org.apache.spark.sql.Dataset;
21 | import org.apache.spark.sql.Encoders;
22 | import org.apache.spark.sql.Row;
23 | import org.apache.spark.sql.SparkSession;
24 | import org.apache.spark.sql.streaming.StreamingQuery;
25 | 
26 | import java.util.Arrays;
27 | import java.util.UUID;
28 | 
29 | public class SparkSLSStructuredStreamingJavaDemo {
30 | 
31 |   public static void main(String[] args) throws Exception {
32 |     if (args.length < 7) {
33 |       System.err.println("Usage: SparkSLSStructuredStreamingJavaDemo <logService-project> " +
34 |           "<logService-store> <access-key-id> <access-key-secret> " +
35 |           "<endpoint> <starting-offsets> <max-offsets-per-trigger> [<checkpoint-location>]");
36 |       System.exit(1);
37 |     }
38 | 
39 |     String logProject = args[0];
40 |     String logStore = args[1];
41 |     String accessKeyId = args[2];
42 |     String accessKeySecret = args[3];
43 |     String endpoint = args[4];
44 |     String startingOffsets = args[5];
45 |     String maxOffsetsPerTrigger = args[6];
46 |     String checkpointLocation = "/tmp/temporary-" + UUID.randomUUID().toString();
47 |     if (args.length > 7) {
48 |       checkpointLocation = args[7];
49 |     }
50 | 
51 |     SparkSession spark = SparkSession
52 |         .builder()
53 |         .master("local[5]")
54 |         .appName("E-MapReduce Demo 6-4: Spark SLS Demo (Java)")
55 |         .getOrCreate();
56 | 
57 |     spark.sparkContext().setLogLevel("WARN");
58 | 
59 |     Dataset<String> lines = spark.readStream()
60 |         .format("org.apache.spark.sql.aliyun.logservice.LoghubSourceProvider")
61 |         .option("sls.project", logProject)
62 |         .option("sls.store", logStore)
63 |         .option("access.key.id", accessKeyId)
64 |         .option("access.key.secret", accessKeySecret)
65 |         .option("endpoint", endpoint)
66 |         .option("startingoffsets", startingOffsets)
67 |         .option("zookeeper.connect.address", "localhost:2181")
68 |         .option("maxOffsetsPerTrigger", maxOffsetsPerTrigger)
69 |         .load()
70 |         .selectExpr("CAST(__value__ AS STRING)")
71 |         .as(Encoders.STRING());
72 | 
73 |     // Generate running word count
74 |     Dataset<Row> wordCounts = lines.flatMap(
75 |         (FlatMapFunction<String, String>) x -> Arrays.asList(x.split(" ")).iterator(),
76 |         Encoders.STRING()).groupBy("value").count();
77 | 
78 |     // Start running the query that prints the running counts to the console
79 |     StreamingQuery query = wordCounts.writeStream()
80 |         .outputMode("complete")
81 |         .format("console")
82 |         .option("checkpointLocation", checkpointLocation)
83 |         .start();
84 | 
85 |     query.awaitTermination();
86 |   }
87 | }
88 | 


--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/spark/streaming/JavaLoghubWordCount.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.aliyun.emr.example.spark.streaming;
19 | 
20 | import org.apache.spark.SparkConf;
21 | import org.apache.spark.api.java.function.FlatMapFunction;
22 | import org.apache.spark.api.java.function.Function;
23 | import org.apache.spark.api.java.function.Function2;
24 | import org.apache.spark.api.java.function.PairFunction;
25 | import org.apache.spark.storage.StorageLevel;
26 | import org.apache.spark.streaming.Duration;
27 | import org.apache.spark.streaming.aliyun.logservice.LoghubUtils;
28 | import org.apache.spark.streaming.api.java.JavaDStream;
29 | import org.apache.spark.streaming.api.java.JavaPairDStream;
30 | import org.apache.spark.streaming.api.java.JavaStreamingContext;
31 | import scala.Tuple2;
32 | 
33 | import java.util.Arrays;
34 | import java.util.Iterator;
35 | import java.util.regex.Pattern;
36 | 
37 | public class JavaLoghubWordCount {
38 |   private static final Pattern SPACE = Pattern.compile(" ");
39 | 
40 |   public static void main(String[] args) throws InterruptedException {
41 |     if (args.length < 6) {
42 |       System.err.println("Usage: bin/spark-submit --class JavaLoghubWordCount " +
43 |           "examples-1.0-SNAPSHOT-shaded.jar <sls project> <sls logstore> <loghub group name> " +
44 |           "<sls endpoint> <access key id> <access key secret>");
45 |       System.exit(1);
46 |     }
47 | 
48 |     String loghubProject = args[0];
49 |     String logStore = args[1];
50 |     String loghubGroupName = args[2];
51 |     String endpoint = args[3];
52 |     String accessKeyId = args[4];
53 |     String accessKeySecret = args[5];
54 | 
55 |     SparkConf conf = new SparkConf().setAppName("Loghub Sample");
56 |     JavaStreamingContext jssc = new JavaStreamingContext(conf, new Duration(2000));
57 |     JavaDStream<byte[]> lines = LoghubUtils.createStream(
58 |       jssc,
59 |       loghubProject,
60 |       logStore,
61 |       loghubGroupName,
62 |       endpoint,
63 |       1,
64 |       accessKeyId,
65 |       accessKeySecret,
66 |       StorageLevel.MEMORY_AND_DISK());
67 | 
68 |     JavaDStream<String> words = lines.map(new Function<byte[], String>() {
69 |       @Override
70 |       public String call(byte[] v1) throws Exception {
71 |         return new String(v1);
72 |       }
73 |     }).flatMap(new FlatMapFunction<String, String>() {
74 |       @Override
75 |       public Iterator<String> call(String s) {
76 |         return Arrays.asList(SPACE.split(s)).iterator();
77 |       }
78 |     });
79 |     JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
80 |       new PairFunction<String, String, Integer>() {
81 |         @Override
82 |         public Tuple2<String, Integer> call(String s) {
83 |           return new Tuple2<String, Integer>(s, 1);
84 |         }
85 |       }).reduceByKey(new Function2<Integer, Integer, Integer>() {
86 |       @Override
87 |       public Integer call(Integer i1, Integer i2) {
88 |         return i1 + i2;
89 |       }
90 |     });
91 | 
92 |     wordCounts.print();
93 |     jssc.start();
94 |     jssc.awaitTermination();
95 |   }
96 | }
97 | 


--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/spark/streaming/SparkMNSJavaDemo.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.aliyun.emr.example.spark.streaming;
19 | 
20 | import com.google.common.collect.Lists;
21 | import org.apache.spark.SparkConf;
22 | import org.apache.spark.api.java.function.FlatMapFunction;
23 | import org.apache.spark.api.java.function.Function;
24 | import org.apache.spark.api.java.function.Function2;
25 | import org.apache.spark.api.java.function.PairFunction;
26 | import org.apache.spark.storage.StorageLevel;
27 | import org.apache.spark.streaming.Duration;
28 | import org.apache.spark.streaming.aliyun.mns.MnsUtils;
29 | import org.apache.spark.streaming.api.java.*;
30 | import scala.Tuple2;
31 | 
32 | import java.util.Iterator;
33 | import java.util.regex.Pattern;
34 | 
35 | public class SparkMNSJavaDemo {
36 |     private static final Pattern SPACE = Pattern.compile(" ");
37 | 
38 |     public static void main(String[] args) throws InterruptedException {
39 |         if (args.length < 4) {
40 |             System.err.println("Usage: bin/spark-submit --class SparkMNSJavaDemo examples-1.0-SNAPSHOT-shaded.jar <queueName> " +
41 |                     "<accessKeyId> <accessKeySecret> <endpoint>");
42 |             System.exit(1);
43 |         }
44 | 
45 |         String queueName = args[0];
46 |         String accessKeyId = args[1];
47 |         String accessKeySecret = args[2];
48 |         String endpoint = args[3];
49 | 
50 |         SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 8-2: Spark MNS Demo (Java)").setMaster("local[4]");
51 |         sparkConf.set("spark.hadoop.fs.oss.impl", "com.aliyun.fs.oss.nat.NativeOssFileSystem");
52 |         sparkConf.set("spark.hadoop.mapreduce.job.run-local", "true");
53 |         // Create the context with 2 seconds batch size
54 |         JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(2000));
55 | 
56 |         JavaReceiverInputDStream<byte[]> lines = MnsUtils.createPullingStreamAsBytes(jssc, queueName, accessKeyId,
57 |                 accessKeySecret, endpoint, StorageLevel.MEMORY_AND_DISK());
58 | 
59 |         JavaDStream<String> words = lines.map(new Function<byte[], String>() {
60 |             @Override
61 |             public String call(byte[] v1) throws Exception {
62 |                 return new String(v1);
63 |             }
64 |         }).flatMap(new FlatMapFunction<String, String>() {
65 |             @Override
66 |             public Iterator<String> call(String x) {
67 |                 return Lists.newArrayList(SPACE.split(x)).iterator();
68 |             }
69 |         });
70 |         JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
71 |                 new PairFunction<String, String, Integer>() {
72 |                     @Override
73 |                     public Tuple2<String, Integer> call(String s) {
74 |                         return new Tuple2<String, Integer>(s, 1);
75 |                     }
76 |                 }).reduceByKey(new Function2<Integer, Integer, Integer>() {
77 |             @Override
78 |             public Integer call(Integer i1, Integer i2) {
79 |                 return i1 + i2;
80 |             }
81 |         });
82 | 
83 |         wordCounts.print();
84 |         jssc.start();
85 |         jssc.awaitTermination();
86 |     }
87 | }
88 | 


--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/spark/streaming/SparkRocketMQJavaDemo.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.aliyun.emr.example.spark.streaming;
19 | 
20 | import com.aliyun.openservices.ons.api.Message;
21 | import com.google.common.collect.Lists;
22 | import org.apache.spark.SparkConf;
23 | import org.apache.spark.api.java.function.FlatMapFunction;
24 | import org.apache.spark.api.java.function.Function;
25 | import org.apache.spark.api.java.function.Function2;
26 | import org.apache.spark.api.java.function.PairFunction;
27 | import org.apache.spark.storage.StorageLevel;
28 | import org.apache.spark.streaming.Duration;
29 | import org.apache.spark.streaming.aliyun.ons.OnsUtils;
30 | import org.apache.spark.streaming.api.java.JavaDStream;
31 | import org.apache.spark.streaming.api.java.JavaPairDStream;
32 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
33 | import org.apache.spark.streaming.api.java.JavaStreamingContext;
34 | import scala.Tuple2;
35 | 
36 | import java.util.Iterator;
37 | import java.util.regex.Pattern;
38 | 
39 | public class SparkRocketMQJavaDemo {
40 |     private static final Pattern SPACE = Pattern.compile(" ");
41 | 
42 |     public static void main(String[] args) throws InterruptedException {
43 |         if (args.length < 5) {
44 |             System.err.println("Usage: spark-submit --class com.aliyun.emr.example.spark.streaming.SparkRocketMQJavaDemo " +
45 |                 "examples-1.0-SNAPSHOT-shaded.jar <accessKeyId> <accessKeySecret> " +
46 |                 "<consumerId> <topic> <subExpression>");
47 |             System.exit(1);
48 |         }
49 | 
50 |         String accessKeyId = args[0];
51 |         String accessKeySecret = args[1];
52 |         String consumerId = args[2];
53 |         String topic = args[3];
54 |         String subExpression = args[4];
55 | 
56 |         SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 4-2: Spark RocketMQ Demo (Java)");
57 |         // Create the context with 2 seconds batch size
58 |         JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(2000));
59 | 
60 |         JavaReceiverInputDStream<byte[]> lines = OnsUtils.createStream(jssc, consumerId, topic, subExpression,
61 |                 accessKeyId, accessKeySecret, StorageLevel.MEMORY_AND_DISK(), new Function<Message, byte[]>() {
62 |                     @Override
63 |                     public byte[] call(Message msg) throws Exception {
64 |                         return msg.getBody();
65 |                     }
66 |                 });
67 | 
68 |         JavaDStream<String> words = lines.map(new Function<byte[], String>() {
69 |             @Override
70 |             public String call(byte[] v1) throws Exception {
71 |                 return new String(v1);
72 |             }
73 |         }).flatMap(new FlatMapFunction<String, String>() {
74 |             @Override
75 |             public Iterator<String> call(String x) {
76 |                 return Lists.newArrayList(SPACE.split(x)).iterator();
77 |             }
78 |         });
79 |         JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
80 |                 new PairFunction<String, String, Integer>() {
81 |                     @Override
82 |                     public Tuple2<String, Integer> call(String s) {
83 |                         return new Tuple2<String, Integer>(s, 1);
84 |                     }
85 |                 }).reduceByKey(new Function2<Integer, Integer, Integer>() {
86 |             @Override
87 |             public Integer call(Integer i1, Integer i2) {
88 |                 return i1 + i2;
89 |             }
90 |         });
91 | 
92 |         wordCounts.print();
93 |         jssc.start();
94 |         jssc.awaitTermination();
95 |     }
96 | }
97 | 


--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/storm/StormKafkaSample.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.aliyun.emr.example.storm;
 19 | 
 20 | import org.apache.storm.Config;
 21 | import org.apache.storm.LocalCluster;
 22 | import org.apache.storm.StormSubmitter;
 23 | import org.apache.storm.generated.AlreadyAliveException;
 24 | import org.apache.storm.generated.AuthorizationException;
 25 | import org.apache.storm.generated.InvalidTopologyException;
 26 | import org.apache.storm.hdfs.bolt.HdfsBolt;
 27 | import org.apache.storm.hdfs.bolt.format.DefaultFileNameFormat;
 28 | import org.apache.storm.hdfs.bolt.format.DelimitedRecordFormat;
 29 | import org.apache.storm.hdfs.bolt.format.FileNameFormat;
 30 | import org.apache.storm.hdfs.bolt.format.RecordFormat;
 31 | import org.apache.storm.hdfs.bolt.rotation.FileRotationPolicy;
 32 | import org.apache.storm.hdfs.bolt.rotation.FileSizeRotationPolicy;
 33 | import org.apache.storm.hdfs.bolt.sync.CountSyncPolicy;
 34 | import org.apache.storm.hdfs.bolt.sync.SyncPolicy;
 35 | import org.apache.storm.kafka.KafkaSpout;
 36 | import org.apache.storm.kafka.SpoutConfig;
 37 | import org.apache.storm.kafka.StringScheme;
 38 | import org.apache.storm.kafka.ZkHosts;
 39 | import org.apache.storm.spout.SchemeAsMultiScheme;
 40 | import org.apache.storm.topology.TopologyBuilder;
 41 | 
 42 | import java.util.ArrayList;
 43 | import java.util.List;
 44 | 
 45 | public class StormKafkaSample {
 46 |     public static void main(String[] args) throws AuthorizationException {
 47 |         String topic = args[0] ;
 48 |         String zk = args[1];
 49 |         String hdfsUrl = args[2];
 50 |         ZkHosts zkHosts = new ZkHosts(zk + ":2181/kafka-1.0.0");
 51 |         SpoutConfig spoutConfig = new SpoutConfig(zkHosts, topic, "/kafka-1.0.0", "MyTrack") ;
 52 |         List<String> zkServers = new ArrayList<String>() ;
 53 |         zkServers.add(zk);
 54 |         spoutConfig.zkServers = zkServers;
 55 |         spoutConfig.zkPort = 2181;
 56 |         spoutConfig.socketTimeoutMs = 60 * 1000 ;
 57 |         spoutConfig.scheme = new SchemeAsMultiScheme(new StringScheme()) ;
 58 | 
 59 |         // use "|" instead of "," for field delimiter
 60 |         RecordFormat format = new DelimitedRecordFormat()
 61 |                 .withFieldDelimiter("|");
 62 | 
 63 |         // sync the filesystem after every 1k tuples
 64 |         SyncPolicy syncPolicy = new CountSyncPolicy(1000);
 65 | 
 66 |         // rotate files when they reach 5MB
 67 |         FileRotationPolicy rotationPolicy = new FileSizeRotationPolicy(5.0f, FileSizeRotationPolicy.Units.MB);
 68 | 
 69 |         FileNameFormat fileNameFormat = new DefaultFileNameFormat().withPath("/foo/");
 70 | 
 71 |         HdfsBolt bolt = new HdfsBolt()
 72 |                 .withFsUrl(hdfsUrl)
 73 |                 .withFileNameFormat(fileNameFormat)
 74 |                 .withRecordFormat(format)
 75 |                 .withRotationPolicy(rotationPolicy)
 76 |                 .withSyncPolicy(syncPolicy);
 77 | 
 78 |         TopologyBuilder builder = new TopologyBuilder() ;
 79 |         builder.setSpout("spout", new KafkaSpout(spoutConfig) ,2) ;
 80 |         builder.setBolt("bolt", bolt, 1).shuffleGrouping("spout") ;
 81 | 
 82 |         Config conf = new Config ();
 83 |         conf.setDebug(false) ;
 84 | 
 85 |         if (args.length > 3) {
 86 |             try {
 87 |                 StormSubmitter.submitTopology(args[3], conf, builder.createTopology());
 88 |             } catch (AlreadyAliveException e) {
 89 |                 e.printStackTrace();
 90 |             } catch (InvalidTopologyException e) {
 91 |                 e.printStackTrace();
 92 |             }
 93 |         } else {
 94 |             LocalCluster localCluster = new LocalCluster();
 95 |             localCluster.submitTopology("mytopology", conf, builder.createTopology());
 96 |         }
 97 | 
 98 |     }
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/storm/benchmark/AbstractTopology.java:
--------------------------------------------------------------------------------
 1 | package com.aliyun.emr.example.storm.benchmark;
 2 | 
 3 | import com.aliyun.emr.example.storm.benchmark.util.Helper;
 4 | import org.apache.commons.lang.StringUtils;
 5 | import org.apache.storm.Config;
 6 | import org.apache.storm.LocalCluster;
 7 | import org.apache.storm.StormSubmitter;
 8 | import org.apache.storm.generated.StormTopology;
 9 | 
10 | import java.io.BufferedInputStream;
11 | import java.io.FileInputStream;
12 | import java.io.InputStream;
13 | import java.io.Serializable;
14 | import java.util.HashMap;
15 | import java.util.Map;
16 | import java.util.Properties;
17 | 
18 | abstract public class AbstractTopology implements Serializable{
19 |     protected Properties configure;
20 | 
21 |     public void init(String configFilepath) throws Exception {
22 |         init(configFilepath, "");
23 |     }
24 | 
25 |     public void init(String configFilepath, String properties) throws Exception {
26 |         InputStream in = new BufferedInputStream(new FileInputStream(configFilepath));
27 |         configure = new Properties();
28 |         configure.load(in);
29 | 
30 |         if (! StringUtils.isBlank(properties)) {
31 |             Map<String, String> customProperty = new HashMap<>();
32 |             for (String item : properties.split(",")) {
33 |                 String[] kv = item.split("=");
34 |                 if (kv.length != 2) {
35 |                     System.out.println("invalid property[" + item + "], pattern should be k1=v2,k2=v2...");
36 |                     continue;
37 |                 }
38 |                 customProperty.put(kv[0], kv[1]);
39 |             }
40 |             configure.putAll(customProperty);
41 |         }
42 | 
43 |         System.out.println("all configure: " + configure);
44 |     }
45 | 
46 |     public void run(boolean cluster) throws Exception {
47 |         String name = configure.getProperty("name");
48 |         Config conf = new Config();
49 | 
50 |         if (!cluster) {
51 |             new LocalCluster().submitTopology("local-" + name, conf, createTopology());
52 |             return;
53 |         }
54 | 
55 |         int slots = Integer.valueOf(configure.getProperty("worker.slot.number"));
56 |         int clusterNodes = Integer.valueOf(configure.getProperty("cluster.worker.node.number"));
57 |         int workerNumber = slots * clusterNodes;
58 |         int clusterNodeMemoryMb = Integer.valueOf(configure.getProperty("cluster.memory.per.node.mb"));
59 |         int workerMem = clusterNodeMemoryMb / slots;
60 |         conf.setNumWorkers(workerNumber);
61 |         if (!Boolean.valueOf(configure.getProperty("ack.open"))) {
62 |             conf.setNumAckers(0);
63 |         }
64 | 
65 |         conf.put("worker.heap.memory.mb", workerMem);
66 |         conf.put("topology.backpressure.enable", Boolean.valueOf(configure.getProperty("backpressure.enable")));
67 |         StormSubmitter.submitTopologyWithProgressBar(name, conf, createTopology());
68 |         Helper.setupShutdownHook(name); // handle Ctrl-C
69 | 
70 |         System.out.println("**********metrics will begin in two minute, please start to send source data to warn up**********");
71 |         for (int i = 0; i< 2; i++) {
72 |             Thread.sleep(1000 * 60);
73 |             System.out.println("...");
74 |         }
75 |         System.out.println("********** start metrics **********");
76 |         Helper.collectMetrics(name, 60);
77 |     }
78 | 
79 |     abstract StormTopology createTopology();
80 | }
81 | 


--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/storm/benchmark/BasicTopology.java:
--------------------------------------------------------------------------------
  1 | package com.aliyun.emr.example.storm.benchmark;
  2 | 
  3 | import com.google.common.collect.ImmutableMap;
  4 | import kafka.api.OffsetRequest;
  5 | import org.apache.storm.generated.StormTopology;
  6 | import org.apache.storm.kafka.*;
  7 | import org.apache.storm.kafka.bolt.KafkaBolt;
  8 | import org.apache.storm.kafka.bolt.mapper.TupleToKafkaMapper;
  9 | import org.apache.storm.kafka.bolt.selector.DefaultTopicSelector;
 10 | import org.apache.storm.topology.*;
 11 | import org.apache.storm.topology.base.BaseBasicBolt;
 12 | import org.apache.storm.tuple.Fields;
 13 | import org.apache.storm.tuple.Tuple;
 14 | import org.apache.storm.tuple.Values;
 15 | 
 16 | import java.util.Arrays;
 17 | import java.util.Properties;
 18 | 
 19 | public class BasicTopology extends AbstractTopology {
 20 | 
 21 |     @Override
 22 |     StormTopology createTopology() {
 23 |         TopologyBuilder builder = new TopologyBuilder();
 24 |         setSpout(builder);
 25 |         setBolt(builder);
 26 |         return builder.createTopology();
 27 |     }
 28 | 
 29 |     private void setSpout(TopologyBuilder builder) {
 30 |         String consumerGroup = configure.getProperty("consumer.group");
 31 |         SpoutConfig conf = new SpoutConfig(new ZkHosts(
 32 |             configure.getProperty("zookeeper.address") + ":2181" + configure.getProperty("zookeeper.root")),
 33 |             configure.getProperty("topic"), configure.getProperty("zookeeper.root"), consumerGroup);
 34 |         conf.zkPort = 2181;
 35 |         conf.zkServers= Arrays.asList(configure.getProperty("zookeeper.address"));
 36 |         conf.socketTimeoutMs = 60 * 1000;
 37 |         conf.scheme = new KeyValueSchemeAsMultiScheme(new StringKeyValueScheme());
 38 |         conf.startOffsetTime= OffsetRequest.LatestTime();
 39 |         conf.ignoreZkOffsets = true;
 40 |         KafkaSpout spout = new KafkaSpout(conf);
 41 | 
 42 |         int kafkaPartition = Integer.valueOf(configure.getProperty("partition.number"));
 43 |         builder.setSpout("spout", spout, kafkaPartition);
 44 |     }
 45 | 
 46 |     protected void setBolt(TopologyBuilder builder) {
 47 |         int clusterCores = Integer.valueOf(configure.getProperty("cluster.cores.total"));
 48 |         int availableCores = clusterCores - Integer.valueOf(configure.getProperty("partition.number"));
 49 | 
 50 | 
 51 |         //inter bolt
 52 |         //builder.setBolt("inter-bolt", getInterBolt(), availableCores).localOrShuffleGrouping("spout");
 53 | 
 54 |         //kafka storm-bolt
 55 |         builder.setBolt("kafka-bolt", getKafkaBolt(), availableCores).localOrShuffleGrouping("spout");
 56 |     }
 57 | 
 58 |     private IBasicBolt getInterBolt() {
 59 |         return  new BaseBasicBolt() {
 60 |             @Override
 61 |             public void execute(Tuple input, BasicOutputCollector collector) {
 62 |                 collector.emit(new Values(input));
 63 |             }
 64 | 
 65 |             @Override
 66 |             public void declareOutputFields(OutputFieldsDeclarer declarer) {
 67 |                 declarer.declare(new Fields("inter-bolt"));
 68 |             }
 69 |         };
 70 |     }
 71 | 
 72 |     private IRichBolt getKafkaBolt() {
 73 |         Properties properties = new Properties();
 74 |         properties.put("bootstrap.servers", configure.getProperty("result.broker.list"));
 75 |         properties.put("acks", "0");
 76 |         properties.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
 77 |         properties.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
 78 |         // consume too much memory
 79 |         //properties.put("batch.size", "10485760");
 80 |         //properties.put("max.request", "10485760");
 81 |         //properties.put("send.buffer.bytes", "1000000");
 82 |         KafkaBolt bolt = new KafkaBolt<String, String>()
 83 |             .withProducerProperties(properties)
 84 |             .withTopicSelector(new DefaultTopicSelector(configure.getProperty("result.topic")))
 85 |             .withTupleToKafkaMapper(new TupleToKafkaMapper<String, String>() {
 86 |                 @Override
 87 |                 public String getKeyFromTuple(Tuple tuple) {
 88 |                     return null;
 89 |                 }
 90 | 
 91 |                 @Override
 92 |                 public String getMessageFromTuple(Tuple tuple) {
 93 | 
 94 |                     ImmutableMap<String, String> kv = (ImmutableMap<String, String>)tuple.getValue(0);
 95 |                     return kv.keySet().iterator().next() + "," + System.currentTimeMillis();
 96 | 
 97 |                 }
 98 |             });
 99 |         bolt.setFireAndForget(true);
100 |         bolt.setAsync(true);
101 |         return bolt;
102 |     }
103 | 
104 |     public static void main(String[] args) throws Exception {
105 |         BasicTopology basicTopology = new BasicTopology();
106 |         if (args.length > 1) {
107 |             if (!"--property".equals(args[1])) {
108 |                 System.out.println("unknow option: " + args[1]);
109 |                 System.out.println("usage storm jar examples-1.1-shaded.jar  com.aliyun.emr.example.storm.benchmark.BasicTopology benchmark.properties --property k1=v1,k2=v2");
110 |                 System.exit(1);
111 |             }
112 |             basicTopology.init(args[0], args[2]);
113 |         } else {
114 |             basicTopology.init(args[0]);
115 |         }
116 | 
117 |         basicTopology.run(true);
118 |     }
119 | }
120 | 


--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/storm/benchmark/KafkaHdfs.java:
--------------------------------------------------------------------------------
 1 | package com.aliyun.emr.example.storm.benchmark;
 2 | 
 3 | import org.apache.storm.hdfs.bolt.HdfsBolt;
 4 | import org.apache.storm.hdfs.bolt.format.DefaultFileNameFormat;
 5 | import org.apache.storm.hdfs.bolt.format.RecordFormat;
 6 | import org.apache.storm.hdfs.bolt.rotation.NoRotationPolicy;
 7 | import org.apache.storm.hdfs.bolt.sync.CountSyncPolicy;
 8 | import org.apache.storm.topology.IRichBolt;
 9 | import org.apache.storm.topology.TopologyBuilder;
10 | import org.apache.storm.tuple.Tuple;
11 | 
12 | import java.util.Map;
13 | 
14 | public class KafkaHdfs extends BasicTopology{
15 | 
16 |     protected void setBolt(TopologyBuilder builder) {
17 |         int clusterCores = Integer.valueOf(configure.getProperty("cluster.cores.total"));
18 |         int availableCores = clusterCores - Integer.valueOf(configure.getProperty("partition.number"));
19 | 
20 |         builder.setBolt("hdfs-bolt", getHdfsBolt(), availableCores).localOrShuffleGrouping("spout");
21 |     }
22 | 
23 |     private IRichBolt getHdfsBolt() {
24 | 
25 |         String filenamePrefix = configure.getProperty("filename.prefix") + configure.getProperty("name") + "/";
26 |         HdfsBolt bolt = new HdfsBolt()
27 |             .withFsUrl(configure.getProperty("url"))
28 |             .withFileNameFormat(new DefaultFileNameFormat().withPrefix(filenamePrefix))
29 |             .withRecordFormat(new RecordFormat() {
30 |                 @Override
31 |                 public byte[] format(Tuple tuple) {
32 |                     String eventTime = ((Map<String, String>)tuple.getValue(0)).keySet().iterator().next();
33 |                     String output = eventTime + "," + System.currentTimeMillis() + System.lineSeparator();
34 |                     return output.getBytes();
35 |                 }
36 |             })
37 |             .withSyncPolicy(new CountSyncPolicy(1000))
38 |             .withRotationPolicy(new NoRotationPolicy());
39 |         return bolt;
40 |     }
41 | 
42 |     public static void main(String[] args) throws Exception {
43 |         KafkaHdfs topology = new KafkaHdfs();
44 |         if (args.length > 1) {
45 |             if (!"--property".equals(args[1])) {
46 |                 System.out.println("unknow option: " + args[1]);
47 |                 System.out.println("usage storm jar examples-1.1-shaded.jar  com.aliyun.emr.example.storm.benchmark.KafkaHdfs benchmark.properties --property k1=v1,k2=v2");
48 |                 System.exit(1);
49 |             }
50 |             topology.init(args[0], args[2]);
51 |         } else {
52 |             topology.init(args[0]);
53 |         }
54 | 
55 |         topology.run(true);
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/storm/benchmark/TridentWordCount.java:
--------------------------------------------------------------------------------
  1 | package com.aliyun.emr.example.storm.benchmark;
  2 | 
  3 | import kafka.api.OffsetRequest;
  4 | import org.apache.storm.generated.StormTopology;
  5 | import org.apache.storm.hdfs.trident.HdfsState;
  6 | import org.apache.storm.hdfs.trident.HdfsStateFactory;
  7 | import org.apache.storm.hdfs.trident.HdfsUpdater;
  8 | import org.apache.storm.hdfs.trident.format.DefaultFileNameFormat;
  9 | import org.apache.storm.hdfs.trident.format.DelimitedRecordFormat;
 10 | import org.apache.storm.hdfs.trident.rotation.NoRotationPolicy;
 11 | import org.apache.storm.kafka.KeyValueSchemeAsMultiScheme;
 12 | import org.apache.storm.kafka.StringKeyValueScheme;
 13 | import org.apache.storm.kafka.ZkHosts;
 14 | import org.apache.storm.kafka.trident.TransactionalTridentKafkaSpout;
 15 | import org.apache.storm.kafka.trident.TridentKafkaConfig;
 16 | import org.apache.storm.trident.TridentTopology;
 17 | import org.apache.storm.trident.operation.BaseFunction;
 18 | import org.apache.storm.trident.operation.TridentCollector;
 19 | import org.apache.storm.trident.state.StateFactory;
 20 | import org.apache.storm.trident.tuple.TridentTuple;
 21 | import org.apache.storm.tuple.Fields;
 22 | import org.apache.storm.tuple.Values;
 23 | 
 24 | import java.util.HashMap;
 25 | import java.util.Map;
 26 | 
 27 | public class TridentWordCount extends AbstractTopology {
 28 | 
 29 |     @Override
 30 |     StormTopology createTopology() {
 31 |         int partition = Integer.valueOf(configure.getProperty("partition.number"));
 32 | 
 33 |         TridentTopology topology = new TridentTopology();
 34 |         TransactionalTridentKafkaSpout spout = createSpout();
 35 | 
 36 |         topology.newStream("kafka-spout", spout).name("kafka").parallelismHint(partition)
 37 |             .each(spout.getOutputFields(), new WordCount(), new Fields("eventTime", "finishTime")).name("word-count")
 38 |             .partitionPersist(createHdfsState("eventTime", "finishTime"), new Fields("eventTime", "finishTime"), new HdfsUpdater(), new Fields("eventTime", "finishTime"));
 39 |         return topology.build();
 40 |     }
 41 | 
 42 |     private TransactionalTridentKafkaSpout createSpout() {
 43 |         String consumerGroup = configure.getProperty("consumer.group");
 44 |         ZkHosts zkHost = new ZkHosts(configure.getProperty("zookeeper.address") + ":2181" + configure.getProperty("zookeeper.root"));
 45 |         TridentKafkaConfig config = new TridentKafkaConfig(zkHost, configure.getProperty("topic"), consumerGroup);
 46 |         config.socketTimeoutMs = 60 * 1000;
 47 |         config.ignoreZkOffsets=true;
 48 |         config.startOffsetTime= OffsetRequest.LatestTime();
 49 |         config.scheme = new KeyValueSchemeAsMultiScheme(new StringKeyValueScheme());
 50 |         config.startOffsetTime = OffsetRequest.LatestTime();
 51 |         return new TransactionalTridentKafkaSpout(config);
 52 |     }
 53 | 
 54 |     private StateFactory createHdfsState(String... fileds) {
 55 |         String filenamePrefix = configure.getProperty("filename.prefix") + configure.getProperty("name") + "/";
 56 | 
 57 |         HdfsState.Options options =  new HdfsState.HdfsFileOptions()
 58 |             .withFsUrl(configure.getProperty("url"))
 59 |             .withFileNameFormat(new DefaultFileNameFormat().withPath(filenamePrefix))
 60 |             .withRecordFormat(new DelimitedRecordFormat().withFields(new Fields(fileds)))
 61 |             .withRotationPolicy(new NoRotationPolicy());
 62 |         return new HdfsStateFactory().withOptions(options);
 63 |     }
 64 | 
 65 |     private class WordCount extends BaseFunction {
 66 |         private HashMap<String, Integer> count = new HashMap<>();
 67 |         @Override
 68 |         public void execute(TridentTuple tuple, TridentCollector collector) {
 69 |             // for test
 70 |             Map<String, String> kv = (Map<String, String>)tuple.get(0);
 71 |             for (Map.Entry<String, String> item: kv.entrySet()) {
 72 |                 String eventTime = item.getKey();
 73 |                 String words = item.getValue();
 74 |                 for  (String word: words.split("\\s+")) {
 75 |                     Integer number = count.get(word);
 76 |                     if (number == null) {
 77 |                         number = 0;
 78 |                     }
 79 |                     number++;
 80 |                     count.put(word, number);
 81 | 
 82 |                 }
 83 |                 collector.emit(new Values(eventTime, System.currentTimeMillis()));
 84 |             }
 85 | 
 86 |         }
 87 |     }
 88 | 
 89 |     public static void main(String[] args) throws Exception {
 90 |         TridentWordCount wordCount = new TridentWordCount();
 91 |         if (args.length > 1) {
 92 |             if (!"--property".equals(args[1])) {
 93 |                 System.out.println("unknow option: " + args[1]);
 94 |                 System.out.println("usage storm jar examples-1.1-shaded.jar  com.aliyun.emr.example.storm.benchmark.TridentWordCount benchmark.properties --property k1=v1,k2=v2");
 95 |                 System.exit(1);
 96 |             }
 97 |             wordCount.init(args[0], args[2]);
 98 |         } else {
 99 |             wordCount.init(args[0]);
100 |         }
101 |         wordCount.run(true);
102 |     }
103 | }
104 | 


--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/storm/benchmark/WindowedWordCount.java:
--------------------------------------------------------------------------------
 1 | package com.aliyun.emr.example.storm.benchmark;
 2 | 
 3 | import org.apache.storm.hdfs.bolt.HdfsBolt;
 4 | import org.apache.storm.hdfs.bolt.format.DefaultFileNameFormat;
 5 | import org.apache.storm.hdfs.bolt.format.DelimitedRecordFormat;
 6 | import org.apache.storm.hdfs.bolt.rotation.NoRotationPolicy;
 7 | import org.apache.storm.hdfs.bolt.sync.CountSyncPolicy;
 8 | import org.apache.storm.task.OutputCollector;
 9 | import org.apache.storm.task.TopologyContext;
10 | import org.apache.storm.topology.OutputFieldsDeclarer;
11 | import org.apache.storm.topology.TopologyBuilder;
12 | import org.apache.storm.topology.base.BaseWindowedBolt;
13 | import org.apache.storm.tuple.Fields;
14 | import org.apache.storm.tuple.Tuple;
15 | import org.apache.storm.tuple.Values;
16 | import org.apache.storm.windowing.TupleWindow;
17 | import org.apache.storm.topology.base.BaseWindowedBolt.Count;
18 | 
19 | import java.util.HashMap;
20 | import java.util.Map;
21 | 
22 | public class WindowedWordCount extends BasicTopology {
23 |     @Override
24 |     protected void setBolt(TopologyBuilder builder) {
25 |         int windowLength = Integer.valueOf(configure.getProperty("window.length"));
26 |         int clusterCores = Integer.valueOf(configure.getProperty("cluster.cores.total"));
27 |         int availableCores = clusterCores - Integer.valueOf(configure.getProperty("partition.number"));
28 |         int parallelism =  availableCores / 2;
29 | 
30 |         int slidingInterval = Integer.valueOf(configure.getProperty("slide.interval"));
31 | 
32 |         builder.setBolt("count", new SplitCount().withWindow(new Count(windowLength), new Count(slidingInterval)), parallelism).localOrShuffleGrouping("spout");
33 | 
34 |         String filenamePrefix = configure.getProperty("filename.prefix") + configure.getProperty("name") + "/";
35 |         HdfsBolt bolt = new HdfsBolt()
36 |             .withFsUrl(configure.getProperty("url"))
37 |             .withFileNameFormat(new DefaultFileNameFormat().withPrefix(filenamePrefix))
38 |             .withRecordFormat(new DelimitedRecordFormat().withFieldDelimiter(","))
39 |             .withSyncPolicy(new CountSyncPolicy(1000))
40 |             .withRotationPolicy(new NoRotationPolicy());
41 |         builder.setBolt("hdfs-bolt", bolt, parallelism).localOrShuffleGrouping("count");
42 |     }
43 | 
44 |     private class SplitCount extends BaseWindowedBolt {
45 |         private OutputCollector collector;
46 |         private Map<String, Integer> counter = new HashMap<>();
47 | 
48 |         @Override
49 |         public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
50 |             super.prepare(stormConf, context, collector);
51 |             this.collector = collector;
52 |         }
53 | 
54 |         @Override
55 |         public void execute(TupleWindow inputWindow) {
56 |             for ( Tuple tuple : inputWindow.get()) {
57 |                 Map<String, String> value = (Map)tuple.getValue(0);
58 |                 for (Map.Entry<String, String> item : value.entrySet()) {
59 |                     String eventTime = item.getKey();
60 |                     String words = item.getValue();
61 |                     for (String word: words.split("\\s+")) {
62 |                         Integer number = counter.get(word);
63 |                         if (number == null) {
64 |                             number = 0;
65 |                         }
66 |                         number++;
67 |                         counter.put(word, number);
68 |                     }
69 |                     collector.emit(new Values(eventTime, System.currentTimeMillis()));
70 |                 }
71 |             }
72 | 
73 |         }
74 | 
75 |         @Override
76 |         public void declareOutputFields(OutputFieldsDeclarer declarer) {
77 |             declarer.declare(new Fields("eventTime", "finishTime"));
78 |         }
79 |     }
80 | 
81 |     public static void main(String[] args) throws Exception {
82 |         WindowedWordCount wordCount = new WindowedWordCount();
83 |         if (args.length > 1) {
84 |             if (!"--property".equals(args[1])) {
85 |                 System.out.println("unknow option: " + args[1]);
86 |                 System.out.println("usage storm jar examples-1.1-shaded.jar  com.aliyun.emr.example.storm.benchmark.WindowedWordCount benchmark.properties --property k1=v1,k2=v2");
87 |                 System.exit(1);
88 |             }
89 |             wordCount.init(args[0], args[2]);
90 |         } else {
91 |             wordCount.init(args[0]);
92 |         }
93 |         wordCount.run(true);
94 |     }
95 | }
96 | 


--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/storm/benchmark/WordCount.java:
--------------------------------------------------------------------------------
 1 | package com.aliyun.emr.example.storm.benchmark;
 2 | 
 3 | import org.apache.storm.hdfs.bolt.HdfsBolt;
 4 | import org.apache.storm.hdfs.bolt.format.DefaultFileNameFormat;
 5 | import org.apache.storm.hdfs.bolt.format.DelimitedRecordFormat;
 6 | import org.apache.storm.hdfs.bolt.rotation.NoRotationPolicy;
 7 | import org.apache.storm.hdfs.bolt.sync.CountSyncPolicy;
 8 | import org.apache.storm.topology.BasicOutputCollector;
 9 | import org.apache.storm.topology.OutputFieldsDeclarer;
10 | import org.apache.storm.topology.TopologyBuilder;
11 | import org.apache.storm.topology.base.BaseBasicBolt;
12 | import org.apache.storm.tuple.Fields;
13 | import org.apache.storm.tuple.Tuple;
14 | import org.apache.storm.tuple.Values;
15 | 
16 | import java.util.HashMap;
17 | import java.util.Map;
18 | 
19 | public class WordCount extends BasicTopology {
20 |     @Override
21 |     protected void setBolt(TopologyBuilder builder) {
22 |         int clusterCores = Integer.valueOf(configure.getProperty("cluster.cores.total"));
23 |         int availableCores = clusterCores - Integer.valueOf(configure.getProperty("partition.number"));
24 | 
25 |         int hdfsParallelismFactor =  Integer.parseInt(configure.getProperty("hdfs.parallelism.factor"));
26 |         int hdfsParallelism = availableCores * hdfsParallelismFactor / (hdfsParallelismFactor + 1);
27 |         builder.setBolt("split-count", new SplitCount(), availableCores - hdfsParallelism).localOrShuffleGrouping("spout");
28 | 
29 |         String filenamePrefix = configure.getProperty("filename.prefix") + configure.getProperty("name") + "/";
30 |         HdfsBolt bolt = new HdfsBolt()
31 |             .withFsUrl(configure.getProperty("url"))
32 |             .withFileNameFormat(new DefaultFileNameFormat().withPrefix(filenamePrefix))
33 |             .withRecordFormat(new DelimitedRecordFormat().withFieldDelimiter(","))
34 |             .withSyncPolicy(new CountSyncPolicy(1000))
35 |             .withRotationPolicy(new NoRotationPolicy());
36 |         builder.setBolt("hdfs-bolt", bolt, hdfsParallelism).localOrShuffleGrouping("split-count");
37 |     }
38 | 
39 | 
40 |     private class SplitCount extends BaseBasicBolt {
41 |         private Map<String, Integer> counter = new HashMap<>();
42 | 
43 |         @Override
44 |         public void execute(Tuple input, BasicOutputCollector collector) {
45 |             Map<String, String> value = (Map<String, String>)input.getValue(0);
46 |             for (Map.Entry<String, String>item : value.entrySet()) {
47 |                 String eventTime = item.getKey();
48 |                 String words = item.getValue();
49 | 
50 |                 for (String word : words.split("\\s+")) {
51 |                     Integer number = counter.get(word);
52 |                     if (number == null) {
53 |                         number = 0;
54 |                     }
55 |                     number++;
56 |                     counter.put(word, number);
57 |                 }
58 |                 collector.emit(new Values(eventTime, System.currentTimeMillis()));
59 |             }
60 | 
61 |         }
62 | 
63 |         @Override
64 |         public void declareOutputFields(OutputFieldsDeclarer declarer) {
65 |             declarer.declare(new Fields("eventTime", "finishTime"));
66 |         }
67 |     }
68 | 
69 |     public static void main(String[] args) throws Exception {
70 |         WordCount wordCount = new WordCount();
71 |         if (args.length > 1) {
72 |             if (!"--property".equals(args[1])) {
73 |                 System.out.println("unknow option: " + args[1]);
74 |                 System.out.println("usage storm jar examples-1.1-shaded.jar  com.aliyun.emr.example.storm.benchmark.WordCount benchmark.properties --property k1=v1,k2=v2");
75 |                 System.exit(1);
76 |             }
77 |             wordCount.init(args[0], args[2]);
78 |         } else {
79 |             wordCount.init(args[0]);
80 |         }
81 |         wordCount.run(true);
82 |     }
83 | }
84 | 


--------------------------------------------------------------------------------
/src/main/java/com/aliyun/emr/example/storm/benchmark/util/Helper.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one
  3 |  * or more contributor license agreements.  See the NOTICE file
  4 |  * distributed with this work for additional information
  5 |  * regarding copyright ownership.  The ASF licenses this file
  6 |  * to you under the Apache License, Version 2.0 (the
  7 |  * "License"); you may not use this file except in compliance
  8 |  * with the License.  You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License
 17 |  */
 18 | 
 19 | package com.aliyun.emr.example.storm.benchmark.util;
 20 | 
 21 | import org.apache.storm.Config;
 22 | import org.apache.storm.LocalCluster;
 23 | import org.apache.storm.StormSubmitter;
 24 | import org.apache.storm.generated.KillOptions;
 25 | import org.apache.storm.generated.Nimbus;
 26 | import org.apache.storm.generated.StormTopology;
 27 | import org.apache.storm.perf.utils.BasicMetricsCollector;
 28 | import org.apache.storm.utils.NimbusClient;
 29 | import org.apache.storm.utils.Utils;
 30 | 
 31 | import java.util.Map;
 32 | 
 33 | public class Helper {
 34 | 
 35 |   public static void kill(Nimbus.Client client, String topoName) throws Exception {
 36 |     KillOptions opts = new KillOptions();
 37 |     opts.set_wait_secs(0);
 38 |     client.killTopologyWithOpts(topoName, opts);
 39 |   }
 40 | 
 41 |   public static void killAndShutdownCluster(LocalCluster cluster, String topoName) throws Exception {
 42 |     KillOptions opts = new KillOptions();
 43 |     opts.set_wait_secs(0);
 44 |     cluster.killTopologyWithOpts(topoName, opts);
 45 |     cluster.shutdown();
 46 |   }
 47 | 
 48 | 
 49 |     public static LocalCluster runOnLocalCluster(String topoName, StormTopology topology) {
 50 |         LocalCluster cluster = new LocalCluster();
 51 |         cluster.submitTopology(topoName, new Config(), topology);
 52 |         return cluster;
 53 |     }
 54 | 
 55 |     public static int getInt(Map map, Object key, int def) {
 56 |         return Utils.getInt(Utils.get(map, key, def));
 57 |     }
 58 | 
 59 |     public static String getStr(Map map, Object key) {
 60 |         return (String) map.get(key);
 61 |     }
 62 | 
 63 |     public static void collectMetrics(String topologyName, Integer pollInterval) throws Exception {
 64 |         Map clusterConf = Utils.readStormConfig();
 65 |         Nimbus.Client client = NimbusClient.getConfiguredClient(clusterConf).getClient();
 66 |         BasicMetricsCollector metricsCollector = new BasicMetricsCollector(client, topologyName, clusterConf);
 67 | 
 68 |         try {
 69 |             while (true){
 70 |                 metricsCollector.collect(client);
 71 |                 Thread.sleep(pollInterval * 1000);
 72 |             }
 73 |         } finally {
 74 |             metricsCollector.close();
 75 |             kill(client, topologyName);
 76 |         }
 77 | 
 78 |     }
 79 | 
 80 |     public static void collectMetricsAndKill(String topologyName, Integer pollInterval, Integer duration) throws Exception {
 81 |         Map clusterConf = Utils.readStormConfig();
 82 |         Nimbus.Client client = NimbusClient.getConfiguredClient(clusterConf).getClient();
 83 |         BasicMetricsCollector metricsCollector = new BasicMetricsCollector(client, topologyName, clusterConf);
 84 | 
 85 |         int times = duration / pollInterval;
 86 |         metricsCollector.collect(client);
 87 |         for (int i = 0; i < times; i++) {
 88 |             Thread.sleep(pollInterval * 1000);
 89 |             metricsCollector.collect(client);
 90 |         }
 91 |         metricsCollector.close();
 92 |         kill(client, topologyName);
 93 |     }
 94 | 
 95 |     public static void collectLocalMetricsAndKill(LocalCluster localCluster, String topologyName, Integer pollInterval, Integer duration, Map clusterConf) throws Exception {
 96 |         BasicMetricsCollector metricsCollector = new BasicMetricsCollector(localCluster, topologyName, clusterConf);
 97 | 
 98 |         int times = duration / pollInterval;
 99 |         metricsCollector.collect(localCluster);
100 |         for (int i = 0; i < times; i++) {
101 |             Thread.sleep(pollInterval * 1000);
102 |             metricsCollector.collect(localCluster);
103 |         }
104 |         metricsCollector.close();
105 |         killAndShutdownCluster(localCluster, topologyName);
106 |     }
107 | 
108 |     /** Kill topo and Shutdown local cluster on Ctrl-C */
109 |   public static void setupShutdownHook(final LocalCluster cluster, final String topoName) {
110 |     Runtime.getRuntime().addShutdownHook(new Thread() {
111 |       public void run() {
112 |         cluster.killTopology(topoName);
113 |         System.out.println("Killed Topology");
114 |         cluster.shutdown();
115 |       }
116 |     });
117 |   }
118 | 
119 |   /** Kill topo on Ctrl-C */
120 |   public static void setupShutdownHook(final String topoName) {
121 |     Map clusterConf = Utils.readStormConfig();
122 |     final Nimbus.Client client = NimbusClient.getConfiguredClient(clusterConf).getClient();
123 |     Runtime.getRuntime().addShutdownHook(new Thread() {
124 |       public void run() {
125 |         try {
126 |           Helper.kill(client, topoName);
127 |           System.out.println("Killed Topology");
128 |         } catch (Exception e) {
129 |           e.printStackTrace();
130 |         }
131 |       }
132 |     });
133 |   }
134 | 
135 |     public static void runOnClusterAndPrintMetrics(Integer durationSec, String topoName, Map topoConf, StormTopology topology) throws Exception {
136 |       // submit topology
137 |       StormSubmitter.submitTopologyWithProgressBar(topoName, topoConf, topology);
138 |       setupShutdownHook(topoName); // handle Ctrl-C
139 | 
140 |       // poll metrics every minute, then kill topology after specified duration
141 |       Integer pollIntervalSec = 60;
142 |       collectMetricsAndKill(topoName, pollIntervalSec, durationSec);
143 |     }
144 | }
145 | 


--------------------------------------------------------------------------------
/src/main/pig/sample.pig:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The ASF licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | -- Query Phrase Popularity (Hadoop cluster)
20 | 
21 | -- This script processes a search query log file from the Excite search engine and finds search phrases that occur with particular high frequency during certain times of the day.
22 | 
23 | 
24 | -- Register the tutorial JAR file so that the included UDFs can be called in the script.
25 | REGISTER $tutorial;
26 | 
27 | -- Use the  PigStorage function to load the excite log file into the ?raw? bag as an array of records.
28 | -- Input: (user,time,query)
29 | raw = LOAD '$input' USING PigStorage('\t') AS (user, time, query);
30 | 
31 | 
32 | -- Call the NonURLDetector UDF to remove records if the query field is empty or a URL.
33 | clean1 = FILTER raw BY org.apache.pig.tutorial.NonURLDetector(query);
34 | 
35 | -- Call the ToLower UDF to change the query field to lowercase.
36 | clean2 = FOREACH clean1 GENERATE user, time, org.apache.pig.tutorial.ToLower(query) as query;
37 | 
38 | -- Because the log file only contains queries for a single day, we are only interested in the hour.
39 | -- The excite query log timestamp format is YYMMDDHHMMSS.
40 | -- Call the ExtractHour UDF to extract the hour (HH) from the time field.
41 | houred = FOREACH clean2 GENERATE user, org.apache.pig.tutorial.ExtractHour(time) as hour, query;
42 | 
43 | -- Call the NGramGenerator UDF to compose the n-grams of the query.
44 | ngramed1 = FOREACH houred GENERATE user, hour, flatten(org.apache.pig.tutorial.NGramGenerator(query)) as ngram;
45 | 
46 | -- Use the  DISTINCT command to get the unique n-grams for all records.
47 | ngramed2 = DISTINCT ngramed1;
48 | 
49 | -- Use the  GROUP command to group records by n-gram and hour.
50 | hour_frequency1 = GROUP ngramed2 BY (ngram, hour);
51 | 
52 | -- Use the  COUNT function to get the count (occurrences) of each n-gram.
53 | hour_frequency2 = FOREACH hour_frequency1 GENERATE flatten($0), COUNT($1) as count;
54 | 
55 | -- Use the  GROUP command to group records by n-gram only.
56 | -- Each group now corresponds to a distinct n-gram and has the count for each hour.
57 | uniq_frequency1 = GROUP hour_frequency2 BY group::ngram;
58 | 
59 | -- For each group, identify the hour in which this n-gram is used with a particularly high frequency.
60 | -- Call the ScoreGenerator UDF to calculate a "popularity" score for the n-gram.
61 | uniq_frequency2 = FOREACH uniq_frequency1 GENERATE flatten($0), flatten(org.apache.pig.tutorial.ScoreGenerator($1));
62 | 
63 | -- Use the  FOREACH-GENERATE command to assign names to the fields.
64 | uniq_frequency3 = FOREACH uniq_frequency2 GENERATE $1 as hour, $0 as ngram, $2 as score, $3 as count, $4 as mean;
65 | 
66 | -- Use the  FILTER command to move all records with a score less than or equal to 2.0.
67 | filtered_uniq_frequency = FILTER uniq_frequency3 BY score > 2.0;
68 | 
69 | -- Use the  ORDER command to sort the remaining records by hour and score.
70 | ordered_uniq_frequency = ORDER filtered_uniq_frequency BY hour, score;
71 | 
72 | -- Use the  PigStorage function to store the results.
73 | -- Output: (hour, n-gram, score, count, average_counts_among_all_hours)
74 | STORE ordered_uniq_frequency INTO '$results' USING PigStorage();


--------------------------------------------------------------------------------
/src/main/python/deeplearning/tf_fm_on_spark.py:
--------------------------------------------------------------------------------
  1 | from pylearning.model.tensorflow_base import tensorflow_base
  2 | from pyspark.sql import SparkSession
  3 | from pyspark import SparkContext
  4 | import os
  5 | import random
  6 | import numpy as np
  7 | 
  8 | from pyspark.sql.functions import col
  9 | import tensorflow as tf
 10 | 
 11 | class tf_fm(tensorflow_base):
 12 | 
 13 |     @staticmethod
 14 |     def pre_train(env):
 15 |         spark_context = SparkContext.getOrCreate()
 16 |         spark = SparkSession(spark_context).builder.getOrCreate()
 17 |         rating_df = spark.read.format('csv').option('header', 'True').load('/moviedata/ratings.csv')
 18 |         movie_df = spark.read.format('csv').option('header', 'True').load('/moviedata/movies.csv')
 19 | 
 20 |         # process user first
 21 |         distinct_user_df = rating_df.select('userId').distinct()
 22 |         users_number = distinct_user_df.count()
 23 |         env.get("algo")["users_number"] = str(users_number)
 24 | 
 25 |         users_row = distinct_user_df.collect()
 26 |         users = []
 27 |         users_dict = []
 28 |         users_map = {}
 29 |         for user in users_row:
 30 |             users.append(user['userId'])
 31 |         sorted_users = sorted(users)
 32 |         for user in sorted_users:
 33 |             users_dict.append((user,len(users_dict)))
 34 |             users_map[user] = len(users_map)
 35 | 
 36 |         # It is use for later process, to get the sorted user id.
 37 |         columns = ["userid","id"]
 38 |         users_sort_df = spark.createDataFrame(users_dict,columns)
 39 |         # users_sort_df.write.format("csv").save("/moviedata/sortedusers")
 40 | 
 41 |         # process genres
 42 |         geners_row = movie_df.select("genres").distinct().collect()
 43 |         genres_set = set()
 44 |         genres_map = {}
 45 |         for genres in geners_row:
 46 |             for one_genre in genres['genres'].split('|'):
 47 |                 genres_set.add(one_genre)
 48 |         for genre in genres_set:
 49 |             genres_map[genre] = len(genres_map)
 50 | 
 51 |         # join two dataframe and process later, userid(bigint) genres(string, need split), rating(float)
 52 |         joined_df = rating_df.join(movie_df, rating_df.movieId == movie_df.movieId)
 53 |         joined_df = joined_df.select(col('userId'),col('genres'),col('rating').cast('float').alias('rating'))
 54 | 
 55 |         users_map_bc = spark_context.broadcast(users_map)
 56 |         genres_map_bc = spark_context.broadcast(genres_map)
 57 |         env.get("algo")["genres_number"] = str(len(genres_map))
 58 | 
 59 |         def process_row(row):
 60 |             userId = row.userId
 61 |             genres = row.genres
 62 |             users_map_rdd = users_map_bc.value
 63 |             genres_map_rdd = genres_map_bc.value
 64 |             genres_return_list = []
 65 |             for i in genres.split("|"):
 66 |                 genres_return_list.append(str(genres_map_rdd[i]))
 67 |             return (users_map_rdd[userId], "|".join(genres_return_list), row.rating)
 68 | 
 69 |         return joined_df.rdd.map(process_row).toDF(['userId','genres','rating'])
 70 | 
 71 |     @staticmethod
 72 |     def train(dataframe, env):
 73 |         environ = os.environ
 74 |         ps_hosts = environ.get("ps_hosts").split(",")
 75 |         worker_hosts = environ.get("worker_hosts").split(",")
 76 |         job_name = environ.get("job_name")
 77 |         task_index = int(environ.get("task_index"))
 78 | 
 79 |         cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
 80 |         server = tf.train.Server(cluster,
 81 |                                  job_name= job_name,
 82 |                                  task_index=task_index)
 83 | 
 84 |         if job_name == "ps":
 85 |             server.join()
 86 |         else :
 87 |             # batch size is 2000, parameter size including embedding for user and one hot for genres
 88 |             # embedding size is 128, one hot size is 20(we can obtain it from env)
 89 |             batch_size = 2000
 90 | 
 91 |             embedding_size = 128
 92 |             genres_size = int(env.get("algo")["genres_number"])
 93 |             users_size = int(env.get("algo")["users_number"])
 94 |             p_size = embedding_size + genres_size
 95 |             k = 10
 96 |             embeddings = tf.Variable(tf.random_uniform([users_size,embedding_size], -1.0, 1.0))
 97 |             USER = tf.placeholder('int64',shape=[batch_size,1])
 98 |             ITEM = tf.placeholder('float', shape=[batch_size, genres_size])
 99 |             embed = tf.nn.embedding_lookup(embeddings, USER)
100 |             user_embed = tf.reshape(embed, shape=[batch_size, embedding_size])
101 |             X = tf.concat([user_embed, ITEM], 1)
102 |             Y = tf.placeholder('float', shape=[batch_size,1])
103 | 
104 |             w0 = tf.Variable(tf.zeros([1]))
105 |             W = tf.Variable(tf.zeros([p_size]))
106 | 
107 |             V = tf.Variable(tf.random_normal([k, p_size], stddev=0.01))
108 |             y_hat = tf.Variable(tf.zeros([batch_size, 1]))
109 | 
110 |             linear_terms = tf.add(w0, tf.reduce_sum(tf.multiply(W, X), 1, keep_dims=True))
111 |             interactions = (tf.multiply(0.5, tf.reduce_sum(
112 |                 tf.subtract(tf.pow(tf.matmul(X, tf.transpose(V)), 2),
113 |                             tf.matmul(tf.pow(X, 2), tf.transpose(tf.pow(V, 2)))), 1,
114 |                 keep_dims=True)))
115 | 
116 |             y_hat = tf.add(linear_terms, interactions)
117 |             lambda_w = tf.constant(0.001, name='lambda_w')
118 |             lambda_v = tf.constant(0.001, name='lambda_v')
119 | 
120 |             l2_norm = (tf.reduce_sum(
121 |                 tf.add(
122 |                     tf.multiply(lambda_w, tf.pow(W, 2)),
123 |                     tf.multiply(lambda_v, tf.pow(V, 2)))))
124 | 
125 |             error = tf.reduce_mean(tf.square(tf.subtract(Y, y_hat)))
126 | 
127 |             loss = tf.add(error, l2_norm)
128 | 
129 |             N_EPOCHS = 100
130 |             eta = tf.constant(0.1)
131 |             global_step = tf.contrib.framework.get_or_create_global_step()
132 |             optimizer = tf.train.AdagradOptimizer(eta).minimize(loss, global_step=global_step)
133 | 
134 |             init = tf.global_variables_initializer()
135 | 
136 |             def get_train_data():
137 |                 users_sub, genres_sub, rating_sub = \
138 |                     zip(*random.sample(list(zip(dataframe.userId, dataframe.genres, dataframe.rating)), batch_size))
139 |                 batch_user = np.zeros(shape=(batch_size,1), dtype=np.int64)
140 |                 batch_genre = np.zeros(shape=(batch_size,genres_size), dtype=np.float32)
141 |                 label = np.ndarray(shape=(batch_size,1), dtype = np.float32)
142 |                 for i in range(batch_size):
143 |                     batch_user[i] = users_sub[i]
144 |                     for genre in genres_sub[i].split("|"):
145 |                         batch_genre[i][int(genre)] = 1
146 |                     label[i] = rating_sub[i]
147 |                 return batch_user, batch_genre, label
148 | 
149 |             checkpoint_dir = "hdfs://emr-header-1:9000/movie"
150 |             saver = tf.train.Saver()
151 |             epoch = 0
152 | 
153 |             with tf.train.MonitoredTrainingSession(master = server.target,
154 |                                            is_chief = task_index == 0,
155 |                                            checkpoint_dir= checkpoint_dir,
156 |                                            save_checkpoint_secs=20) as sess:
157 |                 tf.reset_default_graph()
158 |                 sess.run(init)
159 |                 latest_path = tf.train.latest_checkpoint(checkpoint_dir=checkpoint_dir)
160 |                 saver.restore(sess, latest_path)
161 |                 while epoch < N_EPOCHS:
162 |                     (batch_user,batch_genre,label) = get_train_data()
163 |                     sess.run(optimizer, feed_dict={USER: batch_user, ITEM: batch_genre, Y:label})
164 |                     print(sess.run(error, feed_dict={USER: batch_user, ITEM: batch_genre, Y: label}))
165 |                     epoch = epoch + 1
166 | 


--------------------------------------------------------------------------------
/src/main/python/deeplearning/train_boston.py:
--------------------------------------------------------------------------------
 1 | from pylearning.model.tensorflow_base import tensorflow_base
 2 | from pyspark.sql import SparkSession
 3 | from pyspark import SparkContext
 4 | 
 5 | import tensorflow as tf
 6 | from pyspark.sql.functions import col
 7 | 
 8 | class train_boston(tensorflow_base):
 9 |     @staticmethod
10 |     def pre_train():
11 |         spark_context = SparkContext.getOrCreate()
12 |         spark = SparkSession(spark_context).builder.getOrCreate()
13 |         df = spark.read.format('csv').option("header","True").load('/train.csv')
14 |         cast_df = df.select(*(col(c).cast("double").alias(c) for c in df.columns))
15 |         return cast_df
16 | 
17 |     @staticmethod
18 |     def train(dataframe, env):
19 |         crim = tf.feature_column.numeric_column('crim', dtype=tf.float64, shape=())
20 |         zn = tf.feature_column.numeric_column('zn', dtype=tf.float64, shape=())
21 |         indus = tf.feature_column.numeric_column('indus', dtype=tf.float64, shape=())
22 |         chas = tf.feature_column.numeric_column('chas', dtype=tf.int64, shape=())
23 |         nox = tf.feature_column.numeric_column('nox', dtype=tf.float64, shape=())
24 |         rm = tf.feature_column.numeric_column('rm', dtype=tf.float64, shape=())
25 |         age = tf.feature_column.numeric_column('age', dtype=tf.float64, shape=())
26 |         dis = tf.feature_column.numeric_column('dis', dtype=tf.float64, shape=())
27 |         rad = tf.feature_column.numeric_column('rad', dtype=tf.int64, shape=())
28 |         tax = tf.feature_column.numeric_column('tax', dtype=tf.int64, shape=())
29 |         ptratio = tf.feature_column.numeric_column('ptratio', dtype=tf.float64, shape=())
30 |         black = tf.feature_column.numeric_column('black', dtype=tf.float64, shape=())
31 |         lstat = tf.feature_column.numeric_column('lstat', dtype=tf.float64, shape=())
32 | 
33 |         feature_cols = [crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, black, lstat]
34 |         feature_names = ['ID','crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black',
35 |                          'lstat']
36 |         label_name = 'medv'
37 | 
38 |         dict = {}
39 | 
40 |         index = 0
41 |         for i in feature_names:
42 |             dict[i] = index
43 |             index+=1
44 | 
45 |         def train_input():
46 |             feature_dict = {}
47 |             for i in feature_names[1:]:
48 |                 feature_dict[i] = dataframe.get(i)
49 | 
50 |             _dataset = tf.data.Dataset.from_tensor_slices((feature_dict, dataframe.get(label_name)))
51 |             dataset = _dataset.batch(32)
52 |             return dataset
53 | 
54 |         ps = tf.contrib.distribute.ParameterServerStrategy()
55 |         config = tf.estimator.RunConfig(train_distribute=ps, eval_distribute=ps)
56 |         estimator = tf.estimator.LinearRegressor(feature_columns=feature_cols, model_dir='hdfs://emr-header-1:9000/boston', config=config)
57 | 
58 |         train_spec = tf.estimator.TrainSpec(input_fn=train_input, max_steps=100)
59 |         eval_spec = tf.estimator.EvalSpec(input_fn=train_input, start_delay_secs=0, throttle_secs=10,steps=10)
60 |         tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
61 | 
62 | 


--------------------------------------------------------------------------------
/src/main/python/odps-sample.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | import sys
19 | 
20 | from odps import OdpsOps
21 | from pyspark import SparkContext
22 | 
23 | if __name__ == "__main__":
24 | 
25 |     if len(sys.argv) != 7:
26 |         print >> sys.stderr, "Usage: spark-submit odps-sample.py accessKeyId accessKeySecret project table " \
27 |                              "partition numPartitions"
28 |         exit(-1)
29 | 
30 |     accessKeyId = sys.argv[1]
31 |     accessKeySecret = sys.argv[2]
32 |     odpsUrl = "http://odps-ext.aliyun-inc.com/api"
33 |     tunnelUrl = "http://dt-ext.odps.aliyun-inc.com"
34 |     project = sys.argv[3]
35 |     table = sys.argv[4]
36 |     partition = sys.argv[5]
37 |     numPartitions = sys.argv[6]
38 | 
39 |     sc = SparkContext(appName="PySpark Odps Sample")
40 | 
41 |     odpsOps = OdpsOps(sc, accessKeyId, accessKeySecret, odpsUrl, tunnelUrl)
42 | 
43 |     print "pScheme"
44 |     pSchema = odpsOps.getTableSchema(project, table, True)
45 |     for col in pSchema:
46 |         print col
47 | 
48 |     print "scheme"
49 |     schema = odpsOps.getTableSchema(project, table, False)
50 |     for col in schema:
51 |         print col
52 | 
53 |     print "ColumnByIdx"
54 |     col1 =odpsOps.getColumnByIdx(project, table, 1)
55 |     print col1
56 | 
57 |     data = sc.parallelize([[1, 1.5, False, "2014-06-11", "row 1"],
58 |                            [2, 1.5, True, "2014-06-10", "row 2"]], 2)
59 |     odpsOps.saveToPartitionTable(project, table, partition, data, isCreatePt=True, isOverWrite=False)
60 | 
61 |     nump = int(numPartitions)
62 |     rdd = odpsOps.readPartitionTable(project, table, partition, nump, batchSize=1)
63 |     rows = rdd.collect()
64 |     for row in rows:
65 |         print "row: ",
66 |         for col in row:
67 |             print col, type(col),
68 |         print ""
69 | 
70 |     print "read specific columns"
71 |     rdd2 = odpsOps.readPartitionTable(project, table, partition, nump, cols=[1, 2])
72 |     rows2 = rdd2.collect()
73 |     for row in rows2:
74 |         print "row: ",
75 |         for col in row:
76 |             print col, type(col),
77 |         print ""
78 | 


--------------------------------------------------------------------------------
/src/main/python/streaming/loghub-wordcount.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | import sys
19 | 
20 | from pyspark import SparkContext
21 | from pyspark.streaming import StreamingContext
22 | from loghub import LoghubUtils
23 | 
24 | if __name__ == "__main__":
25 |     if len(sys.argv) != 8:
26 |         print >> sys.stderr, "Usage: spark-submit loghub-wordcount.py logServiceProject logsStoreName " \
27 |                              "logHubConsumerGroupName loghubEndpoint numReceiver accessKeyId accessKeySecret"
28 |         exit(-1)
29 | 
30 |     sc = SparkContext(appName="PythonStreamingLoghubWordCount")
31 |     ssc = StreamingContext(sc, 2)
32 | 
33 |     logServiceProject = sys.argv[1]
34 |     logsStoreName = sys.argv[2]
35 |     logHubConsumerGroupName = sys.argv[3]
36 |     loghubEndpoint = sys.argv[4]
37 |     numReceiver = int(sys.argv[5])
38 |     accessKeyId = sys.argv[6]
39 |     accessKeySecret = sys.argv[7]
40 | 
41 |     stream = LoghubUtils.createStreams(ssc, logServiceProject, logsStoreName, logHubConsumerGroupName, loghubEndpoint,
42 |                                        numReceiver, accessKeyId, accessKeySecret)
43 |     lines = stream.map(lambda x: x[1])
44 |     counts = lines.flatMap(lambda line: line.split(" ")) \
45 |         .map(lambda word: (word, 1)) \
46 |         .reduceByKey(lambda a, b: a+b)
47 |     counts.pprint()
48 | 
49 |     ssc.start()
50 |     ssc.awaitTermination()
51 | 


--------------------------------------------------------------------------------
/src/main/python/streaming/wcmapper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | 
 5 | for line in sys.stdin:
 6 |     line = line.strip()
 7 |     words = line.split()
 8 |     for word in words:
 9 |         print '%s\t%s' % (word, 1)
10 | 
11 | 


--------------------------------------------------------------------------------
/src/main/python/streaming/wcreducer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from operator import itemgetter
 4 | import sys
 5 | 
 6 | current_word = None
 7 | current_count = 0
 8 | word = None
 9 | 
10 | for line in sys.stdin:
11 |     line = line.strip()
12 | 
13 |     word, count = line.split('\t', 1)
14 | 
15 |     try:
16 |         count = int(count)
17 |     except ValueError:
18 |         continue
19 | 
20 |     if current_word == word:
21 |         current_count += count
22 |     else:
23 |         if current_word:
24 |             print '%s\t%s' % (current_word, current_count)
25 |         current_count = count
26 |         current_word = word
27 | 
28 | if current_word == word:
29 |     print '%s\t%s' % (current_word, current_count)
30 | 


--------------------------------------------------------------------------------
/src/main/python/wordcount.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | import sys
19 | from operator import add
20 | from pyspark import SparkContext
21 | from pyspark import SparkConf
22 | 
23 | if __name__ == "__main__":
24 |     conf = SparkConf()
25 |     sc = SparkContext(appName="PythonWordCount", conf=conf)
26 |     lines = sc.textFile(sys.argv[1], int(sys.argv[3]))
27 |     counts = lines.flatMap(lambda x: x.split(' ')) \
28 |                   .map(lambda x: (str(x), 1)) \
29 |                   .reduceByKey(add)
30 |     counts.saveAsTextFile(sys.argv[2])
31 |     sc.stop()


--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/flink/FlinkOSSSample.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.aliyun.emr.example.flink
19 | 
20 | import org.apache.flink.api.java.ExecutionEnvironment
21 | import org.apache.flink.api.java.utils.ParameterTool
22 | 
23 | import scala.collection.JavaConversions._
24 | 
25 | object FlinkOSSSample {
26 |   def main(args: Array[String]) {
27 | 
28 |     val params: ParameterTool = ParameterTool.fromArgs(args)
29 | 
30 |     // set up execution environment
31 |     val env = ExecutionEnvironment.getExecutionEnvironment
32 | 
33 |     // make parameters available in the web interface
34 |     env.getConfig.setGlobalJobParameters(params)
35 | 
36 |     if (!params.has("input")) {
37 |       println("Executing WordCount example with default input data set.")
38 |       println("Use --input to specify file input.")
39 |       sys.exit(1)
40 |     }
41 |     val text = env.readTextFile(params.get("input"))
42 | 
43 |     val top10 = text.first(10)
44 | 
45 |     top10.collect().foreach(println)
46 | 
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/AbstractParams.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.aliyun.emr.example.spark
19 | 
20 | import scala.reflect.runtime.universe._
21 | 
22 | /**
23 |  * Abstract class for parameter case classes.
24 |  * This overrides the [[toString]] method to print all case class fields by name and value.
25 |  * @tparam T  Concrete parameter class.
26 |  */
27 | abstract class AbstractParams[T: TypeTag] {
28 | 
29 |   private def tag: TypeTag[T] = typeTag[T]
30 | 
31 |   /**
32 |    * Finds all case class fields in concrete class instance, and outputs them in JSON-style format:
33 |    * {
34 |    *   [field name]:\t[field value]\n
35 |    *   [field name]:\t[field value]\n
36 |    *   ...
37 |    * }
38 |    */
39 |   override def toString: String = {
40 |     val tpe = tag.tpe
41 |     val allAccessors = tpe.declarations.collect {
42 |       case m: MethodSymbol if m.isCaseAccessor => m
43 |     }
44 |     val mirror = runtimeMirror(getClass.getClassLoader)
45 |     val instanceMirror = mirror.reflect(this)
46 |     allAccessors.map { f =>
47 |       val paramName = f.name.toString
48 |       val fieldMirror = instanceMirror.reflectField(f)
49 |       val paramValue = fieldMirror.get
50 |       s"  $paramName:\t$paramValue"
51 |     }.mkString("{\n", ",\n", "\n}")
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/LinearRegression.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.aliyun.emr.example.spark
 19 | 
 20 | import org.apache.spark.mllib.optimization.{L1Updater, SimpleUpdater, SquaredL2Updater}
 21 | import org.apache.spark.mllib.regression.LinearRegressionWithSGD
 22 | import org.apache.spark.mllib.util.MLUtils
 23 | import _root_.scopt.OptionParser
 24 | 
 25 | object LinearRegression extends RunLocally{
 26 |   object RegType extends Enumeration {
 27 |     type RegType = Value
 28 |     val NONE, L1, L2 = Value
 29 |   }
 30 | 
 31 |   import RegType._
 32 | 
 33 |   case class Params(
 34 |                      input: String = null,
 35 |                      numPartitions: Int = 2,
 36 |                      numIterations: Int = 100,
 37 |                      stepSize: Double = 1.0,
 38 |                      regType: RegType = L2,
 39 |                      regParam: Double = 0.01,
 40 |                      accessKeyId: String = null,
 41 |                      accessKeySecret: String = null,
 42 |                      endpoint: String = null) extends AbstractParams[Params]
 43 | 
 44 |   def main(args: Array[String]) {
 45 |     val defaultParams = Params()
 46 | 
 47 |     val parser = new OptionParser[Params]("LinearRegression") {
 48 |       head("LinearRegression: an example app for linear regression.")
 49 |       opt[Int]("numIterations")
 50 |         .text("number of iterations")
 51 |         .action((x, c) => c.copy(numIterations = x))
 52 |       opt[Double]("stepSize")
 53 |         .text(s"initial step size, default: ${defaultParams.stepSize}")
 54 |         .action((x, c) => c.copy(stepSize = x))
 55 |       opt[String]("regType")
 56 |         .text(s"regularization type (${RegType.values.mkString(",")}), " +
 57 |         s"default: ${defaultParams.regType}")
 58 |         .action((x, c) => c.copy(regType = RegType.withName(x)))
 59 |       opt[Double]("regParam")
 60 |         .text(s"regularization parameter, default: ${defaultParams.regParam}")
 61 |       arg[String]("<input>")
 62 |         .required()
 63 |         .text("input paths to labeled examples in LIBSVM format")
 64 |         .action((x, c) => c.copy(input = x))
 65 |       arg[Int]("<numPartitions>")
 66 |         .required()
 67 |         .text(s"number of partitions, default: ${defaultParams.numPartitions}")
 68 |         .action((x, c) => c.copy(numPartitions = x))
 69 |       note(
 70 |         """
 71 |           | For example, the following command runs this app on a synthetic dataset:
 72 |           |
 73 |           | bin/spark-submit --class LinearRegression examples-1.0-SNAPSHOT-shaded.jar oss://accessKeyId:accessKeySecret@bucket.endpoint/input.txt 2
 74 |         """.stripMargin)
 75 |     }
 76 | 
 77 |     parser.parse(args, defaultParams).map { params =>
 78 |       run(params)
 79 |     } getOrElse {
 80 |       sys.exit(1)
 81 |     }
 82 |   }
 83 | 
 84 |   def run(params: Params) {
 85 |     val examples = MLUtils.loadLibSVMFile(getSparkContext, params.input).cache()
 86 |     val splits = examples.randomSplit(Array(0.8, 0.2))
 87 |     val training = splits(0).cache()
 88 |     val test = splits(1).cache()
 89 | 
 90 |     val numTraining = training.count()
 91 |     val numTest = test.count()
 92 |     println(s"Training: $numTraining, test: $numTest.")
 93 | 
 94 |     examples.unpersist(blocking = false)
 95 | 
 96 |     val updater = params.regType match {
 97 |       case NONE => new SimpleUpdater()
 98 |       case L1 => new L1Updater()
 99 |       case L2 => new SquaredL2Updater()
100 |     }
101 | 
102 |     val algorithm = new LinearRegressionWithSGD()
103 |     algorithm.optimizer
104 |       .setNumIterations(params.numIterations)
105 |       .setStepSize(params.stepSize)
106 |       .setUpdater(updater)
107 |       .setRegParam(params.regParam)
108 | 
109 |     val model = algorithm.run(training)
110 | 
111 |     val prediction = model.predict(test.map(_.features))
112 |     val predictionAndLabel = prediction.zip(test.map(_.label))
113 | 
114 |     val loss = predictionAndLabel.map { case (p, l) =>
115 |       val err = p - l
116 |       err * err
117 |     }.reduce(_ + _)
118 |     val rmse = math.sqrt(loss / numTest)
119 | 
120 |     println(s"Test RMSE = $rmse.")
121 | 
122 |     getSparkContext.stop()
123 |   }
124 | 
125 |   override def getAppName: String = "LinearRegression"
126 | }
127 | 


--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/MongoDBWordCount.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.aliyun.emr.example.spark
19 | 
20 | import com.stratio.datasource.mongodb._
21 | import com.stratio.datasource.mongodb.config._
22 | import com.stratio.datasource.mongodb.config.MongodbConfig._
23 | 
24 | import org.apache.spark.sql._
25 | import org.apache.spark.sql.SQLContext
26 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
27 | 
28 | object MongoDBWordCount extends RunLocally {
29 |   def main(args: Array[String]): Unit = {
30 |     if (args.length < 12) {
31 |       System.err.println(
32 |         """Usage: bin/spark-submit --class MongoDBWordCount examples-1.0-SNAPSHOT-shaded.jar <dbName> <dbUrl> <dbPort>
33 |           |         <userName> <pwd> <collectionName> <sampleRatio> <writeConcern> <splitSize> <splitKey> <inputPath>
34 |           |         <numPartitions>
35 |           |
36 |           |Arguments:
37 |           |
38 |           |    dbName          MongoDB database name.
39 |           |    dbUrl           MongoDB database URL.
40 |           |    dbPort          MongoDB database port.
41 |           |    userName        MongoDB database user name.
42 |           |    pwd             mongoDB database password.
43 |           |    collectionName  MongoDB collection name.
44 |           |    sampleRatio     MongoDB sample ratio.
45 |           |    writeConcern    MongoDB write concern.
46 |           |    splitSize       MongoDB split size.
47 |           |    splitKey        MongoDB split key.
48 |           |    inputPath       OSS input object path, like oss://accessKeyId:accessKeySecret@bucket.endpoint/a/b.txt
49 |           |    numPartitions   RDD partition number.
50 |           |
51 |         """.stripMargin)
52 |       System.exit(1)
53 |     }
54 | 
55 |     val dbName = args(0)
56 |     val dbUrl = args(1)
57 |     val dbPort = args(2)
58 |     val userName = args(3)
59 |     val pwd = args(4)
60 |     val collectionName = args(5)
61 |     val sampleRatio = args(6).toFloat
62 |     val writeConcern = args(7)
63 |     val splitSize = args(8).toInt
64 |     val splitKey = args(9)
65 |     val inputPath = args(10)
66 |     val numPartitions = args(11).toInt
67 | 
68 |     val sqlContext = new SQLContext(getSparkContext)
69 | 
70 |     val input = getSparkContext.textFile(inputPath, numPartitions)
71 |     val counts = input.flatMap(_.split(" ")).map(x => (x, 1)).reduceByKey(_ + _).map(e => Row.apply(e._1, e._2))
72 |     lazy val schema = StructType(
73 |         StructField("word", StringType) ::
74 |         StructField("count", IntegerType) :: Nil)
75 | 
76 |     val hosts = dbUrl.split(",").map(e => s"$e:$dbPort").toList
77 |     val df = sqlContext.createDataFrame(counts, schema)
78 |     val saveConfig = MongodbConfigBuilder(Map(Host -> hosts, Database -> dbName,
79 |         Collection -> collectionName, SamplingRatio -> sampleRatio, WriteConcern -> writeConcern,
80 |         SplitSize -> splitSize, SplitKey -> splitKey,
81 |         Credentials -> List(com.stratio.datasource.mongodb.config.MongodbCredentials(userName, dbName, pwd.toCharArray))))
82 |     df.saveToMongodb(saveConfig.build())
83 |   }
84 | 
85 |   override def getAppName: String = "MongoDBWordCount"
86 | }
87 | 


--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/RunLocally.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.aliyun.emr.example.spark
19 | 
20 | import org.apache.spark.{SparkConf, SparkContext}
21 | 
22 | trait RunLocally {
23 | 
24 |   def getAppName: String
25 | 
26 |   def getSparkConf: SparkConf = new SparkConf()
27 | 
28 |   def getSparkContext: SparkContext = {
29 |     val conf = getSparkConf.setAppName(getAppName).setMaster("local[4]")
30 |     conf.set("spark.hadoop.fs.oss.impl", "com.aliyun.fs.oss.nat.NativeOssFileSystem")
31 |     conf.set("spark.hadoop.mapreduce.job.run-local", "true")
32 |     new SparkContext(conf)
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/SparkMaxComputeDemo.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.aliyun.emr.example.spark
19 | 
20 | import com.aliyun.odps.TableSchema
21 | import com.aliyun.odps.data.Record
22 | 
23 | import org.apache.spark.aliyun.odps.OdpsOps
24 | import org.apache.spark.{SparkConf, SparkContext}
25 | 
26 | object SparkMaxComputeDemo {
27 |   def main(args: Array[String]): Unit = {
28 |     if (args.length < 6) {
29 |       System.err.println(
30 |         """Usage: SparkMaxComputeDemo <accessKeyId> <accessKeySecret> <envType> <project> <table> <numPartitions>
31 |           |
32 |           |Arguments:
33 |           |
34 |           |    accessKeyId      Aliyun Access Key ID.
35 |           |    accessKeySecret  Aliyun Key Secret.
36 |           |    envType          0 or 1
37 |           |                     0: Public environment.
38 |           |                     1: Aliyun internal environment, i.e. Aliyun ECS etc.
39 |           |    project          Aliyun ODPS project
40 |           |    table            Aliyun ODPS table
41 |           |    numPartitions    the number of RDD partitions
42 |         """.stripMargin)
43 |       System.exit(1)
44 |     }
45 | 
46 |     val accessKeyId = args(0)
47 |     val accessKeySecret = args(1)
48 |     val envType = args(2).toInt
49 |     val project = args(3)
50 |     val table = args(4)
51 |     val numPartitions = args(5).toInt
52 | 
53 |     val urls = Seq(
54 |       Seq("http://service.odps.aliyun.com/api", "http://dt.odps.aliyun.com"), // public environment
55 |       Seq("http://odps-ext.aliyun-inc.com/api", "http://dt-ext.odps.aliyun-inc.com") // Aliyun internal environment
56 |     )
57 | 
58 |     val conf = new SparkConf().setAppName("E-MapReduce Demo 3-1: Spark MaxCompute Demo (Scala)")
59 |     val sc = new SparkContext(conf)
60 |     val odpsOps = envType match {
61 |       case 0 =>
62 |         OdpsOps(sc, accessKeyId, accessKeySecret, urls(0)(0), urls(0)(1))
63 |       case 1 =>
64 |         OdpsOps(sc, accessKeyId, accessKeySecret, urls(1)(0), urls(1)(1))
65 |     }
66 | 
67 |     val odpsData = odpsOps.readTable(project, table, read, numPartitions)
68 | 
69 |     println(s"Count (odpsData): ${odpsData.count()}")
70 |   }
71 | 
72 |   def read(record: Record, schema: TableSchema): Long = {
73 |     record.getBigint(0)
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/SparkOssDemo.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.aliyun.emr.example.spark
19 | 
20 | import org.apache.hadoop.io.{LongWritable, Text}
21 | import org.apache.hadoop.mapred.TextInputFormat
22 | import org.apache.spark.SparkConf
23 | 
24 | object SparkOssDemo extends RunLocally {
25 |   var accessKeyId = ""
26 |   var accessKeySecret = ""
27 |   var endpoint = ""
28 | 
29 |   def main(args: Array[String]): Unit = {
30 |     if (args.length < 2) {
31 |       System.err.println(
32 |         """Usage: bin/spark-submit --class com.aliyun.emr.example.spark.SparkOssDemo examples-1.0-SNAPSHOT-shaded.jar
33 |           |
34 |           |Arguments:
35 |           |
36 |           |    accessKeyId      OSS accessKeyId
37 |           |    accessKeySecret  OSS accessKeySecret
38 |           |    endpoint         OSS endpoint
39 |           |    inputPath        Input OSS object path, like oss://bucket/input/a.txt
40 |           |    outputPath       Output OSS object path, like oss://bucket/output/
41 |           |    numPartitions    the number of RDD partitions.
42 |           |
43 |         """.stripMargin)
44 |       System.exit(1)
45 |     }
46 | 
47 |     accessKeyId = args(0)
48 |     accessKeySecret = args(1)
49 |     endpoint = args(2)
50 |     val inputPath = args(3)
51 |     val outputPath = args(4)
52 |     val numPartitions = args(5).toInt
53 |     val ossData = getSparkContext.hadoopFile(inputPath, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], numPartitions)
54 |     ossData.foreach(line => println(s"print: ${line}"))
55 | 
56 |     ossData.saveAsTextFile(outputPath)
57 |   }
58 | 
59 |   override def getAppName: String = "E-MapReduce Demo 2-1: Spark Oss Demo (Scala)"
60 | 
61 |   override def getSparkConf: SparkConf = {
62 |     val conf = new SparkConf()
63 |     conf.set("spark.hadoop.fs.oss.accessKeyId", accessKeyId)
64 |     conf.set("spark.hadoop.fs.oss.accessKeySecret", accessKeySecret)
65 |     conf.set("spark.hadoop.fs.oss.endpoint", endpoint)
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/SparkPi.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.aliyun.emr.example.spark
19 | 
20 | import scala.math._
21 | 
22 | /** Computes an approximation to pi */
23 | object SparkPi extends RunLocally{
24 |   def main(args: Array[String]) {
25 |     val slices = if (args.length > 0) args(0).toInt else 2
26 |     val n = math.min(100000L * slices, Int.MaxValue).toInt // avoid overflow
27 |     val count = getSparkContext.parallelize(1 until n, slices).map { i =>
28 |         val x = random * 2 - 1
29 |         val y = random * 2 - 1
30 |         if (x*x + y*y < 1) 1 else 0
31 |       }.reduce(_ + _)
32 |     println("Pi is roughly " + 4.0 * count / n)
33 |     getSparkContext.stop()
34 |   }
35 | 
36 |   override def getAppName: String = "SparkPi"
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/SparkRdsDemo.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.aliyun.emr.example.spark
19 | 
20 | import java.sql.{Connection, DriverManager, PreparedStatement}
21 | 
22 | object SparkRdsDemo extends RunLocally {
23 |   def main(args: Array[String]): Unit = {
24 |     if (args.length < 8) {
25 |       System.err.println(
26 |         """Usage: spark-submit --class SparkRdsDemo examples-1.0-SNAPSHOT-shaded.jar <dbName> <tbName> <dbUser>
27 |           |       <dbPwd> <dbUrl> <dbPort> <inputPath> <numPartitions>
28 |           |
29 |           |Arguments:
30 |           |
31 |           |    dbName        RDS database name.
32 |           |    tbName        RDS table name.
33 |           |    dbUser        RDS database user name.
34 |           |    dbPwd         RDS database password.
35 |           |    dbUrl         RDS database URL.
36 |           |    dbPort        RDS database port
37 |           |    inputPath     OSS input object path, like oss://accessKeyId:accessKeySecret@bucket.endpoint/a/b.txt
38 |           |    numPartitions
39 |           |
40 |         """.stripMargin)
41 |       System.exit(1)
42 |     }
43 |     val dbName = args(0)
44 |     val tbName = args(1)
45 |     val dbUser = args(2)
46 |     val dbPwd = args(3)
47 |     val dbUrl = args(4)
48 |     val dbPort = args(5)
49 |     val inputPath = args(6)
50 |     val numPartitions = args(7).toInt
51 | 
52 |     val input = getSparkContext.textFile(inputPath, numPartitions)
53 |     input.collect().foreach(println)
54 |     input.flatMap(_.split(" ")).map(x => (x, 1)).reduceByKey(_ + _)
55 |       .mapPartitions(e => {
56 |         var conn: Connection = null
57 |         var ps: PreparedStatement = null
58 |         val sql = s"insert into $tbName(word, count) values (?, ?)"
59 |         try {
60 |           conn = DriverManager.getConnection(s"jdbc:mysql://$dbUrl:$dbPort/$dbName", dbUser, dbPwd)
61 |           ps = conn.prepareStatement(sql)
62 |           e.foreach(pair => {
63 |             ps.setString(1, pair._1)
64 |             ps.setLong(2, pair._2)
65 |             ps.executeUpdate()
66 |           })
67 | 
68 |           ps.close()
69 |           conn.close()
70 |         } catch {
71 |           case e: Exception => e.printStackTrace()
72 |         } finally {
73 |           if (ps != null) {
74 |             ps.close()
75 |           }
76 |           if (conn != null) {
77 |             conn.close()
78 |           }
79 |         }
80 |       Iterator.empty
81 |     }).count()
82 |   }
83 | 
84 |   override def getAppName: String = "E-MapReduce Demo 10: Spark Rds Demo (Scala)"
85 | }
86 | 


--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/SparkWordCount.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.aliyun.emr.example.spark
19 | 
20 | /** Counts words in new text files created in the given directory */
21 | object SparkWordCount extends RunLocally {
22 |   def main(args: Array[String]): Unit = {
23 |     if (args.length < 3) {
24 |       System.err.println(
25 |         """Usage: bin/spark-submit --class com.aliyun.emr.example.SparkWordCount examples-1.0-SNAPSHOT-shaded.jar <inputPath> <outputPath> <numPartition>
26 |           |
27 |           |Arguments:
28 |           |
29 |           |    inputPath        Input OSS object path, like oss://accessKeyId:accessKeySecret@bucket.endpoint/input/words.txt
30 |           |    outputPath       Output OSS object path, like oss://accessKeyId:accessKeySecret@bucket.endpoint/output
31 |           |    numPartitions    The number of RDD partitions.
32 |           |
33 |         """.stripMargin)
34 |       System.exit(1)
35 |     }
36 | 
37 |     val inputPath = args(0)
38 |     val outputPath = args(1)
39 |     val numPartitions = args(2).toInt
40 | 
41 |     val input = getSparkContext.textFile(inputPath, numPartitions)
42 |     val output = input.flatMap(_.split(" ")).map(x => (x, 1)).reduceByKey(_ + _)
43 | 
44 |     output.saveAsTextFile(outputPath)
45 |   }
46 | 
47 |   override def getAppName: String = "E-MapReduce Demo 1: SparkWordCount"
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/sql/ODPSDataSourceSample.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.aliyun.emr.example.spark.sql
19 | 
20 | import org.apache.spark.sql.{SaveMode, SparkSession}
21 | 
22 | object ODPSDataSourceSample {
23 |   def main(args: Array[String]): Unit = {
24 |     if (args.length < 6) {
25 |       System.err.println(
26 |         """Usage: ODPSDataSourceSample <accessKeyId> <accessKeySecret> <envType> <project> <table> <numPartitions>
27 |           |
28 |           |Arguments:
29 |           |
30 |           |    accessKeyId      Aliyun Access Key ID.
31 |           |    accessKeySecret  Aliyun Key Secret.
32 |           |    envType          0 or 1
33 |           |                     0: Public environment.
34 |           |                     1: Aliyun internal environment, i.e. Aliyun ECS etc.
35 |           |    project          Aliyun ODPS project
36 |           |    table            Aliyun ODPS table
37 |           |    numPartitions    the number of RDD partitions
38 |         """.stripMargin)
39 |       System.exit(1)
40 |     }
41 | 
42 |     val accessKeyId = args(0)
43 |     val accessKeySecret = args(1)
44 |     val envType = args(2).toInt
45 |     val project = args(3)
46 |     val table = args(4)
47 | 
48 |     val urls = Seq(
49 |       Seq("http://service.odps.aliyun.com/api", "http://dt.odps.aliyun.com"), // public environment
50 |       Seq("http://odps-ext.aliyun-inc.com/api", "http://dt-ext.odps.aliyun-inc.com") // Aliyun internal environment
51 |     )
52 | 
53 |     val odpsUrl = urls(envType)(0)
54 |     val tunnelUrl = urls(envType)(1)
55 | 
56 |     val ss = SparkSession.builder().appName("Test Odps Read").master("local[*]").getOrCreate()
57 | 
58 |     import ss.implicits._
59 | 
60 |     val dataSeq = (1 to 1000000).map {
61 |       index => (index, (index-3).toString)
62 |     }.toSeq
63 | 
64 | 
65 |     val df = ss.sparkContext.makeRDD(dataSeq).toDF("a", "b")
66 | 
67 |     System.out.println("*****" + table + ",before overwrite table")
68 |     df.write.format("org.apache.spark.aliyun.odps.datasource")
69 |       .option("odpsUrl", odpsUrl)
70 |       .option("tunnelUrl", tunnelUrl)
71 |       .option("table", table)
72 |       .option("project", project)
73 |       .option("accessKeySecret", accessKeySecret)
74 |       .option("accessKeyId", accessKeyId).mode(SaveMode.Overwrite).save()
75 | 
76 |     System.out.println("*****" + table + ",after overwrite table, before read table")
77 | 
78 |     val readDF = ss.read
79 |       .format("org.apache.spark.aliyun.odps.datasource")
80 |       .option("odpsUrl", odpsUrl)
81 |       .option("tunnelUrl", tunnelUrl)
82 |       .option("table", table)
83 |       .option("project", project)
84 |       .option("accessKeySecret", accessKeySecret)
85 |       .option("accessKeyId", accessKeyId).load()
86 | 
87 | 
88 |     val collectList = readDF.collect()
89 |     System.out.println("*****" + table + ",after read table," + collectList.size)
90 |     assert(collectList.length == 1000000)
91 |     assert((1 to 1000000).par.exists(n => collectList.exists(_.getLong(0) == n)))
92 | 
93 |   }
94 | }
95 | 


--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/sql/streaming/SparkSLSContinuousStructuredStreamingDemo.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package com.aliyun.emr.example.spark.sql.streaming
18 | 
19 | import java.util.UUID
20 | 
21 | import org.apache.spark.sql.SparkSession
22 | import org.apache.spark.sql.streaming.Trigger
23 | 
24 | object SparkSLSContinuousStructuredStreamingDemo {
25 |   def main(args: Array[String]) {
26 |     if (args.length < 7) {
27 |       System.err.println("Usage: SparkSLSContinuousStructuredStreamingDemo <logService-project> " +
28 |         "<logService-store> <access-key-id> <access-key-secret> <endpoint> " +
29 |         "<starting-offsets> <max-offsets-per-trigger> [<checkpoint-location>]")
30 |       System.exit(1)
31 |     }
32 | 
33 |     val Array(project, logStore, accessKeyId, accessKeySecret, endpoint, startingOffsets, maxOffsetsPerTrigger, _*) = args
34 |     val checkpointLocation =
35 |       if (args.length > 7) args(7) else "/tmp/temporary-" + UUID.randomUUID.toString
36 | 
37 |     val spark = SparkSession
38 |       .builder
39 |       .appName("E-MapReduce Demo 6-5: Spark SLS Demo (Scala)")
40 |       .master("local[5]")
41 |       .getOrCreate()
42 | 
43 |     spark.sparkContext.setLogLevel("WARN")
44 | 
45 |     import spark.implicits._
46 | 
47 |     // Create DataSet representing the stream of input lines from loghub
48 |     val lineLength = spark
49 |       .readStream
50 |       .format("org.apache.spark.sql.aliyun.logservice.LoghubSourceProvider")
51 |       .option("sls.project", project)
52 |       .option("sls.store", logStore)
53 |       .option("access.key.id", accessKeyId)
54 |       .option("access.key.secret", accessKeySecret)
55 |       .option("endpoint", endpoint)
56 |       .option("startingoffsets", startingOffsets)
57 |       .option("maxOffsetsPerTrigger", maxOffsetsPerTrigger)
58 |       .load()
59 |       .selectExpr("CAST(__value__ AS STRING)")
60 |       .as[String].map(e => (e, e.length)).toDF("value", "length")
61 | 
62 |     val query = lineLength.writeStream
63 |       .outputMode("append")
64 |       .format("console")
65 |       .option("checkpointLocation", checkpointLocation)
66 |       .trigger(Trigger.Continuous("5 second"))
67 |       .start()
68 | 
69 |     query.awaitTermination()
70 |   }
71 | }
72 | 


--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/sql/streaming/SparkSLSStructuredStreamingDemo.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package com.aliyun.emr.example.spark.sql.streaming
18 | 
19 | import java.util.UUID
20 | 
21 | import org.apache.spark.sql.SparkSession
22 | 
23 | object SparkSLSStructuredStreamingDemo {
24 |   def main(args: Array[String]) {
25 |     if (args.length < 7) {
26 |       System.err.println("Usage: SparkSLSStructuredStreamingDemo <logService-project> " +
27 |         "<logService-store> <access-key-id> <access-key-secret> <endpoint> " +
28 |         "<starting-offsets> <max-offsets-per-trigger> [<checkpoint-location>]")
29 |       System.exit(1)
30 |     }
31 | 
32 |     val Array(project, logStore, accessKeyId, accessKeySecret, endpoint, startingOffsets, maxOffsetsPerTrigger, _*) = args
33 |     val checkpointLocation =
34 |       if (args.length > 7) args(7) else "/tmp/temporary-" + UUID.randomUUID.toString
35 | 
36 |     val spark = SparkSession
37 |       .builder
38 |       .appName("E-MapReduce Demo 6-3: Spark SLS Demo (Scala)")
39 |       .master("local[5]")
40 |       .getOrCreate()
41 | 
42 |     spark.sparkContext.setLogLevel("WARN")
43 | 
44 |     import spark.implicits._
45 | 
46 |     // Create DataSet representing the stream of input lines from loghub
47 |     val lines = spark
48 |       .readStream
49 |       .format("org.apache.spark.sql.aliyun.logservice.LoghubSourceProvider")
50 |       .option("sls.project", project)
51 |       .option("sls.store", logStore)
52 |       .option("access.key.id", accessKeyId)
53 |       .option("access.key.secret", accessKeySecret)
54 |       .option("endpoint", endpoint)
55 |       .option("startingoffsets", startingOffsets)
56 |       .option("zookeeper.connect.address", "localhost:2181")
57 |       .option("maxOffsetsPerTrigger", maxOffsetsPerTrigger)
58 |       .load()
59 |       .selectExpr("CAST(__value__ AS STRING)")
60 |       .as[String]
61 | 
62 |     val wordCounts = lines.flatMap(_.split(" ")).groupBy("__value__").count()
63 | 
64 |     val query = wordCounts.writeStream
65 |       .outputMode("complete")
66 |       .format("console")
67 |       .option("checkpointLocation", checkpointLocation)
68 |       .start()
69 | 
70 |     query.awaitTermination()
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/streaming/DirectSparkSLSDemo.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.aliyun.emr.example.spark.streaming
19 | 
20 | import com.aliyun.openservices.loghub.client.config.LogHubCursorPosition
21 | import org.apache.spark.SparkConf
22 | import org.apache.spark.streaming.aliyun.logservice.{DirectLoghubInputDStream, LoghubUtils}
23 | import org.apache.spark.streaming.{Milliseconds, StreamingContext}
24 | 
25 | object DirectSparkSLSDemo {
26 |   def main(args: Array[String]): Unit = {
27 |     if (args.length < 7) {
28 |       System.err.println(
29 |         """Usage: DirectSparkSLSDemo <sls project> <sls logstore> <loghub group name> <sls endpoint>
30 |           |         <access key id> <access key secret> <batch interval seconds> <zookeeper host:port=localhost:2181>
31 |         """.stripMargin)
32 |       System.exit(1)
33 |     }
34 | 
35 |     val loghubProject = args(0)
36 |     val logStore = args(1)
37 |     val loghubGroupName = args(2)
38 |     val endpoint = args(3)
39 |     val accessKeyId = args(4)
40 |     val accessKeySecret = args(5)
41 |     val batchInterval = Milliseconds(args(6).toInt * 1000)
42 |     val zkAddress = if (args.length >= 8) args(7) else "localhost:2181"
43 | 
44 |     def functionToCreateContext(): StreamingContext = {
45 |       val conf = new SparkConf().setAppName("E-MapReduce Demo 6-2: Spark SLS Demo (Scala) (Direct API)")
46 |       val ssc = new StreamingContext(conf, batchInterval)
47 |       val zkParas = Map("zookeeper.connect" -> zkAddress,
48 |         "enable.auto.commit" -> "false")
49 |       val loghubStream = LoghubUtils.createDirectStream(
50 |         ssc,
51 |         loghubProject,
52 |         logStore,
53 |         loghubGroupName,
54 |         accessKeyId,
55 |         accessKeySecret,
56 |         endpoint,
57 |         zkParas,
58 |         LogHubCursorPosition.END_CURSOR)
59 | 
60 |       loghubStream.checkpoint(batchInterval).foreachRDD(rdd => {
61 |         println(s"count by key: ${rdd.map(s => {
62 |           s.sorted
63 |           (s.length, s)
64 |         }).countByKey().size}")
65 |         loghubStream.asInstanceOf[DirectLoghubInputDStream].commitAsync()
66 |       })
67 |       ssc.checkpoint("hdfs:///tmp/spark/streaming") // set checkpoint directory
68 |       ssc
69 |     }
70 | 
71 |     val ssc = StreamingContext.getOrCreate("hdfs:///tmp/spark/streaming", functionToCreateContext _)
72 | 
73 |     ssc.start()
74 |     ssc.awaitTermination()
75 |   }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/streaming/DtsSample.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.aliyun.emr.example.spark.streaming
19 | 
20 | import com.aliyun.drc.clusterclient.message.ClusterMessage
21 | 
22 | import org.apache.spark.SparkConf
23 | import org.apache.spark.storage.StorageLevel
24 | import org.apache.spark.streaming.aliyun.dts.DtsUtils
25 | import org.apache.spark.streaming.{Milliseconds, StreamingContext}
26 | 
27 | object DtsSample {
28 |   def main(args: Array[String]): Unit = {
29 |     if (args.length < 4) {
30 |       System.err.println(s"""
31 |                             |Usage: DtsSample <accessKeyId> <accessKeySecret> <guid> <usePublicIp> <interval-mills>
32 |                             |  <accessKeyId>      Aliyun Access Key ID.
33 |                             |  <accessKeySecret>  Aliyun Access Key Secret.
34 |                             |  <guid>             Aliyun DTS guid name.
35 |                             |  <usePublicIp>      Use public Ip to access DTS or not.
36 |                             |  <interval-mills>   The time interval at which streaming data will be divided into batches.
37 |         """.stripMargin)
38 |       System.exit(1)
39 |     }
40 | 
41 |     val Array(accessKeyId, accessKeySecret, guid, usePublicIp, interval) = args
42 |     val sparkConf = new SparkConf().setAppName("DtsSample")
43 |     val ssc: StreamingContext = new StreamingContext(sparkConf, Milliseconds(interval.toInt))
44 | 
45 |     def func: ClusterMessage => String = msg => msg.getRecord.toString
46 | 
47 |     val dtsStream = DtsUtils.createStream(
48 |       ssc,
49 |       accessKeyId,
50 |       accessKeySecret,
51 |       guid,
52 |       func,
53 |       StorageLevel.MEMORY_AND_DISK_2,
54 |       usePublicIp.toBoolean)
55 | 
56 |     dtsStream.foreachRDD(rdd => {
57 |       rdd.collect().foreach(println)
58 |     })
59 | 
60 |     ssc.start()
61 |     ssc.awaitTermination()
62 |   }
63 | }
64 | 


--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/streaming/RedisWordCount.scala.1:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.aliyun.emr.example.spark.streaming
19 | 
20 | import org.apache.spark.{SparkContext, SparkConf}
21 | import org.apache.spark.streaming.{Seconds, StreamingContext}
22 | import org.apache.spark.storage.StorageLevel
23 | import com.redislabs.provider.redis._
24 | 
25 | object RedisWordCount {
26 |   def main(args: Array[String]): Unit = {
27 |     if (args.length < 4) {
28 |       System.err.println(
29 |         """Usage: bin/spark-submit --class RedisWordCount examples-1.0-SNAPSHOT-shaded.jar <redisHost> <redisPort>
30 |           |           <redisAuth> <keyName>
31 |           |
32 |           |Arguments:
33 |           |
34 |           |    redisHost       Redis host.
35 |           |    redisPort       Redis port.
36 |           |    redisAuth       Redis auth.
37 |           |    keyName         Redis key name.
38 |           |
39 |         """.stripMargin)
40 |       System.exit(1)
41 |     }
42 | 
43 |     val redisHost = args(0)
44 |     val redisPort = args(1)
45 |     val redisAuth = args(2)
46 |     val keyName = args(3)
47 | 
48 |     val conf = new SparkConf().setAppName("Redis WordCount").setMaster("local[4]")
49 |     conf.set("redis.host", redisHost)
50 |     conf.set("redis.port", redisPort)
51 |     conf.set("redis.auth", redisAuth)
52 |     val sc = new SparkContext(conf)
53 |     val ssc = new StreamingContext(sc, Seconds(1))
54 | 
55 |     val redisStream = ssc.createRedisStream(Array(keyName), storageLevel = StorageLevel.MEMORY_AND_DISK_2)
56 |     redisStream.print()
57 | 
58 |     ssc.start()
59 |     ssc.awaitTermination()
60 |   }
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/streaming/SparkDatahubDemo.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.aliyun.emr.example.spark.streaming
 19 | 
 20 | import com.aliyun.datahub.model.RecordEntry
 21 | 
 22 | import org.apache.spark.SparkConf
 23 | import org.apache.spark.storage.StorageLevel
 24 | import org.apache.spark.streaming.{Milliseconds, StreamingContext}
 25 | import org.apache.spark.streaming.aliyun.datahub.DatahubUtils
 26 | import org.apache.spark.streaming.dstream.DStream
 27 | 
 28 | object SparkDatahubDemo {
 29 |   def main(args: Array[String]): Unit = {
 30 |     if (args.length < 7) {
 31 |       // scalastyle:off
 32 |       System.err.println(
 33 |         """Usage: SparkDatahubDemo <project> <topic> <subscribe Id> <access key id>
 34 |           |         <access key secret> <endpoint> <batch interval seconds> [<shard Id>]
 35 |         """.stripMargin)
 36 |       // scalastyle:on
 37 |       System.exit(1)
 38 |     }
 39 | 
 40 |     var isShardDefined = false
 41 |     if (args.length == 8) {
 42 |       isShardDefined = true
 43 |     }
 44 | 
 45 |     val project = args(0)
 46 |     val topic = args(1)
 47 |     val subId = args(2)
 48 |     val accessKeyId = args(3)
 49 |     val accessKeySecret = args(4)
 50 |     val endpoint = args(5)
 51 |     val batchInterval = Milliseconds(args(6).toInt * 1000)
 52 | 
 53 |     def functionToCreateContext(): StreamingContext = {
 54 |       val conf = new SparkConf().setMaster("local[4]").setAppName("E-MapReduce Demo 11: Spark DataHub Demo (Scala)")
 55 |       conf.set("spark.hadoop.fs.oss.impl", "com.aliyun.fs.oss.nat.NativeOssFileSystem")
 56 |       conf.set("spark.hadoop.mapreduce.job.run-local", "true")
 57 |       val ssc = new StreamingContext(conf, batchInterval)
 58 |       var datahubStream: DStream[Array[Byte]] = null
 59 |       if (isShardDefined) {
 60 |         val shardId = args(7)
 61 |         datahubStream = DatahubUtils.createStream(
 62 |           ssc,
 63 |           project,
 64 |           topic,
 65 |           subId,
 66 |           accessKeyId,
 67 |           accessKeySecret,
 68 |           endpoint,
 69 |           shardId,
 70 |           read(_),
 71 |           StorageLevel.MEMORY_AND_DISK)
 72 |       } else {
 73 |         datahubStream = DatahubUtils.createStream(
 74 |           ssc,
 75 |           project,
 76 |           topic,
 77 |           subId,
 78 |           accessKeyId,
 79 |           accessKeySecret,
 80 |           endpoint,
 81 |           read(_),
 82 |           StorageLevel.MEMORY_AND_DISK)
 83 |       }
 84 | 
 85 |       // scalastyle:off
 86 |       datahubStream.foreachRDD(rdd => println(s"rdd.count(): ${rdd.count()}"))
 87 |       // scalastyle:on
 88 |       ssc.checkpoint("hdfs:///tmp/spark/streaming") // set checkpoint directory
 89 |       ssc
 90 |     }
 91 | 
 92 |     val ssc = StreamingContext.getOrCreate("hdfs:///tmp/spark/streaming", functionToCreateContext _)
 93 | 
 94 |     ssc.start()
 95 |     ssc.awaitTermination()
 96 |   }
 97 | 
 98 |   def read(record: RecordEntry): String = {
 99 |     record.getString(0)
100 |   }
101 | }
102 | 


--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/streaming/SparkHBaseDemo.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.aliyun.emr.example.spark.streaming
 19 | 
 20 | import com.aliyun.openservices.ons.api.Message
 21 | import org.apache.hadoop.conf.Configuration
 22 | import org.apache.hadoop.hbase.{HConstants, HBaseConfiguration, TableName}
 23 | import org.apache.hadoop.hbase.client.{Connection, ConnectionFactory, Put}
 24 | import org.apache.hadoop.hbase.util.Bytes
 25 | import org.apache.spark.SparkConf
 26 | import org.apache.spark.storage.StorageLevel
 27 | import org.apache.spark.streaming.aliyun.ons.OnsUtils
 28 | import org.apache.spark.streaming.{StreamingContext, Seconds}
 29 | import scala.collection.JavaConversions._
 30 | 
 31 | object ConnectionUtil extends Serializable {
 32 |   private var conf: Configuration = null
 33 | 
 34 |   private var connection: Connection = null
 35 | 
 36 |   def getDefaultConn(quorum: String): Connection = {
 37 |     if (conf == null && connection == null) {
 38 |       conf = HBaseConfiguration.create()
 39 |       conf.set(HConstants.ZOOKEEPER_QUORUM, quorum)
 40 |       conf.set(HConstants.ZOOKEEPER_ZNODE_PARENT, "/hbase")
 41 |       connection = ConnectionFactory.createConnection(conf)
 42 |     }
 43 |     connection
 44 |   }
 45 | }
 46 | 
 47 | object SparkHBaseDemo {
 48 |   def main(args: Array[String]): Unit = {
 49 |     if (args.length < 7) {
 50 |       System.err.println(
 51 |         """Usage: spark-submit --class SparkHBaseDemo examples-1.0-SNAPSHOT-shaded.jar <accessKeyId> <accessKeySecret>
 52 |           |         <consumerId> <topic> <subExpression> <parallelism> <tableName> <quorum>
 53 |           |
 54 |           |Arguments:
 55 |           |
 56 |           |    accessKeyId      Aliyun Access Key ID.
 57 |           |    accessKeySecret  Aliyun Key Secret.
 58 |           |    consumerId       ONS ConsumerID.
 59 |           |    topic            ONS topic.
 60 |           |    subExpression    * for all, or some specific tag.
 61 |           |    tableName        The name of HBase table.
 62 |           |    quorum           HBase quorum setting.
 63 |           |
 64 |         """.stripMargin)
 65 |       System.exit(1)
 66 |     }
 67 | 
 68 |     val Array(accessKeyId, accessKeySecret, consumerId, topic, subExpression, tname, quorum) = args
 69 | 
 70 |     val COLUMN_FAMILY_BYTES = Bytes.toBytes("count")
 71 |     val COLUMN_QUALIFIER_BYTES = Bytes.toBytes("count")
 72 | 
 73 |     val batchInterval = Seconds(2)
 74 | 
 75 |     val conf = new SparkConf().setAppName("E-MapReduce Demo 9: Spark HBase Demo (Scala)")
 76 |     val ssc = new StreamingContext(conf, batchInterval)
 77 |     def func: Message => Array[Byte] = msg => msg.getBody
 78 |     val onsStream = OnsUtils.createStream(
 79 |         ssc,
 80 |         consumerId,
 81 |         topic,
 82 |         subExpression,
 83 |         accessKeyId,
 84 |         accessKeySecret,
 85 |         StorageLevel.MEMORY_AND_DISK_2,
 86 |         func)
 87 | 
 88 |     onsStream.foreachRDD(rdd => {
 89 |       rdd.map(bytes => new String(bytes))
 90 |         .flatMap(line => line.split(" "))
 91 |         .map(word => (word, 1))
 92 |         .reduceByKey(_ + _)
 93 |         .mapPartitions {words => {
 94 |         val conn = ConnectionUtil.getDefaultConn(quorum)
 95 |         val tableName = TableName.valueOf(tname)
 96 |         val t = conn.getTable(tableName)
 97 |         try {
 98 |           words.sliding(100, 100).foreach(slice => {
 99 |             val puts = slice.map(word => {
100 |               println(s"word: $word")
101 |               val put = new Put(Bytes.toBytes(word._1 + System.currentTimeMillis()))
102 |               put.addColumn(COLUMN_FAMILY_BYTES, COLUMN_QUALIFIER_BYTES,
103 |                 System.currentTimeMillis(), Bytes.toBytes(word._2))
104 |               put
105 |             }).toList
106 |             t.put(puts)
107 |           })
108 |         } finally {
109 |           t.close()
110 |         }
111 | 
112 |         Iterator.empty
113 |       }}.count()
114 |     })
115 | 
116 |     ssc.start()
117 |     ssc.awaitTermination()
118 |   }
119 | }
120 | 


--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/streaming/SparkKafkaDemo.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.aliyun.emr.example.spark.streaming
19 | 
20 | import org.apache.kafka.common.serialization.StringDeserializer
21 | import org.apache.spark.SparkConf
22 | import org.apache.spark.streaming._
23 | import org.apache.spark.streaming.kafka010._
24 | 
25 | object SparkKafkaDemo {
26 |   def main(args: Array[String]) {
27 |     if (args.length < 2) {
28 |       System.err.println(s"""
29 |                             |Usage: SparkKafkaDemo <brokers> <topics>
30 |                             |  <brokers> is a list of one or more Kafka brokers
31 |                             |  <topics> is a list of one or more kafka topics to consume from
32 |                             |  <interval>
33 |         """.stripMargin)
34 |       System.exit(1)
35 |     }
36 |     val Array(brokers, topics, interval) = args
37 | 
38 |     val sparkConf = new SparkConf().setAppName("E-MapReduce Demo 9: Spark Kafka Demo (Scala)")
39 |     val ssc = new StreamingContext(sparkConf, Seconds(interval.toInt))
40 | 
41 |     val kafkaParams = Map[String, Object](
42 |       "bootstrap.servers" -> brokers,
43 |       "key.deserializer" -> classOf[StringDeserializer],
44 |       "value.deserializer" -> classOf[StringDeserializer],
45 |       "group.id" -> "mugen1",
46 |       "auto.offset.reset" -> "earliest",
47 |       "enable.auto.commit" -> (false: java.lang.Boolean),
48 |       "security.protocol" -> "SASL_PLAINTEXT",
49 |       "sasl.mechanism" -> "GSSAPI",
50 |       "sasl.kerberos.service.name" -> "kafka"
51 |     )
52 | 
53 |     val messages = KafkaUtils.createDirectStream[String, String](
54 |       ssc,
55 |       LocationStrategies.PreferConsistent,
56 |       ConsumerStrategies.Subscribe[String, String](Array(topics), kafkaParams)
57 |     )
58 | 
59 |     // Get the lines, split them into words, count the words and print
60 |     val lines = messages.map(_.value)
61 |     val words = lines.flatMap(_.split(" "))
62 |     val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _)
63 |     wordCounts.print()
64 | 
65 |     // Start the computation
66 |     ssc.start()
67 |     ssc.awaitTermination()
68 |   }
69 | }
70 | 


--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/streaming/SparkMNSDemo.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.aliyun.emr.example.spark.streaming
19 | 
20 | import org.apache.spark.SparkConf
21 | import org.apache.spark.storage.StorageLevel
22 | import org.apache.spark.streaming.aliyun.mns.MnsUtils
23 | import org.apache.spark.streaming.{Seconds, StreamingContext}
24 | 
25 | object SparkMNSDemo {
26 |   def main(args: Array[String]): Unit = {
27 |     if (args.length < 4) {
28 |       System.err.println(
29 |         """Usage: spark-submit --class SparkMNSDemo examples-1.0-SNAPSHOT-shaded.jar <queueName> <accessKeyId> <accessKeySecret> <endpoint>""".stripMargin)
30 |       System.exit(1)
31 |     }
32 |     val queueName = args(0)
33 |     val accessKeyId = args(1)
34 |     val accessKeySecret = args(2)
35 |     val endpoint = args(3)
36 | 
37 |     val conf = new SparkConf().setAppName("E-MapReduce Demo 8-1: Spark MNS Demo (Scala)").setMaster("local[4]")
38 |     conf.set("spark.hadoop.fs.oss.impl", "com.aliyun.fs.oss.nat.NativeOssFileSystem")
39 |     conf.set("spark.hadoop.mapreduce.job.run-local", "true")
40 |     val batchInterval = Seconds(10)
41 |     val ssc = new StreamingContext(conf, batchInterval)
42 | 
43 |     val mnsStream = MnsUtils.createPullingStreamAsBytes(ssc, queueName, accessKeyId, accessKeySecret, endpoint,
44 |       StorageLevel.MEMORY_ONLY)
45 |     mnsStream.foreachRDD( rdd => {
46 |       rdd.collect().foreach(e => println(new String(e)))
47 |     })
48 | 
49 |     ssc.start()
50 |     ssc.awaitTermination()
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/streaming/SparkRocketMQDemo.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.aliyun.emr.example.spark.streaming
 19 | 
 20 | import java.util.{Properties, UUID}
 21 | 
 22 | import com.aliyun.openservices.ons.api.impl.ONSFactoryImpl
 23 | import com.aliyun.openservices.ons.api.{Message, PropertyKeyConst}
 24 | import org.apache.spark.storage.StorageLevel
 25 | import org.apache.spark.streaming.aliyun.ons.OnsUtils
 26 | import org.apache.spark.streaming.{Milliseconds, StreamingContext}
 27 | import org.apache.spark.{SparkConf, SparkContext}
 28 | 
 29 | object SparkRocketMQDemo {
 30 |   def main(args: Array[String]): Unit = {
 31 |     if (args.length < 6) {
 32 |       System.err.println(
 33 |         """Usage: bin/spark-submit --class com.aliyun.emr.example.spark.streaming.SparkRocketMQDemo examples-1.0-SNAPSHOT-shaded.jar <accessKeyId> <accessKeySecret>
 34 |           |         <consumerId> <topic> <subExpression> <parallelism>
 35 |           |
 36 |           |Arguments:
 37 |           |
 38 |           |    accessKeyId      Aliyun Access Key ID.
 39 |           |    accessKeySecret  Aliyun Key Secret.
 40 |           |    consumerId       ONS ConsumerID.
 41 |           |    topic            ONS topic.
 42 |           |    subExpression    * for all, or some specific tag.
 43 |           |    parallelism      The number of receivers.
 44 |           |
 45 |         """.stripMargin)
 46 |       System.exit(1)
 47 |     }
 48 | 
 49 |     val accessKeyId = args(0)
 50 |     val accessKeySecret = args(1)
 51 |     val cId = args(2)
 52 |     val topic = args(3)
 53 |     val subExpression = args(4)
 54 |     val parallelism = args(5)
 55 | 
 56 |     val numStreams = parallelism.toInt
 57 |     val batchInterval = Milliseconds(2000)
 58 | 
 59 |     val conf = new SparkConf().setAppName("E-MapReduce Demo 4-1: Spark RocketMQ Demo (Scala)")
 60 |     val ssc = new StreamingContext(conf, batchInterval)
 61 |     def func: Message => Array[Byte] = msg => msg.getBody
 62 |     val onsStreams = (0 until numStreams).map { i =>
 63 |       println(s"starting stream $i")
 64 |       OnsUtils.createStream(
 65 |         ssc,
 66 |         cId,
 67 |         topic,
 68 |         subExpression,
 69 |         accessKeyId,
 70 |         accessKeySecret,
 71 |         StorageLevel.MEMORY_AND_DISK_2,
 72 |         func)
 73 |     }
 74 | 
 75 |     val unionStreams = ssc.union(onsStreams)
 76 |     unionStreams.foreachRDD(rdd => println(s"count: ${rdd.count()}"))
 77 | 
 78 |     ssc.start()
 79 |     ssc.awaitTermination()
 80 |   }
 81 | }
 82 | 
 83 | object OnsRecordProducer {
 84 |   def main(args: Array[String]): Unit = {
 85 |     val Array(accessKeyId, accessKeySecret, pId, topic, tag, parallelism) = args
 86 | 
 87 |     val numPartition = parallelism.toInt
 88 |     val conf = new SparkConf().setAppName("E-MapReduce Demo 4-1: Spark RocketMQ Demo (Scala)")
 89 |     val sc = new SparkContext(conf)
 90 | 
 91 |     sc.parallelize(0 until numPartition, numPartition).mapPartitionsWithIndex {
 92 |       (index, itr) => {
 93 |         generate(index, accessKeyId, accessKeySecret, pId, topic, tag)
 94 |         Iterator.empty
 95 |       }
 96 |     }.count()
 97 |   }
 98 | 
 99 |   def generate(
100 |                 partitionId: Int,
101 |                 accessKeyId: String,
102 |                 accessKeySecret: String,
103 |                 pId: String,
104 |                 topic: String,
105 |                 tag: String): Unit = {
106 |     val properties = new Properties()
107 |     properties.put(PropertyKeyConst.ProducerId, pId)
108 |     properties.put(PropertyKeyConst.AccessKey, accessKeyId)
109 |     properties.put(PropertyKeyConst.SecretKey, accessKeySecret)
110 |     val onsFactoryImpl = new ONSFactoryImpl
111 |     val producer = onsFactoryImpl.createProducer(properties)
112 |     producer.shutdown()
113 |     producer.start()
114 | 
115 |     var count = 0
116 |     while(true){
117 |       val uuid = UUID.randomUUID()
118 |       val msg = new Message(topic, tag, uuid.toString.getBytes)
119 |       msg.setKey(s"ORDERID_${partitionId}_$count")
120 |       producer.send(msg)
121 |       count += 1
122 |       Thread.sleep(100L)
123 |     }
124 |   }
125 | }
126 | 


--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/streaming/SparkSLSDemo.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.aliyun.emr.example.spark.streaming
19 | 
20 | import org.apache.spark.SparkConf
21 | import org.apache.spark.storage.StorageLevel
22 | import org.apache.spark.streaming.{Milliseconds, StreamingContext}
23 | import org.apache.spark.streaming.aliyun.logservice.LoghubUtils
24 | 
25 | object SparkSLSDemo {
26 | 
27 |   def main(args: Array[String]): Unit = {
28 |     if (args.length < 7) {
29 |       System.err.println(
30 |         """Usage: SparkSLSDemo <sls project> <sls logstore> <loghub group name> <sls endpoint>
31 |           |         <access key id> <access key secret> <batch interval seconds>
32 |         """.stripMargin)
33 |       System.exit(1)
34 |     }
35 | 
36 |     val loghubProject = args(0)
37 |     val logStore = args(1)
38 |     val loghubGroupName = args(2)
39 |     val endpoint = args(3)
40 |     val accessKeyId = args(4)
41 |     val accessKeySecret = args(5)
42 |     val batchInterval = Milliseconds(args(6).toInt * 1000)
43 | 
44 |     def functionToCreateContext(): StreamingContext = {
45 |       val conf = new SparkConf().setAppName("E-MapReduce Demo 6-1: Spark SLS Demo (Scala)")
46 |       val ssc = new StreamingContext(conf, batchInterval)
47 |       val loghubStream = LoghubUtils.createStream(
48 |         ssc,
49 |         loghubProject,
50 |         logStore,
51 |         loghubGroupName,
52 |         endpoint,
53 |         accessKeyId,
54 |         accessKeySecret,
55 |         StorageLevel.MEMORY_AND_DISK)
56 | 
57 |       loghubStream.foreachRDD(rdd => println(s"rdd.count(): ${rdd.count()}"))
58 |       ssc.checkpoint("hdfs:///tmp/spark/streaming") // set checkpoint directory
59 |       ssc
60 |     }
61 | 
62 |     val ssc = StreamingContext.getOrCreate("hdfs:///tmp/spark/streaming", functionToCreateContext _)
63 | 
64 |     ssc.start()
65 |     ssc.awaitTermination()
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/streaming/benchmark/AbstractStreaming.scala:
--------------------------------------------------------------------------------
 1 | package com.aliyun.emr.example.spark.streaming.benchmark
 2 | 
 3 | import java.io.{BufferedInputStream, FileInputStream}
 4 | import java.util.Properties
 5 | 
 6 | import org.apache.kafka.clients.consumer.ConsumerRecord
 7 | import org.apache.kafka.common.serialization.StringDeserializer
 8 | import org.apache.spark.streaming.dstream.InputDStream
 9 | import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
10 | import org.apache.spark.streaming.{Duration, StreamingContext}
11 | import org.apache.spark.{SparkConf, SparkContext}
12 | 
13 | 
14 | abstract class AbstractStreaming {
15 |   var config: Properties= _
16 | 
17 |   def runJob(args: Array[String]): Unit = {
18 |     config = loadConfig(args(0))
19 |     val receiverCores = config.getProperty("partition.number").toInt / config.getProperty("kafka.partition.receiver.factor").toInt
20 |     val executorCore = (config.getProperty("cluster.cores.total").toInt * config.getProperty("cpu.core.factor").toFloat - receiverCores).toInt/config.getProperty("spark.executor.instances").toInt
21 |     val executorMem = config.getProperty("cluster.memory.per.node.mb").toInt * config.getProperty("cluster.worker.node.number").toInt / config.getProperty("spark.executor.instances").toInt
22 |     val sparkConf = new SparkConf()
23 |       .setAppName(config.getProperty("name"))
24 |       .set("spark.yarn.am.memory.mb", config.getProperty("spark.yarn.am.memory.mb") + "m")
25 |       .set("spark.yarn.am.cores", config.getProperty("spark.yarn.am.cores"))
26 |       .set("spark.executor.instances", config.getProperty("spark.executor.instances"))
27 |       .set("spark.executor.cores", executorCore.toString)
28 |       .set("spark.executor.memory", executorMem + "m")
29 |       .set("spark.streaming.blockInterval", config.getProperty("spark.streaming.blockInterval"))
30 |     val ssc = new StreamingContext(new SparkContext(sparkConf), Duration(config.getProperty("duration.ms").toLong))
31 | 
32 |     val kafkaParam = Map[String, Object](
33 |       "bootstrap.servers" -> config.getProperty("broker.list"),
34 |       "key.deserializer" -> classOf[StringDeserializer],
35 |       "value.deserializer" -> classOf[StringDeserializer],
36 |       "group.id" -> config.getProperty("consumer.group"),
37 |       "auto.offset.reset" -> "latest",
38 |       "enable.auto.commit" -> (true: java.lang.Boolean)
39 |     )
40 |     val stream = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](Array(config.getProperty("topic")), kafkaParam))
41 | 
42 |     execute(stream)
43 | 
44 |     ssc.start()
45 |     ssc.awaitTermination()
46 |   }
47 |   def execute(stream: InputDStream[ConsumerRecord[String, String]])
48 | 
49 |   def loadConfig(configFile: String): Properties = {
50 |     val properties = new Properties()
51 |     properties.load(new BufferedInputStream(new FileInputStream(configFile)))
52 |     properties
53 |   }
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/streaming/benchmark/KafkaHdfs.scala:
--------------------------------------------------------------------------------
 1 | package com.aliyun.emr.example.spark.streaming.benchmark
 2 | 
 3 | import org.apache.kafka.clients.consumer.ConsumerRecord
 4 | import org.apache.spark.streaming.dstream.InputDStream
 5 | 
 6 | object KafkaHdfs extends AbstractStreaming {
 7 |   override def execute(stream: InputDStream[ConsumerRecord[String, String]]): Unit = {
 8 |     stream.map(kv => kv.key() + "," + System.currentTimeMillis())
 9 |       .saveAsTextFiles(config.getProperty("filename.prefix") + config.getProperty("name") + "/result")
10 |   }
11 | 
12 |   def main(args: Array[String]): Unit = {
13 |     runJob(args)
14 |   }
15 | }
16 | 
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/streaming/benchmark/WordCount.scala:
--------------------------------------------------------------------------------
 1 | package com.aliyun.emr.example.spark.streaming.benchmark
 2 | 
 3 | import org.apache.kafka.clients.consumer.ConsumerRecord
 4 | import org.apache.spark.streaming.dstream.InputDStream
 5 | object WordCount extends AbstractStreaming {
 6 |   override def execute(stream: InputDStream[ConsumerRecord[String, String]]): Unit = {
 7 |     stream.flatMap(kv => {
 8 |       val value:List[(String, (Integer, Long))] = List()
 9 |       val eventTime = kv.key()
10 |       for (v <- kv.value().split(" ")) {
11 |         (v, (1, eventTime.toLong)) +: value
12 |       }
13 |       value
14 |     }).reduceByKey((x,y) =>{
15 |       val count = x._1 + y._1
16 |       var eventTime = x._2
17 |       if (x._2 < y._2) {
18 |         eventTime = y._2
19 |       }
20 |       (count, eventTime)
21 |     }).map(x => x._2._2).saveAsTextFiles(config.getProperty("filename.prefix") + config.getProperty("name") + "/result")
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/streaming/benchmark/metrics/BasicMetrics.scala:
--------------------------------------------------------------------------------
 1 | package com.aliyun.emr.example.spark.streaming.benchmark.metrics
 2 | 
 3 | import java.io.{BufferedInputStream, FileInputStream}
 4 | import java.util.Properties
 5 | 
 6 | class BasicMetrics extends Serializable {
 7 |   def getDuration(value: String, separator: String = ",") : Option[Long] = {
 8 |     val values = value.split(separator)
 9 |     if (values.length != 2) {
10 |       println("invalid result when parse start-time and finish time, invalid pattern should be start-time,end-time. content:" + value)
11 |       return None
12 |     }
13 |     val duration = values(1).toLong - values(0).toLong
14 |     Some(duration)
15 |   }
16 | 
17 |   def loadConfig(configFile: String): Properties = {
18 |     val properties = new Properties()
19 |     properties.load(new BufferedInputStream(new FileInputStream(configFile)))
20 |     properties
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/streaming/benchmark/metrics/HdfsMetrics.scala:
--------------------------------------------------------------------------------
 1 | package com.aliyun.emr.example.spark.streaming.benchmark.metrics
 2 | 
 3 | import org.apache.spark.{SparkConf, SparkContext}
 4 | 
 5 | object HdfsMetrics extends BasicMetrics {
 6 |   private final val AppName = "Metrics"
 7 |   def main(args: Array[String]): Unit = {
 8 |     if (args.length < 1) {
 9 |       System.err.println(
10 |         """Usage: bin/spark-submit --class com.aliyun.emr.example.spark.streaming.benchmark.HdfsMetrics examples-1.1-shaded.jar <configFilePath>
11 |           |
12 |           |Arguments:
13 |           |
14 |           |    configFilePath   config file path, like benchmark.properties
15 |           |
16 |         """.stripMargin)
17 |       System.exit(1)
18 |     }
19 | 
20 |     val config = loadConfig(args(0))
21 | 
22 |     val conf = new SparkConf()
23 |     conf.setAppName(AppName)
24 | 
25 |     var inputPath : String = null
26 |     if (!config.getProperty("from.spark.streaming").toBoolean) {
27 |       inputPath = config.getProperty("filename.prefix") + config.getProperty("benchmark.app.name") + "/*.txt"
28 |     } else {
29 |       inputPath = config.getProperty("filename.prefix") + config.getProperty("benchmark.app.name") + "/*/part-*"
30 | 
31 |     }
32 |     val input = new SparkContext(conf).textFile(inputPath, config.getProperty("metric.numPartitions").toInt)
33 |     val output = input.map(x => getDuration(x))
34 |       .filter(x => x.isDefined)
35 |       .map(x => x.get)
36 | 
37 |     val count = output.count()
38 |     println("total:%d".format(count))
39 |     output.histogram(Array(Double.MinValue, 0.0, 300.0, 500.0, 800.0, 900.0, 1000.0, 2000.0, 3000.0, Double.MaxValue)).foreach(x=> println(x.toDouble / count))
40 |   }
41 | }
42 | 
43 | 


--------------------------------------------------------------------------------
/src/main/scala/com/aliyun/emr/example/spark/streaming/benchmark/metrics/KafkaMetrics.scala:
--------------------------------------------------------------------------------
 1 | package com.aliyun.emr.example.spark.streaming.benchmark.metrics
 2 | 
 3 | import org.apache.kafka.common.serialization.StringDeserializer
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.streaming._
 6 | import org.apache.spark.streaming.kafka010._
 7 | 
 8 | object KafkaMetrics extends BasicMetrics {
 9 |   def main(args: Array[String]): Unit = {
10 | 
11 |     val config = loadConfig(args(0))
12 | 
13 |     val ssc = new StreamingContext(new SparkConf().setAppName("KafkaMetrics"), Seconds(config.getProperty("metric.duration.second").toLong))
14 |     val kafkaParam = Map[String, Object] (
15 |       "bootstrap.servers" -> config.getProperty("result.broker.list"),
16 |       "key.deserializer" -> classOf[StringDeserializer],
17 |       "value.deserializer" -> classOf[StringDeserializer],
18 |       "group.id" -> config.getProperty("metric.group.id"),
19 |       "auto.offset.reset" -> "earliest",
20 |       "enable.auto.commit" -> (false: java.lang.Boolean)
21 | 
22 |     )
23 |     val messages  = KafkaUtils.createDirectStream[String, String](ssc,
24 |       LocationStrategies.PreferConsistent,
25 |       ConsumerStrategies.Subscribe[String, String](Array(config.getProperty("result.topic")), kafkaParam))
26 | 
27 |     val outputPath = config.getProperty("filename.prefix") + config.getProperty("benchmark.app.name") + "/kafka-"
28 |     messages.map(_.value()).saveAsTextFiles(outputPath)
29 | 
30 |     ssc.start()
31 |     ssc.awaitTermination()
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------