├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── conf └── benchmark.properties ├── data ├── The_Sorrows_of_Young_Werther.txt ├── abalone ├── patterns.txt └── u.data ├── lib ├── kafka-tools-1.0.0.jar └── tutorial.jar ├── pic ├── 1.JPG ├── 10.JPG ├── 11.JPG ├── 2.JPG ├── 3.JPG ├── 4.JPG ├── 5.JPG ├── 6.JPG ├── 7.JPG ├── 8.JPG └── 9.JPG ├── pom.xml ├── resources ├── The_Sorrows_of_Young_Werther.txt ├── patterns.txt └── student_data.csv └── src └── main ├── hive └── sample.hive ├── java └── com │ └── aliyun │ └── emr │ └── example │ ├── hadoop │ ├── EMapReduceOSSUtil.java │ └── WordCount.java │ ├── spark │ ├── SparkMaxComputeJavaDemo.java │ ├── SparkOssJavaDemo.java │ ├── SparkTableStoreJavaDemo.java │ ├── sql │ │ └── streaming │ │ │ ├── SparkSLSContinuousStructuredStreamingJavaDemo.java │ │ │ └── SparkSLSStructuredStreamingJavaDemo.java │ └── streaming │ │ ├── JavaLoghubWordCount.java │ │ ├── SparkMNSJavaDemo.java │ │ └── SparkRocketMQJavaDemo.java │ └── storm │ ├── StormKafkaSample.java │ └── benchmark │ ├── AbstractTopology.java │ ├── BasicTopology.java │ ├── KafkaHdfs.java │ ├── TridentWordCount.java │ ├── WindowedWordCount.java │ ├── WordCount.java │ └── util │ └── Helper.java ├── pig └── sample.pig ├── python ├── deeplearning │ ├── data │ │ ├── boston │ │ │ └── train.csv │ │ └── moviedata │ │ │ ├── movies.csv │ │ │ └── ratings.csv │ ├── tf_fm_on_spark.py │ └── train_boston.py ├── odps-sample.py ├── streaming │ ├── loghub-wordcount.py │ ├── wcmapper.py │ └── wcreducer.py └── wordcount.py └── scala └── com └── aliyun └── emr └── example ├── flink └── FlinkOSSSample.scala └── spark ├── AbstractParams.scala ├── LinearRegression.scala ├── MongoDBWordCount.scala ├── RunLocally.scala ├── SparkMaxComputeDemo.scala ├── SparkOssDemo.scala ├── SparkPi.scala ├── SparkRdsDemo.scala ├── SparkWordCount.scala ├── sql ├── ODPSDataSourceSample.scala └── streaming │ ├── SparkSLSContinuousStructuredStreamingDemo.scala │ └── SparkSLSStructuredStreamingDemo.scala └── streaming ├── DirectSparkSLSDemo.scala ├── DtsSample.scala ├── RedisWordCount.scala.1 ├── SparkDatahubDemo.scala ├── SparkHBaseDemo.scala ├── SparkKafkaDemo.scala ├── SparkMNSDemo.scala ├── SparkRocketMQDemo.scala ├── SparkSLSDemo.scala └── benchmark ├── AbstractStreaming.scala ├── KafkaHdfs.scala ├── WordCount.scala └── metrics ├── BasicMetrics.scala ├── HdfsMetrics.scala └── KafkaMetrics.scala /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | .idea/ 3 | *.iml 4 | *.DS_Store 5 | bin/* 6 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # 1. Choose OS (Ubuntu 14.04.3 LTS Server Edition 64bit, ~2 CORE, 7.5GB RAM) 2 | sudo: required 3 | 4 | # 2. Choose language and target JDKs for parallel builds. 5 | language: java 6 | jdk: 7 | - oraclejdk8 8 | 9 | # 3. Setup cache directory for SBT and Maven. 10 | cache: 11 | directories: 12 | - $HOME/.m2 13 | 14 | # 4. Run maven install before running lint-java. 15 | install: 16 | - 17 | 18 | script: 19 | - echo -e '\n\n \n \n mvnsearch-unavailable\n mvnsearch-unavailable\n mvnsearch\n http://repo1.maven.org/maven2\n \n \n \n \n no-mvnsearch\n \n \n mvnsearch\n http://www.mvnsearch.org/maven2\n \n true\n \n \n true\n \n \n \n \n \n \n no-mvnsearch\n \n' > $HOME/.m2/settings.xml 20 | - cat $HOME/.m2/settings.xml 21 | - mvn clean package -DskipTests 22 | 23 | # 5. Branches only 24 | branches: 25 | only: 26 | - master-2 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The Artistic License 2.0 2 | 3 | Copyright (c) 2015 aliyun 4 | 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | This license establishes the terms under which a given free software 11 | Package may be copied, modified, distributed, and/or redistributed. 12 | The intent is that the Copyright Holder maintains some artistic 13 | control over the development of that Package while still keeping the 14 | Package available as open source and free software. 15 | 16 | You are always permitted to make arrangements wholly outside of this 17 | license directly with the Copyright Holder of a given Package. If the 18 | terms of this license do not permit the full use that you propose to 19 | make of the Package, you should contact the Copyright Holder and seek 20 | a different licensing arrangement. 21 | 22 | Definitions 23 | 24 | "Copyright Holder" means the individual(s) or organization(s) 25 | named in the copyright notice for the entire Package. 26 | 27 | "Contributor" means any party that has contributed code or other 28 | material to the Package, in accordance with the Copyright Holder's 29 | procedures. 30 | 31 | "You" and "your" means any person who would like to copy, 32 | distribute, or modify the Package. 33 | 34 | "Package" means the collection of files distributed by the 35 | Copyright Holder, and derivatives of that collection and/or of 36 | those files. A given Package may consist of either the Standard 37 | Version, or a Modified Version. 38 | 39 | "Distribute" means providing a copy of the Package or making it 40 | accessible to anyone else, or in the case of a company or 41 | organization, to others outside of your company or organization. 42 | 43 | "Distributor Fee" means any fee that you charge for Distributing 44 | this Package or providing support for this Package to another 45 | party. It does not mean licensing fees. 46 | 47 | "Standard Version" refers to the Package if it has not been 48 | modified, or has been modified only in ways explicitly requested 49 | by the Copyright Holder. 50 | 51 | "Modified Version" means the Package, if it has been changed, and 52 | such changes were not explicitly requested by the Copyright 53 | Holder. 54 | 55 | "Original License" means this Artistic License as Distributed with 56 | the Standard Version of the Package, in its current version or as 57 | it may be modified by The Perl Foundation in the future. 58 | 59 | "Source" form means the source code, documentation source, and 60 | configuration files for the Package. 61 | 62 | "Compiled" form means the compiled bytecode, object code, binary, 63 | or any other form resulting from mechanical transformation or 64 | translation of the Source form. 65 | 66 | 67 | Permission for Use and Modification Without Distribution 68 | 69 | (1) You are permitted to use the Standard Version and create and use 70 | Modified Versions for any purpose without restriction, provided that 71 | you do not Distribute the Modified Version. 72 | 73 | 74 | Permissions for Redistribution of the Standard Version 75 | 76 | (2) You may Distribute verbatim copies of the Source form of the 77 | Standard Version of this Package in any medium without restriction, 78 | either gratis or for a Distributor Fee, provided that you duplicate 79 | all of the original copyright notices and associated disclaimers. At 80 | your discretion, such verbatim copies may or may not include a 81 | Compiled form of the Package. 82 | 83 | (3) You may apply any bug fixes, portability changes, and other 84 | modifications made available from the Copyright Holder. The resulting 85 | Package will still be considered the Standard Version, and as such 86 | will be subject to the Original License. 87 | 88 | 89 | Distribution of Modified Versions of the Package as Source 90 | 91 | (4) You may Distribute your Modified Version as Source (either gratis 92 | or for a Distributor Fee, and with or without a Compiled form of the 93 | Modified Version) provided that you clearly document how it differs 94 | from the Standard Version, including, but not limited to, documenting 95 | any non-standard features, executables, or modules, and provided that 96 | you do at least ONE of the following: 97 | 98 | (a) make the Modified Version available to the Copyright Holder 99 | of the Standard Version, under the Original License, so that the 100 | Copyright Holder may include your modifications in the Standard 101 | Version. 102 | 103 | (b) ensure that installation of your Modified Version does not 104 | prevent the user installing or running the Standard Version. In 105 | addition, the Modified Version must bear a name that is different 106 | from the name of the Standard Version. 107 | 108 | (c) allow anyone who receives a copy of the Modified Version to 109 | make the Source form of the Modified Version available to others 110 | under 111 | 112 | (i) the Original License or 113 | 114 | (ii) a license that permits the licensee to freely copy, 115 | modify and redistribute the Modified Version using the same 116 | licensing terms that apply to the copy that the licensee 117 | received, and requires that the Source form of the Modified 118 | Version, and of any works derived from it, be made freely 119 | available in that license fees are prohibited but Distributor 120 | Fees are allowed. 121 | 122 | 123 | Distribution of Compiled Forms of the Standard Version 124 | or Modified Versions without the Source 125 | 126 | (5) You may Distribute Compiled forms of the Standard Version without 127 | the Source, provided that you include complete instructions on how to 128 | get the Source of the Standard Version. Such instructions must be 129 | valid at the time of your distribution. If these instructions, at any 130 | time while you are carrying out such distribution, become invalid, you 131 | must provide new instructions on demand or cease further distribution. 132 | If you provide valid instructions or cease distribution within thirty 133 | days after you become aware that the instructions are invalid, then 134 | you do not forfeit any of your rights under this license. 135 | 136 | (6) You may Distribute a Modified Version in Compiled form without 137 | the Source, provided that you comply with Section 4 with respect to 138 | the Source of the Modified Version. 139 | 140 | 141 | Aggregating or Linking the Package 142 | 143 | (7) You may aggregate the Package (either the Standard Version or 144 | Modified Version) with other packages and Distribute the resulting 145 | aggregation provided that you do not charge a licensing fee for the 146 | Package. Distributor Fees are permitted, and licensing fees for other 147 | components in the aggregation are permitted. The terms of this license 148 | apply to the use and Distribution of the Standard or Modified Versions 149 | as included in the aggregation. 150 | 151 | (8) You are permitted to link Modified and Standard Versions with 152 | other works, to embed the Package in a larger work of your own, or to 153 | build stand-alone binary or bytecode versions of applications that 154 | include the Package, and Distribute the result without restriction, 155 | provided the result does not expose a direct interface to the Package. 156 | 157 | 158 | Items That are Not Considered Part of a Modified Version 159 | 160 | (9) Works (including, but not limited to, modules and scripts) that 161 | merely extend or make use of the Package, do not, by themselves, cause 162 | the Package to be a Modified Version. In addition, such works are not 163 | considered parts of the Package itself, and are not subject to the 164 | terms of this license. 165 | 166 | 167 | General Provisions 168 | 169 | (10) Any use, modification, and distribution of the Standard or 170 | Modified Versions is governed by this Artistic License. By using, 171 | modifying or distributing the Package, you accept this license. Do not 172 | use, modify, or distribute the Package, if you do not accept this 173 | license. 174 | 175 | (11) If your Modified Version has been derived from a Modified 176 | Version made by someone other than you, you are nevertheless required 177 | to ensure that your Modified Version complies with the requirements of 178 | this license. 179 | 180 | (12) This license does not grant you the right to use any trademark, 181 | service mark, tradename, or logo of the Copyright Holder. 182 | 183 | (13) This license includes the non-exclusive, worldwide, 184 | free-of-charge patent license to make, have made, use, offer to sell, 185 | sell, import and otherwise transfer the Package with respect to any 186 | patent claims licensable by the Copyright Holder that are necessarily 187 | infringed by the Package. If you institute patent litigation 188 | (including a cross-claim or counterclaim) against any party alleging 189 | that the Package constitutes direct or contributory patent 190 | infringement, then this Artistic License to you shall terminate on the 191 | date that such litigation is filed. 192 | 193 | (14) Disclaimer of Warranty: 194 | THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS 195 | IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES. THE IMPLIED 196 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR 197 | NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY YOUR LOCAL 198 | LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR CONTRIBUTOR WILL 199 | BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL 200 | DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE, EVEN IF 201 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 本项目包含以下示例: 2 | 3 | #### MapReduce 4 | 5 | - WordCount: 单词统计 6 | 7 | #### Hive 8 | 9 | - sample.hive:表的简单查询 10 | 11 | #### Pig 12 | 13 | - sample.pig:Pig处理OSS数据实例 14 | 15 | #### Spark 16 | 17 | - SparkPi: 计算Pi 18 | - SparkWordCount: 单词统计 19 | - LinearRegression: 线性回归 20 | - OSSSample: OSS使用示例 21 | - ONSSample: ONS使用示例 22 | - ODPSSample: ODPS使用示例 23 | - MNSSample:MNS使用示例 24 | - LoghubSample:Loghub使用示例 25 | 26 | #### PySpark 27 | 28 | - WordCount: 单词统计 29 | 30 | ## 依赖资源 31 | 32 | 测试数据(data目录下): 33 | 34 | - The_Sorrows_of_Young_Werther.txt:可作为WordCount(MapReduce/Spark)的输入数据 35 | - patterns.txt:WordCount(MapReduce)作业的过滤字符 36 | - u.data:sample.hive脚本的测试表数据 37 | - abalone:线性回归算法测试数据 38 | 39 | 依赖jar包(lib目录下) 40 | 41 | - tutorial.jar:sample.pig作业需要的依赖jar包 42 | 43 | ## 准备工作 44 | 45 | 本项目提供了一些测试数据,您可以简单地将其上传到OSS中即可使用。其他示例,例如ODPS,MNS,ONS和Loghub等等,需要您自己准备数据如下: 46 | 47 | - 【可选】 创建LogStore,参考[日志服务用户指南](https://help.aliyun.com/document_detail/sls/user-guide/overview.html?spm=5176.docsls/user-guide/consume-logs.3.2.VW5TNb)。 48 | - 【可选】 创建ODPS项目和表,参考[ODPS快速开始](https://help.aliyun.com/document_detail/odps/quick_start/prerequisite.html?spm=5176.docodps/quick_start/prerequisite.3.2.OqBkc4)。 49 | - 【可选】 创建ONS,参考[消息队列快速开始](https://help.aliyun.com/document_detail/ons/quick-start/apply.html?spm=5176.docons/quick-start/send.3.2.eZ8h7p)。 50 | - 【可选】 创建MNS,参考[消息服务控制台使用帮助](https://help.aliyun.com/document_detail/mns/help_of_console/AccessMNSBySubUser.html?spm=5176.docmns/help_of_console/help_of_queue/CreateQueue.3.2.0Sj96I)。 51 | 52 | ## 基本概念: 53 | 54 | - OSSURI: **oss**://accessKeyId:accessKeySecret@bucket.endpoint/a/b/c.txt,用户在作业中指定输入输出数据源时使用,可以类比hdfs://。 55 | - 阿里云AccessKeyId/AccessKeySecret是您访问阿里云API的密钥,你可以在[这里](https://ak-console.aliyun.com/#/accesskey)获取。 56 | 57 | ## 集群运行 58 | 59 | - Spark 60 | - SparkWordCount: `spark-submit --class SparkWordCount examples-1.0-SNAPSHOT-shaded.jar ` 61 | - inputPath: 输入数据路径 62 | - outputPath: 输出路径 63 | - numPartition: 输入数据RDD分片数目 64 | - SparkPi: `spark-submit --class SparkPi examples-1.0-SNAPSHOT-shaded.jar` 65 | - SparkOssDemo:`spark-submit --class SparkOssDemo examples-1.0-SNAPSHOT-shaded.jar ` 66 | - accessKeyId: 阿里云AccessKeyId 67 | - accessKeySecret:阿里云AccessKeySecret 68 | - endpoint: 阿里云OSS endpoint 69 | - inputPath: 输入数据路径 70 | - numPartition:输入数据RDD分片数目 71 | - SparkRocketMQDemo: `spark-submit --class SparkRocketMQDemo examples-1.0-SNAPSHOT-shaded.jar ` 72 | - accessKeyId: 阿里云AccessKeyId 73 | - accessKeySecret:阿里云AccessKeySecret 74 | - consumerId: 参考[Consumer ID说明](https://help.aliyun.com/document_detail/ons/brief-manual/terminology.html?spm=5176.docons/brief-manual/overview.6.87.F8suBu) 75 | - topic: 每个消息队列都有一个topic 76 | - subExpression: 参考[消息过滤](https://help.aliyun.com/document_detail/ons/user-guide/tag-filter.html?spm=5176.docons/tcp/java-sdk/normal-consumer.6.97.PIqsEo)。 77 | - parallelism:指定多少个接收器来消费队列消息。 78 | - SparkMaxComputeDemo: `spark-submit --class SparkMaxComputeDemo examples-1.0-SNAPSHOT-shaded.jar ` 79 | - accessKeyId: 阿里云AccessKeyId 80 | - accessKeySecret:阿里云AccessKeySecret 81 | - envType: 0表示公网环境,1表示内网环境。如果是本地调试选择0,如果是在E-MapReduce上执行请选择1。 82 | - project:参考[ODPS-快速开始](https://help.aliyun.com/document_detail/odps/quick_start/prerequisite.html?spm=5176.docodps/summary/glossary.6.90.inv9Ph)。 83 | - table:参考[ODPS术语介绍](https://help.aliyun.com/document_detail/odps/summary/glossary.html?spm=5176.docodps/quick_start/prerequisite.6.88.A5zVKu)。 84 | - numPartition:输入数据RDD分片数目 85 | - SparkMNSDemo: `spark-submit --class SparkMNSDemo examples-1.0-SNAPSHOT-shaded.jar ` 86 | - queueName:队列名,[参考MNS名词解释](https://help.aliyun.com/document_detail/mns/introduction/product-name-interpretation.html?spm=5176.docmns/help_of_console/help_of_queue/CreateQueue.6.87.lHtPvO)。 87 | - accessKeyId: 阿里云AccessKeyId 88 | - accessKeySecret:阿里云AccessKeySecret 89 | - endpoint:队列数据访问地址 90 | - SparkSLSDemo: `spark-submit --class SparkSLSDemo examples-1.0-SNAPSHOT-shaded.jar ` 91 | - sls project: LogService项目名 92 | - sls logstore: 日志库名 93 | - loghub group name:作业中消费日志数据的组名,可以任意取。sls project,sls store相同时,相同组名的作业会协同消费sls store中的数据;不同组名的作业会相互隔离地消费sls store中的数据。 94 | - sls endpoint: 参考[日志服务入口](https://help.aliyun.com/document_detail/sls/api/endpoints.html?spm=5176.docsls/user-guide/concept.6.134.Gy05tN)。 95 | - accessKeyId: 阿里云AccessKeyId 96 | - accessKeySecret:阿里云AccessKeySecret 97 | - batch interval seconds: Spark Streaming作业的批次间隔,单位为秒。 98 | - LinearRegression: `spark-submit --class LinearRegression examples-1.0-SNAPSHOT-shaded.jar ` 99 | - inputPath:输入数据 100 | - numPartition:输入数据RDD分片数目 101 | 102 | - PySpark 103 | - WordCount: `spark-submit wordcount.py ` 104 | - inputPath: 输入数据路径 105 | - outputPath: 输出路径 106 | - numPartition: 输入数据RDD分片数目 107 | 108 | - Mapreduce 109 | - WordCount: `hadoop jar examples-1.0-SNAPSHOT-shaded.jar WordCount -Dwordcount.case.sensitive=true -skip ` 110 | - inputPathl:输入数据路径 111 | - outputPath:输出路径 112 | - patternPath:过滤字符文件,可以使用data/patterns.txt 113 | 114 | - Hadoop Streaming 115 | - WordCount: `hadoop jar /usr/lib/hadoop-current/share/hadoop/tools/lib/hadoop-streaming-*.jar -file -mapper mapper.py -file -reducer reducer.py -input -output ` 116 | - mapperPyFile mapper文件,[mapper样例](/src/main/python/streaming/wcmapper.py) 117 | - reducerPyFile reducer文件, [reducer样例](/src/main/python/streaming/wcreducer.py) 118 | - inputPath:输入数据路径 119 | - outputPath:输出路径 120 | 121 | - Hive 122 | - `hive -f sample.hive -hiveconf inputPath=` 123 | - inputPath:输入数据路径 124 | 125 | - Pig 126 | - `pig -x mapreduce -f sample.pig -param tutorial= -param input= -param result=` 127 | - tutorialJarPath:依赖Jar包,可使用lib/tutorial.jar 128 | - inputPath:输入数据路径 129 | - resultPath:输出路径 130 | 131 | - 注意: 132 | - 如果在E-MapReduce上使用时,请将测试数据和依赖jar包上传到OSS中,路径规则遵循OSSURI定义,见上。 133 | - 如果集群中使用,可以放在机器本地。 134 | 135 | ## 本地运行 136 | 137 | 这里主要介绍如何在本地运行Spark程序访问阿里云数据源,例如OSS等。如果希望本地调试运行,最好借助一些开发工具,例如Intellij IDEA或者Eclipse。尤其是Windows环境,否则需要在Windows机器上配置Hadoop和Spark运行环境,很麻烦。 138 | 139 | - Intellij IDEA 140 | - 前提:安装Intellij IDEA,Maven, Intellij IDEA Maven插件,Scala,Intellij IDEA Scala插件 141 | - 双击进入SparkWordCount.scala 142 | ![idea5](pic/11.JPG) 143 | - 从下图箭头所指处进入作业配置界面 144 | ![idea1](pic/7.JPG) 145 | - 选择SparkWordCount,在作业参数框中按照所需传入作业参数 146 | ![idea2](pic/8.JPG) 147 | - 点击“OK” 148 | - 点击运行按钮,执行作业 149 | ![idea3](pic/9.JPG) 150 | - 查看作业执行日志 151 | ![idea4](pic/10.JPG) 152 | 153 | - Scala IDE for Eclipse 154 | - 前提:安装Scala IDE for Eclipse,Maven,Eclipse Maven插件 155 | - 导入项目 156 | ![eclipse2](pic/2.JPG) 157 | ![eclipse3](pic/3.JPG) 158 | ![eclipse4](pic/4.JPG) 159 | - Run As Maven build,快捷键是“Alt + Shilft + X, M”;也可以在项目名上右键,“Run As”选择“Maven build” 160 | - 等待编译完后,在需要运行的作业上右键,选择“Run Configuration”,进入配置页 161 | - 在配置页中,选择Scala Application,并配置作业的Main Class和参数等等。 162 | ![eclipse5](pic/5.JPG) 163 | - 点击“Run” 164 | - 查看控制台输出日志 165 | ![eclipse6](pic/6.JPG) 166 | -------------------------------------------------------------------------------- /conf/benchmark.properties: -------------------------------------------------------------------------------- 1 | ##common 2 | name=KafkaHdfs 3 | 4 | ## cluster 5 | cluster.cores.total=160 6 | cluster.worker.node.number=5 7 | cluster.memory.per.node.mb=90000 8 | 9 | ##kafka producer 10 | partition.number=50 11 | topic=st-36 12 | consumer.group=streaming 13 | zookeeper.address=localhost 14 | zookeeper.root=/kafka-1.0.0 15 | broker.list=localhost:9092 16 | 17 | ##kafka consumer 18 | result.topic=benchmark-result 19 | result.broker.list=localhost:9092 20 | 21 | ##storm 22 | worker.slot.number=10 23 | # spout.parallelism equals kafka partition.number 24 | #spout.parallelism=25 25 | window.length=10 26 | slide.interval=10 27 | backpressure.enable=false 28 | hdfs.parallelism.factor=1 29 | # trident 30 | # 0 if disable ack 31 | ack.open=true 32 | 33 | ##spark streaming 34 | #deploy.mode=yarn-client to make use of cluster header node 35 | duration.ms=1000 36 | spark.executor.instances=10 37 | # recerver number=kafka-partition/factor 38 | kafka.partition.receiver.factor=1 39 | spark.yarn.am.memory.mb=20000 40 | spark.yarn.am.cores=15 41 | #default 200ms, recommend >= 50ms 42 | spark.streaming.blockInterval=200ms 43 | # vcore = physical-core *factor 44 | cpu.core.factor=1.5 45 | 46 | ##hdfs 47 | url=hdfs://emr-header-1:9000 48 | filename.prefix=/foo/ 49 | sync.record.number=1000 50 | 51 | ## metric 52 | benchmark.app.name=KafkaHdfs 53 | metric.numPartitions=100 54 | from.spark.streaming=true 55 | metric.duration.second=60 56 | metric.group.id=kafka-metrics 57 | -------------------------------------------------------------------------------- /data/The_Sorrows_of_Young_Werther.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/data/The_Sorrows_of_Young_Werther.txt -------------------------------------------------------------------------------- /data/patterns.txt: -------------------------------------------------------------------------------- 1 | \. 2 | \, 3 | \! 4 | to 5 | \" -------------------------------------------------------------------------------- /lib/kafka-tools-1.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/lib/kafka-tools-1.0.0.jar -------------------------------------------------------------------------------- /lib/tutorial.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/lib/tutorial.jar -------------------------------------------------------------------------------- /pic/1.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/1.JPG -------------------------------------------------------------------------------- /pic/10.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/10.JPG -------------------------------------------------------------------------------- /pic/11.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/11.JPG -------------------------------------------------------------------------------- /pic/2.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/2.JPG -------------------------------------------------------------------------------- /pic/3.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/3.JPG -------------------------------------------------------------------------------- /pic/4.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/4.JPG -------------------------------------------------------------------------------- /pic/5.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/5.JPG -------------------------------------------------------------------------------- /pic/6.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/6.JPG -------------------------------------------------------------------------------- /pic/7.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/7.JPG -------------------------------------------------------------------------------- /pic/8.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/8.JPG -------------------------------------------------------------------------------- /pic/9.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/pic/9.JPG -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.aliyun.emr 8 | examples 9 | 1.2.0 10 | jar 11 | Aliyun E-MapReduce Demo Project 12 | 13 | 14 | 2.3.1 15 | 1.4.0 16 | 2.0.0 17 | 3.0.0 18 | 0.28.4-public 19 | 0.6.13 20 | 1.7.1.Final 21 | 3.5.9 22 | 2.9.0 23 | 1.7.3 24 | 3.2.0 25 | 1.2.45 26 | 11.0.2 27 | 5.1.34 28 | 0.11.2 29 | 2.7.2 30 | 2.0 31 | 1.1.1 32 | 4.6.27.12.0 33 | 3.3.2 34 | 35 | 36 | 37 | 38 | org.apache.spark 39 | spark-core_2.11 40 | ${spark.version} 41 | 42 | 43 | 44 | org.apache.spark 45 | spark-mllib_2.11 46 | ${spark.version} 47 | 48 | 49 | 50 | org.apache.spark 51 | spark-sql_2.11 52 | ${spark.version} 53 | 54 | 55 | 56 | org.apache.spark 57 | spark-streaming_2.11 58 | ${spark.version} 59 | 60 | 61 | 62 | org.apache.spark 63 | spark-streaming-kafka-0-10_2.11 64 | ${spark.version} 65 | 66 | 67 | 68 | org.apache.spark 69 | spark-hive_2.11 70 | ${spark.version} 71 | 72 | 73 | org.apache.httpcomponents 74 | httpclient 75 | 76 | 77 | org.apache.httpcomponents 78 | httpcore 79 | 80 | 81 | 82 | 83 | 84 | org.apache.flink 85 | flink-core 86 | ${flink.version} 87 | 88 | 89 | 90 | org.apache.flink 91 | flink-clients_2.11 92 | ${flink.version} 93 | 94 | 95 | 96 | org.apache.flink 97 | flink-connector-kafka-0.11_2.11 98 | 1.4.2 99 | 100 | 101 | 102 | 103 | com.aliyun.emr 104 | emr-tablestore 105 | ${emr.version} 106 | 107 | 108 | 109 | 110 | com.aliyun.emr 111 | emr-mns_2.11 112 | ${emr.version} 113 | 114 | 115 | com.aliyun.mns 116 | aliyun-sdk-mns 117 | 118 | 119 | 120 | 121 | 122 | com.aliyun.emr 123 | emr-logservice_2.11 124 | ${emr.version} 125 | 126 | 127 | 128 | com.aliyun.openservices 129 | aliyun-log 130 | 0.6.60 131 | 132 | 133 | 134 | 135 | com.aliyun.emr 136 | emr-maxcompute_2.11 137 | ${emr.version} 138 | 139 | 140 | 141 | com.aliyun.emr 142 | emr-ons_2.11 143 | ${emr.version} 144 | 145 | 146 | 147 | com.aliyun.emr 148 | emr-dts_2.11 149 | ${emr.version} 150 | 151 | 152 | 153 | com.aliyun.oss 154 | aliyun-sdk-oss 155 | ${oss.sdk.version} 156 | 157 | 158 | 159 | com.aliyun.odps 160 | odps-sdk-core 161 | ${odps.version} 162 | 163 | 164 | org.codehaus.jackson 165 | jackson-mapper-asl 166 | 167 | 168 | org.codehaus.jackson 169 | jackson-core-asl 170 | 171 | 172 | 173 | 174 | 175 | com.aliyun.odps 176 | odps-sdk-commons 177 | ${odps.version} 178 | 179 | 180 | 181 | com.aliyun.openservices 182 | loghub-client-lib 183 | ${loghubb.client.version} 184 | 185 | 186 | 187 | com.aliyun.openservices 188 | ons-client 189 | ${ons.version} 190 | 191 | 192 | 193 | com.aliyun.openservices 194 | ons-api 195 | ${ons.version} 196 | 197 | 198 | 199 | com.alibaba.rocketmq 200 | rocketmq-client 201 | ${rocketmq.version} 202 | 203 | 204 | 205 | com.alibaba.rocketmq 206 | rocketmq-common 207 | ${rocketmq.version} 208 | 209 | 210 | 211 | com.alibaba.rocketmq 212 | rocketmq-remoting 213 | ${rocketmq.version} 214 | 215 | 216 | 217 | org.apache.hadoop 218 | hadoop-mapreduce-client-core 219 | ${hadoop.version} 220 | 221 | 222 | jdk.tools 223 | jdk.tools 224 | 225 | 226 | 227 | 228 | 229 | org.aspectj 230 | aspectjrt 231 | ${aspectjrt.version} 232 | 233 | 234 | 235 | com.github.scopt 236 | scopt_2.10 237 | ${scopt.version} 238 | 239 | 240 | 241 | com.alibaba 242 | fastjson 243 | ${fastjson.version} 244 | 245 | 246 | 247 | com.google.guava 248 | guava 249 | ${guava.version} 250 | 251 | 252 | 253 | mysql 254 | mysql-connector-java 255 | ${mysql.connector.version} 256 | 257 | 258 | 259 | com.stratio.datasource 260 | spark-mongodb_2.10 261 | ${mongodb.version} 262 | 263 | 264 | 265 | redis.clients 266 | jedis 267 | ${redis.clients.version} 268 | 269 | 270 | 271 | org.apache.commons 272 | commons-pool2 273 | ${commons.pool2.version} 274 | 275 | 276 | 277 | org.apache.hbase 278 | hbase-client 279 | ${hbase.version} 280 | 281 | 282 | jdk.tools 283 | jdk.tools 284 | 285 | 286 | org.apache.hadoop 287 | hadoop-mapreduce-client-core 288 | 289 | 290 | 291 | 292 | 293 | org.apache.hbase 294 | hbase-common 295 | ${hbase.version} 296 | 297 | 298 | 299 | org.apache.hbase 300 | hbase-protocol 301 | ${hbase.version} 302 | 303 | 304 | 305 | com.aliyun.mns 306 | aliyun-sdk-mns 307 | 1.1.8.8 308 | 309 | 310 | 311 | org.apache.httpcomponents 312 | httpasyncclient 313 | 4.1 314 | 315 | 316 | 317 | org.apache.httpcomponents 318 | httpcore-nio 319 | 4.4.1 320 | 321 | 322 | 323 | org.apache.httpcomponents 324 | httpcore 325 | 4.4.1 326 | 327 | 328 | 329 | org.apache.kafka 330 | kafka_2.11 331 | 0.10.0.1 332 | 333 | 334 | 335 | org.apache.kafka 336 | kafka-clients 337 | 0.10.0.1 338 | 339 | 340 | 341 | com.aliyun.dts 342 | dts-subscribe-sdk 343 | ${dts.version} 344 | 345 | 346 | 347 | org.apache.commons 348 | commons-lang3 349 | ${commons.lang3.version} 350 | 351 | 352 | 353 | org.apache.storm 354 | storm-core 355 | 1.1.2 356 | 357 | 358 | org.slf4j 359 | log4j-over-slf4j 360 | 361 | 362 | 363 | 364 | 365 | org.apache.storm 366 | storm-kafka 367 | 1.1.2 368 | 369 | 370 | 371 | org.apache.storm 372 | storm-hdfs 373 | 1.1.2 374 | 375 | 376 | 377 | org.apache.storm 378 | storm-perf 379 | 1.1.2 380 | 381 | 382 | 383 | org.apache.hadoop 384 | hadoop-hdfs 385 | 2.6.1 386 | 387 | 388 | 389 | org.apache.hadoop 390 | hadoop-common 391 | 2.6.1 392 | 393 | 394 | 395 | javax.mail 396 | mail 397 | 1.4.7 398 | 399 | 400 | 401 | com.aliyun.emr 402 | emr-datahub_2.11 403 | 2.2.0 404 | 405 | 406 | 407 | com.squareup.okhttp3 408 | okhttp 409 | 3.12.0 410 | 411 | 412 | 413 | com.aliyun.datahub 414 | aliyun-sdk-datahub 415 | 2.13.0-public 416 | 417 | 418 | 419 | org.apache.htrace 420 | htrace-core 421 | 3.1.0-incubating 422 | 423 | 424 | 425 | 426 | 427 | target/classes 428 | target/test-classes 429 | 430 | 431 | maven-compiler-plugin 432 | 433 | 1.8 434 | 1.8 435 | UTF-8 436 | 437 | 438 | 439 | net.alchim31.maven 440 | scala-maven-plugin 441 | 4.0.1 442 | 443 | 444 | scala-compile-first 445 | process-resources 446 | 447 | compile 448 | 449 | 450 | 451 | scala-test-compile-first 452 | process-test-resources 453 | 454 | testCompile 455 | 456 | 457 | 458 | attach-scaladocs 459 | verify 460 | 461 | doc-jar 462 | 463 | 464 | 465 | 466 | 467 | org.apache.maven.plugins 468 | maven-shade-plugin 469 | 2.4.2 470 | 471 | false 472 | ${project.build.directory}/shaded/examples-${project.version}-shaded.jar 473 | 474 | 475 | javax.mail:mail 476 | org.apache.htrace:htrace-core 477 | com.squareup.okhttp3:okhttp 478 | com.squareup.okio:okio 479 | com.squareup.okhttp3:logging-interceptor 480 | com.squareup.retrofit2:converter-jackson 481 | com.squareup.retrofit2:retrofit 482 | com.aliyun.openservices:aliyun-sls-v0.6-inner 483 | com.aliyun.datahub:aliyun-sdk-datahub 484 | com.aliyun.emr:emr-datahub_2.11 485 | com.aliyun.emr:emr-tablestore 486 | com.aliyun.emr:emr-mns_2.11 487 | com.aliyun.emr:emr-logservice_2.11 488 | com.aliyun.emr:emr-maxcompute_2.11 489 | com.aliyun.emr:emr-ons_2.11 490 | com.aliyun.emr:emr-dts_2.11 491 | com.aliyun.odps:odps-sdk-core 492 | com.aliyun.odps:odps-sdk-commons 493 | com.aliyun.oss:aliyun-sdk-oss 494 | com.aliyun.openservices:aliyun-log 495 | com.aliyun.openservices:loghub-client-lib 496 | com.aliyun.openservices:ons-client 497 | com.aliyun.openservices:ons-api 498 | com.aliyun.mns:aliyun-sdk-mns 499 | com.aliyun.openservices:tablestore 500 | com.alibaba.rocketmq:rocketmq-client 501 | com.alibaba.rocketmq:rocketmq-common 502 | com.alibaba.rocketmq:rocketmq-remoting 503 | com.alibaba:fastjson 504 | com.google.guava:guava 505 | org.aspectj:aspectjrt 506 | com.github.scopt:scopt_2.10 507 | org.jdom:jdom 508 | net.sf.json-lib:json-lib 509 | net.sf.ezmorph:ezmorph 510 | commons-validator:commons-validator 511 | mysql:mysql-connector-java 512 | com.stratio.datasource:spark-mongodb_2.10 513 | redis.clients:jedis 514 | org.apache.commons:commons-pool2 515 | org.apache.hbase:hbase-common 516 | org.apache.hbase:hbase-client 517 | org.apache.hbase:hbase-protocol 518 | org.apache.httpcomponents:httpasyncclient 519 | org.apache.httpcomponents:httpcore-nio 520 | org.apache.httpcomponents:httpcore 521 | org.apache.spark:spark-streaming-kafka-0-10_2.11 522 | org.apache.kafka:kafka-clients 523 | org.apache.kafka:kafka_2.11 524 | org.apache.storm:storm-kafka 525 | org.apache.storm:storm-hdfs 526 | org.apache.storm:storm-perf 527 | commons-lang:commons-lang 528 | org.apache.hadoop:hadoop-hdfs 529 | org.apache.hadoop:hadoop-common 530 | com.101tec:zkclient 531 | com.aliyun.dts:dts-subscribe-sdk 532 | org.apache.commons:commons-lang3 533 | 534 | 535 | 536 | 537 | 538 | package 539 | 540 | shade 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | central 551 | http://maven.aliyun.com/mvn/repository 552 | 553 | true 554 | 555 | 556 | false 557 | 558 | 559 | 560 | snapshots 561 | http://maven.aliyun.com/mvn/repository 562 | 563 | false 564 | 565 | 566 | true 567 | 568 | 569 | 570 | oss 571 | Maven SNAPSHOT Repository 572 | https://oss.sonatype.org/content/repositories/snapshots/ 573 | 574 | false 575 | 576 | 577 | true 578 | 579 | 580 | 581 | 582 | -------------------------------------------------------------------------------- /resources/The_Sorrows_of_Young_Werther.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/resources/The_Sorrows_of_Young_Werther.txt -------------------------------------------------------------------------------- /resources/patterns.txt: -------------------------------------------------------------------------------- 1 | \. 2 | \, 3 | \! 4 | to 5 | \" -------------------------------------------------------------------------------- /resources/student_data.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-emapreduce-demo/e31d55f524df065e94712998d1c6362de038bee2/resources/student_data.csv -------------------------------------------------------------------------------- /src/main/hive/sample.hive: -------------------------------------------------------------------------------- 1 | USE DEFAULT; 2 | set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; 3 | set mapreduce.job.maps=2; 4 | set mapreduce.job.reduces=2; 5 | set hive.stats.autogather=false; 6 | 7 | DROP TABLE emrusers; 8 | CREATE EXTERNAL TABLE emrusers ( 9 | userid INT, 10 | movieid INT, 11 | rating INT, 12 | unixtime STRING ) 13 | ROW FORMAT DELIMITED 14 | FIELDS TERMINATED BY '\t' 15 | LOCATION '${hiveconf:inputPath}'; 16 | 17 | SELECT COUNT(*) FROM emrusers; 18 | 19 | SELECT * from emrusers limit 100; 20 | 21 | SELECT movieid,count(userid) as usercount from emrusers group by movieid order by usercount desc limit 50; -------------------------------------------------------------------------------- /src/main/java/com/aliyun/emr/example/hadoop/EMapReduceOSSUtil.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.aliyun.emr.example.hadoop; 19 | 20 | import org.apache.hadoop.conf.Configuration; 21 | 22 | public class EMapReduceOSSUtil { 23 | 24 | private static String SCHEMA = "oss://"; 25 | private static String AKSEP = ":"; 26 | private static String BKTSEP = "@"; 27 | private static String EPSEP = "."; 28 | private static String HTTP_HEADER = "http://"; 29 | 30 | /** 31 | * complete OSS uri 32 | * convert uri like: oss://bucket/path to oss://accessKeyId:accessKeySecret@bucket.endpoint/path 33 | * ossref do not need this 34 | * 35 | * @param oriUri original OSS uri 36 | */ 37 | public static String buildOSSCompleteUri(String oriUri, String akId, String akSecret, String endpoint) { 38 | if (akId == null) { 39 | System.err.println("miss accessKeyId"); 40 | return oriUri; 41 | } 42 | if (akSecret == null) { 43 | System.err.println("miss accessKeySecret"); 44 | return oriUri; 45 | } 46 | if (endpoint == null) { 47 | System.err.println("miss endpoint"); 48 | return oriUri; 49 | } 50 | 51 | int index = oriUri.indexOf(SCHEMA); 52 | if (index == -1 || index != 0) { 53 | return oriUri; 54 | } 55 | 56 | int bucketIndex = index + SCHEMA.length(); 57 | int pathIndex = oriUri.indexOf("/", bucketIndex); 58 | String bucket = null; 59 | if (pathIndex == -1) { 60 | bucket = oriUri.substring(bucketIndex); 61 | } else { 62 | bucket = oriUri.substring(bucketIndex, pathIndex); 63 | } 64 | 65 | StringBuilder retUri = new StringBuilder(); 66 | retUri.append(SCHEMA) 67 | .append(akId) 68 | .append(AKSEP) 69 | .append(akSecret) 70 | .append(BKTSEP) 71 | .append(bucket) 72 | .append(EPSEP) 73 | .append(stripHttp(endpoint)); 74 | 75 | if (pathIndex > 0) { 76 | retUri.append(oriUri.substring(pathIndex)); 77 | } 78 | 79 | return retUri.toString(); 80 | } 81 | 82 | public static String buildOSSCompleteUri(String oriUri, Configuration conf) { 83 | return buildOSSCompleteUri(oriUri, conf.get("fs.oss.accessKeyId"), conf.get("fs.oss.accessKeySecret"), conf.get("fs.oss.endpoint")); 84 | } 85 | 86 | private static String stripHttp(String endpoint) { 87 | if (endpoint.startsWith(HTTP_HEADER)) { 88 | return endpoint.substring(HTTP_HEADER.length()); 89 | } 90 | return endpoint; 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /src/main/java/com/aliyun/emr/example/hadoop/WordCount.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.aliyun.emr.example.hadoop; 19 | 20 | import java.io.BufferedReader; 21 | import java.io.FileReader; 22 | import java.io.IOException; 23 | import java.net.URI; 24 | import java.util.ArrayList; 25 | import java.util.HashSet; 26 | import java.util.List; 27 | import java.util.Set; 28 | import java.util.StringTokenizer; 29 | 30 | import org.apache.hadoop.conf.Configuration; 31 | import org.apache.hadoop.fs.Path; 32 | import org.apache.hadoop.io.IntWritable; 33 | import org.apache.hadoop.io.Text; 34 | import org.apache.hadoop.mapreduce.Job; 35 | import org.apache.hadoop.mapreduce.Mapper; 36 | import org.apache.hadoop.mapreduce.Reducer; 37 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 38 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 39 | import org.apache.hadoop.mapreduce.Counter; 40 | import org.apache.hadoop.util.GenericOptionsParser; 41 | import org.apache.hadoop.util.StringUtils; 42 | 43 | public class WordCount { 44 | 45 | public static class TokenizerMapper 46 | extends Mapper{ 47 | 48 | static enum CountersEnum { INPUT_WORDS } 49 | 50 | private final static IntWritable one = new IntWritable(1); 51 | private Text word = new Text(); 52 | 53 | private boolean caseSensitive; 54 | private Set patternsToSkip = new HashSet(); 55 | 56 | private Configuration conf; 57 | private BufferedReader fis; 58 | 59 | @Override 60 | public void setup(Context context) throws IOException, 61 | InterruptedException { 62 | conf = context.getConfiguration(); 63 | caseSensitive = conf.getBoolean("wordcount.case.sensitive", true); 64 | if (conf.getBoolean("wordcount.skip.patterns", false)) { 65 | URI[] patternsURIs = Job.getInstance(conf).getCacheFiles(); 66 | for (URI patternsURI : patternsURIs) { 67 | Path patternsPath = new Path(patternsURI.getPath()); 68 | String patternsFileName = patternsPath.getName(); 69 | parseSkipFile(patternsFileName); 70 | } 71 | } 72 | } 73 | 74 | private void parseSkipFile(String fileName) { 75 | try { 76 | fis = new BufferedReader(new FileReader(fileName)); 77 | String pattern; 78 | while ((pattern = fis.readLine()) != null) { 79 | patternsToSkip.add(pattern); 80 | } 81 | } catch (IOException ioe) { 82 | System.err.println("Caught exception while parsing the cached file '" 83 | + StringUtils.stringifyException(ioe)); 84 | } 85 | } 86 | 87 | @Override 88 | public void map(Object key, Text value, Context context 89 | ) throws IOException, InterruptedException { 90 | String line = (caseSensitive) ? 91 | value.toString() : value.toString().toLowerCase(); 92 | for (String pattern : patternsToSkip) { 93 | line = line.replaceAll(pattern, ""); 94 | } 95 | StringTokenizer itr = new StringTokenizer(line); 96 | while (itr.hasMoreTokens()) { 97 | word.set(itr.nextToken()); 98 | context.write(word, one); 99 | Counter counter = context.getCounter(CountersEnum.class.getName(), 100 | CountersEnum.INPUT_WORDS.toString()); 101 | counter.increment(1); 102 | } 103 | } 104 | } 105 | 106 | public static class IntSumReducer 107 | extends Reducer { 108 | private IntWritable result = new IntWritable(); 109 | 110 | public void reduce(Text key, Iterable values, 111 | Context context 112 | ) throws IOException, InterruptedException { 113 | int sum = 0; 114 | for (IntWritable val : values) { 115 | sum += val.get(); 116 | } 117 | result.set(sum); 118 | context.write(key, result); 119 | } 120 | } 121 | 122 | public static void main(String[] args) throws Exception { 123 | Configuration conf = new Configuration(); 124 | GenericOptionsParser optionParser = new GenericOptionsParser(conf, args); 125 | String[] remainingArgs = optionParser.getRemainingArgs(); 126 | if (!(remainingArgs.length == 2 || remainingArgs.length == 4)) { 127 | System.err.println("Usage: wordcount [-skip skipPatternFile]"); 128 | System.exit(2); 129 | } 130 | Job job = Job.getInstance(conf, "word count"); 131 | job.setJarByClass(WordCount.class); 132 | job.setMapperClass(TokenizerMapper.class); 133 | job.setCombinerClass(IntSumReducer.class); 134 | job.setReducerClass(IntSumReducer.class); 135 | job.setOutputKeyClass(Text.class); 136 | job.setOutputValueClass(IntWritable.class); 137 | 138 | List otherArgs = new ArrayList(); 139 | for (int i=0; i < remainingArgs.length; ++i) { 140 | if ("-skip".equals(remainingArgs[i])) { 141 | job.addCacheFile(new Path(EMapReduceOSSUtil.buildOSSCompleteUri(remainingArgs[++i], conf)).toUri()); 142 | job.getConfiguration().setBoolean("wordcount.skip.patterns", true); 143 | } else { 144 | otherArgs.add(remainingArgs[i]); 145 | } 146 | } 147 | FileInputFormat.addInputPath(job, new Path(EMapReduceOSSUtil.buildOSSCompleteUri(otherArgs.get(0), conf))); 148 | FileOutputFormat.setOutputPath(job, new Path(EMapReduceOSSUtil.buildOSSCompleteUri(otherArgs.get(1), conf))); 149 | 150 | System.exit(job.waitForCompletion(true) ? 0 : 1); 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /src/main/java/com/aliyun/emr/example/spark/SparkMaxComputeJavaDemo.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.aliyun.emr.example.spark; 19 | 20 | import com.aliyun.odps.TableSchema; 21 | import com.aliyun.odps.data.Record; 22 | import org.apache.spark.SparkConf; 23 | import org.apache.spark.aliyun.odps.OdpsOps; 24 | import org.apache.spark.api.java.JavaRDD; 25 | import org.apache.spark.api.java.JavaSparkContext; 26 | import org.apache.spark.api.java.function.Function2; 27 | 28 | import java.util.ArrayList; 29 | import java.util.List; 30 | 31 | public class SparkMaxComputeJavaDemo { 32 | 33 | public static void main(String[] args) { 34 | String partition = null; 35 | String accessId = args[0]; 36 | String accessKey = args[1]; 37 | 38 | String odpsUrl = args[2]; 39 | 40 | String tunnelUrl = args[3]; 41 | String project = args[4]; 42 | String table = args[5]; 43 | if (args.length > 6) { 44 | partition = args[6]; 45 | } 46 | 47 | SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 3-2: Spark MaxCompute Demo (Java)"); 48 | JavaSparkContext jsc = new JavaSparkContext(sparkConf); 49 | 50 | OdpsOps odpsOps = new OdpsOps(jsc.sc(), accessId, accessKey, odpsUrl, tunnelUrl); 51 | 52 | System.out.println("Read odps table..."); 53 | JavaRDD> readData = odpsOps.readTableWithJava(project, table, new RecordToLongs(), Integer.valueOf(partition)); 54 | 55 | System.out.println("counts: "); 56 | System.out.println(readData.count()); 57 | } 58 | 59 | static class RecordToLongs implements Function2> { 60 | @Override 61 | public List call(Record record, TableSchema schema) throws Exception { 62 | List ret = new ArrayList(); 63 | for (int i = 0; i < schema.getColumns().size(); i++) { 64 | ret.add(record.getBigint(i)); 65 | } 66 | return ret; 67 | } 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /src/main/java/com/aliyun/emr/example/spark/SparkOssJavaDemo.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.aliyun.emr.example.spark; 19 | 20 | import org.apache.hadoop.io.LongWritable; 21 | import org.apache.hadoop.io.Text; 22 | import org.apache.hadoop.mapred.TextInputFormat; 23 | import org.apache.spark.SparkConf; 24 | import org.apache.spark.api.java.JavaPairRDD; 25 | import org.apache.spark.api.java.JavaSparkContext; 26 | 27 | public class SparkOssJavaDemo { 28 | 29 | public static void main(String[] args) { 30 | 31 | String accessId = args[0]; 32 | String accessKey = args[1]; 33 | 34 | String endpoint = args[2]; 35 | 36 | String inputPath = args[3]; 37 | String outputPath = args[4]; 38 | int partition = Integer.valueOf(args[5]); 39 | 40 | SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 2-2: Spark Oss Demo (Java)").setMaster("local[4]"); 41 | sparkConf.set("spark.hadoop.fs.oss.accessKeyId", accessId); 42 | sparkConf.set("spark.hadoop.fs.oss.accessKeySecret", accessKey); 43 | sparkConf.set("spark.hadoop.fs.oss.endpoint", endpoint); 44 | sparkConf.set("spark.hadoop.fs.oss.impl", "com.aliyun.fs.oss.nat.NativeOssFileSystem"); 45 | sparkConf.set("spark.hadoop.mapreduce.job.run-local", "true"); 46 | JavaSparkContext jsc = new JavaSparkContext(sparkConf); 47 | 48 | JavaPairRDD data = jsc.hadoopFile(inputPath, TextInputFormat.class, LongWritable.class, Text.class, partition); 49 | 50 | System.out.println("Count (data): " + String.valueOf(data.count())); 51 | 52 | data.saveAsTextFile(outputPath); 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/com/aliyun/emr/example/spark/SparkTableStoreJavaDemo.java: -------------------------------------------------------------------------------- 1 | package com.aliyun.emr.example.spark; 2 | 3 | import com.alicloud.openservices.tablestore.ecosystem.ComputeParameters; 4 | import com.alicloud.openservices.tablestore.ecosystem.Filter; 5 | import com.alicloud.openservices.tablestore.model.*; 6 | import com.aliyun.openservices.tablestore.hadoop.*; 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.spark.SparkConf; 9 | import org.apache.spark.api.java.JavaPairRDD; 10 | import org.apache.spark.api.java.JavaSparkContext; 11 | 12 | import java.util.ArrayList; 13 | import java.util.Formatter; 14 | import java.util.List; 15 | 16 | public class SparkTableStoreJavaDemo { 17 | private static RangeRowQueryCriteria fetchCriteria(String tableName, String columnName) { 18 | RangeRowQueryCriteria res = new RangeRowQueryCriteria(tableName); 19 | res.setMaxVersions(1); 20 | List lower = new ArrayList(); 21 | List upper = new ArrayList(); 22 | lower.add(new PrimaryKeyColumn(columnName, PrimaryKeyValue.INF_MIN)); 23 | upper.add(new PrimaryKeyColumn(columnName, PrimaryKeyValue.INF_MAX)); 24 | res.setInclusiveStartPrimaryKey(new PrimaryKey(lower)); 25 | res.setExclusiveEndPrimaryKey(new PrimaryKey(upper)); 26 | return res; 27 | } 28 | 29 | public static void main(String[] args) { 30 | String accessKeyId = args[0]; 31 | String accessKeySecret = args[1]; 32 | Filter filter = new Filter(Filter.CompareOperator.GREATER_THAN,"PK", ColumnValue.fromLong(-1000)); 33 | List list = new ArrayList<>(); 34 | list.add("VALUE"); 35 | TableStoreFilterWritable tableStoreFilterWritable = new TableStoreFilterWritable(filter, list); 36 | 37 | String endpoint = args[2]; 38 | String instance = args[3]; 39 | String tableName = args[4]; 40 | String primaryKeyColumnName = args[5]; 41 | ComputeParams computeParams = new ComputeParams(100, 1, ComputeParameters.ComputeMode.Auto.name()); 42 | SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 5: Spark TableStore Demo (Java)"); 43 | JavaSparkContext sc = null; 44 | try { 45 | sc = new JavaSparkContext(sparkConf); 46 | Configuration hadoopConf = new Configuration(); 47 | hadoopConf.set("computeParams", computeParams.serialize()); 48 | hadoopConf.set("tableName", tableName); 49 | hadoopConf.set("filters", tableStoreFilterWritable.serialize()); 50 | TableStore.setCredential( 51 | hadoopConf, 52 | new Credential(accessKeyId, accessKeySecret, null)); 53 | Endpoint ep = new Endpoint(endpoint, instance); 54 | TableStore.setEndpoint(hadoopConf, ep); 55 | com.aliyun.openservices.tablestore.hadoop.TableStoreInputFormat.addCriteria(hadoopConf, 56 | fetchCriteria(tableName, primaryKeyColumnName)); 57 | JavaPairRDD rdd = sc.newAPIHadoopRDD( 58 | hadoopConf, com.aliyun.openservices.tablestore.hadoop.TableStoreInputFormat.class, 59 | PrimaryKeyWritable.class, RowWritable.class); 60 | System.out.println( 61 | new Formatter().format("TOTAL: %d", rdd.count()).toString()); 62 | rdd.take(10).forEach((primaryKeyWritableRowWritableTuple2) -> { 63 | System.out.println(String.format("Key: %s, VALUE: %s", 64 | primaryKeyWritableRowWritableTuple2._1.getPrimaryKey().toString(), 65 | primaryKeyWritableRowWritableTuple2._2.getRow().toString())); 66 | }); 67 | } finally { 68 | if (sc != null) { 69 | sc.close(); 70 | } 71 | } 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/main/java/com/aliyun/emr/example/spark/sql/streaming/SparkSLSContinuousStructuredStreamingJavaDemo.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package com.aliyun.emr.example.spark.sql.streaming; 18 | 19 | import org.apache.spark.sql.Dataset; 20 | import org.apache.spark.sql.Encoders; 21 | import org.apache.spark.sql.SparkSession; 22 | import org.apache.spark.sql.streaming.StreamingQuery; 23 | import org.apache.spark.sql.streaming.Trigger; 24 | 25 | import java.util.UUID; 26 | 27 | public class SparkSLSContinuousStructuredStreamingJavaDemo { 28 | 29 | public static void main(String[] args) throws Exception { 30 | if (args.length < 7) { 31 | System.err.println("Usage: SparkSLSContinuousStructuredStreamingJavaDemo " + 32 | " " + 33 | " []"); 34 | System.exit(1); 35 | } 36 | 37 | String logProject = args[0]; 38 | String logStore = args[1]; 39 | String accessKeyId = args[2]; 40 | String accessKeySecret = args[3]; 41 | String endpoint = args[4]; 42 | String startingOffsets = args[5]; 43 | String maxOffsetsPerTrigger = args[6]; 44 | String checkpointLocation = "/tmp/temporary-" + UUID.randomUUID().toString(); 45 | if (args.length > 7) { 46 | checkpointLocation = args[7]; 47 | } 48 | 49 | SparkSession spark = SparkSession 50 | .builder() 51 | .master("local[5]") 52 | .appName("E-MapReduce Demo 6-6: Spark SLS Demo (Java)") 53 | .getOrCreate(); 54 | 55 | spark.sparkContext().setLogLevel("WARN"); 56 | 57 | Dataset lines = spark.readStream() 58 | .format("org.apache.spark.sql.aliyun.logservice.LoghubSourceProvider") 59 | .option("sls.project", logProject) 60 | .option("sls.store", logStore) 61 | .option("access.key.id", accessKeyId) 62 | .option("access.key.secret", accessKeySecret) 63 | .option("endpoint", endpoint) 64 | .option("startingoffsets", startingOffsets) 65 | .option("maxOffsetsPerTrigger", maxOffsetsPerTrigger) 66 | .load() 67 | .selectExpr("CAST(__value__ AS STRING)") 68 | .as(Encoders.STRING()); 69 | 70 | // Start running the query that prints the running counts to the console 71 | StreamingQuery query = lines.writeStream() 72 | .outputMode("append") 73 | .format("console") 74 | .option("checkpointLocation", checkpointLocation) 75 | .trigger(Trigger.Continuous("5 second")) 76 | .start(); 77 | 78 | query.awaitTermination(); 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/main/java/com/aliyun/emr/example/spark/sql/streaming/SparkSLSStructuredStreamingJavaDemo.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package com.aliyun.emr.example.spark.sql.streaming; 18 | 19 | import org.apache.spark.api.java.function.FlatMapFunction; 20 | import org.apache.spark.sql.Dataset; 21 | import org.apache.spark.sql.Encoders; 22 | import org.apache.spark.sql.Row; 23 | import org.apache.spark.sql.SparkSession; 24 | import org.apache.spark.sql.streaming.StreamingQuery; 25 | 26 | import java.util.Arrays; 27 | import java.util.UUID; 28 | 29 | public class SparkSLSStructuredStreamingJavaDemo { 30 | 31 | public static void main(String[] args) throws Exception { 32 | if (args.length < 7) { 33 | System.err.println("Usage: SparkSLSStructuredStreamingJavaDemo " + 34 | " " + 35 | " []"); 36 | System.exit(1); 37 | } 38 | 39 | String logProject = args[0]; 40 | String logStore = args[1]; 41 | String accessKeyId = args[2]; 42 | String accessKeySecret = args[3]; 43 | String endpoint = args[4]; 44 | String startingOffsets = args[5]; 45 | String maxOffsetsPerTrigger = args[6]; 46 | String checkpointLocation = "/tmp/temporary-" + UUID.randomUUID().toString(); 47 | if (args.length > 7) { 48 | checkpointLocation = args[7]; 49 | } 50 | 51 | SparkSession spark = SparkSession 52 | .builder() 53 | .master("local[5]") 54 | .appName("E-MapReduce Demo 6-4: Spark SLS Demo (Java)") 55 | .getOrCreate(); 56 | 57 | spark.sparkContext().setLogLevel("WARN"); 58 | 59 | Dataset lines = spark.readStream() 60 | .format("org.apache.spark.sql.aliyun.logservice.LoghubSourceProvider") 61 | .option("sls.project", logProject) 62 | .option("sls.store", logStore) 63 | .option("access.key.id", accessKeyId) 64 | .option("access.key.secret", accessKeySecret) 65 | .option("endpoint", endpoint) 66 | .option("startingoffsets", startingOffsets) 67 | .option("zookeeper.connect.address", "localhost:2181") 68 | .option("maxOffsetsPerTrigger", maxOffsetsPerTrigger) 69 | .load() 70 | .selectExpr("CAST(__value__ AS STRING)") 71 | .as(Encoders.STRING()); 72 | 73 | // Generate running word count 74 | Dataset wordCounts = lines.flatMap( 75 | (FlatMapFunction) x -> Arrays.asList(x.split(" ")).iterator(), 76 | Encoders.STRING()).groupBy("value").count(); 77 | 78 | // Start running the query that prints the running counts to the console 79 | StreamingQuery query = wordCounts.writeStream() 80 | .outputMode("complete") 81 | .format("console") 82 | .option("checkpointLocation", checkpointLocation) 83 | .start(); 84 | 85 | query.awaitTermination(); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/main/java/com/aliyun/emr/example/spark/streaming/JavaLoghubWordCount.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.aliyun.emr.example.spark.streaming; 19 | 20 | import org.apache.spark.SparkConf; 21 | import org.apache.spark.api.java.function.FlatMapFunction; 22 | import org.apache.spark.api.java.function.Function; 23 | import org.apache.spark.api.java.function.Function2; 24 | import org.apache.spark.api.java.function.PairFunction; 25 | import org.apache.spark.storage.StorageLevel; 26 | import org.apache.spark.streaming.Duration; 27 | import org.apache.spark.streaming.aliyun.logservice.LoghubUtils; 28 | import org.apache.spark.streaming.api.java.JavaDStream; 29 | import org.apache.spark.streaming.api.java.JavaPairDStream; 30 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 31 | import scala.Tuple2; 32 | 33 | import java.util.Arrays; 34 | import java.util.Iterator; 35 | import java.util.regex.Pattern; 36 | 37 | public class JavaLoghubWordCount { 38 | private static final Pattern SPACE = Pattern.compile(" "); 39 | 40 | public static void main(String[] args) throws InterruptedException { 41 | if (args.length < 6) { 42 | System.err.println("Usage: bin/spark-submit --class JavaLoghubWordCount " + 43 | "examples-1.0-SNAPSHOT-shaded.jar " + 44 | " "); 45 | System.exit(1); 46 | } 47 | 48 | String loghubProject = args[0]; 49 | String logStore = args[1]; 50 | String loghubGroupName = args[2]; 51 | String endpoint = args[3]; 52 | String accessKeyId = args[4]; 53 | String accessKeySecret = args[5]; 54 | 55 | SparkConf conf = new SparkConf().setAppName("Loghub Sample"); 56 | JavaStreamingContext jssc = new JavaStreamingContext(conf, new Duration(2000)); 57 | JavaDStream lines = LoghubUtils.createStream( 58 | jssc, 59 | loghubProject, 60 | logStore, 61 | loghubGroupName, 62 | endpoint, 63 | 1, 64 | accessKeyId, 65 | accessKeySecret, 66 | StorageLevel.MEMORY_AND_DISK()); 67 | 68 | JavaDStream words = lines.map(new Function() { 69 | @Override 70 | public String call(byte[] v1) throws Exception { 71 | return new String(v1); 72 | } 73 | }).flatMap(new FlatMapFunction() { 74 | @Override 75 | public Iterator call(String s) { 76 | return Arrays.asList(SPACE.split(s)).iterator(); 77 | } 78 | }); 79 | JavaPairDStream wordCounts = words.mapToPair( 80 | new PairFunction() { 81 | @Override 82 | public Tuple2 call(String s) { 83 | return new Tuple2(s, 1); 84 | } 85 | }).reduceByKey(new Function2() { 86 | @Override 87 | public Integer call(Integer i1, Integer i2) { 88 | return i1 + i2; 89 | } 90 | }); 91 | 92 | wordCounts.print(); 93 | jssc.start(); 94 | jssc.awaitTermination(); 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/main/java/com/aliyun/emr/example/spark/streaming/SparkMNSJavaDemo.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.aliyun.emr.example.spark.streaming; 19 | 20 | import com.google.common.collect.Lists; 21 | import org.apache.spark.SparkConf; 22 | import org.apache.spark.api.java.function.FlatMapFunction; 23 | import org.apache.spark.api.java.function.Function; 24 | import org.apache.spark.api.java.function.Function2; 25 | import org.apache.spark.api.java.function.PairFunction; 26 | import org.apache.spark.storage.StorageLevel; 27 | import org.apache.spark.streaming.Duration; 28 | import org.apache.spark.streaming.aliyun.mns.MnsUtils; 29 | import org.apache.spark.streaming.api.java.*; 30 | import scala.Tuple2; 31 | 32 | import java.util.Iterator; 33 | import java.util.regex.Pattern; 34 | 35 | public class SparkMNSJavaDemo { 36 | private static final Pattern SPACE = Pattern.compile(" "); 37 | 38 | public static void main(String[] args) throws InterruptedException { 39 | if (args.length < 4) { 40 | System.err.println("Usage: bin/spark-submit --class SparkMNSJavaDemo examples-1.0-SNAPSHOT-shaded.jar " + 41 | " "); 42 | System.exit(1); 43 | } 44 | 45 | String queueName = args[0]; 46 | String accessKeyId = args[1]; 47 | String accessKeySecret = args[2]; 48 | String endpoint = args[3]; 49 | 50 | SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 8-2: Spark MNS Demo (Java)").setMaster("local[4]"); 51 | sparkConf.set("spark.hadoop.fs.oss.impl", "com.aliyun.fs.oss.nat.NativeOssFileSystem"); 52 | sparkConf.set("spark.hadoop.mapreduce.job.run-local", "true"); 53 | // Create the context with 2 seconds batch size 54 | JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(2000)); 55 | 56 | JavaReceiverInputDStream lines = MnsUtils.createPullingStreamAsBytes(jssc, queueName, accessKeyId, 57 | accessKeySecret, endpoint, StorageLevel.MEMORY_AND_DISK()); 58 | 59 | JavaDStream words = lines.map(new Function() { 60 | @Override 61 | public String call(byte[] v1) throws Exception { 62 | return new String(v1); 63 | } 64 | }).flatMap(new FlatMapFunction() { 65 | @Override 66 | public Iterator call(String x) { 67 | return Lists.newArrayList(SPACE.split(x)).iterator(); 68 | } 69 | }); 70 | JavaPairDStream wordCounts = words.mapToPair( 71 | new PairFunction() { 72 | @Override 73 | public Tuple2 call(String s) { 74 | return new Tuple2(s, 1); 75 | } 76 | }).reduceByKey(new Function2() { 77 | @Override 78 | public Integer call(Integer i1, Integer i2) { 79 | return i1 + i2; 80 | } 81 | }); 82 | 83 | wordCounts.print(); 84 | jssc.start(); 85 | jssc.awaitTermination(); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/main/java/com/aliyun/emr/example/spark/streaming/SparkRocketMQJavaDemo.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.aliyun.emr.example.spark.streaming; 19 | 20 | import com.aliyun.openservices.ons.api.Message; 21 | import com.google.common.collect.Lists; 22 | import org.apache.spark.SparkConf; 23 | import org.apache.spark.api.java.function.FlatMapFunction; 24 | import org.apache.spark.api.java.function.Function; 25 | import org.apache.spark.api.java.function.Function2; 26 | import org.apache.spark.api.java.function.PairFunction; 27 | import org.apache.spark.storage.StorageLevel; 28 | import org.apache.spark.streaming.Duration; 29 | import org.apache.spark.streaming.aliyun.ons.OnsUtils; 30 | import org.apache.spark.streaming.api.java.JavaDStream; 31 | import org.apache.spark.streaming.api.java.JavaPairDStream; 32 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream; 33 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 34 | import scala.Tuple2; 35 | 36 | import java.util.Iterator; 37 | import java.util.regex.Pattern; 38 | 39 | public class SparkRocketMQJavaDemo { 40 | private static final Pattern SPACE = Pattern.compile(" "); 41 | 42 | public static void main(String[] args) throws InterruptedException { 43 | if (args.length < 5) { 44 | System.err.println("Usage: spark-submit --class com.aliyun.emr.example.spark.streaming.SparkRocketMQJavaDemo " + 45 | "examples-1.0-SNAPSHOT-shaded.jar " + 46 | " "); 47 | System.exit(1); 48 | } 49 | 50 | String accessKeyId = args[0]; 51 | String accessKeySecret = args[1]; 52 | String consumerId = args[2]; 53 | String topic = args[3]; 54 | String subExpression = args[4]; 55 | 56 | SparkConf sparkConf = new SparkConf().setAppName("E-MapReduce Demo 4-2: Spark RocketMQ Demo (Java)"); 57 | // Create the context with 2 seconds batch size 58 | JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(2000)); 59 | 60 | JavaReceiverInputDStream lines = OnsUtils.createStream(jssc, consumerId, topic, subExpression, 61 | accessKeyId, accessKeySecret, StorageLevel.MEMORY_AND_DISK(), new Function() { 62 | @Override 63 | public byte[] call(Message msg) throws Exception { 64 | return msg.getBody(); 65 | } 66 | }); 67 | 68 | JavaDStream words = lines.map(new Function() { 69 | @Override 70 | public String call(byte[] v1) throws Exception { 71 | return new String(v1); 72 | } 73 | }).flatMap(new FlatMapFunction() { 74 | @Override 75 | public Iterator call(String x) { 76 | return Lists.newArrayList(SPACE.split(x)).iterator(); 77 | } 78 | }); 79 | JavaPairDStream wordCounts = words.mapToPair( 80 | new PairFunction() { 81 | @Override 82 | public Tuple2 call(String s) { 83 | return new Tuple2(s, 1); 84 | } 85 | }).reduceByKey(new Function2() { 86 | @Override 87 | public Integer call(Integer i1, Integer i2) { 88 | return i1 + i2; 89 | } 90 | }); 91 | 92 | wordCounts.print(); 93 | jssc.start(); 94 | jssc.awaitTermination(); 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/main/java/com/aliyun/emr/example/storm/StormKafkaSample.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.aliyun.emr.example.storm; 19 | 20 | import org.apache.storm.Config; 21 | import org.apache.storm.LocalCluster; 22 | import org.apache.storm.StormSubmitter; 23 | import org.apache.storm.generated.AlreadyAliveException; 24 | import org.apache.storm.generated.AuthorizationException; 25 | import org.apache.storm.generated.InvalidTopologyException; 26 | import org.apache.storm.hdfs.bolt.HdfsBolt; 27 | import org.apache.storm.hdfs.bolt.format.DefaultFileNameFormat; 28 | import org.apache.storm.hdfs.bolt.format.DelimitedRecordFormat; 29 | import org.apache.storm.hdfs.bolt.format.FileNameFormat; 30 | import org.apache.storm.hdfs.bolt.format.RecordFormat; 31 | import org.apache.storm.hdfs.bolt.rotation.FileRotationPolicy; 32 | import org.apache.storm.hdfs.bolt.rotation.FileSizeRotationPolicy; 33 | import org.apache.storm.hdfs.bolt.sync.CountSyncPolicy; 34 | import org.apache.storm.hdfs.bolt.sync.SyncPolicy; 35 | import org.apache.storm.kafka.KafkaSpout; 36 | import org.apache.storm.kafka.SpoutConfig; 37 | import org.apache.storm.kafka.StringScheme; 38 | import org.apache.storm.kafka.ZkHosts; 39 | import org.apache.storm.spout.SchemeAsMultiScheme; 40 | import org.apache.storm.topology.TopologyBuilder; 41 | 42 | import java.util.ArrayList; 43 | import java.util.List; 44 | 45 | public class StormKafkaSample { 46 | public static void main(String[] args) throws AuthorizationException { 47 | String topic = args[0] ; 48 | String zk = args[1]; 49 | String hdfsUrl = args[2]; 50 | ZkHosts zkHosts = new ZkHosts(zk + ":2181/kafka-1.0.0"); 51 | SpoutConfig spoutConfig = new SpoutConfig(zkHosts, topic, "/kafka-1.0.0", "MyTrack") ; 52 | List zkServers = new ArrayList() ; 53 | zkServers.add(zk); 54 | spoutConfig.zkServers = zkServers; 55 | spoutConfig.zkPort = 2181; 56 | spoutConfig.socketTimeoutMs = 60 * 1000 ; 57 | spoutConfig.scheme = new SchemeAsMultiScheme(new StringScheme()) ; 58 | 59 | // use "|" instead of "," for field delimiter 60 | RecordFormat format = new DelimitedRecordFormat() 61 | .withFieldDelimiter("|"); 62 | 63 | // sync the filesystem after every 1k tuples 64 | SyncPolicy syncPolicy = new CountSyncPolicy(1000); 65 | 66 | // rotate files when they reach 5MB 67 | FileRotationPolicy rotationPolicy = new FileSizeRotationPolicy(5.0f, FileSizeRotationPolicy.Units.MB); 68 | 69 | FileNameFormat fileNameFormat = new DefaultFileNameFormat().withPath("/foo/"); 70 | 71 | HdfsBolt bolt = new HdfsBolt() 72 | .withFsUrl(hdfsUrl) 73 | .withFileNameFormat(fileNameFormat) 74 | .withRecordFormat(format) 75 | .withRotationPolicy(rotationPolicy) 76 | .withSyncPolicy(syncPolicy); 77 | 78 | TopologyBuilder builder = new TopologyBuilder() ; 79 | builder.setSpout("spout", new KafkaSpout(spoutConfig) ,2) ; 80 | builder.setBolt("bolt", bolt, 1).shuffleGrouping("spout") ; 81 | 82 | Config conf = new Config (); 83 | conf.setDebug(false) ; 84 | 85 | if (args.length > 3) { 86 | try { 87 | StormSubmitter.submitTopology(args[3], conf, builder.createTopology()); 88 | } catch (AlreadyAliveException e) { 89 | e.printStackTrace(); 90 | } catch (InvalidTopologyException e) { 91 | e.printStackTrace(); 92 | } 93 | } else { 94 | LocalCluster localCluster = new LocalCluster(); 95 | localCluster.submitTopology("mytopology", conf, builder.createTopology()); 96 | } 97 | 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /src/main/java/com/aliyun/emr/example/storm/benchmark/AbstractTopology.java: -------------------------------------------------------------------------------- 1 | package com.aliyun.emr.example.storm.benchmark; 2 | 3 | import com.aliyun.emr.example.storm.benchmark.util.Helper; 4 | import org.apache.commons.lang.StringUtils; 5 | import org.apache.storm.Config; 6 | import org.apache.storm.LocalCluster; 7 | import org.apache.storm.StormSubmitter; 8 | import org.apache.storm.generated.StormTopology; 9 | 10 | import java.io.BufferedInputStream; 11 | import java.io.FileInputStream; 12 | import java.io.InputStream; 13 | import java.io.Serializable; 14 | import java.util.HashMap; 15 | import java.util.Map; 16 | import java.util.Properties; 17 | 18 | abstract public class AbstractTopology implements Serializable{ 19 | protected Properties configure; 20 | 21 | public void init(String configFilepath) throws Exception { 22 | init(configFilepath, ""); 23 | } 24 | 25 | public void init(String configFilepath, String properties) throws Exception { 26 | InputStream in = new BufferedInputStream(new FileInputStream(configFilepath)); 27 | configure = new Properties(); 28 | configure.load(in); 29 | 30 | if (! StringUtils.isBlank(properties)) { 31 | Map customProperty = new HashMap<>(); 32 | for (String item : properties.split(",")) { 33 | String[] kv = item.split("="); 34 | if (kv.length != 2) { 35 | System.out.println("invalid property[" + item + "], pattern should be k1=v2,k2=v2..."); 36 | continue; 37 | } 38 | customProperty.put(kv[0], kv[1]); 39 | } 40 | configure.putAll(customProperty); 41 | } 42 | 43 | System.out.println("all configure: " + configure); 44 | } 45 | 46 | public void run(boolean cluster) throws Exception { 47 | String name = configure.getProperty("name"); 48 | Config conf = new Config(); 49 | 50 | if (!cluster) { 51 | new LocalCluster().submitTopology("local-" + name, conf, createTopology()); 52 | return; 53 | } 54 | 55 | int slots = Integer.valueOf(configure.getProperty("worker.slot.number")); 56 | int clusterNodes = Integer.valueOf(configure.getProperty("cluster.worker.node.number")); 57 | int workerNumber = slots * clusterNodes; 58 | int clusterNodeMemoryMb = Integer.valueOf(configure.getProperty("cluster.memory.per.node.mb")); 59 | int workerMem = clusterNodeMemoryMb / slots; 60 | conf.setNumWorkers(workerNumber); 61 | if (!Boolean.valueOf(configure.getProperty("ack.open"))) { 62 | conf.setNumAckers(0); 63 | } 64 | 65 | conf.put("worker.heap.memory.mb", workerMem); 66 | conf.put("topology.backpressure.enable", Boolean.valueOf(configure.getProperty("backpressure.enable"))); 67 | StormSubmitter.submitTopologyWithProgressBar(name, conf, createTopology()); 68 | Helper.setupShutdownHook(name); // handle Ctrl-C 69 | 70 | System.out.println("**********metrics will begin in two minute, please start to send source data to warn up**********"); 71 | for (int i = 0; i< 2; i++) { 72 | Thread.sleep(1000 * 60); 73 | System.out.println("..."); 74 | } 75 | System.out.println("********** start metrics **********"); 76 | Helper.collectMetrics(name, 60); 77 | } 78 | 79 | abstract StormTopology createTopology(); 80 | } 81 | -------------------------------------------------------------------------------- /src/main/java/com/aliyun/emr/example/storm/benchmark/BasicTopology.java: -------------------------------------------------------------------------------- 1 | package com.aliyun.emr.example.storm.benchmark; 2 | 3 | import com.google.common.collect.ImmutableMap; 4 | import kafka.api.OffsetRequest; 5 | import org.apache.storm.generated.StormTopology; 6 | import org.apache.storm.kafka.*; 7 | import org.apache.storm.kafka.bolt.KafkaBolt; 8 | import org.apache.storm.kafka.bolt.mapper.TupleToKafkaMapper; 9 | import org.apache.storm.kafka.bolt.selector.DefaultTopicSelector; 10 | import org.apache.storm.topology.*; 11 | import org.apache.storm.topology.base.BaseBasicBolt; 12 | import org.apache.storm.tuple.Fields; 13 | import org.apache.storm.tuple.Tuple; 14 | import org.apache.storm.tuple.Values; 15 | 16 | import java.util.Arrays; 17 | import java.util.Properties; 18 | 19 | public class BasicTopology extends AbstractTopology { 20 | 21 | @Override 22 | StormTopology createTopology() { 23 | TopologyBuilder builder = new TopologyBuilder(); 24 | setSpout(builder); 25 | setBolt(builder); 26 | return builder.createTopology(); 27 | } 28 | 29 | private void setSpout(TopologyBuilder builder) { 30 | String consumerGroup = configure.getProperty("consumer.group"); 31 | SpoutConfig conf = new SpoutConfig(new ZkHosts( 32 | configure.getProperty("zookeeper.address") + ":2181" + configure.getProperty("zookeeper.root")), 33 | configure.getProperty("topic"), configure.getProperty("zookeeper.root"), consumerGroup); 34 | conf.zkPort = 2181; 35 | conf.zkServers= Arrays.asList(configure.getProperty("zookeeper.address")); 36 | conf.socketTimeoutMs = 60 * 1000; 37 | conf.scheme = new KeyValueSchemeAsMultiScheme(new StringKeyValueScheme()); 38 | conf.startOffsetTime= OffsetRequest.LatestTime(); 39 | conf.ignoreZkOffsets = true; 40 | KafkaSpout spout = new KafkaSpout(conf); 41 | 42 | int kafkaPartition = Integer.valueOf(configure.getProperty("partition.number")); 43 | builder.setSpout("spout", spout, kafkaPartition); 44 | } 45 | 46 | protected void setBolt(TopologyBuilder builder) { 47 | int clusterCores = Integer.valueOf(configure.getProperty("cluster.cores.total")); 48 | int availableCores = clusterCores - Integer.valueOf(configure.getProperty("partition.number")); 49 | 50 | 51 | //inter bolt 52 | //builder.setBolt("inter-bolt", getInterBolt(), availableCores).localOrShuffleGrouping("spout"); 53 | 54 | //kafka storm-bolt 55 | builder.setBolt("kafka-bolt", getKafkaBolt(), availableCores).localOrShuffleGrouping("spout"); 56 | } 57 | 58 | private IBasicBolt getInterBolt() { 59 | return new BaseBasicBolt() { 60 | @Override 61 | public void execute(Tuple input, BasicOutputCollector collector) { 62 | collector.emit(new Values(input)); 63 | } 64 | 65 | @Override 66 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 67 | declarer.declare(new Fields("inter-bolt")); 68 | } 69 | }; 70 | } 71 | 72 | private IRichBolt getKafkaBolt() { 73 | Properties properties = new Properties(); 74 | properties.put("bootstrap.servers", configure.getProperty("result.broker.list")); 75 | properties.put("acks", "0"); 76 | properties.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer"); 77 | properties.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer"); 78 | // consume too much memory 79 | //properties.put("batch.size", "10485760"); 80 | //properties.put("max.request", "10485760"); 81 | //properties.put("send.buffer.bytes", "1000000"); 82 | KafkaBolt bolt = new KafkaBolt() 83 | .withProducerProperties(properties) 84 | .withTopicSelector(new DefaultTopicSelector(configure.getProperty("result.topic"))) 85 | .withTupleToKafkaMapper(new TupleToKafkaMapper() { 86 | @Override 87 | public String getKeyFromTuple(Tuple tuple) { 88 | return null; 89 | } 90 | 91 | @Override 92 | public String getMessageFromTuple(Tuple tuple) { 93 | 94 | ImmutableMap kv = (ImmutableMap)tuple.getValue(0); 95 | return kv.keySet().iterator().next() + "," + System.currentTimeMillis(); 96 | 97 | } 98 | }); 99 | bolt.setFireAndForget(true); 100 | bolt.setAsync(true); 101 | return bolt; 102 | } 103 | 104 | public static void main(String[] args) throws Exception { 105 | BasicTopology basicTopology = new BasicTopology(); 106 | if (args.length > 1) { 107 | if (!"--property".equals(args[1])) { 108 | System.out.println("unknow option: " + args[1]); 109 | System.out.println("usage storm jar examples-1.1-shaded.jar com.aliyun.emr.example.storm.benchmark.BasicTopology benchmark.properties --property k1=v1,k2=v2"); 110 | System.exit(1); 111 | } 112 | basicTopology.init(args[0], args[2]); 113 | } else { 114 | basicTopology.init(args[0]); 115 | } 116 | 117 | basicTopology.run(true); 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /src/main/java/com/aliyun/emr/example/storm/benchmark/KafkaHdfs.java: -------------------------------------------------------------------------------- 1 | package com.aliyun.emr.example.storm.benchmark; 2 | 3 | import org.apache.storm.hdfs.bolt.HdfsBolt; 4 | import org.apache.storm.hdfs.bolt.format.DefaultFileNameFormat; 5 | import org.apache.storm.hdfs.bolt.format.RecordFormat; 6 | import org.apache.storm.hdfs.bolt.rotation.NoRotationPolicy; 7 | import org.apache.storm.hdfs.bolt.sync.CountSyncPolicy; 8 | import org.apache.storm.topology.IRichBolt; 9 | import org.apache.storm.topology.TopologyBuilder; 10 | import org.apache.storm.tuple.Tuple; 11 | 12 | import java.util.Map; 13 | 14 | public class KafkaHdfs extends BasicTopology{ 15 | 16 | protected void setBolt(TopologyBuilder builder) { 17 | int clusterCores = Integer.valueOf(configure.getProperty("cluster.cores.total")); 18 | int availableCores = clusterCores - Integer.valueOf(configure.getProperty("partition.number")); 19 | 20 | builder.setBolt("hdfs-bolt", getHdfsBolt(), availableCores).localOrShuffleGrouping("spout"); 21 | } 22 | 23 | private IRichBolt getHdfsBolt() { 24 | 25 | String filenamePrefix = configure.getProperty("filename.prefix") + configure.getProperty("name") + "/"; 26 | HdfsBolt bolt = new HdfsBolt() 27 | .withFsUrl(configure.getProperty("url")) 28 | .withFileNameFormat(new DefaultFileNameFormat().withPrefix(filenamePrefix)) 29 | .withRecordFormat(new RecordFormat() { 30 | @Override 31 | public byte[] format(Tuple tuple) { 32 | String eventTime = ((Map)tuple.getValue(0)).keySet().iterator().next(); 33 | String output = eventTime + "," + System.currentTimeMillis() + System.lineSeparator(); 34 | return output.getBytes(); 35 | } 36 | }) 37 | .withSyncPolicy(new CountSyncPolicy(1000)) 38 | .withRotationPolicy(new NoRotationPolicy()); 39 | return bolt; 40 | } 41 | 42 | public static void main(String[] args) throws Exception { 43 | KafkaHdfs topology = new KafkaHdfs(); 44 | if (args.length > 1) { 45 | if (!"--property".equals(args[1])) { 46 | System.out.println("unknow option: " + args[1]); 47 | System.out.println("usage storm jar examples-1.1-shaded.jar com.aliyun.emr.example.storm.benchmark.KafkaHdfs benchmark.properties --property k1=v1,k2=v2"); 48 | System.exit(1); 49 | } 50 | topology.init(args[0], args[2]); 51 | } else { 52 | topology.init(args[0]); 53 | } 54 | 55 | topology.run(true); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/main/java/com/aliyun/emr/example/storm/benchmark/TridentWordCount.java: -------------------------------------------------------------------------------- 1 | package com.aliyun.emr.example.storm.benchmark; 2 | 3 | import kafka.api.OffsetRequest; 4 | import org.apache.storm.generated.StormTopology; 5 | import org.apache.storm.hdfs.trident.HdfsState; 6 | import org.apache.storm.hdfs.trident.HdfsStateFactory; 7 | import org.apache.storm.hdfs.trident.HdfsUpdater; 8 | import org.apache.storm.hdfs.trident.format.DefaultFileNameFormat; 9 | import org.apache.storm.hdfs.trident.format.DelimitedRecordFormat; 10 | import org.apache.storm.hdfs.trident.rotation.NoRotationPolicy; 11 | import org.apache.storm.kafka.KeyValueSchemeAsMultiScheme; 12 | import org.apache.storm.kafka.StringKeyValueScheme; 13 | import org.apache.storm.kafka.ZkHosts; 14 | import org.apache.storm.kafka.trident.TransactionalTridentKafkaSpout; 15 | import org.apache.storm.kafka.trident.TridentKafkaConfig; 16 | import org.apache.storm.trident.TridentTopology; 17 | import org.apache.storm.trident.operation.BaseFunction; 18 | import org.apache.storm.trident.operation.TridentCollector; 19 | import org.apache.storm.trident.state.StateFactory; 20 | import org.apache.storm.trident.tuple.TridentTuple; 21 | import org.apache.storm.tuple.Fields; 22 | import org.apache.storm.tuple.Values; 23 | 24 | import java.util.HashMap; 25 | import java.util.Map; 26 | 27 | public class TridentWordCount extends AbstractTopology { 28 | 29 | @Override 30 | StormTopology createTopology() { 31 | int partition = Integer.valueOf(configure.getProperty("partition.number")); 32 | 33 | TridentTopology topology = new TridentTopology(); 34 | TransactionalTridentKafkaSpout spout = createSpout(); 35 | 36 | topology.newStream("kafka-spout", spout).name("kafka").parallelismHint(partition) 37 | .each(spout.getOutputFields(), new WordCount(), new Fields("eventTime", "finishTime")).name("word-count") 38 | .partitionPersist(createHdfsState("eventTime", "finishTime"), new Fields("eventTime", "finishTime"), new HdfsUpdater(), new Fields("eventTime", "finishTime")); 39 | return topology.build(); 40 | } 41 | 42 | private TransactionalTridentKafkaSpout createSpout() { 43 | String consumerGroup = configure.getProperty("consumer.group"); 44 | ZkHosts zkHost = new ZkHosts(configure.getProperty("zookeeper.address") + ":2181" + configure.getProperty("zookeeper.root")); 45 | TridentKafkaConfig config = new TridentKafkaConfig(zkHost, configure.getProperty("topic"), consumerGroup); 46 | config.socketTimeoutMs = 60 * 1000; 47 | config.ignoreZkOffsets=true; 48 | config.startOffsetTime= OffsetRequest.LatestTime(); 49 | config.scheme = new KeyValueSchemeAsMultiScheme(new StringKeyValueScheme()); 50 | config.startOffsetTime = OffsetRequest.LatestTime(); 51 | return new TransactionalTridentKafkaSpout(config); 52 | } 53 | 54 | private StateFactory createHdfsState(String... fileds) { 55 | String filenamePrefix = configure.getProperty("filename.prefix") + configure.getProperty("name") + "/"; 56 | 57 | HdfsState.Options options = new HdfsState.HdfsFileOptions() 58 | .withFsUrl(configure.getProperty("url")) 59 | .withFileNameFormat(new DefaultFileNameFormat().withPath(filenamePrefix)) 60 | .withRecordFormat(new DelimitedRecordFormat().withFields(new Fields(fileds))) 61 | .withRotationPolicy(new NoRotationPolicy()); 62 | return new HdfsStateFactory().withOptions(options); 63 | } 64 | 65 | private class WordCount extends BaseFunction { 66 | private HashMap count = new HashMap<>(); 67 | @Override 68 | public void execute(TridentTuple tuple, TridentCollector collector) { 69 | // for test 70 | Map kv = (Map)tuple.get(0); 71 | for (Map.Entry item: kv.entrySet()) { 72 | String eventTime = item.getKey(); 73 | String words = item.getValue(); 74 | for (String word: words.split("\\s+")) { 75 | Integer number = count.get(word); 76 | if (number == null) { 77 | number = 0; 78 | } 79 | number++; 80 | count.put(word, number); 81 | 82 | } 83 | collector.emit(new Values(eventTime, System.currentTimeMillis())); 84 | } 85 | 86 | } 87 | } 88 | 89 | public static void main(String[] args) throws Exception { 90 | TridentWordCount wordCount = new TridentWordCount(); 91 | if (args.length > 1) { 92 | if (!"--property".equals(args[1])) { 93 | System.out.println("unknow option: " + args[1]); 94 | System.out.println("usage storm jar examples-1.1-shaded.jar com.aliyun.emr.example.storm.benchmark.TridentWordCount benchmark.properties --property k1=v1,k2=v2"); 95 | System.exit(1); 96 | } 97 | wordCount.init(args[0], args[2]); 98 | } else { 99 | wordCount.init(args[0]); 100 | } 101 | wordCount.run(true); 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /src/main/java/com/aliyun/emr/example/storm/benchmark/WindowedWordCount.java: -------------------------------------------------------------------------------- 1 | package com.aliyun.emr.example.storm.benchmark; 2 | 3 | import org.apache.storm.hdfs.bolt.HdfsBolt; 4 | import org.apache.storm.hdfs.bolt.format.DefaultFileNameFormat; 5 | import org.apache.storm.hdfs.bolt.format.DelimitedRecordFormat; 6 | import org.apache.storm.hdfs.bolt.rotation.NoRotationPolicy; 7 | import org.apache.storm.hdfs.bolt.sync.CountSyncPolicy; 8 | import org.apache.storm.task.OutputCollector; 9 | import org.apache.storm.task.TopologyContext; 10 | import org.apache.storm.topology.OutputFieldsDeclarer; 11 | import org.apache.storm.topology.TopologyBuilder; 12 | import org.apache.storm.topology.base.BaseWindowedBolt; 13 | import org.apache.storm.tuple.Fields; 14 | import org.apache.storm.tuple.Tuple; 15 | import org.apache.storm.tuple.Values; 16 | import org.apache.storm.windowing.TupleWindow; 17 | import org.apache.storm.topology.base.BaseWindowedBolt.Count; 18 | 19 | import java.util.HashMap; 20 | import java.util.Map; 21 | 22 | public class WindowedWordCount extends BasicTopology { 23 | @Override 24 | protected void setBolt(TopologyBuilder builder) { 25 | int windowLength = Integer.valueOf(configure.getProperty("window.length")); 26 | int clusterCores = Integer.valueOf(configure.getProperty("cluster.cores.total")); 27 | int availableCores = clusterCores - Integer.valueOf(configure.getProperty("partition.number")); 28 | int parallelism = availableCores / 2; 29 | 30 | int slidingInterval = Integer.valueOf(configure.getProperty("slide.interval")); 31 | 32 | builder.setBolt("count", new SplitCount().withWindow(new Count(windowLength), new Count(slidingInterval)), parallelism).localOrShuffleGrouping("spout"); 33 | 34 | String filenamePrefix = configure.getProperty("filename.prefix") + configure.getProperty("name") + "/"; 35 | HdfsBolt bolt = new HdfsBolt() 36 | .withFsUrl(configure.getProperty("url")) 37 | .withFileNameFormat(new DefaultFileNameFormat().withPrefix(filenamePrefix)) 38 | .withRecordFormat(new DelimitedRecordFormat().withFieldDelimiter(",")) 39 | .withSyncPolicy(new CountSyncPolicy(1000)) 40 | .withRotationPolicy(new NoRotationPolicy()); 41 | builder.setBolt("hdfs-bolt", bolt, parallelism).localOrShuffleGrouping("count"); 42 | } 43 | 44 | private class SplitCount extends BaseWindowedBolt { 45 | private OutputCollector collector; 46 | private Map counter = new HashMap<>(); 47 | 48 | @Override 49 | public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) { 50 | super.prepare(stormConf, context, collector); 51 | this.collector = collector; 52 | } 53 | 54 | @Override 55 | public void execute(TupleWindow inputWindow) { 56 | for ( Tuple tuple : inputWindow.get()) { 57 | Map value = (Map)tuple.getValue(0); 58 | for (Map.Entry item : value.entrySet()) { 59 | String eventTime = item.getKey(); 60 | String words = item.getValue(); 61 | for (String word: words.split("\\s+")) { 62 | Integer number = counter.get(word); 63 | if (number == null) { 64 | number = 0; 65 | } 66 | number++; 67 | counter.put(word, number); 68 | } 69 | collector.emit(new Values(eventTime, System.currentTimeMillis())); 70 | } 71 | } 72 | 73 | } 74 | 75 | @Override 76 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 77 | declarer.declare(new Fields("eventTime", "finishTime")); 78 | } 79 | } 80 | 81 | public static void main(String[] args) throws Exception { 82 | WindowedWordCount wordCount = new WindowedWordCount(); 83 | if (args.length > 1) { 84 | if (!"--property".equals(args[1])) { 85 | System.out.println("unknow option: " + args[1]); 86 | System.out.println("usage storm jar examples-1.1-shaded.jar com.aliyun.emr.example.storm.benchmark.WindowedWordCount benchmark.properties --property k1=v1,k2=v2"); 87 | System.exit(1); 88 | } 89 | wordCount.init(args[0], args[2]); 90 | } else { 91 | wordCount.init(args[0]); 92 | } 93 | wordCount.run(true); 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /src/main/java/com/aliyun/emr/example/storm/benchmark/WordCount.java: -------------------------------------------------------------------------------- 1 | package com.aliyun.emr.example.storm.benchmark; 2 | 3 | import org.apache.storm.hdfs.bolt.HdfsBolt; 4 | import org.apache.storm.hdfs.bolt.format.DefaultFileNameFormat; 5 | import org.apache.storm.hdfs.bolt.format.DelimitedRecordFormat; 6 | import org.apache.storm.hdfs.bolt.rotation.NoRotationPolicy; 7 | import org.apache.storm.hdfs.bolt.sync.CountSyncPolicy; 8 | import org.apache.storm.topology.BasicOutputCollector; 9 | import org.apache.storm.topology.OutputFieldsDeclarer; 10 | import org.apache.storm.topology.TopologyBuilder; 11 | import org.apache.storm.topology.base.BaseBasicBolt; 12 | import org.apache.storm.tuple.Fields; 13 | import org.apache.storm.tuple.Tuple; 14 | import org.apache.storm.tuple.Values; 15 | 16 | import java.util.HashMap; 17 | import java.util.Map; 18 | 19 | public class WordCount extends BasicTopology { 20 | @Override 21 | protected void setBolt(TopologyBuilder builder) { 22 | int clusterCores = Integer.valueOf(configure.getProperty("cluster.cores.total")); 23 | int availableCores = clusterCores - Integer.valueOf(configure.getProperty("partition.number")); 24 | 25 | int hdfsParallelismFactor = Integer.parseInt(configure.getProperty("hdfs.parallelism.factor")); 26 | int hdfsParallelism = availableCores * hdfsParallelismFactor / (hdfsParallelismFactor + 1); 27 | builder.setBolt("split-count", new SplitCount(), availableCores - hdfsParallelism).localOrShuffleGrouping("spout"); 28 | 29 | String filenamePrefix = configure.getProperty("filename.prefix") + configure.getProperty("name") + "/"; 30 | HdfsBolt bolt = new HdfsBolt() 31 | .withFsUrl(configure.getProperty("url")) 32 | .withFileNameFormat(new DefaultFileNameFormat().withPrefix(filenamePrefix)) 33 | .withRecordFormat(new DelimitedRecordFormat().withFieldDelimiter(",")) 34 | .withSyncPolicy(new CountSyncPolicy(1000)) 35 | .withRotationPolicy(new NoRotationPolicy()); 36 | builder.setBolt("hdfs-bolt", bolt, hdfsParallelism).localOrShuffleGrouping("split-count"); 37 | } 38 | 39 | 40 | private class SplitCount extends BaseBasicBolt { 41 | private Map counter = new HashMap<>(); 42 | 43 | @Override 44 | public void execute(Tuple input, BasicOutputCollector collector) { 45 | Map value = (Map)input.getValue(0); 46 | for (Map.Entryitem : value.entrySet()) { 47 | String eventTime = item.getKey(); 48 | String words = item.getValue(); 49 | 50 | for (String word : words.split("\\s+")) { 51 | Integer number = counter.get(word); 52 | if (number == null) { 53 | number = 0; 54 | } 55 | number++; 56 | counter.put(word, number); 57 | } 58 | collector.emit(new Values(eventTime, System.currentTimeMillis())); 59 | } 60 | 61 | } 62 | 63 | @Override 64 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 65 | declarer.declare(new Fields("eventTime", "finishTime")); 66 | } 67 | } 68 | 69 | public static void main(String[] args) throws Exception { 70 | WordCount wordCount = new WordCount(); 71 | if (args.length > 1) { 72 | if (!"--property".equals(args[1])) { 73 | System.out.println("unknow option: " + args[1]); 74 | System.out.println("usage storm jar examples-1.1-shaded.jar com.aliyun.emr.example.storm.benchmark.WordCount benchmark.properties --property k1=v1,k2=v2"); 75 | System.exit(1); 76 | } 77 | wordCount.init(args[0], args[2]); 78 | } else { 79 | wordCount.init(args[0]); 80 | } 81 | wordCount.run(true); 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/main/java/com/aliyun/emr/example/storm/benchmark/util/Helper.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License 17 | */ 18 | 19 | package com.aliyun.emr.example.storm.benchmark.util; 20 | 21 | import org.apache.storm.Config; 22 | import org.apache.storm.LocalCluster; 23 | import org.apache.storm.StormSubmitter; 24 | import org.apache.storm.generated.KillOptions; 25 | import org.apache.storm.generated.Nimbus; 26 | import org.apache.storm.generated.StormTopology; 27 | import org.apache.storm.perf.utils.BasicMetricsCollector; 28 | import org.apache.storm.utils.NimbusClient; 29 | import org.apache.storm.utils.Utils; 30 | 31 | import java.util.Map; 32 | 33 | public class Helper { 34 | 35 | public static void kill(Nimbus.Client client, String topoName) throws Exception { 36 | KillOptions opts = new KillOptions(); 37 | opts.set_wait_secs(0); 38 | client.killTopologyWithOpts(topoName, opts); 39 | } 40 | 41 | public static void killAndShutdownCluster(LocalCluster cluster, String topoName) throws Exception { 42 | KillOptions opts = new KillOptions(); 43 | opts.set_wait_secs(0); 44 | cluster.killTopologyWithOpts(topoName, opts); 45 | cluster.shutdown(); 46 | } 47 | 48 | 49 | public static LocalCluster runOnLocalCluster(String topoName, StormTopology topology) { 50 | LocalCluster cluster = new LocalCluster(); 51 | cluster.submitTopology(topoName, new Config(), topology); 52 | return cluster; 53 | } 54 | 55 | public static int getInt(Map map, Object key, int def) { 56 | return Utils.getInt(Utils.get(map, key, def)); 57 | } 58 | 59 | public static String getStr(Map map, Object key) { 60 | return (String) map.get(key); 61 | } 62 | 63 | public static void collectMetrics(String topologyName, Integer pollInterval) throws Exception { 64 | Map clusterConf = Utils.readStormConfig(); 65 | Nimbus.Client client = NimbusClient.getConfiguredClient(clusterConf).getClient(); 66 | BasicMetricsCollector metricsCollector = new BasicMetricsCollector(client, topologyName, clusterConf); 67 | 68 | try { 69 | while (true){ 70 | metricsCollector.collect(client); 71 | Thread.sleep(pollInterval * 1000); 72 | } 73 | } finally { 74 | metricsCollector.close(); 75 | kill(client, topologyName); 76 | } 77 | 78 | } 79 | 80 | public static void collectMetricsAndKill(String topologyName, Integer pollInterval, Integer duration) throws Exception { 81 | Map clusterConf = Utils.readStormConfig(); 82 | Nimbus.Client client = NimbusClient.getConfiguredClient(clusterConf).getClient(); 83 | BasicMetricsCollector metricsCollector = new BasicMetricsCollector(client, topologyName, clusterConf); 84 | 85 | int times = duration / pollInterval; 86 | metricsCollector.collect(client); 87 | for (int i = 0; i < times; i++) { 88 | Thread.sleep(pollInterval * 1000); 89 | metricsCollector.collect(client); 90 | } 91 | metricsCollector.close(); 92 | kill(client, topologyName); 93 | } 94 | 95 | public static void collectLocalMetricsAndKill(LocalCluster localCluster, String topologyName, Integer pollInterval, Integer duration, Map clusterConf) throws Exception { 96 | BasicMetricsCollector metricsCollector = new BasicMetricsCollector(localCluster, topologyName, clusterConf); 97 | 98 | int times = duration / pollInterval; 99 | metricsCollector.collect(localCluster); 100 | for (int i = 0; i < times; i++) { 101 | Thread.sleep(pollInterval * 1000); 102 | metricsCollector.collect(localCluster); 103 | } 104 | metricsCollector.close(); 105 | killAndShutdownCluster(localCluster, topologyName); 106 | } 107 | 108 | /** Kill topo and Shutdown local cluster on Ctrl-C */ 109 | public static void setupShutdownHook(final LocalCluster cluster, final String topoName) { 110 | Runtime.getRuntime().addShutdownHook(new Thread() { 111 | public void run() { 112 | cluster.killTopology(topoName); 113 | System.out.println("Killed Topology"); 114 | cluster.shutdown(); 115 | } 116 | }); 117 | } 118 | 119 | /** Kill topo on Ctrl-C */ 120 | public static void setupShutdownHook(final String topoName) { 121 | Map clusterConf = Utils.readStormConfig(); 122 | final Nimbus.Client client = NimbusClient.getConfiguredClient(clusterConf).getClient(); 123 | Runtime.getRuntime().addShutdownHook(new Thread() { 124 | public void run() { 125 | try { 126 | Helper.kill(client, topoName); 127 | System.out.println("Killed Topology"); 128 | } catch (Exception e) { 129 | e.printStackTrace(); 130 | } 131 | } 132 | }); 133 | } 134 | 135 | public static void runOnClusterAndPrintMetrics(Integer durationSec, String topoName, Map topoConf, StormTopology topology) throws Exception { 136 | // submit topology 137 | StormSubmitter.submitTopologyWithProgressBar(topoName, topoConf, topology); 138 | setupShutdownHook(topoName); // handle Ctrl-C 139 | 140 | // poll metrics every minute, then kill topology after specified duration 141 | Integer pollIntervalSec = 60; 142 | collectMetricsAndKill(topoName, pollIntervalSec, durationSec); 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /src/main/pig/sample.pig: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | -- Query Phrase Popularity (Hadoop cluster) 20 | 21 | -- This script processes a search query log file from the Excite search engine and finds search phrases that occur with particular high frequency during certain times of the day. 22 | 23 | 24 | -- Register the tutorial JAR file so that the included UDFs can be called in the script. 25 | REGISTER $tutorial; 26 | 27 | -- Use the PigStorage function to load the excite log file into the ?raw? bag as an array of records. 28 | -- Input: (user,time,query) 29 | raw = LOAD '$input' USING PigStorage('\t') AS (user, time, query); 30 | 31 | 32 | -- Call the NonURLDetector UDF to remove records if the query field is empty or a URL. 33 | clean1 = FILTER raw BY org.apache.pig.tutorial.NonURLDetector(query); 34 | 35 | -- Call the ToLower UDF to change the query field to lowercase. 36 | clean2 = FOREACH clean1 GENERATE user, time, org.apache.pig.tutorial.ToLower(query) as query; 37 | 38 | -- Because the log file only contains queries for a single day, we are only interested in the hour. 39 | -- The excite query log timestamp format is YYMMDDHHMMSS. 40 | -- Call the ExtractHour UDF to extract the hour (HH) from the time field. 41 | houred = FOREACH clean2 GENERATE user, org.apache.pig.tutorial.ExtractHour(time) as hour, query; 42 | 43 | -- Call the NGramGenerator UDF to compose the n-grams of the query. 44 | ngramed1 = FOREACH houred GENERATE user, hour, flatten(org.apache.pig.tutorial.NGramGenerator(query)) as ngram; 45 | 46 | -- Use the DISTINCT command to get the unique n-grams for all records. 47 | ngramed2 = DISTINCT ngramed1; 48 | 49 | -- Use the GROUP command to group records by n-gram and hour. 50 | hour_frequency1 = GROUP ngramed2 BY (ngram, hour); 51 | 52 | -- Use the COUNT function to get the count (occurrences) of each n-gram. 53 | hour_frequency2 = FOREACH hour_frequency1 GENERATE flatten($0), COUNT($1) as count; 54 | 55 | -- Use the GROUP command to group records by n-gram only. 56 | -- Each group now corresponds to a distinct n-gram and has the count for each hour. 57 | uniq_frequency1 = GROUP hour_frequency2 BY group::ngram; 58 | 59 | -- For each group, identify the hour in which this n-gram is used with a particularly high frequency. 60 | -- Call the ScoreGenerator UDF to calculate a "popularity" score for the n-gram. 61 | uniq_frequency2 = FOREACH uniq_frequency1 GENERATE flatten($0), flatten(org.apache.pig.tutorial.ScoreGenerator($1)); 62 | 63 | -- Use the FOREACH-GENERATE command to assign names to the fields. 64 | uniq_frequency3 = FOREACH uniq_frequency2 GENERATE $1 as hour, $0 as ngram, $2 as score, $3 as count, $4 as mean; 65 | 66 | -- Use the FILTER command to move all records with a score less than or equal to 2.0. 67 | filtered_uniq_frequency = FILTER uniq_frequency3 BY score > 2.0; 68 | 69 | -- Use the ORDER command to sort the remaining records by hour and score. 70 | ordered_uniq_frequency = ORDER filtered_uniq_frequency BY hour, score; 71 | 72 | -- Use the PigStorage function to store the results. 73 | -- Output: (hour, n-gram, score, count, average_counts_among_all_hours) 74 | STORE ordered_uniq_frequency INTO '$results' USING PigStorage(); -------------------------------------------------------------------------------- /src/main/python/deeplearning/tf_fm_on_spark.py: -------------------------------------------------------------------------------- 1 | from pylearning.model.tensorflow_base import tensorflow_base 2 | from pyspark.sql import SparkSession 3 | from pyspark import SparkContext 4 | import os 5 | import random 6 | import numpy as np 7 | 8 | from pyspark.sql.functions import col 9 | import tensorflow as tf 10 | 11 | class tf_fm(tensorflow_base): 12 | 13 | @staticmethod 14 | def pre_train(env): 15 | spark_context = SparkContext.getOrCreate() 16 | spark = SparkSession(spark_context).builder.getOrCreate() 17 | rating_df = spark.read.format('csv').option('header', 'True').load('/moviedata/ratings.csv') 18 | movie_df = spark.read.format('csv').option('header', 'True').load('/moviedata/movies.csv') 19 | 20 | # process user first 21 | distinct_user_df = rating_df.select('userId').distinct() 22 | users_number = distinct_user_df.count() 23 | env.get("algo")["users_number"] = str(users_number) 24 | 25 | users_row = distinct_user_df.collect() 26 | users = [] 27 | users_dict = [] 28 | users_map = {} 29 | for user in users_row: 30 | users.append(user['userId']) 31 | sorted_users = sorted(users) 32 | for user in sorted_users: 33 | users_dict.append((user,len(users_dict))) 34 | users_map[user] = len(users_map) 35 | 36 | # It is use for later process, to get the sorted user id. 37 | columns = ["userid","id"] 38 | users_sort_df = spark.createDataFrame(users_dict,columns) 39 | # users_sort_df.write.format("csv").save("/moviedata/sortedusers") 40 | 41 | # process genres 42 | geners_row = movie_df.select("genres").distinct().collect() 43 | genres_set = set() 44 | genres_map = {} 45 | for genres in geners_row: 46 | for one_genre in genres['genres'].split('|'): 47 | genres_set.add(one_genre) 48 | for genre in genres_set: 49 | genres_map[genre] = len(genres_map) 50 | 51 | # join two dataframe and process later, userid(bigint) genres(string, need split), rating(float) 52 | joined_df = rating_df.join(movie_df, rating_df.movieId == movie_df.movieId) 53 | joined_df = joined_df.select(col('userId'),col('genres'),col('rating').cast('float').alias('rating')) 54 | 55 | users_map_bc = spark_context.broadcast(users_map) 56 | genres_map_bc = spark_context.broadcast(genres_map) 57 | env.get("algo")["genres_number"] = str(len(genres_map)) 58 | 59 | def process_row(row): 60 | userId = row.userId 61 | genres = row.genres 62 | users_map_rdd = users_map_bc.value 63 | genres_map_rdd = genres_map_bc.value 64 | genres_return_list = [] 65 | for i in genres.split("|"): 66 | genres_return_list.append(str(genres_map_rdd[i])) 67 | return (users_map_rdd[userId], "|".join(genres_return_list), row.rating) 68 | 69 | return joined_df.rdd.map(process_row).toDF(['userId','genres','rating']) 70 | 71 | @staticmethod 72 | def train(dataframe, env): 73 | environ = os.environ 74 | ps_hosts = environ.get("ps_hosts").split(",") 75 | worker_hosts = environ.get("worker_hosts").split(",") 76 | job_name = environ.get("job_name") 77 | task_index = int(environ.get("task_index")) 78 | 79 | cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) 80 | server = tf.train.Server(cluster, 81 | job_name= job_name, 82 | task_index=task_index) 83 | 84 | if job_name == "ps": 85 | server.join() 86 | else : 87 | # batch size is 2000, parameter size including embedding for user and one hot for genres 88 | # embedding size is 128, one hot size is 20(we can obtain it from env) 89 | batch_size = 2000 90 | 91 | embedding_size = 128 92 | genres_size = int(env.get("algo")["genres_number"]) 93 | users_size = int(env.get("algo")["users_number"]) 94 | p_size = embedding_size + genres_size 95 | k = 10 96 | embeddings = tf.Variable(tf.random_uniform([users_size,embedding_size], -1.0, 1.0)) 97 | USER = tf.placeholder('int64',shape=[batch_size,1]) 98 | ITEM = tf.placeholder('float', shape=[batch_size, genres_size]) 99 | embed = tf.nn.embedding_lookup(embeddings, USER) 100 | user_embed = tf.reshape(embed, shape=[batch_size, embedding_size]) 101 | X = tf.concat([user_embed, ITEM], 1) 102 | Y = tf.placeholder('float', shape=[batch_size,1]) 103 | 104 | w0 = tf.Variable(tf.zeros([1])) 105 | W = tf.Variable(tf.zeros([p_size])) 106 | 107 | V = tf.Variable(tf.random_normal([k, p_size], stddev=0.01)) 108 | y_hat = tf.Variable(tf.zeros([batch_size, 1])) 109 | 110 | linear_terms = tf.add(w0, tf.reduce_sum(tf.multiply(W, X), 1, keep_dims=True)) 111 | interactions = (tf.multiply(0.5, tf.reduce_sum( 112 | tf.subtract(tf.pow(tf.matmul(X, tf.transpose(V)), 2), 113 | tf.matmul(tf.pow(X, 2), tf.transpose(tf.pow(V, 2)))), 1, 114 | keep_dims=True))) 115 | 116 | y_hat = tf.add(linear_terms, interactions) 117 | lambda_w = tf.constant(0.001, name='lambda_w') 118 | lambda_v = tf.constant(0.001, name='lambda_v') 119 | 120 | l2_norm = (tf.reduce_sum( 121 | tf.add( 122 | tf.multiply(lambda_w, tf.pow(W, 2)), 123 | tf.multiply(lambda_v, tf.pow(V, 2))))) 124 | 125 | error = tf.reduce_mean(tf.square(tf.subtract(Y, y_hat))) 126 | 127 | loss = tf.add(error, l2_norm) 128 | 129 | N_EPOCHS = 100 130 | eta = tf.constant(0.1) 131 | global_step = tf.contrib.framework.get_or_create_global_step() 132 | optimizer = tf.train.AdagradOptimizer(eta).minimize(loss, global_step=global_step) 133 | 134 | init = tf.global_variables_initializer() 135 | 136 | def get_train_data(): 137 | users_sub, genres_sub, rating_sub = \ 138 | zip(*random.sample(list(zip(dataframe.userId, dataframe.genres, dataframe.rating)), batch_size)) 139 | batch_user = np.zeros(shape=(batch_size,1), dtype=np.int64) 140 | batch_genre = np.zeros(shape=(batch_size,genres_size), dtype=np.float32) 141 | label = np.ndarray(shape=(batch_size,1), dtype = np.float32) 142 | for i in range(batch_size): 143 | batch_user[i] = users_sub[i] 144 | for genre in genres_sub[i].split("|"): 145 | batch_genre[i][int(genre)] = 1 146 | label[i] = rating_sub[i] 147 | return batch_user, batch_genre, label 148 | 149 | checkpoint_dir = "hdfs://emr-header-1:9000/movie" 150 | saver = tf.train.Saver() 151 | epoch = 0 152 | 153 | with tf.train.MonitoredTrainingSession(master = server.target, 154 | is_chief = task_index == 0, 155 | checkpoint_dir= checkpoint_dir, 156 | save_checkpoint_secs=20) as sess: 157 | tf.reset_default_graph() 158 | sess.run(init) 159 | latest_path = tf.train.latest_checkpoint(checkpoint_dir=checkpoint_dir) 160 | saver.restore(sess, latest_path) 161 | while epoch < N_EPOCHS: 162 | (batch_user,batch_genre,label) = get_train_data() 163 | sess.run(optimizer, feed_dict={USER: batch_user, ITEM: batch_genre, Y:label}) 164 | print(sess.run(error, feed_dict={USER: batch_user, ITEM: batch_genre, Y: label})) 165 | epoch = epoch + 1 166 | -------------------------------------------------------------------------------- /src/main/python/deeplearning/train_boston.py: -------------------------------------------------------------------------------- 1 | from pylearning.model.tensorflow_base import tensorflow_base 2 | from pyspark.sql import SparkSession 3 | from pyspark import SparkContext 4 | 5 | import tensorflow as tf 6 | from pyspark.sql.functions import col 7 | 8 | class train_boston(tensorflow_base): 9 | @staticmethod 10 | def pre_train(): 11 | spark_context = SparkContext.getOrCreate() 12 | spark = SparkSession(spark_context).builder.getOrCreate() 13 | df = spark.read.format('csv').option("header","True").load('/train.csv') 14 | cast_df = df.select(*(col(c).cast("double").alias(c) for c in df.columns)) 15 | return cast_df 16 | 17 | @staticmethod 18 | def train(dataframe, env): 19 | crim = tf.feature_column.numeric_column('crim', dtype=tf.float64, shape=()) 20 | zn = tf.feature_column.numeric_column('zn', dtype=tf.float64, shape=()) 21 | indus = tf.feature_column.numeric_column('indus', dtype=tf.float64, shape=()) 22 | chas = tf.feature_column.numeric_column('chas', dtype=tf.int64, shape=()) 23 | nox = tf.feature_column.numeric_column('nox', dtype=tf.float64, shape=()) 24 | rm = tf.feature_column.numeric_column('rm', dtype=tf.float64, shape=()) 25 | age = tf.feature_column.numeric_column('age', dtype=tf.float64, shape=()) 26 | dis = tf.feature_column.numeric_column('dis', dtype=tf.float64, shape=()) 27 | rad = tf.feature_column.numeric_column('rad', dtype=tf.int64, shape=()) 28 | tax = tf.feature_column.numeric_column('tax', dtype=tf.int64, shape=()) 29 | ptratio = tf.feature_column.numeric_column('ptratio', dtype=tf.float64, shape=()) 30 | black = tf.feature_column.numeric_column('black', dtype=tf.float64, shape=()) 31 | lstat = tf.feature_column.numeric_column('lstat', dtype=tf.float64, shape=()) 32 | 33 | feature_cols = [crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, black, lstat] 34 | feature_names = ['ID','crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 35 | 'lstat'] 36 | label_name = 'medv' 37 | 38 | dict = {} 39 | 40 | index = 0 41 | for i in feature_names: 42 | dict[i] = index 43 | index+=1 44 | 45 | def train_input(): 46 | feature_dict = {} 47 | for i in feature_names[1:]: 48 | feature_dict[i] = dataframe.get(i) 49 | 50 | _dataset = tf.data.Dataset.from_tensor_slices((feature_dict, dataframe.get(label_name))) 51 | dataset = _dataset.batch(32) 52 | return dataset 53 | 54 | ps = tf.contrib.distribute.ParameterServerStrategy() 55 | config = tf.estimator.RunConfig(train_distribute=ps, eval_distribute=ps) 56 | estimator = tf.estimator.LinearRegressor(feature_columns=feature_cols, model_dir='hdfs://emr-header-1:9000/boston', config=config) 57 | 58 | train_spec = tf.estimator.TrainSpec(input_fn=train_input, max_steps=100) 59 | eval_spec = tf.estimator.EvalSpec(input_fn=train_input, start_delay_secs=0, throttle_secs=10,steps=10) 60 | tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) 61 | 62 | -------------------------------------------------------------------------------- /src/main/python/odps-sample.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | import sys 19 | 20 | from odps import OdpsOps 21 | from pyspark import SparkContext 22 | 23 | if __name__ == "__main__": 24 | 25 | if len(sys.argv) != 7: 26 | print >> sys.stderr, "Usage: spark-submit odps-sample.py accessKeyId accessKeySecret project table " \ 27 | "partition numPartitions" 28 | exit(-1) 29 | 30 | accessKeyId = sys.argv[1] 31 | accessKeySecret = sys.argv[2] 32 | odpsUrl = "http://odps-ext.aliyun-inc.com/api" 33 | tunnelUrl = "http://dt-ext.odps.aliyun-inc.com" 34 | project = sys.argv[3] 35 | table = sys.argv[4] 36 | partition = sys.argv[5] 37 | numPartitions = sys.argv[6] 38 | 39 | sc = SparkContext(appName="PySpark Odps Sample") 40 | 41 | odpsOps = OdpsOps(sc, accessKeyId, accessKeySecret, odpsUrl, tunnelUrl) 42 | 43 | print "pScheme" 44 | pSchema = odpsOps.getTableSchema(project, table, True) 45 | for col in pSchema: 46 | print col 47 | 48 | print "scheme" 49 | schema = odpsOps.getTableSchema(project, table, False) 50 | for col in schema: 51 | print col 52 | 53 | print "ColumnByIdx" 54 | col1 =odpsOps.getColumnByIdx(project, table, 1) 55 | print col1 56 | 57 | data = sc.parallelize([[1, 1.5, False, "2014-06-11", "row 1"], 58 | [2, 1.5, True, "2014-06-10", "row 2"]], 2) 59 | odpsOps.saveToPartitionTable(project, table, partition, data, isCreatePt=True, isOverWrite=False) 60 | 61 | nump = int(numPartitions) 62 | rdd = odpsOps.readPartitionTable(project, table, partition, nump, batchSize=1) 63 | rows = rdd.collect() 64 | for row in rows: 65 | print "row: ", 66 | for col in row: 67 | print col, type(col), 68 | print "" 69 | 70 | print "read specific columns" 71 | rdd2 = odpsOps.readPartitionTable(project, table, partition, nump, cols=[1, 2]) 72 | rows2 = rdd2.collect() 73 | for row in rows2: 74 | print "row: ", 75 | for col in row: 76 | print col, type(col), 77 | print "" 78 | -------------------------------------------------------------------------------- /src/main/python/streaming/loghub-wordcount.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | import sys 19 | 20 | from pyspark import SparkContext 21 | from pyspark.streaming import StreamingContext 22 | from loghub import LoghubUtils 23 | 24 | if __name__ == "__main__": 25 | if len(sys.argv) != 8: 26 | print >> sys.stderr, "Usage: spark-submit loghub-wordcount.py logServiceProject logsStoreName " \ 27 | "logHubConsumerGroupName loghubEndpoint numReceiver accessKeyId accessKeySecret" 28 | exit(-1) 29 | 30 | sc = SparkContext(appName="PythonStreamingLoghubWordCount") 31 | ssc = StreamingContext(sc, 2) 32 | 33 | logServiceProject = sys.argv[1] 34 | logsStoreName = sys.argv[2] 35 | logHubConsumerGroupName = sys.argv[3] 36 | loghubEndpoint = sys.argv[4] 37 | numReceiver = int(sys.argv[5]) 38 | accessKeyId = sys.argv[6] 39 | accessKeySecret = sys.argv[7] 40 | 41 | stream = LoghubUtils.createStreams(ssc, logServiceProject, logsStoreName, logHubConsumerGroupName, loghubEndpoint, 42 | numReceiver, accessKeyId, accessKeySecret) 43 | lines = stream.map(lambda x: x[1]) 44 | counts = lines.flatMap(lambda line: line.split(" ")) \ 45 | .map(lambda word: (word, 1)) \ 46 | .reduceByKey(lambda a, b: a+b) 47 | counts.pprint() 48 | 49 | ssc.start() 50 | ssc.awaitTermination() 51 | -------------------------------------------------------------------------------- /src/main/python/streaming/wcmapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | 5 | for line in sys.stdin: 6 | line = line.strip() 7 | words = line.split() 8 | for word in words: 9 | print '%s\t%s' % (word, 1) 10 | 11 | -------------------------------------------------------------------------------- /src/main/python/streaming/wcreducer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from operator import itemgetter 4 | import sys 5 | 6 | current_word = None 7 | current_count = 0 8 | word = None 9 | 10 | for line in sys.stdin: 11 | line = line.strip() 12 | 13 | word, count = line.split('\t', 1) 14 | 15 | try: 16 | count = int(count) 17 | except ValueError: 18 | continue 19 | 20 | if current_word == word: 21 | current_count += count 22 | else: 23 | if current_word: 24 | print '%s\t%s' % (current_word, current_count) 25 | current_count = count 26 | current_word = word 27 | 28 | if current_word == word: 29 | print '%s\t%s' % (current_word, current_count) 30 | -------------------------------------------------------------------------------- /src/main/python/wordcount.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | import sys 19 | from operator import add 20 | from pyspark import SparkContext 21 | from pyspark import SparkConf 22 | 23 | if __name__ == "__main__": 24 | conf = SparkConf() 25 | sc = SparkContext(appName="PythonWordCount", conf=conf) 26 | lines = sc.textFile(sys.argv[1], int(sys.argv[3])) 27 | counts = lines.flatMap(lambda x: x.split(' ')) \ 28 | .map(lambda x: (str(x), 1)) \ 29 | .reduceByKey(add) 30 | counts.saveAsTextFile(sys.argv[2]) 31 | sc.stop() -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/emr/example/flink/FlinkOSSSample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.aliyun.emr.example.flink 19 | 20 | import org.apache.flink.api.java.ExecutionEnvironment 21 | import org.apache.flink.api.java.utils.ParameterTool 22 | 23 | import scala.collection.JavaConversions._ 24 | 25 | object FlinkOSSSample { 26 | def main(args: Array[String]) { 27 | 28 | val params: ParameterTool = ParameterTool.fromArgs(args) 29 | 30 | // set up execution environment 31 | val env = ExecutionEnvironment.getExecutionEnvironment 32 | 33 | // make parameters available in the web interface 34 | env.getConfig.setGlobalJobParameters(params) 35 | 36 | if (!params.has("input")) { 37 | println("Executing WordCount example with default input data set.") 38 | println("Use --input to specify file input.") 39 | sys.exit(1) 40 | } 41 | val text = env.readTextFile(params.get("input")) 42 | 43 | val top10 = text.first(10) 44 | 45 | top10.collect().foreach(println) 46 | 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/emr/example/spark/AbstractParams.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.aliyun.emr.example.spark 19 | 20 | import scala.reflect.runtime.universe._ 21 | 22 | /** 23 | * Abstract class for parameter case classes. 24 | * This overrides the [[toString]] method to print all case class fields by name and value. 25 | * @tparam T Concrete parameter class. 26 | */ 27 | abstract class AbstractParams[T: TypeTag] { 28 | 29 | private def tag: TypeTag[T] = typeTag[T] 30 | 31 | /** 32 | * Finds all case class fields in concrete class instance, and outputs them in JSON-style format: 33 | * { 34 | * [field name]:\t[field value]\n 35 | * [field name]:\t[field value]\n 36 | * ... 37 | * } 38 | */ 39 | override def toString: String = { 40 | val tpe = tag.tpe 41 | val allAccessors = tpe.declarations.collect { 42 | case m: MethodSymbol if m.isCaseAccessor => m 43 | } 44 | val mirror = runtimeMirror(getClass.getClassLoader) 45 | val instanceMirror = mirror.reflect(this) 46 | allAccessors.map { f => 47 | val paramName = f.name.toString 48 | val fieldMirror = instanceMirror.reflectField(f) 49 | val paramValue = fieldMirror.get 50 | s" $paramName:\t$paramValue" 51 | }.mkString("{\n", ",\n", "\n}") 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/emr/example/spark/LinearRegression.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.aliyun.emr.example.spark 19 | 20 | import org.apache.spark.mllib.optimization.{L1Updater, SimpleUpdater, SquaredL2Updater} 21 | import org.apache.spark.mllib.regression.LinearRegressionWithSGD 22 | import org.apache.spark.mllib.util.MLUtils 23 | import _root_.scopt.OptionParser 24 | 25 | object LinearRegression extends RunLocally{ 26 | object RegType extends Enumeration { 27 | type RegType = Value 28 | val NONE, L1, L2 = Value 29 | } 30 | 31 | import RegType._ 32 | 33 | case class Params( 34 | input: String = null, 35 | numPartitions: Int = 2, 36 | numIterations: Int = 100, 37 | stepSize: Double = 1.0, 38 | regType: RegType = L2, 39 | regParam: Double = 0.01, 40 | accessKeyId: String = null, 41 | accessKeySecret: String = null, 42 | endpoint: String = null) extends AbstractParams[Params] 43 | 44 | def main(args: Array[String]) { 45 | val defaultParams = Params() 46 | 47 | val parser = new OptionParser[Params]("LinearRegression") { 48 | head("LinearRegression: an example app for linear regression.") 49 | opt[Int]("numIterations") 50 | .text("number of iterations") 51 | .action((x, c) => c.copy(numIterations = x)) 52 | opt[Double]("stepSize") 53 | .text(s"initial step size, default: ${defaultParams.stepSize}") 54 | .action((x, c) => c.copy(stepSize = x)) 55 | opt[String]("regType") 56 | .text(s"regularization type (${RegType.values.mkString(",")}), " + 57 | s"default: ${defaultParams.regType}") 58 | .action((x, c) => c.copy(regType = RegType.withName(x))) 59 | opt[Double]("regParam") 60 | .text(s"regularization parameter, default: ${defaultParams.regParam}") 61 | arg[String]("") 62 | .required() 63 | .text("input paths to labeled examples in LIBSVM format") 64 | .action((x, c) => c.copy(input = x)) 65 | arg[Int]("") 66 | .required() 67 | .text(s"number of partitions, default: ${defaultParams.numPartitions}") 68 | .action((x, c) => c.copy(numPartitions = x)) 69 | note( 70 | """ 71 | | For example, the following command runs this app on a synthetic dataset: 72 | | 73 | | bin/spark-submit --class LinearRegression examples-1.0-SNAPSHOT-shaded.jar oss://accessKeyId:accessKeySecret@bucket.endpoint/input.txt 2 74 | """.stripMargin) 75 | } 76 | 77 | parser.parse(args, defaultParams).map { params => 78 | run(params) 79 | } getOrElse { 80 | sys.exit(1) 81 | } 82 | } 83 | 84 | def run(params: Params) { 85 | val examples = MLUtils.loadLibSVMFile(getSparkContext, params.input).cache() 86 | val splits = examples.randomSplit(Array(0.8, 0.2)) 87 | val training = splits(0).cache() 88 | val test = splits(1).cache() 89 | 90 | val numTraining = training.count() 91 | val numTest = test.count() 92 | println(s"Training: $numTraining, test: $numTest.") 93 | 94 | examples.unpersist(blocking = false) 95 | 96 | val updater = params.regType match { 97 | case NONE => new SimpleUpdater() 98 | case L1 => new L1Updater() 99 | case L2 => new SquaredL2Updater() 100 | } 101 | 102 | val algorithm = new LinearRegressionWithSGD() 103 | algorithm.optimizer 104 | .setNumIterations(params.numIterations) 105 | .setStepSize(params.stepSize) 106 | .setUpdater(updater) 107 | .setRegParam(params.regParam) 108 | 109 | val model = algorithm.run(training) 110 | 111 | val prediction = model.predict(test.map(_.features)) 112 | val predictionAndLabel = prediction.zip(test.map(_.label)) 113 | 114 | val loss = predictionAndLabel.map { case (p, l) => 115 | val err = p - l 116 | err * err 117 | }.reduce(_ + _) 118 | val rmse = math.sqrt(loss / numTest) 119 | 120 | println(s"Test RMSE = $rmse.") 121 | 122 | getSparkContext.stop() 123 | } 124 | 125 | override def getAppName: String = "LinearRegression" 126 | } 127 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/emr/example/spark/MongoDBWordCount.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.aliyun.emr.example.spark 19 | 20 | import com.stratio.datasource.mongodb._ 21 | import com.stratio.datasource.mongodb.config._ 22 | import com.stratio.datasource.mongodb.config.MongodbConfig._ 23 | 24 | import org.apache.spark.sql._ 25 | import org.apache.spark.sql.SQLContext 26 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} 27 | 28 | object MongoDBWordCount extends RunLocally { 29 | def main(args: Array[String]): Unit = { 30 | if (args.length < 12) { 31 | System.err.println( 32 | """Usage: bin/spark-submit --class MongoDBWordCount examples-1.0-SNAPSHOT-shaded.jar 33 | | 34 | | 35 | | 36 | |Arguments: 37 | | 38 | | dbName MongoDB database name. 39 | | dbUrl MongoDB database URL. 40 | | dbPort MongoDB database port. 41 | | userName MongoDB database user name. 42 | | pwd mongoDB database password. 43 | | collectionName MongoDB collection name. 44 | | sampleRatio MongoDB sample ratio. 45 | | writeConcern MongoDB write concern. 46 | | splitSize MongoDB split size. 47 | | splitKey MongoDB split key. 48 | | inputPath OSS input object path, like oss://accessKeyId:accessKeySecret@bucket.endpoint/a/b.txt 49 | | numPartitions RDD partition number. 50 | | 51 | """.stripMargin) 52 | System.exit(1) 53 | } 54 | 55 | val dbName = args(0) 56 | val dbUrl = args(1) 57 | val dbPort = args(2) 58 | val userName = args(3) 59 | val pwd = args(4) 60 | val collectionName = args(5) 61 | val sampleRatio = args(6).toFloat 62 | val writeConcern = args(7) 63 | val splitSize = args(8).toInt 64 | val splitKey = args(9) 65 | val inputPath = args(10) 66 | val numPartitions = args(11).toInt 67 | 68 | val sqlContext = new SQLContext(getSparkContext) 69 | 70 | val input = getSparkContext.textFile(inputPath, numPartitions) 71 | val counts = input.flatMap(_.split(" ")).map(x => (x, 1)).reduceByKey(_ + _).map(e => Row.apply(e._1, e._2)) 72 | lazy val schema = StructType( 73 | StructField("word", StringType) :: 74 | StructField("count", IntegerType) :: Nil) 75 | 76 | val hosts = dbUrl.split(",").map(e => s"$e:$dbPort").toList 77 | val df = sqlContext.createDataFrame(counts, schema) 78 | val saveConfig = MongodbConfigBuilder(Map(Host -> hosts, Database -> dbName, 79 | Collection -> collectionName, SamplingRatio -> sampleRatio, WriteConcern -> writeConcern, 80 | SplitSize -> splitSize, SplitKey -> splitKey, 81 | Credentials -> List(com.stratio.datasource.mongodb.config.MongodbCredentials(userName, dbName, pwd.toCharArray)))) 82 | df.saveToMongodb(saveConfig.build()) 83 | } 84 | 85 | override def getAppName: String = "MongoDBWordCount" 86 | } 87 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/emr/example/spark/RunLocally.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.aliyun.emr.example.spark 19 | 20 | import org.apache.spark.{SparkConf, SparkContext} 21 | 22 | trait RunLocally { 23 | 24 | def getAppName: String 25 | 26 | def getSparkConf: SparkConf = new SparkConf() 27 | 28 | def getSparkContext: SparkContext = { 29 | val conf = getSparkConf.setAppName(getAppName).setMaster("local[4]") 30 | conf.set("spark.hadoop.fs.oss.impl", "com.aliyun.fs.oss.nat.NativeOssFileSystem") 31 | conf.set("spark.hadoop.mapreduce.job.run-local", "true") 32 | new SparkContext(conf) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/emr/example/spark/SparkMaxComputeDemo.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.aliyun.emr.example.spark 19 | 20 | import com.aliyun.odps.TableSchema 21 | import com.aliyun.odps.data.Record 22 | 23 | import org.apache.spark.aliyun.odps.OdpsOps 24 | import org.apache.spark.{SparkConf, SparkContext} 25 | 26 | object SparkMaxComputeDemo { 27 | def main(args: Array[String]): Unit = { 28 | if (args.length < 6) { 29 | System.err.println( 30 | """Usage: SparkMaxComputeDemo
31 | | 32 | |Arguments: 33 | | 34 | | accessKeyId Aliyun Access Key ID. 35 | | accessKeySecret Aliyun Key Secret. 36 | | envType 0 or 1 37 | | 0: Public environment. 38 | | 1: Aliyun internal environment, i.e. Aliyun ECS etc. 39 | | project Aliyun ODPS project 40 | | table Aliyun ODPS table 41 | | numPartitions the number of RDD partitions 42 | """.stripMargin) 43 | System.exit(1) 44 | } 45 | 46 | val accessKeyId = args(0) 47 | val accessKeySecret = args(1) 48 | val envType = args(2).toInt 49 | val project = args(3) 50 | val table = args(4) 51 | val numPartitions = args(5).toInt 52 | 53 | val urls = Seq( 54 | Seq("http://service.odps.aliyun.com/api", "http://dt.odps.aliyun.com"), // public environment 55 | Seq("http://odps-ext.aliyun-inc.com/api", "http://dt-ext.odps.aliyun-inc.com") // Aliyun internal environment 56 | ) 57 | 58 | val conf = new SparkConf().setAppName("E-MapReduce Demo 3-1: Spark MaxCompute Demo (Scala)") 59 | val sc = new SparkContext(conf) 60 | val odpsOps = envType match { 61 | case 0 => 62 | OdpsOps(sc, accessKeyId, accessKeySecret, urls(0)(0), urls(0)(1)) 63 | case 1 => 64 | OdpsOps(sc, accessKeyId, accessKeySecret, urls(1)(0), urls(1)(1)) 65 | } 66 | 67 | val odpsData = odpsOps.readTable(project, table, read, numPartitions) 68 | 69 | println(s"Count (odpsData): ${odpsData.count()}") 70 | } 71 | 72 | def read(record: Record, schema: TableSchema): Long = { 73 | record.getBigint(0) 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/emr/example/spark/SparkOssDemo.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.aliyun.emr.example.spark 19 | 20 | import org.apache.hadoop.io.{LongWritable, Text} 21 | import org.apache.hadoop.mapred.TextInputFormat 22 | import org.apache.spark.SparkConf 23 | 24 | object SparkOssDemo extends RunLocally { 25 | var accessKeyId = "" 26 | var accessKeySecret = "" 27 | var endpoint = "" 28 | 29 | def main(args: Array[String]): Unit = { 30 | if (args.length < 2) { 31 | System.err.println( 32 | """Usage: bin/spark-submit --class com.aliyun.emr.example.spark.SparkOssDemo examples-1.0-SNAPSHOT-shaded.jar 33 | | 34 | |Arguments: 35 | | 36 | | accessKeyId OSS accessKeyId 37 | | accessKeySecret OSS accessKeySecret 38 | | endpoint OSS endpoint 39 | | inputPath Input OSS object path, like oss://bucket/input/a.txt 40 | | outputPath Output OSS object path, like oss://bucket/output/ 41 | | numPartitions the number of RDD partitions. 42 | | 43 | """.stripMargin) 44 | System.exit(1) 45 | } 46 | 47 | accessKeyId = args(0) 48 | accessKeySecret = args(1) 49 | endpoint = args(2) 50 | val inputPath = args(3) 51 | val outputPath = args(4) 52 | val numPartitions = args(5).toInt 53 | val ossData = getSparkContext.hadoopFile(inputPath, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], numPartitions) 54 | ossData.foreach(line => println(s"print: ${line}")) 55 | 56 | ossData.saveAsTextFile(outputPath) 57 | } 58 | 59 | override def getAppName: String = "E-MapReduce Demo 2-1: Spark Oss Demo (Scala)" 60 | 61 | override def getSparkConf: SparkConf = { 62 | val conf = new SparkConf() 63 | conf.set("spark.hadoop.fs.oss.accessKeyId", accessKeyId) 64 | conf.set("spark.hadoop.fs.oss.accessKeySecret", accessKeySecret) 65 | conf.set("spark.hadoop.fs.oss.endpoint", endpoint) 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/emr/example/spark/SparkPi.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.aliyun.emr.example.spark 19 | 20 | import scala.math._ 21 | 22 | /** Computes an approximation to pi */ 23 | object SparkPi extends RunLocally{ 24 | def main(args: Array[String]) { 25 | val slices = if (args.length > 0) args(0).toInt else 2 26 | val n = math.min(100000L * slices, Int.MaxValue).toInt // avoid overflow 27 | val count = getSparkContext.parallelize(1 until n, slices).map { i => 28 | val x = random * 2 - 1 29 | val y = random * 2 - 1 30 | if (x*x + y*y < 1) 1 else 0 31 | }.reduce(_ + _) 32 | println("Pi is roughly " + 4.0 * count / n) 33 | getSparkContext.stop() 34 | } 35 | 36 | override def getAppName: String = "SparkPi" 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/emr/example/spark/SparkRdsDemo.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.aliyun.emr.example.spark 19 | 20 | import java.sql.{Connection, DriverManager, PreparedStatement} 21 | 22 | object SparkRdsDemo extends RunLocally { 23 | def main(args: Array[String]): Unit = { 24 | if (args.length < 8) { 25 | System.err.println( 26 | """Usage: spark-submit --class SparkRdsDemo examples-1.0-SNAPSHOT-shaded.jar 27 | | 28 | | 29 | |Arguments: 30 | | 31 | | dbName RDS database name. 32 | | tbName RDS table name. 33 | | dbUser RDS database user name. 34 | | dbPwd RDS database password. 35 | | dbUrl RDS database URL. 36 | | dbPort RDS database port 37 | | inputPath OSS input object path, like oss://accessKeyId:accessKeySecret@bucket.endpoint/a/b.txt 38 | | numPartitions 39 | | 40 | """.stripMargin) 41 | System.exit(1) 42 | } 43 | val dbName = args(0) 44 | val tbName = args(1) 45 | val dbUser = args(2) 46 | val dbPwd = args(3) 47 | val dbUrl = args(4) 48 | val dbPort = args(5) 49 | val inputPath = args(6) 50 | val numPartitions = args(7).toInt 51 | 52 | val input = getSparkContext.textFile(inputPath, numPartitions) 53 | input.collect().foreach(println) 54 | input.flatMap(_.split(" ")).map(x => (x, 1)).reduceByKey(_ + _) 55 | .mapPartitions(e => { 56 | var conn: Connection = null 57 | var ps: PreparedStatement = null 58 | val sql = s"insert into $tbName(word, count) values (?, ?)" 59 | try { 60 | conn = DriverManager.getConnection(s"jdbc:mysql://$dbUrl:$dbPort/$dbName", dbUser, dbPwd) 61 | ps = conn.prepareStatement(sql) 62 | e.foreach(pair => { 63 | ps.setString(1, pair._1) 64 | ps.setLong(2, pair._2) 65 | ps.executeUpdate() 66 | }) 67 | 68 | ps.close() 69 | conn.close() 70 | } catch { 71 | case e: Exception => e.printStackTrace() 72 | } finally { 73 | if (ps != null) { 74 | ps.close() 75 | } 76 | if (conn != null) { 77 | conn.close() 78 | } 79 | } 80 | Iterator.empty 81 | }).count() 82 | } 83 | 84 | override def getAppName: String = "E-MapReduce Demo 10: Spark Rds Demo (Scala)" 85 | } 86 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/emr/example/spark/SparkWordCount.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.aliyun.emr.example.spark 19 | 20 | /** Counts words in new text files created in the given directory */ 21 | object SparkWordCount extends RunLocally { 22 | def main(args: Array[String]): Unit = { 23 | if (args.length < 3) { 24 | System.err.println( 25 | """Usage: bin/spark-submit --class com.aliyun.emr.example.SparkWordCount examples-1.0-SNAPSHOT-shaded.jar 26 | | 27 | |Arguments: 28 | | 29 | | inputPath Input OSS object path, like oss://accessKeyId:accessKeySecret@bucket.endpoint/input/words.txt 30 | | outputPath Output OSS object path, like oss://accessKeyId:accessKeySecret@bucket.endpoint/output 31 | | numPartitions The number of RDD partitions. 32 | | 33 | """.stripMargin) 34 | System.exit(1) 35 | } 36 | 37 | val inputPath = args(0) 38 | val outputPath = args(1) 39 | val numPartitions = args(2).toInt 40 | 41 | val input = getSparkContext.textFile(inputPath, numPartitions) 42 | val output = input.flatMap(_.split(" ")).map(x => (x, 1)).reduceByKey(_ + _) 43 | 44 | output.saveAsTextFile(outputPath) 45 | } 46 | 47 | override def getAppName: String = "E-MapReduce Demo 1: SparkWordCount" 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/emr/example/spark/sql/ODPSDataSourceSample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.aliyun.emr.example.spark.sql 19 | 20 | import org.apache.spark.sql.{SaveMode, SparkSession} 21 | 22 | object ODPSDataSourceSample { 23 | def main(args: Array[String]): Unit = { 24 | if (args.length < 6) { 25 | System.err.println( 26 | """Usage: ODPSDataSourceSample
27 | | 28 | |Arguments: 29 | | 30 | | accessKeyId Aliyun Access Key ID. 31 | | accessKeySecret Aliyun Key Secret. 32 | | envType 0 or 1 33 | | 0: Public environment. 34 | | 1: Aliyun internal environment, i.e. Aliyun ECS etc. 35 | | project Aliyun ODPS project 36 | | table Aliyun ODPS table 37 | | numPartitions the number of RDD partitions 38 | """.stripMargin) 39 | System.exit(1) 40 | } 41 | 42 | val accessKeyId = args(0) 43 | val accessKeySecret = args(1) 44 | val envType = args(2).toInt 45 | val project = args(3) 46 | val table = args(4) 47 | 48 | val urls = Seq( 49 | Seq("http://service.odps.aliyun.com/api", "http://dt.odps.aliyun.com"), // public environment 50 | Seq("http://odps-ext.aliyun-inc.com/api", "http://dt-ext.odps.aliyun-inc.com") // Aliyun internal environment 51 | ) 52 | 53 | val odpsUrl = urls(envType)(0) 54 | val tunnelUrl = urls(envType)(1) 55 | 56 | val ss = SparkSession.builder().appName("Test Odps Read").master("local[*]").getOrCreate() 57 | 58 | import ss.implicits._ 59 | 60 | val dataSeq = (1 to 1000000).map { 61 | index => (index, (index-3).toString) 62 | }.toSeq 63 | 64 | 65 | val df = ss.sparkContext.makeRDD(dataSeq).toDF("a", "b") 66 | 67 | System.out.println("*****" + table + ",before overwrite table") 68 | df.write.format("org.apache.spark.aliyun.odps.datasource") 69 | .option("odpsUrl", odpsUrl) 70 | .option("tunnelUrl", tunnelUrl) 71 | .option("table", table) 72 | .option("project", project) 73 | .option("accessKeySecret", accessKeySecret) 74 | .option("accessKeyId", accessKeyId).mode(SaveMode.Overwrite).save() 75 | 76 | System.out.println("*****" + table + ",after overwrite table, before read table") 77 | 78 | val readDF = ss.read 79 | .format("org.apache.spark.aliyun.odps.datasource") 80 | .option("odpsUrl", odpsUrl) 81 | .option("tunnelUrl", tunnelUrl) 82 | .option("table", table) 83 | .option("project", project) 84 | .option("accessKeySecret", accessKeySecret) 85 | .option("accessKeyId", accessKeyId).load() 86 | 87 | 88 | val collectList = readDF.collect() 89 | System.out.println("*****" + table + ",after read table," + collectList.size) 90 | assert(collectList.length == 1000000) 91 | assert((1 to 1000000).par.exists(n => collectList.exists(_.getLong(0) == n))) 92 | 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/emr/example/spark/sql/streaming/SparkSLSContinuousStructuredStreamingDemo.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package com.aliyun.emr.example.spark.sql.streaming 18 | 19 | import java.util.UUID 20 | 21 | import org.apache.spark.sql.SparkSession 22 | import org.apache.spark.sql.streaming.Trigger 23 | 24 | object SparkSLSContinuousStructuredStreamingDemo { 25 | def main(args: Array[String]) { 26 | if (args.length < 7) { 27 | System.err.println("Usage: SparkSLSContinuousStructuredStreamingDemo " + 28 | " " + 29 | " []") 30 | System.exit(1) 31 | } 32 | 33 | val Array(project, logStore, accessKeyId, accessKeySecret, endpoint, startingOffsets, maxOffsetsPerTrigger, _*) = args 34 | val checkpointLocation = 35 | if (args.length > 7) args(7) else "/tmp/temporary-" + UUID.randomUUID.toString 36 | 37 | val spark = SparkSession 38 | .builder 39 | .appName("E-MapReduce Demo 6-5: Spark SLS Demo (Scala)") 40 | .master("local[5]") 41 | .getOrCreate() 42 | 43 | spark.sparkContext.setLogLevel("WARN") 44 | 45 | import spark.implicits._ 46 | 47 | // Create DataSet representing the stream of input lines from loghub 48 | val lineLength = spark 49 | .readStream 50 | .format("org.apache.spark.sql.aliyun.logservice.LoghubSourceProvider") 51 | .option("sls.project", project) 52 | .option("sls.store", logStore) 53 | .option("access.key.id", accessKeyId) 54 | .option("access.key.secret", accessKeySecret) 55 | .option("endpoint", endpoint) 56 | .option("startingoffsets", startingOffsets) 57 | .option("maxOffsetsPerTrigger", maxOffsetsPerTrigger) 58 | .load() 59 | .selectExpr("CAST(__value__ AS STRING)") 60 | .as[String].map(e => (e, e.length)).toDF("value", "length") 61 | 62 | val query = lineLength.writeStream 63 | .outputMode("append") 64 | .format("console") 65 | .option("checkpointLocation", checkpointLocation) 66 | .trigger(Trigger.Continuous("5 second")) 67 | .start() 68 | 69 | query.awaitTermination() 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/emr/example/spark/sql/streaming/SparkSLSStructuredStreamingDemo.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package com.aliyun.emr.example.spark.sql.streaming 18 | 19 | import java.util.UUID 20 | 21 | import org.apache.spark.sql.SparkSession 22 | 23 | object SparkSLSStructuredStreamingDemo { 24 | def main(args: Array[String]) { 25 | if (args.length < 7) { 26 | System.err.println("Usage: SparkSLSStructuredStreamingDemo " + 27 | " " + 28 | " []") 29 | System.exit(1) 30 | } 31 | 32 | val Array(project, logStore, accessKeyId, accessKeySecret, endpoint, startingOffsets, maxOffsetsPerTrigger, _*) = args 33 | val checkpointLocation = 34 | if (args.length > 7) args(7) else "/tmp/temporary-" + UUID.randomUUID.toString 35 | 36 | val spark = SparkSession 37 | .builder 38 | .appName("E-MapReduce Demo 6-3: Spark SLS Demo (Scala)") 39 | .master("local[5]") 40 | .getOrCreate() 41 | 42 | spark.sparkContext.setLogLevel("WARN") 43 | 44 | import spark.implicits._ 45 | 46 | // Create DataSet representing the stream of input lines from loghub 47 | val lines = spark 48 | .readStream 49 | .format("org.apache.spark.sql.aliyun.logservice.LoghubSourceProvider") 50 | .option("sls.project", project) 51 | .option("sls.store", logStore) 52 | .option("access.key.id", accessKeyId) 53 | .option("access.key.secret", accessKeySecret) 54 | .option("endpoint", endpoint) 55 | .option("startingoffsets", startingOffsets) 56 | .option("zookeeper.connect.address", "localhost:2181") 57 | .option("maxOffsetsPerTrigger", maxOffsetsPerTrigger) 58 | .load() 59 | .selectExpr("CAST(__value__ AS STRING)") 60 | .as[String] 61 | 62 | val wordCounts = lines.flatMap(_.split(" ")).groupBy("__value__").count() 63 | 64 | val query = wordCounts.writeStream 65 | .outputMode("complete") 66 | .format("console") 67 | .option("checkpointLocation", checkpointLocation) 68 | .start() 69 | 70 | query.awaitTermination() 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/emr/example/spark/streaming/DirectSparkSLSDemo.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.aliyun.emr.example.spark.streaming 19 | 20 | import com.aliyun.openservices.loghub.client.config.LogHubCursorPosition 21 | import org.apache.spark.SparkConf 22 | import org.apache.spark.streaming.aliyun.logservice.{DirectLoghubInputDStream, LoghubUtils} 23 | import org.apache.spark.streaming.{Milliseconds, StreamingContext} 24 | 25 | object DirectSparkSLSDemo { 26 | def main(args: Array[String]): Unit = { 27 | if (args.length < 7) { 28 | System.err.println( 29 | """Usage: DirectSparkSLSDemo 30 | | 31 | """.stripMargin) 32 | System.exit(1) 33 | } 34 | 35 | val loghubProject = args(0) 36 | val logStore = args(1) 37 | val loghubGroupName = args(2) 38 | val endpoint = args(3) 39 | val accessKeyId = args(4) 40 | val accessKeySecret = args(5) 41 | val batchInterval = Milliseconds(args(6).toInt * 1000) 42 | val zkAddress = if (args.length >= 8) args(7) else "localhost:2181" 43 | 44 | def functionToCreateContext(): StreamingContext = { 45 | val conf = new SparkConf().setAppName("E-MapReduce Demo 6-2: Spark SLS Demo (Scala) (Direct API)") 46 | val ssc = new StreamingContext(conf, batchInterval) 47 | val zkParas = Map("zookeeper.connect" -> zkAddress, 48 | "enable.auto.commit" -> "false") 49 | val loghubStream = LoghubUtils.createDirectStream( 50 | ssc, 51 | loghubProject, 52 | logStore, 53 | loghubGroupName, 54 | accessKeyId, 55 | accessKeySecret, 56 | endpoint, 57 | zkParas, 58 | LogHubCursorPosition.END_CURSOR) 59 | 60 | loghubStream.checkpoint(batchInterval).foreachRDD(rdd => { 61 | println(s"count by key: ${rdd.map(s => { 62 | s.sorted 63 | (s.length, s) 64 | }).countByKey().size}") 65 | loghubStream.asInstanceOf[DirectLoghubInputDStream].commitAsync() 66 | }) 67 | ssc.checkpoint("hdfs:///tmp/spark/streaming") // set checkpoint directory 68 | ssc 69 | } 70 | 71 | val ssc = StreamingContext.getOrCreate("hdfs:///tmp/spark/streaming", functionToCreateContext _) 72 | 73 | ssc.start() 74 | ssc.awaitTermination() 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/emr/example/spark/streaming/DtsSample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.aliyun.emr.example.spark.streaming 19 | 20 | import com.aliyun.drc.clusterclient.message.ClusterMessage 21 | 22 | import org.apache.spark.SparkConf 23 | import org.apache.spark.storage.StorageLevel 24 | import org.apache.spark.streaming.aliyun.dts.DtsUtils 25 | import org.apache.spark.streaming.{Milliseconds, StreamingContext} 26 | 27 | object DtsSample { 28 | def main(args: Array[String]): Unit = { 29 | if (args.length < 4) { 30 | System.err.println(s""" 31 | |Usage: DtsSample 32 | | Aliyun Access Key ID. 33 | | Aliyun Access Key Secret. 34 | | Aliyun DTS guid name. 35 | | Use public Ip to access DTS or not. 36 | | The time interval at which streaming data will be divided into batches. 37 | """.stripMargin) 38 | System.exit(1) 39 | } 40 | 41 | val Array(accessKeyId, accessKeySecret, guid, usePublicIp, interval) = args 42 | val sparkConf = new SparkConf().setAppName("DtsSample") 43 | val ssc: StreamingContext = new StreamingContext(sparkConf, Milliseconds(interval.toInt)) 44 | 45 | def func: ClusterMessage => String = msg => msg.getRecord.toString 46 | 47 | val dtsStream = DtsUtils.createStream( 48 | ssc, 49 | accessKeyId, 50 | accessKeySecret, 51 | guid, 52 | func, 53 | StorageLevel.MEMORY_AND_DISK_2, 54 | usePublicIp.toBoolean) 55 | 56 | dtsStream.foreachRDD(rdd => { 57 | rdd.collect().foreach(println) 58 | }) 59 | 60 | ssc.start() 61 | ssc.awaitTermination() 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/emr/example/spark/streaming/RedisWordCount.scala.1: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.aliyun.emr.example.spark.streaming 19 | 20 | import org.apache.spark.{SparkContext, SparkConf} 21 | import org.apache.spark.streaming.{Seconds, StreamingContext} 22 | import org.apache.spark.storage.StorageLevel 23 | import com.redislabs.provider.redis._ 24 | 25 | object RedisWordCount { 26 | def main(args: Array[String]): Unit = { 27 | if (args.length < 4) { 28 | System.err.println( 29 | """Usage: bin/spark-submit --class RedisWordCount examples-1.0-SNAPSHOT-shaded.jar 30 | | 31 | | 32 | |Arguments: 33 | | 34 | | redisHost Redis host. 35 | | redisPort Redis port. 36 | | redisAuth Redis auth. 37 | | keyName Redis key name. 38 | | 39 | """.stripMargin) 40 | System.exit(1) 41 | } 42 | 43 | val redisHost = args(0) 44 | val redisPort = args(1) 45 | val redisAuth = args(2) 46 | val keyName = args(3) 47 | 48 | val conf = new SparkConf().setAppName("Redis WordCount").setMaster("local[4]") 49 | conf.set("redis.host", redisHost) 50 | conf.set("redis.port", redisPort) 51 | conf.set("redis.auth", redisAuth) 52 | val sc = new SparkContext(conf) 53 | val ssc = new StreamingContext(sc, Seconds(1)) 54 | 55 | val redisStream = ssc.createRedisStream(Array(keyName), storageLevel = StorageLevel.MEMORY_AND_DISK_2) 56 | redisStream.print() 57 | 58 | ssc.start() 59 | ssc.awaitTermination() 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/emr/example/spark/streaming/SparkDatahubDemo.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.aliyun.emr.example.spark.streaming 19 | 20 | import com.aliyun.datahub.model.RecordEntry 21 | 22 | import org.apache.spark.SparkConf 23 | import org.apache.spark.storage.StorageLevel 24 | import org.apache.spark.streaming.{Milliseconds, StreamingContext} 25 | import org.apache.spark.streaming.aliyun.datahub.DatahubUtils 26 | import org.apache.spark.streaming.dstream.DStream 27 | 28 | object SparkDatahubDemo { 29 | def main(args: Array[String]): Unit = { 30 | if (args.length < 7) { 31 | // scalastyle:off 32 | System.err.println( 33 | """Usage: SparkDatahubDemo 34 | | [] 35 | """.stripMargin) 36 | // scalastyle:on 37 | System.exit(1) 38 | } 39 | 40 | var isShardDefined = false 41 | if (args.length == 8) { 42 | isShardDefined = true 43 | } 44 | 45 | val project = args(0) 46 | val topic = args(1) 47 | val subId = args(2) 48 | val accessKeyId = args(3) 49 | val accessKeySecret = args(4) 50 | val endpoint = args(5) 51 | val batchInterval = Milliseconds(args(6).toInt * 1000) 52 | 53 | def functionToCreateContext(): StreamingContext = { 54 | val conf = new SparkConf().setMaster("local[4]").setAppName("E-MapReduce Demo 11: Spark DataHub Demo (Scala)") 55 | conf.set("spark.hadoop.fs.oss.impl", "com.aliyun.fs.oss.nat.NativeOssFileSystem") 56 | conf.set("spark.hadoop.mapreduce.job.run-local", "true") 57 | val ssc = new StreamingContext(conf, batchInterval) 58 | var datahubStream: DStream[Array[Byte]] = null 59 | if (isShardDefined) { 60 | val shardId = args(7) 61 | datahubStream = DatahubUtils.createStream( 62 | ssc, 63 | project, 64 | topic, 65 | subId, 66 | accessKeyId, 67 | accessKeySecret, 68 | endpoint, 69 | shardId, 70 | read(_), 71 | StorageLevel.MEMORY_AND_DISK) 72 | } else { 73 | datahubStream = DatahubUtils.createStream( 74 | ssc, 75 | project, 76 | topic, 77 | subId, 78 | accessKeyId, 79 | accessKeySecret, 80 | endpoint, 81 | read(_), 82 | StorageLevel.MEMORY_AND_DISK) 83 | } 84 | 85 | // scalastyle:off 86 | datahubStream.foreachRDD(rdd => println(s"rdd.count(): ${rdd.count()}")) 87 | // scalastyle:on 88 | ssc.checkpoint("hdfs:///tmp/spark/streaming") // set checkpoint directory 89 | ssc 90 | } 91 | 92 | val ssc = StreamingContext.getOrCreate("hdfs:///tmp/spark/streaming", functionToCreateContext _) 93 | 94 | ssc.start() 95 | ssc.awaitTermination() 96 | } 97 | 98 | def read(record: RecordEntry): String = { 99 | record.getString(0) 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/emr/example/spark/streaming/SparkHBaseDemo.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.aliyun.emr.example.spark.streaming 19 | 20 | import com.aliyun.openservices.ons.api.Message 21 | import org.apache.hadoop.conf.Configuration 22 | import org.apache.hadoop.hbase.{HConstants, HBaseConfiguration, TableName} 23 | import org.apache.hadoop.hbase.client.{Connection, ConnectionFactory, Put} 24 | import org.apache.hadoop.hbase.util.Bytes 25 | import org.apache.spark.SparkConf 26 | import org.apache.spark.storage.StorageLevel 27 | import org.apache.spark.streaming.aliyun.ons.OnsUtils 28 | import org.apache.spark.streaming.{StreamingContext, Seconds} 29 | import scala.collection.JavaConversions._ 30 | 31 | object ConnectionUtil extends Serializable { 32 | private var conf: Configuration = null 33 | 34 | private var connection: Connection = null 35 | 36 | def getDefaultConn(quorum: String): Connection = { 37 | if (conf == null && connection == null) { 38 | conf = HBaseConfiguration.create() 39 | conf.set(HConstants.ZOOKEEPER_QUORUM, quorum) 40 | conf.set(HConstants.ZOOKEEPER_ZNODE_PARENT, "/hbase") 41 | connection = ConnectionFactory.createConnection(conf) 42 | } 43 | connection 44 | } 45 | } 46 | 47 | object SparkHBaseDemo { 48 | def main(args: Array[String]): Unit = { 49 | if (args.length < 7) { 50 | System.err.println( 51 | """Usage: spark-submit --class SparkHBaseDemo examples-1.0-SNAPSHOT-shaded.jar 52 | | 53 | | 54 | |Arguments: 55 | | 56 | | accessKeyId Aliyun Access Key ID. 57 | | accessKeySecret Aliyun Key Secret. 58 | | consumerId ONS ConsumerID. 59 | | topic ONS topic. 60 | | subExpression * for all, or some specific tag. 61 | | tableName The name of HBase table. 62 | | quorum HBase quorum setting. 63 | | 64 | """.stripMargin) 65 | System.exit(1) 66 | } 67 | 68 | val Array(accessKeyId, accessKeySecret, consumerId, topic, subExpression, tname, quorum) = args 69 | 70 | val COLUMN_FAMILY_BYTES = Bytes.toBytes("count") 71 | val COLUMN_QUALIFIER_BYTES = Bytes.toBytes("count") 72 | 73 | val batchInterval = Seconds(2) 74 | 75 | val conf = new SparkConf().setAppName("E-MapReduce Demo 9: Spark HBase Demo (Scala)") 76 | val ssc = new StreamingContext(conf, batchInterval) 77 | def func: Message => Array[Byte] = msg => msg.getBody 78 | val onsStream = OnsUtils.createStream( 79 | ssc, 80 | consumerId, 81 | topic, 82 | subExpression, 83 | accessKeyId, 84 | accessKeySecret, 85 | StorageLevel.MEMORY_AND_DISK_2, 86 | func) 87 | 88 | onsStream.foreachRDD(rdd => { 89 | rdd.map(bytes => new String(bytes)) 90 | .flatMap(line => line.split(" ")) 91 | .map(word => (word, 1)) 92 | .reduceByKey(_ + _) 93 | .mapPartitions {words => { 94 | val conn = ConnectionUtil.getDefaultConn(quorum) 95 | val tableName = TableName.valueOf(tname) 96 | val t = conn.getTable(tableName) 97 | try { 98 | words.sliding(100, 100).foreach(slice => { 99 | val puts = slice.map(word => { 100 | println(s"word: $word") 101 | val put = new Put(Bytes.toBytes(word._1 + System.currentTimeMillis())) 102 | put.addColumn(COLUMN_FAMILY_BYTES, COLUMN_QUALIFIER_BYTES, 103 | System.currentTimeMillis(), Bytes.toBytes(word._2)) 104 | put 105 | }).toList 106 | t.put(puts) 107 | }) 108 | } finally { 109 | t.close() 110 | } 111 | 112 | Iterator.empty 113 | }}.count() 114 | }) 115 | 116 | ssc.start() 117 | ssc.awaitTermination() 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/emr/example/spark/streaming/SparkKafkaDemo.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.aliyun.emr.example.spark.streaming 19 | 20 | import org.apache.kafka.common.serialization.StringDeserializer 21 | import org.apache.spark.SparkConf 22 | import org.apache.spark.streaming._ 23 | import org.apache.spark.streaming.kafka010._ 24 | 25 | object SparkKafkaDemo { 26 | def main(args: Array[String]) { 27 | if (args.length < 2) { 28 | System.err.println(s""" 29 | |Usage: SparkKafkaDemo 30 | | is a list of one or more Kafka brokers 31 | | is a list of one or more kafka topics to consume from 32 | | 33 | """.stripMargin) 34 | System.exit(1) 35 | } 36 | val Array(brokers, topics, interval) = args 37 | 38 | val sparkConf = new SparkConf().setAppName("E-MapReduce Demo 9: Spark Kafka Demo (Scala)") 39 | val ssc = new StreamingContext(sparkConf, Seconds(interval.toInt)) 40 | 41 | val kafkaParams = Map[String, Object]( 42 | "bootstrap.servers" -> brokers, 43 | "key.deserializer" -> classOf[StringDeserializer], 44 | "value.deserializer" -> classOf[StringDeserializer], 45 | "group.id" -> "mugen1", 46 | "auto.offset.reset" -> "earliest", 47 | "enable.auto.commit" -> (false: java.lang.Boolean), 48 | "security.protocol" -> "SASL_PLAINTEXT", 49 | "sasl.mechanism" -> "GSSAPI", 50 | "sasl.kerberos.service.name" -> "kafka" 51 | ) 52 | 53 | val messages = KafkaUtils.createDirectStream[String, String]( 54 | ssc, 55 | LocationStrategies.PreferConsistent, 56 | ConsumerStrategies.Subscribe[String, String](Array(topics), kafkaParams) 57 | ) 58 | 59 | // Get the lines, split them into words, count the words and print 60 | val lines = messages.map(_.value) 61 | val words = lines.flatMap(_.split(" ")) 62 | val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _) 63 | wordCounts.print() 64 | 65 | // Start the computation 66 | ssc.start() 67 | ssc.awaitTermination() 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/emr/example/spark/streaming/SparkMNSDemo.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.aliyun.emr.example.spark.streaming 19 | 20 | import org.apache.spark.SparkConf 21 | import org.apache.spark.storage.StorageLevel 22 | import org.apache.spark.streaming.aliyun.mns.MnsUtils 23 | import org.apache.spark.streaming.{Seconds, StreamingContext} 24 | 25 | object SparkMNSDemo { 26 | def main(args: Array[String]): Unit = { 27 | if (args.length < 4) { 28 | System.err.println( 29 | """Usage: spark-submit --class SparkMNSDemo examples-1.0-SNAPSHOT-shaded.jar """.stripMargin) 30 | System.exit(1) 31 | } 32 | val queueName = args(0) 33 | val accessKeyId = args(1) 34 | val accessKeySecret = args(2) 35 | val endpoint = args(3) 36 | 37 | val conf = new SparkConf().setAppName("E-MapReduce Demo 8-1: Spark MNS Demo (Scala)").setMaster("local[4]") 38 | conf.set("spark.hadoop.fs.oss.impl", "com.aliyun.fs.oss.nat.NativeOssFileSystem") 39 | conf.set("spark.hadoop.mapreduce.job.run-local", "true") 40 | val batchInterval = Seconds(10) 41 | val ssc = new StreamingContext(conf, batchInterval) 42 | 43 | val mnsStream = MnsUtils.createPullingStreamAsBytes(ssc, queueName, accessKeyId, accessKeySecret, endpoint, 44 | StorageLevel.MEMORY_ONLY) 45 | mnsStream.foreachRDD( rdd => { 46 | rdd.collect().foreach(e => println(new String(e))) 47 | }) 48 | 49 | ssc.start() 50 | ssc.awaitTermination() 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/emr/example/spark/streaming/SparkRocketMQDemo.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.aliyun.emr.example.spark.streaming 19 | 20 | import java.util.{Properties, UUID} 21 | 22 | import com.aliyun.openservices.ons.api.impl.ONSFactoryImpl 23 | import com.aliyun.openservices.ons.api.{Message, PropertyKeyConst} 24 | import org.apache.spark.storage.StorageLevel 25 | import org.apache.spark.streaming.aliyun.ons.OnsUtils 26 | import org.apache.spark.streaming.{Milliseconds, StreamingContext} 27 | import org.apache.spark.{SparkConf, SparkContext} 28 | 29 | object SparkRocketMQDemo { 30 | def main(args: Array[String]): Unit = { 31 | if (args.length < 6) { 32 | System.err.println( 33 | """Usage: bin/spark-submit --class com.aliyun.emr.example.spark.streaming.SparkRocketMQDemo examples-1.0-SNAPSHOT-shaded.jar 34 | | 35 | | 36 | |Arguments: 37 | | 38 | | accessKeyId Aliyun Access Key ID. 39 | | accessKeySecret Aliyun Key Secret. 40 | | consumerId ONS ConsumerID. 41 | | topic ONS topic. 42 | | subExpression * for all, or some specific tag. 43 | | parallelism The number of receivers. 44 | | 45 | """.stripMargin) 46 | System.exit(1) 47 | } 48 | 49 | val accessKeyId = args(0) 50 | val accessKeySecret = args(1) 51 | val cId = args(2) 52 | val topic = args(3) 53 | val subExpression = args(4) 54 | val parallelism = args(5) 55 | 56 | val numStreams = parallelism.toInt 57 | val batchInterval = Milliseconds(2000) 58 | 59 | val conf = new SparkConf().setAppName("E-MapReduce Demo 4-1: Spark RocketMQ Demo (Scala)") 60 | val ssc = new StreamingContext(conf, batchInterval) 61 | def func: Message => Array[Byte] = msg => msg.getBody 62 | val onsStreams = (0 until numStreams).map { i => 63 | println(s"starting stream $i") 64 | OnsUtils.createStream( 65 | ssc, 66 | cId, 67 | topic, 68 | subExpression, 69 | accessKeyId, 70 | accessKeySecret, 71 | StorageLevel.MEMORY_AND_DISK_2, 72 | func) 73 | } 74 | 75 | val unionStreams = ssc.union(onsStreams) 76 | unionStreams.foreachRDD(rdd => println(s"count: ${rdd.count()}")) 77 | 78 | ssc.start() 79 | ssc.awaitTermination() 80 | } 81 | } 82 | 83 | object OnsRecordProducer { 84 | def main(args: Array[String]): Unit = { 85 | val Array(accessKeyId, accessKeySecret, pId, topic, tag, parallelism) = args 86 | 87 | val numPartition = parallelism.toInt 88 | val conf = new SparkConf().setAppName("E-MapReduce Demo 4-1: Spark RocketMQ Demo (Scala)") 89 | val sc = new SparkContext(conf) 90 | 91 | sc.parallelize(0 until numPartition, numPartition).mapPartitionsWithIndex { 92 | (index, itr) => { 93 | generate(index, accessKeyId, accessKeySecret, pId, topic, tag) 94 | Iterator.empty 95 | } 96 | }.count() 97 | } 98 | 99 | def generate( 100 | partitionId: Int, 101 | accessKeyId: String, 102 | accessKeySecret: String, 103 | pId: String, 104 | topic: String, 105 | tag: String): Unit = { 106 | val properties = new Properties() 107 | properties.put(PropertyKeyConst.ProducerId, pId) 108 | properties.put(PropertyKeyConst.AccessKey, accessKeyId) 109 | properties.put(PropertyKeyConst.SecretKey, accessKeySecret) 110 | val onsFactoryImpl = new ONSFactoryImpl 111 | val producer = onsFactoryImpl.createProducer(properties) 112 | producer.shutdown() 113 | producer.start() 114 | 115 | var count = 0 116 | while(true){ 117 | val uuid = UUID.randomUUID() 118 | val msg = new Message(topic, tag, uuid.toString.getBytes) 119 | msg.setKey(s"ORDERID_${partitionId}_$count") 120 | producer.send(msg) 121 | count += 1 122 | Thread.sleep(100L) 123 | } 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/emr/example/spark/streaming/SparkSLSDemo.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.aliyun.emr.example.spark.streaming 19 | 20 | import org.apache.spark.SparkConf 21 | import org.apache.spark.storage.StorageLevel 22 | import org.apache.spark.streaming.{Milliseconds, StreamingContext} 23 | import org.apache.spark.streaming.aliyun.logservice.LoghubUtils 24 | 25 | object SparkSLSDemo { 26 | 27 | def main(args: Array[String]): Unit = { 28 | if (args.length < 7) { 29 | System.err.println( 30 | """Usage: SparkSLSDemo 31 | | 32 | """.stripMargin) 33 | System.exit(1) 34 | } 35 | 36 | val loghubProject = args(0) 37 | val logStore = args(1) 38 | val loghubGroupName = args(2) 39 | val endpoint = args(3) 40 | val accessKeyId = args(4) 41 | val accessKeySecret = args(5) 42 | val batchInterval = Milliseconds(args(6).toInt * 1000) 43 | 44 | def functionToCreateContext(): StreamingContext = { 45 | val conf = new SparkConf().setAppName("E-MapReduce Demo 6-1: Spark SLS Demo (Scala)") 46 | val ssc = new StreamingContext(conf, batchInterval) 47 | val loghubStream = LoghubUtils.createStream( 48 | ssc, 49 | loghubProject, 50 | logStore, 51 | loghubGroupName, 52 | endpoint, 53 | accessKeyId, 54 | accessKeySecret, 55 | StorageLevel.MEMORY_AND_DISK) 56 | 57 | loghubStream.foreachRDD(rdd => println(s"rdd.count(): ${rdd.count()}")) 58 | ssc.checkpoint("hdfs:///tmp/spark/streaming") // set checkpoint directory 59 | ssc 60 | } 61 | 62 | val ssc = StreamingContext.getOrCreate("hdfs:///tmp/spark/streaming", functionToCreateContext _) 63 | 64 | ssc.start() 65 | ssc.awaitTermination() 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/emr/example/spark/streaming/benchmark/AbstractStreaming.scala: -------------------------------------------------------------------------------- 1 | package com.aliyun.emr.example.spark.streaming.benchmark 2 | 3 | import java.io.{BufferedInputStream, FileInputStream} 4 | import java.util.Properties 5 | 6 | import org.apache.kafka.clients.consumer.ConsumerRecord 7 | import org.apache.kafka.common.serialization.StringDeserializer 8 | import org.apache.spark.streaming.dstream.InputDStream 9 | import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} 10 | import org.apache.spark.streaming.{Duration, StreamingContext} 11 | import org.apache.spark.{SparkConf, SparkContext} 12 | 13 | 14 | abstract class AbstractStreaming { 15 | var config: Properties= _ 16 | 17 | def runJob(args: Array[String]): Unit = { 18 | config = loadConfig(args(0)) 19 | val receiverCores = config.getProperty("partition.number").toInt / config.getProperty("kafka.partition.receiver.factor").toInt 20 | val executorCore = (config.getProperty("cluster.cores.total").toInt * config.getProperty("cpu.core.factor").toFloat - receiverCores).toInt/config.getProperty("spark.executor.instances").toInt 21 | val executorMem = config.getProperty("cluster.memory.per.node.mb").toInt * config.getProperty("cluster.worker.node.number").toInt / config.getProperty("spark.executor.instances").toInt 22 | val sparkConf = new SparkConf() 23 | .setAppName(config.getProperty("name")) 24 | .set("spark.yarn.am.memory.mb", config.getProperty("spark.yarn.am.memory.mb") + "m") 25 | .set("spark.yarn.am.cores", config.getProperty("spark.yarn.am.cores")) 26 | .set("spark.executor.instances", config.getProperty("spark.executor.instances")) 27 | .set("spark.executor.cores", executorCore.toString) 28 | .set("spark.executor.memory", executorMem + "m") 29 | .set("spark.streaming.blockInterval", config.getProperty("spark.streaming.blockInterval")) 30 | val ssc = new StreamingContext(new SparkContext(sparkConf), Duration(config.getProperty("duration.ms").toLong)) 31 | 32 | val kafkaParam = Map[String, Object]( 33 | "bootstrap.servers" -> config.getProperty("broker.list"), 34 | "key.deserializer" -> classOf[StringDeserializer], 35 | "value.deserializer" -> classOf[StringDeserializer], 36 | "group.id" -> config.getProperty("consumer.group"), 37 | "auto.offset.reset" -> "latest", 38 | "enable.auto.commit" -> (true: java.lang.Boolean) 39 | ) 40 | val stream = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](Array(config.getProperty("topic")), kafkaParam)) 41 | 42 | execute(stream) 43 | 44 | ssc.start() 45 | ssc.awaitTermination() 46 | } 47 | def execute(stream: InputDStream[ConsumerRecord[String, String]]) 48 | 49 | def loadConfig(configFile: String): Properties = { 50 | val properties = new Properties() 51 | properties.load(new BufferedInputStream(new FileInputStream(configFile))) 52 | properties 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/emr/example/spark/streaming/benchmark/KafkaHdfs.scala: -------------------------------------------------------------------------------- 1 | package com.aliyun.emr.example.spark.streaming.benchmark 2 | 3 | import org.apache.kafka.clients.consumer.ConsumerRecord 4 | import org.apache.spark.streaming.dstream.InputDStream 5 | 6 | object KafkaHdfs extends AbstractStreaming { 7 | override def execute(stream: InputDStream[ConsumerRecord[String, String]]): Unit = { 8 | stream.map(kv => kv.key() + "," + System.currentTimeMillis()) 9 | .saveAsTextFiles(config.getProperty("filename.prefix") + config.getProperty("name") + "/result") 10 | } 11 | 12 | def main(args: Array[String]): Unit = { 13 | runJob(args) 14 | } 15 | } 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/emr/example/spark/streaming/benchmark/WordCount.scala: -------------------------------------------------------------------------------- 1 | package com.aliyun.emr.example.spark.streaming.benchmark 2 | 3 | import org.apache.kafka.clients.consumer.ConsumerRecord 4 | import org.apache.spark.streaming.dstream.InputDStream 5 | object WordCount extends AbstractStreaming { 6 | override def execute(stream: InputDStream[ConsumerRecord[String, String]]): Unit = { 7 | stream.flatMap(kv => { 8 | val value:List[(String, (Integer, Long))] = List() 9 | val eventTime = kv.key() 10 | for (v <- kv.value().split(" ")) { 11 | (v, (1, eventTime.toLong)) +: value 12 | } 13 | value 14 | }).reduceByKey((x,y) =>{ 15 | val count = x._1 + y._1 16 | var eventTime = x._2 17 | if (x._2 < y._2) { 18 | eventTime = y._2 19 | } 20 | (count, eventTime) 21 | }).map(x => x._2._2).saveAsTextFiles(config.getProperty("filename.prefix") + config.getProperty("name") + "/result") 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/emr/example/spark/streaming/benchmark/metrics/BasicMetrics.scala: -------------------------------------------------------------------------------- 1 | package com.aliyun.emr.example.spark.streaming.benchmark.metrics 2 | 3 | import java.io.{BufferedInputStream, FileInputStream} 4 | import java.util.Properties 5 | 6 | class BasicMetrics extends Serializable { 7 | def getDuration(value: String, separator: String = ",") : Option[Long] = { 8 | val values = value.split(separator) 9 | if (values.length != 2) { 10 | println("invalid result when parse start-time and finish time, invalid pattern should be start-time,end-time. content:" + value) 11 | return None 12 | } 13 | val duration = values(1).toLong - values(0).toLong 14 | Some(duration) 15 | } 16 | 17 | def loadConfig(configFile: String): Properties = { 18 | val properties = new Properties() 19 | properties.load(new BufferedInputStream(new FileInputStream(configFile))) 20 | properties 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/emr/example/spark/streaming/benchmark/metrics/HdfsMetrics.scala: -------------------------------------------------------------------------------- 1 | package com.aliyun.emr.example.spark.streaming.benchmark.metrics 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | object HdfsMetrics extends BasicMetrics { 6 | private final val AppName = "Metrics" 7 | def main(args: Array[String]): Unit = { 8 | if (args.length < 1) { 9 | System.err.println( 10 | """Usage: bin/spark-submit --class com.aliyun.emr.example.spark.streaming.benchmark.HdfsMetrics examples-1.1-shaded.jar 11 | | 12 | |Arguments: 13 | | 14 | | configFilePath config file path, like benchmark.properties 15 | | 16 | """.stripMargin) 17 | System.exit(1) 18 | } 19 | 20 | val config = loadConfig(args(0)) 21 | 22 | val conf = new SparkConf() 23 | conf.setAppName(AppName) 24 | 25 | var inputPath : String = null 26 | if (!config.getProperty("from.spark.streaming").toBoolean) { 27 | inputPath = config.getProperty("filename.prefix") + config.getProperty("benchmark.app.name") + "/*.txt" 28 | } else { 29 | inputPath = config.getProperty("filename.prefix") + config.getProperty("benchmark.app.name") + "/*/part-*" 30 | 31 | } 32 | val input = new SparkContext(conf).textFile(inputPath, config.getProperty("metric.numPartitions").toInt) 33 | val output = input.map(x => getDuration(x)) 34 | .filter(x => x.isDefined) 35 | .map(x => x.get) 36 | 37 | val count = output.count() 38 | println("total:%d".format(count)) 39 | output.histogram(Array(Double.MinValue, 0.0, 300.0, 500.0, 800.0, 900.0, 1000.0, 2000.0, 3000.0, Double.MaxValue)).foreach(x=> println(x.toDouble / count)) 40 | } 41 | } 42 | 43 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/emr/example/spark/streaming/benchmark/metrics/KafkaMetrics.scala: -------------------------------------------------------------------------------- 1 | package com.aliyun.emr.example.spark.streaming.benchmark.metrics 2 | 3 | import org.apache.kafka.common.serialization.StringDeserializer 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.streaming._ 6 | import org.apache.spark.streaming.kafka010._ 7 | 8 | object KafkaMetrics extends BasicMetrics { 9 | def main(args: Array[String]): Unit = { 10 | 11 | val config = loadConfig(args(0)) 12 | 13 | val ssc = new StreamingContext(new SparkConf().setAppName("KafkaMetrics"), Seconds(config.getProperty("metric.duration.second").toLong)) 14 | val kafkaParam = Map[String, Object] ( 15 | "bootstrap.servers" -> config.getProperty("result.broker.list"), 16 | "key.deserializer" -> classOf[StringDeserializer], 17 | "value.deserializer" -> classOf[StringDeserializer], 18 | "group.id" -> config.getProperty("metric.group.id"), 19 | "auto.offset.reset" -> "earliest", 20 | "enable.auto.commit" -> (false: java.lang.Boolean) 21 | 22 | ) 23 | val messages = KafkaUtils.createDirectStream[String, String](ssc, 24 | LocationStrategies.PreferConsistent, 25 | ConsumerStrategies.Subscribe[String, String](Array(config.getProperty("result.topic")), kafkaParam)) 26 | 27 | val outputPath = config.getProperty("filename.prefix") + config.getProperty("benchmark.app.name") + "/kafka-" 28 | messages.map(_.value()).saveAsTextFiles(outputPath) 29 | 30 | ssc.start() 31 | ssc.awaitTermination() 32 | } 33 | } 34 | --------------------------------------------------------------------------------