├── .github └── workflows │ └── deploy-pages.yml ├── .gitignore ├── LICENSE ├── README.md ├── docs ├── .eslintrc.js ├── .gitignore ├── README.md ├── babel.config.js ├── docs │ ├── advanced │ │ ├── _category_.json │ │ ├── eni-migration.md │ │ ├── jindo-sdk.md │ │ ├── oss-access.md │ │ ├── vpc-access.md │ │ └── zeppelin-integration.md │ ├── config │ │ ├── _category_.json │ │ ├── configuration.md │ │ └── resource-autoscaling.md │ ├── development │ │ ├── _category_.json │ │ ├── ops │ │ │ ├── _category_.json │ │ │ ├── job-diagnosis.md │ │ │ ├── logging-guide.md │ │ │ └── streaming-monitoring.md │ │ ├── pyspark │ │ │ ├── _category_.json │ │ │ ├── pyspark-dependencies.md │ │ │ ├── pyspark-oss.md │ │ │ └── pyspark-thirdparty.md │ │ ├── sample-project.md │ │ └── spark-streaming │ │ │ ├── _category_.json │ │ │ ├── streaming-datahub.md │ │ │ ├── streaming-kafka.md │ │ │ └── streaming-loghub.md │ ├── faq │ │ ├── _category_.json │ │ ├── allocate-resource.md │ │ ├── class-conflict.md │ │ ├── github-images.md │ │ ├── network-access.md │ │ ├── oom-troubleshooting.md │ │ ├── pyspark-faq.md │ │ ├── read-transactional-table.md │ │ ├── ref-external-file.md │ │ ├── spark-24-notes.md │ │ ├── spark-31-notes.md │ │ └── spark-general.md │ ├── overview.md │ ├── quickstart │ │ ├── _category_.json │ │ ├── dataworks-integration.md │ │ └── runtime-mode │ │ │ ├── client-mode.md │ │ │ ├── index.md │ │ │ ├── local-mode.md │ │ │ └── yarn-cluster.md │ └── resources │ │ ├── ENI-1.png │ │ ├── ENI-2.png │ │ ├── ENI-3.png │ │ ├── ENI-4.png │ │ ├── ENI-5.png │ │ ├── OOM1.png │ │ ├── cloudmonitor-1.png │ │ ├── cloudmonitor-2.png │ │ ├── cloudmonitor-3.png │ │ ├── cupid_arch.png │ │ ├── datahub-1.jpg │ │ ├── datahub-2.jpg │ │ ├── datahub-3.jpg │ │ ├── dataworks-1.jpg │ │ ├── dataworks-2.jpg │ │ ├── dataworks-3.jpg │ │ ├── dataworks-4.jpg │ │ ├── dataworks-5.jpg │ │ ├── dataworks-6.jpg │ │ ├── dataworks-7.jpg │ │ ├── dingtalk-share.jpg │ │ ├── fuxisensor.png │ │ ├── fuxisensor2.png │ │ ├── idea-local-1.jpg │ │ ├── idea-local-2.jpg │ │ ├── idea-local-3.jpg │ │ ├── idea-local-4.jpg │ │ ├── idea-local-5.jpg │ │ ├── jobview-1.jpg │ │ ├── jobview-2.jpg │ │ ├── jobview-3.jpg │ │ ├── jobview-4.jpg │ │ ├── jobview-5.jpg │ │ ├── log4j2-stderr.jpg │ │ ├── log4j2-stdout.jpg │ │ ├── logview-1.jpg │ │ ├── logview-2.jpg │ │ ├── logview-3.jpg │ │ ├── logview-4.jpg │ │ ├── logview-5.jpg │ │ ├── oss-1.jpg │ │ ├── oss-2.jpg │ │ ├── oss-3.jpg │ │ ├── sparkui.png │ │ ├── vpc-access-1.jpg │ │ ├── vpc-access-2.jpg │ │ ├── vpc-access-3.jpg │ │ ├── 资源申请1.png │ │ └── 资源申请2.png ├── docusaurus.config.js ├── package.json ├── sidebars.js ├── src │ ├── components │ │ └── HomepageFeatures │ │ │ ├── index.js │ │ │ └── styles.module.css │ ├── css │ │ └── custom.css │ ├── locales.json │ └── pages │ │ ├── index.js │ │ └── index.module.css ├── static │ ├── .nojekyll │ └── img │ │ ├── docusaurus-social-card.jpg │ │ ├── docusaurus.png │ │ ├── favicon.ico │ │ ├── logo.svg │ │ ├── undraw_docusaurus_mountain.svg │ │ ├── undraw_docusaurus_react.svg │ │ └── undraw_docusaurus_tree.svg └── yarn.lock ├── hook └── pre-commit ├── spark-1.x ├── pom.xml └── src │ └── main │ ├── java │ └── com │ │ └── aliyun │ │ └── odps │ │ └── spark │ │ └── examples │ │ └── sparksql │ │ └── JavaSparkSQL.java │ ├── python │ └── spark_sql.py │ └── scala │ └── com │ └── aliyun │ └── odps │ └── spark │ └── examples │ ├── SparkPi.scala │ ├── WordCount.scala │ ├── graphx │ └── PageRank.scala │ ├── mllib │ └── KmeansModelSaveToOss.scala │ ├── oss │ └── SparkUnstructuredDataCompute.scala │ ├── sparksql │ └── SparkSQL.scala │ └── udf │ └── SparkUDF.scala ├── spark-2.x ├── libs │ └── jindofs-sdk-3.7.2.jar ├── pom.xml └── src │ └── main │ ├── java │ └── com │ │ └── aliyun │ │ └── odps │ │ └── spark │ │ └── examples │ │ ├── sparksql │ │ ├── DataConverters.java │ │ ├── JavaSparkSQL.java │ │ └── JavaSparkSQLTransform.java │ │ └── utils │ │ └── ConfigLog4j2.java │ ├── python │ ├── spark_oss.py │ └── spark_sql.py │ └── scala │ └── com │ └── aliyun │ └── odps │ └── spark │ └── examples │ ├── SparkPi.scala │ ├── WordCount.scala │ ├── graphx │ └── PageRank.scala │ ├── log4j2 │ ├── Logger.scala │ └── SimpleWordCount.scala │ ├── mllib │ └── KmeansModelSaveToOss.scala │ ├── oss │ ├── JindoFsDemo.scala │ └── SparkUnstructuredDataCompute.scala │ ├── sparksql │ └── SparkSQL.scala │ ├── streaming │ ├── common │ │ └── SparkSessionSingleton.scala │ ├── datahub │ │ ├── DataHub2OdpsDemo.scala │ │ └── DataHubStreamingDemo.scala │ ├── kafka │ │ ├── Kafka2OdpsDemo.scala │ │ └── KafkaStreamingDemo.scala │ └── loghub │ │ ├── LogHub2OdpsDemo.scala │ │ └── LogHubStreamingDemo.scala │ ├── structuredStreaming │ ├── datahub │ │ └── DatahubStructuredStreamingDemo.scala │ ├── kafka │ │ └── KafkaStructuredStreamingDemo.scala │ └── loghub │ │ └── LoghubStructuredStreamingDemo.scala │ └── zeppelin │ ├── ZeppelinServer.scala │ └── ZeppelinServerPublic.scala ├── spark-3.x ├── libs │ └── jindofs-sdk-3.7.2.jar ├── pom.xml └── src │ └── main │ ├── java │ └── com │ │ └── aliyun │ │ └── odps │ │ └── spark │ │ └── examples │ │ └── sparksql │ │ └── JavaSparkSQL.java │ ├── python │ ├── spark_oss.py │ └── spark_sql.py │ └── scala │ └── com │ └── aliyun │ └── odps │ └── spark │ └── examples │ ├── SparkPi.scala │ ├── WordCount.scala │ ├── graphx │ └── PageRank.scala │ ├── mllib │ └── KmeansModelSaveToOss.scala │ ├── oss │ ├── JindoFsDemo.scala │ └── SparkUnstructuredDataCompute.scala │ └── sparksql │ └── SparkSQL.scala └── spark-utils ├── libs ├── cupid-sdk-3.3.14.jar └── hadoop-yarn-client-3.3.12.jar ├── pom.xml └── src └── main └── java └── com └── aliyun └── odps └── spark ├── CupidApplicationMetaExample.java └── SparkLauncherTest.java /.github/workflows/deploy-pages.yml: -------------------------------------------------------------------------------- 1 | # Simple workflow for deploying static content to GitHub Pages 2 | name: Deploy static content to Pages 3 | 4 | on: 5 | # Runs on pushes targeting the default branch 6 | push: 7 | branches: ["master"] 8 | 9 | # Allows you to run this workflow manually from the Actions tab 10 | workflow_dispatch: 11 | 12 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 13 | permissions: 14 | contents: read 15 | pages: write 16 | id-token: write 17 | 18 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. 19 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. 20 | concurrency: 21 | group: "pages" 22 | cancel-in-progress: false 23 | 24 | jobs: 25 | # Single deploy job since we're just deploying 26 | deploy: 27 | environment: 28 | name: github-pages 29 | url: ${{ steps.deployment.outputs.page_url }} 30 | runs-on: ubuntu-latest 31 | steps: 32 | - name: Checkout 33 | uses: actions/checkout@v4 34 | - name: Setup Pages 35 | uses: actions/configure-pages@v5 36 | - id: build-website # 构建website 37 | run: | 38 | cd docs 39 | npm install 40 | npm run build 41 | - name: Upload artifact 42 | uses: actions/upload-pages-artifact@v3 43 | with: 44 | path: 'docs/build' 45 | - name: Deploy to GitHub Pages 46 | id: deployment 47 | uses: actions/deploy-pages@v4 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.#* 3 | *#*# 4 | *.swp 5 | *.ipr 6 | *.iml 7 | *.iws 8 | *.pyc 9 | *.pyo 10 | .idea/ 11 | .idea_modules/ 12 | .settings 13 | .cache 14 | target/ 15 | .project 16 | .classpath 17 | .DS_Store 18 | metastore_db/ 19 | derby.log 20 | log4j.properties 21 | dependency-reduced-pom.xml 22 | 23 | # Dependencies 24 | /docs/node_modules 25 | 26 | # Production 27 | /docs/build 28 | 29 | # Generated files 30 | .docusaurus 31 | .cache-loader -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MaxCompute Spark 2 | 3 | 本文档帮助快速构建可以运行在MaxCompute Spark上的应用,并提供相关API的使用Demo. 4 | 参考[文档](https://github.com/aliyun/MaxCompute-Spark/wiki) 5 | -------------------------------------------------------------------------------- /docs/.eslintrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | "env": { 3 | "browser": true, 4 | "es2021": true 5 | }, 6 | "extends": "plugin:react/recommended", 7 | "overrides": [ 8 | { 9 | "env": { 10 | "node": true 11 | }, 12 | "files": [ 13 | ".eslintrc.{js,cjs}" 14 | ], 15 | "parserOptions": { 16 | "sourceType": "script" 17 | } 18 | } 19 | ], 20 | "parserOptions": { 21 | "ecmaVersion": "latest", 22 | "sourceType": "module" 23 | }, 24 | "plugins": [ 25 | "react" 26 | ], 27 | "rules": { 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | # Dependencies 2 | /docusaurus/node_modules 3 | 4 | # Production 5 | /build 6 | 7 | # Generated files 8 | .docusaurus 9 | .cache-loader 10 | 11 | # Misc 12 | .DS_Store 13 | .env.local 14 | .env.development.local 15 | .env.test.local 16 | .env.production.local 17 | 18 | npm-debug.log* 19 | yarn-debug.log* 20 | yarn-error.log* 21 | 22 | # IDE 23 | .idea 24 | .idea/* -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # ODPS SDK 文档目录 README 2 | 3 | ## 简介 4 | 5 | 此目录用于维护和构建 ODPS SDK 的官方文档,采用 Docusaurus 进行文档的编写与管理,并利用 GitHub Pages 功能将其托管于 [https://aliyun.github.io/aliyun-odps-java-sdk/](https://aliyun.github.io/aliyun-odps-java-sdk/) 。当前文档正处于积极开发阶段,并且仅提供中文版本。 6 | 7 | ## 文档构建工具 - Docusaurus 8 | 9 | Docusaurus 是一个强大的静态站点生成器,特别适合构建和维护开源项目的文档网站。它的中文文档可以在 [Docusaurus 中文官网](https://docusaurus.io/zh-CN/docs) 找到,这里详细介绍了如何开始、配置以及进阶使用 Docusaurus。 10 | 11 | ## 文档目录结构 12 | 13 | 文档源文件位于 `docs/docs` 目录下。请在此目录中添加、修改或删除文档内容。 14 | 15 | ## 开发环境搭建与本地调试 16 | 17 | ### 初始化项目 18 | 19 | 在 docs 目录下,请确保运行以下命令以安装所有依赖: 20 | 21 | ```bash 22 | yarn install 23 | ``` 24 | 25 | ### 本地运行与预览 26 | 27 | 安装完依赖后,你可以通过以下命令启动本地开发服务器,进行实时预览和调试: 28 | 29 | ```bash 30 | yarn start 31 | ``` 32 | 33 | 这将自动打开浏览器并显示文档的本地预览版。 34 | 35 | ## 部署文档 36 | 37 | ### 当前部署流程 38 | 39 | 目前文档部署为手动过程,但考虑未来可能采用 GitHub Actions 自动化部署。 40 | 41 | #### 手动部署步骤 42 | 43 | 1. 确保你的文档是最新的,并且你已经测试过。 44 | 2. 在项目根目录下的 `docs` 目录中执行以下命令: 45 | 46 | ```bash 47 | USE_SSH=true yarn deploy 48 | ``` 49 | 50 | 该命令会使用 SSH 方式(如果配置了)将编译好的网站发布到 `gh-pages` 分支。此过程包括创建一个临时目录,复制编译后的文件至该目录,然后推送至 GitHub。 51 | 52 | #### 注意事项 53 | 54 | - 如果因 Git Hooks 或其他原因导致自动推送失败,你可以手动进入该临时目录,并执行 `git push` 来完成部署。 55 | - 确保你有正确的权限推送至 `gh-pages` 分支。 56 | 57 | --- 58 | 59 | 文档持续更新中,对于任何问题、建议或想要贡献的意愿,请随时开启 Issue 或发起 Pull Request。让我们共同完善 ODPS SDK 的文档资源! -------------------------------------------------------------------------------- /docs/babel.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | presets: [require.resolve('@docusaurus/core/lib/babel/preset')], 3 | }; 4 | -------------------------------------------------------------------------------- /docs/docs/advanced/_category_.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "增强功能", 3 | "position": 5, 4 | "link": { 5 | "type": "generated-index" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /docs/docs/advanced/eni-migration.md: -------------------------------------------------------------------------------- 1 | # ENI专线访问VPC迁移指南 2 | ## 迁移意义 3 | 相比原先访问Aliyun VPC内的用户实例的方式([文档](https://github.com/aliyun/MaxCompute-Spark/wiki/09.-VPC-Access文档说明)) , ENI专线方式更加稳定,性能更好,且具备公网访问能力。 4 | 5 | ## 注意事项 6 | (1)ENI专线可以打通一个VPC,如果用户作业需要同时访问多个VPC,则可以将 **已经通过ENI专线打通的Vpc** 与 **其他Vpc** 之间再做打通即可,详情咨询**阿里云专有网络相关的技术支持**。 7 | 8 | (2)Spark作业运行所在 MaxCompute Project 的主账号 Uid 与目标 vpc 所属的主账号Uid必须一致,否则作业运行时会提示以下报错信息:"You are not allowed to use this vpc - vpc owner and project owner must be the same person"。 9 | 10 | ## 迁移步骤 11 | ### 1.提供VPC相关的信息 12 | 用户需要提供: 13 | - Vpc所在region: 例如:上海,杭州,北京等 14 | - UserId: MaxCompute Project 所属的用户主账号 Id(同时也是目标Vpc所属的主账号Uid) 15 | - VpcId: 即用户需要通过ENI专线打通的目标Vpc Id 16 | - VSwitchId: 即目标Vpc中的一个交换机Id,可以在 Vpc 管理控制台界面创建/查看,若有多个从中选取一个即可 17 | - SecurityGroupId:即目标Vpc中的一个安全组Id。**用户需要在目标Vpc下新建一个安全组**,用于对 MaxCompute Spark 访问 VPC 服务时进行访问控制 18 | 19 | 其中,新建安全组的流程如下所示: 20 | 21 | a). 在目标 VPC 中,创建安全组: 22 | ![image1](../resources/ENI-1.png) 23 | 24 | b). 在目标 VPC 配置页面中,创建安全组(在页面下方): 25 | ![image1](../resources/ENI-2.png) 26 | 27 | c). 在安全组列表中,进入“创建安全组” 28 | ![image1](../resources/ENI-3.png) 29 | 30 | d). 输入1.“安全组名称” 2.“网络” (请选择 MaxCompute 连接的目标 VPC) 3.“安全组类型” 选择普通安全组 31 | ![image1](../resources/ENI-4.png) 32 | 33 | e). 提供这个新建安全组的 id即可: 34 | ![image1](../resources/ENI-5.png) 35 | 36 | 37 | 38 | ### 2.ENI授权 39 | 对ENI进行授权,该步授权的目的在于允许 MaxCompute 在用户 VPC 内创建 ENI 网卡,以实现 MaxCompute 到用户 VPC 的连通。 40 | 用户只要使用主账号在**登录态**下点击以下链接进行授权即可: 41 | ``` 42 | https://ram.console.aliyun.com/#/role/authorize?request=%7B%22Requests%22%3A%7B%22request1%22%3A%7B%22RoleName%22%3A%22AliyunODPSRoleForENI%22%2C%22TemplateId%22%3A%22AliyunODPSRoleForENI%22%7D%7D%2C%22ReturnUrl%22%3A%22https%3A%2F%2Fram.console.aliyun.com%2Froles%22%2C%22Service%22%3A%22ODPS%22%7D 43 | ``` 44 | 45 | ### 3.等待MaxCompute平台官方人员为您完成专线开通 46 | 47 | ### 4.安全组规则配置 48 | 在ENI专线开通完成后,用户还需要在要访问的服务中增加相关安全规则,授权代表MaxCompute的那个安全组(即上述第1步中提供的安全组)能访问哪些服务的具体端口(比如9200, 31000等)。 49 | 50 | 例如:用户需要访问 阿里云 RDS,则需要在 RDS 中增加规则,允许第1步中创建的安全组访问。**如果用户需要访问的服务无法添加安全组,只能添加Ip**,那么需要将第一步中所使用的VSwitch网段都添加进来。 51 | 52 | ### 5.用户作业配置 53 | 运行spark作业,需要增加下面两个配置,就可以使用ENI专线连通目标VPC内的服务: 54 | ``` 55 | spark.hadoop.odps.cupid.eni.enable = true 56 | spark.hadoop.odps.cupid.eni.info = cn-beijing:vpc-********** 57 | 这个配置格式是region:vpcid,其中vpcid就是前面打通ENI专线的那个vpcid 58 | ``` 59 | 原先Spark作业中的VPC相关操作和配置**不再需要**: 60 | ``` 61 | spark.hadoop.odps.cupid.vpc.domain.list 62 | spark.hadoop.odps.cupid.smartnat.enable 63 | spark.hadoop.odps.cupid.pvtz.rolearn(访问自定义域名) 64 | spark.hadoop.odps.cupid.vpc.usepvtz(访问自定义域名) 65 | ``` -------------------------------------------------------------------------------- /docs/docs/advanced/jindo-sdk.md: -------------------------------------------------------------------------------- 1 | # Jindo sdk接入说明 2 | 参考[jindo-sdk的说明](https://github.com/aliyun/alibabacloud-jindodata/blob/master/docs/spark/jindosdk_on_spark.md),jindo-sdk接入有如下几个步骤。 3 | 4 | - spark默认使用hadoop-oss,增加特殊配置项才可以改为使用jindo-sdk。 5 | - 设置访问OSS需要的配置 6 | - 部署spark应用。 7 | 8 | > jindo-sdk 相比于hadoop-oss 使用更多的本地磁盘空间,如果出现*No space left on device*,可以调整`spark.hadoop.odps.cupid.disk.driver.device_size`增大本地磁盘空间。 9 | 10 | ## 引用jindo-sdk 11 | 12 | 修改spark-defaults.conf增加配置项,增加spark.hadoop.odps.cupid.resources配置。使用外部文件的方法参考[引用外部文件](https://github.com/aliyun/MaxCompute-Spark/wiki/06.-%E5%BC%95%E7%94%A8%E5%A4%96%E9%83%A8%E6%96%87%E4%BB%B6%E9%97%AE%E9%A2%98),样例配置如下: 13 | 14 | ```text 15 | spark.hadoop.odps.cupid.resources = public.jindofs-sdk-3.7.2.jar 16 | ``` 17 | 18 | ## 使用jindo-sdk 19 | 20 | 在`SparkConf`中设置`spark.hadoop.fs.AbstractFileSystem.oss.impl`及`spark.hadoop.fs.oss.impl`, 样例代码如下: 21 | 22 | ```scala 23 | val conf = new SparkConf() 24 | .setAppName("jindo-sdk-demo") 25 | .set("spark.hadoop.fs.AbstractFileSystem.oss.impl", "com.aliyun.emr.fs.oss.OSS") 26 | .set("spark.hadoop.fs.oss.impl", "com.aliyun.emr.fs.oss.JindoOssFileSystem") 27 | ``` 28 | 29 | ## 配置OSS 30 | 31 | 涉及到的配置项有Oss Endpoint和Oss鉴权参数,参考[访问OSS](https://github.com/aliyun/MaxCompute-Spark/wiki/08.-Oss-Access%E6%96%87%E6%A1%A3%E8%AF%B4%E6%98%8E)获得合法的Endpoint值和OSS鉴权参数值。OSS鉴权有两种方式AccessKey鉴权及云服务角色扮演,不同鉴权方式需要使用不同的鉴权参数。 32 | 33 | 34 | ## 使用AccessKey鉴权 35 | 36 | spark-defaults.conf无需变更, `SparkConf`中设置`spark.hadoop.fs.oss.endpoint`、`spark.hadoop.fs.oss.accessKeyId`、`spark.hadoop.fs.oss.accessKeySecret`。 37 | 38 | ```scala 39 | val conf = new SparkConf() 40 | .setAppName("jindo-sdk-demo") 41 | .set("spark.hadoop.fs.AbstractFileSystem.oss.impl", "com.aliyun.emr.fs.oss.OSS") 42 | .set("spark.hadoop.fs.oss.impl", "com.aliyun.emr.fs.oss.JindoOssFileSystem") 43 | 44 | # 配置endpoint 45 | .set("spark.hadoop.fs.oss.endpoint", "endpoint-value") 46 | 47 | # 配置access-key鉴权参数 48 | .set("spark.hadoop.fs.oss.accessKeyId", "xxx") 49 | .set("spark.hadoop.fs.oss.accessKeySecret", "xxx") 50 | ``` 51 | 52 | ## 使用云服务角色鉴权 53 | 云服务角色描述字符串格式为`acs:ram::12345678:role/${role-name}`,其中纯数字部分'12345678'是aliyun-uid,斜线后面的字符串是角色名称。这两个值需要配置在spark应用里。 54 | 55 | spark-defaults.conf需要添加`spark.hadoop.odps.cupid.http.server.enable`, 如下: 56 | ```text 57 | spark.hadoop.odps.cupid.http.server.enable = true 58 | ``` 59 | 60 | `SparkConf`中设置`spark.hadoop.odps.cupid.http.server.enable`、`spark.hadoop.fs.jfs.cache.oss.credentials.provider`、`spark.hadoop.aliyun.oss.provider.url`, 样例代码如下: 61 | 62 | ```scala 63 | val conf = new SparkConf() 64 | .setAppName("jindo-sdk-demo") 65 | .set("spark.hadoop.fs.AbstractFileSystem.oss.impl", "com.aliyun.emr.fs.oss.OSS") 66 | .set("spark.hadoop.fs.oss.impl", "com.aliyun.emr.fs.oss.JindoOssFileSystem") 67 | 68 | # 配置endpoint 69 | .set("spark.hadoop.fs.oss.endpoint", "endpoint-value") 70 | 71 | # 配置云服务角色鉴权 72 | # ${aliyun-uid}是阿里云用户UID 73 | # ${role-name}是角色名称 74 | .set("spark.hadoop.fs.jfs.cache.oss.credentials.provider", "com.aliyun.emr.fs.auth.CustomCredentialsProvider") 75 | .set("spark.hadoop.aliyun.oss.provider.url", "http://localhost:10011/sts-token-info?user_id=${aliyun-uid}&role=${role-name}") 76 | ``` 77 | 78 | ## 打包上传 79 | 80 | ```shell 81 | ./bin/spark-submit --class xxx spark-app.jar 82 | ``` 83 | 84 | 85 | -------------------------------------------------------------------------------- /docs/docs/advanced/oss-access.md: -------------------------------------------------------------------------------- 1 | # Oss Access文档说明 2 | ## Oss Endpoint 配置 3 | 4 | 本机调试时使用对应Region的外网Endpoint,提交集群需替换为VPC内网Endpoint 5 | 6 | [Region和Endpoint对照表](https://help.aliyun.com/document_detail/31837.html?spm=a2c4g.11174283.6.585.5f2d7da2svYAQx#title-qvx-r3a-xr4) 7 | 8 | ## 网络白名单配置 9 | 1. 默认情况下无需设置可以直接访问; 10 | 2. 如发现无法访问,设置 spark.hadoop.odps.cupid.trusted.services.access.list=[yourbucketname].oss-xxxxxx-internal.aliyuncs.com(yarn-cluster模式使用,该配置项必须放在配置文件或是命令行提交参数里) 11 | 12 | ## Oss Id/Key访问方式 13 | 14 | ``` 15 | spark.hadoop.fs.oss.accessKeyId = xxxxxx 16 | spark.hadoop.fs.oss.accessKeySecret = xxxxxx 17 | spark.hadoop.fs.oss.endpoint = oss-xxxxxx-internal.aliyuncs.com 18 | ``` 19 | 20 | 21 | ## OssStsToken 授权访问方式 22 | > 一般来说,Spark提供直接通过OSS AccessId以及AccessKey的方式直接访问OSS资源,但是此方式需要明文将AccessId以及AccessKey写在用户代码或者用户配置中,不是一种安全的访问方式,本文档提供一种更加安全的方式访问OSS资源 23 | 24 | ## 授权MaxCompute以StsToken的方式访问OSS 25 | 26 | 点击下方的一键授权链接,可以把当前云账号的OSS资源通过StsToken的方式授权给MaxCompute的Project直接访问,前提是,该MaxCompute的ProjectOwner也是此云账号 27 | 28 | [一键授权](https://ram.console.aliyun.com/?spm=a2c4g.11186623.2.9.3bf06a064lrBYN#/role/authorize?request=%7B%22Requests%22:%20%7B%22request1%22:%20%7B%22RoleName%22:%20%22AliyunODPSDefaultRole%22,%20%22TemplateId%22:%20%22DefaultRole%22%7D%7D,%20%22ReturnUrl%22:%20%22https:%2F%2Fram.console.aliyun.com%2F%22,%20%22Service%22:%20%22ODPS%22%7D) 29 | 30 | ## 获取roleArn 31 | 32 | 通过上述的授权后,只需要在Spark配置里加上下面的配置就可以访问OSS资源 33 | 34 | ``` 35 | # 此配置表明Spark是通过StsToken去访问OSS资源 36 | spark.hadoop.fs.oss.credentials.provider=org.apache.hadoop.fs.aliyun.oss.AliyunStsTokenCredentialsProvider 37 | 38 | # 此配置是上述一键授权后产生的一个roleArn_ID,授权后可以去访问控制->角色管理获取AliyunODPSDefaultRole Arn信息 39 | spark.hadoop.fs.oss.ststoken.roleArn=acs:ram::xxxxxxxxxxxxxxx:role/aliyunodpsdefaultrole 40 | 41 | # 此配置是OSS资源对应的VPC访问endpoint 不同的region可能会不同 42 | # 请访问 https://oss.console.aliyun.com/index 确认对应的 endpoint 43 | spark.hadoop.fs.oss.endpoint=oss-cn-hangzhou-internal.aliyuncs.com 44 | ``` 45 | 46 | 下面讲一下如何获取roleArn 47 | 48 | * 登录 https://ram.console.aliyun.com/ 49 | * 点击角色管理 50 | * 如果已经执行过一键授权,则会有一个**AliyunODPSDefaultRole**的记录存在 51 | * 点击管理,页面会跳转到详情页,可以看到一个这样格式的ID `acs:ram::xxxxxxxxxxxxxxx:role/aliyunodpsdefaultrole` 52 | 53 | ![](../resources/oss-1.jpg) 54 | 55 | ![](../resources/oss-2.jpg) 56 | 57 | ![](../resources/oss-3.jpg) -------------------------------------------------------------------------------- /docs/docs/advanced/zeppelin-integration.md: -------------------------------------------------------------------------------- 1 | # MaxCompute Spark支持交互式Zeppelin 2 | 由于安全原因,用户无法触达生产集群的网络,所以MaxCompute Spark一直没有放开 `yarn-client`的支持,也就是`Spark-Shell`,`Spark-SQL`以及`PYSPARK`等交互式功能一直无法支持。Zeppelin on MaxCompute Spark可以在一定程度上支持用户交互式需求。这个模式相对于local模式更有力的地方是,这个模式其实是真实用了yarn-cluster模式运行着的,local模式仅仅能验证语法是否正确,而zeppelin模式能以分布式的方式提供交互式查询,这个对于那种需要关注性能结果的debugging是有帮助的。 3 | 4 | ## 步骤说明 5 | 6 | * 一键启动脚本: 7 | * spark 2.3 见 [spark-zeppelin-public.sh](https://odps-repo.oss-cn-hangzhou.aliyuncs.com/spark-zeppelin-public/spark-zeppelin-public.sh) 8 | * spark 2.4 见 [spark-zeppelin-public-2.4.sh](https://odps-repo.oss-cn-hangzhou.aliyuncs.com/spark-zeppelin-public/spark-zeppelin-public-2.4.sh) 9 | * 下载脚本到本地后,运行 `sh spark-zeppelin-public.sh` 后,会自动下载相关组件如下 10 | * spark-zeppelin-public.conf 11 | * spark-zeppelin-public.jar 12 | * spark-2.3.0-odps0.32.1.tar.gz 13 | * 第一次运行脚本会出现以下错误,这是因为默认的`spark-zeppelin-public.conf`并没有配置accessId,accessKey,projectName 14 | 15 | ``` 16 | linxuewei:spark-zeppelin-public linxuewei$ sh spark-zeppelin-public.sh 17 | working dir: /Users/linxuewei/Desktop/spark-zeppelin-public 18 | download spark-zeppelin-public.conf 19 | download spark-zeppelin-public.jar 20 | download spark-2.3.0-odps0.32.1.tar.gz 21 | extract spark-2.3.0-odps0.32.1.tar.gz 22 | export SPARK_HOME 23 | spark-zeppelin-public.conf checking 24 | TBD count is 3, plz check config make sure id key project is written! 25 | config check failed, plz set id key project in spark-zeppelin-public.conf 26 | ``` 27 | 28 | * 注意Spark 2.4.5需要添加 `spark.sql.catalogImplementation = hive 和 spark.sql.sources.default = hive` 之后再运行 `sh spark-zeppelin-public.sh` 29 | * Spark 2.4.5添加 `spark.hadoop.odps.spark.libs.public.enable=true`和`spark.hadoop.odps.spark.version=spark-2.4.5-odps0.33.1` 这两个参数可以加速包上传速度 30 | 31 | * 正常配置 `spark-zeppelin-public.conf` 之后再运行 `sh spark-zeppelin-public.sh` 32 | 33 | ``` 34 | linxuewei:spark-zeppelin-public linxuewei$ sh spark-zeppelin-public.sh 35 | working dir: /Users/linxuewei/Desktop/spark-zeppelin-public 36 | export SPARK_HOME 37 | spark-zeppelin-public.conf checking 38 | config check passed, start spark-submit 39 | 40 | 就会启动一个MaxCompute Spark作业,等待作业执行结束之后,可以回溯日志,找到logview 41 | 42 | http://logview.odps.aliyun.com/logview/?h=http://service.cn.maxcompute.aliyun.com/api&p=zky_test&i=20190710044052214gy6kc292&token=eXN6eFlsNmQzOFV4dUIzVEVndm9KQUtVSlVNPSxPRFBTX09CTzpwNF8yNDcwNjM5MjQ1NDg0NDc5NzksMTU2Mjk5Mjg1Mix7IlN0YXRlbWVudCI6W3siQWN0aW9uIjpbIm9kcHM6UmVhZCJdLCJFZmZlY3QiOiJBbGxvdyIsIlJlc291cmNlIjpbImFjczpvZHBzOio6cHJvamVjdHMvemt5X3Rlc3QvaW5zdGFuY2VzLzIwMTkwNzEwMDQ0MDUyMjE0Z3k2a2MyOTIiXX1dLCJWZXJzaW9uIjoiMSJ9 43 | ``` 44 | 45 | * 打开 `logview` 点击 `master-0` 点击 `StdOut` 46 | 47 | ![image.png](https://ata2-img.cn-hangzhou.oss-pub.aliyun-inc.com/425b961b2b3074622b41068e9a78409f.png) 48 | 49 | ``` 50 | # 日志中的这个url,就是zeppelin server的地址了 51 | # 直接复制粘贴到浏览器上即可访问,弹出的url会需要云账号的登录 52 | Please visit the following url for zeppelin interaction. 53 | http://20190710044052214gy6kc292-zeppelin.open.maxcompute.aliyun.com 54 | Log dir doesn't exist, create /worker/zeppelin_logs/ 55 | Pid dir doesn't exist, create /worker/zeppelin_pids/ 56 | Zeppelin start [ OK ] 57 | ``` 58 | 59 | * 打开 `zeppelin url` 打开 `Examples` Notebook,有时候页面会显示endpoint not exist的日志,这是因为zeppelin还没有启动完毕的情况,稍等片刻就可以 60 | 61 | ![image.png](https://ata2-img.cn-hangzhou.oss-pub.aliyun-inc.com/9f3c2496ba6f0d1cb827e5a6b81ee44a.png) 62 | 63 | * 如果页面弹出一个 `interpreter binding`的页面,直接点击Save即可,然后再点击ToolBar上的运行所有按钮即可执行Notebook上的代码的执行 64 | 65 | ![image.png](https://ata2-img.cn-hangzhou.oss-pub.aliyun-inc.com/b9b8404f9bcd49e9464074e9860c2272.png) 66 | 67 | * 从 examples 样例中我们可以看到,NoteBook支持三种语法 68 | * 以 `%spark` 开头表示 scala 执行器 如果不写就默认是这个模式 69 | * 以 `%sql` 开头表示 spark-sql 执行器,默认用ODPS External Catalog 70 | * 以 `pyspark` 开头表示 pyspark 执行器,默认用我们打包好的 python2.7 71 | 72 | ## 资源释放 73 | 74 | 本质上Zeppelin Server on MaxCompute Spark还是一个Spark作业,默认这个作业会存活三天,如果你想手动关闭这个作业的话,就请用odpscmd,用`kill ;` 命令来停止作业释放资源吧。 -------------------------------------------------------------------------------- /docs/docs/config/_category_.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "配置参考", 3 | "position": 4, 4 | "link": { 5 | "type": "generated-index" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /docs/docs/config/configuration.md: -------------------------------------------------------------------------------- 1 | # Spark配置详解 2 | ## MaxCompute账号相关配置 3 | 4 | * `spark.hadoop.odps.project.name` 5 | + **默认值** `无` 6 | + **配置说明** `MaxCompute项目名称` 7 | * `spark.hadoop.odps.access.id` 8 | + **默认值** `无` 9 | + **配置说明** `MaxCompute项目accessId` 10 | * `spark.hadoop.odps.access.key` 11 | + **默认值** `无` 12 | + **配置说明** `MaxCompute项目accessKey` 13 | * `spark.hadoop.odps.access.security.token` 14 | + **默认值** `无` 15 | + **配置说明** `MaxCompute项目STS Token` 16 | * `spark.hadoop.odps.end.point` 17 | + **建议值** 可以采用中国公共云通用外网endpoint:`http://service.cn.maxcompute.aliyun.com/api`,也可以采用各自region独享的endpoint,参考文档[外网Endpoint](https://help.aliyun.com/document_detail/34951.html?spm=5176.11065259.1996646101.searchclickresult.58c77a0dlXCR54) 18 | + **配置说明** `MaxCompute项目endPoint` 19 | 20 | ## Spark版本配置 21 | * `spark.hadoop.odps.spark.version` 22 | + **默认值** `spark-2.3.0-odps0.33.0,如果使用spark-2.4.5, 请将该参数设置为spark-2.4.5-odps0.34.0,如果使用spark-3.1.1, 请将该参数设置为spark-3.1.1-odps0.34.1` 23 | + **配置说明** `该值指定了提交spark任务所用的spark版本` 24 | + **注意** `可以通过该配置切换到spark-2.4.5/spark-3.1.1` 25 | 26 | * `spark.hadoop.odps.spark.libs.public.enable` 27 | + **默认值** `false` 28 | + **配置说明** `设置为true之后,可以免上传jars,直接从服务端拉取,加速上传` 29 | + **注意** `需要同时配置spark.hadoop.odps.spark.version指定版本后才能生效` 30 | 31 | ## 资源申请相关配置 32 | 33 | * `spark.executor.instances` 34 | + **默认值** `1` 35 | + **配置说明** `executor worker个数` 36 | * `spark.executor.cores` 37 | + **默认值** `1` 38 | + **配置说明** `executor worker核数` 39 | * `spark.executor.memory` 40 | + **默认值** `2g` 41 | + **配置说明** `executor worker内存` 42 | * `spark.driver.cores` 43 | + **默认值** `1` 44 | + **配置说明** `driver核数` 45 | * `spark.driver.memory` 46 | + **默认值** `2g` 47 | + **配置说明** `driver内存` 48 | * `spark.master` 49 | + **默认值** `yarn-cluster` 50 | + **配置说明** `作业提交运行方式,目前支持yarn-cluster以及local[N]` 51 | * `spark.yarn.executor.memoryOverhead` 52 | + **默认值** `参考社区配置` 53 | + **配置说明** `当堆外内存使用比较多时建议提高此值避免整体内存超出被Kill` 54 | + **注意** `单个executor的内存总量是spark.executor.memory+spark.yarn.executor.memoryOverhead` 55 | * `spark.yarn.driver.memoryOverhead` 56 | + **默认值** `参考社区配置` 57 | + **配置说明** `当堆外内存使用比较多时建议提高此值避免整体内存超出被Kill` 58 | + **注意** `driver的内存总量是spark.driver.memory+spark.yarn.driver.memoryOverhead` 59 | * `spark.hadoop.odps.cupid.disk.driver.device_size` 60 | + **默认值** `20g` 61 | + **配置说明** `本地网盘大小,当出现No space left on device时可适当调大该值,最大支持100g` 62 | + **注意** `注意:必须配置在spark-conf文件或者dataworks的配置项中,不能配置在代码中` 63 | 64 | ## MaxCompute数据互通配置 65 | 66 | * `spark.sql.catalogImplementation` 67 | + **配置说明** `spark 2.3.0 需要设置为odps,spark 2.4.5及以上的版本需要设置hive` 68 | * `spark.hadoop.odps.cupid.resources` 69 | + **配置说明** `该配置项指定了任务运行所需要的`[Maxcompute资源](https://help.aliyun.com/document_detail/27831.html?spm=5176.11065259.1996646101.searchclickresult.d55650ea0QU1qd&aly_as=45TiiTdO2),`格式为.,可指定多个,逗号分隔` 70 | + **配置示例** spark.hadoop.odps.cupid.resources=public.python-python-2.7-ucs4.zip,public.myjar.jar 71 | + **使用说明** `指定的资源将被下载到driver和executor的当前工作目录,资源下载到工作目录后默认的名字是.` 72 | + **文件重命名** `在配置时通过.:进行重命名` 73 | + **重命名示例** spark.hadoop.odps.cupid.resources=public.myjar.jar:myjar.jar 74 | + **注意** `该配置项必须要配置在spark-default.conf中或dataworks的配置项中才能生效,而不能写在代码中` 75 | * `spark.hadoop.odps.cupid.vectorization.enable` 76 | + **建议值** `true` 77 | + **配置说明** `当设置为true时,会应用批读写优化,读写数据性能显著提升。 78 | * `spark.hadoop.odps.input.split.size` 79 | + **默认值** `256` 80 | + **配置说明** `该配置可以用来调节读Maxcompute表的并发度,默认每个分区为256MB 81 | 82 | 83 | ## OSS相关配置 84 | 85 | * `spark.hadoop.fs.oss.endpoint` 86 | + **建议值** `无` 87 | + **配置说明** `阿里云OSS控制台上可查看Bucket对应的endpoint` 88 | * `spark.hadoop.fs.oss.ststoken.roleArn` 89 | + **建议值** `无` 90 | + **配置说明** `StsToken授权方式` 91 | * `spark.hadoop.fs.oss.credentials.provider` 92 | + **建议值** `org.apache.hadoop.fs.aliyun.oss.AliyunStsTokenCredentialsProvider` 93 | + **配置说明** `StsToken授权方式` 94 | 95 | [OSS StsToken授权步骤](https://github.com/aliyun/MaxCompute-Spark/wiki/08.-Oss-Access%E6%96%87%E6%A1%A3%E8%AF%B4%E6%98%8E) 96 | 97 | ## VPC服务访问相关配置 98 | 99 | * `spark.hadoop.odps.cupid.vpc.domain.list` 100 | + **建议值** `无` 101 | + **配置说明** `参见以下JSON格式 配置值为压缩去除空格后的字符串` 102 | + **压缩为字符串网址** http://www.bejson.com/ 103 | 104 | ``` 105 | See. http://www.bejson.com/ 106 | 粘贴VPC Domain List内容并选择压缩得到压缩于一行的字符串作为spark.hadoop.odps.cupid.vpc.domain.list的配置值 107 | { 108 | "regionId": "cn-beijing", 109 | "vpcs": [ 110 | { 111 | "vpcId": "vpc-2zeaeq21mb1dmkqh0exox", 112 | "zones": [ 113 | { 114 | "urls": [ 115 | { 116 | "domain": "zky-test", 117 | "port": 9092 118 | } 119 | ], 120 | "zoneId": "9b7ce89c6a6090e114e0f7c415ed9fef" 121 | } 122 | ] 123 | } 124 | ] 125 | } 126 | ``` 127 | * `spark.hadoop.odps.cupid.pvtz.rolearn` 128 | + **建议值** `acs:ram::********:role/aliyunodpsdefaultrole` 129 | + **配置说明** `当spark作业需要访问云上其他VPC域内服务,比如redis、mysql、kafka等等需要配置该参数` 130 | * `spark.hadoop.odps.cupid.smartnat.enable` 131 | + **配置说明** `北京和上海region需要配置该参数为true` 132 | 133 | [VPC访问文档说明](https://github.com/aliyun/MaxCompute-Spark/wiki/09.-VPC-Access%E6%96%87%E6%A1%A3%E8%AF%B4%E6%98%8E) 134 | 135 | ## 流式作业相关配置 136 | 137 | * `spark.hadoop.odps.cupid.engine.running.type` 138 | + **建议值** `longtime` 139 | + **配置说明** `普通作业3天没跑完就会被强制回收,流式作业需要设置此值` 140 | * `spark.hadoop.odps.cupid.job.capability.duration.hours` 141 | + **建议值** `8640` 142 | + **配置说明** `流式作业权限文件expired时间,单位小时` 143 | * `spark.hadoop.odps.moye.trackurl.dutation` 144 | + **建议值** `8640` 145 | + **配置说明** `流式作业jobview expired时间,单位小时` 146 | * `spark.yarn.maxAppAttempts` 147 | + **建议值** `5` 148 | + **配置说明** `流式作业failover次数限制` 149 | * `spark.yarn.am.maxAttemptValidityInterval` 150 | + **建议值** `1h` 151 | + **配置说明** `流式作业failover次数限制窗口验证` 152 | 153 | ## 灰度相关配置 154 | 155 | * `spark.hadoop.odps.task.major.version` 156 | + **建议值** `default` 157 | 158 | ## 隔离相关配置 159 | 160 | * `spark.hadoop.odps.cupid.container.image.enable` 161 | + **建议值** `true` 162 | + **配置说明** `安全隔离相关配置请保持默认值,专有云需要去掉该配置` 163 | * `spark.hadoop.odps.cupid.container.vm.engine.type` 164 | + **建议值** `hyper` 165 | + **配置说明** `安全隔离相关配置请保持默认值,专有云需要去掉该配置` 166 | -------------------------------------------------------------------------------- /docs/docs/config/resource-autoscaling.md: -------------------------------------------------------------------------------- 1 | # 动态资源伸缩问题 2 | ## Spark 2.4.5/3.1.1 支持动态资源伸缩 3 | * 首先需要切换到spark-2.4.5-odps0.34.0版本 4 | 5 | ``` 6 | * 从Dataworks提交任务,需要添加配置:spark.hadoop.odps.spark.version=spark-2.4.5-odps0.34.0,从而切换到新的spark版本 7 | 8 | * 从本地提交任务,需要添加以下两个配置: 9 | spark.hadoop.odps.spark.libs.public.enable=true 10 | spark.hadoop.odps.spark.version=spark-2.4.5-odps0.34.0 11 | 12 | * spark-3.1.1采用客户端提交可以直接使用动态资源伸缩功能 13 | ``` 14 | 15 | * 此外需要添加以下spark参数: 16 | ``` 17 | spark.dynamicAllocation.shuffleTracking.enabled = true (默认 false) 18 | spark.dynamicAllocation.shuffleTracking.timeout = XXXs (默认 Long.MaxValue MILLISECONDS) 19 | spark.dynamicAllocation.enabled = true 20 | 21 | 参考文档:https://spark.apache.org/docs/3.0.0/configuration.html#dynamic-allocation 22 | ``` -------------------------------------------------------------------------------- /docs/docs/development/_category_.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "开发指南", 3 | "position": 3, 4 | "link": { 5 | "type": "generated-index" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /docs/docs/development/ops/_category_.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "运维监控", 3 | "position": 2, 4 | "link": { 5 | "type": "generated-index" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /docs/docs/development/ops/job-diagnosis.md: -------------------------------------------------------------------------------- 1 | # 作业诊断 2 | 快速导航 3 | + [使用Logview工具诊断作业](#1) 4 | + [使用Spark-Web-UI诊断作业](#2) 5 | + [寻求开发介入帮助](#3) 6 | ------------------ 7 | 8 | 提交作业后,一般来说客户端会输出logview和jobview这两个Url可以帮助作业诊断,这两个Url无论是问题自查、结果查看还是寻求开发人员帮助都是十分重要的手段。此外,用户还可以通过[Cupid Console](https://developer.aliyun.com/article/745038?spm=a2c6h.12873581.0.0.ebaa95b08Drzws&groupCode=maxcompute)来获取当前project正在运行的logview和jobview信息。 9 | 下面给出一个Demo日志作为演示。 10 | 11 | ``` 12 | 19/06/11 11:56:41 INFO YarnClientImplUtil: logview url: http://logview.odps.aliyun.com/logview/?h=http://service.cn.maxcompute.aliyun.com/api&p=lightning&i=2019061103564120glypjv21&token=dUxYakFMRUFrc25oNjg5TDk1azhRZTlYVldvPSxPRFBTX09CTzpwNF8yNDcwNjM5MjQ1NDg0NDc5NzksMTU2MDQ4NDYwMSx7IlN0YXRlbWVudCI6W3siQWN0aW9uIjpbIm9kcHM6UmVhZCJdLCJFZmZlY3QiOiJBbGxvdyIsIlJlc291cmNlIjpbImFjczpvZHBzOio6cHJvamVjdHMvbGlnaHRuaW5nL2luc3RhbmNlcy8yMDE5MDYxMTAzNTY0MTIwZ2x5cGp2MjEiXX1dLCJWZXJzaW9uIjoiMSJ9 13 | 19/06/11 11:56:41 INFO CupidUtil: ready!!! 14 | 19/06/11 11:57:08 INFO YarnClientImpl: Submitted applicationType SPARK application application_1560225394361_1217133882 to ResourceManager at instanceId 2019061103564120glypjv21 15 | 19/06/11 11:57:09 INFO SubmitJobUtil: submitting CupidTask with ALIYUN type, operator: GetApplicationMeta 16 | 19/06/11 11:57:09 INFO CupidUtil: getApplicationMeta 17 | 19/06/11 11:57:11 INFO Client: Application report for application_1560225394361_1217133882 (state: RUNNING) 18 | 19/06/11 11:57:11 INFO Client: 19 | client token: N/A 20 | diagnostics: diagnostics 21 | ApplicationMaster host: 11.222.166.90 22 | ApplicationMaster RPC port: 38965 23 | queue: queue 24 | start time: 1560225401092 25 | final status: UNDEFINED 26 | tracking URL: http://jobview.odps.aliyun.com/proxyview/jobview/?h=http://service.cn.maxcompute.aliyun-inc.com/api&p=lightning&i=2019061103564120glypjv21&t=spark&id=application_1560225394361_1217133882&metaname=2019061103564120glypjv21&token=MHJISzg3OVlKZWJTZ3VCSllzUEMzVnF5KzNJPSxPRFBTX09CTzpwNF8yNDcwNjM5MjQ1NDg0NDc5NzksMTU2MDQ4NDYzMSx7IlN0YXRlbWVudCI6W3siQWN0aW9uIjpbIm9kcHM6UmVhZCJdLCJFZmZlY3QiOiJBbGxvdyIsIlJlc291cmNlIjpbImFjczpvZHBzOio6cHJvamVjdHMvbGlnaHRuaW5nL2luc3RhbmNlcy8yMDE5MDYxMTAzNTY0MTIwZ2x5cGp2MjEiXX1dLCJWZXJzaW9uIjoiMSJ9 27 | user: user 28 | ``` 29 | ## JobView 30 | 以`jobview.odps.aliyun.com`开头的Url,我们统称为Jobview,Jobview是上一代Spark UI和HistoryServer,使用时会在一些稳定性问题,因此**不再推荐**用户使用,可以直接在logview中找到Spark UI和History Server的链接来排查问题,见下文。 31 | 32 |

使用Logview工具诊断作业

33 | 34 | 以`logview.odps.aliyun.com`开头的Url,我们统称为Logview,这个MaxCompute自研的分布式作业Tracing工具,通过这个工具我们可以: 35 | 36 | * 获取该作业状态 37 | * 获取该作业各节点起停调度信息 38 | * 获取该作业各节点的标准输入输出日志 (一般Spark结果输出建议打印到StdOut,Spark的log4j日志则默认输出到StdErr) 39 | + 通过log插件打印的日志会显示在StdErr中 40 | + 通过System.out.println()输出到控制台的日志会显示在StdOut中 41 | * Logview的时效性一般是3~5天,甚至其中是StdOut、StdErr很可能会因为本地磁盘满了被清理掉 42 | 43 | * **Logview 2.0包含Sensor功能,可以查看master以及每个worker在运行时的内存和cpu使用情况** 44 | 45 | * 打开Logview,可以在浏览器看到以下页面,有作业完成状态以及时间等信息 46 | ![image1](../../resources/logview-1.jpg) 47 | 48 | * 点击Detail按钮可以查看作业进一步详情,master-0代表Spark Driver所在节点 49 | ![image2](../../resources/logview-2.jpg) 50 | 51 | * 点击master-0按钮并选择All Tab可以看到Spark Driver节点的具体信息 52 | ![image3](../../resources/logview-3.jpg) 53 | 54 | * 点击StdOut按钮可以看到节点结果输出 55 | ![image4](../../resources/logview-4.jpg) 56 | 57 | * 点击StdErr按钮可以看到节点log4j日志 58 | ![image5](../../resources/logview-5.jpg) 59 | 60 |

使用Spark-Web-UI诊断作业

61 | 62 | ## Spark UI和History Server 63 | 用户可以在logivew的summary模块下找到Spark UI链接和History Server链接: 64 | ![image6](../../resources/sparkui.png) 65 | 66 | * 注意 67 | ``` 68 | 1. Spark UI仅在作业运行时才能打开 69 | 70 | 2. History Server需要等待Driver把Event传递到HistoryServer进行渲染,会有一定延迟 71 | ``` 72 | * 打开该链接,可以在浏览器看到 Spark-Web-UI 73 | ![image7](../../resources/jobview-1.jpg) 74 | 75 | * 点击environment tab确认设置的spark参数是否全部正确 76 | ![image8](../../resources/jobview-2.jpg) 77 | 78 | * 点击executors tab重点关注是否有`Dead节点`,以及Driver的StdOut和StdErr 79 | ![image9](../../resources/jobview-3.jpg) 80 | 81 | * 点击StdOut按钮可以看到节点结果输出 82 | ![image10](../../resources/jobview-4.jpg) 83 | 84 | * 点击StdErr按钮可以看到节点log4j日志 85 | ![image11](../../resources/jobview-5.jpg) 86 | 87 |

寻求开发介入帮助

88 | 89 | * 先根据[常见问题](https://github.com/aliyun/MaxCompute-Spark/wiki/06.-Spark%E5%B8%B8%E8%A7%81%E9%97%AE%E9%A2%98)文档做一下初步排查 90 | * 提供logview和jobview,一般logview是一定有的,jobview如果提交的时候马上报错并不会产生 91 | * 钉钉扫码加入MaxCompute Spark开发群支持 -------------------------------------------------------------------------------- /docs/docs/development/ops/logging-guide.md: -------------------------------------------------------------------------------- 1 | # MaxCompute Spark日志使用指南 2 | ## 背景 3 | Spark集群运行环境中日志使用Log4j2接口,因此原先使用了Log4j1相关接口的代码需要一定的修改。 4 | 5 | ## 适用情况 6 | - 有需要在代码中打入自己的日志,同时与Spark产生的系统日志进行区分的。 7 | - 原代码中涉及显式调用Log4j1接口进行日志配置的 8 | 9 | ## 升级步骤 10 | 参考:[log4j2 example](https://github.com/aliyun/MaxCompute-Spark/tree/master/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/log4j2) 11 | 具体来说: 12 | 1. 引入log4j2的包 13 | ```xml 14 | 15 | org.apache.logging.log4j 16 | log4j-core 17 | 2.12.1 18 | provided 19 | 20 | ``` 21 | 保持scope为provided和version为2.12.1,因为集群环境中有此依赖,避免出现意外的类/方法冲突问题。 22 | 23 | 2. 参考[示例](https://github.com/aliyun/MaxCompute-Spark/blob/master/spark-2.x/src/main/java/com/aliyun/odps/spark/examples/utils/ConfigLog4j2.java)中使用log4j2的接口进行日志配置,即向集群运行环境中的log4j2配置中加入自定义的Appender和LoggerConfig 24 | 25 | 3. 使用之前调用配置方法,如下 26 | ```java 27 | ConfigLog4j2.initPackageLogger("your_package_name") 28 | ``` 29 | 即可在需要的地方使用,如下 30 | ```scala 31 | val log: log4j.Logger = LogManager.getLogger(your_class) 32 | ``` 33 | 34 | ## 效果展示 35 | 运行示例中的[SimpleWordCount](https://github.com/aliyun/MaxCompute-Spark/blob/master/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/log4j2/SimpleWordCount.scala),自己在代码中打的日志收在logview->detail->master->stdout中,如下 36 | ![stdout](../../resources/log4j2-stdout.jpg) 37 | 38 | logview->detail->master->stderr中仍是spark的系统日志,如下 39 | ![stderr](../../resources/log4j2-stderr.jpg) 40 | 41 | ## FAQ 42 | 1. 此示例需要开发者自行集成到代码中,涉及到的日志pattern等可自行修改。 43 | 2. initPackageLogger方法中传入的包名,也就是用户代码所处的包名,需要避开以下: 44 | > * com.aliyun.odps 45 | > * com.aliyun.odps.fs 46 | > * org.apache.hadoop 47 | > * org.apache.kafka 48 | > * org.apache.zookeeper 49 | > * org.apache.spark 50 | > * org.apache.flink 51 | > * akka 52 | > * com.aliyun.odps.subprocess -------------------------------------------------------------------------------- /docs/docs/development/ops/streaming-monitoring.md: -------------------------------------------------------------------------------- 1 | # Streaming作业监控报警 2 | Spark Streaming作业的特点是长时间运行,对于数据处理速度,调度延迟等有着较高的要求。因而作业在生产环境运行时,需要关注一些作业的性能、延迟等相关的指标。 3 | 4 | 目前,MaxCompute的Spark Streaming作业的监控报警提供了一个对接云监控平台的插件,可以将作业关键的指标信息推送至[云监控平台](https://www.aliyun.com/product/jiankong),进而可以进行指标查看以及配置监控报警信息,现在支持以下5种类型的监控报警。 5 | 6 | * processingDelay :处理延迟 7 | * schedulingDelay :调度延迟 8 | * totalDelay:总延迟 9 | * totalProcessedRecords:总共处理的记录条数 10 | * waitingBatches:等待执行的Batch数 11 | > 下面将对如何使用Spark Streaming的云监控插件做一个介绍。 12 | 13 | (1)spark-cloudmonitor-sink这个插件是基于[Spark Metrics System](https://spark.apache.org/docs/latest/monitoring.html)的接口开发的一个外置插件。该插件的引入不影响Spark Streaming作业的开发,这里假定我们已经开发调试好了一个Spark Streaming作业,提交方式如下: 14 | ``` 15 | bin/spark-submit --class com.aliyun.odps.spark.examples.streaming.LogHubStreamingDemo --master yarn-cluster --num-executors 1 --driver-memory 4g --executor-memory 4g --executor-cores 1 spark-examples_2.11-1.0.0-SNAPSHOT-shaded.jar 16 | ``` 17 | 18 | (2) 使用spark-cloudmonitor-sink云监控插件,首先需要下载插件的jar包。 19 | ``` 20 | wget http://odps-repo.oss-cn-hangzhou.aliyuncs.com/spark%2Fspark-cloudmonitor-sink-1.0-SNAPSHOT-shaded.jar -O spark-cloudmonitor-sink-1.0-SNAPSHOT-shaded.jar 21 | ``` 22 | 23 | (3)配置云监控相关的账号信息,需要确保云监控平台已经创建好了一个应用分组。配置文件放置在conf/metrics.properties,配置信息如下 24 | ``` 25 | *.sink.cloudmonitor.period=5 26 | 27 | *.sink.cloudmonitor.class=org.apache.spark.metrics.sink.CloudMonitorSink 28 | *.sink.cloudmonitor.endpoint=http://xxxxxx 29 | *.sink.cloudmonitor.accessid=xxxxxx 30 | *.sink.cloudmonitor.accesskey=xxxxxx 31 | *.sink.cloudmonitor.groupid=xxxxxx 32 | ``` 33 | 其中sink.cloudmonitor.endpoint,sink.cloudmonitor.accessid,sink.cloudmonitor.accesskey,sink.cloudmonitor.groupid分别是云监控的endpoint,accessid,accesskey以及metrics需要推送到的应用分组id。 34 | 另外建议在conf/spark-defaults.conf里面增加spark.metrics.namespace: xxxxx 指定为一个有意义的名字标识,否则默认是ApplicationID, 每次作业提交都会不一样。 35 | 36 | (4)带上云监控插件提交Spark Streaming作业,提交作业的命令如下: 37 | ``` 38 | bin/spark-submit --class com.aliyun.odps.spark.examples.streaming.LogHubStreamingDemo --master yarn-cluster --num-executors 1 --driver-memory 4g --executor-memory 4g --executor-cores 1 --jars spark-cloudmonitor-sink-0-SNAPSHOT-shaded.jar --files conf/metrics.properties spark-examples_2.11-1.0.0-SNAPSHOT-shaded.jar 39 | ``` 40 | 和原来不带云监控插件的提交命令对比,可以看到只要新增插件的jar包和conf/metrics.properties配置即可。 41 | 42 | (5)在云监控平台查看指标数据,以及配置报警。 43 | 当作业正常跑起来后,作业的指标数据会不断推送到云监控平台,在云监控的控制台,在自定义监控下面可以看到相应的Spark Streaming作业的监控数据,并且可以针对其中的指标添加相关的报警规则。 44 | ![image1](../../resources/cloudmonitor-1.png) 45 | 46 | ![image2](../../resources/cloudmonitor-2.png) 47 | 48 | ![image3](../../resources/cloudmonitor-3.png) -------------------------------------------------------------------------------- /docs/docs/development/pyspark/_category_.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "pyspark", 3 | "position": 3, 4 | "link": { 5 | "type": "generated-index" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /docs/docs/development/pyspark/pyspark-oss.md: -------------------------------------------------------------------------------- 1 | # PySpark 访问 Oss 2 | ## 参数配置 3 | - 首先需要参考[文档](https://github.com/aliyun/MaxCompute-Spark/wiki/08.-Oss-Access%E6%96%87%E6%A1%A3%E8%AF%B4%E6%98%8E)配置ossid和key: 4 | ``` 5 | spark.hadoop.fs.oss.accessKeyId = xxxxxx 6 | spark.hadoop.fs.oss.accessKeySecret = xxxxxx 7 | spark.hadoop.fs.oss.endpoint = oss-xxxxxx-internal.aliyuncs.com 8 | ``` 9 | 10 | - 配置Hadoop实现类(二选一即可) 11 | ``` 12 | ### 使用jindo sdk(推荐方式,性能更优) 13 | spark.hadoop.fs.AbstractFileSystem.oss.impl=com.aliyun.emr.fs.oss.OSS 14 | spark.hadoop.fs.oss.impl=com.aliyun.emr.fs.oss.JindoOssFileSystem 15 | 16 | ### 使用hadoop-fs-oss 17 | spark.hadoop.fs.oss.impl=org.apache.hadoop.fs.aliyun.oss.AliyunOSSFileSystem 18 | ``` 19 | 20 | - 【公共云】需要引用hadoop oss依赖,添加以下配置(二选一即可): 21 | ``` 22 | ### 使用jindo sdk(推荐方式,性能更优) 23 | spark.hadoop.odps.cupid.resources=public.jindofs-sdk-3.7.2.jar 24 | 25 | ### 使用hadoop-fs-oss 26 | spark.hadoop.odps.cupid.resources=public.hadoop-fs-oss-shaded.jar 27 | ``` 28 | 29 | - 【专有云】需要引用hadoop-fs-oss.jar包,需要按照以下步骤上传资源并添加配置: 30 | ``` 31 | (1)下载hadoop-fs-oss.jar包,下载地址(https://odps-repo.oss-cn-hangzhou.aliyuncs.com/spark/hadoop-fs-oss-shaded.jar) 32 | (2)将jar包上传为MaxCompute资源,参考文档(https://help.aliyun.com/document_detail/27831.html?spm=a2c4g.27797.0.i1#section-533-s8q-d9w) 33 | (3)添加参数:spark.hadoop.odps.cupid.resources=.hadoop-fs-oss-shaded.jar 34 | ``` 35 | 36 | - 需要注意:如果已经配置过spark.hadoop.odps.cupid.resources这个参数,则引用多个资源需要用逗号隔开,参考[文档](https://github.com/aliyun/MaxCompute-Spark/wiki/03.-Spark%E9%85%8D%E7%BD%AE%E8%AF%A6%E8%A7%A3#maxcompute%E6%95%B0%E6%8D%AE%E4%BA%92%E9%80%9A%E9%85%8D%E7%BD%AE) 37 | 38 | ## 例子1:判断oss文件是否存在 39 | ``` 40 | from pyspark.sql import SparkSession 41 | 42 | spark = SparkSession.builder.appName('testoss').getOrCreate() 43 | sc = spark.sparkContext 44 | conf = sc._jsc.hadoopConfiguration() 45 | conf.set("fs.oss.accessKeyId", "xxxx") 46 | conf.set("fs.oss.accessKeySecret", "xxx") 47 | conf.set("fs.oss.endpoint", "xxxx") 48 | conf.set("fs.oss.impl", "org.apache.hadoop.fs.aliyun.oss.AliyunOSSFileSystem") 49 | 50 | path = sc._jvm.org.apache.hadoop.fs.Path("oss://xxxxx") 51 | fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(path.toUri(), conf) 52 | exist = fs.exists(path) 53 | ``` 54 | 55 | 56 | ## 例子2:写oss 57 | ``` 58 | spark = SparkSession.builder.appName('testoss').getOrCreate() 59 | data = [i for i in range(0, 100)] 60 | df = spark.sparkContext.parallelize(data, 2).map(lambda s: ("name-%s" % s, s)).toDF("name: string, num: int") 61 | df.show(n=10) 62 | ## write to oss 63 | pathout = 'oss://[替换为实际Bucket]/test.csv' 64 | df.write.csv(pathout) 65 | ``` 66 | -------------------------------------------------------------------------------- /docs/docs/development/pyspark/pyspark-thirdparty.md: -------------------------------------------------------------------------------- 1 | # PySpark 使用mmlspark和analytics zoo 2 | ## 使用mmlspark 3 | #### 背景 4 | - mmlspark开源库地址:https://github.com/microsoft/SynapseML 5 | - 由于MaxCompute Spark访问外部网络有限制,因此提供以下方案在MaxCompute Spark中使用mmlspark 6 | 7 | #### 使用方式 8 | - 第一步:下载Jar包:首先需要在本地客户端下载mmlspark的所有jar包 9 | ``` 10 | 1. 在本地下载一个spark客户端 11 | 12 | 2. 配置spark-defaults.conf,添加以下参数 13 | spark.jars.repositories=https://mmlspark.azureedge.net/maven 14 | 15 | 3. 使用local模式在本地执行以下命令: 16 | $SPARK_HOME/bin/pyspark --packages com.microsoft.ml.spark:mmlspark_2.11:1.0.0-rc1 17 | 18 | 4. jar包通常会下载到以下目录: 19 | $HOME/.ivy2/jars 20 | 21 | 5. 将所有的jar包压缩为一个zip包: 22 | cd $HOME/.ivy2/jars 23 | zip -r mml_spark.zip . 24 | ``` 25 | 26 | - 第二步:修改spark-defaults.conf 27 | ``` 28 | spark.executor.extraClassPath=./mml_spark.zip/* 29 | spark.driver.extraClassPath=./mml_spark.zip/* 30 | ``` 31 | 32 | - 第三步:使用Yarn-cluster模式提交任务到集群中,注意需要包含 --py-files 33 | ``` 34 | ./bin/spark-submit --archives mml_spark.zip --py-files mml_spark/com.microsoft.ml.spark_mmlspark_2.11-1.0.0-rc1.jar,mml_spark/com.microsoft.ml.lightgbm_lightgbmlib-2.3.100.jar spark_mml.py 35 | ``` 36 | 37 | ## 使用analytics-zoo 38 | #### 相关开源库 39 | - https://github.com/intel-analytics/analytics-zoo 40 | - https://github.com/intel-analytics/BigDL 41 | - https://analytics-zoo.github.io/master/#release-download/ 42 | 43 | #### 参考使用方式 44 | - 注意:下文使用analytics-zoo 0.11.0版本 45 | 46 | - 第一步:Python打包 47 | ``` 48 | 1. ./bin/pip3 install analytics-zoo -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com 49 | 50 | 2. 注意:Python包比较大,可以卸载Pyspark(Maxcompute Spark中已包含PySpark) 51 | ./bin/pip3 uninstall pyspark 52 | 53 | 3. 打包为压缩包,下文使用该名称:python-3.6.14-big-dl.tar.gz 54 | 55 | ``` 56 | 57 | - 第二步:将Python包上传到Maxcompute resource中 58 | 59 | - 第三步:将需要的三个Jar包拷贝出来,路径在 60 | ``` 61 | $python_home/lib/python3.6/site-packages/zoo/share/lib 62 | ○ analytics-zoo-bigdl_0.13.0-spark_2.4.6-0.11.1-jar-with-dependencies.jar 63 | 64 | $python_home/lib/python3.6/site-packages/bigdl/share/lib 65 | ○ bigdl-0.13.0-jar-with-dependencies.jar 66 | ○ bigdl-0.13.0-python-api.zip 67 | ``` 68 | 69 | - 第四步:BigDL重新打包(解决log4j类冲突) 70 | 71 | ``` 72 | 1. 首先找到对应BigDL的版本:如0.11.0对应BigDL的版本是0.13.0 73 | 74 | 2. 下载BigDL源码 75 | 76 | git clone https://github.com/intel-analytics/BigDL.git 77 | 78 | 3. 切换到0.13分支 79 | 80 | git checkout branch-0.13 81 | 82 | 4. 编译打包 83 | cd BigDL/spark/dl/ 84 | mvn clean package -DskipTests; 85 | 86 | 5. 替换Jar包 87 | 用target目录下生成的bigdl-0.13.1-SNAPSHOT-jar-with-dependencies.jar文件来替换第三步中的bigdl-0.13.0-jar-with-dependencies.jar 88 | 89 | ``` 90 | 91 | - 第五步:在spark-defaults.conf中配置Python包 92 | ``` 93 | spark.hadoop.odps.cupid.resources = [projectname].python-3.6.14-big-dl.tar.gz 94 | spark.pyspark.python = ./[projectname].python-3.6.14-big-dl.tar.gz/python-3.6.14-big-dl/bin/python3 95 | ``` 96 | 97 | - 第六步:提交任务,需要携带第三步和第四步中生成的jar包: 98 | ``` 99 | ./bin/spark-submit --jars analytics-zoo-bigdl_0.13.0-spark_2.4.6-0.11.1-jar-with-dependencies.jar,bigdl-0.13.1-SNAPSHOT-jar-with-dependencies.jar,bigdl-0.13.0-python-api.zip spark_test.py 100 | ``` 101 | 102 | -------------------------------------------------------------------------------- /docs/docs/development/spark-streaming/_category_.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "Spark Streaming", 3 | "position": 4, 4 | "link": { 5 | "type": "generated-index" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /docs/docs/development/spark-streaming/streaming-datahub.md: -------------------------------------------------------------------------------- 1 | # Streaming读写DataHub 2 | MaxCompute支持Spark Streaming(DStream)和Spark Structured Streaming,本文介绍Streaming作业流式接收DataHub数据源的示例。 3 | 4 | ## DataHub 数据源 5 | 6 | 首先, 得在阿里云DataHub拥有数据源,[DataHub控制台传送门](https://datahub.console.aliyun.com/datahub) 7 | 8 | * 获取projectName 9 | ![image1](../../resources/datahub-1.jpg) 10 | 11 | * 获取topic 12 | ![image2](../../resources/datahub-2.jpg) 13 | 14 | * 获取subId 15 | 16 | 注意,每一个Streaming程序只能对应一个subId,如果有多个程序要读同一个topic,那么需要多个订阅 17 | 18 | ![image3](../../resources/datahub-3.jpg) 19 | 20 | * 获取endPoint 21 | 22 | 每个region的endPoint都是不一样的,参考[如何配置EndPoint](https://help.aliyun.com/document_detail/47442.html?spm=5176.11065259.1996646101.searchclickresult.4a6e46e8r26UYT) 23 | 24 | ## Spark Streaming(DStream) 25 | 26 | ``` 27 | 28 | 29 | com.aliyun.emr 30 | emr-datahub_${scala.binary.version} 31 | 1.6.0 32 | 33 | 34 | 35 | com.aliyun.datahub 36 | aliyun-sdk-datahub 37 | 2.9.4-public 38 | 39 | 40 | ``` 41 | 42 | [Streaming Access DataHub样例代码](https://github.com/aliyun/MaxCompute-Spark/blob/master/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/streaming/datahub/DataHubStreamingDemo.scala) 43 | 44 | ## 配置详解 45 | 46 | ``` 47 | val dataStream = DatahubUtils.createStream( 48 | ssc, 49 | "projectName", 50 | "topic", 51 | "subId", 52 | "accessId", // 云账号accessId 53 | "accessKey", // 云账号accessKey 54 | "endPoint", 55 | transferFunc(_), // 见Demo注释 56 | StorageLevel.MEMORY_AND_DISK 57 | ) 58 | ``` 59 | 60 | ## DataHub回流到MaxCompute 61 | 利用DStream+Dataframe可以把DataHub数据回流到MaxCompute 62 | 63 | > 详细代码请参考:https://github.com/aliyun/MaxCompute-Spark/blob/master/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/streaming/datahub/DataHub2OdpsDemo.scala 64 | 65 | 66 | ## Spark Structured Streaming 67 | > 详细代码请参考:https://github.com/aliyun/MaxCompute-Spark/blob/master/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/structuredStreaming/datahub/DatahubStructuredStreamingDemo.scala 68 | 69 | source的示例如下(请参考代码): 70 | ``` 71 | val stream = spark.readStream 72 | .format("datahub") 73 | .option("datahub.endpoint", "http://....") 74 | .option("datahub.project", "project") 75 | .option("datahub.topic", "topic1") 76 | .option("datahub.accessId", "accessId") 77 | .option("datahub.accessKey", "accessKey") 78 | .option("datahub.startingoffsets", "latest") 79 | .option("datahub.maxoffsetsperTrigger", 20000) // optional 80 | .load() 81 | ``` 82 | 83 | sink的示例如下: 84 | ``` 85 | val query = spark.writeStream 86 | .format("datahub") 87 | .option("datahub.endpoint", "http://....") 88 | .option("datahub.project", "project") 89 | .option("datahub.topic", "topic1") 90 | .option("datahub.accessId", "accessId") 91 | .option("datahub.accessKey", "accessKey") 92 | .load() 93 | ``` 94 | 95 | 其中datahub.endpoint请使用**经典网络ECS Endpoint**,各region对应的endpoint参考[此文](https://help.aliyun.com/document_detail/47442.html#h2-datahub-1)。此外,需要将endpoint配置在VPC访问配置中,参考[VPC访问](https://github.com/aliyun/MaxCompute-Spark/wiki/09.-VPC-Access%E6%96%87%E6%A1%A3%E8%AF%B4%E6%98%8E)。示例如下: 96 | ``` 97 | { 98 | "regionId":"cn-beijing", 99 | "vpcs":[ 100 | { 101 | "zones":[ 102 | { 103 | "urls":[ 104 | { 105 | "domain":"dh-cn-beijing.aliyun-inc.com", 106 | "port":80 107 | } 108 | ] 109 | } 110 | ] 111 | } 112 | ] 113 | } 114 | ``` 115 | 116 | **注意:** 目前所给的这个Demo,没有启用checkpoint,checkpoint需要使用oss作为checkpoint的存储,另外Spark Streaming作业处于试用阶段,**作业最长运行时间不能超过3天,如果需要投入长时间正式运行使用,请联系我们开通相关权限。** 117 | -------------------------------------------------------------------------------- /docs/docs/development/spark-streaming/streaming-kafka.md: -------------------------------------------------------------------------------- 1 | # Streaming读写kafka 2 | MaxCompute支持Spark Streaming和Spark Structured Streaming,本文介绍Streaming作业流式读写kafka的示例。 3 | 4 | ## Spark Streaming(DStream) 5 | 该示例是基于一个Kafka的Receiver,适用于DStream的接口。 6 | 7 | > 详细代码请参考:https://github.com/aliyun/MaxCompute-Spark/blob/master/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/streaming/kafka/KafkaStreamingDemo.scala 8 | 9 | ## Kafka回流到MaxCompute 10 | 通过DStreaming+Dataframe把Kafka数据导入MaxCompute 11 | 12 | > 详细代码请参考:https://github.com/aliyun/MaxCompute-Spark/blob/master/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/streaming/kafka/Kafka2OdpsDemo.scala 13 | 14 | ## Spark Structured Streaming 15 | > 详细代码请参考:https://github.com/aliyun/MaxCompute-Spark/blob/master/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/structuredStreaming/kafka/KafkaStructuredStreamingDemo.scala 16 | 17 | source的示例如下(请参考代码): 18 | ``` 19 | val df = spark 20 | .readStream 21 | .format("kafka") 22 | .option("kafka.bootstrap.servers", "192.168.72.224:9202,192.168.72.225:9202,192.168.72.226:9202") 23 | .option("subscribe", "zkytest") 24 | .load() 25 | ``` 26 | 27 | 由于kafka在VPC内,需要将endpoint配置在VPC访问配置中,参考[VPC访问](https://github.com/aliyun/MaxCompute-Spark/wiki/09.-VPC-Access%E6%96%87%E6%A1%A3%E8%AF%B4%E6%98%8E)。示例如下: 28 | ``` 29 | { 30 | "regionId":"cn-beijing", 31 | "vpcs":[ 32 | { 33 | "vpcId":"vpc-2zeaeq21mb1dmkqh0exox" 34 | "zones":[ 35 | { 36 | "urls":[ 37 | { 38 | "domain":"192.168.72.224", 39 | "port":9202 40 | }, 41 | { 42 | "domain":"192.168.72.225", 43 | "port":9202 44 | }, 45 | { 46 | "domain":"192.168.72.226", 47 | "port":9202 48 | } 49 | ] 50 | } 51 | ] 52 | } 53 | ] 54 | } 55 | ``` 56 | 57 | **注意:** 目前所给的这个Demo,没有启用checkpoint,checkpoint需要使用oss作为checkpoint的存储,另外Spark Streaming作业处于试用阶段,**作业最长运行时间不能超过3天,如果需要投入长时间正式运行使用,请联系我们开通相关权限。** 58 | -------------------------------------------------------------------------------- /docs/docs/development/spark-streaming/streaming-loghub.md: -------------------------------------------------------------------------------- 1 | # Streaming读写LogHub 2 | MaxCompute支持Spark Streaming和Spark Structured Streaming,本文介绍Streaming作业流式接收LogHub(日志服务的一个组件,日志服务详见[官方文档](https://www.aliyun.com/product/sls))的示例。 3 | 4 | ## Spark Streaming(DStream) 5 | 该示例是基于一个LogHub的Receiver(类似基于Spark之上接收Kafka流的Receiver),适用于DStream的接口。 6 | 7 | > 详细代码请参考:https://github.com/aliyun/MaxCompute-Spark/blob/master/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/streaming/loghub/LogHubStreamingDemo.scala 8 | 9 | 运行这个Demo,需要在Spark的配置中给出LogHub的如下几个配置: 10 | 11 | ``` 12 | spark.logservice.accessKeyId : loghub的accessId 13 | spark.logservice.accessKeySecret : loghub的accessKey 14 | spark.logservice.endpoint : loghub的endpoint,需要根据project所在的region进行选择 15 | spark.logservice.project : 需要读取的loghub的project名字 16 | spark.logservice.logstore : 需要读取的logstore的名字 17 | ``` 18 | 另外StreamingParam#setCursor(LogHubCursorPosition.END_CURSOR) 和 StreamingParam#setGroup("test") 这俩配置的含义可以参考[LogHub官方文档的介绍](https://help.aliyun.com/document_detail/28998.html?spm=a2c4g.11186623.6.877.2ea24bbcd6eDg5)。 19 | 20 | ## LogHub回流到MaxCompute 21 | 利用DStream+Dataframe可以把LogHub数据回流到MaxCompute。 22 | 23 | > 详细代码请参考:https://github.com/aliyun/MaxCompute-Spark/blob/master/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/streaming/loghub/LogHub2OdpsDemo.scala 24 | 25 | ## Spark Structured Streaming 26 | > 详细代码请参考:https://github.com/aliyun/MaxCompute-Spark/blob/master/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/structuredStreaming/loghub/LoghubStructuredStreamingDemo.scala 27 | 28 | source的示例如下(请参考代码): 29 | ``` 30 | val stream = spark.readStream 31 | .format("loghub") 32 | .option("loghub.endpoint", "http://....") 33 | .option("loghub.project", "project") 34 | .option("loghub.logstores", "store1,store2") 35 | .option("loghub.accessId", "accessId") 36 | .option("loghub.accessKey", "accessKey") 37 | .option("loghub.startingoffsets", "latest") 38 | .load() 39 | ``` 40 | 41 | sink的示例如下: 42 | ``` 43 | val query = spark.writeStream 44 | .format("loghub") 45 | .option("loghub.endpoint", "http://....") 46 | .option("loghub.project", "project") 47 | .option("loghub.logstores", "store1,store2") 48 | .option("loghub.accessId", "accessId") 49 | .option("loghub.accessKey", "accessKey") 50 | .start() 51 | ``` 52 | 53 | 其中loghub.endpoint请使用**经典/VPC网络服务入口**,各region对应的endpoint参考[此文](https://help.aliyun.com/document_detail/29008.html#h2-url-2)。此外,需要将endpoint配置在VPC访问配置中,参考[VPC访问](https://github.com/aliyun/MaxCompute-Spark/wiki/09.-VPC-Access%E6%96%87%E6%A1%A3%E8%AF%B4%E6%98%8E)。示例如下: 54 | ``` 55 | { 56 | "regionId":"cn-beijing", 57 | "vpcs":[ 58 | { 59 | "zones":[ 60 | { 61 | "urls":[ 62 | { 63 | "domain":"cn-beijing-intranet.log.aliyuncs.com", 64 | "port":80 65 | } 66 | ] 67 | } 68 | ] 69 | } 70 | ] 71 | } 72 | ``` 73 | 74 | **注意:** 目前所给的这个Demo,没有启用checkpoint,checkpoint需要使用oss作为checkpoint的存储,另外Spark Streaming作业处于试用阶段,**作业最长运行时间不能超过3天,如果需要投入长时间正式运行使用,请联系我们开通相关权限。** 75 | -------------------------------------------------------------------------------- /docs/docs/faq/_category_.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "常见问题", 3 | "position": 6, 4 | "link": { 5 | "type": "generated-index" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /docs/docs/faq/allocate-resource.md: -------------------------------------------------------------------------------- 1 | # 资源申请问题 2 |

资源申请

3 | 通常用户在提交Spark作业时需要关注以下几种资源: 4 | 5 | * **Executor 数量** 6 | * **Executor 内存** 7 | * **Executor core** 8 | * **Driver 内存** 9 | * **Driver core** 10 | * **本地网盘** 11 | 12 | ## Executor 相关参数 13 | * spark.executor.instances 14 | + 总共申请的executor数目,普通任务十几个或者几十个足够了,若是处理大量数据时可以申请多一些,100—2000+ 15 | * spark.executor.cores 16 | + 每个executor的核数,即每个executor中的可同时运行的task数目 17 | + Spark任务的最大并行度是executor数目*executor core数 18 | * spark.executor.memory 19 | + 代表申请executor的堆内内存,也就是启动jvm进程时设置的Xmx参数 20 | * spark.executor.memoryOverhead 21 | + 申请executor的堆外内存,默认单位是MB,主要用于JVM自身,字符串, NIO Buffer等开销 22 | + 默认为executor Memory*0.1,最小384M 23 | + 如果遇到Cannot allocate memory,通常是堆外内存不足,可以考虑适当增大spark.executor.memoryOverhead 24 | + 注意:单个Executor的内存总量是spark.executor.memory+spark.executor.memoryOverhead 25 | 26 | ## Driver 相关参数 27 | * spark.driver.cores 28 | * spark.driver.memory 29 | * spark.yarn.driver.memoryOverhead 30 | * spark.driver.maxResultSize 31 | + 默认1g,控制worker送回driver的数据大小,一旦超出该限制,driver会终止执行 32 | ## 本地网盘参数 33 | * spark.hadoop.odps.cupid.disk.driver.device_size 34 | + 代表本地网盘大小,默认值为20g 35 | + Spark使用网盘作为本地存储,Driver和每个Executor都有一个,Shuffle数据以及BlockManager溢出的数据均存储在网盘上 36 | + 当出现**No space left on device**时可适当调大该值,最大支持100g。如果调整到100g仍然会出现此错误,需要分析具体原因,可能是:1. 数据倾斜,在shuffle或者cache过程中数据集中分布在某些block;2. 可以缩小单个executor的并发(spark.executor.cores) 3. 增加executor的数量(spark.executor.instances) 37 | + **注意** `必须配置在spark-conf文件或者dataworks的配置项中,不能配置在代码中` 38 | 39 |

如何合理设置资源参数

40 | 41 | * 建议按照内存/CPU 1:4来申请资源,即1 core对应4GB内存,建议单个worker core数不要超过8 42 | 43 | * 用户可以通过查看logview中Master或Worker的Sensor来获取运行中的内存和CPU使用情况 44 | ![sensor](../resources/fuxisensor.png) 45 | 46 | * 通常需要关注mem_rss,代表了executor或driver在实际使用时的内存变化曲线,用户可以根据该值变化来判断是否需要增加/减少内存 47 | ![sensor](../resources/fuxisensor2.png) 48 | 49 | 50 |

资源等待

51 | 52 | ## 注意事项 53 | * 用户在集群模式下必须配置spark.master=yarn-cluster才会正确的申请资源(注意local模式调试完之后要将代码中的spark.master=local配置去掉) 54 | 55 | ## 如何等待资源申请到之后提交Job 56 | * 申请资源是一个持续不断的过程,因此可能会出现拿到的资源没有达到用户请求的数量,而spark是不会等到所有的Executor都获取到之后再开始执行任务,可以通过以下参数来控制Spark提交任务的时机 57 | + spark.scheduler.maxRegisteredResourcesWaitingTime:在执行前最大等待申请资源的时间,默认30s。 58 | + spark.scheduler.minRegisteredResourcesRatio:实际注册的资源数占预期需要的资源数的比例,默认0.8 59 | 60 | ## 申请不到资源的可能原因: 61 | * 如果是预付费用户,一般是用户申请的资源超出了购买的资源数量,可以登陆管家进行确认 62 | * 如果是后付费用户,需要抢占资源 63 | 64 | 65 | ## 申请不到资源解决方案 66 | * 调整任务资源:调整用户申请的Executor总数或者单个Executor的资源数量(一般是内存) 67 | * 合理安排任务执行时间 68 | 69 | ## 没有申请不到资源的几种现象 70 | 71 | * 在driver端一般会打以下日志: 72 | WARN YarnClusterScheduler: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources 73 | ![](../resources/资源申请1.png) 74 | 75 | * 在Logview中只能看到driver,而worker数量为0 76 | * 在Spark ui中只能看到driver,而worker数量为0 77 | ![](../resources/资源申请2.png) -------------------------------------------------------------------------------- /docs/docs/faq/class-conflict.md: -------------------------------------------------------------------------------- 1 | # Java Scala类冲突问题 2 | 3 | ## 类冲突问题概述 4 | * 这类报错一般会抛出异常java.lang.NoClassDefFoundError或者方法找不到等问题,需要检查pom并将冲突的依赖排除掉 5 | * 原因在于:用户jar包中很可能打了一些依赖进去,这些依赖的jar包与spark jars目录下的jar包由于版本不一致,jvm在加载类的时候优先加载了用户的jar包 6 | 7 | ## 需要注意的问题 8 | 9 | ### 依赖为provided和compile的区别 10 | 11 | * provided:代码依赖该jar包,但只在编译的时候需要用,而运行时不需要,运行时会去集群中去寻找的相应的jar包,很多时候把jar包的类型设置为provided类型,就是因为这些jar包已经在集群中提供了(主要是spark客户端的jars目录,该目录中包含的jar包通常应该设置为provided),如果不设置为provided,某些时候可能可以正常运行,某些时候就会发生类冲突,类/方法找不到等各种问题。 12 | * compile:代码依赖该jar包,在编译、运行时候都需要,也就是说集群中不存在这些jar包,需要用户打到自己的jar包中。这种类型的jar包一般是一些三方库,且与spark运行无关,与用户代码逻辑有关。 13 | 14 | ### 主jar包必须是一个fat jar 15 | * 必须要把compile类型的依赖都打到用户jar包中,这样在代码运行时才能加载到这些依赖类 16 | 17 | 18 | ## POM自检 19 | 20 | ### 需要设置为provided的jar包 21 | * groupId为org.apache.spark的jar包 22 | + **说明** 这类jar包主要是社区版spark的jar包,已经在spark客户端的jars目录下提供,不需要打进用户的jar包,会在spark客户端提交任务时自动上传到MaxCompute集群中 23 | 24 | * cupid-sdk 25 | + **说明** 该jar包在任务提交时自动上传到MaxCompute集群中 26 | 27 | * odps-sdk 28 | + **说明** 该jar包在任务提交时自动上传到MaxCompute集群中 29 | 30 | * hadoop-yarn-client 31 | + **说明** 该jar包用于任务上传 32 | + **注意** 该jar包可能会被间接依赖,因此最好在打包之前检查并将该依赖排除 33 | 34 | ### 不能设置为provided的jar包 35 | * oss相关的jar包 36 | + **举例** hadoop-fs-oss 37 | + **说明** 该jar包属于第三方jar包,如果需要访问oss,需要打到用户jar包中 38 | 39 | * 流式相关的jar包 40 | + **举例** streaming-lib 41 | + **说明** 该jar包提供了一些spark streaming的接口来访问datahub和loghub,如果用户需要使用,则需要打到用户jar包中 42 | 43 | * 用户访问其他服务用到的jar包 44 | + **举例** 访问mysql等其他第三方服务需要用到的jar包 45 | 46 | 47 | -------------------------------------------------------------------------------- /docs/docs/faq/github-images.md: -------------------------------------------------------------------------------- 1 | # Github图片无法访问的问题 2 | 如果查看文档时发现图片无法显示,那么需要到 https://www.ipaddress.com/ 查看raw.githubusercontent.com的ipv4地址,然后使用管理员权限修改hosts文件,添加: 3 | 4 | [**ip地址**] raw.githubusercontent.com -------------------------------------------------------------------------------- /docs/docs/faq/network-access.md: -------------------------------------------------------------------------------- 1 | # 访问VPC和OSS的问题 2 |

访问OSS常见问题

3 | 4 | ## 线上作业time out 5 | + 用户在local模式通常需要采用公网的oss域名,在线上需要改成inernal域名 6 | 7 | ## 依赖的问题 8 | + 示例中的hadoop-fs-oss必须以compile的方式打入用户主jar包 9 | + 如果用户使用PySpark,一定要把包含hadoop-fs-oss的fat jar包也上传到集群,否则出现找不到OSS相关类的问题 10 | 11 |

访问VPC常见问题

12 | 13 | ## 线上作业time out 14 | + vpc.domain.list 需要压缩成一行:建议通过 [网站](http://www.bejson.com/) 进行压缩,不要有空格 15 | + 如果不是使用ENI专线,则需在要访问的服务中添加ip白名单,允许100.104.0.0/16网段的访问 16 | + 如果是使用ENI专线,需要在要访问的服务中添加安全组白名单(开通ENI专线时使用的安全组) 17 | + smartnat只有北京和上海region可用,且必须设置为true 18 | + 用户要保证所有可能访问到的IP都已经加到vpc.domain.list,例如如果用户要访问位于hdfs,hbase这种多个节点的服务,一定要把所有的节点都添加进来,不然肯定会遇到time out的情况 19 | 20 | ## 访问公网 21 | 目前MaxCompute Spark运行在网络隔离环境中,如果需要访问公网,只能通过以下两种方式: 22 | 23 | * 中国公共云: 24 | + 提工单设置 project 级别白名单,如把 google.com:443 加到odps.security.outbound.internetlist 里面 25 | + 在Spark作业中配置公网访问白名单:spark.hadoop.odps.cupid.internet.access.list=google.com:443和spark.hadoop.odps.cupid.smartnat.enable=true 26 | 27 | * 开通专线 28 | + 提工单开通专线,配置专线参数spark.hadoop.odps.cupid.eni.info和spark.hadoop.odps.cupid.eni.enable=true 29 | + 在Spark作业中配置公网访问白名单:spark.hadoop.odps.cupid.internet.access.list=google.com:443 -------------------------------------------------------------------------------- /docs/docs/faq/oom-troubleshooting.md: -------------------------------------------------------------------------------- 1 | # 运行时OOM问题 2 |

OOM的一些情况

3 | 4 | ## 如何查看worker以及master的内存使用情况 5 | Logview 2.0包含Sensor功能,可以查看master以及每个worker在运行时的内存和cpu使用情况 6 | 7 | 8 | ## Cannot allocate memory 9 | 10 | 1. 在某些Executor中出现Cannot allocate memory,一般是堆外内存不足,此时可以调整spark.yarn.executor.memoryOverhead参数 11 | 2. 在Driver中出现Cannot allocate memory,可以调整spark.yarn.driver.memoryOverhead参数 12 | ![image1](../resources/OOM1.png) 13 | 14 | ## java.lang.OutOfMemoryError: Java heap space 15 | 如果在Executor中出现该错误,通常是堆内内存不足,此时可以适当增大内存,或减少Executor core 16 | 17 | ## No route to host: workerd********* / Could not find CoarseGrainedScheduler 18 | 出现这类错误极有可能是某些Executor出现OOM 19 | 20 | 21 |

OOM解决方案

22 | 23 | 1. 限制executor 并行度,将cores 调小:多个同时运行的 Task 会共享一个Executor 的内存,使得单个 Task 可使用的内存减少,调小并行度能缓解内存压力 24 | 2. 增加单个Executor内存 25 | 3. 增加分区数量,减少每个executor负载 26 | 4. 考虑数据倾斜问题,因为数据倾斜导致某个 task 内存不足,其它 task 内存足够 -------------------------------------------------------------------------------- /docs/docs/faq/pyspark-faq.md: -------------------------------------------------------------------------------- 1 | # PySpark 常见问题 2 | ## Local 模式(Spark 2.4.5) 3 | 4 | - 新建一个odps.conf文件,包含以下odps参数: 5 | ``` 6 | odps.project.name=*** 7 | odps.access.id=*** 8 | odps.access.key=*** 9 | odps.end.point=*** 10 | ``` 11 | 12 | - 在PyCharm中添加以下环境变量: 13 | ``` 14 | SPARK_HOME=/path/to/spark_home 15 | PYTHONPATH=/path/to/spark_home/python 16 | ODPS_CONF_FILE=/path/to/odps.conf 17 | ``` 18 | 19 | - 在代码中添加以下配置: 20 | ``` 21 | spark = SparkSession.builder\ 22 | .appName("spark sql")\ 23 | .config("spark.eventLog.enabled", False)\ 24 | .getOrCreate() 25 | ``` 26 | 27 | - 直接运行pyspark作业即可 28 | 29 | ## Cluster 模式(Spark 2.4.5) 30 | 31 | #### 作业执行抛出异常:***.so: cannot open shared object file: No such file or directory 32 | 33 | 上述抛出的异常,提示用户作业在执行加载时缺少对应的依赖,具体解决步骤如下: 34 | ##### MaxCompute Spark客户端 35 | * 公网下载对应的依赖文件 36 | * 提交作业时通过参数 **--files /path/to/[lib名]** 将对应的依赖文件加载至driver与executor的工作目录内 37 | 38 | ##### Dataworks Spark节点 39 | * 公网下载对应的依赖文件 40 | * 通过DataWorks,添加对应的依赖资源,即,创建MaxCompute资源 41 | * 作业提交新增补充参数,spark.hadoop.odps.cupid.resources = public.python-2.7.13-ucs4.tar.gz,[project名].[resource名].so:[resource名].so, 42 | 43 | ###### 注意事项 44 | ``` 45 | 由于上传的依赖资源是以project名称为前缀,所以需要对上传的resource名称进行重命名为需要的依赖,即,去掉project名称的前缀,这样才可以正确加载依赖 46 | ``` 47 | -------------------------------------------------------------------------------- /docs/docs/faq/read-transactional-table.md: -------------------------------------------------------------------------------- 1 | # 读取ACID表问题 2 | 新版本Spark支持读取ACID表,需要添加以下参数切换到新版本 3 | 4 | ## Spark 2.3.0 5 |
 6 | spark.hadoop.odps.task.major.version = default
 7 | spark.hadoop.odps.cupid.resources = public.__spark_libs__2.3.0-odps0.34.0.zip
 8 | spark.driver.extraClassPath = ./public.__spark_libs__2.3.0-odps0.34.0.zip/* 
 9 | spark.executor.extraClassPath = ./public.__spark_libs__2.3.0-odps0.34.0.zip/*
10 | 
11 | 12 | ## Spark 2.4.5 13 |
14 | spark.hadoop.odps.task.major.version = default
15 | spark.hadoop.odps.cupid.resources = public.__spark_libs__2.4.5-odps0.34.0.zip
16 | spark.driver.extraClassPath = ./public.__spark_libs__2.4.5-odps0.34.0.zip/* 
17 | spark.executor.extraClassPath = ./public.__spark_libs__2.4.5-odps0.34.0.zip/*
18 | 
-------------------------------------------------------------------------------- /docs/docs/faq/ref-external-file.md: -------------------------------------------------------------------------------- 1 | # 引用外部文件问题 2 | 需要引用到外部文件的场景 3 | + 用户作业需要读取一些配置文件 4 | + 用户作业需要额外的jar包/Python库 5 | 6 |

如何上传文件

7 | 8 | 上传文件有两种方式 9 | * 通过Spark参数上传文件 10 | * 通过MaxCompute Resource上传文件 11 | 12 | ## Spark参数 13 | MaxCompute Spark支持Spark社区版原生的--jars,--py-files等参数,可以在作业提交时通过这些参数将文件上传,这些文件在任务运行时会被上传到用户的工作目录下。 14 | 15 | 在不同的运行模式下上传文件: 16 | * 通过Spark客户端:直接使用spark-submit命令行参数 17 |
18 | **注意事项**
19 | * --jars选项,会将配置的jar包上传至Driver和Executor的当前工作目录,多个文件逗号分隔,这些jar包都会加入Driver和Executor的Classpath,Spark作业中直接"./your_jar_name"即可引用,与社区版Spark行为相同。
20 | * --files, --py-files选项,会将配置的 普通文件/python文件 上传至Driver和Executor的当前工作目录,多个文件逗号分隔,Spark作业中直接"./your_file_name"即可引用,与社区版Spark行为相同。
21 | * --archives选项,与社区版Spark行为略有不同,多个逗号分隔,配置方式为xxx#yyy,会将配置的归档文件(例如.zip)解压到Driver和Executor的当前工作目录的子目录中。举例:当配置为xx.zip#yy时,应以"./yy/xx/"引用到归档文件中的内容;当仅配置xx.zip时,应以"./xx.zip/xx/"引用到内容。若一定需要将归档内容直接解压到当前目录,即直接引用"./xxx/",请使用下面提到的spark.hadoop.odps.cupid.resources配置。
22 | 
23 | 24 | * 通过DataWorks添加任务需要的资源,参见[文档](https://github.com/aliyun/MaxCompute-Spark/wiki/02.-Spark-on-Dataworks) 25 | 26 | ## MaxCompute Resource 27 | MaxCompute Spark提供spark.hadoop.odps.cupid.resources参数,可以直接引用MaxCompute中的资源,这些资源在任务运行时会被上传到用户的工作目录下。 28 | 29 | 使用方式 30 | ``` 31 | 1. 通过MaxCompute客户端将文件上传(单个文件最大支持500MB) 32 | 2. 在Spark作业配置中添加spark.hadoop.odps.cupid.resources参数 33 | 格式为.,如果需要引用多个文件,需要用逗号隔开 34 | ``` 35 | ### spark.hadoop.odps.cupid.resources参数介绍 36 | + **配置说明** `该配置项指定了任务运行所需要的`[Maxcompute资源](https://help.aliyun.com/document_detail/27831.html?spm=5176.11065259.1996646101.searchclickresult.d55650ea0QU1qd&aly_as=45TiiTdO2) 37 | + **配置示例** spark.hadoop.odps.cupid.resources=public.python-python-2.7-ucs4.zip,public.myjar.jar 38 | + **使用说明** `指定的资源将被下载到driver和executor的当前工作目录,资源下载到工作目录后默认的名字是.` 39 | + **文件重命名** `在配置时通过.:进行重命名` 40 | + **重命名示例** spark.hadoop.odps.cupid.resources=public.myjar.jar:myjar.jar 41 | + **注意** `该配置项必须要配置在spark-default.conf中或dataworks的配置项中才能生效,而不能写在代码中` 42 | 43 | 44 |

如何在代码中引用文件

45 | 通过上述两种方式可以将文件上传到任务的当前工作目录,文件读取示例: 46 | 47 | ``` 48 | val targetFile = "文件名" 49 | val file = Source.fromFile(targetFile) 50 | for (line <- file.getLines) 51 | println(line) 52 | file.close 53 | ``` -------------------------------------------------------------------------------- /docs/docs/faq/spark-24-notes.md: -------------------------------------------------------------------------------- 1 | # Spark 2.4.5 使用注意事项 2 | ## 如何使用Spark 2.4.5提交作业 3 | * 直接使用Yarn-cluster模式在本地提交任务, 添加 spark.hadoop.odps.spark.libs.public.enable=true和spark.hadoop.odps.spark.version=spark-2.4.5-odps0.34.0 这两个参数可以加速包上传速度 4 | 5 | * 或在Dataworks中配置参数 spark.hadoop.odps.spark.version=spark-2.4.5-odps0.34.0,注意,若Dataworks独享资源组尚未升级到Spark 2.4.5,用户可以采用公共资源组进行调度,或联系Dataworks平台官方人员进行升级 6 | 7 | ## Spark 2.4.5 使用变化 8 | * 如果使用Yarn-cluster模式在本地提交任务,需要新增环境变量 export HADOOP_CONF_DIR=$SPARK_HOME/conf 9 | 10 | * 如果使用local模式进行调试,需要在$SPARK_HOME/conf目录下新建odps.conf文件,并添加以下配置: 11 | ``` 12 | odps.project.name = 13 | odps.access.id = 14 | odps.access.key = 15 | odps.end.point = 16 | ``` 17 | 18 | ## Spark 2.4.5 参数配置变化 19 | 20 | * `spark.sql.catalogImplementation` 21 | + **配置值** `hive` 22 | * `spark.sql.sources.default` 23 | + **配置值** `hive` 24 | * `spark.sql.odps.columnarReaderBatchSize` 25 | + **默认值** `4096` 26 | + **配置说明** `向量化读每个batch包含的行数` 27 | * `spark.sql.odps.enableVectorizedReader` 28 | + **默认值** `true` 29 | + **配置说明** `开启向量化读` 30 | * `spark.sql.odps.enableVectorizedWriter` 31 | + **默认值** `true` 32 | + **配置说明** `开启向量化写` 33 | * `spark.sql.odps.split.size` 34 | + **默认值** `256m` 35 | + **配置说明** `该配置可以用来调节读Maxcompute表的并发度,默认每个分区为256MB` 36 | * `spark.hadoop.odps.cupid.vnet.capacity` 37 | + **默认值** `802` 38 | + **配置说明** `该配置用于设置最大的instance数量,建议配置值为spark.executor.instances + 2,否则可能会遇到create virtual net failed错误。该参数需要设置到spark-defaults.conf或Dataworks配置项中` 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /docs/docs/faq/spark-31-notes.md: -------------------------------------------------------------------------------- 1 | # Spark 3.1.1 使用注意事项 2 | ## 如何使用Spark 3.1.1提交作业 3 | * 直接使用Yarn-cluster模式在本地提交任务 4 | 5 | * 通过DataWorks平台选择Spark 3.x选项。若提交任务报错,则需要提单升级独享资源组版本。 6 | 7 | ## Spark 3.1.1 使用变化 8 | * 如果使用Yarn-cluster模式从本地提交任务,需要新增环境变量 export HADOOP_CONF_DIR=$SPARK_HOME/conf 9 | 10 | * 如果使用Yarn-cluster模式提交Pyspark作业,需要添加以下参数使用Python3 11 | ``` 12 | spark.hadoop.odps.cupid.resources = public.python-3.7.9-ucs4.tar.gz 13 | spark.pyspark.python = ./public.python-3.7.9-ucs4.tar.gz/python-3.7.9-ucs4/bin/python3 14 | ``` 15 | 16 | * 如果使用local模式进行调试,需要在类路径下新建odps.conf文件,并添加以下配置: 17 | ``` 18 | odps.project.name = 19 | odps.access.id = 20 | odps.access.key = 21 | odps.end.point = 22 | ``` 23 | 24 | * 如果使用local模式进行调试,需要添加spark.hadoop.fs.defaultFS = file:/// 25 | ``` 26 | val spark = SparkSession 27 | .builder() 28 | .config("spark.hadoop.fs.defaultFS", "file:///") 29 | .enableHiveSupport() 30 | .getOrCreate() 31 | ``` 32 | 33 | ## Spark 3.1.1 参数配置 34 | 35 | * `spark.sql.defaultCatalog` 36 | + **配置值** `odps` 37 | * `spark.sql.catalog.odps` 38 | + **配置值** `org.apache.spark.sql.execution.datasources.v2.odps.OdpsTableCatalog` 39 | * `spark.sql.sources.partitionOverwriteMode` 40 | + **配置值** `dynamic` 41 | * `spark.sql.extensions` 42 | + **配置值** `org.apache.spark.sql.execution.datasources.v2.odps.extension.OdpsExtensions` 43 | * `spark.sql.catalog.odps.enableVectorizedReader` 44 | + **默认值** `true` 45 | + **配置说明** `开启向量化读` 46 | * `spark.sql.catalog.odps.enableVectorizedWriter` 47 | + **默认值** `true` 48 | + **配置说明** `开启向量化写` 49 | * `spark.sql.catalog.odps.splitSizeInMB` 50 | + **默认值** `256` 51 | + **配置说明** `该配置可以用来调节读Maxcompute表的并发度,默认每个分区为256MB` -------------------------------------------------------------------------------- /docs/docs/overview.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 1 3 | --- 4 | 5 | # MaxCompute Spark概述 6 | MaxCompute Spark是MaxCompute提供的兼容开源的Spark计算服务。它在统一的计算资源和数据集权限体系之上,提供Spark计算框架,支持用户以熟悉的开发使用方式提交运行Spark作业,以满足更丰富的数据处理分析场景。 7 | 8 | ## 关键特性 9 | 10 | * 支持原生多版本Spark作业 11 | > 社区原生Spark运行在MaxCompute里,完全兼容Spark的API,支持多个Spark版本同时运行。MaxCompute Spark提供原生的Spark WebUI供用户查看。 12 | 13 | * 统一的计算资源 14 | > MaxCompute Spark像MaxCompute SQL/MR等任务类型一样,运行在MaxCompute项目开通的统一计算资源中。 15 | 16 | * 统一的数据和权限管理 17 | > 完全遵循MaxCompute项目的权限体系,在访问用户权限范围内安全地查询数据。 18 | 19 | * 与开源系统相同的使用体验 20 | > MaxCompute Spark与社区开源Spark保持相同的体验(例如开源应用的UI界面、在线交互等),完全符合Spark用户使用习惯。开源应用的调试过程中需要使用开源UI,MaxCompute Spark提供原生的开源实时UI和查询历史日志的功能。其中,对于部分开源应用还支持交互式体验,在后台引擎运行后即可进行实时交互。 21 | 22 | ## 系统结构 23 | 24 | MaxCompute Spark是阿里云通过Spark on MaxCompute的解决方案,让原生Spark能够运行在MaxCompute当中。 25 | 26 | ![cupid架构图](resources/cupid_arch.png) 27 | 28 | 左侧是原生Spark的架构图,右边Spark on MaxCompute运行在阿里云自研的Cupid的平台之上,该平台可以原生支持开源社区Yarn所支持的计算框架,如Spark等。 29 | 30 | ## 约束与限制 31 | 32 | 目前MaxCompute Spark支持以下适用场景: 33 | 34 | * 离线计算场景:GraphX、Mllib、RDD、Spark-SQL、PySpark等 35 | * 读写MaxCompute Table 36 | * 引用MaxCompute中的文件资源 37 | * 读写VPC环境下的服务,如RDS、Redis、HBase、ECS上部署的服务等 38 | * 读写OSS非结构化存储 39 | 40 | 暂不支持以下场景: 41 | 42 | * 不支持交互式类需求Spark-Shell、Spark-SQL-Shell、PySpark-Shell等 43 | * 不支持访问Maxcompute外部表,函数和UDF 44 | * 只支持Local模式和Yarn-cluster模式运行 -------------------------------------------------------------------------------- /docs/docs/quickstart/_category_.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "快速开始", 3 | "position": 2, 4 | "link": { 5 | "type": "generated-index" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /docs/docs/quickstart/dataworks-integration.md: -------------------------------------------------------------------------------- 1 | # Spark on Dataworks 2 |

Dataworks节点使用流程

3 | 4 | * 创建资源 5 | 6 | ![image1](../resources/dataworks-1.jpg) 7 | 8 | * 上传模板项目工程编译出来的jar包 9 | 10 | ![image2](../resources/dataworks-2.jpg) 11 | 12 | * 上传之后一定要commit,也就是红色方框的按钮 13 | 14 | ![image3](../resources/dataworks-3.jpg) 15 | 16 | * 创建ODPS Spark节点 17 | 18 | ![image4](../resources/dataworks-4.jpg) 19 | 20 | * 选择刚才上传的资源并且按照spark-defaults.conf里面的配置填页面上的配置项,并提交 21 | 22 | ![image5](../resources/dataworks-5.jpg) 23 | 24 | * 点击红色方框冒烟测试,冒烟测试按钮旁边的按钮可以查看运行日志 25 | 26 | ![image6](../resources/dataworks-6.jpg) 27 | 28 | * 运行日志 29 | 30 | ![image7](../resources/dataworks-7.jpg) 31 | 32 | 33 |

Dataworks Spark节点配置

34 | 35 | ## ODPS SPARK节点介绍 36 | 37 | 本质上ODPS SPARK节点的配置对应于spark-submit命令的参数和选项。具体来说 38 | 39 | | 节点 | spark-submit | 40 | | --- | --- | 41 | | 主java/python资源 | app jar or python file | 42 | | 配置项 | --conf PROP=VALUE | 43 | | main class | --class CLASS_NAME | 44 | | 参数 | [app arguments] | 45 | | 选择jar资源 | --jars JARS | 46 | | 选择python资源 | --py-files PY_FILES | 47 | | 选择file资源 | --files FILES | 48 | | 选择archives资源 | --archives ARCHIVES | 49 | 50 | ## 配置项 51 | 52 | 配置项对应于spark-submit命令的--conf选项,其中: 53 | 54 | * accessid,accesskey,projectname,endpoint无需配置,默认是生产账号(有特殊原因可显式配置,将覆盖默认值) 55 | 56 | * 除此之外,需要将spark-default.conf中的配置逐条加到dataworks的配置项中 57 | 58 | ## 如何传参数(如bizdate) 59 | 60 | * 同SQL节点,首先在调度->参数中添加参数 61 | 62 | ![image8](http://ata2-img.cn-hangzhou.img-pub.aliyun-inc.com/e9bcf652514ef95f463039c224d22771.png#alt=image.png)
63 | 64 | * 然后在Spark节点“参数”栏引用该参数,该参数会传给用户主类,用户在代码中解析该参数即可 65 | 66 | Java/Scala: 67 | 68 | ![image9](http://ata2-img.cn-hangzhou.img-pub.aliyun-inc.com/68c282f810d83d3efb9cf2ce2654ad10.png#alt=image.png)
69 | 70 | Python: 71 | ![image10](http://ata2-img.cn-hangzhou.img-pub.aliyun-inc.com/42b4f596eac16f2eb55186f98d02352c.png#alt=image.png) 72 | 73 | 74 | ## 资源上传 75 | 76 | * 用户需要在DataWorks中添加任务需要的资源,这些资源在任务运行时会被上传到用户的工作目录下,资源可能包括: 77 | ``` 78 | 1. jar资源/python资源:对应于spark-submit命令的--jars,--py-files, 79 | 2. file资源,对应于spark-submit命令的--files 80 | 3. archive资源:对应于spark-submit命令的--archives,archives会默认被解压,解压后的文件名等同于资源名去掉后缀。例如上传的资源名是mnist.zip,则解压名为mnist 81 | ``` 82 | 83 | * DataWorks中上传资源限制最大为50MB,如果需要使用更大的资源,用户需要将该资源通过[MaxCompute客户端](https://help.aliyun.com/document_detail/27971.html?spm=a2c4g.11174283.6.990.3158590eUSc7JU)上传为MaxCompute资源,然后将该资源添加到数据开发中,详见[文档](https://help.aliyun.com/document_detail/137575.html?spm=a2c4g.11186623.6.813.665b1861iN9oa8) 84 | 85 | -------------------------------------------------------------------------------- /docs/docs/quickstart/runtime-mode/client-mode.md: -------------------------------------------------------------------------------- 1 | # Client 模式 2 | 为了让Spark Session作为业务框架的后端数据处理服务,MaxCompute Spark 团队开发了“Client”模式来覆盖业务框架向同一个Spark Session 动态提交多个作业、实时获取作业状态的场景。 3 | 4 | 5 | ## Client模式开发初衷 6 | 社区spark生产主要使用"yarn-cluster"、"yarn-client"两种模式。“yarn-cluster”模式将spark作业提交到集群运行,运行完毕客户端打印状态日志;这种模式无法向一个Spark 7 | Session动态多次提交作业,且客户端无法获取每个job的状态及结果。“yarn-client”模式,主要解决spark交互式场景问题,需要在客户端机器启动Driver,无法将Spark 8 | Session作为一个服务。因此我们基于Spark On MaxCompute开发了"Client"模式来解决上面的问题,该模式具有以下特点: 9 | 10 | - 客户端轻量级,不用再启动spark的Driver; 11 | - 客户端有一套API向MaxCompute集群的同一个Spark Session动态提交作业并监控状态; 12 | - 客户端可以通过监控作业状态及结果构建作业之间的依赖关系; 13 | - 用户可以动态编译应用程序jar通过客户端提交到原有的Spark Session运行; 14 | - 客户端可以集成在业务的WebServer中,且可进行水平扩展; 15 | 16 | 17 | ## Client模式简介 18 | client模式是为了解决交互式/在线任务需求。由于cluster模式必须把Driver放置在MaxCompute集群里面,如果我们有在线查询或者交互式的需求,由于网络隔离的原因,无法直接访问到Driver。 19 | 20 | 自研client模式同yarn-cluster模式一样也是把作业提交到MaxCompute集群,跟cluster模式最大的区别是client模式是由用户client端驱动,而cluster模式是由提交到计算集群的应用程序驱动。client模式把spark引擎作为一个在线服务来用,用户可以把client嵌入到在线业务系统进行实时分析。 21 | 22 | client模式提供如下接口,允许多个Spark Job串行/并行执行,并提供多个spark作业共享的Context,允许多个spark作业共享数据。 23 | ``` 24 | /** 25 | * Add the local jar file ,which contains user SparkJobs 26 | * @param localFilePath the local jar file path 27 | * @return return the jarName ,the startjob() will use 28 | */ 29 | def addJar(localFilePath: String): String 30 | 31 | /** 32 | * After add the jar,can start the sparkjob in the jar 33 | * @param className the class name in the jar 34 | * @param jarName jar name return from the addJar() 35 | * @param conf the conf when sparkjob run need 36 | * @return the jobId, getJobStatus/killJob will use 37 | */ 38 | def startJob(className: String, jarName: String, conf: Map[String,String]): String 39 | 40 | /** 41 | * get the jobstatus after the job start 42 | * @param jobId jobId return from the startJob() 43 | * @return the job status ,eg: JobStart,JobSuccess,JobFailed,JobKilled 44 | */ 45 | def getJobStatus(jobId: String): Any 46 | 47 | /** 48 | * stop the remote driver,then can not submit sparkjob 49 | */ 50 | def stopRemoteDriver() 51 | 52 | /** 53 | * kill the sparkjob running 54 | * @param jobId the jobid will kill 55 | */ 56 | def killJob(jobId: String) 57 | ``` 58 | 59 | 60 | 61 | ## Client模式作业提交方式 62 | Client模式与传统 spark-submit 命令行提交方式的最大不同在于再依赖Spark客户端。这带来了两大优势: 63 | 64 | 1. 由于摆脱了Spark客户端的依赖,用户不再需要下载配置Spark环境,大大增加了Client模式的易用性,同时降低了用户的学习成本 65 | 66 | 2. 由于不再需要上传Spark libraries,启动Client时不再需要上传200M左右的spark libs,既节省了时间又节省了网络开销,真正做到了让用户随时随地都可以提交Spark作业 67 | 68 | 69 | Client模式提供了非常直观的提交参数接口,将在下文详细介绍。 70 | 71 | 72 | ## [](#lg4goi)提交参数接口 73 | ```java 74 | public class SubmitParam { 75 | 76 | // Primary resource 77 | private String file; 78 | 79 | // This field is for Livy, don't have to care if you're using new Client Mode 80 | private String proxyUser; 81 | 82 | // --classname, your driver's classname 83 | private String className; 84 | 85 | // --args, arguments for your spark application 86 | private List args; 87 | 88 | // --jars, extra jars to distribute to driver & executors 89 | private List jars; 90 | 91 | // --py-files, extra python files to distribute to driver & executors 92 | private List pyFiles; 93 | 94 | // --files, extra files to distribute to driver & executors 95 | private List files; 96 | 97 | // --archives, extra archives to distribute to driver & executors 98 | private List archives; 99 | 100 | // --driver-memory 101 | private String driverMemory; 102 | 103 | // --driver-cores 104 | private String driverCores; 105 | 106 | // --executor-memory 107 | private String executorMemory; 108 | 109 | // --executor-cores 110 | private String executorCores; 111 | 112 | // --num-executors 113 | private String numExecutors; 114 | 115 | // --queue, you can ignore it 116 | private String queue; 117 | 118 | // --name, name of the spark application 119 | private String name; 120 | 121 | // --conf, other spark configurations 122 | private Map conf; 123 | ... 124 | } 125 | ``` 126 | * [使用示例](https://github.com/aliyun/MaxCompute-Spark/blob/clientmode-snapshot/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/clientmode/ClientModeDemo.scala) 127 | 128 | * 由于提交时不再依赖Spark客户端,因此提交参数的接口有一定变化。现在参数统一通过SubmitParam这个接口传递,有两种传参方式: 129 | 130 | 1. 在代码中传递参数: 131 | 132 | ``` 133 | SubmitParam param = new SubmitParam(); 134 | param.setFile("/path/to/primary/resource"); 135 | param.setClassName("classname"); 136 | ``` 137 | 138 | 2. 使用配置文件: 139 | 140 | ``` 141 | SubmitParam param = new SubmitParam(); 142 | param.loadConfFromFile("/path/to/submitparam.conf"); 143 | ``` 144 | 3. Demo 145 | checkout到clientmode-snapshot分支 146 | ``` 147 | git checkout clientmode-snapshot 148 | ``` 149 | 编译 150 | ``` 151 | cd spark-2.x 152 | mvn clean package 153 | ``` 154 | 提交执行 155 | ``` 156 | java -cp ./odps-spark-client_2.11-0.0.1-DEV-SNAPSHOT-jar-with-dependencies.jar:./target/spark-examples_2.11-1.0.0-SNAPSHOT-shaded.jar com.aliyun.odps.spark.examples.clientmode.ClientModeDemo 157 | ``` -------------------------------------------------------------------------------- /docs/docs/quickstart/runtime-mode/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | slug: /mode 3 | sidebar_position: 1 4 | --- 5 | 6 | # 运行模式 7 | 目前MaxCompute Spark支持以下几种运行方式:client 模式,local模式,cluster模式,以及支持在DataWorks中执行。 8 | 9 | ## Local模式 10 | local模式可用于小批量数据以及计算本地验证,local模式验证通过后再提交到yarn-cluster模式 11 | 12 | **说明** 13 | 具体使用可参考[Local模式](./local-mode.md) 14 | 15 | ``` 16 | ## /path/to/MaxCompute-Spark 请指向正确的编译出来后的application jar包 17 | cd $SPARK_HOME 18 | bin/spark-submit --master local[4] --class com.aliyun.odps.spark.examples.SparkPi \ 19 | /path/to/MaxCompute-Spark/spark-2.x/target/spark-examples_2.11-1.0.0-SNAPSHOT-shaded.jar 20 | ``` 21 | 22 | ## Cluster模式 23 | **说明** 24 | 具体使用可参考[Yarn Cluster模式](./yarn-cluster.md) 25 | 26 | ``` 27 | ## /path/to/MaxCompute-Spark 请指向正确的编译出来后的application jar包 28 | cd $SPARK_HOME 29 | bin/spark-submit --master yarn-cluster --class com.aliyun.odps.spark.examples.SparkPi \ 30 | /path/to/MaxCompute-Spark/spark-2.x/target/spark-examples_2.11-1.0.0-SNAPSHOT-shaded.jar 31 | ``` 32 | 33 | ## 在 DataWorks 上执行 34 | Spark作业可以在DataWorks中进行调度,本质上也是采用了Yarn Cluster模式进行任务提交 35 | 36 | **说明** 37 | 具体使用可参考[Spark on Dataworks](../dataworks-integration.md) 38 | -------------------------------------------------------------------------------- /docs/docs/quickstart/runtime-mode/local-mode.md: -------------------------------------------------------------------------------- 1 | # Local 模式 2 |

Local模式介绍

3 | 4 | * MaxCompute Spark支持用户以原生的Spark Local模式进行任务调试 5 | 6 | * 与Yarn Cluster模式类似,用户首先需要做以下准备工作 7 | ``` 8 | 1. 准备MaxCompute项目以及对应的accessId,accessKey 9 | 2. 下载MaxCompute Spark客户端 10 | 3. 环境变量准备 11 | 4. spark-defaults.conf配置 12 | 5. 下载工程模版并编译 13 | ``` 14 | 15 | * 进行任务提交 16 | ``` 17 | # Java/Scala 18 | cd $SPARK_HOME 19 | ./bin/spark-submit --master local[4] --class com.aliyun.odps.spark.examples.SparkPi \ 20 | /path/to/odps-spark-examples/spark-examples/target/spark-examples-2.0.0-SNAPSHOT-shaded.jar 21 | 22 | # PySpark 23 | cd $SPARK_HOME 24 | ./bin/spark-submit --master local[4] \ 25 | /path/to/odps-spark-examples/spark-examples/src/main/python/odps_table_rw.py 26 | ``` 27 |

Local模式注意事项

28 | 29 | ``` 30 | 1. Local模式读写Maxcompute表慢,这个原因是因为local模式是通过Tunnel来读写的,读写速度相比于yarn-cluster模式要慢 31 | 32 | 2. Local模式是在本地执行的,有的用户会经常遇到local模式下可以访问通vpc,但是在yarn-cluster模式下不行。 33 | 显而易见,local模式是处于用户本机环境,网络没有隔离。而yarn-cluster模式是处于Maxcompute的网络隔离环境中, 34 | 必须要要配置vpc访问的相关参数才行。 35 | 36 | 3. Local模式下访问vpc的endpoint通常是外网endpoint,而yarn-cluster模式下访问vpc的endpoint通常是vpc网络endpoint 37 | 38 | 4. IDEA Local模式下需要将相关配置写在代码中,而在Yarn-Cluster模式运行时一定要将这些配置从代码中去掉 39 | ``` 40 | 41 |

IDEA Local模式执行

42 | 43 | * Spark可以支持用户在IDEA里支持以Local[N]的模式直接运行代码,而不需要通过命令行提交,用户需要注意以下两点: 44 | ``` 45 | 1. IDEA运行Local模式是不能直接引用spark-defaults.conf里的配置,需要手动在代码里指定相关配置 46 | 47 | 2. 一定要注意需要在IDEA里手动添加MaxCompute Spark客户端的相关依赖(jars目录),否则会出现以下报错: 48 | the value of spark.sql.catalogimplementation should be one of hive in-memory but was odps 49 | ``` 50 | 51 | # 1. 在代码需要手动设置spark config 52 | 53 | ``` 54 | # spark 2.3版本 55 | val spark = SparkSession 56 | .builder() 57 | .appName("SparkPi") 58 | .config("spark.master", "local[4]") // 需设置spark.master为local[N]才能直接运行,N为并发数 59 | .config("spark.hadoop.odps.project.name", "****") 60 | .config("spark.hadoop.odps.access.id", "****") 61 | .config("spark.hadoop.odps.access.key", "****") 62 | .config("spark.sql.catalogImplementation", "odps") 63 | .config("spark.hadoop.odps.end.point", "http://service.cn.maxcompute.aliyun.com/api") 64 | .getOrCreate() 65 | 66 | # 注意,如果使用spark 2.4.5及以上的版本,需要在代码中配置spark.sql.catalogImplementation=hive,不再需要在代码中配置spark.hadoop.odps.project.name,spark.hadoop.odps.access.id,spark.hadoop.odps.access.key,spark.hadoop.odps.end.point这几个参数 67 | 只要在代码的resources目录下(类加载器能加载的目录)创建一个名为odps.conf的文件,然后添加以下配置,注意在集群模式中需要将该文件删除: 68 | 69 | odps.project.name=*** 70 | odps.access.id=*** 71 | odps.access.key=*** 72 | odps.end.point=*** 73 | 74 | 75 | 76 | 77 | 78 | ``` 79 | 80 | # 2. 在IDEA里手动添加MaxCompute Spark客户端的相关依赖(下图无法显示请参考[文档](https://github.com/aliyun/MaxCompute-Spark/wiki/12.-Github%E5%9B%BE%E7%89%87%E6%97%A0%E6%B3%95%E8%AE%BF%E9%97%AE%E7%9A%84%E9%97%AE%E9%A2%98)) 81 | 82 | ![image1](../../resources/idea-local-1.jpg) 83 | 84 | ![image2](../../resources/idea-local-2.jpg) 85 | 86 | ![image3](../../resources/idea-local-3.jpg) 87 | 88 | ![image4](../../resources/idea-local-4.jpg) 89 | 90 | ![image5](../../resources/idea-local-5.jpg) -------------------------------------------------------------------------------- /docs/docs/quickstart/runtime-mode/yarn-cluster.md: -------------------------------------------------------------------------------- 1 | # Yarn Cluster 模式 2 | 快速导航 3 | + [下载MaxCompute Spark客户端](#1) 4 | + [设置环境变量](#2) 5 | + [配置spark-defaults.conf](#3) 6 | + [准备项目工程](#4) 7 | + [SparkPi 冒烟测试](#5) 8 | ----------------- 9 | 10 |

下载MaxCompute Spark客户端

11 | 12 | MaxCompute Spark发布包集成了MaxCompute认证功能。作为客户端工具,它用于通过spark-submit方式提交作业到MaxCompute项目中运行。 13 | 14 | 目前Spark版本支持如下,请优先使用Spark 2以上的版本! 15 | * [spark-1.6.3](https://odps-repo.oss-cn-hangzhou.aliyuncs.com/spark/1.6.3-public/spark-1.6.3-public.tar.gz) 16 | 17 | 专有云: 18 | * [spark-2.3.0](https://odps-repo.oss-cn-hangzhou.aliyuncs.com/spark/2.3.0-odps0.33.0/spark-2.3.0-odps0.33.0.tar.gz) 19 | * [spark-2.4.5](https://odps-repo.oss-cn-hangzhou.aliyuncs.com/spark/2.4.5-odps0.33.4/spark-2.4.5-odps0.33.4.tar.gz) 20 | * [spark-3.1.1](https://odps-repo.oss-cn-hangzhou.aliyuncs.com/spark/3.1.1-odps0.33.0/spark-3.1.1-odps0.33.0.tar.gz) 21 | 22 | 公共云: 23 | * [spark-2.3.0](https://odps-repo.oss-cn-hangzhou.aliyuncs.com/spark/2.3.0-odps0.34.0/spark-2.3.0-odps0.34.0.tar.gz) 24 | * [spark-2.4.5](https://odps-repo.oss-cn-hangzhou.aliyuncs.com/spark/2.4.5-odps0.34.0/spark-2.4.5-odps0.34.0.tar.gz) 25 | * [spark-3.1.1](https://odps-repo.oss-cn-hangzhou.aliyuncs.com/spark/3.1.1-odps0.34.1/spark-3.1.1-odps0.34.1.tar.gz) 26 | 27 |

设置环境变量

28 | 29 | * JAVA_HOME设置 30 | 31 | ``` 32 | ## 推荐使用JDK 1.8 33 | export JAVA_HOME=/path/to/jdk 34 | export CLASSPATH=.:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar 35 | export PATH=$JAVA_HOME/bin:$PATH 36 | ``` 37 | 38 | * SPARK_HOME设置 39 | 40 | ``` 41 | ## 下载上文提到的MaxCompute Spark客户端并解压到本地任意路径 42 | ## 请不要直接设置SPARK_HOME等于以下路径下述路径仅做展示用途 43 | ## 请指向正确的路径 44 | export SPARK_HOME=/path/to/spark_extracted_package 45 | export PATH=$SPARK_HOME/bin:$PATH 46 | ``` 47 | 48 | * PySpark的用户请安装Python2.7版本,并设置PATH 49 | 50 | ``` 51 | export PATH=/path/to/python/bin/:$PATH 52 | ``` 53 | 54 | * HADOOP_CONF_DIR设置:注意Spark 2.4.5和Spark 3必须要设置该参数 55 | 56 | ``` 57 | export HADOOP_CONF_DIR=$SPARK_HOME/conf 58 | ``` 59 | 60 |

配置spark-defaults.conf

61 | 62 | + 第一次下载MaxCompute Spark客户端后,需要配置spark-defaults.conf 63 | + 在 $SPARK_HOME/conf/ 下面有一个文件名称为 spark-defaults.conf.template。请将其重命名为 spark-defaults.conf 后再进行相关配置(很多人会忽略这一步,导致配置无法生效) 64 | 65 | ``` 66 | ## spark-defaults.conf 67 | ## 一般来说默认的template只需要再填上MaxCompute相关的账号信息就可以使用Spark 68 | spark.hadoop.odps.project.name = 69 | spark.hadoop.odps.access.id = 70 | spark.hadoop.odps.access.key = 71 | 72 | ## 其他的配置直接采用以下参数即可 73 | spark.hadoop.odps.end.point = http://service.cn.maxcompute.aliyun.com/api 74 | spark.hadoop.odps.runtime.end.point = http://service.cn.maxcompute.aliyun-inc.com/api 75 | 76 | ##########-------注意catalog设置-------########## 77 | ### spark 2.3.0请将该参数设置为odps 78 | spark.sql.catalogImplementation=odps 79 | 80 | ### spark 2.4.5请将该参数设置为hive 81 | spark.sql.catalogImplementation=hive 82 | 83 | ### spark 3.1.1参数变化 84 | spark.sql.defaultCatalog=odps 85 | spark.sql.catalog.odps=org.apache.spark.sql.execution.datasources.v2.odps.OdpsTableCatalog 86 | spark.sql.sources.partitionOverwriteMode=dynamic 87 | spark.sql.extensions=org.apache.spark.sql.execution.datasources.v2.odps.extension.OdpsExtensions 88 | 89 | ``` 90 | 91 | 如果有一些特殊的场景还有功能,还可以开启另外的一些配置,见[Spark配置详解](https://github.com/aliyun/MaxCompute-Spark/wiki/07.-Spark%E9%85%8D%E7%BD%AE%E8%AF%A6%E8%A7%A3) 92 | 93 | Spark 2.4.5的参数变化,详见[Spark 2.4.5使用注意事项](https://github.com/aliyun/MaxCompute-Spark/wiki/06.-Spark-2.4.5-%E4%BD%BF%E7%94%A8%E6%B3%A8%E6%84%8F%E4%BA%8B%E9%A1%B9) 94 | 95 | Spark 3.1.1的参数变化,详见[Spark 3.1.1使用注意事项](https://github.com/aliyun/MaxCompute-Spark/wiki/06.-Spark-3.1.1-%E4%BD%BF%E7%94%A8%E6%B3%A8%E6%84%8F%E4%BA%8B%E9%A1%B9) 96 | 97 |

准备项目工程

98 | 99 | + MaxCompute Spark提供了项目工程模版,建议开发者下载模版复制后直接在模版里开发 100 | 101 | + 可以看到模版工程里的关于spark的依赖的scope都是provided的,这个请务必不要更改,否则提交的作业无法正常运行 102 | 103 | spark-1.x 模板及编译 104 | 105 | ``` 106 | git clone https://github.com/aliyun/MaxCompute-Spark.git 107 | cd spark-1.x 108 | mvn clean package 109 | ``` 110 | 111 | spark-2.x 模板及编译 112 | 113 | ``` 114 | git clone https://github.com/aliyun/MaxCompute-Spark.git 115 | cd spark-2.x 116 | mvn clean package 117 | ``` 118 | 119 | spark-3.x 模板及编译 120 | 121 | ``` 122 | git clone https://github.com/aliyun/MaxCompute-Spark.git 123 | cd spark-3.x 124 | mvn clean package 125 | ``` 126 | 127 |

SparkPi 冒烟测试

128 | 129 | 在完成了以上的工作后,可以来进行冒烟测试,验证MaxCompute Spark是否E2E走通,需要以下前提: 130 | 131 | * 准备MaxCompute项目以及对应的accessId,accessKey 132 | * 下载MaxCompute Spark客户端 133 | * 环境变量准备 134 | * spark-defaults.conf配置 135 | * 下载工程模版并编译 136 | 137 | 以 spark-2.x 为例,我们可以提交一个SparkPi来验证功能是否正常,提交命令如下: 138 | 139 | ``` 140 | ## /path/to/MaxCompute-Spark 请指向正确的编译出来后的application jar包 141 | 142 | ## bash环境 143 | cd $SPARK_HOME 144 | bin/spark-submit --master yarn-cluster --class com.aliyun.odps.spark.examples.SparkPi \ 145 | /path/to/MaxCompute-Spark/spark-2.x/target/spark-examples_2.11-1.0.0-SNAPSHOT-shaded.jar 146 | 147 | ## 在windows环境提交 148 | cd $SPARK_HOME/bin 149 | spark-submit.cmd --master yarn-cluster --class com.aliyun.odps.spark.examples.SparkPi 150 | \path\to\MaxCompute-Spark\spark-2.x\target\spark-examples_2.11-1.0.0-SNAPSHOT-shaded.jar 151 | 152 | ## 当看到以下日志则表明冒烟作业成功 153 | 19/06/11 11:57:30 INFO Client: 154 | client token: N/A 155 | diagnostics: N/A 156 | ApplicationMaster host: 11.222.166.90 157 | ApplicationMaster RPC port: 38965 158 | queue: queue 159 | start time: 1560225401092 160 | final status: SUCCEEDED 161 | ``` -------------------------------------------------------------------------------- /docs/docs/resources/ENI-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/ENI-1.png -------------------------------------------------------------------------------- /docs/docs/resources/ENI-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/ENI-2.png -------------------------------------------------------------------------------- /docs/docs/resources/ENI-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/ENI-3.png -------------------------------------------------------------------------------- /docs/docs/resources/ENI-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/ENI-4.png -------------------------------------------------------------------------------- /docs/docs/resources/ENI-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/ENI-5.png -------------------------------------------------------------------------------- /docs/docs/resources/OOM1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/OOM1.png -------------------------------------------------------------------------------- /docs/docs/resources/cloudmonitor-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/cloudmonitor-1.png -------------------------------------------------------------------------------- /docs/docs/resources/cloudmonitor-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/cloudmonitor-2.png -------------------------------------------------------------------------------- /docs/docs/resources/cloudmonitor-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/cloudmonitor-3.png -------------------------------------------------------------------------------- /docs/docs/resources/cupid_arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/cupid_arch.png -------------------------------------------------------------------------------- /docs/docs/resources/datahub-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/datahub-1.jpg -------------------------------------------------------------------------------- /docs/docs/resources/datahub-2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/datahub-2.jpg -------------------------------------------------------------------------------- /docs/docs/resources/datahub-3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/datahub-3.jpg -------------------------------------------------------------------------------- /docs/docs/resources/dataworks-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/dataworks-1.jpg -------------------------------------------------------------------------------- /docs/docs/resources/dataworks-2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/dataworks-2.jpg -------------------------------------------------------------------------------- /docs/docs/resources/dataworks-3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/dataworks-3.jpg -------------------------------------------------------------------------------- /docs/docs/resources/dataworks-4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/dataworks-4.jpg -------------------------------------------------------------------------------- /docs/docs/resources/dataworks-5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/dataworks-5.jpg -------------------------------------------------------------------------------- /docs/docs/resources/dataworks-6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/dataworks-6.jpg -------------------------------------------------------------------------------- /docs/docs/resources/dataworks-7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/dataworks-7.jpg -------------------------------------------------------------------------------- /docs/docs/resources/dingtalk-share.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/dingtalk-share.jpg -------------------------------------------------------------------------------- /docs/docs/resources/fuxisensor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/fuxisensor.png -------------------------------------------------------------------------------- /docs/docs/resources/fuxisensor2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/fuxisensor2.png -------------------------------------------------------------------------------- /docs/docs/resources/idea-local-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/idea-local-1.jpg -------------------------------------------------------------------------------- /docs/docs/resources/idea-local-2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/idea-local-2.jpg -------------------------------------------------------------------------------- /docs/docs/resources/idea-local-3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/idea-local-3.jpg -------------------------------------------------------------------------------- /docs/docs/resources/idea-local-4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/idea-local-4.jpg -------------------------------------------------------------------------------- /docs/docs/resources/idea-local-5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/idea-local-5.jpg -------------------------------------------------------------------------------- /docs/docs/resources/jobview-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/jobview-1.jpg -------------------------------------------------------------------------------- /docs/docs/resources/jobview-2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/jobview-2.jpg -------------------------------------------------------------------------------- /docs/docs/resources/jobview-3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/jobview-3.jpg -------------------------------------------------------------------------------- /docs/docs/resources/jobview-4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/jobview-4.jpg -------------------------------------------------------------------------------- /docs/docs/resources/jobview-5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/jobview-5.jpg -------------------------------------------------------------------------------- /docs/docs/resources/log4j2-stderr.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/log4j2-stderr.jpg -------------------------------------------------------------------------------- /docs/docs/resources/log4j2-stdout.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/log4j2-stdout.jpg -------------------------------------------------------------------------------- /docs/docs/resources/logview-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/logview-1.jpg -------------------------------------------------------------------------------- /docs/docs/resources/logview-2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/logview-2.jpg -------------------------------------------------------------------------------- /docs/docs/resources/logview-3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/logview-3.jpg -------------------------------------------------------------------------------- /docs/docs/resources/logview-4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/logview-4.jpg -------------------------------------------------------------------------------- /docs/docs/resources/logview-5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/logview-5.jpg -------------------------------------------------------------------------------- /docs/docs/resources/oss-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/oss-1.jpg -------------------------------------------------------------------------------- /docs/docs/resources/oss-2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/oss-2.jpg -------------------------------------------------------------------------------- /docs/docs/resources/oss-3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/oss-3.jpg -------------------------------------------------------------------------------- /docs/docs/resources/sparkui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/sparkui.png -------------------------------------------------------------------------------- /docs/docs/resources/vpc-access-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/vpc-access-1.jpg -------------------------------------------------------------------------------- /docs/docs/resources/vpc-access-2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/vpc-access-2.jpg -------------------------------------------------------------------------------- /docs/docs/resources/vpc-access-3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/vpc-access-3.jpg -------------------------------------------------------------------------------- /docs/docs/resources/资源申请1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/资源申请1.png -------------------------------------------------------------------------------- /docs/docs/resources/资源申请2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/资源申请2.png -------------------------------------------------------------------------------- /docs/docusaurus.config.js: -------------------------------------------------------------------------------- 1 | // @ts-check 2 | // `@type` JSDoc annotations allow editor autocompletion and type checking 3 | // (when paired with `@ts-check`). 4 | // There are various equivalent ways to declare your Docusaurus config. 5 | // See: https://docusaurus.io/docs/api/docusaurus-config 6 | 7 | import {themes as prismThemes} from 'prism-react-renderer'; 8 | 9 | /** @type {import('@docusaurus/types').Config} */ 10 | const config = { 11 | title: 'MaxCompute Spark', 12 | // Set the // pathname under which your site is served 13 | // For GitHub pages deployment, it is often '//' 14 | url: 'https://aliyun.github.io', 15 | baseUrl: '/MaxCompute-Spark/', 16 | 17 | // GitHub pages deployment config. 18 | // If you aren't using GitHub pages, you don't need these. 19 | organizationName: 'aliyun', // Usually your GitHub org/user name. 20 | projectName: 'MaxCompute-Spark', // Usually your repo name. 21 | trailingSlash: 'true', 22 | 23 | onBrokenAnchors: 'ignore', 24 | onBrokenLinks: 'ignore', 25 | onBrokenMarkdownLinks: 'ignore', 26 | 27 | markdown: { 28 | mermaid: true, 29 | }, 30 | themes: ['@docusaurus/theme-mermaid'], 31 | 32 | // Even if you don't use internationalization, you can use this field to set 33 | // useful metadata like html lang. For example, if your site is Chinese, you 34 | // may want to replace "en" with "zh-Hans". 35 | i18n: { 36 | defaultLocale: 'zh-Hans', 37 | locales: ['zh-Hans'], 38 | }, 39 | 40 | presets: [ 41 | [ 42 | '@docusaurus/preset-classic', 43 | { 44 | docs: { 45 | routeBasePath: '/', // Serve the docs at the site's root 46 | sidebarPath: './sidebars.js', 47 | }, 48 | blog: false, 49 | theme: { 50 | customCss: './src/css/custom.css', 51 | }, 52 | }, 53 | ], 54 | ], 55 | 56 | themeConfig: 57 | /** @type {import('@docusaurus/preset-classic').ThemeConfig} */ 58 | ({ 59 | docs: { 60 | sidebar: { 61 | hideable: true, 62 | autoCollapseCategories: true, 63 | }, 64 | }, 65 | image: 'img/logo.svg', 66 | navbar: { 67 | title: 'MaxCompute Spark', 68 | logo: { 69 | alt: 'MaxCompute Logo', 70 | src: 'img/logo.svg', 71 | }, 72 | items: [ 73 | { 74 | type: 'docSidebar', 75 | sidebarId: 'docs', 76 | position: 'left', 77 | label: '文档', 78 | }, 79 | // { 80 | // href: 'https://github.com/aliyun/aliyun-odps-jdbc', 81 | // position: 'right', 82 | // label: '使用 JDBC 链接 MaxCompute', 83 | // }, 84 | // { 85 | // type: 'docsVersionDropdown', 86 | // sidebarId: 'version', 87 | // position: 'left', 88 | // dropdownActiveClassDisabled: true, 89 | // }, 90 | // { 91 | // href: 'https://github.com/aliyun/aliyun-odps-java-sdk', 92 | // label: 'GitHub', 93 | // position: 'right', 94 | // }, 95 | ], 96 | }, 97 | prism: { 98 | theme: prismThemes.github, 99 | darkTheme: prismThemes.dracula, 100 | additionalLanguages: ['java'], 101 | }, 102 | }), 103 | }; 104 | 105 | export default config; 106 | 107 | -------------------------------------------------------------------------------- /docs/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "odps-sdk-doc", 3 | "version": "0.0.0", 4 | "private": true, 5 | "scripts": { 6 | "docusaurus": "docusaurus", 7 | "start": "docusaurus start", 8 | "build": "docusaurus build", 9 | "swizzle": "docusaurus swizzle", 10 | "deploy": "docusaurus deploy", 11 | "clear": "docusaurus clear", 12 | "serve": "docusaurus serve", 13 | "write-translations": "docusaurus write-translations", 14 | "write-heading-ids": "docusaurus write-heading-ids" 15 | }, 16 | "dependencies": { 17 | "@docusaurus/core": "3.5.2", 18 | "@docusaurus/preset-classic": "3.5.2", 19 | "@docusaurus/theme-mermaid": "3.5.2", 20 | "@mdx-js/react": "^3.0.0", 21 | "clsx": "^2.0.0", 22 | "prism-react-renderer": "^2.3.0", 23 | "react": "^18.0.0", 24 | "react-dom": "^18.0.0" 25 | }, 26 | "devDependencies": { 27 | "@docusaurus/module-type-aliases": "3.0.1", 28 | "@docusaurus/types": "3.0.1", 29 | "eslint": "^8.56.0", 30 | "eslint-plugin-react": "^7.33.2" 31 | }, 32 | "browserslist": { 33 | "production": [ 34 | ">0.5%", 35 | "not dead", 36 | "not op_mini all" 37 | ], 38 | "development": [ 39 | "last 3 chrome version", 40 | "last 3 firefox version", 41 | "last 5 safari version" 42 | ] 43 | }, 44 | "engines": { 45 | "node": ">=18.0" 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /docs/sidebars.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Creating a sidebar enables you to: 3 | - create an ordered group of docs 4 | - render a sidebar for each doc of that group 5 | - provide next/previous navigation 6 | 7 | The sidebars can be generated from the filesystem, or explicitly defined here. 8 | 9 | Create as many sidebars as you want. 10 | */ 11 | 12 | // @ts-check 13 | 14 | /** @type {import('@docusaurus/plugin-content-docs').SidebarsConfig} */ 15 | const sidebars = { 16 | // By default, Docusaurus generates a sidebar from the docs folder structure 17 | docs: [{type: 'autogenerated', dirName: '.'}], 18 | 19 | // But you can create a sidebar manually 20 | /* 21 | tutorialSidebar: [ 22 | 'intro', 23 | 'hello', 24 | { 25 | type: 'category', 26 | label: 'Tutorial', 27 | items: ['tutorial-basics/create-a-document'], 28 | }, 29 | ], 30 | */ 31 | }; 32 | 33 | export default sidebars; 34 | -------------------------------------------------------------------------------- /docs/src/components/HomepageFeatures/index.js: -------------------------------------------------------------------------------- 1 | import clsx from 'clsx'; 2 | import Heading from '@theme/Heading'; 3 | import styles from './styles.module.css'; 4 | 5 | const FeatureList = [ 6 | { 7 | title: 'Easy to Use', 8 | Svg: require('@site/static/img/undraw_docusaurus_mountain.svg').default, 9 | description: ( 10 | <> 11 | Docusaurus was designed from the ground up to be easily installed and 12 | used to get your website up and running quickly. 13 | 14 | ), 15 | }, 16 | { 17 | title: 'Focus on What Matters', 18 | Svg: require('@site/static/img/undraw_docusaurus_tree.svg').default, 19 | description: ( 20 | <> 21 | Docusaurus lets you focus on your docs, and we'll do the chores. Go 22 | ahead and move your docs into the docs directory. 23 | 24 | ), 25 | }, 26 | { 27 | title: 'Powered by React', 28 | Svg: require('@site/static/img/undraw_docusaurus_react.svg').default, 29 | description: ( 30 | <> 31 | Extend or customize your website layout by reusing React. Docusaurus can 32 | be extended while reusing the same header and footer. 33 | 34 | ), 35 | }, 36 | ]; 37 | 38 | function Feature({Svg, title, description}) { 39 | return ( 40 |
41 |
42 | 43 |
44 |
45 | {title} 46 |

{description}

47 |
48 |
49 | ); 50 | } 51 | 52 | export default function HomepageFeatures() { 53 | return ( 54 |
55 |
56 |
57 | {FeatureList.map((props, idx) => ( 58 | 59 | ))} 60 |
61 |
62 |
63 | ); 64 | } 65 | -------------------------------------------------------------------------------- /docs/src/components/HomepageFeatures/styles.module.css: -------------------------------------------------------------------------------- 1 | .features { 2 | display: flex; 3 | align-items: center; 4 | padding: 2rem 0; 5 | width: 100%; 6 | } 7 | 8 | .featureSvg { 9 | height: 200px; 10 | width: 200px; 11 | } 12 | -------------------------------------------------------------------------------- /docs/src/css/custom.css: -------------------------------------------------------------------------------- 1 | /** 2 | * Any CSS included here will be global. The classic template 3 | * bundles Infima by default. Infima is a CSS framework designed to 4 | * work well for content-centric websites. 5 | */ 6 | 7 | /* You can override the default Infima variables here. */ 8 | :root { 9 | --ifm-color-primary: #2e8555; 10 | --ifm-color-primary-dark: #29784c; 11 | --ifm-color-primary-darker: #277148; 12 | --ifm-color-primary-darkest: #205d3b; 13 | --ifm-color-primary-light: #33925d; 14 | --ifm-color-primary-lighter: #359962; 15 | --ifm-color-primary-lightest: #3cad6e; 16 | --ifm-code-font-size: 95%; 17 | --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.1); 18 | } 19 | 20 | /* For readability concerns, you should choose a lighter palette in dark mode. */ 21 | [data-theme='dark'] { 22 | --ifm-color-primary: #25c2a0; 23 | --ifm-color-primary-dark: #21af90; 24 | --ifm-color-primary-darker: #1fa588; 25 | --ifm-color-primary-darkest: #1a8870; 26 | --ifm-color-primary-light: #29d5b0; 27 | --ifm-color-primary-lighter: #32d8b4; 28 | --ifm-color-primary-lightest: #4fddbf; 29 | --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.3); 30 | } 31 | -------------------------------------------------------------------------------- /docs/src/locales.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "id": "zh-cn", 4 | "sourceDir": "docs/zh" 5 | }, 6 | { 7 | "id": "en-us", 8 | "sourceDir": "docs/en" 9 | } 10 | ] -------------------------------------------------------------------------------- /docs/src/pages/index.js: -------------------------------------------------------------------------------- 1 | import clsx from 'clsx'; 2 | import Link from '@docusaurus/Link'; 3 | import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; 4 | import Layout from '@theme/Layout'; 5 | import Heading from '@theme/Heading'; 6 | import styles from './index.module.css'; 7 | 8 | 9 | /** 10 | * FIXME: 理论上有办法让用户直接进入到文档界面,而不是进入一个只有“进入文档”入口的标题界面。 11 | */ 12 | function HomepageHeader() { 13 | const {siteConfig} = useDocusaurusContext(); 14 | return ( 15 |
16 |
17 | 18 | MaxCompute Spark 使用文档 19 | 20 |
21 | 24 | 进入文档 📚 25 | 26 |
27 |
28 |
29 | ); 30 | } 31 | 32 | export default function Home() { 33 | const {siteConfig} = useDocusaurusContext(); 34 | return ( 35 | 38 | 39 | 40 | ); 41 | } 42 | -------------------------------------------------------------------------------- /docs/src/pages/index.module.css: -------------------------------------------------------------------------------- 1 | /** 2 | * CSS files with the .module.css suffix will be treated as CSS modules 3 | * and scoped locally. 4 | */ 5 | 6 | .heroBanner { 7 | padding: 4rem 0; 8 | text-align: center; 9 | position: relative; 10 | overflow: hidden; 11 | } 12 | 13 | @media screen and (max-width: 996px) { 14 | .heroBanner { 15 | padding: 2rem; 16 | } 17 | } 18 | 19 | .buttons { 20 | display: flex; 21 | align-items: center; 22 | justify-content: center; 23 | } 24 | -------------------------------------------------------------------------------- /docs/static/.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/static/.nojekyll -------------------------------------------------------------------------------- /docs/static/img/docusaurus-social-card.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/static/img/docusaurus-social-card.jpg -------------------------------------------------------------------------------- /docs/static/img/docusaurus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/static/img/docusaurus.png -------------------------------------------------------------------------------- /docs/static/img/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/static/img/favicon.ico -------------------------------------------------------------------------------- /docs/static/img/logo.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hook/pre-commit: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | STAGE_FILES=$(git diff --cached --name-only --diff-filter=ACM) 4 | 5 | echo 'check sensitive information ...' 6 | FAIL=0 7 | for FILE in $STAGE_FILES 8 | do 9 | grep --color -Hni -E "(ssh-rsa|authorized_keys|id_dsa|ssh-keygen)" $FILE && FAIL=1 10 | grep --color -Hni -E "(private key|secret|signature|accessid|access_id|access_key|accesskey|access_|password)(.*?)(\=|\:)(\s*)(\'|\")[^\$^%][^)]+(\'|\")[^)]*$" $FILE && FAIL=1 11 | done 12 | 13 | if [ ${FAIL} == 0 ]; then 14 | echo 'check sensitive information ... passed' 15 | exit 0 16 | else 17 | echo 'check sensitive information ... failed' 18 | exit 1 19 | fi 20 | -------------------------------------------------------------------------------- /spark-1.x/src/main/java/com/aliyun/odps/spark/examples/sparksql/JavaSparkSQL.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | *

10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | *

12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.aliyun.odps.spark.examples.sparksql; 20 | 21 | import com.aliyun.odps.Odps; 22 | import com.aliyun.odps.cupid.CupidSession; 23 | import org.apache.spark.SparkConf; 24 | import org.apache.spark.api.java.JavaSparkContext; 25 | import org.apache.spark.api.java.JavaRDD; 26 | import org.apache.spark.api.java.function.Function; 27 | import org.apache.spark.sql.odps.OdpsContext; 28 | import org.apache.spark.sql.DataFrame; 29 | import org.apache.spark.sql.Row; 30 | import org.apache.spark.sql.RowFactory; 31 | 32 | import org.apache.spark.sql.types.*; 33 | 34 | import java.util.ArrayList; 35 | import java.util.List; 36 | 37 | import org.apache.spark.sql.types.StructField; 38 | 39 | public class JavaSparkSQL { 40 | 41 | public static void main(String[] args) { 42 | SparkConf conf = new SparkConf() 43 | .set("spark.hadoop.odps.exec.dynamic.partition.mode", "nonstrict") 44 | .setAppName("sparkSQL"); 45 | JavaSparkContext sc = new JavaSparkContext(conf); 46 | OdpsContext odpsContext = new OdpsContext(sc); 47 | 48 | String project = sc.getConf().get("odps.project.name"); 49 | String tableName = "mc_test_table"; 50 | String tableNameCopy = "mc_test_table_copy"; 51 | String ptTableName = "mc_test_pt_table"; 52 | 53 | 54 | odpsContext.sql("DROP TABLE IF EXISTS " + tableName); 55 | odpsContext.sql("DROP TABLE IF EXISTS " + tableNameCopy); 56 | odpsContext.sql("DROP TABLE IF EXISTS " + ptTableName); 57 | 58 | 59 | odpsContext.sql("CREATE TABLE " + tableName + " (name STRING, num BIGINT)"); 60 | odpsContext.sql("CREATE TABLE " + ptTableName+ " (name STRING, num BIGINT) PARTITIONED BY (pt1 STRING, pt2 STRING)"); 61 | 62 | odpsContext.sql("DESCRIBE " + tableName); 63 | odpsContext.sql("DESCRIBE " + ptTableName); 64 | 65 | List data = new ArrayList(); 66 | for (int i = 0; i < 100 ; i++) { 67 | data.add(i); 68 | } 69 | 70 | JavaRDD dfRDD = sc.parallelize(data, 2).map(new Function() { 71 | public Row call(Integer i) { 72 | return RowFactory.create( 73 | "name-" + i.toString(), 74 | Long.valueOf(i)); 75 | } 76 | }); 77 | 78 | JavaRDD ptDfRDD = sc.parallelize(data, 2).map(new Function() { 79 | public Row call(Integer i) { 80 | return RowFactory.create( 81 | "name-" + i.toString(), 82 | Long.valueOf(i), 83 | "2018", 84 | "0601"); 85 | } 86 | }); 87 | 88 | List structFilelds = new ArrayList(); 89 | structFilelds.add(DataTypes.createStructField("name", DataTypes.StringType, true)); 90 | structFilelds.add(DataTypes.createStructField("num", DataTypes.LongType, true)); 91 | DataFrame df = odpsContext.createDataFrame(dfRDD, DataTypes.createStructType(structFilelds)); 92 | 93 | structFilelds.add(DataTypes.createStructField("pt1", DataTypes.StringType, true)); 94 | structFilelds.add(DataTypes.createStructField("pt2", DataTypes.StringType, true)); 95 | DataFrame ptDf = odpsContext.createDataFrame(ptDfRDD, DataTypes.createStructType(structFilelds)); 96 | 97 | // 写 普通表 98 | df.write().insertInto(tableName); // insertInto语义 99 | df.write().mode("overwrite").insertInto(tableName);// insertOverwrite语义 100 | 101 | // 读 普通表 102 | DataFrame rdf =odpsContext.sql("select name, num from "+ tableName); 103 | System.out.println("rdf count: "+ rdf.count()); 104 | rdf.printSchema(); 105 | 106 | //create table as select 107 | odpsContext.sql("CREATE TABLE " + tableNameCopy +" AS SELECT name, num FROM " + tableName); 108 | odpsContext.sql("SELECT * FROM " + tableNameCopy).show(); 109 | 110 | // 写 分区表 111 | // DataFrameWriter 无法指定分区写入 需要通过临时表再用SQL写入特定分区 112 | df.registerTempTable(ptTableName +"_tmp_view"); 113 | odpsContext.sql("insert into table " + ptTableName + " partition (pt1='2018', pt2='0601') select * from " + ptTableName + "_tmp_view"); 114 | odpsContext.sql("insert overwrite table " + ptTableName+ " partition (pt1='2018', pt2='0601') select * from " + ptTableName+ "_tmp_view"); 115 | 116 | ptDf.write().partitionBy("pt1", "pt2").insertInto(ptTableName);// 动态分区 insertInto语义 117 | ptDf.write().partitionBy("pt1", "pt2").mode("overwrite").insertInto(ptTableName); // 动态分区 insertOverwrite语义 118 | 119 | // 读 分区表 120 | DataFrame rptdf = odpsContext.sql("select name, num, pt1, pt2 from " + ptTableName + " where pt1 = '2018' and pt2 = '0601'"); 121 | System.out.println("rptdf count: "+ rptdf.count()); 122 | rptdf.printSchema(); 123 | 124 | 125 | Odps odps = CupidSession.get().odps(); 126 | System.out.println(odps.tables().get(ptTableName).getPartitions().size()); 127 | System.out.println(odps.tables().get(ptTableName).getPartitions().get(0).getPartitionSpec()); 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /spark-1.x/src/main/python/spark_sql.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext, SparkConf 2 | from pyspark.sql import OdpsContext 3 | 4 | if __name__ == '__main__': 5 | conf = SparkConf().setAppName("odps_pyspark") 6 | sc = SparkContext(conf=conf) 7 | sql_context = OdpsContext(sc) 8 | sql_context.sql("DROP TABLE IF EXISTS spark_sql_test_table") 9 | sql_context.sql("CREATE TABLE spark_sql_test_table(name STRING, num BIGINT)") 10 | sql_context.sql("INSERT INTO TABLE spark_sql_test_table SELECT 'abc', 100000") 11 | sql_context.sql("SELECT * FROM spark_sql_test_table").show() 12 | sql_context.sql("SELECT COUNT(*) FROM spark_sql_test_table").show() -------------------------------------------------------------------------------- /spark-1.x/src/main/scala/com/aliyun/odps/spark/examples/SparkPi.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | *

10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | *

12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.aliyun.odps.spark.examples 20 | 21 | import scala.math.random 22 | 23 | import org.apache.spark._ 24 | 25 | object SparkPi { 26 | def main(args: Array[String]) { 27 | val conf = new SparkConf().setAppName("Spark Pi") 28 | val sc = new SparkContext(conf) 29 | try { 30 | val slices = if (args.length > 0) args(0).toInt else 2 31 | val n = math.min(100000L * slices, Int.MaxValue).toInt // avoid overflow 32 | val count = sc.parallelize(1 until n, slices).map { i => 33 | val x = random * 2 - 1 34 | val y = random * 2 - 1 35 | if (x * x + y * y < 1) 1 else 0 36 | }.reduce(_ + _) 37 | println("Pi is roughly " + 4.0 * count / n) 38 | } finally { 39 | sc.stop() 40 | } 41 | } 42 | } -------------------------------------------------------------------------------- /spark-1.x/src/main/scala/com/aliyun/odps/spark/examples/WordCount.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | *

10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | *

12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.aliyun.odps.spark.examples 20 | 21 | import org.apache.spark.SparkContext 22 | import org.apache.spark.SparkConf 23 | 24 | object WordCount { 25 | def main(args: Array[String]) { 26 | val conf = new SparkConf().setAppName("WordCount") 27 | val sc = new SparkContext(conf) 28 | try { 29 | sc.parallelize(1 to 100, 10).map(word => (word, 1)).reduceByKey(_ + _, 10).take(100).foreach(println) 30 | } finally { 31 | sc.stop() 32 | } 33 | } 34 | } -------------------------------------------------------------------------------- /spark-1.x/src/main/scala/com/aliyun/odps/spark/examples/graphx/PageRank.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | *

10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | *

12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.aliyun.odps.spark.examples.graphx 20 | 21 | import org.apache.spark.graphx.{Edge, Graph, VertexId} 22 | import org.apache.spark.rdd.RDD 23 | import org.apache.spark.{SparkConf, SparkContext} 24 | 25 | object PageRank { 26 | def main(args: Array[String]): Unit = { 27 | val conf = new SparkConf().setAppName("PageRank") 28 | val sc = new SparkContext(conf) 29 | 30 | // build vertices 31 | val users: RDD[(VertexId, Array[String])] = sc.parallelize(List( 32 | "1,BarackObama,Barack Obama", 33 | "2,ladygaga,Goddess of Love", 34 | "3,jeresig,John Resig", 35 | "4,justinbieber,Justin Bieber", 36 | "6,matei_zaharia,Matei Zaharia", 37 | "7,odersky,Martin Odersky", 38 | "8,anonsys" 39 | ).map(line => line.split(",")).map(parts => (parts.head.toLong, parts.tail))) 40 | 41 | // build edges 42 | val followers: RDD[Edge[Double]] = sc.parallelize(Array( 43 | Edge(2L, 1L, 1.0), 44 | Edge(4L, 1L, 1.0), 45 | Edge(1L, 2L, 1.0), 46 | Edge(6L, 3L, 1.0), 47 | Edge(7L, 3L, 1.0), 48 | Edge(7L, 6L, 1.0), 49 | Edge(6L, 7L, 1.0), 50 | Edge(3L, 7L, 1.0) 51 | )) 52 | 53 | // build graph 54 | val followerGraph: Graph[Array[String], Double] = Graph(users, followers) 55 | 56 | // restrict the graph to users with usernames and names 57 | val subgraph = followerGraph.subgraph(vpred = (vid, attr) => attr.size == 2) 58 | 59 | // compute PageRank 60 | val pageRankGraph = subgraph.pageRank(0.001) 61 | 62 | // get attributes of the top pagerank users 63 | val userInfoWithPageRank = subgraph.outerJoinVertices(pageRankGraph.vertices) { 64 | case (uid, attrList, Some(pr)) => (pr, attrList.toList) 65 | case (uid, attrList, None) => (0.0, attrList.toList) 66 | } 67 | 68 | println(userInfoWithPageRank.vertices.top(5)(Ordering.by(_._2._1)).mkString("\n")) 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /spark-1.x/src/main/scala/com/aliyun/odps/spark/examples/mllib/KmeansModelSaveToOss.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | *

10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | *

12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.aliyun.odps.spark.examples.mllib 20 | 21 | import org.apache.spark.mllib.clustering.KMeans._ 22 | import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} 23 | import org.apache.spark.mllib.linalg.Vectors 24 | import org.apache.spark.{SparkConf, SparkContext} 25 | 26 | object KmeansModelSaveToOss { 27 | val modelOssDir = "oss://bucket/kmeans-model" 28 | 29 | def main(args: Array[String]) { 30 | 31 | //1. train and save the model 32 | val conf = new SparkConf().setAppName("KmeansModelSaveToOss") 33 | conf.set("spark.hadoop.fs.oss.credentials.provider", "org.apache.hadoop.fs.aliyun.oss.AliyunStsTokenCredentialsProvider") 34 | conf.set("spark.hadoop.fs.oss.ststoken.roleArn", "acs:ram::****:role/aliyunodpsdefaultrole") 35 | conf.set("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-zmf.aliyuncs.com") 36 | 37 | val sc = new SparkContext(conf) 38 | val points = Seq( 39 | Vectors.dense(0.0, 0.0), 40 | Vectors.dense(0.0, 0.1), 41 | Vectors.dense(0.1, 0.0), 42 | Vectors.dense(9.0, 0.0), 43 | Vectors.dense(9.0, 0.2), 44 | Vectors.dense(9.2, 0.0) 45 | ) 46 | val rdd = sc.parallelize(points, 3) 47 | val initMode = K_MEANS_PARALLEL 48 | val model = KMeans.train(rdd, k = 2, maxIterations = 2, runs = 1, initMode) 49 | val predictResult1 = rdd.map(feature => "cluster id: " + model.predict(feature) + " feature:" + feature.toArray.mkString(",")).collect 50 | println("modelOssDir=" + modelOssDir) 51 | model.save(sc, modelOssDir) 52 | 53 | //2. predict from the oss model 54 | val modelLoadOss = KMeansModel.load(sc, modelOssDir) 55 | val predictResult2 = rdd.map(feature => "cluster id: " + modelLoadOss.predict(feature) + " feature:" + feature.toArray.mkString(",")).collect 56 | assert(predictResult1.size == predictResult2.size) 57 | predictResult2.foreach(result2 => assert(predictResult1.contains(result2))) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /spark-1.x/src/main/scala/com/aliyun/odps/spark/examples/oss/SparkUnstructuredDataCompute.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | *

10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | *

12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.aliyun.odps.spark.examples.oss 20 | 21 | import org.apache.spark.{SparkConf, SparkContext} 22 | 23 | object SparkUnstructuredDataCompute { 24 | def main(args: Array[String]) { 25 | val conf = new SparkConf().setAppName("SparkUnstructuredDataCompute") 26 | conf.set("spark.hadoop.fs.oss.credentials.provider", "org.apache.hadoop.fs.aliyun.oss.AliyunStsTokenCredentialsProvider") 27 | conf.set("spark.hadoop.fs.oss.ststoken.roleArn", "acs:ram::****:role/aliyunodpsdefaultrole") 28 | conf.set("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-zmf.aliyuncs.com") 29 | val sc = new SparkContext(conf) 30 | try { 31 | val pathIn = "oss://bucket/inputdata/" 32 | val inputData = sc.textFile(pathIn, 5) 33 | val cnt = inputData.count 34 | println(s"count: $cnt") 35 | } finally { 36 | sc.stop() 37 | } 38 | } 39 | } -------------------------------------------------------------------------------- /spark-1.x/src/main/scala/com/aliyun/odps/spark/examples/sparksql/SparkSQL.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | *

10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | *

12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.aliyun.odps.spark.examples.sparksql 20 | 21 | import org.apache.spark.sql.odps.OdpsContext 22 | import org.apache.spark.{SparkConf, SparkContext} 23 | 24 | object SparkSQL { 25 | def main(args: Array[String]) { 26 | val conf = new SparkConf().setAppName("sparkSQL") 27 | val sc = new SparkContext(conf) 28 | val sqlContext = new OdpsContext(sc) 29 | import sqlContext._ 30 | 31 | val project = sc.getConf.get("odps.project.name") 32 | import sqlContext.implicits._ 33 | val tableName = "mc_test_table" 34 | val ptTableName = "mc_test_pt_table" 35 | // Drop Create 36 | sql(s"DROP TABLE IF EXISTS ${tableName}") 37 | sql(s"DROP TABLE IF EXISTS ${ptTableName}") 38 | 39 | sql(s"CREATE TABLE ${tableName} (name STRING, num BIGINT)") 40 | sql(s"CREATE TABLE ${ptTableName} (name STRING, num BIGINT) PARTITIONED BY (pt1 STRING, pt2 STRING)") 41 | 42 | val df = sc.parallelize(0 to 99, 2).map(f => { 43 | (s"name-$f", f) 44 | }).toDF("name", "num") 45 | 46 | val ptDf = sc.parallelize(0 to 99, 2).map(f => { 47 | (s"name-$f", f, "2018", "0601") 48 | }).toDF("name", "num", "pt1", "pt2") 49 | 50 | // 写 普通表 51 | df.write.insertInto(tableName) // insertInto语义 52 | df.write.mode("overwrite").insertInto(tableName) // insertOverwrite语义 53 | 54 | // 写 分区表 55 | // DataFrameWriter 无法指定分区写入 需要通过临时表再用SQL写入特定分区 56 | df.registerTempTable(s"${ptTableName}_tmp_view") 57 | sql(s"insert into table ${ptTableName} partition (pt1='2018', pt2='0601') select * from ${ptTableName}_tmp_view") 58 | sql(s"insert overwrite table ${ptTableName} partition (pt1='2018', pt2='0601') select * from ${ptTableName}_tmp_view") 59 | 60 | ptDf.write.partitionBy("pt1", "pt2").insertInto(ptTableName) // 动态分区 insertInto语义 61 | ptDf.write.partitionBy("pt1", "pt2").mode("overwrite").insertInto(ptTableName) // 动态分区 insertOverwrite语义 62 | 63 | // 读 普通表 64 | val rdf = sql(s"select name, num from $tableName") 65 | println(s"rdf count, ${rdf.count()}") 66 | rdf.printSchema() 67 | 68 | // 读 分区表 69 | val rptdf = sql(s"select name, num, pt1, pt2 from $ptTableName where pt1 = '2018' and pt2 = '0601'") 70 | println(s"rptdf count, ${rptdf.count()}") 71 | rptdf.printSchema() 72 | } 73 | } -------------------------------------------------------------------------------- /spark-1.x/src/main/scala/com/aliyun/odps/spark/examples/udf/SparkUDF.scala: -------------------------------------------------------------------------------- 1 | package com.aliyun.odps.spark.examples.udf 2 | 3 | import org.apache.spark.sql.odps.OdpsContext 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | 6 | object SparkUDF { 7 | def main(args: Array[String]) { 8 | val conf = new SparkConf().setAppName("sparkUDF") 9 | val sc = new SparkContext(conf) 10 | 11 | val sqlContext = new OdpsContext(sc) 12 | import sqlContext._ 13 | 14 | sql("DROP TABLE IF EXISTS spark_sql_test_partition_table") 15 | sql("CREATE TABLE spark_sql_test_partition_table(name STRING, num BIGINT) PARTITIONED BY (p1 STRING, p2 STRING)") 16 | 17 | sql("INSERT INTO TABLE spark_sql_test_partition_table PARTITION (p1='2020',p2='hangzhou') SELECT 'hz', 400") 18 | sql("INSERT INTO TABLE spark_sql_test_partition_table PARTITION (p1='2020',p2='shanghai') SELECT 'sh', 500") 19 | sql("INSERT INTO TABLE spark_sql_test_partition_table PARTITION (p1='2020',p2='hangzhou') SELECT 'hz', 600") 20 | 21 | try { 22 | udf.register("myUpper", (input: String) => input.toUpperCase) 23 | val funcs = sql("SHOW FUNCTIONS myupper").collect() 24 | funcs foreach println 25 | assert(funcs.length == 1) 26 | val data = sql("SELECT myupper(name) FROM spark_sql_test_partition_table WHERE name = 'hz'").collect() 27 | assert(data(0).get(0) == "HZ") 28 | println("======= test register udf success ======") 29 | } catch { 30 | case e: Throwable => 31 | e.printStackTrace(System.out) 32 | throw e 33 | } 34 | } 35 | } 36 | 37 | -------------------------------------------------------------------------------- /spark-2.x/libs/jindofs-sdk-3.7.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/spark-2.x/libs/jindofs-sdk-3.7.2.jar -------------------------------------------------------------------------------- /spark-2.x/src/main/java/com/aliyun/odps/spark/examples/utils/ConfigLog4j2.java: -------------------------------------------------------------------------------- 1 | package com.aliyun.odps.spark.examples.utils; 2 | 3 | import org.apache.logging.log4j.Level; 4 | import org.apache.logging.log4j.LogManager; 5 | import org.apache.logging.log4j.core.Appender; 6 | import org.apache.logging.log4j.core.LoggerContext; 7 | import org.apache.logging.log4j.core.appender.ConsoleAppender; 8 | import org.apache.logging.log4j.core.config.AppenderRef; 9 | import org.apache.logging.log4j.core.config.Configuration; 10 | import org.apache.logging.log4j.core.config.LoggerConfig; 11 | import org.apache.logging.log4j.core.layout.PatternLayout; 12 | 13 | public class ConfigLog4j2 { 14 | 15 | private static final LoggerContext CONTEXT; 16 | public static final String DEFAULT_APPENDER = "MY_STDOUT"; 17 | public static final String 18 | DEFAULT_PATTERN = 19 | "%d{yyyy-MM-dd HH:mm:ss.SSS} [%t] %-5level %logger - %msg %ex %n"; 20 | 21 | static { 22 | CONTEXT = (LoggerContext) LogManager.getContext(false); 23 | } 24 | 25 | /** 26 | * @Description: add specific logger for specific package 27 | * @Param: packageName, such as com.xx.yy 28 | * @return: void 29 | * @Author: lcj265802@alibaba-inc.com 30 | * @Date: 2020/12/29 31 | */ 32 | public static void initPackageLogger(String packageName) { 33 | LoggerContext loggerContext = CONTEXT; 34 | Configuration config = loggerContext.getConfiguration(); 35 | 36 | ConsoleAppender.Builder builder = ConsoleAppender.newBuilder(); 37 | builder.setName(DEFAULT_APPENDER); 38 | builder.setLayout(PatternLayout.newBuilder().withPattern(DEFAULT_PATTERN).build()); 39 | Appender stdoutAppender = builder.setTarget(ConsoleAppender.Target.SYSTEM_OUT).build(); 40 | stdoutAppender.start(); 41 | 42 | config.addAppender(stdoutAppender); 43 | 44 | AppenderRef ref = AppenderRef.createAppenderRef(DEFAULT_APPENDER, null, null); 45 | AppenderRef[] refs = new AppenderRef[]{ref}; 46 | 47 | LoggerConfig 48 | loggerConfig = 49 | LoggerConfig.createLogger(false, Level.INFO, packageName, 50 | "true", refs, null, config, null); 51 | loggerConfig.addAppender(stdoutAppender, null, null); 52 | config.addLogger(packageName, loggerConfig); 53 | 54 | loggerContext.updateLoggers(); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /spark-2.x/src/main/python/spark_oss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | from pyspark.sql import SparkSession 4 | 5 | try: 6 | # for python 2 7 | reload(sys) 8 | sys.setdefaultencoding('utf8') 9 | except: 10 | # python 3 not needed 11 | pass 12 | 13 | if __name__ == '__main__': 14 | spark = SparkSession.builder\ 15 | .config("spark.hadoop.fs.AbstractFileSystem.oss.impl", "com.aliyun.emr.fs.oss.OSS")\ 16 | .config("spark.hadoop.fs.oss.impl", "com.aliyun.emr.fs.oss.JindoOssFileSystem")\ 17 | .config("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-internal.aliyuncs.com")\ 18 | .config("spark.hadoop.fs.oss.accessKeyId", "xxx")\ 19 | .config("spark.hadoop.fs.oss.accessKeySecret", "xxx")\ 20 | .appName("spark write df to oss")\ 21 | 22 | .getOrCreate() 23 | 24 | data = [i for i in range(0, 100)] 25 | 26 | df = spark.sparkContext.parallelize(data, 2).map(lambda s: ("name-%s" % s, s)).toDF("name: string, num: int") 27 | 28 | df.show(n=10) 29 | 30 | # write to oss 31 | pathout = 'oss://[bucket]/test.csv' 32 | df.write.csv(pathout) 33 | -------------------------------------------------------------------------------- /spark-2.x/src/main/python/spark_sql.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | from pyspark.sql import SparkSession 4 | 5 | try: 6 | # for python 2 7 | reload(sys) 8 | sys.setdefaultencoding('utf8') 9 | except: 10 | # python 3 not needed 11 | pass 12 | 13 | if __name__ == '__main__': 14 | spark = SparkSession.builder\ 15 | .appName("spark sql")\ 16 | .config("spark.sql.broadcastTimeout", 20 * 60)\ 17 | .config("spark.sql.crossJoin.enabled", True)\ 18 | .config("odps.exec.dynamic.partition.mode", "nonstrict")\ 19 | .getOrCreate() 20 | 21 | tableName = "mc_test_table" 22 | ptTableName = "mc_test_pt_table" 23 | data = [i for i in range(0, 100)] 24 | 25 | # Drop Create 26 | spark.sql("DROP TABLE IF EXISTS %s" % tableName) 27 | spark.sql("DROP TABLE IF EXISTS %s" % ptTableName) 28 | 29 | spark.sql("CREATE TABLE %s (name STRING, num BIGINT)" % tableName) 30 | spark.sql("CREATE TABLE %s (name STRING, num BIGINT) PARTITIONED BY (pt1 STRING, pt2 STRING)" % ptTableName) 31 | 32 | df = spark.sparkContext.parallelize(data, 2).map(lambda s: ("name-%s" % s, s)).toDF("name: string, num: int") 33 | pt_df = spark.sparkContext.parallelize(data, 2).map(lambda s: ("name-%s" % s, s, "2018", "0601")).toDF("name: string, num: int, pt1: string, pt2: string") 34 | 35 | # 写 普通表 36 | df.write.insertInto(tableName) # insertInto语义 37 | df.write.insertInto(tableName, True) # insertOverwrite语义 38 | 39 | # 写 分区表 40 | # DataFrameWriter 无法指定分区写入 需要通过临时表再用SQL写入特定分区 41 | df.createOrReplaceTempView("%s_tmp_view" % ptTableName) 42 | spark.sql("insert into table %s partition (pt1='2018', pt2='0601') select * from %s_tmp_view" % (ptTableName, ptTableName)) 43 | spark.sql("insert overwrite table %s partition (pt1='2018', pt2='0601') select * from %s_tmp_view" % (ptTableName, ptTableName)) 44 | 45 | pt_df.write.insertInto(ptTableName) # 动态分区 insertInto语义 46 | pt_df.write.insertInto(ptTableName, True) # 动态分区 insertOverwrite语义 47 | 48 | # 读 普通表 49 | rdf = spark.sql("select name, num from %s" % tableName) 50 | print("rdf count, %s\n" % rdf.count()) 51 | rdf.printSchema() 52 | 53 | # 读 分区表 54 | rptdf = spark.sql("select name, num, pt1, pt2 from %s where pt1 = '2018' and pt2 = '0601'" % ptTableName) 55 | print("rptdf count, %s" % (rptdf.count())) 56 | rptdf.printSchema() 57 | 58 | 59 | -------------------------------------------------------------------------------- /spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/SparkPi.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | *

10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | *

12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.aliyun.odps.spark.examples 20 | 21 | import org.apache.spark.sql.SparkSession 22 | 23 | import scala.math.random 24 | 25 | object SparkPi { 26 | def main(args: Array[String]) { 27 | val spark = SparkSession 28 | .builder() 29 | .appName("SparkPi") 30 | .getOrCreate() 31 | val sc = spark.sparkContext 32 | 33 | try { 34 | val slices = if (args.length > 0) args(0).toInt else 2 35 | val n = math.min(100000L * slices, Int.MaxValue).toInt // avoid overflow 36 | val count = sc.parallelize(1 until n, slices).map { i => 37 | val x = random * 2 - 1 38 | val y = random * 2 - 1 39 | if (x * x + y * y < 1) 1 else 0 40 | }.reduce(_ + _) 41 | println("Pi is roughly " + 4.0 * count / n) 42 | } finally { 43 | sc.stop() 44 | } 45 | } 46 | } -------------------------------------------------------------------------------- /spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/WordCount.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | *

10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | *

12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.aliyun.odps.spark.examples 20 | 21 | import org.apache.spark.sql.SparkSession 22 | 23 | object WordCount { 24 | def main(args: Array[String]) { 25 | val spark = SparkSession 26 | .builder() 27 | .appName("WordCount") 28 | .getOrCreate() 29 | val sc = spark.sparkContext 30 | 31 | try { 32 | sc.parallelize(1 to 100, 10).map(word => (word, 1)).reduceByKey(_ + _, 10).take(100).foreach(println) 33 | } finally { 34 | sc.stop() 35 | } 36 | } 37 | } -------------------------------------------------------------------------------- /spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/graphx/PageRank.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | *

10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | *

12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.aliyun.odps.spark.examples.graphx 20 | 21 | import org.apache.spark.graphx.{Edge, Graph, VertexId} 22 | import org.apache.spark.rdd.RDD 23 | import org.apache.spark.sql.SparkSession 24 | 25 | object PageRank { 26 | def main(args: Array[String]): Unit = { 27 | val spark = SparkSession 28 | .builder() 29 | .appName("PageRank") 30 | .getOrCreate() 31 | val sc = spark.sparkContext 32 | 33 | // build vertices 34 | val users: RDD[(VertexId, Array[String])] = sc.parallelize(List( 35 | "1,BarackObama,Barack Obama", 36 | "2,ladygaga,Goddess of Love", 37 | "3,jeresig,John Resig", 38 | "4,justinbieber,Justin Bieber", 39 | "6,matei_zaharia,Matei Zaharia", 40 | "7,odersky,Martin Odersky", 41 | "8,anonsys" 42 | ).map(line => line.split(",")).map(parts => (parts.head.toLong, parts.tail))) 43 | 44 | // build edges 45 | val followers: RDD[Edge[Double]] = sc.parallelize(Array( 46 | Edge(2L, 1L, 1.0), 47 | Edge(4L, 1L, 1.0), 48 | Edge(1L, 2L, 1.0), 49 | Edge(6L, 3L, 1.0), 50 | Edge(7L, 3L, 1.0), 51 | Edge(7L, 6L, 1.0), 52 | Edge(6L, 7L, 1.0), 53 | Edge(3L, 7L, 1.0) 54 | )) 55 | 56 | // build graph 57 | val followerGraph: Graph[Array[String], Double] = Graph(users, followers) 58 | 59 | // restrict the graph to users with usernames and names 60 | val subgraph = followerGraph.subgraph(vpred = (vid, attr) => attr.size == 2) 61 | 62 | // compute PageRank 63 | val pageRankGraph = subgraph.pageRank(0.001) 64 | 65 | // get attributes of the top pagerank users 66 | val userInfoWithPageRank = subgraph.outerJoinVertices(pageRankGraph.vertices) { 67 | case (uid, attrList, Some(pr)) => (pr, attrList.toList) 68 | case (uid, attrList, None) => (0.0, attrList.toList) 69 | } 70 | 71 | println(userInfoWithPageRank.vertices.top(5)(Ordering.by(_._2._1)).mkString("\n")) 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/log4j2/Logger.scala: -------------------------------------------------------------------------------- 1 | package com.aliyun.odps.spark.examples.log4j2 2 | 3 | import org.apache.logging.log4j 4 | import org.apache.logging.log4j.LogManager 5 | 6 | trait Logger { 7 | val log: log4j.Logger = LogManager.getLogger(this.getClass) 8 | log 9 | } 10 | -------------------------------------------------------------------------------- /spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/log4j2/SimpleWordCount.scala: -------------------------------------------------------------------------------- 1 | package com.aliyun.odps.spark.examples.log4j2 2 | 3 | import com.aliyun.odps.spark.examples.utils.ConfigLog4j2 4 | import org.apache.spark.rdd.RDD 5 | import org.apache.spark.sql.SparkSession 6 | 7 | object SimpleWordCount extends Logger { 8 | def main(args: Array[String]): Unit = { 9 | 10 | ConfigLog4j2.initPackageLogger("com.aliyun.odps.spark.examples.log4j2") 11 | val spark: SparkSession = SparkSession 12 | .builder() 13 | .appName("WordCount") 14 | .getOrCreate() 15 | 16 | log.info("My Test!") 17 | val wordList = List("Hello", "World", "Hello") 18 | val rdd: RDD[String] = spark.sparkContext.parallelize(Seq(wordList: _*)).cache() 19 | val resultRDD: RDD[(String, Int)] = rdd.map(w => (w, 1)).reduceByKey(_ + _) 20 | resultRDD.collect().foreach(v => { 21 | log.info(s"${v._1} has num ${v._2}") 22 | }) 23 | 24 | spark.stop() 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/mllib/KmeansModelSaveToOss.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | *

10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | *

12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.aliyun.odps.spark.examples.mllib 20 | 21 | import org.apache.spark.mllib.clustering.KMeans._ 22 | import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} 23 | import org.apache.spark.mllib.linalg.Vectors 24 | import org.apache.spark.sql.SparkSession 25 | 26 | object KmeansModelSaveToOss { 27 | val modelOssDir = "oss://[bucket]/kmeans-model" 28 | 29 | def main(args: Array[String]) { 30 | 31 | //1. train and save the model 32 | val spark = SparkSession 33 | .builder() 34 | .config("spark.hadoop.fs.AbstractFileSystem.oss.impl", "com.aliyun.emr.fs.oss.OSS") 35 | .config("spark.hadoop.fs.oss.impl", "com.aliyun.emr.fs.oss.JindoOssFileSystem") 36 | .config("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-internal.aliyuncs.com") 37 | .config("spark.hadoop.fs.oss.accessKeyId", "xxx") 38 | .config("spark.hadoop.fs.oss.accessKeySecret", "xxx") 39 | .appName("KmeansModelSaveToOss") 40 | .getOrCreate() 41 | 42 | val sc = spark.sparkContext 43 | val points = Seq( 44 | Vectors.dense(0.0, 0.0), 45 | Vectors.dense(0.0, 0.1), 46 | Vectors.dense(0.1, 0.0), 47 | Vectors.dense(9.0, 0.0), 48 | Vectors.dense(9.0, 0.2), 49 | Vectors.dense(9.2, 0.0) 50 | ) 51 | val rdd = sc.parallelize(points, 3) 52 | val initMode = K_MEANS_PARALLEL 53 | val model = KMeans.train(rdd, k = 2, maxIterations = 2, runs = 1, initMode) 54 | val predictResult1 = rdd.map(feature => "cluster id: " + model.predict(feature) + " feature:" + feature.toArray.mkString(",")).collect 55 | println("modelOssDir=" + modelOssDir) 56 | model.save(sc, modelOssDir) 57 | 58 | //2. predict from the oss model 59 | val modelLoadOss = KMeansModel.load(sc, modelOssDir) 60 | val predictResult2 = rdd.map(feature => "cluster id: " + modelLoadOss.predict(feature) + " feature:" + feature.toArray.mkString(",")).collect 61 | assert(predictResult1.size == predictResult2.size) 62 | predictResult2.foreach(result2 => assert(predictResult1.contains(result2))) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/oss/JindoFsDemo.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | *

10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | *

12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.aliyun.odps.spark.examples.oss 20 | 21 | import org.apache.spark.{SparkConf, SparkContext} 22 | 23 | object JindoFsDemo { 24 | def main(args: Array[String]): Unit = { 25 | val bucket : String = args(0) 26 | val ossPath : String = args(1) 27 | 28 | //using access-key-id/access-key-secret 29 | val conf = new SparkConf() 30 | .setAppName("jindo-fs-demo") 31 | .set("spark.hadoop.fs.AbstractFileSystem.oss.impl", "com.aliyun.emr.fs.oss.OSS") 32 | .set("spark.hadoop.fs.oss.impl", "com.aliyun.emr.fs.oss.JindoOssFileSystem") 33 | .set("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-internal.aliyuncs.com") 34 | .set("spark.hadoop.fs.oss.accessKeyId", "xxx") 35 | .set("spark.hadoop.fs.oss.accessKeySecret", "xxx") 36 | 37 | val sc = new SparkContext(conf) 38 | 39 | try { 40 | read_oss_dir(sc, "demo", s"oss://${bucket}/${ossPath}") 41 | } finally { 42 | sc.stop() 43 | } 44 | } 45 | 46 | /** 47 | * compute cost time using jindo sdk 48 | */ 49 | def read_oss_dir(sc: SparkContext, job_des:String, ossPath: String): Unit = { 50 | val startTime: Long = System.currentTimeMillis() 51 | val inputData = sc.textFile(ossPath, 20) 52 | val cnt = inputData.count 53 | val endTime:Long = System.currentTimeMillis() 54 | val cost:Long = endTime - startTime 55 | println(s"job:$job_des, count:$cnt, consume:$cost") 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/oss/SparkUnstructuredDataCompute.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | *

10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | *

12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.aliyun.odps.spark.examples.oss 20 | 21 | import org.apache.spark.sql.SparkSession 22 | 23 | object SparkUnstructuredDataCompute { 24 | def main(args: Array[String]) { 25 | val spark = SparkSession 26 | .builder() 27 | .config("spark.hadoop.fs.AbstractFileSystem.oss.impl", "com.aliyun.emr.fs.oss.OSS") 28 | .config("spark.hadoop.fs.oss.impl", "com.aliyun.emr.fs.oss.JindoOssFileSystem") 29 | .config("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-internal.aliyuncs.com") 30 | .config("spark.hadoop.fs.oss.accessKeyId", "xxx") 31 | .config("spark.hadoop.fs.oss.accessKeySecret", "xxx") 32 | .appName("SparkUnstructuredDataCompute") 33 | .getOrCreate() 34 | 35 | val sc = spark.sparkContext 36 | try { 37 | val pathIn = "oss://bucket/inputdata/" 38 | val inputData = sc.textFile(pathIn, 5) 39 | val cnt = inputData.count 40 | println(s"count: $cnt") 41 | } finally { 42 | sc.stop() 43 | } 44 | } 45 | } -------------------------------------------------------------------------------- /spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/sparksql/SparkSQL.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | *

10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | *

12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.aliyun.odps.spark.examples.sparksql 20 | 21 | import org.apache.spark.sql.{SaveMode, SparkSession} 22 | 23 | object SparkSQL { 24 | def main(args: Array[String]) { 25 | val spark = SparkSession 26 | .builder() 27 | .appName("SparkSQL-on-MaxCompute") 28 | .config("spark.sql.broadcastTimeout", 20 * 60) 29 | .config("spark.sql.crossJoin.enabled", true) 30 | .config("odps.exec.dynamic.partition.mode", "nonstrict") 31 | .getOrCreate() 32 | 33 | // val project = spark.conf.get("odps.project.name") 34 | 35 | import spark._ 36 | import sqlContext.implicits._ 37 | val tableName = "mc_test_table" 38 | val ptTableName = "mc_test_pt_table" 39 | // Drop Create 40 | sql(s"DROP TABLE IF EXISTS ${tableName}") 41 | sql(s"DROP TABLE IF EXISTS ${ptTableName}") 42 | 43 | sql(s"CREATE TABLE ${tableName} (name STRING, num BIGINT)") 44 | sql(s"CREATE TABLE ${ptTableName} (name STRING, num BIGINT) PARTITIONED BY (pt1 STRING, pt2 STRING)") 45 | 46 | val df = spark.sparkContext.parallelize(0 to 99, 2).map(f => { 47 | (s"name-$f", f) 48 | }).toDF("name", "num") 49 | 50 | val ptDf = spark.sparkContext.parallelize(0 to 99, 2).map(f => { 51 | (s"name-$f", f, "2018", "0601") 52 | }).toDF("name", "num", "pt1", "pt2") 53 | 54 | // 写 普通表 55 | df.write.insertInto(tableName) // insertInto语义 56 | df.write.mode("overwrite").insertInto(tableName) // insertOverwrite语义 57 | 58 | // 写 分区表 59 | // DataFrameWriter 无法指定分区写入 需要通过临时表再用SQL写入特定分区 60 | df.createOrReplaceTempView(s"${ptTableName}_tmp_view") 61 | sql(s"insert into table ${ptTableName} partition (pt1='2018', pt2='0601') select * from ${ptTableName}_tmp_view") 62 | sql(s"insert overwrite table ${ptTableName} partition (pt1='2018', pt2='0601') select * from ${ptTableName}_tmp_view") 63 | 64 | ptDf.write.insertInto(ptTableName) // 动态分区 insertInto语义 65 | ptDf.write.mode("overwrite").insertInto(ptTableName) // 动态分区 insertOverwrite语义 66 | 67 | // 读 普通表 68 | val rdf = sql(s"select name, num from $tableName") 69 | println(s"rdf count, ${rdf.count()}") 70 | rdf.printSchema() 71 | 72 | // 读 分区表 73 | val rptdf = sql(s"select name, num, pt1, pt2 from $ptTableName where pt1 = '2018' and pt2 = '0601'") 74 | println(s"rptdf count, ${rptdf.count()}") 75 | rptdf.printSchema() 76 | } 77 | } 78 | 79 | -------------------------------------------------------------------------------- /spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/streaming/common/SparkSessionSingleton.scala: -------------------------------------------------------------------------------- 1 | package com.aliyun.odps.spark.examples.streaming.common 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object SparkSessionSingleton { 7 | @transient private var instance: SparkSession = _ 8 | 9 | def getInstance(sparkConf: SparkConf): SparkSession = { 10 | if (instance == null) { 11 | instance = SparkSession 12 | .builder 13 | .config(sparkConf) 14 | .getOrCreate() 15 | } 16 | instance 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/streaming/datahub/DataHub2OdpsDemo.scala: -------------------------------------------------------------------------------- 1 | package com.aliyun.odps.spark.examples.streaming.datahub 2 | 3 | import com.aliyun.datahub.model.RecordEntry 4 | import com.aliyun.odps.spark.examples.streaming.common.SparkSessionSingleton 5 | 6 | import org.apache.spark.sql.SparkSession 7 | import org.apache.spark.storage.StorageLevel 8 | import org.apache.spark.streaming.{Seconds, StreamingContext} 9 | import org.apache.spark.streaming.aliyun.datahub.DatahubUtils 10 | 11 | object DataHub2OdpsDemo { 12 | 13 | def transferFunc(record: RecordEntry): String = { 14 | // 这个转化函数目前只支持把DataHub Record转成String 15 | // 如果是需要多个字段的话, 那么需要处理一下拼接的逻辑 16 | record.getString(1) 17 | } 18 | 19 | def main(args: Array[String]): Unit = { 20 | val spark = SparkSession 21 | .builder() 22 | .appName("DataHubStreamingDemo") 23 | .config("spark.hadoop.fs.oss.credentials.provider", "org.apache.hadoop.fs.aliyun.oss.AliyunStsTokenCredentialsProvider") 24 | .config("spark.hadoop.fs.oss.ststoken.roleArn", "acs:ram::****:role/aliyunodpsdefaultrole") 25 | .config("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-zmf.aliyuncs.com") 26 | .getOrCreate() 27 | 28 | // 设置Batch间隔时间 29 | val ssc = new StreamingContext(spark.sparkContext, Seconds(10)) 30 | 31 | // checkpoint dir to oss 32 | ssc.checkpoint("oss://bucket/inputdata/") 33 | 34 | val dataStream = DatahubUtils.createStream( 35 | ssc, 36 | "projectName", 37 | "topic", 38 | "subId", 39 | "accessId", 40 | "accessKey", 41 | "endPoint", 42 | transferFunc(_), 43 | StorageLevel.MEMORY_AND_DISK 44 | ) 45 | 46 | dataStream.map(x => new String(x)).foreachRDD(rdd => { 47 | val spark = SparkSessionSingleton.getInstance(rdd.sparkContext.getConf) 48 | import spark.implicits._ 49 | 50 | rdd.toDF("id").write.mode("append").saveAsTable("test_table") 51 | }) 52 | 53 | ssc.start() 54 | ssc.awaitTermination() 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/streaming/datahub/DataHubStreamingDemo.scala: -------------------------------------------------------------------------------- 1 | package com.aliyun.odps.spark.examples.streaming.datahub 2 | 3 | import com.aliyun.datahub.model.RecordEntry 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.storage.StorageLevel 6 | import org.apache.spark.streaming.aliyun.datahub.DatahubUtils 7 | import org.apache.spark.streaming.{Seconds, StreamingContext} 8 | 9 | object DataHubStreamingDemo { 10 | 11 | def transferFunc(record: RecordEntry): String = { 12 | // 这个转化函数目前只支持把DataHub Record转成String 13 | // 如果是需要多个字段的话, 那么需要处理一下拼接的逻辑 14 | record.getString(1) 15 | } 16 | 17 | def main(args: Array[String]): Unit = { 18 | val spark = SparkSession 19 | .builder() 20 | .appName("DataHubStreamingDemo") 21 | .config("spark.hadoop.fs.oss.credentials.provider", "org.apache.hadoop.fs.aliyun.oss.AliyunStsTokenCredentialsProvider") 22 | .config("spark.hadoop.fs.oss.ststoken.roleArn", "acs:ram::****:role/aliyunodpsdefaultrole") 23 | .config("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-zmf.aliyuncs.com") 24 | .getOrCreate() 25 | 26 | // 设置Batch间隔时间 27 | val ssc = new StreamingContext(spark.sparkContext, Seconds(5)) 28 | 29 | // checkpoint dir to oss 30 | ssc.checkpoint("oss://bucket/inputdata/") 31 | 32 | val dataStream = DatahubUtils.createStream( 33 | ssc, 34 | "projectName", 35 | "topic", 36 | "subId", 37 | "accessId", 38 | "accessKey", 39 | "endPoint", 40 | transferFunc(_), 41 | StorageLevel.MEMORY_AND_DISK 42 | ) 43 | 44 | dataStream.count().print() 45 | 46 | ssc.start() 47 | ssc.awaitTermination() 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/streaming/kafka/Kafka2OdpsDemo.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | *

10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | *

12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.aliyun.odps.spark.examples.streaming.kafka 20 | 21 | import com.aliyun.odps.spark.examples.streaming.common.SparkSessionSingleton 22 | import org.apache.kafka.clients.consumer.ConsumerRecord 23 | import org.apache.kafka.common.serialization.StringDeserializer 24 | 25 | import org.apache.spark.SparkConf 26 | import org.apache.spark.streaming.{Seconds, StreamingContext} 27 | import org.apache.spark.streaming.dstream.{DStream, InputDStream} 28 | import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} 29 | 30 | object Kafka2OdpsDemo { 31 | def main(args: Array[String]): Unit = { 32 | val sparkConf = new SparkConf().setAppName("test") 33 | val ssc = new StreamingContext(sparkConf, Seconds(10)) 34 | 35 | // 请使用OSS作为Checkpoint存储,修改为有效OSS路径。OSS访问文档请参考 https://github.com/aliyun/MaxCompute-Spark/wiki/08.-Oss-Access%E6%96%87%E6%A1%A3%E8%AF%B4%E6%98%8E 36 | ssc.checkpoint("oss://bucket/checkpointdir") 37 | 38 | // kafka配置参数 39 | val kafkaParams = Map[String, Object]( 40 | "bootstrap.servers" -> "localhost:9092", 41 | "key.deserializer" -> classOf[StringDeserializer], 42 | "value.deserializer" -> classOf[StringDeserializer], 43 | "group.id" -> "testGroupId", 44 | "auto.offset.reset" -> "latest", 45 | "enable.auto.commit" -> (false: java.lang.Boolean) 46 | ) 47 | 48 | // 创建kafka dstream 49 | val topics = Set("test") 50 | val recordDstream: InputDStream[ConsumerRecord[String, String]] = 51 | KafkaUtils.createDirectStream[String, String]( 52 | ssc, 53 | LocationStrategies.PreferConsistent, 54 | ConsumerStrategies.Subscribe[String, String](topics, kafkaParams) 55 | ) 56 | val dstream = recordDstream.map(f => (f.key(), f.value())) 57 | // 解析kafka数据并写入odps 58 | val data: DStream[String] = dstream.map(_._2) 59 | val wordsDStream: DStream[String] = data.flatMap(_.split(" ")) 60 | wordsDStream.foreachRDD(rdd => { 61 | val spark = SparkSessionSingleton.getInstance(rdd.sparkContext.getConf) 62 | import spark.implicits._ 63 | 64 | rdd.toDF("id").write.mode("append").saveAsTable("test_table") 65 | }) 66 | 67 | ssc.start() 68 | ssc.awaitTermination() 69 | } 70 | } 71 | 72 | -------------------------------------------------------------------------------- /spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/streaming/kafka/KafkaStreamingDemo.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | *

10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | *

12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.aliyun.odps.spark.examples.streaming.kafka 20 | 21 | import org.apache.kafka.clients.consumer.ConsumerRecord 22 | import org.apache.kafka.common.serialization.StringDeserializer 23 | import org.apache.spark.sql.SparkSession 24 | import org.apache.spark.streaming.dstream.{DStream, InputDStream} 25 | import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} 26 | import org.apache.spark.streaming.{Seconds, StreamingContext} 27 | 28 | object KafkaStreamingDemo { 29 | def main(args: Array[String]): Unit = { 30 | val spark = SparkSession 31 | .builder() 32 | .appName("KafkaStreamingDemo") 33 | .getOrCreate() 34 | 35 | val ssc = new StreamingContext(spark.sparkContext, Seconds(5)) 36 | 37 | // 请使用OSS作为Checkpoint存储 38 | ssc.checkpoint("oss://bucket/checkpointDir/") 39 | 40 | // kafka配置参数 41 | val kafkaParams = Map[String, Object]( 42 | "bootstrap.servers" -> "192.168.1.1:9200,192.168.1.2:9200,192.168.1.3:9200", 43 | "key.deserializer" -> classOf[StringDeserializer], 44 | "value.deserializer" -> classOf[StringDeserializer], 45 | "group.id" -> "testGroupId", 46 | "auto.offset.reset" -> "latest", 47 | "enable.auto.commit" -> (false: java.lang.Boolean) 48 | ) 49 | 50 | val topics = Set("event_topic") 51 | val recordDstream: InputDStream[ConsumerRecord[String, String]] = 52 | KafkaUtils.createDirectStream[String, String]( 53 | ssc, 54 | LocationStrategies.PreferConsistent, 55 | ConsumerStrategies.Subscribe[String, String](topics, kafkaParams) 56 | ) 57 | 58 | 59 | val dstream = recordDstream.map(f => (f.key(), f.value())) 60 | val data: DStream[String] = dstream.map(_._2) 61 | val wordsDStream: DStream[String] = data.flatMap(_.split(" ")) 62 | val wordAndOneDstream: DStream[(String, Int)] = wordsDStream.map((_, 1)) 63 | val result: DStream[(String, Int)] = wordAndOneDstream.reduceByKey(_ + _) 64 | result.print() 65 | 66 | ssc.start() 67 | ssc.awaitTermination() 68 | } 69 | } 70 | 71 | -------------------------------------------------------------------------------- /spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/streaming/loghub/LogHub2OdpsDemo.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | *

10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | *

12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.aliyun.odps.spark.examples.streaming.loghub 20 | 21 | import com.aliyun.odps.spark.examples.streaming.common.SparkSessionSingleton 22 | import com.aliyun.openservices.loghub.client.config.LogHubCursorPosition 23 | 24 | import org.apache.spark.{SparkConf, SparkContext} 25 | import org.apache.spark.storage.StorageLevel 26 | import org.apache.spark.streaming.{Durations, StreamingContext} 27 | import org.apache.spark.streaming.loghub.{LoghubUtils, StreamingParam} 28 | 29 | object LogHub2OdpsDemo { 30 | 31 | def buildParam(conf: SparkConf): StreamingParam = { 32 | val sp = new StreamingParam() 33 | sp.setId(conf.get("spark.logservice.accessKeyId")) 34 | sp.setSecret(conf.get("spark.logservice.accessKeySecret")) 35 | sp.setEndpoint(conf.get("spark.logservice.endpoint")) 36 | sp.setProject(conf.get("spark.logservice.project")) 37 | sp.setLogstore(conf.get("spark.logservice.logstore")) 38 | sp.setCursor(LogHubCursorPosition.END_CURSOR) 39 | sp.setGroup("test") 40 | sp.setLevel(StorageLevel.MEMORY_AND_DISK) 41 | 42 | sp 43 | } 44 | 45 | def main(args: Array[String]) { 46 | val conf = new SparkConf(true).setAppName("LogHubStreamingDemo") 47 | val sc = new SparkContext(conf) 48 | 49 | val ssc = new StreamingContext(sc, Durations.seconds(5)) 50 | 51 | val lines = LoghubUtils.createStream(ssc, buildParam(conf), 1).map(line => { 52 | val str = new String(line) 53 | str 54 | }) 55 | 56 | val words = lines.flatMap(_.split(" ")) 57 | words.foreachRDD(rdd => { 58 | val spark = SparkSessionSingleton.getInstance(rdd.sparkContext.getConf) 59 | import spark.implicits._ 60 | 61 | rdd.toDF("id").write.mode("append").saveAsTable("test_table") 62 | }) 63 | 64 | ssc.start() // Start the computation 65 | ssc.awaitTermination() // Wait for the computation to terminate 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/streaming/loghub/LogHubStreamingDemo.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | *

10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | *

12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.aliyun.odps.spark.examples.streaming.loghub 20 | 21 | import com.aliyun.openservices.loghub.client.config.LogHubCursorPosition 22 | import org.apache.spark.storage.StorageLevel 23 | import org.apache.spark.streaming.loghub.{LoghubUtils, StreamingParam} 24 | import org.apache.spark.streaming.{Durations, StreamingContext} 25 | import org.apache.spark.{SparkConf, SparkContext} 26 | 27 | object LogHubStreamingDemo { 28 | 29 | def buildParam(conf: SparkConf): StreamingParam = { 30 | val sp = new StreamingParam() 31 | sp.setId(conf.get("spark.logservice.accessKeyId")) 32 | sp.setSecret(conf.get("spark.logservice.accessKeySecret")) 33 | sp.setEndpoint(conf.get("spark.logservice.endpoint")) 34 | sp.setProject(conf.get("spark.logservice.project")) 35 | sp.setLogstore(conf.get("spark.logservice.logstore")) 36 | sp.setCursor(LogHubCursorPosition.END_CURSOR) 37 | sp.setGroup("test") 38 | sp.setLevel(StorageLevel.MEMORY_AND_DISK) 39 | 40 | sp 41 | } 42 | 43 | def main(args: Array[String]) { 44 | val conf = new SparkConf(true).setAppName("LogHubStreamingDemo") 45 | val sc = new SparkContext(conf) 46 | 47 | val ssc = new StreamingContext(sc, Durations.seconds(5)) 48 | 49 | val lines = LoghubUtils.createStream(ssc, buildParam(conf), 1).map(line => { 50 | val str = new String(line) 51 | str 52 | }) 53 | 54 | val words = lines.flatMap(_.split(" ")) 55 | val pairs = words.map(word => (word, 1)) 56 | val wordCounts = pairs.reduceByKey(_ + _) 57 | 58 | // Print the first ten elements of each RDD generated in this DStream to the console 59 | wordCounts.print() 60 | 61 | ssc.start() // Start the computation 62 | ssc.awaitTermination() // Wait for the computation to terminate 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/structuredStreaming/datahub/DatahubStructuredStreamingDemo.scala: -------------------------------------------------------------------------------- 1 | package com.aliyun.odps.spark.examples.structuredStreaming.datahub 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object DatahubStructuredStreamingDemo { 6 | def main(args: Array[String]): Unit = { 7 | val spark = SparkSession 8 | .builder() 9 | .appName("DatahubStructuredStreamingDemo") 10 | .getOrCreate() 11 | 12 | import spark.implicits._ 13 | 14 | val df = spark 15 | .readStream 16 | .format("datahub") 17 | .option("datahub.endpoint", "http://dh-cn-beijing.aliyun-inc.com") 18 | .option("datahub.project", "zkytest") 19 | .option("datahub.topic", "zkytest") 20 | .option("datahub.AccessId", "******") 21 | .option("datahub.AccessKey", "******") 22 | .option("StartingOffsets", "earliest") 23 | .load() 24 | 25 | /** * 26 | * WordCount Demo 27 | */ 28 | // 请使用OSS作为Checkpoint存储 29 | val checkpointLocation = "oss://bucket/checkpoint/" 30 | val lines = df.select($"id").as[String] 31 | val wordCounts = lines.flatMap(_.split(" ")).toDF("word").groupBy("word").count() 32 | 33 | val query = wordCounts.writeStream 34 | .outputMode("complete") 35 | .format("console") 36 | .option("checkpointLocation", checkpointLocation) 37 | .start() 38 | 39 | query.awaitTermination() 40 | } 41 | } 42 | 43 | -------------------------------------------------------------------------------- /spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/structuredStreaming/kafka/KafkaStructuredStreamingDemo.scala: -------------------------------------------------------------------------------- 1 | package com.aliyun.odps.spark.examples.structuredStreaming.kafka 2 | 3 | import java.sql.Timestamp 4 | 5 | import org.apache.spark.sql.SparkSession 6 | import org.apache.spark.sql.functions.window 7 | 8 | object KafkaStructuredStreamingDemo{ 9 | def main(args: Array[String]): Unit = { 10 | val spark = SparkSession 11 | .builder() 12 | .appName("KafkaStreamingDemo") 13 | .getOrCreate() 14 | 15 | import spark.implicits._ 16 | 17 | val df = spark 18 | .readStream 19 | .format("kafka") 20 | .option("kafka.bootstrap.servers", "localhost:9092") 21 | .option("subscribe", "topic") 22 | .load() 23 | 24 | /** * 25 | * WordCount Demo 26 | */ 27 | // 请使用OSS作为Checkpoint存储 28 | val checkpointLocation = "oss://bucket/checkpoint/" 29 | val lines = df.selectExpr("cast(value as string)").as[String] 30 | val wordCounts = lines.flatMap(_.split(" ")).toDF("word").groupBy("word").count() 31 | 32 | val query = wordCounts.writeStream 33 | .outputMode("complete") 34 | .format("console") 35 | .option("checkpointLocation", checkpointLocation) 36 | .option("path", "query1") 37 | .start() 38 | 39 | query.awaitTermination() 40 | 41 | /** * 42 | * Windowed WordCount Demo 43 | */ 44 | val wordsWithTimestamp = df.selectExpr("cast(value as string)").as[String] 45 | .flatMap(x => { 46 | val Array(ts, data) = x.split(",") 47 | data.split(" ").map((new Timestamp(ts.toLong), _)) 48 | }).as[(Timestamp, String)].toDF("timestamp", "word") 49 | 50 | // 请使用OSS作为Checkpoint存储 51 | val checkpointLocation2 = "oss://bucket/checkpoint2/" 52 | val windowedCounts = wordsWithTimestamp 53 | .groupBy( 54 | window($"timestamp", "10 seconds", "5 seconds"), 55 | $"word" 56 | ).count() 57 | 58 | val query2 = windowedCounts.writeStream 59 | .outputMode("complete") 60 | .format("console") 61 | .option("checkpointLocation", checkpointLocation2) 62 | .start() 63 | 64 | query2.awaitTermination() 65 | 66 | /** * 67 | * Windowed WordCount with Watermark Demo 68 | */ 69 | // 请使用OSS作为Checkpoint存储 70 | val checkpointLocation3 = "oss://bucket/checkpoint3/" 71 | 72 | val windowedCountsWithWatermark = wordsWithTimestamp 73 | .withWatermark("timestamp", "5 seconds") 74 | .groupBy( 75 | window($"timestamp", "6 seconds", "3 seconds"), 76 | $"word" 77 | ).count() 78 | 79 | val query3 = windowedCountsWithWatermark.writeStream 80 | .outputMode("append") 81 | .format("console") 82 | .option("checkpointLocation", checkpointLocation3) 83 | .start() 84 | 85 | query3.awaitTermination() 86 | } 87 | } 88 | 89 | -------------------------------------------------------------------------------- /spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/structuredStreaming/loghub/LoghubStructuredStreamingDemo.scala: -------------------------------------------------------------------------------- 1 | package com.aliyun.odps.spark.examples.structuredStreaming.loghub 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object LoghubStructuredStreamingDemo { 6 | def main(args: Array[String]): Unit = { 7 | val spark = SparkSession 8 | .builder() 9 | .appName("LoghubStructuredStreamingDemo") 10 | .getOrCreate() 11 | 12 | import spark.implicits._ 13 | 14 | val df = spark 15 | .readStream 16 | .format("loghub") 17 | .option("Loghub.Endpoint", "cn-beijing-intranet.log.aliyuncs.com") 18 | .option("Loghub.Project", "zkytest") 19 | .option("Loghub.AccessId", "******") 20 | .option("Loghub.AccessKey", "******") 21 | .option("Loghub.Logstores", "zkytest") 22 | .option("StartingOffsets", "latest") 23 | .load() 24 | 25 | /** * 26 | * WordCount Demo 27 | */ 28 | // 请使用OSS作为Checkpoint存储 29 | val checkpointLocation = "oss://bucket/checkpoint" 30 | val lines = df.select($"contents").as[String] 31 | val wordCounts = lines.flatMap(_.split(" ")).toDF("word").groupBy("word").count() 32 | 33 | val query = wordCounts.writeStream 34 | .outputMode("complete") 35 | .format("console") 36 | .option("checkpointLocation", checkpointLocation) 37 | .start() 38 | 39 | query.awaitTermination() 40 | } 41 | } 42 | 43 | -------------------------------------------------------------------------------- /spark-3.x/libs/jindofs-sdk-3.7.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/spark-3.x/libs/jindofs-sdk-3.7.2.jar -------------------------------------------------------------------------------- /spark-3.x/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 15 | 18 | 4.0.0 19 | 20 | 21 | 3.1.1 22 | 3.3.8-public 23 | 2.12.10 24 | 2.12 25 | 26 | 27 | com.aliyun.odps 28 | spark-examples_${scala.binary.version} 29 | 1.0.0-SNAPSHOT 30 | jar 31 | 32 | 33 | 34 | org.apache.spark 35 | spark-core_${scala.binary.version} 36 | ${spark.version} 37 | provided 38 | 39 | 40 | org.apache.spark 41 | spark-sql_${scala.binary.version} 42 | ${spark.version} 43 | provided 44 | 45 | 46 | org.apache.spark 47 | spark-mllib_${scala.binary.version} 48 | ${spark.version} 49 | provided 50 | 51 | 52 | org.apache.spark 53 | spark-streaming_${scala.binary.version} 54 | ${spark.version} 55 | provided 56 | 57 | 58 | 59 | com.aliyun.odps 60 | cupid-sdk 61 | ${cupid.sdk.version} 62 | provided 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | org.apache.maven.plugins 71 | maven-shade-plugin 72 | 2.4.3 73 | 74 | 75 | package 76 | 77 | shade 78 | 79 | 80 | false 81 | true 82 | 83 | 84 | 86 | *:* 87 | 88 | 89 | 90 | 91 | *:* 92 | 93 | META-INF/*.SF 94 | META-INF/*.DSA 95 | META-INF/*.RSA 96 | **/log4j.properties 97 | 98 | 99 | 100 | 101 | 103 | reference.conf 104 | 105 | 107 | 108 | META-INF/services/org.apache.spark.sql.sources.DataSourceRegister 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | net.alchim31.maven 118 | scala-maven-plugin 119 | 3.3.2 120 | 121 | 122 | scala-compile-first 123 | process-resources 124 | 125 | compile 126 | 127 | 128 | 129 | scala-test-compile-first 130 | process-test-resources 131 | 132 | testCompile 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | -------------------------------------------------------------------------------- /spark-3.x/src/main/java/com/aliyun/odps/spark/examples/sparksql/JavaSparkSQL.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | *

10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | *

12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.aliyun.odps.spark.examples.sparksql; 20 | 21 | import com.aliyun.odps.Odps; 22 | import com.aliyun.odps.cupid.CupidSession; 23 | import org.apache.spark.sql.SparkSession; 24 | import org.apache.spark.api.java.JavaRDD; 25 | import org.apache.spark.api.java.JavaSparkContext; 26 | import org.apache.spark.api.java.function.Function; 27 | import org.apache.spark.sql.Row; 28 | import org.apache.spark.sql.Dataset; 29 | import org.apache.spark.sql.RowFactory; 30 | import org.apache.spark.sql.types.*; 31 | 32 | import java.util.ArrayList; 33 | import java.util.List; 34 | 35 | import org.apache.spark.sql.types.StructField; 36 | 37 | public class JavaSparkSQL { 38 | 39 | public static void main(String[] args) throws Exception { 40 | SparkSession spark = SparkSession 41 | .builder() 42 | .appName("SparkSQL-on-MaxCompute") 43 | .config("spark.sql.defaultCatalog","odps") 44 | .config("spark.sql.catalog.odps", "org.apache.spark.sql.execution.datasources.v2.odps.OdpsTableCatalog") 45 | .config("spark.sql.sources.partitionOverwriteMode", "dynamic") 46 | .config("spark.sql.extensions", "org.apache.spark.sql.execution.datasources.v2.odps.extension.OdpsExtensions") 47 | .config("spark.sql.catalogImplementation","hive") 48 | .getOrCreate(); 49 | JavaSparkContext sparkContext = new JavaSparkContext(spark.sparkContext()); 50 | 51 | 52 | String tableName = "mc_test_table"; 53 | String tableNameCopy = "mc_test_table_copy"; 54 | String ptTableName = "mc_test_pt_table"; 55 | 56 | 57 | spark.sql("DROP TABLE IF EXISTS " + tableName); 58 | spark.sql("DROP TABLE IF EXISTS " + tableNameCopy); 59 | spark.sql("DROP TABLE IF EXISTS " + ptTableName); 60 | 61 | spark.sql("CREATE TABLE " + tableName + " (name STRING, num BIGINT)"); 62 | spark.sql("CREATE TABLE " + ptTableName + " (name STRING, num BIGINT) PARTITIONED BY (pt1 STRING, pt2 STRING)"); 63 | 64 | List data = new ArrayList(); 65 | for (int i = 0; i < 100; i++) { 66 | data.add(i); 67 | } 68 | 69 | JavaRDD dfRDD = sparkContext.parallelize(data, 2).map(new Function() { 70 | public Row call(Integer i) { 71 | return RowFactory.create( 72 | "name-" + i.toString(), 73 | Long.valueOf(i)); 74 | } 75 | }); 76 | 77 | JavaRDD ptDfRDD = sparkContext.parallelize(data, 2).map(new Function() { 78 | public Row call(Integer i) { 79 | return RowFactory.create( 80 | "name-" + i.toString(), 81 | Long.valueOf(i), 82 | "2018", 83 | "0601"); 84 | } 85 | }); 86 | 87 | List structFilelds = new ArrayList(); 88 | structFilelds.add(DataTypes.createStructField("name", DataTypes.StringType, true)); 89 | structFilelds.add(DataTypes.createStructField("num", DataTypes.LongType, true)); 90 | Dataset df = spark.createDataFrame(dfRDD, DataTypes.createStructType(structFilelds)); 91 | 92 | structFilelds.add(DataTypes.createStructField("pt1", DataTypes.StringType, true)); 93 | structFilelds.add(DataTypes.createStructField("pt2", DataTypes.StringType, true)); 94 | Dataset ptDf = spark.createDataFrame(ptDfRDD, DataTypes.createStructType(structFilelds)); 95 | 96 | // 写 普通表 97 | df.write().insertInto(tableName); // insertInto语义 98 | df.writeTo(tableName).overwritePartitions(); // insertOverwrite use datasourcev2 99 | 100 | // 读 普通表 101 | Dataset rdf = spark.sql("select name, num from " + tableName); 102 | System.out.println("rdf count: " + rdf.count()); 103 | rdf.printSchema(); 104 | 105 | //create table as select 106 | spark.sql("CREATE TABLE " + tableNameCopy + " AS SELECT name, num FROM " + tableName); 107 | spark.sql("SELECT * FROM " + tableNameCopy).show(); 108 | 109 | // 写 分区表 110 | // DataFrameWriter 无法指定分区写入 需要通过临时表再用SQL写入特定分区 111 | df.registerTempTable(ptTableName + "_tmp_view"); 112 | spark.sql("insert into table " + ptTableName + " partition (pt1='2018', pt2='0601') select * from " + ptTableName + "_tmp_view"); 113 | spark.sql("insert overwrite table " + ptTableName + " partition (pt1='2018', pt2='0601') select * from " + ptTableName + "_tmp_view"); 114 | 115 | ptDf.write().insertInto(ptTableName);// 动态分区 insertInto语义 116 | ptDf.write().mode("overwrite").insertInto(ptTableName); // 动态分区 insertOverwrite语义 117 | 118 | // 读 分区表 119 | Dataset rptdf = spark.sql("select name, num, pt1, pt2 from " + ptTableName + " where pt1 = '2018' and pt2 = '0601'"); 120 | System.out.println("rptdf count: " + rptdf.count()); 121 | rptdf.printSchema(); 122 | 123 | // example for use odps 124 | Odps odps = CupidSession.get().odps(); 125 | System.out.println(odps.tables().get(ptTableName).getPartitions().size()); 126 | System.out.println(odps.tables().get(ptTableName).getPartitions().get(0).getPartitionSpec()); 127 | } 128 | } -------------------------------------------------------------------------------- /spark-3.x/src/main/python/spark_oss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | from pyspark.sql import SparkSession 4 | 5 | try: 6 | # for python 2 7 | reload(sys) 8 | sys.setdefaultencoding('utf8') 9 | except: 10 | # python 3 not needed 11 | pass 12 | 13 | if __name__ == '__main__': 14 | spark = SparkSession.builder\ 15 | .appName("spark write df to oss")\ 16 | .getOrCreate() 17 | 18 | data = [i for i in range(0, 100)] 19 | 20 | df = spark.sparkContext.parallelize(data, 2).map(lambda s: ("name-%s" % s, s)).toDF("name: string, num: int") 21 | 22 | df.show(n=10) 23 | 24 | # write to oss 25 | pathout = 'oss://yeshan01/test.csv' 26 | df.write.csv(pathout) 27 | -------------------------------------------------------------------------------- /spark-3.x/src/main/python/spark_sql.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | from pyspark.sql import SparkSession 4 | 5 | try: 6 | # for python 2 7 | reload(sys) 8 | sys.setdefaultencoding('utf8') 9 | except: 10 | # python 3 not needed 11 | pass 12 | 13 | if __name__ == '__main__': 14 | spark = SparkSession.builder\ 15 | .appName("spark sql")\ 16 | .config("spark.sql.broadcastTimeout", 20 * 60)\ 17 | .config("spark.sql.crossJoin.enabled", True)\ 18 | .getOrCreate() 19 | 20 | tableName = "mc_test_table" 21 | ptTableName = "mc_test_pt_table" 22 | data = [i for i in range(0, 100)] 23 | 24 | # Drop Create 25 | spark.sql("DROP TABLE IF EXISTS %s" % tableName) 26 | spark.sql("DROP TABLE IF EXISTS %s" % ptTableName) 27 | 28 | spark.sql("CREATE TABLE %s (name STRING, num BIGINT)" % tableName) 29 | spark.sql("CREATE TABLE %s (name STRING, num BIGINT) PARTITIONED BY (pt1 STRING, pt2 STRING)" % ptTableName) 30 | 31 | df = spark.sparkContext.parallelize(data, 2).map(lambda s: ("name-%s" % s, s)).toDF("name: string, num: int") 32 | pt_df = spark.sparkContext.parallelize(data, 2).map(lambda s: ("name-%s" % s, s, "2018", "0601")).toDF("name: string, num: int, pt1: string, pt2: string") 33 | 34 | # 写 普通表 35 | df.write.insertInto(tableName) # insertInto语义 36 | df.writeTo(tableName).overwritePartitions() # insertOverwrite use datasourcev2 37 | 38 | # 写 分区表 39 | # DataFrameWriter 无法指定分区写入 需要通过临时表再用SQL写入特定分区 40 | df.createOrReplaceTempView("%s_tmp_view" % ptTableName) 41 | spark.sql("insert into table %s partition (pt1='2018', pt2='0601') select * from %s_tmp_view" % (ptTableName, ptTableName)) 42 | spark.sql("insert overwrite table %s partition (pt1='2018', pt2='0601') select * from %s_tmp_view" % (ptTableName, ptTableName)) 43 | 44 | pt_df.write.insertInto(ptTableName) # 动态分区 insertInto语义 45 | pt_df.write.insertInto(ptTableName, True) # 动态分区 insertOverwrite语义 46 | 47 | # 读 普通表 48 | rdf = spark.sql("select name, num from %s" % tableName) 49 | print("rdf count, %s\n" % rdf.count()) 50 | rdf.printSchema() 51 | 52 | # 读 分区表 53 | rptdf = spark.sql("select name, num, pt1, pt2 from %s where pt1 = '2018' and pt2 = '0601'" % ptTableName) 54 | print("rptdf count, %s" % (rptdf.count())) 55 | rptdf.printSchema() -------------------------------------------------------------------------------- /spark-3.x/src/main/scala/com/aliyun/odps/spark/examples/SparkPi.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | *

10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | *

12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.aliyun.odps.spark.examples 20 | 21 | import org.apache.spark.sql.SparkSession 22 | 23 | import scala.math.random 24 | 25 | object SparkPi { 26 | def main(args: Array[String]) { 27 | val spark = SparkSession 28 | .builder() 29 | .appName("SparkPi") 30 | .getOrCreate() 31 | val sc = spark.sparkContext 32 | 33 | try { 34 | val slices = if (args.length > 0) args(0).toInt else 2 35 | val n = math.min(100000L * slices, Int.MaxValue).toInt // avoid overflow 36 | val count = sc.parallelize(1 until n, slices).map { i => 37 | val x = random * 2 - 1 38 | val y = random * 2 - 1 39 | if (x * x + y * y < 1) 1 else 0 40 | }.reduce(_ + _) 41 | println("Pi is roughly " + 4.0 * count / n) 42 | } finally { 43 | sc.stop() 44 | } 45 | } 46 | } -------------------------------------------------------------------------------- /spark-3.x/src/main/scala/com/aliyun/odps/spark/examples/WordCount.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | *

10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | *

12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.aliyun.odps.spark.examples 20 | 21 | import org.apache.spark.sql.SparkSession 22 | 23 | object WordCount { 24 | def main(args: Array[String]) { 25 | val spark = SparkSession 26 | .builder() 27 | .appName("WordCount") 28 | .getOrCreate() 29 | val sc = spark.sparkContext 30 | 31 | try { 32 | sc.parallelize(1 to 100, 10).map(word => (word, 1)).reduceByKey(_ + _, 10).take(100).foreach(println) 33 | } finally { 34 | sc.stop() 35 | } 36 | } 37 | } -------------------------------------------------------------------------------- /spark-3.x/src/main/scala/com/aliyun/odps/spark/examples/graphx/PageRank.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | *

10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | *

12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.aliyun.odps.spark.examples.graphx 20 | 21 | import org.apache.spark.graphx.{Edge, Graph, VertexId} 22 | import org.apache.spark.rdd.RDD 23 | import org.apache.spark.sql.SparkSession 24 | 25 | object PageRank { 26 | def main(args: Array[String]): Unit = { 27 | val spark = SparkSession 28 | .builder() 29 | .appName("PageRank") 30 | .getOrCreate() 31 | val sc = spark.sparkContext 32 | 33 | // build vertices 34 | val users: RDD[(VertexId, Array[String])] = sc.parallelize(List( 35 | "1,BarackObama,Barack Obama", 36 | "2,ladygaga,Goddess of Love", 37 | "3,jeresig,John Resig", 38 | "4,justinbieber,Justin Bieber", 39 | "6,matei_zaharia,Matei Zaharia", 40 | "7,odersky,Martin Odersky", 41 | "8,anonsys" 42 | ).map(line => line.split(",")).map(parts => (parts.head.toLong, parts.tail))) 43 | 44 | // build edges 45 | val followers: RDD[Edge[Double]] = sc.parallelize(Array( 46 | Edge(2L, 1L, 1.0), 47 | Edge(4L, 1L, 1.0), 48 | Edge(1L, 2L, 1.0), 49 | Edge(6L, 3L, 1.0), 50 | Edge(7L, 3L, 1.0), 51 | Edge(7L, 6L, 1.0), 52 | Edge(6L, 7L, 1.0), 53 | Edge(3L, 7L, 1.0) 54 | )) 55 | 56 | // build graph 57 | val followerGraph: Graph[Array[String], Double] = Graph(users, followers) 58 | 59 | // restrict the graph to users with usernames and names 60 | val subgraph = followerGraph.subgraph(vpred = (vid, attr) => attr.size == 2) 61 | 62 | // compute PageRank 63 | val pageRankGraph = subgraph.pageRank(0.001) 64 | 65 | // get attributes of the top pagerank users 66 | val userInfoWithPageRank = subgraph.outerJoinVertices(pageRankGraph.vertices) { 67 | case (uid, attrList, Some(pr)) => (pr, attrList.toList) 68 | case (uid, attrList, None) => (0.0, attrList.toList) 69 | } 70 | 71 | println(userInfoWithPageRank.vertices.top(5)(Ordering.by(_._2._1)).mkString("\n")) 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /spark-3.x/src/main/scala/com/aliyun/odps/spark/examples/mllib/KmeansModelSaveToOss.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | *

10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | *

12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.aliyun.odps.spark.examples.mllib 20 | 21 | import org.apache.spark.mllib.clustering.KMeans._ 22 | import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} 23 | import org.apache.spark.mllib.linalg.Vectors 24 | import org.apache.spark.sql.SparkSession 25 | 26 | object KmeansModelSaveToOss { 27 | val modelOssDir = "oss://[bucket]/kmeans-model" 28 | 29 | def main(args: Array[String]) { 30 | 31 | //1. train and save the model 32 | val spark = SparkSession 33 | .builder() 34 | .config("spark.hadoop.fs.AbstractFileSystem.oss.impl", "com.aliyun.emr.fs.oss.OSS") 35 | .config("spark.hadoop.fs.oss.impl", "com.aliyun.emr.fs.oss.JindoOssFileSystem") 36 | .config("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-internal.aliyuncs.com") 37 | .config("spark.hadoop.fs.oss.accessKeyId", "xxx") 38 | .config("spark.hadoop.fs.oss.accessKeySecret", "xxx") 39 | .appName("KmeansModelSaveToOss") 40 | .getOrCreate() 41 | 42 | val sc = spark.sparkContext 43 | val points = Seq( 44 | Vectors.dense(0.0, 0.0), 45 | Vectors.dense(0.0, 0.1), 46 | Vectors.dense(0.1, 0.0), 47 | Vectors.dense(9.0, 0.0), 48 | Vectors.dense(9.0, 0.2), 49 | Vectors.dense(9.2, 0.0) 50 | ) 51 | val rdd = sc.parallelize(points, 3) 52 | val initMode = K_MEANS_PARALLEL 53 | val model = KMeans.train(rdd, k = 2, maxIterations = 2, initMode) 54 | val predictResult1 = rdd.map(feature => "cluster id: " + model.predict(feature) + " feature:" + feature.toArray.mkString(",")).collect 55 | println("modelOssDir=" + modelOssDir) 56 | model.save(sc, modelOssDir) 57 | 58 | //2. predict from the oss model 59 | val modelLoadOss = KMeansModel.load(sc, modelOssDir) 60 | val predictResult2 = rdd.map(feature => "cluster id: " + modelLoadOss.predict(feature) + " feature:" + feature.toArray.mkString(",")).collect 61 | assert(predictResult1.size == predictResult2.size) 62 | predictResult2.foreach(result2 => assert(predictResult1.contains(result2))) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /spark-3.x/src/main/scala/com/aliyun/odps/spark/examples/oss/JindoFsDemo.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | *

10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | *

12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.aliyun.odps.spark.examples.oss 20 | 21 | import org.apache.spark.{SparkConf, SparkContext} 22 | 23 | object JindoFsDemo { 24 | def main(args: Array[String]): Unit = { 25 | val bucket : String = args(0) 26 | val ossPath : String = args(1) 27 | 28 | //using access-key-id/access-key-secret 29 | val conf = new SparkConf() 30 | .setAppName("jindo-fs-demo") 31 | .set("spark.hadoop.fs.AbstractFileSystem.oss.impl", "com.aliyun.emr.fs.oss.OSS") 32 | .set("spark.hadoop.fs.oss.impl", "com.aliyun.emr.fs.oss.JindoOssFileSystem") 33 | .set("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-internal.aliyuncs.com") 34 | .set("spark.hadoop.fs.oss.accessKeyId", "xxx") 35 | .set("spark.hadoop.fs.oss.accessKeySecret", "xxx") 36 | 37 | val sc = new SparkContext(conf) 38 | 39 | try { 40 | read_oss_dir(sc, "demo", s"oss://${bucket}/${ossPath}") 41 | } finally { 42 | sc.stop() 43 | } 44 | } 45 | 46 | /** 47 | * compute cost time using jindo sdk 48 | */ 49 | def read_oss_dir(sc: SparkContext, job_des:String, ossPath: String): Unit = { 50 | val startTime: Long = System.currentTimeMillis() 51 | val inputData = sc.textFile(ossPath, 20) 52 | val cnt = inputData.count 53 | val endTime:Long = System.currentTimeMillis() 54 | val cost:Long = endTime - startTime 55 | println(s"job:$job_des, count:$cnt, consume:$cost") 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /spark-3.x/src/main/scala/com/aliyun/odps/spark/examples/oss/SparkUnstructuredDataCompute.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | *

10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | *

12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.aliyun.odps.spark.examples.oss 20 | 21 | import org.apache.spark.sql.SparkSession 22 | 23 | object SparkUnstructuredDataCompute { 24 | def main(args: Array[String]) { 25 | val spark = SparkSession 26 | .builder() 27 | .config("spark.hadoop.fs.AbstractFileSystem.oss.impl", "com.aliyun.emr.fs.oss.OSS") 28 | .config("spark.hadoop.fs.oss.impl", "com.aliyun.emr.fs.oss.JindoOssFileSystem") 29 | .config("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-internal.aliyuncs.com") 30 | .config("spark.hadoop.fs.oss.accessKeyId", "xxx") 31 | .config("spark.hadoop.fs.oss.accessKeySecret", "xxx") 32 | .appName("SparkUnstructuredDataCompute") 33 | .getOrCreate() 34 | 35 | val sc = spark.sparkContext 36 | try { 37 | val pathIn = "oss://bucket/inputdata/" 38 | val inputData = sc.textFile(pathIn, 5) 39 | val cnt = inputData.count 40 | println(s"count: $cnt") 41 | } finally { 42 | sc.stop() 43 | } 44 | } 45 | } -------------------------------------------------------------------------------- /spark-3.x/src/main/scala/com/aliyun/odps/spark/examples/sparksql/SparkSQL.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | *

10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | *

12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.aliyun.odps.spark.examples.sparksql 20 | 21 | import org.apache.spark.sql.{SaveMode, SparkSession} 22 | 23 | object SparkSQL { 24 | def main(args: Array[String]) { 25 | val spark = SparkSession 26 | .builder() 27 | .appName("SparkSQL-on-MaxCompute") 28 | .config("spark.sql.broadcastTimeout", 20 * 60) 29 | .config("spark.sql.crossJoin.enabled", true) 30 | .config("spark.sql.defaultCatalog","odps") 31 | .config("spark.sql.catalog.odps", "org.apache.spark.sql.execution.datasources.v2.odps.OdpsTableCatalog") 32 | .config("spark.sql.sources.partitionOverwriteMode", "dynamic") 33 | .config("spark.sql.extensions", "org.apache.spark.sql.execution.datasources.v2.odps.extension.OdpsExtensions") 34 | .config("spark.sql.catalogImplementation","hive") 35 | .getOrCreate() 36 | 37 | import spark._ 38 | import sqlContext.implicits._ 39 | val tableName = "mc_test_table" 40 | val ptTableName = "mc_test_pt_table" 41 | // Drop Create 42 | sql(s"DROP TABLE IF EXISTS ${tableName}") 43 | sql(s"DROP TABLE IF EXISTS ${ptTableName}") 44 | 45 | sql(s"CREATE TABLE ${tableName} (name STRING, num BIGINT)") 46 | sql(s"CREATE TABLE ${ptTableName} (name STRING, num BIGINT) PARTITIONED BY (pt1 STRING, pt2 STRING)") 47 | 48 | val df = spark.sparkContext.parallelize(0 to 99, 2).map(f => { 49 | (s"name-$f", f) 50 | }).toDF("name", "num") 51 | 52 | val ptDf = spark.sparkContext.parallelize(0 to 99, 2).map(f => { 53 | (s"name-$f", f, "2018", "0601") 54 | }).toDF("name", "num", "pt1", "pt2") 55 | 56 | // 写 普通表 57 | df.write.insertInto(tableName) // insertInto语义 58 | df.writeTo(tableName).overwritePartitions() // insertOverwrite use datasourceV2 59 | 60 | // 写 分区表 61 | // DataFrameWriter 无法指定分区写入 需要通过临时表再用SQL写入特定分区 62 | df.createOrReplaceTempView(s"${ptTableName}_tmp_view") 63 | sql(s"insert into table ${ptTableName} partition (pt1='2018', pt2='0601') select * from ${ptTableName}_tmp_view") 64 | sql(s"insert overwrite table ${ptTableName} partition (pt1='2018', pt2='0601') select * from ${ptTableName}_tmp_view") 65 | 66 | ptDf.write.insertInto(ptTableName) // 动态分区 insertInto语义 67 | ptDf.write.mode("overwrite").insertInto(ptTableName) // 动态分区 insertOverwrite语义 68 | 69 | // 读 普通表 70 | val rdf = sql(s"select name, num from $tableName") 71 | println(s"rdf show, ${rdf.count()}") 72 | rdf.show() 73 | rdf.printSchema() 74 | 75 | // 读 分区表 76 | val rptdf = sql(s"select name, num, pt1, pt2 from $ptTableName where pt1 = '2018' and pt2 = '0601'") 77 | println(s"rptdf show, ${rptdf.count()}") 78 | rptdf.show() 79 | rptdf.printSchema() 80 | } 81 | } 82 | 83 | -------------------------------------------------------------------------------- /spark-utils/libs/cupid-sdk-3.3.14.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/spark-utils/libs/cupid-sdk-3.3.14.jar -------------------------------------------------------------------------------- /spark-utils/libs/hadoop-yarn-client-3.3.12.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/spark-utils/libs/hadoop-yarn-client-3.3.12.jar -------------------------------------------------------------------------------- /spark-utils/src/main/java/com/aliyun/odps/spark/CupidApplicationMetaExample.java: -------------------------------------------------------------------------------- 1 | package com.aliyun.odps.spark; 2 | import apsara.odps.cupid.protocol.CupidTaskParamProtos; 3 | import com.aliyun.odps.cupid.CupidConf; 4 | import com.aliyun.odps.cupid.CupidSession; 5 | import com.aliyun.odps.cupid.requestcupid.ApplicationMetaUtil; 6 | import com.aliyun.odps.cupid.requestcupid.CupidProxyTokenUtil; 7 | 8 | import java.util.List; 9 | import java.util.stream.Collectors; 10 | 11 | public class CupidApplicationMetaExample { 12 | 13 | // cd target 14 | // java -cp ../libs/cupid-sdk-3.3.14.jar:spark-utils-1.0.0-shaded.jar com.aliyun.odps.spark.CupidApplicationMetaExample 15 | public static void main(String[] args) throws Exception { 16 | CupidConf conf = new CupidConf(); 17 | conf.set("odps.access.id", ""); 18 | conf.set("odps.access.key", ""); 19 | conf.set("odps.project.name", ""); 20 | conf.set("odps.end.point", ""); 21 | CupidSession session = new CupidSession(conf); 22 | 23 | /* 24 | * list application metas 25 | * yarnApplicationStates: https://hadoop.apache.org/docs/r2.7.3/api/org/apache/hadoop/yarn/api/records/YarnApplicationState.html 26 | * 注意:list开销较大,调用频率不建议太高 27 | */ 28 | CupidTaskParamProtos.ApplicationMetaList applicationMetaList = ApplicationMetaUtil.listApplicationMeta( 29 | "SPARK", 30 | "5", 31 | session); 32 | List applicationMetas = applicationMetaList.getApplicationMetaListList() 33 | .stream() 34 | .collect(Collectors.toList()); 35 | if (applicationMetas.size() > 0) { 36 | applicationMetas.forEach(System.out::println); 37 | } 38 | 39 | /* 40 | * get application meta by instanceid 41 | */ 42 | String instanceId = "20211214074136554gqpk7659"; 43 | CupidTaskParamProtos.ApplicationMeta applicationMeta= ApplicationMetaUtil.getCupidInstanceMeta(instanceId, session); 44 | System.out.println(applicationMeta.toString()); 45 | } 46 | } 47 | 48 | 49 | -------------------------------------------------------------------------------- /spark-utils/src/main/java/com/aliyun/odps/spark/SparkLauncherTest.java: -------------------------------------------------------------------------------- 1 | package com.aliyun.odps.spark; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | import java.util.concurrent.CountDownLatch; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.yarn.api.records.ApplicationId; 9 | import org.apache.hadoop.yarn.api.records.ApplicationReport; 10 | import org.apache.hadoop.yarn.api.records.YarnApplicationState; 11 | import org.apache.hadoop.yarn.client.api.YarnClient; 12 | import org.apache.hadoop.yarn.util.ConverterUtils; 13 | import org.apache.spark.launcher.SparkLauncher; 14 | import org.apache.spark.launcher.SparkAppHandle; 15 | 16 | public class SparkLauncherTest { 17 | 18 | private static String accessId = ""; 19 | private static String accessKey = ""; 20 | private static String projectName = ""; 21 | private static String endPoint = ""; 22 | 23 | // cd target 24 | // java -cp ../libs/*:spark-utils-1.0.0-shaded.jar com.aliyun.odps.spark.SparkLauncherTest 25 | public static void main(String[] args) throws Exception { 26 | Map env = new HashMap<>(); 27 | // relace here 28 | env.put("SPARK_HOME", "/Users/wusj/software/spark/spark-2.3.0-odps0.33.0"); 29 | 30 | CountDownLatch countDownLatch = new CountDownLatch(1); 31 | SparkLauncher launcher = new SparkLauncher(env); 32 | launcher.setConf(SparkLauncher.DRIVER_EXTRA_CLASSPATH, System.getProperty("java.class.path")) 33 | .setConf("spark.hadoop.odps.access.id", accessId) 34 | .setConf("spark.hadoop.odps.access.key", accessKey) 35 | .setConf("spark.hadoop.odps.project.name", projectName) 36 | .setConf("spark.hadoop.odps.end.point", endPoint) 37 | .setMainClass("JavaSparkPi") 38 | // relace here 39 | .setAppResource("/Users/wusj/code/spark/test.jar") 40 | .setMaster("yarn") 41 | .setDeployMode("cluster") 42 | .startApplication(new SparkAppHandle.Listener(){ 43 | @Override 44 | public void stateChanged(SparkAppHandle handle){ 45 | System.out.println("State changed to:" + handle.getState().toString()); 46 | if (handle.getState().equals(SparkAppHandle.State.RUNNING)) { 47 | // Test kill application 48 | killApplication(handle.getAppId()); 49 | } 50 | if (handle.getState().isFinal()) { 51 | countDownLatch.countDown(); 52 | } 53 | } 54 | @Override 55 | public void infoChanged(SparkAppHandle handle) { 56 | } 57 | }); 58 | countDownLatch.await(); 59 | } 60 | 61 | public static void killApplication(String applicationId) { 62 | YarnClient client = YarnClient.createYarnClient(); 63 | Configuration conf = new Configuration(); 64 | conf.set("odps.access.id", accessId); 65 | conf.set("odps.access.key", accessKey); 66 | conf.set("odps.project.name", projectName); 67 | conf.set("odps.end.point", endPoint); 68 | client.init(conf); 69 | client.start(); 70 | 71 | ApplicationId appId = ConverterUtils.toApplicationId(applicationId); 72 | try { 73 | ApplicationReport appReport = client.getApplicationReport(appId); 74 | if (appReport.getYarnApplicationState() == YarnApplicationState.FINISHED 75 | || appReport.getYarnApplicationState() == YarnApplicationState.KILLED 76 | || appReport.getYarnApplicationState() == YarnApplicationState.FAILED) { 77 | System.out.println("Application " + applicationId + " has already finished "); 78 | } else { 79 | System.out.println("Killing application " + applicationId); 80 | client.killApplication(appId); 81 | } 82 | } catch (Exception e) { 83 | System.out.println("Kill application with id '" + applicationId + "' failed: " + e.getMessage()); 84 | } 85 | } 86 | } 87 | 88 | 89 | --------------------------------------------------------------------------------