├── .github
    └── workflows
    │   └── deploy-pages.yml
├── .gitignore
├── LICENSE
├── README.md
├── docs
    ├── .eslintrc.js
    ├── .gitignore
    ├── README.md
    ├── babel.config.js
    ├── docs
    │   ├── advanced
    │   │   ├── _category_.json
    │   │   ├── eni-migration.md
    │   │   ├── jindo-sdk.md
    │   │   ├── oss-access.md
    │   │   ├── vpc-access.md
    │   │   └── zeppelin-integration.md
    │   ├── config
    │   │   ├── _category_.json
    │   │   ├── configuration.md
    │   │   └── resource-autoscaling.md
    │   ├── development
    │   │   ├── _category_.json
    │   │   ├── ops
    │   │   │   ├── _category_.json
    │   │   │   ├── job-diagnosis.md
    │   │   │   ├── logging-guide.md
    │   │   │   └── streaming-monitoring.md
    │   │   ├── pyspark
    │   │   │   ├── _category_.json
    │   │   │   ├── pyspark-dependencies.md
    │   │   │   ├── pyspark-oss.md
    │   │   │   └── pyspark-thirdparty.md
    │   │   ├── sample-project.md
    │   │   └── spark-streaming
    │   │   │   ├── _category_.json
    │   │   │   ├── streaming-datahub.md
    │   │   │   ├── streaming-kafka.md
    │   │   │   └── streaming-loghub.md
    │   ├── faq
    │   │   ├── _category_.json
    │   │   ├── allocate-resource.md
    │   │   ├── class-conflict.md
    │   │   ├── github-images.md
    │   │   ├── network-access.md
    │   │   ├── oom-troubleshooting.md
    │   │   ├── pyspark-faq.md
    │   │   ├── read-transactional-table.md
    │   │   ├── ref-external-file.md
    │   │   ├── spark-24-notes.md
    │   │   ├── spark-31-notes.md
    │   │   └── spark-general.md
    │   ├── overview.md
    │   ├── quickstart
    │   │   ├── _category_.json
    │   │   ├── dataworks-integration.md
    │   │   └── runtime-mode
    │   │   │   ├── client-mode.md
    │   │   │   ├── index.md
    │   │   │   ├── local-mode.md
    │   │   │   └── yarn-cluster.md
    │   └── resources
    │   │   ├── ENI-1.png
    │   │   ├── ENI-2.png
    │   │   ├── ENI-3.png
    │   │   ├── ENI-4.png
    │   │   ├── ENI-5.png
    │   │   ├── OOM1.png
    │   │   ├── cloudmonitor-1.png
    │   │   ├── cloudmonitor-2.png
    │   │   ├── cloudmonitor-3.png
    │   │   ├── cupid_arch.png
    │   │   ├── datahub-1.jpg
    │   │   ├── datahub-2.jpg
    │   │   ├── datahub-3.jpg
    │   │   ├── dataworks-1.jpg
    │   │   ├── dataworks-2.jpg
    │   │   ├── dataworks-3.jpg
    │   │   ├── dataworks-4.jpg
    │   │   ├── dataworks-5.jpg
    │   │   ├── dataworks-6.jpg
    │   │   ├── dataworks-7.jpg
    │   │   ├── dingtalk-share.jpg
    │   │   ├── fuxisensor.png
    │   │   ├── fuxisensor2.png
    │   │   ├── idea-local-1.jpg
    │   │   ├── idea-local-2.jpg
    │   │   ├── idea-local-3.jpg
    │   │   ├── idea-local-4.jpg
    │   │   ├── idea-local-5.jpg
    │   │   ├── jobview-1.jpg
    │   │   ├── jobview-2.jpg
    │   │   ├── jobview-3.jpg
    │   │   ├── jobview-4.jpg
    │   │   ├── jobview-5.jpg
    │   │   ├── log4j2-stderr.jpg
    │   │   ├── log4j2-stdout.jpg
    │   │   ├── logview-1.jpg
    │   │   ├── logview-2.jpg
    │   │   ├── logview-3.jpg
    │   │   ├── logview-4.jpg
    │   │   ├── logview-5.jpg
    │   │   ├── oss-1.jpg
    │   │   ├── oss-2.jpg
    │   │   ├── oss-3.jpg
    │   │   ├── sparkui.png
    │   │   ├── vpc-access-1.jpg
    │   │   ├── vpc-access-2.jpg
    │   │   ├── vpc-access-3.jpg
    │   │   ├── 资源申请1.png
    │   │   └── 资源申请2.png
    ├── docusaurus.config.js
    ├── package.json
    ├── sidebars.js
    ├── src
    │   ├── components
    │   │   └── HomepageFeatures
    │   │   │   ├── index.js
    │   │   │   └── styles.module.css
    │   ├── css
    │   │   └── custom.css
    │   ├── locales.json
    │   └── pages
    │   │   ├── index.js
    │   │   └── index.module.css
    ├── static
    │   ├── .nojekyll
    │   └── img
    │   │   ├── docusaurus-social-card.jpg
    │   │   ├── docusaurus.png
    │   │   ├── favicon.ico
    │   │   ├── logo.svg
    │   │   ├── undraw_docusaurus_mountain.svg
    │   │   ├── undraw_docusaurus_react.svg
    │   │   └── undraw_docusaurus_tree.svg
    └── yarn.lock
├── hook
    └── pre-commit
├── spark-1.x
    ├── pom.xml
    └── src
    │   └── main
    │       ├── java
    │           └── com
    │           │   └── aliyun
    │           │       └── odps
    │           │           └── spark
    │           │               └── examples
    │           │                   └── sparksql
    │           │                       └── JavaSparkSQL.java
    │       ├── python
    │           └── spark_sql.py
    │       └── scala
    │           └── com
    │               └── aliyun
    │                   └── odps
    │                       └── spark
    │                           └── examples
    │                               ├── SparkPi.scala
    │                               ├── WordCount.scala
    │                               ├── graphx
    │                                   └── PageRank.scala
    │                               ├── mllib
    │                                   └── KmeansModelSaveToOss.scala
    │                               ├── oss
    │                                   └── SparkUnstructuredDataCompute.scala
    │                               ├── sparksql
    │                                   └── SparkSQL.scala
    │                               └── udf
    │                                   └── SparkUDF.scala
├── spark-2.x
    ├── libs
    │   └── jindofs-sdk-3.7.2.jar
    ├── pom.xml
    └── src
    │   └── main
    │       ├── java
    │           └── com
    │           │   └── aliyun
    │           │       └── odps
    │           │           └── spark
    │           │               └── examples
    │           │                   ├── sparksql
    │           │                       ├── DataConverters.java
    │           │                       ├── JavaSparkSQL.java
    │           │                       └── JavaSparkSQLTransform.java
    │           │                   └── utils
    │           │                       └── ConfigLog4j2.java
    │       ├── python
    │           ├── spark_oss.py
    │           └── spark_sql.py
    │       └── scala
    │           └── com
    │               └── aliyun
    │                   └── odps
    │                       └── spark
    │                           └── examples
    │                               ├── SparkPi.scala
    │                               ├── WordCount.scala
    │                               ├── graphx
    │                                   └── PageRank.scala
    │                               ├── log4j2
    │                                   ├── Logger.scala
    │                                   └── SimpleWordCount.scala
    │                               ├── mllib
    │                                   └── KmeansModelSaveToOss.scala
    │                               ├── oss
    │                                   ├── JindoFsDemo.scala
    │                                   └── SparkUnstructuredDataCompute.scala
    │                               ├── sparksql
    │                                   └── SparkSQL.scala
    │                               ├── streaming
    │                                   ├── common
    │                                   │   └── SparkSessionSingleton.scala
    │                                   ├── datahub
    │                                   │   ├── DataHub2OdpsDemo.scala
    │                                   │   └── DataHubStreamingDemo.scala
    │                                   ├── kafka
    │                                   │   ├── Kafka2OdpsDemo.scala
    │                                   │   └── KafkaStreamingDemo.scala
    │                                   └── loghub
    │                                   │   ├── LogHub2OdpsDemo.scala
    │                                   │   └── LogHubStreamingDemo.scala
    │                               ├── structuredStreaming
    │                                   ├── datahub
    │                                   │   └── DatahubStructuredStreamingDemo.scala
    │                                   ├── kafka
    │                                   │   └── KafkaStructuredStreamingDemo.scala
    │                                   └── loghub
    │                                   │   └── LoghubStructuredStreamingDemo.scala
    │                               └── zeppelin
    │                                   ├── ZeppelinServer.scala
    │                                   └── ZeppelinServerPublic.scala
├── spark-3.x
    ├── libs
    │   └── jindofs-sdk-3.7.2.jar
    ├── pom.xml
    └── src
    │   └── main
    │       ├── java
    │           └── com
    │           │   └── aliyun
    │           │       └── odps
    │           │           └── spark
    │           │               └── examples
    │           │                   └── sparksql
    │           │                       └── JavaSparkSQL.java
    │       ├── python
    │           ├── spark_oss.py
    │           └── spark_sql.py
    │       └── scala
    │           └── com
    │               └── aliyun
    │                   └── odps
    │                       └── spark
    │                           └── examples
    │                               ├── SparkPi.scala
    │                               ├── WordCount.scala
    │                               ├── graphx
    │                                   └── PageRank.scala
    │                               ├── mllib
    │                                   └── KmeansModelSaveToOss.scala
    │                               ├── oss
    │                                   ├── JindoFsDemo.scala
    │                                   └── SparkUnstructuredDataCompute.scala
    │                               └── sparksql
    │                                   └── SparkSQL.scala
└── spark-utils
    ├── libs
        ├── cupid-sdk-3.3.14.jar
        └── hadoop-yarn-client-3.3.12.jar
    ├── pom.xml
    └── src
        └── main
            └── java
                └── com
                    └── aliyun
                        └── odps
                            └── spark
                                ├── CupidApplicationMetaExample.java
                                └── SparkLauncherTest.java


/.github/workflows/deploy-pages.yml:
--------------------------------------------------------------------------------
 1 | # Simple workflow for deploying static content to GitHub Pages
 2 | name: Deploy static content to Pages
 3 | 
 4 | on:
 5 |   # Runs on pushes targeting the default branch
 6 |   push:
 7 |     branches: ["master"]
 8 | 
 9 |   # Allows you to run this workflow manually from the Actions tab
10 |   workflow_dispatch:
11 | 
12 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
13 | permissions:
14 |   contents: read
15 |   pages: write
16 |   id-token: write
17 | 
18 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
19 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
20 | concurrency:
21 |   group: "pages"
22 |   cancel-in-progress: false
23 | 
24 | jobs:
25 |   # Single deploy job since we're just deploying
26 |   deploy:
27 |     environment:
28 |       name: github-pages
29 |       url: ${{ steps.deployment.outputs.page_url }}
30 |     runs-on: ubuntu-latest
31 |     steps:
32 |       - name: Checkout
33 |         uses: actions/checkout@v4
34 |       - name: Setup Pages
35 |         uses: actions/configure-pages@v5
36 |       - id: build-website                         # 构建website
37 |         run: |
38 |           cd docs
39 |           npm install
40 |           npm run build    
41 |       - name: Upload artifact
42 |         uses: actions/upload-pages-artifact@v3
43 |         with:
44 |           path: 'docs/build'
45 |       - name: Deploy to GitHub Pages
46 |         id: deployment
47 |         uses: actions/deploy-pages@v4
48 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *~
 2 | *.#*
 3 | *#*#
 4 | *.swp
 5 | *.ipr
 6 | *.iml
 7 | *.iws
 8 | *.pyc
 9 | *.pyo
10 | .idea/
11 | .idea_modules/
12 | .settings
13 | .cache
14 | target/
15 | .project
16 | .classpath
17 | .DS_Store
18 | metastore_db/
19 | derby.log
20 | log4j.properties
21 | dependency-reduced-pom.xml
22 | 
23 | # Dependencies
24 | /docs/node_modules
25 | 
26 | # Production
27 | /docs/build
28 | 
29 | # Generated files
30 | .docusaurus
31 | .cache-loader


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MaxCompute Spark
2 | 
3 | 本文档帮助快速构建可以运行在MaxCompute Spark上的应用，并提供相关API的使用Demo.
4 | 参考[文档](https://github.com/aliyun/MaxCompute-Spark/wiki)
5 | 


--------------------------------------------------------------------------------
/docs/.eslintrc.js:
--------------------------------------------------------------------------------
 1 | module.exports = {
 2 |     "env": {
 3 |         "browser": true,
 4 |         "es2021": true
 5 |     },
 6 |     "extends": "plugin:react/recommended",
 7 |     "overrides": [
 8 |         {
 9 |             "env": {
10 |                 "node": true
11 |             },
12 |             "files": [
13 |                 ".eslintrc.{js,cjs}"
14 |             ],
15 |             "parserOptions": {
16 |                 "sourceType": "script"
17 |             }
18 |         }
19 |     ],
20 |     "parserOptions": {
21 |         "ecmaVersion": "latest",
22 |         "sourceType": "module"
23 |     },
24 |     "plugins": [
25 |         "react"
26 |     ],
27 |     "rules": {
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
 1 | # Dependencies
 2 | /docusaurus/node_modules
 3 | 
 4 | # Production
 5 | /build
 6 | 
 7 | # Generated files
 8 | .docusaurus
 9 | .cache-loader
10 | 
11 | # Misc
12 | .DS_Store
13 | .env.local
14 | .env.development.local
15 | .env.test.local
16 | .env.production.local
17 | 
18 | npm-debug.log*
19 | yarn-debug.log*
20 | yarn-error.log*
21 | 
22 | # IDE
23 | .idea
24 | .idea/*


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # ODPS SDK 文档目录 README
 2 | 
 3 | ## 简介
 4 | 
 5 | 此目录用于维护和构建 ODPS SDK 的官方文档，采用 Docusaurus 进行文档的编写与管理，并利用 GitHub Pages 功能将其托管于 [https://aliyun.github.io/aliyun-odps-java-sdk/](https://aliyun.github.io/aliyun-odps-java-sdk/) 。当前文档正处于积极开发阶段，并且仅提供中文版本。
 6 | 
 7 | ## 文档构建工具 - Docusaurus
 8 | 
 9 | Docusaurus 是一个强大的静态站点生成器，特别适合构建和维护开源项目的文档网站。它的中文文档可以在 [Docusaurus 中文官网](https://docusaurus.io/zh-CN/docs) 找到，这里详细介绍了如何开始、配置以及进阶使用 Docusaurus。
10 | 
11 | ## 文档目录结构
12 | 
13 | 文档源文件位于 `docs/docs` 目录下。请在此目录中添加、修改或删除文档内容。
14 | 
15 | ## 开发环境搭建与本地调试
16 | 
17 | ### 初始化项目
18 | 
19 | 在 docs 目录下，请确保运行以下命令以安装所有依赖：
20 | 
21 | ```bash
22 | yarn install
23 | ```
24 | 
25 | ### 本地运行与预览
26 | 
27 | 安装完依赖后，你可以通过以下命令启动本地开发服务器，进行实时预览和调试：
28 | 
29 | ```bash
30 | yarn start
31 | ```
32 | 
33 | 这将自动打开浏览器并显示文档的本地预览版。
34 | 
35 | ## 部署文档
36 | 
37 | ### 当前部署流程
38 | 
39 | 目前文档部署为手动过程，但考虑未来可能采用 GitHub Actions 自动化部署。
40 | 
41 | #### 手动部署步骤
42 | 
43 | 1. 确保你的文档是最新的，并且你已经测试过。
44 | 2. 在项目根目录下的 `docs` 目录中执行以下命令：
45 | 
46 | ```bash
47 | USE_SSH=true yarn deploy
48 | ```
49 | 
50 | 该命令会使用 SSH 方式（如果配置了）将编译好的网站发布到 `gh-pages` 分支。此过程包括创建一个临时目录，复制编译后的文件至该目录，然后推送至 GitHub。
51 | 
52 | #### 注意事项
53 | 
54 | - 如果因 Git Hooks 或其他原因导致自动推送失败，你可以手动进入该临时目录，并执行 `git push` 来完成部署。
55 | - 确保你有正确的权限推送至 `gh-pages` 分支。
56 | 
57 | ---
58 | 
59 | 文档持续更新中，对于任何问题、建议或想要贡献的意愿，请随时开启 Issue 或发起 Pull Request。让我们共同完善 ODPS SDK 的文档资源！


--------------------------------------------------------------------------------
/docs/babel.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |   presets: [require.resolve('@docusaurus/core/lib/babel/preset')],
3 | };
4 | 


--------------------------------------------------------------------------------
/docs/docs/advanced/_category_.json:
--------------------------------------------------------------------------------
1 | {
2 |   "label": "增强功能",
3 |   "position": 5,
4 |   "link": {
5 |     "type": "generated-index"
6 |   }
7 | }
8 | 


--------------------------------------------------------------------------------
/docs/docs/advanced/eni-migration.md:
--------------------------------------------------------------------------------
 1 | # ENI专线访问VPC迁移指南
 2 | ## 迁移意义
 3 | 相比原先访问Aliyun VPC内的用户实例的方式（[文档](https://github.com/aliyun/MaxCompute-Spark/wiki/09.-VPC-Access文档说明)) ， ENI专线方式更加稳定，性能更好，且具备公网访问能力。
 4 | 
 5 | ## 注意事项
 6 | （1）ENI专线可以打通一个VPC，如果用户作业需要同时访问多个VPC，则可以将 **已经通过ENI专线打通的Vpc** 与 **其他Vpc** 之间再做打通即可，详情咨询**阿里云专有网络相关的技术支持**。
 7 | 
 8 | （2）Spark作业运行所在 MaxCompute Project 的主账号 Uid 与目标 vpc 所属的主账号Uid必须一致，否则作业运行时会提示以下报错信息："You are not allowed to use this vpc - vpc owner and project owner must be the same person"。
 9 | 
10 | ## 迁移步骤
11 | ### 1.提供VPC相关的信息
12 | 用户需要提供：
13 | - Vpc所在region: 例如：上海，杭州，北京等
14 | - UserId: MaxCompute Project 所属的用户主账号 Id（同时也是目标Vpc所属的主账号Uid）
15 | - VpcId: 即用户需要通过ENI专线打通的目标Vpc Id
16 | - VSwitchId: 即目标Vpc中的一个交换机Id，可以在 Vpc 管理控制台界面创建/查看，若有多个从中选取一个即可
17 | - SecurityGroupId：即目标Vpc中的一个安全组Id。**用户需要在目标Vpc下新建一个安全组**，用于对 MaxCompute Spark 访问 VPC 服务时进行访问控制
18 | 
19 | 其中，新建安全组的流程如下所示：
20 | 
21 | a). 在目标 VPC 中，创建安全组：
22 | ![image1](../resources/ENI-1.png)
23 | 
24 | b). 在目标 VPC 配置页面中，创建安全组（在页面下方）：
25 | ![image1](../resources/ENI-2.png)
26 | 
27 | c). 在安全组列表中，进入“创建安全组”
28 | ![image1](../resources/ENI-3.png)
29 | 
30 | d). 输入1.“安全组名称” 2.“网络” （请选择 MaxCompute 连接的目标 VPC） 3.“安全组类型” 选择普通安全组
31 | ![image1](../resources/ENI-4.png)
32 | 
33 | e). 提供这个新建安全组的 id即可：
34 | ![image1](../resources/ENI-5.png)
35 | 
36 | 
37 | 
38 | ### 2.ENI授权
39 | 对ENI进行授权，该步授权的目的在于允许 MaxCompute 在用户 VPC 内创建 ENI 网卡，以实现 MaxCompute 到用户 VPC 的连通。
40 | 用户只要使用主账号在**登录态**下点击以下链接进行授权即可：
41 | ```
42 | https://ram.console.aliyun.com/#/role/authorize?request=%7B%22Requests%22%3A%7B%22request1%22%3A%7B%22RoleName%22%3A%22AliyunODPSRoleForENI%22%2C%22TemplateId%22%3A%22AliyunODPSRoleForENI%22%7D%7D%2C%22ReturnUrl%22%3A%22https%3A%2F%2Fram.console.aliyun.com%2Froles%22%2C%22Service%22%3A%22ODPS%22%7D
43 | ```
44 | 
45 | ### 3.等待MaxCompute平台官方人员为您完成专线开通
46 | 
47 | ### 4.安全组规则配置
48 | 在ENI专线开通完成后，用户还需要在要访问的服务中增加相关安全规则，授权代表MaxCompute的那个安全组（即上述第1步中提供的安全组）能访问哪些服务的具体端口（比如9200, 31000等）。
49 | 
50 | 例如：用户需要访问 阿里云 RDS，则需要在 RDS 中增加规则，允许第1步中创建的安全组访问。**如果用户需要访问的服务无法添加安全组，只能添加Ip**，那么需要将第一步中所使用的VSwitch网段都添加进来。
51 | 
52 | ### 5.用户作业配置
53 | 运行spark作业，需要增加下面两个配置，就可以使用ENI专线连通目标VPC内的服务：
54 | ```
55 | spark.hadoop.odps.cupid.eni.enable = true
56 | spark.hadoop.odps.cupid.eni.info = cn-beijing:vpc-**********
57 | 这个配置格式是region:vpcid，其中vpcid就是前面打通ENI专线的那个vpcid
58 | ```
59 | 原先Spark作业中的VPC相关操作和配置**不再需要**：
60 | ```
61 | spark.hadoop.odps.cupid.vpc.domain.list
62 | spark.hadoop.odps.cupid.smartnat.enable
63 | spark.hadoop.odps.cupid.pvtz.rolearn（访问自定义域名）
64 | spark.hadoop.odps.cupid.vpc.usepvtz（访问自定义域名）
65 | ```


--------------------------------------------------------------------------------
/docs/docs/advanced/jindo-sdk.md:
--------------------------------------------------------------------------------
 1 | # Jindo sdk接入说明
 2 | 参考[jindo-sdk的说明](https://github.com/aliyun/alibabacloud-jindodata/blob/master/docs/spark/jindosdk_on_spark.md)，jindo-sdk接入有如下几个步骤。
 3 | 
 4 | - spark默认使用hadoop-oss，增加特殊配置项才可以改为使用jindo-sdk。
 5 | - 设置访问OSS需要的配置
 6 | - 部署spark应用。
 7 | 
 8 | > jindo-sdk 相比于hadoop-oss 使用更多的本地磁盘空间，如果出现*No space left on device*，可以调整`spark.hadoop.odps.cupid.disk.driver.device_size`增大本地磁盘空间。
 9 | 
10 | ## 引用jindo-sdk
11 | 
12 | 修改spark-defaults.conf增加配置项，增加spark.hadoop.odps.cupid.resources配置。使用外部文件的方法参考[引用外部文件](https://github.com/aliyun/MaxCompute-Spark/wiki/06.-%E5%BC%95%E7%94%A8%E5%A4%96%E9%83%A8%E6%96%87%E4%BB%B6%E9%97%AE%E9%A2%98)，样例配置如下：
13 | 
14 | ```text
15 | spark.hadoop.odps.cupid.resources = public.jindofs-sdk-3.7.2.jar
16 | ```
17 | 
18 | ## 使用jindo-sdk
19 | 
20 | 在`SparkConf`中设置`spark.hadoop.fs.AbstractFileSystem.oss.impl`及`spark.hadoop.fs.oss.impl`, 样例代码如下：
21 | 
22 | ```scala
23 | val conf = new SparkConf()
24 |   .setAppName("jindo-sdk-demo")
25 |   .set("spark.hadoop.fs.AbstractFileSystem.oss.impl", "com.aliyun.emr.fs.oss.OSS")
26 |   .set("spark.hadoop.fs.oss.impl", "com.aliyun.emr.fs.oss.JindoOssFileSystem")
27 | ```
28 | 
29 | ## 配置OSS
30 | 
31 | 涉及到的配置项有Oss Endpoint和Oss鉴权参数，参考[访问OSS](https://github.com/aliyun/MaxCompute-Spark/wiki/08.-Oss-Access%E6%96%87%E6%A1%A3%E8%AF%B4%E6%98%8E)获得合法的Endpoint值和OSS鉴权参数值。OSS鉴权有两种方式AccessKey鉴权及云服务角色扮演，不同鉴权方式需要使用不同的鉴权参数。
32 | 
33 | 
34 | ## 使用AccessKey鉴权
35 | 
36 | spark-defaults.conf无需变更, `SparkConf`中设置`spark.hadoop.fs.oss.endpoint`、`spark.hadoop.fs.oss.accessKeyId`、`spark.hadoop.fs.oss.accessKeySecret`。
37 | 
38 | ```scala
39 | val conf = new SparkConf()
40 |   .setAppName("jindo-sdk-demo")
41 |   .set("spark.hadoop.fs.AbstractFileSystem.oss.impl", "com.aliyun.emr.fs.oss.OSS")
42 |   .set("spark.hadoop.fs.oss.impl", "com.aliyun.emr.fs.oss.JindoOssFileSystem")
43 | 
44 |   # 配置endpoint
45 |   .set("spark.hadoop.fs.oss.endpoint", "endpoint-value")
46 | 
47 |   # 配置access-key鉴权参数
48 |   .set("spark.hadoop.fs.oss.accessKeyId", "xxx")
49 |   .set("spark.hadoop.fs.oss.accessKeySecret", "xxx")
50 | ```
51 | 
52 | ## 使用云服务角色鉴权
53 | 云服务角色描述字符串格式为`acs:ram::12345678:role/${role-name}`，其中纯数字部分'12345678'是aliyun-uid，斜线后面的字符串是角色名称。这两个值需要配置在spark应用里。
54 | 
55 | spark-defaults.conf需要添加`spark.hadoop.odps.cupid.http.server.enable`, 如下：
56 | ```text
57 | spark.hadoop.odps.cupid.http.server.enable = true
58 | ```
59 | 
60 | `SparkConf`中设置`spark.hadoop.odps.cupid.http.server.enable`、`spark.hadoop.fs.jfs.cache.oss.credentials.provider`、`spark.hadoop.aliyun.oss.provider.url`, 样例代码如下:
61 | 
62 | ```scala
63 | val conf = new SparkConf()
64 |   .setAppName("jindo-sdk-demo")
65 |   .set("spark.hadoop.fs.AbstractFileSystem.oss.impl", "com.aliyun.emr.fs.oss.OSS")
66 |   .set("spark.hadoop.fs.oss.impl", "com.aliyun.emr.fs.oss.JindoOssFileSystem")
67 | 
68 |   # 配置endpoint
69 |   .set("spark.hadoop.fs.oss.endpoint", "endpoint-value")
70 | 
71 |   # 配置云服务角色鉴权
72 |   # ${aliyun-uid}是阿里云用户UID
73 |   # ${role-name}是角色名称
74 |   .set("spark.hadoop.fs.jfs.cache.oss.credentials.provider", "com.aliyun.emr.fs.auth.CustomCredentialsProvider")
75 |   .set("spark.hadoop.aliyun.oss.provider.url", "http://localhost:10011/sts-token-info?user_id=${aliyun-uid}&role=${role-name}")
76 | ```
77 | 
78 | ## 打包上传
79 | 
80 | ```shell
81 | ./bin/spark-submit --class xxx spark-app.jar
82 | ```
83 | 
84 | 
85 | 


--------------------------------------------------------------------------------
/docs/docs/advanced/oss-access.md:
--------------------------------------------------------------------------------
 1 | # Oss Access文档说明
 2 | ## Oss Endpoint 配置
 3 | 
 4 | 本机调试时使用对应Region的外网Endpoint，提交集群需替换为VPC内网Endpoint
 5 | 
 6 | [Region和Endpoint对照表](https://help.aliyun.com/document_detail/31837.html?spm=a2c4g.11174283.6.585.5f2d7da2svYAQx#title-qvx-r3a-xr4)
 7 | 
 8 | ## 网络白名单配置
 9 | 1. 默认情况下无需设置可以直接访问；
10 | 2. 如发现无法访问，设置 spark.hadoop.odps.cupid.trusted.services.access.list=[yourbucketname].oss-xxxxxx-internal.aliyuncs.com（yarn-cluster模式使用，该配置项必须放在配置文件或是命令行提交参数里）
11 | 
12 | ## Oss Id/Key访问方式
13 | 
14 | ```
15 | spark.hadoop.fs.oss.accessKeyId = xxxxxx
16 | spark.hadoop.fs.oss.accessKeySecret = xxxxxx
17 | spark.hadoop.fs.oss.endpoint = oss-xxxxxx-internal.aliyuncs.com
18 | ```
19 | 
20 | 
21 | ## OssStsToken 授权访问方式
22 | > 一般来说，Spark提供直接通过OSS AccessId以及AccessKey的方式直接访问OSS资源，但是此方式需要明文将AccessId以及AccessKey写在用户代码或者用户配置中，不是一种安全的访问方式，本文档提供一种更加安全的方式访问OSS资源
23 | 
24 | ## 授权MaxCompute以StsToken的方式访问OSS
25 | 
26 | 点击下方的一键授权链接，可以把当前云账号的OSS资源通过StsToken的方式授权给MaxCompute的Project直接访问，前提是，该MaxCompute的ProjectOwner也是此云账号
27 | 
28 | [一键授权](https://ram.console.aliyun.com/?spm=a2c4g.11186623.2.9.3bf06a064lrBYN#/role/authorize?request=%7B%22Requests%22:%20%7B%22request1%22:%20%7B%22RoleName%22:%20%22AliyunODPSDefaultRole%22,%20%22TemplateId%22:%20%22DefaultRole%22%7D%7D,%20%22ReturnUrl%22:%20%22https:%2F%2Fram.console.aliyun.com%2F%22,%20%22Service%22:%20%22ODPS%22%7D)
29 | 
30 | ## 获取roleArn
31 | 
32 | 通过上述的授权后，只需要在Spark配置里加上下面的配置就可以访问OSS资源
33 | 
34 | ```
35 | # 此配置表明Spark是通过StsToken去访问OSS资源
36 | spark.hadoop.fs.oss.credentials.provider=org.apache.hadoop.fs.aliyun.oss.AliyunStsTokenCredentialsProvider
37 | 
38 | # 此配置是上述一键授权后产生的一个roleArn_ID，授权后可以去访问控制->角色管理获取AliyunODPSDefaultRole Arn信息
39 | spark.hadoop.fs.oss.ststoken.roleArn=acs:ram::xxxxxxxxxxxxxxx:role/aliyunodpsdefaultrole
40 | 
41 | # 此配置是OSS资源对应的VPC访问endpoint 不同的region可能会不同
42 | # 请访问 https://oss.console.aliyun.com/index 确认对应的 endpoint
43 | spark.hadoop.fs.oss.endpoint=oss-cn-hangzhou-internal.aliyuncs.com
44 | ```
45 | 
46 | 下面讲一下如何获取roleArn
47 | 
48 | * 登录 https://ram.console.aliyun.com/
49 | * 点击角色管理
50 | * 如果已经执行过一键授权，则会有一个**AliyunODPSDefaultRole**的记录存在
51 | * 点击管理，页面会跳转到详情页，可以看到一个这样格式的ID `acs:ram::xxxxxxxxxxxxxxx:role/aliyunodpsdefaultrole`
52 | 
53 | ![](../resources/oss-1.jpg)
54 | 
55 | ![](../resources/oss-2.jpg)
56 | 
57 | ![](../resources/oss-3.jpg)


--------------------------------------------------------------------------------
/docs/docs/advanced/zeppelin-integration.md:
--------------------------------------------------------------------------------
 1 | # MaxCompute Spark支持交互式Zeppelin
 2 | 由于安全原因，用户无法触达生产集群的网络，所以MaxCompute Spark一直没有放开 `yarn-client`的支持，也就是`Spark-Shell`，`Spark-SQL`以及`PYSPARK`等交互式功能一直无法支持。Zeppelin on MaxCompute Spark可以在一定程度上支持用户交互式需求。这个模式相对于local模式更有力的地方是，这个模式其实是真实用了yarn-cluster模式运行着的，local模式仅仅能验证语法是否正确，而zeppelin模式能以分布式的方式提供交互式查询，这个对于那种需要关注性能结果的debugging是有帮助的。
 3 | 
 4 | ##  步骤说明
 5 | 
 6 | *  一键启动脚本：
 7 |     * spark 2.3 见 [spark-zeppelin-public.sh](https://odps-repo.oss-cn-hangzhou.aliyuncs.com/spark-zeppelin-public/spark-zeppelin-public.sh)
 8 |     * spark 2.4 见 [spark-zeppelin-public-2.4.sh](https://odps-repo.oss-cn-hangzhou.aliyuncs.com/spark-zeppelin-public/spark-zeppelin-public-2.4.sh)
 9 | * 下载脚本到本地后，运行 `sh spark-zeppelin-public.sh` 后，会自动下载相关组件如下
10 |     * spark-zeppelin-public.conf
11 |     * spark-zeppelin-public.jar
12 |     * spark-2.3.0-odps0.32.1.tar.gz
13 | * 第一次运行脚本会出现以下错误，这是因为默认的`spark-zeppelin-public.conf`并没有配置accessId，accessKey，projectName
14 | 
15 | ```
16 | linxuewei:spark-zeppelin-public linxuewei$ sh spark-zeppelin-public.sh 
17 | working dir: /Users/linxuewei/Desktop/spark-zeppelin-public
18 | download spark-zeppelin-public.conf
19 | download spark-zeppelin-public.jar
20 | download spark-2.3.0-odps0.32.1.tar.gz
21 | extract spark-2.3.0-odps0.32.1.tar.gz
22 | export SPARK_HOME
23 | spark-zeppelin-public.conf checking
24 | TBD count is        3, plz check config make sure id key project is written!
25 | config check failed, plz set id key project in spark-zeppelin-public.conf
26 | ```
27 | 
28 | * 注意Spark 2.4.5需要添加 `spark.sql.catalogImplementation = hive 和 spark.sql.sources.default = hive` 之后再运行 `sh spark-zeppelin-public.sh` 
29 | * Spark 2.4.5添加 `spark.hadoop.odps.spark.libs.public.enable=true`和`spark.hadoop.odps.spark.version=spark-2.4.5-odps0.33.1` 这两个参数可以加速包上传速度
30 | 
31 | * 正常配置 `spark-zeppelin-public.conf` 之后再运行 `sh spark-zeppelin-public.sh` 
32 | 
33 | ```
34 | linxuewei:spark-zeppelin-public linxuewei$ sh spark-zeppelin-public.sh 
35 | working dir: /Users/linxuewei/Desktop/spark-zeppelin-public
36 | export SPARK_HOME
37 | spark-zeppelin-public.conf checking
38 | config check passed, start spark-submit
39 | 
40 | 就会启动一个MaxCompute Spark作业，等待作业执行结束之后，可以回溯日志，找到logview
41 | 
42 | http://logview.odps.aliyun.com/logview/?h=http://service.cn.maxcompute.aliyun.com/api&p=zky_test&i=20190710044052214gy6kc292&token=eXN6eFlsNmQzOFV4dUIzVEVndm9KQUtVSlVNPSxPRFBTX09CTzpwNF8yNDcwNjM5MjQ1NDg0NDc5NzksMTU2Mjk5Mjg1Mix7IlN0YXRlbWVudCI6W3siQWN0aW9uIjpbIm9kcHM6UmVhZCJdLCJFZmZlY3QiOiJBbGxvdyIsIlJlc291cmNlIjpbImFjczpvZHBzOio6cHJvamVjdHMvemt5X3Rlc3QvaW5zdGFuY2VzLzIwMTkwNzEwMDQ0MDUyMjE0Z3k2a2MyOTIiXX1dLCJWZXJzaW9uIjoiMSJ9
43 | ```
44 | 
45 | * 打开 `logview` 点击 `master-0` 点击 `StdOut`
46 | 
47 |  ![image.png](https://ata2-img.cn-hangzhou.oss-pub.aliyun-inc.com/425b961b2b3074622b41068e9a78409f.png)
48 | 
49 | ```
50 | # 日志中的这个url，就是zeppelin server的地址了
51 | # 直接复制粘贴到浏览器上即可访问，弹出的url会需要云账号的登录
52 | Please visit the following url for zeppelin interaction.
53 | http://20190710044052214gy6kc292-zeppelin.open.maxcompute.aliyun.com
54 | Log dir doesn't exist, create /worker/zeppelin_logs/
55 | Pid dir doesn't exist, create /worker/zeppelin_pids/
56 | Zeppelin start [60G[[0;32m  OK  [0;39m]
57 | ```
58 | 
59 | * 打开 `zeppelin url` 打开 `Examples` Notebook，有时候页面会显示endpoint not exist的日志，这是因为zeppelin还没有启动完毕的情况，稍等片刻就可以
60 | 
61 | ![image.png](https://ata2-img.cn-hangzhou.oss-pub.aliyun-inc.com/9f3c2496ba6f0d1cb827e5a6b81ee44a.png)
62 | 
63 | * 如果页面弹出一个 `interpreter binding`的页面，直接点击Save即可，然后再点击ToolBar上的运行所有按钮即可执行Notebook上的代码的执行
64 | 
65 | ![image.png](https://ata2-img.cn-hangzhou.oss-pub.aliyun-inc.com/b9b8404f9bcd49e9464074e9860c2272.png)
66 | 
67 | * 从 examples 样例中我们可以看到，NoteBook支持三种语法
68 |     * 以 `%spark` 开头表示 scala 执行器 如果不写就默认是这个模式
69 |     * 以 `%sql` 开头表示 spark-sql 执行器，默认用ODPS External Catalog
70 |     * 以 `pyspark` 开头表示 pyspark 执行器，默认用我们打包好的 python2.7
71 | 
72 | ## 资源释放
73 | 
74 | 本质上Zeppelin Server on MaxCompute Spark还是一个Spark作业，默认这个作业会存活三天，如果你想手动关闭这个作业的话，就请用odpscmd，用`kill <instanceId>;` 命令来停止作业释放资源吧。


--------------------------------------------------------------------------------
/docs/docs/config/_category_.json:
--------------------------------------------------------------------------------
1 | {
2 |   "label": "配置参考",
3 |   "position": 4,
4 |   "link": {
5 |     "type": "generated-index"
6 |   }
7 | }
8 | 


--------------------------------------------------------------------------------
/docs/docs/config/configuration.md:
--------------------------------------------------------------------------------
  1 | # Spark配置详解
  2 | ## MaxCompute账号相关配置
  3 | 
  4 | * `spark.hadoop.odps.project.name`
  5 |   + **默认值** `无`
  6 |   + **配置说明**   `MaxCompute项目名称`
  7 | * `spark.hadoop.odps.access.id`
  8 |   + **默认值** `无`
  9 |   + **配置说明**   `MaxCompute项目accessId`
 10 | * `spark.hadoop.odps.access.key`
 11 |   + **默认值** `无`
 12 |   + **配置说明**   `MaxCompute项目accessKey`
 13 | * `spark.hadoop.odps.access.security.token`
 14 |   + **默认值** `无`
 15 |   + **配置说明**   `MaxCompute项目STS Token`
 16 | * `spark.hadoop.odps.end.point`
 17 |   + **建议值** 可以采用中国公共云通用外网endpoint：`http://service.cn.maxcompute.aliyun.com/api`，也可以采用各自region独享的endpoint，参考文档[外网Endpoint](https://help.aliyun.com/document_detail/34951.html?spm=5176.11065259.1996646101.searchclickresult.58c77a0dlXCR54)
 18 |   + **配置说明**   `MaxCompute项目endPoint`
 19 | 
 20 | ## Spark版本配置
 21 | * `spark.hadoop.odps.spark.version`
 22 |   + **默认值** `spark-2.3.0-odps0.33.0，如果使用spark-2.4.5, 请将该参数设置为spark-2.4.5-odps0.34.0，如果使用spark-3.1.1, 请将该参数设置为spark-3.1.1-odps0.34.1`
 23 |   + **配置说明**   `该值指定了提交spark任务所用的spark版本`
 24 |   + **注意**  `可以通过该配置切换到spark-2.4.5/spark-3.1.1`
 25 | 
 26 | * `spark.hadoop.odps.spark.libs.public.enable`
 27 |   + **默认值** `false`
 28 |   + **配置说明**   `设置为true之后，可以免上传jars，直接从服务端拉取，加速上传`
 29 |   + **注意**  `需要同时配置spark.hadoop.odps.spark.version指定版本后才能生效`
 30 | 
 31 | ## 资源申请相关配置
 32 | 
 33 | * `spark.executor.instances`
 34 |   + **默认值** `1`
 35 |   + **配置说明**   `executor worker个数`
 36 | * `spark.executor.cores`
 37 |   + **默认值** `1`
 38 |   + **配置说明**   `executor worker核数`
 39 | * `spark.executor.memory`
 40 |   + **默认值** `2g`
 41 |   + **配置说明**   `executor worker内存`
 42 | * `spark.driver.cores`
 43 |   + **默认值** `1`
 44 |   + **配置说明**   `driver核数`
 45 | * `spark.driver.memory`
 46 |   + **默认值** `2g`
 47 |   + **配置说明**   `driver内存`
 48 | * `spark.master`
 49 |   + **默认值** `yarn-cluster`
 50 |   + **配置说明**   `作业提交运行方式，目前支持yarn-cluster以及local[N]`
 51 | * `spark.yarn.executor.memoryOverhead`
 52 |   + **默认值** `参考社区配置`
 53 |   + **配置说明**   `当堆外内存使用比较多时建议提高此值避免整体内存超出被Kill` 
 54 |   + **注意**  `单个executor的内存总量是spark.executor.memory+spark.yarn.executor.memoryOverhead`
 55 | * `spark.yarn.driver.memoryOverhead`
 56 |   + **默认值** `参考社区配置`
 57 |   + **配置说明**   `当堆外内存使用比较多时建议提高此值避免整体内存超出被Kill`
 58 |   + **注意**  `driver的内存总量是spark.driver.memory+spark.yarn.driver.memoryOverhead`
 59 | * `spark.hadoop.odps.cupid.disk.driver.device_size`
 60 |   + **默认值** `20g`
 61 |   + **配置说明**   `本地网盘大小，当出现No space left on device时可适当调大该值，最大支持100g`
 62 |   + **注意**  `注意：必须配置在spark-conf文件或者dataworks的配置项中，不能配置在代码中`
 63 | 
 64 | ## MaxCompute数据互通配置
 65 | 
 66 | * `spark.sql.catalogImplementation`
 67 |   + **配置说明**   `spark 2.3.0 需要设置为odps，spark 2.4.5及以上的版本需要设置hive`
 68 | * `spark.hadoop.odps.cupid.resources`
 69 |   + **配置说明** `该配置项指定了任务运行所需要的`[Maxcompute资源](https://help.aliyun.com/document_detail/27831.html?spm=5176.11065259.1996646101.searchclickresult.d55650ea0QU1qd&aly_as=45TiiTdO2)，`格式为<projectname>.<resourcename>，可指定多个，逗号分隔`
 70 |   + **配置示例** spark.hadoop.odps.cupid.resources=public.python-python-2.7-ucs4.zip,public.myjar.jar
 71 |   + **使用说明** `指定的资源将被下载到driver和executor的当前工作目录，资源下载到工作目录后默认的名字是<projectname>.<resourcename>`
 72 |   + **文件重命名** `在配置时通过<projectname>.<resourcename>:<newresourcename>进行重命名`
 73 |   + **重命名示例** spark.hadoop.odps.cupid.resources=public.myjar.jar:myjar.jar
 74 |   + **注意** `该配置项必须要配置在spark-default.conf中或dataworks的配置项中才能生效，而不能写在代码中`
 75 | * `spark.hadoop.odps.cupid.vectorization.enable`
 76 |   + **建议值** `true`
 77 |   + **配置说明**   `当设置为true时，会应用批读写优化，读写数据性能显著提升。
 78 | * `spark.hadoop.odps.input.split.size`
 79 |   + **默认值** `256`
 80 |   + **配置说明**   `该配置可以用来调节读Maxcompute表的并发度，默认每个分区为256MB
 81 | 
 82 | 
 83 | ## OSS相关配置
 84 | 
 85 | * `spark.hadoop.fs.oss.endpoint`
 86 |   + **建议值** `无`
 87 |   + **配置说明**   `阿里云OSS控制台上可查看Bucket对应的endpoint`
 88 | * `spark.hadoop.fs.oss.ststoken.roleArn`
 89 |   + **建议值** `无`
 90 |   + **配置说明**   `StsToken授权方式`
 91 | * `spark.hadoop.fs.oss.credentials.provider`
 92 |   + **建议值** `org.apache.hadoop.fs.aliyun.oss.AliyunStsTokenCredentialsProvider`
 93 |   + **配置说明**   `StsToken授权方式`
 94 | 
 95 | [OSS StsToken授权步骤](https://github.com/aliyun/MaxCompute-Spark/wiki/08.-Oss-Access%E6%96%87%E6%A1%A3%E8%AF%B4%E6%98%8E)
 96 | 
 97 | ## VPC服务访问相关配置
 98 | 
 99 | * `spark.hadoop.odps.cupid.vpc.domain.list`
100 |   + **建议值** `无`
101 |   + **配置说明**   `参见以下JSON格式 配置值为压缩去除空格后的字符串`
102 |   + **压缩为字符串网址**    http://www.bejson.com/
103 | 
104 | ```
105 | See. http://www.bejson.com/
106 | 粘贴VPC Domain List内容并选择压缩得到压缩于一行的字符串作为spark.hadoop.odps.cupid.vpc.domain.list的配置值
107 | {
108 |     "regionId": "cn-beijing",
109 |     "vpcs": [
110 |         {
111 |             "vpcId": "vpc-2zeaeq21mb1dmkqh0exox",
112 |             "zones": [
113 |                 {
114 |                     "urls": [
115 |                         {
116 |                             "domain": "zky-test",
117 |                             "port": 9092
118 |                         }
119 |                     ],
120 |                     "zoneId": "9b7ce89c6a6090e114e0f7c415ed9fef"
121 |                 }
122 |             ]
123 |         }
124 |     ]
125 | }
126 | ```
127 | * `spark.hadoop.odps.cupid.pvtz.rolearn`
128 |   + **建议值** `acs:ram::********:role/aliyunodpsdefaultrole`
129 |   + **配置说明**   `当spark作业需要访问云上其他VPC域内服务，比如redis、mysql、kafka等等需要配置该参数`
130 | * `spark.hadoop.odps.cupid.smartnat.enable`
131 |   + **配置说明**   `北京和上海region需要配置该参数为true`    
132 | 
133 | [VPC访问文档说明](https://github.com/aliyun/MaxCompute-Spark/wiki/09.-VPC-Access%E6%96%87%E6%A1%A3%E8%AF%B4%E6%98%8E)
134 | 
135 | ## 流式作业相关配置
136 | 
137 | * `spark.hadoop.odps.cupid.engine.running.type`
138 |   + **建议值** `longtime`
139 |   + **配置说明**   `普通作业3天没跑完就会被强制回收，流式作业需要设置此值`
140 | * `spark.hadoop.odps.cupid.job.capability.duration.hours`
141 |   + **建议值** `8640`
142 |   + **配置说明**   `流式作业权限文件expired时间，单位小时`
143 | * `spark.hadoop.odps.moye.trackurl.dutation`
144 |   + **建议值** `8640`
145 |   + **配置说明**   `流式作业jobview expired时间，单位小时`
146 | * `spark.yarn.maxAppAttempts`
147 |   + **建议值** `5`
148 |   + **配置说明**   `流式作业failover次数限制`
149 | * `spark.yarn.am.maxAttemptValidityInterval`
150 |   + **建议值** `1h`
151 |   + **配置说明**   `流式作业failover次数限制窗口验证`
152 | 
153 | ## 灰度相关配置
154 | 
155 | * `spark.hadoop.odps.task.major.version`
156 |   + **建议值** `default`
157 | 
158 | ## 隔离相关配置
159 | 
160 | * `spark.hadoop.odps.cupid.container.image.enable`
161 |   + **建议值** `true`
162 |   + **配置说明**   `安全隔离相关配置请保持默认值，专有云需要去掉该配置`
163 | * `spark.hadoop.odps.cupid.container.vm.engine.type`
164 |   + **建议值** `hyper`
165 |   + **配置说明**   `安全隔离相关配置请保持默认值，专有云需要去掉该配置`
166 | 


--------------------------------------------------------------------------------
/docs/docs/config/resource-autoscaling.md:
--------------------------------------------------------------------------------
 1 | # 动态资源伸缩问题
 2 | ## Spark 2.4.5/3.1.1 支持动态资源伸缩
 3 | * 首先需要切换到spark-2.4.5-odps0.34.0版本
 4 | 
 5 | ```
 6 | * 从Dataworks提交任务，需要添加配置：spark.hadoop.odps.spark.version=spark-2.4.5-odps0.34.0，从而切换到新的spark版本
 7 | 
 8 | * 从本地提交任务，需要添加以下两个配置：
 9 |   spark.hadoop.odps.spark.libs.public.enable=true
10 |   spark.hadoop.odps.spark.version=spark-2.4.5-odps0.34.0
11 | 
12 | * spark-3.1.1采用客户端提交可以直接使用动态资源伸缩功能
13 | ```
14 | 
15 | * 此外需要添加以下spark参数：
16 | ```
17 | spark.dynamicAllocation.shuffleTracking.enabled = true （默认 false）
18 | spark.dynamicAllocation.shuffleTracking.timeout = XXXs （默认 Long.MaxValue MILLISECONDS）
19 | spark.dynamicAllocation.enabled = true
20 | 
21 | 参考文档：https://spark.apache.org/docs/3.0.0/configuration.html#dynamic-allocation
22 | ```


--------------------------------------------------------------------------------
/docs/docs/development/_category_.json:
--------------------------------------------------------------------------------
1 | {
2 |   "label": "开发指南",
3 |   "position": 3,
4 |   "link": {
5 |     "type": "generated-index"
6 |   }
7 | }
8 | 


--------------------------------------------------------------------------------
/docs/docs/development/ops/_category_.json:
--------------------------------------------------------------------------------
1 | {
2 |   "label": "运维监控",
3 |   "position": 2,
4 |   "link": {
5 |     "type": "generated-index"
6 |   }
7 | }
8 | 


--------------------------------------------------------------------------------
/docs/docs/development/ops/job-diagnosis.md:
--------------------------------------------------------------------------------
 1 | # 作业诊断
 2 | 快速导航
 3 |   + [使用Logview工具诊断作业](#1)
 4 |   + [使用Spark-Web-UI诊断作业](#2)
 5 |   + [寻求开发介入帮助](#3)
 6 | ------------------
 7 | 
 8 | 提交作业后，一般来说客户端会输出logview和jobview这两个Url可以帮助作业诊断，这两个Url无论是问题自查、结果查看还是寻求开发人员帮助都是十分重要的手段。此外，用户还可以通过[Cupid Console](https://developer.aliyun.com/article/745038?spm=a2c6h.12873581.0.0.ebaa95b08Drzws&groupCode=maxcompute)来获取当前project正在运行的logview和jobview信息。
 9 | 下面给出一个Demo日志作为演示。
10 | 
11 | ```
12 | 19/06/11 11:56:41 INFO YarnClientImplUtil: logview url: http://logview.odps.aliyun.com/logview/?h=http://service.cn.maxcompute.aliyun.com/api&p=lightning&i=2019061103564120glypjv21&token=dUxYakFMRUFrc25oNjg5TDk1azhRZTlYVldvPSxPRFBTX09CTzpwNF8yNDcwNjM5MjQ1NDg0NDc5NzksMTU2MDQ4NDYwMSx7IlN0YXRlbWVudCI6W3siQWN0aW9uIjpbIm9kcHM6UmVhZCJdLCJFZmZlY3QiOiJBbGxvdyIsIlJlc291cmNlIjpbImFjczpvZHBzOio6cHJvamVjdHMvbGlnaHRuaW5nL2luc3RhbmNlcy8yMDE5MDYxMTAzNTY0MTIwZ2x5cGp2MjEiXX1dLCJWZXJzaW9uIjoiMSJ9
13 | 19/06/11 11:56:41 INFO CupidUtil: ready!!!
14 | 19/06/11 11:57:08 INFO YarnClientImpl: Submitted applicationType SPARK application application_1560225394361_1217133882 to ResourceManager at instanceId 2019061103564120glypjv21
15 | 19/06/11 11:57:09 INFO SubmitJobUtil: submitting CupidTask with ALIYUN type, operator: GetApplicationMeta
16 | 19/06/11 11:57:09 INFO CupidUtil: getApplicationMeta
17 | 19/06/11 11:57:11 INFO Client: Application report for application_1560225394361_1217133882 (state: RUNNING)
18 | 19/06/11 11:57:11 INFO Client: 
19 |          client token: N/A
20 |          diagnostics: diagnostics
21 |          ApplicationMaster host: 11.222.166.90
22 |          ApplicationMaster RPC port: 38965
23 |          queue: queue
24 |          start time: 1560225401092
25 |          final status: UNDEFINED
26 |          tracking URL: http://jobview.odps.aliyun.com/proxyview/jobview/?h=http://service.cn.maxcompute.aliyun-inc.com/api&p=lightning&i=2019061103564120glypjv21&t=spark&id=application_1560225394361_1217133882&metaname=2019061103564120glypjv21&token=MHJISzg3OVlKZWJTZ3VCSllzUEMzVnF5KzNJPSxPRFBTX09CTzpwNF8yNDcwNjM5MjQ1NDg0NDc5NzksMTU2MDQ4NDYzMSx7IlN0YXRlbWVudCI6W3siQWN0aW9uIjpbIm9kcHM6UmVhZCJdLCJFZmZlY3QiOiJBbGxvdyIsIlJlc291cmNlIjpbImFjczpvZHBzOio6cHJvamVjdHMvbGlnaHRuaW5nL2luc3RhbmNlcy8yMDE5MDYxMTAzNTY0MTIwZ2x5cGp2MjEiXX1dLCJWZXJzaW9uIjoiMSJ9
27 |          user: user
28 | ```
29 | ## JobView
30 | 以`jobview.odps.aliyun.com`开头的Url，我们统称为Jobview，Jobview是上一代Spark UI和HistoryServer，使用时会在一些稳定性问题，因此**不再推荐**用户使用，可以直接在logview中找到Spark UI和History Server的链接来排查问题，见下文。
31 | 
32 | <h1 id="1">使用Logview工具诊断作业</h1>
33 | 
34 | 以`logview.odps.aliyun.com`开头的Url，我们统称为Logview，这个MaxCompute自研的分布式作业Tracing工具，通过这个工具我们可以:
35 | 
36 | * 获取该作业状态
37 | * 获取该作业各节点起停调度信息
38 | * 获取该作业各节点的标准输入输出日志 (一般Spark结果输出建议打印到StdOut，Spark的log4j日志则默认输出到StdErr)
39 |   + 通过log插件打印的日志会显示在StdErr中
40 |   + 通过System.out.println()输出到控制台的日志会显示在StdOut中
41 | * Logview的时效性一般是3～5天，甚至其中是StdOut、StdErr很可能会因为本地磁盘满了被清理掉
42 | 
43 | * **Logview 2.0包含Sensor功能，可以查看master以及每个worker在运行时的内存和cpu使用情况**
44 | 
45 | * 打开Logview，可以在浏览器看到以下页面，有作业完成状态以及时间等信息
46 | ![image1](../../resources/logview-1.jpg)
47 | 
48 | * 点击Detail按钮可以查看作业进一步详情，master-0代表Spark Driver所在节点
49 | ![image2](../../resources/logview-2.jpg)
50 | 
51 | * 点击master-0按钮并选择All Tab可以看到Spark Driver节点的具体信息
52 | ![image3](../../resources/logview-3.jpg)
53 | 
54 | * 点击StdOut按钮可以看到节点结果输出
55 | ![image4](../../resources/logview-4.jpg)
56 | 
57 | * 点击StdErr按钮可以看到节点log4j日志
58 | ![image5](../../resources/logview-5.jpg)
59 | 
60 | <h1 id="2">使用Spark-Web-UI诊断作业</h1>
61 | 
62 | ## Spark UI和History Server
63 | 用户可以在logivew的summary模块下找到Spark UI链接和History Server链接：
64 | ![image6](../../resources/sparkui.png)
65 | 
66 | * 注意
67 | ```
68 | 1. Spark UI仅在作业运行时才能打开
69 | 
70 | 2. History Server需要等待Driver把Event传递到HistoryServer进行渲染，会有一定延迟
71 | ```
72 | * 打开该链接，可以在浏览器看到 Spark-Web-UI
73 | ![image7](../../resources/jobview-1.jpg)
74 | 
75 | * 点击environment tab确认设置的spark参数是否全部正确
76 | ![image8](../../resources/jobview-2.jpg)
77 | 
78 | * 点击executors tab重点关注是否有`Dead节点`，以及Driver的StdOut和StdErr
79 | ![image9](../../resources/jobview-3.jpg)
80 | 
81 | * 点击StdOut按钮可以看到节点结果输出
82 | ![image10](../../resources/jobview-4.jpg)
83 | 
84 | * 点击StdErr按钮可以看到节点log4j日志
85 | ![image11](../../resources/jobview-5.jpg)
86 | 
87 | <h1 id="3">寻求开发介入帮助</h1>
88 | 
89 | * 先根据[常见问题](https://github.com/aliyun/MaxCompute-Spark/wiki/06.-Spark%E5%B8%B8%E8%A7%81%E9%97%AE%E9%A2%98)文档做一下初步排查
90 | * 提供logview和jobview，一般logview是一定有的，jobview如果提交的时候马上报错并不会产生
91 | * 钉钉扫码加入MaxCompute Spark开发群支持


--------------------------------------------------------------------------------
/docs/docs/development/ops/logging-guide.md:
--------------------------------------------------------------------------------
 1 | # MaxCompute Spark日志使用指南
 2 | ## 背景
 3 | Spark集群运行环境中日志使用Log4j2接口，因此原先使用了Log4j1相关接口的代码需要一定的修改。
 4 | 
 5 | ## 适用情况
 6 | - 有需要在代码中打入自己的日志，同时与Spark产生的系统日志进行区分的。
 7 | - 原代码中涉及显式调用Log4j1接口进行日志配置的
 8 | 
 9 | ## 升级步骤
10 | 参考：[log4j2 example](https://github.com/aliyun/MaxCompute-Spark/tree/master/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/log4j2)
11 | 具体来说：
12 | 1. 引入log4j2的包
13 | ```xml
14 | <dependency>
15 |     <groupId>org.apache.logging.log4j</groupId>
16 |     <artifactId>log4j-core</artifactId>
17 |     <version>2.12.1</version>
18 |     <scope>provided</scope>
19 | </dependency>
20 | ```
21 | 保持scope为provided和version为2.12.1，因为集群环境中有此依赖，避免出现意外的类/方法冲突问题。
22 | 
23 | 2. 参考[示例](https://github.com/aliyun/MaxCompute-Spark/blob/master/spark-2.x/src/main/java/com/aliyun/odps/spark/examples/utils/ConfigLog4j2.java)中使用log4j2的接口进行日志配置，即向集群运行环境中的log4j2配置中加入自定义的Appender和LoggerConfig
24 | 
25 | 3. 使用之前调用配置方法，如下
26 | ```java
27 | ConfigLog4j2.initPackageLogger("your_package_name")
28 | ```
29 | 即可在需要的地方使用，如下
30 | ```scala
31 | val log: log4j.Logger = LogManager.getLogger(your_class)
32 | ```
33 | 
34 | ## 效果展示
35 | 运行示例中的[SimpleWordCount](https://github.com/aliyun/MaxCompute-Spark/blob/master/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/log4j2/SimpleWordCount.scala)，自己在代码中打的日志收在logview->detail->master->stdout中，如下
36 | ![stdout](../../resources/log4j2-stdout.jpg)
37 | 
38 | logview->detail->master->stderr中仍是spark的系统日志，如下
39 | ![stderr](../../resources/log4j2-stderr.jpg)
40 | 
41 | ## FAQ
42 | 1. 此示例需要开发者自行集成到代码中，涉及到的日志pattern等可自行修改。
43 | 2. initPackageLogger方法中传入的包名，也就是用户代码所处的包名，需要避开以下:
44 | > * com.aliyun.odps
45 | > * com.aliyun.odps.fs
46 | > * org.apache.hadoop
47 | > * org.apache.kafka
48 | > * org.apache.zookeeper
49 | > * org.apache.spark
50 | > * org.apache.flink
51 | > * akka
52 | > * com.aliyun.odps.subprocess


--------------------------------------------------------------------------------
/docs/docs/development/ops/streaming-monitoring.md:
--------------------------------------------------------------------------------
 1 | # Streaming作业监控报警
 2 | Spark Streaming作业的特点是长时间运行，对于数据处理速度，调度延迟等有着较高的要求。因而作业在生产环境运行时，需要关注一些作业的性能、延迟等相关的指标。
 3 | 
 4 | 目前，MaxCompute的Spark Streaming作业的监控报警提供了一个对接云监控平台的插件，可以将作业关键的指标信息推送至[云监控平台](https://www.aliyun.com/product/jiankong)，进而可以进行指标查看以及配置监控报警信息，现在支持以下5种类型的监控报警。
 5 | 
 6 | * processingDelay ：处理延迟
 7 | * schedulingDelay ：调度延迟
 8 | * totalDelay：总延迟
 9 | * totalProcessedRecords：总共处理的记录条数
10 | * waitingBatches：等待执行的Batch数
11 | > 下面将对如何使用Spark Streaming的云监控插件做一个介绍。
12 | 
13 | （1）spark-cloudmonitor-sink这个插件是基于[Spark Metrics System](https://spark.apache.org/docs/latest/monitoring.html)的接口开发的一个外置插件。该插件的引入不影响Spark Streaming作业的开发，这里假定我们已经开发调试好了一个Spark Streaming作业，提交方式如下：
14 | ```
15 | bin/spark-submit --class com.aliyun.odps.spark.examples.streaming.LogHubStreamingDemo --master yarn-cluster --num-executors 1 --driver-memory 4g --executor-memory 4g --executor-cores 1 spark-examples_2.11-1.0.0-SNAPSHOT-shaded.jar
16 | ```
17 | 
18 | （2） 使用spark-cloudmonitor-sink云监控插件，首先需要下载插件的jar包。
19 | ```
20 | wget http://odps-repo.oss-cn-hangzhou.aliyuncs.com/spark%2Fspark-cloudmonitor-sink-1.0-SNAPSHOT-shaded.jar -O spark-cloudmonitor-sink-1.0-SNAPSHOT-shaded.jar
21 | ```
22 | 
23 | （3）配置云监控相关的账号信息，需要确保云监控平台已经创建好了一个应用分组。配置文件放置在conf/metrics.properties，配置信息如下
24 | ```
25 | *.sink.cloudmonitor.period=5
26 | 
27 | *.sink.cloudmonitor.class=org.apache.spark.metrics.sink.CloudMonitorSink
28 | *.sink.cloudmonitor.endpoint=http://xxxxxx
29 | *.sink.cloudmonitor.accessid=xxxxxx
30 | *.sink.cloudmonitor.accesskey=xxxxxx
31 | *.sink.cloudmonitor.groupid=xxxxxx
32 | ```
33 | 其中sink.cloudmonitor.endpoint，sink.cloudmonitor.accessid，sink.cloudmonitor.accesskey，sink.cloudmonitor.groupid分别是云监控的endpoint，accessid，accesskey以及metrics需要推送到的应用分组id。
34 | 另外建议在conf/spark-defaults.conf里面增加spark.metrics.namespace: xxxxx 指定为一个有意义的名字标识，否则默认是ApplicationID, 每次作业提交都会不一样。
35 | 
36 | （4）带上云监控插件提交Spark Streaming作业，提交作业的命令如下：
37 | ```
38 | bin/spark-submit --class com.aliyun.odps.spark.examples.streaming.LogHubStreamingDemo --master yarn-cluster --num-executors 1 --driver-memory 4g --executor-memory 4g --executor-cores 1 --jars spark-cloudmonitor-sink-0-SNAPSHOT-shaded.jar --files conf/metrics.properties spark-examples_2.11-1.0.0-SNAPSHOT-shaded.jar
39 | ```
40 | 和原来不带云监控插件的提交命令对比，可以看到只要新增插件的jar包和conf/metrics.properties配置即可。
41 | 
42 | （5）在云监控平台查看指标数据，以及配置报警。
43 | 当作业正常跑起来后，作业的指标数据会不断推送到云监控平台，在云监控的控制台，在自定义监控下面可以看到相应的Spark Streaming作业的监控数据，并且可以针对其中的指标添加相关的报警规则。
44 | ![image1](../../resources/cloudmonitor-1.png)
45 | 
46 | ![image2](../../resources/cloudmonitor-2.png)
47 | 
48 | ![image3](../../resources/cloudmonitor-3.png)


--------------------------------------------------------------------------------
/docs/docs/development/pyspark/_category_.json:
--------------------------------------------------------------------------------
1 | {
2 |   "label": "pyspark",
3 |   "position": 3,
4 |   "link": {
5 |     "type": "generated-index"
6 |   }
7 | }
8 | 


--------------------------------------------------------------------------------
/docs/docs/development/pyspark/pyspark-oss.md:
--------------------------------------------------------------------------------
 1 | # PySpark 访问 Oss
 2 | ## 参数配置
 3 | - 首先需要参考[文档](https://github.com/aliyun/MaxCompute-Spark/wiki/08.-Oss-Access%E6%96%87%E6%A1%A3%E8%AF%B4%E6%98%8E)配置ossid和key：
 4 | ```
 5 | spark.hadoop.fs.oss.accessKeyId = xxxxxx
 6 | spark.hadoop.fs.oss.accessKeySecret = xxxxxx
 7 | spark.hadoop.fs.oss.endpoint = oss-xxxxxx-internal.aliyuncs.com
 8 | ```
 9 | 
10 | - 配置Hadoop实现类（二选一即可）
11 | ```
12 | ### 使用jindo sdk（推荐方式，性能更优）
13 | spark.hadoop.fs.AbstractFileSystem.oss.impl=com.aliyun.emr.fs.oss.OSS
14 | spark.hadoop.fs.oss.impl=com.aliyun.emr.fs.oss.JindoOssFileSystem
15 | 
16 | ### 使用hadoop-fs-oss
17 | spark.hadoop.fs.oss.impl=org.apache.hadoop.fs.aliyun.oss.AliyunOSSFileSystem
18 | ```
19 | 
20 | - 【公共云】需要引用hadoop oss依赖，添加以下配置（二选一即可）：
21 | ```
22 | ### 使用jindo sdk（推荐方式，性能更优）
23 | spark.hadoop.odps.cupid.resources=public.jindofs-sdk-3.7.2.jar
24 | 
25 | ### 使用hadoop-fs-oss 
26 | spark.hadoop.odps.cupid.resources=public.hadoop-fs-oss-shaded.jar 
27 | ```
28 | 
29 | - 【专有云】需要引用hadoop-fs-oss.jar包，需要按照以下步骤上传资源并添加配置：
30 | ```
31 | （1）下载hadoop-fs-oss.jar包，下载地址(https://odps-repo.oss-cn-hangzhou.aliyuncs.com/spark/hadoop-fs-oss-shaded.jar)
32 | （2）将jar包上传为MaxCompute资源，参考文档（https://help.aliyun.com/document_detail/27831.html?spm=a2c4g.27797.0.i1#section-533-s8q-d9w）
33 | （3）添加参数：spark.hadoop.odps.cupid.resources=<projectname>.hadoop-fs-oss-shaded.jar 
34 | ```
35 | 
36 | - 需要注意：如果已经配置过spark.hadoop.odps.cupid.resources这个参数，则引用多个资源需要用逗号隔开，参考[文档](https://github.com/aliyun/MaxCompute-Spark/wiki/03.-Spark%E9%85%8D%E7%BD%AE%E8%AF%A6%E8%A7%A3#maxcompute%E6%95%B0%E6%8D%AE%E4%BA%92%E9%80%9A%E9%85%8D%E7%BD%AE)
37 | 
38 | ## 例子1：判断oss文件是否存在
39 | ```
40 | from pyspark.sql import SparkSession
41 | 
42 | spark = SparkSession.builder.appName('testoss').getOrCreate()
43 | sc = spark.sparkContext
44 | conf = sc._jsc.hadoopConfiguration()
45 | conf.set("fs.oss.accessKeyId", "xxxx")
46 | conf.set("fs.oss.accessKeySecret", "xxx")
47 | conf.set("fs.oss.endpoint", "xxxx")
48 | conf.set("fs.oss.impl", "org.apache.hadoop.fs.aliyun.oss.AliyunOSSFileSystem")
49 | 
50 | path = sc._jvm.org.apache.hadoop.fs.Path("oss://xxxxx")
51 | fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(path.toUri(), conf)
52 | exist = fs.exists(path)
53 | ```
54 | 
55 | 
56 | ## 例子2：写oss
57 | ```
58 | spark = SparkSession.builder.appName('testoss').getOrCreate()
59 | data = [i for i in range(0, 100)]
60 | df = spark.sparkContext.parallelize(data, 2).map(lambda s: ("name-%s" % s, s)).toDF("name: string, num: int")
61 | df.show(n=10)
62 | ## write to oss
63 | pathout = 'oss://[替换为实际Bucket]/test.csv'
64 | df.write.csv(pathout)
65 | ```
66 | 


--------------------------------------------------------------------------------
/docs/docs/development/pyspark/pyspark-thirdparty.md:
--------------------------------------------------------------------------------
  1 | # PySpark 使用mmlspark和analytics zoo
  2 | ## 使用mmlspark
  3 | #### 背景
  4 | - mmlspark开源库地址：https://github.com/microsoft/SynapseML
  5 | - 由于MaxCompute Spark访问外部网络有限制，因此提供以下方案在MaxCompute Spark中使用mmlspark
  6 | 
  7 | #### 使用方式
  8 | - 第一步：下载Jar包：首先需要在本地客户端下载mmlspark的所有jar包
  9 | ```
 10 | 1. 在本地下载一个spark客户端
 11 | 
 12 | 2. 配置spark-defaults.conf，添加以下参数
 13 | spark.jars.repositories=https://mmlspark.azureedge.net/maven
 14 | 
 15 | 3. 使用local模式在本地执行以下命令：
 16 | $SPARK_HOME/bin/pyspark --packages com.microsoft.ml.spark:mmlspark_2.11:1.0.0-rc1
 17 | 
 18 | 4. jar包通常会下载到以下目录：
 19 | $HOME/.ivy2/jars
 20 | 
 21 | 5. 将所有的jar包压缩为一个zip包：
 22 | cd $HOME/.ivy2/jars
 23 | zip -r mml_spark.zip .
 24 | ```
 25 | 
 26 | - 第二步：修改spark-defaults.conf
 27 | ```
 28 | spark.executor.extraClassPath=./mml_spark.zip/*
 29 | spark.driver.extraClassPath=./mml_spark.zip/*
 30 | ```
 31 | 
 32 | - 第三步：使用Yarn-cluster模式提交任务到集群中，注意需要包含 --py-files
 33 | ```
 34 | ./bin/spark-submit --archives mml_spark.zip --py-files mml_spark/com.microsoft.ml.spark_mmlspark_2.11-1.0.0-rc1.jar,mml_spark/com.microsoft.ml.lightgbm_lightgbmlib-2.3.100.jar spark_mml.py
 35 | ```
 36 | 
 37 | ## 使用analytics-zoo
 38 | #### 相关开源库
 39 | - https://github.com/intel-analytics/analytics-zoo
 40 | - https://github.com/intel-analytics/BigDL
 41 | - https://analytics-zoo.github.io/master/#release-download/
 42 | 
 43 | #### 参考使用方式
 44 | - 注意：下文使用analytics-zoo 0.11.0版本
 45 | 
 46 | - 第一步：Python打包
 47 | ```
 48 | 1. ./bin/pip3 install analytics-zoo -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
 49 | 
 50 | 2. 注意：Python包比较大，可以卸载Pyspark（Maxcompute Spark中已包含PySpark）
 51 | ./bin/pip3 uninstall pyspark
 52 | 
 53 | 3. 打包为压缩包，下文使用该名称：python-3.6.14-big-dl.tar.gz
 54 | 
 55 | ```
 56 | 
 57 | - 第二步：将Python包上传到Maxcompute resource中
 58 | 
 59 | - 第三步：将需要的三个Jar包拷贝出来，路径在
 60 | ```
 61 | $python_home/lib/python3.6/site-packages/zoo/share/lib
 62 |   ○ analytics-zoo-bigdl_0.13.0-spark_2.4.6-0.11.1-jar-with-dependencies.jar
 63 | 
 64 | $python_home/lib/python3.6/site-packages/bigdl/share/lib
 65 |   ○ bigdl-0.13.0-jar-with-dependencies.jar
 66 |   ○ bigdl-0.13.0-python-api.zip
 67 | ```
 68 | 
 69 | - 第四步：BigDL重新打包（解决log4j类冲突）
 70 | 
 71 | ```
 72 | 1. 首先找到对应BigDL的版本：如0.11.0对应BigDL的版本是0.13.0
 73 | 
 74 | 2. 下载BigDL源码
 75 | 
 76 |    git clone https://github.com/intel-analytics/BigDL.git
 77 | 
 78 | 3. 切换到0.13分支
 79 |    
 80 |    git checkout branch-0.13
 81 | 
 82 | 4. 编译打包
 83 |    cd BigDL/spark/dl/
 84 |    mvn clean package -DskipTests;
 85 | 
 86 | 5. 替换Jar包
 87 |    用target目录下生成的bigdl-0.13.1-SNAPSHOT-jar-with-dependencies.jar文件来替换第三步中的bigdl-0.13.0-jar-with-dependencies.jar
 88 | 
 89 | ```
 90 | 
 91 | - 第五步：在spark-defaults.conf中配置Python包
 92 | ```
 93 | spark.hadoop.odps.cupid.resources = [projectname].python-3.6.14-big-dl.tar.gz
 94 | spark.pyspark.python = ./[projectname].python-3.6.14-big-dl.tar.gz/python-3.6.14-big-dl/bin/python3
 95 | ```
 96 | 
 97 | - 第六步：提交任务，需要携带第三步和第四步中生成的jar包：
 98 | ```
 99 | ./bin/spark-submit --jars analytics-zoo-bigdl_0.13.0-spark_2.4.6-0.11.1-jar-with-dependencies.jar,bigdl-0.13.1-SNAPSHOT-jar-with-dependencies.jar,bigdl-0.13.0-python-api.zip spark_test.py
100 | ```
101 | 
102 | 


--------------------------------------------------------------------------------
/docs/docs/development/spark-streaming/_category_.json:
--------------------------------------------------------------------------------
1 | {
2 |   "label": "Spark Streaming",
3 |   "position": 4,
4 |   "link": {
5 |     "type": "generated-index"
6 |   }
7 | }
8 | 


--------------------------------------------------------------------------------
/docs/docs/development/spark-streaming/streaming-datahub.md:
--------------------------------------------------------------------------------
  1 | # Streaming读写DataHub
  2 | MaxCompute支持Spark Streaming(DStream)和Spark Structured Streaming，本文介绍Streaming作业流式接收DataHub数据源的示例。
  3 | 
  4 | ## DataHub 数据源
  5 | 
  6 | 首先, 得在阿里云DataHub拥有数据源，[DataHub控制台传送门](https://datahub.console.aliyun.com/datahub)
  7 | 
  8 | * 获取projectName
  9 | ![image1](../../resources/datahub-1.jpg)
 10 | 
 11 | * 获取topic
 12 | ![image2](../../resources/datahub-2.jpg)
 13 | 
 14 | * 获取subId
 15 | 
 16 | 注意，每一个Streaming程序只能对应一个subId，如果有多个程序要读同一个topic，那么需要多个订阅
 17 | 
 18 | ![image3](../../resources/datahub-3.jpg)
 19 | 
 20 | * 获取endPoint
 21 | 
 22 | 每个region的endPoint都是不一样的，参考[如何配置EndPoint](https://help.aliyun.com/document_detail/47442.html?spm=5176.11065259.1996646101.searchclickresult.4a6e46e8r26UYT)
 23 | 
 24 | ## Spark Streaming(DStream)
 25 | 
 26 | ```
 27 | <!-- datahub streaming依赖 -->
 28 | <dependency>
 29 |     <groupId>com.aliyun.emr</groupId>
 30 |     <artifactId>emr-datahub_${scala.binary.version}</artifactId>
 31 |     <version>1.6.0</version>
 32 | </dependency>
 33 | 
 34 | <dependency>
 35 |     <groupId>com.aliyun.datahub</groupId>
 36 |     <artifactId>aliyun-sdk-datahub</artifactId>
 37 |     <version>2.9.4-public</version>
 38 | </dependency>
 39 | <!-- datahub streaming依赖 -->
 40 | ```
 41 | 
 42 | [Streaming Access DataHub样例代码](https://github.com/aliyun/MaxCompute-Spark/blob/master/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/streaming/datahub/DataHubStreamingDemo.scala)
 43 | 
 44 | ## 配置详解
 45 | 
 46 | ```
 47 | val dataStream = DatahubUtils.createStream(
 48 |       ssc,
 49 |       "projectName",
 50 |       "topic",
 51 |       "subId",
 52 |       "accessId", // 云账号accessId
 53 |       "accessKey", // 云账号accessKey
 54 |       "endPoint",
 55 |       transferFunc(_), // 见Demo注释
 56 |       StorageLevel.MEMORY_AND_DISK
 57 |     )
 58 | ```
 59 | 
 60 | ## DataHub回流到MaxCompute
 61 | 利用DStream+Dataframe可以把DataHub数据回流到MaxCompute
 62 | 
 63 | > 详细代码请参考：https://github.com/aliyun/MaxCompute-Spark/blob/master/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/streaming/datahub/DataHub2OdpsDemo.scala
 64 | 
 65 | 
 66 | ## Spark Structured Streaming
 67 | > 详细代码请参考：https://github.com/aliyun/MaxCompute-Spark/blob/master/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/structuredStreaming/datahub/DatahubStructuredStreamingDemo.scala
 68 | 
 69 | source的示例如下(请参考代码):
 70 | ```
 71 | val stream = spark.readStream
 72 |   .format("datahub")
 73 |   .option("datahub.endpoint", "http://....")
 74 |   .option("datahub.project", "project")
 75 |   .option("datahub.topic", "topic1")
 76 |   .option("datahub.accessId", "accessId")
 77 |   .option("datahub.accessKey", "accessKey")
 78 |   .option("datahub.startingoffsets", "latest")
 79 |   .option("datahub.maxoffsetsperTrigger", 20000) // optional
 80 |   .load()
 81 | ```
 82 | 
 83 | sink的示例如下:
 84 | ```
 85 | val query = spark.writeStream
 86 |   .format("datahub")
 87 |   .option("datahub.endpoint", "http://....")
 88 |   .option("datahub.project", "project")
 89 |   .option("datahub.topic", "topic1")
 90 |   .option("datahub.accessId", "accessId")
 91 |   .option("datahub.accessKey", "accessKey")
 92 |   .load()
 93 | ```
 94 | 
 95 | 其中datahub.endpoint请使用**经典网络ECS Endpoint**，各region对应的endpoint参考[此文](https://help.aliyun.com/document_detail/47442.html#h2-datahub-1)。此外，需要将endpoint配置在VPC访问配置中，参考[VPC访问](https://github.com/aliyun/MaxCompute-Spark/wiki/09.-VPC-Access%E6%96%87%E6%A1%A3%E8%AF%B4%E6%98%8E)。示例如下：
 96 | ```
 97 | {
 98 |   "regionId":"cn-beijing",
 99 |   "vpcs":[
100 |     {
101 |       "zones":[
102 |         {
103 |           "urls":[
104 |             {
105 |               "domain":"dh-cn-beijing.aliyun-inc.com",
106 |               "port":80
107 |             }
108 |           ]
109 |         }
110 |       ]
111 |     }
112 |   ]
113 | }
114 | ```
115 | 
116 | **注意：** 目前所给的这个Demo，没有启用checkpoint，checkpoint需要使用oss作为checkpoint的存储，另外Spark Streaming作业处于试用阶段，**作业最长运行时间不能超过3天，如果需要投入长时间正式运行使用，请联系我们开通相关权限。**
117 | 


--------------------------------------------------------------------------------
/docs/docs/development/spark-streaming/streaming-kafka.md:
--------------------------------------------------------------------------------
 1 | # Streaming读写kafka
 2 | MaxCompute支持Spark Streaming和Spark Structured Streaming，本文介绍Streaming作业流式读写kafka的示例。
 3 | 
 4 | ## Spark Streaming(DStream)
 5 | 该示例是基于一个Kafka的Receiver，适用于DStream的接口。
 6 | 
 7 | > 详细代码请参考：https://github.com/aliyun/MaxCompute-Spark/blob/master/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/streaming/kafka/KafkaStreamingDemo.scala
 8 | 
 9 | ## Kafka回流到MaxCompute
10 | 通过DStreaming+Dataframe把Kafka数据导入MaxCompute
11 | 
12 | > 详细代码请参考：https://github.com/aliyun/MaxCompute-Spark/blob/master/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/streaming/kafka/Kafka2OdpsDemo.scala
13 | 
14 | ## Spark Structured Streaming
15 | > 详细代码请参考：https://github.com/aliyun/MaxCompute-Spark/blob/master/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/structuredStreaming/kafka/KafkaStructuredStreamingDemo.scala
16 | 
17 | source的示例如下(请参考代码):
18 | ```
19 | val df = spark
20 |         .readStream
21 |         .format("kafka")
22 |         .option("kafka.bootstrap.servers", "192.168.72.224:9202,192.168.72.225:9202,192.168.72.226:9202")
23 |         .option("subscribe", "zkytest")
24 |         .load()
25 | ```
26 | 
27 | 由于kafka在VPC内，需要将endpoint配置在VPC访问配置中，参考[VPC访问](https://github.com/aliyun/MaxCompute-Spark/wiki/09.-VPC-Access%E6%96%87%E6%A1%A3%E8%AF%B4%E6%98%8E)。示例如下：
28 | ```
29 | {
30 |   "regionId":"cn-beijing",
31 |   "vpcs":[
32 |     {
33 |       "vpcId":"vpc-2zeaeq21mb1dmkqh0exox"
34 |       "zones":[
35 |         {
36 |           "urls":[
37 |             {
38 |               "domain":"192.168.72.224",
39 |               "port":9202
40 |             },
41 |             {
42 |               "domain":"192.168.72.225",
43 |               "port":9202
44 |             },
45 |             {
46 |               "domain":"192.168.72.226",
47 |               "port":9202
48 |             }
49 |           ]
50 |         }
51 |       ]
52 |     }
53 |   ]
54 | }
55 | ```
56 | 
57 | **注意：** 目前所给的这个Demo，没有启用checkpoint，checkpoint需要使用oss作为checkpoint的存储，另外Spark Streaming作业处于试用阶段，**作业最长运行时间不能超过3天，如果需要投入长时间正式运行使用，请联系我们开通相关权限。**
58 | 


--------------------------------------------------------------------------------
/docs/docs/development/spark-streaming/streaming-loghub.md:
--------------------------------------------------------------------------------
 1 | # Streaming读写LogHub
 2 | MaxCompute支持Spark Streaming和Spark Structured Streaming，本文介绍Streaming作业流式接收LogHub(日志服务的一个组件，日志服务详见[官方文档](https://www.aliyun.com/product/sls))的示例。
 3 | 
 4 | ## Spark Streaming(DStream)
 5 | 该示例是基于一个LogHub的Receiver（类似基于Spark之上接收Kafka流的Receiver），适用于DStream的接口。
 6 | 
 7 | > 详细代码请参考：https://github.com/aliyun/MaxCompute-Spark/blob/master/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/streaming/loghub/LogHubStreamingDemo.scala
 8 | 
 9 | 运行这个Demo，需要在Spark的配置中给出LogHub的如下几个配置：
10 | 
11 | ```
12 | spark.logservice.accessKeyId : loghub的accessId
13 | spark.logservice.accessKeySecret : loghub的accessKey
14 | spark.logservice.endpoint : loghub的endpoint，需要根据project所在的region进行选择
15 | spark.logservice.project : 需要读取的loghub的project名字
16 | spark.logservice.logstore : 需要读取的logstore的名字
17 | ```
18 | 另外StreamingParam#setCursor(LogHubCursorPosition.END_CURSOR) 和 StreamingParam#setGroup("test") 这俩配置的含义可以参考[LogHub官方文档的介绍](https://help.aliyun.com/document_detail/28998.html?spm=a2c4g.11186623.6.877.2ea24bbcd6eDg5)。 
19 | 
20 | ## LogHub回流到MaxCompute
21 | 利用DStream+Dataframe可以把LogHub数据回流到MaxCompute。
22 | 
23 | > 详细代码请参考：https://github.com/aliyun/MaxCompute-Spark/blob/master/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/streaming/loghub/LogHub2OdpsDemo.scala
24 | 
25 | ## Spark Structured Streaming
26 | > 详细代码请参考：https://github.com/aliyun/MaxCompute-Spark/blob/master/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/structuredStreaming/loghub/LoghubStructuredStreamingDemo.scala
27 | 
28 | source的示例如下(请参考代码):
29 | ```
30 | val stream = spark.readStream
31 |   .format("loghub")
32 |   .option("loghub.endpoint", "http://....")
33 |   .option("loghub.project", "project")
34 |   .option("loghub.logstores", "store1,store2")
35 |   .option("loghub.accessId", "accessId")
36 |   .option("loghub.accessKey", "accessKey")
37 |   .option("loghub.startingoffsets", "latest")
38 |   .load()
39 | ```
40 | 
41 | sink的示例如下:
42 | ```
43 | val query = spark.writeStream
44 |   .format("loghub")
45 |   .option("loghub.endpoint", "http://....")
46 |   .option("loghub.project", "project")
47 |   .option("loghub.logstores", "store1,store2")
48 |   .option("loghub.accessId", "accessId")
49 |   .option("loghub.accessKey", "accessKey")
50 |   .start()
51 | ```
52 | 
53 | 其中loghub.endpoint请使用**经典/VPC网络服务入口**，各region对应的endpoint参考[此文](https://help.aliyun.com/document_detail/29008.html#h2-url-2)。此外，需要将endpoint配置在VPC访问配置中，参考[VPC访问](https://github.com/aliyun/MaxCompute-Spark/wiki/09.-VPC-Access%E6%96%87%E6%A1%A3%E8%AF%B4%E6%98%8E)。示例如下：
54 | ```
55 | {
56 |   "regionId":"cn-beijing",
57 |   "vpcs":[
58 |     {
59 |       "zones":[
60 |         {
61 |           "urls":[
62 |             {
63 |               "domain":"cn-beijing-intranet.log.aliyuncs.com",
64 |               "port":80
65 |             }
66 |           ]
67 |         }
68 |       ]
69 |     }
70 |   ]
71 | }
72 | ```
73 | 
74 | **注意：** 目前所给的这个Demo，没有启用checkpoint，checkpoint需要使用oss作为checkpoint的存储，另外Spark Streaming作业处于试用阶段，**作业最长运行时间不能超过3天，如果需要投入长时间正式运行使用，请联系我们开通相关权限。**
75 | 


--------------------------------------------------------------------------------
/docs/docs/faq/_category_.json:
--------------------------------------------------------------------------------
1 | {
2 |   "label": "常见问题",
3 |   "position": 6,
4 |   "link": {
5 |     "type": "generated-index"
6 |   }
7 | }
8 | 


--------------------------------------------------------------------------------
/docs/docs/faq/allocate-resource.md:
--------------------------------------------------------------------------------
 1 | # 资源申请问题
 2 | <h1 id="1">资源申请</h1>
 3 | 通常用户在提交Spark作业时需要关注以下几种资源：
 4 | 
 5 | * **Executor 数量**
 6 | * **Executor 内存**
 7 | * **Executor core**
 8 | * **Driver 内存**
 9 | * **Driver core**
10 | * **本地网盘**
11 | 
12 | ## Executor 相关参数
13 | * spark.executor.instances
14 |   + 总共申请的executor数目，普通任务十几个或者几十个足够了，若是处理大量数据时可以申请多一些，100—2000+
15 | * spark.executor.cores
16 |   + 每个executor的核数，即每个executor中的可同时运行的task数目
17 |   + Spark任务的最大并行度是executor数目*executor core数
18 | * spark.executor.memory
19 |   + 代表申请executor的堆内内存，也就是启动jvm进程时设置的Xmx参数
20 | * spark.executor.memoryOverhead
21 |   + 申请executor的堆外内存，默认单位是MB，主要用于JVM自身，字符串, NIO Buffer等开销
22 |   + 默认为executor Memory*0.1,最小384M
23 |   + 如果遇到Cannot allocate memory，通常是堆外内存不足，可以考虑适当增大spark.executor.memoryOverhead
24 |   + 注意：单个Executor的内存总量是spark.executor.memory+spark.executor.memoryOverhead
25 | 
26 | ## Driver 相关参数
27 | * spark.driver.cores
28 | * spark.driver.memory
29 | * spark.yarn.driver.memoryOverhead
30 | * spark.driver.maxResultSize
31 |   + 默认1g，控制worker送回driver的数据大小，一旦超出该限制，driver会终止执行
32 | ## 本地网盘参数
33 | * spark.hadoop.odps.cupid.disk.driver.device_size
34 |   + 代表本地网盘大小，默认值为20g
35 |   + Spark使用网盘作为本地存储，Driver和每个Executor都有一个，Shuffle数据以及BlockManager溢出的数据均存储在网盘上
36 |   + 当出现**No space left on device**时可适当调大该值，最大支持100g。如果调整到100g仍然会出现此错误，需要分析具体原因，可能是：1. 数据倾斜，在shuffle或者cache过程中数据集中分布在某些block；2. 可以缩小单个executor的并发（spark.executor.cores） 3. 增加executor的数量(spark.executor.instances)
37 |   + **注意**  `必须配置在spark-conf文件或者dataworks的配置项中，不能配置在代码中`
38 |  
39 | <h1 id="2">如何合理设置资源参数</h1>
40 | 
41 | * 建议按照内存/CPU 1:4来申请资源，即1 core对应4GB内存，建议单个worker core数不要超过8 
42 | 
43 | * 用户可以通过查看logview中Master或Worker的Sensor来获取运行中的内存和CPU使用情况
44 | ![sensor](../resources/fuxisensor.png)
45 | 
46 | * 通常需要关注mem_rss，代表了executor或driver在实际使用时的内存变化曲线，用户可以根据该值变化来判断是否需要增加/减少内存
47 | ![sensor](../resources/fuxisensor2.png)
48 | 
49 | 
50 | <h1 id="3">资源等待</h1>
51 | 
52 | ## 注意事项
53 | * 用户在集群模式下必须配置spark.master=yarn-cluster才会正确的申请资源（注意local模式调试完之后要将代码中的spark.master=local配置去掉）
54 | 
55 | ## 如何等待资源申请到之后提交Job
56 | * 申请资源是一个持续不断的过程，因此可能会出现拿到的资源没有达到用户请求的数量，而spark是不会等到所有的Executor都获取到之后再开始执行任务，可以通过以下参数来控制Spark提交任务的时机
57 |   + spark.scheduler.maxRegisteredResourcesWaitingTime：在执行前最大等待申请资源的时间，默认30s。
58 |   + spark.scheduler.minRegisteredResourcesRatio：实际注册的资源数占预期需要的资源数的比例，默认0.8
59 | 
60 | ## 申请不到资源的可能原因：
61 | * 如果是预付费用户，一般是用户申请的资源超出了购买的资源数量，可以登陆管家进行确认
62 | * 如果是后付费用户，需要抢占资源
63 | 
64 | 
65 | ## 申请不到资源解决方案
66 | * 调整任务资源：调整用户申请的Executor总数或者单个Executor的资源数量（一般是内存）
67 | * 合理安排任务执行时间
68 | 
69 | ## 没有申请不到资源的几种现象
70 | 
71 | * 在driver端一般会打以下日志：
72 | WARN YarnClusterScheduler: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
73 | ![](../resources/资源申请1.png)
74 | 
75 | * 在Logview中只能看到driver，而worker数量为0
76 | * 在Spark ui中只能看到driver，而worker数量为0
77 | ![](../resources/资源申请2.png)


--------------------------------------------------------------------------------
/docs/docs/faq/class-conflict.md:
--------------------------------------------------------------------------------
 1 | # Java Scala类冲突问题
 2 | 
 3 | ## 类冲突问题概述
 4 | * 这类报错一般会抛出异常java.lang.NoClassDefFoundError或者方法找不到等问题，需要检查pom并将冲突的依赖排除掉
 5 | * 原因在于：用户jar包中很可能打了一些依赖进去，这些依赖的jar包与spark jars目录下的jar包由于版本不一致，jvm在加载类的时候优先加载了用户的jar包
 6 | 
 7 | ## 需要注意的问题
 8 | 
 9 | ### 依赖为provided和compile的区别
10 | 
11 | * provided：代码依赖该jar包，但只在编译的时候需要用，而运行时不需要，运行时会去集群中去寻找的相应的jar包，很多时候把jar包的类型设置为provided类型，就是因为这些jar包已经在集群中提供了（主要是spark客户端的jars目录，该目录中包含的jar包通常应该设置为provided），如果不设置为provided，某些时候可能可以正常运行，某些时候就会发生类冲突，类/方法找不到等各种问题。
12 | * compile：代码依赖该jar包，在编译、运行时候都需要，也就是说集群中不存在这些jar包，需要用户打到自己的jar包中。这种类型的jar包一般是一些三方库，且与spark运行无关，与用户代码逻辑有关。
13 | 
14 | ### 主jar包必须是一个fat jar
15 | * 必须要把compile类型的依赖都打到用户jar包中，这样在代码运行时才能加载到这些依赖类
16 | 
17 | 
18 | ## POM自检
19 | 
20 | ### 需要设置为provided的jar包
21 | * groupId为org.apache.spark的jar包
22 |   + **说明** 这类jar包主要是社区版spark的jar包，已经在spark客户端的jars目录下提供，不需要打进用户的jar包，会在spark客户端提交任务时自动上传到MaxCompute集群中
23 | 
24 | * cupid-sdk
25 |   + **说明** 该jar包在任务提交时自动上传到MaxCompute集群中
26 | 
27 | * odps-sdk
28 |   + **说明** 该jar包在任务提交时自动上传到MaxCompute集群中
29 | 
30 | * hadoop-yarn-client
31 |   + **说明** 该jar包用于任务上传
32 |   + **注意** 该jar包可能会被间接依赖，因此最好在打包之前检查并将该依赖排除
33 |   
34 | ### 不能设置为provided的jar包
35 | * oss相关的jar包
36 |   + **举例** hadoop-fs-oss
37 |   + **说明** 该jar包属于第三方jar包，如果需要访问oss，需要打到用户jar包中
38 | 
39 | * 流式相关的jar包
40 |   + **举例** streaming-lib
41 |   + **说明** 该jar包提供了一些spark streaming的接口来访问datahub和loghub，如果用户需要使用，则需要打到用户jar包中
42 | 
43 | * 用户访问其他服务用到的jar包
44 |   + **举例** 访问mysql等其他第三方服务需要用到的jar包
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/docs/docs/faq/github-images.md:
--------------------------------------------------------------------------------
1 | # Github图片无法访问的问题
2 | 如果查看文档时发现图片无法显示，那么需要到 https://www.ipaddress.com/ 查看raw.githubusercontent.com的ipv4地址，然后使用管理员权限修改hosts文件，添加:
3 | 
4 | [**ip地址**] raw.githubusercontent.com


--------------------------------------------------------------------------------
/docs/docs/faq/network-access.md:
--------------------------------------------------------------------------------
 1 | # 访问VPC和OSS的问题
 2 | <h1 id="1">访问OSS常见问题</h1>
 3 | 
 4 | ## 线上作业time out
 5 |    + 用户在local模式通常需要采用公网的oss域名，在线上需要改成inernal域名
 6 | 
 7 | ## 依赖的问题
 8 |    + 示例中的hadoop-fs-oss必须以compile的方式打入用户主jar包
 9 |    + 如果用户使用PySpark，一定要把包含hadoop-fs-oss的fat jar包也上传到集群，否则出现找不到OSS相关类的问题
10 |    
11 | <h1 id="2">访问VPC常见问题</h1> 
12 | 
13 | ## 线上作业time out
14 |    + vpc.domain.list 需要压缩成一行：建议通过 [网站](http://www.bejson.com/) 进行压缩，不要有空格
15 |    + 如果不是使用ENI专线，则需在要访问的服务中添加ip白名单，允许100.104.0.0/16网段的访问
16 |    + 如果是使用ENI专线，需要在要访问的服务中添加安全组白名单（开通ENI专线时使用的安全组）
17 |    + smartnat只有北京和上海region可用，且必须设置为true
18 |    + 用户要保证所有可能访问到的IP都已经加到vpc.domain.list，例如如果用户要访问位于hdfs，hbase这种多个节点的服务，一定要把所有的节点都添加进来，不然肯定会遇到time out的情况
19 |    
20 | ## 访问公网
21 | 目前MaxCompute Spark运行在网络隔离环境中，如果需要访问公网，只能通过以下两种方式：
22 | 
23 | * 中国公共云：
24 |    + 提工单设置 project 级别白名单，如把 google.com:443 加到odps.security.outbound.internetlist 里面
25 |    + 在Spark作业中配置公网访问白名单:spark.hadoop.odps.cupid.internet.access.list=google.com:443和spark.hadoop.odps.cupid.smartnat.enable=true
26 | 
27 | * 开通专线
28 |    + 提工单开通专线，配置专线参数spark.hadoop.odps.cupid.eni.info和spark.hadoop.odps.cupid.eni.enable=true
29 |    + 在Spark作业中配置公网访问白名单:spark.hadoop.odps.cupid.internet.access.list=google.com:443


--------------------------------------------------------------------------------
/docs/docs/faq/oom-troubleshooting.md:
--------------------------------------------------------------------------------
 1 | # 运行时OOM问题
 2 | <h1 id="1">OOM的一些情况</h1>
 3 | 
 4 | ## 如何查看worker以及master的内存使用情况
 5 | Logview 2.0包含Sensor功能，可以查看master以及每个worker在运行时的内存和cpu使用情况
 6 | 
 7 | 
 8 | ## Cannot allocate memory
 9 | 
10 | 1. 在某些Executor中出现Cannot allocate memory，一般是堆外内存不足，此时可以调整spark.yarn.executor.memoryOverhead参数
11 | 2. 在Driver中出现Cannot allocate memory，可以调整spark.yarn.driver.memoryOverhead参数
12 | ![image1](../resources/OOM1.png)
13 | 
14 | ## java.lang.OutOfMemoryError: Java heap space
15 | 如果在Executor中出现该错误，通常是堆内内存不足，此时可以适当增大内存，或减少Executor core
16 | 
17 | ## No route to host: workerd********* / Could not find CoarseGrainedScheduler
18 | 出现这类错误极有可能是某些Executor出现OOM
19 | 
20 | 
21 | <h1 id="2">OOM解决方案</h1>
22 | 
23 | 1. 限制executor 并行度，将cores 调小：多个同时运行的 Task 会共享一个Executor 的内存，使得单个 Task 可使用的内存减少，调小并行度能缓解内存压力
24 | 2. 增加单个Executor内存
25 | 3. 增加分区数量，减少每个executor负载
26 | 4. 考虑数据倾斜问题，因为数据倾斜导致某个 task 内存不足，其它 task 内存足够


--------------------------------------------------------------------------------
/docs/docs/faq/pyspark-faq.md:
--------------------------------------------------------------------------------
 1 | # PySpark 常见问题
 2 | ## Local 模式（Spark 2.4.5）
 3 | 
 4 | - 新建一个odps.conf文件，包含以下odps参数：
 5 | ```
 6 | odps.project.name=***
 7 | odps.access.id=***
 8 | odps.access.key=***
 9 | odps.end.point=***
10 | ```
11 | 
12 | - 在PyCharm中添加以下环境变量：
13 | ```
14 | SPARK_HOME=/path/to/spark_home
15 | PYTHONPATH=/path/to/spark_home/python
16 | ODPS_CONF_FILE=/path/to/odps.conf
17 | ```
18 | 
19 | - 在代码中添加以下配置：
20 | ```
21 | spark = SparkSession.builder\
22 |    .appName("spark sql")\
23 |    .config("spark.eventLog.enabled", False)\
24 |    .getOrCreate()
25 | ```
26 | 
27 | - 直接运行pyspark作业即可
28 | 
29 | ## Cluster 模式（Spark 2.4.5）
30 | 
31 | #### 作业执行抛出异常：***.so: cannot open shared object file: No such file or directory
32 | 
33 | 上述抛出的异常，提示用户作业在执行加载时缺少对应的依赖，具体解决步骤如下：
34 | ##### MaxCompute Spark客户端
35 | * 公网下载对应的依赖文件
36 | * 提交作业时通过参数 **--files /path/to/[lib名]** 将对应的依赖文件加载至driver与executor的工作目录内
37 | 
38 | ##### Dataworks Spark节点
39 | * 公网下载对应的依赖文件
40 | * 通过DataWorks，添加对应的依赖资源，即，创建MaxCompute资源
41 | * 作业提交新增补充参数，spark.hadoop.odps.cupid.resources = public.python-2.7.13-ucs4.tar.gz,[project名].[resource名].so:[resource名].so，
42 | 
43 | ###### 注意事项
44 | ```
45 | 由于上传的依赖资源是以project名称为前缀，所以需要对上传的resource名称进行重命名为需要的依赖，即，去掉project名称的前缀，这样才可以正确加载依赖
46 | ```
47 | 


--------------------------------------------------------------------------------
/docs/docs/faq/read-transactional-table.md:
--------------------------------------------------------------------------------
 1 | # 读取ACID表问题
 2 | 新版本Spark支持读取ACID表，需要添加以下参数切换到新版本
 3 | 
 4 | ## Spark 2.3.0
 5 | <pre>
 6 | spark.hadoop.odps.task.major.version = default
 7 | spark.hadoop.odps.cupid.resources = public.__spark_libs__2.3.0-odps0.34.0.zip
 8 | spark.driver.extraClassPath = ./public.__spark_libs__2.3.0-odps0.34.0.zip/* 
 9 | spark.executor.extraClassPath = ./public.__spark_libs__2.3.0-odps0.34.0.zip/*
10 | </pre>
11 | 
12 | ## Spark 2.4.5
13 | <pre>
14 | spark.hadoop.odps.task.major.version = default
15 | spark.hadoop.odps.cupid.resources = public.__spark_libs__2.4.5-odps0.34.0.zip
16 | spark.driver.extraClassPath = ./public.__spark_libs__2.4.5-odps0.34.0.zip/* 
17 | spark.executor.extraClassPath = ./public.__spark_libs__2.4.5-odps0.34.0.zip/*
18 | </pre>


--------------------------------------------------------------------------------
/docs/docs/faq/ref-external-file.md:
--------------------------------------------------------------------------------
 1 | # 引用外部文件问题
 2 | 需要引用到外部文件的场景
 3 |   + 用户作业需要读取一些配置文件
 4 |   + 用户作业需要额外的jar包/Python库
 5 | 
 6 | <h1 id="1">如何上传文件</h1>
 7 | 
 8 | 上传文件有两种方式
 9 | * 通过Spark参数上传文件
10 | * 通过MaxCompute Resource上传文件
11 | 
12 | ## Spark参数
13 | MaxCompute Spark支持Spark社区版原生的--jars，--py-files等参数，可以在作业提交时通过这些参数将文件上传，这些文件在任务运行时会被上传到用户的工作目录下。
14 | 
15 | 在不同的运行模式下上传文件：
16 | * 通过Spark客户端：直接使用spark-submit命令行参数
17 | <pre>
18 | **注意事项**
19 | * --jars选项，会将配置的jar包上传至Driver和Executor的当前工作目录，多个文件逗号分隔，这些jar包都会加入Driver和Executor的Classpath，Spark作业中直接"./your_jar_name"即可引用，与社区版Spark行为相同。
20 | * --files, --py-files选项，会将配置的 普通文件/python文件 上传至Driver和Executor的当前工作目录，多个文件逗号分隔，Spark作业中直接"./your_file_name"即可引用，与社区版Spark行为相同。
21 | * --archives选项，与社区版Spark行为<b>略有不同</b>，多个逗号分隔，配置方式为xxx#yyy，会将配置的归档文件（例如.zip）解压到Driver和Executor的当前工作目录的<b>子目录</b>中。举例：当配置为xx.zip#yy时，应以"./yy/xx/"引用到归档文件中的内容；当仅配置xx.zip时，应以"./xx.zip/xx/"引用到内容。若一定需要将归档内容直接解压到当前目录，即直接引用"./xxx/"，请使用下面提到的<b>spark.hadoop.odps.cupid.resources</b>配置。
22 | </pre>
23 | 
24 | * 通过DataWorks添加任务需要的资源，参见[文档](https://github.com/aliyun/MaxCompute-Spark/wiki/02.-Spark-on-Dataworks)
25 | 
26 | ## MaxCompute Resource
27 | MaxCompute Spark提供spark.hadoop.odps.cupid.resources参数，可以直接引用MaxCompute中的资源，这些资源在任务运行时会被上传到用户的工作目录下。
28 | 
29 | 使用方式
30 | ```
31 | 1. 通过MaxCompute客户端将文件上传(单个文件最大支持500MB)
32 | 2. 在Spark作业配置中添加spark.hadoop.odps.cupid.resources参数
33 |    格式为<projectname>.<resourcename>，如果需要引用多个文件，需要用逗号隔开
34 | ```
35 | ### spark.hadoop.odps.cupid.resources参数介绍
36 |   + **配置说明** `该配置项指定了任务运行所需要的`[Maxcompute资源](https://help.aliyun.com/document_detail/27831.html?spm=5176.11065259.1996646101.searchclickresult.d55650ea0QU1qd&aly_as=45TiiTdO2)
37 |   + **配置示例** spark.hadoop.odps.cupid.resources=public.python-python-2.7-ucs4.zip,public.myjar.jar
38 |   + **使用说明** `指定的资源将被下载到driver和executor的当前工作目录，资源下载到工作目录后默认的名字是<projectname>.<resourcename>`
39 |   + **文件重命名** `在配置时通过<projectname>.<resourcename>:<newresourcename>进行重命名`
40 |   + **重命名示例** spark.hadoop.odps.cupid.resources=public.myjar.jar:myjar.jar
41 |   + **注意** `该配置项必须要配置在spark-default.conf中或dataworks的配置项中才能生效，而不能写在代码中`
42 |   
43 |   
44 | <h1 id="2">如何在代码中引用文件</h1>
45 | 通过上述两种方式可以将文件上传到任务的当前工作目录，文件读取示例：
46 | 
47 | ```
48 | val targetFile = "文件名"
49 | val file = Source.fromFile(targetFile)
50 | for (line <- file.getLines)
51 |     println(line)
52 | file.close
53 | ```


--------------------------------------------------------------------------------
/docs/docs/faq/spark-24-notes.md:
--------------------------------------------------------------------------------
 1 | # Spark 2.4.5 使用注意事项
 2 | ## 如何使用Spark 2.4.5提交作业
 3 | * 直接使用Yarn-cluster模式在本地提交任务, 添加 spark.hadoop.odps.spark.libs.public.enable=true和spark.hadoop.odps.spark.version=spark-2.4.5-odps0.34.0 这两个参数可以加速包上传速度
 4 | 
 5 | * 或在Dataworks中配置参数 spark.hadoop.odps.spark.version=spark-2.4.5-odps0.34.0，注意，若Dataworks独享资源组尚未升级到Spark 2.4.5，用户可以采用公共资源组进行调度，或联系Dataworks平台官方人员进行升级
 6 | 
 7 | ## Spark 2.4.5 使用变化
 8 | * 如果使用Yarn-cluster模式在本地提交任务，需要新增环境变量 export HADOOP_CONF_DIR=$SPARK_HOME/conf
 9 | 
10 | * 如果使用local模式进行调试，需要在$SPARK_HOME/conf目录下新建odps.conf文件，并添加以下配置：
11 | ```
12 | odps.project.name = 
13 | odps.access.id = 
14 | odps.access.key =
15 | odps.end.point =
16 | ```
17 | 
18 | ## Spark 2.4.5 参数配置变化
19 | 
20 | * `spark.sql.catalogImplementation`
21 |   + **配置值** `hive`
22 | * `spark.sql.sources.default`
23 |   + **配置值** `hive`
24 | * `spark.sql.odps.columnarReaderBatchSize`
25 |   + **默认值** `4096`
26 |   + **配置说明**  `向量化读每个batch包含的行数`
27 | * `spark.sql.odps.enableVectorizedReader`
28 |   + **默认值** `true`
29 |   + **配置说明**  `开启向量化读`
30 | * `spark.sql.odps.enableVectorizedWriter`
31 |   + **默认值** `true`
32 |   + **配置说明**  `开启向量化写`
33 | * `spark.sql.odps.split.size`
34 |   + **默认值** `256m`
35 |   + **配置说明**  `该配置可以用来调节读Maxcompute表的并发度，默认每个分区为256MB`
36 | * `spark.hadoop.odps.cupid.vnet.capacity`
37 |   + **默认值** `802`
38 |   + **配置说明**  `该配置用于设置最大的instance数量，建议配置值为spark.executor.instances + 2，否则可能会遇到create virtual net failed错误。该参数需要设置到spark-defaults.conf或Dataworks配置项中`
39 | 
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/docs/docs/faq/spark-31-notes.md:
--------------------------------------------------------------------------------
 1 | # Spark 3.1.1 使用注意事项
 2 | ## 如何使用Spark 3.1.1提交作业
 3 | * 直接使用Yarn-cluster模式在本地提交任务
 4 | 
 5 | * 通过DataWorks平台选择Spark 3.x选项。若提交任务报错，则需要提单升级独享资源组版本。
 6 | 
 7 | ## Spark 3.1.1 使用变化
 8 | * 如果使用Yarn-cluster模式从本地提交任务，需要新增环境变量 export HADOOP_CONF_DIR=$SPARK_HOME/conf
 9 | 
10 | * 如果使用Yarn-cluster模式提交Pyspark作业，需要添加以下参数使用Python3
11 | ```
12 | spark.hadoop.odps.cupid.resources = public.python-3.7.9-ucs4.tar.gz
13 | spark.pyspark.python = ./public.python-3.7.9-ucs4.tar.gz/python-3.7.9-ucs4/bin/python3
14 | ```
15 | 
16 | * 如果使用local模式进行调试，需要在类路径下新建odps.conf文件，并添加以下配置：
17 | ```
18 | odps.project.name = 
19 | odps.access.id = 
20 | odps.access.key =
21 | odps.end.point =
22 | ```
23 | 
24 | * 如果使用local模式进行调试，需要添加spark.hadoop.fs.defaultFS = file:///
25 | ```
26 | val spark = SparkSession
27 |   .builder()
28 |   .config("spark.hadoop.fs.defaultFS", "file:///")
29 |   .enableHiveSupport()
30 |   .getOrCreate()
31 | ```
32 | 
33 | ## Spark 3.1.1 参数配置
34 | 
35 | * `spark.sql.defaultCatalog`
36 |   + **配置值** `odps`
37 | * `spark.sql.catalog.odps`
38 |   + **配置值** `org.apache.spark.sql.execution.datasources.v2.odps.OdpsTableCatalog`
39 | * `spark.sql.sources.partitionOverwriteMode`
40 |   + **配置值** `dynamic`
41 | * `spark.sql.extensions`
42 |   + **配置值** `org.apache.spark.sql.execution.datasources.v2.odps.extension.OdpsExtensions`
43 | * `spark.sql.catalog.odps.enableVectorizedReader`
44 |   + **默认值** `true`
45 |   + **配置说明**  `开启向量化读`
46 | * `spark.sql.catalog.odps.enableVectorizedWriter`
47 |   + **默认值** `true`
48 |   + **配置说明**  `开启向量化写`
49 | * `spark.sql.catalog.odps.splitSizeInMB`
50 |   + **默认值** `256`
51 |   + **配置说明**  `该配置可以用来调节读Maxcompute表的并发度，默认每个分区为256MB`


--------------------------------------------------------------------------------
/docs/docs/overview.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | sidebar_position: 1
 3 | ---
 4 | 
 5 | # MaxCompute Spark概述
 6 | MaxCompute Spark是MaxCompute提供的兼容开源的Spark计算服务。它在统一的计算资源和数据集权限体系之上，提供Spark计算框架，支持用户以熟悉的开发使用方式提交运行Spark作业，以满足更丰富的数据处理分析场景。
 7 | 
 8 | ## 关键特性
 9 | 
10 | * 支持原生多版本Spark作业
11 | > 社区原生Spark运行在MaxCompute里，完全兼容Spark的API，支持多个Spark版本同时运行。MaxCompute Spark提供原生的Spark WebUI供用户查看。
12 | 
13 | * 统一的计算资源
14 | > MaxCompute Spark像MaxCompute SQL/MR等任务类型一样，运行在MaxCompute项目开通的统一计算资源中。
15 | 
16 | * 统一的数据和权限管理
17 | > 完全遵循MaxCompute项目的权限体系，在访问用户权限范围内安全地查询数据。
18 | 
19 | * 与开源系统相同的使用体验
20 | > MaxCompute Spark与社区开源Spark保持相同的体验（例如开源应用的UI界面、在线交互等），完全符合Spark用户使用习惯。开源应用的调试过程中需要使用开源UI，MaxCompute Spark提供原生的开源实时UI和查询历史日志的功能。其中，对于部分开源应用还支持交互式体验，在后台引擎运行后即可进行实时交互。
21 | 
22 | ## 系统结构
23 | 
24 | MaxCompute Spark是阿里云通过Spark on MaxCompute的解决方案，让原生Spark能够运行在MaxCompute当中。
25 | 
26 | ![cupid架构图](resources/cupid_arch.png)
27 | 
28 | 左侧是原生Spark的架构图，右边Spark on MaxCompute运行在阿里云自研的Cupid的平台之上，该平台可以原生支持开源社区Yarn所支持的计算框架，如Spark等。
29 | 
30 | ## 约束与限制
31 | 
32 | 目前MaxCompute Spark支持以下适用场景：
33 | 
34 | * 离线计算场景：GraphX、Mllib、RDD、Spark-SQL、PySpark等
35 | * 读写MaxCompute Table
36 | * 引用MaxCompute中的文件资源
37 | * 读写VPC环境下的服务，如RDS、Redis、HBase、ECS上部署的服务等
38 | * 读写OSS非结构化存储
39 | 
40 | 暂不支持以下场景：
41 | 
42 | * 不支持交互式类需求Spark-Shell、Spark-SQL-Shell、PySpark-Shell等
43 | * 不支持访问Maxcompute外部表，函数和UDF
44 | * 只支持Local模式和Yarn-cluster模式运行


--------------------------------------------------------------------------------
/docs/docs/quickstart/_category_.json:
--------------------------------------------------------------------------------
1 | {
2 |   "label": "快速开始",
3 |   "position": 2,
4 |   "link": {
5 |     "type": "generated-index"
6 |   }
7 | }
8 | 


--------------------------------------------------------------------------------
/docs/docs/quickstart/dataworks-integration.md:
--------------------------------------------------------------------------------
 1 | # Spark on Dataworks
 2 | <h1 id="1">Dataworks节点使用流程</h1>
 3 | 
 4 | * 创建资源
 5 | 
 6 | ![image1](../resources/dataworks-1.jpg)
 7 | 
 8 | * 上传模板项目工程编译出来的jar包
 9 | 
10 | ![image2](../resources/dataworks-2.jpg)
11 | 
12 | * 上传之后一定要commit，也就是红色方框的按钮
13 | 
14 | ![image3](../resources/dataworks-3.jpg)
15 | 
16 | * 创建ODPS Spark节点
17 | 
18 | ![image4](../resources/dataworks-4.jpg)
19 | 
20 | * 选择刚才上传的资源并且按照spark-defaults.conf里面的配置填页面上的配置项，并提交
21 | 
22 | ![image5](../resources/dataworks-5.jpg)
23 | 
24 | * 点击红色方框冒烟测试，冒烟测试按钮旁边的按钮可以查看运行日志
25 | 
26 | ![image6](../resources/dataworks-6.jpg)
27 | 
28 | * 运行日志
29 | 
30 | ![image7](../resources/dataworks-7.jpg)
31 | 
32 | 
33 | <h1 id="1">Dataworks Spark节点配置</h1>
34 | 
35 | ## ODPS SPARK节点介绍
36 | 
37 | 本质上ODPS SPARK节点的配置对应于spark-submit命令的参数和选项。具体来说
38 | 
39 | | 节点 | spark-submit |
40 | | --- | --- |
41 | | 主java/python资源 | app jar or python file |
42 | | 配置项 | --conf PROP=VALUE |
43 | | main class | --class CLASS_NAME |
44 | | 参数 | [app arguments] |
45 | | 选择jar资源 | --jars JARS |
46 | | 选择python资源 | --py-files PY_FILES |
47 | | 选择file资源 | --files FILES |
48 | | 选择archives资源 | --archives ARCHIVES |
49 | 
50 | ## 配置项
51 | 
52 | 配置项对应于spark-submit命令的--conf选项，其中：
53 | 
54 | * accessid，accesskey，projectname，endpoint无需配置，默认是生产账号（有特殊原因可显式配置，将覆盖默认值）
55 | 
56 | * 除此之外，需要将spark-default.conf中的配置逐条加到dataworks的配置项中
57 | 
58 | ## 如何传参数(如bizdate)
59 | 
60 | * 同SQL节点，首先在调度->参数中添加参数
61 | 
62 | ![image8](http://ata2-img.cn-hangzhou.img-pub.aliyun-inc.com/e9bcf652514ef95f463039c224d22771.png#alt=image.png)<br />
63 | 
64 | * 然后在Spark节点“参数”栏引用该参数，该参数会传给用户主类，用户在代码中解析该参数即可
65 | 
66 | Java/Scala:
67 | 
68 | ![image9](http://ata2-img.cn-hangzhou.img-pub.aliyun-inc.com/68c282f810d83d3efb9cf2ce2654ad10.png#alt=image.png)<br />
69 | 
70 | Python:
71 | ![image10](http://ata2-img.cn-hangzhou.img-pub.aliyun-inc.com/42b4f596eac16f2eb55186f98d02352c.png#alt=image.png)
72 | 
73 | 
74 | ## 资源上传
75 | 
76 | * 用户需要在DataWorks中添加任务需要的资源，这些资源在任务运行时会被上传到用户的工作目录下，资源可能包括：
77 | ```
78 | 1. jar资源/python资源：对应于spark-submit命令的--jars，--py-files，
79 | 2. file资源，对应于spark-submit命令的--files
80 | 3. archive资源：对应于spark-submit命令的--archives，archives会默认被解压，解压后的文件名等同于资源名去掉后缀。例如上传的资源名是mnist.zip，则解压名为mnist
81 | ```
82 | 
83 | * DataWorks中上传资源限制最大为50MB，如果需要使用更大的资源，用户需要将该资源通过[MaxCompute客户端](https://help.aliyun.com/document_detail/27971.html?spm=a2c4g.11174283.6.990.3158590eUSc7JU)上传为MaxCompute资源，然后将该资源添加到数据开发中，详见[文档](https://help.aliyun.com/document_detail/137575.html?spm=a2c4g.11186623.6.813.665b1861iN9oa8)
84 | 
85 | 


--------------------------------------------------------------------------------
/docs/docs/quickstart/runtime-mode/client-mode.md:
--------------------------------------------------------------------------------
  1 | # Client 模式
  2 | 为了让Spark Session作为业务框架的后端数据处理服务，MaxCompute Spark 团队开发了“Client”模式来覆盖业务框架向同一个Spark Session 动态提交多个作业、实时获取作业状态的场景。
  3 | 
  4 | <a name="5gUvs"></a>
  5 | ## Client模式开发初衷
  6 | 社区spark生产主要使用"yarn-cluster"、"yarn-client"两种模式。“yarn-cluster”模式将spark作业提交到集群运行，运行完毕客户端打印状态日志；这种模式无法向一个Spark
  7 | Session动态多次提交作业，且客户端无法获取每个job的状态及结果。“yarn-client”模式，主要解决spark交互式场景问题，需要在客户端机器启动Driver,无法将Spark
  8 |  Session作为一个服务。因此我们基于Spark On MaxCompute开发了"Client"模式来解决上面的问题，该模式具有以下特点：
  9 | 
 10 | - 客户端轻量级，不用再启动spark的Driver;
 11 | - 客户端有一套API向MaxCompute集群的同一个Spark Session动态提交作业并监控状态；
 12 | - 客户端可以通过监控作业状态及结果构建作业之间的依赖关系；
 13 | - 用户可以动态编译应用程序jar通过客户端提交到原有的Spark Session运行；
 14 | - 客户端可以集成在业务的WebServer中，且可进行水平扩展;
 15 | 
 16 | <a name="tHDW4"></a>
 17 | ## Client模式简介
 18 | client模式是为了解决交互式／在线任务需求。由于cluster模式必须把Driver放置在MaxCompute集群里面，如果我们有在线查询或者交互式的需求，由于网络隔离的原因，无法直接访问到Driver。
 19 | 
 20 | 自研client模式同yarn-cluster模式一样也是把作业提交到MaxCompute集群，跟cluster模式最大的区别是client模式是由用户client端驱动，而cluster模式是由提交到计算集群的应用程序驱动。client模式把spark引擎作为一个在线服务来用，用户可以把client嵌入到在线业务系统进行实时分析。
 21 | 
 22 | client模式提供如下接口，允许多个Spark Job串行/并行执行，并提供多个spark作业共享的Context，允许多个spark作业共享数据。
 23 | ```
 24 | /**
 25 |  * Add the local jar file ,which contains user SparkJobs
 26 |  * @param localFilePath the local jar file path
 27 |  * @return return the jarName ,the startjob() will use
 28 |  */
 29 | def addJar(localFilePath: String): String
 30 | 
 31 | /**
 32 |  * After add the jar,can start the sparkjob in the jar
 33 |  * @param className the class name in the jar
 34 |  * @param jarName jar name return from the addJar()
 35 |  * @param conf the conf when sparkjob run need
 36 |  * @return the jobId, getJobStatus/killJob will use
 37 |  */
 38 | def startJob(className: String, jarName: String, conf: Map[String,String]): String
 39 | 
 40 | /**
 41 |  * get the jobstatus after the job start
 42 |  * @param jobId jobId return from the startJob()
 43 |  * @return the job status ,eg: JobStart,JobSuccess,JobFailed,JobKilled
 44 |  */
 45 | def getJobStatus(jobId: String): Any
 46 | 
 47 | /**
 48 |  * stop the remote driver,then can not submit sparkjob
 49 |  */
 50 | def stopRemoteDriver()
 51 | 
 52 | /**
 53 |  * kill the sparkjob running
 54 |  * @param jobId the jobid will kill
 55 |  */
 56 | def killJob(jobId: String)
 57 | ```
 58 | 
 59 | 
 60 | <a name="zz4goa"></a>
 61 | ## Client模式作业提交方式
 62 | Client模式与传统 spark-submit 命令行提交方式的最大不同在于再依赖Spark客户端。这带来了两大优势：
 63 | 
 64 | 1. 由于摆脱了Spark客户端的依赖，用户不再需要下载配置Spark环境，大大增加了Client模式的易用性，同时降低了用户的学习成本
 65 | 
 66 | 2. 由于不再需要上传Spark libraries，启动Client时不再需要上传200M左右的spark libs，既节省了时间又节省了网络开销，真正做到了让用户随时随地都可以提交Spark作业
 67 | 
 68 | 
 69 | Client模式提供了非常直观的提交参数接口，将在下文详细介绍。
 70 | 
 71 | <a name="lg4goi"></a>
 72 | ## [](#lg4goi)提交参数接口
 73 | ```java
 74 | public class SubmitParam {
 75 | 
 76 |     // Primary resource
 77 |     private String file;
 78 | 
 79 |     // This field is for Livy, don't have to care if you're using new Client Mode
 80 |     private String proxyUser;
 81 | 
 82 |     // --classname, your driver's classname
 83 |     private String className;
 84 | 
 85 |     // --args, arguments for your spark application
 86 |     private List<String> args;
 87 | 
 88 |     // --jars, extra jars to distribute to driver & executors
 89 |     private List<String> jars;
 90 | 
 91 |     // --py-files, extra python files to distribute to driver & executors
 92 |     private List<String> pyFiles;
 93 | 
 94 |     // --files, extra files to distribute to driver & executors
 95 |     private List<String> files;
 96 | 
 97 |     // --archives, extra archives to distribute to driver & executors
 98 |     private List<String> archives;
 99 | 
100 |     // --driver-memory
101 |     private String driverMemory;
102 | 
103 |     // --driver-cores
104 |     private String driverCores;
105 | 
106 |     // --executor-memory
107 |     private String executorMemory;
108 | 
109 |     // --executor-cores
110 |     private String executorCores;
111 | 
112 |     // --num-executors
113 |     private String numExecutors;
114 | 
115 |     // --queue, you can ignore it
116 |     private String queue;
117 | 
118 |     // --name, name of the spark application
119 |     private String name;
120 | 
121 |     // --conf, other spark configurations
122 |     private Map<String, String> conf;
123 |     ...
124 | }
125 | ```
126 | * [使用示例](https://github.com/aliyun/MaxCompute-Spark/blob/clientmode-snapshot/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/clientmode/ClientModeDemo.scala)
127 | 
128 | * 由于提交时不再依赖Spark客户端，因此提交参数的接口有一定变化。现在参数统一通过SubmitParam这个接口传递，有两种传参方式：
129 | 
130 | 1. 在代码中传递参数：
131 | 
132 | ```
133 | SubmitParam param = new SubmitParam();
134 | param.setFile("/path/to/primary/resource");
135 | param.setClassName("classname");
136 | ```
137 | 
138 | 2. 使用配置文件：
139 | 
140 | ```
141 | SubmitParam param = new SubmitParam();
142 | param.loadConfFromFile("/path/to/submitparam.conf");
143 | ```
144 | 3. Demo
145 | checkout到clientmode-snapshot分支
146 | ```
147 | git checkout clientmode-snapshot
148 | ```
149 | 编译
150 | ```
151 | cd spark-2.x
152 | mvn clean package
153 | ```
154 | 提交执行
155 | ```
156 | java -cp ./odps-spark-client_2.11-0.0.1-DEV-SNAPSHOT-jar-with-dependencies.jar:./target/spark-examples_2.11-1.0.0-SNAPSHOT-shaded.jar com.aliyun.odps.spark.examples.clientmode.ClientModeDemo
157 | ```


--------------------------------------------------------------------------------
/docs/docs/quickstart/runtime-mode/index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | slug: /mode
 3 | sidebar_position: 1
 4 | ---
 5 | 
 6 | # 运行模式
 7 | 目前MaxCompute Spark支持以下几种运行方式：client 模式，local模式，cluster模式，以及支持在DataWorks中执行。
 8 | 
 9 | ## Local模式
10 | local模式可用于小批量数据以及计算本地验证，local模式验证通过后再提交到yarn-cluster模式
11 | 
12 | **说明** 
13 | 具体使用可参考[Local模式](./local-mode.md)
14 | 
15 | ```
16 | ## /path/to/MaxCompute-Spark 请指向正确的编译出来后的application jar包
17 | cd $SPARK_HOME
18 | bin/spark-submit --master local[4] --class com.aliyun.odps.spark.examples.SparkPi \
19 | /path/to/MaxCompute-Spark/spark-2.x/target/spark-examples_2.11-1.0.0-SNAPSHOT-shaded.jar
20 | ```
21 | 
22 | ## Cluster模式
23 | **说明** 
24 | 具体使用可参考[Yarn Cluster模式](./yarn-cluster.md)
25 | 
26 | ```
27 | ## /path/to/MaxCompute-Spark 请指向正确的编译出来后的application jar包
28 | cd $SPARK_HOME
29 | bin/spark-submit --master yarn-cluster --class com.aliyun.odps.spark.examples.SparkPi \
30 | /path/to/MaxCompute-Spark/spark-2.x/target/spark-examples_2.11-1.0.0-SNAPSHOT-shaded.jar
31 | ```
32 | 
33 | ## 在 DataWorks 上执行
34 | Spark作业可以在DataWorks中进行调度，本质上也是采用了Yarn Cluster模式进行任务提交
35 | 
36 | **说明** 
37 | 具体使用可参考[Spark on Dataworks](../dataworks-integration.md)
38 | 


--------------------------------------------------------------------------------
/docs/docs/quickstart/runtime-mode/local-mode.md:
--------------------------------------------------------------------------------
 1 | # Local 模式
 2 | <h1 id="1">Local模式介绍</h1>
 3 | 
 4 | * MaxCompute Spark支持用户以原生的Spark Local模式进行任务调试
 5 | 
 6 | * 与Yarn Cluster模式类似，用户首先需要做以下准备工作
 7 | ```
 8 | 1. 准备MaxCompute项目以及对应的accessId，accessKey
 9 | 2. 下载MaxCompute Spark客户端
10 | 3. 环境变量准备
11 | 4. spark-defaults.conf配置
12 | 5. 下载工程模版并编译
13 | ```
14 | 
15 | * 进行任务提交
16 | ```
17 | # Java/Scala
18 | cd $SPARK_HOME
19 | ./bin/spark-submit --master local[4] --class com.aliyun.odps.spark.examples.SparkPi \
20 | /path/to/odps-spark-examples/spark-examples/target/spark-examples-2.0.0-SNAPSHOT-shaded.jar
21 | 
22 | # PySpark
23 | cd $SPARK_HOME
24 | ./bin/spark-submit --master local[4] \
25 | /path/to/odps-spark-examples/spark-examples/src/main/python/odps_table_rw.py
26 | ```
27 | <h1 id="2">Local模式注意事项</h1>
28 | 
29 | ```
30 | 1. Local模式读写Maxcompute表慢，这个原因是因为local模式是通过Tunnel来读写的，读写速度相比于yarn-cluster模式要慢
31 | 
32 | 2. Local模式是在本地执行的，有的用户会经常遇到local模式下可以访问通vpc，但是在yarn-cluster模式下不行。
33 |    显而易见，local模式是处于用户本机环境，网络没有隔离。而yarn-cluster模式是处于Maxcompute的网络隔离环境中，
34 |    必须要要配置vpc访问的相关参数才行。
35 | 
36 | 3. Local模式下访问vpc的endpoint通常是外网endpoint，而yarn-cluster模式下访问vpc的endpoint通常是vpc网络endpoint
37 | 
38 | 4. IDEA Local模式下需要将相关配置写在代码中，而在Yarn-Cluster模式运行时一定要将这些配置从代码中去掉
39 | ```
40 | 
41 | <h1 id="3">IDEA Local模式执行</h1>
42 | 
43 | * Spark可以支持用户在IDEA里支持以Local[N]的模式直接运行代码，而不需要通过命令行提交，用户需要注意以下两点：
44 | ```
45 | 1. IDEA运行Local模式是不能直接引用spark-defaults.conf里的配置，需要手动在代码里指定相关配置
46 | 
47 | 2. 一定要注意需要在IDEA里手动添加MaxCompute Spark客户端的相关依赖（jars目录），否则会出现以下报错：
48 |    the value of spark.sql.catalogimplementation should be one of hive in-memory but was odps
49 | ```
50 | 
51 | # 1. 在代码需要手动设置spark config
52 | 
53 | ```
54 | # spark 2.3版本
55 | val spark = SparkSession
56 |       .builder()
57 |       .appName("SparkPi")
58 |       .config("spark.master", "local[4]") // 需设置spark.master为local[N]才能直接运行，N为并发数
59 |       .config("spark.hadoop.odps.project.name", "****")
60 |       .config("spark.hadoop.odps.access.id", "****")
61 |       .config("spark.hadoop.odps.access.key", "****")
62 |       .config("spark.sql.catalogImplementation", "odps")
63 |       .config("spark.hadoop.odps.end.point", "http://service.cn.maxcompute.aliyun.com/api")
64 |       .getOrCreate()
65 | 
66 | # 注意，如果使用spark 2.4.5及以上的版本，需要在代码中配置spark.sql.catalogImplementation=hive，不再需要在代码中配置spark.hadoop.odps.project.name，spark.hadoop.odps.access.id，spark.hadoop.odps.access.key，spark.hadoop.odps.end.point这几个参数
67 | 只要在代码的resources目录下（类加载器能加载的目录）创建一个名为odps.conf的文件，然后添加以下配置，注意在集群模式中需要将该文件删除：
68 | 
69 | odps.project.name=***
70 | odps.access.id=***
71 | odps.access.key=***
72 | odps.end.point=***
73 | 
74 | 
75 | 
76 | 
77 | 
78 | ```
79 | 
80 | # 2. 在IDEA里手动添加MaxCompute Spark客户端的相关依赖（下图无法显示请参考[文档](https://github.com/aliyun/MaxCompute-Spark/wiki/12.-Github%E5%9B%BE%E7%89%87%E6%97%A0%E6%B3%95%E8%AE%BF%E9%97%AE%E7%9A%84%E9%97%AE%E9%A2%98)）
81 | 
82 | ![image1](../../resources/idea-local-1.jpg)
83 | 
84 | ![image2](../../resources/idea-local-2.jpg)
85 | 
86 | ![image3](../../resources/idea-local-3.jpg)
87 | 
88 | ![image4](../../resources/idea-local-4.jpg)
89 | 
90 | ![image5](../../resources/idea-local-5.jpg)


--------------------------------------------------------------------------------
/docs/docs/quickstart/runtime-mode/yarn-cluster.md:
--------------------------------------------------------------------------------
  1 | # Yarn Cluster 模式
  2 | 快速导航
  3 |   + [下载MaxCompute Spark客户端](#1)
  4 |   + [设置环境变量](#2)
  5 |   + [配置spark-defaults.conf](#3)
  6 |   + [准备项目工程](#4)
  7 |   + [SparkPi 冒烟测试](#5)
  8 | -----------------
  9 | 
 10 | <h1 id="1">下载MaxCompute Spark客户端</h1>
 11 | 
 12 | MaxCompute Spark发布包集成了MaxCompute认证功能。作为客户端工具，它用于通过spark-submit方式提交作业到MaxCompute项目中运行。
 13 | 
 14 | 目前Spark版本支持如下，请优先使用Spark 2以上的版本!
 15 | * [spark-1.6.3](https://odps-repo.oss-cn-hangzhou.aliyuncs.com/spark/1.6.3-public/spark-1.6.3-public.tar.gz)
 16 | 
 17 | 专有云：
 18 | * [spark-2.3.0](https://odps-repo.oss-cn-hangzhou.aliyuncs.com/spark/2.3.0-odps0.33.0/spark-2.3.0-odps0.33.0.tar.gz)
 19 | * [spark-2.4.5](https://odps-repo.oss-cn-hangzhou.aliyuncs.com/spark/2.4.5-odps0.33.4/spark-2.4.5-odps0.33.4.tar.gz)
 20 | * [spark-3.1.1](https://odps-repo.oss-cn-hangzhou.aliyuncs.com/spark/3.1.1-odps0.33.0/spark-3.1.1-odps0.33.0.tar.gz)
 21 | 
 22 | 公共云：
 23 | * [spark-2.3.0](https://odps-repo.oss-cn-hangzhou.aliyuncs.com/spark/2.3.0-odps0.34.0/spark-2.3.0-odps0.34.0.tar.gz)
 24 | * [spark-2.4.5](https://odps-repo.oss-cn-hangzhou.aliyuncs.com/spark/2.4.5-odps0.34.0/spark-2.4.5-odps0.34.0.tar.gz)
 25 | * [spark-3.1.1](https://odps-repo.oss-cn-hangzhou.aliyuncs.com/spark/3.1.1-odps0.34.1/spark-3.1.1-odps0.34.1.tar.gz)
 26 | 
 27 | <h1 id="2">设置环境变量</h1>
 28 | 
 29 | * JAVA_HOME设置
 30 | 
 31 | ```
 32 | ## 推荐使用JDK 1.8
 33 | export JAVA_HOME=/path/to/jdk
 34 | export CLASSPATH=.:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar
 35 | export PATH=$JAVA_HOME/bin:$PATH
 36 | ```
 37 | 
 38 | * SPARK_HOME设置
 39 | 
 40 | ```
 41 | ## 下载上文提到的MaxCompute Spark客户端并解压到本地任意路径
 42 | ## 请不要直接设置SPARK_HOME等于以下路径下述路径仅做展示用途
 43 | ## 请指向正确的路径
 44 | export SPARK_HOME=/path/to/spark_extracted_package
 45 | export PATH=$SPARK_HOME/bin:$PATH
 46 | ```
 47 | 
 48 | * PySpark的用户请安装Python2.7版本，并设置PATH
 49 | 
 50 | ```
 51 | export PATH=/path/to/python/bin/:$PATH
 52 | ```
 53 | 
 54 | * HADOOP_CONF_DIR设置：注意Spark 2.4.5和Spark 3必须要设置该参数
 55 | 
 56 | ```
 57 | export HADOOP_CONF_DIR=$SPARK_HOME/conf
 58 | ```
 59 | 
 60 | <h1 id="3">配置spark-defaults.conf</h1>
 61 | 
 62 | + 第一次下载MaxCompute Spark客户端后，需要配置spark-defaults.conf
 63 | + 在 $SPARK_HOME/conf/ 下面有一个文件名称为 spark-defaults.conf.template。请将其重命名为 spark-defaults.conf 后再进行相关配置（很多人会忽略这一步，导致配置无法生效）
 64 | 
 65 | ```
 66 | ## spark-defaults.conf
 67 | ## 一般来说默认的template只需要再填上MaxCompute相关的账号信息就可以使用Spark
 68 | spark.hadoop.odps.project.name =
 69 | spark.hadoop.odps.access.id =
 70 | spark.hadoop.odps.access.key =
 71 | 
 72 | ## 其他的配置直接采用以下参数即可
 73 | spark.hadoop.odps.end.point = http://service.cn.maxcompute.aliyun.com/api
 74 | spark.hadoop.odps.runtime.end.point = http://service.cn.maxcompute.aliyun-inc.com/api
 75 | 
 76 | ##########-------注意catalog设置-------##########
 77 | ### spark 2.3.0请将该参数设置为odps
 78 | spark.sql.catalogImplementation=odps
 79 | 
 80 | ### spark 2.4.5请将该参数设置为hive
 81 | spark.sql.catalogImplementation=hive
 82 | 
 83 | ### spark 3.1.1参数变化
 84 | spark.sql.defaultCatalog=odps
 85 | spark.sql.catalog.odps=org.apache.spark.sql.execution.datasources.v2.odps.OdpsTableCatalog 
 86 | spark.sql.sources.partitionOverwriteMode=dynamic
 87 | spark.sql.extensions=org.apache.spark.sql.execution.datasources.v2.odps.extension.OdpsExtensions
 88 | 
 89 | ```
 90 | 
 91 | 如果有一些特殊的场景还有功能，还可以开启另外的一些配置，见[Spark配置详解](https://github.com/aliyun/MaxCompute-Spark/wiki/07.-Spark%E9%85%8D%E7%BD%AE%E8%AF%A6%E8%A7%A3)
 92 | 
 93 | Spark 2.4.5的参数变化，详见[Spark 2.4.5使用注意事项](https://github.com/aliyun/MaxCompute-Spark/wiki/06.-Spark-2.4.5-%E4%BD%BF%E7%94%A8%E6%B3%A8%E6%84%8F%E4%BA%8B%E9%A1%B9)
 94 | 
 95 | Spark 3.1.1的参数变化，详见[Spark 3.1.1使用注意事项](https://github.com/aliyun/MaxCompute-Spark/wiki/06.-Spark-3.1.1-%E4%BD%BF%E7%94%A8%E6%B3%A8%E6%84%8F%E4%BA%8B%E9%A1%B9)
 96 | 
 97 | <h1 id="4">准备项目工程</h1>
 98 | 
 99 | + MaxCompute Spark提供了项目工程模版，建议开发者下载模版复制后直接在模版里开发
100 | 
101 | + 可以看到模版工程里的关于spark的依赖的scope都是provided的，这个请务必不要更改，否则提交的作业无法正常运行
102 | 
103 | spark-1.x 模板及编译
104 | 
105 | ```
106 | git clone https://github.com/aliyun/MaxCompute-Spark.git
107 | cd spark-1.x
108 | mvn clean package
109 | ```
110 | 
111 | spark-2.x 模板及编译
112 | 
113 | ```
114 | git clone https://github.com/aliyun/MaxCompute-Spark.git
115 | cd spark-2.x
116 | mvn clean package
117 | ```
118 | 
119 | spark-3.x 模板及编译
120 | 
121 | ```
122 | git clone https://github.com/aliyun/MaxCompute-Spark.git
123 | cd spark-3.x
124 | mvn clean package
125 | ```
126 | 
127 | <h1 id="5">SparkPi 冒烟测试</h1>
128 | 
129 | 在完成了以上的工作后，可以来进行冒烟测试，验证MaxCompute Spark是否E2E走通，需要以下前提:
130 | 
131 | * 准备MaxCompute项目以及对应的accessId，accessKey
132 | * 下载MaxCompute Spark客户端
133 | * 环境变量准备
134 | * spark-defaults.conf配置
135 | * 下载工程模版并编译
136 | 
137 | 以 spark-2.x 为例，我们可以提交一个SparkPi来验证功能是否正常，提交命令如下:
138 | 
139 | ```
140 | ## /path/to/MaxCompute-Spark 请指向正确的编译出来后的application jar包
141 | 
142 | ## bash环境
143 | cd $SPARK_HOME
144 | bin/spark-submit --master yarn-cluster --class com.aliyun.odps.spark.examples.SparkPi \
145 | /path/to/MaxCompute-Spark/spark-2.x/target/spark-examples_2.11-1.0.0-SNAPSHOT-shaded.jar
146 | 
147 | ## 在windows环境提交
148 | cd $SPARK_HOME/bin
149 | spark-submit.cmd --master yarn-cluster --class com.aliyun.odps.spark.examples.SparkPi
150 | \path\to\MaxCompute-Spark\spark-2.x\target\spark-examples_2.11-1.0.0-SNAPSHOT-shaded.jar
151 | 
152 | ## 当看到以下日志则表明冒烟作业成功
153 | 19/06/11 11:57:30 INFO Client: 
154 |          client token: N/A
155 |          diagnostics: N/A
156 |          ApplicationMaster host: 11.222.166.90
157 |          ApplicationMaster RPC port: 38965
158 |          queue: queue
159 |          start time: 1560225401092
160 |          final status: SUCCEEDED
161 | ```


--------------------------------------------------------------------------------
/docs/docs/resources/ENI-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/ENI-1.png


--------------------------------------------------------------------------------
/docs/docs/resources/ENI-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/ENI-2.png


--------------------------------------------------------------------------------
/docs/docs/resources/ENI-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/ENI-3.png


--------------------------------------------------------------------------------
/docs/docs/resources/ENI-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/ENI-4.png


--------------------------------------------------------------------------------
/docs/docs/resources/ENI-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/ENI-5.png


--------------------------------------------------------------------------------
/docs/docs/resources/OOM1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/OOM1.png


--------------------------------------------------------------------------------
/docs/docs/resources/cloudmonitor-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/cloudmonitor-1.png


--------------------------------------------------------------------------------
/docs/docs/resources/cloudmonitor-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/cloudmonitor-2.png


--------------------------------------------------------------------------------
/docs/docs/resources/cloudmonitor-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/cloudmonitor-3.png


--------------------------------------------------------------------------------
/docs/docs/resources/cupid_arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/cupid_arch.png


--------------------------------------------------------------------------------
/docs/docs/resources/datahub-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/datahub-1.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/datahub-2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/datahub-2.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/datahub-3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/datahub-3.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/dataworks-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/dataworks-1.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/dataworks-2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/dataworks-2.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/dataworks-3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/dataworks-3.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/dataworks-4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/dataworks-4.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/dataworks-5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/dataworks-5.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/dataworks-6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/dataworks-6.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/dataworks-7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/dataworks-7.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/dingtalk-share.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/dingtalk-share.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/fuxisensor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/fuxisensor.png


--------------------------------------------------------------------------------
/docs/docs/resources/fuxisensor2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/fuxisensor2.png


--------------------------------------------------------------------------------
/docs/docs/resources/idea-local-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/idea-local-1.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/idea-local-2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/idea-local-2.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/idea-local-3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/idea-local-3.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/idea-local-4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/idea-local-4.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/idea-local-5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/idea-local-5.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/jobview-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/jobview-1.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/jobview-2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/jobview-2.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/jobview-3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/jobview-3.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/jobview-4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/jobview-4.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/jobview-5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/jobview-5.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/log4j2-stderr.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/log4j2-stderr.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/log4j2-stdout.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/log4j2-stdout.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/logview-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/logview-1.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/logview-2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/logview-2.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/logview-3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/logview-3.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/logview-4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/logview-4.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/logview-5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/logview-5.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/oss-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/oss-1.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/oss-2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/oss-2.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/oss-3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/oss-3.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/sparkui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/sparkui.png


--------------------------------------------------------------------------------
/docs/docs/resources/vpc-access-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/vpc-access-1.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/vpc-access-2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/vpc-access-2.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/vpc-access-3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/vpc-access-3.jpg


--------------------------------------------------------------------------------
/docs/docs/resources/资源申请1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/资源申请1.png


--------------------------------------------------------------------------------
/docs/docs/resources/资源申请2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/docs/resources/资源申请2.png


--------------------------------------------------------------------------------
/docs/docusaurus.config.js:
--------------------------------------------------------------------------------
  1 | // @ts-check
  2 | // `@type` JSDoc annotations allow editor autocompletion and type checking
  3 | // (when paired with `@ts-check`).
  4 | // There are various equivalent ways to declare your Docusaurus config.
  5 | // See: https://docusaurus.io/docs/api/docusaurus-config
  6 | 
  7 | import {themes as prismThemes} from 'prism-react-renderer';
  8 | 
  9 | /** @type {import('@docusaurus/types').Config} */
 10 | const config = {
 11 |   title: 'MaxCompute Spark',
 12 |   // Set the /<baseUrl>/ pathname under which your site is served
 13 |   // For GitHub pages deployment, it is often '/<projectName>/'
 14 |   url: 'https://aliyun.github.io',
 15 |   baseUrl: '/MaxCompute-Spark/',
 16 | 
 17 |   // GitHub pages deployment config.
 18 |   // If you aren't using GitHub pages, you don't need these.
 19 |   organizationName: 'aliyun', // Usually your GitHub org/user name.
 20 |   projectName: 'MaxCompute-Spark', // Usually your repo name.
 21 |   trailingSlash: 'true',
 22 | 
 23 |   onBrokenAnchors: 'ignore',
 24 |   onBrokenLinks: 'ignore',
 25 |   onBrokenMarkdownLinks: 'ignore',
 26 | 
 27 |   markdown: {
 28 |       mermaid: true,
 29 |   },
 30 |   themes: ['@docusaurus/theme-mermaid'],
 31 | 
 32 |   // Even if you don't use internationalization, you can use this field to set
 33 |   // useful metadata like html lang. For example, if your site is Chinese, you
 34 |   // may want to replace "en" with "zh-Hans".
 35 |   i18n: {
 36 |     defaultLocale: 'zh-Hans',
 37 |     locales: ['zh-Hans'],
 38 |   },
 39 | 
 40 |   presets: [
 41 |     [
 42 |       '@docusaurus/preset-classic',
 43 |       {
 44 |         docs: {
 45 |           routeBasePath: '/', // Serve the docs at the site's root
 46 |           sidebarPath: './sidebars.js',
 47 |         },
 48 |         blog: false,
 49 |         theme: {
 50 |           customCss: './src/css/custom.css',
 51 |         },
 52 |       },
 53 |     ],
 54 |   ],
 55 | 
 56 |   themeConfig:
 57 |     /** @type {import('@docusaurus/preset-classic').ThemeConfig} */
 58 |     ({
 59 |       docs: {
 60 |         sidebar: {
 61 |           hideable: true,
 62 |           autoCollapseCategories: true,
 63 |         },
 64 |       },
 65 |       image: 'img/logo.svg',
 66 |       navbar: {
 67 |         title: 'MaxCompute Spark',
 68 |         logo: {
 69 |           alt: 'MaxCompute Logo',
 70 |           src: 'img/logo.svg',
 71 |         },
 72 |         items: [
 73 |           {
 74 |             type: 'docSidebar',
 75 |             sidebarId: 'docs',
 76 |             position: 'left',
 77 |             label: '文档',
 78 |           },
 79 | //          {
 80 | //            href: 'https://github.com/aliyun/aliyun-odps-jdbc',
 81 | //            position: 'right',
 82 | //            label: '使用 JDBC 链接 MaxCompute',
 83 | //          },
 84 | //          {
 85 | //            type: 'docsVersionDropdown',
 86 | //            sidebarId: 'version',
 87 | //            position: 'left',
 88 | //            dropdownActiveClassDisabled: true,
 89 | //          },
 90 | //          {
 91 | //            href: 'https://github.com/aliyun/aliyun-odps-java-sdk',
 92 | //            label: 'GitHub',
 93 | //            position: 'right',
 94 | //          },
 95 |         ],
 96 |       },
 97 |       prism: {
 98 |         theme: prismThemes.github,
 99 |         darkTheme: prismThemes.dracula,
100 |         additionalLanguages: ['java'],
101 |       },
102 |     }),
103 | };
104 | 
105 | export default config;
106 | 
107 | 


--------------------------------------------------------------------------------
/docs/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "odps-sdk-doc",
 3 |   "version": "0.0.0",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "docusaurus": "docusaurus",
 7 |     "start": "docusaurus start",
 8 |     "build": "docusaurus build",
 9 |     "swizzle": "docusaurus swizzle",
10 |     "deploy": "docusaurus deploy",
11 |     "clear": "docusaurus clear",
12 |     "serve": "docusaurus serve",
13 |     "write-translations": "docusaurus write-translations",
14 |     "write-heading-ids": "docusaurus write-heading-ids"
15 |   },
16 |   "dependencies": {
17 |     "@docusaurus/core": "3.5.2",
18 |     "@docusaurus/preset-classic": "3.5.2",
19 |     "@docusaurus/theme-mermaid": "3.5.2",
20 |     "@mdx-js/react": "^3.0.0",
21 |     "clsx": "^2.0.0",
22 |     "prism-react-renderer": "^2.3.0",
23 |     "react": "^18.0.0",
24 |     "react-dom": "^18.0.0"
25 |   },
26 |   "devDependencies": {
27 |     "@docusaurus/module-type-aliases": "3.0.1",
28 |     "@docusaurus/types": "3.0.1",
29 |     "eslint": "^8.56.0",
30 |     "eslint-plugin-react": "^7.33.2"
31 |   },
32 |   "browserslist": {
33 |     "production": [
34 |       ">0.5%",
35 |       "not dead",
36 |       "not op_mini all"
37 |     ],
38 |     "development": [
39 |       "last 3 chrome version",
40 |       "last 3 firefox version",
41 |       "last 5 safari version"
42 |     ]
43 |   },
44 |   "engines": {
45 |     "node": ">=18.0"
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/docs/sidebars.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Creating a sidebar enables you to:
 3 |  - create an ordered group of docs
 4 |  - render a sidebar for each doc of that group
 5 |  - provide next/previous navigation
 6 | 
 7 |  The sidebars can be generated from the filesystem, or explicitly defined here.
 8 | 
 9 |  Create as many sidebars as you want.
10 |  */
11 | 
12 | // @ts-check
13 | 
14 | /** @type {import('@docusaurus/plugin-content-docs').SidebarsConfig} */
15 | const sidebars = {
16 |   // By default, Docusaurus generates a sidebar from the docs folder structure
17 |   docs: [{type: 'autogenerated', dirName: '.'}],
18 | 
19 |   // But you can create a sidebar manually
20 |   /*
21 |   tutorialSidebar: [
22 |     'intro',
23 |     'hello',
24 |     {
25 |       type: 'category',
26 |       label: 'Tutorial',
27 |       items: ['tutorial-basics/create-a-document'],
28 |     },
29 |   ],
30 |    */
31 | };
32 | 
33 | export default sidebars;
34 | 


--------------------------------------------------------------------------------
/docs/src/components/HomepageFeatures/index.js:
--------------------------------------------------------------------------------
 1 | import clsx from 'clsx';
 2 | import Heading from '@theme/Heading';
 3 | import styles from './styles.module.css';
 4 | 
 5 | const FeatureList = [
 6 |   {
 7 |     title: 'Easy to Use',
 8 |     Svg: require('@site/static/img/undraw_docusaurus_mountain.svg').default,
 9 |     description: (
10 |       <>
11 |         Docusaurus was designed from the ground up to be easily installed and
12 |         used to get your website up and running quickly.
13 |       </>
14 |     ),
15 |   },
16 |   {
17 |     title: 'Focus on What Matters',
18 |     Svg: require('@site/static/img/undraw_docusaurus_tree.svg').default,
19 |     description: (
20 |       <>
21 |         Docusaurus lets you focus on your docs, and we&apos;ll do the chores. Go
22 |         ahead and move your docs into the <code>docs</code> directory.
23 |       </>
24 |     ),
25 |   },
26 |   {
27 |     title: 'Powered by React',
28 |     Svg: require('@site/static/img/undraw_docusaurus_react.svg').default,
29 |     description: (
30 |       <>
31 |         Extend or customize your website layout by reusing React. Docusaurus can
32 |         be extended while reusing the same header and footer.
33 |       </>
34 |     ),
35 |   },
36 | ];
37 | 
38 | function Feature({Svg, title, description}) {
39 |   return (
40 |     <div className={clsx('col col--4')}>
41 |       <div className="text--center">
42 |         <Svg className={styles.featureSvg} role="img" />
43 |       </div>
44 |       <div className="text--center padding-horiz--md">
45 |         <Heading as="h3">{title}</Heading>
46 |         <p>{description}</p>
47 |       </div>
48 |     </div>
49 |   );
50 | }
51 | 
52 | export default function HomepageFeatures() {
53 |   return (
54 |     <section className={styles.features}>
55 |       <div className="container">
56 |         <div className="row">
57 |           {FeatureList.map((props, idx) => (
58 |             <Feature key={idx} {...props} />
59 |           ))}
60 |         </div>
61 |       </div>
62 |     </section>
63 |   );
64 | }
65 | 


--------------------------------------------------------------------------------
/docs/src/components/HomepageFeatures/styles.module.css:
--------------------------------------------------------------------------------
 1 | .features {
 2 |   display: flex;
 3 |   align-items: center;
 4 |   padding: 2rem 0;
 5 |   width: 100%;
 6 | }
 7 | 
 8 | .featureSvg {
 9 |   height: 200px;
10 |   width: 200px;
11 | }
12 | 


--------------------------------------------------------------------------------
/docs/src/css/custom.css:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Any CSS included here will be global. The classic template
 3 |  * bundles Infima by default. Infima is a CSS framework designed to
 4 |  * work well for content-centric websites.
 5 |  */
 6 | 
 7 | /* You can override the default Infima variables here. */
 8 | :root {
 9 |   --ifm-color-primary: #2e8555;
10 |   --ifm-color-primary-dark: #29784c;
11 |   --ifm-color-primary-darker: #277148;
12 |   --ifm-color-primary-darkest: #205d3b;
13 |   --ifm-color-primary-light: #33925d;
14 |   --ifm-color-primary-lighter: #359962;
15 |   --ifm-color-primary-lightest: #3cad6e;
16 |   --ifm-code-font-size: 95%;
17 |   --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.1);
18 | }
19 | 
20 | /* For readability concerns, you should choose a lighter palette in dark mode. */
21 | [data-theme='dark'] {
22 |   --ifm-color-primary: #25c2a0;
23 |   --ifm-color-primary-dark: #21af90;
24 |   --ifm-color-primary-darker: #1fa588;
25 |   --ifm-color-primary-darkest: #1a8870;
26 |   --ifm-color-primary-light: #29d5b0;
27 |   --ifm-color-primary-lighter: #32d8b4;
28 |   --ifm-color-primary-lightest: #4fddbf;
29 |   --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.3);
30 | }
31 | 


--------------------------------------------------------------------------------
/docs/src/locales.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "id": "zh-cn",
 4 |     "sourceDir": "docs/zh"
 5 |   },
 6 |   {
 7 |     "id": "en-us",
 8 |     "sourceDir": "docs/en"
 9 |   }
10 | ]


--------------------------------------------------------------------------------
/docs/src/pages/index.js:
--------------------------------------------------------------------------------
 1 | import clsx from 'clsx';
 2 | import Link from '@docusaurus/Link';
 3 | import useDocusaurusContext from '@docusaurus/useDocusaurusContext';
 4 | import Layout from '@theme/Layout';
 5 | import Heading from '@theme/Heading';
 6 | import styles from './index.module.css';
 7 | 
 8 | 
 9 | /**
10 |  * FIXME: 理论上有办法让用户直接进入到文档界面，而不是进入一个只有“进入文档”入口的标题界面。
11 |  */
12 | function HomepageHeader() {
13 |   const {siteConfig} = useDocusaurusContext();
14 |   return (
15 |     <header className={clsx('hero hero--primary', styles.heroBanner)}>
16 |       <div className="container">
17 |         <Heading as="h1" className="hero__title">
18 |            MaxCompute Spark 使用文档
19 |         </Heading>
20 |         <div className={styles.buttons}>
21 |           <Link
22 |             className="button button--secondary button--lg"
23 |             to="overview">
24 |               进入文档 📚
25 |           </Link>
26 |         </div>
27 |       </div>
28 |     </header>
29 |   );
30 | }
31 | 
32 | export default function Home() {
33 |   const {siteConfig} = useDocusaurusContext();
34 |   return (
35 |     <Layout
36 |       title='MaxCompute Spark 使用文档'
37 |       description="MaxCompute Spark 使用文档：了解如何使用 MaxCompute Spark 进行数据处理">
38 |       <HomepageHeader />
39 |     </Layout>
40 |   );
41 | }
42 | 


--------------------------------------------------------------------------------
/docs/src/pages/index.module.css:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * CSS files with the .module.css suffix will be treated as CSS modules
 3 |  * and scoped locally.
 4 |  */
 5 | 
 6 | .heroBanner {
 7 |   padding: 4rem 0;
 8 |   text-align: center;
 9 |   position: relative;
10 |   overflow: hidden;
11 | }
12 | 
13 | @media screen and (max-width: 996px) {
14 |   .heroBanner {
15 |     padding: 2rem;
16 |   }
17 | }
18 | 
19 | .buttons {
20 |   display: flex;
21 |   align-items: center;
22 |   justify-content: center;
23 | }
24 | 


--------------------------------------------------------------------------------
/docs/static/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/static/.nojekyll


--------------------------------------------------------------------------------
/docs/static/img/docusaurus-social-card.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/static/img/docusaurus-social-card.jpg


--------------------------------------------------------------------------------
/docs/static/img/docusaurus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/static/img/docusaurus.png


--------------------------------------------------------------------------------
/docs/static/img/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/docs/static/img/favicon.ico


--------------------------------------------------------------------------------
/docs/static/img/logo.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg t="1696930281820" class="icon" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg" p-id="1517" xmlns:xlink="http://www.w3.org/1999/xlink" width="200" height="200"><path d="M575.936794 0.000213a127.985148 127.985148 0 0 1 108.040797 196.649181l181.994881 254.81843a127.985148 127.985148 0 1 1-78.113603 55.801525l-181.994881-254.81843c-2.709019 0.639926-5.439369 1.215859-8.21238 1.685137l-102.388119 470.793369a159.981435 159.981435 0 1 1-199.016906 32.806859L147.911796 510.404985a127.985148 127.985148 0 1 1 82.294451-49.380936l148.398779 247.288637a159.59748 159.59748 0 0 1 22.888011-3.754231l102.345457-470.814699A127.985148 127.985148 0 0 1 575.936794 0.000213z" fill="#FF6A00" p-id="1518"></path></svg>


--------------------------------------------------------------------------------
/hook/pre-commit:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | STAGE_FILES=$(git diff --cached --name-only --diff-filter=ACM)
 4 | 
 5 | echo 'check sensitive information ...'
 6 | FAIL=0
 7 | for FILE in $STAGE_FILES
 8 | do
 9 |   grep --color -Hni -E "(ssh-rsa|authorized_keys|id_dsa|ssh-keygen)" $FILE && FAIL=1
10 |   grep --color -Hni -E "(private key|secret|signature|accessid|access_id|access_key|accesskey|access_|password)(.*?)(\=|\:)(\s*)(\'|\")[^\$^%][^)]+(\'|\")[^)]*$" $FILE && FAIL=1
11 | done
12 | 
13 | if [ ${FAIL} == 0 ]; then
14 |   echo 'check sensitive information ... passed'
15 |   exit 0
16 | else
17 |   echo 'check sensitive information ... failed'
18 |   exit 1
19 | fi
20 | 


--------------------------------------------------------------------------------
/spark-1.x/src/main/java/com/aliyun/odps/spark/examples/sparksql/JavaSparkSQL.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |   * Licensed to the Apache Software Foundation (ASF) under one
  3 |   * or more contributor license agreements.  See the NOTICE file
  4 |   * distributed with this work for additional information
  5 |   * regarding copyright ownership.  The ASF licenses this file
  6 |   * to you under the Apache License, Version 2.0 (the
  7 |   * "License"); you may not use this file except in compliance
  8 |   * with the License.  You may obtain a copy of the License at
  9 |   * <p>
 10 |   * http://www.apache.org/licenses/LICENSE-2.0
 11 |   * <p>
 12 |   * Unless required by applicable law or agreed to in writing, software
 13 |   * distributed under the License is distributed on an "AS IS" BASIS,
 14 |   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |   * See the License for the specific language governing permissions and
 16 |   * limitations under the License.
 17 |   */
 18 | 
 19 | package com.aliyun.odps.spark.examples.sparksql;
 20 | 
 21 | import com.aliyun.odps.Odps;
 22 | import com.aliyun.odps.cupid.CupidSession;
 23 | import org.apache.spark.SparkConf;
 24 | import org.apache.spark.api.java.JavaSparkContext;
 25 | import org.apache.spark.api.java.JavaRDD;
 26 | import org.apache.spark.api.java.function.Function;
 27 | import org.apache.spark.sql.odps.OdpsContext;
 28 | import org.apache.spark.sql.DataFrame;
 29 | import org.apache.spark.sql.Row;
 30 | import org.apache.spark.sql.RowFactory;
 31 | 
 32 | import org.apache.spark.sql.types.*;
 33 | 
 34 | import java.util.ArrayList;
 35 | import java.util.List;
 36 | 
 37 | import org.apache.spark.sql.types.StructField;
 38 | 
 39 | public class JavaSparkSQL {
 40 | 
 41 |   public static void main(String[] args) {
 42 |     SparkConf conf = new SparkConf()
 43 |             .set("spark.hadoop.odps.exec.dynamic.partition.mode", "nonstrict")
 44 |             .setAppName("sparkSQL");
 45 |     JavaSparkContext sc = new JavaSparkContext(conf);
 46 |     OdpsContext odpsContext = new OdpsContext(sc);
 47 | 
 48 |     String project = sc.getConf().get("odps.project.name");
 49 |     String tableName = "mc_test_table";
 50 |     String tableNameCopy = "mc_test_table_copy";
 51 |     String ptTableName = "mc_test_pt_table";
 52 | 
 53 | 
 54 |     odpsContext.sql("DROP TABLE IF EXISTS " + tableName);
 55 |     odpsContext.sql("DROP TABLE IF EXISTS " + tableNameCopy);
 56 |     odpsContext.sql("DROP TABLE IF EXISTS " + ptTableName);
 57 | 
 58 | 
 59 |     odpsContext.sql("CREATE TABLE " + tableName + " (name STRING, num BIGINT)");
 60 |     odpsContext.sql("CREATE TABLE " + ptTableName+ " (name STRING, num BIGINT) PARTITIONED BY (pt1 STRING, pt2 STRING)");
 61 | 
 62 |     odpsContext.sql("DESCRIBE " + tableName);
 63 |     odpsContext.sql("DESCRIBE " + ptTableName);
 64 | 
 65 |     List<Integer> data = new ArrayList<Integer>();
 66 |     for (int i = 0; i < 100 ; i++) {
 67 |       data.add(i);
 68 |     }
 69 | 
 70 |     JavaRDD<Row> dfRDD = sc.parallelize(data, 2).map(new Function<Integer, Row>() {
 71 |       public Row call(Integer i) {
 72 |         return RowFactory.create(
 73 |                 "name-" + i.toString(),
 74 |                 Long.valueOf(i));
 75 |       }
 76 |     });
 77 | 
 78 |     JavaRDD<Row> ptDfRDD = sc.parallelize(data, 2).map(new Function<Integer, Row>() {
 79 |       public Row call(Integer i) {
 80 |         return RowFactory.create(
 81 |                 "name-" + i.toString(),
 82 |                 Long.valueOf(i),
 83 |                 "2018",
 84 |                 "0601");
 85 |       }
 86 |     });
 87 | 
 88 |     List<StructField> structFilelds = new ArrayList<StructField>();
 89 |     structFilelds.add(DataTypes.createStructField("name", DataTypes.StringType, true));
 90 |     structFilelds.add(DataTypes.createStructField("num", DataTypes.LongType, true));
 91 |     DataFrame df = odpsContext.createDataFrame(dfRDD, DataTypes.createStructType(structFilelds));
 92 | 
 93 |     structFilelds.add(DataTypes.createStructField("pt1", DataTypes.StringType, true));
 94 |     structFilelds.add(DataTypes.createStructField("pt2", DataTypes.StringType, true));
 95 |     DataFrame ptDf = odpsContext.createDataFrame(ptDfRDD, DataTypes.createStructType(structFilelds));
 96 | 
 97 |     // 写 普通表
 98 |     df.write().insertInto(tableName); // insertInto语义
 99 |     df.write().mode("overwrite").insertInto(tableName);// insertOverwrite语义
100 | 
101 |     // 读 普通表
102 |     DataFrame rdf =odpsContext.sql("select name, num from "+ tableName);
103 |     System.out.println("rdf count: "+ rdf.count());
104 |     rdf.printSchema();
105 | 
106 |     //create table as select
107 |     odpsContext.sql("CREATE TABLE " + tableNameCopy +" AS SELECT name, num FROM " + tableName);
108 |     odpsContext.sql("SELECT * FROM " + tableNameCopy).show();
109 | 
110 |     // 写 分区表
111 |     // DataFrameWriter 无法指定分区写入 需要通过临时表再用SQL写入特定分区
112 |     df.registerTempTable(ptTableName +"_tmp_view");
113 |     odpsContext.sql("insert into table " + ptTableName + " partition (pt1='2018', pt2='0601') select * from " + ptTableName + "_tmp_view");
114 |     odpsContext.sql("insert overwrite table " + ptTableName+ " partition (pt1='2018', pt2='0601') select * from " + ptTableName+ "_tmp_view");
115 | 
116 |     ptDf.write().partitionBy("pt1", "pt2").insertInto(ptTableName);// 动态分区 insertInto语义
117 |     ptDf.write().partitionBy("pt1", "pt2").mode("overwrite").insertInto(ptTableName); // 动态分区 insertOverwrite语义
118 |     
119 |     // 读 分区表
120 |     DataFrame rptdf = odpsContext.sql("select name, num, pt1, pt2 from " + ptTableName + " where pt1 = '2018' and pt2 = '0601'");
121 |     System.out.println("rptdf count: "+ rptdf.count());
122 |     rptdf.printSchema();
123 | 
124 | 
125 |     Odps odps = CupidSession.get().odps();
126 |     System.out.println(odps.tables().get(ptTableName).getPartitions().size());
127 |     System.out.println(odps.tables().get(ptTableName).getPartitions().get(0).getPartitionSpec());
128 |   }
129 | }
130 | 


--------------------------------------------------------------------------------
/spark-1.x/src/main/python/spark_sql.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkContext, SparkConf
 2 | from pyspark.sql import OdpsContext
 3 | 
 4 | if __name__ == '__main__':
 5 |     conf = SparkConf().setAppName("odps_pyspark")
 6 |     sc = SparkContext(conf=conf)
 7 |     sql_context = OdpsContext(sc)
 8 |     sql_context.sql("DROP TABLE IF EXISTS spark_sql_test_table")
 9 |     sql_context.sql("CREATE TABLE spark_sql_test_table(name STRING, num BIGINT)")
10 |     sql_context.sql("INSERT INTO TABLE spark_sql_test_table SELECT 'abc', 100000")
11 |     sql_context.sql("SELECT * FROM spark_sql_test_table").show()
12 |     sql_context.sql("SELECT COUNT(*) FROM spark_sql_test_table").show()


--------------------------------------------------------------------------------
/spark-1.x/src/main/scala/com/aliyun/odps/spark/examples/SparkPi.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Licensed to the Apache Software Foundation (ASF) under one
 3 |   * or more contributor license agreements.  See the NOTICE file
 4 |   * distributed with this work for additional information
 5 |   * regarding copyright ownership.  The ASF licenses this file
 6 |   * to you under the Apache License, Version 2.0 (the
 7 |   * "License"); you may not use this file except in compliance
 8 |   * with the License.  You may obtain a copy of the License at
 9 |   * <p>
10 |   * http://www.apache.org/licenses/LICENSE-2.0
11 |   * <p>
12 |   * Unless required by applicable law or agreed to in writing, software
13 |   * distributed under the License is distributed on an "AS IS" BASIS,
14 |   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   * See the License for the specific language governing permissions and
16 |   * limitations under the License.
17 |   */
18 | 
19 | package com.aliyun.odps.spark.examples
20 | 
21 | import scala.math.random
22 | 
23 | import org.apache.spark._
24 | 
25 | object SparkPi {
26 |   def main(args: Array[String]) {
27 |     val conf = new SparkConf().setAppName("Spark Pi")
28 |     val sc = new SparkContext(conf)
29 |     try {
30 |       val slices = if (args.length > 0) args(0).toInt else 2
31 |       val n = math.min(100000L * slices, Int.MaxValue).toInt // avoid overflow
32 |       val count = sc.parallelize(1 until n, slices).map { i =>
33 |         val x = random * 2 - 1
34 |         val y = random * 2 - 1
35 |         if (x * x + y * y < 1) 1 else 0
36 |       }.reduce(_ + _)
37 |       println("Pi is roughly " + 4.0 * count / n)
38 |     } finally {
39 |       sc.stop()
40 |     }
41 |   }
42 | }


--------------------------------------------------------------------------------
/spark-1.x/src/main/scala/com/aliyun/odps/spark/examples/WordCount.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Licensed to the Apache Software Foundation (ASF) under one
 3 |   * or more contributor license agreements.  See the NOTICE file
 4 |   * distributed with this work for additional information
 5 |   * regarding copyright ownership.  The ASF licenses this file
 6 |   * to you under the Apache License, Version 2.0 (the
 7 |   * "License"); you may not use this file except in compliance
 8 |   * with the License.  You may obtain a copy of the License at
 9 |   * <p>
10 |   * http://www.apache.org/licenses/LICENSE-2.0
11 |   * <p>
12 |   * Unless required by applicable law or agreed to in writing, software
13 |   * distributed under the License is distributed on an "AS IS" BASIS,
14 |   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   * See the License for the specific language governing permissions and
16 |   * limitations under the License.
17 |   */
18 | 
19 | package com.aliyun.odps.spark.examples
20 | 
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.SparkConf
23 | 
24 | object WordCount {
25 |   def main(args: Array[String]) {
26 |     val conf = new SparkConf().setAppName("WordCount")
27 |     val sc = new SparkContext(conf)
28 |     try {
29 |       sc.parallelize(1 to 100, 10).map(word => (word, 1)).reduceByKey(_ + _, 10).take(100).foreach(println)
30 |     } finally {
31 |       sc.stop()
32 |     }
33 |   }
34 | }


--------------------------------------------------------------------------------
/spark-1.x/src/main/scala/com/aliyun/odps/spark/examples/graphx/PageRank.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Licensed to the Apache Software Foundation (ASF) under one
 3 |   * or more contributor license agreements.  See the NOTICE file
 4 |   * distributed with this work for additional information
 5 |   * regarding copyright ownership.  The ASF licenses this file
 6 |   * to you under the Apache License, Version 2.0 (the
 7 |   * "License"); you may not use this file except in compliance
 8 |   * with the License.  You may obtain a copy of the License at
 9 |   * <p>
10 |   * http://www.apache.org/licenses/LICENSE-2.0
11 |   * <p>
12 |   * Unless required by applicable law or agreed to in writing, software
13 |   * distributed under the License is distributed on an "AS IS" BASIS,
14 |   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   * See the License for the specific language governing permissions and
16 |   * limitations under the License.
17 |   */
18 | 
19 | package com.aliyun.odps.spark.examples.graphx
20 | 
21 | import org.apache.spark.graphx.{Edge, Graph, VertexId}
22 | import org.apache.spark.rdd.RDD
23 | import org.apache.spark.{SparkConf, SparkContext}
24 | 
25 | object PageRank {
26 |   def main(args: Array[String]): Unit = {
27 |     val conf = new SparkConf().setAppName("PageRank")
28 |     val sc = new SparkContext(conf)
29 | 
30 |     // build vertices
31 |     val users: RDD[(VertexId, Array[String])] = sc.parallelize(List(
32 |       "1,BarackObama,Barack Obama",
33 |       "2,ladygaga,Goddess of Love",
34 |       "3,jeresig,John Resig",
35 |       "4,justinbieber,Justin Bieber",
36 |       "6,matei_zaharia,Matei Zaharia",
37 |       "7,odersky,Martin Odersky",
38 |       "8,anonsys"
39 |     ).map(line => line.split(",")).map(parts => (parts.head.toLong, parts.tail)))
40 | 
41 |     // build edges
42 |     val followers: RDD[Edge[Double]] = sc.parallelize(Array(
43 |       Edge(2L, 1L, 1.0),
44 |       Edge(4L, 1L, 1.0),
45 |       Edge(1L, 2L, 1.0),
46 |       Edge(6L, 3L, 1.0),
47 |       Edge(7L, 3L, 1.0),
48 |       Edge(7L, 6L, 1.0),
49 |       Edge(6L, 7L, 1.0),
50 |       Edge(3L, 7L, 1.0)
51 |     ))
52 | 
53 |     // build graph
54 |     val followerGraph: Graph[Array[String], Double] = Graph(users, followers)
55 | 
56 |     // restrict the graph to users with usernames and names
57 |     val subgraph = followerGraph.subgraph(vpred = (vid, attr) => attr.size == 2)
58 | 
59 |     // compute PageRank
60 |     val pageRankGraph = subgraph.pageRank(0.001)
61 | 
62 |     // get attributes of the top pagerank users
63 |     val userInfoWithPageRank = subgraph.outerJoinVertices(pageRankGraph.vertices) {
64 |       case (uid, attrList, Some(pr)) => (pr, attrList.toList)
65 |       case (uid, attrList, None) => (0.0, attrList.toList)
66 |     }
67 | 
68 |     println(userInfoWithPageRank.vertices.top(5)(Ordering.by(_._2._1)).mkString("\n"))
69 |   }
70 | }
71 | 


--------------------------------------------------------------------------------
/spark-1.x/src/main/scala/com/aliyun/odps/spark/examples/mllib/KmeansModelSaveToOss.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Licensed to the Apache Software Foundation (ASF) under one
 3 |   * or more contributor license agreements.  See the NOTICE file
 4 |   * distributed with this work for additional information
 5 |   * regarding copyright ownership.  The ASF licenses this file
 6 |   * to you under the Apache License, Version 2.0 (the
 7 |   * "License"); you may not use this file except in compliance
 8 |   * with the License.  You may obtain a copy of the License at
 9 |   * <p>
10 |   * http://www.apache.org/licenses/LICENSE-2.0
11 |   * <p>
12 |   * Unless required by applicable law or agreed to in writing, software
13 |   * distributed under the License is distributed on an "AS IS" BASIS,
14 |   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   * See the License for the specific language governing permissions and
16 |   * limitations under the License.
17 |   */
18 | 
19 | package com.aliyun.odps.spark.examples.mllib
20 | 
21 | import org.apache.spark.mllib.clustering.KMeans._
22 | import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
23 | import org.apache.spark.mllib.linalg.Vectors
24 | import org.apache.spark.{SparkConf, SparkContext}
25 | 
26 | object KmeansModelSaveToOss {
27 |   val modelOssDir = "oss://bucket/kmeans-model"
28 | 
29 |   def main(args: Array[String]) {
30 | 
31 |     //1. train and save the model
32 |     val conf = new SparkConf().setAppName("KmeansModelSaveToOss")
33 |     conf.set("spark.hadoop.fs.oss.credentials.provider", "org.apache.hadoop.fs.aliyun.oss.AliyunStsTokenCredentialsProvider")
34 |     conf.set("spark.hadoop.fs.oss.ststoken.roleArn", "acs:ram::****:role/aliyunodpsdefaultrole")
35 |     conf.set("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-zmf.aliyuncs.com")
36 | 
37 |     val sc = new SparkContext(conf)
38 |     val points = Seq(
39 |       Vectors.dense(0.0, 0.0),
40 |       Vectors.dense(0.0, 0.1),
41 |       Vectors.dense(0.1, 0.0),
42 |       Vectors.dense(9.0, 0.0),
43 |       Vectors.dense(9.0, 0.2),
44 |       Vectors.dense(9.2, 0.0)
45 |     )
46 |     val rdd = sc.parallelize(points, 3)
47 |     val initMode = K_MEANS_PARALLEL
48 |     val model = KMeans.train(rdd, k = 2, maxIterations = 2, runs = 1, initMode)
49 |     val predictResult1 = rdd.map(feature => "cluster id: " + model.predict(feature) + " feature:" + feature.toArray.mkString(",")).collect
50 |     println("modelOssDir=" + modelOssDir)
51 |     model.save(sc, modelOssDir)
52 | 
53 |     //2. predict from the oss model
54 |     val modelLoadOss = KMeansModel.load(sc, modelOssDir)
55 |     val predictResult2 = rdd.map(feature => "cluster id: " + modelLoadOss.predict(feature) + " feature:" + feature.toArray.mkString(",")).collect
56 |     assert(predictResult1.size == predictResult2.size)
57 |     predictResult2.foreach(result2 => assert(predictResult1.contains(result2)))
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/spark-1.x/src/main/scala/com/aliyun/odps/spark/examples/oss/SparkUnstructuredDataCompute.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Licensed to the Apache Software Foundation (ASF) under one
 3 |   * or more contributor license agreements.  See the NOTICE file
 4 |   * distributed with this work for additional information
 5 |   * regarding copyright ownership.  The ASF licenses this file
 6 |   * to you under the Apache License, Version 2.0 (the
 7 |   * "License"); you may not use this file except in compliance
 8 |   * with the License.  You may obtain a copy of the License at
 9 |   * <p>
10 |   * http://www.apache.org/licenses/LICENSE-2.0
11 |   * <p>
12 |   * Unless required by applicable law or agreed to in writing, software
13 |   * distributed under the License is distributed on an "AS IS" BASIS,
14 |   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   * See the License for the specific language governing permissions and
16 |   * limitations under the License.
17 |   */
18 | 
19 | package com.aliyun.odps.spark.examples.oss
20 | 
21 | import org.apache.spark.{SparkConf, SparkContext}
22 | 
23 | object SparkUnstructuredDataCompute {
24 |   def main(args: Array[String]) {
25 |     val conf = new SparkConf().setAppName("SparkUnstructuredDataCompute")
26 |     conf.set("spark.hadoop.fs.oss.credentials.provider", "org.apache.hadoop.fs.aliyun.oss.AliyunStsTokenCredentialsProvider")
27 |     conf.set("spark.hadoop.fs.oss.ststoken.roleArn", "acs:ram::****:role/aliyunodpsdefaultrole")
28 |     conf.set("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-zmf.aliyuncs.com")
29 |     val sc = new SparkContext(conf)
30 |     try {
31 |       val pathIn = "oss://bucket/inputdata/"
32 |       val inputData = sc.textFile(pathIn, 5)
33 |       val cnt = inputData.count
34 |       println(s"count: $cnt")
35 |     } finally {
36 |       sc.stop()
37 |     }
38 |   }
39 | }


--------------------------------------------------------------------------------
/spark-1.x/src/main/scala/com/aliyun/odps/spark/examples/sparksql/SparkSQL.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to the Apache Software Foundation (ASF) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The ASF licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  * <p>
10 |  * http://www.apache.org/licenses/LICENSE-2.0
11 |  * <p>
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | package com.aliyun.odps.spark.examples.sparksql
20 | 
21 | import org.apache.spark.sql.odps.OdpsContext
22 | import org.apache.spark.{SparkConf, SparkContext}
23 | 
24 | object SparkSQL {
25 |   def main(args: Array[String]) {
26 |     val conf = new SparkConf().setAppName("sparkSQL")
27 |     val sc = new SparkContext(conf)
28 |     val sqlContext = new OdpsContext(sc)
29 |     import sqlContext._
30 | 
31 |     val project = sc.getConf.get("odps.project.name")
32 |     import sqlContext.implicits._
33 |     val tableName = "mc_test_table"
34 |     val ptTableName = "mc_test_pt_table"
35 |     // Drop Create
36 |     sql(s"DROP TABLE IF EXISTS ${tableName}")
37 |     sql(s"DROP TABLE IF EXISTS ${ptTableName}")
38 | 
39 |     sql(s"CREATE TABLE ${tableName} (name STRING, num BIGINT)")
40 |     sql(s"CREATE TABLE ${ptTableName} (name STRING, num BIGINT) PARTITIONED BY (pt1 STRING, pt2 STRING)")
41 | 
42 |     val df = sc.parallelize(0 to 99, 2).map(f => {
43 |       (s"name-$f", f)
44 |     }).toDF("name", "num")
45 | 
46 |     val ptDf = sc.parallelize(0 to 99, 2).map(f => {
47 |       (s"name-$f", f, "2018", "0601")
48 |     }).toDF("name", "num", "pt1", "pt2")
49 | 
50 |     // 写 普通表
51 |     df.write.insertInto(tableName) // insertInto语义
52 |     df.write.mode("overwrite").insertInto(tableName) // insertOverwrite语义
53 | 
54 |     // 写 分区表
55 |     // DataFrameWriter 无法指定分区写入 需要通过临时表再用SQL写入特定分区
56 |     df.registerTempTable(s"${ptTableName}_tmp_view")
57 |     sql(s"insert into table ${ptTableName} partition (pt1='2018', pt2='0601') select * from ${ptTableName}_tmp_view")
58 |     sql(s"insert overwrite table ${ptTableName} partition (pt1='2018', pt2='0601') select * from ${ptTableName}_tmp_view")
59 | 
60 |     ptDf.write.partitionBy("pt1", "pt2").insertInto(ptTableName) // 动态分区 insertInto语义
61 |     ptDf.write.partitionBy("pt1", "pt2").mode("overwrite").insertInto(ptTableName) // 动态分区 insertOverwrite语义
62 | 
63 |     // 读 普通表
64 |     val rdf = sql(s"select name, num from $tableName")
65 |     println(s"rdf count, ${rdf.count()}")
66 |     rdf.printSchema()
67 | 
68 |     // 读 分区表
69 |     val rptdf = sql(s"select name, num, pt1, pt2 from $ptTableName where pt1 = '2018' and pt2 = '0601'")
70 |     println(s"rptdf count, ${rptdf.count()}")
71 |     rptdf.printSchema()
72 |   }
73 | }


--------------------------------------------------------------------------------
/spark-1.x/src/main/scala/com/aliyun/odps/spark/examples/udf/SparkUDF.scala:
--------------------------------------------------------------------------------
 1 | package com.aliyun.odps.spark.examples.udf
 2 | 
 3 | import org.apache.spark.sql.odps.OdpsContext
 4 | import org.apache.spark.{SparkConf, SparkContext}
 5 | 
 6 | object SparkUDF {
 7 |   def main(args: Array[String]) {
 8 |     val conf = new SparkConf().setAppName("sparkUDF")
 9 |     val sc = new SparkContext(conf)
10 | 
11 |     val sqlContext = new OdpsContext(sc)
12 |     import sqlContext._
13 | 
14 |     sql("DROP TABLE IF EXISTS spark_sql_test_partition_table")
15 |     sql("CREATE TABLE spark_sql_test_partition_table(name STRING, num BIGINT) PARTITIONED BY (p1 STRING, p2 STRING)")
16 | 
17 |     sql("INSERT INTO TABLE spark_sql_test_partition_table PARTITION (p1='2020',p2='hangzhou') SELECT 'hz', 400")
18 |     sql("INSERT INTO TABLE spark_sql_test_partition_table PARTITION (p1='2020',p2='shanghai') SELECT 'sh', 500")
19 |     sql("INSERT INTO TABLE spark_sql_test_partition_table PARTITION (p1='2020',p2='hangzhou') SELECT 'hz', 600")
20 | 
21 |     try {
22 |       udf.register("myUpper", (input: String) => input.toUpperCase)
23 |       val funcs = sql("SHOW FUNCTIONS myupper").collect()
24 |       funcs foreach println
25 |       assert(funcs.length == 1)
26 |       val data = sql("SELECT myupper(name) FROM spark_sql_test_partition_table WHERE name = 'hz'").collect()
27 |       assert(data(0).get(0) == "HZ")
28 |       println("======= test register udf success ======")
29 |     } catch {
30 |       case e: Throwable =>
31 |         e.printStackTrace(System.out)
32 |         throw e
33 |     }
34 |   }
35 | }
36 | 
37 | 


--------------------------------------------------------------------------------
/spark-2.x/libs/jindofs-sdk-3.7.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/spark-2.x/libs/jindofs-sdk-3.7.2.jar


--------------------------------------------------------------------------------
/spark-2.x/src/main/java/com/aliyun/odps/spark/examples/utils/ConfigLog4j2.java:
--------------------------------------------------------------------------------
 1 | package com.aliyun.odps.spark.examples.utils;
 2 | 
 3 | import org.apache.logging.log4j.Level;
 4 | import org.apache.logging.log4j.LogManager;
 5 | import org.apache.logging.log4j.core.Appender;
 6 | import org.apache.logging.log4j.core.LoggerContext;
 7 | import org.apache.logging.log4j.core.appender.ConsoleAppender;
 8 | import org.apache.logging.log4j.core.config.AppenderRef;
 9 | import org.apache.logging.log4j.core.config.Configuration;
10 | import org.apache.logging.log4j.core.config.LoggerConfig;
11 | import org.apache.logging.log4j.core.layout.PatternLayout;
12 | 
13 | public class ConfigLog4j2 {
14 | 
15 |   private static final LoggerContext CONTEXT;
16 |   public static final String DEFAULT_APPENDER = "MY_STDOUT";
17 |   public static final String
18 |       DEFAULT_PATTERN =
19 |       "%d{yyyy-MM-dd HH:mm:ss.SSS} [%t] %-5level %logger - %msg %ex %n";
20 | 
21 |   static {
22 |     CONTEXT = (LoggerContext) LogManager.getContext(false);
23 |   }
24 | 
25 |   /**
26 |    * @Description: add specific logger for specific package
27 |    * @Param: packageName, such as com.xx.yy
28 |    * @return: void
29 |    * @Author: lcj265802@alibaba-inc.com
30 |    * @Date: 2020/12/29
31 |    */
32 |   public static void initPackageLogger(String packageName) {
33 |     LoggerContext loggerContext = CONTEXT;
34 |     Configuration config = loggerContext.getConfiguration();
35 | 
36 |     ConsoleAppender.Builder builder = ConsoleAppender.newBuilder();
37 |     builder.setName(DEFAULT_APPENDER);
38 |     builder.setLayout(PatternLayout.newBuilder().withPattern(DEFAULT_PATTERN).build());
39 |     Appender stdoutAppender = builder.setTarget(ConsoleAppender.Target.SYSTEM_OUT).build();
40 |     stdoutAppender.start();
41 | 
42 |     config.addAppender(stdoutAppender);
43 | 
44 |     AppenderRef ref = AppenderRef.createAppenderRef(DEFAULT_APPENDER, null, null);
45 |     AppenderRef[] refs = new AppenderRef[]{ref};
46 | 
47 |     LoggerConfig
48 |         loggerConfig =
49 |         LoggerConfig.createLogger(false, Level.INFO, packageName,
50 |                                   "true", refs, null, config, null);
51 |     loggerConfig.addAppender(stdoutAppender, null, null);
52 |     config.addLogger(packageName, loggerConfig);
53 | 
54 |     loggerContext.updateLoggers();
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/spark-2.x/src/main/python/spark_oss.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import sys
 3 | from pyspark.sql import SparkSession
 4 | 
 5 | try:
 6 |     # for python 2
 7 |     reload(sys)
 8 |     sys.setdefaultencoding('utf8')
 9 | except:
10 |     # python 3 not needed
11 |     pass
12 | 
13 | if __name__ == '__main__':
14 |     spark = SparkSession.builder\
15 |         .config("spark.hadoop.fs.AbstractFileSystem.oss.impl", "com.aliyun.emr.fs.oss.OSS")\
16 |         .config("spark.hadoop.fs.oss.impl", "com.aliyun.emr.fs.oss.JindoOssFileSystem")\
17 |         .config("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-internal.aliyuncs.com")\
18 |         .config("spark.hadoop.fs.oss.accessKeyId", "xxx")\
19 |         .config("spark.hadoop.fs.oss.accessKeySecret", "xxx")\
20 |         .appName("spark write df to oss")\
21 | 
22 |         .getOrCreate()
23 | 
24 |     data = [i for i in range(0, 100)]
25 | 
26 |     df = spark.sparkContext.parallelize(data, 2).map(lambda s: ("name-%s" % s, s)).toDF("name: string, num: int")
27 | 
28 |     df.show(n=10)
29 | 
30 |     # write to oss
31 |     pathout = 'oss://[bucket]/test.csv'
32 |     df.write.csv(pathout)
33 | 


--------------------------------------------------------------------------------
/spark-2.x/src/main/python/spark_sql.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import sys
 3 | from pyspark.sql import SparkSession
 4 | 
 5 | try:
 6 |     # for python 2
 7 |     reload(sys)
 8 |     sys.setdefaultencoding('utf8')
 9 | except:
10 |     # python 3 not needed
11 |     pass
12 | 
13 | if __name__ == '__main__':
14 |     spark = SparkSession.builder\
15 |         .appName("spark sql")\
16 |         .config("spark.sql.broadcastTimeout", 20 * 60)\
17 |         .config("spark.sql.crossJoin.enabled", True)\
18 |         .config("odps.exec.dynamic.partition.mode", "nonstrict")\
19 |         .getOrCreate()
20 | 
21 |     tableName = "mc_test_table"
22 |     ptTableName = "mc_test_pt_table"
23 |     data = [i for i in range(0, 100)]
24 | 
25 |     # Drop Create
26 |     spark.sql("DROP TABLE IF EXISTS %s" % tableName)
27 |     spark.sql("DROP TABLE IF EXISTS %s" % ptTableName)
28 | 
29 |     spark.sql("CREATE TABLE %s (name STRING, num BIGINT)" % tableName)
30 |     spark.sql("CREATE TABLE %s (name STRING, num BIGINT) PARTITIONED BY (pt1 STRING, pt2 STRING)" % ptTableName)
31 | 
32 |     df = spark.sparkContext.parallelize(data, 2).map(lambda s: ("name-%s" % s, s)).toDF("name: string, num: int")
33 |     pt_df = spark.sparkContext.parallelize(data, 2).map(lambda s: ("name-%s" % s, s, "2018", "0601")).toDF("name: string, num: int, pt1: string, pt2: string")
34 | 
35 |     # 写 普通表
36 |     df.write.insertInto(tableName) # insertInto语义
37 |     df.write.insertInto(tableName, True) # insertOverwrite语义
38 | 
39 |     # 写 分区表
40 |     # DataFrameWriter 无法指定分区写入 需要通过临时表再用SQL写入特定分区
41 |     df.createOrReplaceTempView("%s_tmp_view" % ptTableName)
42 |     spark.sql("insert into table %s partition (pt1='2018', pt2='0601') select * from %s_tmp_view" % (ptTableName, ptTableName))
43 |     spark.sql("insert overwrite table %s partition (pt1='2018', pt2='0601') select * from %s_tmp_view" % (ptTableName, ptTableName))
44 | 
45 |     pt_df.write.insertInto(ptTableName) # 动态分区 insertInto语义
46 |     pt_df.write.insertInto(ptTableName, True) # 动态分区 insertOverwrite语义
47 | 
48 |     # 读 普通表
49 |     rdf = spark.sql("select name, num from %s" % tableName)
50 |     print("rdf count, %s\n" % rdf.count())
51 |     rdf.printSchema()
52 | 
53 |     # 读 分区表
54 |     rptdf = spark.sql("select name, num, pt1, pt2 from %s where pt1 = '2018' and pt2 = '0601'" % ptTableName)
55 |     print("rptdf count, %s" % (rptdf.count()))
56 |     rptdf.printSchema()
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/SparkPi.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Licensed to the Apache Software Foundation (ASF) under one
 3 |   * or more contributor license agreements.  See the NOTICE file
 4 |   * distributed with this work for additional information
 5 |   * regarding copyright ownership.  The ASF licenses this file
 6 |   * to you under the Apache License, Version 2.0 (the
 7 |   * "License"); you may not use this file except in compliance
 8 |   * with the License.  You may obtain a copy of the License at
 9 |   * <p>
10 |   * http://www.apache.org/licenses/LICENSE-2.0
11 |   * <p>
12 |   * Unless required by applicable law or agreed to in writing, software
13 |   * distributed under the License is distributed on an "AS IS" BASIS,
14 |   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   * See the License for the specific language governing permissions and
16 |   * limitations under the License.
17 |   */
18 | 
19 | package com.aliyun.odps.spark.examples
20 | 
21 | import org.apache.spark.sql.SparkSession
22 | 
23 | import scala.math.random
24 | 
25 | object SparkPi {
26 |   def main(args: Array[String]) {
27 |     val spark = SparkSession
28 |       .builder()
29 |       .appName("SparkPi")
30 |       .getOrCreate()
31 |     val sc = spark.sparkContext
32 | 
33 |     try {
34 |       val slices = if (args.length > 0) args(0).toInt else 2
35 |       val n = math.min(100000L * slices, Int.MaxValue).toInt // avoid overflow
36 |       val count = sc.parallelize(1 until n, slices).map { i =>
37 |         val x = random * 2 - 1
38 |         val y = random * 2 - 1
39 |         if (x * x + y * y < 1) 1 else 0
40 |       }.reduce(_ + _)
41 |       println("Pi is roughly " + 4.0 * count / n)
42 |     } finally {
43 |       sc.stop()
44 |     }
45 |   }
46 | }


--------------------------------------------------------------------------------
/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/WordCount.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Licensed to the Apache Software Foundation (ASF) under one
 3 |   * or more contributor license agreements.  See the NOTICE file
 4 |   * distributed with this work for additional information
 5 |   * regarding copyright ownership.  The ASF licenses this file
 6 |   * to you under the Apache License, Version 2.0 (the
 7 |   * "License"); you may not use this file except in compliance
 8 |   * with the License.  You may obtain a copy of the License at
 9 |   * <p>
10 |   * http://www.apache.org/licenses/LICENSE-2.0
11 |   * <p>
12 |   * Unless required by applicable law or agreed to in writing, software
13 |   * distributed under the License is distributed on an "AS IS" BASIS,
14 |   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   * See the License for the specific language governing permissions and
16 |   * limitations under the License.
17 |   */
18 | 
19 | package com.aliyun.odps.spark.examples
20 | 
21 | import org.apache.spark.sql.SparkSession
22 | 
23 | object WordCount {
24 |   def main(args: Array[String]) {
25 |     val spark = SparkSession
26 |       .builder()
27 |       .appName("WordCount")
28 |       .getOrCreate()
29 |     val sc = spark.sparkContext
30 | 
31 |     try {
32 |       sc.parallelize(1 to 100, 10).map(word => (word, 1)).reduceByKey(_ + _, 10).take(100).foreach(println)
33 |     } finally {
34 |       sc.stop()
35 |     }
36 |   }
37 | }


--------------------------------------------------------------------------------
/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/graphx/PageRank.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Licensed to the Apache Software Foundation (ASF) under one
 3 |   * or more contributor license agreements.  See the NOTICE file
 4 |   * distributed with this work for additional information
 5 |   * regarding copyright ownership.  The ASF licenses this file
 6 |   * to you under the Apache License, Version 2.0 (the
 7 |   * "License"); you may not use this file except in compliance
 8 |   * with the License.  You may obtain a copy of the License at
 9 |   * <p>
10 |   * http://www.apache.org/licenses/LICENSE-2.0
11 |   * <p>
12 |   * Unless required by applicable law or agreed to in writing, software
13 |   * distributed under the License is distributed on an "AS IS" BASIS,
14 |   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   * See the License for the specific language governing permissions and
16 |   * limitations under the License.
17 |   */
18 | 
19 | package com.aliyun.odps.spark.examples.graphx
20 | 
21 | import org.apache.spark.graphx.{Edge, Graph, VertexId}
22 | import org.apache.spark.rdd.RDD
23 | import org.apache.spark.sql.SparkSession
24 | 
25 | object PageRank {
26 |   def main(args: Array[String]): Unit = {
27 |     val spark = SparkSession
28 |       .builder()
29 |       .appName("PageRank")
30 |       .getOrCreate()
31 |     val sc = spark.sparkContext
32 | 
33 |     // build vertices
34 |     val users: RDD[(VertexId, Array[String])] = sc.parallelize(List(
35 |       "1,BarackObama,Barack Obama",
36 |       "2,ladygaga,Goddess of Love",
37 |       "3,jeresig,John Resig",
38 |       "4,justinbieber,Justin Bieber",
39 |       "6,matei_zaharia,Matei Zaharia",
40 |       "7,odersky,Martin Odersky",
41 |       "8,anonsys"
42 |     ).map(line => line.split(",")).map(parts => (parts.head.toLong, parts.tail)))
43 | 
44 |     // build edges
45 |     val followers: RDD[Edge[Double]] = sc.parallelize(Array(
46 |       Edge(2L, 1L, 1.0),
47 |       Edge(4L, 1L, 1.0),
48 |       Edge(1L, 2L, 1.0),
49 |       Edge(6L, 3L, 1.0),
50 |       Edge(7L, 3L, 1.0),
51 |       Edge(7L, 6L, 1.0),
52 |       Edge(6L, 7L, 1.0),
53 |       Edge(3L, 7L, 1.0)
54 |     ))
55 | 
56 |     // build graph
57 |     val followerGraph: Graph[Array[String], Double] = Graph(users, followers)
58 | 
59 |     // restrict the graph to users with usernames and names
60 |     val subgraph = followerGraph.subgraph(vpred = (vid, attr) => attr.size == 2)
61 | 
62 |     // compute PageRank
63 |     val pageRankGraph = subgraph.pageRank(0.001)
64 | 
65 |     // get attributes of the top pagerank users
66 |     val userInfoWithPageRank = subgraph.outerJoinVertices(pageRankGraph.vertices) {
67 |       case (uid, attrList, Some(pr)) => (pr, attrList.toList)
68 |       case (uid, attrList, None) => (0.0, attrList.toList)
69 |     }
70 | 
71 |     println(userInfoWithPageRank.vertices.top(5)(Ordering.by(_._2._1)).mkString("\n"))
72 |   }
73 | }
74 | 


--------------------------------------------------------------------------------
/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/log4j2/Logger.scala:
--------------------------------------------------------------------------------
 1 | package com.aliyun.odps.spark.examples.log4j2
 2 | 
 3 | import org.apache.logging.log4j
 4 | import org.apache.logging.log4j.LogManager
 5 | 
 6 | trait Logger {
 7 |   val log: log4j.Logger = LogManager.getLogger(this.getClass)
 8 |   log
 9 | }
10 | 


--------------------------------------------------------------------------------
/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/log4j2/SimpleWordCount.scala:
--------------------------------------------------------------------------------
 1 | package com.aliyun.odps.spark.examples.log4j2
 2 | 
 3 | import com.aliyun.odps.spark.examples.utils.ConfigLog4j2
 4 | import org.apache.spark.rdd.RDD
 5 | import org.apache.spark.sql.SparkSession
 6 | 
 7 | object SimpleWordCount extends Logger {
 8 |   def main(args: Array[String]): Unit = {
 9 | 
10 |     ConfigLog4j2.initPackageLogger("com.aliyun.odps.spark.examples.log4j2")
11 |     val spark: SparkSession = SparkSession
12 |       .builder()
13 |       .appName("WordCount")
14 |       .getOrCreate()
15 | 
16 |     log.info("My Test!")
17 |     val wordList = List("Hello", "World", "Hello")
18 |     val rdd: RDD[String] = spark.sparkContext.parallelize(Seq(wordList: _*)).cache()
19 |     val resultRDD: RDD[(String, Int)] = rdd.map(w => (w, 1)).reduceByKey(_ + _)
20 |     resultRDD.collect().foreach(v => {
21 |       log.info(s"${v._1} has num ${v._2}")
22 |     })
23 | 
24 |     spark.stop()
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/mllib/KmeansModelSaveToOss.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Licensed to the Apache Software Foundation (ASF) under one
 3 |   * or more contributor license agreements.  See the NOTICE file
 4 |   * distributed with this work for additional information
 5 |   * regarding copyright ownership.  The ASF licenses this file
 6 |   * to you under the Apache License, Version 2.0 (the
 7 |   * "License"); you may not use this file except in compliance
 8 |   * with the License.  You may obtain a copy of the License at
 9 |   * <p>
10 |   * http://www.apache.org/licenses/LICENSE-2.0
11 |   * <p>
12 |   * Unless required by applicable law or agreed to in writing, software
13 |   * distributed under the License is distributed on an "AS IS" BASIS,
14 |   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   * See the License for the specific language governing permissions and
16 |   * limitations under the License.
17 |   */
18 | 
19 | package com.aliyun.odps.spark.examples.mllib
20 | 
21 | import org.apache.spark.mllib.clustering.KMeans._
22 | import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
23 | import org.apache.spark.mllib.linalg.Vectors
24 | import org.apache.spark.sql.SparkSession
25 | 
26 | object KmeansModelSaveToOss {
27 |   val modelOssDir = "oss://[bucket]/kmeans-model"
28 | 
29 |   def main(args: Array[String]) {
30 | 
31 |     //1. train and save the model
32 |     val spark = SparkSession
33 |       .builder()
34 |       .config("spark.hadoop.fs.AbstractFileSystem.oss.impl", "com.aliyun.emr.fs.oss.OSS")
35 |       .config("spark.hadoop.fs.oss.impl", "com.aliyun.emr.fs.oss.JindoOssFileSystem")
36 |       .config("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-internal.aliyuncs.com")
37 |       .config("spark.hadoop.fs.oss.accessKeyId", "xxx")
38 |       .config("spark.hadoop.fs.oss.accessKeySecret", "xxx")
39 |       .appName("KmeansModelSaveToOss")
40 |       .getOrCreate()
41 | 
42 |     val sc = spark.sparkContext
43 |     val points = Seq(
44 |       Vectors.dense(0.0, 0.0),
45 |       Vectors.dense(0.0, 0.1),
46 |       Vectors.dense(0.1, 0.0),
47 |       Vectors.dense(9.0, 0.0),
48 |       Vectors.dense(9.0, 0.2),
49 |       Vectors.dense(9.2, 0.0)
50 |     )
51 |     val rdd = sc.parallelize(points, 3)
52 |     val initMode = K_MEANS_PARALLEL
53 |     val model = KMeans.train(rdd, k = 2, maxIterations = 2, runs = 1, initMode)
54 |     val predictResult1 = rdd.map(feature => "cluster id: " + model.predict(feature) + " feature:" + feature.toArray.mkString(",")).collect
55 |     println("modelOssDir=" + modelOssDir)
56 |     model.save(sc, modelOssDir)
57 | 
58 |     //2. predict from the oss model
59 |     val modelLoadOss = KMeansModel.load(sc, modelOssDir)
60 |     val predictResult2 = rdd.map(feature => "cluster id: " + modelLoadOss.predict(feature) + " feature:" + feature.toArray.mkString(",")).collect
61 |     assert(predictResult1.size == predictResult2.size)
62 |     predictResult2.foreach(result2 => assert(predictResult1.contains(result2)))
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/oss/JindoFsDemo.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Licensed to the Apache Software Foundation (ASF) under one
 3 |   * or more contributor license agreements.  See the NOTICE file
 4 |   * distributed with this work for additional information
 5 |   * regarding copyright ownership.  The ASF licenses this file
 6 |   * to you under the Apache License, Version 2.0 (the
 7 |   * "License"); you may not use this file except in compliance
 8 |   * with the License.  You may obtain a copy of the License at
 9 |   * <p>
10 |   * http://www.apache.org/licenses/LICENSE-2.0
11 |   * <p>
12 |   * Unless required by applicable law or agreed to in writing, software
13 |   * distributed under the License is distributed on an "AS IS" BASIS,
14 |   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   * See the License for the specific language governing permissions and
16 |   * limitations under the License.
17 |   */
18 | 
19 | package com.aliyun.odps.spark.examples.oss
20 | 
21 | import org.apache.spark.{SparkConf, SparkContext}
22 | 
23 | object JindoFsDemo {
24 |   def main(args: Array[String]): Unit = {
25 |     val bucket : String = args(0)
26 |     val ossPath : String = args(1)
27 | 
28 |     //using access-key-id/access-key-secret
29 |     val conf = new SparkConf()
30 |       .setAppName("jindo-fs-demo")
31 |       .set("spark.hadoop.fs.AbstractFileSystem.oss.impl", "com.aliyun.emr.fs.oss.OSS")
32 |       .set("spark.hadoop.fs.oss.impl", "com.aliyun.emr.fs.oss.JindoOssFileSystem")
33 |       .set("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-internal.aliyuncs.com")
34 |       .set("spark.hadoop.fs.oss.accessKeyId", "xxx")
35 |       .set("spark.hadoop.fs.oss.accessKeySecret", "xxx")
36 | 
37 |     val sc = new SparkContext(conf)
38 | 
39 |     try {
40 |       read_oss_dir(sc, "demo", s"oss://${bucket}/${ossPath}")
41 |     } finally {
42 |       sc.stop()
43 |     }
44 |   }
45 | 
46 |   /**
47 |     * compute cost time using jindo sdk
48 |     */
49 |   def read_oss_dir(sc: SparkContext, job_des:String, ossPath: String): Unit = {
50 |     val startTime: Long = System.currentTimeMillis()
51 |     val inputData = sc.textFile(ossPath, 20)
52 |     val cnt = inputData.count
53 |     val endTime:Long = System.currentTimeMillis()
54 |     val cost:Long = endTime - startTime
55 |     println(s"job:$job_des, count:$cnt, consume:$cost")
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/oss/SparkUnstructuredDataCompute.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Licensed to the Apache Software Foundation (ASF) under one
 3 |   * or more contributor license agreements.  See the NOTICE file
 4 |   * distributed with this work for additional information
 5 |   * regarding copyright ownership.  The ASF licenses this file
 6 |   * to you under the Apache License, Version 2.0 (the
 7 |   * "License"); you may not use this file except in compliance
 8 |   * with the License.  You may obtain a copy of the License at
 9 |   * <p>
10 |   * http://www.apache.org/licenses/LICENSE-2.0
11 |   * <p>
12 |   * Unless required by applicable law or agreed to in writing, software
13 |   * distributed under the License is distributed on an "AS IS" BASIS,
14 |   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   * See the License for the specific language governing permissions and
16 |   * limitations under the License.
17 |   */
18 | 
19 | package com.aliyun.odps.spark.examples.oss
20 | 
21 | import org.apache.spark.sql.SparkSession
22 | 
23 | object SparkUnstructuredDataCompute {
24 |   def main(args: Array[String]) {
25 |     val spark = SparkSession
26 |       .builder()
27 |       .config("spark.hadoop.fs.AbstractFileSystem.oss.impl", "com.aliyun.emr.fs.oss.OSS")
28 |       .config("spark.hadoop.fs.oss.impl", "com.aliyun.emr.fs.oss.JindoOssFileSystem")
29 |       .config("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-internal.aliyuncs.com")
30 |       .config("spark.hadoop.fs.oss.accessKeyId", "xxx")
31 |       .config("spark.hadoop.fs.oss.accessKeySecret", "xxx")
32 |       .appName("SparkUnstructuredDataCompute")
33 |       .getOrCreate()
34 | 
35 |     val sc = spark.sparkContext
36 |     try {
37 |       val pathIn = "oss://bucket/inputdata/"
38 |       val inputData = sc.textFile(pathIn, 5)
39 |       val cnt = inputData.count
40 |       println(s"count: $cnt")
41 |     } finally {
42 |       sc.stop()
43 |     }
44 |   }
45 | }


--------------------------------------------------------------------------------
/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/sparksql/SparkSQL.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Licensed to the Apache Software Foundation (ASF) under one
 3 |   * or more contributor license agreements.  See the NOTICE file
 4 |   * distributed with this work for additional information
 5 |   * regarding copyright ownership.  The ASF licenses this file
 6 |   * to you under the Apache License, Version 2.0 (the
 7 |   * "License"); you may not use this file except in compliance
 8 |   * with the License.  You may obtain a copy of the License at
 9 |   * <p>
10 |   * http://www.apache.org/licenses/LICENSE-2.0
11 |   * <p>
12 |   * Unless required by applicable law or agreed to in writing, software
13 |   * distributed under the License is distributed on an "AS IS" BASIS,
14 |   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   * See the License for the specific language governing permissions and
16 |   * limitations under the License.
17 |   */
18 | 
19 | package com.aliyun.odps.spark.examples.sparksql
20 | 
21 | import org.apache.spark.sql.{SaveMode, SparkSession}
22 | 
23 | object SparkSQL {
24 |   def main(args: Array[String]) {
25 |     val spark = SparkSession
26 |       .builder()
27 |       .appName("SparkSQL-on-MaxCompute")
28 |       .config("spark.sql.broadcastTimeout", 20 * 60)
29 |       .config("spark.sql.crossJoin.enabled", true)
30 |       .config("odps.exec.dynamic.partition.mode", "nonstrict")
31 |       .getOrCreate()
32 | 
33 |     // val project = spark.conf.get("odps.project.name")
34 | 
35 |     import spark._
36 |     import sqlContext.implicits._
37 |     val tableName = "mc_test_table"
38 |     val ptTableName = "mc_test_pt_table"
39 |     // Drop Create
40 |     sql(s"DROP TABLE IF EXISTS ${tableName}")
41 |     sql(s"DROP TABLE IF EXISTS ${ptTableName}")
42 | 
43 |     sql(s"CREATE TABLE ${tableName} (name STRING, num BIGINT)")
44 |     sql(s"CREATE TABLE ${ptTableName} (name STRING, num BIGINT) PARTITIONED BY (pt1 STRING, pt2 STRING)")
45 | 
46 |     val df = spark.sparkContext.parallelize(0 to 99, 2).map(f => {
47 |       (s"name-$f", f)
48 |     }).toDF("name", "num")
49 | 
50 |     val ptDf = spark.sparkContext.parallelize(0 to 99, 2).map(f => {
51 |       (s"name-$f", f, "2018", "0601")
52 |     }).toDF("name", "num", "pt1", "pt2")
53 | 
54 |     // 写 普通表
55 |     df.write.insertInto(tableName) // insertInto语义
56 |     df.write.mode("overwrite").insertInto(tableName) // insertOverwrite语义
57 | 
58 |     // 写 分区表
59 |     // DataFrameWriter 无法指定分区写入 需要通过临时表再用SQL写入特定分区
60 |     df.createOrReplaceTempView(s"${ptTableName}_tmp_view")
61 |     sql(s"insert into table ${ptTableName} partition (pt1='2018', pt2='0601') select * from ${ptTableName}_tmp_view")
62 |     sql(s"insert overwrite table ${ptTableName} partition (pt1='2018', pt2='0601') select * from ${ptTableName}_tmp_view")
63 | 
64 |     ptDf.write.insertInto(ptTableName) // 动态分区 insertInto语义
65 |     ptDf.write.mode("overwrite").insertInto(ptTableName) // 动态分区 insertOverwrite语义
66 | 
67 |     // 读 普通表
68 |     val rdf = sql(s"select name, num from $tableName")
69 |     println(s"rdf count, ${rdf.count()}")
70 |     rdf.printSchema()
71 | 
72 |     // 读 分区表
73 |     val rptdf = sql(s"select name, num, pt1, pt2 from $ptTableName where pt1 = '2018' and pt2 = '0601'")
74 |     println(s"rptdf count, ${rptdf.count()}")
75 |     rptdf.printSchema()
76 |   }
77 | }
78 | 
79 | 


--------------------------------------------------------------------------------
/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/streaming/common/SparkSessionSingleton.scala:
--------------------------------------------------------------------------------
 1 | package com.aliyun.odps.spark.examples.streaming.common
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.sql.SparkSession
 5 | 
 6 | object SparkSessionSingleton {
 7 |   @transient  private var instance: SparkSession = _
 8 | 
 9 |   def getInstance(sparkConf: SparkConf): SparkSession = {
10 |     if (instance == null) {
11 |       instance = SparkSession
12 |         .builder
13 |         .config(sparkConf)
14 |         .getOrCreate()
15 |     }
16 |     instance
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/streaming/datahub/DataHub2OdpsDemo.scala:
--------------------------------------------------------------------------------
 1 | package com.aliyun.odps.spark.examples.streaming.datahub
 2 | 
 3 | import com.aliyun.datahub.model.RecordEntry
 4 | import com.aliyun.odps.spark.examples.streaming.common.SparkSessionSingleton
 5 | 
 6 | import org.apache.spark.sql.SparkSession
 7 | import org.apache.spark.storage.StorageLevel
 8 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 9 | import org.apache.spark.streaming.aliyun.datahub.DatahubUtils
10 | 
11 | object DataHub2OdpsDemo {
12 | 
13 |   def transferFunc(record: RecordEntry): String = {
14 |     // 这个转化函数目前只支持把DataHub Record转成String
15 |     // 如果是需要多个字段的话, 那么需要处理一下拼接的逻辑
16 |     record.getString(1)
17 |   }
18 | 
19 |   def main(args: Array[String]): Unit = {
20 |     val spark = SparkSession
21 |       .builder()
22 |       .appName("DataHubStreamingDemo")
23 |       .config("spark.hadoop.fs.oss.credentials.provider", "org.apache.hadoop.fs.aliyun.oss.AliyunStsTokenCredentialsProvider")
24 |       .config("spark.hadoop.fs.oss.ststoken.roleArn", "acs:ram::****:role/aliyunodpsdefaultrole")
25 |       .config("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-zmf.aliyuncs.com")
26 |       .getOrCreate()
27 | 
28 |     // 设置Batch间隔时间
29 |     val ssc = new StreamingContext(spark.sparkContext, Seconds(10))
30 | 
31 |     // checkpoint dir to oss
32 |     ssc.checkpoint("oss://bucket/inputdata/")
33 | 
34 |     val dataStream = DatahubUtils.createStream(
35 |       ssc,
36 |       "projectName",
37 |       "topic",
38 |       "subId",
39 |       "accessId",
40 |       "accessKey",
41 |       "endPoint",
42 |       transferFunc(_),
43 |       StorageLevel.MEMORY_AND_DISK
44 |     )
45 | 
46 |     dataStream.map(x => new String(x)).foreachRDD(rdd => {
47 |       val spark = SparkSessionSingleton.getInstance(rdd.sparkContext.getConf)
48 |       import spark.implicits._
49 | 
50 |       rdd.toDF("id").write.mode("append").saveAsTable("test_table")
51 |     })
52 | 
53 |     ssc.start()
54 |     ssc.awaitTermination()
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/streaming/datahub/DataHubStreamingDemo.scala:
--------------------------------------------------------------------------------
 1 | package com.aliyun.odps.spark.examples.streaming.datahub
 2 | 
 3 | import com.aliyun.datahub.model.RecordEntry
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.storage.StorageLevel
 6 | import org.apache.spark.streaming.aliyun.datahub.DatahubUtils
 7 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 8 | 
 9 | object DataHubStreamingDemo {
10 | 
11 |   def transferFunc(record: RecordEntry): String = {
12 |     // 这个转化函数目前只支持把DataHub Record转成String
13 |     // 如果是需要多个字段的话, 那么需要处理一下拼接的逻辑
14 |     record.getString(1)
15 |   }
16 | 
17 |   def main(args: Array[String]): Unit = {
18 |     val spark = SparkSession
19 |       .builder()
20 |       .appName("DataHubStreamingDemo")
21 |       .config("spark.hadoop.fs.oss.credentials.provider", "org.apache.hadoop.fs.aliyun.oss.AliyunStsTokenCredentialsProvider")
22 |       .config("spark.hadoop.fs.oss.ststoken.roleArn", "acs:ram::****:role/aliyunodpsdefaultrole")
23 |       .config("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-zmf.aliyuncs.com")
24 |       .getOrCreate()
25 | 
26 |     // 设置Batch间隔时间
27 |     val ssc = new StreamingContext(spark.sparkContext, Seconds(5))
28 | 
29 |     // checkpoint dir to oss
30 |     ssc.checkpoint("oss://bucket/inputdata/")
31 | 
32 |     val dataStream = DatahubUtils.createStream(
33 |       ssc,
34 |       "projectName",
35 |       "topic",
36 |       "subId",
37 |       "accessId",
38 |       "accessKey",
39 |       "endPoint",
40 |       transferFunc(_),
41 |       StorageLevel.MEMORY_AND_DISK
42 |     )
43 | 
44 |     dataStream.count().print()
45 | 
46 |     ssc.start()
47 |     ssc.awaitTermination()
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/streaming/kafka/Kafka2OdpsDemo.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Licensed to the Apache Software Foundation (ASF) under one
 3 |   * or more contributor license agreements.  See the NOTICE file
 4 |   * distributed with this work for additional information
 5 |   * regarding copyright ownership.  The ASF licenses this file
 6 |   * to you under the Apache License, Version 2.0 (the
 7 |   * "License"); you may not use this file except in compliance
 8 |   * with the License.  You may obtain a copy of the License at
 9 |   * <p>
10 |   * http://www.apache.org/licenses/LICENSE-2.0
11 |   * <p>
12 |   * Unless required by applicable law or agreed to in writing, software
13 |   * distributed under the License is distributed on an "AS IS" BASIS,
14 |   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   * See the License for the specific language governing permissions and
16 |   * limitations under the License.
17 |   */
18 | 
19 | package com.aliyun.odps.spark.examples.streaming.kafka
20 | 
21 | import com.aliyun.odps.spark.examples.streaming.common.SparkSessionSingleton
22 | import org.apache.kafka.clients.consumer.ConsumerRecord
23 | import org.apache.kafka.common.serialization.StringDeserializer
24 | 
25 | import org.apache.spark.SparkConf
26 | import org.apache.spark.streaming.{Seconds, StreamingContext}
27 | import org.apache.spark.streaming.dstream.{DStream, InputDStream}
28 | import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
29 | 
30 | object Kafka2OdpsDemo {
31 |   def main(args: Array[String]): Unit = {
32 |     val sparkConf = new SparkConf().setAppName("test")
33 |     val ssc = new StreamingContext(sparkConf, Seconds(10))
34 | 
35 |     // 请使用OSS作为Checkpoint存储,修改为有效OSS路径。OSS访问文档请参考 https://github.com/aliyun/MaxCompute-Spark/wiki/08.-Oss-Access%E6%96%87%E6%A1%A3%E8%AF%B4%E6%98%8E
36 |     ssc.checkpoint("oss://bucket/checkpointdir")
37 | 
38 |     // kafka配置参数
39 |     val kafkaParams = Map[String, Object](
40 |       "bootstrap.servers" -> "localhost:9092",
41 |       "key.deserializer" -> classOf[StringDeserializer],
42 |       "value.deserializer" -> classOf[StringDeserializer],
43 |       "group.id" -> "testGroupId",
44 |       "auto.offset.reset" -> "latest",
45 |       "enable.auto.commit" -> (false: java.lang.Boolean)
46 |     )
47 | 
48 |     // 创建kafka dstream
49 |     val topics = Set("test")
50 |     val recordDstream: InputDStream[ConsumerRecord[String, String]] =
51 |       KafkaUtils.createDirectStream[String, String](
52 |         ssc,
53 |         LocationStrategies.PreferConsistent,
54 |         ConsumerStrategies.Subscribe[String, String](topics, kafkaParams)
55 |       )
56 |     val dstream = recordDstream.map(f => (f.key(), f.value()))
57 |     // 解析kafka数据并写入odps
58 |     val data: DStream[String] = dstream.map(_._2)
59 |     val wordsDStream: DStream[String] = data.flatMap(_.split(" "))
60 |     wordsDStream.foreachRDD(rdd => {
61 |       val spark = SparkSessionSingleton.getInstance(rdd.sparkContext.getConf)
62 |       import spark.implicits._
63 | 
64 |       rdd.toDF("id").write.mode("append").saveAsTable("test_table")
65 |     })
66 | 
67 |     ssc.start()
68 |     ssc.awaitTermination()
69 |   }
70 | }
71 | 
72 | 


--------------------------------------------------------------------------------
/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/streaming/kafka/KafkaStreamingDemo.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Licensed to the Apache Software Foundation (ASF) under one
 3 |   * or more contributor license agreements.  See the NOTICE file
 4 |   * distributed with this work for additional information
 5 |   * regarding copyright ownership.  The ASF licenses this file
 6 |   * to you under the Apache License, Version 2.0 (the
 7 |   * "License"); you may not use this file except in compliance
 8 |   * with the License.  You may obtain a copy of the License at
 9 |   * <p>
10 |   * http://www.apache.org/licenses/LICENSE-2.0
11 |   * <p>
12 |   * Unless required by applicable law or agreed to in writing, software
13 |   * distributed under the License is distributed on an "AS IS" BASIS,
14 |   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   * See the License for the specific language governing permissions and
16 |   * limitations under the License.
17 |   */
18 | 
19 | package com.aliyun.odps.spark.examples.streaming.kafka
20 | 
21 | import org.apache.kafka.clients.consumer.ConsumerRecord
22 | import org.apache.kafka.common.serialization.StringDeserializer
23 | import org.apache.spark.sql.SparkSession
24 | import org.apache.spark.streaming.dstream.{DStream, InputDStream}
25 | import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
26 | import org.apache.spark.streaming.{Seconds, StreamingContext}
27 | 
28 | object KafkaStreamingDemo {
29 |   def main(args: Array[String]): Unit = {
30 |     val spark = SparkSession
31 |       .builder()
32 |       .appName("KafkaStreamingDemo")
33 |       .getOrCreate()
34 | 
35 |     val ssc = new StreamingContext(spark.sparkContext, Seconds(5))
36 | 
37 |     // 请使用OSS作为Checkpoint存储
38 |     ssc.checkpoint("oss://bucket/checkpointDir/")
39 | 
40 |     // kafka配置参数
41 |     val kafkaParams = Map[String, Object](
42 |       "bootstrap.servers" -> "192.168.1.1:9200,192.168.1.2:9200,192.168.1.3:9200",
43 |       "key.deserializer" -> classOf[StringDeserializer],
44 |       "value.deserializer" -> classOf[StringDeserializer],
45 |       "group.id" -> "testGroupId",
46 |       "auto.offset.reset" -> "latest",
47 |       "enable.auto.commit" -> (false: java.lang.Boolean)
48 |     )
49 | 
50 |     val topics = Set("event_topic")
51 |     val recordDstream: InputDStream[ConsumerRecord[String, String]] =
52 |       KafkaUtils.createDirectStream[String, String](
53 |         ssc,
54 |         LocationStrategies.PreferConsistent,
55 |         ConsumerStrategies.Subscribe[String, String](topics, kafkaParams)
56 |       )
57 | 
58 | 
59 |     val dstream = recordDstream.map(f => (f.key(), f.value()))
60 |     val data: DStream[String] = dstream.map(_._2)
61 |     val wordsDStream: DStream[String] = data.flatMap(_.split(" "))
62 |     val wordAndOneDstream: DStream[(String, Int)] = wordsDStream.map((_, 1))
63 |     val result: DStream[(String, Int)] = wordAndOneDstream.reduceByKey(_ + _)
64 |     result.print()
65 | 
66 |     ssc.start()
67 |     ssc.awaitTermination()
68 |   }
69 | }
70 | 
71 | 


--------------------------------------------------------------------------------
/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/streaming/loghub/LogHub2OdpsDemo.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Licensed to the Apache Software Foundation (ASF) under one
 3 |   * or more contributor license agreements.  See the NOTICE file
 4 |   * distributed with this work for additional information
 5 |   * regarding copyright ownership.  The ASF licenses this file
 6 |   * to you under the Apache License, Version 2.0 (the
 7 |   * "License"); you may not use this file except in compliance
 8 |   * with the License.  You may obtain a copy of the License at
 9 |   * <p>
10 |   * http://www.apache.org/licenses/LICENSE-2.0
11 |   * <p>
12 |   * Unless required by applicable law or agreed to in writing, software
13 |   * distributed under the License is distributed on an "AS IS" BASIS,
14 |   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   * See the License for the specific language governing permissions and
16 |   * limitations under the License.
17 |   */
18 | 
19 | package com.aliyun.odps.spark.examples.streaming.loghub
20 | 
21 | import com.aliyun.odps.spark.examples.streaming.common.SparkSessionSingleton
22 | import com.aliyun.openservices.loghub.client.config.LogHubCursorPosition
23 | 
24 | import org.apache.spark.{SparkConf, SparkContext}
25 | import org.apache.spark.storage.StorageLevel
26 | import org.apache.spark.streaming.{Durations, StreamingContext}
27 | import org.apache.spark.streaming.loghub.{LoghubUtils, StreamingParam}
28 | 
29 | object LogHub2OdpsDemo {
30 | 
31 |   def buildParam(conf: SparkConf): StreamingParam = {
32 |     val sp = new StreamingParam()
33 |     sp.setId(conf.get("spark.logservice.accessKeyId"))
34 |     sp.setSecret(conf.get("spark.logservice.accessKeySecret"))
35 |     sp.setEndpoint(conf.get("spark.logservice.endpoint"))
36 |     sp.setProject(conf.get("spark.logservice.project"))
37 |     sp.setLogstore(conf.get("spark.logservice.logstore"))
38 |     sp.setCursor(LogHubCursorPosition.END_CURSOR)
39 |     sp.setGroup("test")
40 |     sp.setLevel(StorageLevel.MEMORY_AND_DISK)
41 | 
42 |     sp
43 |   }
44 | 
45 |   def main(args: Array[String]) {
46 |     val conf = new SparkConf(true).setAppName("LogHubStreamingDemo")
47 |     val sc = new SparkContext(conf)
48 | 
49 |     val ssc = new StreamingContext(sc, Durations.seconds(5))
50 | 
51 |     val lines = LoghubUtils.createStream(ssc, buildParam(conf), 1).map(line => {
52 |         val str = new String(line)
53 |         str
54 |       })
55 | 
56 |     val words = lines.flatMap(_.split(" "))
57 |     words.foreachRDD(rdd => {
58 |       val spark = SparkSessionSingleton.getInstance(rdd.sparkContext.getConf)
59 |       import spark.implicits._
60 | 
61 |       rdd.toDF("id").write.mode("append").saveAsTable("test_table")
62 |     })
63 | 
64 |     ssc.start()             // Start the computation
65 |     ssc.awaitTermination()  // Wait for the computation to terminate
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/streaming/loghub/LogHubStreamingDemo.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Licensed to the Apache Software Foundation (ASF) under one
 3 |   * or more contributor license agreements.  See the NOTICE file
 4 |   * distributed with this work for additional information
 5 |   * regarding copyright ownership.  The ASF licenses this file
 6 |   * to you under the Apache License, Version 2.0 (the
 7 |   * "License"); you may not use this file except in compliance
 8 |   * with the License.  You may obtain a copy of the License at
 9 |   * <p>
10 |   * http://www.apache.org/licenses/LICENSE-2.0
11 |   * <p>
12 |   * Unless required by applicable law or agreed to in writing, software
13 |   * distributed under the License is distributed on an "AS IS" BASIS,
14 |   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   * See the License for the specific language governing permissions and
16 |   * limitations under the License.
17 |   */
18 | 
19 | package com.aliyun.odps.spark.examples.streaming.loghub
20 | 
21 | import com.aliyun.openservices.loghub.client.config.LogHubCursorPosition
22 | import org.apache.spark.storage.StorageLevel
23 | import org.apache.spark.streaming.loghub.{LoghubUtils, StreamingParam}
24 | import org.apache.spark.streaming.{Durations, StreamingContext}
25 | import org.apache.spark.{SparkConf, SparkContext}
26 | 
27 | object LogHubStreamingDemo {
28 | 
29 |   def buildParam(conf: SparkConf): StreamingParam = {
30 |     val sp = new StreamingParam()
31 |     sp.setId(conf.get("spark.logservice.accessKeyId"))
32 |     sp.setSecret(conf.get("spark.logservice.accessKeySecret"))
33 |     sp.setEndpoint(conf.get("spark.logservice.endpoint"))
34 |     sp.setProject(conf.get("spark.logservice.project"))
35 |     sp.setLogstore(conf.get("spark.logservice.logstore"))
36 |     sp.setCursor(LogHubCursorPosition.END_CURSOR)
37 |     sp.setGroup("test")
38 |     sp.setLevel(StorageLevel.MEMORY_AND_DISK)
39 | 
40 |     sp
41 |   }
42 | 
43 |   def main(args: Array[String]) {
44 |     val conf = new SparkConf(true).setAppName("LogHubStreamingDemo")
45 |     val sc = new SparkContext(conf)
46 | 
47 |     val ssc = new StreamingContext(sc, Durations.seconds(5))
48 | 
49 |     val lines = LoghubUtils.createStream(ssc, buildParam(conf), 1).map(line => {
50 |         val str = new String(line)
51 |         str
52 |       })
53 | 
54 |     val words = lines.flatMap(_.split(" "))
55 |     val pairs = words.map(word => (word, 1))
56 |     val wordCounts = pairs.reduceByKey(_ + _)
57 | 
58 |     // Print the first ten elements of each RDD generated in this DStream to the console
59 |     wordCounts.print()
60 | 
61 |     ssc.start()             // Start the computation
62 |     ssc.awaitTermination()  // Wait for the computation to terminate
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/structuredStreaming/datahub/DatahubStructuredStreamingDemo.scala:
--------------------------------------------------------------------------------
 1 | package com.aliyun.odps.spark.examples.structuredStreaming.datahub
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object DatahubStructuredStreamingDemo {
 6 |   def main(args: Array[String]): Unit = {
 7 |     val spark = SparkSession
 8 |       .builder()
 9 |       .appName("DatahubStructuredStreamingDemo")
10 |       .getOrCreate()
11 | 
12 |     import spark.implicits._
13 | 
14 |     val df = spark
15 |       .readStream
16 |       .format("datahub")
17 |       .option("datahub.endpoint", "http://dh-cn-beijing.aliyun-inc.com")
18 |       .option("datahub.project", "zkytest")
19 |       .option("datahub.topic", "zkytest")
20 |       .option("datahub.AccessId", "******")
21 |       .option("datahub.AccessKey", "******")
22 |       .option("StartingOffsets", "earliest")
23 |       .load()
24 | 
25 |     /** *
26 |      * WordCount Demo
27 |      */
28 |     // 请使用OSS作为Checkpoint存储
29 |     val checkpointLocation = "oss://bucket/checkpoint/"
30 |     val lines = df.select($"id").as[String]
31 |     val wordCounts = lines.flatMap(_.split(" ")).toDF("word").groupBy("word").count()
32 | 
33 |     val query = wordCounts.writeStream
34 |       .outputMode("complete")
35 |       .format("console")
36 |       .option("checkpointLocation", checkpointLocation)
37 |       .start()
38 | 
39 |     query.awaitTermination()
40 |   }
41 | }
42 | 
43 | 


--------------------------------------------------------------------------------
/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/structuredStreaming/kafka/KafkaStructuredStreamingDemo.scala:
--------------------------------------------------------------------------------
 1 | package com.aliyun.odps.spark.examples.structuredStreaming.kafka
 2 | 
 3 | import java.sql.Timestamp
 4 | 
 5 | import org.apache.spark.sql.SparkSession
 6 | import org.apache.spark.sql.functions.window
 7 | 
 8 | object KafkaStructuredStreamingDemo{
 9 |   def main(args: Array[String]): Unit = {
10 |     val spark = SparkSession
11 |       .builder()
12 |       .appName("KafkaStreamingDemo")
13 |       .getOrCreate()
14 | 
15 |     import spark.implicits._
16 | 
17 |     val df = spark
18 |       .readStream
19 |       .format("kafka")
20 |       .option("kafka.bootstrap.servers", "localhost:9092")
21 |       .option("subscribe", "topic")
22 |       .load()
23 | 
24 |     /** *
25 |      * WordCount Demo
26 |      */
27 |     // 请使用OSS作为Checkpoint存储
28 |     val checkpointLocation = "oss://bucket/checkpoint/"
29 |     val lines = df.selectExpr("cast(value as string)").as[String]
30 |     val wordCounts = lines.flatMap(_.split(" ")).toDF("word").groupBy("word").count()
31 | 
32 |     val query = wordCounts.writeStream
33 |       .outputMode("complete")
34 |       .format("console")
35 |       .option("checkpointLocation", checkpointLocation)
36 |       .option("path", "query1")
37 |       .start()
38 | 
39 |     query.awaitTermination()
40 | 
41 |     /** *
42 |      * Windowed WordCount Demo
43 |      */
44 |     val wordsWithTimestamp = df.selectExpr("cast(value as string)").as[String]
45 |       .flatMap(x => {
46 |         val Array(ts, data) = x.split(",")
47 |         data.split(" ").map((new Timestamp(ts.toLong), _))
48 |       }).as[(Timestamp, String)].toDF("timestamp", "word")
49 | 
50 |     // 请使用OSS作为Checkpoint存储
51 |     val checkpointLocation2 = "oss://bucket/checkpoint2/"
52 |     val windowedCounts = wordsWithTimestamp
53 |       .groupBy(
54 |         window($"timestamp", "10 seconds", "5 seconds"),
55 |         $"word"
56 |       ).count()
57 | 
58 |     val query2 = windowedCounts.writeStream
59 |       .outputMode("complete")
60 |       .format("console")
61 |       .option("checkpointLocation", checkpointLocation2)
62 |       .start()
63 | 
64 |     query2.awaitTermination()
65 | 
66 |     /** *
67 |      * Windowed WordCount with Watermark Demo
68 |      */
69 |     // 请使用OSS作为Checkpoint存储
70 |     val checkpointLocation3 = "oss://bucket/checkpoint3/"
71 | 
72 |     val windowedCountsWithWatermark = wordsWithTimestamp
73 |       .withWatermark("timestamp", "5 seconds")
74 |       .groupBy(
75 |         window($"timestamp", "6 seconds", "3 seconds"),
76 |         $"word"
77 |       ).count()
78 | 
79 |     val query3 = windowedCountsWithWatermark.writeStream
80 |       .outputMode("append")
81 |       .format("console")
82 |       .option("checkpointLocation", checkpointLocation3)
83 |       .start()
84 | 
85 |     query3.awaitTermination()
86 |   }
87 | }
88 | 
89 | 


--------------------------------------------------------------------------------
/spark-2.x/src/main/scala/com/aliyun/odps/spark/examples/structuredStreaming/loghub/LoghubStructuredStreamingDemo.scala:
--------------------------------------------------------------------------------
 1 | package com.aliyun.odps.spark.examples.structuredStreaming.loghub
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object LoghubStructuredStreamingDemo {
 6 |   def main(args: Array[String]): Unit = {
 7 |     val spark = SparkSession
 8 |       .builder()
 9 |       .appName("LoghubStructuredStreamingDemo")
10 |       .getOrCreate()
11 | 
12 |     import spark.implicits._
13 | 
14 |     val df = spark
15 |       .readStream
16 |       .format("loghub")
17 |       .option("Loghub.Endpoint", "cn-beijing-intranet.log.aliyuncs.com")
18 |       .option("Loghub.Project", "zkytest")
19 |       .option("Loghub.AccessId", "******")
20 |       .option("Loghub.AccessKey", "******")
21 |       .option("Loghub.Logstores", "zkytest")
22 |       .option("StartingOffsets", "latest")
23 |       .load()
24 | 
25 |     /** *
26 |      * WordCount Demo
27 |      */
28 |     // 请使用OSS作为Checkpoint存储
29 |     val checkpointLocation = "oss://bucket/checkpoint"
30 |     val lines = df.select($"contents").as[String]
31 |     val wordCounts = lines.flatMap(_.split(" ")).toDF("word").groupBy("word").count()
32 | 
33 |     val query = wordCounts.writeStream
34 |       .outputMode("complete")
35 |       .format("console")
36 |       .option("checkpointLocation", checkpointLocation)
37 |       .start()
38 | 
39 |     query.awaitTermination()
40 |   }
41 | }
42 | 
43 | 


--------------------------------------------------------------------------------
/spark-3.x/libs/jindofs-sdk-3.7.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/spark-3.x/libs/jindofs-sdk-3.7.2.jar


--------------------------------------------------------------------------------
/spark-3.x/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <!--
  3 |   Licensed under the Apache License, Version 2.0 (the "License");
  4 |   you may not use this file except in compliance with the License.
  5 |   You may obtain a copy of the License at
  6 | 
  7 |     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 |   Unless required by applicable law or agreed to in writing, software
 10 |   distributed under the License is distributed on an "AS IS" BASIS,
 11 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 |   See the License for the specific language governing permissions and
 13 |   limitations under the License. See accompanying LICENSE file.
 14 | -->
 15 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 16 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 17 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 18 |     <modelVersion>4.0.0</modelVersion>
 19 | 
 20 |     <properties>
 21 |         <spark.version>3.1.1</spark.version>
 22 |         <cupid.sdk.version>3.3.8-public</cupid.sdk.version>
 23 |         <scala.version>2.12.10</scala.version>
 24 |         <scala.binary.version>2.12</scala.binary.version>
 25 |     </properties>
 26 | 
 27 |     <groupId>com.aliyun.odps</groupId>
 28 |     <artifactId>spark-examples_${scala.binary.version}</artifactId>
 29 |     <version>1.0.0-SNAPSHOT</version>
 30 |     <packaging>jar</packaging>
 31 | 
 32 |     <dependencies>
 33 |         <dependency>
 34 |             <groupId>org.apache.spark</groupId>
 35 |             <artifactId>spark-core_${scala.binary.version}</artifactId>
 36 |             <version>${spark.version}</version>
 37 |             <scope>provided</scope>
 38 |         </dependency>
 39 |         <dependency>
 40 |             <groupId>org.apache.spark</groupId>
 41 |             <artifactId>spark-sql_${scala.binary.version}</artifactId>
 42 |             <version>${spark.version}</version>
 43 |             <scope>provided</scope>
 44 |         </dependency>
 45 |         <dependency>
 46 |             <groupId>org.apache.spark</groupId>
 47 |             <artifactId>spark-mllib_${scala.binary.version}</artifactId>
 48 |             <version>${spark.version}</version>
 49 |             <scope>provided</scope>
 50 |         </dependency>
 51 |         <dependency>
 52 |             <groupId>org.apache.spark</groupId>
 53 |             <artifactId>spark-streaming_${scala.binary.version}</artifactId>
 54 |             <version>${spark.version}</version>
 55 |             <scope>provided</scope>
 56 |         </dependency>
 57 | 
 58 |         <dependency>
 59 |             <groupId>com.aliyun.odps</groupId>
 60 |             <artifactId>cupid-sdk</artifactId>
 61 |             <version>${cupid.sdk.version}</version>
 62 |             <scope>provided</scope>
 63 |         </dependency>
 64 | 
 65 |     </dependencies>
 66 | 
 67 |     <build>
 68 |         <plugins>
 69 |             <plugin>
 70 |                 <groupId>org.apache.maven.plugins</groupId>
 71 |                 <artifactId>maven-shade-plugin</artifactId>
 72 |                 <version>2.4.3</version>
 73 |                 <executions>
 74 |                     <execution>
 75 |                         <phase>package</phase>
 76 |                         <goals>
 77 |                             <goal>shade</goal>
 78 |                         </goals>
 79 |                         <configuration>
 80 |                             <minimizeJar>false</minimizeJar>
 81 |                             <shadedArtifactAttached>true</shadedArtifactAttached>
 82 |                             <artifactSet>
 83 |                                 <includes>
 84 |                                     <!-- Include here the dependencies you
 85 |                                         want to be packed in your fat jar -->
 86 |                                     <include>*:*</include>
 87 |                                 </includes>
 88 |                             </artifactSet>
 89 |                             <filters>
 90 |                                 <filter>
 91 |                                     <artifact>*:*</artifact>
 92 |                                     <excludes>
 93 |                                         <exclude>META-INF/*.SF</exclude>
 94 |                                         <exclude>META-INF/*.DSA</exclude>
 95 |                                         <exclude>META-INF/*.RSA</exclude>
 96 |                                         <exclude>**/log4j.properties</exclude>
 97 |                                     </excludes>
 98 |                                 </filter>
 99 |                             </filters>
100 |                             <transformers>
101 |                                 <transformer
102 |                                         implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
103 |                                     <resource>reference.conf</resource>
104 |                                 </transformer>
105 |                                 <transformer
106 |                                         implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
107 |                                     <resource>
108 |                                         META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
109 |                                     </resource>
110 |                                 </transformer>
111 |                             </transformers>
112 |                         </configuration>
113 |                     </execution>
114 |                 </executions>
115 |             </plugin>
116 |             <plugin>
117 |                 <groupId>net.alchim31.maven</groupId>
118 |                 <artifactId>scala-maven-plugin</artifactId>
119 |                 <version>3.3.2</version>
120 |                 <executions>
121 |                     <execution>
122 |                         <id>scala-compile-first</id>
123 |                         <phase>process-resources</phase>
124 |                         <goals>
125 |                             <goal>compile</goal>
126 |                         </goals>
127 |                     </execution>
128 |                     <execution>
129 |                         <id>scala-test-compile-first</id>
130 |                         <phase>process-test-resources</phase>
131 |                         <goals>
132 |                             <goal>testCompile</goal>
133 |                         </goals>
134 |                     </execution>
135 |                 </executions>
136 |             </plugin>
137 |         </plugins>
138 |     </build>
139 | 
140 | </project>
141 | 


--------------------------------------------------------------------------------
/spark-3.x/src/main/java/com/aliyun/odps/spark/examples/sparksql/JavaSparkSQL.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Licensed to the Apache Software Foundation (ASF) under one
  3 |  * or more contributor license agreements.  See the NOTICE file
  4 |  * distributed with this work for additional information
  5 |  * regarding copyright ownership.  The ASF licenses this file
  6 |  * to you under the Apache License, Version 2.0 (the
  7 |  * "License"); you may not use this file except in compliance
  8 |  * with the License.  You may obtain a copy of the License at
  9 |  * <p>
 10 |  * http://www.apache.org/licenses/LICENSE-2.0
 11 |  * <p>
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | 
 19 | package com.aliyun.odps.spark.examples.sparksql;
 20 | 
 21 | import com.aliyun.odps.Odps;
 22 | import com.aliyun.odps.cupid.CupidSession;
 23 | import org.apache.spark.sql.SparkSession;
 24 | import org.apache.spark.api.java.JavaRDD;
 25 | import org.apache.spark.api.java.JavaSparkContext;
 26 | import org.apache.spark.api.java.function.Function;
 27 | import org.apache.spark.sql.Row;
 28 | import org.apache.spark.sql.Dataset;
 29 | import org.apache.spark.sql.RowFactory;
 30 | import org.apache.spark.sql.types.*;
 31 | 
 32 | import java.util.ArrayList;
 33 | import java.util.List;
 34 | 
 35 | import org.apache.spark.sql.types.StructField;
 36 | 
 37 | public class JavaSparkSQL {
 38 | 
 39 |     public static void main(String[] args) throws Exception {
 40 |         SparkSession spark = SparkSession
 41 |                 .builder()
 42 |                 .appName("SparkSQL-on-MaxCompute")
 43 |                 .config("spark.sql.defaultCatalog","odps")
 44 |                 .config("spark.sql.catalog.odps", "org.apache.spark.sql.execution.datasources.v2.odps.OdpsTableCatalog")
 45 |                 .config("spark.sql.sources.partitionOverwriteMode", "dynamic")
 46 |                 .config("spark.sql.extensions", "org.apache.spark.sql.execution.datasources.v2.odps.extension.OdpsExtensions")
 47 |                 .config("spark.sql.catalogImplementation","hive")
 48 |                 .getOrCreate();
 49 |         JavaSparkContext sparkContext = new JavaSparkContext(spark.sparkContext());
 50 | 
 51 | 
 52 |         String tableName = "mc_test_table";
 53 |         String tableNameCopy = "mc_test_table_copy";
 54 |         String ptTableName = "mc_test_pt_table";
 55 | 
 56 | 
 57 |         spark.sql("DROP TABLE IF EXISTS " + tableName);
 58 |         spark.sql("DROP TABLE IF EXISTS " + tableNameCopy);
 59 |         spark.sql("DROP TABLE IF EXISTS " + ptTableName);
 60 | 
 61 |         spark.sql("CREATE TABLE " + tableName + " (name STRING, num BIGINT)");
 62 |         spark.sql("CREATE TABLE " + ptTableName + " (name STRING, num BIGINT) PARTITIONED BY (pt1 STRING, pt2 STRING)");
 63 | 
 64 |         List<Integer> data = new ArrayList<Integer>();
 65 |         for (int i = 0; i < 100; i++) {
 66 |             data.add(i);
 67 |         }
 68 | 
 69 |         JavaRDD<Row> dfRDD = sparkContext.parallelize(data, 2).map(new Function<Integer, Row>() {
 70 |             public Row call(Integer i) {
 71 |                 return RowFactory.create(
 72 |                         "name-" + i.toString(),
 73 |                         Long.valueOf(i));
 74 |             }
 75 |         });
 76 | 
 77 |         JavaRDD<Row> ptDfRDD = sparkContext.parallelize(data, 2).map(new Function<Integer, Row>() {
 78 |             public Row call(Integer i) {
 79 |                 return RowFactory.create(
 80 |                         "name-" + i.toString(),
 81 |                         Long.valueOf(i),
 82 |                         "2018",
 83 |                         "0601");
 84 |             }
 85 |         });
 86 | 
 87 |         List<StructField> structFilelds = new ArrayList<StructField>();
 88 |         structFilelds.add(DataTypes.createStructField("name", DataTypes.StringType, true));
 89 |         structFilelds.add(DataTypes.createStructField("num", DataTypes.LongType, true));
 90 |         Dataset<Row> df = spark.createDataFrame(dfRDD, DataTypes.createStructType(structFilelds));
 91 | 
 92 |         structFilelds.add(DataTypes.createStructField("pt1", DataTypes.StringType, true));
 93 |         structFilelds.add(DataTypes.createStructField("pt2", DataTypes.StringType, true));
 94 |         Dataset<Row> ptDf = spark.createDataFrame(ptDfRDD, DataTypes.createStructType(structFilelds));
 95 | 
 96 |         // 写 普通表
 97 |         df.write().insertInto(tableName); // insertInto语义
 98 |         df.writeTo(tableName).overwritePartitions(); // insertOverwrite use datasourcev2
 99 | 
100 |         // 读 普通表
101 |         Dataset<Row> rdf = spark.sql("select name, num from " + tableName);
102 |         System.out.println("rdf count: " + rdf.count());
103 |         rdf.printSchema();
104 | 
105 |         //create table as select
106 |         spark.sql("CREATE TABLE " + tableNameCopy + " AS SELECT name, num FROM " + tableName);
107 |         spark.sql("SELECT * FROM " + tableNameCopy).show();
108 | 
109 |         // 写 分区表
110 |         // DataFrameWriter 无法指定分区写入 需要通过临时表再用SQL写入特定分区
111 |         df.registerTempTable(ptTableName + "_tmp_view");
112 |         spark.sql("insert into table " + ptTableName + " partition (pt1='2018', pt2='0601') select * from " + ptTableName + "_tmp_view");
113 |         spark.sql("insert overwrite table " + ptTableName + " partition (pt1='2018', pt2='0601') select * from " + ptTableName + "_tmp_view");
114 | 
115 |         ptDf.write().insertInto(ptTableName);// 动态分区 insertInto语义
116 |         ptDf.write().mode("overwrite").insertInto(ptTableName); // 动态分区 insertOverwrite语义
117 | 
118 |         // 读 分区表
119 |         Dataset<Row> rptdf = spark.sql("select name, num, pt1, pt2 from " + ptTableName + " where pt1 = '2018' and pt2 = '0601'");
120 |         System.out.println("rptdf count: " + rptdf.count());
121 |         rptdf.printSchema();
122 | 
123 |         // example for use odps
124 |         Odps odps = CupidSession.get().odps();
125 |         System.out.println(odps.tables().get(ptTableName).getPartitions().size());
126 |         System.out.println(odps.tables().get(ptTableName).getPartitions().get(0).getPartitionSpec());
127 |     }
128 | }


--------------------------------------------------------------------------------
/spark-3.x/src/main/python/spark_oss.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import sys
 3 | from pyspark.sql import SparkSession
 4 | 
 5 | try:
 6 |     # for python 2
 7 |     reload(sys)
 8 |     sys.setdefaultencoding('utf8')
 9 | except:
10 |     # python 3 not needed
11 |     pass
12 | 
13 | if __name__ == '__main__':
14 |     spark = SparkSession.builder\
15 |         .appName("spark write df to oss")\
16 |         .getOrCreate()
17 | 
18 |     data = [i for i in range(0, 100)]
19 | 
20 |     df = spark.sparkContext.parallelize(data, 2).map(lambda s: ("name-%s" % s, s)).toDF("name: string, num: int")
21 | 
22 |     df.show(n=10)
23 | 
24 |     # write to oss
25 |     pathout = 'oss://yeshan01/test.csv'
26 |     df.write.csv(pathout)
27 | 


--------------------------------------------------------------------------------
/spark-3.x/src/main/python/spark_sql.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import sys
 3 | from pyspark.sql import SparkSession
 4 | 
 5 | try:
 6 |     # for python 2
 7 |     reload(sys)
 8 |     sys.setdefaultencoding('utf8')
 9 | except:
10 |     # python 3 not needed
11 |     pass
12 | 
13 | if __name__ == '__main__':
14 |     spark = SparkSession.builder\
15 |         .appName("spark sql")\
16 |         .config("spark.sql.broadcastTimeout", 20 * 60)\
17 |         .config("spark.sql.crossJoin.enabled", True)\
18 |         .getOrCreate()
19 | 
20 |     tableName = "mc_test_table"
21 |     ptTableName = "mc_test_pt_table"
22 |     data = [i for i in range(0, 100)]
23 | 
24 |     # Drop Create
25 |     spark.sql("DROP TABLE IF EXISTS %s" % tableName)
26 |     spark.sql("DROP TABLE IF EXISTS %s" % ptTableName)
27 | 
28 |     spark.sql("CREATE TABLE %s (name STRING, num BIGINT)" % tableName)
29 |     spark.sql("CREATE TABLE %s (name STRING, num BIGINT) PARTITIONED BY (pt1 STRING, pt2 STRING)" % ptTableName)
30 | 
31 |     df = spark.sparkContext.parallelize(data, 2).map(lambda s: ("name-%s" % s, s)).toDF("name: string, num: int")
32 |     pt_df = spark.sparkContext.parallelize(data, 2).map(lambda s: ("name-%s" % s, s, "2018", "0601")).toDF("name: string, num: int, pt1: string, pt2: string")
33 | 
34 |     # 写 普通表
35 |     df.write.insertInto(tableName) # insertInto语义
36 |     df.writeTo(tableName).overwritePartitions() # insertOverwrite use datasourcev2
37 | 
38 |     # 写 分区表
39 |     # DataFrameWriter 无法指定分区写入 需要通过临时表再用SQL写入特定分区
40 |     df.createOrReplaceTempView("%s_tmp_view" % ptTableName)
41 |     spark.sql("insert into table %s partition (pt1='2018', pt2='0601') select * from %s_tmp_view" % (ptTableName, ptTableName))
42 |     spark.sql("insert overwrite table %s partition (pt1='2018', pt2='0601') select * from %s_tmp_view" % (ptTableName, ptTableName))
43 | 
44 |     pt_df.write.insertInto(ptTableName) # 动态分区 insertInto语义
45 |     pt_df.write.insertInto(ptTableName, True) # 动态分区 insertOverwrite语义
46 | 
47 |     # 读 普通表
48 |     rdf = spark.sql("select name, num from %s" % tableName)
49 |     print("rdf count, %s\n" % rdf.count())
50 |     rdf.printSchema()
51 | 
52 |     # 读 分区表
53 |     rptdf = spark.sql("select name, num, pt1, pt2 from %s where pt1 = '2018' and pt2 = '0601'" % ptTableName)
54 |     print("rptdf count, %s" % (rptdf.count()))
55 |     rptdf.printSchema()


--------------------------------------------------------------------------------
/spark-3.x/src/main/scala/com/aliyun/odps/spark/examples/SparkPi.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Licensed to the Apache Software Foundation (ASF) under one
 3 |   * or more contributor license agreements.  See the NOTICE file
 4 |   * distributed with this work for additional information
 5 |   * regarding copyright ownership.  The ASF licenses this file
 6 |   * to you under the Apache License, Version 2.0 (the
 7 |   * "License"); you may not use this file except in compliance
 8 |   * with the License.  You may obtain a copy of the License at
 9 |   * <p>
10 |   * http://www.apache.org/licenses/LICENSE-2.0
11 |   * <p>
12 |   * Unless required by applicable law or agreed to in writing, software
13 |   * distributed under the License is distributed on an "AS IS" BASIS,
14 |   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   * See the License for the specific language governing permissions and
16 |   * limitations under the License.
17 |   */
18 | 
19 | package com.aliyun.odps.spark.examples
20 | 
21 | import org.apache.spark.sql.SparkSession
22 | 
23 | import scala.math.random
24 | 
25 | object SparkPi {
26 |   def main(args: Array[String]) {
27 |     val spark = SparkSession
28 |       .builder()
29 |       .appName("SparkPi")
30 |       .getOrCreate()
31 |     val sc = spark.sparkContext
32 | 
33 |     try {
34 |       val slices = if (args.length > 0) args(0).toInt else 2
35 |       val n = math.min(100000L * slices, Int.MaxValue).toInt // avoid overflow
36 |       val count = sc.parallelize(1 until n, slices).map { i =>
37 |         val x = random * 2 - 1
38 |         val y = random * 2 - 1
39 |         if (x * x + y * y < 1) 1 else 0
40 |       }.reduce(_ + _)
41 |       println("Pi is roughly " + 4.0 * count / n)
42 |     } finally {
43 |       sc.stop()
44 |     }
45 |   }
46 | }


--------------------------------------------------------------------------------
/spark-3.x/src/main/scala/com/aliyun/odps/spark/examples/WordCount.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Licensed to the Apache Software Foundation (ASF) under one
 3 |   * or more contributor license agreements.  See the NOTICE file
 4 |   * distributed with this work for additional information
 5 |   * regarding copyright ownership.  The ASF licenses this file
 6 |   * to you under the Apache License, Version 2.0 (the
 7 |   * "License"); you may not use this file except in compliance
 8 |   * with the License.  You may obtain a copy of the License at
 9 |   * <p>
10 |   * http://www.apache.org/licenses/LICENSE-2.0
11 |   * <p>
12 |   * Unless required by applicable law or agreed to in writing, software
13 |   * distributed under the License is distributed on an "AS IS" BASIS,
14 |   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   * See the License for the specific language governing permissions and
16 |   * limitations under the License.
17 |   */
18 | 
19 | package com.aliyun.odps.spark.examples
20 | 
21 | import org.apache.spark.sql.SparkSession
22 | 
23 | object WordCount {
24 |   def main(args: Array[String]) {
25 |     val spark = SparkSession
26 |       .builder()
27 |       .appName("WordCount")
28 |       .getOrCreate()
29 |     val sc = spark.sparkContext
30 | 
31 |     try {
32 |       sc.parallelize(1 to 100, 10).map(word => (word, 1)).reduceByKey(_ + _, 10).take(100).foreach(println)
33 |     } finally {
34 |       sc.stop()
35 |     }
36 |   }
37 | }


--------------------------------------------------------------------------------
/spark-3.x/src/main/scala/com/aliyun/odps/spark/examples/graphx/PageRank.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Licensed to the Apache Software Foundation (ASF) under one
 3 |   * or more contributor license agreements.  See the NOTICE file
 4 |   * distributed with this work for additional information
 5 |   * regarding copyright ownership.  The ASF licenses this file
 6 |   * to you under the Apache License, Version 2.0 (the
 7 |   * "License"); you may not use this file except in compliance
 8 |   * with the License.  You may obtain a copy of the License at
 9 |   * <p>
10 |   * http://www.apache.org/licenses/LICENSE-2.0
11 |   * <p>
12 |   * Unless required by applicable law or agreed to in writing, software
13 |   * distributed under the License is distributed on an "AS IS" BASIS,
14 |   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   * See the License for the specific language governing permissions and
16 |   * limitations under the License.
17 |   */
18 | 
19 | package com.aliyun.odps.spark.examples.graphx
20 | 
21 | import org.apache.spark.graphx.{Edge, Graph, VertexId}
22 | import org.apache.spark.rdd.RDD
23 | import org.apache.spark.sql.SparkSession
24 | 
25 | object PageRank {
26 |   def main(args: Array[String]): Unit = {
27 |     val spark = SparkSession
28 |       .builder()
29 |       .appName("PageRank")
30 |       .getOrCreate()
31 |     val sc = spark.sparkContext
32 | 
33 |     // build vertices
34 |     val users: RDD[(VertexId, Array[String])] = sc.parallelize(List(
35 |       "1,BarackObama,Barack Obama",
36 |       "2,ladygaga,Goddess of Love",
37 |       "3,jeresig,John Resig",
38 |       "4,justinbieber,Justin Bieber",
39 |       "6,matei_zaharia,Matei Zaharia",
40 |       "7,odersky,Martin Odersky",
41 |       "8,anonsys"
42 |     ).map(line => line.split(",")).map(parts => (parts.head.toLong, parts.tail)))
43 | 
44 |     // build edges
45 |     val followers: RDD[Edge[Double]] = sc.parallelize(Array(
46 |       Edge(2L, 1L, 1.0),
47 |       Edge(4L, 1L, 1.0),
48 |       Edge(1L, 2L, 1.0),
49 |       Edge(6L, 3L, 1.0),
50 |       Edge(7L, 3L, 1.0),
51 |       Edge(7L, 6L, 1.0),
52 |       Edge(6L, 7L, 1.0),
53 |       Edge(3L, 7L, 1.0)
54 |     ))
55 | 
56 |     // build graph
57 |     val followerGraph: Graph[Array[String], Double] = Graph(users, followers)
58 | 
59 |     // restrict the graph to users with usernames and names
60 |     val subgraph = followerGraph.subgraph(vpred = (vid, attr) => attr.size == 2)
61 | 
62 |     // compute PageRank
63 |     val pageRankGraph = subgraph.pageRank(0.001)
64 | 
65 |     // get attributes of the top pagerank users
66 |     val userInfoWithPageRank = subgraph.outerJoinVertices(pageRankGraph.vertices) {
67 |       case (uid, attrList, Some(pr)) => (pr, attrList.toList)
68 |       case (uid, attrList, None) => (0.0, attrList.toList)
69 |     }
70 | 
71 |     println(userInfoWithPageRank.vertices.top(5)(Ordering.by(_._2._1)).mkString("\n"))
72 |   }
73 | }
74 | 


--------------------------------------------------------------------------------
/spark-3.x/src/main/scala/com/aliyun/odps/spark/examples/mllib/KmeansModelSaveToOss.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Licensed to the Apache Software Foundation (ASF) under one
 3 |   * or more contributor license agreements.  See the NOTICE file
 4 |   * distributed with this work for additional information
 5 |   * regarding copyright ownership.  The ASF licenses this file
 6 |   * to you under the Apache License, Version 2.0 (the
 7 |   * "License"); you may not use this file except in compliance
 8 |   * with the License.  You may obtain a copy of the License at
 9 |   * <p>
10 |   * http://www.apache.org/licenses/LICENSE-2.0
11 |   * <p>
12 |   * Unless required by applicable law or agreed to in writing, software
13 |   * distributed under the License is distributed on an "AS IS" BASIS,
14 |   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   * See the License for the specific language governing permissions and
16 |   * limitations under the License.
17 |   */
18 | 
19 | package com.aliyun.odps.spark.examples.mllib
20 | 
21 | import org.apache.spark.mllib.clustering.KMeans._
22 | import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
23 | import org.apache.spark.mllib.linalg.Vectors
24 | import org.apache.spark.sql.SparkSession
25 | 
26 | object KmeansModelSaveToOss {
27 |   val modelOssDir = "oss://[bucket]/kmeans-model"
28 | 
29 |   def main(args: Array[String]) {
30 | 
31 |     //1. train and save the model
32 |     val spark = SparkSession
33 |       .builder()
34 |       .config("spark.hadoop.fs.AbstractFileSystem.oss.impl", "com.aliyun.emr.fs.oss.OSS")
35 |       .config("spark.hadoop.fs.oss.impl", "com.aliyun.emr.fs.oss.JindoOssFileSystem")
36 |       .config("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-internal.aliyuncs.com")
37 |       .config("spark.hadoop.fs.oss.accessKeyId", "xxx")
38 |       .config("spark.hadoop.fs.oss.accessKeySecret", "xxx")
39 |       .appName("KmeansModelSaveToOss")
40 |       .getOrCreate()
41 | 
42 |     val sc = spark.sparkContext
43 |     val points = Seq(
44 |       Vectors.dense(0.0, 0.0),
45 |       Vectors.dense(0.0, 0.1),
46 |       Vectors.dense(0.1, 0.0),
47 |       Vectors.dense(9.0, 0.0),
48 |       Vectors.dense(9.0, 0.2),
49 |       Vectors.dense(9.2, 0.0)
50 |     )
51 |     val rdd = sc.parallelize(points, 3)
52 |     val initMode = K_MEANS_PARALLEL
53 |     val model = KMeans.train(rdd, k = 2, maxIterations = 2, initMode)
54 |     val predictResult1 = rdd.map(feature => "cluster id: " + model.predict(feature) + " feature:" + feature.toArray.mkString(",")).collect
55 |     println("modelOssDir=" + modelOssDir)
56 |     model.save(sc, modelOssDir)
57 | 
58 |     //2. predict from the oss model
59 |     val modelLoadOss = KMeansModel.load(sc, modelOssDir)
60 |     val predictResult2 = rdd.map(feature => "cluster id: " + modelLoadOss.predict(feature) + " feature:" + feature.toArray.mkString(",")).collect
61 |     assert(predictResult1.size == predictResult2.size)
62 |     predictResult2.foreach(result2 => assert(predictResult1.contains(result2)))
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/spark-3.x/src/main/scala/com/aliyun/odps/spark/examples/oss/JindoFsDemo.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Licensed to the Apache Software Foundation (ASF) under one
 3 |   * or more contributor license agreements.  See the NOTICE file
 4 |   * distributed with this work for additional information
 5 |   * regarding copyright ownership.  The ASF licenses this file
 6 |   * to you under the Apache License, Version 2.0 (the
 7 |   * "License"); you may not use this file except in compliance
 8 |   * with the License.  You may obtain a copy of the License at
 9 |   * <p>
10 |   * http://www.apache.org/licenses/LICENSE-2.0
11 |   * <p>
12 |   * Unless required by applicable law or agreed to in writing, software
13 |   * distributed under the License is distributed on an "AS IS" BASIS,
14 |   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   * See the License for the specific language governing permissions and
16 |   * limitations under the License.
17 |   */
18 | 
19 | package com.aliyun.odps.spark.examples.oss
20 | 
21 | import org.apache.spark.{SparkConf, SparkContext}
22 | 
23 | object JindoFsDemo {
24 |   def main(args: Array[String]): Unit = {
25 |     val bucket : String = args(0)
26 |     val ossPath : String = args(1)
27 | 
28 |     //using access-key-id/access-key-secret
29 |     val conf = new SparkConf()
30 |       .setAppName("jindo-fs-demo")
31 |       .set("spark.hadoop.fs.AbstractFileSystem.oss.impl", "com.aliyun.emr.fs.oss.OSS")
32 |       .set("spark.hadoop.fs.oss.impl", "com.aliyun.emr.fs.oss.JindoOssFileSystem")
33 |       .set("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-internal.aliyuncs.com")
34 |       .set("spark.hadoop.fs.oss.accessKeyId", "xxx")
35 |       .set("spark.hadoop.fs.oss.accessKeySecret", "xxx")
36 | 
37 |     val sc = new SparkContext(conf)
38 | 
39 |     try {
40 |       read_oss_dir(sc, "demo", s"oss://${bucket}/${ossPath}")
41 |     } finally {
42 |       sc.stop()
43 |     }
44 |   }
45 | 
46 |   /**
47 |     * compute cost time using jindo sdk
48 |     */
49 |   def read_oss_dir(sc: SparkContext, job_des:String, ossPath: String): Unit = {
50 |     val startTime: Long = System.currentTimeMillis()
51 |     val inputData = sc.textFile(ossPath, 20)
52 |     val cnt = inputData.count
53 |     val endTime:Long = System.currentTimeMillis()
54 |     val cost:Long = endTime - startTime
55 |     println(s"job:$job_des, count:$cnt, consume:$cost")
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/spark-3.x/src/main/scala/com/aliyun/odps/spark/examples/oss/SparkUnstructuredDataCompute.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Licensed to the Apache Software Foundation (ASF) under one
 3 |   * or more contributor license agreements.  See the NOTICE file
 4 |   * distributed with this work for additional information
 5 |   * regarding copyright ownership.  The ASF licenses this file
 6 |   * to you under the Apache License, Version 2.0 (the
 7 |   * "License"); you may not use this file except in compliance
 8 |   * with the License.  You may obtain a copy of the License at
 9 |   * <p>
10 |   * http://www.apache.org/licenses/LICENSE-2.0
11 |   * <p>
12 |   * Unless required by applicable law or agreed to in writing, software
13 |   * distributed under the License is distributed on an "AS IS" BASIS,
14 |   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   * See the License for the specific language governing permissions and
16 |   * limitations under the License.
17 |   */
18 | 
19 | package com.aliyun.odps.spark.examples.oss
20 | 
21 | import org.apache.spark.sql.SparkSession
22 | 
23 | object SparkUnstructuredDataCompute {
24 |   def main(args: Array[String]) {
25 |     val spark = SparkSession
26 |       .builder()
27 |       .config("spark.hadoop.fs.AbstractFileSystem.oss.impl", "com.aliyun.emr.fs.oss.OSS")
28 |       .config("spark.hadoop.fs.oss.impl", "com.aliyun.emr.fs.oss.JindoOssFileSystem")
29 |       .config("spark.hadoop.fs.oss.endpoint", "oss-cn-hangzhou-internal.aliyuncs.com")
30 |       .config("spark.hadoop.fs.oss.accessKeyId", "xxx")
31 |       .config("spark.hadoop.fs.oss.accessKeySecret", "xxx")
32 |       .appName("SparkUnstructuredDataCompute")
33 |       .getOrCreate()
34 | 
35 |     val sc = spark.sparkContext
36 |     try {
37 |       val pathIn = "oss://bucket/inputdata/"
38 |       val inputData = sc.textFile(pathIn, 5)
39 |       val cnt = inputData.count
40 |       println(s"count: $cnt")
41 |     } finally {
42 |       sc.stop()
43 |     }
44 |   }
45 | }


--------------------------------------------------------------------------------
/spark-3.x/src/main/scala/com/aliyun/odps/spark/examples/sparksql/SparkSQL.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Licensed to the Apache Software Foundation (ASF) under one
 3 |   * or more contributor license agreements.  See the NOTICE file
 4 |   * distributed with this work for additional information
 5 |   * regarding copyright ownership.  The ASF licenses this file
 6 |   * to you under the Apache License, Version 2.0 (the
 7 |   * "License"); you may not use this file except in compliance
 8 |   * with the License.  You may obtain a copy of the License at
 9 |   * <p>
10 |   * http://www.apache.org/licenses/LICENSE-2.0
11 |   * <p>
12 |   * Unless required by applicable law or agreed to in writing, software
13 |   * distributed under the License is distributed on an "AS IS" BASIS,
14 |   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   * See the License for the specific language governing permissions and
16 |   * limitations under the License.
17 |   */
18 | 
19 | package com.aliyun.odps.spark.examples.sparksql
20 | 
21 | import org.apache.spark.sql.{SaveMode, SparkSession}
22 | 
23 | object SparkSQL {
24 |   def main(args: Array[String]) {
25 |     val spark = SparkSession
26 |       .builder()
27 |       .appName("SparkSQL-on-MaxCompute")
28 |       .config("spark.sql.broadcastTimeout", 20 * 60)
29 |       .config("spark.sql.crossJoin.enabled", true)
30 |       .config("spark.sql.defaultCatalog","odps")
31 |       .config("spark.sql.catalog.odps", "org.apache.spark.sql.execution.datasources.v2.odps.OdpsTableCatalog")
32 |       .config("spark.sql.sources.partitionOverwriteMode", "dynamic")
33 |       .config("spark.sql.extensions", "org.apache.spark.sql.execution.datasources.v2.odps.extension.OdpsExtensions")
34 |       .config("spark.sql.catalogImplementation","hive")
35 |       .getOrCreate()
36 | 
37 |     import spark._
38 |     import sqlContext.implicits._
39 |     val tableName = "mc_test_table"
40 |     val ptTableName = "mc_test_pt_table"
41 |     // Drop Create
42 |     sql(s"DROP TABLE IF EXISTS ${tableName}")
43 |     sql(s"DROP TABLE IF EXISTS ${ptTableName}")
44 | 
45 |     sql(s"CREATE TABLE ${tableName} (name STRING, num BIGINT)")
46 |     sql(s"CREATE TABLE ${ptTableName} (name STRING, num BIGINT) PARTITIONED BY (pt1 STRING, pt2 STRING)")
47 | 
48 |     val df = spark.sparkContext.parallelize(0 to 99, 2).map(f => {
49 |       (s"name-$f", f)
50 |     }).toDF("name", "num")
51 | 
52 |     val ptDf = spark.sparkContext.parallelize(0 to 99, 2).map(f => {
53 |       (s"name-$f", f, "2018", "0601")
54 |     }).toDF("name", "num", "pt1", "pt2")
55 | 
56 |     // 写 普通表
57 |     df.write.insertInto(tableName) // insertInto语义
58 |     df.writeTo(tableName).overwritePartitions() // insertOverwrite use datasourceV2
59 | 
60 |     // 写 分区表
61 |     // DataFrameWriter 无法指定分区写入 需要通过临时表再用SQL写入特定分区
62 |     df.createOrReplaceTempView(s"${ptTableName}_tmp_view")
63 |     sql(s"insert into table ${ptTableName} partition (pt1='2018', pt2='0601') select * from ${ptTableName}_tmp_view")
64 |     sql(s"insert overwrite table ${ptTableName} partition (pt1='2018', pt2='0601') select * from ${ptTableName}_tmp_view")
65 | 
66 |     ptDf.write.insertInto(ptTableName) // 动态分区 insertInto语义
67 |     ptDf.write.mode("overwrite").insertInto(ptTableName) // 动态分区 insertOverwrite语义
68 | 
69 |     // 读 普通表
70 |     val rdf = sql(s"select name, num from $tableName")
71 |     println(s"rdf show, ${rdf.count()}")
72 |     rdf.show()
73 |     rdf.printSchema()
74 | 
75 |     // 读 分区表
76 |     val rptdf = sql(s"select name, num, pt1, pt2 from $ptTableName where pt1 = '2018' and pt2 = '0601'")
77 |     println(s"rptdf show, ${rptdf.count()}")
78 |     rptdf.show()
79 |     rptdf.printSchema()
80 |   }
81 | }
82 | 
83 | 


--------------------------------------------------------------------------------
/spark-utils/libs/cupid-sdk-3.3.14.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/spark-utils/libs/cupid-sdk-3.3.14.jar


--------------------------------------------------------------------------------
/spark-utils/libs/hadoop-yarn-client-3.3.12.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aliyun/MaxCompute-Spark/85ac99435e30566215f7fde03a9778fa387d0175/spark-utils/libs/hadoop-yarn-client-3.3.12.jar


--------------------------------------------------------------------------------
/spark-utils/src/main/java/com/aliyun/odps/spark/CupidApplicationMetaExample.java:
--------------------------------------------------------------------------------
 1 | package com.aliyun.odps.spark;
 2 | import apsara.odps.cupid.protocol.CupidTaskParamProtos;
 3 | import com.aliyun.odps.cupid.CupidConf;
 4 | import com.aliyun.odps.cupid.CupidSession;
 5 | import com.aliyun.odps.cupid.requestcupid.ApplicationMetaUtil;
 6 | import com.aliyun.odps.cupid.requestcupid.CupidProxyTokenUtil;
 7 | 
 8 | import java.util.List;
 9 | import java.util.stream.Collectors;
10 | 
11 | public class CupidApplicationMetaExample {
12 | 
13 |     // cd target
14 |     // java -cp ../libs/cupid-sdk-3.3.14.jar:spark-utils-1.0.0-shaded.jar com.aliyun.odps.spark.CupidApplicationMetaExample
15 |     public static void main(String[] args) throws Exception {
16 |         CupidConf conf = new CupidConf();
17 |         conf.set("odps.access.id", "");
18 |         conf.set("odps.access.key", "");
19 |         conf.set("odps.project.name", "");
20 |         conf.set("odps.end.point", "");
21 |         CupidSession session = new CupidSession(conf);
22 | 
23 |         /*
24 |          * list application metas
25 |          * yarnApplicationStates: https://hadoop.apache.org/docs/r2.7.3/api/org/apache/hadoop/yarn/api/records/YarnApplicationState.html
26 |          * 注意：list开销较大，调用频率不建议太高
27 |          */
28 |         CupidTaskParamProtos.ApplicationMetaList applicationMetaList = ApplicationMetaUtil.listApplicationMeta(
29 |                 "SPARK",
30 |                 "5",
31 |                 session);
32 |         List<CupidTaskParamProtos.ApplicationMeta> applicationMetas = applicationMetaList.getApplicationMetaListList()
33 |                 .stream()
34 |                 .collect(Collectors.toList());
35 |         if (applicationMetas.size() > 0) {
36 |             applicationMetas.forEach(System.out::println);
37 |         }
38 | 
39 |         /*
40 |          * get application meta by instanceid
41 |          */
42 |         String instanceId = "20211214074136554gqpk7659";
43 |         CupidTaskParamProtos.ApplicationMeta applicationMeta= ApplicationMetaUtil.getCupidInstanceMeta(instanceId, session);
44 |         System.out.println(applicationMeta.toString());
45 |     }
46 | }
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/spark-utils/src/main/java/com/aliyun/odps/spark/SparkLauncherTest.java:
--------------------------------------------------------------------------------
 1 | package com.aliyun.odps.spark;
 2 | 
 3 | import java.util.HashMap;
 4 | import java.util.Map;
 5 | import java.util.concurrent.CountDownLatch;
 6 | 
 7 | import org.apache.hadoop.conf.Configuration;
 8 | import org.apache.hadoop.yarn.api.records.ApplicationId;
 9 | import org.apache.hadoop.yarn.api.records.ApplicationReport;
10 | import org.apache.hadoop.yarn.api.records.YarnApplicationState;
11 | import org.apache.hadoop.yarn.client.api.YarnClient;
12 | import org.apache.hadoop.yarn.util.ConverterUtils;
13 | import org.apache.spark.launcher.SparkLauncher;
14 | import org.apache.spark.launcher.SparkAppHandle;
15 | 
16 | public class SparkLauncherTest {
17 | 
18 |     private static String accessId = "";
19 |     private static String accessKey = "";
20 |     private static String projectName = "";
21 |     private static String endPoint = "";
22 | 
23 |     // cd target
24 |     // java -cp ../libs/*:spark-utils-1.0.0-shaded.jar com.aliyun.odps.spark.SparkLauncherTest
25 |     public static void main(String[] args) throws Exception {
26 |         Map<String, String> env = new HashMap<>();
27 |         // relace here
28 |         env.put("SPARK_HOME", "/Users/wusj/software/spark/spark-2.3.0-odps0.33.0");
29 | 
30 |         CountDownLatch countDownLatch = new CountDownLatch(1);
31 |         SparkLauncher launcher = new SparkLauncher(env);
32 |         launcher.setConf(SparkLauncher.DRIVER_EXTRA_CLASSPATH, System.getProperty("java.class.path"))
33 |                 .setConf("spark.hadoop.odps.access.id", accessId)
34 |                 .setConf("spark.hadoop.odps.access.key", accessKey)
35 |                 .setConf("spark.hadoop.odps.project.name", projectName)
36 |                 .setConf("spark.hadoop.odps.end.point", endPoint)
37 |                 .setMainClass("JavaSparkPi")
38 |                 // relace here
39 |                 .setAppResource("/Users/wusj/code/spark/test.jar")
40 |                 .setMaster("yarn")
41 |                 .setDeployMode("cluster")
42 |                 .startApplication(new SparkAppHandle.Listener(){
43 |                     @Override
44 |                     public void stateChanged(SparkAppHandle handle){
45 |                         System.out.println("State changed to:" + handle.getState().toString());
46 |                         if (handle.getState().equals(SparkAppHandle.State.RUNNING)) {
47 |                             // Test kill application
48 |                             killApplication(handle.getAppId());
49 |                         }
50 |                         if (handle.getState().isFinal()) {
51 |                             countDownLatch.countDown();
52 |                         }
53 |                     }
54 |                     @Override
55 |                     public void infoChanged(SparkAppHandle handle) {
56 |                     }
57 |                 });
58 |         countDownLatch.await();
59 |     }
60 | 
61 |     public static void killApplication(String applicationId) {
62 |         YarnClient client = YarnClient.createYarnClient();
63 |         Configuration conf = new Configuration();
64 |         conf.set("odps.access.id", accessId);
65 |         conf.set("odps.access.key", accessKey);
66 |         conf.set("odps.project.name", projectName);
67 |         conf.set("odps.end.point", endPoint);
68 |         client.init(conf);
69 |         client.start();
70 | 
71 |         ApplicationId appId = ConverterUtils.toApplicationId(applicationId);
72 |         try {
73 |             ApplicationReport appReport = client.getApplicationReport(appId);
74 |             if (appReport.getYarnApplicationState() == YarnApplicationState.FINISHED
75 |                     || appReport.getYarnApplicationState() == YarnApplicationState.KILLED
76 |                     || appReport.getYarnApplicationState() == YarnApplicationState.FAILED) {
77 |                 System.out.println("Application " + applicationId + " has already finished ");
78 |             } else {
79 |                 System.out.println("Killing application " + applicationId);
80 |                 client.killApplication(appId);
81 |             }
82 |         } catch (Exception e) {
83 |             System.out.println("Kill application with id '" + applicationId + "' failed: " + e.getMessage());
84 |         }
85 |     }
86 | }
87 | 
88 | 
89 | 


--------------------------------------------------------------------------------