├── .Rbuildignore ├── .github └── workflows │ └── building.yaml ├── .gitignore ├── DESCRIPTION ├── Dockerfile ├── INSTALL ├── License ├── NAMESPACE ├── R ├── init.R ├── odps_version.R ├── rodps.R ├── rodps_base_function.R ├── rodps_misc.R ├── rodps_predict.R ├── rodps_project.R ├── rodps_sql.R ├── rodps_str.R ├── rodps_table.R ├── rodps_table_hist.R ├── rodps_table_sample.R ├── rodps_table_summary.R └── rodps_version.R ├── README.md ├── configure ├── docs ├── mindmap-thumb.png └── mindmap.pdf ├── examples └── odps_config.ini.template ├── hooks └── pre-commit ├── java ├── check_style.xml ├── eclipse-java-google-style.xml ├── intellij-java-google-style.xml ├── pom.xml └── src │ ├── main │ ├── java │ │ └── com │ │ │ └── aliyun │ │ │ └── odps │ │ │ └── rodps │ │ │ ├── DataTunnel │ │ │ ├── Context.java │ │ │ ├── DTProcess.java │ │ │ ├── DataFrameItem.java │ │ │ ├── DownloadWorker.java │ │ │ ├── MiddleStorage.java │ │ │ ├── RDTDownloader.java │ │ │ ├── RDTUploader.java │ │ │ ├── ROdpsException.java │ │ │ ├── SqliteMiddleStorage.java │ │ │ └── UploadWorker.java │ │ │ └── ROdps.java │ └── resources │ │ └── log4j2.properties │ └── test │ ├── java │ └── com │ │ └── aliyun │ │ └── odps │ │ └── rodps │ │ └── ROdpsTest.java │ └── resources │ └── log4j.properties ├── man ├── RODPS.Rd ├── error.Rd ├── head.rodps.data.Rd ├── na.omit.rodps.data.Rd ├── rodps.bizid.Rd ├── rodps.change.types.Rd ├── rodps.data.Rd ├── rodps.generate.DDL.Rd ├── rodps.help.Rd ├── rodps.init.Rd ├── rodps.predict.Rd ├── rodps.predict.fda.Rd ├── rodps.predict.rpart.Rd ├── rodps.project.Rd ├── rodps.project.current.Rd ├── rodps.project.use.Rd ├── rodps.set.Rd ├── rodps.split.ftn.Rd ├── rodps.sql.Rd ├── rodps.str.Rd ├── rodps.table.Rd ├── rodps.table.desc.Rd ├── rodps.table.drop.Rd ├── rodps.table.exist.Rd ├── rodps.table.head.Rd ├── rodps.table.hist.Rd ├── rodps.table.list.Rd ├── rodps.table.na.omit.Rd ├── rodps.table.partitions.Rd ├── rodps.table.read.Rd ├── rodps.table.rows.Rd ├── rodps.table.sample.srs.Rd ├── rodps.table.sample.strat.Rd ├── rodps.table.size.Rd ├── rodps.table.summary.Rd ├── rodps.table.write.Rd ├── rodps.tmpdir.Rd ├── rodps.unset.Rd ├── rodps.vector.Rd └── summary.rodps.data.Rd ├── tests ├── rodpstest.R ├── test_rodps_advanced.R ├── test_rodps_basics.R └── test_rodps_table.R └── tools ├── format_code.sh ├── gendoc.sh ├── package.sh ├── release.sh └── test_all.sh /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^\.github$ 4 | ^\.gitignore$ 5 | ^\.java-version$ 6 | ^Dockerfile$ 7 | ^docs$ 8 | ^examples$ 9 | ^hooks$ 10 | ^tests$ 11 | ^tools$ 12 | ^.*\.ini$ 13 | ^\.idea$ 14 | ^DESCRIPTION-E -------------------------------------------------------------------------------- /.github/workflows/building.yaml: -------------------------------------------------------------------------------- 1 | name: Building RODPS 2 | 3 | on: 4 | push: 5 | branches: [ "master" ] 6 | pull_request: 7 | branches: [ "master" ] 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | strategy: 13 | fail-fast: false 14 | 15 | steps: 16 | - uses: actions/checkout@v3 17 | - uses: r-lib/actions/setup-r@v2 18 | - name: Install deps 19 | run: | 20 | sudo apt install build-essential libcurl4-gnutls-dev libxml2-dev libssl-dev libharfbuzz-dev libfribidi-dev 21 | R --no-save -e "install.packages(c('devtools', 'formatR'), repo='https://mirrors.nju.edu.cn/CRAN')" 22 | - name: Building package 23 | run: | 24 | ./tools/package.sh 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.iml 2 | .idea 3 | target/ 4 | .DS_Store 5 | *.jar 6 | *.tar.gz 7 | *.tgz 8 | *.log* 9 | RODPS.Rcheck/ 10 | *.ini 11 | *~ 12 | build/ 13 | .R* 14 | rodps.log 15 | rodps.log.* 16 | *.versionsBackup 17 | *.Rproj 18 | .vscode/ 19 | .java-version 20 | .python-version 21 | .Rproj.user 22 | inst/ 23 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: RODPS 2 | Version: 2.1.6.3 3 | Title: R interface to interact with MaxCompute/ODPS 4 | Description: This package is developed for R to interact with 5 | MaxCompute/ODPS, the platform of Alibaba to process big data. 6 | Author: ruibo.lirb@alibaba-inc.com 7 | Maintainer: Xiaming Chen 8 | URL: http://github.com/aliyun/aliyun-odps-r-plugin 9 | License: file LICENSE 10 | Depends: R (>= 1.8.0), utils, rJava, DBI, RSQLite 11 | Encoding: UTF-8 12 | SystemRequirements: Java (>= 8) 13 | Imports: methods 14 | RoxygenNote: 7.2.3 15 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM rocker/rstudio:3.4.3 2 | RUN apt-get -y update && apt-get install -y rjava 3 | CMD /bin/bash 4 | -------------------------------------------------------------------------------- /INSTALL: -------------------------------------------------------------------------------- 1 | # Install RODPS 2 | 3 | -------------------------------------------------------------------------------- /License: -------------------------------------------------------------------------------- 1 | Copyright 1999-2023 Alibaba Group Holding Ltd. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(hist,rodps.vector) 4 | S3method(na.omit,rodps.data) 5 | S3method(rodps.predict,fda) 6 | S3method(rodps.predict,rpart) 7 | S3method(summary,rodps.data) 8 | export(error) 9 | export(fatal) 10 | export(head.rodps.data) 11 | export(info) 12 | export(init.odps.ext) 13 | export(rodps.bizid) 14 | export(rodps.change.types) 15 | export(rodps.current.project) 16 | export(rodps.data) 17 | export(rodps.desc.table) 18 | export(rodps.drop.table) 19 | export(rodps.exist.table) 20 | export(rodps.generate.DDL) 21 | export(rodps.help) 22 | export(rodps.hist) 23 | export(rodps.init) 24 | export(rodps.list.table) 25 | export(rodps.list.tables) 26 | export(rodps.load.table) 27 | export(rodps.partitions.table) 28 | export(rodps.predict) 29 | export(rodps.project.current) 30 | export(rodps.project.use) 31 | export(rodps.query) 32 | export(rodps.read.table) 33 | export(rodps.rows.table) 34 | export(rodps.sample.srs) 35 | export(rodps.sample.strat) 36 | export(rodps.set) 37 | export(rodps.size.table) 38 | export(rodps.split.ftn) 39 | export(rodps.sql) 40 | export(rodps.str) 41 | export(rodps.table.desc) 42 | export(rodps.table.drop) 43 | export(rodps.table.exist) 44 | export(rodps.table.head) 45 | export(rodps.table.hist) 46 | export(rodps.table.list) 47 | export(rodps.table.na.omit) 48 | export(rodps.table.partitions) 49 | export(rodps.table.read) 50 | export(rodps.table.rows) 51 | export(rodps.table.sample.srs) 52 | export(rodps.table.sample.strat) 53 | export(rodps.table.size) 54 | export(rodps.table.summary) 55 | export(rodps.table.write) 56 | export(rodps.tmpdir) 57 | export(rodps.unset) 58 | export(rodps.vector) 59 | export(rodps.write.table) 60 | export(str.rodps.data) 61 | export(warn) 62 | import(rJava) 63 | importFrom(stats,df) 64 | importFrom(stats,na.omit) 65 | importFrom(stats,runif) 66 | importFrom(utils,read.table) 67 | -------------------------------------------------------------------------------- /R/init.R: -------------------------------------------------------------------------------- 1 | .onLoad <- function(libname, pkgname) { 2 | init.java.env(libname, pkgname) 3 | rodpsTmpdir <<- tempdir() 4 | maxRecord <<- 10000 5 | errormsg <<- load.errormsg() 6 | conf <- rodps.loadconf() 7 | odpsOperator <<- NULL 8 | if (!is.null(conf)) { 9 | .init_odps_operator(conf) 10 | } 11 | } 12 | 13 | .check.init <- function() { 14 | if (length(ls(envir = .GlobalEnv, pattern = "odpsOperator")) == 0 || is.null(odpsOperator)) { 15 | stop(print("RODPS uninitialized or session timeout, please exectue rodps.init(path), path for the path of odps_config.ini")) 16 | } 17 | } 18 | 19 | #' Init ODPS with configs 20 | #' 21 | #' @param path File path string indicating odps_config 22 | #' @param access.id Access ID string 23 | #' @param access.key Access Key string 24 | #' @export 25 | rodps.init <- function(path = NULL, access.id = NULL, access.key = NULL) { 26 | # in case of isolating global variable by using rm(list=ls(all=TRUE)) + 27 | # rodps.init 28 | rodpsTmpdir <<- tempdir() 29 | maxRecord <<- 10000 30 | errormsg <<- load.errormsg() 31 | conf <- rodps.loadconf(path) 32 | if (!is.null(access.id)) { 33 | conf["access_id"] <- access.id 34 | } 35 | if (!is.null(access.key)) { 36 | conf["access_key"] <- access.key 37 | } 38 | odpsOperator <<- NULL 39 | if (!is.null(conf)) { 40 | .init_odps_operator(conf) 41 | } 42 | } 43 | 44 | #' Change RODPS TempDir 45 | #' 46 | #' @param path Target path string 47 | #' @export 48 | rodps.tmpdir <- function(path) { 49 | rodpsTmpdir <<- path 50 | } 51 | 52 | #' @noRd 53 | .init_odps_operator <- function(conf) { 54 | # tunnel endpoint should be explicitly specified 55 | if (!is.na(conf["tunnel_endpoint"])) { 56 | tunnel_endpoint <- conf["tunnel_endpoint"] 57 | } else if (!is.na(conf["dt_end_point"])) { 58 | tunnel_endpoint <- conf["dt_end_point"] 59 | } else { 60 | tunnel_endpoint <- "NA" 61 | writeLines("WARN: tunnel_endpoint not set, auto-routed tunnel endpoint might not work") 62 | } 63 | # init tmp dir 64 | if (!is.na(conf["rodps_tmpdir"])) { 65 | rodpsTmpdir <<- conf["rodps_tmpdir"] 66 | } 67 | # java odps object 68 | odpsOperator <<- .jnew("com/aliyun/odps/rodps/ROdps", conf["project_name"], conf["access_id"], 69 | conf["access_key"], conf["sts_token"], conf["end_point"], tunnel_endpoint, 70 | conf["logview_host"], conf["log4j_properties"]) 71 | rodps.init.type() 72 | } 73 | 74 | #' @noRd 75 | init.java.env <- function(libname, pkgname) { 76 | library(rJava) 77 | if ("windows" == .Platform$OS.type) { 78 | .jinit(parameters = c("-Xmx512m", "-Xms512m")) 79 | } else { 80 | .jinit() 81 | } 82 | jarPath <- paste(libname, pkgname, "java", sep = .Platform$file.sep) 83 | jarFiles <- list.files(jarPath) 84 | .jaddClassPath(paste(jarPath, "/.", sep = "")) 85 | for (i in 1:length(jarFiles)) { 86 | jarFiles[i] <- paste(jarPath, jarFiles[i], sep = .Platform$file.sep) 87 | if (0 == length(grep("DT", jarFiles[i]))) 88 | .jaddClassPath(jarFiles[i]) 89 | } 90 | init.dtsdk.env(libname, pkgname) 91 | } 92 | 93 | #' @noRd 94 | init.dtsdk.env <- function(libname, pkgname) { 95 | jarPath <- paste(libname, pkgname, "java", "DT", sep = .Platform$file.sep) 96 | jarFiles <- list.files(jarPath) 97 | for (i in 1:length(jarFiles)) { 98 | jarFiles[i] <- paste(jarPath, jarFiles[i], sep = .Platform$file.sep) 99 | .jaddClassPath(jarFiles[i]) 100 | } 101 | } 102 | 103 | #' @noRd 104 | rodps.loadconf <- function(path = NULL) { 105 | if (is.null(path) || path == "" || !file.exists(path)) { 106 | path <- Sys.getenv("RODPS_CONFIG") 107 | writeLines("checking RODPS_CONFIG ...") 108 | } 109 | if (is.null(path) || path == "" || !file.exists(path)) { 110 | path <- Sys.getenv("ODPS_CONFIG") 111 | writeLines("checking ODPS_CONFIG ...") 112 | } 113 | if (is.null(path) || path == "" || !file.exists(path)) { 114 | path <- paste(Sys.getenv("HOME"), .Platform$file.sep, "odps_config.ini", 115 | sep = "") 116 | writeLines(paste("checking", path, "...")) 117 | } 118 | if (is.null(path) || path == "" || !file.exists(path)) { 119 | path <- paste(Sys.getenv("HOME"), .Platform$file.sep, ".odpscmd", .Platform$file.sep, 120 | "odps_config.ini", sep = "") 121 | writeLines(paste("checking", path, "...")) 122 | } 123 | if (is.null(path) || path == "" || !file.exists(path)) { 124 | writeLines("RODPS_CONFIG environment variable is not set or the configuration file does not exist.\n- rodps.init(path) to manually init RODPS\n- add RODPS_CONFIG as environment variable") 125 | return(NULL) 126 | } 127 | writeLines(paste("using config file", path)) 128 | conf <- read.table(path, stringsAsFactors = FALSE) 129 | keys <- c() 130 | values <- c() 131 | for (i in 1:nrow(conf)) { 132 | row <- conf[i, ] 133 | if (nchar(row) > 0 && substr(row, 1, 1) != "#") { 134 | idx <- grep("=", strsplit(row, "")[[1]])[[1]] 135 | if (length(idx) < 1) { 136 | warn("config_error", row) 137 | } else { 138 | key = substr(row, 1, idx - 1) 139 | value = substr(row, idx + 1, nchar(row)) 140 | keys[i] <- key 141 | values[i] <- value 142 | } 143 | } 144 | } 145 | names(values) <- keys 146 | 147 | # add access_id/access_key check 148 | if (is.na(values["access_id"])) { 149 | values["access_id"] <- readline("Please input access_id:") 150 | } 151 | if (is.na(values["access_key"])) { 152 | values["access_key"] <- readline("Please input access_key:") 153 | } 154 | return(values) 155 | } 156 | 157 | #' @noRd 158 | rodps.init.type <- function() { 159 | type.map <- c("integer=int,tinyint,smallint", "numeric=double,float,long,bigint", 160 | "POSIXct=datetime", "Date=date", "character=string", "logical=boolean", "factor=string") 161 | keys1 <- c() 162 | values1 <- c() 163 | keys2 <- c() 164 | values2 <- c() 165 | i <- 1 166 | j <- 1 167 | for (m in unlist(type.map)) { 168 | mp <- strsplit(m, "=") 169 | vs <- strsplit(unlist(mp)[2], ",") 170 | keys1[i] <- unlist(mp)[1] 171 | values1[i] <- unlist(vs)[1] 172 | for (v in unlist(vs)) { 173 | keys2[j] <- v 174 | values2[j] <- unlist(mp)[1] 175 | j <- j + 1 176 | } 177 | i <- i + 1 178 | } 179 | names(values1) <- keys1 180 | rodps.type.r2java <<- values1 181 | names(values2) <- keys2 182 | rodps.type.java2r <<- values2 183 | } 184 | 185 | #' Change RODPS Java types into R 186 | #' @export 187 | rodps.change.types <- function(types) { 188 | newtypes <- c() 189 | i <- 1 190 | for (t in types) { 191 | newtypes[i] <- rodps.type.java2r[t] 192 | i <- i + 1 193 | } 194 | return(newtypes) 195 | } 196 | 197 | #' @noRd 198 | load.errormsg <- function() { 199 | keys <- c() 200 | values <- c() 201 | names(values) <- keys 202 | values <- add.errormsg(values, "table_not_found", "table not found.") 203 | values <- add.errormsg(values, "odps_config_ini_missing", paste("Can not find odps_config.ini with env variable RODPS_CONFIG, init ODPS environment fail!\n", 204 | "Please check your odps_config.ini file then re-load RODPS again. \n", "\n")) 205 | values <- add.errormsg(values, "invalid_value", "invalid value") 206 | values <- add.errormsg(values, "config_error", "odps conf error on row ") 207 | values <- add.errormsg(values, "input_query_error", "input query error ") 208 | values <- add.errormsg(values, "argument_type_error", "argument type is wrong ") 209 | values <- add.errormsg(values, "invalid_project_name", "The project name can not be empty ") 210 | return(values) 211 | } 212 | 213 | #' @noRd 214 | add.errormsg <- function(errormap, newkey, newvalue) { 215 | idx <- length(errormap) + 1 216 | errormap[idx] <- newvalue 217 | names(errormap)[idx] <- newkey 218 | return(errormap) 219 | } 220 | 221 | #' Print Messages 222 | #' @export 223 | error <- function(error_name, msg = NULL) { 224 | print(geterror(error_name, msg)) 225 | } 226 | 227 | #' @rdname error 228 | #' @export 229 | fatal <- function(error_name, msg = NULL) { 230 | print(geterror(error_name, msg)) 231 | } 232 | 233 | #' @rdname error 234 | #' @export 235 | info <- function(error_name, msg = NULL) { 236 | print(geterror(error_name, msg)) 237 | } 238 | 239 | #' @rdname error 240 | #' @export 241 | warn <- function(error_name, msg = NULL) { 242 | print(geterror(error_name, msg)) 243 | } 244 | 245 | #' @noRd 246 | geterror <- function(error_name, msg = NULL) { 247 | if (is.null(error_name) || error_name == "") { 248 | output = "" 249 | } else if (is.null(errormsg[error_name])) { 250 | output = error_name 251 | } else { 252 | output = errormsg[error_name] 253 | } 254 | 255 | if (!is.null(msg) && msg != "") { 256 | output <- paste(output, "-", msg) 257 | } 258 | return(output) 259 | } 260 | -------------------------------------------------------------------------------- /R/odps_version.R: -------------------------------------------------------------------------------- 1 | rodps.version <- function() { 2 | print("RODPS 2.1.6.3") 3 | } 4 | -------------------------------------------------------------------------------- /R/rodps.R: -------------------------------------------------------------------------------- 1 | 2 | #' @title RODPS: R interface to interact with ODPS 3 | #' 4 | #' @description RODPS is an R extension to enable R to interact with ODPS 5 | #' system, also support other related algorithm packages. 6 | #' 7 | #' @name RODPS 8 | #' @docType package 9 | #' @concept RODPS 10 | #' @author \email{yunyuan.zhangyy@alibaba-inc.com} 11 | #' @details The RODPS package supplies functions to interact with ODPS from 12 | #' within R. There are functions for exporting and connecting as well as 13 | #' querying ODPS. Please make sure the environment variable ODPS_CONFIG is set, 14 | #' it's in the same format as used in odpscmd, this file is required when 15 | #' connecting to ODPS. 16 | #' 17 | #' @seealso \code{\link{rodps.sql}}, \code{\link{rodps.set}}, 18 | #' \code{\link{rodps.table}},\code{\link{rodps.project}} 19 | #' 20 | #' @import rJava 21 | #' @importFrom stats df 22 | #' @importFrom stats runif 23 | #' @importFrom utils read.table 24 | #' @importFrom stats na.omit 25 | NULL 26 | -------------------------------------------------------------------------------- /R/rodps_base_function.R: -------------------------------------------------------------------------------- 1 | #' Show help information 2 | #' 3 | #' @return NULL 4 | #' @export 5 | rodps.help <- function() { 6 | cat("Please try help(rodps) \n") 7 | } 8 | 9 | #' Set business ID 10 | #' 11 | #' @param bizid business id, e.g. 012345^. 12 | #' @author \email{ruibo.lirb@alibaba-inc.com} 13 | #' @seealso \code{\link{RODPS}}, \code{\link{rodps.sql}} 14 | #' @examples 15 | #' ## set business id to 012345 16 | #' \dontrun{rodps.bizid('012345^')} 17 | #' @export 18 | rodps.bizid <- function(bizid) { 19 | .check.init() 20 | odpsOperator$setBizId(bizid) 21 | } 22 | 23 | #' Set task properties 24 | #' 25 | #' Set properties for SQL task 26 | #' 27 | #' @param key setting name, e.g. odps.sql.allow.fullscan. 28 | #' @param value setting value. 29 | #' @author \email{ruibo.lirb@alibaba-inc.com} 30 | #' @seealso \code{\link{RODPS}}, \code{\link{rodps.sql}}, 31 | #' \code{\link{rodps.unset}}, 32 | #' @examples 33 | #' ## enable full table scan 34 | #' \dontrun{rodps.set('odps.sql.allow.fullscan', 'true')} 35 | #' @export 36 | rodps.set <- function(key, value) { 37 | .check.init() 38 | odpsOperator$set(key, value) 39 | } 40 | 41 | #' Unset task properties 42 | #' 43 | #' Unset properties for SQL task 44 | #' 45 | #' @param key setting name, e.g. odps.sql.allow.fullscan. 46 | #' @author \email{ruibo.lirb@alibaba-inc.com} 47 | #' @seealso \code{\link{RODPS}}, \code{\link{rodps.sql}}, 48 | #' \code{\link{rodps.set}}, 49 | #' @examples 50 | #' ## set full table scan to its default value 51 | #' \dontrun{rodps.unset('odps.sql.allow.fullscan')} 52 | #' @export 53 | rodps.unset <- function(key) { 54 | .check.init() 55 | odpsOperator$unset(key) 56 | } 57 | 58 | .dataframe.to.sqlite <- function(dataframe, thread, filename, tablename, isdebug) { 59 | if (!require(DBI, quietly = TRUE)) { 60 | stop("DBI library not available") 61 | } 62 | if (!require(RSQLite, quietly = TRUE)) { 63 | stop("RSQLite library not available") 64 | } 65 | dl <- list() 66 | recordNumPerThread <- nrow(dataframe)%/%thread 67 | dbNames = c() 68 | for (i in (0:(thread - 1))) { 69 | startPos <- i * recordNumPerThread + 1 70 | endPos <- startPos + recordNumPerThread - 1 71 | if (i == thread - 1) { 72 | endPos <- nrow(dataframe) 73 | } 74 | dl[[length(dl) + 1]] <- as.data.frame(dataframe[startPos:endPos, ]) 75 | } 76 | 77 | for (i in (1:length(dl))) { 78 | dbName <- paste(filename, "_", i - 1, sep = "") 79 | if (file.exists(dbName)) { 80 | print(paste("warning:upload middle file", dbName, "already exist, now delete it.")) 81 | file.remove(dbName) 82 | } 83 | 84 | con <- dbConnect(SQLite(), dbname = dbName) 85 | dbWriteTable(con, tablename, dl[[i]], row.names = FALSE) 86 | if (isdebug) { 87 | print(paste("write file", i, ":", dbName)) 88 | } 89 | dbDisconnect(con) 90 | dbNames <- append(dbNames, dbName) 91 | } 92 | return(dbNames) 93 | } 94 | 95 | .sqlite.to.dataframe <- function(dbs, coltype, tablename, isdebug) { 96 | if (!require(DBI, quietly = TRUE)) { 97 | stop("DBI library not available") 98 | } 99 | if (!require(RSQLite, quietly = TRUE)) { 100 | stop("RSQLite library not available") 101 | } 102 | 103 | filenum <- dbs$size() 104 | if (filenum == 0) { 105 | stop("Internal error: no middle file return.") 106 | } 107 | data <- data.frame() 108 | for (i in 0:(filenum - 1)) { 109 | filename <- dbs$get(i) 110 | if (!file.exists(filename)) { 111 | stop(paste("file not exists:", filename)) 112 | } 113 | con <- dbConnect(SQLite(), dbname = filename) 114 | sql = paste("select * from [", tablename, "]", sep = "") 115 | tmp_data <- dbGetQuery(con, sql) 116 | data <- rbind(data, tmp_data) 117 | dbDisconnect(con) 118 | if (isdebug) { 119 | print(paste("download temp file:", filename)) 120 | } else { 121 | file.remove(filename) 122 | } 123 | } 124 | 125 | for (i in 0:(coltype$size() - 1)) { 126 | if (coltype$get(i) == "datetime" || coltype$get(i) == "date" || coltype$get(i) == 127 | "timestamp") { 128 | data[[i + 1]] = as.POSIXct(as.POSIXlt(data[[i + 1]], origin = "1970-01-01")) 129 | } else if (coltype$get(i) == "boolean") { 130 | data[[i + 1]] = as.logical(data[[i + 1]]) 131 | } else if (coltype$get(i) == "decimal") { 132 | data[[i + 1]] = as.numeric(data[[i + 1]]) 133 | } 134 | } 135 | return(data) 136 | } 137 | 138 | .data.frame.get.namelist <- function(dataframe) { 139 | if (!is.data.frame(dataframe)) 140 | stop("input data is not data frame") 141 | retlist <- .jnew("java/util/ArrayList") 142 | columnnum <- length(dataframe) 143 | for (i in 1:columnnum) { 144 | retlist$add(names(dataframe)[i]) 145 | } 146 | return(retlist) 147 | } 148 | 149 | .data.frame.get.typelist <- function(dataframe) { 150 | if (!is.data.frame(dataframe)) 151 | stop("input data is not data frame") 152 | retlist <- .jnew("java/util/ArrayList") 153 | columnnum <- length(dataframe) 154 | for (i in 1:columnnum) { 155 | retlist$add(.get.object.type(dataframe[[i]])) 156 | } 157 | return(retlist) 158 | } 159 | 160 | .data.frame.to.arraylist <- function(dataframe) { 161 | if (!is.data.frame(dataframe)) 162 | stop("input data is not data frame") 163 | 164 | retlist <- .jnew("java/util/ArrayList") 165 | columnnum <- length(dataframe) 166 | for (i in 1:columnnum) { 167 | name <- names(dataframe)[i] 168 | type <- .get.object.type(dataframe[[i]]) 169 | dataframeitem <- .jnew("com/aliyun/openservices/odps/roperator/DataFrameItem", 170 | name, type) 171 | for (j in 1:length(dataframe[[i]])) { 172 | dataframeitem$getData()$add(as.character(dataframe[[i]][j])) 173 | } 174 | retlist$add(dataframeitem) 175 | } 176 | return(retlist) 177 | } 178 | 179 | .get.object.type <- function(obj) { 180 | type <- is(obj)[1] 181 | return(rodps.type.r2java[type]) 182 | } 183 | 184 | .dataframe.code.conv <- function(dataframe, fromcode, tocode) { 185 | collen <- ncol(dataframe) 186 | for (i in 1:collen) { 187 | type <- is(dataframe[[i]])[1] 188 | if (type == "character" || type == "factor") { 189 | dataframe[[i]] = iconv(dataframe[[i]], fromcode, tocode) 190 | } 191 | } 192 | return(dataframe) 193 | } 194 | 195 | .check.tablename <- function(tablename) { 196 | if (is.null(tablename) || tablename == "") { 197 | stop(error("invalid_value", "table name is null")) 198 | } 199 | if (!is.character(tablename)) { 200 | stop(error("argument_type_error", "tablename must be string type.")) 201 | } 202 | } 203 | 204 | .change.data <- function(ret) { 205 | rdata <- list() 206 | if (!is.null(ret) && ret$size() > 0) { 207 | data <- .jcast(ret, new.class = "java/util/List", check = FALSE, convert.array = FALSE) 208 | vlist <- c(0:(data$size() - 1)) 209 | for (pos in vlist) { 210 | dfitem <- data$get(as.integer(pos)) 211 | rdata[[pos + 1]] <- .change.type(dfitem$getType()) 212 | names(rdata)[pos + 1] <- dfitem$getName() 213 | values <- dfitem$getData() 214 | if (values$size() > 0) { 215 | for (i in (0:(values$size() - 1))) { 216 | v <- values$get(as.integer(i)) 217 | if (is.null(v)) { 218 | v <- NA 219 | } 220 | rdata[[pos + 1]][i + 1] <- .change.value(dfitem$getType(), v) 221 | } 222 | } 223 | } 224 | } 225 | return(as.data.frame(rdata, stringsAsFactors = FALSE)) 226 | } 227 | 228 | .change.value <- function(type, value) { 229 | type <- tolower(type) 230 | if (is.null(type)) { 231 | return(as.character(value)) 232 | } 233 | rtype <- rodps.type.java2r[type] 234 | if (!is.null(rtype) && rtype != "character") { 235 | return(eval(parse(text = paste("as.", rtype, "('", value, "')", sep = "")))) 236 | } else { 237 | return(as.character(value)) 238 | } 239 | } 240 | 241 | .change.type <- function(type) { 242 | type <- tolower(type) 243 | rtype <- rodps.type.java2r[type] 244 | if (is.null(rtype)) { 245 | return(character()) 246 | } 247 | if (type == "datetime") { 248 | return(Sys.time()) 249 | } 250 | if (type == "date") { 251 | return(date()) 252 | } 253 | return(eval(parse(text = paste(rtype, "()", sep = "")))) 254 | } 255 | 256 | 257 | .change.to.list <- function(ret) { 258 | lst <- list() 259 | if (!is.null(ret)) { 260 | data <- .jcast(ret, new.class = "java/util/List", check = FALSE, convert.array = FALSE) 261 | if (!data$isEmpty()) { 262 | vlist <- c(0:(data$size() - 1)) 263 | for (pos in vlist) { 264 | dfitem <- data$get(as.integer(pos)) 265 | values <- dfitem$getData() 266 | if (values$size() > 1) { 267 | vs <- c() 268 | for (i in (0:(values$size() - 1))) { 269 | v <- values$get(as.integer(i)) 270 | if (is.null(v)) { 271 | vs[i + 1] <- NA 272 | } else { 273 | vs[i + 1] <- .change.value(dfitem$getType(), v) 274 | } 275 | } 276 | lst[[dfitem$getName()]] <- vs 277 | } else { 278 | v <- values$get(as.integer(0)) 279 | if (is.null(v)) { 280 | lst[[dfitem$getName()]] <- NA 281 | } else { 282 | lst[[dfitem$getName()]] <- .change.value(dfitem$getType(), values$get(as.integer(0))) 283 | } 284 | } 285 | } 286 | } 287 | } 288 | return(lst) 289 | } 290 | 291 | # 将java中返回的List转化成对象 292 | .change.to.obj <- function(ret) { 293 | obj <- "object" 294 | if (!is.null(ret)) { 295 | data <- .jcast(ret, new.class = "java/util/List", check = FALSE, convert.array = FALSE) 296 | if (!data$isEmpty()) { 297 | vlist <- c(0:(data$size() - 1)) 298 | for (pos in vlist) { 299 | dfitem <- data$get(as.integer(pos)) 300 | values <- dfitem$getData() 301 | if (values$size() > 1) { 302 | vs <- c() 303 | for (i in (0:(values$size() - 1))) { 304 | v <- values$get(as.integer(i)) 305 | if (is.null(v)) { 306 | vs[i + 1] <- NA 307 | } else { 308 | vs[i + 1] <- .change.value(dfitem$getType(), v) 309 | } 310 | } 311 | attr(obj, dfitem$getName()) <- vs 312 | } else { 313 | v <- values$get(as.integer(0)) 314 | if (is.null(v)) { 315 | attr(obj, dfitem$getname()) <- NA 316 | } else { 317 | attr(obj, dfitem$getName()) <- .change.value(dfitem$getType(), 318 | values$get(as.integer(0))) 319 | } 320 | } 321 | } 322 | } 323 | } 324 | return(obj) 325 | } 326 | -------------------------------------------------------------------------------- /R/rodps_misc.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | init.odps.ext <- function() { 3 | # tbl: table name, does not accept partition load.time: when object is 4 | # created, to determine if the buffer data is outdated data: local buffer, 5 | # if data is small enough, load it into buffer 6 | setClass("rodps.data", representation(tbl = "character", load.time = "POSIXct", 7 | data = "data.frame")) 8 | setClass("rodps.vector", representation(tbl = "character", col = "character", 9 | load.time = "POSIXct", data = "data.frame")) 10 | } 11 | 12 | #' Set up the odps.data class 13 | #' @export 14 | rodps.data <- function(tblname) { 15 | rt <- new("rodps.data", tbl = tblname, load.time = Sys.time()) 16 | return(rt) 17 | } 18 | 19 | #' Set up odps.vector class, a vector is nothing but a column in table 20 | #' @export 21 | rodps.vector <- function(tblname, colname) { 22 | rt <- new("rodps.vector", tbl = tblname, col = colname, load.time = Sys.time()) 23 | return(rt) 24 | } 25 | 26 | #' Remove NULL values from a table 27 | #' @export 28 | na.omit.rodps.data <- function(rd) { 29 | rodps.table.na.omit(rd@tbl) 30 | } 31 | 32 | #' Remove NULL values from a table 33 | #' @export 34 | rodps.table.na.omit <- function(tbl, tgttbl) { 35 | des <- rodps.table.desc(tbl) 36 | rows <- rodps.table.rows(tbl) 37 | cols <- des$columns 38 | cond <- paste(cols$names, " is not null ", sep = "", collapse = " \n and ") 39 | sql <- sprintf("create table %s as \n select * from %s \n where %s ", tgttbl, 40 | tbl, cond) 41 | } 42 | -------------------------------------------------------------------------------- /R/rodps_predict.R: -------------------------------------------------------------------------------- 1 | #' Extend predict function 2 | #' @export 3 | rodps.predict <- function(x, ...) { 4 | UseMethod("rodps.predict", x) 5 | } 6 | 7 | #' Extend Recursive Partitioning 8 | #' 9 | #' @param object Rpart model 10 | #' @param srctbl Data source table 11 | #' @param tgttbl Target table of prediction results 12 | #' @param dryrun Return the prediction SQL string instead of running the query 13 | #' @export 14 | rodps.predict.rpart <- function(object, srctbl, tgttbl, inc.col = NULL, dryrun = FALSE) { 15 | if (!require("rpart")) { 16 | stop("rpart package required in rodps.predict.rpart") 17 | } 18 | 19 | if (class(object) != "rpart") { 20 | stop("object is not rpart model") 21 | } 22 | if ((object$method != "class") && (object$method != "anova")) { 23 | stop("this model method is not class or anova, not supported yet") 24 | } 25 | yvar = as.character(attr(object$terms, "variables"))[-1][attr(object$terms, "response")] 26 | if (length(yvar) > 1) { 27 | stop("Multiple response variable found in formula") 28 | } 29 | yvar.p = sprintf("%s_predict", yvar) 30 | sql = sprintf("CREATE TABLE IF NOT EXISTS %s AS\n", tgttbl) 31 | 32 | srccol <- as.character(object$frame$var[object$frame$var != ""]) 33 | ylevels <- attr(object, "ylevels") 34 | ylabels <- labels(object) 35 | leafidx <- which(object$frame$var == "") 36 | cw = " CASE " 37 | sidx = which(object$frame$var != "") 38 | nodes = as.numeric(row.names(object$frame)) 39 | 40 | for (i in leafidx) { 41 | pos = nodes[i] 42 | cond = ylabels[i] 43 | # get parent condition 44 | parentidx = pos%/%2 45 | while (parentidx > 1) { 46 | pcond = ylabels[which(nodes == parentidx)] 47 | cond = sprintf("%s AND %s", pcond, cond) 48 | parentidx = parentidx%/%2 49 | } 50 | 51 | if (object$method == "class") { 52 | cond = sprintf(" WHEN %s THEN '%s'", cond, ylevels[object$frame$yval[i]]) 53 | } else if (object$method == "anova") { 54 | cond = sprintf(" WHEN %s THEN %.5f", cond, object$frame$yval[i]) 55 | } else { 56 | stop("Invalid method") 57 | } 58 | cw = sprintf("%s \n %s", cw, cond) 59 | } 60 | cw = sprintf("%s \n END AS %s\n", cw, yvar.p) 61 | sel = " SELECT " 62 | if (!is.null(inc.col)) { 63 | sel = sprintf("%s\n %s,", sel, paste(inc.col, sep = "", collapse = ",\n ")) 64 | } 65 | for (col in unique(srccol)) { 66 | sel = sprintf("%s \n %s,", sel, col) 67 | } 68 | sel = sprintf("%s \n %s,", sel, yvar) 69 | sql = sprintf("%s %s \n%s FROM %s;\n", sql, sel, cw, srctbl) 70 | 71 | if (dryrun) { 72 | return(sql) 73 | } else { 74 | cat(sql) 75 | if (!rodps.table.exist(srctbl)) { 76 | stop(sprintf("source table %s does not exist", srctbl)) 77 | } 78 | 79 | if (rodps.table.exist(tgttbl)) { 80 | stop(sprintf("target table %s already exists", tgttbl)) 81 | } 82 | rodps.sql(sql) 83 | } 84 | } 85 | 86 | #' Extend FDA 87 | 88 | #' @param object FDA model 89 | #' @param srctbl Data source table 90 | #' @param tgttbl Target table of prediction results 91 | #' @param dryrun Return the prediction SQL string instead of running the query 92 | #' @export 93 | rodps.predict.fda <- function(object, srctbl, tgttbl, prior, type = "class", dimension = 2) { 94 | if (!require(mda)) { 95 | stop("mda library not available") 96 | } 97 | if (class(object) != "fda") { 98 | stop("Invalid object class, only support fda model") 99 | } 100 | dist <- function(x, mean, m = ncol(mean)) (scale(x, mean, FALSE)^2) %*% rep(1, 101 | m) 102 | 103 | type <- match.arg(type) 104 | if (type != "class") { 105 | stop("type is not class, not supported yet") 106 | } 107 | if (object$fit$monomial != FALSE) { 108 | stop("unsupported monomial class") 109 | } 110 | if (attr(object$fit, "class") != "polyreg") { 111 | stop("unsupported fit class, only polyreg works") 112 | } 113 | if (object$fit$degree != 1) { 114 | stop("unsupported object$fit$degree") 115 | } 116 | means <- object$means 117 | Jk <- dim(means) 118 | J <- Jk[1] 119 | k <- Jk[2] 120 | if (k > 2) { 121 | stop("k is not 2, unsupported dimension <=2") 122 | } 123 | if (type == "hierarchical") { 124 | if (missing(dimension)) 125 | dimension.set <- seq(k) else { 126 | dimension.set <- dimension[dimension <= k] 127 | if (!length(dimension.set)) 128 | dimension.set <- k 129 | dimension <- max(dimension.set) 130 | } 131 | } else { 132 | dimension <- min(max(dimension), k) 133 | } 134 | # y <- predict(object$fit, newdata)  #假设object$fit$degree=1, 135 | # object$fit$monomial=FALSE 136 | yvar.name = paste(all.vars(object$terms)[attr(object$terms, "response")], seq(1:ncol(object$fit$coefficients)), 137 | sep = "_ln") 138 | # 线性变换intercept + a1x1+a2x2+... 139 | vars = rownames(object$fit$coefficients) 140 | names(vars) = rownames(object$fit$coefficients) 141 | vars[which(vars == "Intercept")] = "1" 142 | var.exp = c() 143 | for (i in seq(1:ncol(object$fit$coefficients))) { 144 | var.exp[i] = paste(object$fit$coefficients[, i], vars, sep = "*", collapse = "+") 145 | } 146 | sql.linear.exp = paste(var.exp, yvar.name, sep = " AS ", collapse = ",\n ") 147 | sql.linear.exp = gsub("\\+-", "-", sql.linear.exp) 148 | sql.linear = sprintf(" SELECT * ,\n %s \n FROM %s\n ", sql.linear.exp, srctbl) 149 | 150 | # y <- y %*% object$theta[, seq(dimension), drop = FALSE] #这里转为SQL 151 | 152 | lambda <- object$values 153 | alpha <- sqrt(lambda[seq(dimension)]) 154 | sqima <- sqrt(1 - lambda[seq(dimension)]) 155 | # 根据alpha值缩放,合并到上一步,sql.tran中 newdata <- scale(y, FALSE, 156 | # sqima * alpha) 157 | sa = sqima * alpha 158 | # 投影+缩放 159 | theta = object$theta[, seq(dimension), drop = FALSE] 160 | ytran.name = paste(all.vars(object$terms)[attr(object$terms, "response")], seq(1:ncol(object$fit$coefficients)), 161 | sep = "_tr") 162 | var.exp = c() 163 | for (i in seq(1:ncol(theta))) { 164 | var.exp[i] = sprintf(" (%s)/%f ", paste(theta[, i], yvar.name, sep = "*", 165 | collapse = "+"), sa[i]) 166 | } 167 | sql.tran.exp = paste(var.exp, ytran.name, sep = " AS ", collapse = " ,\n ") 168 | sql.tran.exp = gsub("\\+-", "-", sql.tran.exp) 169 | sql.tran = sprintf(" SELECT *, \n %s \n FROM (\n %s ) sub1 \n", sql.tran.exp, 170 | sql.linear) 171 | if (missing(prior)) 172 | prior <- object$prior else { 173 | if (any(prior < 0) | round(sum(prior), 5) != 1) 174 | stop("innappropriate prior") 175 | } 176 | means <- means[, seq(dimension), drop = FALSE] 177 | 178 | prior <- 2 * log(prior) 179 | dist_list = c() 180 | for (i in seq(1:nrow(means))) { 181 | dist_i = paste(ytran.name, "-", means[i, ], sep = "") 182 | dist_i = paste(" pow(", dist_i, ",2)", sep = "", collapse = "+") 183 | # 加上prior 184 | dist_i = paste(dist_i, prior[i], sep = "-") 185 | dist_list[i] = dist_i 186 | } 187 | dist_exp = paste(dist_list, sep = " , ", collapse = ",") 188 | dist_exp = gsub("--", "+", dist_exp) 189 | dist_exp = gsub("\\+-", "-", dist_exp) 190 | label_exp = paste(" WHEN ", seq(nrow(means)), " THEN '", rownames(means), "'", 191 | sep = "", collapse = "") 192 | case_exp = sprintf(" CASE least_index(%s) \n %s \n END AS %s\n", dist_exp, label_exp, 193 | "predict_v") 194 | sql = sprintf(" CREATE TABLE %s AS \nSELECT *, \n%s \n FROM (\n %s \n) sub_2", 195 | tgttbl, case_exp, sql.tran) 196 | } 197 | -------------------------------------------------------------------------------- /R/rodps_project.R: -------------------------------------------------------------------------------- 1 | #' @name rodps.project 2 | #' @title Project functions 3 | #' @description Provide functions to operate project. 4 | #' @author \email{yunyuan.zhangyy@alibaba-inc.com} 5 | #' @seealso \code{\link{rodps.project.use}}, 6 | #' \code{\link{rodps.project.current}} 7 | NULL 8 | 9 | #' Change current project. 10 | #' 11 | #' @param projectname target projectname; make sure that you have the authority to access this Project. 12 | #' @author \email{yunyuan.zhangyy@alibaba-inc.com} 13 | #' @seealso \code{\link{rodps.project.current}} 14 | #' @examples 15 | #' ## change project to prjb 16 | #' \dontrun{rodps.project.use('prjb')} 17 | #' @export 18 | rodps.project.use <- function(projectname) { 19 | .check.init() 20 | if (is.null(projectname) || projectname == "") { 21 | stop(error("invalid_project_name")) 22 | } 23 | odpsOperator$useProject(projectname) 24 | } 25 | 26 | #' Show current project name. 27 | #' 28 | #' @author \email{yunyuan.zhangyy@alibaba-inc.com} 29 | #' @seealso \code{\link{rodps.project.use}} 30 | #' @examples 31 | #' ## get current project name 32 | #' \dontrun{rodps.project.current()} 33 | #' @export 34 | rodps.project.current <- function() { 35 | .check.init() 36 | return(odpsOperator$getProjectName("")) 37 | } 38 | 39 | #' @rdname rodps.project.current 40 | #' @export 41 | rodps.current.project <- rodps.project.current 42 | -------------------------------------------------------------------------------- /R/rodps_sql.R: -------------------------------------------------------------------------------- 1 | .rodps.bigSql <- function(query, mcqa = FALSE, memsize = 10737518240, thread = 8) { 2 | .check.init() 3 | if (is.null(query) || query == "") { 4 | stop(error("input_query_error", "query is null")) 5 | } 6 | postfix <- paste(sample(c(letters[1:6], 0:9), 30, replace = TRUE), collapse = "") 7 | tmptable <- paste("rodps_result_", postfix, sep = "") 8 | query <- paste("CREATE TABLE ", tmptable, " LIFECYCLE 3 AS ", query, sep = "") 9 | 10 | odpsOperator$runSqlTask(query, mcqa) 11 | length <- rodps.table.size(tmptable) 12 | 13 | if (length > memsize) { 14 | x <- tmptable 15 | attr(x, "result:size") <- length 16 | return(x) 17 | } else { 18 | result <- try(rodps.table.read(tmptable, memsize = memsize, thread = thread)) 19 | odpsOperator$runSqlTask(paste("DROP TABLE ", tmptable, sep = ""), mcqa) 20 | if ("try-error" == class(result)) { 21 | stop(paste("Exception ocurred when loading table:", tmptable, sep = "")) 22 | } else { 23 | return(result) 24 | } 25 | } 26 | } 27 | 28 | #' SQL Command 29 | #' 30 | #' Run SQL command and return result(in data.frame type). 31 | #' 32 | #' @param query SQL string 33 | #' @param mcqa Whether enable MCQA or not 34 | #' @param result.table.limit The size limit of resulted table as engine side table or fetched data frame. 35 | #' @param thread The threading number to read table data when the table size is larger than `result.table.limit`. 36 | #' @author \email{yunyuan.zhangyy@alibaba-inc.com} 37 | #' @seealso \code{\link{RODPS}}, \code{\link{rodps.table}}, 38 | #' \code{\link{rodps.project}} 39 | #' @examples 40 | #' ## select the data of 'sales' in January ,and store the result in data.frame 41 | #' \dontrun{ data <- rodps.sql('select * from sales where month=1')} 42 | #' @export 43 | rodps.sql <- function(query, mcqa = FALSE, result.table.limit = 10737518240, thread = 8) { 44 | .check.init() 45 | if (is.null(query) || query == "") { 46 | stop(error("input_query_error", "query is null")) 47 | } 48 | type <- blacklist(query) 49 | if (as.logical(type) && type == TRUE) { 50 | stop(error("input_query_error", paste("rodps.sql does not support '", query, 51 | "' command", sep = ""))) 52 | } 53 | 54 | # set odps.instance.priority 55 | if (grepl("set", query) && grepl("odps.instance.priority", query) && grepl("[", 56 | query, fixed = TRUE) && grepl("]", query, fixed = TRUE)) { 57 | query_1 <- strsplit(query, "[", fixed = TRUE) 58 | query_1[[1]][1] 59 | query_1[[1]][2] 60 | query_2 <- strsplit(query_1[[1]][2], "]", fixed = TRUE) 61 | if (!is.na(query_2[[1]][2])) { 62 | query <- paste(query_1[[1]][1], query_2[[1]][2], sep = "") 63 | } else { 64 | query <- query_1[[1]][1] 65 | } 66 | if (nchar(query_2) > 0) { 67 | odpsOperator$runSqlTask(query_2[[1]], mcqa) 68 | } 69 | if (nchar(query) < 1) { 70 | return(TRUE) 71 | } 72 | } 73 | 74 | if (type == "select") { 75 | return(.rodps.bigSql(query, mcqa = mcqa, memsize = result.table.limit, thread = thread)) 76 | } 77 | ret <- odpsOperator$runSqlTask(query, mcqa) 78 | if (is.null(ret) || ret$size() == 0) { 79 | return(NULL) 80 | } 81 | if (ret$size() == 1) { 82 | return(strsplit(ret$get(as.integer(0)), "\n")) 83 | } 84 | vlist <- list() 85 | for (i in c(0:(ret$size() - 1))) { 86 | vlist[i + 1] <- ret$get(as.integer(i)) 87 | } 88 | return(vlist) 89 | } 90 | 91 | #' @rdname rodps.sql 92 | #' @export 93 | rodps.query <- rodps.sql 94 | 95 | # 不支持运行的query 96 | blacklist <- function(query) { 97 | tokens <- strsplit(tolower(query), "\\s+", fixed = FALSE) 98 | if (length(tokens[[1]]) == 0) { 99 | return(FALSE) 100 | } 101 | if (length(tokens[[1]]) >= 2 && tokens[[1]][1] == "") { 102 | head <- tokens[[1]][2] 103 | } else { 104 | head <- tokens[[1]][1] 105 | } 106 | if (head == "use" || head == "read") { 107 | return(TRUE) 108 | } 109 | return(head) 110 | } 111 | -------------------------------------------------------------------------------- /R/rodps_str.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | str.rodps.data <- function(rd) { 3 | rodps.str(rd@tbl) 4 | } 5 | 6 | #' Display Table 7 | #' 8 | #' Print table as formatted string. 9 | #' 10 | #' @param tbl RODPS Table object 11 | #' @return Formatted string. 12 | #' @seealso [str.rodps.data()] 13 | #' @export 14 | rodps.str <- function(tbl) { 15 | obs <- rodps.table.rows(tbl) 16 | des <- rodps.table.desc(tbl) 17 | vars <- nrow(des$columns) 18 | 19 | # load 10 records to display 20 | sql <- sprintf(" select * from %s limit 10;", tbl) 21 | d <- rodps.sql(sql) 22 | 23 | cat(sprintf("'rodps.data':\t%d obs. of %d variables:\n", obs, vars)) 24 | for (i in seq(1:length(d))) { 25 | if (class(names(d)[i]) == "character") { 26 | rows <- min(obs, 4) 27 | } else { 28 | rows <- min(obs, 10) 29 | } 30 | cat(sprintf(" $ %s: %s %s", format(names(d)[i], width = max(nchar(names(d)))), 31 | format(class(d[, i]), width = max(nchar(sapply(df, "class")))), paste(d[, 32 | i][1:rows], collapse = " "))) 33 | if (obs > 10) { 34 | cat(" ... \n") 35 | } else { 36 | cat("\n") 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /R/rodps_table.R: -------------------------------------------------------------------------------- 1 | #' @name rodps.table 2 | #' @title RODPS Table Functions 3 | #' @description Provide functions to operate table. 4 | #' @author \email{yunyuan.zhangyy@alibaba-inc.com} 5 | #' @seealso \code{\link{rodps.table.desc}}, \code{\link{rodps.table.drop}}, 6 | #' \code{\link{rodps.table.exist}}, \code{\link{rodps.table.partitions}}, 7 | #' \code{\link{rodps.table.list}}, \code{\link{rodps.table.rows}}, 8 | #' \code{\link{rodps.table.size}}, \code{\link{rodps.table.read}}, 9 | #' \code{\link{rodps.table.write}} 10 | NULL 11 | 12 | #' Table Head 13 | #' 14 | #' Create odps.data and odps.vector in S4. 15 | #' Store the head result in a temp table 16 | #' 17 | #' @export 18 | head.rodps.data <- function(rd, n = 6L) { 19 | rodps.table.head(rd@tbl, n) 20 | } 21 | 22 | #' Table Head 23 | #' 24 | #' Show a few of head rows of table. 25 | #' 26 | #' @param tbl Table name 27 | #' @param n The number of head rows 28 | #' @export 29 | rodps.table.head <- function(tbl, n = 6L) { 30 | # could be optimized by identify the tbl/partiton tbl/view 31 | sql <- sprintf("select * from %s limit %d;", tbl, n) 32 | df <- rodps.sql(sql) 33 | df 34 | } 35 | 36 | #' Split full table name into table name and project name 37 | #' @param ftn Full table name. 38 | #' @export 39 | rodps.split.ftn <- function(ftn) { 40 | if (is.null(ftn) || !is.character(ftn) || nchar(ftn) == 0 || length(ftn) > 1) { 41 | stop("Invalid table name ") 42 | } 43 | p.t <- unlist(strsplit(ftn, "[.]")) 44 | if (length(p.t) > 2 || length(p.t) < 1) { 45 | stop("Invalid table name ") 46 | } 47 | ret <- list() 48 | if (length(p.t) == 1) { 49 | ret$tablename <- p.t[1] 50 | } else { 51 | ret$projectname <- p.t[1] 52 | ret$tablename <- p.t[2] 53 | } 54 | return(ret) 55 | } 56 | 57 | #' Table Existence 58 | #' 59 | #' Check whether a table exists. 60 | #' 61 | #' @param full.tablename table name. 62 | #' @param partition partition spec, default NULL. 63 | #' @author \email{yunyuan.zhangyy@alibaba-inc.com} 64 | #' @examples 65 | #' \dontrun{rodps.table.exist('mytable')} 66 | #' @seealso \code{\link{rodps.table.desc}}, \code{\link{rodps.table.drop}}, 67 | #' \code{\link{rodps.table.partitions}}, \code{\link{rodps.table.list}}, 68 | #' \code{\link{rodps.table.rows}}, \code{\link{rodps.table.size}}, 69 | #' \code{\link{rodps.table.read}}, \code{\link{rodps.table.write}} 70 | #' @export 71 | rodps.table.exist <- function(full.tablename, partition = NULL) { 72 | .check.init() 73 | p.t <- rodps.split.ftn(full.tablename) 74 | projectname <- p.t$projectname 75 | tablename <- p.t$tablename 76 | 77 | if (is.null(projectname)) { 78 | projectname <- rodps.project.current() 79 | } 80 | 81 | .check.tablename(tablename) 82 | tableExist <- odpsOperator$isTableExist(.jnew("java/lang/String", projectname), 83 | tablename, partition) 84 | return(tableExist) 85 | } 86 | 87 | #' @rdname rodps.table.exist 88 | #' @export 89 | rodps.exist.table <- rodps.table.exist 90 | 91 | #' List Tables 92 | #' 93 | #' List all tables in the project, default in current project. 94 | #' 95 | #' @param pattern Partition pattern, use '*' or specific PartitionName. 96 | #' @param projectname Specific project to query,default is current project. 97 | #' @author \email{yunyuan.zhangyy@alibaba-inc.com} 98 | #' @examples 99 | #' ##list the tables in current project 100 | #' \dontrun{rodps.table.list()} 101 | #' @seealso \code{\link{rodps.table.desc}}, \code{\link{rodps.table.drop}}, 102 | #' \code{\link{rodps.table.exist}}, \code{\link{rodps.table.partitions}}, 103 | #' \code{\link{rodps.table.rows}}, \code{\link{rodps.table.size}}, 104 | #' \code{\link{rodps.table.read}}, \code{\link{rodps.table.write}} 105 | #' @export 106 | rodps.table.list <- function(pattern = NULL, projectname = NULL) { 107 | .check.init() 108 | 109 | if (is.null(projectname)) { 110 | projectname <- rodps.project.current() 111 | } else { 112 | rodps.project.use(projectname) 113 | } 114 | 115 | tables <- try(odpsOperator$getTables(projectname, pattern)) 116 | if ("try-error" %in% class(tables)) { 117 | stop("Exception occured when listing tables") 118 | } 119 | return(.change.data(tables)) 120 | } 121 | 122 | #' @rdname rodps.table.list 123 | #' @export 124 | rodps.list.table <- rodps.table.list 125 | 126 | #' @rdname rodps.table.list 127 | #' @export 128 | rodps.list.tables <- rodps.table.list 129 | 130 | #' List Partitions 131 | #' 132 | #' List partitions of a table. Raise ERROR if the table has no partition. 133 | #' 134 | #' @param full.tablename, Table name, in format of 'ProjectName.TableName' or 135 | #' 'TableName' (using current project). 136 | #' @author \email{yunyuan.zhangyy@alibaba-inc.com} 137 | #' @examples 138 | #' ## list partitions of 'sales' 139 | #' \dontrun{rodps.table.partitions('sales')} 140 | #' @seealso \code{\link{rodps.table.desc}}, \code{\link{rodps.table.drop}}, 141 | #' \code{\link{rodps.table.exist}}, \code{\link{rodps.table.list}}, 142 | #' \code{\link{rodps.table.rows}}, \code{\link{rodps.table.size}}, 143 | #' \code{\link{rodps.table.read}}, \code{\link{rodps.table.write}} 144 | #' @export 145 | rodps.table.partitions <- function(full.tablename) { 146 | .check.init() 147 | df <- rodps.sql(paste("show partitions", full.tablename)) 148 | return(df) 149 | } 150 | 151 | #' @rdname rodps.table.partitions 152 | #' @export 153 | rodps.partitions.table <- rodps.table.partitions 154 | 155 | #' Drop Table 156 | #' 157 | #' Delete table if it exists. 158 | #' 159 | #' @param full.tablename Table name. 160 | #' @param partition Partition spec. 161 | #' @author \email{yunyuan.zhangyy@alibaba-inc.com} 162 | #' @examples 163 | #' \dontrun{rodps.table.drop('sales_backup')} 164 | #' @seealso \code{\link{rodps.table.desc}}, \code{\link{rodps.table.exist}}, 165 | #' \code{\link{rodps.table.partitions}}, \code{\link{rodps.table.list}}, 166 | #' \code{\link{rodps.table.rows}}, \code{\link{rodps.table.size}}, 167 | #' \code{\link{rodps.table.read}}, \code{\link{rodps.table.write}} 168 | #' @export 169 | rodps.table.drop <- function(full.tablename, partition = NULL) { 170 | .check.init() 171 | p.t <- rodps.split.ftn(full.tablename) 172 | 173 | projectname <- p.t$projectname 174 | tablename <- p.t$tablename 175 | 176 | .check.tablename(tablename) 177 | if (!is.null(projectname)) { 178 | ftn <- paste(projectname, ".", tablename, sep = "") 179 | } else { 180 | ftn <- tablename 181 | } 182 | if (is.null(partition)) { 183 | sql <- paste("drop table if exists", ftn) 184 | } else { 185 | sql <- paste("alter table", ftn, "drop partition(", partition, ")") 186 | } 187 | rodps.sql(sql) 188 | return(TRUE) 189 | } 190 | 191 | #' @rdname rodps.table.drop 192 | #' @export 193 | rodps.drop.table <- rodps.table.drop 194 | 195 | #' Convert pt|string| into dataframe 196 | #' @noRd 197 | .column.to.dataframe <- function(cols) { 198 | len <- length(cols) 199 | names <- c() 200 | types <- c() 201 | comments <- c() 202 | for (i in 1:len) { 203 | items <- strsplit(cols[i], "|", fixed = TRUE) 204 | names[i] <- items[[1]][1] 205 | types[i] <- items[[1]][2] 206 | if (length(items[[1]]) > 2) { 207 | comments[i] <- items[[1]][3] 208 | } else { 209 | comments[i] <- NA 210 | } 211 | } 212 | return(data.frame(names, types, comments, stringsAsFactors = FALSE)) 213 | } 214 | 215 | #' Table Description 216 | #' 217 | #' Show description of a table, including metadata of 218 | #' Owner, Project, Comment, Create_time, Last_modified_time, Size, Columns. 219 | #' 220 | #' @param full.tablename Table name, in format 'ProjectName.TableName',or 221 | #' 'TableName' (using current project). 222 | #' @param partition Partition spec 223 | #' @author \email{yunyuan.zhangyy@alibaba-inc.com} 224 | #' @examples 225 | #' ## show description of 'dual' 226 | #' \dontrun{rodps.table.desc('dual')} 227 | #' @seealso \code{\link{rodps.table.drop}}, \code{\link{rodps.table.exist}}, 228 | #' \code{\link{rodps.table.partitions}}, \code{\link{rodps.table.list}}, 229 | #' \code{\link{rodps.table.rows}}, \code{\link{rodps.table.size}}, 230 | #' \code{\link{rodps.table.read}}, \code{\link{rodps.table.write}} 231 | #' @export 232 | rodps.table.desc <- function(full.tablename, partition = NULL) { 233 | .check.init() 234 | p.t <- rodps.split.ftn(full.tablename) 235 | projectname <- p.t$projectname 236 | tablename <- p.t$tablename 237 | 238 | if (is.null(projectname)) { 239 | projectname <- rodps.project.current() 240 | } 241 | 242 | .check.tablename(tablename) 243 | tableMeta <- odpsOperator$describeTable(.jnew("java/lang/String", projectname), 244 | tablename, partition) 245 | ret <- .change.to.list(tableMeta) 246 | ret$columns = .column.to.dataframe(ret$columns) 247 | if (length(ret$partition_keys) > 0) { 248 | ret$partition_keys <- .column.to.dataframe(ret$partition_keys) 249 | } 250 | if ("windows" == .Platform$OS.type) { 251 | ret$comment <- iconv(ret$comment, "utf-8", "gbk") 252 | ret$columns$comments <- iconv(ret$columns$comments, "utf-8", "gbk") 253 | } 254 | return(ret) 255 | } 256 | 257 | #' @rdname rodps.table.desc 258 | #' @export 259 | rodps.desc.table <- rodps.table.desc 260 | 261 | #' Table Size 262 | #' 263 | #' Get the size of table in Bytes. 264 | #' 265 | #' @param full.tablename Table name, in format 'ProjectName.TableName',or 266 | #' 'TableName' (using current project). 267 | #' @param partition Partition spec 268 | #' @author \email{yunyuan.zhangyy@alibaba-inc.com} 269 | #' @examples 270 | #' ## get the size of 'sales' 271 | #' \dontrun{rodps.table.size('sales')} 272 | #' @seealso \code{\link{rodps.table.desc}}, \code{\link{rodps.table.drop}}, 273 | #' \code{\link{rodps.table.exist}}, \code{\link{rodps.table.partitions}}, 274 | #' \code{\link{rodps.table.list}}, \code{\link{rodps.table.rows}}, 275 | #' \code{\link{rodps.table.read}}, \code{\link{rodps.table.write}} 276 | #' @export 277 | rodps.table.size <- function(full.tablename, partition = NULL) { 278 | .check.init() 279 | p.t <- rodps.split.ftn(full.tablename) 280 | projectname <- p.t$projectname 281 | tablename <- p.t$tablename 282 | 283 | .check.tablename(tablename) 284 | size <- odpsOperator$getTableSize(.jnew("java/lang/String", projectname), tablename, 285 | partition) 286 | 287 | return(size) 288 | } 289 | 290 | #' @rdname rodps.table.size 291 | #' @export 292 | rodps.size.table <- rodps.table.size 293 | 294 | #' @noRd 295 | .check.column.name <- function(colname) { 296 | if (length(grep("[.]|[$]", colname)) > 0 || nchar(colname) > 128 || substr(colname, 297 | 1, 1) == "_") 298 | stop(paste("Invalid column name", colname)) 299 | } 300 | 301 | #' DDL Generation 302 | #' 303 | #' Generate SQL DDL from dataframe. 304 | #' 305 | #' @param full.tablename Table name, in format 'ProjectName.TableName',or 306 | #' 'TableName' (using current project). 307 | #' @param dataframe Source data frame. 308 | #' @param tablecomment DDL comment string. 309 | #' @author \email{yunyuan.zhangyy@alibaba-inc.com} 310 | #' @export 311 | rodps.generate.DDL <- function(full.tablename, dataframe, tablecomment = NULL) { 312 | p.t <- rodps.split.ftn(full.tablename) 313 | projectname <- p.t$projectname 314 | tablename <- p.t$tablename 315 | 316 | .check.tablename(tablename) 317 | if (!is.data.frame(dataframe)) { 318 | stop("dataframe should be data.frame type") 319 | } 320 | 321 | namelist <- names(dataframe) 322 | if (length(namelist) == 0) { 323 | stop("Zero columes in dataframe") 324 | } 325 | for (n in namelist) .check.column.name(n) 326 | 327 | typelist <- sapply(dataframe, .get.object.type) 328 | 329 | sql <- paste("CREATE TABLE ", full.tablename, " (\n", sep = "") 330 | ncol <- length(namelist) 331 | ntype <- length(typelist) 332 | 333 | for (i in seq(1, ncol)) { 334 | if (i != ncol) { 335 | sql <- paste(sql, " ", namelist[i], "\t", typelist[i], ",\n", sep = "") 336 | } else { 337 | sql <- paste(sql, " ", namelist[i], "\t", typelist[i], ")", sep = "") 338 | } 339 | } 340 | if (!is.null(tablecomment)) { 341 | sql <- paste(sql, "\nCOMMENT '", tablecomment, "'", sep = "") 342 | } 343 | sql <- paste(sql, ";", sep = "") 344 | return(sql) 345 | } 346 | 347 | #' Write Table 348 | #' 349 | #' Write 'dataframe' into 'full.tablename' of ODPS, make sure the target table 350 | #' 'full.tablename' is not exist. Dataframe can be written to a non-exist table 351 | #' or partition. 352 | #' 353 | #' @param dataframe Data in data.frame type, make sure the ColumnName is 354 | #' allowable in ODPS. 355 | #' @param full.tablename Table name, in format 'ProjectName.TableName',or 356 | #' 'TableName' (using current project). 357 | #' @param partition Partition spec. 358 | #' @param tablecomment Table comment. 359 | #' @param isdebug Boolean value, if debugging is enabled. 360 | #' @param thread Thread number. 361 | #' @author \email{yunyuan.zhangyy@alibaba-inc.com} 362 | #' @examples 363 | #' ## write data.frame into 'mytable' 364 | #' \dontrun{ x<-data.frame(c1=1:10,c2=1:10)} 365 | #' \dontrun{ rodps.table.write(x,'mytable')} 366 | #' @seealso \code{\link{rodps.table.desc}}, \code{\link{rodps.table.drop}}, 367 | #' \code{\link{rodps.table.exist}}, \code{\link{rodps.table.partitions}}, 368 | #' \code{\link{rodps.table.list}}, \code{\link{rodps.table.rows}}, 369 | #' \code{\link{rodps.table.size}}, \code{\link{rodps.table.read}} 370 | #' @export 371 | rodps.table.write <- function(dataframe, full.tablename, partition = NULL, tablecomment = NULL, 372 | isdebug = FALSE, thread = 8) { 373 | .check.init() 374 | p.t <- rodps.split.ftn(full.tablename) 375 | projectname <- p.t$projectname 376 | tablename <- p.t$tablename 377 | 378 | if (is.null(projectname)) { 379 | projectname <- rodps.project.current() 380 | } 381 | 382 | .check.tablename(tablename) 383 | if (!is.data.frame(dataframe)) { 384 | stop("dataframe should be class of data.frame") 385 | } 386 | 387 | if (length(colnames(dataframe)) == 0) { 388 | stop("dataframe should have at least one column") 389 | } 390 | 391 | if (!is.null(partition) && !rodps.table.exist(full.tablename)) { 392 | stop(sprintf("Table not exists,table=%s partition=%s", full.tablename, partition)) 393 | } 394 | sql <- NULL 395 | if (!rodps.table.exist(full.tablename)) { 396 | sql <- rodps.generate.DDL(full.tablename, dataframe, tablecomment) 397 | } 398 | if (!is.null(partition) && !rodps.table.exist(full.tablename, partition)) { 399 | sql <- paste("alter table", full.tablename, "add partition(", odpsOperator$formatPartition(partition, 400 | "'", ","), ")") 401 | } 402 | if (!is.null(sql)) { 403 | ret <- try(rodps.sql(sql)) 404 | if ("try-error" %in% class(ret)) { 405 | cat("Exception occured when creating table\n") 406 | cat(sql) 407 | cat("\n") 408 | } 409 | } 410 | 411 | if (nrow(dataframe) == 0) { 412 | return(TRUE) 413 | } 414 | 415 | tempprefix <- paste("rodps", ceiling(runif(1, 1, 1e+06)), "_", sep = "") 416 | filename <- tempfile(tempprefix, rodpsTmpdir) 417 | 418 | actual_thread <- as.integer(thread) 419 | if (nrow(dataframe) < thread * 100) { 420 | actual_thread <- as.integer(1) 421 | } 422 | if ("windows" == .Platform$OS.type) { 423 | dataframe <- .dataframe.code.conv(dataframe, "", "UTF-8") 424 | } 425 | dbNames <- .dataframe.to.sqlite(dataframe, actual_thread, filename, tablename, 426 | isdebug) 427 | odpsOperator$writeTableFromDT(projectname, tablename, partition, filename, NULL, 428 | NULL, .jlong(length(dataframe[[1]])), actual_thread) 429 | if (!isdebug) { 430 | for (i in 1:length(dbNames)) { 431 | file.remove(dbNames[i]) 432 | } 433 | } 434 | return(TRUE) 435 | } 436 | 437 | #' @rdname rodps.table.write 438 | #' @export 439 | rodps.write.table <- rodps.table.write 440 | 441 | #' Reading Table 442 | #' 443 | #' Read data from ODPS and store in R data frame. 444 | #' 445 | #' @param full.tablename Table name 446 | #' @param partition Partition spec 447 | #' @param limit Limit the rows to read, '-1' for not limit. 448 | #' @param memsize Maximum data capacity. 449 | #' @param isdebug Boolean value, if debugging is enabled. 450 | #' @param thread Thread number. 451 | #' @author \email{yunyuan.zhangyy@alibaba-inc.com} 452 | #' @examples 453 | #' ## show description of 'dual' 454 | #' \dontrun{ x<-rodps.table.read('sales',partition='ds=20180124',limit=100) } 455 | #' @seealso \code{\link{rodps.table.desc}}, \code{\link{rodps.table.drop}}, 456 | #' \code{\link{rodps.table.exist}}, \code{\link{rodps.table.partitions}}, 457 | #' \code{\link{rodps.table.list}}, \code{\link{rodps.table.rows}}, 458 | #' \code{\link{rodps.table.size}}, \code{\link{rodps.table.write}} 459 | #' @export 460 | rodps.table.read <- function(full.tablename, partition = NULL, limit = -1, memsize = 10737518240, 461 | isdebug = FALSE, thread = 8) { 462 | .check.init() 463 | p.t <- rodps.split.ftn(full.tablename) 464 | projectname <- p.t$projectname 465 | tablename <- p.t$tablename 466 | 467 | .check.tablename(tablename) 468 | tablesize <- rodps.table.size(full.tablename, partition = partition) 469 | if ((tablesize > memsize) && (limit == -1)) { 470 | msg <- paste("whole table size (", tablesize, ") is larger than memsize (", 471 | memsize, "), can not be loaded.") 472 | stop(msg) 473 | } 474 | 475 | tempprefix <- paste("rodps", ceiling(runif(1, 1, 1e+05)), "_", sep = "") 476 | filename <- tempfile(tempprefix, rodpsTmpdir) 477 | 478 | results <- odpsOperator$loadTableFromDT(projectname, tablename, partition, filename, 479 | NULL, NULL, as.integer(limit), as.integer(thread)) 480 | 481 | if (3 != results$size()) { 482 | stop("Internal error with load table") 483 | } 484 | res <- .sqlite.to.dataframe(results$get(as.integer(2)), results$get(as.integer(1)), 485 | tablename, isdebug) 486 | if ("windows" == .Platform$OS.type) { 487 | res <- .dataframe.code.conv(res, "UTF-8", "") 488 | } 489 | return(res) 490 | } 491 | 492 | #' @rdname rodps.table.read 493 | #' @export 494 | rodps.read.table <- rodps.table.read 495 | 496 | #' @rdname rodps.table.read 497 | #' @export 498 | rodps.load.table <- rodps.table.read 499 | 500 | #' Table Rows 501 | #' 502 | #' Get the number of rows in a table. 503 | #' 504 | #' @param full.tablename Table name , in format of 'ProjectName.TableName' or 505 | #' 'TableName' (using current project) 506 | #' @param partition Partition spec. 507 | #' @author \email{yunyuan.zhangyy@alibaba-inc.com} 508 | #' @examples 509 | #' ## get the number of rows 510 | #' \dontrun{rodps.table.rows('sales')} 511 | #' @seealso \code{\link{rodps.table.desc}}, \code{\link{rodps.table.drop}}, 512 | #' \code{\link{rodps.table.exist}}, \code{\link{rodps.table.partitions}}, 513 | #' \code{\link{rodps.table.list}}, \code{\link{rodps.table.size}}, 514 | #' \code{\link{rodps.table.read}}, \code{\link{rodps.table.write}} 515 | #' @export 516 | rodps.table.rows <- function(full.tablename, partition = NULL) { 517 | .check.init() 518 | p.t <- rodps.split.ftn(full.tablename) 519 | projectname <- p.t$projectname 520 | tablename <- p.t$tablename 521 | 522 | .check.tablename(tablename) 523 | sz <- rodps.table.size(full.tablename) 524 | 525 | if (sz < 10 * 1024 * 1024 * 1024 || !is.null(partition) && partition != "") { 526 | sql <- sprintf(" count %s ", full.tablename) 527 | if (!is.null(partition) && partition != "") { 528 | sql <- paste(sql, "partition(", partition, ")") 529 | } 530 | v <- rodps.sql(sql) 531 | ret <- as.numeric(v[[1]]) 532 | } else { 533 | sql <- sprintf("select count(*) from %s", full.tablename) 534 | v <- rodps.sql(sql) 535 | ret <- as.numeric(v[1, 1]) 536 | } 537 | return(ret) 538 | } 539 | 540 | #' @rdname rodps.table.rows 541 | #' @export 542 | rodps.rows.table <- rodps.table.rows 543 | -------------------------------------------------------------------------------- /R/rodps_table_hist.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | hist.rodps.vector <- function(ov) { 3 | rodps.table.hist(ov@tbl, ov@col) 4 | } 5 | 6 | #' Table Histogram 7 | #' 8 | #' Extend hist. This function returns a list of breaks, counts, density, mids, xname, 9 | #' equidist, class attr from tbl; then plot with the list 10 | #' 11 | #' @export 12 | rodps.table.hist <- function(tblname, colname, breaks = NULL, freq = TRUE, include.lowest = TRUE, 13 | right = TRUE, main = paste("Histogram of ", colname), xlab = colname, ...) { 14 | 15 | if (!is.null(breaks) && !is.numeric(breaks)) { 16 | stop("Invalid breaks") 17 | } 18 | 19 | sql <- paste("select count(*), min(", colname, "), ", " max(", colname, ") from ", 20 | tblname, ";", sep = " ") 21 | cat("\n") 22 | mm <- rodps.sql(sql) 23 | 24 | # nclass.Sturges 25 | nbin <- ceiling(log2(mm[1, 1]) + 1) 26 | if (is.null(breaks)) { 27 | breaks <- pretty(c(mm[1, 3], mm[1, 2]), n = nbin) 28 | } else { 29 | if (length(breaks) <= 1) { 30 | stop("Invalid breaks length") 31 | } 32 | if (breaks[1] > mm[1, 2] || breaks[length(breaks)] < mm[1, 3]) { 33 | stop("Invalid breaks range") 34 | } 35 | } 36 | 37 | breaks <- sort(breaks) 38 | 39 | sql <- "select " 40 | lb <- breaks[1:(length(breaks) - 1)] 41 | ub <- breaks[2:length(breaks)] 42 | if (right) { 43 | cnt <- paste(" count (case when ", colname, ">", lb, " and ", colname, "<=", 44 | ub, " then 1 end)", sep = "") 45 | cntb <- paste(" count (case when ", colname, ">=", lb[1], " and ", colname, 46 | "<=", ub[1], " then 1 end)", sep = "") 47 | cnt[1] <- cntb 48 | } else { 49 | cnt <- paste(" count (case when ", colname, ">=", lb, " and ", colname, 50 | "<", ub, " then 1 end)", sep = "") 51 | cnte <- paste(" count (case when ", colname, ">=", lb[length(cnt)], " and ", 52 | colname, "<=", ub[length(cnt)], " then 1 end)", sep = "") 53 | cnt[length(cnt)] <- cnte 54 | } 55 | 56 | cnt <- paste(cnt, collapse = ",\n") 57 | 58 | sql <- sprintf("%s \n%s \n from %s", sql, cnt, tblname) 59 | cat(sql) 60 | 61 | cnt <- rodps.sql(sql) 62 | cnt <- t(unlist(cnt[1, ])) 63 | mids <- (lb + ub)/2 64 | dens <- cnt/(mm[1, 1] * diff(breaks)) 65 | equi <- diff(range(diff(breaks))) < 1e-07 66 | 67 | h <- list() 68 | h$breaks <- breaks 69 | h$counts <- cnt 70 | h$mids <- mids 71 | h$density <- dens 72 | h$xname <- colname 73 | h$equidist <- T 74 | class(h) <- "histogram" 75 | 76 | plot(h, freq = freq, main = main, xlab = xlab, ...) 77 | invisible(h) 78 | } 79 | 80 | #' @rdname rodps.table.hist 81 | #' @export 82 | rodps.hist <- rodps.table.hist 83 | -------------------------------------------------------------------------------- /R/rodps_table_sample.R: -------------------------------------------------------------------------------- 1 | #' Sample table 2 | #' 3 | #' @seealso \code{\link{rodps.table.sample.strat}} 4 | #' @export 5 | rodps.table.sample.srs <- function(srctable, tgttable, samplerate, cond = NULL, select = NULL) { 6 | rv <- round(runif(3) * 100) 7 | 8 | .check.tablename(srctable) 9 | .check.tablename(tgttable) 10 | if (!is.numeric(samplerate)) { 11 | stop("samplerate should be numeric") 12 | } 13 | 14 | if (is.null(select)) { 15 | sel = "*" 16 | } else { 17 | if (!is.character(select)) { 18 | stop("Select should be character") 19 | } 20 | sel = paste(select, sep = ",", collapse = ",") 21 | } 22 | 23 | if (!rodps.table.exist(srctable)) { 24 | stop(paste("Table not exists ", srctable)) 25 | } 26 | if (rodps.table.exist(tgttable)) { 27 | stop(paste("Target table already exists", tgttable)) 28 | } 29 | 30 | sql <- sprintf(" SELECT %s FROM %s", sel, srctable) 31 | if (!is.null(cond)) { 32 | if (!is.character(cond)) { 33 | stop("Invalid condition expression") 34 | } else { 35 | sql <- paste(sql, " WHERE ", cond) 36 | } 37 | } 38 | 39 | distby = sprintf(" DISTRIBUTE BY rand(%d)*10 SORT BY rand(%d)", rv[1], rv[2]) 40 | if (samplerate < 1) { 41 | # sample by percentage 42 | sql <- paste(sql, distby) 43 | sql <- paste(" CREATE TABLE ", tgttable, " AS \n SELECT * FROM (", sql, 44 | " ) sub \n WHERE rand(", rv[3], ")<= ", samplerate) 45 | } else { 46 | # sample by abs value 47 | sql <- paste(" CREATE TABLE ", tgttable, " AS \n SELECT * FROM ( ", sql, 48 | distby, " ) sub \n LIMIT ", samplerate) 49 | } 50 | 51 | ret <- try(rodps.sql(sql)) 52 | if ("try-error" %in% class(ret)) { 53 | cat("Exception occurred when executing sql \n") 54 | cat(sql) 55 | cat("\n") 56 | return(FALSE) 57 | } 58 | return(T) 59 | } 60 | 61 | #' @rdname rodps.table.sample.srs 62 | #' @export 63 | rodps.sample.srs <- rodps.table.sample.srs 64 | 65 | #' Sample Table 66 | #' 67 | #' The sample strategy is as similar as: 68 | #' 69 | #' select abc from ( *, row_number() over( partition by g order by 70 | #' rand()) r_rn, rand() as r_select ) sub 71 | #' 1. by percent sub where r_select < rate 72 | #' 2. by number sub where rn <= rate 73 | #' 74 | #' @seealso \code{\link{rodps.table.sample.srs}} 75 | #' @export 76 | rodps.table.sample.strat <- function(srctable, tgttable, samplerate, strat, select = NULL) { 77 | .check.tablename(srctable) 78 | .check.tablename(tgttable) 79 | 80 | if (!is.numeric(samplerate)) { 81 | stop("sample rate should be numeric ") 82 | } 83 | if (!is.character(strat)) { 84 | stop("strat should be character") 85 | } 86 | if (!is.null(select) && !is.character(select)) { 87 | stop("select should be character") 88 | } 89 | if (rodps.table.exist(tgttable)) { 90 | stop("target table already exists") 91 | } 92 | 93 | rv <- round(runif(3) * 100) 94 | if (is.null(select)) { 95 | des <- rodps.table.desc(srctable) 96 | cols <- paste(des$columns$names, collapse = ",") 97 | } else { 98 | cols <- paste(select, collapse = ",") 99 | } 100 | pcols <- paste(strat, collapse = ",") 101 | 102 | temp <- "CREATE TABLE %s AS \n SELECT %s FROM ( \n SELECT %s, \n row_number() OVER (PARTITION BY %s ORDER BY rand(%d)) sel_rownumber, \n rand(%d) sel_random FROM %s) sub" 103 | sql <- sprintf(temp, tgttable, cols, cols, pcols, rv[1], rv[2], srctable) 104 | 105 | if (samplerate < 1) { 106 | sql <- paste(sql, " WHERE sel_random <= ", samplerate) 107 | } else { 108 | sql <- paste(sql, " WHERE sel_rownumber <= ", samplerate) 109 | } 110 | 111 | ret <- try(rodps.sql(sql)) 112 | if ("try-error" %in% class(ret)) { 113 | cat("Exception occurred when executing sql\n") 114 | cat(sql) 115 | cat("\n") 116 | } 117 | return(TRUE) 118 | } 119 | 120 | #' @rdname rodps.table.sample.strat 121 | #' @export 122 | rodps.sample.strat <- rodps.table.sample.strat 123 | -------------------------------------------------------------------------------- /R/rodps_table_summary.R: -------------------------------------------------------------------------------- 1 | #' Store unique result in a temp table 2 | #' 3 | #' @export 4 | summary.rodps.data <- function(rd) { 5 | rodps.table.summary(rd@tbl) 6 | } 7 | 8 | #' Table Summary 9 | #' 10 | #' Get statistical summaries of a table. 11 | #' @export 12 | rodps.table.summary <- function(tbl) { 13 | stblname <- paste("rodps_", paste(sample(c(letters[1:6], 0:9), 30, replace = TRUE), 14 | collapse = "", sep = ""), sep = "") 15 | rtblname <- paste("rodps_", paste(sample(c(letters[1:6], 0:9), 30, replace = TRUE), 16 | collapse = "", sep = ""), sep = "") 17 | 18 | des <- rodps.table.desc(tbl) 19 | rows <- rodps.table.rows(tbl) 20 | dblcols <- des$columns[which(des$columns$types == "double"), ] 21 | cols <- paste(which(des$columns$types == "double") - 1, collapse = ",") 22 | 23 | if (nrow(dblcols) > 0) { 24 | sql <- sprintf("sort_rank -i %s -o %s,%s -c %s ", tbl, stblname, rtblname, 25 | cols) 26 | rodps.sql(sql) 27 | 28 | sql <- sprintf("select colname, percentile,pctvalue from %s where percentile in (0.0, 25.0, 50.0, 75.0, 100.0);", 29 | rtblname) 30 | df <- rodps.sql(sql) 31 | 32 | rodps.table.drop(stblname) 33 | rodps.table.drop(rtblname) 34 | } else { 35 | df <- data.frame() 36 | } 37 | 38 | # get mean value 39 | meancols <- paste("avg(", des$columns[which(des$columns$types == "double"), ]$names, 40 | ")", sep = "", collapse = ",") 41 | nacols <- paste(" count(case when ", des$columns$names, " is null then 1 end)", 42 | sep = "", collapse = ",") 43 | if (nrow(dblcols) > 0) { 44 | sql <- sprintf("select %s , %s from %s;", meancols, nacols, tbl) 45 | } else { 46 | sql <- sprintf("select %s from %s;", nacols, tbl) 47 | } 48 | df2 <- rodps.sql(sql) 49 | 50 | # assemble output 51 | tmp <- list() 52 | allcols = des$columns 53 | dbli <- 0 54 | for (i in seq(1:nrow(allcols))) { 55 | coltype = allcols[i, ]$types 56 | if (coltype == "double") { 57 | dbli <- dbli + 1 58 | # min, 1st qu, median, mean, 3rd qu. max. na 59 | minv <- df[which(df$colname == allcols[i, ]$names & df$percentile == 60 | 0), ]$pctvalue 61 | fstqv <- df[which(df$colname == allcols[i, ]$names & df$percentile == 62 | 25), ]$pctvalue 63 | medianv <- df[which(df$colname == allcols[i, ]$names & df$percentile == 64 | 50), ]$pctvalue 65 | meanv <- df2[1, dbli] 66 | trdqv <- df[which(df$colname == allcols[i, ]$names & df$percentile == 67 | 75), ]$pctvalue 68 | maxv <- df[which(df$colname == allcols[i, ]$names & df$percentile == 69 | 100), ]$pctvalue 70 | nav <- df2[1, nrow(dblcols) + i] 71 | tmpcol <- c(sprintf("Min. :%.2f", minv), sprintf("1st Qu.:%.2f", fstqv), 72 | sprintf("Median :%.2f", medianv), sprintf("Mean :%.2f", meanv), 73 | sprintf("3rd Qu.:%.2f", trdqv), sprintf("Max. :%.2f", maxv), ifelse(nav > 74 | 0, sprintf("NA's : %d", nav), NA)) 75 | } else { 76 | lenv <- sprintf("Length:%d", rows) 77 | classv <- sprintf("Class :%s", allcols[i, ]$types) 78 | nav <- df2[1, nrow(dblcols) + i] 79 | tmpcol <- c(lenv, classv, ifelse(nav > 0, sprintf("NA's : %d", nav), 80 | NA), NA, NA, NA, NA) 81 | } 82 | tmp[[i]] <- tmpcol 83 | } 84 | 85 | tmp <- unlist(tmp) 86 | dim(tmp) <- c(7, nrow(allcols)) 87 | dimnames(tmp)[[1]] <- rep("", 7) 88 | dimnames(tmp)[[2]] <- allcols$names 89 | class(tmp) <- "table" 90 | tmp 91 | } 92 | -------------------------------------------------------------------------------- /R/rodps_version.R: -------------------------------------------------------------------------------- 1 | rodps.version <- function() { 2 | print("RODPS 2.1.5") 3 | } 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RODPS: ODPS Plugin for R 2 | 3 | [![Building RODPS](https://github.com/aliyun/aliyun-odps-r-plugin/actions/workflows/building.yaml/badge.svg?branch=master)](https://github.com/aliyun/aliyun-odps-r-plugin/actions/workflows/building.yaml) 4 | 5 | 6 | ## Features 7 | 8 | - Read/write dataframe from/to ODPS. 9 | - Convert some of the R models to SQL command. 10 | - The large data set can be processed by using the distributed algorithm. 11 | - The small data set can be processed directly in R. 12 | 13 | ## Requirements 14 | 15 | System dependencies: 16 | 17 | - Java 8+ 18 | - R 1.8+ 19 | 20 | R libraries: 21 | 22 | - [rJava](https://cran.r-project.org/web/packages/rJava/index.html) 23 | - [DBI](https://cran.r-project.org/web/packages/DBI/index.html) 24 | - [RSQLite](https://cran.r-project.org/web/packages/RSQLite/index.html) 25 | 26 | ## Installation 27 | 28 | 1. Install the R dependencies: 29 | 30 | ```R 31 | install.packages('DBI') 32 | install.packages('rJava') 33 | install.packages('RSQLite') 34 | ``` 35 | 2. Install RODPS 36 | 37 | 2.1. Install from release package 38 | 39 | Check out the latest version on [release page](https://github.com/aliyun/aliyun-odps-r-plugin/releases). As for version 2.1.3, for example: 40 | 41 | ```R 42 | install.packages('https://github.com/aliyun/aliyun-odps-r-plugin/releases/download/v2.1.3/RODPS_2.1.3.tar.gz', type="source", repos=NULL) 43 | ``` 44 | 45 | 2.2. Install with `devtools` packages 46 | 47 | This method requires JDK and Maven executables to build java module. 48 | 49 | ```R 50 | install_github("aliyun/aliyun-odps-r-plugin") 51 | ``` 52 | 53 | 2.3 Install from CRAN (**Under development**) 54 | 55 | ## Getting Started 56 | 57 | 1. Please make sure the environment variable `RODPS_CONFIG` is set to `/path/to/odps_config.ini` 58 | 59 | ```bash 60 | export RODPS_CONFIG=/path/to/odps_config.ini 61 | ``` 62 | 63 | See the configuration template: [odps_config.ini.template](examples/odps_config.ini.template) 64 | 65 | 2. Basic Usage 66 | 67 | * [Basic project and SQL functions](https://github.com/aliyun/aliyun-odps-r-plugin/blob/master/tests/test_rodps_basics.R) 68 | * [Basic table functions](https://github.com/aliyun/aliyun-odps-r-plugin/blob/master/tests/test_rodps_table.R) 69 | 70 | ## Under the Hood 71 | 72 | ### Design Architecture 73 | 74 | For the mind map of related concepts, please refer to the [MindMapDoc](docs/mindmap.pdf) 75 | 76 | ### Type System 77 | 78 | **All numeric in R have possibility of precision loss.** 79 | 80 | | MaxCompute/ODPS | R | Notes | 81 | |-----------------|---|-------| 82 | | BOOLEAN | logical | | 83 | | BIGINT | numeric | \[-9223372036854774784, 9223372036854774784\] * | 84 | | INT | numeric | | 85 | | TINYINT | numeric | | 86 | | SMALLINT | numeric | | 87 | | DOUBLE | numeric | | 88 | | FLOAT | numeric | | 89 | | DATETIME | numeric | POSIXct POSIXlt, in second | 90 | | DATE | numeric | POSIXct POSIXlt, in second | 91 | | TIMESTAMP | numeric | POSIXct POSIXlt, in second | 92 | | INTERVAL_YEAR_MONTH | numeric | in month | 93 | | INTERVAL_DATE_TIME | numeric | in second | 94 | | DECIMAL | numeric | | 95 | | STRING | character | | 96 | | CHAR | character | | 97 | | VARCHAR | character | | 98 | | BINARY | character | | 99 | | MAP | - | unsupport | 100 | | ARRAY | - | unsupport | 101 | | STRUCT | - | unsupport | 102 | 103 | * BIGINT(64bit) from MaxCompute is stored and calculated as double(64bit) in RODPS. Precision loss might happen when casting BIGINT to double, which shrinks the min/max value could be written back to MaxCompute/ODPS. 104 | 105 | ### Trouble shooting 106 | 107 | - For Windows users: DO NOT install BOTH 32bit and 64bit R on your system, which will introduce compiling issues in the installation of `rJava`. 108 | 109 | ## License 110 | 111 | Licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0.html) 112 | -------------------------------------------------------------------------------- /configure: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | basepath=$( 5 | cd "$(dirname "$0")" 6 | pwd 7 | ) 8 | 9 | RVERSION=2.1.6.3 10 | VERSIONDATE=$(date +"%Y-%m-%d %H:%M:%S") 11 | 12 | # Constants for colors 13 | RED='\033[0;31m' 14 | GREEN='\033[0;32m' 15 | YELLOW='\033[0;33m' 16 | NC='\033[0m' # No Color 17 | 18 | function print_message { 19 | local color=$1 20 | shift 21 | printf "${color}[%s] %s${NC}\n" "$(date '+%Y-%m-%dT%H:%M:%S')" "$@" 22 | } 23 | 24 | function INFO { 25 | print_message "$GREEN" "$@" 26 | } 27 | 28 | function WARN { 29 | print_message "$YELLOW" "$@" 30 | } 31 | 32 | function ERROR { 33 | print_message "$RED" "$@" 34 | } 35 | 36 | # Check for Java and Maven dependencies 37 | function check_dependencies { 38 | local missing_deps=() 39 | 40 | command -v java >/dev/null 2>&1 || missing_deps+=("Java") 41 | command -v mvn >/dev/null 2>&1 || missing_deps+=("Maven") 42 | 43 | if [ ${#missing_deps[@]} -gt 0 ]; then 44 | ERROR "Missing dependencies: ${missing_deps[*]}" && exit 1 45 | fi 46 | } 47 | 48 | # Check sed command compatibility 49 | SED_CMD="sed -i" 50 | if sed -i '' /dev/null 2>/dev/null; then 51 | SED_CMD="sed -i ''" 52 | fi 53 | 54 | INFO "Configuring RODPS package, version ${RVERSION}..." 55 | 56 | # Update description version 57 | $SED_CMD -E "s|Version: .*|Version: ${RVERSION}|g" "$basepath/DESCRIPTION" 58 | 59 | cat >$basepath/R/odps_version.R <<__EOF__ 60 | rodps.version <- function() { 61 | print("RODPS ${RVERSION}") 62 | } 63 | __EOF__ 64 | 65 | function build_java_src { 66 | INFO "Building Java lib located at $basepath/java" 67 | 68 | libpath=${basepath}/inst/java 69 | if [ -e ${libpath} ]; then rm -rf $libpath; fi 70 | mkdir -p ${libpath} 71 | 72 | # mvn package 73 | cd $basepath/java 74 | mvn clean 75 | mvn versions:set -DnewVersion=${RVERSION} 76 | mvn package -DskipTests 77 | cd $basepath 78 | 79 | # copy jars & log4j config to libpath 80 | cp $basepath/java/target/lib/*.jar $libpath 81 | cp $basepath/java/target/lib/../*.jar $libpath 82 | cp $basepath/java/src/main/resources/log4j2.properties $libpath 83 | rm -rf $basepath/java/target 84 | 85 | INFO "Java building success!" 86 | } 87 | 88 | function check_java_building_env { 89 | check_dependencies 90 | 91 | java_version=$(java -version 2>&1 | awk -F '"' '/version/ {print $2}') 92 | java_major_version=$(echo "$java_version" | awk -F '.' '{print $1}') 93 | INFO "Java version: ${java_version}" 94 | 95 | maven_version=$(mvn -v | grep "Apache Maven" | awk '{print $3}') 96 | maven_major_version=$(echo "$maven_version" | awk -F '.' '{print $1}') 97 | INFO "Maven version: ${maven_version}" 98 | 99 | if [ "x$maven_version" == "x" ]; then 100 | ERROR "Unknown maven version, exit" && exit 1 101 | fi 102 | } 103 | 104 | targetJar="rodps-${RVERSION}.jar" 105 | if [ ! -e "$basepath/inst/java/${targetJar}" ]; then 106 | WARN "Java lib not found in 'inst/java', try to run local building." 107 | check_java_building_env 108 | build_java_src 109 | else 110 | INFO "Java lib found at $targetJar" 111 | fi 112 | -------------------------------------------------------------------------------- /docs/mindmap-thumb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-odps-r-plugin/896b20bb6866aaf2ea7832df842f4dc5974a1e9e/docs/mindmap-thumb.png -------------------------------------------------------------------------------- /docs/mindmap.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/aliyun-odps-r-plugin/896b20bb6866aaf2ea7832df842f4dc5974a1e9e/docs/mindmap.pdf -------------------------------------------------------------------------------- /examples/odps_config.ini.template: -------------------------------------------------------------------------------- 1 | access_id= 2 | access_key= 3 | end_point= 4 | tunnel_endpoint= 5 | project_name= 6 | # sts_token= 7 | # rodps_tmpdir= 8 | -------------------------------------------------------------------------------- /hooks/pre-commit: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | STAGE_FILES=$(git diff --cached --name-only --diff-filter=ACM) 4 | 5 | echo 'check sensitive information ...' 6 | FAIL=0 7 | for FILE in $STAGE_FILES 8 | do 9 | grep --color -Hni -E "(ssh-rsa|authorized_keys|id_dsa|ssh-keygen)" $FILE && FAIL=1 10 | grep --color -Hni -E "(private key|secret|signature|accessid|access_id|access_key|accesskey|access_|password)(.*?)(\=|\:)(\s*)(\'|\")[^\$^%][^)]+(\'|\")[^)]*$" $FILE && FAIL=1 11 | done 12 | 13 | if [ ${FAIL} == 0 ]; then 14 | echo 'check sensitive information ... passed' 15 | exit 0 16 | else 17 | echo 'check sensitive information ... failed' 18 | exit 1 19 | fi 20 | -------------------------------------------------------------------------------- /java/check_style.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 79 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 104 | 105 | 106 | 108 | 109 | 110 | 111 | 113 | 114 | 115 | 116 | 118 | 119 | 120 | 121 | 122 | 123 | 125 | 126 | 127 | 128 | 130 | 131 | 132 | 133 | 135 | 136 | 137 | 138 | 140 | 142 | 144 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 166 | 167 | 168 | 169 | 170 | 172 | 173 | 174 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 186 | 187 | 188 | 189 | 190 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | -------------------------------------------------------------------------------- /java/intellij-java-google-style.xml: -------------------------------------------------------------------------------- 1 | 2 | 14 | 34 | 250 | -------------------------------------------------------------------------------- /java/pom.xml: -------------------------------------------------------------------------------- 1 | 4 | 4.0.0 5 | 6 | com.aliyun.odps 7 | rodps 8 | 2.1.6.3 9 | jar 10 | rodps 11 | https://github.com/aliyun/aliyun-odps-r-plugin 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | com.aliyun.odps 20 | odps-sdk-core 21 | 0.45.5-public 22 | shaded 23 | 24 | 25 | org.everit.osgi.bundles 26 | org.everit.osgi.bundles.org.json 27 | 1.0.0-v20140107 28 | 29 | 30 | org.xerial 31 | sqlite-jdbc 32 | 3.42.0.0 33 | 34 | 35 | junit 36 | junit 37 | 4.13.1 38 | test 39 | 40 | 41 | org.apache.logging.log4j 42 | log4j-core 43 | 2.20.0 44 | 45 | 46 | org.apache.logging.log4j 47 | log4j-api 48 | 2.20.0 49 | 50 | 51 | 52 | 53 | 54 | 55 | org.apache.maven.plugins 56 | maven-dependency-plugin 57 | 58 | 59 | copy-dependencies 60 | package 61 | 62 | copy-dependencies 63 | 64 | 65 | runtime 66 | ${project.build.directory}/lib 67 | true 68 | true 69 | true 70 | false 71 | pom 72 | 73 | 74 | 75 | 76 | 77 | org.apache.maven.plugins 78 | maven-compiler-plugin 79 | 3.1 80 | 81 | 1.8 82 | 1.8 83 | 84 | 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /java/src/main/java/com/aliyun/odps/rodps/DataTunnel/Context.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more contributor license 3 | * agreements. See the NOTICE file distributed with this work for additional information regarding 4 | * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance with the License. You may obtain a 6 | * copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software distributed under the License 11 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 12 | * or implied. See the License for the specific language governing permissions and limitations under 13 | * the License. 14 | */ 15 | 16 | package com.aliyun.odps.rodps.DataTunnel; 17 | 18 | import com.aliyun.odps.Odps; 19 | import com.aliyun.odps.TableSchema; 20 | 21 | /** 22 | * @Title: RContext.java 23 | * @Package com.aliyun.odps.rodps.DataTunnel 24 | * @Description: TODO(添加描述) 维护状态信息(连接、总行数、要读取的行数、启动的线程数) 25 | * @author dendi.ywd 26 | * @date 2015-8-7 17:52:19 27 | * @version V1.0 28 | */ 29 | public class Context { 30 | 31 | private Odps odps; 32 | private final String dtEndpoint; 33 | private final String project; 34 | private final String table; 35 | private final String partition; 36 | private TableSchema schema; 37 | private String actionId; 38 | 39 | private long recordCount; 40 | 41 | private final long limit; 42 | private final String colDim; 43 | private final String rowDim; 44 | private T action; 45 | 46 | private int threadNumber; 47 | 48 | public Context(Odps odps, String dtEndpoint, String project, String table, String partition, 49 | long limit, String colDim, String rowDim, int threadNum) { 50 | this.odps = odps; 51 | this.dtEndpoint = dtEndpoint; 52 | this.project = project; 53 | this.table = table; 54 | this.partition = partition; 55 | this.limit = limit; 56 | this.colDim = colDim; 57 | this.rowDim = rowDim; 58 | this.threadNumber = threadNum; 59 | } 60 | 61 | public String getColDim() { 62 | return colDim; 63 | } 64 | 65 | public int getThreadNumber() { 66 | return threadNumber; 67 | } 68 | 69 | public void setThreadNumber(int threadNumber) { 70 | this.threadNumber = threadNumber; 71 | } 72 | 73 | public String getRowDim() { 74 | return rowDim; 75 | } 76 | 77 | 78 | public long getDownloadRecords() { 79 | if (limit <= 0L) { 80 | return this.recordCount; 81 | } 82 | if (this.recordCount > limit) { 83 | return limit; 84 | } 85 | return this.recordCount; 86 | } 87 | 88 | public int getActualThreads() { 89 | if (getDownloadRecords() < getThreadNumber() * 100) { 90 | return 1; 91 | } else { 92 | return this.threadNumber; 93 | } 94 | } 95 | 96 | public String getDtEndpoint() { 97 | return dtEndpoint; 98 | } 99 | 100 | public String getProject() { 101 | return project; 102 | } 103 | 104 | public String getTable() { 105 | return table; 106 | } 107 | 108 | public String getPartition() { 109 | return partition; 110 | } 111 | 112 | public String getActionId() { 113 | return actionId; 114 | } 115 | 116 | public void setActionId(String actionId) { 117 | this.actionId = actionId; 118 | } 119 | 120 | public long getRecordCount() { 121 | return recordCount; 122 | } 123 | 124 | public void setRecordCount(long recordCount) { 125 | this.recordCount = recordCount; 126 | } 127 | 128 | public void setAction(T action) { 129 | this.action = action; 130 | } 131 | 132 | public T getAction() { 133 | return this.action; 134 | } 135 | 136 | public Odps getOdps() { 137 | return this.odps; 138 | } 139 | 140 | public void setOdps(Odps odps) { 141 | this.odps = odps; 142 | } 143 | 144 | public TableSchema getSchema() { 145 | return this.schema; 146 | } 147 | 148 | public void setSchema(TableSchema schema) { 149 | this.schema = schema; 150 | } 151 | } 152 | -------------------------------------------------------------------------------- /java/src/main/java/com/aliyun/odps/rodps/DataTunnel/DTProcess.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more contributor license 3 | * agreements. See the NOTICE file distributed with this work for additional information regarding 4 | * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance with the License. You may obtain a 6 | * copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software distributed under the License 11 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 12 | * or implied. See the License for the specific language governing permissions and limitations under 13 | * the License. 14 | */ 15 | 16 | package com.aliyun.odps.rodps.DataTunnel; 17 | 18 | import java.util.ArrayList; 19 | import java.util.List; 20 | import org.apache.logging.log4j.LogManager; 21 | import org.apache.logging.log4j.Logger; 22 | 23 | /** 24 | * @Title: DTProcess.java 25 | * @Package com.aliyun.odps.rodps.DataTunnel 26 | * @Description: TODO(添加描述) 27 | * @author dendi.ywd 28 | * @date 2015-8-7 17:54:59 29 | * @version V1.0 30 | */ 31 | public abstract class DTProcess { 32 | private static Logger LOG = LogManager.getLogger(DTProcess.class.getSuperclass()); 33 | protected Context context; 34 | 35 | public DTProcess(Context context) { 36 | this.context = context; 37 | } 38 | 39 | public List createWorkerList(String fileName) throws ROdpsException { 40 | int threadNum = context.getActualThreads(); 41 | 42 | LOG.debug(String.format("start to create %d processing workers", threadNum)); 43 | 44 | long recordNumPerThread = this.context.getDownloadRecords() / threadNum; 45 | LOG.debug("record number per thread:" + String.valueOf(recordNumPerThread)); 46 | List workers = new ArrayList(); 47 | for (int i = 0; i < threadNum; ++i) { 48 | T worker; 49 | Long records = 50 | (i == threadNum - 1 ? (this.context.getDownloadRecords() - i * recordNumPerThread) 51 | : recordNumPerThread); 52 | worker = 53 | createWorker(i, context, i * recordNumPerThread, records, createTempFileName(fileName, i)); 54 | workers.add(worker); 55 | } 56 | LOG.debug("finish creating processing workers"); 57 | return workers; 58 | } 59 | 60 | public abstract T createWorker(int threadId, Context context, long startRecordNumber, 61 | long downloadRecordNumber, String fileName) throws ROdpsException; 62 | 63 | protected static String createTempFileName(String fileName, int index) { 64 | // Should keep consistent with function `.dataframe.to.sqlite()` 65 | return fileName + "_" + index; 66 | } 67 | 68 | } 69 | -------------------------------------------------------------------------------- /java/src/main/java/com/aliyun/odps/rodps/DataTunnel/DataFrameItem.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more contributor license 3 | * agreements. See the NOTICE file distributed with this work for additional information regarding 4 | * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance with the License. You may obtain a 6 | * copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software distributed under the License 11 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 12 | * or implied. See the License for the specific language governing permissions and limitations under 13 | * the License. 14 | */ 15 | 16 | package com.aliyun.odps.rodps.DataTunnel; 17 | 18 | import java.util.ArrayList; 19 | import java.util.List; 20 | 21 | 22 | /* 23 | * * 用于描述DataFrame的数据结构,表示DataFrame中的某一列 24 | */ 25 | public class DataFrameItem { 26 | 27 | private String name; 28 | private String type; 29 | private List data; 30 | 31 | public DataFrameItem() {} 32 | 33 | public DataFrameItem(String name, String type) { 34 | this.name = name; 35 | this.type = type; 36 | this.data = new ArrayList(); 37 | } 38 | 39 | public String getName() { 40 | return name; 41 | } 42 | 43 | public void setName(String name) { 44 | this.name = name; 45 | } 46 | 47 | public String getType() { 48 | return type; 49 | } 50 | 51 | public void setType(String type) { 52 | this.type = type; 53 | } 54 | 55 | public List getData() { 56 | return data; 57 | } 58 | 59 | public void setData(List data) { 60 | this.data = data; 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /java/src/main/java/com/aliyun/odps/rodps/DataTunnel/DownloadWorker.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more contributor license 3 | * agreements. See the NOTICE file distributed with this work for additional information regarding 4 | * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance with the License. You may obtain a 6 | * copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software distributed under the License 11 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 12 | * or implied. See the License for the specific language governing permissions and limitations under 13 | * the License. 14 | */ 15 | package com.aliyun.odps.rodps.DataTunnel; 16 | 17 | import java.io.PrintWriter; 18 | import java.io.StringWriter; 19 | 20 | import org.apache.logging.log4j.LogManager; 21 | import org.apache.logging.log4j.Logger; 22 | 23 | import com.aliyun.odps.data.RecordReader; 24 | import com.aliyun.odps.tunnel.TableTunnel.DownloadSession; 25 | 26 | /** 27 | * @Title: DownloadWorker.java 28 | * @Package com.aliyun.odps.rodps.DataTunnel 29 | * @Description: TODO(添加描述) 30 | * @author dendi.ywd 31 | * @date 2015-8-7 17:53:55 32 | * @version V1.0 33 | */ 34 | public class DownloadWorker implements Runnable { 35 | private static Logger LOG = LogManager.getLogger(DownloadWorker.class); 36 | private final long startRecordNumber; 37 | private final long downloadRecordNumber; 38 | private String errorMessage; 39 | private boolean isSuccessful; 40 | private final String savePath; 41 | private long loadedRecordNum; 42 | private final Context context; 43 | public Thread t; 44 | private int threadId; 45 | private MiddleStorage midStorage; 46 | private int maxRetries = 5; 47 | 48 | DownloadWorker(int threadId, Context context, long startRecordNumber, 49 | long downloadRecordNumber, String savePath) throws ROdpsException { 50 | this.threadId = threadId; 51 | this.startRecordNumber = startRecordNumber; 52 | this.downloadRecordNumber = downloadRecordNumber; 53 | this.loadedRecordNum = 0; 54 | this.isSuccessful = false; 55 | this.savePath = savePath; 56 | this.context = context; 57 | this.midStorage = new SqliteMiddleStorage(this.savePath, context); 58 | LOG.debug(threadId + ":" + String.valueOf(startRecordNumber) + " " 59 | + String.valueOf(downloadRecordNumber)); 60 | t = new Thread(this, String.valueOf(threadId)); 61 | t.start(); 62 | } 63 | 64 | public void run() { 65 | LOG.info("start to download threadId=" + this.threadId); 66 | int retries = 1; 67 | while (retries <= maxRetries && !isSuccessful) { 68 | try { 69 | RecordReader reader = null; 70 | if (downloadRecordNumber > 0) { 71 | reader = context.getAction().openRecordReader(startRecordNumber, downloadRecordNumber); 72 | } 73 | loadedRecordNum = midStorage.readDataTunnel(reader, downloadRecordNumber); 74 | LOG.info("threadId=" + this.threadId + " download finished, record=" 75 | + this.loadedRecordNum); 76 | isSuccessful = true; 77 | } catch (Exception e) { 78 | StringWriter sw = new StringWriter(); 79 | e.printStackTrace(new PrintWriter(sw)); 80 | this.errorMessage = sw.toString(); 81 | if (retries <= maxRetries) { 82 | LOG.warn("download failed in attempt " + retries + ", threadId=" + threadId + ", stack=" + sw.toString()); 83 | try { 84 | Thread.sleep(1000); 85 | } catch (InterruptedException e1) { 86 | LOG.error("Sleep interrupted!", e1); 87 | } 88 | } else { 89 | LOG.error("download failed, threadId=" + threadId + ", stack=" + sw.toString()); 90 | } 91 | } 92 | retries++; 93 | } 94 | if (this.midStorage != null) { 95 | this.midStorage.close(); 96 | } 97 | } 98 | 99 | public boolean IsSuccessful() { 100 | return isSuccessful; 101 | } 102 | 103 | public String getErrorMessage() { 104 | return errorMessage; 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /java/src/main/java/com/aliyun/odps/rodps/DataTunnel/MiddleStorage.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more contributor license 3 | * agreements. See the NOTICE file distributed with this work for additional information regarding 4 | * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance with the License. You may obtain a 6 | * copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software distributed under the License 11 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 12 | * or implied. See the License for the specific language governing permissions and limitations under 13 | * the License. 14 | */ 15 | package com.aliyun.odps.rodps.DataTunnel; 16 | 17 | import com.aliyun.odps.data.RecordReader; 18 | import com.aliyun.odps.data.RecordWriter; 19 | 20 | public interface MiddleStorage { 21 | long readDataTunnel(RecordReader reader, long downloadRecordNumber) throws Exception; 22 | 23 | long writeDataTunnel(RecordWriter writer) throws Exception; 24 | 25 | void close(); 26 | } 27 | -------------------------------------------------------------------------------- /java/src/main/java/com/aliyun/odps/rodps/DataTunnel/RDTDownloader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more contributor license 3 | * agreements. See the NOTICE file distributed with this work for additional information regarding 4 | * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance with the License. You may obtain a 6 | * copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software distributed under the License 11 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 12 | * or implied. See the License for the specific language governing permissions and limitations under 13 | * the License. 14 | */ 15 | 16 | package com.aliyun.odps.rodps.DataTunnel; 17 | 18 | import java.io.File; 19 | import java.io.IOException; 20 | import java.util.ArrayList; 21 | import java.util.List; 22 | 23 | import org.apache.logging.log4j.LogManager; 24 | import org.apache.logging.log4j.Logger; 25 | 26 | import com.aliyun.odps.PartitionSpec; 27 | import com.aliyun.odps.tunnel.TableTunnel; 28 | import com.aliyun.odps.tunnel.TunnelException; 29 | import com.aliyun.odps.tunnel.TableTunnel.DownloadSession; 30 | 31 | public class RDTDownloader extends DTProcess { 32 | private static Logger LOG = LogManager.getLogger(RDTDownloader.class); 33 | 34 | public RDTDownloader(Context context) throws IOException { 35 | super(context); 36 | } 37 | 38 | public List> downloadTable(String tempDataFile) throws ROdpsException, IOException { 39 | if (null == tempDataFile) { 40 | throw new ROdpsException("Internal Error: temp data file is null"); 41 | } 42 | TableTunnel tunnel = new TableTunnel(context.getOdps()); 43 | if (context.getDtEndpoint() != null) { 44 | tunnel.setEndpoint(context.getDtEndpoint()); 45 | } 46 | LOG.info("start to download table"); 47 | DownloadSession downloadSession; 48 | try { 49 | if (context.getPartition() != null && !context.getPartition().isEmpty()) { 50 | PartitionSpec partitionSpec = new PartitionSpec(context.getPartition()); 51 | downloadSession = 52 | tunnel.createDownloadSession(context.getProject(), context.getTable(), partitionSpec); 53 | } else { 54 | downloadSession = tunnel.createDownloadSession(context.getProject(), context.getTable()); 55 | } 56 | context.setAction(downloadSession); 57 | LOG.debug("start to create download"); 58 | context.setRecordCount(downloadSession.getRecordCount()); 59 | context.setSchema(downloadSession.getSchema()); 60 | LOG.debug("end to init RDTDownloader"); 61 | } catch (TunnelException e) { 62 | LOG.error(e.getMessage(), e); 63 | throw new ROdpsException(e.getErrorCode() + e.getErrorMsg()); 64 | } 65 | List ptkv = this.genPartitionCell(); 66 | List> ret = genTableSchema(ptkv == null ? null : (List) ptkv.get(0)); 67 | try { 68 | List workers = this.createWorkerList(tempDataFile); 69 | String errorMessage = new String(); 70 | LOG.info("wait for download end"); 71 | for (DownloadWorker worker : workers) { 72 | worker.t.join(); // TODO: add time out here 73 | if (!worker.IsSuccessful()) { 74 | errorMessage += worker.getErrorMessage(); 75 | } 76 | } 77 | if (0 < errorMessage.length()) { 78 | throw new ROdpsException(errorMessage); 79 | } 80 | ret.add(getFiles(tempDataFile, workers.size())); 81 | } catch (Exception e) { 82 | throw new IOException("download table failed: ", e); 83 | } 84 | return ret; 85 | } 86 | 87 | @Override 88 | public DownloadWorker createWorker(int threadId, Context context, 89 | long startRecordNumber, long downloadRecordNumber, String savePath) throws ROdpsException { 90 | File file = new File(savePath); 91 | if (file.exists()) { 92 | LOG.warn("download file: " + savePath + "already exist, now delete it."); 93 | file.delete(); 94 | } 95 | return new DownloadWorker(threadId, context, startRecordNumber, downloadRecordNumber, savePath); 96 | } 97 | 98 | /** 99 | * 返回到R的schema 100 | * 101 | * @Title: genTableSchema 102 | * @Description: TODO 103 | * @return 104 | * @return LinkedHashMap 105 | * @throws 106 | */ 107 | private List> genTableSchema(List keys) { 108 | List> ret = new ArrayList>(); 109 | ret.add(new ArrayList()); 110 | ret.add(new ArrayList()); 111 | int columnNumber = this.context.getSchema().getColumns().size(); 112 | for (int i = 0; i < columnNumber; ++i) { 113 | ret.get(0).add(this.context.getSchema().getColumn(i).getName()); 114 | ret.get(1).add( 115 | this.context.getSchema().getColumn(i).getTypeInfo().getTypeName() 116 | .replace("ODPS_", "").toLowerCase()); 117 | } 118 | if (keys != null) { 119 | for (String k : keys) { 120 | ret.get(0).add(k); 121 | ret.get(1).add("string"); 122 | } 123 | } 124 | return ret; 125 | } 126 | 127 | 128 | /** 129 | * 生成partition值cell 130 | * 131 | * @Title: genPartitionCell 132 | * @Description: TODO 133 | * @return 134 | * @throws ROdpsException 135 | * @return String 136 | * @throws 137 | */ 138 | public List genPartitionCell() throws ROdpsException { 139 | if (this.context.getPartition() == null || this.context.getPartition().isEmpty()) { 140 | return null; 141 | } 142 | String[] ptcols = this.context.getPartition().split(","); 143 | StringBuffer vs = new StringBuffer(); 144 | List keys = new ArrayList(); 145 | List ret = new ArrayList(); 146 | for (String p : ptcols) { 147 | String[] items = p.split("="); 148 | if (items.length != 2) { 149 | throw new ROdpsException("Partition express error:" + p); 150 | } 151 | if (vs.length() > 0) { 152 | vs.append(this.context.getColDim()); 153 | } 154 | keys.add(items[0].trim().toLowerCase()); 155 | vs.append("\"" + items[1].trim() + "\""); 156 | } 157 | ret.add(keys); 158 | ret.add(vs.toString()); 159 | return ret; 160 | } 161 | 162 | public List getFiles(String savePath, int fileNum) { 163 | ArrayList fileNames = new ArrayList(); 164 | for (int i = 0; i < fileNum; i++) { 165 | fileNames.add(createTempFileName(savePath, i)); 166 | } 167 | return fileNames; 168 | } 169 | } 170 | -------------------------------------------------------------------------------- /java/src/main/java/com/aliyun/odps/rodps/DataTunnel/RDTUploader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more contributor license 3 | * agreements. See the NOTICE file distributed with this work for additional information regarding 4 | * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance with the License. You may obtain a 6 | * copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software distributed under the License 11 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 12 | * or implied. See the License for the specific language governing permissions and limitations under 13 | * the License. 14 | */ 15 | 16 | package com.aliyun.odps.rodps.DataTunnel; 17 | 18 | 19 | import java.io.IOException; 20 | import java.util.List; 21 | import org.apache.logging.log4j.LogManager; 22 | import org.apache.logging.log4j.Logger; 23 | 24 | import com.aliyun.odps.PartitionSpec; 25 | import com.aliyun.odps.tunnel.TableTunnel; 26 | import com.aliyun.odps.tunnel.TunnelException; 27 | import com.aliyun.odps.tunnel.TableTunnel.UploadSession; 28 | 29 | 30 | public class RDTUploader extends DTProcess { 31 | private static Logger LOG = LogManager.getLogger(RDTUploader.class); 32 | 33 | public RDTUploader(Context context) throws IOException, ROdpsException { 34 | super(context); 35 | } 36 | 37 | public void upload(String dataFilePath) throws ROdpsException, IOException { 38 | TableTunnel tunnel = new TableTunnel(context.getOdps()); 39 | if (context.getDtEndpoint() != null) { 40 | tunnel.setEndpoint(context.getDtEndpoint()); 41 | } 42 | UploadSession uploadSession; 43 | try { 44 | if (context.getPartition() != null && !context.getPartition().isEmpty()) { 45 | PartitionSpec partitionSpec = new PartitionSpec(context.getPartition()); 46 | uploadSession = 47 | tunnel.createUploadSession(context.getProject(), context.getTable(), partitionSpec); 48 | } else { 49 | uploadSession = tunnel.createUploadSession(context.getProject(), context.getTable()); 50 | } 51 | context.setAction(uploadSession); 52 | context.setSchema(context.getAction().getSchema()); 53 | } catch (TunnelException e) { 54 | throw new ROdpsException(e.getErrorCode() + e.getErrorMsg()); 55 | } 56 | 57 | LOG.info("upload session ID: " + uploadSession.getId()); 58 | 59 | List workers = this.createWorkerList(dataFilePath); 60 | try { 61 | String errorMessage = ""; 62 | LOG.debug("wait for upload end"); 63 | for (UploadWorker worker : workers) { 64 | worker.t.join(); 65 | if (!worker.isSuccessful()) { 66 | LOG.error("thread fail met!"); 67 | errorMessage += worker.getErrorMessage(); 68 | } 69 | } 70 | Long[] blockList = new Long[workers.size()]; 71 | for (int i = 0; i < workers.size(); i++) 72 | blockList[i] = Long.valueOf(i); 73 | uploadSession.commit(blockList); 74 | if (!errorMessage.isEmpty()) { 75 | throw new IOException(errorMessage); 76 | } 77 | LOG.info("commit success"); 78 | } catch (InterruptedException e) { 79 | throw new ROdpsException(e); 80 | 81 | } catch (TunnelException e) { 82 | throw new ROdpsException(e); 83 | } 84 | } 85 | 86 | @Override 87 | public UploadWorker createWorker(int threadId, Context context, 88 | long startRecordNumber, long downloadRecordNumber, String fileName) throws ROdpsException { 89 | return new UploadWorker(threadId, context, fileName); 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /java/src/main/java/com/aliyun/odps/rodps/DataTunnel/ROdpsException.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more contributor license 3 | * agreements. See the NOTICE file distributed with this work for additional information regarding 4 | * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance with the License. You may obtain a 6 | * copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software distributed under the License 11 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 12 | * or implied. See the License for the specific language governing permissions and limitations under 13 | * the License. 14 | */ 15 | 16 | package com.aliyun.odps.rodps.DataTunnel; 17 | 18 | /** 19 | * @Title: ROdpsException.java 20 | * @Package com.aliyun.odps.rodps.DataTunnel 21 | * @Description: TODO(添加描述) 22 | * @author dendi.ywd 23 | * @date 2015-8-7 17:57:05 24 | * @version V1.0 25 | */ 26 | public class ROdpsException extends Exception { 27 | public ROdpsException(String msg) { 28 | super(msg); 29 | } 30 | 31 | public ROdpsException(Exception e) { 32 | super(e.getMessage()); 33 | this.initCause(e); 34 | } 35 | 36 | public ROdpsException(Exception e, String msg) { 37 | super(msg); 38 | this.initCause(e); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /java/src/main/java/com/aliyun/odps/rodps/DataTunnel/SqliteMiddleStorage.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more contributor license 3 | * agreements. See the NOTICE file distributed with this work for additional information regarding 4 | * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance with the License. You may obtain a 6 | * copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software distributed under the License 11 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 12 | * or implied. See the License for the specific language governing permissions and limitations under 13 | * the License. 14 | */ 15 | package com.aliyun.odps.rodps.DataTunnel; 16 | 17 | import java.math.BigDecimal; 18 | import java.sql.Connection; 19 | import java.sql.DriverManager; 20 | import java.sql.PreparedStatement; 21 | import java.sql.ResultSet; 22 | import java.sql.SQLException; 23 | import java.sql.Statement; 24 | import java.sql.Timestamp; 25 | import java.sql.Types; 26 | import java.util.ArrayList; 27 | 28 | import org.sqlite.SQLiteConfig; 29 | 30 | import com.aliyun.odps.TableSchema; 31 | import com.aliyun.odps.data.ArrayRecord; 32 | import com.aliyun.odps.data.Binary; 33 | import com.aliyun.odps.data.Char; 34 | import com.aliyun.odps.data.IntervalDayTime; 35 | import com.aliyun.odps.data.IntervalYearMonth; 36 | import com.aliyun.odps.data.RecordReader; 37 | import com.aliyun.odps.data.RecordWriter; 38 | import com.aliyun.odps.data.Varchar; 39 | import com.aliyun.odps.tunnel.TableTunnel.UploadSession; 40 | import com.aliyun.odps.type.TypeInfo; 41 | 42 | public class SqliteMiddleStorage implements MiddleStorage { 43 | private final Connection conn; 44 | private final Context context; 45 | private ArrayList ptks; 46 | private ArrayList ptvs; 47 | 48 | public SqliteMiddleStorage(String dbName, Context context) throws ROdpsException { 49 | this.context = context; 50 | try { 51 | Class.forName("org.sqlite.JDBC"); 52 | SQLiteConfig config = new SQLiteConfig(); 53 | config.setSynchronous(SQLiteConfig.SynchronousMode.OFF); 54 | this.conn = DriverManager.getConnection("jdbc:sqlite:" + dbName, config.toProperties()); 55 | } catch (Exception e) { 56 | throw new ROdpsException(e, "Initial Sqlite Connection fail!"); 57 | } 58 | 59 | initialPtkvs(context.getPartition()); 60 | } 61 | 62 | /** 63 | * Read odps data using DT, and persist these data into sqlite 64 | * 65 | * @param reader DT reader 66 | * @param downloadRecordNumber download record number 67 | * @throws Exception 68 | */ 69 | public long readDataTunnel(RecordReader reader, long downloadRecordNumber) throws Exception { 70 | ArrayRecord record; 71 | long loadedRecordNum = 0; 72 | int batchSize = 10000; 73 | int columnNumber = context.getSchema().getColumns().size(); 74 | int allColNumber = columnNumber; 75 | if (ptks != null) { 76 | allColNumber += ptks.size(); 77 | } 78 | createTable(); 79 | if (downloadRecordNumber == 0) 80 | return 0L; 81 | 82 | // create insert sql 83 | String insSql = "insert into [" + context.getTable() + "] values("; 84 | for (int i = 0; i < allColNumber; i++) { 85 | insSql += "?,"; 86 | } 87 | insSql = insSql.substring(0, insSql.length() - 1) + ");"; 88 | PreparedStatement insPreStmt = null; 89 | 90 | try { 91 | insPreStmt = this.conn.prepareStatement(insSql); 92 | 93 | while (loadedRecordNum < downloadRecordNumber) { 94 | record = (ArrayRecord) reader.read(); 95 | int i = 0; 96 | for (; i < columnNumber; i++) { 97 | TypeInfo colType = context.getSchema().getColumn(i).getTypeInfo(); 98 | switch (colType.getOdpsType()) { 99 | case BOOLEAN: { 100 | Boolean v = record.getBoolean(i); 101 | if (v == null) 102 | insPreStmt.setNull(i + 1, Types.BOOLEAN); 103 | else 104 | insPreStmt.setBoolean(i + 1, v); 105 | break; 106 | } 107 | case BIGINT: { 108 | // XXX: int of r is 32bit,so convert int to double 109 | Long v = record.getBigint(i); 110 | if (v == null) 111 | insPreStmt.setNull(i + 1, Types.DOUBLE); 112 | else 113 | insPreStmt.setDouble(i + 1, v); 114 | break; 115 | } 116 | case INT: { 117 | Integer v = record.getInt(i); 118 | if (v == null) 119 | insPreStmt.setNull(i + 1, Types.DOUBLE); 120 | else 121 | insPreStmt.setDouble(i + 1, v); 122 | break; 123 | } 124 | case TINYINT: { 125 | Byte v = record.getTinyint(i); 126 | if (v == null) 127 | insPreStmt.setNull(i + 1, Types.DOUBLE); 128 | else 129 | insPreStmt.setDouble(i + 1, v); 130 | break; 131 | } 132 | case SMALLINT: { 133 | Short v = record.getSmallint(i); 134 | if (v == null) 135 | insPreStmt.setNull(i + 1, Types.DOUBLE); 136 | else 137 | insPreStmt.setDouble(i + 1, v); 138 | break; 139 | } 140 | case DOUBLE: { 141 | Double v = record.getDouble(i); 142 | if (v == null) 143 | insPreStmt.setNull(i + 1, Types.DOUBLE); 144 | else 145 | insPreStmt.setDouble(i + 1, v); 146 | break; 147 | } 148 | case FLOAT: { 149 | Float v = record.getFloat(i); 150 | if (v == null) 151 | insPreStmt.setNull(i + 1, Types.DOUBLE); 152 | else 153 | insPreStmt.setDouble(i + 1, v); 154 | break; 155 | } 156 | case DATETIME: { 157 | java.util.Date v = record.getDatetime(i); 158 | if (v == null) 159 | insPreStmt.setNull(i + 1, Types.DOUBLE); 160 | else 161 | insPreStmt.setDouble(i + 1, v.getTime() / 1000.0); 162 | break; 163 | } 164 | case DATE: { 165 | java.sql.Date v = record.getDate(i); 166 | if (v == null) 167 | insPreStmt.setNull(i + 1, Types.DOUBLE); 168 | else 169 | insPreStmt.setDouble(i + 1, v.getTime() / 1000.0); 170 | break; 171 | } 172 | case TIMESTAMP: { 173 | Timestamp v = record.getTimestamp(i); 174 | if (v == null) 175 | insPreStmt.setNull(i + 1, Types.DOUBLE); 176 | else 177 | insPreStmt.setDouble(i + 1, v.getTime() / 1000.0); 178 | break; 179 | } 180 | case DECIMAL: { 181 | BigDecimal v = record.getDecimal(i); 182 | if (v == null) 183 | insPreStmt.setNull(i + 1, Types.NULL); 184 | else 185 | insPreStmt.setString(i + 1, v.toPlainString()); 186 | break; 187 | } 188 | case STRING: { 189 | String v = record.getString(i); 190 | if (v == null) 191 | insPreStmt.setNull(i + 1, Types.NULL); 192 | else 193 | insPreStmt.setString(i + 1, v); 194 | break; 195 | } 196 | case CHAR: { 197 | Char v = record.getChar(i); 198 | if (v == null) 199 | insPreStmt.setNull(i + 1, Types.NULL); 200 | else 201 | insPreStmt.setString(i + 1, v.getValue()); 202 | break; 203 | } 204 | case VARCHAR: { 205 | Varchar v = record.getVarchar(i); 206 | if (v == null) 207 | insPreStmt.setNull(i + 1, Types.NULL); 208 | else 209 | insPreStmt.setString(i + 1, v.getValue()); 210 | break; 211 | } 212 | case BINARY: { 213 | byte[] v = record.getBytes(i); 214 | if (v == null) 215 | insPreStmt.setNull(i + 1, Types.NULL); 216 | else 217 | insPreStmt.setBytes(i + 1, v); 218 | break; 219 | } 220 | case INTERVAL_YEAR_MONTH: { 221 | IntervalYearMonth v = record.getIntervalYearMonth(i); 222 | if (v == null) 223 | insPreStmt.setNull(i + 1, Types.DOUBLE); 224 | else 225 | insPreStmt.setDouble(i + 1, v.getTotalMonths()); 226 | break; 227 | } 228 | case INTERVAL_DAY_TIME: { 229 | IntervalDayTime v = record.getIntervalDayTime(i); 230 | if (v == null) 231 | insPreStmt.setNull(i + 1, Types.DOUBLE); 232 | else 233 | insPreStmt.setDouble(i + 1, v.getTotalSeconds()); 234 | break; 235 | } 236 | case MAP: 237 | case STRUCT: 238 | case ARRAY: 239 | default: 240 | throw new ROdpsException("Unsupported type " + colType.getTypeName()); 241 | } 242 | } 243 | 244 | // add partition values 245 | if (ptvs != null) { 246 | for (String v : ptvs) { 247 | i++; 248 | insPreStmt.setString(i, v); 249 | } 250 | } 251 | insPreStmt.addBatch(); 252 | loadedRecordNum++; 253 | if (loadedRecordNum % batchSize == 0) { 254 | this.conn.setAutoCommit(false); 255 | insPreStmt.executeBatch(); 256 | this.conn.commit(); 257 | insPreStmt.clearBatch(); 258 | } 259 | } 260 | 261 | this.conn.setAutoCommit(false); 262 | insPreStmt.executeBatch(); 263 | this.conn.commit(); 264 | return loadedRecordNum; 265 | } finally { 266 | if (insPreStmt != null) { 267 | insPreStmt.close(); 268 | } 269 | } 270 | } 271 | 272 | /** 273 | * Read data from sqlite, and write these data into odps using DT 274 | * 275 | * @param writer DT writer 276 | * @throws Exception 277 | */ 278 | public long writeDataTunnel(RecordWriter writer) throws Exception { 279 | Statement stmt = null; 280 | ResultSet rs = null; 281 | try { 282 | stmt = this.conn.createStatement(); 283 | String sql = "select * from [" + context.getTable() + "]"; 284 | rs = stmt.executeQuery(sql); 285 | long i = 0; 286 | while (rs.next()) { 287 | ArrayRecord bufRecord = (ArrayRecord) ((UploadSession) (context.getAction())).newRecord(); 288 | for (int j = 0; j < this.context.getSchema().getColumns().size(); j++) { 289 | if (rs.getObject(j + 1) == null) { 290 | bufRecord.set(j, null); 291 | continue; 292 | } 293 | TypeInfo colType = this.context.getSchema().getColumn(j).getTypeInfo(); 294 | switch (colType.getOdpsType()) { 295 | case BOOLEAN: 296 | bufRecord.setBoolean(j, rs.getBoolean(j + 1)); 297 | break; 298 | case BIGINT: 299 | bufRecord.setBigint(j, (long) rs.getDouble(j + 1)); 300 | break; 301 | case INT: 302 | bufRecord.setInt(j, (int) rs.getDouble(j + 1)); 303 | break; 304 | case TINYINT: 305 | bufRecord.setTinyint(j, (byte) rs.getDouble(j + 1)); 306 | break; 307 | case SMALLINT: 308 | bufRecord.setSmallint(j, (short) rs.getShort(j + 1)); 309 | break; 310 | case DOUBLE: 311 | bufRecord.setDouble(j, rs.getDouble(j + 1)); 312 | break; 313 | case FLOAT: 314 | bufRecord.setFloat(j, (float) rs.getDouble(j + 1)); 315 | break; 316 | case DATETIME: 317 | bufRecord.setDatetime(j, new java.util.Date((long) (rs.getDouble(j + 1) * 1000.0))); 318 | break; 319 | case DATE: 320 | bufRecord.setDate(j, new java.sql.Date((long) (rs.getDouble(j + 1) * 1000.0))); 321 | break; 322 | case TIMESTAMP: 323 | bufRecord.setTimestamp(j, new Timestamp((long) (rs.getDouble(j + 1) * 1000.0))); 324 | break; 325 | case DECIMAL: 326 | bufRecord.setDecimal(j, new BigDecimal(rs.getString(j + 1))); 327 | break; 328 | case STRING: 329 | bufRecord.setString(j, rs.getString(j + 1)); 330 | break; 331 | case CHAR: 332 | bufRecord.setChar(j, new Char(rs.getString(j + 1))); 333 | break; 334 | case VARCHAR: 335 | bufRecord.setVarchar(j, new Varchar(rs.getString(j + 1))); 336 | break; 337 | case BINARY: 338 | bufRecord.setBinary(j, new Binary(rs.getBytes(j + 1))); 339 | break; 340 | case INTERVAL_YEAR_MONTH: 341 | bufRecord.setIntervalYearMonth(j, new IntervalYearMonth((int) rs.getDouble(j + 1))); 342 | break; 343 | case INTERVAL_DAY_TIME: 344 | bufRecord.setIntervalDayTime(j, new IntervalDayTime((int) rs.getDouble(j + 1), 0)); 345 | break; 346 | case MAP: 347 | case ARRAY: 348 | case STRUCT: 349 | default: 350 | throw new ROdpsException("Unsupported type " + colType.getTypeName()); 351 | } 352 | } 353 | i++; 354 | writer.write(bufRecord); 355 | } 356 | return i; 357 | } finally { 358 | if (rs != null) { 359 | rs.close(); 360 | } 361 | if (stmt != null) { 362 | stmt.close(); 363 | } 364 | } 365 | } 366 | 367 | /** 368 | * close database connection 369 | */ 370 | public void close() { 371 | if (this.conn != null) { 372 | try { 373 | this.conn.close(); 374 | } catch (SQLException e) { 375 | } 376 | } 377 | } 378 | 379 | /** 380 | * Create table in sqlite. The schema is as same as odps table. 381 | * 382 | * @throws Exception 383 | */ 384 | private void createTable() throws Exception { 385 | TableSchema schema = context.getSchema(); 386 | int columnNumber = schema.getColumns().size(); 387 | StringBuffer sb = new StringBuffer("create table [" + context.getTable() + "] ("); 388 | for (int i = 0; i < columnNumber; ++i) { 389 | String colName = "[" + context.getSchema().getColumn(i).getName() + "]"; 390 | sb.append(colName); 391 | sb.append(" "); 392 | TypeInfo colType = context.getSchema().getColumn(i).getTypeInfo(); 393 | String type; 394 | switch (colType.getOdpsType()) { 395 | case BOOLEAN: 396 | type = "boolean"; 397 | break; 398 | case BIGINT: 399 | case INT: 400 | case TINYINT: 401 | case SMALLINT: 402 | case DOUBLE: 403 | case FLOAT: 404 | case DATETIME: 405 | case DATE: 406 | case TIMESTAMP: 407 | type = "double"; 408 | break; 409 | case DECIMAL: 410 | case STRING: 411 | case CHAR: 412 | case VARCHAR: 413 | case BINARY: 414 | type = "text"; 415 | break; 416 | case MAP: 417 | case STRUCT: 418 | case ARRAY: 419 | default: 420 | throw new ROdpsException("Unsupported type " + colType.getTypeName()); 421 | } 422 | sb.append(type); 423 | sb.append(","); 424 | } 425 | 426 | // contain partition columns 427 | if (this.ptks != null) { 428 | for (String key : this.ptks) { 429 | sb.append(key + " text,"); 430 | } 431 | } 432 | String sql = sb.toString(); 433 | sql = sql.substring(0, sql.length() - 1) + ")"; 434 | // create table in sqlite 435 | Statement stmt = this.conn.createStatement(); 436 | stmt.executeUpdate(sql); 437 | stmt.close(); 438 | } 439 | 440 | /** 441 | * parse partition to key list and value list 442 | * 443 | * @param part partition, format:key=value,... 444 | */ 445 | private void initialPtkvs(String part) throws ROdpsException { 446 | if (part == null) { 447 | return; 448 | } 449 | 450 | this.ptks = new ArrayList(); 451 | this.ptvs = new ArrayList(); 452 | String[] pts = part.split(","); 453 | for (String p : pts) { 454 | String[] kv = p.split("="); 455 | if (kv.length != 2) { 456 | throw new ROdpsException("Partition expression error:" + part); 457 | } 458 | this.ptks.add(kv[0]); 459 | this.ptvs.add(kv[1]); 460 | } 461 | return; 462 | } 463 | } 464 | -------------------------------------------------------------------------------- /java/src/main/java/com/aliyun/odps/rodps/DataTunnel/UploadWorker.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more contributor license 3 | * agreements. See the NOTICE file distributed with this work for additional information regarding 4 | * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the 5 | * "License"); you may not use this file except in compliance with the License. You may obtain a 6 | * copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software distributed under the License 11 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 12 | * or implied. See the License for the specific language governing permissions and limitations under 13 | * the License. 14 | */ 15 | package com.aliyun.odps.rodps.DataTunnel; 16 | 17 | import java.io.IOException; 18 | import java.io.PrintWriter; 19 | import java.io.StringWriter; 20 | 21 | import org.apache.logging.log4j.LogManager; 22 | import org.apache.logging.log4j.Logger; 23 | 24 | import com.aliyun.odps.data.RecordWriter; 25 | import com.aliyun.odps.rodps.ROdps; 26 | import com.aliyun.odps.tunnel.TableTunnel.UploadSession; 27 | 28 | /** 29 | * @Title: UpdateWorker.java 30 | * @Package com.aliyun.odps.rodps 31 | * @Description: TODO(添加描述) 32 | * @author dendi.ywd 33 | * @date 2015-8-7 17:59:40 34 | * @version V1.0 35 | */ 36 | public class UploadWorker implements Runnable { 37 | private static Logger LOG = LogManager.getLogger(UploadWorker.class); 38 | 39 | private int threadId; 40 | private String errorMessage; 41 | private boolean isSuccessful = false; 42 | private Context context; 43 | private final String fileName; 44 | public Thread t; 45 | private RecordWriter writer; 46 | private long uploaded; 47 | private MiddleStorage midStorage; 48 | private int maxRetries = 5; 49 | 50 | public UploadWorker(int threadId, Context context, String fileName) 51 | throws ROdpsException { 52 | this.context = context; 53 | this.fileName = fileName; 54 | this.threadId = threadId; 55 | this.midStorage = new SqliteMiddleStorage(this.fileName, context); 56 | t = new Thread(this, String.valueOf(threadId)); 57 | t.start(); 58 | } 59 | 60 | public void run() { 61 | LOG.info("start to upload threadId=" + this.threadId); 62 | long blockID = (long) threadId; 63 | int retries = 1; 64 | while (retries <= maxRetries && !isSuccessful) { 65 | try { 66 | // The last opened writer would be valid only 67 | writer = context.getAction().openRecordWriter(blockID); 68 | long cnt = midStorage.writeDataTunnel(writer); 69 | if (this.writer != null) { 70 | this.writer.close(); // Occasional timeout due to server flunctation 71 | } 72 | isSuccessful = true; 73 | LOG.info("upload finish threadId=" + threadId + ", record=" + cnt); 74 | } catch (Exception e) { 75 | StringWriter sw = new StringWriter(); 76 | e.printStackTrace(new PrintWriter(sw)); 77 | this.errorMessage = sw.toString(); 78 | if (retries <= maxRetries) { 79 | LOG.warn("upload failed in attempt " + retries + ", threadId=" + threadId + ", stack=" + sw.toString()); 80 | try { 81 | Thread.sleep(1000); 82 | } catch (InterruptedException e1) { 83 | LOG.error("Sleep interrupted!", e1); 84 | } 85 | } else { 86 | LOG.error("upload failed finally, threadId=" + threadId + ", stack=" + sw.toString()); 87 | } 88 | } 89 | retries++; 90 | } 91 | if (this.midStorage != null) { 92 | this.midStorage.close(); 93 | } 94 | } 95 | 96 | public String getErrorMessage() { 97 | return errorMessage; 98 | } 99 | 100 | public void setErrorMessage(String errorMessage) { 101 | this.errorMessage = errorMessage; 102 | } 103 | 104 | public boolean isSuccessful() { 105 | return isSuccessful; 106 | } 107 | 108 | public void setSuccessful(boolean isSuccessful) { 109 | this.isSuccessful = isSuccessful; 110 | } 111 | 112 | public void closeWriter() throws IOException { 113 | if (this.writer != null) { 114 | this.writer.close(); 115 | } 116 | } 117 | 118 | public long getUploaded() { 119 | return this.uploaded; 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /java/src/main/resources/log4j2.properties: -------------------------------------------------------------------------------- 1 | # Define the root logger with a file appender 2 | status = error 3 | name = PropertiesConfig 4 | 5 | appenders = console,file 6 | 7 | # Root logger configuration 8 | rootLogger.level = info 9 | rootLogger.appenderRefs = console,file 10 | rootLogger.appenderRef.console.ref = STDOUT 11 | rootLogger.appenderRef.file.ref = LOGFILE 12 | 13 | # Console appender configuration 14 | appender.console.type = Console 15 | appender.console.name = STDOUT 16 | appender.console.layout.type = PatternLayout 17 | appender.console.layout.pattern = [%d{yyyy-MM-dd HH:mm:ss}] [%p] [%t] [%C:%L] %m%n 18 | 19 | # File appender configuration 20 | appender.file.type = File 21 | appender.file.name = LOGFILE 22 | appender.file.fileName = /tmp/rodps.log 23 | appender.file.layout.type = PatternLayout 24 | appender.file.datePattern='.'yyyy-MM-dd 25 | appender.file.layout.pattern = [%d{yyyy-MM-dd HH:mm:ss}] [%p] [%t] [%C:%L] %m%n 26 | -------------------------------------------------------------------------------- /java/src/test/java/com/aliyun/odps/rodps/ROdpsTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1999-2015 Alibaba Group Holding Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 5 | * in compliance with the License. You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software distributed under the License 10 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11 | * or implied. See the License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | 15 | /** 16 | * @Title: ROdpsTest.java 17 | * @Package com.aliyun.odps.rodps.UnitTest 18 | * @Description: TODO(用一句话描述该文件做什么) 19 | * @author dendi.ywd 20 | * @date 2015-8-10 09:11:38 21 | * @version V1.0 22 | */ 23 | package com.aliyun.odps.rodps; 24 | 25 | import java.io.BufferedReader; 26 | import java.io.File; 27 | import java.io.FileReader; 28 | import java.util.HashMap; 29 | import java.util.List; 30 | import java.util.Map; 31 | 32 | import org.junit.Test; 33 | 34 | import com.aliyun.odps.OdpsException; 35 | import com.aliyun.odps.rodps.DataTunnel.DataFrameItem; 36 | import com.aliyun.odps.rodps.DataTunnel.ROdpsException; 37 | 38 | import junit.framework.TestCase; 39 | 40 | /** 41 | * @Title: ROdpsTest.java 42 | * @Package com.aliyun.odps.rodps.UnitTest 43 | * @Description: TODO(添加描述) 44 | * @author dendi.ywd 45 | * @date 2015-8-10 09:11:38 46 | * @version V1.0 47 | */ 48 | 49 | public class ROdpsTest extends TestCase { 50 | final static String table = "odps_r_operator"; 51 | static String file; 52 | static ROdps rodps; 53 | 54 | protected void setUp() throws ROdpsException, OdpsException, ClassNotFoundException { 55 | String odps_config_path = ROdpsTest.class.getClassLoader().getResource("odps_config.ini").getPath(); 56 | Map conf = loadConfig(odps_config_path); 57 | if (conf == null) { 58 | System.exit(-1); 59 | } 60 | file = conf.get("sqlite_temp") + table; 61 | rodps = new ROdps(conf.get("project_name"), 62 | conf.get("access_id"), 63 | conf.get("access_key"), 64 | conf.get("end_point"), 65 | conf.get("dt_end_point"), 66 | conf.get("logview_host"), 67 | ""); 68 | rodps.setBizId("012345^"); 69 | assertNotNull(rodps); 70 | rodps.runSqlTask("create table if not exists " + table + "(id int) comment 'This is the test table for ROdps';"); 71 | } 72 | 73 | protected void tearDown() throws ROdpsException { 74 | assertTrue(rodps.dropTable(null, table)); 75 | } 76 | 77 | private static Map loadConfig(String file) { 78 | try { 79 | Map ret = new HashMap(); 80 | FileReader fileReader = new FileReader(new File(file)); 81 | BufferedReader br = new BufferedReader(fileReader); 82 | String line; 83 | while ((line = br.readLine()) != null) { 84 | if (line.startsWith("#")) { 85 | continue; 86 | } 87 | int idx = line.indexOf("="); 88 | if (idx < 0) { 89 | System.out.println("odps_config.ini error line:" + line); 90 | continue; 91 | } 92 | ret.put(line.substring(0, idx).trim(), line.substring(idx + 1, line.length()).trim()); 93 | } 94 | return ret; 95 | } catch (Exception e) { 96 | e.printStackTrace(); 97 | return null; 98 | } 99 | } 100 | 101 | @Test 102 | public void testGetTablesList() throws ROdpsException, OdpsException { 103 | List ret = rodps.getTables(null, null); 104 | 105 | for (DataFrameItem d : ret) { 106 | List res = d.getData(); 107 | System.out.println(res); 108 | } 109 | 110 | assertNotNull(ret); 111 | assertEquals(2, ret.size()); 112 | System.out.println("testN1GetTableList:" + ret); 113 | } 114 | 115 | @Test 116 | public void testIsTableExists() throws ROdpsException { 117 | assertTrue(rodps.isTableExist(null, table, null)); 118 | assertFalse(rodps.isTableExist(null, table + "not_exists", null)); 119 | } 120 | 121 | @Test 122 | public void testParsePartition() throws ROdpsException { 123 | assertTrue(rodps.formatPartition("pt=1/ds=2", "'", " and ").equals("pt='1' and ds='2'")); 124 | assertTrue(rodps.formatPartition("pt=1,ds=2", "'", " and ").equals("pt='1' and ds='2'")); 125 | assertTrue(rodps.formatPartition("pt='1',ds='2'", "'", " and ").equals("pt='1' and ds='2'")); 126 | assertTrue(rodps.formatPartition("pt=\"1\",ds=\"2\"", "'", " and ").equals( 127 | "pt='1' and ds='2'")); 128 | assertTrue(rodps.formatPartition("pt=\"1\",ds=\"2\"", "", ",").equals("pt=1,ds=2")); 129 | assertTrue(rodps.formatPartition("pt='1',ds='2012-01-01 00:11:11'", "", ",").equals( 130 | "pt=1,ds=2012-01-01 00:11:11")); 131 | try { 132 | assertTrue(rodps.formatPartition("pt='1',ds='a,b'", "", ",").equals( 133 | "pt=1,ds=2012-01-01 00:11:11")); 134 | assertTrue(false); 135 | } catch (Exception e) { 136 | assertTrue(true); 137 | } 138 | try { 139 | assertTrue(rodps.formatPartition("pt='1',ds='a=b'", "", ",").equals( 140 | "pt=1,ds=2012-01-01 00:11:11")); 141 | assertTrue(false); 142 | } catch (Exception e) { 143 | assertTrue(true); 144 | } 145 | try { 146 | assertTrue(rodps.formatPartition("pt='1',ds=',a=b'", "", ",").equals( 147 | "pt=1,ds=2012-01-01 00:11:11")); 148 | assertTrue(false); 149 | } catch (Exception e) { 150 | assertTrue(true); 151 | } 152 | } 153 | 154 | @Test 155 | public void test_DescribeTable() throws ROdpsException, OdpsException { 156 | List ret = rodps.describeTable(null, table, null); 157 | for (DataFrameItem d : ret) { 158 | List res = d.getData(); 159 | System.out.println(d.getName()); 160 | System.out.println(res); 161 | } 162 | assertNotNull(ret); 163 | } 164 | 165 | @Test 166 | public void testTableSize() throws ROdpsException { 167 | Long ret = rodps.getTableSize(null, table, null); 168 | System.out.println(ret); 169 | } 170 | 171 | @Test 172 | public void testDtLoad() throws ROdpsException { 173 | String project = null; 174 | String partition = null; 175 | String colDelimiter = "\u0001"; 176 | String rowDelimiter = "\n"; 177 | int limit = 863; 178 | List> ret = 179 | rodps.loadTableFromDT(project, table, partition, file, colDelimiter, rowDelimiter, limit, 8); 180 | System.out.println(ret); 181 | } 182 | 183 | @Test 184 | public void testUpload() throws ROdpsException { 185 | String project = null; 186 | String partition = null; 187 | String colDelimiter = "\u0001"; 188 | String rowDelimiter = "\n"; 189 | rodps.writeTableFromDT(project, table, partition, file, colDelimiter, rowDelimiter, 1, 8); 190 | } 191 | 192 | @Test 193 | public void testRunSqlTask() throws ROdpsException { 194 | List ret = rodps.runSqlTask("create table if not exists odps_r_operator(id int);"); 195 | assertNotNull(ret); 196 | assertEquals(0, ret.size()); 197 | 198 | ret = rodps.runSqlTask("insert into table odps_r_operator select 1 from dual;"); 199 | assertNotNull(ret); 200 | assertEquals(0, ret.size()); 201 | } 202 | 203 | } 204 | -------------------------------------------------------------------------------- /java/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=DEBUG, STDOUT 2 | log4j.appender.STDOUT=org.apache.log4j.ConsoleAppender 3 | log4j.appender.STDOUT.layout=org.apache.log4j.PatternLayout 4 | log4j.appender.STDOUT.layout.ConversionPattern=%5p [%15t] %m (%F:%L) %n 5 | -------------------------------------------------------------------------------- /man/RODPS.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps.R 3 | \docType{package} 4 | \name{RODPS} 5 | \alias{RODPS} 6 | \title{RODPS: R interface to interact with ODPS} 7 | \description{ 8 | RODPS is an R extension to enable R to interact with ODPS 9 | system, also support other related algorithm packages. 10 | } 11 | \details{ 12 | The RODPS package supplies functions to interact with ODPS from 13 | within R. There are functions for exporting and connecting as well as 14 | querying ODPS. Please make sure the environment variable ODPS_CONFIG is set, 15 | it's in the same format as used in odpscmd, this file is required when 16 | connecting to ODPS. 17 | } 18 | \seealso{ 19 | \code{\link{rodps.sql}}, \code{\link{rodps.set}}, 20 | \code{\link{rodps.table}},\code{\link{rodps.project}} 21 | } 22 | \author{ 23 | \email{yunyuan.zhangyy@alibaba-inc.com} 24 | } 25 | \concept{RODPS} 26 | -------------------------------------------------------------------------------- /man/error.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/init.R 3 | \name{error} 4 | \alias{error} 5 | \alias{fatal} 6 | \alias{info} 7 | \alias{warn} 8 | \title{Print Messages} 9 | \usage{ 10 | error(error_name, msg = NULL) 11 | 12 | fatal(error_name, msg = NULL) 13 | 14 | info(error_name, msg = NULL) 15 | 16 | warn(error_name, msg = NULL) 17 | } 18 | \description{ 19 | Print Messages 20 | } 21 | -------------------------------------------------------------------------------- /man/head.rodps.data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_table.R 3 | \name{head.rodps.data} 4 | \alias{head.rodps.data} 5 | \title{Table Head} 6 | \usage{ 7 | head.rodps.data(rd, n = 6L) 8 | } 9 | \description{ 10 | Create odps.data and odps.vector in S4. 11 | Store the head result in a temp table 12 | } 13 | -------------------------------------------------------------------------------- /man/na.omit.rodps.data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_misc.R 3 | \name{na.omit.rodps.data} 4 | \alias{na.omit.rodps.data} 5 | \title{Remove NULL values from a table} 6 | \usage{ 7 | \method{na.omit}{rodps.data}(rd) 8 | } 9 | \description{ 10 | Remove NULL values from a table 11 | } 12 | -------------------------------------------------------------------------------- /man/rodps.bizid.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_base_function.R 3 | \name{rodps.bizid} 4 | \alias{rodps.bizid} 5 | \title{Set business ID} 6 | \usage{ 7 | rodps.bizid(bizid) 8 | } 9 | \arguments{ 10 | \item{bizid}{business id, e.g. 012345^.} 11 | } 12 | \description{ 13 | Set business ID 14 | } 15 | \examples{ 16 | ## set business id to 012345 17 | \dontrun{rodps.bizid('012345^')} 18 | } 19 | \seealso{ 20 | \code{\link{RODPS}}, \code{\link{rodps.sql}} 21 | } 22 | \author{ 23 | \email{ruibo.lirb@alibaba-inc.com} 24 | } 25 | -------------------------------------------------------------------------------- /man/rodps.change.types.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/init.R 3 | \name{rodps.change.types} 4 | \alias{rodps.change.types} 5 | \title{Change RODPS Java types into R} 6 | \usage{ 7 | rodps.change.types(types) 8 | } 9 | \description{ 10 | Change RODPS Java types into R 11 | } 12 | -------------------------------------------------------------------------------- /man/rodps.data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_misc.R 3 | \name{rodps.data} 4 | \alias{rodps.data} 5 | \title{Set up the odps.data class} 6 | \usage{ 7 | rodps.data(tblname) 8 | } 9 | \description{ 10 | Set up the odps.data class 11 | } 12 | -------------------------------------------------------------------------------- /man/rodps.generate.DDL.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_table.R 3 | \name{rodps.generate.DDL} 4 | \alias{rodps.generate.DDL} 5 | \title{DDL Generation} 6 | \usage{ 7 | rodps.generate.DDL(full.tablename, dataframe, tablecomment = NULL) 8 | } 9 | \arguments{ 10 | \item{full.tablename}{Table name, in format 'ProjectName.TableName',or 11 | 'TableName' (using current project).} 12 | 13 | \item{dataframe}{Source data frame.} 14 | 15 | \item{tablecomment}{DDL comment string.} 16 | } 17 | \description{ 18 | Generate SQL DDL from dataframe. 19 | } 20 | \author{ 21 | \email{yunyuan.zhangyy@alibaba-inc.com} 22 | } 23 | -------------------------------------------------------------------------------- /man/rodps.help.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_base_function.R 3 | \name{rodps.help} 4 | \alias{rodps.help} 5 | \title{Show help information} 6 | \usage{ 7 | rodps.help() 8 | } 9 | \description{ 10 | Show help information 11 | } 12 | -------------------------------------------------------------------------------- /man/rodps.init.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/init.R 3 | \name{rodps.init} 4 | \alias{rodps.init} 5 | \title{Init ODPS with configs} 6 | \usage{ 7 | rodps.init(path = NULL, access.id = NULL, access.key = NULL) 8 | } 9 | \arguments{ 10 | \item{path}{File path string indicating odps_config} 11 | 12 | \item{access.id}{Access ID string} 13 | 14 | \item{access.key}{Access Key string} 15 | } 16 | \description{ 17 | Init ODPS with configs 18 | } 19 | -------------------------------------------------------------------------------- /man/rodps.predict.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_predict.R 3 | \name{rodps.predict} 4 | \alias{rodps.predict} 5 | \title{Extend predict function} 6 | \usage{ 7 | rodps.predict(x, ...) 8 | } 9 | \description{ 10 | Extend predict function 11 | } 12 | -------------------------------------------------------------------------------- /man/rodps.predict.fda.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_predict.R 3 | \name{rodps.predict.fda} 4 | \alias{rodps.predict.fda} 5 | \title{Extend FDA} 6 | \usage{ 7 | \method{rodps.predict}{fda}(object, srctbl, tgttbl, prior, type = "class", dimension = 2) 8 | } 9 | \arguments{ 10 | \item{object}{FDA model} 11 | 12 | \item{srctbl}{Data source table} 13 | 14 | \item{tgttbl}{Target table of prediction results} 15 | 16 | \item{dryrun}{Return the prediction SQL string instead of running the query} 17 | } 18 | \description{ 19 | Extend FDA 20 | } 21 | -------------------------------------------------------------------------------- /man/rodps.predict.rpart.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_predict.R 3 | \name{rodps.predict.rpart} 4 | \alias{rodps.predict.rpart} 5 | \title{Extend Recursive Partitioning} 6 | \usage{ 7 | \method{rodps.predict}{rpart}(object, srctbl, tgttbl, inc.col = NULL, dryrun = FALSE) 8 | } 9 | \arguments{ 10 | \item{object}{Rpart model} 11 | 12 | \item{srctbl}{Data source table} 13 | 14 | \item{tgttbl}{Target table of prediction results} 15 | 16 | \item{dryrun}{Return the prediction SQL string instead of running the query} 17 | } 18 | \description{ 19 | Extend Recursive Partitioning 20 | } 21 | -------------------------------------------------------------------------------- /man/rodps.project.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_project.R 3 | \name{rodps.project} 4 | \alias{rodps.project} 5 | \title{Project functions} 6 | \description{ 7 | Provide functions to operate project. 8 | } 9 | \seealso{ 10 | \code{\link{rodps.project.use}}, 11 | \code{\link{rodps.project.current}} 12 | } 13 | \author{ 14 | \email{yunyuan.zhangyy@alibaba-inc.com} 15 | } 16 | -------------------------------------------------------------------------------- /man/rodps.project.current.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_project.R 3 | \name{rodps.project.current} 4 | \alias{rodps.project.current} 5 | \alias{rodps.current.project} 6 | \title{Show current project name.} 7 | \usage{ 8 | rodps.project.current() 9 | 10 | rodps.current.project() 11 | } 12 | \description{ 13 | Show current project name. 14 | } 15 | \examples{ 16 | ## get current project name 17 | \dontrun{rodps.project.current()} 18 | } 19 | \seealso{ 20 | \code{\link{rodps.project.use}} 21 | } 22 | \author{ 23 | \email{yunyuan.zhangyy@alibaba-inc.com} 24 | } 25 | -------------------------------------------------------------------------------- /man/rodps.project.use.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_project.R 3 | \name{rodps.project.use} 4 | \alias{rodps.project.use} 5 | \title{Change current project.} 6 | \usage{ 7 | rodps.project.use(projectname) 8 | } 9 | \arguments{ 10 | \item{projectname}{target projectname; make sure that you have the authority to access this Project.} 11 | } 12 | \description{ 13 | Change current project. 14 | } 15 | \examples{ 16 | ## change project to prjb 17 | \dontrun{rodps.project.use('prjb')} 18 | } 19 | \seealso{ 20 | \code{\link{rodps.project.current}} 21 | } 22 | \author{ 23 | \email{yunyuan.zhangyy@alibaba-inc.com} 24 | } 25 | -------------------------------------------------------------------------------- /man/rodps.set.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_base_function.R 3 | \name{rodps.set} 4 | \alias{rodps.set} 5 | \title{Set task properties} 6 | \usage{ 7 | rodps.set(key, value) 8 | } 9 | \arguments{ 10 | \item{key}{setting name, e.g. odps.sql.allow.fullscan.} 11 | 12 | \item{value}{setting value.} 13 | } 14 | \description{ 15 | Set properties for SQL task 16 | } 17 | \examples{ 18 | ## enable full table scan 19 | \dontrun{rodps.set('odps.sql.allow.fullscan', 'true')} 20 | } 21 | \seealso{ 22 | \code{\link{RODPS}}, \code{\link{rodps.sql}}, 23 | \code{\link{rodps.unset}}, 24 | } 25 | \author{ 26 | \email{ruibo.lirb@alibaba-inc.com} 27 | } 28 | -------------------------------------------------------------------------------- /man/rodps.split.ftn.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_table.R 3 | \name{rodps.split.ftn} 4 | \alias{rodps.split.ftn} 5 | \title{Split full table name into table name and project name} 6 | \usage{ 7 | rodps.split.ftn(ftn) 8 | } 9 | \arguments{ 10 | \item{ftn}{Full table name.} 11 | } 12 | \description{ 13 | Split full table name into table name and project name 14 | } 15 | -------------------------------------------------------------------------------- /man/rodps.sql.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_sql.R 3 | \name{rodps.sql} 4 | \alias{rodps.sql} 5 | \alias{rodps.query} 6 | \title{SQL Command} 7 | \usage{ 8 | rodps.sql(query, mcqa = FALSE, result.table.limit = 10737518240, thread = 8) 9 | 10 | rodps.query(query, mcqa = FALSE, result.table.limit = 10737518240, thread = 8) 11 | } 12 | \arguments{ 13 | \item{query}{SQL string} 14 | 15 | \item{mcqa}{Whether enable MCQA or not} 16 | 17 | \item{result.table.limit}{The size limit of resulted table as engine side table or fetched data frame.} 18 | 19 | \item{thread}{The threading number to read table data when the table size is larger than `result.table.limit`.} 20 | } 21 | \description{ 22 | Run SQL command and return result(in data.frame type). 23 | } 24 | \examples{ 25 | ## select the data of 'sales' in January ,and store the result in data.frame 26 | \dontrun{ data <- rodps.sql('select * from sales where month=1')} 27 | } 28 | \seealso{ 29 | \code{\link{RODPS}}, \code{\link{rodps.table}}, 30 | \code{\link{rodps.project}} 31 | } 32 | \author{ 33 | \email{yunyuan.zhangyy@alibaba-inc.com} 34 | } 35 | -------------------------------------------------------------------------------- /man/rodps.str.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_str.R 3 | \name{rodps.str} 4 | \alias{rodps.str} 5 | \title{Display Table} 6 | \usage{ 7 | rodps.str(tbl) 8 | } 9 | \arguments{ 10 | \item{tbl}{RODPS Table object} 11 | } 12 | \value{ 13 | Formatted string. 14 | } 15 | \description{ 16 | Print table as formatted string. 17 | } 18 | \seealso{ 19 | [str.rodps.data()] 20 | } 21 | -------------------------------------------------------------------------------- /man/rodps.table.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_table.R 3 | \name{rodps.table} 4 | \alias{rodps.table} 5 | \title{RODPS Table Functions} 6 | \description{ 7 | Provide functions to operate table. 8 | } 9 | \seealso{ 10 | \code{\link{rodps.table.desc}}, \code{\link{rodps.table.drop}}, 11 | \code{\link{rodps.table.exist}}, \code{\link{rodps.table.partitions}}, 12 | \code{\link{rodps.table.list}}, \code{\link{rodps.table.rows}}, 13 | \code{\link{rodps.table.size}}, \code{\link{rodps.table.read}}, 14 | \code{\link{rodps.table.write}} 15 | } 16 | \author{ 17 | \email{yunyuan.zhangyy@alibaba-inc.com} 18 | } 19 | -------------------------------------------------------------------------------- /man/rodps.table.desc.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_table.R 3 | \name{rodps.table.desc} 4 | \alias{rodps.table.desc} 5 | \alias{rodps.desc.table} 6 | \title{Table Description} 7 | \usage{ 8 | rodps.table.desc(full.tablename, partition = NULL) 9 | 10 | rodps.desc.table(full.tablename, partition = NULL) 11 | } 12 | \arguments{ 13 | \item{full.tablename}{Table name, in format 'ProjectName.TableName',or 14 | 'TableName' (using current project).} 15 | 16 | \item{partition}{Partition spec} 17 | } 18 | \description{ 19 | Show description of a table, including metadata of 20 | Owner, Project, Comment, Create_time, Last_modified_time, Size, Columns. 21 | } 22 | \examples{ 23 | ## show description of 'dual' 24 | \dontrun{rodps.table.desc('dual')} 25 | } 26 | \seealso{ 27 | \code{\link{rodps.table.drop}}, \code{\link{rodps.table.exist}}, 28 | \code{\link{rodps.table.partitions}}, \code{\link{rodps.table.list}}, 29 | \code{\link{rodps.table.rows}}, \code{\link{rodps.table.size}}, 30 | \code{\link{rodps.table.read}}, \code{\link{rodps.table.write}} 31 | } 32 | \author{ 33 | \email{yunyuan.zhangyy@alibaba-inc.com} 34 | } 35 | -------------------------------------------------------------------------------- /man/rodps.table.drop.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_table.R 3 | \name{rodps.table.drop} 4 | \alias{rodps.table.drop} 5 | \alias{rodps.drop.table} 6 | \title{Drop Table} 7 | \usage{ 8 | rodps.table.drop(full.tablename, partition = NULL) 9 | 10 | rodps.drop.table(full.tablename, partition = NULL) 11 | } 12 | \arguments{ 13 | \item{full.tablename}{Table name.} 14 | 15 | \item{partition}{Partition spec.} 16 | } 17 | \description{ 18 | Delete table if it exists. 19 | } 20 | \examples{ 21 | \dontrun{rodps.table.drop('sales_backup')} 22 | } 23 | \seealso{ 24 | \code{\link{rodps.table.desc}}, \code{\link{rodps.table.exist}}, 25 | \code{\link{rodps.table.partitions}}, \code{\link{rodps.table.list}}, 26 | \code{\link{rodps.table.rows}}, \code{\link{rodps.table.size}}, 27 | \code{\link{rodps.table.read}}, \code{\link{rodps.table.write}} 28 | } 29 | \author{ 30 | \email{yunyuan.zhangyy@alibaba-inc.com} 31 | } 32 | -------------------------------------------------------------------------------- /man/rodps.table.exist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_table.R 3 | \name{rodps.table.exist} 4 | \alias{rodps.table.exist} 5 | \alias{rodps.exist.table} 6 | \title{Table Existence} 7 | \usage{ 8 | rodps.table.exist(full.tablename, partition = NULL) 9 | 10 | rodps.exist.table(full.tablename, partition = NULL) 11 | } 12 | \arguments{ 13 | \item{full.tablename}{table name.} 14 | 15 | \item{partition}{partition spec, default NULL.} 16 | } 17 | \description{ 18 | Check whether a table exists. 19 | } 20 | \examples{ 21 | \dontrun{rodps.table.exist('mytable')} 22 | } 23 | \seealso{ 24 | \code{\link{rodps.table.desc}}, \code{\link{rodps.table.drop}}, 25 | \code{\link{rodps.table.partitions}}, \code{\link{rodps.table.list}}, 26 | \code{\link{rodps.table.rows}}, \code{\link{rodps.table.size}}, 27 | \code{\link{rodps.table.read}}, \code{\link{rodps.table.write}} 28 | } 29 | \author{ 30 | \email{yunyuan.zhangyy@alibaba-inc.com} 31 | } 32 | -------------------------------------------------------------------------------- /man/rodps.table.head.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_table.R 3 | \name{rodps.table.head} 4 | \alias{rodps.table.head} 5 | \title{Table Head} 6 | \usage{ 7 | rodps.table.head(tbl, n = 6L) 8 | } 9 | \arguments{ 10 | \item{tbl}{Table name} 11 | 12 | \item{n}{The number of head rows} 13 | } 14 | \description{ 15 | Show a few of head rows of table. 16 | } 17 | -------------------------------------------------------------------------------- /man/rodps.table.hist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_table_hist.R 3 | \name{rodps.table.hist} 4 | \alias{rodps.table.hist} 5 | \alias{rodps.hist} 6 | \title{Table Histogram} 7 | \usage{ 8 | rodps.table.hist( 9 | tblname, 10 | colname, 11 | breaks = NULL, 12 | freq = TRUE, 13 | include.lowest = TRUE, 14 | right = TRUE, 15 | main = paste("Histogram of ", colname), 16 | xlab = colname, 17 | ... 18 | ) 19 | 20 | rodps.hist( 21 | tblname, 22 | colname, 23 | breaks = NULL, 24 | freq = TRUE, 25 | include.lowest = TRUE, 26 | right = TRUE, 27 | main = paste("Histogram of ", colname), 28 | xlab = colname, 29 | ... 30 | ) 31 | } 32 | \description{ 33 | Extend hist. This function returns a list of breaks, counts, density, mids, xname, 34 | equidist, class attr from tbl; then plot with the list 35 | } 36 | -------------------------------------------------------------------------------- /man/rodps.table.list.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_table.R 3 | \name{rodps.table.list} 4 | \alias{rodps.table.list} 5 | \alias{rodps.list.table} 6 | \alias{rodps.list.tables} 7 | \title{List Tables} 8 | \usage{ 9 | rodps.table.list(pattern = NULL, projectname = NULL) 10 | 11 | rodps.list.table(pattern = NULL, projectname = NULL) 12 | 13 | rodps.list.tables(pattern = NULL, projectname = NULL) 14 | } 15 | \arguments{ 16 | \item{pattern}{Partition pattern, use '*' or specific PartitionName.} 17 | 18 | \item{projectname}{Specific project to query,default is current project.} 19 | } 20 | \description{ 21 | List all tables in the project, default in current project. 22 | } 23 | \examples{ 24 | ##list the tables in current project 25 | \dontrun{rodps.table.list()} 26 | } 27 | \seealso{ 28 | \code{\link{rodps.table.desc}}, \code{\link{rodps.table.drop}}, 29 | \code{\link{rodps.table.exist}}, \code{\link{rodps.table.partitions}}, 30 | \code{\link{rodps.table.rows}}, \code{\link{rodps.table.size}}, 31 | \code{\link{rodps.table.read}}, \code{\link{rodps.table.write}} 32 | } 33 | \author{ 34 | \email{yunyuan.zhangyy@alibaba-inc.com} 35 | } 36 | -------------------------------------------------------------------------------- /man/rodps.table.na.omit.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_misc.R 3 | \name{rodps.table.na.omit} 4 | \alias{rodps.table.na.omit} 5 | \title{Remove NULL values from a table} 6 | \usage{ 7 | rodps.table.na.omit(tbl, tgttbl) 8 | } 9 | \description{ 10 | Remove NULL values from a table 11 | } 12 | -------------------------------------------------------------------------------- /man/rodps.table.partitions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_table.R 3 | \name{rodps.table.partitions} 4 | \alias{rodps.table.partitions} 5 | \alias{rodps.partitions.table} 6 | \title{List Partitions} 7 | \usage{ 8 | rodps.table.partitions(full.tablename) 9 | 10 | rodps.partitions.table(full.tablename) 11 | } 12 | \arguments{ 13 | \item{full.tablename, }{Table name, in format of 'ProjectName.TableName' or 14 | 'TableName' (using current project).} 15 | } 16 | \description{ 17 | List partitions of a table. Raise ERROR if the table has no partition. 18 | } 19 | \examples{ 20 | ## list partitions of 'sales' 21 | \dontrun{rodps.table.partitions('sales')} 22 | } 23 | \seealso{ 24 | \code{\link{rodps.table.desc}}, \code{\link{rodps.table.drop}}, 25 | \code{\link{rodps.table.exist}}, \code{\link{rodps.table.list}}, 26 | \code{\link{rodps.table.rows}}, \code{\link{rodps.table.size}}, 27 | \code{\link{rodps.table.read}}, \code{\link{rodps.table.write}} 28 | } 29 | \author{ 30 | \email{yunyuan.zhangyy@alibaba-inc.com} 31 | } 32 | -------------------------------------------------------------------------------- /man/rodps.table.read.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_table.R 3 | \name{rodps.table.read} 4 | \alias{rodps.table.read} 5 | \alias{rodps.read.table} 6 | \alias{rodps.load.table} 7 | \title{Reading Table} 8 | \usage{ 9 | rodps.table.read( 10 | full.tablename, 11 | partition = NULL, 12 | limit = -1, 13 | memsize = 10737518240, 14 | isdebug = FALSE, 15 | thread = 8 16 | ) 17 | 18 | rodps.read.table( 19 | full.tablename, 20 | partition = NULL, 21 | limit = -1, 22 | memsize = 10737518240, 23 | isdebug = FALSE, 24 | thread = 8 25 | ) 26 | 27 | rodps.load.table( 28 | full.tablename, 29 | partition = NULL, 30 | limit = -1, 31 | memsize = 10737518240, 32 | isdebug = FALSE, 33 | thread = 8 34 | ) 35 | } 36 | \arguments{ 37 | \item{full.tablename}{Table name} 38 | 39 | \item{partition}{Partition spec} 40 | 41 | \item{limit}{Limit the rows to read, '-1' for not limit.} 42 | 43 | \item{memsize}{Maximum data capacity.} 44 | 45 | \item{isdebug}{Boolean value, if debugging is enabled.} 46 | 47 | \item{thread}{Thread number.} 48 | } 49 | \description{ 50 | Read data from ODPS and store in R data frame. 51 | } 52 | \examples{ 53 | ## show description of 'dual' 54 | \dontrun{ x<-rodps.table.read('sales',partition='ds=20180124',limit=100) } 55 | } 56 | \seealso{ 57 | \code{\link{rodps.table.desc}}, \code{\link{rodps.table.drop}}, 58 | \code{\link{rodps.table.exist}}, \code{\link{rodps.table.partitions}}, 59 | \code{\link{rodps.table.list}}, \code{\link{rodps.table.rows}}, 60 | \code{\link{rodps.table.size}}, \code{\link{rodps.table.write}} 61 | } 62 | \author{ 63 | \email{yunyuan.zhangyy@alibaba-inc.com} 64 | } 65 | -------------------------------------------------------------------------------- /man/rodps.table.rows.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_table.R 3 | \name{rodps.table.rows} 4 | \alias{rodps.table.rows} 5 | \alias{rodps.rows.table} 6 | \title{Table Rows} 7 | \usage{ 8 | rodps.table.rows(full.tablename, partition = NULL) 9 | 10 | rodps.rows.table(full.tablename, partition = NULL) 11 | } 12 | \arguments{ 13 | \item{full.tablename}{Table name , in format of 'ProjectName.TableName' or 14 | 'TableName' (using current project)} 15 | 16 | \item{partition}{Partition spec.} 17 | } 18 | \description{ 19 | Get the number of rows in a table. 20 | } 21 | \examples{ 22 | ## get the number of rows 23 | \dontrun{rodps.table.rows('sales')} 24 | } 25 | \seealso{ 26 | \code{\link{rodps.table.desc}}, \code{\link{rodps.table.drop}}, 27 | \code{\link{rodps.table.exist}}, \code{\link{rodps.table.partitions}}, 28 | \code{\link{rodps.table.list}}, \code{\link{rodps.table.size}}, 29 | \code{\link{rodps.table.read}}, \code{\link{rodps.table.write}} 30 | } 31 | \author{ 32 | \email{yunyuan.zhangyy@alibaba-inc.com} 33 | } 34 | -------------------------------------------------------------------------------- /man/rodps.table.sample.srs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_table_sample.R 3 | \name{rodps.table.sample.srs} 4 | \alias{rodps.table.sample.srs} 5 | \alias{rodps.sample.srs} 6 | \title{Sample table} 7 | \usage{ 8 | rodps.table.sample.srs( 9 | srctable, 10 | tgttable, 11 | samplerate, 12 | cond = NULL, 13 | select = NULL 14 | ) 15 | 16 | rodps.sample.srs(srctable, tgttable, samplerate, cond = NULL, select = NULL) 17 | } 18 | \description{ 19 | Sample table 20 | } 21 | \seealso{ 22 | \code{\link{rodps.table.sample.strat}} 23 | } 24 | -------------------------------------------------------------------------------- /man/rodps.table.sample.strat.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_table_sample.R 3 | \name{rodps.table.sample.strat} 4 | \alias{rodps.table.sample.strat} 5 | \alias{rodps.sample.strat} 6 | \title{Sample Table} 7 | \usage{ 8 | rodps.table.sample.strat(srctable, tgttable, samplerate, strat, select = NULL) 9 | 10 | rodps.sample.strat(srctable, tgttable, samplerate, strat, select = NULL) 11 | } 12 | \description{ 13 | The sample strategy is as similar as: 14 | } 15 | \details{ 16 | select abc from ( *, row_number() over( partition by g order by 17 | rand()) r_rn, rand() as r_select ) sub 18 | 1. by percent sub where r_select < rate 19 | 2. by number sub where rn <= rate 20 | } 21 | \seealso{ 22 | \code{\link{rodps.table.sample.srs}} 23 | } 24 | -------------------------------------------------------------------------------- /man/rodps.table.size.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_table.R 3 | \name{rodps.table.size} 4 | \alias{rodps.table.size} 5 | \alias{rodps.size.table} 6 | \title{Table Size} 7 | \usage{ 8 | rodps.table.size(full.tablename, partition = NULL) 9 | 10 | rodps.size.table(full.tablename, partition = NULL) 11 | } 12 | \arguments{ 13 | \item{full.tablename}{Table name, in format 'ProjectName.TableName',or 14 | 'TableName' (using current project).} 15 | 16 | \item{partition}{Partition spec} 17 | } 18 | \description{ 19 | Get the size of table in Bytes. 20 | } 21 | \examples{ 22 | ## get the size of 'sales' 23 | \dontrun{rodps.table.size('sales')} 24 | } 25 | \seealso{ 26 | \code{\link{rodps.table.desc}}, \code{\link{rodps.table.drop}}, 27 | \code{\link{rodps.table.exist}}, \code{\link{rodps.table.partitions}}, 28 | \code{\link{rodps.table.list}}, \code{\link{rodps.table.rows}}, 29 | \code{\link{rodps.table.read}}, \code{\link{rodps.table.write}} 30 | } 31 | \author{ 32 | \email{yunyuan.zhangyy@alibaba-inc.com} 33 | } 34 | -------------------------------------------------------------------------------- /man/rodps.table.summary.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_table_summary.R 3 | \name{rodps.table.summary} 4 | \alias{rodps.table.summary} 5 | \title{Table Summary} 6 | \usage{ 7 | rodps.table.summary(tbl) 8 | } 9 | \description{ 10 | Get statistical summaries of a table. 11 | } 12 | -------------------------------------------------------------------------------- /man/rodps.table.write.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_table.R 3 | \name{rodps.table.write} 4 | \alias{rodps.table.write} 5 | \alias{rodps.write.table} 6 | \title{Write Table} 7 | \usage{ 8 | rodps.table.write( 9 | dataframe, 10 | full.tablename, 11 | partition = NULL, 12 | tablecomment = NULL, 13 | isdebug = FALSE, 14 | thread = 8 15 | ) 16 | 17 | rodps.write.table( 18 | dataframe, 19 | full.tablename, 20 | partition = NULL, 21 | tablecomment = NULL, 22 | isdebug = FALSE, 23 | thread = 8 24 | ) 25 | } 26 | \arguments{ 27 | \item{dataframe}{Data in data.frame type, make sure the ColumnName is 28 | allowable in ODPS.} 29 | 30 | \item{full.tablename}{Table name, in format 'ProjectName.TableName',or 31 | 'TableName' (using current project).} 32 | 33 | \item{partition}{Partition spec.} 34 | 35 | \item{tablecomment}{Table comment.} 36 | 37 | \item{isdebug}{Boolean value, if debugging is enabled.} 38 | 39 | \item{thread}{Thread number.} 40 | } 41 | \description{ 42 | Write 'dataframe' into 'full.tablename' of ODPS, make sure the target table 43 | 'full.tablename' is not exist. Dataframe can be written to a non-exist table 44 | or partition. 45 | } 46 | \examples{ 47 | ## write data.frame into 'mytable' 48 | \dontrun{ x<-data.frame(c1=1:10,c2=1:10)} 49 | \dontrun{ rodps.table.write(x,'mytable')} 50 | } 51 | \seealso{ 52 | \code{\link{rodps.table.desc}}, \code{\link{rodps.table.drop}}, 53 | \code{\link{rodps.table.exist}}, \code{\link{rodps.table.partitions}}, 54 | \code{\link{rodps.table.list}}, \code{\link{rodps.table.rows}}, 55 | \code{\link{rodps.table.size}}, \code{\link{rodps.table.read}} 56 | } 57 | \author{ 58 | \email{yunyuan.zhangyy@alibaba-inc.com} 59 | } 60 | -------------------------------------------------------------------------------- /man/rodps.tmpdir.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/init.R 3 | \name{rodps.tmpdir} 4 | \alias{rodps.tmpdir} 5 | \title{Change RODPS TempDir} 6 | \usage{ 7 | rodps.tmpdir(path) 8 | } 9 | \arguments{ 10 | \item{path}{Target path string} 11 | } 12 | \description{ 13 | Change RODPS TempDir 14 | } 15 | -------------------------------------------------------------------------------- /man/rodps.unset.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_base_function.R 3 | \name{rodps.unset} 4 | \alias{rodps.unset} 5 | \title{Unset task properties} 6 | \usage{ 7 | rodps.unset(key) 8 | } 9 | \arguments{ 10 | \item{key}{setting name, e.g. odps.sql.allow.fullscan.} 11 | } 12 | \description{ 13 | Unset properties for SQL task 14 | } 15 | \examples{ 16 | ## set full table scan to its default value 17 | \dontrun{rodps.unset('odps.sql.allow.fullscan')} 18 | } 19 | \seealso{ 20 | \code{\link{RODPS}}, \code{\link{rodps.sql}}, 21 | \code{\link{rodps.set}}, 22 | } 23 | \author{ 24 | \email{ruibo.lirb@alibaba-inc.com} 25 | } 26 | -------------------------------------------------------------------------------- /man/rodps.vector.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_misc.R 3 | \name{rodps.vector} 4 | \alias{rodps.vector} 5 | \title{Set up odps.vector class, a vector is nothing but a column in table} 6 | \usage{ 7 | rodps.vector(tblname, colname) 8 | } 9 | \description{ 10 | Set up odps.vector class, a vector is nothing but a column in table 11 | } 12 | -------------------------------------------------------------------------------- /man/summary.rodps.data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rodps_table_summary.R 3 | \name{summary.rodps.data} 4 | \alias{summary.rodps.data} 5 | \title{Store unique result in a temp table} 6 | \usage{ 7 | \method{summary}{rodps.data}(rd) 8 | } 9 | \description{ 10 | Store unique result in a temp table 11 | } 12 | -------------------------------------------------------------------------------- /tests/rodpstest.R: -------------------------------------------------------------------------------- 1 | test.dir = "testout" 2 | base.dir = "testbase" 3 | cur.test = NULL 4 | current.prj = NULL 5 | other.prj = NULL 6 | 7 | 8 | mark <- function(case.title) { 9 | print(paste("#################case:", case.title, "##########################")) 10 | sink(sprintf("%s/%s.out", test.dir, case.title), type = c("output", "message")) 11 | cat(sprintf("########### RODPS %s###########\n", case.title)) 12 | } 13 | 14 | check.case <- function() { 15 | sink() 16 | } 17 | 18 | 19 | library("RODPS") 20 | rodps.init("./odps_config.ini_newdailyrun") 21 | current.prj <- rodps.project.current() 22 | rodps.bizid("012345^") 23 | rodps.set("hello", "world") 24 | rodps.set("foo", "bar") 25 | rodps.unset("foo") 26 | # 27 | mark("listtable") 28 | rodps.table.list() 29 | check.case() 30 | 31 | # 32 | mark("list_partition") 33 | rodps.sql("drop table test_partition") 34 | rodps.sql("create table test_partition(id string) partitioned by(pt string)") 35 | rodps.sql("alter table test_partition add partition(pt='20140531')") 36 | rodps.table.partitions("test_partition") 37 | check.case() 38 | 39 | # 40 | mark("create_drop_table") 41 | rodps.sql("create table if not exists rodps_drop_table(a string)") 42 | rodps.table.desc("rodps_drop_table") 43 | rodps.table.drop("rodps_drop_table") 44 | rodps.sql("create table if not exists rodps_drop_table(a string)") 45 | rodps.table.drop(paste(current.prj, ".", "rodps_drop_table", sep = "")) 46 | rodps.table.exist(paste(current.prj, "rodps_drop_table", sep = ".")) 47 | check.case() 48 | 49 | ## creat table[dual] for test rodps.sql('create table if not exists dual(a 50 | ## string)') check.case() 51 | 52 | # 53 | mark("desc_table") 54 | rodps.table.desc("dual") 55 | rodps.table.desc(paste(current.prj, ".", "dual", sep = "")) 56 | check.case() 57 | 58 | # 59 | mark("exist_table") 60 | rodps.table.exist("dual") 61 | rodps.table.exist(paste(current.prj, ".", "dual", sep = "")) 62 | check.case() 63 | 64 | # 65 | mark("select_table") 66 | x <- rodps.sql("select * from dual") 67 | x 68 | sapply(x, class) 69 | sapply(x, typeof) 70 | check.case() 71 | 72 | 73 | # 74 | mark("size_table") 75 | rodps.table.size("dual") 76 | rodps.table.size(paste(current.prj, ".", "dual", sep = "")) 77 | check.case() 78 | 79 | # 80 | mark("write_table") 81 | x <- data.frame(c1 = 1:10, c2 = 1:10) 82 | rodps.table.write(x, "rodps_write_table") 83 | 84 | names(iris) = gsub("\\.", "_", names(iris)) 85 | rodps.table.drop("iris_test") 86 | rodps.table.write(iris, "iris_test") 87 | check.case() 88 | 89 | # 90 | mark("load_table") 91 | rodps.table.read("dual") 92 | rodps.table.read(paste(current.prj, ".", "dual", sep = "")) 93 | 94 | rodps.table.drop("rodps_load_write_table") 95 | rodps.sql("create table rodps_load_write_table(c_couble double, c_string string, c_boolean boolean, c_datetime datetime, c_bigint bigint )") 96 | rodps.sql("insert into table rodps_load_write_table select 1.1, \"abc\",true,to_date(\"20130101\",\"yyyymmdd\"), 10000 from (select count(*) from rodps_load_write_table) a") 97 | rodps.sql("insert into table rodps_load_write_table select -1.1, \"ab\\nc\",false,to_date(\"99991231\",\"yyyymmdd\"), -10000 from (select count(*) from rodps_load_write_table) a") 98 | 99 | x <- rodps.table.read("rodps_load_write_table") 100 | x 101 | check.case() 102 | 103 | 104 | # 105 | mark("sample_srs") 106 | # test sample.srs 107 | rodps.table.drop("rodps_sample_src") 108 | rodps.table.drop("rodps_sample_tgt") 109 | 110 | x <- data.frame(c1 = 1:1000, c2 = rep(1:10, 100)) 111 | 112 | rodps.table.write(x, "rodps_sample_src") 113 | rodps.table.sample.srs("rodps_sample_src", "rodps_sample_tgt", 0.5) 114 | x <- rodps.table.read("rodps_sample_tgt") 115 | 116 | names(x) 117 | nrow(x) 118 | 119 | rodps.table.drop("rodps_sample_tgt") 120 | rodps.table.sample.srs("rodps_sample_src", "rodps_sample_tgt", 20, select = c("c1")) 121 | x <- rodps.table.read("rodps_sample_tgt") 122 | 123 | names(x) 124 | nrow(x) 125 | check.case() 126 | 127 | # test sample.strat 128 | mark("sample_strat") 129 | rodps.table.drop("rodps_sample_src") 130 | rodps.table.drop("rodps_sample_tgt") 131 | 132 | x <- data.frame(c1 = 1:1000, c2 = rep(1:10, 100), c3 = rep(1:2, 500)) 133 | rodps.table.write(x, "rodps_sample_src") 134 | 135 | rodps.table.sample.strat("rodps_sample_src", "rodps_sample_tgt", 0.5, strat = c("c3")) 136 | y <- rodps.table.read("rodps_sample_tgt") 137 | names(y) 138 | nrow(y) 139 | check.case() 140 | 141 | # test datetime loading 142 | mark("test_datetime") 143 | d <- rodps.table.read("rodps_load_write_table") 144 | summary(d) 145 | check.case() 146 | 147 | 148 | # 149 | mark("rodps_predict_rpart") 150 | library(rpart) 151 | names(iris) <- gsub("\\.", "_", names(iris)) 152 | rodps.table.drop("iris_tbl") 153 | rodps.table.drop("iris_predict") 154 | rodps.table.write(iris, "iris_tbl") 155 | fit = rpart(Species ~ ., data = iris) 156 | sql = rodps.predict.rpart(fit, "iris_tbl", "iris_predict", run = F) 157 | rodps.sql(sql) 158 | d = rodps.table.read("iris_predict") 159 | v = predict(fit) 160 | max_index <- function(v) { 161 | names(v)[which(v == max(v))] 162 | } 163 | v1 = apply(v, 1, max_index) 164 | any(d$species_predict != v1) 165 | check.case() 166 | 167 | -------------------------------------------------------------------------------- /tests/test_rodps_advanced.R: -------------------------------------------------------------------------------- 1 | library("RODPS") 2 | library(assertthat) 3 | 4 | # Init rodps config 5 | rodps.init("~/.config/odps_config.ini") 6 | 7 | # Upload iris dataset 8 | rodps.table.drop("iris") 9 | names(iris) <- gsub("\\.", "_", names(iris)) 10 | rodps.table.write(iris, "iris") 11 | 12 | # Table histgram 13 | rodps.table.hist(tblname = "iris", colname = "sepal_length", col = rainbow(10), freq = F) 14 | 15 | # Rpart prediction 16 | library(rpart) 17 | fit <- rpart(Species ~ ., data = iris) 18 | rodps.table.drop("iris_p") 19 | rodps.predict(fit, srctbl = "iris", tgttbl = "iris_p") 20 | rodps.table.read("iris_p") 21 | 22 | # Drop test table 23 | rodps.table.drop("iris") 24 | -------------------------------------------------------------------------------- /tests/test_rodps_basics.R: -------------------------------------------------------------------------------- 1 | library("RODPS") 2 | library(assertthat) 3 | 4 | # Init rodps config 5 | rodps.init("~/.config/odps_config.ini") 6 | 7 | # Upload iris dataset 8 | rodps.table.drop("iris") 9 | names(iris) <- gsub("\\.", "_", names(iris)) 10 | rodps.table.write(iris, "iris") 11 | 12 | # Show current project 13 | rodps.project.current() 14 | 15 | # Run plain SQL 16 | sql_str <- "select species, count(1) from iris group by species" 17 | result <- rodps.sql(sql_str) 18 | assert_that(is.data.frame(result)) # Resulted data frame fetched locally 19 | assert_that(nrow(result) == 3) 20 | result 21 | 22 | # Run plain SQL and return remote table 23 | result <- rodps.sql(sql_str, result.table.limit = 0L) 24 | assert_that(is.character(result)) 25 | result_table <- result[1] # Char vector whose first element is the resulted table name 26 | assert_that(rodps.table.rows(result_table) == 3) 27 | rodps.table.read(result_table) 28 | rodps.table.drop(result_table) 29 | 30 | # Run plain SQL with MCQA 31 | result <- rodps.sql(sql_str, mcqa = TRUE) 32 | assert_that(nrow(result) == 3) 33 | result 34 | 35 | # Drop test table 36 | rodps.table.drop("iris") 37 | -------------------------------------------------------------------------------- /tests/test_rodps_table.R: -------------------------------------------------------------------------------- 1 | library("RODPS") 2 | library(assertthat) 3 | 4 | # Init rodps config 5 | rodps.init("~/.config/odps_config.ini") 6 | 7 | # Reset test data 8 | if (rodps.table.exist("iris")) { 9 | rodps.table.drop("iris") 10 | } 11 | 12 | # Upload iris dataset 13 | names(iris) <- gsub("\\.", "_", names(iris)) 14 | rodps.table.write(iris, "iris") 15 | 16 | # DDL generation 17 | sqlddl <- rodps.generate.DDL("testiris", iris, "Iris dataset") 18 | sqlddl2 <- rodps.generate.DDL("testiris", iris) 19 | print(sqlddl) 20 | print(sqlddl2) 21 | 22 | # Table reading 23 | tbl1 <- rodps.table.read("iris") 24 | head(tbl1) 25 | 26 | # Basic table dimensions 27 | rodps.table.head("iris", n = 3) 28 | 29 | assert_that(rodps.table.exist("iris")) 30 | assert_that(!rodps.table.exist("iris-non-existed")) 31 | 32 | rodps.table.list() 33 | rodps.table.desc("iris") 34 | 35 | assert_that(rodps.table.rows("iris") == 150) 36 | assert_that(rodps.table.size("iris") >= 2380) 37 | assert_that(rodps.table.size("iris") <= 2400) 38 | 39 | # Table sampling 40 | if (rodps.table.exist("iris_sampled")) { 41 | rodps.table.drop("iris_sampled") 42 | } 43 | rodps.table.sample.srs("iris", "iris_sampled", 0.5) 44 | rodps.table.rows("iris_sampled") 45 | rodps.table.drop("iris_sampled") 46 | 47 | # Drop test table 48 | rodps.table.drop("iris") 49 | -------------------------------------------------------------------------------- /tools/format_code.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | basepath=$(cd "$(dirname "$0")";pwd)/.. 3 | R -e "library(formatR);tidy_dir('${basepath}/R', recursive=TRUE)" 4 | R -e "library(formatR);tidy_dir('${basepath}/tests', recursive=TRUE)" 5 | -------------------------------------------------------------------------------- /tools/gendoc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "Regenerate documentations..." 3 | R --no-save -e "library(devtools);devtools::document()" -------------------------------------------------------------------------------- /tools/package.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | basepath=$(cd "$(dirname "$0")";pwd)/.. 3 | 4 | # fresh building java lib 5 | rm -rf $basepath/inst/java/rodps-*.jar >> /dev/null 2>&1 6 | 7 | sh $basepath/configure 8 | sh $basepath/tools/format_code.sh 9 | sh $basepath/tools/gendoc.sh 10 | 11 | echo "Check RODPS package..." 12 | R --no-save -e "library(devtools);devtools::check()" 13 | 14 | echo "Build RODPS package..." 15 | R --no-save -e "library(devtools);devtools::build(path = '.')" 16 | -------------------------------------------------------------------------------- /tools/release.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | basepath=$(cd "$(dirname "$0")";pwd)/.. 3 | 4 | sh $basepath/tools/package.sh 5 | 6 | echo "Release RODPS to CRAN..." 7 | R --no-save -e "library(devtools);devtools::release()" -------------------------------------------------------------------------------- /tools/test_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | basepath=$(cd "$(dirname "$0")";pwd)/.. 4 | R -f $basepath/tests/test_rodps_basics.R 5 | R -f $basepath/tests/test_rodps_table.R 6 | R -f $basepath/tests/test_rodps_advanced.R 7 | 8 | --------------------------------------------------------------------------------