├── .Rbuildignore
├── .gitignore
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── NEWS.md
├── R
├── append.R
├── create.R
├── internal.R
├── replace.R
├── table_definition.R
├── upsert.R
└── upsert2.R
├── README.Rmd
├── README.md
├── cran-comments.md
├── man
├── rs_append_table.Rd
├── rs_cols_upsert_table.Rd
├── rs_create_statement.Rd
├── rs_create_table.Rd
├── rs_replace_table.Rd
└── rs_upsert_table.Rd
└── redshiftTools.Rproj
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^README.Rmd
4 | ^cran-comments\.md$
5 | ^LICENSE\.md$
6 | ^CRAN-RELEASE$
7 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: redshiftTools
2 | Type: Package
3 | Title: Amazon Redshift Tools
4 | Version: 1.0.1.900
5 | Authors@R: person("Pablo", "Seibelt", email = "pabloseibelt@sicarul.com",
6 | role = c("aut", "cre"))
7 | Depends:
8 | R (>= 3.3.0)
9 | Imports:
10 | DBI,
11 | aws.s3,
12 | purrr,
13 | progress
14 | Suggests:
15 | RJDBC,
16 | RPostgres
17 | Description: Efficiently upload data into an Amazon Redshift database using the approach recommended by Amazon .
18 | URL: https://github.com/sicarul/redshiftTools
19 | BugReports: https://github.com/sicarul/redshiftTools/issues
20 | License: MIT + file LICENSE
21 | LazyData: TRUE
22 | RoxygenNote: 6.1.1
23 | Collate:
24 | 'append.R'
25 | 'create.R'
26 | 'internal.R'
27 | 'replace.R'
28 | 'table_definition.R'
29 | 'upsert.R'
30 | 'upsert2.R'
31 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2016-2019
2 | COPYRIGHT HOLDER: Pablo Seibelt
3 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2016-2019 Pablo Alejandro Seibelt
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6 |
7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8 |
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | export(rs_append_table)
4 | export(rs_cols_upsert_table)
5 | export(rs_create_statement)
6 | export(rs_create_table)
7 | export(rs_replace_table)
8 | export(rs_upsert_table)
9 | importFrom("aws.s3","bucket_exists")
10 | importFrom("aws.s3","delete_object")
11 | importFrom("aws.s3","put_object")
12 | importFrom("progress","progress_bar")
13 | importFrom("purrr","map")
14 | importFrom("purrr","map2")
15 | importFrom("utils","head")
16 | importFrom("utils","write.csv")
17 | importFrom(DBI,dbExecute)
18 | importFrom(DBI,dbGetQuery)
19 |
--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 | ## redshiftTools 1.0.1
2 |
3 | Fixed eagerness on parallelization when uploading files causing issues when many processes are run at once.
4 |
5 | ## redshiftTools 1.0
6 |
7 | Initial CRAN release
8 |
--------------------------------------------------------------------------------
/R/append.R:
--------------------------------------------------------------------------------
1 | #' Append redshift table
2 | #'
3 | #' Upload a table to S3 and then append it to a Redshift Table.
4 | #' The table on redshift has to have the same structure and column ordering to work correctly.
5 | #'
6 | #' @param df a data frame
7 | #' @param dbcon an RPostgres/RJDBC connection to the redshift server
8 | #' @param table_name the name of the table to replace
9 | #' @param split_files optional parameter to specify amount of files to split into. If not specified will look at amount of slices in Redshift to determine an optimal amount.
10 | #' @param bucket the name of the temporary bucket to load the data. Will look for AWS_BUCKET_NAME on environment if not specified.
11 | #' @param region the region of the bucket. Will look for AWS_DEFAULT_REGION on environment if not specified.
12 | #' @param access_key the access key with permissions for the bucket. Will look for AWS_ACCESS_KEY_ID on environment if not specified.
13 | #' @param secret_key the secret key with permissions fot the bucket. Will look for AWS_SECRET_ACCESS_KEY on environment if not specified.
14 | #' @param session_token the session key with permissions for the bucket, this will be used instead of the access/secret keys if specified. Will look for AWS_SESSION_TOKEN on environment if not specified.
15 | #' @param iam_role_arn an iam role arn with permissions fot the bucket. Will look for AWS_IAM_ROLE_ARN on environment if not specified. This is ignoring access_key and secret_key if set.
16 | #' @param wlm_slots amount of WLM slots to use for this bulk load http://docs.aws.amazon.com/redshift/latest/dg/tutorial-configuring-workload-management.html
17 | #' @param additional_params Additional params to send to the COPY statement in Redshift
18 | #'
19 | #' @examples
20 | #' library(DBI)
21 | #'
22 | #' a=data.frame(a=seq(1,10000), b=seq(10000,1))
23 | #'
24 | #'\dontrun{
25 | #' con <- dbConnect(RPostgres::Postgres(), dbname="dbname",
26 | #' host='my-redshift-url.amazon.com', port='5439',
27 | #' user='myuser', password='mypassword',sslmode='require')
28 | #'
29 | #' rs_append_table(df=a, dbcon=con, table_name='testTable',
30 | #' bucket="my-bucket", split_files=4)
31 | #'
32 | #' }
33 | #' @export
34 | rs_append_table = function(
35 | df,
36 | dbcon,
37 | table_name,
38 | split_files,
39 | bucket=Sys.getenv('AWS_BUCKET_NAME'),
40 | region=Sys.getenv('AWS_DEFAULT_REGION'),
41 | access_key=Sys.getenv('AWS_ACCESS_KEY_ID'),
42 | secret_key=Sys.getenv('AWS_SECRET_ACCESS_KEY'),
43 | session_token=Sys.getenv('AWS_SESSION_TOKEN'),
44 | iam_role_arn=Sys.getenv('AWS_IAM_ROLE_ARN'),
45 | wlm_slots=1,
46 | additional_params=''
47 | )
48 | {
49 |
50 | message('Initiating Redshift table append for table ',table_name)
51 |
52 | if(!inherits(df, 'data.frame')){
53 | warning("The df parameter must be a data.frame or an object compatible with it's interface")
54 | return(FALSE)
55 | }
56 | numRows = nrow(df)
57 | numCols = ncol(df)
58 |
59 | if(numRows == 0){
60 | warning("Empty dataset provided, will not try uploading")
61 | return(FALSE)
62 | }
63 |
64 | message(paste0("The provided data.frame has ", numRows, ' rows and ', numCols, ' columns'))
65 |
66 |
67 | if(missing(split_files)){
68 | split_files = splitDetermine(dbcon, numRows, as.numeric(object.size(df[1,])))
69 | }
70 | split_files = pmin(split_files, numRows)
71 |
72 | # Upload data to S3
73 | prefix = uploadToS3(df, bucket, split_files, access_key, secret_key, session_token, region)
74 |
75 | if(wlm_slots>1){
76 | queryStmt(dbcon,paste0("set wlm_query_slot_count to ", wlm_slots));
77 | }
78 |
79 | result = tryCatch({
80 | stageTable=s3ToRedshift(dbcon, table_name, bucket, prefix, region, access_key, secret_key, session_token, iam_role_arn, additional_params)
81 |
82 | # Use a single transaction
83 | queryStmt(dbcon, 'begin')
84 |
85 | message("Insert new rows")
86 | queryStmt(dbcon, sprintf('insert into %s select * from %s', table_name, stageTable))
87 |
88 | message("Drop staging table")
89 | queryStmt(dbcon, sprintf("drop table %s", stageTable))
90 |
91 | message("Committing changes")
92 | queryStmt(dbcon, "COMMIT;")
93 |
94 | return(TRUE)
95 | }, error = function(e) {
96 | warning(e$message)
97 | queryStmt(dbcon, 'ROLLBACK;')
98 | return(FALSE)
99 | }, finally = {
100 | message("Deleting temporary files from S3 bucket")
101 | deletePrefix(prefix, bucket, split_files, access_key, secret_key, session_token, region)
102 | })
103 |
104 | return (result)
105 | }
106 |
--------------------------------------------------------------------------------
/R/create.R:
--------------------------------------------------------------------------------
1 | #' Create a table from scratch, guessing the table schema
2 | #'
3 | #'
4 | #' @param df a data frame
5 | #' @param dbcon an RPostgres/RJDBC connection to the redshift server
6 | #' @param table_name the name of the table to create
7 | #' @param split_files optional parameter to specify amount of files to split into. If not specified will look at amount of slices in Redshift to determine an optimal amount.
8 | #' @param bucket the name of the temporary bucket to load the data. Will look for AWS_BUCKET_NAME on environment if not specified.
9 | #' @param region the region of the bucket. Will look for AWS_DEFAULT_REGION on environment if not specified.
10 | #' @param access_key the access key with permissions for the bucket. Will look for AWS_ACCESS_KEY_ID on environment if not specified.
11 | #' @param secret_key the secret key with permissions fot the bucket. Will look for AWS_SECRET_ACCESS_KEY on environment if not specified.
12 | #' @param session_token the session key with permissions for the bucket, this will be used instead of the access/secret keys if specified. Will look for AWS_SESSION_TOKEN on environment if not specified.
13 | #' @param iam_role_arn an iam role arn with permissions fot the bucket. Will look for AWS_IAM_ROLE_ARN on environment if not specified. This is ignoring access_key and secret_key if set.
14 | #' @param wlm_slots amount of WLM slots to use for this bulk load http://docs.aws.amazon.com/redshift/latest/dg/tutorial-configuring-workload-management.html
15 | #' @param sortkeys Column or columns to sort the table by
16 | #' @param sortkey_style Sortkey style, can be compound or interleaved http://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data-compare-sort-styles.html
17 | #' @param distkey Distkey column, can only be one, if chosen the table is distributed among clusters according to a hash of this column's value.
18 | #' @param distkey_style Distkey style, can be even or all, for the key distribution use the distkey parameter. http://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html
19 | #' @param compression Add encoding for columns whose compression algorithm is easy to guess, for the rest you should upload it to Redshift and run ANALYZE COMPRESSION
20 | #' @param additional_params Additional params to send to the COPY statement in Redshift
21 | #'
22 | #' @examples
23 | #' library(DBI)
24 | #'
25 | #' a=data.frame(a=seq(1,10000), b=seq(10000,1))
26 | #'
27 | #'\dontrun{
28 | #' con <- dbConnect(RPostgres::Postgres(), dbname="dbname",
29 | #' host='my-redshift-url.amazon.com', port='5439',
30 | #' user='myuser', password='mypassword',sslmode='require')
31 | #'
32 | #' rs_create_table(df=a, dbcon=con, table_name='testTable',
33 | #' bucket="my-bucket", split_files=4)
34 | #'
35 | #' }
36 | #' @export
37 | rs_create_table = function(
38 | df,
39 | dbcon,
40 | table_name=deparse(substitute(df)),
41 | split_files,
42 | bucket=Sys.getenv('AWS_BUCKET_NAME'),
43 | region=Sys.getenv('AWS_DEFAULT_REGION'),
44 | access_key=Sys.getenv('AWS_ACCESS_KEY_ID'),
45 | secret_key=Sys.getenv('AWS_SECRET_ACCESS_KEY'),
46 | session_token=Sys.getenv('AWS_SESSION_TOKEN'),
47 | iam_role_arn=Sys.getenv('AWS_IAM_ROLE_ARN'),
48 | wlm_slots=1,
49 | sortkeys,
50 | sortkey_style='compound',
51 | distkey,
52 | distkey_style='even',
53 | compression=T,
54 | additional_params=''
55 | )
56 | {
57 |
58 | message('Initiating Redshift table creation for table ',table_name)
59 |
60 | tableSchema = rs_create_statement(df, table_name = table_name, sortkeys=sortkeys,
61 | sortkey_style = sortkey_style, distkey=distkey, distkey_style = distkey_style,
62 | compression = compression)
63 |
64 | queryStmt(dbcon, tableSchema)
65 |
66 | return(rs_replace_table(df, dbcon, table_name, split_files, bucket, region, access_key, secret_key, session_token, iam_role_arn, wlm_slots, additional_params))
67 |
68 | }
69 |
--------------------------------------------------------------------------------
/R/internal.R:
--------------------------------------------------------------------------------
1 | # Internal utility functions used by the redshift tools
2 |
3 | if(getRversion() >= "2.15.1") utils::globalVariables(c("i", "obj"))
4 |
5 | #' @importFrom "aws.s3" "put_object" "bucket_exists"
6 | #' @importFrom "utils" "write.csv"
7 | #' @importFrom "purrr" "map2"
8 | #' @importFrom "progress" "progress_bar"
9 | uploadToS3 = function(data, bucket, split_files, key, secret, session, region){
10 | prefix=paste0(sample(rep(letters, 10),50),collapse = "")
11 | if(!bucket_exists(bucket, key=key, secret=secret, session=session, region=region)){
12 | stop("Bucket does not exist")
13 | }
14 |
15 | splitted = suppressWarnings(split(data, seq(1:split_files)))
16 |
17 | message(paste("Uploading", split_files, "files with prefix", prefix, "to bucket", bucket))
18 |
19 |
20 | pb <- progress_bar$new(total = split_files, format='Uploading file :current/:total [:bar]')
21 | pb$tick(0)
22 |
23 | upload_part = function(part, i){
24 | tmpFile = tempfile()
25 | s3Name=paste(bucket, "/", prefix, ".", formatC(i, width = 4, format = "d", flag = "0"), sep="")
26 | write.csv(part, gzfile(tmpFile, encoding="UTF-8"), na='', row.names=F, quote=T)
27 |
28 | r=put_object(file = tmpFile, object = s3Name, bucket = "", key=key, secret=secret,
29 | session=session, region=region)
30 | pb$tick()
31 | return(r)
32 | }
33 |
34 | res = map2 (splitted, 1:split_files, upload_part)
35 |
36 | if(length(which(!unlist(res))) > 0){
37 | warning("Error uploading data!")
38 | return(NA)
39 | }else{
40 | message("Upload to S3 complete!")
41 | return(prefix)
42 | }
43 | }
44 |
45 | #' @importFrom "aws.s3" "delete_object"
46 | #' @importFrom "purrr" "map"
47 | deletePrefix = function(prefix, bucket, split_files, key, secret, session, region){
48 |
49 | s3Names=paste(prefix, ".", formatC(1:split_files, width = 4, format = "d", flag = "0"), sep="")
50 |
51 | message(paste("Deleting", split_files, "files with prefix", prefix, "from bucket", bucket))
52 |
53 | pb <- progress_bar$new(total = split_files, format='Deleting file :current/:total [:bar]')
54 | pb$tick(0)
55 |
56 | deleteObj = function(obj){
57 | delete_object(obj, bucket, key=key, secret=secret, session=session, region=region)
58 | pb$tick()
59 | }
60 |
61 | res = map(s3Names, deleteObj)
62 | }
63 |
64 | #' @importFrom DBI dbGetQuery
65 | queryDo = function(dbcon, query){
66 | dbGetQuery(dbcon, query)
67 | }
68 |
69 | #' @importFrom DBI dbExecute
70 | queryStmt = function(dbcon, query){
71 | if(inherits(dbcon, 'JDBCConnection')){
72 | RJDBC::dbSendUpdate(dbcon, query)
73 | }else{
74 | dbExecute(dbcon, query)
75 | }
76 | }
77 |
78 | splitDetermine = function(dbcon, numRows, rowSize){
79 | message("Getting number of slices from Redshift")
80 | slices = queryDo(dbcon,"select count(*) from stv_slices")
81 | slices_num = pmax(as.integer(round(slices[1,'count'])), 1)
82 | split_files = slices_num
83 |
84 | bigSplit = pmin(floor((numRows*rowSize)/(256*1024*1024)), 5000) #200Mb Per file Up to 5000 files
85 | smallSplit = pmax(ceiling((numRows*rowSize)/(10*1024*1024)), 1) #10MB per file, very small files
86 |
87 | if(bigSplit > slices_num){
88 | split_files=slices_num*round(bigSplit/slices_num) # Round to nearest multiple of slices, optimizes the load
89 | }else if(smallSplit < slices_num){
90 | split_files=smallSplit
91 | }else{
92 | split_files=slices_num
93 | }
94 |
95 | message(sprintf("%s slices detected, will split into %s files", slices, split_files))
96 | return(split_files)
97 | }
98 |
99 |
100 | s3ToRedshift = function(dbcon, table_name, bucket, prefix, region, access_key, secret_key, session, iam_role_arn, additional_params){
101 | stageTable=paste0(sample(letters,16),collapse = "")
102 | # Create temporary table for staging data
103 | queryStmt(dbcon, sprintf("create temp table %s (like %s)", stageTable, table_name))
104 | copyStr = "copy %s from 's3://%s/%s.' region '%s' csv gzip ignoreheader 1 emptyasnull COMPUPDATE FALSE STATUPDATE FALSE %s %s"
105 | # Use IAM Role if available
106 | if (nchar(iam_role_arn) > 0) {
107 | credsStr = sprintf("iam_role '%s'", iam_role_arn)
108 | } else {
109 | # creds string now includes a token in case it is needed.
110 | if (session != '') {
111 | credsStr = sprintf("credentials 'aws_access_key_id=%s;aws_secret_access_key=%s;token=%s'", access_key, secret_key, session)
112 | } else {
113 | credsStr = sprintf("credentials 'aws_access_key_id=%s;aws_secret_access_key=%s'", access_key, secret_key)
114 | }
115 | }
116 | statement = sprintf(copyStr, stageTable, bucket, prefix, region, additional_params, credsStr)
117 | queryStmt(dbcon, statement)
118 |
119 | return(stageTable)
120 | }
121 |
--------------------------------------------------------------------------------
/R/replace.R:
--------------------------------------------------------------------------------
1 | #' Replace redshift table
2 | #'
3 | #' Upload a table to S3 and then load it with redshift, replacing the contents of that table.
4 | #' The table on redshift has to have the same structure and column ordering to work correctly.
5 | #'
6 | #' @param df a data frame
7 | #' @param dbcon an RPostgres/RJDBC connection to the redshift server
8 | #' @param table_name the name of the table to replace
9 | #' @param split_files optional parameter to specify amount of files to split into. If not specified will look at amount of slices in Redshift to determine an optimal amount.
10 | #' @param bucket the name of the temporary bucket to load the data. Will look for AWS_BUCKET_NAME on environment if not specified.
11 | #' @param region the region of the bucket. Will look for AWS_DEFAULT_REGION on environment if not specified.
12 | #' @param access_key the access key with permissions for the bucket. Will look for AWS_ACCESS_KEY_ID on environment if not specified.
13 | #' @param secret_key the secret key with permissions fot the bucket. Will look for AWS_SECRET_ACCESS_KEY on environment if not specified.
14 | #' @param session_token the session key with permissions for the bucket, this will be used instead of the access/secret keys if specified. Will look for AWS_SESSION_TOKEN on environment if not specified.
15 | #' @param iam_role_arn an iam role arn with permissions fot the bucket. Will look for AWS_IAM_ROLE_ARN on environment if not specified. This is ignoring access_key and secret_key if set.
16 | #' @param wlm_slots amount of WLM slots to use for this bulk load http://docs.aws.amazon.com/redshift/latest/dg/tutorial-configuring-workload-management.html
17 | #' @param additional_params Additional params to send to the COPY statement in Redshift
18 | #'
19 | #' @examples
20 | #' library(DBI)
21 | #'
22 | #' a=data.frame(a=seq(1,10000), b=seq(10000,1))
23 | #'
24 | #'\dontrun{
25 | #' con <- dbConnect(RPostgres::Postgres(), dbname="dbname",
26 | #' host='my-redshift-url.amazon.com', port='5439',
27 | #' user='myuser', password='mypassword',sslmode='require')
28 | #'
29 | #' rs_replace_table(df=a, dbcon=con, table_name='testTable',
30 | #' bucket="my-bucket", split_files=4)
31 | #'
32 | #' }
33 | #' @export
34 | rs_replace_table = function(
35 | df,
36 | dbcon,
37 | table_name,
38 | split_files,
39 | bucket=Sys.getenv('AWS_BUCKET_NAME'),
40 | region=Sys.getenv('AWS_DEFAULT_REGION'),
41 | access_key=Sys.getenv('AWS_ACCESS_KEY_ID'),
42 | secret_key=Sys.getenv('AWS_SECRET_ACCESS_KEY'),
43 | session_token=Sys.getenv('AWS_SESSION_TOKEN'),
44 | iam_role_arn=Sys.getenv('AWS_IAM_ROLE_ARN'),
45 | wlm_slots=1,
46 | additional_params=''
47 | )
48 | {
49 |
50 | message('Initiating Redshift table replacement for table ',table_name)
51 |
52 | if(!inherits(df, 'data.frame')){
53 | warning("The df parameter must be a data.frame or an object compatible with it's interface")
54 | return(FALSE)
55 | }
56 | numRows = nrow(df)
57 | numCols = ncol(df)
58 |
59 | if(numRows == 0){
60 | warning("Empty dataset provided, will not try uploading")
61 | return(FALSE)
62 | }
63 |
64 | message(paste0("The provided data.frame has ", numRows, ' rows and ', numCols, ' columns'))
65 |
66 |
67 | if(missing(split_files)){
68 | split_files = splitDetermine(dbcon, numRows, as.numeric(object.size(df[1,])))
69 | }
70 | split_files = pmin(split_files, numRows)
71 |
72 | # Upload data to S3
73 | prefix = uploadToS3(df, bucket, split_files, access_key, secret_key, session_token, region)
74 |
75 | if(wlm_slots>1){
76 | queryStmt(dbcon,paste0("set wlm_query_slot_count to ", wlm_slots));
77 | }
78 |
79 | result = tryCatch({
80 | stageTable=s3ToRedshift(dbcon, table_name, bucket, prefix, region, access_key, secret_key, session_token, iam_role_arn, additional_params)
81 |
82 | # Use a single transaction
83 | queryStmt(dbcon, 'begin')
84 |
85 | message("Deleting target table for replacement")
86 | queryStmt(dbcon, sprintf("delete from %s", table_name))
87 |
88 | message("Insert new rows")
89 | queryStmt(dbcon, sprintf('insert into %s select * from %s', table_name, stageTable))
90 |
91 | message("Drop staging table")
92 | queryStmt(dbcon, sprintf("drop table %s", stageTable))
93 |
94 | message("Committing changes")
95 | queryStmt(dbcon, "COMMIT;")
96 |
97 | return(TRUE)
98 | }, error = function(e) {
99 | warning(e$message)
100 | queryStmt(dbcon, 'ROLLBACK;')
101 | return(FALSE)
102 | }, finally = {
103 | message("Deleting temporary files from S3 bucket")
104 | deletePrefix(prefix, bucket, split_files, access_key, secret_key, session_token, region)
105 | })
106 |
107 | return (result)
108 | }
109 |
--------------------------------------------------------------------------------
/R/table_definition.R:
--------------------------------------------------------------------------------
1 | #' @importFrom "utils" "head"
2 | calculateCharSize <- function(col){
3 | col=as.character(col)
4 | maxChar = max(nchar(col), na.rm=T)
5 | if(is.infinite(maxChar)){
6 | maxChar=1000
7 | warning('Empty column found, setting to 1024 length')
8 | }
9 |
10 | sizes = c(2^c(3:15),65535) # From 8 to 65535, max varchar size in redshift
11 | fsizes = sizes[ifelse(sizes>maxChar, T, F)]
12 | if(length(fsizes)==0){
13 | warning("Character column over maximum size of 65535, set to that value but will fail if not trimmed before uploading!")
14 | warning(paste0('Example offending value: ', head(col[nchar(col) > 65535], 1)))
15 | return(max(sizes, na.rm=T))
16 | }else{
17 | return(min(fsizes, na.rm=T))
18 | }
19 | }
20 |
21 | colToRedshiftType <- function(col, compression) {
22 | class = class(col)[[1]]
23 | switch(class,
24 | logical = {
25 | return('boolean')
26 | },
27 | numeric = {
28 | return('float8')
29 | },
30 | integer = {
31 | if(all(is.na(col))){ #Unknown column, all null
32 | return('int')
33 | }
34 | if(max(col, na.rm = T) < 2000000000){ # Max int is 2147483647 in Redshift
35 | return('int')
36 | } else if (max(col, na.rm=T) < 9200000000000000000){ #Max bigint is 9223372036854775807 in redshift, if bigger treat as numeric
37 | return('bigint')
38 | } else{
39 | return('numeric(38,0)')
40 | }
41 |
42 | },
43 | Date = {
44 | return('date')
45 | },
46 | POSIXct = {
47 | return('timestamp')
48 | },
49 | POSIXlt = {
50 | return('timestamp')
51 | }
52 |
53 | )
54 | charSize = calculateCharSize(col)
55 | if(compression==T){
56 | return(paste0('VARCHAR(', charSize, ') encode zstd'))
57 | }else{
58 | return(paste0('VARCHAR(', charSize, ')'))
59 | }
60 |
61 | }
62 |
63 |
64 | getRedshiftTypesForDataFrame <- function(df, compression) {
65 | return(
66 | sapply(
67 | df,
68 | FUN = colToRedshiftType,
69 | compression
70 | )
71 | )
72 | }
73 |
74 | #' Generate create table statement for Amazon Redshift
75 | #'
76 | #' This lets you easily generate a table schema from a data.frame, which allows for easily uploading to redshift afterwards.
77 | #'
78 | #' @param df the data.frame you want to upload to Amazon Redshift
79 | #' @param table_name the name of the table to create, if not specified it'll use the data.frame name
80 | #' @param sortkeys Column or columns to sort the table by
81 | #' @param sortkey_style Sortkey style, can be compound or interleaved http://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data-compare-sort-styles.html
82 | #' @param distkey Distkey column, can only be one, if chosen the table is distributed among clusters according to a hash of this column's value.
83 | #' @param distkey_style Distkey style, can be even or all, for the key distribution use the distkey parameter. http://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html
84 | #' @param compression Add encoding for columns whose compression algorithm is easy to guess, for the rest you should upload it to Redshift and run ANALYZE COMPRESSION
85 | #' @examples
86 | #'
87 | #'n=1000
88 | #'testdf = data.frame(
89 | #'a=rep('a', n),
90 | #'b=c(1:n),
91 | #'c=rep(as.Date('2017-01-01'), n),
92 | #'d=rep(as.POSIXct('2017-01-01 20:01:32'), n),
93 | #'e=rep(as.POSIXlt('2017-01-01 20:01:32'), n),
94 | #'f=rep(paste0(rep('a', 4000), collapse=''), n) )
95 | #'
96 | #'cat(rs_create_statement(testdf, table_name='dm_great_table'))
97 | #'
98 | #' @export
99 | rs_create_statement <- function(
100 | df,
101 | table_name = deparse(substitute(df)),
102 | sortkeys,
103 | sortkey_style='compound',
104 | distkey,
105 | distkey_style='even',
106 | compression=T
107 | ){
108 | definitions = getRedshiftTypesForDataFrame(df, compression)
109 | fields = paste(names(definitions), definitions, collapse=',\n')
110 | sortkey_style=tolower(sortkey_style)
111 | distkey_style=tolower(distkey_style)
112 |
113 | if(ncol(df) > 1600){
114 | warning("Redshift doesn't support tables of more than 1600 columns")
115 | }
116 |
117 | dkey=''
118 | if(!missing(distkey)){
119 | dkey=paste0('diststyle key distkey(', distkey, ')\n')
120 | }else if (distkey_style=='all'){
121 | dkey=paste0('diststyle all\n')
122 | }else if (distkey_style!='even'){
123 | warning('Unknown distkey style', distkey_style)
124 | }
125 |
126 | skey=''
127 | if(!missing(sortkeys)){
128 | if(length(sortkeys) > 1){
129 | skeyvals = paste(sortkeys, collapse=', ')
130 | if(!sortkey_style %in% c('interleaved', 'compound')){
131 | warning('Unknown sortkey style', sortkey_style)
132 | }
133 | skey=paste0(sortkey_style, ' sortkey (', skeyvals, ')\n')
134 | }else{
135 | skey=paste0('sortkey(', sortkeys,')\n')
136 | }
137 | }
138 |
139 | return(paste0('CREATE TABLE ', table_name, ' (\n', fields, '\n)', dkey, skey,';'))
140 | }
141 |
--------------------------------------------------------------------------------
/R/upsert.R:
--------------------------------------------------------------------------------
1 | #' Upsert redshift table
2 | #'
3 | #' Upload a table to S3 and then load it with redshift, replacing rows with the same
4 | #' keys, and inserting rows with new keys.
5 | #' The table on redshift has to have the same structure and column ordering to work correctly.
6 | #'
7 | #' @param df a data frame
8 | #' @param dbcon an RPostgres/RJDBC connection to the redshift server
9 | #' @param table_name the name of the table to update/insert
10 | #' @param split_files optional parameter to specify amount of files to split into. If not specified will look at amount of slices in Redshift to determine an optimal amount.
11 | #' @param keys this optional vector contains the variables by which to upsert. If not defined, the upsert becomes an append.
12 | #' @param bucket the name of the temporary bucket to load the data. Will look for AWS_BUCKET_NAME on environment if not specified.
13 | #' @param region the region of the bucket. Will look for AWS_DEFAULT_REGION on environment if not specified.
14 | #' @param access_key the access key with permissions for the bucket. Will look for AWS_ACCESS_KEY_ID on environment if not specified.
15 | #' @param secret_key the secret key with permissions for the bucket. Will look for AWS_SECRET_ACCESS_KEY on environment if not specified.
16 | #' @param session_token the session key with permissions for the bucket, this will be used instead of the access/secret keys if specified. Will look for AWS_SESSION_TOKEN on environment if not specified.
17 | #' @param iam_role_arn an iam role arn with permissions fot the bucket. Will look for AWS_IAM_ROLE_ARN on environment if not specified. This is ignoring access_key and secret_key if set.
18 | #' @param wlm_slots amount of WLM slots to use for this bulk load http://docs.aws.amazon.com/redshift/latest/dg/tutorial-configuring-workload-management.html
19 | #' @param additional_params Additional params to send to the COPY statement in Redshift
20 | #'
21 | #' @examples
22 | #' library(DBI)
23 | #'
24 | #' a=data.frame(a=seq(1,10000), b=seq(10000,1))
25 | #' n=head(a,n=5000)
26 | #' n$b=n$a
27 | #' nx=rbind(n, data.frame(a=seq(99999:104000), b=seq(104000:99999)))
28 | #'
29 | #'\dontrun{
30 | #' con <- dbConnect(RPostgres::Postgres(), dbname="dbname",
31 | #' host='my-redshift-url.amazon.com', port='5439',
32 | #' user='myuser', password='mypassword',sslmode='require')
33 | #'
34 | #' rs_upsert_table(df=nx, dbcon=con, table_name='testTable',
35 | #' bucket="my-bucket", split_files=4, keys=c('a'))
36 | #'
37 | #'}
38 | #' @export
39 | rs_upsert_table = function(
40 | df,
41 | dbcon,
42 | table_name,
43 | keys,
44 | split_files,
45 | bucket=Sys.getenv('AWS_BUCKET_NAME'),
46 | region=Sys.getenv('AWS_DEFAULT_REGION'),
47 | access_key=Sys.getenv('AWS_ACCESS_KEY_ID'),
48 | secret_key=Sys.getenv('AWS_SECRET_ACCESS_KEY'),
49 | session_token=Sys.getenv('AWS_SESSION_TOKEN'),
50 | iam_role_arn=Sys.getenv('AWS_IAM_ROLE_ARN'),
51 | wlm_slots=1,
52 | additional_params=''
53 | )
54 | {
55 |
56 | message('Initiating Redshift table upsert for table ',table_name)
57 |
58 | if(!inherits(df, 'data.frame')){
59 | warning("The df parameter must be a data.frame or an object compatible with it's interface")
60 | return(FALSE)
61 | }
62 | numRows = nrow(df)
63 | numCols = ncol(df)
64 |
65 | if(numRows == 0){
66 | warning("Empty dataset provided, will not try uploading")
67 | return(FALSE)
68 | }
69 |
70 | message(paste0("The provided data.frame has ", numRows, ' rows and ', numCols, ' columns'))
71 |
72 | if(missing(split_files)){
73 | split_files = splitDetermine(dbcon, numRows, as.numeric(object.size(df[1,])))
74 | }
75 | split_files = pmin(split_files, numRows)
76 |
77 | # Upload data to S3
78 | prefix = uploadToS3(df, bucket, split_files, access_key, secret_key, session_token, region)
79 |
80 | if(wlm_slots>1){
81 | queryStmt(dbcon,paste0("set wlm_query_slot_count to ", wlm_slots));
82 | }
83 |
84 | result = tryCatch({
85 | stageTable=s3ToRedshift(dbcon, table_name, bucket, prefix, region, access_key, secret_key, session_token, iam_role_arn, additional_params)
86 |
87 | # Use a single transaction
88 | queryStmt(dbcon, 'begin')
89 |
90 |
91 | if(!missing(keys)){
92 | # where stage.key = table.key and...
93 | keysCond = paste(stageTable,".",keys, "=", table_name,".",keys, sep="")
94 | keysWhere = sub(" and $", "", paste0(keysCond, collapse="", sep=" and "))
95 |
96 | queryStmt(dbcon, sprintf('delete from %s using %s where %s',
97 | table_name,
98 | stageTable,
99 | keysWhere
100 | ))
101 | }
102 |
103 | message("Insert new rows")
104 | queryStmt(dbcon, sprintf('insert into %s select * from %s', table_name, stageTable))
105 |
106 | message("Drop staging table")
107 | queryStmt(dbcon, sprintf("drop table %s", stageTable))
108 |
109 | message("Commiting")
110 | queryStmt(dbcon, "COMMIT;")
111 |
112 | return(TRUE)
113 | }, warning = function(w) {
114 | warning(w)
115 | }, error = function(e) {
116 | warning(e$message)
117 | queryStmt(dbcon, 'ROLLBACK;')
118 | return(FALSE)
119 | }, finally = {
120 | message("Deleting temporary files from S3 bucket")
121 | deletePrefix(prefix, bucket, split_files, access_key, secret_key, session_token, region)
122 | })
123 |
124 | return (result)
125 | }
126 |
--------------------------------------------------------------------------------
/R/upsert2.R:
--------------------------------------------------------------------------------
1 | #' Upsert Redshift table by specifying a column list
2 | #'
3 | #' Upload a table to S3 and then load to Redshift, replacing the target value
4 | #' in existing rows that have the same keys, and inserting rows with new keys.
5 | #' New rows must match structure and column ordering of existing Redshift table.
6 | #'
7 | #' @param dat a data frame
8 | #' @param dbcon an RPostgres/RJDBC connection to the redshift server
9 | #' @param table_name the name of the table to update/insert
10 | #' @param split_files optional parameter to specify amount of files to split into. If not specified will look at amount of slices in Redshift to determine an optimal amount.
11 | #' @param values the columns that will be updated
12 | #' @param keys this optional vector contains the variables by which to upsert. If not defined, the upsert becomes an append.
13 | #' @param bucket the name of the temporary bucket to load the data. Will look for AWS_BUCKET_NAME on environment if not specified.
14 | #' @param region the region of the bucket. Will look for AWS_DEFAULT_REGION on environment if not specified.
15 | #' @param access_key the access key with permissions for the bucket. Will look for AWS_ACCESS_KEY_ID on environment if not specified.
16 | #' @param secret_key the secret key with permissions for the bucket. Will look for AWS_SECRET_ACCESS_KEY on environment if not specified.
17 | #' @param session_token the session key with permissions for the bucket, this will be used instead of the access/secret keys if specified. Will look for AWS_SESSION_TOKEN on environment if not specified.
18 | #' @param iam_role_arn an iam role arn with permissions fot the bucket. Will look for AWS_IAM_ROLE_ARN on environment if not specified. This is ignoring access_key and secret_key if set.
19 | #' @param wlm_slots amount of WLM slots to use for this bulk load http://docs.aws.amazon.com/redshift/latest/dg/tutorial-configuring-workload-management.html
20 | #' @param additional_params Additional params to send to the COPY statement in Redshift
21 | #'
22 | #' @examples
23 | #' library(DBI)
24 | #'
25 | #' a=data.frame(a=seq(1,10000), b=seq(10000,1))
26 | #' n=head(a,n=5000)
27 | #' n$b=n$a
28 | #' nx=rbind(n, data.frame(a=seq(99999:104000), b=seq(104000:99999)))
29 | #'
30 | #'\dontrun{
31 | #' con <- dbConnect(RPostgres::Postgres(), dbname="dbname",
32 | #' host='my-redshift-url.amazon.com', port='5439',
33 | #' user='myuser', password='mypassword',sslmode='require')
34 | #'
35 | #' rs_cols_upsert_table(df=nx, dbcon=con, table_name='testTable',
36 | #' bucket="my-bucket", split_files=4, values=c('b'), keys=c('a'))
37 | #'}
38 | #' @seealso \url{https://docs.aws.amazon.com/redshift/latest/dg/merge-specify-a-column-list.html}
39 | #' @seealso \url{http://docs.aws.amazon.com/redshift/latest/dg/tutorial-configuring-workload-management.html}
40 | #' @export
41 | rs_cols_upsert_table = function(dat,
42 | dbcon,
43 | table_name,
44 | values,
45 | keys,
46 | split_files,
47 | bucket = Sys.getenv('AWS_BUCKET_NAME'),
48 | region = Sys.getenv('AWS_DEFAULT_REGION'),
49 | access_key = Sys.getenv('AWS_ACCESS_KEY_ID'),
50 | secret_key = Sys.getenv('AWS_SECRET_ACCESS_KEY'),
51 | session_token=Sys.getenv('AWS_SESSION_TOKEN'),
52 | iam_role_arn = Sys.getenv('AWS_IAM_ROLE_ARN'),
53 | wlm_slots = 1,
54 | additional_params = '') {
55 |
56 | message('Initiating Redshift table upsert for table ',table_name)
57 |
58 | if (!inherits(dat, 'data.frame')) {
59 | warning("dat must be a data.frame or inherit from data.frame")
60 | return(FALSE)
61 | }
62 | numRows = nrow(dat)
63 | numCols = ncol(df)
64 |
65 | if (numRows == 0) {
66 | warning("Empty dataset provided, will not try uploading")
67 | return(FALSE)
68 | }
69 |
70 | message(paste0("The provided data.frame has ", numRows, ' rows and ', numCols, ' columns'))
71 |
72 | if (missing(split_files)) {
73 | split_files = splitDetermine(dbcon, numRows, as.numeric(object.size(df[1,])))
74 | }
75 | split_files = pmin(split_files, numRows)
76 |
77 | # Upload data to S3
78 | prefix = uploadToS3(dat, bucket, split_files, access_key, secret_key, session_token, region)
79 |
80 | if (wlm_slots > 1) {
81 | queryStmt(dbcon, paste0("set wlm_query_slot_count to ", wlm_slots))
82 | }
83 |
84 | result = tryCatch({
85 | stageTable = s3ToRedshift(
86 | dbcon,
87 | table_name,
88 | bucket,
89 | prefix,
90 | region,
91 | access_key,
92 | secret_key,
93 | session_token,
94 | iam_role_arn,
95 | additional_params
96 | )
97 |
98 | # Use a single transaction
99 | queryStmt(dbcon, 'begin')
100 |
101 | # values to update
102 | setValues = paste(values, "=", stageTable, ".", values,
103 | sep="", collapse=", ")
104 | # check that the values actually differ from existing ones
105 | changedValues = paste(table_name, ".", values, "!=",
106 | stageTable, ".", values,
107 | sep="", collapse=" OR ")
108 |
109 | # keys aren't optional
110 | # where stage.key = table.key and...
111 | keysWhere = paste(stageTable, ".", keys, "=", table_name, ".", keys,
112 | sep="", collapse=" and ")
113 | qu <- sprintf('UPDATE %s \nSET %s \nFROM %s \nWHERE %s \nAND (%s)',
114 | table_name,
115 | setValues,
116 | stageTable,
117 | keysWhere,
118 | changedValues
119 | )
120 | message(qu)
121 | res <- queryStmt(dbcon, qu)
122 | message(res, " rows affected")
123 |
124 | qu <- sprintf('DELETE FROM %s \nUSING %s \nWHERE %s',
125 | stageTable,
126 | table_name,
127 | keysWhere
128 | )
129 | message(qu)
130 | res <- queryStmt(dbcon, qu)
131 | message(res, " rows affected")
132 |
133 | message("Insert new rows")
134 | qu <- sprintf('INSERT INTO %s \nSELECT * FROM %s',
135 | table_name,
136 | stageTable)
137 | message(qu)
138 | res <- queryStmt(dbcon, qu)
139 | message(res, " rows affected")
140 |
141 | message("Commiting")
142 | queryStmt(dbcon, "COMMIT;")
143 |
144 | qu <- sprintf("DROP TABLE %s", stageTable)
145 | message(qu)
146 | res <- queryStmt(dbcon, qu)
147 | message(res, " rows affected")
148 |
149 | return(TRUE)
150 | }, warning = function(w) {
151 | warning(w)
152 | }, error = function(e) {
153 | warning(e$message)
154 | queryStmt(dbcon, 'ROLLBACK;')
155 | return(FALSE)
156 | }, finally = {
157 | message("Deleting temporary files from S3 bucket")
158 | deletePrefix(prefix, bucket, split_files, access_key, secret_key, session_token, region)
159 | })
160 |
161 | return (result)
162 | }
163 |
--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | output:
3 | md_document:
4 | variant: markdown_github
5 | ---
6 |
7 |
8 |
9 | ```{r setup, include=FALSE}
10 | knitr::opts_chunk$set(echo = TRUE)
11 | ```
12 |
13 |
14 | 
15 | 
16 |
17 |
18 |
19 | # redshiftTools
20 |
21 | This is an R Package meant to easen common operations with Amazon Redshift. The first motivation for this package was making it easier for bulk uploads, where the procedure for uploading data consists in generating various CSV files, uploading them to an S3 bucket and then calling a copy command on the server, this package helps with all those tasks in encapsulated functions.
22 |
23 |
24 | ## Installation
25 |
26 | To install the latest CRAN version, you'll need to execute:
27 |
28 | ``` r
29 | install.packages('redshiftTools')
30 | ```
31 |
32 | If instead you want to install the latest github master version:
33 |
34 | ``` r
35 | devtools::install_github("sicarul/redshiftTools")
36 | ```
37 |
38 |
39 | ## Drivers
40 |
41 | This library supports two official ways of connecting to Amazon Redshift (Others may work, but untested):
42 |
43 | ### RPostgres
44 | This Postgres library is great, and it works even with Amazon Redshift servers with SSL enabled. It previously didn't support transactions, but is now the recommended way to work with redshiftTools.
45 |
46 | To use it, please configure like this:
47 |
48 | ``` r
49 | devtools::install_github("r-dbi/RPostgres")
50 | library(RPostgres)
51 |
52 | con <- dbConnect(RPostgres::Postgres(), dbname="dbname",
53 | host='my-redshift-url.amazon.com', port='5439',
54 | user='myuser', password='mypassword',sslmode='require')
55 | test=dbGetQuery(con, 'select 1')
56 | ```
57 |
58 | ### RJDBC
59 | If you download the official redshift driver .jar, you can use it with this R library, it's not great in the sense that you can't use it with dplyr for example, since it doesn't implement all the standard DBI interfaces, but it works fine for uploading data.
60 |
61 | To use it, please configure like this:
62 |
63 | ``` r
64 | install.packages('RJDBC')
65 | library(RJDBC)
66 |
67 | # Save the driver into a directory
68 | dir.create('~/.redshiftTools')
69 | # - Check your AWS Dashboard to get the latest URL instead of this version -
70 | download.file('http://s3.amazonaws.com/redshift-downloads/drivers/RedshiftJDBC41-1.1.9.1009.jar','~/.redshiftTools/redshift-driver.jar')
71 |
72 | # Use Redshift driver
73 | driver <- JDBC("com.amazon.redshift.jdbc41.Driver", "~/.redshiftTools/redshift-driver.jar", identifier.quote="`")
74 |
75 | # Create connection, in production, you may want to move these variables to a .env file with library dotenv, or other methods.
76 | dbname="dbname"
77 | host='my-redshift-url.amazon.com'
78 | port='5439'
79 | user='myuser'
80 | password='mypassword'
81 | ssl='true'
82 | url <- sprintf("jdbc:redshift://%s:%s/%s?tcpKeepAlive=true&ssl=%s&sslfactory=com.amazon.redshift.ssl.NonValidatingFactory", host, port, dbname, ssl)
83 | conn <- dbConnect(driver, url, user, password)
84 |
85 | ```
86 |
87 | ## Usage
88 |
89 | ### Creating tables
90 |
91 | For creating tables, there is a support function, `rs_create_statement`, which receives a data.frame and returns the query for creating the same table in Amazon Redshift.
92 |
93 | ``` r
94 | n=1000
95 | testdf = data.frame(
96 | a=rep('a', n),
97 | b=c(1:n),
98 | c=rep(as.Date('2017-01-01'), n),
99 | d=rep(as.POSIXct('2017-01-01 20:01:32'), n),
100 | e=rep(as.POSIXlt('2017-01-01 20:01:32'), n),
101 | f=rep(paste0(rep('a', 4000), collapse=''), n) )
102 |
103 | cat(rs_create_statement(testdf, table_name='dm_great_table'))
104 |
105 | ```
106 |
107 | This returns:
108 |
109 | ``` sql
110 | CREATE TABLE dm_great_table (
111 | a VARCHAR(8),
112 | b int,
113 | c date,
114 | d timestamp,
115 | e timestamp,
116 | f VARCHAR(4096)
117 | );
118 | ```
119 |
120 | The cat is only done to view properly in console, it's not done directly in the function in case you need to pass the string to another function (Like a query runner)
121 |
122 | ### Uploading data
123 |
124 | For uploading data, you'll have available now 2 functions: `rs_replace_table` and `rs_upsert_table`, both of these functions are called with almost the same parameters, except on upsert you can specify with which keys to search for matching rows.
125 |
126 | For example, suppose we have a table to load with 2 integer columns, we could use the following code:
127 |
128 | ``` r
129 | library("aws.s3")
130 | library(RPostgres)
131 | library(redshiftTools)
132 |
133 | a=data.frame(a=seq(1,10000), b=seq(10000,1))
134 | n=head(a,n=10)
135 | n$b=n$a
136 | nx=rbind(n, data.frame(a=seq(5:10), b=seq(10:5)))
137 |
138 | con <- dbConnect(RPostgres::Postgres(), dbname="dbname",
139 | host='my-redshift-url.amazon.com', port='5439',
140 | user='myuser', password='mypassword',sslmode='require')
141 |
142 | b=rs_replace_table(a, dbcon=con, table_name='mytable', bucket="mybucket", split_files=4)
143 | c=rs_upsert_table(nx, dbcon=con, table_name = 'mytable', split_files=4, bucket="mybucket", keys=c('a'))
144 |
145 | ```
146 |
147 |
148 | ### Creating tables with data
149 |
150 | A conjunction of `rs_create_statement` and `rs_replace_table` can be found in `rs_create_table`. You can create a table from scratch from R and upload the contents of the data frame, without needing to write SQL code at all.
151 |
152 |
153 | ``` r
154 | library("aws.s3")
155 | library(RPostgres)
156 | library(redshiftTools)
157 |
158 | a=data.frame(a=seq(1,10000), b=seq(10000,1))
159 |
160 | con <- dbConnect(RPostgres::Postgres(), dbname="dbname",
161 | host='my-redshift-url.amazon.com', port='5439',
162 | user='myuser', password='mypassword',sslmode='require')
163 |
164 | b=rs_create_table(a, dbcon=con, table_name='mytable', bucket="mybucket", split_files=4)
165 |
166 |
167 | ```
168 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | redshiftTools
3 | =============
4 |
5 | This is an R Package meant to easen common operations with Amazon
6 | Redshift. The first motivation for this package was making it easier for
7 | bulk uploads, where the procedure for uploading data consists in
8 | generating various CSV files, uploading them to an S3 bucket and then
9 | calling a copy command on the server, this package helps with all those
10 | tasks in encapsulated functions.
11 |
12 | # WARNING
13 |
14 | This package is not being maintained, however this fork is being maintained instead:
15 | https://github.com/RedOakStrategic/redshiftTools
16 |
17 |
18 | Installation
19 | ------------
20 |
21 | To install the latest CRAN version, you’ll need to execute:
22 |
23 | ``` r
24 | install.packages('redshiftTools')
25 | ```
26 |
27 | If instead you want to install the latest github master version:
28 |
29 | ``` r
30 | devtools::install_github("sicarul/redshiftTools")
31 | ```
32 |
33 | Drivers
34 | -------
35 |
36 | This library supports two official ways of connecting to Amazon Redshift
37 | (Others may work, but untested):
38 |
39 | ### RPostgres
40 |
41 | This Postgres library is great, and it works even with Amazon Redshift
42 | servers with SSL enabled. It previously didn’t support transactions, but
43 | is now the recommended way to work with redshiftTools.
44 |
45 | To use it, please configure like this:
46 |
47 | ``` r
48 | devtools::install_github("r-dbi/RPostgres")
49 | library(RPostgres)
50 |
51 | con <- dbConnect(RPostgres::Postgres(), dbname="dbname",
52 | host='my-redshift-url.amazon.com', port='5439',
53 | user='myuser', password='mypassword',sslmode='require')
54 | test=dbGetQuery(con, 'select 1')
55 | ```
56 |
57 | ### RJDBC
58 |
59 | If you download the official redshift driver .jar, you can use it with
60 | this R library, it’s not great in the sense that you can’t use it with
61 | dplyr for example, since it doesn’t implement all the standard DBI
62 | interfaces, but it works fine for uploading data.
63 |
64 | To use it, please configure like this:
65 |
66 | ``` r
67 | install.packages('RJDBC')
68 | library(RJDBC)
69 |
70 | # Save the driver into a directory
71 | dir.create('~/.redshiftTools')
72 | # - Check your AWS Dashboard to get the latest URL instead of this version -
73 | download.file('http://s3.amazonaws.com/redshift-downloads/drivers/RedshiftJDBC41-1.1.9.1009.jar','~/.redshiftTools/redshift-driver.jar')
74 |
75 | # Use Redshift driver
76 | driver <- JDBC("com.amazon.redshift.jdbc41.Driver", "~/.redshiftTools/redshift-driver.jar", identifier.quote="`")
77 |
78 | # Create connection, in production, you may want to move these variables to a .env file with library dotenv, or other methods.
79 | dbname="dbname"
80 | host='my-redshift-url.amazon.com'
81 | port='5439'
82 | user='myuser'
83 | password='mypassword'
84 | ssl='true'
85 | url <- sprintf("jdbc:redshift://%s:%s/%s?tcpKeepAlive=true&ssl=%s&sslfactory=com.amazon.redshift.ssl.NonValidatingFactory", host, port, dbname, ssl)
86 | conn <- dbConnect(driver, url, user, password)
87 | ```
88 |
89 | Usage
90 | -----
91 |
92 | ### Creating tables
93 |
94 | For creating tables, there is a support function, `rs_create_statement`,
95 | which receives a data.frame and returns the query for creating the same
96 | table in Amazon Redshift.
97 |
98 | ``` r
99 | n=1000
100 | testdf = data.frame(
101 | a=rep('a', n),
102 | b=c(1:n),
103 | c=rep(as.Date('2017-01-01'), n),
104 | d=rep(as.POSIXct('2017-01-01 20:01:32'), n),
105 | e=rep(as.POSIXlt('2017-01-01 20:01:32'), n),
106 | f=rep(paste0(rep('a', 4000), collapse=''), n) )
107 |
108 | cat(rs_create_statement(testdf, table_name='dm_great_table'))
109 | ```
110 |
111 | This returns:
112 |
113 | ``` sql
114 | CREATE TABLE dm_great_table (
115 | a VARCHAR(8),
116 | b int,
117 | c date,
118 | d timestamp,
119 | e timestamp,
120 | f VARCHAR(4096)
121 | );
122 | ```
123 |
124 | The cat is only done to view properly in console, it’s not done directly
125 | in the function in case you need to pass the string to another function
126 | (Like a query runner)
127 |
128 | ### Uploading data
129 |
130 | For uploading data, you’ll have available now 2 functions:
131 | `rs_replace_table` and `rs_upsert_table`, both of these functions are
132 | called with almost the same parameters, except on upsert you can specify
133 | with which keys to search for matching rows.
134 |
135 | For example, suppose we have a table to load with 2 integer columns, we
136 | could use the following code:
137 |
138 | ``` r
139 | library("aws.s3")
140 | library(RPostgres)
141 | library(redshiftTools)
142 |
143 | a=data.frame(a=seq(1,10000), b=seq(10000,1))
144 | n=head(a,n=10)
145 | n$b=n$a
146 | nx=rbind(n, data.frame(a=seq(5:10), b=seq(10:5)))
147 |
148 | con <- dbConnect(RPostgres::Postgres(), dbname="dbname",
149 | host='my-redshift-url.amazon.com', port='5439',
150 | user='myuser', password='mypassword',sslmode='require')
151 |
152 | b=rs_replace_table(a, dbcon=con, table_name='mytable', bucket="mybucket", split_files=4)
153 | c=rs_upsert_table(nx, dbcon=con, table_name = 'mytable', split_files=4, bucket="mybucket", keys=c('a'))
154 | ```
155 |
156 | ### Creating tables with data
157 |
158 | A conjunction of `rs_create_statement` and `rs_replace_table` can be
159 | found in `rs_create_table`. You can create a table from scratch from R
160 | and upload the contents of the data frame, without needing to write SQL
161 | code at all.
162 |
163 | ``` r
164 | library("aws.s3")
165 | library(RPostgres)
166 | library(redshiftTools)
167 |
168 | a=data.frame(a=seq(1,10000), b=seq(10000,1))
169 |
170 | con <- dbConnect(RPostgres::Postgres(), dbname="dbname",
171 | host='my-redshift-url.amazon.com', port='5439',
172 | user='myuser', password='mypassword',sslmode='require')
173 |
174 | b=rs_create_table(a, dbcon=con, table_name='mytable', bucket="mybucket", split_files=4)
175 |
176 | ```
177 |
--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
1 | ## Test environments
2 | * local OS X install, R 3.4.3
3 | * ubuntu 12.04 (on travis-ci), R 3.4.3
4 | * win-builder (devel and release)
5 |
6 | ## R CMD check results
7 |
8 | 0 errors | 0 warnings | 1 note
9 |
10 | * This is a new release.
11 |
12 | ## Reverse dependencies
13 |
14 | There are no reverse dependencies.
15 |
16 | ---
17 |
18 | * Examples are all set as {dontrun} because they require a connection to an actual Amazon Redshift database and S3 buckets to work, otherwise they'd fail every time you run a check.
19 |
20 | * Updating to fix an issue with excessive parallelization.
21 |
--------------------------------------------------------------------------------
/man/rs_append_table.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/append.R
3 | \name{rs_append_table}
4 | \alias{rs_append_table}
5 | \title{Append redshift table}
6 | \usage{
7 | rs_append_table(df, dbcon, table_name, split_files,
8 | bucket = Sys.getenv("AWS_BUCKET_NAME"),
9 | region = Sys.getenv("AWS_DEFAULT_REGION"),
10 | access_key = Sys.getenv("AWS_ACCESS_KEY_ID"),
11 | secret_key = Sys.getenv("AWS_SECRET_ACCESS_KEY"),
12 | session_token = Sys.getenv("AWS_SESSION_TOKEN"),
13 | iam_role_arn = Sys.getenv("AWS_IAM_ROLE_ARN"), wlm_slots = 1,
14 | additional_params = "")
15 | }
16 | \arguments{
17 | \item{df}{a data frame}
18 |
19 | \item{dbcon}{an RPostgres/RJDBC connection to the redshift server}
20 |
21 | \item{table_name}{the name of the table to replace}
22 |
23 | \item{split_files}{optional parameter to specify amount of files to split into. If not specified will look at amount of slices in Redshift to determine an optimal amount.}
24 |
25 | \item{bucket}{the name of the temporary bucket to load the data. Will look for AWS_BUCKET_NAME on environment if not specified.}
26 |
27 | \item{region}{the region of the bucket. Will look for AWS_DEFAULT_REGION on environment if not specified.}
28 |
29 | \item{access_key}{the access key with permissions for the bucket. Will look for AWS_ACCESS_KEY_ID on environment if not specified.}
30 |
31 | \item{secret_key}{the secret key with permissions fot the bucket. Will look for AWS_SECRET_ACCESS_KEY on environment if not specified.}
32 |
33 | \item{session_token}{the session key with permissions for the bucket, this will be used instead of the access/secret keys if specified. Will look for AWS_SESSION_TOKEN on environment if not specified.}
34 |
35 | \item{iam_role_arn}{an iam role arn with permissions fot the bucket. Will look for AWS_IAM_ROLE_ARN on environment if not specified. This is ignoring access_key and secret_key if set.}
36 |
37 | \item{wlm_slots}{amount of WLM slots to use for this bulk load http://docs.aws.amazon.com/redshift/latest/dg/tutorial-configuring-workload-management.html}
38 |
39 | \item{additional_params}{Additional params to send to the COPY statement in Redshift}
40 | }
41 | \description{
42 | Upload a table to S3 and then append it to a Redshift Table.
43 | The table on redshift has to have the same structure and column ordering to work correctly.
44 | }
45 | \examples{
46 | library(DBI)
47 |
48 | a=data.frame(a=seq(1,10000), b=seq(10000,1))
49 |
50 | \dontrun{
51 | con <- dbConnect(RPostgres::Postgres(), dbname="dbname",
52 | host='my-redshift-url.amazon.com', port='5439',
53 | user='myuser', password='mypassword',sslmode='require')
54 |
55 | rs_append_table(df=a, dbcon=con, table_name='testTable',
56 | bucket="my-bucket", split_files=4)
57 |
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/man/rs_cols_upsert_table.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/upsert2.R
3 | \name{rs_cols_upsert_table}
4 | \alias{rs_cols_upsert_table}
5 | \title{Upsert Redshift table by specifying a column list}
6 | \usage{
7 | rs_cols_upsert_table(dat, dbcon, table_name, values, keys, split_files,
8 | bucket = Sys.getenv("AWS_BUCKET_NAME"),
9 | region = Sys.getenv("AWS_DEFAULT_REGION"),
10 | access_key = Sys.getenv("AWS_ACCESS_KEY_ID"),
11 | secret_key = Sys.getenv("AWS_SECRET_ACCESS_KEY"),
12 | session_token = Sys.getenv("AWS_SESSION_TOKEN"),
13 | iam_role_arn = Sys.getenv("AWS_IAM_ROLE_ARN"), wlm_slots = 1,
14 | additional_params = "")
15 | }
16 | \arguments{
17 | \item{dat}{a data frame}
18 |
19 | \item{dbcon}{an RPostgres/RJDBC connection to the redshift server}
20 |
21 | \item{table_name}{the name of the table to update/insert}
22 |
23 | \item{values}{the columns that will be updated}
24 |
25 | \item{keys}{this optional vector contains the variables by which to upsert. If not defined, the upsert becomes an append.}
26 |
27 | \item{split_files}{optional parameter to specify amount of files to split into. If not specified will look at amount of slices in Redshift to determine an optimal amount.}
28 |
29 | \item{bucket}{the name of the temporary bucket to load the data. Will look for AWS_BUCKET_NAME on environment if not specified.}
30 |
31 | \item{region}{the region of the bucket. Will look for AWS_DEFAULT_REGION on environment if not specified.}
32 |
33 | \item{access_key}{the access key with permissions for the bucket. Will look for AWS_ACCESS_KEY_ID on environment if not specified.}
34 |
35 | \item{secret_key}{the secret key with permissions for the bucket. Will look for AWS_SECRET_ACCESS_KEY on environment if not specified.}
36 |
37 | \item{session_token}{the session key with permissions for the bucket, this will be used instead of the access/secret keys if specified. Will look for AWS_SESSION_TOKEN on environment if not specified.}
38 |
39 | \item{iam_role_arn}{an iam role arn with permissions fot the bucket. Will look for AWS_IAM_ROLE_ARN on environment if not specified. This is ignoring access_key and secret_key if set.}
40 |
41 | \item{wlm_slots}{amount of WLM slots to use for this bulk load http://docs.aws.amazon.com/redshift/latest/dg/tutorial-configuring-workload-management.html}
42 |
43 | \item{additional_params}{Additional params to send to the COPY statement in Redshift}
44 | }
45 | \description{
46 | Upload a table to S3 and then load to Redshift, replacing the target value
47 | in existing rows that have the same keys, and inserting rows with new keys.
48 | New rows must match structure and column ordering of existing Redshift table.
49 | }
50 | \examples{
51 | library(DBI)
52 |
53 | a=data.frame(a=seq(1,10000), b=seq(10000,1))
54 | n=head(a,n=5000)
55 | n$b=n$a
56 | nx=rbind(n, data.frame(a=seq(99999:104000), b=seq(104000:99999)))
57 |
58 | \dontrun{
59 | con <- dbConnect(RPostgres::Postgres(), dbname="dbname",
60 | host='my-redshift-url.amazon.com', port='5439',
61 | user='myuser', password='mypassword',sslmode='require')
62 |
63 | rs_cols_upsert_table(df=nx, dbcon=con, table_name='testTable',
64 | bucket="my-bucket", split_files=4, values=c('b'), keys=c('a'))
65 | }
66 | }
67 | \seealso{
68 | \url{https://docs.aws.amazon.com/redshift/latest/dg/merge-specify-a-column-list.html}
69 |
70 | \url{http://docs.aws.amazon.com/redshift/latest/dg/tutorial-configuring-workload-management.html}
71 | }
72 |
--------------------------------------------------------------------------------
/man/rs_create_statement.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/table_definition.R
3 | \name{rs_create_statement}
4 | \alias{rs_create_statement}
5 | \title{Generate create table statement for Amazon Redshift}
6 | \usage{
7 | rs_create_statement(df, table_name = deparse(substitute(df)), sortkeys,
8 | sortkey_style = "compound", distkey, distkey_style = "even",
9 | compression = T)
10 | }
11 | \arguments{
12 | \item{df}{the data.frame you want to upload to Amazon Redshift}
13 |
14 | \item{table_name}{the name of the table to create, if not specified it'll use the data.frame name}
15 |
16 | \item{sortkeys}{Column or columns to sort the table by}
17 |
18 | \item{sortkey_style}{Sortkey style, can be compound or interleaved http://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data-compare-sort-styles.html}
19 |
20 | \item{distkey}{Distkey column, can only be one, if chosen the table is distributed among clusters according to a hash of this column's value.}
21 |
22 | \item{distkey_style}{Distkey style, can be even or all, for the key distribution use the distkey parameter. http://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html}
23 |
24 | \item{compression}{Add encoding for columns whose compression algorithm is easy to guess, for the rest you should upload it to Redshift and run ANALYZE COMPRESSION}
25 | }
26 | \description{
27 | This lets you easily generate a table schema from a data.frame, which allows for easily uploading to redshift afterwards.
28 | }
29 | \examples{
30 |
31 | n=1000
32 | testdf = data.frame(
33 | a=rep('a', n),
34 | b=c(1:n),
35 | c=rep(as.Date('2017-01-01'), n),
36 | d=rep(as.POSIXct('2017-01-01 20:01:32'), n),
37 | e=rep(as.POSIXlt('2017-01-01 20:01:32'), n),
38 | f=rep(paste0(rep('a', 4000), collapse=''), n) )
39 |
40 | cat(rs_create_statement(testdf, table_name='dm_great_table'))
41 |
42 | }
43 |
--------------------------------------------------------------------------------
/man/rs_create_table.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/create.R
3 | \name{rs_create_table}
4 | \alias{rs_create_table}
5 | \title{Create a table from scratch, guessing the table schema}
6 | \usage{
7 | rs_create_table(df, dbcon, table_name = deparse(substitute(df)),
8 | split_files, bucket = Sys.getenv("AWS_BUCKET_NAME"),
9 | region = Sys.getenv("AWS_DEFAULT_REGION"),
10 | access_key = Sys.getenv("AWS_ACCESS_KEY_ID"),
11 | secret_key = Sys.getenv("AWS_SECRET_ACCESS_KEY"),
12 | session_token = Sys.getenv("AWS_SESSION_TOKEN"),
13 | iam_role_arn = Sys.getenv("AWS_IAM_ROLE_ARN"), wlm_slots = 1,
14 | sortkeys, sortkey_style = "compound", distkey,
15 | distkey_style = "even", compression = T, additional_params = "")
16 | }
17 | \arguments{
18 | \item{df}{a data frame}
19 |
20 | \item{dbcon}{an RPostgres/RJDBC connection to the redshift server}
21 |
22 | \item{table_name}{the name of the table to create}
23 |
24 | \item{split_files}{optional parameter to specify amount of files to split into. If not specified will look at amount of slices in Redshift to determine an optimal amount.}
25 |
26 | \item{bucket}{the name of the temporary bucket to load the data. Will look for AWS_BUCKET_NAME on environment if not specified.}
27 |
28 | \item{region}{the region of the bucket. Will look for AWS_DEFAULT_REGION on environment if not specified.}
29 |
30 | \item{access_key}{the access key with permissions for the bucket. Will look for AWS_ACCESS_KEY_ID on environment if not specified.}
31 |
32 | \item{secret_key}{the secret key with permissions fot the bucket. Will look for AWS_SECRET_ACCESS_KEY on environment if not specified.}
33 |
34 | \item{session_token}{the session key with permissions for the bucket, this will be used instead of the access/secret keys if specified. Will look for AWS_SESSION_TOKEN on environment if not specified.}
35 |
36 | \item{iam_role_arn}{an iam role arn with permissions fot the bucket. Will look for AWS_IAM_ROLE_ARN on environment if not specified. This is ignoring access_key and secret_key if set.}
37 |
38 | \item{wlm_slots}{amount of WLM slots to use for this bulk load http://docs.aws.amazon.com/redshift/latest/dg/tutorial-configuring-workload-management.html}
39 |
40 | \item{sortkeys}{Column or columns to sort the table by}
41 |
42 | \item{sortkey_style}{Sortkey style, can be compound or interleaved http://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data-compare-sort-styles.html}
43 |
44 | \item{distkey}{Distkey column, can only be one, if chosen the table is distributed among clusters according to a hash of this column's value.}
45 |
46 | \item{distkey_style}{Distkey style, can be even or all, for the key distribution use the distkey parameter. http://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html}
47 |
48 | \item{compression}{Add encoding for columns whose compression algorithm is easy to guess, for the rest you should upload it to Redshift and run ANALYZE COMPRESSION}
49 |
50 | \item{additional_params}{Additional params to send to the COPY statement in Redshift}
51 | }
52 | \description{
53 | Create a table from scratch, guessing the table schema
54 | }
55 | \examples{
56 | library(DBI)
57 |
58 | a=data.frame(a=seq(1,10000), b=seq(10000,1))
59 |
60 | \dontrun{
61 | con <- dbConnect(RPostgres::Postgres(), dbname="dbname",
62 | host='my-redshift-url.amazon.com', port='5439',
63 | user='myuser', password='mypassword',sslmode='require')
64 |
65 | rs_create_table(df=a, dbcon=con, table_name='testTable',
66 | bucket="my-bucket", split_files=4)
67 |
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/man/rs_replace_table.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/replace.R
3 | \name{rs_replace_table}
4 | \alias{rs_replace_table}
5 | \title{Replace redshift table}
6 | \usage{
7 | rs_replace_table(df, dbcon, table_name, split_files,
8 | bucket = Sys.getenv("AWS_BUCKET_NAME"),
9 | region = Sys.getenv("AWS_DEFAULT_REGION"),
10 | access_key = Sys.getenv("AWS_ACCESS_KEY_ID"),
11 | secret_key = Sys.getenv("AWS_SECRET_ACCESS_KEY"),
12 | session_token = Sys.getenv("AWS_SESSION_TOKEN"),
13 | iam_role_arn = Sys.getenv("AWS_IAM_ROLE_ARN"), wlm_slots = 1,
14 | additional_params = "")
15 | }
16 | \arguments{
17 | \item{df}{a data frame}
18 |
19 | \item{dbcon}{an RPostgres/RJDBC connection to the redshift server}
20 |
21 | \item{table_name}{the name of the table to replace}
22 |
23 | \item{split_files}{optional parameter to specify amount of files to split into. If not specified will look at amount of slices in Redshift to determine an optimal amount.}
24 |
25 | \item{bucket}{the name of the temporary bucket to load the data. Will look for AWS_BUCKET_NAME on environment if not specified.}
26 |
27 | \item{region}{the region of the bucket. Will look for AWS_DEFAULT_REGION on environment if not specified.}
28 |
29 | \item{access_key}{the access key with permissions for the bucket. Will look for AWS_ACCESS_KEY_ID on environment if not specified.}
30 |
31 | \item{secret_key}{the secret key with permissions fot the bucket. Will look for AWS_SECRET_ACCESS_KEY on environment if not specified.}
32 |
33 | \item{session_token}{the session key with permissions for the bucket, this will be used instead of the access/secret keys if specified. Will look for AWS_SESSION_TOKEN on environment if not specified.}
34 |
35 | \item{iam_role_arn}{an iam role arn with permissions fot the bucket. Will look for AWS_IAM_ROLE_ARN on environment if not specified. This is ignoring access_key and secret_key if set.}
36 |
37 | \item{wlm_slots}{amount of WLM slots to use for this bulk load http://docs.aws.amazon.com/redshift/latest/dg/tutorial-configuring-workload-management.html}
38 |
39 | \item{additional_params}{Additional params to send to the COPY statement in Redshift}
40 | }
41 | \description{
42 | Upload a table to S3 and then load it with redshift, replacing the contents of that table.
43 | The table on redshift has to have the same structure and column ordering to work correctly.
44 | }
45 | \examples{
46 | library(DBI)
47 |
48 | a=data.frame(a=seq(1,10000), b=seq(10000,1))
49 |
50 | \dontrun{
51 | con <- dbConnect(RPostgres::Postgres(), dbname="dbname",
52 | host='my-redshift-url.amazon.com', port='5439',
53 | user='myuser', password='mypassword',sslmode='require')
54 |
55 | rs_replace_table(df=a, dbcon=con, table_name='testTable',
56 | bucket="my-bucket", split_files=4)
57 |
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/man/rs_upsert_table.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/upsert.R
3 | \name{rs_upsert_table}
4 | \alias{rs_upsert_table}
5 | \title{Upsert redshift table}
6 | \usage{
7 | rs_upsert_table(df, dbcon, table_name, keys, split_files,
8 | bucket = Sys.getenv("AWS_BUCKET_NAME"),
9 | region = Sys.getenv("AWS_DEFAULT_REGION"),
10 | access_key = Sys.getenv("AWS_ACCESS_KEY_ID"),
11 | secret_key = Sys.getenv("AWS_SECRET_ACCESS_KEY"),
12 | session_token = Sys.getenv("AWS_SESSION_TOKEN"),
13 | iam_role_arn = Sys.getenv("AWS_IAM_ROLE_ARN"), wlm_slots = 1,
14 | additional_params = "")
15 | }
16 | \arguments{
17 | \item{df}{a data frame}
18 |
19 | \item{dbcon}{an RPostgres/RJDBC connection to the redshift server}
20 |
21 | \item{table_name}{the name of the table to update/insert}
22 |
23 | \item{keys}{this optional vector contains the variables by which to upsert. If not defined, the upsert becomes an append.}
24 |
25 | \item{split_files}{optional parameter to specify amount of files to split into. If not specified will look at amount of slices in Redshift to determine an optimal amount.}
26 |
27 | \item{bucket}{the name of the temporary bucket to load the data. Will look for AWS_BUCKET_NAME on environment if not specified.}
28 |
29 | \item{region}{the region of the bucket. Will look for AWS_DEFAULT_REGION on environment if not specified.}
30 |
31 | \item{access_key}{the access key with permissions for the bucket. Will look for AWS_ACCESS_KEY_ID on environment if not specified.}
32 |
33 | \item{secret_key}{the secret key with permissions for the bucket. Will look for AWS_SECRET_ACCESS_KEY on environment if not specified.}
34 |
35 | \item{session_token}{the session key with permissions for the bucket, this will be used instead of the access/secret keys if specified. Will look for AWS_SESSION_TOKEN on environment if not specified.}
36 |
37 | \item{iam_role_arn}{an iam role arn with permissions fot the bucket. Will look for AWS_IAM_ROLE_ARN on environment if not specified. This is ignoring access_key and secret_key if set.}
38 |
39 | \item{wlm_slots}{amount of WLM slots to use for this bulk load http://docs.aws.amazon.com/redshift/latest/dg/tutorial-configuring-workload-management.html}
40 |
41 | \item{additional_params}{Additional params to send to the COPY statement in Redshift}
42 | }
43 | \description{
44 | Upload a table to S3 and then load it with redshift, replacing rows with the same
45 | keys, and inserting rows with new keys.
46 | The table on redshift has to have the same structure and column ordering to work correctly.
47 | }
48 | \examples{
49 | library(DBI)
50 |
51 | a=data.frame(a=seq(1,10000), b=seq(10000,1))
52 | n=head(a,n=5000)
53 | n$b=n$a
54 | nx=rbind(n, data.frame(a=seq(99999:104000), b=seq(104000:99999)))
55 |
56 | \dontrun{
57 | con <- dbConnect(RPostgres::Postgres(), dbname="dbname",
58 | host='my-redshift-url.amazon.com', port='5439',
59 | user='myuser', password='mypassword',sslmode='require')
60 |
61 | rs_upsert_table(df=nx, dbcon=con, table_name='testTable',
62 | bucket="my-bucket", split_files=4, keys=c('a'))
63 |
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/redshiftTools.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 |
18 | BuildType: Package
19 | PackageInstallArgs: --no-multiarch --with-keep.source
20 | PackageRoxygenize: rd,collate,namespace
21 |
--------------------------------------------------------------------------------