├── .editorconfig ├── .gitignore ├── .travis.yml ├── LICENSE ├── Pipfile ├── README.md ├── rsdf ├── __init__.py ├── redshift.py └── s3.py ├── setup.py └── tests └── test_engine.py /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = space 5 | indent_size = 4 6 | end_of_line = lf 7 | charset = utf-8 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | 11 | [*.md] 12 | trim_trailing_whitespace = true 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | Pipfile.lock 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | .pytest_cache 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *,cover 48 | .hypothesis/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # IPython Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # dotenv 81 | .env 82 | 83 | # virtualenv 84 | venv/ 85 | ENV/ 86 | 87 | # Spyder project settings 88 | .spyderproject 89 | 90 | # Rope project settings 91 | .ropeproject 92 | 93 | #OSX 94 | .DS_Store 95 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | python: 3.6 4 | 5 | install: 6 | - pip install pipenv 7 | - pipenv install --dev 8 | 9 | script: pipenv run pytest 10 | 11 | deploy: 12 | provider: pypi 13 | user: $PYPI_USERNAME 14 | password: $PYPI_PASSWORD 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Buffer 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.python.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | boto3 = "*" 8 | psycopg2-binary = "*" 9 | pandas = "*" 10 | sqlalchemy = "*" 11 | sqlalchemy-redshift = "*" 12 | 13 | [dev-packages] 14 | pytest = "*" 15 | "rsdf" = {path = ".", editable = true} 16 | 17 | [requires] 18 | python_version = "3.6" 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RSDF 2 | 3 | [![Build Status](https://travis-ci.com/bufferapp/rsdf.svg?branch=master)](https://travis-ci.com/bufferapp/rsdf) 4 | [![PyPI version](https://badge.fury.io/py/rsdf.svg)](https://badge.fury.io/py/rsdf) 5 | [![License](https://img.shields.io/github/license/mashape/apistatus.svg)](LICENSE) 6 | 7 | Set of utils to connect Pandas DataFrames and Redshift. This module will add a 8 | new function to the `DataFrame` object. Inspired by [josepablog gist](https://gist.github.com/josepablog/1ce154a45dc20348b6718804ac8ad0a5). 9 | 10 | ## Installation 11 | 12 | To install `rsdf`, simply use pip: 13 | 14 | ```bash 15 | $ pip install rsdf 16 | ``` 17 | 18 | If you were using the older version, you can also install it with `pip`: 19 | 20 | ```bash 21 | $ pip install git+git://github.com/bufferapp/rsdf.git@d1a5feca220cef9ba7da16da57a746dfb24ee8d7 22 | ``` 23 | 24 | ## Usage 25 | 26 | Once `rdsf` is imported, the `DataFrame` objects will have new functions: 27 | 28 | ```python 29 | import pandas as pd 30 | import rsdf 31 | 32 | 33 | engine_string = 'redshift://user:password@endpoint:port/db' 34 | 35 | users = pd.read_sql_query('select * from users limit 10', engine_string) 36 | 37 | users['money'] = users['money'] * 42 38 | 39 | # Write it back to Redshift 40 | users.to_redshift( 41 | table_name='users', 42 | schema='public', 43 | engine=engine_string, 44 | s3_bucket='users-data', 45 | s3_key='rich_users.gzip', 46 | if_exists='update', 47 | primary_key='id' 48 | ) 49 | ``` 50 | 51 | Alternatively, if no `engine` is provided, the `rsdf` module will try to figure out the engine string from the following environment variables: 52 | 53 | - `REDSHIFT_USER` 54 | - `REDSHIFT_PASSWORD` 55 | - `REDSHIFT_ENDPOINT` 56 | - `REDSHIFT_DB_NAME` 57 | - `REDSHIFT_DB_PORT` 58 | 59 | Since `rsdf` uploads the files to S3 and then runs a `COPY` command to add the 60 | data to Redshift you'll also need to provide (or have them in the environment 61 | loaded) these two AWS variables: 62 | 63 | - `AWS_ACCESS_KEY_ID` 64 | - `AWS_SECRET_ACCESS_KEY` 65 | 66 | ## License 67 | 68 | MIT © Buffer 69 | -------------------------------------------------------------------------------- /rsdf/__init__.py: -------------------------------------------------------------------------------- 1 | from pandas import DataFrame 2 | from .redshift import to_redshift 3 | from .s3 import to_s3 4 | 5 | setattr(DataFrame, to_redshift.__name__, to_redshift) 6 | setattr(DataFrame, to_s3.__name__, to_s3) 7 | -------------------------------------------------------------------------------- /rsdf/redshift.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pandas.io.sql import SQLTable, pandasSQL_builder, _engine_builder 3 | from sqlalchemy_redshift.commands import CopyCommand 4 | 5 | 6 | def generate_redshift_engine_string(): 7 | redshift_db_name = os.getenv("REDSHIFT_DB_NAME") 8 | redshift_user = os.getenv("REDSHIFT_USER") 9 | redshift_password = os.getenv("REDSHIFT_PASSWORD") 10 | redshift_endpoint = os.getenv("REDSHIFT_ENDPOINT") 11 | redshift_db_port = int(os.getenv("REDSHIFT_DB_PORT", 0)) 12 | 13 | return "postgres+psycopg2://{}:{}@{}:{}/{}".format( 14 | redshift_user, 15 | redshift_password, 16 | redshift_endpoint, 17 | redshift_db_port, 18 | redshift_db_name, 19 | ) 20 | 21 | 22 | def to_redshift( 23 | self, 24 | table_name, 25 | s3_bucket, 26 | s3_key, 27 | engine=None, 28 | schema=None, 29 | if_exists="fail", 30 | index=False, 31 | compress=True, 32 | primary_key=None, 33 | aws_access_key_id=None, 34 | aws_secret_access_key=None, 35 | **kwargs 36 | ): 37 | 38 | if not engine: 39 | engine = generate_redshift_engine_string() 40 | 41 | if not aws_access_key_id: 42 | aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID") 43 | if not aws_secret_access_key: 44 | aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY") 45 | 46 | # Get Pandas SQLTable object 47 | table = SQLTable( 48 | table_name, 49 | pandasSQL_builder(engine, schema=schema), 50 | self, 51 | if_exists=if_exists, 52 | schema=schema, 53 | index=index, 54 | ) 55 | 56 | def quote(s): 57 | return '"' + str(s) + '"' 58 | 59 | # Full table name with schema 60 | if schema: 61 | full_table_name = quote(schema) + "." + quote(table_name) 62 | else: 63 | full_table_name = quote(table_name) 64 | 65 | # Check table 66 | if table.exists(): 67 | if if_exists == "fail": 68 | raise ValueError("Table {} already exists.".format(table_name)) 69 | elif if_exists == "append": 70 | queue = [ 71 | CopyCommand( 72 | to=table, 73 | data_location="s3://{}/{}".format(s3_bucket, s3_key), 74 | access_key_id=aws_access_key_id, 75 | secret_access_key=aws_secret_access_key, 76 | format="CSV", 77 | compression="GZIP" if compress else None, 78 | ) 79 | ] 80 | elif if_exists == "replace": 81 | queue = [ 82 | "drop table {};".format(full_table_name), 83 | table.sql_schema() + ";", 84 | CopyCommand( 85 | to=table, 86 | data_location="s3://{}/{}".format(s3_bucket, s3_key), 87 | access_key_id=aws_access_key_id, 88 | secret_access_key=aws_secret_access_key, 89 | format="CSV", 90 | compression="GZIP" if compress else None, 91 | ), 92 | ] 93 | elif if_exists == "update": 94 | staging_table = "{}_staging".format(table_name) 95 | 96 | if not primary_key: 97 | raise ValueError("Expected a primary key to update existing table") 98 | 99 | queue = [ 100 | "begin;", 101 | "drop table if exists {};".format(staging_table), 102 | "create temporary table {} (like {});".format( 103 | staging_table, full_table_name 104 | ), 105 | CopyCommand( 106 | to=table, 107 | data_location="s3://{}/{}".format(s3_bucket, s3_key), 108 | access_key_id=aws_access_key_id, 109 | secret_access_key=aws_secret_access_key, 110 | format="CSV", 111 | compression="GZIP" if compress else None, 112 | ), 113 | "delete from {full_table_name} where {primary_key} in (select {primary_key} from {staging_table});".format( 114 | full_table_name=full_table_name, 115 | primary_key=primary_key, 116 | staging_table=staging_table, 117 | ), 118 | "insert into {} (select * from {});".format( 119 | full_table_name, staging_table 120 | ), 121 | "end;", 122 | ] 123 | else: 124 | raise ValueError("{} is not valid for if_exists".format(if_exists)) 125 | else: 126 | queue = [ 127 | table.sql_schema() + ";", 128 | CopyCommand( 129 | to=table, 130 | data_location="s3://{}/{}".format(s3_bucket, s3_key), 131 | access_key_id=aws_access_key_id, 132 | secret_access_key=aws_secret_access_key, 133 | format="CSV", 134 | compression="GZIP" if compress else None, 135 | ), 136 | ] 137 | 138 | # Save DataFrame to S3 139 | self.to_s3(bucket=s3_bucket, key=s3_key, index=index, compress=compress) 140 | 141 | # Execute queued statements 142 | engine = _engine_builder(engine) 143 | with engine.begin() as con: 144 | for stmt in queue: 145 | con.execute(stmt) 146 | -------------------------------------------------------------------------------- /rsdf/s3.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | from io import StringIO 3 | from io import BytesIO 4 | from gzip import GzipFile 5 | 6 | s3 = boto3.client("s3") 7 | 8 | 9 | def gzip_stringio(input_object): 10 | compressed_object = BytesIO() 11 | 12 | input_object.seek(0) 13 | 14 | with GzipFile(fileobj=compressed_object, mode="wb") as gzf: 15 | gzf.write(input_object.getvalue().encode("utf-8")) 16 | 17 | compressed_object.seek(0) 18 | 19 | return compressed_object 20 | 21 | 22 | def to_s3(self, bucket, key, compress=True, index=False, header=None, **kwargs): 23 | df_csv_object = StringIO() 24 | 25 | self.to_csv(df_csv_object, index=index, header=header, **kwargs) 26 | 27 | if compress: 28 | df_csv_object = gzip_stringio(df_csv_object) 29 | 30 | return s3.upload_fileobj(df_csv_object, bucket, key) 31 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | 3 | from setuptools import setup 4 | 5 | setup( 6 | name="rsdf", 7 | packages=["rsdf"], 8 | version="0.9.2", 9 | description="Redshift interface to Pandas DataFrames", 10 | author="Michael Erasmus", 11 | author_email="michael@buffer.com", 12 | url="https://github.com/bufferapp/rsdf", 13 | license="MIT", 14 | keywords=["redshift", "pandas", "upsert"], 15 | install_requires=[ 16 | "boto3", 17 | "psycopg2-binary", 18 | "pandas", 19 | "sqlalchemy", 20 | "sqlalchemy-redshift", 21 | ], 22 | ) 23 | -------------------------------------------------------------------------------- /tests/test_engine.py: -------------------------------------------------------------------------------- 1 | from rsdf.redshift import generate_redshift_engine_string 2 | 3 | 4 | def test_engine(): 5 | engine = generate_redshift_engine_string() 6 | assert engine.startswith("postgres+psycopg2") 7 | --------------------------------------------------------------------------------