├── .editorconfig
├── .gitignore
├── .travis.yml
├── LICENSE
├── Pipfile
├── README.md
├── rsdf
    ├── __init__.py
    ├── redshift.py
    └── s3.py
├── setup.py
└── tests
    └── test_engine.py


/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | indent_style = space
 5 | indent_size = 4
 6 | end_of_line = lf
 7 | charset = utf-8
 8 | trim_trailing_whitespace = true
 9 | insert_final_newline = true
10 | 
11 | [*.md]
12 | trim_trailing_whitespace = true
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | Pipfile.lock
27 | 
28 | # PyInstaller
29 | #  Usually these files are written by a python script from a template
30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 | .pytest_cache
34 | 
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 | 
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .coverage
43 | .coverage.*
44 | .cache
45 | nosetests.xml
46 | coverage.xml
47 | *,cover
48 | .hypothesis/
49 | 
50 | # Translations
51 | *.mo
52 | *.pot
53 | 
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | 
58 | # Flask stuff:
59 | instance/
60 | .webassets-cache
61 | 
62 | # Scrapy stuff:
63 | .scrapy
64 | 
65 | # Sphinx documentation
66 | docs/_build/
67 | 
68 | # PyBuilder
69 | target/
70 | 
71 | # IPython Notebook
72 | .ipynb_checkpoints
73 | 
74 | # pyenv
75 | .python-version
76 | 
77 | # celery beat schedule file
78 | celerybeat-schedule
79 | 
80 | # dotenv
81 | .env
82 | 
83 | # virtualenv
84 | venv/
85 | ENV/
86 | 
87 | # Spyder project settings
88 | .spyderproject
89 | 
90 | # Rope project settings
91 | .ropeproject
92 | 
93 | #OSX
94 | .DS_Store
95 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | python: 3.6
 4 | 
 5 | install:
 6 |     -  pip install pipenv
 7 |     -  pipenv install --dev
 8 | 
 9 | script: pipenv run pytest
10 | 
11 | deploy:
12 |   provider: pypi
13 |   user: $PYPI_USERNAME
14 |   password: $PYPI_PASSWORD
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Buffer
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.python.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | boto3 = "*"
 8 | psycopg2-binary = "*"
 9 | pandas = "*"
10 | sqlalchemy = "*"
11 | sqlalchemy-redshift = "*"
12 | 
13 | [dev-packages]
14 | pytest = "*"
15 | "rsdf" = {path = ".", editable = true}
16 | 
17 | [requires]
18 | python_version = "3.6"
19 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # RSDF
 2 | 
 3 | [![Build Status](https://travis-ci.com/bufferapp/rsdf.svg?branch=master)](https://travis-ci.com/bufferapp/rsdf)
 4 | [![PyPI version](https://badge.fury.io/py/rsdf.svg)](https://badge.fury.io/py/rsdf)
 5 | [![License](https://img.shields.io/github/license/mashape/apistatus.svg)](LICENSE)
 6 | 
 7 | Set of utils to connect Pandas DataFrames and Redshift. This module will add a
 8 | new function to the `DataFrame` object. Inspired by [josepablog gist](https://gist.github.com/josepablog/1ce154a45dc20348b6718804ac8ad0a5).
 9 | 
10 | ## Installation
11 | 
12 | To install `rsdf`, simply use pip:
13 | 
14 | ```bash
15 | $ pip install rsdf
16 | ```
17 | 
18 | If you were using the older version, you can also install it with `pip`:
19 | 
20 | ```bash
21 | $ pip install git+git://github.com/bufferapp/rsdf.git@d1a5feca220cef9ba7da16da57a746dfb24ee8d7
22 | ```
23 | 
24 | ## Usage
25 | 
26 | Once `rdsf` is imported, the `DataFrame` objects will have new functions:
27 | 
28 | ```python
29 | import pandas as pd
30 | import rsdf
31 | 
32 | 
33 | engine_string = 'redshift://user:password@endpoint:port/db'
34 | 
35 | users = pd.read_sql_query('select * from users limit 10', engine_string)
36 | 
37 | users['money'] = users['money'] * 42
38 | 
39 | # Write it back to Redshift
40 | users.to_redshift(
41 |     table_name='users',
42 |     schema='public',
43 |     engine=engine_string,
44 |     s3_bucket='users-data',
45 |     s3_key='rich_users.gzip',
46 |     if_exists='update',
47 |     primary_key='id'
48 | )
49 | ```
50 | 
51 | Alternatively, if no `engine` is provided, the `rsdf` module will try to figure out the engine string from the following environment variables:
52 | 
53 | - `REDSHIFT_USER`
54 | - `REDSHIFT_PASSWORD`
55 | - `REDSHIFT_ENDPOINT`
56 | - `REDSHIFT_DB_NAME`
57 | - `REDSHIFT_DB_PORT`
58 | 
59 | Since `rsdf` uploads the files to S3 and then runs a `COPY` command to add the
60 | data to Redshift you'll also need to provide (or have them in the environment
61 | loaded) these two AWS variables:
62 | 
63 | - `AWS_ACCESS_KEY_ID`
64 | - `AWS_SECRET_ACCESS_KEY`
65 | 
66 | ## License
67 | 
68 | MIT © Buffer
69 | 


--------------------------------------------------------------------------------
/rsdf/__init__.py:
--------------------------------------------------------------------------------
1 | from pandas import DataFrame
2 | from .redshift import to_redshift
3 | from .s3 import to_s3
4 | 
5 | setattr(DataFrame, to_redshift.__name__, to_redshift)
6 | setattr(DataFrame, to_s3.__name__, to_s3)
7 | 


--------------------------------------------------------------------------------
/rsdf/redshift.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from pandas.io.sql import SQLTable, pandasSQL_builder, _engine_builder
  3 | from sqlalchemy_redshift.commands import CopyCommand
  4 | 
  5 | 
  6 | def generate_redshift_engine_string():
  7 |     redshift_db_name = os.getenv("REDSHIFT_DB_NAME")
  8 |     redshift_user = os.getenv("REDSHIFT_USER")
  9 |     redshift_password = os.getenv("REDSHIFT_PASSWORD")
 10 |     redshift_endpoint = os.getenv("REDSHIFT_ENDPOINT")
 11 |     redshift_db_port = int(os.getenv("REDSHIFT_DB_PORT", 0))
 12 | 
 13 |     return "postgres+psycopg2://{}:{}@{}:{}/{}".format(
 14 |         redshift_user,
 15 |         redshift_password,
 16 |         redshift_endpoint,
 17 |         redshift_db_port,
 18 |         redshift_db_name,
 19 |     )
 20 | 
 21 | 
 22 | def to_redshift(
 23 |     self,
 24 |     table_name,
 25 |     s3_bucket,
 26 |     s3_key,
 27 |     engine=None,
 28 |     schema=None,
 29 |     if_exists="fail",
 30 |     index=False,
 31 |     compress=True,
 32 |     primary_key=None,
 33 |     aws_access_key_id=None,
 34 |     aws_secret_access_key=None,
 35 |     **kwargs
 36 | ):
 37 | 
 38 |     if not engine:
 39 |         engine = generate_redshift_engine_string()
 40 | 
 41 |     if not aws_access_key_id:
 42 |         aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
 43 |     if not aws_secret_access_key:
 44 |         aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")
 45 | 
 46 |     # Get Pandas SQLTable object
 47 |     table = SQLTable(
 48 |         table_name,
 49 |         pandasSQL_builder(engine, schema=schema),
 50 |         self,
 51 |         if_exists=if_exists,
 52 |         schema=schema,
 53 |         index=index,
 54 |     )
 55 | 
 56 |     def quote(s):
 57 |         return '"' + str(s) + '"'
 58 | 
 59 |     # Full table name with schema
 60 |     if schema:
 61 |         full_table_name = quote(schema) + "." + quote(table_name)
 62 |     else:
 63 |         full_table_name = quote(table_name)
 64 | 
 65 |     # Check table
 66 |     if table.exists():
 67 |         if if_exists == "fail":
 68 |             raise ValueError("Table {} already exists.".format(table_name))
 69 |         elif if_exists == "append":
 70 |             queue = [
 71 |                 CopyCommand(
 72 |                     to=table,
 73 |                     data_location="s3://{}/{}".format(s3_bucket, s3_key),
 74 |                     access_key_id=aws_access_key_id,
 75 |                     secret_access_key=aws_secret_access_key,
 76 |                     format="CSV",
 77 |                     compression="GZIP" if compress else None,
 78 |                 )
 79 |             ]
 80 |         elif if_exists == "replace":
 81 |             queue = [
 82 |                 "drop table {};".format(full_table_name),
 83 |                 table.sql_schema() + ";",
 84 |                 CopyCommand(
 85 |                     to=table,
 86 |                     data_location="s3://{}/{}".format(s3_bucket, s3_key),
 87 |                     access_key_id=aws_access_key_id,
 88 |                     secret_access_key=aws_secret_access_key,
 89 |                     format="CSV",
 90 |                     compression="GZIP" if compress else None,
 91 |                 ),
 92 |             ]
 93 |         elif if_exists == "update":
 94 |             staging_table = "{}_staging".format(table_name)
 95 | 
 96 |             if not primary_key:
 97 |                 raise ValueError("Expected a primary key to update existing table")
 98 | 
 99 |             queue = [
100 |                 "begin;",
101 |                 "drop table if exists {};".format(staging_table),
102 |                 "create temporary table {} (like {});".format(
103 |                     staging_table, full_table_name
104 |                 ),
105 |                 CopyCommand(
106 |                     to=table,
107 |                     data_location="s3://{}/{}".format(s3_bucket, s3_key),
108 |                     access_key_id=aws_access_key_id,
109 |                     secret_access_key=aws_secret_access_key,
110 |                     format="CSV",
111 |                     compression="GZIP" if compress else None,
112 |                 ),
113 |                 "delete from {full_table_name} where {primary_key} in (select {primary_key} from {staging_table});".format(
114 |                     full_table_name=full_table_name,
115 |                     primary_key=primary_key,
116 |                     staging_table=staging_table,
117 |                 ),
118 |                 "insert into {} (select * from {});".format(
119 |                     full_table_name, staging_table
120 |                 ),
121 |                 "end;",
122 |             ]
123 |         else:
124 |             raise ValueError("{} is not valid for if_exists".format(if_exists))
125 |     else:
126 |         queue = [
127 |             table.sql_schema() + ";",
128 |             CopyCommand(
129 |                 to=table,
130 |                 data_location="s3://{}/{}".format(s3_bucket, s3_key),
131 |                 access_key_id=aws_access_key_id,
132 |                 secret_access_key=aws_secret_access_key,
133 |                 format="CSV",
134 |                 compression="GZIP" if compress else None,
135 |             ),
136 |         ]
137 | 
138 |     # Save DataFrame to S3
139 |     self.to_s3(bucket=s3_bucket, key=s3_key, index=index, compress=compress)
140 | 
141 |     # Execute queued statements
142 |     engine = _engine_builder(engine)
143 |     with engine.begin() as con:
144 |         for stmt in queue:
145 |             con.execute(stmt)
146 | 


--------------------------------------------------------------------------------
/rsdf/s3.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | from io import StringIO
 3 | from io import BytesIO
 4 | from gzip import GzipFile
 5 | 
 6 | s3 = boto3.client("s3")
 7 | 
 8 | 
 9 | def gzip_stringio(input_object):
10 |     compressed_object = BytesIO()
11 | 
12 |     input_object.seek(0)
13 | 
14 |     with GzipFile(fileobj=compressed_object, mode="wb") as gzf:
15 |         gzf.write(input_object.getvalue().encode("utf-8"))
16 | 
17 |     compressed_object.seek(0)
18 | 
19 |     return compressed_object
20 | 
21 | 
22 | def to_s3(self, bucket, key, compress=True, index=False, header=None, **kwargs):
23 |     df_csv_object = StringIO()
24 | 
25 |     self.to_csv(df_csv_object, index=index, header=header, **kwargs)
26 | 
27 |     if compress:
28 |         df_csv_object = gzip_stringio(df_csv_object)
29 | 
30 |     return s3.upload_fileobj(df_csv_object, bucket, key)
31 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | 
 3 | from setuptools import setup
 4 | 
 5 | setup(
 6 |     name="rsdf",
 7 |     packages=["rsdf"],
 8 |     version="0.9.2",
 9 |     description="Redshift interface to Pandas DataFrames",
10 |     author="Michael Erasmus",
11 |     author_email="michael@buffer.com",
12 |     url="https://github.com/bufferapp/rsdf",
13 |     license="MIT",
14 |     keywords=["redshift", "pandas", "upsert"],
15 |     install_requires=[
16 |         "boto3",
17 |         "psycopg2-binary",
18 |         "pandas",
19 |         "sqlalchemy",
20 |         "sqlalchemy-redshift",
21 |     ],
22 | )
23 | 


--------------------------------------------------------------------------------
/tests/test_engine.py:
--------------------------------------------------------------------------------
1 | from rsdf.redshift import generate_redshift_engine_string
2 | 
3 | 
4 | def test_engine():
5 |     engine = generate_redshift_engine_string()
6 |     assert engine.startswith("postgres+psycopg2")
7 | 


--------------------------------------------------------------------------------