├── pandas_to_sql ├── engine │ ├── __init__.py │ ├── columns │ │ ├── __init__.py │ │ ├── column.py │ │ ├── bool_column.py │ │ ├── str_column.py │ │ ├── datetime_column.py │ │ ├── common.py │ │ └── numeric_columns.py │ ├── grouped_table.py │ └── table.py ├── testing │ ├── __init__.py │ ├── utils │ │ ├── __init__.py │ │ ├── asserters.py │ │ └── fake_data_creation.py │ ├── tests │ │ ├── test_operations_base.py │ │ ├── test_table_operations.py │ │ ├── test_select.py │ │ ├── test_pandas_dataframe_intercepter.py │ │ ├── test_concat.py │ │ ├── test_assignment.py │ │ ├── test_groupby.py │ │ ├── test_merge.py │ │ ├── test_datetime.py │ │ ├── test_str.py │ │ ├── test_operations_numeric.py │ │ └── test_operations_compare.py │ └── conftest.py ├── utils │ ├── __init__.py │ ├── helpers.py │ ├── pandas_interceptor.py │ └── pandas_dataframe_intercepter.py ├── conventions │ ├── __init__.py │ └── groupby_conventions.py └── __init__.py ├── .gitignore ├── environment.yml ├── .github └── workflows │ ├── tests.yml │ └── publish-to-pypi.yml ├── setup.py ├── LICENSE ├── example_runner.py ├── README.md └── pandas_to_sql_colab_example.ipynb /pandas_to_sql/engine/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pandas_to_sql/testing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pandas_to_sql/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pandas_to_sql/engine/columns/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pandas_to_sql/testing/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pandas_to_sql/conventions/__init__.py: -------------------------------------------------------------------------------- 1 | from pandas_to_sql.conventions.groupby_conventions import flatten_grouped_dataframe -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/__pycache__/* 2 | **/.pytest_cache/* 3 | .vscode/ 4 | **/env/* 5 | local/* 6 | build/* 7 | dist/* 8 | **.egg-info 9 | **.db -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: pandas-to-sql-dev 2 | channels: 3 | - defaults 4 | - conda-forge 5 | dependencies: 6 | # required 7 | - setuptools 8 | - python=3.7 9 | - numpy>=1.19 10 | - pandas>=1.1 11 | # testing 12 | - pytest>=5.0.1 13 | - sqlalchemy 14 | 15 | -------------------------------------------------------------------------------- /pandas_to_sql/engine/columns/column.py: -------------------------------------------------------------------------------- 1 | 2 | class Column: 3 | dtype = None 4 | sql_string = None 5 | 6 | def __init__(self, dtype=None, sql_string=None): 7 | self.dtype = dtype 8 | self.sql_string = sql_string 9 | 10 | def __copy__(self): 11 | return type(self)(self.sql_string) 12 | -------------------------------------------------------------------------------- /pandas_to_sql/testing/tests/test_operations_base.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pandas_to_sql.testing.utils.asserters import assert_ 3 | from copy import copy 4 | 5 | def test_copy(): 6 | df = pytest.df1 7 | df2 = copy(df) 8 | df['new_value'] = df.random_float > 10 # some unrelated operation 9 | assert_(df2) 10 | -------------------------------------------------------------------------------- /pandas_to_sql/testing/tests/test_table_operations.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import pytest 3 | from pandas_to_sql.testing.utils.asserters import assert_ 4 | 5 | 6 | def test_rename(): 7 | df = pytest.df1 8 | df = df.rename(columns={'random_int': 'random_int_2', 9 | 'random_str': 'random_str_2'}) 10 | assert_(df) 11 | 12 | def test_drop(): 13 | df = pytest.df1 14 | df = df.drop(columns=['random_int', 'random_str']) 15 | assert_(df) 16 | -------------------------------------------------------------------------------- /pandas_to_sql/testing/tests/test_select.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pandas_to_sql.testing.utils.asserters import assert_ 3 | 4 | 5 | def test_select_inline(): 6 | assert_(pytest.df1[['random_int', 'random_float']]) 7 | 8 | 9 | def test_select_not_inline(): 10 | df = pytest.df1[['random_int', 'random_float']] 11 | assert_(df) 12 | 13 | 14 | def test_select_multiple_times(): 15 | df = pytest.df1[['random_int', 'random_datetime','random_bool']] 16 | df = df[['random_datetime']] 17 | assert_(df) 18 | -------------------------------------------------------------------------------- /pandas_to_sql/__init__.py: -------------------------------------------------------------------------------- 1 | from pandas_to_sql.engine.table import create_table_from_schema 2 | from pandas_to_sql.utils.helpers import create_schema_from_df 3 | from pandas_to_sql.utils.pandas_dataframe_intercepter import PandasDataFrameIntercepter 4 | from pandas_to_sql.utils.pandas_interceptor import PandasIntercepter 5 | 6 | 7 | def wrap_df(df, table_name): 8 | t = create_table_from_schema(table_name=table_name, schema=create_schema_from_df(df)) 9 | return PandasDataFrameIntercepter(df, t) 10 | 11 | 12 | def wrap_pd(pd): 13 | return PandasIntercepter(pd) -------------------------------------------------------------------------------- /pandas_to_sql/engine/columns/bool_column.py: -------------------------------------------------------------------------------- 1 | from pandas_to_sql.engine.columns.column import Column 2 | from pandas_to_sql.engine.columns.common import value_to_sql_string, add_common_operators_to_class 3 | 4 | 5 | class BoolColumn(Column): 6 | def __init__(self, sql_string): 7 | super().__init__(dtype='BOOL', sql_string=sql_string) 8 | 9 | def __neg__(self): 10 | return BoolColumn(sql_string=f'(NOT({value_to_sql_string(self)}))') 11 | 12 | def __invert__(self): 13 | return BoolColumn(sql_string=f'(NOT({value_to_sql_string(self)}))') 14 | 15 | 16 | add_common_operators_to_class(BoolColumn) 17 | -------------------------------------------------------------------------------- /pandas_to_sql/testing/tests/test_pandas_dataframe_intercepter.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pandas_to_sql.testing.conftest import TABLE_NAME_1 3 | 4 | def test_columns_attribute(): 5 | expected = pytest.df1.df_pandas.columns 6 | actual = pytest.df1.columns 7 | assert type(expected) == type(actual) 8 | assert set(expected) == set(actual) 9 | 10 | 11 | def test_get_sql_string_attribute(): 12 | expected = '''SELECT (random_int) AS random_int, (random_float) AS random_float, (random_bool) AS random_bool, (random_datetime) AS random_datetime, (random_str) AS random_str FROM random_data_1''' 13 | assert expected == pytest.df1.get_sql_string() -------------------------------------------------------------------------------- /pandas_to_sql/testing/tests/test_concat.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pandas_to_sql.testing.utils.asserters import assert_ 3 | from pandas_to_sql.conventions import flatten_grouped_dataframe 4 | from copy import copy 5 | import pandas as pd 6 | import pandas_to_sql 7 | 8 | def test_concat_simple(): 9 | df = pytest.df1 10 | 11 | pd_wrapped = pandas_to_sql.wrap_pd(pd) 12 | 13 | df2 = pd_wrapped.concat([df, df, df]) 14 | 15 | assert_(df2) 16 | 17 | 18 | def test_concat_simple_with_copy(): 19 | df = pytest.df1 20 | 21 | pd_wrapped = pandas_to_sql.wrap_pd(pd) 22 | 23 | df2 = pd_wrapped.concat([df, copy(df), copy(df)]) 24 | 25 | assert_(df2) 26 | 27 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build-linux: 7 | runs-on: ubuntu-latest 8 | strategy: 9 | max-parallel: 5 10 | 11 | steps: 12 | - uses: actions/checkout@v2 13 | - name: Set up Python 3.8 14 | uses: actions/setup-python@v2 15 | with: 16 | python-version: 3.8 17 | - name: Install dependencies 18 | run: | 19 | # $CONDA is an environment variable pointing to the root of the miniconda directory 20 | $CONDA/bin/conda env update --file environment.yml --name base 21 | - name: Test with pytest 22 | run: | 23 | conda install pytest 24 | $CONDA/bin/pytest 25 | -------------------------------------------------------------------------------- /pandas_to_sql/utils/helpers.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | ## Types 5 | def convert_df_type(col_type): 6 | if pd.api.types.is_bool_dtype(col_type): return 'BOOL' 7 | elif pd.api.types.is_integer_dtype(col_type): return 'INT' 8 | elif pd.api.types.is_numeric_dtype(col_type): return 'FLOAT' 9 | elif pd.api.types.is_string_dtype(col_type): return 'VARCHAR' 10 | elif pd.api.types.is_datetime64_any_dtype(col_type): return 'DATETIME' 11 | else: raise Exception(f"could not convert column type. got: {str(col_type)}") 12 | 13 | 14 | def create_schema_from_df(df): 15 | schema = {} 16 | for col_name, col_type in df.dtypes.items(): 17 | schema[col_name] = convert_df_type(col_type) 18 | return schema -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import setuptools 3 | 4 | 5 | v = os.environ['RELEASE_VERSION'] 6 | print('Version: ', v) 7 | 8 | with open("README.md", "r") as fh: 9 | long_description = fh.read() 10 | 11 | setuptools.setup( 12 | name="pandas-to-sql", # Replace with your own username 13 | version=v, 14 | author="Amir", 15 | author_email="amirpupko@gmail.com", 16 | description="Convert pandas dataframe manipulations to sql query string", 17 | long_description=long_description, 18 | long_description_content_type="text/markdown", 19 | url="https://github.com/AmirPupko/pandas-to-sql", 20 | packages=setuptools.find_packages(), 21 | classifiers=[ 22 | "Programming Language :: Python :: 3", 23 | "License :: OSI Approved :: MIT License", 24 | "Operating System :: OS Independent", 25 | ], 26 | python_requires='>=3.6', 27 | ) -------------------------------------------------------------------------------- /pandas_to_sql/conventions/groupby_conventions.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pandas_to_sql.utils.pandas_dataframe_intercepter import PandasDataFrameIntercepter 3 | from copy import copy 4 | 5 | def flatten_grouped_dataframe(df): 6 | if not isinstance(df, PandasDataFrameIntercepter): 7 | raise Exception(f"can only get type {str(type(PandasDataFrameIntercepter))}") 8 | 9 | df_c = copy(df.df_pandas) 10 | if isinstance(df_c, pd.core.series.Series): 11 | series_name = df_c.name 12 | new_col_name = list(filter(lambda k: k.startswith(series_name), df.df_sql_convert_table.columns.keys()))[0] 13 | df_c = df_c.reset_index().rename(columns={series_name: new_col_name}) 14 | else: 15 | df_c.columns = df_c.columns.map('_'.join) 16 | df_c = df_c.reset_index() 17 | return PandasDataFrameIntercepter(df_c, copy(df.df_sql_convert_table)) 18 | -------------------------------------------------------------------------------- /.github/workflows/publish-to-pypi.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Publish to PyPI 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | deploy: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: '3.x' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install setuptools wheel twine 25 | - name: Build and publish 26 | env: 27 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 29 | RELEASE_VERSION: ${{ github.event.release.tag_name }} 30 | run: | 31 | python setup.py sdist bdist_wheel 32 | twine upload dist/* 33 | -------------------------------------------------------------------------------- /pandas_to_sql/testing/conftest.py: -------------------------------------------------------------------------------- 1 | 2 | from copy import copy 3 | import sqlite3 4 | import pytest 5 | from pandas_to_sql import wrap_df 6 | from pandas_to_sql.testing.utils import fake_data_creation 7 | 8 | sql_connection = sqlite3.connect('./example.db') #create db 9 | 10 | TABLE_NAME_1 = 'random_data_1' 11 | DF1, SCHEMA_1 = fake_data_creation.create_fake_dataset() 12 | DF1.to_sql(TABLE_NAME_1, sql_connection, if_exists='replace', index=False, dtype=SCHEMA_1) 13 | 14 | TABLE_NAME_2 = 'random_data_2' 15 | DF2, SCHEMA_2 = fake_data_creation.create_fake_dataset() 16 | DF2.columns = DF2.columns.map(lambda c: c + '_2') 17 | DF2.to_sql(TABLE_NAME_2, sql_connection, if_exists='replace', index=False, dtype=SCHEMA_2) 18 | 19 | def pytest_configure(): 20 | pytest.df1 = None 21 | pytest.df2 = None 22 | pytest.sql_connection = sql_connection 23 | 24 | @pytest.fixture(scope="function", autouse=True) 25 | def run_around_tests(): 26 | # print('\nhere\n') 27 | pytest.df1 = wrap_df(copy(DF1), TABLE_NAME_1) 28 | pytest.df2 = wrap_df(copy(DF2), TABLE_NAME_2) 29 | yield 30 | # run after function 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 AmirPupko 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /pandas_to_sql/testing/tests/test_assignment.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import pytest 3 | from pandas_to_sql.testing.utils.asserters import assert_ 4 | 5 | 6 | def test_assign(): 7 | df = pytest.df1 8 | df['new_value'] = df.random_float + 10 9 | assert_(df) 10 | 11 | 12 | def test_assign2(): 13 | df = pytest.df1 14 | df['new_value'] = df.random_bool 15 | assert_(df) 16 | 17 | def test_assign3(): 18 | df = pytest.df1 19 | df['new_value'] = df.random_bool 20 | df2 = df[['new_value','random_float']] 21 | assert_(df2) 22 | 23 | def test_assignment_int(): 24 | df = pytest.df1 25 | df['new_value2'] = 4 26 | assert_(df) 27 | 28 | def test_assignment_float(): 29 | df = pytest.df1 30 | df['new_value2'] = 23.132 31 | assert_(df) 32 | 33 | def test_assignment_bool(): 34 | df = pytest.df1 35 | df['new_value2'] = True 36 | assert_(df) 37 | 38 | def test_assignment_str(): 39 | df = pytest.df1 40 | df['new_value2'] = 'some_str' 41 | assert_(df) 42 | 43 | def test_assignment_datetime(): 44 | df = pytest.df1 45 | df['new_value'] = datetime(1970, 1, 1) 46 | assert_(df) 47 | -------------------------------------------------------------------------------- /pandas_to_sql/testing/utils/asserters.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pandas.testing import assert_frame_equal 3 | import pytest 4 | 5 | 6 | def assert_dataframes_equals(expected, actual): 7 | assert expected.shape==actual.shape 8 | assert set(expected.columns) == set(actual.columns) 9 | columns_order = list(expected.columns) 10 | a = actual[columns_order].sort_values(by=list(actual.columns)).reset_index(drop=True) 11 | e = expected[columns_order].sort_values(by=list(actual.columns)).reset_index(drop=True) 12 | assert_frame_equal(e, a, check_dtype=False) 13 | 14 | 15 | def get_expected_and_actual(df): 16 | actual_query_string = df.df_sql_convert_table.get_sql_string() 17 | actual_columns = df.df_sql_convert_table.columns 18 | datetime_columns = [c for c in actual_columns.keys() if actual_columns[c].dtype == 'DATETIME'] 19 | 20 | df_actual = pd.read_sql_query(actual_query_string, pytest.sql_connection, parse_dates=datetime_columns) 21 | df_expected = df.df_pandas 22 | 23 | return df_expected, df_actual 24 | 25 | def assert_(df): 26 | df_expected, df_actual = get_expected_and_actual(df) 27 | 28 | # i = df_expected.new_value != df_actual.new_value 29 | # a=df_expected[i][:3] 30 | # b=df_expected[i][:3] 31 | 32 | assert_dataframes_equals(df_expected, df_actual) 33 | -------------------------------------------------------------------------------- /example_runner.py: -------------------------------------------------------------------------------- 1 | from copy import copy 2 | import sqlite3 3 | import pandas as pd 4 | import pandas_to_sql 5 | from pandas_to_sql.testing.utils.fake_data_creation import create_fake_dataset 6 | from pandas_to_sql.conventions import flatten_grouped_dataframe 7 | 8 | # table_name = 'random_data' 9 | # df, _ = create_fake_dataset() 10 | # df_ = pandas_to_sql.wrap_df(df, table_name) 11 | # df2 = df_.groupby('random_int').agg({'random_float':['mean','sum','count'], 'random_str':', '.join}) 12 | # df2 = flatten_grouped_dataframe(df2) 13 | # print(df2.get_sql_string()) 14 | 15 | iris = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv') 16 | table_name = 'iris' 17 | sql_connection = sqlite3.connect('./iris.db') #create db 18 | iris.to_sql(table_name, sql_connection, if_exists='replace', index=False) 19 | 20 | df = pandas_to_sql.wrap_df(iris, table_name) 21 | pd_wrapped = pandas_to_sql.wrap_pd(pd) 22 | 23 | df_ = copy(df) 24 | df_['sepal_width_rounded'] = df_.sepal_width.round() 25 | df_1 = df_[df_.species=='setosa'].reset_index(drop=True) 26 | df_2 = df_[df_.species=='versicolor'].reset_index(drop=True) 27 | 28 | some_df = pd_wrapped.concat([df_1, df_2]).reset_index(drop=True) 29 | 30 | sql_string = some_df.get_sql_string() 31 | 32 | df_from_sql_database = pd.read_sql_query(sql_string, sql_connection) 33 | df_pandas = some_df.df_pandas 34 | 35 | from pandas_to_sql.testing.utils.asserters import assert_dataframes_equals 36 | assert_dataframes_equals(df_pandas, df_from_sql_database) 37 | -------------------------------------------------------------------------------- /pandas_to_sql/testing/tests/test_groupby.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pandas_to_sql.testing.utils.asserters import assert_ 3 | from pandas_to_sql.conventions import flatten_grouped_dataframe 4 | 5 | 6 | 7 | def test_groupby_mean(): 8 | df2 = pytest.df1.groupby('random_int').random_float.mean() 9 | assert_(flatten_grouped_dataframe(df2)) 10 | 11 | def test_groupby_sum(): 12 | df2 = pytest.df1.groupby('random_int').random_float.sum() 13 | assert_(flatten_grouped_dataframe(df2)) 14 | 15 | def test_groupby_count(): 16 | df2 = pytest.df1.groupby('random_int').random_float.count() 17 | assert_(flatten_grouped_dataframe(df2)) 18 | 19 | 20 | def test_groupby_agg_1(): 21 | df2 = pytest.df1 \ 22 | .groupby('random_int') \ 23 | .agg({'random_float':['mean','sum','count'], 'random_str':', '.join}) 24 | assert_(flatten_grouped_dataframe(df2)) 25 | 26 | def test_groupby_agg_2(): 27 | df2 = pytest.df1 \ 28 | .groupby('random_bool') \ 29 | .agg({'random_int':['mean','sum','count'], 'random_str':[', '.join]}) 30 | assert_(flatten_grouped_dataframe(df2)) 31 | 32 | 33 | def test_groupby_multiple_on(): 34 | df2 = pytest.df1 \ 35 | .groupby(['random_bool', 'random_int']).random_float.mean() 36 | assert_(flatten_grouped_dataframe(df2)) 37 | 38 | def test_groupby_multiple_on_agg(): 39 | df2 = pytest.df1 \ 40 | .groupby(['random_bool', 'random_int']) \ 41 | .agg({'random_float': ['count','sum']}) 42 | assert_(flatten_grouped_dataframe(df2)) -------------------------------------------------------------------------------- /pandas_to_sql/utils/pandas_interceptor.py: -------------------------------------------------------------------------------- 1 | from copy import copy 2 | import operator 3 | from pandas_to_sql.utils.pandas_dataframe_intercepter import PandasDataFrameIntercepter 4 | from pandas_to_sql.engine.table import create_table, Table 5 | 6 | class PandasIntercepter: 7 | def __init__(self, pandas): 8 | self.pandas = pandas 9 | 10 | def concat(self, objs, axis=0): 11 | objs_pandas = list(map(lambda x: x.df_pandas, objs)) 12 | a = self.pandas.concat(objs_pandas, axis=axis) 13 | objs_sql_convert = list(map(lambda x: x.df_sql_convert_table, objs)) 14 | b = concat(objs_sql_convert, axis=axis) 15 | return PandasDataFrameIntercepter(a,b) 16 | 17 | 18 | def concat(objs, axis=0): 19 | if axis != 0: 20 | raise Exception(f"supporting only axis==0") 21 | for df in objs: 22 | if not isinstance(df, Table): 23 | raise Exception(f'expected Table. got: {str(type(df))}') 24 | 25 | first = None 26 | for columns in list(map(lambda t: set(t.columns.keys()), objs)): 27 | if not first: 28 | first = columns 29 | else: 30 | if columns != first: 31 | raise Exception(f"expected all dataframes to have same columns") 32 | 33 | all_tables_sql_string = list(map(lambda x: x.get_sql_string(), objs)) 34 | new_table_sql_string = ' UNION ALL '.join(all_tables_sql_string) 35 | return create_table(table_name='Temp', 36 | columns=copy(objs[0]).columns, 37 | from_sql_string=new_table_sql_string) 38 | 39 | 40 | -------------------------------------------------------------------------------- /pandas_to_sql/testing/tests/test_merge.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pandas_to_sql.testing.utils.asserters import assert_ 3 | from pandas_to_sql.conventions import flatten_grouped_dataframe 4 | from copy import copy 5 | 6 | 7 | def test_merge_inner(): 8 | df = pytest.df1 9 | df2 = copy(df) 10 | df2['random_int_plus_3'] = df2.random_int + 3 11 | df2 = df2[df2.random_int < 3] 12 | df2 = df2[['random_int_plus_3','random_str']] 13 | df3 = df.merge(df2, on='random_str', how='inner') 14 | assert_(df3) 15 | 16 | 17 | def test_merge_left(): 18 | df = pytest.df1 19 | df2 = copy(df) 20 | df2['random_int_plus_3'] = df2.random_int + 3 21 | df2 = df2[df2.random_int < 3] 22 | df2 = df2[['random_int_plus_3','random_str']] 23 | df3 = df.merge(df2, on='random_str', how='left') 24 | assert_(df3) 25 | 26 | 27 | def test_merge_left_on_right_on_how_inner(): 28 | df = pytest.df1 29 | df2 = copy(df) 30 | df2['random_int_plus_3'] = df2.random_int + 3 31 | df2['random_str_2'] = df2.random_str 32 | df2 = df2[df2.random_int < 3] 33 | df2 = df2[['random_int_plus_3','random_str_2']] 34 | df3 = df.merge(df2, left_on='random_str', right_on='random_str_2', how='inner') 35 | assert_(df3) 36 | 37 | 38 | def test_merge_left_on_right_on_how_left(): 39 | df = pytest.df1 40 | df2 = copy(df) 41 | df2['random_int_plus_3'] = df2.random_int + 3 42 | df2['random_str_2'] = df2.random_str 43 | df2 = df2[df2.random_int < 3] 44 | df2 = df2[['random_int_plus_3','random_str_2']] 45 | df3 = df.merge(df2, left_on='random_str', right_on='random_str_2', how='left') 46 | assert_(df3) 47 | 48 | -------------------------------------------------------------------------------- /pandas_to_sql/testing/utils/fake_data_creation.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | 5 | def random_datetimes_or_dates(start, end, out_format='datetime', n=10): 6 | ''' 7 | unix timestamp is in ns by default. 8 | I divide the unix time value by 10**9 to make it seconds (or 24*60*60*10**9 to make it days). 9 | The corresponding unit variable is passed to the pd.to_datetime function. 10 | Values for the (divide_by, unit) pair to select is defined by the out_format parameter. 11 | for 1 -> out_format='datetime' 12 | for 2 -> out_format=anything else 13 | ''' 14 | (divide_by, unit) = ( 15 | 10**9, 's') if out_format == 'datetime' else (24*60*60*10**9, 'D') 16 | 17 | start_u = start.value//divide_by 18 | end_u = end.value//divide_by 19 | 20 | return pd.to_datetime(np.random.randint(start_u, end_u, n), unit=unit) 21 | 22 | 23 | def random_timedelta(start, end, n, unit='D', seed=None): 24 | if not seed: # from piR's answer 25 | np.random.seed(0) 26 | 27 | ndays = (end - start).days + 1 28 | return pd.to_timedelta(np.random.rand(n) * ndays, unit=unit) 29 | 30 | 31 | def create_fake_dataset(start=pd.to_datetime('2015-01-01'), end=pd.to_datetime('2018-01-01')): 32 | df = pd.DataFrame() 33 | df_size = 1000 34 | df_random_columns = { 35 | 'random_int': 'INT', 36 | 'random_float': 'FLOAT', 37 | 'random_bool': 'BOOL', 38 | 'random_datetime': 'DATETIME', 39 | 'random_str': 'VARCHAR', 40 | } 41 | df['random_int'] = np.random.randint(1, 6, df_size) 42 | df['random_float'] = np.random.randn(df_size) 43 | df['random_bool'] = np.random.randn(df_size) > 0 44 | df['random_datetime'] = random_datetimes_or_dates(start, end, n=df_size) 45 | df['random_str'] = pd.util.testing.rands_array(10, df_size) 46 | return df, df_random_columns 47 | -------------------------------------------------------------------------------- /pandas_to_sql/testing/tests/test_datetime.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta, datetime 2 | import pytest 3 | from pandas_to_sql.testing.utils.asserters import assert_, get_expected_and_actual 4 | from copy import copy 5 | import pandas as pd 6 | import pandas_to_sql 7 | 8 | 9 | def test_add_days(): 10 | df = pytest.df1 11 | df['new_value'] = df.random_datetime + timedelta(days=20) 12 | assert_(df) 13 | 14 | def test_radd_days(): 15 | df = pytest.df1 16 | df['new_value'] = timedelta(days=40) + df.random_datetime 17 | assert_(df) 18 | 19 | def test_sub_days(): 20 | df = pytest.df1 21 | df['new_value'] = df.random_datetime - timedelta(days=40) 22 | assert_(df) 23 | 24 | 25 | def test_add_zero_time_dateoffset(): 26 | df = pytest.df1 27 | df['new_value'] = df.random_datetime + pd.offsets.DateOffset(minutes=0, years=0) 28 | assert_(df) 29 | 30 | 31 | def test_dt_second(): 32 | df = pytest.df1 33 | df['seconds'] = df.random_datetime.dt.second 34 | assert_(df) 35 | 36 | def test_dt_month(): 37 | df = pytest.df1 38 | df['month'] = df.random_datetime.dt.month 39 | assert_(df) 40 | 41 | def test_dt_day(): 42 | df = pytest.df1 43 | df['day'] = df.random_datetime.dt.day 44 | assert_(df) 45 | 46 | def test_dt_hour(): 47 | df = pytest.df1 48 | df['hour'] = df.random_datetime.dt.hour 49 | assert_(df) 50 | 51 | def test_dt_year(): 52 | df = pytest.df1 53 | df['y'] = df.random_datetime.dt.year 54 | assert_(df) 55 | 56 | def test_dt_dayofweek(): 57 | df = pytest.df1 58 | df['dayofweek'] = df.random_datetime.dt.dayofweek 59 | assert_(df) 60 | 61 | def test_dt_week(): 62 | df = pytest.df1 63 | df['week'] = df.random_datetime.dt.week 64 | df_expected, df_actual = get_expected_and_actual(df) 65 | 66 | week_diff = (df_expected.week - df_actual.week).value_counts() 67 | 68 | # asserting week error <= 2. 52,53 is modulo 69 | assert (df_expected.week - df_actual.week).isin([0,1,2,52,53]).all() 70 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # pandas-to-sql 3 | **This libaray is not production ready!!** 4 | 5 | ## Intro 6 | Convert [pandas DataFrame](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html) manipulations to sql query string. 7 | 8 | ![Tests](https://github.com/AmirPupko/pandas-to-sql/workflows/Tests/badge.svg) 9 | 10 | ![Publish to PyPI](https://github.com/AmirPupko/pandas-to-sql/workflows/Publish%20to%20PyPI/badge.svg) 11 | 12 | Support: 13 | - [sqlite](https://sqlite.org/) 14 | 15 | ### Try it yourself 16 | 17 | ```python 18 | >>> import pandas as pd 19 | >>> import pandas_to_sql 20 | >>> iris = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv') 21 | >>> df = pandas_to_sql.wrap_df(iris, table_name='iris') 22 | >>> df.get_sql_string() 23 | 'SELECT (sepal_length) AS sepal_length, (sepal_width) AS sepal_width, (petal_length) AS petal_length, (petal_width) AS petal_width, (species) AS species FROM iris' 24 | ``` 25 | 26 | ```python 27 | >>> df[df.species == 'setosa'].get_sql_string() 28 | "SELECT (sepal_length) AS sepal_length, (sepal_width) AS sepal_width, (petal_length) AS petal_length, (petal_width) AS petal_width, (species) AS species FROM iris WHERE ((species = 'setosa')) " 29 | ``` 30 | 31 | [Here are some more examples](https://github.com/AmirPupko/pandas-to-sql/blob/main/pandas_to_sql_colab_example.ipynb) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AmirPupko/pandas-to-sql/blob/main/pandas_to_sql_colab_example.ipynb) 32 | 33 | 34 | ## Installation 35 | `pip install pandas-to-sql` 36 | 37 | 38 | ## Development 39 | 40 | ### Run example 41 | `python example_runner.py` 42 | 43 | ### Tests 44 | `pytest ./pandas_to_sql` 45 | 46 | ### Environment 47 | `conda env create -f environment.yml --prefix ./env` 48 | `conda activate ./env` 49 | `conda env update --prefix ./env -f environment.yml` 50 | `conda remove --prefix ./env --all` 51 | 52 | ### New release 53 | `python setup.py sdist bdist_wheel` 54 | `python -m twine upload --repository pypi --skip-existing dist/*` 55 | -------------------------------------------------------------------------------- /pandas_to_sql/testing/tests/test_str.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta, datetime 2 | import pytest 3 | from pandas_to_sql.testing.utils.asserters import assert_, get_expected_and_actual 4 | from copy import copy 5 | import pandas as pd 6 | import pandas_to_sql 7 | 8 | 9 | def test_replace(): 10 | df = pytest.df1 11 | df['new_value'] = df.random_str.str.replace('m','v').str.replace('z','_3') 12 | assert_(df) 13 | 14 | def test_lower(): 15 | df = pytest.df1 16 | df['new_value'] = df.random_str.str.lower() 17 | assert_(df) 18 | 19 | def test_upper(): 20 | df = pytest.df1 21 | df['new_value'] = df.random_str.str.upper() 22 | assert_(df) 23 | 24 | def test_slice1(): 25 | df = pytest.df1 26 | df['new_value'] = df.random_str.str.slice(1,3) 27 | assert_(df) 28 | 29 | def test_slice2(): 30 | df = pytest.df1 31 | df['new_value'] = df.random_str.str.slice(2) 32 | assert_(df) 33 | 34 | def test_slice3(): 35 | df = pytest.df1 36 | df['new_value'] = df.random_str.str.slice(stop=4) 37 | assert_(df) 38 | 39 | def test_slice4(): 40 | df = pytest.df1 41 | df['new_value'] = df.random_str.str.slice(-1,-3) 42 | assert_(df) 43 | 44 | def test_strip(): 45 | df = pytest.df1 46 | df['new_value'] = df.random_str.str.strip('ABCKSLFjadkj') 47 | assert_(df) 48 | 49 | def test_strip_none_chars(): 50 | df = pytest.df1 51 | df['new_value1'] = df.random_str + ' ' 52 | df['new_value2'] = df.random_str.str.strip() 53 | assert_(df) 54 | 55 | def test_lstrip(): 56 | df = pytest.df1 57 | df['new_value'] = df.random_str.str.lstrip('ABCKSLFjadkj') 58 | assert_(df) 59 | 60 | 61 | def test_rstrip(): 62 | df = pytest.df1 63 | df['new_value'] = df.random_str.str.rstrip('ABCKSLFjadkj') 64 | assert_(df) 65 | 66 | def test_len(): 67 | df = pytest.df1 68 | df['new_value'] = df.random_str.str.len() 69 | assert_(df) 70 | 71 | def test_contains(): 72 | df = pytest.df1 73 | df['new_value1'] = df.random_str.str.contains('a') 74 | df['new_value2'] = df.random_str.str.contains('B') 75 | assert_(df) 76 | 77 | def test_contains_case_false(): 78 | df = pytest.df1 79 | df['new_value1'] = df.random_str.str.contains('a', case=False) 80 | df['new_value2'] = df.random_str.str.contains('B', case=False) 81 | assert_(df) -------------------------------------------------------------------------------- /pandas_to_sql/testing/tests/test_operations_numeric.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pandas_to_sql.testing.utils.asserters import assert_ 3 | 4 | 5 | def test_add(): 6 | df = pytest.df1 7 | df['new_value'] = df.random_float + 10 8 | assert_(df) 9 | 10 | def test_radd(): 11 | df = pytest.df1 12 | df['new_value'] = 10 + df.random_float 13 | assert_(df) 14 | 15 | def test_add_str(): 16 | df = pytest.df1 17 | df['new_value'] = df.random_str + '_some_other_str' 18 | assert_(df) 19 | 20 | def test_add_str_to_str(): 21 | df = pytest.df1 22 | df['new_value'] = df.random_str + '_' + df.random_str 23 | assert_(df) 24 | 25 | 26 | def test_sub(): 27 | df = pytest.df1 28 | df['new_value'] = df.random_float - 10 29 | assert_(df) 30 | 31 | def test_rsub(): 32 | df = pytest.df1 33 | df['new_value'] = 10 - df.random_float 34 | assert_(df) 35 | 36 | 37 | def test_mul(): 38 | df = pytest.df1 39 | df['new_value'] = df.random_float * 2 40 | assert_(df) 41 | 42 | def test_rmul(): 43 | df = pytest.df1 44 | df['new_value'] = 2.5 * df.random_int 45 | assert_(df) 46 | 47 | def test_truediv(): 48 | df = pytest.df1 49 | df['new_value'] = df.random_int / 2.0 50 | assert_(df) 51 | 52 | def test_truediv2(): 53 | df = pytest.df1 54 | df['new_value'] = df.random_float / 2 55 | assert_(df) 56 | 57 | def test_truediv_int_int(): 58 | df = pytest.df1 59 | df['new_value'] = df.random_int / 2 60 | assert_(df) 61 | 62 | def test_rtruediv(): 63 | df = pytest.df1 64 | df['new_value'] = 2 / df.random_float 65 | assert_(df) 66 | 67 | def test_floordiv(): 68 | df = pytest.df1 69 | df['new_value'] = df.random_float // 2.0 70 | assert_(df) 71 | 72 | def test_rfloordiv(): 73 | df = pytest.df1 74 | df['new_value'] = 1 // df.random_float 75 | assert_(df) 76 | 77 | def test_round(): 78 | df = pytest.df1 79 | df['new_value'] = df.random_float.round() 80 | assert_(df) 81 | 82 | def test_round_with_half_values(): 83 | df = pytest.df1 84 | df['a'] = 0.5 85 | df['b'] = 1.5 86 | df['c'] = 2.5 87 | df['d'] = 3.5 88 | df['e'] = -0.5 89 | df['f'] = -1.5 90 | df['g'] = -2.5 91 | df['h'] = -3.5 92 | 93 | for c in ['a','b','c','d','e','f','g','h']: 94 | df[c + '_new'] = df[c].round() 95 | 96 | assert_(df) 97 | 98 | 99 | def test_abs(): 100 | df = pytest.df1 101 | df['new_value'] = df.random_float.abs() 102 | assert_(df) 103 | 104 | -------------------------------------------------------------------------------- /pandas_to_sql/testing/tests/test_operations_compare.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | from pandas_to_sql import wrap_df 4 | from pandas_to_sql.testing.utils.asserters import assert_ 5 | 6 | 7 | def test_gt1(): 8 | df = pytest.df1 9 | df['new_value'] = df.random_float > 10 10 | assert_(df) 11 | 12 | def test_gt2(): 13 | df = pytest.df1 14 | df['new_value'] = df.random_int > 3 15 | assert_(df) 16 | 17 | def test_abs_float(): 18 | df = pytest.df1 19 | df['new_value'] = abs(df.random_float) 20 | assert_(df) 21 | 22 | def test_abs_int(): 23 | df = pytest.df1 24 | df['new_value'] = abs(df.random_int) 25 | assert_(df) 26 | 27 | def test_ge(): 28 | df = pytest.df1 29 | df['new_value'] = df.random_int >= 3 30 | assert_(df) 31 | 32 | def test_ge(): 33 | df = pytest.df1 34 | df['new_value'] = df.random_float >= 0 35 | assert_(df) 36 | 37 | def test_ge2(): 38 | df = pytest.df1 39 | df['new_value'] = df.random_int >= 3 40 | assert_(df) 41 | 42 | def test_lt(): 43 | df = pytest.df1 44 | df['new_value'] = df.random_int < 3 45 | assert_(df) 46 | 47 | def test_le(): 48 | df = pytest.df1 49 | df['new_value'] = df.random_int <= 3 50 | assert_(df) 51 | 52 | def test_eq(): 53 | df = pytest.df1 54 | df['new_value'] = df.random_int == 3 55 | assert_(df) 56 | 57 | def test_ne(): 58 | df = pytest.df1 59 | df['new_value'] = df.random_int != 3 60 | assert_(df) 61 | 62 | def test_tilde(): 63 | df = pytest.df1 64 | df['new_value'] = ~df.random_bool 65 | assert_(df) 66 | 67 | def test_neg_bool(): 68 | df = pytest.df1 69 | df['new_value'] = -df.random_bool 70 | assert_(df) 71 | 72 | def test_neg_numeric(): 73 | df = pytest.df1 74 | df['new_value'] = -df.random_int 75 | assert_(df) 76 | 77 | 78 | def test_two_conds_and(): 79 | df = pytest.df1 80 | df['new_value'] = (df.random_float > 1) & (df.random_float <=2) 81 | assert_(df) 82 | 83 | def test_two_conds_or(): 84 | df = pytest.df1 85 | df['new_value'] = (df.random_float > 1) or True 86 | assert_(df) 87 | 88 | def test_fillna(): 89 | df = pd.DataFrame({'col':[1,None,.3,-20,None]}) 90 | table_name = 'some_fillna_table_name' 91 | df.to_sql(table_name, pytest.sql_connection, if_exists='replace', index=False) 92 | df_ = wrap_df(df, table_name) 93 | 94 | df_['new_value'] = df_.col.fillna(2) 95 | 96 | assert_(df_) 97 | 98 | def test_fillna2(): 99 | df = pd.DataFrame({'col':[1,None,.3,-20,None]}) 100 | table_name = 'some_fillna_table_name' 101 | df.to_sql(table_name, pytest.sql_connection, if_exists='replace', index=False) 102 | df_ = wrap_df(df, table_name) 103 | 104 | df_['new_value'] = df_.col.fillna('f') 105 | 106 | assert_(df_) 107 | 108 | def test_astype(): 109 | df = pytest.df1 110 | df['new_value'] = df.random_float.astype(int) 111 | assert_(df) 112 | 113 | -------------------------------------------------------------------------------- /pandas_to_sql/engine/columns/str_column.py: -------------------------------------------------------------------------------- 1 | from pandas_to_sql.engine.columns.column import Column 2 | from pandas_to_sql.engine.columns.numeric_columns import IntColumn 3 | from pandas_to_sql.engine.columns.common import add_common_operators_to_class, value_to_sql_string, create_column_from_operation 4 | 5 | 6 | class StrColumn(Column): 7 | def __init__(self, sql_string): 8 | super().__init__(dtype='VARCHAR', sql_string=sql_string) 9 | 10 | def __getattribute__(self, attr): 11 | if attr == 'str': 12 | return self 13 | return object.__getattribute__(self, attr) 14 | 15 | def __add__(self, r): 16 | return create_column_from_operation(self, r, StrColumn, '||') 17 | 18 | def __radd__(self, l): 19 | return create_column_from_operation(self, StrColumn, l, '||') 20 | 21 | 22 | add_common_operators_to_class(StrColumn) 23 | 24 | StrColumn.lower = lambda self: StrColumn(sql_string=f'(LOWER({value_to_sql_string(self)}))') 25 | StrColumn.upper = lambda self: StrColumn(sql_string=f'(UPPER({value_to_sql_string(self)}))') 26 | 27 | StrColumn.replace = lambda self, old, new: \ 28 | StrColumn(sql_string=f'(REPLACE({value_to_sql_string(self)}, {value_to_sql_string(old)}, {value_to_sql_string(new)}))') 29 | 30 | 31 | def slice_(self, start=None, stop=None,j=None): 32 | if j: raise 'slice "step" not supported' 33 | 34 | start = start if start else 0 35 | start+=1 36 | 37 | if stop: 38 | stop +=1 39 | length = stop - start 40 | s = f'(SUBSTR({value_to_sql_string(self)}, {start}, {length}))' 41 | else: 42 | s = f'(SUBSTR({value_to_sql_string(self)}, {start}))' 43 | 44 | return StrColumn(sql_string=s) 45 | 46 | 47 | StrColumn.slice = slice_ 48 | 49 | 50 | 51 | 52 | def strip_(self, op, chars=None): 53 | if not chars: 54 | chars = ' ' 55 | if not isinstance(chars, str): 56 | raise f'"chars" must be str. got {str(type(chars))}' 57 | 58 | s = f"({op}({value_to_sql_string(self)}, {value_to_sql_string(chars)}))" 59 | return StrColumn(sql_string=s) 60 | 61 | 62 | StrColumn.strip = lambda self, chars=None: strip_(self, 'TRIM', chars) 63 | StrColumn.lstrip = lambda self, chars=None: strip_(self, 'LTRIM', chars) 64 | StrColumn.rstrip = lambda self, chars=None: strip_(self, 'RTRIM', chars) 65 | 66 | StrColumn.len = lambda self: IntColumn(sql_string=f'(LENGTH({value_to_sql_string(self)}))') 67 | 68 | 69 | 70 | def contains(self, s, case=True): 71 | if not isinstance(s, str): 72 | raise f'"s" must be str. got {str(type(s))}' 73 | 74 | if case==False: 75 | sql_string = f"(INSTR(LOWER({value_to_sql_string(self)}), LOWER({value_to_sql_string(s)})))" 76 | else: 77 | sql_string = f"(INSTR({value_to_sql_string(self)}, {value_to_sql_string(s)}))" 78 | 79 | # sql_string = f"(INSTR({value_to_sql_string(self)}, {value_to_sql_string(s)}))" 80 | sql_string = f"(CAST({sql_string} > 0 AS BOOL))" 81 | return StrColumn(sql_string=sql_string) 82 | 83 | 84 | StrColumn.contains = contains 85 | -------------------------------------------------------------------------------- /pandas_to_sql/engine/columns/datetime_column.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | from dateutil.relativedelta import relativedelta 3 | import pandas as pd 4 | from pandas_to_sql.engine.columns.column import Column 5 | from pandas_to_sql.engine.columns.common import add_common_operators_to_class, value_to_sql_string 6 | from pandas_to_sql.engine.columns.numeric_columns import IntColumn, FloatColumn 7 | 8 | time_unit_to_format = { 9 | 'second': '%S', 10 | 'month': '%m', 11 | 'minute': '%M', 12 | 'hour': '%H', 13 | 'week': '%W', 14 | 'year': '%Y', 15 | 'day': '%d', 16 | 'dayofweek': '%w'} 17 | 18 | 19 | class DatetimeColumn(Column): 20 | 21 | def __init__(self, sql_string): 22 | super().__init__(dtype='DATETIME', sql_string=sql_string) 23 | 24 | def __getattribute__(self, attr): 25 | if attr == 'dt': 26 | return self 27 | if attr == 'dayofweek': 28 | return self.extract_dayofweek() 29 | if attr in time_unit_to_format.keys(): 30 | return self.extract_time_unit(time_unit_to_format[attr]) 31 | 32 | return object.__getattribute__(self, attr) 33 | 34 | 35 | def get_sql_for_format(self, format): 36 | return f"(CAST(strftime('{format}', {value_to_sql_string(self)}) AS INT))" 37 | 38 | def extract_time_unit(self, format): 39 | sql_string = self.get_sql_for_format(format) 40 | return IntColumn(sql_string=sql_string) 41 | 42 | def extract_dayofweek(self): 43 | sql_string = self.get_sql_for_format(time_unit_to_format['dayofweek']) 44 | sql_string = f"( ({sql_string} + 6) % 7)" 45 | return IntColumn(sql_string=sql_string) 46 | 47 | 48 | def __my_add__(col, v): 49 | if isinstance(v, timedelta): 50 | # https://docs.python.org/3/library/datetime.html#datetime.timedelta 51 | sign = '+' if v.days >= 0 else '-' 52 | added_days = f"'{sign}{abs(v.days)} days'" 53 | 54 | sign = '+' if v.seconds >= 0 else '-' 55 | added_seconds = f"'{sign}{abs(v.seconds)} seconds'" 56 | 57 | sql_string = f"(datetime({value_to_sql_string(col)}, {added_days}, {added_seconds}))" 58 | return DatetimeColumn(sql_string=sql_string) 59 | elif isinstance(v, relativedelta): 60 | s = [] 61 | for t_type, t_value in v.kwds.items(): 62 | sign = '+' if t_value >= 0 else '-' 63 | s.append(f"'{sign}{abs(t_value)} {t_type}'") 64 | sql_string = f"(datetime({value_to_sql_string(col)}, {', '.join(s)}))" 65 | return DatetimeColumn(sql_string=sql_string) 66 | elif isinstance(v, pd.offsets.DateOffset): 67 | s = [] 68 | for t_type, t_value in v.kwds.items(): 69 | sign = '+' if t_value >= 0 else '-' 70 | s.append(f"'{sign}{abs(t_value)} {t_type}'") 71 | sql_string = f"(datetime({value_to_sql_string(col)}, {', '.join(s)}))" 72 | return DatetimeColumn(sql_string=sql_string) 73 | else: 74 | raise Exception(f'Supporting only timedelta, got {str(type(v))}') 75 | 76 | 77 | add_common_operators_to_class(DatetimeColumn) 78 | 79 | DatetimeColumn.__add__ = __my_add__ 80 | DatetimeColumn.__radd__ = lambda self, l: __my_add__(self, l) 81 | DatetimeColumn.__sub__ = lambda self, r: __my_add__(self, -r) 82 | 83 | -------------------------------------------------------------------------------- /pandas_to_sql/engine/columns/common.py: -------------------------------------------------------------------------------- 1 | 2 | import numbers 3 | import operator 4 | from datetime import datetime 5 | from pandas_to_sql.engine.columns.column import Column 6 | from pandas_to_sql.utils.helpers import convert_df_type 7 | 8 | def get_column_class_from_type(col_type): 9 | from pandas_to_sql.engine.columns.bool_column import BoolColumn 10 | from pandas_to_sql.engine.columns.numeric_columns import IntColumn, FloatColumn 11 | from pandas_to_sql.engine.columns.str_column import StrColumn 12 | from pandas_to_sql.engine.columns.datetime_column import DatetimeColumn 13 | if col_type == 'BOOL': return BoolColumn 14 | elif col_type == 'INT': return IntColumn 15 | elif col_type == 'FLOAT': return FloatColumn 16 | elif col_type == 'VARCHAR': return StrColumn 17 | elif col_type == 'DATETIME': return DatetimeColumn 18 | else: raise Exception(f"could not convert column type. got: {str(col_type)}") 19 | 20 | 21 | def value_to_sql_string(value): 22 | if isinstance(value, numbers.Number): 23 | return str(value) 24 | elif isinstance(value, str): 25 | return "'" + value + "'" 26 | elif isinstance(value, datetime): 27 | return f"datetime('{value.strftime('%Y-%m-%d %H:%M:%S')}')" 28 | elif isinstance(value, Column): 29 | return value.sql_string 30 | raise Exception(f"Value not supported. supporting: premitives and {str(Column)}. got {str(type(value))}") 31 | 32 | 33 | def create_column_from_value(v): 34 | from pandas_to_sql.engine.columns.bool_column import BoolColumn 35 | from pandas_to_sql.engine.columns.str_column import StrColumn 36 | from pandas_to_sql.engine.columns.datetime_column import DatetimeColumn 37 | from pandas_to_sql.engine.columns.numeric_columns import IntColumn, FloatColumn 38 | sql_string = value_to_sql_string(v) 39 | if isinstance(v, int): return IntColumn(sql_string) 40 | if isinstance(v, float): return FloatColumn(sql_string) 41 | if isinstance(v, str): return StrColumn(sql_string) 42 | if isinstance(v, bool): return BoolColumn(sql_string) 43 | if isinstance(v, datetime): return DatetimeColumn(sql_string) 44 | 45 | raise Exception(f'trying to set table column with unsupported type. expected types are Column or primitives. got type: {str(type(newvalue))}' ) 46 | 47 | def create_column_from_operation(l, r, dtype, op): 48 | return dtype(sql_string=f'({value_to_sql_string(l)} {op} {value_to_sql_string(r)})') 49 | 50 | 51 | def add_common_operators_to_class(class_type): 52 | from pandas_to_sql.engine.columns.bool_column import BoolColumn 53 | 54 | def __lt__(self,other): 55 | return create_column_from_operation(self, other, BoolColumn, '<') 56 | 57 | def __le__(self,other): 58 | return create_column_from_operation(self, other, BoolColumn, '<=') 59 | 60 | def __gt__(self,other): 61 | return create_column_from_operation(self, other, BoolColumn, '>') 62 | 63 | def __ge__(self,other): 64 | return create_column_from_operation(self, other, BoolColumn, '>=') 65 | 66 | def __eq__(self,other): 67 | return create_column_from_operation(self, other, BoolColumn, '=') 68 | 69 | def __ne__(self,other): 70 | return create_column_from_operation(self, other, BoolColumn, '<>') 71 | 72 | def __and__(self,other): 73 | return create_column_from_operation(self, other, BoolColumn, 'AND') 74 | 75 | def __or__(self,other): 76 | return create_column_from_operation(self, other, BoolColumn, 'OR') 77 | 78 | def fillna(self, v): 79 | dtype = type(self) 80 | return dtype(sql_string=f'(IFNULL({value_to_sql_string(self)}, {value_to_sql_string(v)}))') 81 | 82 | def astype(self, t): 83 | tt = convert_df_type(t) 84 | dtype = get_column_class_from_type(tt) 85 | return dtype(sql_string=f'(CAST({value_to_sql_string(self)} AS {tt}))') 86 | 87 | 88 | class_type.__lt__ = __lt__ 89 | class_type.__gt__ = __gt__ 90 | class_type.__le__ = __le__ 91 | class_type.__ge__ = __ge__ 92 | class_type.__eq__ = __eq__ 93 | class_type.__ne__ = __ne__ 94 | class_type.__and__ = __and__ 95 | class_type.__or__ = __or__ 96 | class_type.fillna = fillna 97 | class_type.astype = astype 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /pandas_to_sql/engine/columns/numeric_columns.py: -------------------------------------------------------------------------------- 1 | from pandas_to_sql.engine.columns.column import Column 2 | from pandas_to_sql.engine.columns.common import add_common_operators_to_class, value_to_sql_string, create_column_from_operation 3 | 4 | 5 | class FloatColumn(Column): 6 | def __init__(self, sql_string): 7 | super().__init__(dtype='FLOAT', sql_string=sql_string) 8 | 9 | 10 | class IntColumn(Column): 11 | def __init__(self, sql_string): 12 | super().__init__(dtype='INT', sql_string=sql_string) 13 | 14 | 15 | def __floordiv__(self, r): 16 | # http://sqlite.1065341.n5.nabble.com/floor-help-td46158.html 17 | return FloatColumn(sql_string=f'( ROUND(({value_to_sql_string(self)} / {value_to_sql_string(r)}) - 0.5) )') 18 | 19 | def __rfloordiv__(self, l): 20 | # http://sqlite.1065341.n5.nabble.com/floor-help-td46158.html 21 | return FloatColumn(sql_string=f'( ROUND(({value_to_sql_string(l)} / {value_to_sql_string(self)}) - 0.5) )') 22 | 23 | def is_int(v): 24 | return isinstance(v, int) or isinstance(v, IntColumn) 25 | 26 | def numeric_op_result_from_types(l, r): 27 | x = IntColumn if is_int(l) and is_int(r) else FloatColumn 28 | return x 29 | 30 | def __add__(self, r): 31 | res_column_type = numeric_op_result_from_types(self, r) 32 | return create_column_from_operation(self, r, res_column_type, '+') 33 | 34 | def __radd__(self, l): 35 | res_column_type = numeric_op_result_from_types(l, self) 36 | return create_column_from_operation(l, self, res_column_type, '+') 37 | 38 | def __sub__(self, r): 39 | res_column_type = numeric_op_result_from_types(self, r) 40 | return create_column_from_operation(self, r, res_column_type, '-') 41 | 42 | def __rsub__(self, l): 43 | res_column_type = numeric_op_result_from_types(l, self) 44 | return create_column_from_operation(l, self, res_column_type, '-') 45 | 46 | def __mul__(self, r): 47 | res_column_type = numeric_op_result_from_types(self, r) 48 | return create_column_from_operation(self, r, res_column_type, '*') 49 | 50 | def __rmul__(self, l): 51 | res_column_type = numeric_op_result_from_types(l, self) 52 | return create_column_from_operation(l, self, res_column_type, '*') 53 | 54 | def __truediv__(self, r): 55 | return FloatColumn(sql_string=f'(({value_to_sql_string(self)} + 0.0) / {value_to_sql_string(r)})') 56 | 57 | def __rtruediv__(self, l): 58 | return FloatColumn(sql_string=f'(({value_to_sql_string(l)} + 0.0) / {value_to_sql_string(self)})') 59 | 60 | def __abs__(self): 61 | return type(self)(sql_string=f'ABS({value_to_sql_string(self)})') 62 | 63 | def __neg__(self): 64 | return type(self)(sql_string=f'(-({value_to_sql_string(self)}))') 65 | 66 | 67 | def round_(self): 68 | # https://docs.python.org/3/library/functions.html#round 69 | v = value_to_sql_string(self) 70 | integer_part = f'(CAST({v} AS INT))' 71 | fractional_part = f'(ABS({v}) - ROUND(ABS({v})-0.5))' 72 | 73 | is_integer_part_even = f'({integer_part}%2 == 0)' 74 | is_fractional_part_exactly_half = f'({fractional_part}==.5)' 75 | 76 | simple_round = f'(ROUND({v}))' 77 | round_with_change = f'(CASE WHEN {v}>0 THEN ROUND({v}-0.001) ELSE ROUND({v}+0.001) END)' 78 | 79 | s = f'(CASE WHEN {is_fractional_part_exactly_half} AND {is_integer_part_even} THEN {round_with_change} ELSE {simple_round} END)' 80 | return FloatColumn(sql_string=s) 81 | 82 | def abs_(self): 83 | return type(self)(sql_string=f'ABS({value_to_sql_string(self)})') 84 | 85 | 86 | 87 | add_common_operators_to_class(FloatColumn) 88 | FloatColumn.__add__ = __add__ 89 | FloatColumn.__radd__ = __radd__ 90 | FloatColumn.__sub__ = __sub__ 91 | FloatColumn.__rsub__ = __rsub__ 92 | FloatColumn.__mul__ = __mul__ 93 | FloatColumn.__rmul__ = __rmul__ 94 | FloatColumn.__floordiv__ = __floordiv__ 95 | FloatColumn.__rfloordiv__ = __rfloordiv__ 96 | FloatColumn.__truediv__ = __truediv__ 97 | FloatColumn.__rtruediv__ = __rtruediv__ 98 | FloatColumn.__abs__ = __abs__ 99 | FloatColumn.__neg__ = __neg__ 100 | FloatColumn.round = round_ 101 | FloatColumn.abs = abs_ 102 | 103 | 104 | add_common_operators_to_class(IntColumn) 105 | IntColumn.__add__ = __add__ 106 | IntColumn.__radd__ = __radd__ 107 | IntColumn.__sub__ = __sub__ 108 | IntColumn.__rsub__ = __rsub__ 109 | IntColumn.__mul__ = __mul__ 110 | IntColumn.__rmul__ = __rmul__ 111 | IntColumn.__floordiv__ = __floordiv__ 112 | IntColumn.__rfloordiv__ = __rfloordiv__ 113 | IntColumn.__truediv__ = __truediv__ 114 | IntColumn.__rtruediv__ = __rtruediv__ 115 | IntColumn.__abs__ = __abs__ 116 | IntColumn.__neg__ = __neg__ 117 | IntColumn.round = round_ 118 | IntColumn.abs = abs_ 119 | 120 | -------------------------------------------------------------------------------- /pandas_to_sql/engine/grouped_table.py: -------------------------------------------------------------------------------- 1 | from copy import copy 2 | from pandas_to_sql.engine.columns.column import Column 3 | from pandas_to_sql.engine.columns.common import get_column_class_from_type 4 | 5 | class GroupedTable: 6 | table = None 7 | groupings = None 8 | 9 | def __init__(self, table, groupings): 10 | self.table = table 11 | self.groupings = groupings 12 | 13 | def __copy__(self): 14 | return GroupedTable(copy(self.table), copy(self.groupings)) 15 | 16 | def __getitem__(self, key): 17 | if isinstance(key, Column): 18 | raise Exception('Cant filter/where GroupedTable') 19 | if isinstance(key, list): 20 | return GroupedTable(self.table[key], copy(self.groupings)) 21 | if isinstance(key, str): 22 | return GroupedTable(self.table[[key]], copy(self.groupings)) 23 | raise Exception(f'GroupedTable __getitem__ key type not supported. type: {str(type(key))}') 24 | 25 | def __setitem__(self, key, newvalue): 26 | raise Exception(f'GroupedTable __setitem__ not supported') 27 | 28 | def __getattr__(self, attribute_name): 29 | return self[attribute_name] 30 | 31 | def mean(self): 32 | return self.agg(dict(map(lambda k: (k,'mean'),self.table.columns.keys()))) 33 | 34 | def count(self): 35 | return self.agg(dict(map(lambda k: (k,'count'),self.table.columns.keys()))) 36 | 37 | def sum(self): 38 | return self.agg(dict(map(lambda k: (k,'sum'),self.table.columns.keys()))) 39 | 40 | def agg(self, v): 41 | if isinstance(v, str): 42 | return self.agg(dict(zip(self.table.columns.keys(), v))) 43 | elif isinstance(v, list): 44 | return self.agg(dict(zip(self.table.columns.keys(), v))) 45 | elif isinstance(v, dict): 46 | if len( set(v.keys()) & set(self.groupings.keys()) ) > 0: 47 | raise Exception("grouped table doesnt support same column in 'on' and 'select'") 48 | self_table_copy = copy(self.table) 49 | # create groupby columns query 50 | groupby_select_columns = {} 51 | for column_name in v.keys(): 52 | column = self_table_copy[column_name] 53 | operations = v[column_name] if isinstance(v[column_name], list) else [v[column_name]] 54 | for operation in operations: 55 | join_str_seperator = None 56 | operation_column_name_override = None 57 | dtype = None 58 | 59 | if callable(operation) and operation.__qualname__=='str.join': 60 | join_str_seperator = operation.__self__ 61 | operation_column_name_override = 'join' 62 | operation = 'group_concat' 63 | # if not isinstance(operation, str): 64 | # raise Exception(f"groupby agg support only str name for operations or ','.join. got: {type(operation)}") 65 | # SUPPORTED_OPERATIONS = ['count','sum','mean','avg'] 66 | # if operation not in SUPPORTED_OPERATIONS: 67 | # raise Exception(f"groupby operation '{operation}' is not supported. supported: {', '.join(SUPPORTED_OPERATIONS)}") 68 | 69 | operation = operation.lower() 70 | 71 | if operation=='mean': 72 | dtype = 'FLOAT' 73 | operation = 'avg' 74 | operation_column_name_override = 'mean' 75 | elif operation=='sum' and column.dtype=='VARCHAR': 76 | dtype = 'VARCHAR' 77 | operation = 'group_concat' 78 | join_str_seperator = '' 79 | operation_column_name_override = 'sum' 80 | elif operation=='count' or (operation=='sum' and column.dtype=='INT'): 81 | dtype = 'INT' 82 | else: 83 | dtype = 'FLOAT' 84 | 85 | new_column_name = f'{column_name}_{operation_column_name_override or operation}' 86 | new_sql_string = f'{operation}({column.sql_string})' 87 | if operation=='group_concat': 88 | new_sql_string = f"{operation}({column.sql_string},'{join_str_seperator}')" 89 | t = get_column_class_from_type(dtype) 90 | groupby_select_columns[new_column_name] = t(sql_string=new_sql_string) 91 | groupby_select_columns.update(self.groupings) 92 | 93 | self_table_copy.columns = groupby_select_columns 94 | 95 | # create new table columns 96 | new_table_columns = {} 97 | for k in groupby_select_columns.keys(): 98 | t = get_column_class_from_type(groupby_select_columns[k].dtype) 99 | new_table_columns[k] = t(sql_string=k) 100 | 101 | grouping_field = ', '.join(list(map(lambda k: self.groupings[k].sql_string, self.groupings.keys()))) 102 | 103 | from pandas_to_sql.engine.table import create_table 104 | return create_table(table_name='Temp', 105 | columns=new_table_columns, 106 | from_sql_string=f'{self_table_copy.get_sql_string()} GROUP BY {grouping_field}', 107 | had_changed=False) 108 | 109 | -------------------------------------------------------------------------------- /pandas_to_sql/utils/pandas_dataframe_intercepter.py: -------------------------------------------------------------------------------- 1 | from copy import copy 2 | import operator 3 | 4 | class PandasDataFrameIntercepter: 5 | def __init__(self, df_pandas, df_sql_convert_table): 6 | self.df_pandas = df_pandas 7 | self.df_sql_convert_table = df_sql_convert_table 8 | 9 | def __repr__(self): 10 | return self.df_pandas.__repr__() 11 | 12 | def __format__(self, fmt): 13 | return self.df_pandas.__format__(fmt) 14 | 15 | def __str__(self): 16 | return self.df_pandas.__str__() 17 | 18 | @staticmethod 19 | def get_attr_for_df_pandas_if_needed(obj): 20 | if isinstance(obj, PandasDataFrameIntercepter): 21 | return object.__getattribute__(obj, 'df_pandas') 22 | else: 23 | return obj 24 | 25 | @staticmethod 26 | def get_attr_for_df_sql_convert_table_if_needed(obj): 27 | if isinstance(obj, PandasDataFrameIntercepter): 28 | return object.__getattribute__(obj, 'df_sql_convert_table') 29 | else: 30 | return obj 31 | 32 | def __getattribute__(self, name): 33 | if name in ['df_pandas', 'df_sql_convert_table']: 34 | return object.__getattribute__(self, name) 35 | 36 | df_sql_convert_table_attr = self.df_sql_convert_table.__getattribute__(name) 37 | if name=='get_sql_string' and hasattr(df_sql_convert_table_attr, '__call__'): 38 | return lambda *args, **kwargs: df_sql_convert_table_attr(*args, **kwargs) 39 | 40 | df_pandas_attr = self.df_pandas.__getattribute__(name) 41 | if name=='columns' and not hasattr(df_pandas_attr, '__call__'): 42 | return df_pandas_attr 43 | 44 | if hasattr(df_sql_convert_table_attr, '__call__'): 45 | def _(*args, **kwargs): 46 | def __dictionary_map_values(d, func): 47 | return {k: func(v) for k, v in d.items()} 48 | 49 | args_df_pandas = tuple(map(PandasDataFrameIntercepter.get_attr_for_df_pandas_if_needed, args)) 50 | args_obj_new = tuple(map(PandasDataFrameIntercepter.get_attr_for_df_sql_convert_table_if_needed, args)) 51 | 52 | kwargs_df_pandas = __dictionary_map_values(kwargs, PandasDataFrameIntercepter.get_attr_for_df_pandas_if_needed) 53 | kwargs_obj_new = __dictionary_map_values(kwargs, PandasDataFrameIntercepter.get_attr_for_df_sql_convert_table_if_needed) 54 | 55 | a = df_pandas_attr(*args_df_pandas, **kwargs_df_pandas) 56 | b = df_sql_convert_table_attr(*args_obj_new, **kwargs_obj_new) 57 | return PandasDataFrameIntercepter(a, b) 58 | return _ 59 | else: 60 | return PandasDataFrameIntercepter(df_pandas_attr, df_sql_convert_table_attr) 61 | 62 | def __getitem__(self, key): 63 | a = self.df_pandas[PandasDataFrameIntercepter.get_attr_for_df_pandas_if_needed(key)] 64 | b = self.df_sql_convert_table[PandasDataFrameIntercepter.get_attr_for_df_sql_convert_table_if_needed(key)] 65 | return PandasDataFrameIntercepter(a, b) 66 | 67 | def __setitem__(self, key, newvalue): 68 | self.df_pandas[key] = PandasDataFrameIntercepter.get_attr_for_df_pandas_if_needed(newvalue) 69 | self.df_sql_convert_table[key] = PandasDataFrameIntercepter.get_attr_for_df_sql_convert_table_if_needed(newvalue) 70 | return PandasDataFrameIntercepter(self.df_pandas, self.df_sql_convert_table) 71 | 72 | def __getattr__(self, attribute_name): 73 | a = self.df_pandas[attribute_name] 74 | b = self.df_sql_convert_table[attribute_name] 75 | return PandasDataFrameIntercepter(a, b) 76 | 77 | def __copy__(self): 78 | return PandasDataFrameIntercepter(copy(self.df_pandas), copy(self.df_sql_convert_table)) 79 | 80 | @staticmethod 81 | def run_operation_and_return(left, right, op): 82 | left_ = PandasDataFrameIntercepter.get_attr_for_df_pandas_if_needed(left) 83 | right_ = PandasDataFrameIntercepter.get_attr_for_df_pandas_if_needed(right) 84 | a = op(left_, right_) 85 | 86 | left_ = PandasDataFrameIntercepter.get_attr_for_df_sql_convert_table_if_needed(left) 87 | right_ = PandasDataFrameIntercepter.get_attr_for_df_sql_convert_table_if_needed(right) 88 | b = op(left_, right_) 89 | return PandasDataFrameIntercepter(a, b) 90 | 91 | @staticmethod 92 | def run_operation_single_and_return(obj, op): 93 | a = PandasDataFrameIntercepter.get_attr_for_df_pandas_if_needed(obj) 94 | b = PandasDataFrameIntercepter.get_attr_for_df_sql_convert_table_if_needed(obj) 95 | a = op(a) 96 | b = op(b) 97 | return PandasDataFrameIntercepter(a, b) 98 | 99 | # comparisons 100 | def __lt__(self,other): 101 | return PandasDataFrameIntercepter.run_operation_and_return(self, other, operator.lt) 102 | 103 | def __le__(self,other): 104 | return PandasDataFrameIntercepter.run_operation_and_return(self, other, operator.le) 105 | 106 | def __gt__(self,other): 107 | return PandasDataFrameIntercepter.run_operation_and_return(self, other, operator.gt) 108 | 109 | def __ge__(self,other): 110 | return PandasDataFrameIntercepter.run_operation_and_return(self, other, operator.ge) 111 | 112 | def __eq__(self,other): 113 | return PandasDataFrameIntercepter.run_operation_and_return(self, other, operator.eq) 114 | 115 | def __ne__(self,other): 116 | return PandasDataFrameIntercepter.run_operation_and_return(self, other, operator.ne) 117 | 118 | def __abs__(self): 119 | return PandasDataFrameIntercepter.run_operation_single_and_return(self, operator.abs) 120 | 121 | def __neg__(self): 122 | return PandasDataFrameIntercepter.run_operation_single_and_return(self, operator.neg) 123 | 124 | def __invert__(self): 125 | return PandasDataFrameIntercepter.run_operation_single_and_return(self, operator.invert) 126 | 127 | def __contains__(self, r): 128 | return PandasDataFrameIntercepter.run_operation_and_return(self, r, operator.contains) 129 | 130 | # numeric 131 | def __add__(self, r): 132 | return PandasDataFrameIntercepter.run_operation_and_return(self, r, operator.add) 133 | 134 | def __sub__(self, r): 135 | return PandasDataFrameIntercepter.run_operation_and_return(self, r, operator.sub) 136 | 137 | def __mul__(self, r): 138 | return PandasDataFrameIntercepter.run_operation_and_return(self, r, operator.mul) 139 | 140 | # def __matmul__(self, r): 141 | # return PandasDataFrameIntercepter.run_operation_and_return(self, r, operator.matmul) 142 | 143 | def __truediv__(self, r): 144 | return PandasDataFrameIntercepter.run_operation_and_return(self, r, operator.truediv) 145 | 146 | def __floordiv__(self, r): 147 | return PandasDataFrameIntercepter.run_operation_and_return(self, r, operator.floordiv) 148 | 149 | def __mod__(self, r): 150 | return PandasDataFrameIntercepter.run_operation_and_return(self, r, operator.mod) 151 | 152 | def __pow__(self, r): 153 | return PandasDataFrameIntercepter.run_operation_and_return(self, r, operator.pow) 154 | 155 | def __and__(self, r): 156 | return PandasDataFrameIntercepter.run_operation_and_return(self, r, operator.and_) 157 | 158 | def __or__(self, r): 159 | return PandasDataFrameIntercepter.run_operation_and_return(self, r, operator.or_) 160 | 161 | # numeric r 162 | def __radd__(self, l): 163 | return PandasDataFrameIntercepter.run_operation_and_return(l, self, operator.add) 164 | 165 | def __rsub__(self, l): 166 | return PandasDataFrameIntercepter.run_operation_and_return(l, self, operator.sub) 167 | 168 | def __rmul__(self, l): 169 | return PandasDataFrameIntercepter.run_operation_and_return(l, self, operator.mul) 170 | 171 | def __rmatmul__(self, l): 172 | return PandasDataFrameIntercepter.run_operation_and_return(l, self, operator.matmul) 173 | 174 | def __rtruediv__(self, l): 175 | return PandasDataFrameIntercepter.run_operation_and_return(l, self, operator.truediv) 176 | 177 | def __rfloordiv__(self, l): 178 | return PandasDataFrameIntercepter.run_operation_and_return(l, self, operator.floordiv) 179 | 180 | def __rmod__(self, l): 181 | return PandasDataFrameIntercepter.run_operation_and_return(l, self, operator.mod) 182 | 183 | def __rpow__(self, l): 184 | return PandasDataFrameIntercepter.run_operation_and_return(l, self, operator.pow) 185 | 186 | def __rand__(self, l): 187 | return PandasDataFrameIntercepter.run_operation_and_return(l, self, operator.and_) 188 | 189 | def __ror__(self, l): 190 | return PandasDataFrameIntercepter.run_operation_and_return(l, self, operator.or_) 191 | 192 | 193 | 194 | 195 | -------------------------------------------------------------------------------- /pandas_to_sql/engine/table.py: -------------------------------------------------------------------------------- 1 | from copy import copy 2 | from pandas_to_sql.engine.columns.column import Column 3 | from pandas_to_sql.engine.grouped_table import GroupedTable 4 | from pandas_to_sql.engine.columns.common import get_column_class_from_type, create_column_from_value 5 | 6 | 7 | class Table: 8 | table_name = None 9 | columns = None 10 | filters = None 11 | from_sql_string = None 12 | had_changed = None 13 | 14 | def __init__(self, table_name, columns, from_sql_string, filters, had_changed): 15 | self.table_name = table_name 16 | self.columns = columns 17 | self.filters = filters 18 | self.from_sql_string = from_sql_string 19 | self.had_changed = had_changed 20 | 21 | def __getitem__(self, key): 22 | if isinstance(key, Column): 23 | if key.dtype != 'BOOL': 24 | raise Exception('Can only filter/where using column of type BOOL. got %s' % (key.dtype)) 25 | return self.where(key) 26 | if isinstance(key, list): 27 | if all(map(lambda x: isinstance(x, str), key)) == False: 28 | raise Exception('List must be all strings. got %s' % (key)) 29 | if all(map(lambda x: x in self.columns, key)) == False: 30 | raise Exception('All columns names must be a column in the table. got %s' % (key)) 31 | return self.select(key) 32 | 33 | c = copy(self.columns[key]) 34 | return c 35 | 36 | def __setitem__(self, key, newvalue): 37 | if isinstance(newvalue, Column) or issubclass(type(newvalue), Column): 38 | self.columns[key] = newvalue 39 | self.had_changed = True 40 | else: 41 | self.columns[key] = create_column_from_value(newvalue) 42 | self.had_changed = True 43 | 44 | def __getattr__(self, attribute_name): 45 | return self[attribute_name] 46 | 47 | def __copy__(self): 48 | columns_copy = {} 49 | for c in self.columns.keys(): 50 | columns_copy[c] = self[c] # column deep copy will occur in __getitem__ 51 | 52 | filters_copy = [] 53 | for f in self.filters: filters_copy.append(copy(f)) 54 | 55 | result_table = create_table(table_name=self.table_name, 56 | from_sql_string=self.from_sql_string, 57 | had_changed=self.had_changed, 58 | columns=columns_copy, 59 | filters=filters_copy) 60 | return result_table 61 | 62 | def reset_index(self, level=None, drop=False, inplace=False, col_level=0, col_fill=''): 63 | return copy(self) 64 | 65 | def to_frame(self): 66 | return copy(self) 67 | 68 | def rename(self, columns): 69 | self.had_changed = True 70 | new_table = copy(self) 71 | new_columns = {} 72 | for col_name, col_value in new_table.columns.items(): 73 | if col_name in columns.keys(): 74 | new_columns[columns[col_name]] = col_value 75 | else: 76 | new_columns[col_name] = col_value 77 | 78 | new_table.columns = new_columns 79 | return new_table 80 | 81 | def drop(self, columns): 82 | self.had_changed = True 83 | new_table = copy(self) 84 | new_columns = { col_name: col_value 85 | for col_name, col_value in new_table.columns.items() 86 | if col_name not in columns } 87 | new_table.columns = new_columns 88 | return new_table 89 | 90 | def where(self, cond_column): 91 | self.had_changed = True 92 | new_table = copy(self) 93 | new_table.filters.append(cond_column) 94 | return new_table 95 | 96 | def select(self, columns_names): 97 | self.had_changed = True 98 | new_table = copy(self) 99 | # filter only selected columns from columns dictionary 100 | new_table.columns = \ 101 | {col_name:col_val for (col_name, col_val) in new_table.columns.items() if col_name in columns_names} 102 | return new_table 103 | 104 | def merge(self, right, how='inner', on=None, left_on=None, right_on=None): 105 | if not isinstance(right, Table): 106 | raise Exception("merge expects right to be of type: %s, got: %s" % (str(type(Table)), str(type(right)))) 107 | if how not in ['left', 'inner']: 108 | raise Exception("merge 'how' value must be in [‘left’, ‘inner’]") 109 | 110 | left = copy(self) 111 | right = copy(right) 112 | if len(set(left.columns.keys()) & set(right.columns.keys())) > 1: 113 | raise Exception("merge got duplicates columns in both tables (except 'on' value)") 114 | 115 | left_on_column = None 116 | right_on_column = None 117 | if on and not left_on and not right_on: 118 | left_on_column = on 119 | right_on_column = on 120 | elif left_on and right_on and not on: 121 | left_on_column = left_on 122 | right_on_column = right_on 123 | else: 124 | raise Exception("got unexpected on/left_on/right_on values.") 125 | 126 | if not isinstance(left_on_column, str) or \ 127 | not isinstance(right_on_column, str): 128 | raise Exception("'on/left_on/right_on' must be str") 129 | 130 | if left_on_column not in left.columns or right_on_column not in right.columns: 131 | raise Exception("merge 'on/left_on/right_on' value must be in both tables as column") 132 | 133 | left_columns = dict(zip(left.columns.keys(), map(lambda x: left[x], left.columns.keys()))) 134 | right_columns = dict(zip(right.columns.keys(), map(lambda x: right[x], right.columns.keys()))) 135 | 136 | # creating new table columns 137 | if left_on_column == right_on_column: 138 | right_columns.pop(on) 139 | new_table_columns = {**left_columns, **right_columns} 140 | 141 | # creating new table sql string 142 | single_select_field_format = 't1.%s AS %s' 143 | selected_fields_left = ', '.join(list(map(lambda x: single_select_field_format % (x, x), left_columns.keys()))) 144 | 145 | single_select_field_format = 't2.%s AS %s' 146 | selected_fields_right = ', '.join(list(map(lambda x: single_select_field_format % (x, x), right_columns.keys()))) 147 | 148 | selected_fields = selected_fields_left 149 | if selected_fields_right: 150 | selected_fields += ', ' + selected_fields_right 151 | 152 | new_table_sql_string = f'SELECT {selected_fields} FROM ({left.get_sql_string()}) AS t1 {how.upper()} JOIN ({right.get_sql_string()}) AS t2 ON t1.{left_on_column}=t2.{right_on_column}' 153 | 154 | return create_table(table_name='Temp', 155 | columns=new_table_columns, 156 | from_sql_string=new_table_sql_string) 157 | 158 | def groupby(self, by): 159 | def __get_column_key(col): 160 | for k in self.columns.keys(): 161 | if self.columns[k].sql_string==col.sql_string: return k 162 | raise Exception('groupby got column that is not in table') 163 | 164 | groupings = None 165 | if isinstance(by, str): 166 | groupings = {by:self[by]} 167 | elif isinstance(by, Column): 168 | groupings = {__get_column_key(by): copy(by)} 169 | elif isinstance(by, list): 170 | groupings = {} 171 | for b in by: 172 | if isinstance(b, str): groupings[b] = self[b] 173 | elif isinstance(b, Column): groupings[__get_column_key(by)] = copy(b) 174 | else: raise Exception(f'groupby got unexpected type. expect str or column, got: {str(type(b))}') 175 | else: 176 | raise Exception("groupby 'by' value must be str OR list[str] OR Column OR list[Column]") 177 | 178 | return GroupedTable(copy(self), groupings=groupings) 179 | 180 | def get_sql_string(self): 181 | if self.from_sql_string and not self.had_changed: 182 | return self.from_sql_string 183 | 184 | from_field = None 185 | selected_fields = None 186 | if self.from_sql_string: 187 | from_field = f'({self.from_sql_string}) AS {self.table_name}' 188 | else: 189 | from_field = self.table_name 190 | 191 | single_select_field_format = '(%s) AS %s' 192 | selected_fields = ', '.join(list(map(lambda x: single_select_field_format % (self[x].sql_string, x), self.columns.keys()))) 193 | 194 | single_where_field_format = '(%s)' 195 | where_cond = ' AND '.join(list(map(lambda c: single_where_field_format % (c.sql_string), self.filters))) 196 | 197 | if where_cond: 198 | return f'SELECT {selected_fields} FROM {from_field} WHERE {where_cond} ' 199 | else: 200 | return f'SELECT {selected_fields} FROM {from_field}' 201 | 202 | 203 | 204 | 205 | def create_table_from_schema(table_name, schema) -> Table: 206 | columns = {} 207 | for column_name in schema.keys(): 208 | columns[column_name] = get_column_class_from_type(schema[column_name])(sql_string=column_name) 209 | return create_table(table_name=table_name, columns=columns) 210 | 211 | def create_table(table_name, columns={}, from_sql_string=None, filters=[], had_changed=False) -> Table: 212 | return Table( 213 | table_name=table_name, 214 | columns=columns, 215 | from_sql_string=from_sql_string, 216 | filters=filters, 217 | had_changed=had_changed) -------------------------------------------------------------------------------- /pandas_to_sql_colab_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "pandas_to_sql_colab_example.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [], 9 | "authorship_tag": "ABX9TyPTBsf7gZggRD828S1nx250", 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | } 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "view-in-github", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | "\"Open" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "metadata": { 31 | "id": "5oTOIl8oHBhe", 32 | "colab": { 33 | "base_uri": "https://localhost:8080/" 34 | }, 35 | "outputId": "1f6d0bea-3f84-43d3-d43a-ddb55596e920" 36 | }, 37 | "source": [ 38 | "!pip install pandas-to-sql -U" 39 | ], 40 | "execution_count": 1, 41 | "outputs": [ 42 | { 43 | "output_type": "stream", 44 | "text": [ 45 | "Requirement already up-to-date: pandas-to-sql in /usr/local/lib/python3.6/dist-packages (0.0.546)\n" 46 | ], 47 | "name": "stdout" 48 | } 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "metadata": { 54 | "id": "sGSsvHC8HaQ0" 55 | }, 56 | "source": [ 57 | "from copy import copy\r\n", 58 | "import sqlite3\r\n", 59 | "import pandas as pd\r\n", 60 | "import pandas_to_sql\r\n", 61 | "from pandas_to_sql import conventions" 62 | ], 63 | "execution_count": 2, 64 | "outputs": [] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "metadata": { 69 | "id": "NexlwrknMQGS", 70 | "colab": { 71 | "base_uri": "https://localhost:8080/", 72 | "height": 110 73 | }, 74 | "outputId": "3e8f6560-0d38-4ca2-b728-15c0b44dbe69" 75 | }, 76 | "source": [ 77 | "iris = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')\r\n", 78 | "table_name = 'iris'\r\n", 79 | "sql_connection = sqlite3.connect('./iris.db') #create db\r\n", 80 | "iris.to_sql(table_name, sql_connection, if_exists='replace', index=False)\r\n", 81 | "iris[:2]" 82 | ], 83 | "execution_count": 3, 84 | "outputs": [ 85 | { 86 | "output_type": "execute_result", 87 | "data": { 88 | "text/html": [ 89 | "
\n", 90 | "\n", 103 | "\n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | "
sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
05.13.51.40.2setosa
14.93.01.40.2setosa
\n", 133 | "
" 134 | ], 135 | "text/plain": [ 136 | " sepal_length sepal_width petal_length petal_width species\n", 137 | "0 5.1 3.5 1.4 0.2 setosa\n", 138 | "1 4.9 3.0 1.4 0.2 setosa" 139 | ] 140 | }, 141 | "metadata": { 142 | "tags": [] 143 | }, 144 | "execution_count": 3 145 | } 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "metadata": { 151 | "id": "dlK3PNWBMPXc" 152 | }, 153 | "source": [ 154 | "df = pandas_to_sql.wrap_df(iris, table_name)\r\n", 155 | "pd_wrapped = pandas_to_sql.wrap_pd(pd)" 156 | ], 157 | "execution_count": 4, 158 | "outputs": [] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "metadata": { 163 | "id": "cUwu1LtnVCXY", 164 | "colab": { 165 | "base_uri": "https://localhost:8080/", 166 | "height": 53 167 | }, 168 | "outputId": "6463e2a7-f4b2-4fda-a064-658e4b52b4a9" 169 | }, 170 | "source": [ 171 | "df.get_sql_string()" 172 | ], 173 | "execution_count": 5, 174 | "outputs": [ 175 | { 176 | "output_type": "execute_result", 177 | "data": { 178 | "application/vnd.google.colaboratory.intrinsic+json": { 179 | "type": "string" 180 | }, 181 | "text/plain": [ 182 | "'SELECT (sepal_length) AS sepal_length, (sepal_width) AS sepal_width, (petal_length) AS petal_length, (petal_width) AS petal_width, (species) AS species FROM iris'" 183 | ] 184 | }, 185 | "metadata": { 186 | "tags": [] 187 | }, 188 | "execution_count": 5 189 | } 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "metadata": { 195 | "id": "0tQZAfyLMtDB", 196 | "colab": { 197 | "base_uri": "https://localhost:8080/", 198 | "height": 53 199 | }, 200 | "outputId": "b68d70a9-fd51-4fe7-ee68-4b56830762ab" 201 | }, 202 | "source": [ 203 | "species_petal_length_stats_df = df.groupby('species').agg({'petal_length':['mean','sum','count']})\r\n", 204 | "species_petal_length_stats_df = conventions.flatten_grouped_dataframe(species_petal_length_stats_df)\r\n", 205 | "\r\n", 206 | "species_petal_length_stats_df.get_sql_string()\r\n" 207 | ], 208 | "execution_count": 6, 209 | "outputs": [ 210 | { 211 | "output_type": "execute_result", 212 | "data": { 213 | "application/vnd.google.colaboratory.intrinsic+json": { 214 | "type": "string" 215 | }, 216 | "text/plain": [ 217 | "'SELECT (avg(petal_length)) AS petal_length_mean, (sum(petal_length)) AS petal_length_sum, (count(petal_length)) AS petal_length_count, (species) AS species FROM iris GROUP BY species'" 218 | ] 219 | }, 220 | "metadata": { 221 | "tags": [] 222 | }, 223 | "execution_count": 6 224 | } 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "metadata": { 230 | "id": "p9YLqZ7EMs9V", 231 | "colab": { 232 | "base_uri": "https://localhost:8080/", 233 | "height": 53 234 | }, 235 | "outputId": "fcf3e396-007c-48f4-815b-e611b5628da8" 236 | }, 237 | "source": [ 238 | "df[(df.petal_length>1.4) & (df.petal_width<.2)].get_sql_string()" 239 | ], 240 | "execution_count": 7, 241 | "outputs": [ 242 | { 243 | "output_type": "execute_result", 244 | "data": { 245 | "application/vnd.google.colaboratory.intrinsic+json": { 246 | "type": "string" 247 | }, 248 | "text/plain": [ 249 | "'SELECT (sepal_length) AS sepal_length, (sepal_width) AS sepal_width, (petal_length) AS petal_length, (petal_width) AS petal_width, (species) AS species FROM iris WHERE (((petal_length > 1.4) AND (petal_width < 0.2))) '" 250 | ] 251 | }, 252 | "metadata": { 253 | "tags": [] 254 | }, 255 | "execution_count": 7 256 | } 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "metadata": { 262 | "id": "9DfYy1SeMs40", 263 | "colab": { 264 | "base_uri": "https://localhost:8080/", 265 | "height": 141 266 | }, 267 | "outputId": "821d7ea9-6f30-4fea-f1d3-4fcad1e23613" 268 | }, 269 | "source": [ 270 | "df_ = copy(df)\r\n", 271 | "df_['sepal_width_rounded'] = df_.sepal_width.round()\r\n", 272 | "df_1 = df_[df_.species=='setosa'].reset_index(drop=True)\r\n", 273 | "df_2 = df_[df_.species=='versicolor'].reset_index(drop=True)\r\n", 274 | "pd_wrapped.concat([df_1, df_2]).get_sql_string()" 275 | ], 276 | "execution_count": 8, 277 | "outputs": [ 278 | { 279 | "output_type": "execute_result", 280 | "data": { 281 | "application/vnd.google.colaboratory.intrinsic+json": { 282 | "type": "string" 283 | }, 284 | "text/plain": [ 285 | "\"SELECT (sepal_length) AS sepal_length, (sepal_width) AS sepal_width, (petal_length) AS petal_length, (petal_width) AS petal_width, (species) AS species, ((CASE WHEN ((ABS(sepal_width) - ROUND(ABS(sepal_width)-0.5))==.5) AND ((CAST(sepal_width AS INT))%2 == 0) THEN (CASE WHEN sepal_width>0 THEN ROUND(sepal_width-0.001) ELSE ROUND(sepal_width+0.001) END) ELSE (ROUND(sepal_width)) END)) AS sepal_width_rounded FROM iris WHERE ((species = 'setosa')) UNION ALL SELECT (sepal_length) AS sepal_length, (sepal_width) AS sepal_width, (petal_length) AS petal_length, (petal_width) AS petal_width, (species) AS species, ((CASE WHEN ((ABS(sepal_width) - ROUND(ABS(sepal_width)-0.5))==.5) AND ((CAST(sepal_width AS INT))%2 == 0) THEN (CASE WHEN sepal_width>0 THEN ROUND(sepal_width-0.001) ELSE ROUND(sepal_width+0.001) END) ELSE (ROUND(sepal_width)) END)) AS sepal_width_rounded FROM iris WHERE ((species = 'versicolor')) \"" 286 | ] 287 | }, 288 | "metadata": { 289 | "tags": [] 290 | }, 291 | "execution_count": 8 292 | } 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "metadata": { 298 | "id": "N5tjEM2tMszV" 299 | }, 300 | "source": [ 301 | "df_ = copy(df)\r\n", 302 | "df_['sepal_width_rounded'] = df_.sepal_width.round()\r\n", 303 | "df_1 = df_[df_.species=='setosa'].reset_index(drop=True)\r\n", 304 | "df_2 = df_[df_.species=='versicolor'].reset_index(drop=True)\r\n", 305 | "\r\n", 306 | "some_df = pd_wrapped.concat([df_1, df_2])\r\n", 307 | "\r\n", 308 | "sql_string = some_df.get_sql_string()\r\n", 309 | "\r\n", 310 | "df_from_sql_database = pd.read_sql_query(sql_string, sql_connection)\r\n", 311 | "df_pandas = some_df.df_pandas\r\n", 312 | "\r\n", 313 | "from pandas_to_sql.testing.utils.asserters import assert_dataframes_equals\r\n", 314 | "assert_dataframes_equals(df_pandas, df_from_sql_database)" 315 | ], 316 | "execution_count": 9, 317 | "outputs": [] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "metadata": { 322 | "id": "Xdq5qc-ZMssZ" 323 | }, 324 | "source": [ 325 | "" 326 | ], 327 | "execution_count": 9, 328 | "outputs": [] 329 | } 330 | ] 331 | } --------------------------------------------------------------------------------