├── test ├── __init__.py ├── test_group.py ├── test_base.py ├── test_summarize.py ├── test_transform.py ├── test_vector.py ├── test_join.py ├── test_subset.py ├── test_reshape.py ├── test_window_functions.py ├── test_select.py └── test_summary_functions.py ├── dfply.egg-info ├── dependency_links.txt ├── requires.txt ├── top_level.txt ├── PKG-INFO └── SOURCES.txt ├── requirements.txt ├── MANIFEST.in ├── .travis.yml~ ├── dfply ├── data │ └── __init__.py ├── group.py ├── __init__.py ├── summarize.py ├── subset.py ├── transform.py ├── summary_functions.py ├── select.py ├── window_functions.py ├── set_ops.py ├── vector.py ├── join.py ├── base.py └── reshape.py ├── .gitignore ├── setup.py ├── .travis.yml ├── RELEASES.txt └── examples ├── basics-extending-functionality.ipynb └── .ipynb_checkpoints └── basics-extending-functionality-checkpoint.ipynb /test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dfply.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /dfply.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | -------------------------------------------------------------------------------- /dfply.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | dfply 2 | test 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.11.1 2 | pandas>=0.18.1 3 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # Include the license file 2 | include LICENSE.md 3 | -------------------------------------------------------------------------------- /.travis.yml~: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - 2.6 4 | - 2.7 5 | install: 6 | - pip install . 7 | - pip install -r requirements.txt 8 | script: python -m pytest test/ 9 | -------------------------------------------------------------------------------- /dfply/data/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | 4 | root = os.path.abspath(os.path.dirname(__file__)) 5 | diamonds = pd.read_csv(os.path.join(root, "diamonds.csv")) 6 | -------------------------------------------------------------------------------- /dfply/group.py: -------------------------------------------------------------------------------- 1 | from .base import * 2 | 3 | 4 | @pipe 5 | @symbolic_evaluation(eval_as_label=True) 6 | def group_by(df, *args): 7 | df._grouped_by = list(args) 8 | return df 9 | 10 | 11 | @pipe 12 | def ungroup(df): 13 | df._grouped_by = None 14 | return df 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Unit test / coverage reports 7 | .cache 8 | 9 | # Annoying Mac File 10 | .DC_Store 11 | 12 | # workbook test files 13 | test/feature_workbook.ipynb 14 | test/.ipynb_checkpoints/* 15 | test/worksheet.py 16 | 17 | # distribution 18 | dist 19 | build 20 | 21 | # egg_info 22 | dfply.egg_info 23 | -------------------------------------------------------------------------------- /test/test_group.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from dfply import * 4 | 5 | ##============================================================================== 6 | ## grouping test functions 7 | ##============================================================================== 8 | 9 | 10 | def test_group_attributes(): 11 | d = diamonds >> group_by('cut') 12 | assert hasattr(d, '_grouped_by') 13 | assert d._grouped_by == ['cut',] 14 | -------------------------------------------------------------------------------- /dfply.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.0 2 | Name: dfply 3 | Version: 0.3.0 4 | Summary: dplyr-style piping operations for pandas dataframes 5 | Home-page: https://github.com/kieferk/dfply 6 | Author: Kiefer Katovich 7 | Author-email: kiefer.katovich@gmail.com 8 | License: GNU General Public License v3.0 9 | Description: See https://github.com/kieferk/dfply/blob/master/README.md for details. 10 | Keywords: pandas dplyr 11 | Platform: UNKNOWN 12 | -------------------------------------------------------------------------------- /dfply/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import * 2 | from .group import * 3 | from .join import * 4 | from .reshape import * 5 | from .select import * 6 | from .select import * 7 | from .set_ops import * 8 | from .subset import * 9 | from .summarize import * 10 | from .transform import * 11 | from .data import diamonds 12 | from .summary_functions import * 13 | from .window_functions import * 14 | from .vector import * 15 | 16 | for verb in dir(): 17 | if 'ize' in verb: 18 | exec(verb.replace('ize', 'ise') + '=' + verb) 19 | -------------------------------------------------------------------------------- /test/test_base.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from dfply import * 4 | 5 | 6 | ##============================================================================== 7 | ## pipe tests 8 | ##============================================================================== 9 | 10 | @dfpipe 11 | def blank_function(df): 12 | return df 13 | 14 | 15 | def test_pipe(): 16 | d = diamonds >> blank_function() 17 | assert diamonds.equals(d) 18 | d = diamonds >> blank_function() >> blank_function() 19 | assert diamonds.equals(d) 20 | 21 | 22 | def test_inplace_pipe(): 23 | df = diamonds[['price','carat']].head(5) 24 | d = diamonds.copy() 25 | d >>= select(X.price, X.carat) >> head(5) 26 | print(df) 27 | print(d) 28 | assert df.equals(d) 29 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name = 'dfply', 5 | version = '0.3.3', 6 | author = 'Kiefer Katovich', 7 | author_email = 'kiefer.katovich@gmail.com', 8 | keywords = 'pandas dplyr', 9 | packages = find_packages(), 10 | include_package_data=True, 11 | package_data={'dfply': ['data/diamonds.csv']}, 12 | package_dir={'dfply':'dfply'}, 13 | install_requires=['numpy', 'pandas'], 14 | description = 'dplyr-style piping operations for pandas dataframes', 15 | long_description = 'See https://github.com/kieferk/dfply/blob/master/README.md for details.', 16 | license = 'GNU General Public License v3.0', 17 | url = 'https://github.com/kieferk/dfply', 18 | test_suite='test', 19 | ) 20 | -------------------------------------------------------------------------------- /dfply/summarize.py: -------------------------------------------------------------------------------- 1 | from .base import * 2 | 3 | 4 | @dfpipe 5 | def summarize(df, **kwargs): 6 | return pd.DataFrame({k: [v] for k, v in kwargs.items()}) 7 | 8 | 9 | @dfpipe 10 | def summarize_each(df, functions, *args): 11 | columns, values = [], [] 12 | for arg in args: 13 | if isinstance(arg, pd.Series): 14 | varname = arg.name 15 | col = arg 16 | elif isinstance(arg, str): 17 | varname = arg 18 | col = df[varname] 19 | elif isinstance(arg, int): 20 | varname = df.columns[arg] 21 | col = df.iloc[:, arg] 22 | 23 | for f in functions: 24 | fname = f.__name__ 25 | columns.append('_'.join([varname, fname])) 26 | values.append(f(col)) 27 | 28 | return pd.DataFrame([values], columns=columns) 29 | -------------------------------------------------------------------------------- /dfply.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | LICENSE.md 2 | MANIFEST.in 3 | setup.py 4 | dfply/__init__.py 5 | dfply/base.py 6 | dfply/group.py 7 | dfply/join.py 8 | dfply/reshape.py 9 | dfply/select.py 10 | dfply/set_ops.py 11 | dfply/subset.py 12 | dfply/summarize.py 13 | dfply/summary_functions.py 14 | dfply/transform.py 15 | dfply/vector.py 16 | dfply/window_functions.py 17 | dfply.egg-info/PKG-INFO 18 | dfply.egg-info/SOURCES.txt 19 | dfply.egg-info/dependency_links.txt 20 | dfply.egg-info/requires.txt 21 | dfply.egg-info/top_level.txt 22 | dfply/data/__init__.py 23 | test/__init__.py 24 | test/test_base.py 25 | test/test_group.py 26 | test/test_join.py 27 | test/test_reshape.py 28 | test/test_select.py 29 | test/test_subset.py 30 | test/test_summarize.py 31 | test/test_summary_functions.py 32 | test/test_transform.py 33 | test/test_vector.py 34 | test/test_window_functions.py -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - '3.6' 4 | - '3.7' 5 | install: 6 | - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh 7 | - bash Miniconda3-latest-Linux-x86_64.sh -b -p $HOME/miniconda 8 | - export PATH="$HOME/miniconda/bin:$PATH" 9 | - hash -r 10 | - conda config --set always_yes yes --set changeps1 no 11 | - conda update -q conda 12 | - conda info -a 13 | - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION pip pytest numpy 14 | pandas>=0.18.1 15 | - source activate test-environment 16 | - python setup.py install 17 | script: python -m pytest test/ 18 | deploy: 19 | provider: pypi 20 | user: TokenNobody 21 | password: 22 | secure: XazMtrRpb6i/jtdeBIDV5mWZNQr2dPlmspgF/qqt9AbZRCu/Y28DaI/12KGSFgVJc2lzREp+cxKNq60bDT8mB3t0+YtYeHsmQXawInyXAFACfmRI5/nigiYLMhQ1OV/RHtXQcXeHJF1MbKeF2WjWdBKh9m9cBi5NVxGot/knGOALkwyiPG4Ykf5fVD4bCeJTkdrBav/XLYqYPntpw6GT0PA8yvt3E1lQfL+uTV8+ZcwsqXh8ebWNI0aU86lurE6b1cJn6xpTZYzSqiJqHuikCZC7alqd311kpm/sKuHMb2V9tKiHiJFN7fcKfdaVuAjQE22Tc7R7uC2ph9tBvL8xHnzi48Wj9Ri5QYLATN2u28d3rkCS+zN+tC3MT9bjDcyuqPdbx3Sx5bFJC6P0HFcof5lpnan80TW4VQSM2GV8rqwPgm0kLi0k/DG5yvRWecNdlvvCDZ5e6M9eiOcer9guimDYITtQCfuiUZLUbzgw+u7QE3jY9Exnv7Ekdi150Zd+ubPS+yU1ZG5tgB2ijw7n2bTxEy77d6Zm0quDnQ6gVBi7STp2si3397TTQH/nV+eaX51VOxTufZXDW0eiaVoRhH32xUllhFeAzJSezAVJ0WuLEuSLXGkxep7VNofK0Kyjxg4S2ED41lV7LtucdQe7L/LlGTfmYCgzSDaDW98CqIM= 23 | on: 24 | tags: true 25 | distributions: sdist bdist_wheel 26 | repo: kieferk/dfply 27 | branch: master 28 | -------------------------------------------------------------------------------- /dfply/subset.py: -------------------------------------------------------------------------------- 1 | from .base import * 2 | import warnings 3 | import numpy as np 4 | 5 | 6 | # ------------------------------------------------------------------------------ 7 | # `head` and `tail` 8 | # ------------------------------------------------------------------------------ 9 | 10 | @dfpipe 11 | def head(df, n=5): 12 | return df.head(n) 13 | 14 | 15 | @dfpipe 16 | def tail(df, n=5): 17 | return df.tail(n) 18 | 19 | 20 | # ------------------------------------------------------------------------------ 21 | # Sampling 22 | # ------------------------------------------------------------------------------ 23 | 24 | 25 | @dfpipe 26 | def sample(df, *args, **kwargs): 27 | return df.sample(*args, **kwargs) 28 | 29 | 30 | @pipe 31 | @group_delegation 32 | @symbolic_evaluation(eval_as_label=['*']) 33 | def distinct(df, *args, **kwargs): 34 | if not args: 35 | return df.drop_duplicates(**kwargs) 36 | return df.drop_duplicates(list(args), **kwargs) 37 | 38 | 39 | @dfpipe 40 | def row_slice(df, indices): 41 | if isinstance(indices, (tuple, list)): 42 | indices = np.array(indices) 43 | if isinstance(indices, int): 44 | indices = np.array([indices]) 45 | if isinstance(indices, pd.Series): 46 | indices = indices.values 47 | 48 | if indices.dtype == bool: 49 | return df.loc[indices, :] 50 | else: 51 | return df.iloc[indices, :] 52 | 53 | 54 | # ------------------------------------------------------------------------------ 55 | # Filtering/masking 56 | # ------------------------------------------------------------------------------ 57 | 58 | @dfpipe 59 | def mask(df, *args): 60 | mask = pd.Series(np.ones(df.shape[0], dtype=bool)) 61 | for arg in args: 62 | if arg.dtype != bool: 63 | raise Exception("Arguments must be boolean.") 64 | mask = mask & arg.reset_index(drop=True) 65 | return df[mask.values] 66 | 67 | 68 | filter_by = mask # alias for mask() 69 | 70 | 71 | @dfpipe 72 | def top_n(df, n=None, ascending=True, col=None): 73 | if not n: 74 | raise ValueError('n must be specified') 75 | if not isinstance(col, pd.Series): 76 | col = df.columns[-1] 77 | else: 78 | col = col._name 79 | index = df[[col]].copy() 80 | index['ranks'] = index[col].rank(ascending=ascending) 81 | index = index[index['ranks'] >= index['ranks'].nlargest(n).min()] 82 | return df.reindex(index.index) 83 | 84 | 85 | @dfpipe 86 | def pull(df, column=-1): 87 | return df.ix[:, column] 88 | -------------------------------------------------------------------------------- /test/test_summarize.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from dfply import * 4 | 5 | 6 | ##============================================================================== 7 | ## summarization test functions 8 | ##============================================================================== 9 | 10 | def test_summarize(): 11 | p = pd.DataFrame({ 12 | 'price_mean':[diamonds.price.mean()], 13 | 'price_std':[diamonds.price.std()] 14 | }) 15 | assert p.equals(diamonds >> summarize(price_mean=X.price.mean(), 16 | price_std=X.price.std())) 17 | 18 | pcut = pd.DataFrame({ 19 | 'cut':['Fair','Good','Ideal','Premium','Very Good'] 20 | }) 21 | pcut['price_mean'] = [diamonds[diamonds.cut == c].price.mean() for c in pcut.cut.values] 22 | pcut['price_std'] = [diamonds[diamonds.cut == c].price.std() for c in pcut.cut.values] 23 | assert pcut.equals(diamonds >> group_by('cut') >> 24 | summarize(price_mean=X.price.mean(), price_std=X.price.std())) 25 | 26 | 27 | def test_summarize_each(): 28 | to_match = pd.DataFrame({ 29 | 'price_mean':[np.mean(diamonds.price)], 30 | 'price_var':[np.var(diamonds.price)], 31 | 'depth_mean':[np.mean(diamonds.depth)], 32 | 'depth_var':[np.var(diamonds.depth)] 33 | }) 34 | to_match = to_match[['price_mean','price_var','depth_mean','depth_var']] 35 | 36 | test1 = diamonds >> summarize_each([np.mean, np.var], X.price, 4) 37 | test2 = diamonds >> summarize_each([np.mean, np.var], X.price, 'depth') 38 | assert to_match.equals(test1) 39 | assert to_match.equals(test2) 40 | 41 | group = pd.DataFrame({ 42 | 'cut':['Fair','Good','Ideal','Premium','Very Good'] 43 | }) 44 | group['price_mean'] = [np.mean(diamonds[diamonds.cut == c].price) for c in group.cut.values] 45 | group['price_var'] = [np.var(diamonds[diamonds.cut == c].price) for c in group.cut.values] 46 | group['depth_mean'] = [np.mean(diamonds[diamonds.cut == c].depth) for c in group.cut.values] 47 | group['depth_var'] = [np.var(diamonds[diamonds.cut == c].depth) for c in group.cut.values] 48 | 49 | group = group[['cut','price_mean','price_var','depth_mean','depth_var']] 50 | 51 | test1 = (diamonds >> group_by(X.cut) >> 52 | summarize_each([np.mean, np.var], X.price, 4)) 53 | test2 = (diamonds >> group_by('cut') >> 54 | summarize_each([np.mean, np.var], X.price, 'depth')) 55 | 56 | assert group.equals(test1) 57 | assert group.equals(test2) 58 | -------------------------------------------------------------------------------- /test/test_transform.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from dfply import * 4 | 5 | 6 | ##============================================================================== 7 | ## transform test functions 8 | ##============================================================================== 9 | 10 | def test_mutate(): 11 | df = diamonds.copy() 12 | df['testcol'] = 1 13 | assert df.equals(diamonds >> mutate(testcol=1)) 14 | df['testcol'] = df['x'] 15 | assert df.equals(diamonds >> mutate(testcol=X.x)) 16 | df['testcol'] = df['x'] * df['y'] 17 | assert df.equals(diamonds >> mutate(testcol=X.x * X.y)) 18 | df['testcol'] = df['x'].mean() 19 | assert df.equals(diamonds >> mutate(testcol=np.mean(X.x))) 20 | 21 | 22 | def group_mutate_helper(df): 23 | df['testcol'] = df['x']*df.shape[0] 24 | return df 25 | 26 | 27 | def test_group_mutate(): 28 | df = diamonds.copy() 29 | df = df.groupby('cut').apply(group_mutate_helper) 30 | d = diamonds >> group_by('cut') >> mutate(testcol=X.x*X.shape[0]) >> ungroup() 31 | assert df.equals(d.sort_index()) 32 | 33 | 34 | def test_transmute(): 35 | df = diamonds.copy() 36 | df['testcol'] = df['x'] * df['y'] 37 | df = df[['testcol']] 38 | assert df.equals(diamonds >> transmute(testcol=X.x * X.y)) 39 | 40 | 41 | def test_group_transmute(): 42 | df = diamonds.copy() 43 | df = df.groupby('cut').apply(group_mutate_helper).reset_index(drop=True) 44 | df = df[['cut','testcol']] 45 | d = diamonds >> group_by('cut') >> transmute(testcol=X.x*X.shape[0]) 46 | print(d.head()) 47 | print(df.head()) 48 | assert df.equals(d.sort_index()) 49 | 50 | 51 | def test_mutate_if(): 52 | df = diamonds.copy() 53 | for col in df: 54 | try: 55 | if max(df[col]) < 10: 56 | df[col] *= 2 57 | except: 58 | pass 59 | assert df.equals(diamonds >> mutate_if(lambda col: max(col) < 10, lambda row: row * 2)) 60 | df = diamonds.copy() 61 | for col in df: 62 | try: 63 | if any(df[col].str.contains('.')): 64 | df[col] = df[col].str.lower() 65 | except: 66 | pass 67 | assert df.equals(diamonds >> mutate_if(lambda col: any(col.str.contains('.')), lambda row: row.str.lower())) 68 | df = diamonds.copy() 69 | for col in df: 70 | try: 71 | if min(df[col]) < 1 and mean(df[col]) < 4: 72 | df[col] *= -1 73 | except: 74 | pass 75 | assert df.equals(diamonds >> mutate_if(lambda col: min(col) < 1 and mean(col) < 4, lambda row: -row)) 76 | -------------------------------------------------------------------------------- /dfply/transform.py: -------------------------------------------------------------------------------- 1 | from .base import * 2 | 3 | 4 | @dfpipe 5 | def mutate(df, **kwargs): 6 | """ 7 | Creates new variables (columns) in the DataFrame specified by keyword 8 | argument pairs, where the key is the column name and the value is the 9 | new column value(s). 10 | 11 | Args: 12 | df (pandas.DataFrame): data passed in through the pipe. 13 | 14 | Kwargs: 15 | **kwargs: keys are the names of the new columns, values indicate 16 | what the new column values will be. 17 | 18 | Example: 19 | diamonds >> mutate(x_plus_y=X.x + X.y) >> select_from('x') >> head(3) 20 | 21 | x y z x_plus_y 22 | 0 3.95 3.98 2.43 7.93 23 | 1 3.89 3.84 2.31 7.73 24 | 2 4.05 4.07 2.31 8.12 25 | """ 26 | 27 | return df.assign(**kwargs) 28 | 29 | 30 | @dfpipe 31 | def mutate_if(df, predicate, fun): 32 | """ 33 | Modifies columns in place if the specified predicate is true. 34 | Args: 35 | df (pandas.DataFrame): data passed in through the pipe. 36 | predicate: a function applied to columns that returns a boolean value 37 | fun: a function that will be applied to columns where predicate returns True 38 | 39 | Example: 40 | diamonds >> mutate_if(lambda col: min(col) < 1 and mean(col) < 4, lambda row: 2 * row) >> head(3) 41 | carat cut color clarity depth table price x y z 42 | 0 0.46 Ideal E SI2 61.5 55.0 326 3.95 3.98 4.86 43 | 1 0.42 Premium E SI1 59.8 61.0 326 3.89 3.84 4.62 44 | 2 0.46 Good E VS1 56.9 65.0 327 4.05 4.07 4.62 45 | (columns 'carat' and 'z', both having a min < 1 and mean < 4, are doubled, while the 46 | other rows remain as they were) 47 | """ 48 | cols = list() 49 | for col in df: 50 | try: 51 | if predicate(df[col]): 52 | cols.append(col) 53 | except: 54 | pass 55 | df[cols] = df[cols].apply(fun) 56 | return df 57 | 58 | # df2 = df.copy() 59 | # df2[cols] = df2[cols].apply(fun) 60 | # return df2 61 | 62 | 63 | @dfpipe 64 | def transmute(df, *keep_columns, **kwargs): 65 | """ 66 | Creates columns and then returns those new columns and optionally specified 67 | original columns from the DataFrame. 68 | 69 | This works like `mutate`, but designed to discard the original columns used 70 | to create the new ones. 71 | 72 | Args: 73 | *keep_columns: Column labels to keep. Can be string, symbolic, or 74 | integer position. 75 | 76 | Kwargs: 77 | **kwargs: keys are the names of the new columns, values indicate 78 | what the new column values will be. 79 | 80 | Example: 81 | diamonds >> transmute(x_plus_y=X.x + X.y, y_div_z=(X.y / X.z)) >> head(3) 82 | 83 | y_div_z x_plus_y 84 | 0 1.637860 7.93 85 | 1 1.662338 7.73 86 | 2 1.761905 8.12 87 | """ 88 | 89 | keep_cols = [] 90 | for col in flatten(keep_columns): 91 | try: 92 | keep_cols.append(col.name) 93 | except: 94 | if isinstance(col, str): 95 | keep_cols.append(col) 96 | elif isinstance(col, int): 97 | keep_cols.append(df.columns[col]) 98 | 99 | df = df.assign(**kwargs) 100 | columns = [k for k in kwargs.keys()] + list(keep_cols) 101 | return df[columns] 102 | -------------------------------------------------------------------------------- /test/test_vector.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from dfply import * 3 | 4 | ##============================================================================== 5 | ## desc, order by tests 6 | ##============================================================================== 7 | 8 | def test_desc(): 9 | 10 | df = diamonds >> select(X.cut, X.x) >> head(10) 11 | t = df >> summarize(last=nth(X.x, -1, order_by=[desc(X.cut), desc(X.x)])) 12 | 13 | series_num = pd.Series([4,1,3,2]) 14 | series_bool = pd.Series([True,False,True,False]) 15 | series_str = pd.Series(['d','a','c','b']) 16 | 17 | num_truth = series_num.rank(method='min',ascending=False) 18 | bool_truth = series_bool.rank(method='min',ascending=False) 19 | str_truth = series_str.rank(method='min',ascending=False) 20 | 21 | assert desc(series_num).equals(num_truth) 22 | assert desc(series_bool).equals(bool_truth) 23 | assert desc(series_str).equals(str_truth) 24 | 25 | 26 | def test_order_series_by(): 27 | series = pd.Series([1,2,3,4,5,6,7,8]) 28 | order1 = pd.Series(['A','B','A','B','A','B','A','B']) 29 | ordered1 = order_series_by(series, order1).reset_index(drop=True) 30 | true1 = pd.Series([1,3,5,7,2,4,6,8]) 31 | assert ordered1.equals(true1) 32 | 33 | order2 = pd.Series([2,2,2,2,1,1,1,1]) 34 | ordered2 = order_series_by(series, [order1, order2]).reset_index(drop=True) 35 | true2 = pd.Series([5,7,1,3,6,8,2,4]) 36 | assert ordered2.equals(true2) 37 | 38 | 39 | ##============================================================================== 40 | ## coalesce test 41 | ##============================================================================== 42 | 43 | def test_coalesce(): 44 | df = pd.DataFrame({ 45 | 'a':[1,np.nan,np.nan,np.nan,np.nan], 46 | 'b':[2,3,np.nan,np.nan,np.nan], 47 | 'c':[np.nan,np.nan,4,5,np.nan], 48 | 'd':[6,7,8,9,np.nan] 49 | }) 50 | truth_df = df.assign(coal=[1,3,4,5,np.nan]) 51 | d = df >> mutate(coal=coalesce(X.a, X.b, X.c, X.d)) 52 | assert truth_df.equals(d) 53 | 54 | 55 | ##============================================================================== 56 | ## case_when test 57 | ##============================================================================== 58 | 59 | def test_case_when(): 60 | df = pd.DataFrame({ 61 | 'num':np.arange(31) 62 | }) 63 | df_truth = df.assign(strnum=['fizzbuzz' if (i % 15 == 0) else 64 | 'fizz' if (i % 3 == 0) else 65 | 'buzz' if (i % 5 == 0) else 66 | str(i) for i in np.arange(31)]) 67 | d = df >> mutate(strnum=case_when([X.num % 15 == 0, 'fizzbuzz'], 68 | [X.num % 3 == 0, 'fizz'], 69 | [X.num % 5 == 0, 'buzz'], 70 | [True, X.num.astype(str)])) 71 | print(df_truth) 72 | print(d) 73 | assert df_truth.equals(d) 74 | 75 | 76 | ##============================================================================== 77 | ## if_else test 78 | ##============================================================================== 79 | 80 | def test_if_else(): 81 | df = pd.DataFrame({ 82 | 'a':[1,2,3,4,5,6,7,8,9] 83 | }) 84 | b_truth = ['odd','even','odd','even','odd','even','odd','even','odd'] 85 | d = df >> mutate(b=if_else(X.a % 2 == 0, 'even', 'odd')) 86 | assert d.equals(df.assign(b=b_truth)) 87 | 88 | df = pd.DataFrame({ 89 | 'a':[0,0,0,1,1,1,2,2,2] 90 | }) 91 | b_truth = [5,5,5,5,5,5,9,9,9] 92 | d = df >> mutate(b=if_else(X.a < 2, [5,5,5,5,5,5,5,5,5], [9,9,9,9,9,9,9,9,9])) 93 | assert d.equals(df.assign(b=b_truth)) 94 | 95 | 96 | ##============================================================================== 97 | ## na_if test 98 | ##============================================================================== 99 | 100 | def test_na_if(): 101 | df = pd.DataFrame({ 102 | 'a':[1,2,3,4,5] 103 | }) 104 | d = df >> mutate(b=na_if(X.a, 3), c=na_if(X.a,1,2,3)) 105 | d = d[['a','b','c']] 106 | df_true = df.assign(b=[1,2,np.nan,4,5], c=[np.nan,np.nan,np.nan,4,5]) 107 | assert df_true.equals(d) 108 | -------------------------------------------------------------------------------- /test/test_join.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from dfply import * 4 | 5 | ##============================================================================== 6 | ## join test functions 7 | ##============================================================================== 8 | 9 | @pytest.fixture 10 | def dfA(scope='module'): 11 | a = pd.DataFrame({ 12 | 'x1':['A','B','C'], 13 | 'x2':[1,2,3] 14 | }) 15 | return a 16 | 17 | 18 | @pytest.fixture 19 | def dfB(scope='module'): 20 | b = pd.DataFrame({ 21 | 'x1':['A','B','D'], 22 | 'x3':[True,False,True] 23 | }) 24 | return b 25 | 26 | @pytest.fixture 27 | def dfC(scope='module'): 28 | c = pd.DataFrame({ 29 | 'x1':['B','C','D'], 30 | 'x2':[2,3,4] 31 | }) 32 | return c 33 | 34 | 35 | def test_inner_join(dfA, dfB): 36 | ab = pd.DataFrame({ 37 | 'x1':['A','B'], 38 | 'x2':[1,2], 39 | 'x3':[True, False] 40 | }) 41 | 42 | c = dfA >> inner_join(dfB, by='x1') 43 | assert c.equals(ab) 44 | 45 | 46 | def test_outer_join(dfA, dfB): 47 | ab = pd.DataFrame({ 48 | 'x1':['A','B','C','D'], 49 | 'x2':[1,2,3,np.nan], 50 | 'x3':[True, False,np.nan,True] 51 | }) 52 | 53 | c = dfA >> outer_join(dfB, by='x1') 54 | assert c.equals(ab) 55 | c = dfA >> full_join(dfB, by='x1') 56 | assert c.equals(ab) 57 | 58 | 59 | def test_left_join(dfA, dfB): 60 | ab = pd.DataFrame({ 61 | 'x1':['A','B','C'], 62 | 'x2':[1,2,3], 63 | 'x3':[True, False, np.nan] 64 | }) 65 | 66 | c = dfA >> left_join(dfB, by='x1') 67 | assert c.equals(ab) 68 | 69 | 70 | def test_right_join(dfA, dfB): 71 | ab = pd.DataFrame({ 72 | 'x1':['A','B','D'], 73 | 'x2':[1,2,np.nan], 74 | 'x3':[True, False, True] 75 | }) 76 | 77 | c = dfA >> right_join(dfB, by='x1') 78 | assert c.equals(ab) 79 | 80 | def test_semi_join(dfA, dfB): 81 | ab = pd.DataFrame({ 82 | 'x1':['A', 'B'], 83 | 'x2':[1, 2] 84 | }) 85 | 86 | c = dfA >> semi_join(dfB, by='x1') 87 | assert c.equals(ab) 88 | 89 | 90 | def test_anti_join(dfA, dfB): 91 | ab = pd.DataFrame({ 92 | 'x1':['C'], 93 | 'x2':[3] 94 | }, index=[2]) 95 | 96 | c = dfA >> anti_join(dfB, by='x1') 97 | assert c.equals(ab) 98 | 99 | 100 | ##============================================================================== 101 | ## set operation (row join) test functions 102 | ##============================================================================== 103 | 104 | def test_union(dfA, dfC): 105 | ac = pd.DataFrame({ 106 | 'x1': ['A', 'B', 'C', 'D'], 107 | 'x2': [1, 2, 3, 4] 108 | }, index=[0, 1, 2, 2]) 109 | 110 | d = dfA >> union(dfC) 111 | assert d.equals(ac) 112 | 113 | 114 | def test_intersect(dfA, dfC): 115 | ac = pd.DataFrame({ 116 | 'x1': ['B', 'C'], 117 | 'x2': [2, 3] 118 | }) 119 | 120 | d = dfA >> intersect(dfC) 121 | assert d.equals(ac) 122 | 123 | 124 | def test_set_diff(dfA, dfC): 125 | ac = pd.DataFrame({ 126 | 'x1': ['A'], 127 | 'x2': [1] 128 | }) 129 | 130 | d = dfA >> set_diff(dfC) 131 | assert d.equals(ac) 132 | 133 | 134 | ##============================================================================== 135 | ## bind rows, cols 136 | ##============================================================================== 137 | 138 | def test_bind_rows(dfA, dfB): 139 | inner = pd.DataFrame({ 140 | 'x1':['A','B','C','A','B','D'] 141 | }) 142 | outer = pd.DataFrame({ 143 | 'x1':['A','B','C','A','B','D'], 144 | 'x2':[1,2,3,np.nan,np.nan,np.nan], 145 | 'x3':[np.nan,np.nan,np.nan,True,False,True] 146 | }) 147 | ab_inner = dfA >> bind_rows(dfB, join='inner') 148 | ab_outer = dfA >> bind_rows(dfB, join='outer') 149 | assert inner.equals(ab_inner.reset_index(drop=True)) 150 | assert outer.equals(ab_outer.reset_index(drop=True)) 151 | 152 | 153 | def test_bind_cols(dfA, dfB): 154 | dfB.columns = ['x3','x4'] 155 | df = pd.DataFrame({ 156 | 'x1':['A','B','C'], 157 | 'x2':[1,2,3], 158 | 'x3':['A','B','D'], 159 | 'x4':[True,False,True] 160 | }) 161 | d = dfA >> bind_cols(dfB) 162 | assert df.equals(d) 163 | -------------------------------------------------------------------------------- /test/test_subset.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from dfply import * 4 | 5 | 6 | ##============================================================================== 7 | ## subset test functions 8 | ##============================================================================== 9 | 10 | def test_head(): 11 | df = diamonds.head(2) 12 | d = diamonds >> head(2) 13 | assert df.equals(d) 14 | 15 | 16 | def test_grouped_head(): 17 | df = diamonds.groupby(['cut','color']).apply(lambda x: x.head(2)).reset_index(drop=True) 18 | d = diamonds >> group_by('cut','color') >> head(2) 19 | assert df.equals(d.reset_index(drop=True)) 20 | 21 | 22 | def test_tail(): 23 | df = diamonds.tail(2) 24 | d = diamonds >> tail(2) 25 | assert df.equals(d) 26 | 27 | 28 | def test_grouped_tail(): 29 | df = diamonds.groupby(['cut','color']).apply(lambda x: x.tail(2)).reset_index(drop=True) 30 | d = diamonds >> group_by('cut','color') >> tail(2) 31 | assert df.equals(d.reset_index(drop=True)) 32 | 33 | 34 | def test_distinct(): 35 | d = diamonds >> distinct('depth') 36 | df = diamonds.drop_duplicates('depth') 37 | assert df.equals(d) 38 | 39 | d = diamonds >> distinct(X.cut, 'depth') 40 | df = diamonds.drop_duplicates(['cut','depth']) 41 | assert df.equals(d) 42 | 43 | df = diamonds[['carat', 'cut']].drop_duplicates() 44 | d = diamonds >> select(X.carat, X.cut) >> distinct() 45 | assert df.equals(d) 46 | 47 | df = diamonds[['carat', 'cut']].drop_duplicates(keep='last') 48 | d = diamonds >> select(X.carat, X.cut) >> distinct(keep='last') 49 | assert df.equals(d) 50 | 51 | 52 | def test_sample(): 53 | random_state = 55 54 | 55 | d = diamonds >> sample(n=10, random_state=random_state) 56 | df = diamonds.sample(n=10, random_state=random_state) 57 | assert df.equals(d) 58 | 59 | d = diamonds >> sample(frac=0.001, random_state=random_state) 60 | df = diamonds.sample(frac=0.001, random_state=random_state) 61 | assert df.equals(d) 62 | 63 | d = diamonds >> group_by(X.cut) >> sample(n=10, random_state=random_state) 64 | d = d.reset_index(drop=True) 65 | df = diamonds.groupby('cut').apply(lambda x: x.sample(n=10, random_state=random_state)) 66 | df = df.reset_index(drop=True) 67 | assert df.equals(d) 68 | 69 | 70 | def test_row_slice(): 71 | df = diamonds.iloc[[0,1],:] 72 | assert df.equals(diamonds >> row_slice([0,1])) 73 | df = diamonds.groupby('cut').apply(lambda df: df.iloc[0,:]).reset_index(drop=True) 74 | d = diamonds >> group_by(X.cut) >> row_slice(0) 75 | assert df.equals(d.reset_index(drop=True)) 76 | df = diamonds.loc[diamonds.table > 61, :] 77 | assert df.equals(diamonds >> row_slice(X.table > 61)) 78 | 79 | 80 | def test_mask(): 81 | test1 = diamonds >> mask(X.cut == 'Ideal') 82 | df = diamonds[diamonds.cut == 'Ideal'] 83 | assert df.equals(test1) 84 | 85 | test2 = diamonds >> mask(X.cut == 'Ideal', X.color == 'E', 86 | X.table < 55, X.price < 500) 87 | df_mask = (diamonds.cut == 'Ideal') & (diamonds.color == 'E') 88 | df_mask = df_mask & (diamonds.table < 55) & (diamonds.price < 500) 89 | df = diamonds[df_mask] 90 | assert df.equals(test2) 91 | 92 | 93 | # def test_mask_small(): 94 | # a = (diamonds >> group_by(X.cut) >> arrange(X.price) >> 95 | # head(3) >> ungroup() >> mask(X.carat < 0.23)) 96 | # print(a) 97 | # assert False 98 | 99 | # d = diamonds >> group_by(X.cut) >> mutate(price_lag=lag(X.price)) >> head(2) >> select(X.cut, X.price_lag) 100 | 101 | def test_top_n(): 102 | with pytest.raises(ValueError): 103 | diamonds >> top_n() 104 | test2 = diamonds >> top_n(n=6) 105 | df2 = diamonds.sort_values('z', ascending=False).head(6).sort_index() 106 | assert test2.equals(df2) 107 | test3 = diamonds >> top_n(col=X.x, n=5) 108 | df3 = diamonds.sort_values('x', ascending=False).head(5).sort_index() 109 | assert test3.equals(df3) 110 | test4 = diamonds >> top_n(col=X.cut, n=1) 111 | df4 = diamonds[diamonds.cut == 'Very Good'] 112 | assert test4.equals(df4) 113 | test5 = diamonds >> group_by(X.cut) >> top_n(n=2) 114 | df5 = diamonds.ix[[27415, 27630, 23539, 27517, 27518, 24297, 24328, 24067, 25999, 26444, 48410]] 115 | assert test5.equals(df5) 116 | test6 = diamonds >> top_n(col=X.x, ascending=False, n=5) 117 | df6 = diamonds.sort_values('x', ascending=True).head(8).sort_index() 118 | assert test6.equals(df6) 119 | -------------------------------------------------------------------------------- /dfply/summary_functions.py: -------------------------------------------------------------------------------- 1 | from .base import * 2 | from .vector import * 3 | 4 | 5 | # ------------------------------------------------------------------------------ 6 | # Series summary functions 7 | # ------------------------------------------------------------------------------ 8 | 9 | 10 | @make_symbolic 11 | def mean(series): 12 | """ 13 | Returns the mean of a series. 14 | 15 | Args: 16 | series (pandas.Series): column to summarize. 17 | """ 18 | 19 | if np.issubdtype(series.dtype, np.number): 20 | return series.mean() 21 | else: 22 | return np.nan 23 | 24 | 25 | @make_symbolic 26 | def first(series, order_by=None): 27 | """ 28 | Returns the first value of a series. 29 | 30 | Args: 31 | series (pandas.Series): column to summarize. 32 | 33 | Kwargs: 34 | order_by: a pandas.Series or list of series (can be symbolic) to order 35 | the input series by before summarization. 36 | """ 37 | 38 | if order_by is not None: 39 | series = order_series_by(series, order_by) 40 | first_s = series.iloc[0] 41 | return first_s 42 | 43 | 44 | @make_symbolic 45 | def last(series, order_by=None): 46 | """ 47 | Returns the last value of a series. 48 | 49 | Args: 50 | series (pandas.Series): column to summarize. 51 | 52 | Kwargs: 53 | order_by: a pandas.Series or list of series (can be symbolic) to order 54 | the input series by before summarization. 55 | """ 56 | 57 | if order_by is not None: 58 | series = order_series_by(series, order_by) 59 | last_s = series.iloc[series.size - 1] 60 | return last_s 61 | 62 | 63 | @make_symbolic 64 | def nth(series, n, order_by=None): 65 | """ 66 | Returns the nth value of a series. 67 | 68 | Args: 69 | series (pandas.Series): column to summarize. 70 | n (integer): position of desired value. Returns `NaN` if out of range. 71 | 72 | Kwargs: 73 | order_by: a pandas.Series or list of series (can be symbolic) to order 74 | the input series by before summarization. 75 | """ 76 | 77 | if order_by is not None: 78 | series = order_series_by(series, order_by) 79 | try: 80 | return series.iloc[n] 81 | except: 82 | return np.nan 83 | 84 | 85 | @make_symbolic 86 | def n(series): 87 | """ 88 | Returns the length of a series. 89 | 90 | Args: 91 | series (pandas.Series): column to summarize. 92 | """ 93 | 94 | n_s = series.size 95 | return n_s 96 | 97 | 98 | @make_symbolic 99 | def n_distinct(series): 100 | """ 101 | Returns the number of distinct values in a series. 102 | 103 | Args: 104 | series (pandas.Series): column to summarize. 105 | """ 106 | 107 | n_distinct_s = series.unique().size 108 | return n_distinct_s 109 | 110 | 111 | @make_symbolic 112 | def IQR(series): 113 | """ 114 | Returns the inter-quartile range (IQR) of a series. 115 | 116 | The IRQ is defined as the 75th quantile minus the 25th quantile values. 117 | 118 | Args: 119 | series (pandas.Series): column to summarize. 120 | """ 121 | 122 | iqr_s = series.quantile(.75) - series.quantile(.25) 123 | return iqr_s 124 | 125 | 126 | @make_symbolic 127 | def colmin(series): 128 | """ 129 | Returns the minimum value of a series. 130 | 131 | Args: 132 | series (pandas.Series): column to summarize. 133 | """ 134 | 135 | min_s = series.min() 136 | return min_s 137 | 138 | 139 | @make_symbolic 140 | def colmax(series): 141 | """ 142 | Returns the maximum value of a series. 143 | 144 | Args: 145 | series (pandas.Series): column to summarize. 146 | """ 147 | 148 | max_s = series.max() 149 | return max_s 150 | 151 | 152 | @make_symbolic 153 | def median(series): 154 | """ 155 | Returns the median value of a series. 156 | 157 | Args: 158 | series (pandas.Series): column to summarize. 159 | """ 160 | 161 | if np.issubdtype(series.dtype, np.number): 162 | return series.median() 163 | else: 164 | return np.nan 165 | 166 | 167 | @make_symbolic 168 | def var(series): 169 | """ 170 | Returns the variance of values in a series. 171 | 172 | Args: 173 | series (pandas.Series): column to summarize. 174 | """ 175 | if np.issubdtype(series.dtype, np.number): 176 | return series.var() 177 | else: 178 | return np.nan 179 | 180 | 181 | @make_symbolic 182 | def sd(series): 183 | """ 184 | Returns the standard deviation of values in a series. 185 | 186 | Args: 187 | series (pandas.Series): column to summarize. 188 | """ 189 | 190 | if np.issubdtype(series.dtype, np.number): 191 | return series.std() 192 | else: 193 | return np.nan 194 | -------------------------------------------------------------------------------- /RELEASES.txt: -------------------------------------------------------------------------------- 1 | v0.3.3 2 | - Hotfix for parsing left_on and right_on 3 | 4 | TODO: Need to figure out fix to the inversion of symbol issue. Somewhat complicated. 5 | 6 | v0.3.2 7 | Various PRs added fixing bugs, etc. 8 | 9 | v0.3.1 10 | This update is almost solely the pull requests by @bleearmstrong that were sitting 11 | in the repo waiting. There were some minor bug-fixes and changes too. 12 | 13 | - `select_if` and `drop_if` are now available to perform selection according to a function 14 | - `mutate_if` allows variable creation if a criterion function is evaluated as True 15 | - `row_number` window function is available (same as rank(method='first')) 16 | - `distinct` can take no arguments, making it equivalent to `drop_duplicates` with no arguments 17 | 18 | v0.3.0 19 | Lots and lots of big changes here. Total reworking of the internal functionality. 20 | The good news is that it should (basically) work the same as before, but 21 | hopefully better. Obviously keep in mind that this is still beta and there will 22 | be plenty of bugs to work out on the horizon, but the preexisting tests pass for now... 23 | There is not backward compatibility with old versions as the decorator names 24 | have changed, but again, the functionality is otherwise the same. 25 | 26 | Some major things (see readme for details): 27 | - Moved entirely to python 3 support only. It may still work in python 2, but don't count on it. 28 | - pandas-ply is no longer required; It was brittle and so rolled my own stuff. 29 | - Selection "subfunctions" are now working and the selection functions have been changed in light of this. 30 | 31 | 32 | v0.2.4 33 | - Bug fixed in semi-join and anti-join 34 | - top_n added 35 | 36 | v0.2.3 37 | Inplace piping added using the `>>=` operator. The `pipe` decorator internals 38 | have been changed to make this possible through the addition of an `__rshift__` 39 | implementation and chaining pipes together until evaluated against a 40 | DataFrame. 41 | 42 | 43 | v0.2.2 44 | - Added docstrings to functions and classes. 45 | - Added the `case_when` function. 46 | - Fixed `arrange` to work with symbolic functions like `desc` in the function 47 | call. 48 | - Added `cumany` and `cumall` window functions. 49 | - Added `if_else` function. 50 | - Added `na_if` function. 51 | - Added `percent_rank` function. 52 | - Reorganization of decorator functions (better subclassing). 53 | 54 | 55 | v0.2.1 56 | Fixed an issue with the way the decorators were structured, particularly 57 | the @make_symbolic, that would cause problems with returning Call objects 58 | that would not evaluate properly. Hopefully this is now resolved. 59 | 60 | The "coalesce" function was added from dplyr. 61 | 62 | Some code was moved from base.py to the new vector.py file. The vector.py 63 | file now contains functions that specifically perform operations on 64 | series or numpy arrays (such as coalesce, desc, etc.). Test files have been 65 | reorganized accordingly. 66 | 67 | 68 | v0.2.0 69 | This release now introduces the @make_symbolic decorator, which can wrap 70 | functions to allow them to evaluate later. This is particularly (and perhaps 71 | only) useful when you embed functions as arguments to other functions. For 72 | example the summary and window functions. 73 | 74 | The code for the symbolic handling decorators has been reworked. They now 75 | inherit from a common class since they shared patterns in their code. 76 | 77 | - @make_symbolic decorator 78 | - README updates 79 | - desc() and order_series_by() functions 80 | - re-imagining of the code for @symbolic_evaluation, @symbolic_reference 81 | (the functionality remains unchanged) 82 | - window and summary functions, along with their tests, have been moved around 83 | to other files. 84 | 85 | 86 | v0.1.10 87 | - `separate` and `unite` functions added. 88 | - Summary functions added for series operations. 89 | - README improved dramatically. 90 | - Function docstrings added to more functions (still not all). 91 | 92 | v0.1.9 93 | Moved unit tests into individual files that reflect the categories of the 94 | functions/features they are testing. Some small bugs have been fixed as well. 95 | 96 | v0.1.8 97 | The pipe decorator now copies the dataframe upon each chained function, along 98 | with the `_grouped_by` attribute, if any. Before, operations with the pipe 99 | functions were modifying the original dataframe (such as `mutate`). 100 | 101 | v0.1.7 102 | Restructuring of package to include `diamonds.csv` with pip installation 103 | and require `six` and `pandas-ply` rather than come pre-packaged with them. 104 | 105 | v0.1.6 106 | Added window functions: 107 | `dense_rank` 108 | `min_rank` 109 | `cumsum` 110 | `cummean` 111 | `cummax` 112 | `cummin` 113 | `cumprod` 114 | 115 | 116 | v0.1.5 117 | dplyr set operations added thanks to bleearmstrong. 118 | 119 | `df >> union(other)` 120 | Rows that appear in either `df` or `other`. 121 | 122 | `df >> intersect(other)` 123 | Rows that appear in both `df` and `other` 124 | 125 | `df >> set_diff(other)` 126 | Rows that appear in `df` but not `other`. 127 | -------------------------------------------------------------------------------- /dfply/select.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from .base import * 4 | 5 | 6 | # ------------------------------------------------------------------------------ 7 | # Select and drop operators 8 | # ------------------------------------------------------------------------------ 9 | 10 | def selection_context(arg, context): 11 | if isinstance(arg, Intention): 12 | arg = arg.evaluate(context) 13 | if isinstance(arg, pd.Index): 14 | arg = list(arg) 15 | if isinstance(arg, pd.Series): 16 | arg = arg.name 17 | return arg 18 | 19 | 20 | def selection_filter(f): 21 | def wrapper(*args, **kwargs): 22 | return Intention(lambda x: f(list(x.columns), 23 | *(selection_context(a, x) for a in args), 24 | **{k: selection_context(v, x) for k, v in kwargs.items()})) 25 | 26 | return wrapper 27 | 28 | 29 | def resolve_selection(df, *args, drop=False): 30 | if len(args) > 0: 31 | args = [a for a in flatten(args)] 32 | ordering = [] 33 | column_indices = np.zeros(df.shape[1]) 34 | for selector in args: 35 | visible = np.where(selector != 0)[0] 36 | if not drop: 37 | column_indices[visible] = selector[visible] 38 | else: 39 | column_indices[visible] = selector[visible] * -1 40 | for selection in np.where(selector == 1)[0]: 41 | if not df.columns[selection] in ordering: 42 | ordering.append(df.columns[selection]) 43 | else: 44 | ordering = list(df.columns) 45 | column_indices = np.ones(df.shape[1]) 46 | return ordering, column_indices 47 | 48 | 49 | @pipe 50 | @group_delegation 51 | @symbolic_evaluation(eval_as_selector=True) 52 | def select(df, *args): 53 | ordering, column_indices = resolve_selection(df, *args) 54 | if (column_indices == 0).all(): 55 | return df[[]] 56 | selection = np.where((column_indices == np.max(column_indices)) & 57 | (column_indices >= 0))[0] 58 | df = df.iloc[:, selection] 59 | if all([col in ordering for col in df.columns]): 60 | ordering = [c for c in ordering if c in df.columns] 61 | return df[ordering] 62 | else: 63 | return df 64 | 65 | 66 | @pipe 67 | @group_delegation 68 | @symbolic_evaluation(eval_as_selector=True) 69 | def drop(df, *args): 70 | _, column_indices = resolve_selection(df, *args, drop=True) 71 | if (column_indices == 0).all(): 72 | return df[[]] 73 | selection = np.where((column_indices == np.max(column_indices)) & 74 | (column_indices >= 0))[0] 75 | return df.iloc[:, selection] 76 | 77 | 78 | @pipe 79 | def select_if(df, fun): 80 | """Selects columns where fun(ction) is true 81 | Args: 82 | fun: a function that will be applied to columns 83 | """ 84 | 85 | def _filter_f(col): 86 | try: 87 | return fun(df[col]) 88 | except: 89 | return False 90 | 91 | cols = list(filter(_filter_f, df.columns)) 92 | return df[cols] 93 | 94 | 95 | @pipe 96 | def drop_if(df, fun): 97 | """Drops columns where fun(ction) is true 98 | Args: 99 | fun: a function that will be applied to columns 100 | """ 101 | 102 | def _filter_f(col): 103 | try: 104 | return fun(df[col]) 105 | except: 106 | return False 107 | 108 | cols = list(filter(_filter_f, df.columns)) 109 | return df.drop(cols, axis=1) 110 | 111 | 112 | @selection_filter 113 | def starts_with(columns, prefix): 114 | return [c for c in columns if c.startswith(prefix)] 115 | 116 | 117 | @selection_filter 118 | def ends_with(columns, suffix): 119 | return [c for c in columns if c.endswith(suffix)] 120 | 121 | 122 | @selection_filter 123 | def contains(columns, substr): 124 | return [c for c in columns if substr in c] 125 | 126 | 127 | @selection_filter 128 | def matches(columns, pattern): 129 | return [c for c in columns if re.search(pattern, c)] 130 | 131 | 132 | @selection_filter 133 | def everything(columns): 134 | return columns 135 | 136 | 137 | @selection_filter 138 | def num_range(columns, prefix, range): 139 | colnames = [prefix + str(i) for i in range] 140 | return [c for c in columns if c in colnames] 141 | 142 | 143 | @selection_filter 144 | def one_of(columns, specified): 145 | return [c for c in columns if c in specified] 146 | 147 | 148 | @selection_filter 149 | def columns_between(columns, start_col, end_col, inclusive=True): 150 | if isinstance(start_col, str): 151 | start_col = columns.index(start_col) 152 | if isinstance(end_col, str): 153 | end_col = columns.index(end_col) 154 | return columns[start_col:end_col + int(inclusive)] 155 | 156 | 157 | @selection_filter 158 | def columns_from(columns, start_col): 159 | if isinstance(start_col, str): 160 | start_col = columns.index(start_col) 161 | return columns[start_col:] 162 | 163 | 164 | @selection_filter 165 | def columns_to(columns, end_col, inclusive=False): 166 | if isinstance(end_col, str): 167 | end_col = columns.index(end_col) 168 | return columns[:end_col + int(inclusive)] 169 | -------------------------------------------------------------------------------- /dfply/window_functions.py: -------------------------------------------------------------------------------- 1 | from .base import * 2 | 3 | 4 | # ------------------------------------------------------------------------------ 5 | # Window functions 6 | # ------------------------------------------------------------------------------ 7 | 8 | @make_symbolic 9 | def lead(series, i=1): 10 | """ 11 | Returns a series shifted forward by a value. `NaN` values will be filled 12 | in the end. 13 | 14 | Same as a call to `series.shift(i)` 15 | 16 | Args: 17 | series: column to shift forward. 18 | i (int): number of positions to shift forward. 19 | """ 20 | 21 | shifted = series.shift(i * -1) 22 | return shifted 23 | 24 | 25 | @make_symbolic 26 | def lag(series, i=1): 27 | """ 28 | Returns a series shifted backwards by a value. `NaN` values will be filled 29 | in the beginning. 30 | 31 | Same as a call to `series.shift(-i)` 32 | 33 | Args: 34 | series: column to shift backward. 35 | i (int): number of positions to shift backward. 36 | """ 37 | 38 | shifted = series.shift(i) 39 | return shifted 40 | 41 | 42 | @make_symbolic 43 | def between(series, a, b, inclusive=False): 44 | """ 45 | Returns a boolean series specifying whether rows of the input series 46 | are between values `a` and `b`. 47 | 48 | Args: 49 | series: column to compare, typically symbolic. 50 | a: value series must be greater than (or equal to if `inclusive=True`) 51 | for the output series to be `True` at that position. 52 | b: value series must be less than (or equal to if `inclusive=True`) for 53 | the output series to be `True` at that position. 54 | 55 | Kwargs: 56 | inclusive (bool): If `True`, comparison is done with `>=` and `<=`. 57 | If `False` (the default), comparison uses `>` and `<`. 58 | """ 59 | 60 | if inclusive == True: 61 | met_condition = (series >= a) & (series <= b) 62 | elif inclusive == False: 63 | met_condition = (series > a) & (series < b) 64 | return met_condition 65 | 66 | 67 | @make_symbolic 68 | def dense_rank(series, ascending=True): 69 | """ 70 | Equivalent to `series.rank(method='dense', ascending=ascending)`. 71 | 72 | Args: 73 | series: column to rank. 74 | 75 | Kwargs: 76 | ascending (bool): whether to rank in ascending order (default is `True`). 77 | """ 78 | 79 | ranks = series.rank(method='dense', ascending=ascending) 80 | return ranks 81 | 82 | 83 | @make_symbolic 84 | def min_rank(series, ascending=True): 85 | """ 86 | Equivalent to `series.rank(method='min', ascending=ascending)`. 87 | 88 | Args: 89 | series: column to rank. 90 | 91 | Kwargs: 92 | ascending (bool): whether to rank in ascending order (default is `True`). 93 | """ 94 | 95 | ranks = series.rank(method='min', ascending=ascending) 96 | return ranks 97 | 98 | 99 | @make_symbolic 100 | def cumsum(series): 101 | """ 102 | Calculates cumulative sum of values. Equivalent to `series.cumsum()`. 103 | 104 | Args: 105 | series: column to compute cumulative sum for. 106 | """ 107 | 108 | sums = series.cumsum() 109 | return sums 110 | 111 | 112 | @make_symbolic 113 | def cummean(series): 114 | """ 115 | Calculates cumulative mean of values. Equivalent to 116 | `series.expanding().mean()`. 117 | 118 | Args: 119 | series: column to compute cumulative mean for. 120 | """ 121 | 122 | means = series.expanding().mean() 123 | return means 124 | 125 | 126 | @make_symbolic 127 | def cummax(series): 128 | """ 129 | Calculates cumulative maximum of values. Equivalent to 130 | `series.expanding().max()`. 131 | 132 | Args: 133 | series: column to compute cumulative maximum for. 134 | """ 135 | 136 | maxes = series.expanding().max() 137 | return maxes 138 | 139 | 140 | @make_symbolic 141 | def cummin(series): 142 | """ 143 | Calculates cumulative minimum of values. Equivalent to 144 | `series.expanding().min()`. 145 | 146 | Args: 147 | series: column to compute cumulative minimum for. 148 | """ 149 | 150 | mins = series.expanding().min() 151 | return mins 152 | 153 | 154 | @make_symbolic 155 | def cumprod(series): 156 | """ 157 | Calculates cumulative product of values. Equivalent to 158 | `series.cumprod()`. 159 | 160 | Args: 161 | series: column to compute cumulative product for. 162 | """ 163 | 164 | prods = series.cumprod() 165 | return prods 166 | 167 | 168 | @make_symbolic 169 | def cumany(series): 170 | """ 171 | Calculates cumulative any of values. Equivalent to 172 | `series.expanding().apply(np.any).astype(bool)`. 173 | 174 | Args: 175 | series: column to compute cumulative any for. 176 | """ 177 | 178 | anys = series.expanding().apply(np.any).astype(bool) 179 | return anys 180 | 181 | 182 | @make_symbolic 183 | def cumall(series): 184 | """ 185 | Calculates cumulative all of values. Equivalent to 186 | `series.expanding().apply(np.all).astype(bool)`. 187 | 188 | Args: 189 | series: column to compute cumulative all for. 190 | """ 191 | 192 | alls = series.expanding().apply(np.all).astype(bool) 193 | return alls 194 | 195 | 196 | @make_symbolic 197 | def percent_rank(series, ascending=True): 198 | if series.size == 1: 199 | return 0 200 | percents = (series.rank(method='min', ascending=ascending) - 1) / (series.size - 1) 201 | return percents 202 | 203 | 204 | @make_symbolic 205 | def row_number(series, ascending=True): 206 | """ 207 | Returns row number based on column rank 208 | Equivalent to `series.rank(method='first', ascending=ascending)`. 209 | 210 | Args: 211 | series: column to rank. 212 | 213 | Kwargs: 214 | ascending (bool): whether to rank in ascending order (default is `True`). 215 | 216 | Usage: 217 | diamonds >> head() >> mutate(rn=row_number(X.x)) 218 | 219 | carat cut color clarity depth table price x y z rn 220 | 0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 2.0 221 | 1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 1.0 222 | 2 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 3.0 223 | 3 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 4.0 224 | 4 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 5.0 225 | """ 226 | 227 | series_rank = series.rank(method='first', ascending=ascending) 228 | return series_rank 229 | -------------------------------------------------------------------------------- /dfply/set_ops.py: -------------------------------------------------------------------------------- 1 | from .base import * 2 | import warnings 3 | import pandas as pd 4 | 5 | 6 | def validate_set_ops(df, other): 7 | """ 8 | Helper function to ensure that DataFrames are valid for set operations. 9 | Columns must be the same name in the same order, and indices must be of the 10 | same dimension with the same names. 11 | """ 12 | 13 | if df.columns.values.tolist() != other.columns.values.tolist(): 14 | not_in_df = [col for col in other.columns if col not in df.columns] 15 | not_in_other = [col for col in df.columns if col not in other.columns] 16 | error_string = 'Error: not compatible.' 17 | if len(not_in_df): 18 | error_string += ' Cols in y but not x: ' + str(not_in_df) + '.' 19 | if len(not_in_other): 20 | error_string += ' Cols in x but not y: ' + str(not_in_other) + '.' 21 | raise ValueError(error_string) 22 | if len(df.index.names) != len(other.index.names): 23 | raise ValueError('Index dimension mismatch') 24 | if df.index.names != other.index.names: 25 | raise ValueError('Index mismatch') 26 | else: 27 | return 28 | 29 | 30 | # ------------------------------------------------------------------------------ 31 | # `union` 32 | # ------------------------------------------------------------------------------ 33 | 34 | @pipe 35 | def union(df, other, index=False, keep='first'): 36 | """ 37 | Returns rows that appear in either DataFrame. 38 | 39 | Args: 40 | df (pandas.DataFrame): data passed in through the pipe. 41 | other (pandas.DataFrame): other DataFrame to use for set operation with 42 | the first. 43 | 44 | Kwargs: 45 | index (bool): Boolean indicating whether to consider the pandas index 46 | as part of the set operation (default `False`). 47 | keep (str): Indicates which duplicate should be kept. Options are `'first'` 48 | and `'last'`. 49 | """ 50 | validate_set_ops(df, other) 51 | stacked = df.append(other) 52 | if index: 53 | stacked_reset_indexes = stacked.reset_index() 54 | index_cols = [col for col in stacked_reset_indexes.columns if col not in df.columns] 55 | index_name = df.index.names 56 | return_df = stacked_reset_indexes.drop_duplicates(keep=keep).set_index(index_cols) 57 | return_df.index.names = index_name 58 | return return_df 59 | else: 60 | return stacked.drop_duplicates(keep=keep) 61 | 62 | 63 | # ------------------------------------------------------------------------------ 64 | # `intersect` 65 | # ------------------------------------------------------------------------------ 66 | 67 | 68 | @pipe 69 | def intersect(df, other, index=False, keep='first'): 70 | """ 71 | Returns rows that appear in both DataFrames. 72 | 73 | Args: 74 | df (pandas.DataFrame): data passed in through the pipe. 75 | other (pandas.DataFrame): other DataFrame to use for set operation with 76 | the first. 77 | 78 | Kwargs: 79 | index (bool): Boolean indicating whether to consider the pandas index 80 | as part of the set operation (default `False`). 81 | keep (str): Indicates which duplicate should be kept. Options are `'first'` 82 | and `'last'`. 83 | """ 84 | 85 | validate_set_ops(df, other) 86 | if index: 87 | df_reset_index = df.reset_index() 88 | other_reset_index = other.reset_index() 89 | index_cols = [col for col in df_reset_index.columns if col not in df.columns] 90 | df_index_names = df.index.names 91 | return_df = (pd.merge(df_reset_index, other_reset_index, 92 | how='inner', 93 | left_on=df_reset_index.columns.values.tolist(), 94 | right_on=df_reset_index.columns.values.tolist()) 95 | .set_index(index_cols)) 96 | return_df.index.names = df_index_names 97 | return_df = return_df.drop_duplicates(keep=keep) 98 | return return_df 99 | else: 100 | return_df = pd.merge(df, other, 101 | how='inner', 102 | left_on=df.columns.values.tolist(), 103 | right_on=df.columns.values.tolist()) 104 | return_df = return_df.drop_duplicates(keep=keep) 105 | return return_df 106 | 107 | 108 | # ------------------------------------------------------------------------------ 109 | # `set_diff` 110 | # ------------------------------------------------------------------------------ 111 | 112 | 113 | @pipe 114 | def set_diff(df, other, index=False, keep='first'): 115 | """ 116 | Returns rows that appear in the first DataFrame but not the second. 117 | 118 | Args: 119 | df (pandas.DataFrame): data passed in through the pipe. 120 | other (pandas.DataFrame): other DataFrame to use for set operation with 121 | the first. 122 | 123 | Kwargs: 124 | index (bool): Boolean indicating whether to consider the pandas index 125 | as part of the set operation (default `False`). 126 | keep (str): Indicates which duplicate should be kept. Options are `'first'` 127 | and `'last'`. 128 | """ 129 | 130 | validate_set_ops(df, other) 131 | if index: 132 | df_reset_index = df.reset_index() 133 | other_reset_index = other.reset_index() 134 | index_cols = [col for col in df_reset_index.columns if col not in df.columns] 135 | df_index_names = df.index.names 136 | return_df = (pd.merge(df_reset_index, other_reset_index, 137 | how='left', 138 | left_on=df_reset_index.columns.values.tolist(), 139 | right_on=other_reset_index.columns.values.tolist(), 140 | indicator=True) 141 | .set_index(index_cols)) 142 | return_df = return_df[return_df._merge == 'left_only'] 143 | return_df.index.names = df_index_names 144 | return_df = return_df.drop_duplicates(keep=keep)[df.columns] 145 | return return_df 146 | else: 147 | return_df = pd.merge(df, other, 148 | how='left', 149 | left_on=df.columns.values.tolist(), 150 | right_on=df.columns.values.tolist(), 151 | indicator=True) 152 | return_df = return_df[return_df._merge == 'left_only'] 153 | return_df = return_df.drop_duplicates(keep=keep)[df.columns] 154 | return return_df 155 | -------------------------------------------------------------------------------- /test/test_reshape.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from dfply import * 4 | 5 | ##============================================================================== 6 | ## reshape test functions 7 | ##============================================================================== 8 | 9 | 10 | def arrange_apply_helperfunc(df): 11 | df = df.sort_values('depth', ascending=False) 12 | df = df.head(5) 13 | return df 14 | 15 | # def test_arrange_small(): 16 | # d = diamonds >> arrange(desc(X.cut), desc(X.price)) 17 | # print(d.head(25)) 18 | # assert False 19 | 20 | 21 | def test_arrange(): 22 | df = diamonds.groupby('cut').apply(arrange_apply_helperfunc).reset_index(drop=True) 23 | d = (diamonds >> group_by('cut') >> arrange('depth', ascending=False) >> 24 | head(5) >> ungroup()).reset_index(drop=True) 25 | #print('df', df, df.shape) 26 | #print('d', d, d.shape) 27 | assert df.equals(d) 28 | 29 | d = (diamonds >> group_by('cut') >> arrange(X.depth, ascending=False) >> 30 | head(5) >> ungroup()).reset_index(drop=True) 31 | assert df.equals(d) 32 | 33 | print(type(d), type(df), type(diamonds)) 34 | 35 | df = diamonds.sort_values(['cut','price'], ascending=False) 36 | d = diamonds >> arrange(desc(X.cut), desc(X.price)) 37 | print('df', df >> head(5)) 38 | print('d', d >> head(5)) 39 | assert df.equals(d) 40 | 41 | 42 | def test_rename(): 43 | df = diamonds.rename(columns={'cut':'Cut','table':'Table','carat':'Carat'}) 44 | d = diamonds >> rename(Cut=X.cut, Table=X.table, Carat='carat') 45 | assert df.equals(d) 46 | 47 | 48 | @pytest.fixture 49 | def elongated(): 50 | elongated = diamonds >> gather('variable', 'value', add_id=True) 51 | return elongated 52 | 53 | 54 | def test_gather(elongated): 55 | d = diamonds >> gather('variable', 'value', ['price', 'depth','x','y','z']) 56 | 57 | variables = ['price','depth','x','y','z'] 58 | id_vars = [c for c in diamonds.columns if c not in variables] 59 | df = pd.melt(diamonds, id_vars, variables, 'variable', 'value') 60 | 61 | assert df.equals(d) 62 | 63 | d = diamonds >> gather('variable', 'value') 64 | 65 | variables = diamonds.columns.tolist() 66 | id_vars = [] 67 | df = pd.melt(diamonds, id_vars, variables, 'variable', 'value') 68 | 69 | assert df.equals(d) 70 | 71 | df = diamonds.copy() 72 | df['_ID'] = np.arange(df.shape[0]) 73 | df = pd.melt(df, ['_ID'], variables, 'variable', 'value') 74 | 75 | assert df.equals(elongated) 76 | 77 | 78 | def test_spread(elongated): 79 | 80 | columns = elongated.columns.tolist() 81 | id_cols = ['_ID'] 82 | 83 | df = elongated.copy() 84 | df['temp_index'] = df['_ID'].values 85 | df = df.set_index('temp_index') 86 | spread_data = df[['variable','value']] 87 | 88 | spread_data = spread_data.pivot(columns='variable', values='value') 89 | converted_spread = spread_data.copy() 90 | 91 | columns_to_convert = [col for col in spread_data if col not in columns] 92 | converted_spread = convert_type(converted_spread, columns_to_convert) 93 | 94 | df = df[['_ID']].drop_duplicates() 95 | 96 | df_spread = df.merge(spread_data, left_index=True, right_index=True).reset_index(drop=True) 97 | df_conv = df.merge(converted_spread, left_index=True, right_index=True).reset_index(drop=True) 98 | 99 | d_spread = elongated >> spread('variable', 'value') 100 | d_spread_conv = elongated >> spread('variable', 'value', convert=True) 101 | 102 | assert df_spread.equals(d_spread) 103 | assert df_conv.equals(d_spread_conv) 104 | 105 | 106 | def test_separate(): 107 | 108 | d = pd.DataFrame({ 109 | 'a':['1-a-3','1-b','1-c-3-4','9-d-1','10'] 110 | }) 111 | 112 | test1 = d >> separate(X.a, ['a1','a2','a3'], 113 | remove=True, convert=False, 114 | extra='merge', fill='right') 115 | 116 | true1 = pd.DataFrame({ 117 | 'a1':['1','1','1','9','10'], 118 | 'a2':['a','b','c','d',np.nan], 119 | 'a3':['3',np.nan,'3-4','1',np.nan] 120 | }) 121 | print(test1) 122 | print(true1) 123 | assert true1.equals(test1) 124 | 125 | test2 = d >> separate(X.a, ['a1','a2','a3'], 126 | remove=True, convert=False, 127 | extra='merge', fill='left') 128 | 129 | true2 = pd.DataFrame({ 130 | 'a1':['1',np.nan,'1','9',np.nan], 131 | 'a2':['a','1','c','d',np.nan], 132 | 'a3':['3','b','3-4','1','10'] 133 | }) 134 | assert true2.equals(test2) 135 | 136 | test3 = d >> separate(X.a, ['a1','a2','a3'], 137 | remove=True, convert=True, 138 | extra='merge', fill='right') 139 | 140 | true3 = pd.DataFrame({ 141 | 'a1':[1,1,1,9,10], 142 | 'a2':['a','b','c','d',np.nan], 143 | 'a3':['3',np.nan,'3-4','1',np.nan] 144 | }) 145 | assert true3.equals(test3) 146 | 147 | test4 = d >> separate(X.a, ['col1','col2'], sep=[1,3], 148 | remove=True, convert=False, extra='drop', fill='left') 149 | 150 | true4 = pd.DataFrame({ 151 | 'col1':['1','1','1','9','1'], 152 | 'col2':['-a','-b','-c','-d','0'] 153 | }) 154 | assert true4.equals(test4) 155 | 156 | test5 = d >> separate(X.a, ['col1','col2'], sep=[1,3], 157 | remove=False, convert=False, extra='drop', fill='left') 158 | 159 | true5 = pd.DataFrame({ 160 | 'a':['1-a-3','1-b','1-c-3-4','9-d-1','10'], 161 | 'col1':['1','1','1','9','1'], 162 | 'col2':['-a','-b','-c','-d','0'] 163 | }) 164 | assert true5.equals(test5) 165 | 166 | test6 = d >> separate(X.a, ['col1','col2','col3'], sep=[30], 167 | remove=True, convert=False, extra='drop', fill='left') 168 | 169 | true6 = pd.DataFrame({ 170 | 'col1':['1-a-3','1-b','1-c-3-4','9-d-1','10'], 171 | 'col2':[np.nan,np.nan,np.nan,np.nan,np.nan], 172 | 'col3':[np.nan,np.nan,np.nan,np.nan,np.nan] 173 | }) 174 | assert true6.equals(test6) 175 | 176 | 177 | def test_unite(): 178 | d = pd.DataFrame({ 179 | 'a':[1,2,3], 180 | 'b':['a','b','c'], 181 | 'c':[True, False, np.nan] 182 | }) 183 | 184 | test1 = d >> unite('united', X.a, 'b', 2, remove=True, na_action='maintain') 185 | true1 = pd.DataFrame({ 186 | 'united':['1_a_True','2_b_False',np.nan] 187 | }) 188 | assert true1.equals(test1) 189 | 190 | test2 = d >> unite('united', ['a','b','c'], remove=True, na_action='ignore', 191 | sep='*') 192 | true2 = pd.DataFrame({ 193 | 'united':['1*a*True','2*b*False','3*c'] 194 | }) 195 | assert test2.equals(true2) 196 | 197 | test3 = d >> unite('united', d.columns, remove=True, na_action='as_string') 198 | true3 = pd.DataFrame({ 199 | 'united':['1_a_True','2_b_False','3_c_nan'] 200 | }) 201 | assert true3.equals(test3) 202 | 203 | test4 = d >> unite('united', d.columns, remove=False, na_action='as_string') 204 | true4 = pd.DataFrame({ 205 | 'a':[1,2,3], 206 | 'b':['a','b','c'], 207 | 'c':[True, False, np.nan], 208 | 'united':['1_a_True','2_b_False','3_c_nan'] 209 | }) 210 | 211 | print(true4) 212 | print(test4) 213 | assert true4.equals(test4) 214 | -------------------------------------------------------------------------------- /test/test_window_functions.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from dfply import * 4 | 5 | ##============================================================================== 6 | ## window function tests 7 | ##============================================================================== 8 | 9 | 10 | def test_lead(): 11 | d = diamonds >> mutate(price_lag = lead(X.price, i=2)) 12 | df = diamonds.assign(price_lag = diamonds.price.shift(-2)) 13 | assert df.equals(d) 14 | 15 | 16 | def test_lag(): 17 | d = diamonds >> mutate(price_lag = lag(X.price, i=2)) 18 | df = diamonds.assign(price_lag = diamonds.price.shift(2)) 19 | assert df.equals(d) 20 | 21 | 22 | def test_between(): 23 | d = diamonds >> mutate(z_btwn_x_y = between(X.z, X.x, X.y)) 24 | df = diamonds.copy() 25 | df['z_btwn_x_y'] = (df.z > df.x) & (df.z < df.y) 26 | assert df.equals(d) 27 | 28 | 29 | def test_dense_rank(): 30 | df = diamonds.copy() >> head(5) >> select(X.cut, X.x) 31 | df_dr = df >> mutate(dr=dense_rank(X.x)) 32 | df_truth = df 33 | df_truth['dr'] = pd.Series([2.0, 1.0, 3.0, 4.0, 5.0]) 34 | assert df_dr.equals(df_truth) 35 | df_dr = df >> mutate(dr=dense_rank(X.cut)) 36 | df_truth['dr'] = pd.Series([2.0, 3.0, 1.0, 3.0, 1.0]) 37 | assert df_dr.equals(df_truth) 38 | df_dr = df >> group_by(X.cut) >> mutate(dr=dense_rank(X.x)) 39 | df_truth['dr'] = pd.Series([1.0, 1.0, 1.0, 2.0, 2.0]) 40 | assert df_dr.sort_index().equals(df_truth) 41 | df_dr = df >> mutate(dr=dense_rank(X.x, ascending=False)) 42 | df_truth['dr'] = pd.Series([4.0, 5.0, 3.0, 2.0, 1.0]) 43 | assert df_dr.equals(df_truth) 44 | 45 | 46 | def test_min_rank(): 47 | df = diamonds.copy() >> head(5) >> select(X.cut, X.x) 48 | df_mr = df >> mutate(mr=min_rank(X.x)) 49 | df_truth = df 50 | df_truth['mr'] = pd.Series([2.0, 1.0, 3.0, 4.0, 5.0]) 51 | assert df_mr.equals(df_truth) 52 | df_mr = df >> mutate(mr=min_rank(X.cut)) 53 | df_truth['mr'] = pd.Series([3.0, 4.0, 1.0, 4.0, 1.0]) 54 | assert df_mr.equals(df_truth) 55 | df_mr = df >> group_by(X.cut) >> mutate(mr=min_rank(X.x)) 56 | df_truth['mr'] = pd.Series([1.0, 1.0, 1.0, 2.0, 2.0]) 57 | assert df_mr.sort_index().equals(df_truth) 58 | df_mr = df >> mutate(mr=min_rank(X.x, ascending=False)) 59 | df_truth['mr'] = pd.Series([4.0, 5.0, 3.0, 2.0, 1.0]) 60 | assert df_mr.equals(df_truth) 61 | 62 | 63 | def test_cumsum(): 64 | df = diamonds.copy() >> head(5) >> select(X.cut, X.x) 65 | df_cs = df >> mutate(cs=cumsum(X.x)) 66 | df_truth = df 67 | df_truth['cs'] = pd.Series([3.95, 7.84, 11.89, 16.09, 20.43]) 68 | pd.util.testing.assert_frame_equal(df_cs, df_truth) 69 | #assert df_cs.equals(df_truth) 70 | df_cs = df >> group_by(X.cut) >> mutate(cs=cumsum(X.x)) 71 | df_truth['cs'] = pd.Series([3.95, 3.89, 4.05, 8.09, 8.39]) 72 | pd.util.testing.assert_frame_equal(df_cs.sort_index(), df_truth) 73 | #assert df_cs.equals(df_truth) 74 | 75 | 76 | def test_cummean(): 77 | df = diamonds.copy() >> head(5) >> select(X.cut, X.x) 78 | df_cm = df >> mutate(cm=cummean(X.x)) 79 | df_truth = df 80 | df_truth['cm'] = pd.Series([3.950000, 3.920000, 3.963333, 4.022500, 4.086000]) 81 | pd.util.testing.assert_frame_equal(df_cm, df_truth) 82 | #assert df_cm.equals(df_truth) 83 | df_cm = df >> group_by(X.cut) >> mutate(cm=cummean(X.x)) 84 | df_truth['cm'] = pd.Series([3.950, 3.890, 4.050, 4.045, 4.195]) 85 | pd.util.testing.assert_frame_equal(df_cm.sort_index(), df_truth) 86 | #assert df_cm.equals(df_truth) 87 | 88 | 89 | def test_cummax(): 90 | df = diamonds.copy() >> head(5) >> select(X.cut, X.x) 91 | df_cm = df >> mutate(cm=cummax(X.x)) 92 | df_truth = df 93 | df_truth['cm'] = pd.Series([3.95, 3.95, 4.05, 4.20, 4.34]) 94 | pd.util.testing.assert_frame_equal(df_cm, df_truth) 95 | #assert df_cm.equals(df_truth) 96 | df_cm = df >> group_by(X.cut) >> mutate(cm=cummax(X.x)) 97 | df_truth['cm'] = pd.Series([3.95, 3.89, 4.05, 4.20, 4.34]) 98 | pd.util.testing.assert_frame_equal(df_cm.sort_index(), df_truth) 99 | #assert df_cm.equals(df_truth) 100 | 101 | 102 | def test_cummin(): 103 | df = diamonds.copy() >> head(5) >> select(X.cut, X.x) 104 | df_cm = df >> mutate(cm=cummin(X.x)) 105 | df_truth = df 106 | df_truth['cm'] = pd.Series([3.95, 3.89, 3.89, 3.89, 3.89]) 107 | pd.util.testing.assert_frame_equal(df_cm, df_truth) 108 | #assert df_cm.equals(df_truth) 109 | df_cm = df >> group_by(X.cut) >> mutate(cm=cummin(X.x)) 110 | df_truth['cm'] = pd.Series([3.95, 3.89, 4.05, 3.89, 4.05]) 111 | pd.util.testing.assert_frame_equal(df_cm.sort_index(), df_truth) 112 | #assert df_cm.equals(df_truth) 113 | 114 | 115 | def test_cumprod(): 116 | df = diamonds.copy() >> head(5) >> select(X.cut, X.x) 117 | df_cp = df >> mutate(cp=cumprod(X.x)) 118 | df_truth = df.copy() 119 | df_truth['cp'] = pd.Series([3.950000, 15.365500, 62.230275, 261.367155, 1134.333453]) 120 | pd.util.testing.assert_frame_equal(df_cp, df_truth) 121 | #assert df_cp.equals(df_truth) 122 | df_cp = df >> group_by(X.cut) >> mutate(cp=cumprod(X.x)) 123 | df_truth['cp'] = pd.Series([3.950, 3.890, 4.050, 16.338, 17.577]) 124 | # some tricky floating point stuff going on here 125 | diffs = df_cp.sort_index().cp - df_truth.cp 126 | assert all(diffs < .0000001) 127 | 128 | 129 | def test_cumany(): 130 | df = pd.DataFrame({ 131 | 'a':[False,False,True,True,False,True], 132 | 'b':['x','x','x','x','y','y'] 133 | }) 134 | 135 | d = df >> mutate(ca=cumany(X.a)) 136 | assert d.equals(df.assign(ca=[False,False,True,True,True,True])) 137 | 138 | d = df >> group_by(X.b) >> mutate(ca=cumany(X.a)) 139 | assert d.sort_index().equals(df.assign(ca=[False,False,True,True,False,True])) 140 | 141 | 142 | def test_cumall(): 143 | df = pd.DataFrame({ 144 | 'a':[True,True,False,True,False,True], 145 | 'b':['x','x','x','y','y','y'] 146 | }) 147 | 148 | d = df >> mutate(ca=cumall(X.a)) 149 | assert d.equals(df.assign(ca=[True,True,False,False,False,False])) 150 | 151 | d = df >> group_by(X.b) >> mutate(ca=cumall(X.a)) 152 | assert d.sort_index().equals(df.assign(ca=[True,True,False,True,False,False])) 153 | 154 | 155 | def test_percent_rank(): 156 | df = diamonds.copy() >> head(5) >> select(X.cut, X.x) 157 | df_pr = df >> mutate(pr=percent_rank(X.x)) 158 | df_truth = df.copy() 159 | assert df_pr.equals(df_truth.assign(pr=[.25, 0.00, 0.50, 0.75, 1.00])) 160 | df_pr = df >> mutate(pr=percent_rank(X.cut)) 161 | assert df_pr.equals(df_truth.assign(pr=[0.50, 0.75, 0.00, 0.75, 0.00])) 162 | df_pr = df >> group_by(X.cut) >> mutate(pr=percent_rank(X.x)) 163 | assert df_pr.sort_index().equals(df_truth.assign(pr=[0.0, 0.0, 0.0, 1.0, 1.0])) 164 | df_pr = df >> mutate(pr=percent_rank(X.x, ascending=False)) 165 | assert df_pr.equals(df_truth.assign(pr=[0.75, 1.0, 0.50, 0.25, 0.00])) 166 | 167 | 168 | def test_row_number(): 169 | df = diamonds.copy().head(5).sort_values(by='x') 170 | df['rn'] = range(1, df.shape[0] + 1) 171 | df['rn'] = df['rn'].astype(float) 172 | df.sort_index(inplace=True) 173 | assert df.equals(diamonds >> head(5) >> mutate(rn=row_number(X.x))) 174 | # test 2: row number with desc() option 175 | df = diamonds.copy().head(5).sort_values(by='x', ascending=False) 176 | df['rn'] = range(1, df.shape[0] + 1) 177 | df['rn'] = df['rn'].astype(float) 178 | df.sort_index(inplace=True) 179 | assert df.equals(diamonds >> head(5) >> mutate(rn=row_number(desc(X.x)))) 180 | # test 3: row number with ascending keyword 181 | df = diamonds.copy().head(5).sort_values(by='x', ascending=False) 182 | df['rn'] = range(1, df.shape[0] + 1) 183 | df['rn'] = df['rn'].astype(float) 184 | df.sort_index(inplace=True) 185 | assert df.equals(diamonds >> head(5) >> mutate(rn=row_number(X.x, ascending=False))) 186 | # test 4: with a group by 187 | df = diamonds.copy().head(5) 188 | df['rn'] = [1, 1, 1, 2, 2] 189 | df['rn'] = df['rn'].astype(float) 190 | assert df.equals((diamonds >> head(5) >> group_by(X.cut) >> mutate(rn=row_number(X.x))).sort_index()) 191 | -------------------------------------------------------------------------------- /dfply/vector.py: -------------------------------------------------------------------------------- 1 | from .base import * 2 | import collections 3 | 4 | 5 | # ------------------------------------------------------------------------------ 6 | # series ordering 7 | # ------------------------------------------------------------------------------ 8 | 9 | @make_symbolic 10 | def order_series_by(series, order_series): 11 | """ 12 | Orders one series according to another series, or a list of other 13 | series. If a list of other series are specified, ordering is done hierarchically 14 | like when a list of columns is supplied to `.sort_values()`. 15 | 16 | Args: 17 | series (:obj:`pandas.Series`): the pandas Series object to be reordered. 18 | order_series: either a pandas Series object or a list of pandas Series 19 | objects. These will be sorted using `.sort_values()` with 20 | `ascending=True`, and the new order will be used to reorder the 21 | Series supplied in the first argument. 22 | 23 | Returns: 24 | reordered `pandas.Series` object 25 | """ 26 | 27 | if isinstance(order_series, (list, tuple)): 28 | sorter = pd.concat(order_series, axis=1) 29 | sorter_columns = ['_sorter' + str(i) for i in range(len(order_series))] 30 | sorter.columns = sorter_columns 31 | sorter['series'] = series.values 32 | sorted_series = sorter.sort_values(sorter_columns)['series'] 33 | return sorted_series 34 | else: 35 | sorted_series = pd.DataFrame({ 36 | 'series': series.values, 37 | 'order': order_series.values 38 | }).sort_values('order', ascending=True)['series'] 39 | return sorted_series 40 | 41 | 42 | @make_symbolic 43 | def desc(series): 44 | """ 45 | Mimics the functionality of the R desc function. Essentially inverts a 46 | series object to make ascending sort act like descending sort. 47 | 48 | Args: 49 | series (:obj:`pandas.Series`): pandas series to be inverted prior to 50 | ordering/sorting. 51 | 52 | Returns: 53 | inverted `pandas.Series`. The returned series will be numeric (integers), 54 | regardless of the type of the original series. 55 | 56 | Example: 57 | 58 | First group by cut, then find the first value of price when ordering by 59 | price ascending, and ordering by price descending using the `desc` function. 60 | 61 | diamonds >> group_by(X.cut) >> summarize(carat_low=first(X.price, order_by=X.price), 62 | carat_high=first(X.price, order_by=desc(X.price))) 63 | 64 | cut carat_high carat_low 65 | 0 Fair 18574 337 66 | 1 Good 18788 327 67 | 2 Ideal 18806 326 68 | 3 Premium 18823 326 69 | 4 Very Good 18818 336 70 | """ 71 | 72 | return series.rank(method='min', ascending=False) 73 | 74 | 75 | # ------------------------------------------------------------------------------ 76 | # coalesce 77 | # ------------------------------------------------------------------------------ 78 | 79 | @make_symbolic 80 | def coalesce(*series): 81 | """ 82 | Takes the first non-NaN value in order across the specified series, 83 | returning a new series. Mimics the coalesce function in dplyr and SQL. 84 | 85 | Args: 86 | *series: Series objects, typically represented in their symbolic form 87 | (like X.series). 88 | 89 | Example: 90 | df = pd.DataFrame({ 91 | 'a':[1,np.nan,np.nan,np.nan,np.nan], 92 | 'b':[2,3,np.nan,np.nan,np.nan], 93 | 'c':[np.nan,np.nan,4,5,np.nan], 94 | 'd':[6,7,8,9,np.nan] 95 | }) 96 | df >> transmute(coal=coalesce(X.a, X.b, X.c, X.d)) 97 | 98 | coal 99 | 0 1 100 | 1 3 101 | 2 4 102 | 3 5 103 | 4 np.nan 104 | """ 105 | 106 | series = [pd.Series(s) for s in series] 107 | coalescer = pd.concat(series, axis=1) 108 | min_nonna = np.argmin(pd.isnull(coalescer).values, axis=1) 109 | min_nonna = [coalescer.columns[i] for i in min_nonna] 110 | return coalescer.lookup(np.arange(coalescer.shape[0]), min_nonna) 111 | 112 | 113 | # ------------------------------------------------------------------------------ 114 | # case_when 115 | # ------------------------------------------------------------------------------ 116 | 117 | @make_symbolic 118 | def case_when(*conditions): 119 | """ 120 | Functions as a switch statement, creating a new series out of logical 121 | conditions specified by 2-item lists where the left-hand item is the 122 | logical condition and the right-hand item is the value where that 123 | condition is true. 124 | 125 | Conditions should go from the most specific to the most general. A 126 | conditional that appears earlier in the series will "overwrite" one that 127 | appears later. Think of it like a series of if-else statements. 128 | 129 | The logicals and values of the condition pairs must be all the same 130 | length, or length 1. Logicals can be vectors of booleans or a single 131 | boolean (`True`, for example, can be the logical statement for the 132 | final conditional to catch all remaining.). 133 | 134 | Args: 135 | *conditions: Each condition should be a list with two values. The first 136 | value is a boolean or vector of booleans that specify indices in 137 | which the condition is met. The second value is a vector of values 138 | or single value specifying the outcome where that condition is met. 139 | 140 | Example: 141 | df = pd.DataFrame({ 142 | 'num':np.arange(16) 143 | }) 144 | df >> mutate(strnum=case_when([X.num % 15 == 0, 'fizzbuzz'], 145 | [X.num % 3 == 0, 'fizz'], 146 | [X.num % 5 == 0, 'buzz'], 147 | [True, X.num.astype(str)])) 148 | 149 | num strnum 150 | 0 0 fizzbuzz 151 | 1 1 1 152 | 2 2 2 153 | 3 3 fizz 154 | 4 4 4 155 | 5 5 buzz 156 | 6 6 fizz 157 | 7 7 7 158 | 8 8 8 159 | 9 9 fizz 160 | 10 10 buzz 161 | 11 11 11 162 | 12 12 fizz 163 | 13 13 13 164 | 14 14 14 165 | 15 15 fizzbuzz 166 | """ 167 | 168 | lengths = [] 169 | for logical, outcome in conditions: 170 | if isinstance(logical, collections.Iterable): 171 | lengths.append(len(logical)) 172 | if isinstance(outcome, collections.Iterable) and not isinstance(outcome, str): 173 | lengths.append(len(outcome)) 174 | unique_lengths = np.unique(lengths) 175 | assert len(unique_lengths) == 1 176 | output_len = unique_lengths[0] 177 | 178 | output = [] 179 | for logical, outcome in conditions: 180 | if isinstance(logical, bool): 181 | logical = np.repeat(logical, output_len) 182 | if isinstance(logical, pd.Series): 183 | logical = logical.values 184 | if not isinstance(outcome, collections.Iterable) or isinstance(outcome, str): 185 | outcome = pd.Series(np.repeat(outcome, output_len)) 186 | outcome[~logical] = np.nan 187 | output.append(outcome) 188 | 189 | return coalesce(*output) 190 | 191 | 192 | # ------------------------------------------------------------------------------ 193 | # if_else 194 | # ------------------------------------------------------------------------------ 195 | 196 | @make_symbolic 197 | def if_else(condition, when_true, otherwise): 198 | """ 199 | Wraps creation of a series based on if-else conditional logic into a function 200 | call. 201 | 202 | Provide a boolean vector condition, value(s) when true, and value(s) 203 | when false, and a vector will be returned the same length as the conditional 204 | vector according to the logical statement. 205 | 206 | Args: 207 | condition: A boolean vector representing the condition. This is often 208 | a logical statement with a symbolic series. 209 | when_true: A vector the same length as the condition vector or a single 210 | value to apply when the condition is `True`. 211 | otherwise: A vector the same length as the condition vector or a single 212 | value to apply when the condition is `False`. 213 | 214 | Example: 215 | df = pd.DataFrame 216 | """ 217 | 218 | if not isinstance(when_true, collections.Iterable) or isinstance(when_true, str): 219 | when_true = np.repeat(when_true, len(condition)) 220 | if not isinstance(otherwise, collections.Iterable) or isinstance(otherwise, str): 221 | otherwise = np.repeat(otherwise, len(condition)) 222 | assert (len(condition) == len(when_true)) and (len(condition) == len(otherwise)) 223 | 224 | if isinstance(when_true, pd.Series): 225 | when_true = when_true.values 226 | if isinstance(otherwise, pd.Series): 227 | otherwise = otherwise.values 228 | 229 | output = np.array([when_true[i] if c else otherwise[i] 230 | for i, c in enumerate(condition)]) 231 | return output 232 | 233 | 234 | # ------------------------------------------------------------------------------ 235 | # na_if 236 | # ------------------------------------------------------------------------------ 237 | 238 | @make_symbolic 239 | def na_if(series, *values): 240 | """ 241 | If values in a series match a specified value, change them to `np.nan`. 242 | 243 | Args: 244 | series: Series or vector, often symbolic. 245 | *values: Value(s) to convert to `np.nan` in the series. 246 | """ 247 | 248 | series = pd.Series(series) 249 | series[series.isin(values)] = np.nan 250 | return series 251 | -------------------------------------------------------------------------------- /dfply/join.py: -------------------------------------------------------------------------------- 1 | from .base import * 2 | 3 | 4 | # ------------------------------------------------------------------------------ 5 | # SQL-style joins 6 | # ------------------------------------------------------------------------------ 7 | 8 | def get_join_parameters(join_kwargs): 9 | """ 10 | Convenience function to determine the columns to join the right and 11 | left DataFrames on, as well as any suffixes for the columns. 12 | """ 13 | 14 | by = join_kwargs.get('by', None) 15 | suffixes = join_kwargs.get('suffixes', ('_x', '_y')) 16 | if isinstance(by, tuple): 17 | left_on, right_on = by 18 | elif isinstance(by, list): 19 | by = [x if isinstance(x, tuple) else (x, x) for x in by] 20 | left_on, right_on = (list(x) for x in zip(*by)) 21 | else: 22 | left_on, right_on = by, by 23 | return left_on, right_on, suffixes 24 | 25 | 26 | @pipe 27 | def inner_join(df, other, **kwargs): 28 | """ 29 | Joins on values present in both DataFrames. 30 | 31 | Args: 32 | df (pandas.DataFrame): Left DataFrame (passed in via pipe) 33 | other (pandas.DataFrame): Right DataFrame 34 | 35 | Kwargs: 36 | by (str or list): Columns to join on. If a single string, will join 37 | on that column. If a list of lists which contain strings or 38 | integers, the right/left columns to join on. 39 | suffixes (list): String suffixes to append to column names in left 40 | and right DataFrames. 41 | 42 | Example: 43 | a >> inner_join(b, by='x1') 44 | 45 | x1 x2 x3 46 | 0 A 1 True 47 | 1 B 2 False 48 | """ 49 | 50 | left_on, right_on, suffixes = get_join_parameters(kwargs) 51 | joined = df.merge(other, how='inner', left_on=left_on, 52 | right_on=right_on, suffixes=suffixes) 53 | return joined 54 | 55 | 56 | @pipe 57 | def full_join(df, other, **kwargs): 58 | """ 59 | Joins on values present in either DataFrame. (Alternate to `outer_join`) 60 | 61 | Args: 62 | df (pandas.DataFrame): Left DataFrame (passed in via pipe) 63 | other (pandas.DataFrame): Right DataFrame 64 | 65 | Kwargs: 66 | by (str or list): Columns to join on. If a single string, will join 67 | on that column. If a list of lists which contain strings or 68 | integers, the right/left columns to join on. 69 | suffixes (list): String suffixes to append to column names in left 70 | and right DataFrames. 71 | 72 | Example: 73 | a >> outer_join(b, by='x1') 74 | 75 | x1 x2 x3 76 | 0 A 1.0 True 77 | 1 B 2.0 False 78 | 2 C 3.0 NaN 79 | 3 D NaN True 80 | """ 81 | 82 | left_on, right_on, suffixes = get_join_parameters(kwargs) 83 | joined = df.merge(other, how='outer', left_on=left_on, 84 | right_on=right_on, suffixes=suffixes) 85 | return joined 86 | 87 | 88 | @pipe 89 | def outer_join(df, other, **kwargs): 90 | """ 91 | Joins on values present in either DataFrame. (Alternate to `full_join`) 92 | 93 | Args: 94 | df (pandas.DataFrame): Left DataFrame (passed in via pipe) 95 | other (pandas.DataFrame): Right DataFrame 96 | 97 | Kwargs: 98 | by (str or list): Columns to join on. If a single string, will join 99 | on that column. If a list of lists which contain strings or 100 | integers, the right/left columns to join on. 101 | suffixes (list): String suffixes to append to column names in left 102 | and right DataFrames. 103 | 104 | Example: 105 | a >> full_join(b, by='x1') 106 | 107 | x1 x2 x3 108 | 0 A 1.0 True 109 | 1 B 2.0 False 110 | 2 C 3.0 NaN 111 | 3 D NaN True 112 | """ 113 | 114 | left_on, right_on, suffixes = get_join_parameters(kwargs) 115 | joined = df.merge(other, how='outer', left_on=left_on, 116 | right_on=right_on, suffixes=suffixes) 117 | return joined 118 | 119 | 120 | @pipe 121 | def left_join(df, other, **kwargs): 122 | """ 123 | Joins on values present in in the left DataFrame. 124 | 125 | Args: 126 | df (pandas.DataFrame): Left DataFrame (passed in via pipe) 127 | other (pandas.DataFrame): Right DataFrame 128 | 129 | Kwargs: 130 | by (str or list): Columns to join on. If a single string, will join 131 | on that column. If a list of lists which contain strings or 132 | integers, the right/left columns to join on. 133 | suffixes (list): String suffixes to append to column names in left 134 | and right DataFrames. 135 | 136 | Example: 137 | a >> left_join(b, by='x1') 138 | 139 | x1 x2 x3 140 | 0 A 1 True 141 | 1 B 2 False 142 | 2 C 3 NaN 143 | """ 144 | 145 | left_on, right_on, suffixes = get_join_parameters(kwargs) 146 | joined = df.merge(other, how='left', left_on=left_on, 147 | right_on=right_on, suffixes=suffixes) 148 | return joined 149 | 150 | 151 | @pipe 152 | def right_join(df, other, **kwargs): 153 | """ 154 | Joins on values present in in the right DataFrame. 155 | 156 | Args: 157 | df (pandas.DataFrame): Left DataFrame (passed in via pipe) 158 | other (pandas.DataFrame): Right DataFrame 159 | 160 | Kwargs: 161 | by (str or list): Columns to join on. If a single string, will join 162 | on that column. If a list of lists which contain strings or 163 | integers, the right/left columns to join on. 164 | suffixes (list): String suffixes to append to column names in left 165 | and right DataFrames. 166 | 167 | Example: 168 | a >> right_join(b, by='x1') 169 | 170 | x1 x2 x3 171 | 0 A 1.0 True 172 | 1 B 2.0 False 173 | 2 D NaN True 174 | """ 175 | 176 | left_on, right_on, suffixes = get_join_parameters(kwargs) 177 | joined = df.merge(other, how='right', left_on=left_on, 178 | right_on=right_on, suffixes=suffixes) 179 | return joined 180 | 181 | 182 | @pipe 183 | def semi_join(df, other, **kwargs): 184 | """ 185 | Returns all of the rows in the left DataFrame that have a match 186 | in the right DataFrame. 187 | 188 | Args: 189 | df (pandas.DataFrame): Left DataFrame (passed in via pipe) 190 | other (pandas.DataFrame): Right DataFrame 191 | 192 | Kwargs: 193 | by (str or list): Columns to join on. If a single string, will join 194 | on that column. If a list of lists which contain strings or 195 | integers, the right/left columns to join on. 196 | 197 | Example: 198 | a >> semi_join(b, by='x1') 199 | 200 | x1 x2 201 | 0 A 1 202 | 1 B 2 203 | """ 204 | 205 | left_on, right_on, suffixes = get_join_parameters(kwargs) 206 | if not right_on: 207 | right_on = [col_name for col_name in df.columns.values.tolist() if col_name in other.columns.values.tolist()] 208 | left_on = right_on 209 | elif not isinstance(right_on, (list, tuple)): 210 | right_on = [right_on] 211 | other_reduced = other[right_on].drop_duplicates() 212 | joined = df.merge(other_reduced, how='inner', left_on=left_on, 213 | right_on=right_on, suffixes=('', '_y'), 214 | indicator=True).query('_merge=="both"')[df.columns.values.tolist()] 215 | return joined 216 | 217 | 218 | @pipe 219 | def anti_join(df, other, **kwargs): 220 | """ 221 | Returns all of the rows in the left DataFrame that do not have a 222 | match in the right DataFrame. 223 | 224 | Args: 225 | df (pandas.DataFrame): Left DataFrame (passed in via pipe) 226 | other (pandas.DataFrame): Right DataFrame 227 | 228 | Kwargs: 229 | by (str or list): Columns to join on. If a single string, will join 230 | on that column. If a list of lists which contain strings or 231 | integers, the right/left columns to join on. 232 | 233 | Example: 234 | a >> anti_join(b, by='x1') 235 | 236 | x1 x2 237 | 2 C 3 238 | """ 239 | 240 | left_on, right_on, suffixes = get_join_parameters(kwargs) 241 | if not right_on: 242 | right_on = [col_name for col_name in df.columns.values.tolist() if col_name in other.columns.values.tolist()] 243 | left_on = right_on 244 | elif not isinstance(right_on, (list, tuple)): 245 | right_on = [right_on] 246 | other_reduced = other[right_on].drop_duplicates() 247 | joined = df.merge(other_reduced, how='left', left_on=left_on, 248 | right_on=right_on, suffixes=('', '_y'), 249 | indicator=True).query('_merge=="left_only"')[df.columns.values.tolist()] 250 | return joined 251 | 252 | 253 | # ------------------------------------------------------------------------------ 254 | # Binding 255 | # ------------------------------------------------------------------------------ 256 | 257 | @pipe 258 | def bind_rows(df, other, join='outer', ignore_index=False): 259 | """ 260 | Binds DataFrames "vertically", stacking them together. This is equivalent 261 | to `pd.concat` with `axis=0`. 262 | 263 | Args: 264 | df (pandas.DataFrame): Top DataFrame (passed in via pipe). 265 | other (pandas.DataFrame): Bottom DataFrame. 266 | 267 | Kwargs: 268 | join (str): One of `"outer"` or `"inner"`. Outer join will preserve 269 | columns not present in both DataFrames, whereas inner joining will 270 | drop them. 271 | ignore_index (bool): Indicates whether to consider pandas indices as 272 | part of the concatenation (defaults to `False`). 273 | """ 274 | 275 | df = pd.concat([df, other], join=join, ignore_index=ignore_index, axis=0) 276 | return df 277 | 278 | 279 | @pipe 280 | def bind_cols(df, other, join='outer', ignore_index=False): 281 | """ 282 | Binds DataFrames "horizontally". This is equivalent to `pd.concat` with 283 | `axis=1`. 284 | 285 | Args: 286 | df (pandas.DataFrame): Left DataFrame (passed in via pipe). 287 | other (pandas.DataFrame): Right DataFrame. 288 | 289 | Kwargs: 290 | join (str): One of `"outer"` or `"inner"`. Outer join will preserve 291 | rows not present in both DataFrames, whereas inner joining will 292 | drop them. 293 | ignore_index (bool): Indicates whether to consider pandas indices as 294 | part of the concatenation (defaults to `False`). 295 | """ 296 | 297 | df = pd.concat([df, other], join=join, ignore_index=ignore_index, axis=1) 298 | return df 299 | -------------------------------------------------------------------------------- /test/test_select.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from dfply import * 4 | 5 | ##============================================================================== 6 | ## select and drop test functions 7 | ##============================================================================== 8 | 9 | # 0 1 2 3 4 5 6 7 8 9 10 | # carat cut color clarity depth table price x y z 11 | # 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 12 | 13 | def test_select(): 14 | df = diamonds[['carat','cut','price']] 15 | assert df.equals(diamonds >> select('carat','cut','price')) 16 | assert df.equals(diamonds >> select(0, 1, 6)) 17 | assert df.equals(diamonds >> select(0, 1, 'price')) 18 | assert df.equals(diamonds >> select([0, X.cut], X.price)) 19 | assert df.equals(diamonds >> select(X.carat, X['cut'], X.price)) 20 | assert df.equals(diamonds >> select(X[['carat','cut','price']])) 21 | assert df.equals(diamonds >> select(X[['carat','cut']], X.price)) 22 | assert df.equals(diamonds >> select(X.iloc[:,[0,1,6]])) 23 | assert df.equals(diamonds >> select([X.loc[:, ['carat','cut','price']]])) 24 | 25 | 26 | def test_select_inversion(): 27 | df = diamonds.iloc[:, 3:] 28 | d = diamonds >> select(~X.carat, ~X.cut, ~X.color) 29 | print(df.head()) 30 | print(d.head()) 31 | assert df.equals(d) 32 | 33 | 34 | def test_drop(): 35 | df = diamonds.drop(['carat','cut','price'], axis=1) 36 | assert df.equals(diamonds >> drop('carat','cut','price')) 37 | assert df.equals(diamonds >> drop(0, 1, 6)) 38 | assert df.equals(diamonds >> drop(0, 1, 'price')) 39 | assert df.equals(diamonds >> drop([0, X.cut], X.price)) 40 | assert df.equals(diamonds >> drop(X.carat, X['cut'], X.price)) 41 | assert df.equals(diamonds >> drop(X[['carat','cut','price']])) 42 | assert df.equals(diamonds >> drop(X[['carat','cut']], X.price)) 43 | assert df.equals(diamonds >> drop(X.iloc[:,[0,1,6]])) 44 | assert df.equals(diamonds >> drop([X.loc[:, ['carat','cut','price']]])) 45 | 46 | 47 | def test_select_containing(): 48 | df = diamonds[['carat','cut','color','clarity','price']] 49 | assert df.equals(diamonds >> select(contains('c'))) 50 | 51 | 52 | def test_drop_containing(): 53 | df = diamonds[['depth','table','x','y','z']] 54 | assert df.equals(diamonds >> drop(contains('c'))) 55 | 56 | 57 | def test_select_matches(): 58 | df = diamonds[['carat','cut','color','clarity','price']] 59 | assert df.equals(diamonds >> select(matches('^c[auol]|pri'))) 60 | 61 | 62 | def test_drop_matches(): 63 | df = diamonds[['depth','table','x','y','z']] 64 | assert df.equals(diamonds >> drop(matches('^c[auol]|p.i'))) 65 | 66 | 67 | def test_select_startswith(): 68 | df = diamonds[['carat','cut','color','clarity']] 69 | assert df.equals(diamonds >> select(starts_with('c'))) 70 | 71 | 72 | def test_drop_startswith(): 73 | df = diamonds[['depth','table','price','x','y','z']] 74 | assert df.equals(diamonds >> drop(starts_with('c'))) 75 | 76 | 77 | def test_select_endswith(): 78 | df = diamonds[['table','price']] 79 | assert df.equals(diamonds >> select(ends_with('e'))) 80 | 81 | 82 | def test_drop_endswith(): 83 | df = diamonds.drop('z', axis=1) 84 | assert df.equals(diamonds >> drop(ends_with('z'))) 85 | 86 | 87 | def test_select_between(): 88 | df = diamonds[['cut','color','clarity']] 89 | assert df.equals(diamonds >> select(columns_between(X.cut, X.clarity))) 90 | assert df.equals(diamonds >> select(columns_between('cut', 'clarity'))) 91 | assert df.equals(diamonds >> select(columns_between(1, 3))) 92 | 93 | df = diamonds[['x','y','z']] 94 | assert df.equals(diamonds >> select(columns_between('x', 20))) 95 | 96 | 97 | 98 | def test_drop_between(): 99 | df = diamonds[['carat','z']] 100 | assert df.equals(diamonds >> drop(columns_between('cut','y'))) 101 | assert df.equals(diamonds >> drop(columns_between(X.cut, 8))) 102 | 103 | df = diamonds[['carat','cut']] 104 | assert df.equals(diamonds >> drop(columns_between(X.color, 20))) 105 | 106 | 107 | def test_select_from(): 108 | df = diamonds[['x','y','z']] 109 | assert df.equals(diamonds >> select(columns_from('x'))) 110 | assert df.equals(diamonds >> select(columns_from(X.x))) 111 | assert df.equals(diamonds >> select(columns_from(7))) 112 | 113 | assert diamonds[[]].equals(diamonds >> select(columns_from(100))) 114 | 115 | 116 | def test_drop_from(): 117 | df = diamonds[['carat','cut']] 118 | assert df.equals(diamonds >> drop(columns_from('color'))) 119 | assert df.equals(diamonds >> drop(columns_from(X.color))) 120 | assert df.equals(diamonds >> drop(columns_from(2))) 121 | 122 | #print(diamonds >> drop(columns_from(0))) 123 | assert diamonds[[]].equals(diamonds >> drop(columns_from(0))) 124 | 125 | 126 | def test_select_to(): 127 | df = diamonds[['carat','cut']] 128 | assert df.equals(diamonds >> select(columns_to('color'))) 129 | assert df.equals(diamonds >> select(columns_to(X.color))) 130 | assert df.equals(diamonds >> select(columns_to(2))) 131 | 132 | 133 | def test_drop_to(): 134 | df = diamonds[['x','y','z']] 135 | assert df.equals(diamonds >> drop(columns_to('x'))) 136 | assert df.equals(diamonds >> drop(columns_to(X.x))) 137 | assert df.equals(diamonds >> drop(columns_to(7))) 138 | 139 | 140 | def select_through(): 141 | df = diamonds[['carat','cut','color']] 142 | assert df.equals(diamonds >> select(columns_to('color', inclusive=True))) 143 | assert df.equals(diamonds >> select(columns_to(X.color, inclusive=True))) 144 | assert df.equals(diamonds >> select(columns_to(2, inclusive=True))) 145 | 146 | 147 | def drop_through(): 148 | df = diamonds[['y','z']] 149 | assert df.equals(diamonds >> drop(columns_to('x', inclusive=True))) 150 | assert df.equals(diamonds >> drop(columns_to(X.x, inclusive=True))) 151 | assert df.equals(diamonds >> drop(columns_to(7, inclusive=True))) 152 | 153 | 154 | 155 | def test_select_if(): 156 | # test 1: manually build diamonds subset where columns are numeric and 157 | # mean is greater than 3 158 | cols = list() 159 | for col in diamonds: 160 | try: 161 | if mean(diamonds[col]) > 3: 162 | cols.append(col) 163 | except: 164 | pass 165 | df_if = diamonds[cols] 166 | assert df_if.equals(diamonds >> select_if(lambda col: mean(col) > 3)) 167 | # test 2: use and 168 | cols = list() 169 | for col in diamonds: 170 | try: 171 | if mean(diamonds[col]) > 3 and max(diamonds[col]) < 50: 172 | cols.append(col) 173 | except: 174 | pass 175 | df_if = diamonds[cols] 176 | assert df_if.equals(diamonds >> select_if(lambda col: mean(col) > 3 and max(col) < 50)) 177 | # test 3: use or 178 | cols = list() 179 | for col in diamonds: 180 | try: 181 | if mean(diamonds[col]) > 3 or max(diamonds[col]) < 6: 182 | cols.append(col) 183 | except: 184 | pass 185 | df_if = diamonds[cols] 186 | assert df_if.equals(diamonds >> select_if(lambda col: mean(col) > 3 or max(col) < 6)) 187 | # test 4: string operations - contain a specific string 188 | cols = list() 189 | for col in diamonds: 190 | try: 191 | if any(diamonds[col].str.contains('Ideal')): 192 | cols.append(col) 193 | except: 194 | pass 195 | df_if = diamonds[cols] 196 | assert df_if.equals(diamonds >> select_if(lambda col: any(col.str.contains('Ideal')))) 197 | # test 5: get any text columns 198 | # uses the special '.' regex symbol to find any text value 199 | cols = list() 200 | for col in diamonds: 201 | try: 202 | if any(diamonds[col].str.contains('.')): 203 | cols.append(col) 204 | except: 205 | pass 206 | df_if = diamonds[cols] 207 | assert df_if.equals(diamonds >> select_if(lambda col: any(col.str.contains('.')))) 208 | 209 | 210 | def test_drop_if(): 211 | # test 1: returns a dataframe where any column does not have a mean greater than 3 212 | # this means numeric columns with mean less than 3, and also any non-numeric column 213 | # (since it does not have a mean) 214 | cols = list() 215 | for col in diamonds: 216 | try: 217 | if mean(diamonds[col]) > 3: 218 | cols.append(col) 219 | except: 220 | pass 221 | inverse_cols = [col for col in diamonds if col not in cols] 222 | df_if = diamonds[inverse_cols] 223 | assert df_if.equals(diamonds >> drop_if(lambda col: mean(col) > 3)) 224 | # test 2: use and 225 | # return colums where both conditions are false: 226 | # the mean greater than 3, and max < 50 227 | # again, this will include non-numeric columns 228 | cols = list() 229 | for col in diamonds: 230 | try: 231 | if mean(diamonds[col]) > 3 and max(diamonds[col]) < 50: 232 | cols.append(col) 233 | except: 234 | pass 235 | inverse_cols = [col for col in diamonds if col not in cols] 236 | df_if = diamonds[inverse_cols] 237 | assert df_if.equals(diamonds >> drop_if(lambda col: mean(col) > 3 and max(col) < 50)) 238 | # test 3: use or 239 | # this will return a dataframe where either of the two conditions are false: 240 | # the mean is greater than 3, or the max < 6 241 | cols = list() 242 | for col in diamonds: 243 | try: 244 | if mean(diamonds[col]) > 3 or max(diamonds[col]) < 6: 245 | cols.append(col) 246 | except: 247 | pass 248 | inverse_cols = [col for col in diamonds if col not in cols] 249 | df_if = diamonds[inverse_cols] 250 | assert df_if.equals(diamonds >> drop_if(lambda col: mean(col) > 3 or max(col) < 6)) 251 | # test 4: string operations - contain a specific string 252 | # this will drop any columns if they contain the word 'Ideal' 253 | cols = list() 254 | for col in diamonds: 255 | try: 256 | if any(diamonds[col].str.contains('Ideal')): 257 | cols.append(col) 258 | except: 259 | pass 260 | inverse_cols = [col for col in diamonds if col not in cols] 261 | df_if = diamonds[inverse_cols] 262 | assert df_if.equals(diamonds >> drop_if(lambda col: any(col.str.contains('Ideal')))) 263 | # test 5: drop any text columns 264 | # uses the special '.' regex symbol to find any text value 265 | cols = list() 266 | for col in diamonds: 267 | try: 268 | if any(diamonds[col].str.contains('.')): 269 | cols.append(col) 270 | except: 271 | pass 272 | inverse_cols = [col for col in diamonds if col not in cols] 273 | df_if = diamonds[inverse_cols] 274 | assert df_if.equals(diamonds >> drop_if(lambda col: any(col.str.contains('.')))) 275 | -------------------------------------------------------------------------------- /dfply/base.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import warnings 4 | from functools import partial, wraps 5 | 6 | 7 | def _recursive_apply(f, l): 8 | if isinstance(l, (list, tuple)): 9 | out = [_recursive_apply(f, l_) for l_ in l] 10 | if isinstance(l, tuple): 11 | out = tuple(out) 12 | return out 13 | else: 14 | return f(l) 15 | 16 | 17 | def contextualize(arg, context): 18 | if isinstance(arg, Intention): 19 | arg = arg.evaluate(context) 20 | return arg 21 | 22 | 23 | def flatten(l): 24 | for el in l: 25 | if isinstance(el, (tuple, list)): 26 | yield from flatten(el) 27 | else: 28 | yield el 29 | 30 | 31 | def _check_delayed_eval(args, kwargs): 32 | check = lambda x: isinstance(x, Intention) 33 | delay = any([a for a in flatten(_recursive_apply(check, args))]) 34 | delay = delay or any([v for v in flatten(_recursive_apply(check, list(kwargs.values())))]) 35 | return delay 36 | 37 | 38 | def _context_args(args): 39 | return lambda x: _recursive_apply(partial(contextualize, context=x), args) 40 | 41 | 42 | def _context_kwargs(kwargs): 43 | values_ = lambda x: _recursive_apply(partial(contextualize, context=x), 44 | list(kwargs.values())) 45 | return lambda x: {k: v for k, v in zip(kwargs.keys(), values_(x))} 46 | 47 | 48 | def _delayed_function(function, args, kwargs): 49 | return lambda x: function(*_context_args(args)(x), 50 | **_context_kwargs(kwargs)(x)) 51 | 52 | 53 | def make_symbolic(f): 54 | def wrapper(*args, **kwargs): 55 | delay = _check_delayed_eval(args, kwargs) 56 | if delay: 57 | delayed = _delayed_function(f, args, kwargs) 58 | return Intention(delayed) 59 | else: 60 | return f(*args, **kwargs) 61 | 62 | return wrapper 63 | 64 | 65 | class Intention(object): 66 | def __init__(self, function=lambda x: x, invert=False): 67 | self.function = function 68 | self.inverted = invert 69 | 70 | def evaluate(self, context): 71 | return self.function(context) 72 | 73 | def __getattr__(self, attribute): 74 | return Intention(lambda x: getattr(self.function(x), attribute), 75 | invert=self.inverted) 76 | 77 | def __invert__(self): 78 | return Intention(self.function, invert=not self.inverted) 79 | 80 | def __call__(self, *args, **kwargs): 81 | return Intention(lambda x: self.function(x)(*_context_args(args)(x), 82 | **_context_kwargs(kwargs)(x)), 83 | invert=self.inverted) 84 | 85 | 86 | _magic_method_names = [ 87 | '__abs__', '__add__', '__and__', '__cmp__', '__complex__', '__contains__', 88 | '__delattr__', '__delete__', '__delitem__', '__delslice__', '__div__', 89 | '__divmod__', '__enter__', '__eq__', '__exit__', '__float__', 90 | '__floordiv__', '__ge__', '__get__', '__getitem__', '__getslice__', 91 | '__gt__', '__hash__', '__hex__', '__iadd__', '__iand__', '__idiv__', 92 | '__ifloordiv__', '__ilshift__', '__imod__', '__imul__', '__index__', 93 | '__int__', '__ior__', '__ipow__', '__irshift__', '__isub__', 94 | '__iter__', '__itruediv__', '__ixor__', '__le__', '__len__', '__long__', 95 | '__lshift__', '__lt__', '__mod__', '__mul__', '__ne__', '__neg__', 96 | '__nonzero__', '__oct__', '__or__', '__pos__', '__pow__', '__radd__', 97 | '__rand__', '__rcmp__', '__rdiv__', '__rdivmod__', # '__repr__', 98 | '__reversed__', '__rfloordiv__', '__rlshift__', '__rmod__', '__rmul__', 99 | '__ror__', '__rpow__', '__rrshift__', '__rshift__', '__rsub__', 100 | '__rtruediv__', '__rxor__', '__set__', '__setitem__', '__setslice__', 101 | '__sub__', '__truediv__', '__unicode__', '__xor__', '__str__', 102 | ] 103 | 104 | 105 | def _set_magic_method(name): 106 | def magic_method(self, *args, **kwargs): 107 | return Intention(lambda x: getattr(self.function(x), name)(*_context_args(args)(x), 108 | **_context_kwargs(kwargs)(x)), 109 | invert=self.inverted) 110 | 111 | return magic_method 112 | 113 | 114 | for name in _magic_method_names: 115 | setattr(Intention, name, _set_magic_method(name)) 116 | 117 | # Initialize the global X symbol 118 | X = Intention() 119 | 120 | 121 | class pipe(object): 122 | __name__ = "pipe" 123 | 124 | def __init__(self, function): 125 | self.function = function 126 | self.__doc__ = function.__doc__ 127 | 128 | self.chained_pipes = [] 129 | 130 | def __rshift__(self, other): 131 | assert isinstance(other, pipe) 132 | self.chained_pipes.append(other) 133 | return self 134 | 135 | def __rrshift__(self, other): 136 | other_copy = other.copy() 137 | 138 | with warnings.catch_warnings(): 139 | warnings.simplefilter("ignore") 140 | other_copy._grouped_by = getattr(other, '_grouped_by', None) 141 | 142 | result = self.function(other_copy) 143 | 144 | for p in self.chained_pipes: 145 | result = p.__rrshift__(result) 146 | return result 147 | 148 | def __call__(self, *args, **kwargs): 149 | return pipe(lambda x: self.function(x, *args, **kwargs)) 150 | 151 | 152 | class IntentionEvaluator(object): 153 | """ 154 | Parent class for symbolic argument decorators. 155 | Default behavior is to recursively turn the arguments and keyword 156 | arguments of a decorated function into `symbolic.Call` objects that 157 | can be evaluated against a pandas DataFrame as it comes down a pipe. 158 | """ 159 | 160 | __name__ = "IntentionEvaluator" 161 | 162 | def __init__(self, function, eval_symbols=True, eval_as_label=[], 163 | eval_as_selector=[]): 164 | super(IntentionEvaluator, self).__init__() 165 | self.function = function 166 | self.__doc__ = function.__doc__ 167 | 168 | self.eval_symbols = eval_symbols 169 | self.eval_as_label = eval_as_label 170 | self.eval_as_selector = eval_as_selector 171 | 172 | def _evaluate(self, df, arg): 173 | if isinstance(arg, Intention): 174 | negate = arg.inverted 175 | arg = arg.evaluate(df) 176 | if negate: 177 | arg = ~arg 178 | return arg 179 | 180 | def _evaluate_label(self, df, arg): 181 | arg = self._evaluate(df, arg) 182 | 183 | cols = list(df.columns) 184 | if isinstance(arg, pd.Series): 185 | arg = arg.name 186 | if isinstance(arg, pd.Index): 187 | arg = list(arg) 188 | if isinstance(arg, int): 189 | arg = cols[arg] 190 | return arg 191 | 192 | def _evaluate_selector(self, df, arg): 193 | negate = False 194 | if isinstance(arg, Intention): 195 | negate = arg.inverted 196 | arg = arg.evaluate(df) 197 | 198 | cols = list(df.columns) 199 | if isinstance(arg, pd.Series): 200 | arg = [cols.index(arg.name)] 201 | if isinstance(arg, pd.Index): 202 | arg = [cols.index(i) for i in list(arg)] 203 | if isinstance(arg, pd.DataFrame): 204 | arg = [cols.index(i) for i in arg.columns] 205 | if isinstance(arg, int): 206 | arg = [arg] 207 | if isinstance(arg, str): 208 | arg = [cols.index(arg)] 209 | if isinstance(arg, (list, tuple)): 210 | arg = [cols.index(i) if isinstance(i, str) else i for i in arg] 211 | 212 | selection_vector = np.zeros(df.shape[1]) 213 | col_idx = np.array(arg) 214 | 215 | if negate and len(col_idx) > 0: 216 | selection_vector[col_idx] = -1 217 | elif len(col_idx) > 0: 218 | selection_vector[col_idx] = 1 219 | return selection_vector 220 | 221 | def _evaluator_loop(self, df, arg, eval_func): 222 | if isinstance(arg, (list, tuple)): 223 | return [self._evaluator_loop(df, a_, eval_func) for a_ in arg] 224 | else: 225 | return eval_func(df, arg) 226 | 227 | def _symbolic_eval(self, df, arg): 228 | return self._evaluator_loop(df, arg, self._evaluate) 229 | 230 | def _symbolic_to_label(self, df, arg): 231 | return self._evaluator_loop(df, arg, self._evaluate_label) 232 | 233 | def _symbolic_to_selector(self, df, arg): 234 | return self._evaluator_loop(df, arg, self._evaluate_selector) 235 | 236 | def _recursive_arg_eval(self, df, args): 237 | eval_symbols = self._find_eval_args(self.eval_symbols, args) 238 | eval_as_label = self._find_eval_args(self.eval_as_label, args) 239 | eval_as_selector = self._find_eval_args(self.eval_as_selector, args) 240 | 241 | return [ 242 | self._symbolic_to_label(df, a) if i in eval_as_label 243 | else self._symbolic_to_selector(df, a) if i in eval_as_selector 244 | else self._symbolic_eval(df, a) if i in eval_symbols 245 | else a 246 | for i, a in enumerate(args) 247 | ] 248 | 249 | def _recursive_kwarg_eval(self, df, kwargs): 250 | eval_symbols = self._find_eval_kwargs(self.eval_symbols, kwargs) 251 | eval_as_label = self._find_eval_kwargs(self.eval_as_label, kwargs) 252 | eval_as_selector = self._find_eval_kwargs(self.eval_as_selector, kwargs) 253 | 254 | return { 255 | k: (self._symbolic_to_label(df, v) if k in eval_as_label 256 | else self._symbolic_to_selector(df, v) if k in eval_as_selector 257 | else self._symbolic_eval(df, v) if k in eval_symbols 258 | else v) 259 | for k, v in kwargs.items() 260 | } 261 | 262 | def _find_eval_args(self, request, args): 263 | if (request == True) or ('*' in request): 264 | return [i for i in range(len(args))] 265 | elif request in [None, False]: 266 | return [] 267 | return request 268 | 269 | def _find_eval_kwargs(self, request, kwargs): 270 | if (request == True) or ('**' in request): 271 | return [k for k in kwargs.keys()] 272 | elif request in [None, False]: 273 | return [] 274 | return request 275 | 276 | def __call__(self, *args, **kwargs): 277 | df = args[0] 278 | 279 | args = self._recursive_arg_eval(df, args[1:]) 280 | kwargs = self._recursive_kwarg_eval(df, kwargs) 281 | 282 | return self.function(df, *args, **kwargs) 283 | 284 | 285 | def symbolic_evaluation(function=None, eval_symbols=True, eval_as_label=[], 286 | eval_as_selector=[]): 287 | if function: 288 | return IntentionEvaluator(function) 289 | else: 290 | @wraps(function) 291 | def wrapper(function): 292 | return IntentionEvaluator(function, eval_symbols=eval_symbols, 293 | eval_as_label=eval_as_label, 294 | eval_as_selector=eval_as_selector) 295 | 296 | return wrapper 297 | 298 | 299 | class group_delegation(object): 300 | __name__ = "group_delegation" 301 | 302 | def __init__(self, function): 303 | self.function = function 304 | self.__doc__ = function.__doc__ 305 | 306 | def _apply(self, df, *args, **kwargs): 307 | grouped = df.groupby(df._grouped_by) 308 | 309 | dff = grouped.apply(self.function, *args, **kwargs) 310 | # Save all the metadata attributes back into the new data frame 311 | for field in df._metadata: 312 | setattr(dff, field, getattr(df, field)) 313 | df = dff 314 | 315 | for name in df.index.names[:-1]: 316 | if name in df: 317 | df.reset_index(level=0, drop=True, inplace=True) 318 | else: 319 | df.reset_index(level=0, inplace=True) 320 | 321 | if (df.index == 0).all(): 322 | df.reset_index(drop=True, inplace=True) 323 | 324 | return df 325 | 326 | def __call__(self, *args, **kwargs): 327 | grouped_by = getattr(args[0], '_grouped_by', None) 328 | if (grouped_by is None) or not all([g in args[0].columns for g in grouped_by]): 329 | return self.function(*args, **kwargs) 330 | else: 331 | applied = self._apply(args[0], *args[1:], **kwargs) 332 | 333 | with warnings.catch_warnings(): 334 | warnings.simplefilter("ignore") 335 | applied._grouped_by = grouped_by 336 | 337 | return applied 338 | 339 | 340 | def dfpipe(f): 341 | return pipe( 342 | group_delegation( 343 | symbolic_evaluation(f) 344 | ) 345 | ) 346 | -------------------------------------------------------------------------------- /dfply/reshape.py: -------------------------------------------------------------------------------- 1 | from .base import * 2 | import re 3 | 4 | 5 | # ------------------------------------------------------------------------------ 6 | # Sorting 7 | # ------------------------------------------------------------------------------ 8 | 9 | @dfpipe 10 | def arrange(df, *args, **kwargs): 11 | """Calls `pandas.DataFrame.sort_values` to sort a DataFrame according to 12 | criteria. 13 | 14 | See: 15 | http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sort_values.html 16 | 17 | For a list of specific keyword arguments for sort_values (which will be 18 | the same in arrange). 19 | 20 | Args: 21 | *args: Symbolic, string, integer or lists of those types indicating 22 | columns to sort the DataFrame by. 23 | 24 | Kwargs: 25 | **kwargs: Any keyword arguments will be passed through to the pandas 26 | `DataFrame.sort_values` function. 27 | """ 28 | 29 | flat_args = [a for a in flatten(args)] 30 | 31 | series = [df[arg] if isinstance(arg, str) else 32 | df.iloc[:, arg] if isinstance(arg, int) else 33 | pd.Series(arg) for arg in flat_args] 34 | 35 | sorter = pd.concat(series, axis=1).reset_index(drop=True) 36 | sorter = sorter.sort_values(sorter.columns.tolist(), **kwargs) 37 | return df.iloc[sorter.index, :] 38 | 39 | 40 | # ------------------------------------------------------------------------------ 41 | # Renaming 42 | # ------------------------------------------------------------------------------ 43 | 44 | @pipe 45 | @symbolic_evaluation(eval_as_label=True) 46 | def rename(df, **kwargs): 47 | """Renames columns, where keyword argument values are the current names 48 | of columns and keys are the new names. 49 | 50 | Args: 51 | df (:obj:`pandas.DataFrame`): DataFrame passed in via `>>` pipe. 52 | 53 | Kwargs: 54 | **kwargs: key:value pairs where keys are new names for columns and 55 | values are current names of columns. 56 | """ 57 | 58 | return df.rename(columns={v: k for k, v in kwargs.items()}) 59 | 60 | 61 | # ------------------------------------------------------------------------------ 62 | # Elongate 63 | # ------------------------------------------------------------------------------ 64 | 65 | @pipe 66 | @symbolic_evaluation(eval_as_label=['*']) 67 | def gather(df, key, values, *args, **kwargs): 68 | """ 69 | Melts the specified columns in your DataFrame into two key:value columns. 70 | 71 | Args: 72 | key (str): Name of identifier column. 73 | values (str): Name of column that will contain values for the key. 74 | *args (str, int, symbolic): Columns to "melt" into the new key and 75 | value columns. If no args are specified, all columns are melted 76 | into they key and value columns. 77 | 78 | Kwargs: 79 | add_id (bool): Boolean value indicating whether to add a `"_ID"` 80 | column that will preserve information about the original rows 81 | (useful for being able to re-widen the data later). 82 | 83 | Example: 84 | diamonds >> gather('variable', 'value', ['price', 'depth','x','y','z']) >> head(5) 85 | 86 | carat cut color clarity table variable value 87 | 0 0.23 Ideal E SI2 55.0 price 326.0 88 | 1 0.21 Premium E SI1 61.0 price 326.0 89 | 2 0.23 Good E VS1 65.0 price 327.0 90 | 3 0.29 Premium I VS2 58.0 price 334.0 91 | 4 0.31 Good J SI2 58.0 price 335.0 92 | """ 93 | 94 | if len(args) == 0: 95 | args = df.columns.tolist() 96 | else: 97 | args = [a for a in flatten(args)] 98 | 99 | if kwargs.get('add_id', False): 100 | df = df.assign(_ID=np.arange(df.shape[0])) 101 | 102 | columns = df.columns.tolist() 103 | id_vars = [col for col in columns if col not in args] 104 | return pd.melt(df, id_vars, list(args), key, values) 105 | 106 | 107 | # ------------------------------------------------------------------------------ 108 | # Widen 109 | # ------------------------------------------------------------------------------ 110 | 111 | def convert_type(df, columns): 112 | """ 113 | Helper function that attempts to convert columns into their appropriate 114 | data type. 115 | """ 116 | # taken in part from the dplython package 117 | out_df = df.copy() 118 | for col in columns: 119 | column_values = pd.Series(out_df[col].unique()) 120 | column_values = column_values[~column_values.isnull()] 121 | # empty 122 | if len(column_values) == 0: 123 | continue 124 | # boolean 125 | if set(column_values.values) < {'True', 'False'}: 126 | out_df[col] = out_df[col].map({'True': True, 'False': False}) 127 | continue 128 | # numeric 129 | if pd.to_numeric(column_values, errors='coerce').isnull().sum() == 0: 130 | out_df[col] = pd.to_numeric(out_df[col], errors='ignore') 131 | continue 132 | # datetime 133 | if pd.to_datetime(column_values, errors='coerce').isnull().sum() == 0: 134 | out_df[col] = pd.to_datetime(out_df[col], errors='ignore', 135 | infer_datetime_format=True) 136 | continue 137 | 138 | return out_df 139 | 140 | 141 | @pipe 142 | @symbolic_evaluation(eval_as_label=['*']) 143 | def spread(df, key, values, convert=False): 144 | """ 145 | Transforms a "long" DataFrame into a "wide" format using a key and value 146 | column. 147 | 148 | If you have a mixed datatype column in your long-format DataFrame then the 149 | default behavior is for the spread columns to be of type `object`, or 150 | string. If you want to try to convert dtypes when spreading, you can set 151 | the convert keyword argument in spread to True. 152 | 153 | Args: 154 | key (str, int, or symbolic): Label for the key column. 155 | values (str, int, or symbolic): Label for the values column. 156 | 157 | Kwargs: 158 | convert (bool): Boolean indicating whether or not to try and convert 159 | the spread columns to more appropriate data types. 160 | 161 | 162 | Example: 163 | widened = elongated >> spread(X.variable, X.value) 164 | widened >> head(5) 165 | 166 | _ID carat clarity color cut depth price table x y z 167 | 0 0 0.23 SI2 E Ideal 61.5 326 55 3.95 3.98 2.43 168 | 1 1 0.21 SI1 E Premium 59.8 326 61 3.89 3.84 2.31 169 | 2 10 0.3 SI1 J Good 64 339 55 4.25 4.28 2.73 170 | 3 100 0.75 SI1 D Very Good 63.2 2760 56 5.8 5.75 3.65 171 | 4 1000 0.75 SI1 D Ideal 62.3 2898 55 5.83 5.8 3.62 172 | """ 173 | 174 | # Taken mostly from dplython package 175 | columns = df.columns.tolist() 176 | id_cols = [col for col in columns if not col in [key, values]] 177 | 178 | temp_index = ['' for i in range(len(df))] 179 | for id_col in id_cols: 180 | temp_index += df[id_col].map(str) 181 | 182 | out_df = df.assign(temp_index=temp_index) 183 | out_df = out_df.set_index('temp_index') 184 | spread_data = out_df[[key, values]] 185 | 186 | if not all(spread_data.groupby([spread_data.index, key]).agg( 187 | 'count').reset_index()[values] < 2): 188 | raise ValueError('Duplicate identifiers') 189 | 190 | spread_data = spread_data.pivot(columns=key, values=values) 191 | 192 | if convert and (out_df[values].dtype.kind in 'OSaU'): 193 | columns_to_convert = [col for col in spread_data if col not in columns] 194 | spread_data = convert_type(spread_data, columns_to_convert) 195 | 196 | out_df = out_df[id_cols].drop_duplicates() 197 | out_df = out_df.merge(spread_data, left_index=True, right_index=True).reset_index(drop=True) 198 | 199 | out_df = (out_df >> arrange(id_cols)).reset_index(drop=True) 200 | 201 | return out_df 202 | 203 | 204 | # ------------------------------------------------------------------------------ 205 | # Separate columns 206 | # ------------------------------------------------------------------------------ 207 | 208 | @pipe 209 | @symbolic_evaluation(eval_as_label=['*']) 210 | def separate(df, column, into, sep="[\W_]+", remove=True, convert=False, 211 | extra='drop', fill='right'): 212 | """ 213 | Splits columns into multiple columns. 214 | 215 | Args: 216 | df (pandas.DataFrame): DataFrame passed in through the pipe. 217 | column (str, symbolic): Label of column to split. 218 | into (list): List of string names for new columns. 219 | 220 | Kwargs: 221 | sep (str or list): If a string, the regex string used to split the 222 | column. If a list, a list of integer positions to split strings 223 | on. 224 | remove (bool): Boolean indicating whether to remove the original column. 225 | convert (bool): Boolean indicating whether the new columns should be 226 | converted to the appropriate type. 227 | extra (str): either `'drop'`, where split pieces beyond the specified 228 | new columns are dropped, or `'merge'`, where the final split piece 229 | contains the remainder of the original column. 230 | fill (str): either `'right'`, where `np.nan` values are filled in the 231 | right-most columns for missing pieces, or `'left'` where `np.nan` 232 | values are filled in the left-most columns. 233 | """ 234 | 235 | assert isinstance(into, (tuple, list)) 236 | 237 | if isinstance(sep, (tuple, list)): 238 | inds = [0] + list(sep) 239 | if len(inds) > len(into): 240 | if extra == 'drop': 241 | inds = inds[:len(into) + 1] 242 | elif extra == 'merge': 243 | inds = inds[:len(into)] + [None] 244 | else: 245 | inds = inds + [None] 246 | 247 | splits = df[column].map(lambda x: [str(x)[slice(inds[i], inds[i + 1])] 248 | if i < len(inds) - 1 else np.nan 249 | for i in range(len(into))]) 250 | 251 | else: 252 | maxsplit = len(into) - 1 if extra == 'merge' else 0 253 | splits = df[column].map(lambda x: re.split(sep, x, maxsplit)) 254 | 255 | right_filler = lambda x: x + [np.nan for i in range(len(into) - len(x))] 256 | left_filler = lambda x: [np.nan for i in range(len(into) - len(x))] + x 257 | 258 | if fill == 'right': 259 | splits = [right_filler(x) for x in splits] 260 | elif fill == 'left': 261 | splits = [left_filler(x) for x in splits] 262 | 263 | for i, split_col in enumerate(into): 264 | df[split_col] = [x[i] if not x[i] == '' else np.nan for x in splits] 265 | 266 | if convert: 267 | df = convert_type(df, into) 268 | 269 | if remove: 270 | df.drop(column, axis=1, inplace=True) 271 | 272 | return df 273 | 274 | 275 | # ------------------------------------------------------------------------------ 276 | # Unite columns 277 | # ------------------------------------------------------------------------------ 278 | 279 | @pipe 280 | @symbolic_evaluation(eval_as_label=['*']) 281 | def unite(df, colname, *args, **kwargs): 282 | """ 283 | Does the inverse of `separate`, joining columns together by a specified 284 | separator. 285 | 286 | Any columns that are not strings will be converted to strings. 287 | 288 | Args: 289 | df (pandas.DataFrame): DataFrame passed in through the pipe. 290 | colname (str): the name of the new joined column. 291 | *args: list of columns to be joined, which can be strings, symbolic, or 292 | integer positions. 293 | 294 | Kwargs: 295 | sep (str): the string separator to join the columns with. 296 | remove (bool): Boolean indicating whether or not to remove the 297 | original columns. 298 | na_action (str): can be one of `'maintain'` (the default), 299 | '`ignore'`, or `'as_string'`. The default will make the new column 300 | row a `NaN` value if any of the original column cells at that 301 | row contained `NaN`. '`ignore'` will treat any `NaN` value as an 302 | empty string during joining. `'as_string'` will convert any `NaN` 303 | value to the string `'nan'` prior to joining. 304 | """ 305 | 306 | to_unite = list([a for a in flatten(args)]) 307 | sep = kwargs.get('sep', '_') 308 | remove = kwargs.get('remove', True) 309 | # possible na_action values 310 | # ignore: empty string 311 | # maintain: keep as np.nan (default) 312 | # as_string: becomes string 'nan' 313 | na_action = kwargs.get('na_action', 'maintain') 314 | 315 | # print(to_unite, sep, remove, na_action) 316 | 317 | if na_action == 'maintain': 318 | df[colname] = df[to_unite].apply(lambda x: np.nan if any(x.isnull()) 319 | else sep.join(x.map(str)), axis=1) 320 | elif na_action == 'ignore': 321 | df[colname] = df[to_unite].apply(lambda x: sep.join(x[~x.isnull()].map(str)), 322 | axis=1) 323 | elif na_action == 'as_string': 324 | df[colname] = df[to_unite].astype(str).apply(lambda x: sep.join(x), axis=1) 325 | 326 | if remove: 327 | df.drop(to_unite, axis=1, inplace=True) 328 | 329 | return df 330 | -------------------------------------------------------------------------------- /test/test_summary_functions.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from dfply import * 3 | 4 | 5 | ##============================================================================== 6 | ## transform summary functions 7 | ##============================================================================== 8 | 9 | def test_mean(): 10 | df = diamonds >> select(X.cut, X.x) >> head(5) 11 | # straight summarize 12 | t = df >> summarize(m=mean(X.x)) 13 | df_truth = pd.DataFrame({'m': [4.086]}) 14 | assert t.equals(df_truth) 15 | # grouped summarize 16 | t = df >> group_by(X.cut) >> summarize(m=mean(X.x)) 17 | df_truth = pd.DataFrame({'cut': ['Good', 'Ideal', 'Premium'], 18 | 'm': [4.195, 3.950, 4.045]}) 19 | assert t.equals(df_truth) 20 | # straight mutate 21 | t = df >> mutate(m=mean(X.x)) 22 | df_truth = df.copy() 23 | df_truth['m'] = df_truth.x.mean() 24 | assert t.equals(df_truth) 25 | # grouped mutate 26 | t = df >> group_by(X.cut) >> mutate(m=mean(X.x)) 27 | df_truth['m'] = pd.Series([3.950, 4.045, 4.195, 4.045, 4.195]) 28 | assert t.sort_index().equals(df_truth) 29 | 30 | 31 | def test_first(): 32 | df = diamonds >> select(X.cut, X.x) >> head(5) 33 | # straight summarize 34 | t = df >> summarize(f=first(X.x)) 35 | df_truth = pd.DataFrame({'f': [3.95]}) 36 | assert t.equals(df_truth) 37 | # grouped summarize 38 | t = df >> group_by(X.cut) >> summarize(f=first(X.x)) 39 | df_truth = pd.DataFrame({'cut': ['Good', 'Ideal', 'Premium'], 40 | 'f': [4.05, 3.95, 3.89]}) 41 | assert t.equals(df_truth) 42 | # summarize with order_by 43 | t = df >> summarize(f=first(X.x, order_by=desc(X.cut))) 44 | df_truth = pd.DataFrame({'f':[3.89]}) 45 | # straight mutate 46 | t = df >> mutate(f=first(X.x)) 47 | df_truth = df.copy() 48 | df_truth['f'] = df_truth.x.iloc[0] 49 | assert t.equals(df_truth) 50 | # grouped mutate 51 | t = df >> group_by(X.cut) >> mutate(f=first(X.x)) 52 | df_truth['f'] = pd.Series([3.95, 3.89, 4.05, 3.89, 4.05]) 53 | assert t.sort_index().equals(df_truth) 54 | 55 | 56 | def test_last(): 57 | df = diamonds >> select(X.cut, X.x) >> head(5) 58 | # straight summarize 59 | t = df >> summarize(l=last(X.x)) 60 | df_truth = pd.DataFrame({'l': [4.34]}) 61 | assert t.equals(df_truth) 62 | # grouped summarize 63 | t = df >> group_by(X.cut) >> summarize(l=last(X.x)) 64 | df_truth = pd.DataFrame({'cut': ['Good', 'Ideal', 'Premium'], 65 | 'l': [4.34, 3.95, 4.20]}) 66 | assert t.equals(df_truth) 67 | # summarize with order_by 68 | #t = df >> summarize(f=last(X.x, order_by=desc(X.cut))) 69 | t = df >> summarize(f=last(X.x, order_by=[desc(X.cut), desc(X.x)])) 70 | df_truth = pd.DataFrame({'f':[4.05]}) 71 | assert df_truth.equals(t) 72 | # straight mutate 73 | t = df >> mutate(l=last(X.x)) 74 | df_truth = df.copy() 75 | df_truth['l'] = df_truth.x.iloc[4] 76 | assert t.equals(df_truth) 77 | # grouped mutate 78 | t = df >> group_by(X.cut) >> mutate(l=last(X.x)) 79 | df_truth['l'] = pd.Series([3.95, 4.20, 4.34, 4.20, 4.34]) 80 | assert t.sort_index().equals(df_truth) 81 | 82 | 83 | def test_nth(): 84 | df = diamonds >> select(X.cut, X.x) >> head(10) 85 | # straight summarize 86 | t = df >> summarize(second=nth(X.x, 1)) 87 | df_truth = pd.DataFrame({'second': [3.89]}) 88 | assert t.equals(df_truth) 89 | # grouped summarize 90 | t = df >> group_by(X.cut) >> summarize(first=nth(X.x, 0)) 91 | df_truth = pd.DataFrame({'cut': ['Fair','Good', 'Ideal', 'Premium','Very Good'], 92 | 'first': [3.87,4.05,3.95,3.89,3.94]}) 93 | assert t.equals(df_truth) 94 | # summarize with order_by 95 | t = df >> summarize(last=nth(X.x, -1, order_by=[desc(X.cut), desc(X.x)])) 96 | #print(t) 97 | df_truth = pd.DataFrame({'last':[3.87]}) 98 | #print(df_truth) 99 | #print(t) 100 | assert df_truth.equals(t) 101 | # straight mutate 102 | t = df >> mutate(out_of_range=nth(X.x, 500)) 103 | df_truth = df.copy() 104 | df_truth['out_of_range'] = np.nan 105 | assert t.equals(df_truth) 106 | # grouped mutate 107 | t = df >> group_by(X.cut) >> mutate(penultimate=nth(X.x, -2)) 108 | df_truth = df.copy() 109 | df_truth['penultimate'] = pd.Series([np.nan,3.89,4.05,3.89,4.05,4.07, 110 | 4.07,4.07,np.nan,4.07]) 111 | print(t) 112 | print(df_truth) 113 | assert t.sort_index().equals(df_truth) 114 | 115 | 116 | def test_n(): 117 | df = diamonds >> select(X.cut, X.x) >> head(5) 118 | # straight summarize 119 | t = df >> summarize(n=n(X.x)) 120 | df_truth = pd.DataFrame({'n': [5]}) 121 | assert t.equals(df_truth) 122 | # grouped summarize 123 | t = df >> group_by(X.cut) >> summarize(n=n(X.x)) 124 | df_truth = pd.DataFrame({'cut': ['Good', 'Ideal', 'Premium'], 125 | 'n': [2, 1, 2]}) 126 | assert t.equals(df_truth) 127 | # straight mutate 128 | t = df >> mutate(n=n(X.x)) 129 | df_truth = df.copy() 130 | df_truth['n'] = 5 131 | assert t.equals(df_truth) 132 | # grouped mutate 133 | t = df >> group_by(X.cut) >> mutate(n=n(X.x)) 134 | df_truth['n'] = pd.Series([1, 2, 2, 2, 2, 2]) 135 | print(t) 136 | print(df_truth) 137 | assert t.sort_index().equals(df_truth) 138 | 139 | 140 | def test_n_distinct(): 141 | df = pd.DataFrame({'col_1': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c'], 142 | 'col_2': [1, 1, 1, 2, 3, 3, 4, 5]}) 143 | # straight summarize 144 | t = df >> summarize(n=n_distinct(X.col_2)) 145 | df_truth = pd.DataFrame({'n': [5]}) 146 | assert t.equals(df_truth) 147 | # grouped summarize 148 | t = df >> group_by(X.col_1) >> summarize(n=n_distinct(X.col_2)) 149 | df_truth = pd.DataFrame({'col_1': ['a', 'b', 'c'], 150 | 'n': [1, 2, 2]}) 151 | assert t.equals(df_truth) 152 | # straight mutate 153 | t = df >> mutate(n=n_distinct(X.col_2)) 154 | df_truth = df.copy() 155 | df_truth['n'] = 5 156 | assert t.equals(df_truth) 157 | # grouped mutate 158 | t = df >> group_by(X.col_1) >> mutate(n=n_distinct(X.col_2)) 159 | df_truth['n'] = pd.Series([1, 1, 1, 2, 2, 2, 2, 2]) 160 | assert t.equals(df_truth) 161 | 162 | 163 | def test_IQR(): 164 | df = diamonds >> select(X.cut, X.x) >> head(5) 165 | # straight summarize 166 | t = df >> summarize(i=IQR(X.x)) 167 | df_truth = pd.DataFrame({'i': [.25]}) 168 | assert t.equals(df_truth) 169 | # grouped summarize 170 | t = df >> group_by(X.cut) >> summarize(i=IQR(X.x)) 171 | df_truth = pd.DataFrame({'cut': ['Good', 'Ideal', 'Premium'], 172 | 'i': [0.145, 0.000, 0.155]}) 173 | test_vector = abs(t.i - df_truth.i) 174 | assert all(test_vector < 0.000000001) 175 | # straight mutate 176 | t = df >> mutate(i=IQR(X.x)) 177 | df_truth = df.copy() 178 | df_truth['i'] = 0.25 179 | assert t.equals(df_truth) 180 | # grouped mutate 181 | t = df >> group_by(X.cut) >> mutate(i=IQR(X.x)) 182 | df_truth['i'] = pd.Series([0.000, 0.155, 0.145, 0.155, 0.145]) 183 | test_vector = abs(t.i - df_truth.i) 184 | assert all(test_vector < 0.000000001) 185 | 186 | 187 | def test_colmin(): 188 | df = diamonds >> select(X.cut, X.x) >> head(5) 189 | # straight summarize 190 | t = df >> summarize(m=colmin(X.x)) 191 | df_truth = pd.DataFrame({'m': [3.89]}) 192 | assert t.equals(df_truth) 193 | # grouped summarize 194 | t = df >> group_by(X.cut) >> summarize(m=colmin(X.x)) 195 | df_truth = pd.DataFrame({'cut': ['Good', 'Ideal', 'Premium'], 196 | 'm': [4.05, 3.95, 3.89]}) 197 | assert t.equals(df_truth) 198 | # straight mutate 199 | t = df >> mutate(m=colmin(X.x)) 200 | df_truth = df.copy() 201 | df_truth['m'] = 3.89 202 | assert t.equals(df_truth) 203 | # grouped mutate 204 | t = df >> group_by(X.cut) >> mutate(m=colmin(X.x)) 205 | df_truth['m'] = pd.Series([3.95, 3.89, 4.05, 3.89, 4.05]) 206 | assert t.sort_index().equals(df_truth) 207 | 208 | 209 | def test_colmax(): 210 | df = diamonds >> select(X.cut, X.x) >> head(5) 211 | # straight summarize 212 | t = df >> summarize(m=colmax(X.x)) 213 | df_truth = pd.DataFrame({'m': [4.34]}) 214 | assert t.equals(df_truth) 215 | # grouped summarize 216 | t = df >> group_by(X.cut) >> summarize(m=colmax(X.x)) 217 | df_truth = pd.DataFrame({'cut': ['Good', 'Ideal', 'Premium'], 218 | 'm': [4.34, 3.95, 4.20]}) 219 | assert t.equals(df_truth) 220 | # straight mutate 221 | t = df >> mutate(m=colmax(X.x)) 222 | df_truth = df.copy() 223 | df_truth['m'] = 4.34 224 | assert t.equals(df_truth) 225 | # grouped mutate 226 | print(df.groupby('cut')['x'].agg(np.max)) 227 | print(df) 228 | t = df >> group_by(X.cut) >> mutate(m=colmax(X.x)) 229 | df_truth['m'] = pd.Series([3.95, 4.20, 4.34, 4.20, 4.34]) 230 | print(t) 231 | print(df_truth) 232 | assert t.sort_index().equals(df_truth) 233 | 234 | 235 | def test_median(): 236 | df = diamonds >> group_by(X.cut) >> head(3) >> select(X.cut, X.x) >> ungroup() 237 | # straight summarize 238 | t = df >> summarize(m=median(X.x)) 239 | df_truth = pd.DataFrame({'m': [4.05]}) 240 | assert t.equals(df_truth) 241 | 242 | # grouped summarize 243 | t = df >> group_by(X.cut) >> summarize(m=median(X.x)) 244 | df_truth = pd.DataFrame({'cut': ['Fair', 'Good', 'Ideal', 'Premium', 'Very Good'], 245 | 'm': [6.27, 4.25, 3.95, 3.89, 3.95]}) 246 | assert t.equals(df_truth) 247 | # straight mutate 248 | t = df >> mutate(m=median(X.x)) 249 | df_truth = df.copy() 250 | df_truth['m'] = 4.05 251 | assert t.equals(df_truth) 252 | # grouped mutate 253 | # t = df >> group_by(X.cut) >> mutate(m=median(X.x)) 254 | # df_truth['m'] = pd.Series( 255 | # [6.27, 6.27, 6.27, 4.25, 4.25, 4.25, 3.95, 3.95, 3.95, 3.89, 3.89, 3.89, 3.95, 3.95, 3.95], 256 | # index=t.index) 257 | # assert t.equals(df_truth) 258 | # make sure it handles case with even counts properly 259 | df = diamonds >> group_by(X.cut) >> head(2) >> select(X.cut, X.x) 260 | t = df >> group_by(X.cut) >> summarize(m=median(X.x)) 261 | df_truth = pd.DataFrame({'cut': ['Fair', 'Good', 'Ideal', 'Premium', 'Very Good'], 262 | 'm': [5.160, 4.195, 3.940, 4.045, 3.945]}) 263 | test_vector = abs(t.m - df_truth.m) 264 | assert all(test_vector < .000000001) 265 | 266 | 267 | def test_var(): 268 | df = diamonds >> group_by(X.cut) >> head(3) >> select(X.cut, X.x) >> ungroup() 269 | 270 | # straight summarize 271 | t = df >> summarize(v=var(X.x)) 272 | df_truth = pd.DataFrame({'v': [0.687392]}) 273 | test_vector = abs(t.v - df_truth.v) 274 | print(t.v) 275 | print(df_truth.v) 276 | assert all(test_vector < .00001) 277 | 278 | # grouped summarize 279 | t = df >> group_by(X.cut) >> summarize(v=var(X.x)) 280 | df_truth = pd.DataFrame({'cut': ['Fair', 'Good', 'Ideal', 'Premium', 'Very Good'], 281 | 'v': [2.074800, 0.022033, 0.056133, 0.033100, 0.005233]}) 282 | test_vector = abs(t.v - df_truth.v) 283 | assert all(test_vector < .00001) 284 | # straight mutate 285 | t = df >> mutate(v=var(X.x)) 286 | df_truth = df.copy() 287 | df_truth['v'] = 0.687392 288 | test_vector = abs(t.v - df_truth.v) 289 | assert all(test_vector < .00001) 290 | # grouped mutate 291 | # t = df >> group_by(X.cut) >> mutate(v=var(X.x)) 292 | # df_truth['v'] = pd.Series([2.074800, 2.074800, 2.074800, 0.022033, 0.022033, 0.022033, 293 | # 0.056133, 0.056133, 0.056133, 0.033100, 0.033100, 0.033100, 294 | # 0.005233, 0.005233, 0.005233], 295 | # index=t.index) 296 | # test_vector = abs(t.v - df_truth.v) 297 | # assert all(test_vector < .00001) 298 | # test with single value (var undefined) 299 | df = diamonds >> group_by(X.cut) >> head(1) >> select(X.cut, X.x) 300 | t = df >> group_by(X.cut) >> summarize(v=var(X.x)) 301 | df_truth = pd.DataFrame({'cut': ['Fair', 'Good', 'Ideal', 'Premium', 'Very Good'], 302 | 'v': [np.nan, np.nan, np.nan, np.nan, np.nan]}) 303 | assert t.equals(df_truth) 304 | 305 | 306 | def test_sd(): 307 | df = diamonds >> group_by(X.cut) >> head(3) >> select(X.cut, X.x) >> ungroup() 308 | # straight summarize 309 | t = df >> summarize(s=sd(X.x)) 310 | df_truth = pd.DataFrame({'s': [0.829091]}) 311 | test_vector = abs(t.s - df_truth.s) 312 | print(t) 313 | print(t.s) 314 | print(df_truth.s) 315 | assert all(test_vector < .00001) 316 | # grouped summarize 317 | t = df >> group_by(X.cut) >> summarize(s=sd(X.x)) 318 | df_truth = pd.DataFrame({'cut': ['Fair', 'Good', 'Ideal', 'Premium', 'Very Good'], 319 | 's': [1.440417, 0.148436, 0.236925, 0.181934, 0.072342]}) 320 | test_vector = abs(t.s - df_truth.s) 321 | assert all(test_vector < .00001) 322 | # straight mutate 323 | t = df >> mutate(s=sd(X.x)) 324 | df_truth = df.copy() 325 | df_truth['s'] = 0.829091 326 | test_vector = abs(t.s - df_truth.s) 327 | assert all(test_vector < .00001) 328 | # grouped mutate 329 | t = df >> group_by(X.cut) >> mutate(s=sd(X.x)) 330 | # df_truth['s'] = pd.Series([1.440417, 1.440417, 1.440417, 0.148436, 0.148436, 0.148436, 331 | # 0.236925, 0.236925, 0.236925, 0.181934, 0.181934, 0.181934, 332 | # 0.072342, 0.072342, 0.072342], 333 | # index=t.index) 334 | # test_vector = abs(t.s - df_truth.s) 335 | # print(t) 336 | # print(df_truth) 337 | assert all(test_vector < .00001) 338 | # test with single value (var undefined) 339 | df = diamonds >> group_by(X.cut) >> head(1) >> select(X.cut, X.x) 340 | t = df >> group_by(X.cut) >> summarize(s=sd(X.x)) 341 | df_truth = pd.DataFrame({'cut': ['Fair', 'Good', 'Ideal', 'Premium', 'Very Good'], 342 | 's': [np.nan, np.nan, np.nan, np.nan, np.nan]}) 343 | assert t.equals(df_truth) 344 | -------------------------------------------------------------------------------- /examples/basics-extending-functionality.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "\n", 14 | "from dfply import *" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "### Case #1: A custom pipe function\n", 22 | "---\n", 23 | "\n", 24 | "Pandas has a function `pd.crosstab` which can generate a cross-tabluation of factors. Let's say we wanted to build a pipe function that wrapped around this. The docstring of the Pandas function is below:\n", 25 | "\n", 26 | "Compute a simple cross-tabulation of two (or more) factors. By default\n", 27 | "computes a frequency table of the factors unless an array of values and an\n", 28 | "aggregation function are passed\n", 29 | "\n", 30 | " Parameters\n", 31 | " ----------\n", 32 | " index : array-like, Series, or list of arrays/Series\n", 33 | " Values to group by in the rows\n", 34 | " columns : array-like, Series, or list of arrays/Series\n", 35 | " Values to group by in the columns\n", 36 | " values : array-like, optional\n", 37 | " Array of values to aggregate according to the factors.\n", 38 | " Requires `aggfunc` be specified.\n", 39 | " aggfunc : function, optional\n", 40 | " If specified, requires `values` be specified as well\n", 41 | " rownames : sequence, default None\n", 42 | " If passed, must match number of row arrays passed\n", 43 | " colnames : sequence, default None\n", 44 | " If passed, must match number of column arrays passed\n", 45 | " margins : boolean, default False\n", 46 | " Add row/column margins (subtotals)\n", 47 | " dropna : boolean, default True\n", 48 | " Do not include columns whose entries are all NaN\n", 49 | " normalize : boolean, {'all', 'index', 'columns'}, or {0,1}, default False\n", 50 | " Normalize by dividing all values by the sum of values.\n", 51 | "\n", 52 | " - If passed 'all' or `True`, will normalize over all values.\n", 53 | " - If passed 'index' will normalize over each row.\n", 54 | " - If passed 'columns' will normalize over each column.\n", 55 | " - If margins is `True`, will also normalize margin values.\n", 56 | " \n", 57 | "\n", 58 | "**To keep it simple, let's build a reduced version of this that takes only:**\n", 59 | "- `index`\n", 60 | "- `columns`\n", 61 | "- `values`\n", 62 | "- `aggfunc`\n", 63 | "\n", 64 | "Below is a function that wraps around the call to `pd.crosstab`. " 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 5, 70 | "metadata": { 71 | "collapsed": true 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "def crosstab(index, columns, values=None, aggfunc=None):\n", 76 | " return pd.crosstab(index, columns, values=values, aggfunc=aggfunc)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 6, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "data": { 86 | "text/html": [ 87 | "
\n", 88 | "\n", 101 | "\n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | "
caratcutcolorclaritydepthtablepricexyz
00.23IdealESI261.555.03263.953.982.43
10.21PremiumESI159.861.03263.893.842.31
\n", 146 | "
" 147 | ], 148 | "text/plain": [ 149 | " carat cut color clarity depth table price x y z\n", 150 | "0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43\n", 151 | "1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31" 152 | ] 153 | }, 154 | "execution_count": 6, 155 | "metadata": {}, 156 | "output_type": "execute_result" 157 | } 158 | ], 159 | "source": [ 160 | "diamonds.head(2)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 7, 166 | "metadata": {}, 167 | "outputs": [ 168 | { 169 | "data": { 170 | "text/html": [ 171 | "
\n", 172 | "\n", 185 | "\n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | "
colorDEFGHIJ
cut
Fair163224312314303175119
Good662933909871702522307
Ideal283439033826488431152093896
Premium160323372331292423601428808
Very Good151324002164229918241204678
\n", 261 | "
" 262 | ], 263 | "text/plain": [ 264 | "color D E F G H I J\n", 265 | "cut \n", 266 | "Fair 163 224 312 314 303 175 119\n", 267 | "Good 662 933 909 871 702 522 307\n", 268 | "Ideal 2834 3903 3826 4884 3115 2093 896\n", 269 | "Premium 1603 2337 2331 2924 2360 1428 808\n", 270 | "Very Good 1513 2400 2164 2299 1824 1204 678" 271 | ] 272 | }, 273 | "execution_count": 7, 274 | "metadata": {}, 275 | "output_type": "execute_result" 276 | } 277 | ], 278 | "source": [ 279 | "crosstab(diamonds.cut, diamonds.color)" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "If you want your function to be part of a dfply pipe chain, the first argument _must_ be a dataframe, which is implicitly passed through during the evaluation of the chain! We will need to redefine the function to have the implicit `df` passed in as the first argument.\n", 287 | "\n", 288 | "The most common and straightforward way to convert a custom function to a dfply piping function is to use the `@dfpipe` decorator. \n", 289 | "\n", 290 | "> Note: the `@dfpipe` decorator is in fact a convenience decorator that stacks three dfply decorators together: \n", 291 | "\n", 292 | "> `@pipe` \n", 293 | "`@group_delegation` \n", 294 | "`@symbolic_evaluation` \n", 295 | "\n", 296 | "> `@pipe` ensures that the function will work in the dfply piping syntax and take an implicit DataFrame, `@group_delegation` makes the function work with groupings applied prior in the chain, and `@symbolic_evaluation` enables you to use and evaluate symbolic arguments like `X.cut` that are placeholders for incoming data." 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 8, 302 | "metadata": { 303 | "collapsed": true 304 | }, 305 | "outputs": [], 306 | "source": [ 307 | "@dfpipe\n", 308 | "def crosstab(df, index, columns, values=None, aggfunc=None):\n", 309 | " return pd.crosstab(index, columns, values=values, aggfunc=aggfunc)" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 9, 315 | "metadata": {}, 316 | "outputs": [ 317 | { 318 | "data": { 319 | "text/html": [ 320 | "
\n", 321 | "\n", 334 | "\n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | "
colorDEFGHIJ
cut
Fair163224312314303175119
Good662933909871702522307
Ideal283439033826488431152093896
Premium160323372331292423601428808
Very Good151324002164229918241204678
\n", 410 | "
" 411 | ], 412 | "text/plain": [ 413 | "color D E F G H I J\n", 414 | "cut \n", 415 | "Fair 163 224 312 314 303 175 119\n", 416 | "Good 662 933 909 871 702 522 307\n", 417 | "Ideal 2834 3903 3826 4884 3115 2093 896\n", 418 | "Premium 1603 2337 2331 2924 2360 1428 808\n", 419 | "Very Good 1513 2400 2164 2299 1824 1204 678" 420 | ] 421 | }, 422 | "execution_count": 9, 423 | "metadata": {}, 424 | "output_type": "execute_result" 425 | } 426 | ], 427 | "source": [ 428 | "diamonds >> crosstab(X.cut, X.color)" 429 | ] 430 | }, 431 | { 432 | "cell_type": "markdown", 433 | "metadata": {}, 434 | "source": [ 435 | "### Case #2: A function that works with symbolic arguments\n", 436 | "---\n", 437 | "\n", 438 | "Many tasks are simpler and do not require the capacity to work as a pipe function. The `dfply` window functions are the common examples of this: functions that take a Series (or _symbolic_ Series) and return a modified version.\n", 439 | "\n", 440 | "Let's say we had a dataframe with dates represented by strings that we wanted to convert to pandas datetime objects using the `pd.to_datetime` function. Below is a tiny example dataframe with this issue." 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": 10, 446 | "metadata": {}, 447 | "outputs": [ 448 | { 449 | "data": { 450 | "text/html": [ 451 | "
\n", 452 | "\n", 465 | "\n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | "
datesales
07/10/171220
17/11/171592
27/12/17908
37/13/171102
47/14/171395
\n", 501 | "
" 502 | ], 503 | "text/plain": [ 504 | " date sales\n", 505 | "0 7/10/17 1220\n", 506 | "1 7/11/17 1592\n", 507 | "2 7/12/17 908\n", 508 | "3 7/13/17 1102\n", 509 | "4 7/14/17 1395" 510 | ] 511 | }, 512 | "execution_count": 10, 513 | "metadata": {}, 514 | "output_type": "execute_result" 515 | } 516 | ], 517 | "source": [ 518 | "sales = pd.DataFrame(dict(date=['7/10/17','7/11/17','7/12/17','7/13/17','7/14/17'],\n", 519 | " sales=[1220, 1592, 908, 1102, 1395]))\n", 520 | "sales" 521 | ] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": 11, 526 | "metadata": {}, 527 | "outputs": [ 528 | { 529 | "data": { 530 | "text/plain": [ 531 | "date object\n", 532 | "sales int64\n", 533 | "dtype: object" 534 | ] 535 | }, 536 | "execution_count": 11, 537 | "metadata": {}, 538 | "output_type": "execute_result" 539 | } 540 | ], 541 | "source": [ 542 | "sales.dtypes" 543 | ] 544 | }, 545 | { 546 | "cell_type": "markdown", 547 | "metadata": {}, 548 | "source": [ 549 | "In pandas we would use the `pd.to_datetime` function to convert the strings to date objects, and add it as a new column like so:" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": 12, 555 | "metadata": {}, 556 | "outputs": [ 557 | { 558 | "data": { 559 | "text/html": [ 560 | "
\n", 561 | "\n", 574 | "\n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | "
datesalespd_date
07/10/1712202017-07-10
17/11/1715922017-07-11
27/12/179082017-07-12
37/13/1711022017-07-13
47/14/1713952017-07-14
\n", 616 | "
" 617 | ], 618 | "text/plain": [ 619 | " date sales pd_date\n", 620 | "0 7/10/17 1220 2017-07-10\n", 621 | "1 7/11/17 1592 2017-07-11\n", 622 | "2 7/12/17 908 2017-07-12\n", 623 | "3 7/13/17 1102 2017-07-13\n", 624 | "4 7/14/17 1395 2017-07-14" 625 | ] 626 | }, 627 | "execution_count": 12, 628 | "metadata": {}, 629 | "output_type": "execute_result" 630 | } 631 | ], 632 | "source": [ 633 | "sales['pd_date'] = pd.to_datetime(sales['date'], infer_datetime_format=True)\n", 634 | "sales" 635 | ] 636 | }, 637 | { 638 | "cell_type": "code", 639 | "execution_count": 13, 640 | "metadata": { 641 | "collapsed": true 642 | }, 643 | "outputs": [], 644 | "source": [ 645 | "sales.drop('pd_date', axis=1, inplace=True)" 646 | ] 647 | }, 648 | { 649 | "cell_type": "markdown", 650 | "metadata": {}, 651 | "source": [ 652 | "What if you tried to use the `pd.to_datetime` function inside of a call to mutate, like so?\n", 653 | "\n", 654 | "```python\n", 655 | "sales >> mutate(pd_date=pd.to_datetime(X.date, infer_datetime_format=True))\n", 656 | "```\n", 657 | "\n", 658 | "This will unfortunately break. The `dfply` functions are special in that they \"know\" to delay their evaluation until the data is at that point in the chain. `pd.to_datetime` is not such a function, and will immediately try to evaluate `X.date`. With a symbolic `Intention` argument passed in, the function will fail as it does not know what to do with that.\n", 659 | "\n", 660 | "Instead, we will need to make a wrapper around `pd.to_datetime` that can handle these symbolic arguments and delay evaluation until the right time. \n", 661 | "\n", 662 | "This is quite simple: all you need to do is decorate a function with the `@make_symbolic` decorator, like so:" 663 | ] 664 | }, 665 | { 666 | "cell_type": "code", 667 | "execution_count": 14, 668 | "metadata": { 669 | "collapsed": true 670 | }, 671 | "outputs": [], 672 | "source": [ 673 | "@make_symbolic\n", 674 | "def to_datetime(series, infer_datetime_format=True):\n", 675 | " return pd.to_datetime(series, infer_datetime_format=infer_datetime_format)" 676 | ] 677 | }, 678 | { 679 | "cell_type": "code", 680 | "execution_count": 15, 681 | "metadata": {}, 682 | "outputs": [ 683 | { 684 | "data": { 685 | "text/html": [ 686 | "
\n", 687 | "\n", 700 | "\n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | "
datesalespd_date
07/10/1712202017-07-10
17/11/1715922017-07-11
27/12/179082017-07-12
37/13/1711022017-07-13
47/14/1713952017-07-14
\n", 742 | "
" 743 | ], 744 | "text/plain": [ 745 | " date sales pd_date\n", 746 | "0 7/10/17 1220 2017-07-10\n", 747 | "1 7/11/17 1592 2017-07-11\n", 748 | "2 7/12/17 908 2017-07-12\n", 749 | "3 7/13/17 1102 2017-07-13\n", 750 | "4 7/14/17 1395 2017-07-14" 751 | ] 752 | }, 753 | "execution_count": 15, 754 | "metadata": {}, 755 | "output_type": "execute_result" 756 | } 757 | ], 758 | "source": [ 759 | "sales >> mutate(pd_date=to_datetime(X.date))" 760 | ] 761 | }, 762 | { 763 | "cell_type": "markdown", 764 | "metadata": {}, 765 | "source": [ 766 | "And there you go. Able to delay the evaluation.\n", 767 | "\n", 768 | "What's particularly nice about the `@make_symbolic` decorator is that it has no trouble working with non-symbolic arguments too. If we were to pass in the series itself the function evaluates without a problem:" 769 | ] 770 | }, 771 | { 772 | "cell_type": "code", 773 | "execution_count": 16, 774 | "metadata": {}, 775 | "outputs": [ 776 | { 777 | "data": { 778 | "text/plain": [ 779 | "0 2017-07-10\n", 780 | "1 2017-07-11\n", 781 | "2 2017-07-12\n", 782 | "3 2017-07-13\n", 783 | "4 2017-07-14\n", 784 | "Name: date, dtype: datetime64[ns]" 785 | ] 786 | }, 787 | "execution_count": 16, 788 | "metadata": {}, 789 | "output_type": "execute_result" 790 | } 791 | ], 792 | "source": [ 793 | "to_datetime(sales.date)" 794 | ] 795 | }, 796 | { 797 | "cell_type": "markdown", 798 | "metadata": {}, 799 | "source": [ 800 | "Keep in mind, though, that if _any_ of the arguments or keyword arguments are symbolic `Intention` objects, the return will itself be an `Intention` object representing the function awaiting evaluation by a dataframe:" 801 | ] 802 | }, 803 | { 804 | "cell_type": "code", 805 | "execution_count": 17, 806 | "metadata": {}, 807 | "outputs": [ 808 | { 809 | "data": { 810 | "text/plain": [ 811 | "" 812 | ] 813 | }, 814 | "execution_count": 17, 815 | "metadata": {}, 816 | "output_type": "execute_result" 817 | } 818 | ], 819 | "source": [ 820 | "to_datetime(X.date)" 821 | ] 822 | }, 823 | { 824 | "cell_type": "code", 825 | "execution_count": 19, 826 | "metadata": {}, 827 | "outputs": [ 828 | { 829 | "data": { 830 | "text/plain": [ 831 | "0 2017-07-10\n", 832 | "1 2017-07-11\n", 833 | "2 2017-07-12\n", 834 | "3 2017-07-13\n", 835 | "4 2017-07-14\n", 836 | "Name: date, dtype: datetime64[ns]" 837 | ] 838 | }, 839 | "execution_count": 19, 840 | "metadata": {}, 841 | "output_type": "execute_result" 842 | } 843 | ], 844 | "source": [ 845 | "awaiting = to_datetime(X.date)\n", 846 | "awaiting.evaluate(sales)" 847 | ] 848 | }, 849 | { 850 | "cell_type": "code", 851 | "execution_count": null, 852 | "metadata": { 853 | "collapsed": true 854 | }, 855 | "outputs": [], 856 | "source": [] 857 | } 858 | ], 859 | "metadata": { 860 | "kernelspec": { 861 | "display_name": "Python 3", 862 | "language": "python", 863 | "name": "python3" 864 | }, 865 | "language_info": { 866 | "codemirror_mode": { 867 | "name": "ipython", 868 | "version": 3 869 | }, 870 | "file_extension": ".py", 871 | "mimetype": "text/x-python", 872 | "name": "python", 873 | "nbconvert_exporter": "python", 874 | "pygments_lexer": "ipython3", 875 | "version": "3.6.1" 876 | } 877 | }, 878 | "nbformat": 4, 879 | "nbformat_minor": 2 880 | } 881 | -------------------------------------------------------------------------------- /examples/.ipynb_checkpoints/basics-extending-functionality-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "\n", 14 | "from dfply import *" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "### Case #1: A custom pipe function\n", 22 | "---\n", 23 | "\n", 24 | "Pandas has a function `pd.crosstab` which can generate a cross-tabluation of factors. Let's say we wanted to build a pipe function that wrapped around this. The docstring of the Pandas function is below:\n", 25 | "\n", 26 | "Compute a simple cross-tabulation of two (or more) factors. By default\n", 27 | "computes a frequency table of the factors unless an array of values and an\n", 28 | "aggregation function are passed\n", 29 | "\n", 30 | " Parameters\n", 31 | " ----------\n", 32 | " index : array-like, Series, or list of arrays/Series\n", 33 | " Values to group by in the rows\n", 34 | " columns : array-like, Series, or list of arrays/Series\n", 35 | " Values to group by in the columns\n", 36 | " values : array-like, optional\n", 37 | " Array of values to aggregate according to the factors.\n", 38 | " Requires `aggfunc` be specified.\n", 39 | " aggfunc : function, optional\n", 40 | " If specified, requires `values` be specified as well\n", 41 | " rownames : sequence, default None\n", 42 | " If passed, must match number of row arrays passed\n", 43 | " colnames : sequence, default None\n", 44 | " If passed, must match number of column arrays passed\n", 45 | " margins : boolean, default False\n", 46 | " Add row/column margins (subtotals)\n", 47 | " dropna : boolean, default True\n", 48 | " Do not include columns whose entries are all NaN\n", 49 | " normalize : boolean, {'all', 'index', 'columns'}, or {0,1}, default False\n", 50 | " Normalize by dividing all values by the sum of values.\n", 51 | "\n", 52 | " - If passed 'all' or `True`, will normalize over all values.\n", 53 | " - If passed 'index' will normalize over each row.\n", 54 | " - If passed 'columns' will normalize over each column.\n", 55 | " - If margins is `True`, will also normalize margin values.\n", 56 | " \n", 57 | "\n", 58 | "**To keep it simple, let's build a reduced version of this that takes only:**\n", 59 | "- `index`\n", 60 | "- `columns`\n", 61 | "- `values`\n", 62 | "- `aggfunc`\n", 63 | "\n", 64 | "Below is a function that wraps around the call to `pd.crosstab`. " 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 5, 70 | "metadata": { 71 | "collapsed": true 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "def crosstab(index, columns, values=None, aggfunc=None):\n", 76 | " return pd.crosstab(index, columns, values=values, aggfunc=aggfunc)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 6, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "data": { 86 | "text/html": [ 87 | "
\n", 88 | "\n", 101 | "\n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | "
caratcutcolorclaritydepthtablepricexyz
00.23IdealESI261.555.03263.953.982.43
10.21PremiumESI159.861.03263.893.842.31
\n", 146 | "
" 147 | ], 148 | "text/plain": [ 149 | " carat cut color clarity depth table price x y z\n", 150 | "0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43\n", 151 | "1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31" 152 | ] 153 | }, 154 | "execution_count": 6, 155 | "metadata": {}, 156 | "output_type": "execute_result" 157 | } 158 | ], 159 | "source": [ 160 | "diamonds.head(2)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 7, 166 | "metadata": {}, 167 | "outputs": [ 168 | { 169 | "data": { 170 | "text/html": [ 171 | "
\n", 172 | "\n", 185 | "\n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | "
colorDEFGHIJ
cut
Fair163224312314303175119
Good662933909871702522307
Ideal283439033826488431152093896
Premium160323372331292423601428808
Very Good151324002164229918241204678
\n", 261 | "
" 262 | ], 263 | "text/plain": [ 264 | "color D E F G H I J\n", 265 | "cut \n", 266 | "Fair 163 224 312 314 303 175 119\n", 267 | "Good 662 933 909 871 702 522 307\n", 268 | "Ideal 2834 3903 3826 4884 3115 2093 896\n", 269 | "Premium 1603 2337 2331 2924 2360 1428 808\n", 270 | "Very Good 1513 2400 2164 2299 1824 1204 678" 271 | ] 272 | }, 273 | "execution_count": 7, 274 | "metadata": {}, 275 | "output_type": "execute_result" 276 | } 277 | ], 278 | "source": [ 279 | "crosstab(diamonds.cut, diamonds.color)" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "If you want your function to be part of a dfply pipe chain, the first argument _must_ be a dataframe, which is implicitly passed through during the evaluation of the chain! We will need to redefine the function to have the implicit `df` passed in as the first argument.\n", 287 | "\n", 288 | "The most common and straightforward way to convert a custom function to a dfply piping function is to use the `@dfpipe` decorator. \n", 289 | "\n", 290 | "> Note: the `@dfpipe` decorator is in fact a convenience decorator that stacks three dfply decorators together: \n", 291 | "\n", 292 | "> `@pipe` \n", 293 | "`@group_delegation` \n", 294 | "`@symbolic_evaluation` \n", 295 | "\n", 296 | "> `@pipe` ensures that the function will work in the dfply piping syntax and take an implicit DataFrame, `@group_delegation` makes the function work with groupings applied prior in the chain, and `@symbolic_evaluation` enables you to use and evaluate symbolic arguments like `X.cut` that are placeholders for incoming data." 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 8, 302 | "metadata": { 303 | "collapsed": true 304 | }, 305 | "outputs": [], 306 | "source": [ 307 | "@dfpipe\n", 308 | "def crosstab(df, index, columns, values=None, aggfunc=None):\n", 309 | " return pd.crosstab(index, columns, values=values, aggfunc=aggfunc)" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 9, 315 | "metadata": {}, 316 | "outputs": [ 317 | { 318 | "data": { 319 | "text/html": [ 320 | "
\n", 321 | "\n", 334 | "\n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | "
colorDEFGHIJ
cut
Fair163224312314303175119
Good662933909871702522307
Ideal283439033826488431152093896
Premium160323372331292423601428808
Very Good151324002164229918241204678
\n", 410 | "
" 411 | ], 412 | "text/plain": [ 413 | "color D E F G H I J\n", 414 | "cut \n", 415 | "Fair 163 224 312 314 303 175 119\n", 416 | "Good 662 933 909 871 702 522 307\n", 417 | "Ideal 2834 3903 3826 4884 3115 2093 896\n", 418 | "Premium 1603 2337 2331 2924 2360 1428 808\n", 419 | "Very Good 1513 2400 2164 2299 1824 1204 678" 420 | ] 421 | }, 422 | "execution_count": 9, 423 | "metadata": {}, 424 | "output_type": "execute_result" 425 | } 426 | ], 427 | "source": [ 428 | "diamonds >> crosstab(X.cut, X.color)" 429 | ] 430 | }, 431 | { 432 | "cell_type": "markdown", 433 | "metadata": {}, 434 | "source": [ 435 | "### Case #2: A function that works with symbolic arguments\n", 436 | "---\n", 437 | "\n", 438 | "Many tasks are simpler and do not require the capacity to work as a pipe function. The `dfply` window functions are the common examples of this: functions that take a Series (or _symbolic_ Series) and return a modified version.\n", 439 | "\n", 440 | "Let's say we had a dataframe with dates represented by strings that we wanted to convert to pandas datetime objects using the `pd.to_datetime` function. Below is a tiny example dataframe with this issue." 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": 10, 446 | "metadata": {}, 447 | "outputs": [ 448 | { 449 | "data": { 450 | "text/html": [ 451 | "
\n", 452 | "\n", 465 | "\n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | "
datesales
07/10/171220
17/11/171592
27/12/17908
37/13/171102
47/14/171395
\n", 501 | "
" 502 | ], 503 | "text/plain": [ 504 | " date sales\n", 505 | "0 7/10/17 1220\n", 506 | "1 7/11/17 1592\n", 507 | "2 7/12/17 908\n", 508 | "3 7/13/17 1102\n", 509 | "4 7/14/17 1395" 510 | ] 511 | }, 512 | "execution_count": 10, 513 | "metadata": {}, 514 | "output_type": "execute_result" 515 | } 516 | ], 517 | "source": [ 518 | "sales = pd.DataFrame(dict(date=['7/10/17','7/11/17','7/12/17','7/13/17','7/14/17'],\n", 519 | " sales=[1220, 1592, 908, 1102, 1395]))\n", 520 | "sales" 521 | ] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": 11, 526 | "metadata": {}, 527 | "outputs": [ 528 | { 529 | "data": { 530 | "text/plain": [ 531 | "date object\n", 532 | "sales int64\n", 533 | "dtype: object" 534 | ] 535 | }, 536 | "execution_count": 11, 537 | "metadata": {}, 538 | "output_type": "execute_result" 539 | } 540 | ], 541 | "source": [ 542 | "sales.dtypes" 543 | ] 544 | }, 545 | { 546 | "cell_type": "markdown", 547 | "metadata": {}, 548 | "source": [ 549 | "In pandas we would use the `pd.to_datetime` function to convert the strings to date objects, and add it as a new column like so:" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": 12, 555 | "metadata": {}, 556 | "outputs": [ 557 | { 558 | "data": { 559 | "text/html": [ 560 | "
\n", 561 | "\n", 574 | "\n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | "
datesalespd_date
07/10/1712202017-07-10
17/11/1715922017-07-11
27/12/179082017-07-12
37/13/1711022017-07-13
47/14/1713952017-07-14
\n", 616 | "
" 617 | ], 618 | "text/plain": [ 619 | " date sales pd_date\n", 620 | "0 7/10/17 1220 2017-07-10\n", 621 | "1 7/11/17 1592 2017-07-11\n", 622 | "2 7/12/17 908 2017-07-12\n", 623 | "3 7/13/17 1102 2017-07-13\n", 624 | "4 7/14/17 1395 2017-07-14" 625 | ] 626 | }, 627 | "execution_count": 12, 628 | "metadata": {}, 629 | "output_type": "execute_result" 630 | } 631 | ], 632 | "source": [ 633 | "sales['pd_date'] = pd.to_datetime(sales['date'], infer_datetime_format=True)\n", 634 | "sales" 635 | ] 636 | }, 637 | { 638 | "cell_type": "code", 639 | "execution_count": 13, 640 | "metadata": { 641 | "collapsed": true 642 | }, 643 | "outputs": [], 644 | "source": [ 645 | "sales.drop('pd_date', axis=1, inplace=True)" 646 | ] 647 | }, 648 | { 649 | "cell_type": "markdown", 650 | "metadata": {}, 651 | "source": [ 652 | "What if you tried to use the `pd.to_datetime` function inside of a call to mutate, like so?\n", 653 | "\n", 654 | "```python\n", 655 | "sales >> mutate(pd_date=pd.to_datetime(X.date, infer_datetime_format=True))\n", 656 | "```\n", 657 | "\n", 658 | "This will unfortunately break. The `dfply` functions are special in that they \"know\" to delay their evaluation until the data is at that point in the chain. `pd.to_datetime` is not such a function, and will immediately try to evaluate `X.date`. With a symbolic `Intention` argument passed in, the function will fail as it does not know what to do with that.\n", 659 | "\n", 660 | "Instead, we will need to make a wrapper around `pd.to_datetime` that can handle these symbolic arguments and delay evaluation until the right time. \n", 661 | "\n", 662 | "This is quite simple: all you need to do is decorate a function with the `@make_symbolic` decorator, like so:" 663 | ] 664 | }, 665 | { 666 | "cell_type": "code", 667 | "execution_count": 14, 668 | "metadata": { 669 | "collapsed": true 670 | }, 671 | "outputs": [], 672 | "source": [ 673 | "@make_symbolic\n", 674 | "def to_datetime(series, infer_datetime_format=True):\n", 675 | " return pd.to_datetime(series, infer_datetime_format=infer_datetime_format)" 676 | ] 677 | }, 678 | { 679 | "cell_type": "code", 680 | "execution_count": 15, 681 | "metadata": {}, 682 | "outputs": [ 683 | { 684 | "data": { 685 | "text/html": [ 686 | "
\n", 687 | "\n", 700 | "\n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | "
datesalespd_date
07/10/1712202017-07-10
17/11/1715922017-07-11
27/12/179082017-07-12
37/13/1711022017-07-13
47/14/1713952017-07-14
\n", 742 | "
" 743 | ], 744 | "text/plain": [ 745 | " date sales pd_date\n", 746 | "0 7/10/17 1220 2017-07-10\n", 747 | "1 7/11/17 1592 2017-07-11\n", 748 | "2 7/12/17 908 2017-07-12\n", 749 | "3 7/13/17 1102 2017-07-13\n", 750 | "4 7/14/17 1395 2017-07-14" 751 | ] 752 | }, 753 | "execution_count": 15, 754 | "metadata": {}, 755 | "output_type": "execute_result" 756 | } 757 | ], 758 | "source": [ 759 | "sales >> mutate(pd_date=to_datetime(X.date))" 760 | ] 761 | }, 762 | { 763 | "cell_type": "markdown", 764 | "metadata": {}, 765 | "source": [ 766 | "And there you go. Able to delay the evaluation.\n", 767 | "\n", 768 | "What's particularly nice about the `@make_symbolic` decorator is that it has no trouble working with non-symbolic arguments too. If we were to pass in the series itself the function evaluates without a problem:" 769 | ] 770 | }, 771 | { 772 | "cell_type": "code", 773 | "execution_count": 16, 774 | "metadata": {}, 775 | "outputs": [ 776 | { 777 | "data": { 778 | "text/plain": [ 779 | "0 2017-07-10\n", 780 | "1 2017-07-11\n", 781 | "2 2017-07-12\n", 782 | "3 2017-07-13\n", 783 | "4 2017-07-14\n", 784 | "Name: date, dtype: datetime64[ns]" 785 | ] 786 | }, 787 | "execution_count": 16, 788 | "metadata": {}, 789 | "output_type": "execute_result" 790 | } 791 | ], 792 | "source": [ 793 | "to_datetime(sales.date)" 794 | ] 795 | }, 796 | { 797 | "cell_type": "markdown", 798 | "metadata": {}, 799 | "source": [ 800 | "Keep in mind, though, that if _any_ of the arguments or keyword arguments are symbolic `Intention` objects, the return will itself be an `Intention` object representing the function awaiting evaluation by a dataframe:" 801 | ] 802 | }, 803 | { 804 | "cell_type": "code", 805 | "execution_count": 17, 806 | "metadata": {}, 807 | "outputs": [ 808 | { 809 | "data": { 810 | "text/plain": [ 811 | "" 812 | ] 813 | }, 814 | "execution_count": 17, 815 | "metadata": {}, 816 | "output_type": "execute_result" 817 | } 818 | ], 819 | "source": [ 820 | "to_datetime(X.date)" 821 | ] 822 | }, 823 | { 824 | "cell_type": "code", 825 | "execution_count": 19, 826 | "metadata": {}, 827 | "outputs": [ 828 | { 829 | "data": { 830 | "text/plain": [ 831 | "0 2017-07-10\n", 832 | "1 2017-07-11\n", 833 | "2 2017-07-12\n", 834 | "3 2017-07-13\n", 835 | "4 2017-07-14\n", 836 | "Name: date, dtype: datetime64[ns]" 837 | ] 838 | }, 839 | "execution_count": 19, 840 | "metadata": {}, 841 | "output_type": "execute_result" 842 | } 843 | ], 844 | "source": [ 845 | "awaiting = to_datetime(X.date)\n", 846 | "awaiting.evaluate(sales)" 847 | ] 848 | }, 849 | { 850 | "cell_type": "code", 851 | "execution_count": null, 852 | "metadata": { 853 | "collapsed": true 854 | }, 855 | "outputs": [], 856 | "source": [] 857 | } 858 | ], 859 | "metadata": { 860 | "kernelspec": { 861 | "display_name": "Python 3", 862 | "language": "python", 863 | "name": "python3" 864 | }, 865 | "language_info": { 866 | "codemirror_mode": { 867 | "name": "ipython", 868 | "version": 3 869 | }, 870 | "file_extension": ".py", 871 | "mimetype": "text/x-python", 872 | "name": "python", 873 | "nbconvert_exporter": "python", 874 | "pygments_lexer": "ipython3", 875 | "version": "3.6.1" 876 | } 877 | }, 878 | "nbformat": 4, 879 | "nbformat_minor": 2 880 | } 881 | --------------------------------------------------------------------------------