├── test
    ├── __init__.py
    ├── test_group.py
    ├── test_base.py
    ├── test_summarize.py
    ├── test_transform.py
    ├── test_vector.py
    ├── test_join.py
    ├── test_subset.py
    ├── test_reshape.py
    ├── test_window_functions.py
    ├── test_select.py
    └── test_summary_functions.py
├── dfply.egg-info
    ├── dependency_links.txt
    ├── requires.txt
    ├── top_level.txt
    ├── PKG-INFO
    └── SOURCES.txt
├── requirements.txt
├── MANIFEST.in
├── .travis.yml~
├── dfply
    ├── data
    │   └── __init__.py
    ├── group.py
    ├── __init__.py
    ├── summarize.py
    ├── subset.py
    ├── transform.py
    ├── summary_functions.py
    ├── select.py
    ├── window_functions.py
    ├── set_ops.py
    ├── vector.py
    ├── join.py
    ├── base.py
    └── reshape.py
├── .gitignore
├── setup.py
├── .travis.yml
├── RELEASES.txt
└── examples
    ├── basics-extending-functionality.ipynb
    └── .ipynb_checkpoints
        └── basics-extending-functionality-checkpoint.ipynb


/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dfply.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/dfply.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | 


--------------------------------------------------------------------------------
/dfply.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | dfply
2 | test
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.11.1
2 | pandas>=0.18.1
3 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | # Include the license file
2 | include LICENSE.md
3 | 


--------------------------------------------------------------------------------
/.travis.yml~:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - 2.6
4 | - 2.7
5 | install:
6 |   - pip install .
7 |   - pip install -r requirements.txt
8 | script: python -m pytest test/
9 | 


--------------------------------------------------------------------------------
/dfply/data/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 | 
4 | root = os.path.abspath(os.path.dirname(__file__))
5 | diamonds = pd.read_csv(os.path.join(root, "diamonds.csv"))
6 | 


--------------------------------------------------------------------------------
/dfply/group.py:
--------------------------------------------------------------------------------
 1 | from .base import *
 2 | 
 3 | 
 4 | @pipe
 5 | @symbolic_evaluation(eval_as_label=True)
 6 | def group_by(df, *args):
 7 |     df._grouped_by = list(args)
 8 |     return df
 9 | 
10 | 
11 | @pipe
12 | def ungroup(df):
13 |     df._grouped_by = None
14 |     return df
15 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # Unit test / coverage reports
 7 | .cache
 8 | 
 9 | # Annoying Mac File
10 | .DC_Store
11 | 
12 | # workbook test files
13 | test/feature_workbook.ipynb
14 | test/.ipynb_checkpoints/*
15 | test/worksheet.py
16 | 
17 | # distribution
18 | dist
19 | build
20 | 
21 | # egg_info
22 | dfply.egg_info
23 | 


--------------------------------------------------------------------------------
/test/test_group.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from dfply import *
 4 | 
 5 | ##==============================================================================
 6 | ## grouping test functions
 7 | ##==============================================================================
 8 | 
 9 | 
10 | def test_group_attributes():
11 |     d = diamonds >> group_by('cut')
12 |     assert hasattr(d, '_grouped_by')
13 |     assert d._grouped_by == ['cut',]
14 | 


--------------------------------------------------------------------------------
/dfply.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 1.0
 2 | Name: dfply
 3 | Version: 0.3.0
 4 | Summary: dplyr-style piping operations for pandas dataframes
 5 | Home-page: https://github.com/kieferk/dfply
 6 | Author: Kiefer Katovich
 7 | Author-email: kiefer.katovich@gmail.com
 8 | License: GNU General Public License v3.0
 9 | Description: See https://github.com/kieferk/dfply/blob/master/README.md for details.
10 | Keywords: pandas dplyr
11 | Platform: UNKNOWN
12 | 


--------------------------------------------------------------------------------
/dfply/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base import *
 2 | from .group import *
 3 | from .join import *
 4 | from .reshape import *
 5 | from .select import *
 6 | from .select import *
 7 | from .set_ops import *
 8 | from .subset import *
 9 | from .summarize import *
10 | from .transform import *
11 | from .data import diamonds
12 | from .summary_functions import *
13 | from .window_functions import *
14 | from .vector import *
15 | 
16 | for verb in dir():
17 |     if 'ize' in verb:
18 |         exec(verb.replace('ize', 'ise') + '=' + verb)
19 | 


--------------------------------------------------------------------------------
/test/test_base.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from dfply import *
 4 | 
 5 | 
 6 | ##==============================================================================
 7 | ## pipe tests
 8 | ##==============================================================================
 9 | 
10 | @dfpipe
11 | def blank_function(df):
12 |     return df
13 | 
14 | 
15 | def test_pipe():
16 |     d = diamonds >> blank_function()
17 |     assert diamonds.equals(d)
18 |     d = diamonds >> blank_function() >> blank_function()
19 |     assert diamonds.equals(d)
20 | 
21 | 
22 | def test_inplace_pipe():
23 |     df = diamonds[['price','carat']].head(5)
24 |     d = diamonds.copy()
25 |     d >>= select(X.price, X.carat) >> head(5)
26 |     print(df)
27 |     print(d)
28 |     assert df.equals(d)
29 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name = 'dfply',
 5 |     version = '0.3.3',
 6 |     author = 'Kiefer Katovich',
 7 |     author_email = 'kiefer.katovich@gmail.com',
 8 |     keywords = 'pandas dplyr',
 9 |     packages = find_packages(),
10 |     include_package_data=True,
11 |     package_data={'dfply': ['data/diamonds.csv']},
12 |     package_dir={'dfply':'dfply'},
13 |     install_requires=['numpy', 'pandas'],
14 |     description = 'dplyr-style piping operations for pandas dataframes',
15 |     long_description = 'See https://github.com/kieferk/dfply/blob/master/README.md for details.',
16 |     license = 'GNU General Public License v3.0',
17 |     url = 'https://github.com/kieferk/dfply',
18 |     test_suite='test',
19 | )
20 | 


--------------------------------------------------------------------------------
/dfply/summarize.py:
--------------------------------------------------------------------------------
 1 | from .base import *
 2 | 
 3 | 
 4 | @dfpipe
 5 | def summarize(df, **kwargs):
 6 |     return pd.DataFrame({k: [v] for k, v in kwargs.items()})
 7 | 
 8 | 
 9 | @dfpipe
10 | def summarize_each(df, functions, *args):
11 |     columns, values = [], []
12 |     for arg in args:
13 |         if isinstance(arg, pd.Series):
14 |             varname = arg.name
15 |             col = arg
16 |         elif isinstance(arg, str):
17 |             varname = arg
18 |             col = df[varname]
19 |         elif isinstance(arg, int):
20 |             varname = df.columns[arg]
21 |             col = df.iloc[:, arg]
22 | 
23 |         for f in functions:
24 |             fname = f.__name__
25 |             columns.append('_'.join([varname, fname]))
26 |             values.append(f(col))
27 | 
28 |     return pd.DataFrame([values], columns=columns)
29 | 


--------------------------------------------------------------------------------
/dfply.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | LICENSE.md
 2 | MANIFEST.in
 3 | setup.py
 4 | dfply/__init__.py
 5 | dfply/base.py
 6 | dfply/group.py
 7 | dfply/join.py
 8 | dfply/reshape.py
 9 | dfply/select.py
10 | dfply/set_ops.py
11 | dfply/subset.py
12 | dfply/summarize.py
13 | dfply/summary_functions.py
14 | dfply/transform.py
15 | dfply/vector.py
16 | dfply/window_functions.py
17 | dfply.egg-info/PKG-INFO
18 | dfply.egg-info/SOURCES.txt
19 | dfply.egg-info/dependency_links.txt
20 | dfply.egg-info/requires.txt
21 | dfply.egg-info/top_level.txt
22 | dfply/data/__init__.py
23 | test/__init__.py
24 | test/test_base.py
25 | test/test_group.py
26 | test/test_join.py
27 | test/test_reshape.py
28 | test/test_select.py
29 | test/test_subset.py
30 | test/test_summarize.py
31 | test/test_summary_functions.py
32 | test/test_transform.py
33 | test/test_vector.py
34 | test/test_window_functions.py


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 | - '3.6'
 4 | - '3.7'
 5 | install:
 6 | - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
 7 | - bash Miniconda3-latest-Linux-x86_64.sh -b -p $HOME/miniconda
 8 | - export PATH="$HOME/miniconda/bin:$PATH"
 9 | - hash -r
10 | - conda config --set always_yes yes --set changeps1 no
11 | - conda update -q conda
12 | - conda info -a
13 | - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION pip pytest numpy
14 |   pandas>=0.18.1
15 | - source activate test-environment
16 | - python setup.py install
17 | script: python -m pytest test/
18 | deploy:
19 |   provider: pypi
20 |   user: TokenNobody
21 |   password:
22 |     secure: XazMtrRpb6i/jtdeBIDV5mWZNQr2dPlmspgF/qqt9AbZRCu/Y28DaI/12KGSFgVJc2lzREp+cxKNq60bDT8mB3t0+YtYeHsmQXawInyXAFACfmRI5/nigiYLMhQ1OV/RHtXQcXeHJF1MbKeF2WjWdBKh9m9cBi5NVxGot/knGOALkwyiPG4Ykf5fVD4bCeJTkdrBav/XLYqYPntpw6GT0PA8yvt3E1lQfL+uTV8+ZcwsqXh8ebWNI0aU86lurE6b1cJn6xpTZYzSqiJqHuikCZC7alqd311kpm/sKuHMb2V9tKiHiJFN7fcKfdaVuAjQE22Tc7R7uC2ph9tBvL8xHnzi48Wj9Ri5QYLATN2u28d3rkCS+zN+tC3MT9bjDcyuqPdbx3Sx5bFJC6P0HFcof5lpnan80TW4VQSM2GV8rqwPgm0kLi0k/DG5yvRWecNdlvvCDZ5e6M9eiOcer9guimDYITtQCfuiUZLUbzgw+u7QE3jY9Exnv7Ekdi150Zd+ubPS+yU1ZG5tgB2ijw7n2bTxEy77d6Zm0quDnQ6gVBi7STp2si3397TTQH/nV+eaX51VOxTufZXDW0eiaVoRhH32xUllhFeAzJSezAVJ0WuLEuSLXGkxep7VNofK0Kyjxg4S2ED41lV7LtucdQe7L/LlGTfmYCgzSDaDW98CqIM=
23 |   on:
24 |     tags: true
25 |     distributions: sdist bdist_wheel
26 |     repo: kieferk/dfply
27 |     branch: master
28 | 


--------------------------------------------------------------------------------
/dfply/subset.py:
--------------------------------------------------------------------------------
 1 | from .base import *
 2 | import warnings
 3 | import numpy as np
 4 | 
 5 | 
 6 | # ------------------------------------------------------------------------------
 7 | # `head` and `tail`
 8 | # ------------------------------------------------------------------------------
 9 | 
10 | @dfpipe
11 | def head(df, n=5):
12 |     return df.head(n)
13 | 
14 | 
15 | @dfpipe
16 | def tail(df, n=5):
17 |     return df.tail(n)
18 | 
19 | 
20 | # ------------------------------------------------------------------------------
21 | # Sampling
22 | # ------------------------------------------------------------------------------
23 | 
24 | 
25 | @dfpipe
26 | def sample(df, *args, **kwargs):
27 |     return df.sample(*args, **kwargs)
28 | 
29 | 
30 | @pipe
31 | @group_delegation
32 | @symbolic_evaluation(eval_as_label=['*'])
33 | def distinct(df, *args, **kwargs):
34 |     if not args:
35 |         return df.drop_duplicates(**kwargs)
36 |     return df.drop_duplicates(list(args), **kwargs)
37 | 
38 | 
39 | @dfpipe
40 | def row_slice(df, indices):
41 |     if isinstance(indices, (tuple, list)):
42 |         indices = np.array(indices)
43 |     if isinstance(indices, int):
44 |         indices = np.array([indices])
45 |     if isinstance(indices, pd.Series):
46 |         indices = indices.values
47 | 
48 |     if indices.dtype == bool:
49 |         return df.loc[indices, :]
50 |     else:
51 |         return df.iloc[indices, :]
52 | 
53 | 
54 | # ------------------------------------------------------------------------------
55 | # Filtering/masking
56 | # ------------------------------------------------------------------------------
57 | 
58 | @dfpipe
59 | def mask(df, *args):
60 |     mask = pd.Series(np.ones(df.shape[0], dtype=bool))
61 |     for arg in args:
62 |         if arg.dtype != bool:
63 |             raise Exception("Arguments must be boolean.")
64 |         mask = mask & arg.reset_index(drop=True)
65 |     return df[mask.values]
66 | 
67 | 
68 | filter_by = mask   # alias for mask()
69 | 
70 | 
71 | @dfpipe
72 | def top_n(df, n=None, ascending=True, col=None):
73 |     if not n:
74 |         raise ValueError('n must be specified')
75 |     if not isinstance(col, pd.Series):
76 |         col = df.columns[-1]
77 |     else:
78 |         col = col._name
79 |     index = df[[col]].copy()
80 |     index['ranks'] = index[col].rank(ascending=ascending)
81 |     index = index[index['ranks'] >= index['ranks'].nlargest(n).min()]
82 |     return df.reindex(index.index)
83 | 
84 | 
85 | @dfpipe
86 | def pull(df, column=-1):
87 |     return df.ix[:, column]
88 | 


--------------------------------------------------------------------------------
/test/test_summarize.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from dfply import *
 4 | 
 5 | 
 6 | ##==============================================================================
 7 | ## summarization test functions
 8 | ##==============================================================================
 9 | 
10 | def test_summarize():
11 |     p = pd.DataFrame({
12 |         'price_mean':[diamonds.price.mean()],
13 |         'price_std':[diamonds.price.std()]
14 |     })
15 |     assert p.equals(diamonds >> summarize(price_mean=X.price.mean(),
16 |                                           price_std=X.price.std()))
17 | 
18 |     pcut = pd.DataFrame({
19 |         'cut':['Fair','Good','Ideal','Premium','Very Good']
20 |     })
21 |     pcut['price_mean'] = [diamonds[diamonds.cut == c].price.mean() for c in pcut.cut.values]
22 |     pcut['price_std'] = [diamonds[diamonds.cut == c].price.std() for c in pcut.cut.values]
23 |     assert pcut.equals(diamonds >> group_by('cut') >>
24 |                        summarize(price_mean=X.price.mean(), price_std=X.price.std()))
25 | 
26 | 
27 | def test_summarize_each():
28 |     to_match = pd.DataFrame({
29 |         'price_mean':[np.mean(diamonds.price)],
30 |         'price_var':[np.var(diamonds.price)],
31 |         'depth_mean':[np.mean(diamonds.depth)],
32 |         'depth_var':[np.var(diamonds.depth)]
33 |     })
34 |     to_match = to_match[['price_mean','price_var','depth_mean','depth_var']]
35 | 
36 |     test1 = diamonds >> summarize_each([np.mean, np.var], X.price, 4)
37 |     test2 = diamonds >> summarize_each([np.mean, np.var], X.price, 'depth')
38 |     assert to_match.equals(test1)
39 |     assert to_match.equals(test2)
40 | 
41 |     group = pd.DataFrame({
42 |         'cut':['Fair','Good','Ideal','Premium','Very Good']
43 |     })
44 |     group['price_mean'] = [np.mean(diamonds[diamonds.cut == c].price) for c in group.cut.values]
45 |     group['price_var'] = [np.var(diamonds[diamonds.cut == c].price) for c in group.cut.values]
46 |     group['depth_mean'] = [np.mean(diamonds[diamonds.cut == c].depth) for c in group.cut.values]
47 |     group['depth_var'] = [np.var(diamonds[diamonds.cut == c].depth) for c in group.cut.values]
48 | 
49 |     group = group[['cut','price_mean','price_var','depth_mean','depth_var']]
50 | 
51 |     test1 = (diamonds >> group_by(X.cut) >>
52 |              summarize_each([np.mean, np.var], X.price, 4))
53 |     test2 = (diamonds >> group_by('cut') >>
54 |              summarize_each([np.mean, np.var], X.price, 'depth'))
55 | 
56 |     assert group.equals(test1)
57 |     assert group.equals(test2)
58 | 


--------------------------------------------------------------------------------
/test/test_transform.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from dfply import *
 4 | 
 5 | 
 6 | ##==============================================================================
 7 | ## transform test functions
 8 | ##==============================================================================
 9 | 
10 | def test_mutate():
11 |     df = diamonds.copy()
12 |     df['testcol'] = 1
13 |     assert df.equals(diamonds >> mutate(testcol=1))
14 |     df['testcol'] = df['x']
15 |     assert df.equals(diamonds >> mutate(testcol=X.x))
16 |     df['testcol'] = df['x'] * df['y']
17 |     assert df.equals(diamonds >> mutate(testcol=X.x * X.y))
18 |     df['testcol'] = df['x'].mean()
19 |     assert df.equals(diamonds >> mutate(testcol=np.mean(X.x)))
20 | 
21 | 
22 | def group_mutate_helper(df):
23 |     df['testcol'] = df['x']*df.shape[0]
24 |     return df
25 | 
26 | 
27 | def test_group_mutate():
28 |     df = diamonds.copy()
29 |     df = df.groupby('cut').apply(group_mutate_helper)
30 |     d = diamonds >> group_by('cut') >> mutate(testcol=X.x*X.shape[0]) >> ungroup()
31 |     assert df.equals(d.sort_index())
32 | 
33 | 
34 | def test_transmute():
35 |     df = diamonds.copy()
36 |     df['testcol'] = df['x'] * df['y']
37 |     df = df[['testcol']]
38 |     assert df.equals(diamonds >> transmute(testcol=X.x * X.y))
39 | 
40 | 
41 | def test_group_transmute():
42 |     df = diamonds.copy()
43 |     df = df.groupby('cut').apply(group_mutate_helper).reset_index(drop=True)
44 |     df = df[['cut','testcol']]
45 |     d = diamonds >> group_by('cut') >> transmute(testcol=X.x*X.shape[0])
46 |     print(d.head())
47 |     print(df.head())
48 |     assert df.equals(d.sort_index())
49 | 
50 | 
51 | def test_mutate_if():
52 |     df = diamonds.copy()
53 |     for col in df:
54 |         try:
55 |             if max(df[col]) < 10:
56 |                 df[col] *= 2
57 |         except:
58 |             pass
59 |     assert df.equals(diamonds >> mutate_if(lambda col: max(col) < 10, lambda row: row * 2))
60 |     df = diamonds.copy()
61 |     for col in df:
62 |         try:
63 |             if any(df[col].str.contains('.')):
64 |                 df[col] = df[col].str.lower()
65 |         except:
66 |             pass
67 |     assert df.equals(diamonds >> mutate_if(lambda col: any(col.str.contains('.')), lambda row: row.str.lower()))
68 |     df = diamonds.copy()
69 |     for col in df:
70 |         try:
71 |             if min(df[col]) < 1 and mean(df[col]) < 4:
72 |                 df[col] *= -1
73 |         except:
74 |             pass
75 |     assert df.equals(diamonds >> mutate_if(lambda col: min(col) < 1 and mean(col) < 4, lambda row: -row))
76 | 


--------------------------------------------------------------------------------
/dfply/transform.py:
--------------------------------------------------------------------------------
  1 | from .base import *
  2 | 
  3 | 
  4 | @dfpipe
  5 | def mutate(df, **kwargs):
  6 |     """
  7 |     Creates new variables (columns) in the DataFrame specified by keyword
  8 |     argument pairs, where the key is the column name and the value is the
  9 |     new column value(s).
 10 | 
 11 |     Args:
 12 |         df (pandas.DataFrame): data passed in through the pipe.
 13 | 
 14 |     Kwargs:
 15 |         **kwargs: keys are the names of the new columns, values indicate
 16 |             what the new column values will be.
 17 | 
 18 |     Example:
 19 |         diamonds >> mutate(x_plus_y=X.x + X.y) >> select_from('x') >> head(3)
 20 | 
 21 |               x     y     z  x_plus_y
 22 |         0  3.95  3.98  2.43      7.93
 23 |         1  3.89  3.84  2.31      7.73
 24 |         2  4.05  4.07  2.31      8.12
 25 |     """
 26 | 
 27 |     return df.assign(**kwargs)
 28 | 
 29 | 
 30 | @dfpipe
 31 | def mutate_if(df, predicate, fun):
 32 |     """
 33 |     Modifies columns in place if the specified predicate is true.
 34 |     Args:
 35 |         df (pandas.DataFrame): data passed in through the pipe.
 36 |         predicate: a function applied to columns that returns a boolean value
 37 |         fun: a function that will be applied to columns where predicate returns True
 38 | 
 39 |     Example:
 40 |         diamonds >> mutate_if(lambda col: min(col) < 1 and mean(col) < 4, lambda row: 2 * row) >> head(3)
 41 |            carat      cut color clarity  depth  table  price     x     y     z
 42 |         0   0.46    Ideal     E     SI2   61.5   55.0    326  3.95  3.98  4.86
 43 |         1   0.42  Premium     E     SI1   59.8   61.0    326  3.89  3.84  4.62
 44 |         2   0.46     Good     E     VS1   56.9   65.0    327  4.05  4.07  4.62
 45 |         (columns 'carat' and 'z', both having a min < 1 and mean < 4, are doubled, while the
 46 |         other rows remain as they were)
 47 |     """
 48 |     cols = list()
 49 |     for col in df:
 50 |         try:
 51 |             if predicate(df[col]):
 52 |                 cols.append(col)
 53 |         except:
 54 |             pass
 55 |     df[cols] = df[cols].apply(fun)
 56 |     return df
 57 | 
 58 |     # df2 = df.copy()
 59 |     # df2[cols] = df2[cols].apply(fun)
 60 |     # return df2
 61 | 
 62 | 
 63 | @dfpipe
 64 | def transmute(df, *keep_columns, **kwargs):
 65 |     """
 66 |     Creates columns and then returns those new columns and optionally specified
 67 |     original columns from the DataFrame.
 68 | 
 69 |     This works like `mutate`, but designed to discard the original columns used
 70 |     to create the new ones.
 71 | 
 72 |     Args:
 73 |         *keep_columns: Column labels to keep. Can be string, symbolic, or
 74 |             integer position.
 75 | 
 76 |     Kwargs:
 77 |         **kwargs: keys are the names of the new columns, values indicate
 78 |             what the new column values will be.
 79 | 
 80 |     Example:
 81 |         diamonds >> transmute(x_plus_y=X.x + X.y, y_div_z=(X.y / X.z)) >> head(3)
 82 | 
 83 |             y_div_z  x_plus_y
 84 |         0  1.637860      7.93
 85 |         1  1.662338      7.73
 86 |         2  1.761905      8.12
 87 |     """
 88 | 
 89 |     keep_cols = []
 90 |     for col in flatten(keep_columns):
 91 |         try:
 92 |             keep_cols.append(col.name)
 93 |         except:
 94 |             if isinstance(col, str):
 95 |                 keep_cols.append(col)
 96 |             elif isinstance(col, int):
 97 |                 keep_cols.append(df.columns[col])
 98 | 
 99 |     df = df.assign(**kwargs)
100 |     columns = [k for k in kwargs.keys()] + list(keep_cols)
101 |     return df[columns]
102 | 


--------------------------------------------------------------------------------
/test/test_vector.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from dfply import *
  3 | 
  4 | ##==============================================================================
  5 | ## desc, order by tests
  6 | ##==============================================================================
  7 | 
  8 | def test_desc():
  9 | 
 10 |     df = diamonds >> select(X.cut, X.x) >> head(10)
 11 |     t = df >> summarize(last=nth(X.x, -1, order_by=[desc(X.cut), desc(X.x)]))
 12 | 
 13 |     series_num = pd.Series([4,1,3,2])
 14 |     series_bool = pd.Series([True,False,True,False])
 15 |     series_str = pd.Series(['d','a','c','b'])
 16 | 
 17 |     num_truth = series_num.rank(method='min',ascending=False)
 18 |     bool_truth = series_bool.rank(method='min',ascending=False)
 19 |     str_truth = series_str.rank(method='min',ascending=False)
 20 | 
 21 |     assert desc(series_num).equals(num_truth)
 22 |     assert desc(series_bool).equals(bool_truth)
 23 |     assert desc(series_str).equals(str_truth)
 24 | 
 25 | 
 26 | def test_order_series_by():
 27 |     series = pd.Series([1,2,3,4,5,6,7,8])
 28 |     order1 = pd.Series(['A','B','A','B','A','B','A','B'])
 29 |     ordered1 = order_series_by(series, order1).reset_index(drop=True)
 30 |     true1 = pd.Series([1,3,5,7,2,4,6,8])
 31 |     assert ordered1.equals(true1)
 32 | 
 33 |     order2 = pd.Series([2,2,2,2,1,1,1,1])
 34 |     ordered2 = order_series_by(series, [order1, order2]).reset_index(drop=True)
 35 |     true2 = pd.Series([5,7,1,3,6,8,2,4])
 36 |     assert ordered2.equals(true2)
 37 | 
 38 | 
 39 | ##==============================================================================
 40 | ## coalesce test
 41 | ##==============================================================================
 42 | 
 43 | def test_coalesce():
 44 |     df = pd.DataFrame({
 45 |         'a':[1,np.nan,np.nan,np.nan,np.nan],
 46 |         'b':[2,3,np.nan,np.nan,np.nan],
 47 |         'c':[np.nan,np.nan,4,5,np.nan],
 48 |         'd':[6,7,8,9,np.nan]
 49 |     })
 50 |     truth_df = df.assign(coal=[1,3,4,5,np.nan])
 51 |     d = df >> mutate(coal=coalesce(X.a, X.b, X.c, X.d))
 52 |     assert truth_df.equals(d)
 53 | 
 54 | 
 55 | ##==============================================================================
 56 | ## case_when test
 57 | ##==============================================================================
 58 | 
 59 | def test_case_when():
 60 |     df = pd.DataFrame({
 61 |         'num':np.arange(31)
 62 |     })
 63 |     df_truth = df.assign(strnum=['fizzbuzz' if (i % 15 == 0) else
 64 |                                  'fizz' if (i % 3 == 0) else
 65 |                                  'buzz' if (i % 5 == 0) else
 66 |                                  str(i) for i in np.arange(31)])
 67 |     d = df >> mutate(strnum=case_when([X.num % 15 == 0, 'fizzbuzz'],
 68 |                                       [X.num % 3 == 0, 'fizz'],
 69 |                                       [X.num % 5 == 0, 'buzz'],
 70 |                                       [True, X.num.astype(str)]))
 71 |     print(df_truth)
 72 |     print(d)
 73 |     assert df_truth.equals(d)
 74 | 
 75 | 
 76 | ##==============================================================================
 77 | ## if_else test
 78 | ##==============================================================================
 79 | 
 80 | def test_if_else():
 81 |     df = pd.DataFrame({
 82 |         'a':[1,2,3,4,5,6,7,8,9]
 83 |     })
 84 |     b_truth = ['odd','even','odd','even','odd','even','odd','even','odd']
 85 |     d = df >> mutate(b=if_else(X.a % 2 == 0, 'even', 'odd'))
 86 |     assert d.equals(df.assign(b=b_truth))
 87 | 
 88 |     df = pd.DataFrame({
 89 |         'a':[0,0,0,1,1,1,2,2,2]
 90 |     })
 91 |     b_truth = [5,5,5,5,5,5,9,9,9]
 92 |     d = df >> mutate(b=if_else(X.a < 2, [5,5,5,5,5,5,5,5,5], [9,9,9,9,9,9,9,9,9]))
 93 |     assert d.equals(df.assign(b=b_truth))
 94 | 
 95 | 
 96 | ##==============================================================================
 97 | ## na_if test
 98 | ##==============================================================================
 99 | 
100 | def test_na_if():
101 |     df = pd.DataFrame({
102 |         'a':[1,2,3,4,5]
103 |     })
104 |     d = df >> mutate(b=na_if(X.a, 3), c=na_if(X.a,1,2,3))
105 |     d = d[['a','b','c']]
106 |     df_true = df.assign(b=[1,2,np.nan,4,5], c=[np.nan,np.nan,np.nan,4,5])
107 |     assert df_true.equals(d)
108 | 


--------------------------------------------------------------------------------
/test/test_join.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from dfply import *
  4 | 
  5 | ##==============================================================================
  6 | ## join test functions
  7 | ##==============================================================================
  8 | 
  9 | @pytest.fixture
 10 | def dfA(scope='module'):
 11 |     a = pd.DataFrame({
 12 |         'x1':['A','B','C'],
 13 |         'x2':[1,2,3]
 14 |     })
 15 |     return a
 16 | 
 17 | 
 18 | @pytest.fixture
 19 | def dfB(scope='module'):
 20 |     b = pd.DataFrame({
 21 |         'x1':['A','B','D'],
 22 |         'x3':[True,False,True]
 23 |     })
 24 |     return b
 25 | 
 26 | @pytest.fixture
 27 | def dfC(scope='module'):
 28 |     c = pd.DataFrame({
 29 |         'x1':['B','C','D'],
 30 |         'x2':[2,3,4]
 31 |     })
 32 |     return c
 33 | 
 34 | 
 35 | def test_inner_join(dfA, dfB):
 36 |     ab = pd.DataFrame({
 37 |         'x1':['A','B'],
 38 |         'x2':[1,2],
 39 |         'x3':[True, False]
 40 |     })
 41 | 
 42 |     c = dfA >> inner_join(dfB, by='x1')
 43 |     assert c.equals(ab)
 44 | 
 45 | 
 46 | def test_outer_join(dfA, dfB):
 47 |     ab = pd.DataFrame({
 48 |         'x1':['A','B','C','D'],
 49 |         'x2':[1,2,3,np.nan],
 50 |         'x3':[True, False,np.nan,True]
 51 |     })
 52 | 
 53 |     c = dfA >> outer_join(dfB, by='x1')
 54 |     assert c.equals(ab)
 55 |     c = dfA >> full_join(dfB, by='x1')
 56 |     assert c.equals(ab)
 57 | 
 58 | 
 59 | def test_left_join(dfA, dfB):
 60 |     ab = pd.DataFrame({
 61 |         'x1':['A','B','C'],
 62 |         'x2':[1,2,3],
 63 |         'x3':[True, False, np.nan]
 64 |     })
 65 | 
 66 |     c = dfA >> left_join(dfB, by='x1')
 67 |     assert c.equals(ab)
 68 | 
 69 | 
 70 | def test_right_join(dfA, dfB):
 71 |     ab = pd.DataFrame({
 72 |         'x1':['A','B','D'],
 73 |         'x2':[1,2,np.nan],
 74 |         'x3':[True, False, True]
 75 |     })
 76 | 
 77 |     c = dfA >> right_join(dfB, by='x1')
 78 |     assert c.equals(ab)
 79 | 
 80 | def test_semi_join(dfA, dfB):
 81 |     ab = pd.DataFrame({
 82 |         'x1':['A', 'B'],
 83 |         'x2':[1, 2]
 84 |     })
 85 | 
 86 |     c = dfA >> semi_join(dfB, by='x1')
 87 |     assert c.equals(ab)
 88 | 
 89 | 
 90 | def test_anti_join(dfA, dfB):
 91 |     ab = pd.DataFrame({
 92 |         'x1':['C'],
 93 |         'x2':[3]
 94 |     }, index=[2])
 95 | 
 96 |     c = dfA >> anti_join(dfB, by='x1')
 97 |     assert c.equals(ab)
 98 | 
 99 | 
100 | ##==============================================================================
101 | ## set operation (row join) test functions
102 | ##==============================================================================
103 | 
104 | def test_union(dfA, dfC):
105 |     ac = pd.DataFrame({
106 |         'x1': ['A', 'B', 'C', 'D'],
107 |         'x2': [1, 2, 3, 4]
108 |     }, index=[0, 1, 2, 2])
109 | 
110 |     d = dfA >> union(dfC)
111 |     assert d.equals(ac)
112 | 
113 | 
114 | def test_intersect(dfA, dfC):
115 |     ac = pd.DataFrame({
116 |         'x1': ['B', 'C'],
117 |         'x2': [2, 3]
118 |     })
119 | 
120 |     d = dfA >> intersect(dfC)
121 |     assert d.equals(ac)
122 | 
123 | 
124 | def test_set_diff(dfA, dfC):
125 |     ac = pd.DataFrame({
126 |         'x1': ['A'],
127 |         'x2': [1]
128 |     })
129 | 
130 |     d = dfA >> set_diff(dfC)
131 |     assert d.equals(ac)
132 | 
133 | 
134 | ##==============================================================================
135 | ## bind rows, cols
136 | ##==============================================================================
137 | 
138 | def test_bind_rows(dfA, dfB):
139 |     inner = pd.DataFrame({
140 |         'x1':['A','B','C','A','B','D']
141 |     })
142 |     outer = pd.DataFrame({
143 |         'x1':['A','B','C','A','B','D'],
144 |         'x2':[1,2,3,np.nan,np.nan,np.nan],
145 |         'x3':[np.nan,np.nan,np.nan,True,False,True]
146 |     })
147 |     ab_inner = dfA >> bind_rows(dfB, join='inner')
148 |     ab_outer = dfA >> bind_rows(dfB, join='outer')
149 |     assert inner.equals(ab_inner.reset_index(drop=True))
150 |     assert outer.equals(ab_outer.reset_index(drop=True))
151 | 
152 | 
153 | def test_bind_cols(dfA, dfB):
154 |     dfB.columns = ['x3','x4']
155 |     df = pd.DataFrame({
156 |         'x1':['A','B','C'],
157 |         'x2':[1,2,3],
158 |         'x3':['A','B','D'],
159 |         'x4':[True,False,True]
160 |     })
161 |     d = dfA >> bind_cols(dfB)
162 |     assert df.equals(d)
163 | 


--------------------------------------------------------------------------------
/test/test_subset.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from dfply import *
  4 | 
  5 | 
  6 | ##==============================================================================
  7 | ## subset test functions
  8 | ##==============================================================================
  9 | 
 10 | def test_head():
 11 |     df = diamonds.head(2)
 12 |     d = diamonds >> head(2)
 13 |     assert df.equals(d)
 14 | 
 15 | 
 16 | def test_grouped_head():
 17 |     df = diamonds.groupby(['cut','color']).apply(lambda x: x.head(2)).reset_index(drop=True)
 18 |     d = diamonds >> group_by('cut','color') >> head(2)
 19 |     assert df.equals(d.reset_index(drop=True))
 20 | 
 21 | 
 22 | def test_tail():
 23 |     df = diamonds.tail(2)
 24 |     d = diamonds >> tail(2)
 25 |     assert df.equals(d)
 26 | 
 27 | 
 28 | def test_grouped_tail():
 29 |     df = diamonds.groupby(['cut','color']).apply(lambda x: x.tail(2)).reset_index(drop=True)
 30 |     d = diamonds >> group_by('cut','color') >> tail(2)
 31 |     assert df.equals(d.reset_index(drop=True))
 32 | 
 33 | 
 34 | def test_distinct():
 35 |     d = diamonds >> distinct('depth')
 36 |     df = diamonds.drop_duplicates('depth')
 37 |     assert df.equals(d)
 38 | 
 39 |     d = diamonds >> distinct(X.cut, 'depth')
 40 |     df = diamonds.drop_duplicates(['cut','depth'])
 41 |     assert df.equals(d)
 42 | 
 43 |     df = diamonds[['carat', 'cut']].drop_duplicates()
 44 |     d = diamonds >> select(X.carat, X.cut) >> distinct()
 45 |     assert df.equals(d)
 46 | 
 47 |     df = diamonds[['carat', 'cut']].drop_duplicates(keep='last')
 48 |     d = diamonds >> select(X.carat, X.cut) >> distinct(keep='last')
 49 |     assert df.equals(d)
 50 | 
 51 | 
 52 | def test_sample():
 53 |     random_state = 55
 54 | 
 55 |     d = diamonds >> sample(n=10, random_state=random_state)
 56 |     df = diamonds.sample(n=10, random_state=random_state)
 57 |     assert df.equals(d)
 58 | 
 59 |     d = diamonds >> sample(frac=0.001, random_state=random_state)
 60 |     df = diamonds.sample(frac=0.001, random_state=random_state)
 61 |     assert df.equals(d)
 62 | 
 63 |     d = diamonds >> group_by(X.cut) >> sample(n=10, random_state=random_state)
 64 |     d = d.reset_index(drop=True)
 65 |     df = diamonds.groupby('cut').apply(lambda x: x.sample(n=10, random_state=random_state))
 66 |     df = df.reset_index(drop=True)
 67 |     assert df.equals(d)
 68 | 
 69 | 
 70 | def test_row_slice():
 71 |     df = diamonds.iloc[[0,1],:]
 72 |     assert df.equals(diamonds >> row_slice([0,1]))
 73 |     df = diamonds.groupby('cut').apply(lambda df: df.iloc[0,:]).reset_index(drop=True)
 74 |     d = diamonds >> group_by(X.cut) >> row_slice(0)
 75 |     assert df.equals(d.reset_index(drop=True))
 76 |     df = diamonds.loc[diamonds.table > 61, :]
 77 |     assert df.equals(diamonds >> row_slice(X.table > 61))
 78 | 
 79 | 
 80 | def test_mask():
 81 |     test1 = diamonds >> mask(X.cut == 'Ideal')
 82 |     df = diamonds[diamonds.cut == 'Ideal']
 83 |     assert df.equals(test1)
 84 | 
 85 |     test2 = diamonds >> mask(X.cut == 'Ideal', X.color == 'E',
 86 |                              X.table < 55, X.price < 500)
 87 |     df_mask = (diamonds.cut == 'Ideal') & (diamonds.color == 'E')
 88 |     df_mask = df_mask & (diamonds.table < 55) & (diamonds.price < 500)
 89 |     df = diamonds[df_mask]
 90 |     assert df.equals(test2)
 91 | 
 92 | 
 93 | # def test_mask_small():
 94 | #     a = (diamonds >> group_by(X.cut) >> arrange(X.price) >>
 95 | #          head(3) >> ungroup() >> mask(X.carat < 0.23))
 96 | #     print(a)
 97 | #     assert False
 98 | 
 99 | #  d = diamonds >> group_by(X.cut) >> mutate(price_lag=lag(X.price)) >> head(2) >> select(X.cut, X.price_lag)
100 | 
101 | def test_top_n():
102 |     with pytest.raises(ValueError):
103 |         diamonds >> top_n()
104 |     test2 = diamonds >> top_n(n=6)
105 |     df2 = diamonds.sort_values('z', ascending=False).head(6).sort_index()
106 |     assert test2.equals(df2)
107 |     test3 = diamonds >> top_n(col=X.x, n=5)
108 |     df3 = diamonds.sort_values('x', ascending=False).head(5).sort_index()
109 |     assert test3.equals(df3)
110 |     test4 = diamonds >> top_n(col=X.cut, n=1)
111 |     df4 = diamonds[diamonds.cut == 'Very Good']
112 |     assert test4.equals(df4)
113 |     test5 = diamonds >> group_by(X.cut) >> top_n(n=2)
114 |     df5 = diamonds.ix[[27415, 27630, 23539, 27517, 27518, 24297, 24328, 24067, 25999, 26444, 48410]]
115 |     assert test5.equals(df5)
116 |     test6 = diamonds >> top_n(col=X.x, ascending=False, n=5)
117 |     df6 = diamonds.sort_values('x', ascending=True).head(8).sort_index()
118 |     assert test6.equals(df6)
119 | 


--------------------------------------------------------------------------------
/dfply/summary_functions.py:
--------------------------------------------------------------------------------
  1 | from .base import *
  2 | from .vector import *
  3 | 
  4 | 
  5 | # ------------------------------------------------------------------------------
  6 | # Series summary functions
  7 | # ------------------------------------------------------------------------------
  8 | 
  9 | 
 10 | @make_symbolic
 11 | def mean(series):
 12 |     """
 13 |     Returns the mean of a series.
 14 | 
 15 |     Args:
 16 |         series (pandas.Series): column to summarize.
 17 |     """
 18 | 
 19 |     if np.issubdtype(series.dtype, np.number):
 20 |         return series.mean()
 21 |     else:
 22 |         return np.nan
 23 | 
 24 | 
 25 | @make_symbolic
 26 | def first(series, order_by=None):
 27 |     """
 28 |     Returns the first value of a series.
 29 | 
 30 |     Args:
 31 |         series (pandas.Series): column to summarize.
 32 | 
 33 |     Kwargs:
 34 |         order_by: a pandas.Series or list of series (can be symbolic) to order
 35 |             the input series by before summarization.
 36 |     """
 37 | 
 38 |     if order_by is not None:
 39 |         series = order_series_by(series, order_by)
 40 |     first_s = series.iloc[0]
 41 |     return first_s
 42 | 
 43 | 
 44 | @make_symbolic
 45 | def last(series, order_by=None):
 46 |     """
 47 |     Returns the last value of a series.
 48 | 
 49 |     Args:
 50 |         series (pandas.Series): column to summarize.
 51 | 
 52 |     Kwargs:
 53 |         order_by: a pandas.Series or list of series (can be symbolic) to order
 54 |             the input series by before summarization.
 55 |     """
 56 | 
 57 |     if order_by is not None:
 58 |         series = order_series_by(series, order_by)
 59 |     last_s = series.iloc[series.size - 1]
 60 |     return last_s
 61 | 
 62 | 
 63 | @make_symbolic
 64 | def nth(series, n, order_by=None):
 65 |     """
 66 |     Returns the nth value of a series.
 67 | 
 68 |     Args:
 69 |         series (pandas.Series): column to summarize.
 70 |         n (integer): position of desired value. Returns `NaN` if out of range.
 71 | 
 72 |     Kwargs:
 73 |         order_by: a pandas.Series or list of series (can be symbolic) to order
 74 |             the input series by before summarization.
 75 |     """
 76 | 
 77 |     if order_by is not None:
 78 |         series = order_series_by(series, order_by)
 79 |     try:
 80 |         return series.iloc[n]
 81 |     except:
 82 |         return np.nan
 83 | 
 84 | 
 85 | @make_symbolic
 86 | def n(series):
 87 |     """
 88 |     Returns the length of a series.
 89 | 
 90 |     Args:
 91 |         series (pandas.Series): column to summarize.
 92 |     """
 93 | 
 94 |     n_s = series.size
 95 |     return n_s
 96 | 
 97 | 
 98 | @make_symbolic
 99 | def n_distinct(series):
100 |     """
101 |     Returns the number of distinct values in a series.
102 | 
103 |     Args:
104 |         series (pandas.Series): column to summarize.
105 |     """
106 | 
107 |     n_distinct_s = series.unique().size
108 |     return n_distinct_s
109 | 
110 | 
111 | @make_symbolic
112 | def IQR(series):
113 |     """
114 |     Returns the inter-quartile range (IQR) of a series.
115 | 
116 |     The IRQ is defined as the 75th quantile minus the 25th quantile values.
117 | 
118 |     Args:
119 |         series (pandas.Series): column to summarize.
120 |     """
121 | 
122 |     iqr_s = series.quantile(.75) - series.quantile(.25)
123 |     return iqr_s
124 | 
125 | 
126 | @make_symbolic
127 | def colmin(series):
128 |     """
129 |     Returns the minimum value of a series.
130 | 
131 |     Args:
132 |         series (pandas.Series): column to summarize.
133 |     """
134 | 
135 |     min_s = series.min()
136 |     return min_s
137 | 
138 | 
139 | @make_symbolic
140 | def colmax(series):
141 |     """
142 |     Returns the maximum value of a series.
143 | 
144 |     Args:
145 |         series (pandas.Series): column to summarize.
146 |     """
147 | 
148 |     max_s = series.max()
149 |     return max_s
150 | 
151 | 
152 | @make_symbolic
153 | def median(series):
154 |     """
155 |     Returns the median value of a series.
156 | 
157 |     Args:
158 |         series (pandas.Series): column to summarize.
159 |     """
160 | 
161 |     if np.issubdtype(series.dtype, np.number):
162 |         return series.median()
163 |     else:
164 |         return np.nan
165 | 
166 | 
167 | @make_symbolic
168 | def var(series):
169 |     """
170 |     Returns the variance of values in a series.
171 | 
172 |     Args:
173 |         series (pandas.Series): column to summarize.
174 |     """
175 |     if np.issubdtype(series.dtype, np.number):
176 |         return series.var()
177 |     else:
178 |         return np.nan
179 | 
180 | 
181 | @make_symbolic
182 | def sd(series):
183 |     """
184 |     Returns the standard deviation of values in a series.
185 | 
186 |     Args:
187 |         series (pandas.Series): column to summarize.
188 |     """
189 | 
190 |     if np.issubdtype(series.dtype, np.number):
191 |         return series.std()
192 |     else:
193 |         return np.nan
194 | 


--------------------------------------------------------------------------------
/RELEASES.txt:
--------------------------------------------------------------------------------
  1 | v0.3.3
  2 | - Hotfix for parsing left_on and right_on
  3 | 
  4 | TODO: Need to figure out fix to the inversion of symbol issue. Somewhat complicated.
  5 | 
  6 | v0.3.2
  7 | Various PRs added fixing bugs, etc.
  8 | 
  9 | v0.3.1
 10 | This update is almost solely the pull requests by @bleearmstrong that were sitting
 11 | in the repo waiting. There were some minor bug-fixes and changes too.
 12 | 
 13 | - `select_if` and `drop_if` are now available to perform selection according to a function
 14 | - `mutate_if` allows variable creation if a criterion function is evaluated as True
 15 | - `row_number` window function is available (same as rank(method='first'))
 16 | - `distinct` can take no arguments, making it equivalent to `drop_duplicates` with no arguments
 17 | 
 18 | v0.3.0
 19 | Lots and lots of big changes here. Total reworking of the internal functionality.
 20 | The good news is that it should (basically) work the same as before, but
 21 | hopefully better. Obviously keep in mind that this is still beta and there will
 22 | be plenty of bugs to work out on the horizon, but the preexisting tests pass for now...
 23 | There is not backward compatibility with old versions as the decorator names
 24 | have changed, but again, the functionality is otherwise the same.
 25 | 
 26 | Some major things (see readme for details):
 27 | - Moved entirely to python 3 support only. It may still work in python 2, but don't count on it.
 28 | - pandas-ply is no longer required; It was brittle and so rolled my own stuff.
 29 | - Selection "subfunctions" are now working and the selection functions have been changed in light of this.
 30 | 
 31 | 
 32 | v0.2.4
 33 | - Bug fixed in semi-join and anti-join
 34 | - top_n added
 35 | 
 36 | v0.2.3
 37 | Inplace piping added using the `>>=` operator. The `pipe` decorator internals
 38 | have been changed to make this possible through the addition of an `__rshift__`
 39 | implementation and chaining pipes together until evaluated against a
 40 | DataFrame.
 41 | 
 42 | 
 43 | v0.2.2
 44 | - Added docstrings to functions and classes.
 45 | - Added the `case_when` function.
 46 | - Fixed `arrange` to work with symbolic functions like `desc` in the function
 47 |   call.
 48 | - Added `cumany` and `cumall` window functions.
 49 | - Added `if_else` function.
 50 | - Added `na_if` function.
 51 | - Added `percent_rank` function.
 52 | - Reorganization of decorator functions (better subclassing).
 53 | 
 54 | 
 55 | v0.2.1
 56 | Fixed an issue with the way the decorators were structured, particularly
 57 | the @make_symbolic, that would cause problems with returning Call objects
 58 | that would not evaluate properly. Hopefully this is now resolved.
 59 | 
 60 | The "coalesce" function was added from dplyr.
 61 | 
 62 | Some code was moved from base.py to the new vector.py file. The vector.py
 63 | file now contains functions that specifically perform operations on
 64 | series or numpy arrays (such as coalesce, desc, etc.). Test files have been
 65 | reorganized accordingly.
 66 | 
 67 | 
 68 | v0.2.0
 69 | This release now introduces the @make_symbolic decorator, which can wrap
 70 | functions to allow them to evaluate later. This is particularly (and perhaps
 71 | only) useful when you embed functions as arguments to other functions. For
 72 | example the summary and window functions.
 73 | 
 74 | The code for the symbolic handling decorators has been reworked. They now
 75 | inherit from a common class since they shared patterns in their code.
 76 | 
 77 | - @make_symbolic decorator
 78 | - README updates
 79 | - desc() and order_series_by() functions
 80 | - re-imagining of the code for @symbolic_evaluation, @symbolic_reference
 81 |   (the functionality remains unchanged)
 82 | - window and summary functions, along with their tests, have been moved around
 83 |   to other files.
 84 | 
 85 | 
 86 | v0.1.10
 87 | - `separate` and `unite` functions added.
 88 | - Summary functions added for series operations.
 89 | - README improved dramatically.
 90 | - Function docstrings added to more functions (still not all).
 91 | 
 92 | v0.1.9
 93 | Moved unit tests into individual files that reflect the categories of the
 94 | functions/features they are testing. Some small bugs have been fixed as well.
 95 | 
 96 | v0.1.8
 97 | The pipe decorator now copies the dataframe upon each chained function, along
 98 | with the `_grouped_by` attribute, if any. Before, operations with the pipe
 99 | functions were modifying the original dataframe (such as `mutate`).
100 | 
101 | v0.1.7
102 | Restructuring of package to include `diamonds.csv` with pip installation
103 | and require `six` and `pandas-ply` rather than come pre-packaged with them.
104 | 
105 | v0.1.6
106 | Added window functions:
107 | `dense_rank`
108 | `min_rank`
109 | `cumsum`
110 | `cummean`
111 | `cummax`
112 | `cummin`
113 | `cumprod`
114 | 
115 | 
116 | v0.1.5
117 | dplyr set operations added thanks to bleearmstrong.
118 | 
119 | `df >> union(other)`
120 | Rows that appear in either `df` or `other`.
121 | 
122 | `df >> intersect(other)`
123 | Rows that appear in both `df` and `other`
124 | 
125 | `df >> set_diff(other)`
126 | Rows that appear in `df` but not `other`.
127 | 


--------------------------------------------------------------------------------
/dfply/select.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | from .base import *
  4 | 
  5 | 
  6 | # ------------------------------------------------------------------------------
  7 | # Select and drop operators
  8 | # ------------------------------------------------------------------------------
  9 | 
 10 | def selection_context(arg, context):
 11 |     if isinstance(arg, Intention):
 12 |         arg = arg.evaluate(context)
 13 |         if isinstance(arg, pd.Index):
 14 |             arg = list(arg)
 15 |         if isinstance(arg, pd.Series):
 16 |             arg = arg.name
 17 |     return arg
 18 | 
 19 | 
 20 | def selection_filter(f):
 21 |     def wrapper(*args, **kwargs):
 22 |         return Intention(lambda x: f(list(x.columns),
 23 |                                      *(selection_context(a, x) for a in args),
 24 |                                      **{k: selection_context(v, x) for k, v in kwargs.items()}))
 25 | 
 26 |     return wrapper
 27 | 
 28 | 
 29 | def resolve_selection(df, *args, drop=False):
 30 |     if len(args) > 0:
 31 |         args = [a for a in flatten(args)]
 32 |         ordering = []
 33 |         column_indices = np.zeros(df.shape[1])
 34 |         for selector in args:
 35 |             visible = np.where(selector != 0)[0]
 36 |             if not drop:
 37 |                 column_indices[visible] = selector[visible]
 38 |             else:
 39 |                 column_indices[visible] = selector[visible] * -1
 40 |             for selection in np.where(selector == 1)[0]:
 41 |                 if not df.columns[selection] in ordering:
 42 |                     ordering.append(df.columns[selection])
 43 |     else:
 44 |         ordering = list(df.columns)
 45 |         column_indices = np.ones(df.shape[1])
 46 |     return ordering, column_indices
 47 | 
 48 | 
 49 | @pipe
 50 | @group_delegation
 51 | @symbolic_evaluation(eval_as_selector=True)
 52 | def select(df, *args):
 53 |     ordering, column_indices = resolve_selection(df, *args)
 54 |     if (column_indices == 0).all():
 55 |         return df[[]]
 56 |     selection = np.where((column_indices == np.max(column_indices)) &
 57 |                          (column_indices >= 0))[0]
 58 |     df = df.iloc[:, selection]
 59 |     if all([col in ordering for col in df.columns]):
 60 |         ordering = [c for c in ordering if c in df.columns]
 61 |         return df[ordering]
 62 |     else:
 63 |         return df
 64 | 
 65 | 
 66 | @pipe
 67 | @group_delegation
 68 | @symbolic_evaluation(eval_as_selector=True)
 69 | def drop(df, *args):
 70 |     _, column_indices = resolve_selection(df, *args, drop=True)
 71 |     if (column_indices == 0).all():
 72 |         return df[[]]
 73 |     selection = np.where((column_indices == np.max(column_indices)) &
 74 |                          (column_indices >= 0))[0]
 75 |     return df.iloc[:, selection]
 76 | 
 77 | 
 78 | @pipe
 79 | def select_if(df, fun):
 80 |     """Selects columns where fun(ction) is true
 81 |     Args:
 82 |         fun: a function that will be applied to columns
 83 |     """
 84 | 
 85 |     def _filter_f(col):
 86 |         try:
 87 |             return fun(df[col])
 88 |         except:
 89 |             return False
 90 | 
 91 |     cols = list(filter(_filter_f, df.columns))
 92 |     return df[cols]
 93 | 
 94 | 
 95 | @pipe
 96 | def drop_if(df, fun):
 97 |     """Drops columns where fun(ction) is true
 98 |     Args:
 99 |         fun: a function that will be applied to columns
100 |     """
101 | 
102 |     def _filter_f(col):
103 |         try:
104 |             return fun(df[col])
105 |         except:
106 |             return False
107 | 
108 |     cols = list(filter(_filter_f, df.columns))
109 |     return df.drop(cols, axis=1)
110 | 
111 | 
112 | @selection_filter
113 | def starts_with(columns, prefix):
114 |     return [c for c in columns if c.startswith(prefix)]
115 | 
116 | 
117 | @selection_filter
118 | def ends_with(columns, suffix):
119 |     return [c for c in columns if c.endswith(suffix)]
120 | 
121 | 
122 | @selection_filter
123 | def contains(columns, substr):
124 |     return [c for c in columns if substr in c]
125 | 
126 | 
127 | @selection_filter
128 | def matches(columns, pattern):
129 |     return [c for c in columns if re.search(pattern, c)]
130 | 
131 | 
132 | @selection_filter
133 | def everything(columns):
134 |     return columns
135 | 
136 | 
137 | @selection_filter
138 | def num_range(columns, prefix, range):
139 |     colnames = [prefix + str(i) for i in range]
140 |     return [c for c in columns if c in colnames]
141 | 
142 | 
143 | @selection_filter
144 | def one_of(columns, specified):
145 |     return [c for c in columns if c in specified]
146 | 
147 | 
148 | @selection_filter
149 | def columns_between(columns, start_col, end_col, inclusive=True):
150 |     if isinstance(start_col, str):
151 |         start_col = columns.index(start_col)
152 |     if isinstance(end_col, str):
153 |         end_col = columns.index(end_col)
154 |     return columns[start_col:end_col + int(inclusive)]
155 | 
156 | 
157 | @selection_filter
158 | def columns_from(columns, start_col):
159 |     if isinstance(start_col, str):
160 |         start_col = columns.index(start_col)
161 |     return columns[start_col:]
162 | 
163 | 
164 | @selection_filter
165 | def columns_to(columns, end_col, inclusive=False):
166 |     if isinstance(end_col, str):
167 |         end_col = columns.index(end_col)
168 |     return columns[:end_col + int(inclusive)]
169 | 


--------------------------------------------------------------------------------
/dfply/window_functions.py:
--------------------------------------------------------------------------------
  1 | from .base import *
  2 | 
  3 | 
  4 | # ------------------------------------------------------------------------------
  5 | # Window functions
  6 | # ------------------------------------------------------------------------------
  7 | 
  8 | @make_symbolic
  9 | def lead(series, i=1):
 10 |     """
 11 |     Returns a series shifted forward by a value. `NaN` values will be filled
 12 |     in the end.
 13 | 
 14 |     Same as a call to `series.shift(i)`
 15 | 
 16 |     Args:
 17 |         series: column to shift forward.
 18 |         i (int): number of positions to shift forward.
 19 |     """
 20 | 
 21 |     shifted = series.shift(i * -1)
 22 |     return shifted
 23 | 
 24 | 
 25 | @make_symbolic
 26 | def lag(series, i=1):
 27 |     """
 28 |     Returns a series shifted backwards by a value. `NaN` values will be filled
 29 |     in the beginning.
 30 | 
 31 |     Same as a call to `series.shift(-i)`
 32 | 
 33 |     Args:
 34 |         series: column to shift backward.
 35 |         i (int): number of positions to shift backward.
 36 |     """
 37 | 
 38 |     shifted = series.shift(i)
 39 |     return shifted
 40 | 
 41 | 
 42 | @make_symbolic
 43 | def between(series, a, b, inclusive=False):
 44 |     """
 45 |     Returns a boolean series specifying whether rows of the input series
 46 |     are between values `a` and `b`.
 47 | 
 48 |     Args:
 49 |         series: column to compare, typically symbolic.
 50 |         a: value series must be greater than (or equal to if `inclusive=True`)
 51 |             for the output series to be `True` at that position.
 52 |         b: value series must be less than (or equal to if `inclusive=True`) for
 53 |             the output series to be `True` at that position.
 54 | 
 55 |     Kwargs:
 56 |         inclusive (bool): If `True`, comparison is done with `>=` and `<=`.
 57 |             If `False` (the default), comparison uses `>` and `<`.
 58 |     """
 59 | 
 60 |     if inclusive == True:
 61 |         met_condition = (series >= a) & (series <= b)
 62 |     elif inclusive == False:
 63 |         met_condition = (series > a) & (series < b)
 64 |     return met_condition
 65 | 
 66 | 
 67 | @make_symbolic
 68 | def dense_rank(series, ascending=True):
 69 |     """
 70 |     Equivalent to `series.rank(method='dense', ascending=ascending)`.
 71 | 
 72 |     Args:
 73 |         series: column to rank.
 74 | 
 75 |     Kwargs:
 76 |         ascending (bool): whether to rank in ascending order (default is `True`).
 77 |     """
 78 | 
 79 |     ranks = series.rank(method='dense', ascending=ascending)
 80 |     return ranks
 81 | 
 82 | 
 83 | @make_symbolic
 84 | def min_rank(series, ascending=True):
 85 |     """
 86 |     Equivalent to `series.rank(method='min', ascending=ascending)`.
 87 | 
 88 |     Args:
 89 |         series: column to rank.
 90 | 
 91 |     Kwargs:
 92 |         ascending (bool): whether to rank in ascending order (default is `True`).
 93 |     """
 94 | 
 95 |     ranks = series.rank(method='min', ascending=ascending)
 96 |     return ranks
 97 | 
 98 | 
 99 | @make_symbolic
100 | def cumsum(series):
101 |     """
102 |     Calculates cumulative sum of values. Equivalent to `series.cumsum()`.
103 | 
104 |     Args:
105 |         series: column to compute cumulative sum for.
106 |     """
107 | 
108 |     sums = series.cumsum()
109 |     return sums
110 | 
111 | 
112 | @make_symbolic
113 | def cummean(series):
114 |     """
115 |     Calculates cumulative mean of values. Equivalent to
116 |     `series.expanding().mean()`.
117 | 
118 |     Args:
119 |         series: column to compute cumulative mean for.
120 |     """
121 | 
122 |     means = series.expanding().mean()
123 |     return means
124 | 
125 | 
126 | @make_symbolic
127 | def cummax(series):
128 |     """
129 |     Calculates cumulative maximum of values. Equivalent to
130 |     `series.expanding().max()`.
131 | 
132 |     Args:
133 |         series: column to compute cumulative maximum for.
134 |     """
135 | 
136 |     maxes = series.expanding().max()
137 |     return maxes
138 | 
139 | 
140 | @make_symbolic
141 | def cummin(series):
142 |     """
143 |     Calculates cumulative minimum of values. Equivalent to
144 |     `series.expanding().min()`.
145 | 
146 |     Args:
147 |         series: column to compute cumulative minimum for.
148 |     """
149 | 
150 |     mins = series.expanding().min()
151 |     return mins
152 | 
153 | 
154 | @make_symbolic
155 | def cumprod(series):
156 |     """
157 |     Calculates cumulative product of values. Equivalent to
158 |     `series.cumprod()`.
159 | 
160 |     Args:
161 |         series: column to compute cumulative product for.
162 |     """
163 | 
164 |     prods = series.cumprod()
165 |     return prods
166 | 
167 | 
168 | @make_symbolic
169 | def cumany(series):
170 |     """
171 |     Calculates cumulative any of values. Equivalent to
172 |     `series.expanding().apply(np.any).astype(bool)`.
173 | 
174 |     Args:
175 |         series: column to compute cumulative any for.
176 |     """
177 | 
178 |     anys = series.expanding().apply(np.any).astype(bool)
179 |     return anys
180 | 
181 | 
182 | @make_symbolic
183 | def cumall(series):
184 |     """
185 |     Calculates cumulative all of values. Equivalent to
186 |     `series.expanding().apply(np.all).astype(bool)`.
187 | 
188 |     Args:
189 |         series: column to compute cumulative all for.
190 |     """
191 | 
192 |     alls = series.expanding().apply(np.all).astype(bool)
193 |     return alls
194 | 
195 | 
196 | @make_symbolic
197 | def percent_rank(series, ascending=True):
198 |     if series.size == 1:
199 |         return 0
200 |     percents = (series.rank(method='min', ascending=ascending) - 1) / (series.size - 1)
201 |     return percents
202 | 
203 | 
204 | @make_symbolic
205 | def row_number(series, ascending=True):
206 |     """
207 |     Returns row number based on column rank
208 |     Equivalent to `series.rank(method='first', ascending=ascending)`.
209 | 
210 |     Args:
211 |         series: column to rank.
212 | 
213 |     Kwargs:
214 |         ascending (bool): whether to rank in ascending order (default is `True`).
215 | 
216 |     Usage:
217 |     diamonds >> head() >> mutate(rn=row_number(X.x))
218 | 
219 |        carat      cut color clarity  depth  table  price     x     y     z   rn
220 |     0   0.23    Ideal     E     SI2   61.5   55.0    326  3.95  3.98  2.43  2.0
221 |     1   0.21  Premium     E     SI1   59.8   61.0    326  3.89  3.84  2.31  1.0
222 |     2   0.23     Good     E     VS1   56.9   65.0    327  4.05  4.07  2.31  3.0
223 |     3   0.29  Premium     I     VS2   62.4   58.0    334  4.20  4.23  2.63  4.0
224 |     4   0.31     Good     J     SI2   63.3   58.0    335  4.34  4.35  2.75  5.0
225 |     """
226 | 
227 |     series_rank = series.rank(method='first', ascending=ascending)
228 |     return series_rank
229 | 


--------------------------------------------------------------------------------
/dfply/set_ops.py:
--------------------------------------------------------------------------------
  1 | from .base import *
  2 | import warnings
  3 | import pandas as pd
  4 | 
  5 | 
  6 | def validate_set_ops(df, other):
  7 |     """
  8 |     Helper function to ensure that DataFrames are valid for set operations.
  9 |     Columns must be the same name in the same order, and indices must be of the
 10 |     same dimension with the same names.
 11 |     """
 12 | 
 13 |     if df.columns.values.tolist() != other.columns.values.tolist():
 14 |         not_in_df = [col for col in other.columns if col not in df.columns]
 15 |         not_in_other = [col for col in df.columns if col not in other.columns]
 16 |         error_string = 'Error: not compatible.'
 17 |         if len(not_in_df):
 18 |             error_string += ' Cols in y but not x: ' + str(not_in_df) + '.'
 19 |         if len(not_in_other):
 20 |             error_string += ' Cols in x but not y: ' + str(not_in_other) + '.'
 21 |         raise ValueError(error_string)
 22 |     if len(df.index.names) != len(other.index.names):
 23 |         raise ValueError('Index dimension mismatch')
 24 |     if df.index.names != other.index.names:
 25 |         raise ValueError('Index mismatch')
 26 |     else:
 27 |         return
 28 | 
 29 | 
 30 | # ------------------------------------------------------------------------------
 31 | # `union`
 32 | # ------------------------------------------------------------------------------
 33 | 
 34 | @pipe
 35 | def union(df, other, index=False, keep='first'):
 36 |     """
 37 |     Returns rows that appear in either DataFrame.
 38 | 
 39 |     Args:
 40 |         df (pandas.DataFrame): data passed in through the pipe.
 41 |         other (pandas.DataFrame): other DataFrame to use for set operation with
 42 |             the first.
 43 | 
 44 |     Kwargs:
 45 |         index (bool): Boolean indicating whether to consider the pandas index
 46 |             as part of the set operation (default `False`).
 47 |         keep (str): Indicates which duplicate should be kept. Options are `'first'`
 48 |             and `'last'`.
 49 |     """
 50 |     validate_set_ops(df, other)
 51 |     stacked = df.append(other)
 52 |     if index:
 53 |         stacked_reset_indexes = stacked.reset_index()
 54 |         index_cols = [col for col in stacked_reset_indexes.columns if col not in df.columns]
 55 |         index_name = df.index.names
 56 |         return_df = stacked_reset_indexes.drop_duplicates(keep=keep).set_index(index_cols)
 57 |         return_df.index.names = index_name
 58 |         return return_df
 59 |     else:
 60 |         return stacked.drop_duplicates(keep=keep)
 61 | 
 62 | 
 63 | # ------------------------------------------------------------------------------
 64 | # `intersect`
 65 | # ------------------------------------------------------------------------------
 66 | 
 67 | 
 68 | @pipe
 69 | def intersect(df, other, index=False, keep='first'):
 70 |     """
 71 |     Returns rows that appear in both DataFrames.
 72 | 
 73 |     Args:
 74 |         df (pandas.DataFrame): data passed in through the pipe.
 75 |         other (pandas.DataFrame): other DataFrame to use for set operation with
 76 |             the first.
 77 | 
 78 |     Kwargs:
 79 |         index (bool): Boolean indicating whether to consider the pandas index
 80 |             as part of the set operation (default `False`).
 81 |         keep (str): Indicates which duplicate should be kept. Options are `'first'`
 82 |             and `'last'`.
 83 |     """
 84 | 
 85 |     validate_set_ops(df, other)
 86 |     if index:
 87 |         df_reset_index = df.reset_index()
 88 |         other_reset_index = other.reset_index()
 89 |         index_cols = [col for col in df_reset_index.columns if col not in df.columns]
 90 |         df_index_names = df.index.names
 91 |         return_df = (pd.merge(df_reset_index, other_reset_index,
 92 |                               how='inner',
 93 |                               left_on=df_reset_index.columns.values.tolist(),
 94 |                               right_on=df_reset_index.columns.values.tolist())
 95 |                      .set_index(index_cols))
 96 |         return_df.index.names = df_index_names
 97 |         return_df = return_df.drop_duplicates(keep=keep)
 98 |         return return_df
 99 |     else:
100 |         return_df = pd.merge(df, other,
101 |                              how='inner',
102 |                              left_on=df.columns.values.tolist(),
103 |                              right_on=df.columns.values.tolist())
104 |         return_df = return_df.drop_duplicates(keep=keep)
105 |         return return_df
106 | 
107 | 
108 | # ------------------------------------------------------------------------------
109 | # `set_diff`
110 | # ------------------------------------------------------------------------------
111 | 
112 | 
113 | @pipe
114 | def set_diff(df, other, index=False, keep='first'):
115 |     """
116 |     Returns rows that appear in the first DataFrame but not the second.
117 | 
118 |     Args:
119 |         df (pandas.DataFrame): data passed in through the pipe.
120 |         other (pandas.DataFrame): other DataFrame to use for set operation with
121 |             the first.
122 | 
123 |     Kwargs:
124 |         index (bool): Boolean indicating whether to consider the pandas index
125 |             as part of the set operation (default `False`).
126 |         keep (str): Indicates which duplicate should be kept. Options are `'first'`
127 |             and `'last'`.
128 |     """
129 | 
130 |     validate_set_ops(df, other)
131 |     if index:
132 |         df_reset_index = df.reset_index()
133 |         other_reset_index = other.reset_index()
134 |         index_cols = [col for col in df_reset_index.columns if col not in df.columns]
135 |         df_index_names = df.index.names
136 |         return_df = (pd.merge(df_reset_index, other_reset_index,
137 |                               how='left',
138 |                               left_on=df_reset_index.columns.values.tolist(),
139 |                               right_on=other_reset_index.columns.values.tolist(),
140 |                               indicator=True)
141 |                      .set_index(index_cols))
142 |         return_df = return_df[return_df._merge == 'left_only']
143 |         return_df.index.names = df_index_names
144 |         return_df = return_df.drop_duplicates(keep=keep)[df.columns]
145 |         return return_df
146 |     else:
147 |         return_df = pd.merge(df, other,
148 |                              how='left',
149 |                              left_on=df.columns.values.tolist(),
150 |                              right_on=df.columns.values.tolist(),
151 |                              indicator=True)
152 |         return_df = return_df[return_df._merge == 'left_only']
153 |         return_df = return_df.drop_duplicates(keep=keep)[df.columns]
154 |         return return_df
155 | 


--------------------------------------------------------------------------------
/test/test_reshape.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from dfply import *
  4 | 
  5 | ##==============================================================================
  6 | ## reshape test functions
  7 | ##==============================================================================
  8 | 
  9 | 
 10 | def arrange_apply_helperfunc(df):
 11 |     df = df.sort_values('depth', ascending=False)
 12 |     df = df.head(5)
 13 |     return df
 14 | 
 15 | # def test_arrange_small():
 16 | #     d = diamonds >> arrange(desc(X.cut), desc(X.price))
 17 | #     print(d.head(25))
 18 | #     assert False
 19 | 
 20 | 
 21 | def test_arrange():
 22 |     df = diamonds.groupby('cut').apply(arrange_apply_helperfunc).reset_index(drop=True)
 23 |     d = (diamonds >> group_by('cut') >> arrange('depth', ascending=False) >>
 24 |          head(5) >> ungroup()).reset_index(drop=True)
 25 |     #print('df', df, df.shape)
 26 |     #print('d', d, d.shape)
 27 |     assert df.equals(d)
 28 | 
 29 |     d = (diamonds >> group_by('cut') >> arrange(X.depth, ascending=False) >>
 30 |          head(5) >> ungroup()).reset_index(drop=True)
 31 |     assert df.equals(d)
 32 | 
 33 |     print(type(d), type(df), type(diamonds))
 34 | 
 35 |     df = diamonds.sort_values(['cut','price'], ascending=False)
 36 |     d = diamonds >> arrange(desc(X.cut), desc(X.price))
 37 |     print('df', df >> head(5))
 38 |     print('d', d >> head(5))
 39 |     assert df.equals(d)
 40 | 
 41 | 
 42 | def test_rename():
 43 |     df = diamonds.rename(columns={'cut':'Cut','table':'Table','carat':'Carat'})
 44 |     d = diamonds >> rename(Cut=X.cut, Table=X.table, Carat='carat')
 45 |     assert df.equals(d)
 46 | 
 47 | 
 48 | @pytest.fixture
 49 | def elongated():
 50 |     elongated = diamonds >> gather('variable', 'value', add_id=True)
 51 |     return elongated
 52 | 
 53 | 
 54 | def test_gather(elongated):
 55 |     d = diamonds >> gather('variable', 'value', ['price', 'depth','x','y','z'])
 56 | 
 57 |     variables = ['price','depth','x','y','z']
 58 |     id_vars = [c for c in diamonds.columns if c not in variables]
 59 |     df = pd.melt(diamonds, id_vars, variables, 'variable', 'value')
 60 | 
 61 |     assert df.equals(d)
 62 | 
 63 |     d = diamonds >> gather('variable', 'value')
 64 | 
 65 |     variables = diamonds.columns.tolist()
 66 |     id_vars = []
 67 |     df = pd.melt(diamonds, id_vars, variables, 'variable', 'value')
 68 | 
 69 |     assert df.equals(d)
 70 | 
 71 |     df = diamonds.copy()
 72 |     df['_ID'] = np.arange(df.shape[0])
 73 |     df = pd.melt(df, ['_ID'], variables, 'variable', 'value')
 74 | 
 75 |     assert df.equals(elongated)
 76 | 
 77 | 
 78 | def test_spread(elongated):
 79 | 
 80 |     columns = elongated.columns.tolist()
 81 |     id_cols = ['_ID']
 82 | 
 83 |     df = elongated.copy()
 84 |     df['temp_index'] = df['_ID'].values
 85 |     df = df.set_index('temp_index')
 86 |     spread_data = df[['variable','value']]
 87 | 
 88 |     spread_data = spread_data.pivot(columns='variable', values='value')
 89 |     converted_spread = spread_data.copy()
 90 | 
 91 |     columns_to_convert = [col for col in spread_data if col not in columns]
 92 |     converted_spread = convert_type(converted_spread, columns_to_convert)
 93 | 
 94 |     df = df[['_ID']].drop_duplicates()
 95 | 
 96 |     df_spread = df.merge(spread_data, left_index=True, right_index=True).reset_index(drop=True)
 97 |     df_conv = df.merge(converted_spread, left_index=True, right_index=True).reset_index(drop=True)
 98 | 
 99 |     d_spread = elongated >> spread('variable', 'value')
100 |     d_spread_conv = elongated >> spread('variable', 'value', convert=True)
101 | 
102 |     assert df_spread.equals(d_spread)
103 |     assert df_conv.equals(d_spread_conv)
104 | 
105 | 
106 | def test_separate():
107 | 
108 |     d = pd.DataFrame({
109 |         'a':['1-a-3','1-b','1-c-3-4','9-d-1','10']
110 |     })
111 | 
112 |     test1 = d >> separate(X.a, ['a1','a2','a3'],
113 |                           remove=True, convert=False,
114 |                           extra='merge', fill='right')
115 | 
116 |     true1 = pd.DataFrame({
117 |         'a1':['1','1','1','9','10'],
118 |         'a2':['a','b','c','d',np.nan],
119 |         'a3':['3',np.nan,'3-4','1',np.nan]
120 |     })
121 |     print(test1)
122 |     print(true1)
123 |     assert true1.equals(test1)
124 | 
125 |     test2 = d >> separate(X.a, ['a1','a2','a3'],
126 |                           remove=True, convert=False,
127 |                           extra='merge', fill='left')
128 | 
129 |     true2 = pd.DataFrame({
130 |         'a1':['1',np.nan,'1','9',np.nan],
131 |         'a2':['a','1','c','d',np.nan],
132 |         'a3':['3','b','3-4','1','10']
133 |     })
134 |     assert true2.equals(test2)
135 | 
136 |     test3 = d >> separate(X.a, ['a1','a2','a3'],
137 |                           remove=True, convert=True,
138 |                           extra='merge', fill='right')
139 | 
140 |     true3 = pd.DataFrame({
141 |         'a1':[1,1,1,9,10],
142 |         'a2':['a','b','c','d',np.nan],
143 |         'a3':['3',np.nan,'3-4','1',np.nan]
144 |     })
145 |     assert true3.equals(test3)
146 | 
147 |     test4 = d >> separate(X.a, ['col1','col2'], sep=[1,3],
148 |                           remove=True, convert=False, extra='drop', fill='left')
149 | 
150 |     true4 = pd.DataFrame({
151 |         'col1':['1','1','1','9','1'],
152 |         'col2':['-a','-b','-c','-d','0']
153 |     })
154 |     assert true4.equals(test4)
155 | 
156 |     test5 = d >> separate(X.a, ['col1','col2'], sep=[1,3],
157 |                           remove=False, convert=False, extra='drop', fill='left')
158 | 
159 |     true5 = pd.DataFrame({
160 |         'a':['1-a-3','1-b','1-c-3-4','9-d-1','10'],
161 |         'col1':['1','1','1','9','1'],
162 |         'col2':['-a','-b','-c','-d','0']
163 |     })
164 |     assert true5.equals(test5)
165 | 
166 |     test6 = d >> separate(X.a, ['col1','col2','col3'], sep=[30],
167 |                           remove=True, convert=False, extra='drop', fill='left')
168 | 
169 |     true6 = pd.DataFrame({
170 |         'col1':['1-a-3','1-b','1-c-3-4','9-d-1','10'],
171 |         'col2':[np.nan,np.nan,np.nan,np.nan,np.nan],
172 |         'col3':[np.nan,np.nan,np.nan,np.nan,np.nan]
173 |     })
174 |     assert true6.equals(test6)
175 | 
176 | 
177 | def test_unite():
178 |     d = pd.DataFrame({
179 |         'a':[1,2,3],
180 |         'b':['a','b','c'],
181 |         'c':[True, False, np.nan]
182 |     })
183 | 
184 |     test1 = d >> unite('united', X.a, 'b', 2, remove=True, na_action='maintain')
185 |     true1 = pd.DataFrame({
186 |         'united':['1_a_True','2_b_False',np.nan]
187 |     })
188 |     assert true1.equals(test1)
189 | 
190 |     test2 = d >> unite('united', ['a','b','c'], remove=True, na_action='ignore',
191 |                        sep='*')
192 |     true2 = pd.DataFrame({
193 |         'united':['1*a*True','2*b*False','3*c']
194 |     })
195 |     assert test2.equals(true2)
196 | 
197 |     test3 = d >> unite('united', d.columns, remove=True, na_action='as_string')
198 |     true3 = pd.DataFrame({
199 |         'united':['1_a_True','2_b_False','3_c_nan']
200 |     })
201 |     assert true3.equals(test3)
202 | 
203 |     test4 = d >> unite('united', d.columns, remove=False, na_action='as_string')
204 |     true4 = pd.DataFrame({
205 |         'a':[1,2,3],
206 |         'b':['a','b','c'],
207 |         'c':[True, False, np.nan],
208 |         'united':['1_a_True','2_b_False','3_c_nan']
209 |     })
210 | 
211 |     print(true4)
212 |     print(test4)
213 |     assert true4.equals(test4)
214 | 


--------------------------------------------------------------------------------
/test/test_window_functions.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from dfply import *
  4 | 
  5 | ##==============================================================================
  6 | ## window function tests
  7 | ##==============================================================================
  8 | 
  9 | 
 10 | def test_lead():
 11 |     d = diamonds >> mutate(price_lag = lead(X.price, i=2))
 12 |     df = diamonds.assign(price_lag = diamonds.price.shift(-2))
 13 |     assert df.equals(d)
 14 | 
 15 | 
 16 | def test_lag():
 17 |     d = diamonds >> mutate(price_lag = lag(X.price, i=2))
 18 |     df = diamonds.assign(price_lag = diamonds.price.shift(2))
 19 |     assert df.equals(d)
 20 | 
 21 | 
 22 | def test_between():
 23 |     d = diamonds >> mutate(z_btwn_x_y = between(X.z, X.x, X.y))
 24 |     df = diamonds.copy()
 25 |     df['z_btwn_x_y'] = (df.z > df.x) & (df.z < df.y)
 26 |     assert df.equals(d)
 27 | 
 28 | 
 29 | def test_dense_rank():
 30 |     df = diamonds.copy() >> head(5) >> select(X.cut, X.x)
 31 |     df_dr = df >> mutate(dr=dense_rank(X.x))
 32 |     df_truth = df
 33 |     df_truth['dr'] = pd.Series([2.0, 1.0, 3.0, 4.0, 5.0])
 34 |     assert df_dr.equals(df_truth)
 35 |     df_dr = df >> mutate(dr=dense_rank(X.cut))
 36 |     df_truth['dr'] = pd.Series([2.0, 3.0, 1.0, 3.0, 1.0])
 37 |     assert df_dr.equals(df_truth)
 38 |     df_dr = df >> group_by(X.cut) >> mutate(dr=dense_rank(X.x))
 39 |     df_truth['dr'] = pd.Series([1.0, 1.0, 1.0, 2.0, 2.0])
 40 |     assert df_dr.sort_index().equals(df_truth)
 41 |     df_dr = df >> mutate(dr=dense_rank(X.x, ascending=False))
 42 |     df_truth['dr'] = pd.Series([4.0, 5.0, 3.0, 2.0, 1.0])
 43 |     assert df_dr.equals(df_truth)
 44 | 
 45 | 
 46 | def test_min_rank():
 47 |     df = diamonds.copy() >> head(5) >> select(X.cut, X.x)
 48 |     df_mr = df >> mutate(mr=min_rank(X.x))
 49 |     df_truth = df
 50 |     df_truth['mr'] = pd.Series([2.0, 1.0, 3.0, 4.0, 5.0])
 51 |     assert df_mr.equals(df_truth)
 52 |     df_mr = df >> mutate(mr=min_rank(X.cut))
 53 |     df_truth['mr'] = pd.Series([3.0, 4.0, 1.0, 4.0, 1.0])
 54 |     assert df_mr.equals(df_truth)
 55 |     df_mr = df >> group_by(X.cut) >> mutate(mr=min_rank(X.x))
 56 |     df_truth['mr'] = pd.Series([1.0, 1.0, 1.0, 2.0, 2.0])
 57 |     assert df_mr.sort_index().equals(df_truth)
 58 |     df_mr = df >> mutate(mr=min_rank(X.x, ascending=False))
 59 |     df_truth['mr'] = pd.Series([4.0, 5.0, 3.0, 2.0, 1.0])
 60 |     assert df_mr.equals(df_truth)
 61 | 
 62 | 
 63 | def test_cumsum():
 64 |     df = diamonds.copy() >> head(5) >> select(X.cut, X.x)
 65 |     df_cs = df >> mutate(cs=cumsum(X.x))
 66 |     df_truth = df
 67 |     df_truth['cs'] = pd.Series([3.95, 7.84, 11.89, 16.09, 20.43])
 68 |     pd.util.testing.assert_frame_equal(df_cs, df_truth)
 69 |     #assert df_cs.equals(df_truth)
 70 |     df_cs = df >> group_by(X.cut) >> mutate(cs=cumsum(X.x))
 71 |     df_truth['cs'] = pd.Series([3.95, 3.89, 4.05, 8.09, 8.39])
 72 |     pd.util.testing.assert_frame_equal(df_cs.sort_index(), df_truth)
 73 |     #assert df_cs.equals(df_truth)
 74 | 
 75 | 
 76 | def test_cummean():
 77 |     df = diamonds.copy() >> head(5) >> select(X.cut, X.x)
 78 |     df_cm = df >> mutate(cm=cummean(X.x))
 79 |     df_truth = df
 80 |     df_truth['cm'] = pd.Series([3.950000, 3.920000, 3.963333, 4.022500, 4.086000])
 81 |     pd.util.testing.assert_frame_equal(df_cm, df_truth)
 82 |     #assert df_cm.equals(df_truth)
 83 |     df_cm = df >> group_by(X.cut) >> mutate(cm=cummean(X.x))
 84 |     df_truth['cm'] = pd.Series([3.950, 3.890, 4.050, 4.045, 4.195])
 85 |     pd.util.testing.assert_frame_equal(df_cm.sort_index(), df_truth)
 86 |     #assert df_cm.equals(df_truth)
 87 | 
 88 | 
 89 | def test_cummax():
 90 |     df = diamonds.copy() >> head(5) >> select(X.cut, X.x)
 91 |     df_cm = df >> mutate(cm=cummax(X.x))
 92 |     df_truth = df
 93 |     df_truth['cm'] = pd.Series([3.95, 3.95, 4.05, 4.20, 4.34])
 94 |     pd.util.testing.assert_frame_equal(df_cm, df_truth)
 95 |     #assert df_cm.equals(df_truth)
 96 |     df_cm = df >> group_by(X.cut) >> mutate(cm=cummax(X.x))
 97 |     df_truth['cm'] = pd.Series([3.95, 3.89, 4.05, 4.20, 4.34])
 98 |     pd.util.testing.assert_frame_equal(df_cm.sort_index(), df_truth)
 99 |     #assert df_cm.equals(df_truth)
100 | 
101 | 
102 | def test_cummin():
103 |     df = diamonds.copy() >> head(5) >> select(X.cut, X.x)
104 |     df_cm = df >> mutate(cm=cummin(X.x))
105 |     df_truth = df
106 |     df_truth['cm'] = pd.Series([3.95, 3.89, 3.89, 3.89, 3.89])
107 |     pd.util.testing.assert_frame_equal(df_cm, df_truth)
108 |     #assert df_cm.equals(df_truth)
109 |     df_cm = df >> group_by(X.cut) >> mutate(cm=cummin(X.x))
110 |     df_truth['cm'] = pd.Series([3.95, 3.89, 4.05, 3.89, 4.05])
111 |     pd.util.testing.assert_frame_equal(df_cm.sort_index(), df_truth)
112 |     #assert df_cm.equals(df_truth)
113 | 
114 | 
115 | def test_cumprod():
116 |     df = diamonds.copy() >> head(5) >> select(X.cut, X.x)
117 |     df_cp = df >> mutate(cp=cumprod(X.x))
118 |     df_truth = df.copy()
119 |     df_truth['cp'] = pd.Series([3.950000, 15.365500, 62.230275, 261.367155, 1134.333453])
120 |     pd.util.testing.assert_frame_equal(df_cp, df_truth)
121 |     #assert df_cp.equals(df_truth)
122 |     df_cp = df >> group_by(X.cut) >> mutate(cp=cumprod(X.x))
123 |     df_truth['cp'] = pd.Series([3.950, 3.890, 4.050, 16.338, 17.577])
124 |     # some tricky floating point stuff going on here
125 |     diffs = df_cp.sort_index().cp - df_truth.cp
126 |     assert all(diffs < .0000001)
127 | 
128 | 
129 | def test_cumany():
130 |     df = pd.DataFrame({
131 |         'a':[False,False,True,True,False,True],
132 |         'b':['x','x','x','x','y','y']
133 |     })
134 | 
135 |     d = df >> mutate(ca=cumany(X.a))
136 |     assert d.equals(df.assign(ca=[False,False,True,True,True,True]))
137 | 
138 |     d = df >> group_by(X.b) >> mutate(ca=cumany(X.a))
139 |     assert d.sort_index().equals(df.assign(ca=[False,False,True,True,False,True]))
140 | 
141 | 
142 | def test_cumall():
143 |     df = pd.DataFrame({
144 |         'a':[True,True,False,True,False,True],
145 |         'b':['x','x','x','y','y','y']
146 |     })
147 | 
148 |     d = df >> mutate(ca=cumall(X.a))
149 |     assert d.equals(df.assign(ca=[True,True,False,False,False,False]))
150 | 
151 |     d = df >> group_by(X.b) >> mutate(ca=cumall(X.a))
152 |     assert d.sort_index().equals(df.assign(ca=[True,True,False,True,False,False]))
153 | 
154 | 
155 | def test_percent_rank():
156 |     df = diamonds.copy() >> head(5) >> select(X.cut, X.x)
157 |     df_pr = df >> mutate(pr=percent_rank(X.x))
158 |     df_truth = df.copy()
159 |     assert df_pr.equals(df_truth.assign(pr=[.25, 0.00, 0.50, 0.75, 1.00]))
160 |     df_pr = df >> mutate(pr=percent_rank(X.cut))
161 |     assert df_pr.equals(df_truth.assign(pr=[0.50, 0.75, 0.00, 0.75, 0.00]))
162 |     df_pr = df >> group_by(X.cut) >> mutate(pr=percent_rank(X.x))
163 |     assert df_pr.sort_index().equals(df_truth.assign(pr=[0.0, 0.0, 0.0, 1.0, 1.0]))
164 |     df_pr = df >> mutate(pr=percent_rank(X.x, ascending=False))
165 |     assert df_pr.equals(df_truth.assign(pr=[0.75, 1.0, 0.50, 0.25, 0.00]))
166 | 
167 | 
168 | def test_row_number():
169 |     df = diamonds.copy().head(5).sort_values(by='x')
170 |     df['rn'] = range(1, df.shape[0] + 1)
171 |     df['rn'] = df['rn'].astype(float)
172 |     df.sort_index(inplace=True)
173 |     assert df.equals(diamonds >> head(5) >> mutate(rn=row_number(X.x)))
174 |     # test 2: row number with desc() option
175 |     df = diamonds.copy().head(5).sort_values(by='x', ascending=False)
176 |     df['rn'] = range(1, df.shape[0] + 1)
177 |     df['rn'] = df['rn'].astype(float)
178 |     df.sort_index(inplace=True)
179 |     assert df.equals(diamonds >> head(5) >> mutate(rn=row_number(desc(X.x))))
180 |     # test 3: row number with ascending keyword
181 |     df = diamonds.copy().head(5).sort_values(by='x', ascending=False)
182 |     df['rn'] = range(1, df.shape[0] + 1)
183 |     df['rn'] = df['rn'].astype(float)
184 |     df.sort_index(inplace=True)
185 |     assert df.equals(diamonds >> head(5) >> mutate(rn=row_number(X.x, ascending=False)))
186 |     # test 4: with a group by
187 |     df = diamonds.copy().head(5)
188 |     df['rn'] = [1, 1, 1, 2, 2]
189 |     df['rn'] = df['rn'].astype(float)
190 |     assert df.equals((diamonds >> head(5) >> group_by(X.cut) >> mutate(rn=row_number(X.x))).sort_index())
191 | 


--------------------------------------------------------------------------------
/dfply/vector.py:
--------------------------------------------------------------------------------
  1 | from .base import *
  2 | import collections
  3 | 
  4 | 
  5 | # ------------------------------------------------------------------------------
  6 | # series ordering
  7 | # ------------------------------------------------------------------------------
  8 | 
  9 | @make_symbolic
 10 | def order_series_by(series, order_series):
 11 |     """
 12 |     Orders one series according to another series, or a list of other
 13 |     series. If a list of other series are specified, ordering is done hierarchically
 14 |     like when a list of columns is supplied to `.sort_values()`.
 15 | 
 16 |     Args:
 17 |         series (:obj:`pandas.Series`): the pandas Series object to be reordered.
 18 |         order_series: either a pandas Series object or a list of pandas Series
 19 |             objects. These will be sorted using `.sort_values()` with
 20 |             `ascending=True`, and the new order will be used to reorder the
 21 |             Series supplied in the first argument.
 22 | 
 23 |     Returns:
 24 |         reordered `pandas.Series` object
 25 |     """
 26 | 
 27 |     if isinstance(order_series, (list, tuple)):
 28 |         sorter = pd.concat(order_series, axis=1)
 29 |         sorter_columns = ['_sorter' + str(i) for i in range(len(order_series))]
 30 |         sorter.columns = sorter_columns
 31 |         sorter['series'] = series.values
 32 |         sorted_series = sorter.sort_values(sorter_columns)['series']
 33 |         return sorted_series
 34 |     else:
 35 |         sorted_series = pd.DataFrame({
 36 |             'series': series.values,
 37 |             'order': order_series.values
 38 |         }).sort_values('order', ascending=True)['series']
 39 |         return sorted_series
 40 | 
 41 | 
 42 | @make_symbolic
 43 | def desc(series):
 44 |     """
 45 |     Mimics the functionality of the R desc function. Essentially inverts a
 46 |     series object to make ascending sort act like descending sort.
 47 | 
 48 |     Args:
 49 |         series (:obj:`pandas.Series`): pandas series to be inverted prior to
 50 |             ordering/sorting.
 51 | 
 52 |     Returns:
 53 |         inverted `pandas.Series`. The returned series will be numeric (integers),
 54 |             regardless of the type of the original series.
 55 | 
 56 |     Example:
 57 | 
 58 |         First group by cut, then find the first value of price when ordering by
 59 |         price ascending, and ordering by price descending using the `desc` function.
 60 | 
 61 |         diamonds >> group_by(X.cut) >> summarize(carat_low=first(X.price, order_by=X.price),
 62 |                                                  carat_high=first(X.price, order_by=desc(X.price)))
 63 | 
 64 |                  cut  carat_high  carat_low
 65 |         0       Fair       18574        337
 66 |         1       Good       18788        327
 67 |         2      Ideal       18806        326
 68 |         3    Premium       18823        326
 69 |         4  Very Good       18818        336
 70 |     """
 71 | 
 72 |     return series.rank(method='min', ascending=False)
 73 | 
 74 | 
 75 | # ------------------------------------------------------------------------------
 76 | # coalesce
 77 | # ------------------------------------------------------------------------------
 78 | 
 79 | @make_symbolic
 80 | def coalesce(*series):
 81 |     """
 82 |     Takes the first non-NaN value in order across the specified series,
 83 |     returning a new series. Mimics the coalesce function in dplyr and SQL.
 84 | 
 85 |     Args:
 86 |         *series: Series objects, typically represented in their symbolic form
 87 |             (like X.series).
 88 | 
 89 |     Example:
 90 |         df = pd.DataFrame({
 91 |             'a':[1,np.nan,np.nan,np.nan,np.nan],
 92 |             'b':[2,3,np.nan,np.nan,np.nan],
 93 |             'c':[np.nan,np.nan,4,5,np.nan],
 94 |             'd':[6,7,8,9,np.nan]
 95 |         })
 96 |         df >> transmute(coal=coalesce(X.a, X.b, X.c, X.d))
 97 | 
 98 |              coal
 99 |         0       1
100 |         1       3
101 |         2       4
102 |         3       5
103 |         4  np.nan
104 |     """
105 | 
106 |     series = [pd.Series(s) for s in series]
107 |     coalescer = pd.concat(series, axis=1)
108 |     min_nonna = np.argmin(pd.isnull(coalescer).values, axis=1)
109 |     min_nonna = [coalescer.columns[i] for i in min_nonna]
110 |     return coalescer.lookup(np.arange(coalescer.shape[0]), min_nonna)
111 | 
112 | 
113 | # ------------------------------------------------------------------------------
114 | # case_when
115 | # ------------------------------------------------------------------------------
116 | 
117 | @make_symbolic
118 | def case_when(*conditions):
119 |     """
120 |     Functions as a switch statement, creating a new series out of logical
121 |     conditions specified by 2-item lists where the left-hand item is the
122 |     logical condition and the right-hand item is the value where that
123 |     condition is true.
124 | 
125 |     Conditions should go from the most specific to the most general. A
126 |     conditional that appears earlier in the series will "overwrite" one that
127 |     appears later. Think of it like a series of if-else statements.
128 | 
129 |     The logicals and values of the condition pairs must be all the same
130 |     length, or length 1. Logicals can be vectors of booleans or a single
131 |     boolean (`True`, for example, can be the logical statement for the
132 |     final conditional to catch all remaining.).
133 | 
134 |     Args:
135 |         *conditions: Each condition should be a list with two values. The first
136 |             value is a boolean or vector of booleans that specify indices in
137 |             which the condition is met. The second value is a vector of values
138 |             or single value specifying the outcome where that condition is met.
139 | 
140 |     Example:
141 |         df = pd.DataFrame({
142 |             'num':np.arange(16)
143 |         })
144 |         df >> mutate(strnum=case_when([X.num % 15 == 0, 'fizzbuzz'],
145 |                                       [X.num % 3 == 0, 'fizz'],
146 |                                       [X.num % 5 == 0, 'buzz'],
147 |                                       [True, X.num.astype(str)]))
148 | 
149 |             num    strnum
150 |         0     0  fizzbuzz
151 |         1     1         1
152 |         2     2         2
153 |         3     3      fizz
154 |         4     4         4
155 |         5     5      buzz
156 |         6     6      fizz
157 |         7     7         7
158 |         8     8         8
159 |         9     9      fizz
160 |         10   10      buzz
161 |         11   11        11
162 |         12   12      fizz
163 |         13   13        13
164 |         14   14        14
165 |         15   15  fizzbuzz
166 |     """
167 | 
168 |     lengths = []
169 |     for logical, outcome in conditions:
170 |         if isinstance(logical, collections.Iterable):
171 |             lengths.append(len(logical))
172 |         if isinstance(outcome, collections.Iterable) and not isinstance(outcome, str):
173 |             lengths.append(len(outcome))
174 |     unique_lengths = np.unique(lengths)
175 |     assert len(unique_lengths) == 1
176 |     output_len = unique_lengths[0]
177 | 
178 |     output = []
179 |     for logical, outcome in conditions:
180 |         if isinstance(logical, bool):
181 |             logical = np.repeat(logical, output_len)
182 |         if isinstance(logical, pd.Series):
183 |             logical = logical.values
184 |         if not isinstance(outcome, collections.Iterable) or isinstance(outcome, str):
185 |             outcome = pd.Series(np.repeat(outcome, output_len))
186 |         outcome[~logical] = np.nan
187 |         output.append(outcome)
188 | 
189 |     return coalesce(*output)
190 | 
191 | 
192 | # ------------------------------------------------------------------------------
193 | # if_else
194 | # ------------------------------------------------------------------------------
195 | 
196 | @make_symbolic
197 | def if_else(condition, when_true, otherwise):
198 |     """
199 |     Wraps creation of a series based on if-else conditional logic into a function
200 |     call.
201 | 
202 |     Provide a boolean vector condition, value(s) when true, and value(s)
203 |     when false, and a vector will be returned the same length as the conditional
204 |     vector according to the logical statement.
205 | 
206 |     Args:
207 |         condition: A boolean vector representing the condition. This is often
208 |             a logical statement with a symbolic series.
209 |         when_true: A vector the same length as the condition vector or a single
210 |             value to apply when the condition is `True`.
211 |         otherwise: A vector the same length as the condition vector or a single
212 |             value to apply when the condition is `False`.
213 | 
214 |     Example:
215 |     df = pd.DataFrame
216 |     """
217 | 
218 |     if not isinstance(when_true, collections.Iterable) or isinstance(when_true, str):
219 |         when_true = np.repeat(when_true, len(condition))
220 |     if not isinstance(otherwise, collections.Iterable) or isinstance(otherwise, str):
221 |         otherwise = np.repeat(otherwise, len(condition))
222 |     assert (len(condition) == len(when_true)) and (len(condition) == len(otherwise))
223 | 
224 |     if isinstance(when_true, pd.Series):
225 |         when_true = when_true.values
226 |     if isinstance(otherwise, pd.Series):
227 |         otherwise = otherwise.values
228 | 
229 |     output = np.array([when_true[i] if c else otherwise[i]
230 |                        for i, c in enumerate(condition)])
231 |     return output
232 | 
233 | 
234 | # ------------------------------------------------------------------------------
235 | # na_if
236 | # ------------------------------------------------------------------------------
237 | 
238 | @make_symbolic
239 | def na_if(series, *values):
240 |     """
241 |     If values in a series match a specified value, change them to `np.nan`.
242 | 
243 |     Args:
244 |         series: Series or vector, often symbolic.
245 |         *values: Value(s) to convert to `np.nan` in the series.
246 |     """
247 | 
248 |     series = pd.Series(series)
249 |     series[series.isin(values)] = np.nan
250 |     return series
251 | 


--------------------------------------------------------------------------------
/dfply/join.py:
--------------------------------------------------------------------------------
  1 | from .base import *
  2 | 
  3 | 
  4 | # ------------------------------------------------------------------------------
  5 | # SQL-style joins
  6 | # ------------------------------------------------------------------------------
  7 | 
  8 | def get_join_parameters(join_kwargs):
  9 |     """
 10 |     Convenience function to determine the columns to join the right and
 11 |     left DataFrames on, as well as any suffixes for the columns.
 12 |     """
 13 | 
 14 |     by = join_kwargs.get('by', None)
 15 |     suffixes = join_kwargs.get('suffixes', ('_x', '_y'))
 16 |     if isinstance(by, tuple):
 17 |         left_on, right_on = by
 18 |     elif isinstance(by, list):
 19 |         by = [x if isinstance(x, tuple) else (x, x) for x in by]
 20 |         left_on, right_on = (list(x) for x in zip(*by))
 21 |     else:
 22 |         left_on, right_on = by, by
 23 |     return left_on, right_on, suffixes
 24 | 
 25 | 
 26 | @pipe
 27 | def inner_join(df, other, **kwargs):
 28 |     """
 29 |     Joins on values present in both DataFrames.
 30 | 
 31 |     Args:
 32 |         df (pandas.DataFrame): Left DataFrame (passed in via pipe)
 33 |         other (pandas.DataFrame): Right DataFrame
 34 | 
 35 |     Kwargs:
 36 |         by (str or list): Columns to join on. If a single string, will join
 37 |             on that column. If a list of lists which contain strings or
 38 |             integers, the right/left columns to join on.
 39 |         suffixes (list): String suffixes to append to column names in left
 40 |             and right DataFrames.
 41 | 
 42 |     Example:
 43 |         a >> inner_join(b, by='x1')
 44 | 
 45 |           x1  x2     x3
 46 |         0  A   1   True
 47 |         1  B   2  False
 48 |     """
 49 | 
 50 |     left_on, right_on, suffixes = get_join_parameters(kwargs)
 51 |     joined = df.merge(other, how='inner', left_on=left_on,
 52 |                       right_on=right_on, suffixes=suffixes)
 53 |     return joined
 54 | 
 55 | 
 56 | @pipe
 57 | def full_join(df, other, **kwargs):
 58 |     """
 59 |     Joins on values present in either DataFrame. (Alternate to `outer_join`)
 60 | 
 61 |     Args:
 62 |         df (pandas.DataFrame): Left DataFrame (passed in via pipe)
 63 |         other (pandas.DataFrame): Right DataFrame
 64 | 
 65 |     Kwargs:
 66 |         by (str or list): Columns to join on. If a single string, will join
 67 |             on that column. If a list of lists which contain strings or
 68 |             integers, the right/left columns to join on.
 69 |         suffixes (list): String suffixes to append to column names in left
 70 |             and right DataFrames.
 71 | 
 72 |     Example:
 73 |         a >> outer_join(b, by='x1')
 74 | 
 75 |           x1   x2     x3
 76 |         0  A  1.0   True
 77 |         1  B  2.0  False
 78 |         2  C  3.0    NaN
 79 |         3  D  NaN   True
 80 |     """
 81 | 
 82 |     left_on, right_on, suffixes = get_join_parameters(kwargs)
 83 |     joined = df.merge(other, how='outer', left_on=left_on,
 84 |                       right_on=right_on, suffixes=suffixes)
 85 |     return joined
 86 | 
 87 | 
 88 | @pipe
 89 | def outer_join(df, other, **kwargs):
 90 |     """
 91 |     Joins on values present in either DataFrame. (Alternate to `full_join`)
 92 | 
 93 |     Args:
 94 |         df (pandas.DataFrame): Left DataFrame (passed in via pipe)
 95 |         other (pandas.DataFrame): Right DataFrame
 96 | 
 97 |     Kwargs:
 98 |         by (str or list): Columns to join on. If a single string, will join
 99 |             on that column. If a list of lists which contain strings or
100 |             integers, the right/left columns to join on.
101 |         suffixes (list): String suffixes to append to column names in left
102 |             and right DataFrames.
103 | 
104 |     Example:
105 |         a >> full_join(b, by='x1')
106 | 
107 |           x1   x2     x3
108 |         0  A  1.0   True
109 |         1  B  2.0  False
110 |         2  C  3.0    NaN
111 |         3  D  NaN   True
112 |     """
113 | 
114 |     left_on, right_on, suffixes = get_join_parameters(kwargs)
115 |     joined = df.merge(other, how='outer', left_on=left_on,
116 |                       right_on=right_on, suffixes=suffixes)
117 |     return joined
118 | 
119 | 
120 | @pipe
121 | def left_join(df, other, **kwargs):
122 |     """
123 |     Joins on values present in in the left DataFrame.
124 | 
125 |     Args:
126 |         df (pandas.DataFrame): Left DataFrame (passed in via pipe)
127 |         other (pandas.DataFrame): Right DataFrame
128 | 
129 |     Kwargs:
130 |         by (str or list): Columns to join on. If a single string, will join
131 |             on that column. If a list of lists which contain strings or
132 |             integers, the right/left columns to join on.
133 |         suffixes (list): String suffixes to append to column names in left
134 |             and right DataFrames.
135 | 
136 |     Example:
137 |         a >> left_join(b, by='x1')
138 | 
139 |           x1  x2     x3
140 |         0  A   1   True
141 |         1  B   2  False
142 |         2  C   3    NaN
143 |     """
144 | 
145 |     left_on, right_on, suffixes = get_join_parameters(kwargs)
146 |     joined = df.merge(other, how='left', left_on=left_on,
147 |                       right_on=right_on, suffixes=suffixes)
148 |     return joined
149 | 
150 | 
151 | @pipe
152 | def right_join(df, other, **kwargs):
153 |     """
154 |     Joins on values present in in the right DataFrame.
155 | 
156 |     Args:
157 |         df (pandas.DataFrame): Left DataFrame (passed in via pipe)
158 |         other (pandas.DataFrame): Right DataFrame
159 | 
160 |     Kwargs:
161 |         by (str or list): Columns to join on. If a single string, will join
162 |             on that column. If a list of lists which contain strings or
163 |             integers, the right/left columns to join on.
164 |         suffixes (list): String suffixes to append to column names in left
165 |             and right DataFrames.
166 | 
167 |     Example:
168 |         a >> right_join(b, by='x1')
169 | 
170 |           x1   x2     x3
171 |         0  A  1.0   True
172 |         1  B  2.0  False
173 |         2  D  NaN   True
174 |     """
175 | 
176 |     left_on, right_on, suffixes = get_join_parameters(kwargs)
177 |     joined = df.merge(other, how='right', left_on=left_on,
178 |                       right_on=right_on, suffixes=suffixes)
179 |     return joined
180 | 
181 | 
182 | @pipe
183 | def semi_join(df, other, **kwargs):
184 |     """
185 |     Returns all of the rows in the left DataFrame that have a match
186 |     in the right DataFrame.
187 | 
188 |     Args:
189 |         df (pandas.DataFrame): Left DataFrame (passed in via pipe)
190 |         other (pandas.DataFrame): Right DataFrame
191 | 
192 |     Kwargs:
193 |         by (str or list): Columns to join on. If a single string, will join
194 |             on that column. If a list of lists which contain strings or
195 |             integers, the right/left columns to join on.
196 | 
197 |     Example:
198 |         a >> semi_join(b, by='x1')
199 | 
200 |           x1  x2
201 |         0  A   1
202 |         1  B   2
203 |     """
204 | 
205 |     left_on, right_on, suffixes = get_join_parameters(kwargs)
206 |     if not right_on:
207 |         right_on = [col_name for col_name in df.columns.values.tolist() if col_name in other.columns.values.tolist()]
208 |         left_on = right_on
209 |     elif not isinstance(right_on, (list, tuple)):
210 |         right_on = [right_on]
211 |     other_reduced = other[right_on].drop_duplicates()
212 |     joined = df.merge(other_reduced, how='inner', left_on=left_on,
213 |                       right_on=right_on, suffixes=('', '_y'),
214 |                       indicator=True).query('_merge=="both"')[df.columns.values.tolist()]
215 |     return joined
216 | 
217 | 
218 | @pipe
219 | def anti_join(df, other, **kwargs):
220 |     """
221 |     Returns all of the rows in the left DataFrame that do not have a
222 |     match in the right DataFrame.
223 | 
224 |     Args:
225 |         df (pandas.DataFrame): Left DataFrame (passed in via pipe)
226 |         other (pandas.DataFrame): Right DataFrame
227 | 
228 |     Kwargs:
229 |         by (str or list): Columns to join on. If a single string, will join
230 |             on that column. If a list of lists which contain strings or
231 |             integers, the right/left columns to join on.
232 | 
233 |     Example:
234 |         a >> anti_join(b, by='x1')
235 | 
236 |           x1  x2
237 |         2  C   3
238 |     """
239 | 
240 |     left_on, right_on, suffixes = get_join_parameters(kwargs)
241 |     if not right_on:
242 |         right_on = [col_name for col_name in df.columns.values.tolist() if col_name in other.columns.values.tolist()]
243 |         left_on = right_on
244 |     elif not isinstance(right_on, (list, tuple)):
245 |         right_on = [right_on]
246 |     other_reduced = other[right_on].drop_duplicates()
247 |     joined = df.merge(other_reduced, how='left', left_on=left_on,
248 |                       right_on=right_on, suffixes=('', '_y'),
249 |                       indicator=True).query('_merge=="left_only"')[df.columns.values.tolist()]
250 |     return joined
251 | 
252 | 
253 | # ------------------------------------------------------------------------------
254 | # Binding
255 | # ------------------------------------------------------------------------------
256 | 
257 | @pipe
258 | def bind_rows(df, other, join='outer', ignore_index=False):
259 |     """
260 |     Binds DataFrames "vertically", stacking them together. This is equivalent
261 |     to `pd.concat` with `axis=0`.
262 | 
263 |     Args:
264 |         df (pandas.DataFrame): Top DataFrame (passed in via pipe).
265 |         other (pandas.DataFrame): Bottom DataFrame.
266 | 
267 |     Kwargs:
268 |         join (str): One of `"outer"` or `"inner"`. Outer join will preserve
269 |             columns not present in both DataFrames, whereas inner joining will
270 |             drop them.
271 |         ignore_index (bool): Indicates whether to consider pandas indices as
272 |             part of the concatenation (defaults to `False`).
273 |     """
274 | 
275 |     df = pd.concat([df, other], join=join, ignore_index=ignore_index, axis=0)
276 |     return df
277 | 
278 | 
279 | @pipe
280 | def bind_cols(df, other, join='outer', ignore_index=False):
281 |     """
282 |     Binds DataFrames "horizontally". This is equivalent to `pd.concat` with
283 |     `axis=1`.
284 | 
285 |     Args:
286 |         df (pandas.DataFrame): Left DataFrame (passed in via pipe).
287 |         other (pandas.DataFrame): Right DataFrame.
288 | 
289 |     Kwargs:
290 |         join (str): One of `"outer"` or `"inner"`. Outer join will preserve
291 |             rows not present in both DataFrames, whereas inner joining will
292 |             drop them.
293 |         ignore_index (bool): Indicates whether to consider pandas indices as
294 |             part of the concatenation (defaults to `False`).
295 |     """
296 | 
297 |     df = pd.concat([df, other], join=join, ignore_index=ignore_index, axis=1)
298 |     return df
299 | 


--------------------------------------------------------------------------------
/test/test_select.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from dfply import *
  4 | 
  5 | ##==============================================================================
  6 | ## select and drop test functions
  7 | ##==============================================================================
  8 | 
  9 | #       0     1      2     3       4      5      6      7     8     9
 10 | #   carat    cut color clarity  depth  table  price     x     y     z
 11 | #    0.23  Ideal     E     SI2   61.5   55.0    326  3.95  3.98  2.43
 12 | 
 13 | def test_select():
 14 |     df = diamonds[['carat','cut','price']]
 15 |     assert df.equals(diamonds >> select('carat','cut','price'))
 16 |     assert df.equals(diamonds >> select(0, 1, 6))
 17 |     assert df.equals(diamonds >> select(0, 1, 'price'))
 18 |     assert df.equals(diamonds >> select([0, X.cut], X.price))
 19 |     assert df.equals(diamonds >> select(X.carat, X['cut'], X.price))
 20 |     assert df.equals(diamonds >> select(X[['carat','cut','price']]))
 21 |     assert df.equals(diamonds >> select(X[['carat','cut']], X.price))
 22 |     assert df.equals(diamonds >> select(X.iloc[:,[0,1,6]]))
 23 |     assert df.equals(diamonds >> select([X.loc[:, ['carat','cut','price']]]))
 24 | 
 25 | 
 26 | def test_select_inversion():
 27 |     df = diamonds.iloc[:, 3:]
 28 |     d = diamonds >> select(~X.carat, ~X.cut, ~X.color)
 29 |     print(df.head())
 30 |     print(d.head())
 31 |     assert df.equals(d)
 32 | 
 33 | 
 34 | def test_drop():
 35 |     df = diamonds.drop(['carat','cut','price'], axis=1)
 36 |     assert df.equals(diamonds >> drop('carat','cut','price'))
 37 |     assert df.equals(diamonds >> drop(0, 1, 6))
 38 |     assert df.equals(diamonds >> drop(0, 1, 'price'))
 39 |     assert df.equals(diamonds >> drop([0, X.cut], X.price))
 40 |     assert df.equals(diamonds >> drop(X.carat, X['cut'], X.price))
 41 |     assert df.equals(diamonds >> drop(X[['carat','cut','price']]))
 42 |     assert df.equals(diamonds >> drop(X[['carat','cut']], X.price))
 43 |     assert df.equals(diamonds >> drop(X.iloc[:,[0,1,6]]))
 44 |     assert df.equals(diamonds >> drop([X.loc[:, ['carat','cut','price']]]))
 45 | 
 46 | 
 47 | def test_select_containing():
 48 |     df = diamonds[['carat','cut','color','clarity','price']]
 49 |     assert df.equals(diamonds >> select(contains('c')))
 50 | 
 51 | 
 52 | def test_drop_containing():
 53 |     df = diamonds[['depth','table','x','y','z']]
 54 |     assert df.equals(diamonds >> drop(contains('c')))
 55 | 
 56 | 
 57 | def test_select_matches():
 58 |     df = diamonds[['carat','cut','color','clarity','price']]
 59 |     assert df.equals(diamonds >> select(matches('^c[auol]|pri')))
 60 | 
 61 | 
 62 | def test_drop_matches():
 63 |     df = diamonds[['depth','table','x','y','z']]
 64 |     assert df.equals(diamonds >> drop(matches('^c[auol]|p.i')))
 65 | 
 66 | 
 67 | def test_select_startswith():
 68 |     df = diamonds[['carat','cut','color','clarity']]
 69 |     assert df.equals(diamonds >> select(starts_with('c')))
 70 | 
 71 | 
 72 | def test_drop_startswith():
 73 |     df = diamonds[['depth','table','price','x','y','z']]
 74 |     assert df.equals(diamonds >> drop(starts_with('c')))
 75 | 
 76 | 
 77 | def test_select_endswith():
 78 |     df = diamonds[['table','price']]
 79 |     assert df.equals(diamonds >> select(ends_with('e')))
 80 | 
 81 | 
 82 | def test_drop_endswith():
 83 |     df = diamonds.drop('z', axis=1)
 84 |     assert df.equals(diamonds >> drop(ends_with('z')))
 85 | 
 86 | 
 87 | def test_select_between():
 88 |     df = diamonds[['cut','color','clarity']]
 89 |     assert df.equals(diamonds >> select(columns_between(X.cut, X.clarity)))
 90 |     assert df.equals(diamonds >> select(columns_between('cut', 'clarity')))
 91 |     assert df.equals(diamonds >> select(columns_between(1, 3)))
 92 | 
 93 |     df = diamonds[['x','y','z']]
 94 |     assert df.equals(diamonds >> select(columns_between('x', 20)))
 95 | 
 96 | 
 97 | 
 98 | def test_drop_between():
 99 |     df = diamonds[['carat','z']]
100 |     assert df.equals(diamonds >> drop(columns_between('cut','y')))
101 |     assert df.equals(diamonds >> drop(columns_between(X.cut, 8)))
102 | 
103 |     df = diamonds[['carat','cut']]
104 |     assert df.equals(diamonds >> drop(columns_between(X.color, 20)))
105 | 
106 | 
107 | def test_select_from():
108 |     df = diamonds[['x','y','z']]
109 |     assert df.equals(diamonds >> select(columns_from('x')))
110 |     assert df.equals(diamonds >> select(columns_from(X.x)))
111 |     assert df.equals(diamonds >> select(columns_from(7)))
112 | 
113 |     assert diamonds[[]].equals(diamonds >> select(columns_from(100)))
114 | 
115 | 
116 | def test_drop_from():
117 |     df = diamonds[['carat','cut']]
118 |     assert df.equals(diamonds >> drop(columns_from('color')))
119 |     assert df.equals(diamonds >> drop(columns_from(X.color)))
120 |     assert df.equals(diamonds >> drop(columns_from(2)))
121 | 
122 |     #print(diamonds >> drop(columns_from(0)))
123 |     assert diamonds[[]].equals(diamonds >> drop(columns_from(0)))
124 | 
125 | 
126 | def test_select_to():
127 |     df = diamonds[['carat','cut']]
128 |     assert df.equals(diamonds >> select(columns_to('color')))
129 |     assert df.equals(diamonds >> select(columns_to(X.color)))
130 |     assert df.equals(diamonds >> select(columns_to(2)))
131 | 
132 | 
133 | def test_drop_to():
134 |     df = diamonds[['x','y','z']]
135 |     assert df.equals(diamonds >> drop(columns_to('x')))
136 |     assert df.equals(diamonds >> drop(columns_to(X.x)))
137 |     assert df.equals(diamonds >> drop(columns_to(7)))
138 | 
139 | 
140 | def select_through():
141 |     df = diamonds[['carat','cut','color']]
142 |     assert df.equals(diamonds >> select(columns_to('color', inclusive=True)))
143 |     assert df.equals(diamonds >> select(columns_to(X.color, inclusive=True)))
144 |     assert df.equals(diamonds >> select(columns_to(2, inclusive=True)))
145 | 
146 | 
147 | def drop_through():
148 |     df = diamonds[['y','z']]
149 |     assert df.equals(diamonds >> drop(columns_to('x', inclusive=True)))
150 |     assert df.equals(diamonds >> drop(columns_to(X.x, inclusive=True)))
151 |     assert df.equals(diamonds >> drop(columns_to(7, inclusive=True)))
152 | 
153 | 
154 | 
155 | def test_select_if():
156 |     # test 1: manually build diamonds subset where columns are numeric and
157 |     # mean is greater than 3
158 |     cols = list()
159 |     for col in diamonds:
160 |         try:
161 |             if mean(diamonds[col]) > 3:
162 |                 cols.append(col)
163 |         except:
164 |             pass
165 |     df_if = diamonds[cols]
166 |     assert df_if.equals(diamonds >> select_if(lambda col: mean(col) > 3))
167 |     # test 2: use and
168 |     cols = list()
169 |     for col in diamonds:
170 |         try:
171 |             if mean(diamonds[col]) > 3 and max(diamonds[col]) < 50:
172 |                 cols.append(col)
173 |         except:
174 |             pass
175 |     df_if = diamonds[cols]
176 |     assert df_if.equals(diamonds >> select_if(lambda col: mean(col) > 3 and max(col) < 50))
177 |     # test 3: use or
178 |     cols = list()
179 |     for col in diamonds:
180 |         try:
181 |             if mean(diamonds[col]) > 3 or max(diamonds[col]) < 6:
182 |                 cols.append(col)
183 |         except:
184 |             pass
185 |     df_if = diamonds[cols]
186 |     assert df_if.equals(diamonds >> select_if(lambda col: mean(col) > 3 or max(col) < 6))
187 |     # test 4: string operations - contain a specific string
188 |     cols = list()
189 |     for col in diamonds:
190 |         try:
191 |             if any(diamonds[col].str.contains('Ideal')):
192 |                 cols.append(col)
193 |         except:
194 |             pass
195 |     df_if = diamonds[cols]
196 |     assert df_if.equals(diamonds >> select_if(lambda col: any(col.str.contains('Ideal'))))
197 |     # test 5: get any text columns
198 |     # uses the special '.' regex symbol to find any text value
199 |     cols = list()
200 |     for col in diamonds:
201 |         try:
202 |             if any(diamonds[col].str.contains('.')):
203 |                 cols.append(col)
204 |         except:
205 |             pass
206 |     df_if = diamonds[cols]
207 |     assert df_if.equals(diamonds >> select_if(lambda col: any(col.str.contains('.'))))
208 | 
209 | 
210 | def test_drop_if():
211 |     # test 1: returns a dataframe where any column does not have a mean greater than 3
212 |     # this means numeric columns with mean less than 3, and also any non-numeric column
213 |     # (since it does not have a mean)
214 |     cols = list()
215 |     for col in diamonds:
216 |         try:
217 |             if mean(diamonds[col]) > 3:
218 |                 cols.append(col)
219 |         except:
220 |             pass
221 |     inverse_cols = [col for col in diamonds if col not in cols]
222 |     df_if = diamonds[inverse_cols]
223 |     assert df_if.equals(diamonds >> drop_if(lambda col: mean(col) > 3))
224 |     # test 2: use and
225 |     # return colums where both conditions are false:
226 |     # the mean greater than 3, and max < 50
227 |     # again, this will include non-numeric columns
228 |     cols = list()
229 |     for col in diamonds:
230 |         try:
231 |             if mean(diamonds[col]) > 3 and max(diamonds[col]) < 50:
232 |                 cols.append(col)
233 |         except:
234 |             pass
235 |     inverse_cols = [col for col in diamonds if col not in cols]
236 |     df_if = diamonds[inverse_cols]
237 |     assert df_if.equals(diamonds >> drop_if(lambda col: mean(col) > 3 and max(col) < 50))
238 |     # test 3: use or
239 |     # this will return a dataframe where either of the two conditions are false:
240 |     # the mean is greater than 3, or the max < 6
241 |     cols = list()
242 |     for col in diamonds:
243 |         try:
244 |             if mean(diamonds[col]) > 3 or max(diamonds[col]) < 6:
245 |                 cols.append(col)
246 |         except:
247 |             pass
248 |     inverse_cols = [col for col in diamonds if col not in cols]
249 |     df_if = diamonds[inverse_cols]
250 |     assert df_if.equals(diamonds >> drop_if(lambda col: mean(col) > 3 or max(col) < 6))
251 |     # test 4: string operations - contain a specific string
252 |     # this will drop any columns if they contain the word 'Ideal'
253 |     cols = list()
254 |     for col in diamonds:
255 |         try:
256 |             if any(diamonds[col].str.contains('Ideal')):
257 |                 cols.append(col)
258 |         except:
259 |             pass
260 |     inverse_cols = [col for col in diamonds if col not in cols]
261 |     df_if = diamonds[inverse_cols]
262 |     assert df_if.equals(diamonds >> drop_if(lambda col: any(col.str.contains('Ideal'))))
263 |     # test 5: drop any text columns
264 |     # uses the special '.' regex symbol to find any text value
265 |     cols = list()
266 |     for col in diamonds:
267 |         try:
268 |             if any(diamonds[col].str.contains('.')):
269 |                 cols.append(col)
270 |         except:
271 |             pass
272 |     inverse_cols = [col for col in diamonds if col not in cols]
273 |     df_if = diamonds[inverse_cols]
274 |     assert df_if.equals(diamonds >> drop_if(lambda col: any(col.str.contains('.'))))
275 | 


--------------------------------------------------------------------------------
/dfply/base.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import warnings
  4 | from functools import partial, wraps
  5 | 
  6 | 
  7 | def _recursive_apply(f, l):
  8 |     if isinstance(l, (list, tuple)):
  9 |         out = [_recursive_apply(f, l_) for l_ in l]
 10 |         if isinstance(l, tuple):
 11 |             out = tuple(out)
 12 |         return out
 13 |     else:
 14 |         return f(l)
 15 | 
 16 | 
 17 | def contextualize(arg, context):
 18 |     if isinstance(arg, Intention):
 19 |         arg = arg.evaluate(context)
 20 |     return arg
 21 | 
 22 | 
 23 | def flatten(l):
 24 |     for el in l:
 25 |         if isinstance(el, (tuple, list)):
 26 |             yield from flatten(el)
 27 |         else:
 28 |             yield el
 29 | 
 30 | 
 31 | def _check_delayed_eval(args, kwargs):
 32 |     check = lambda x: isinstance(x, Intention)
 33 |     delay = any([a for a in flatten(_recursive_apply(check, args))])
 34 |     delay = delay or any([v for v in flatten(_recursive_apply(check, list(kwargs.values())))])
 35 |     return delay
 36 | 
 37 | 
 38 | def _context_args(args):
 39 |     return lambda x: _recursive_apply(partial(contextualize, context=x), args)
 40 | 
 41 | 
 42 | def _context_kwargs(kwargs):
 43 |     values_ = lambda x: _recursive_apply(partial(contextualize, context=x),
 44 |                                          list(kwargs.values()))
 45 |     return lambda x: {k: v for k, v in zip(kwargs.keys(), values_(x))}
 46 | 
 47 | 
 48 | def _delayed_function(function, args, kwargs):
 49 |     return lambda x: function(*_context_args(args)(x),
 50 |                               **_context_kwargs(kwargs)(x))
 51 | 
 52 | 
 53 | def make_symbolic(f):
 54 |     def wrapper(*args, **kwargs):
 55 |         delay = _check_delayed_eval(args, kwargs)
 56 |         if delay:
 57 |             delayed = _delayed_function(f, args, kwargs)
 58 |             return Intention(delayed)
 59 |         else:
 60 |             return f(*args, **kwargs)
 61 | 
 62 |     return wrapper
 63 | 
 64 | 
 65 | class Intention(object):
 66 |     def __init__(self, function=lambda x: x, invert=False):
 67 |         self.function = function
 68 |         self.inverted = invert
 69 | 
 70 |     def evaluate(self, context):
 71 |         return self.function(context)
 72 | 
 73 |     def __getattr__(self, attribute):
 74 |         return Intention(lambda x: getattr(self.function(x), attribute),
 75 |                          invert=self.inverted)
 76 | 
 77 |     def __invert__(self):
 78 |         return Intention(self.function, invert=not self.inverted)
 79 | 
 80 |     def __call__(self, *args, **kwargs):
 81 |         return Intention(lambda x: self.function(x)(*_context_args(args)(x),
 82 |                                                     **_context_kwargs(kwargs)(x)),
 83 |                          invert=self.inverted)
 84 | 
 85 | 
 86 | _magic_method_names = [
 87 |     '__abs__', '__add__', '__and__', '__cmp__', '__complex__', '__contains__',
 88 |     '__delattr__', '__delete__', '__delitem__', '__delslice__', '__div__',
 89 |     '__divmod__', '__enter__', '__eq__', '__exit__', '__float__',
 90 |     '__floordiv__', '__ge__', '__get__', '__getitem__', '__getslice__',
 91 |     '__gt__', '__hash__', '__hex__', '__iadd__', '__iand__', '__idiv__',
 92 |     '__ifloordiv__', '__ilshift__', '__imod__', '__imul__', '__index__',
 93 |     '__int__', '__ior__', '__ipow__', '__irshift__', '__isub__',
 94 |     '__iter__', '__itruediv__', '__ixor__', '__le__', '__len__', '__long__',
 95 |     '__lshift__', '__lt__', '__mod__', '__mul__', '__ne__', '__neg__',
 96 |     '__nonzero__', '__oct__', '__or__', '__pos__', '__pow__', '__radd__',
 97 |     '__rand__', '__rcmp__', '__rdiv__', '__rdivmod__',  # '__repr__',
 98 |     '__reversed__', '__rfloordiv__', '__rlshift__', '__rmod__', '__rmul__',
 99 |     '__ror__', '__rpow__', '__rrshift__', '__rshift__', '__rsub__',
100 |     '__rtruediv__', '__rxor__', '__set__', '__setitem__', '__setslice__',
101 |     '__sub__', '__truediv__', '__unicode__', '__xor__', '__str__',
102 | ]
103 | 
104 | 
105 | def _set_magic_method(name):
106 |     def magic_method(self, *args, **kwargs):
107 |         return Intention(lambda x: getattr(self.function(x), name)(*_context_args(args)(x),
108 |                                                                    **_context_kwargs(kwargs)(x)),
109 |                          invert=self.inverted)
110 | 
111 |     return magic_method
112 | 
113 | 
114 | for name in _magic_method_names:
115 |     setattr(Intention, name, _set_magic_method(name))
116 | 
117 | # Initialize the global X symbol
118 | X = Intention()
119 | 
120 | 
121 | class pipe(object):
122 |     __name__ = "pipe"
123 | 
124 |     def __init__(self, function):
125 |         self.function = function
126 |         self.__doc__ = function.__doc__
127 | 
128 |         self.chained_pipes = []
129 | 
130 |     def __rshift__(self, other):
131 |         assert isinstance(other, pipe)
132 |         self.chained_pipes.append(other)
133 |         return self
134 | 
135 |     def __rrshift__(self, other):
136 |         other_copy = other.copy()
137 | 
138 |         with warnings.catch_warnings():
139 |             warnings.simplefilter("ignore")
140 |             other_copy._grouped_by = getattr(other, '_grouped_by', None)
141 | 
142 |         result = self.function(other_copy)
143 | 
144 |         for p in self.chained_pipes:
145 |             result = p.__rrshift__(result)
146 |         return result
147 | 
148 |     def __call__(self, *args, **kwargs):
149 |         return pipe(lambda x: self.function(x, *args, **kwargs))
150 | 
151 | 
152 | class IntentionEvaluator(object):
153 |     """
154 |     Parent class for symbolic argument decorators.
155 |     Default behavior is to recursively turn the arguments and keyword
156 |     arguments of a decorated function into `symbolic.Call` objects that
157 |     can be evaluated against a pandas DataFrame as it comes down a pipe.
158 |     """
159 | 
160 |     __name__ = "IntentionEvaluator"
161 | 
162 |     def __init__(self, function, eval_symbols=True, eval_as_label=[],
163 |                  eval_as_selector=[]):
164 |         super(IntentionEvaluator, self).__init__()
165 |         self.function = function
166 |         self.__doc__ = function.__doc__
167 | 
168 |         self.eval_symbols = eval_symbols
169 |         self.eval_as_label = eval_as_label
170 |         self.eval_as_selector = eval_as_selector
171 | 
172 |     def _evaluate(self, df, arg):
173 |         if isinstance(arg, Intention):
174 |             negate = arg.inverted
175 |             arg = arg.evaluate(df)
176 |             if negate:
177 |                 arg = ~arg
178 |         return arg
179 | 
180 |     def _evaluate_label(self, df, arg):
181 |         arg = self._evaluate(df, arg)
182 | 
183 |         cols = list(df.columns)
184 |         if isinstance(arg, pd.Series):
185 |             arg = arg.name
186 |         if isinstance(arg, pd.Index):
187 |             arg = list(arg)
188 |         if isinstance(arg, int):
189 |             arg = cols[arg]
190 |         return arg
191 | 
192 |     def _evaluate_selector(self, df, arg):
193 |         negate = False
194 |         if isinstance(arg, Intention):
195 |             negate = arg.inverted
196 |             arg = arg.evaluate(df)
197 | 
198 |         cols = list(df.columns)
199 |         if isinstance(arg, pd.Series):
200 |             arg = [cols.index(arg.name)]
201 |         if isinstance(arg, pd.Index):
202 |             arg = [cols.index(i) for i in list(arg)]
203 |         if isinstance(arg, pd.DataFrame):
204 |             arg = [cols.index(i) for i in arg.columns]
205 |         if isinstance(arg, int):
206 |             arg = [arg]
207 |         if isinstance(arg, str):
208 |             arg = [cols.index(arg)]
209 |         if isinstance(arg, (list, tuple)):
210 |             arg = [cols.index(i) if isinstance(i, str) else i for i in arg]
211 | 
212 |         selection_vector = np.zeros(df.shape[1])
213 |         col_idx = np.array(arg)
214 | 
215 |         if negate and len(col_idx) > 0:
216 |             selection_vector[col_idx] = -1
217 |         elif len(col_idx) > 0:
218 |             selection_vector[col_idx] = 1
219 |         return selection_vector
220 | 
221 |     def _evaluator_loop(self, df, arg, eval_func):
222 |         if isinstance(arg, (list, tuple)):
223 |             return [self._evaluator_loop(df, a_, eval_func) for a_ in arg]
224 |         else:
225 |             return eval_func(df, arg)
226 | 
227 |     def _symbolic_eval(self, df, arg):
228 |         return self._evaluator_loop(df, arg, self._evaluate)
229 | 
230 |     def _symbolic_to_label(self, df, arg):
231 |         return self._evaluator_loop(df, arg, self._evaluate_label)
232 | 
233 |     def _symbolic_to_selector(self, df, arg):
234 |         return self._evaluator_loop(df, arg, self._evaluate_selector)
235 | 
236 |     def _recursive_arg_eval(self, df, args):
237 |         eval_symbols = self._find_eval_args(self.eval_symbols, args)
238 |         eval_as_label = self._find_eval_args(self.eval_as_label, args)
239 |         eval_as_selector = self._find_eval_args(self.eval_as_selector, args)
240 | 
241 |         return [
242 |             self._symbolic_to_label(df, a) if i in eval_as_label
243 |             else self._symbolic_to_selector(df, a) if i in eval_as_selector
244 |             else self._symbolic_eval(df, a) if i in eval_symbols
245 |             else a
246 |             for i, a in enumerate(args)
247 |         ]
248 | 
249 |     def _recursive_kwarg_eval(self, df, kwargs):
250 |         eval_symbols = self._find_eval_kwargs(self.eval_symbols, kwargs)
251 |         eval_as_label = self._find_eval_kwargs(self.eval_as_label, kwargs)
252 |         eval_as_selector = self._find_eval_kwargs(self.eval_as_selector, kwargs)
253 | 
254 |         return {
255 |             k: (self._symbolic_to_label(df, v) if k in eval_as_label
256 |                 else self._symbolic_to_selector(df, v) if k in eval_as_selector
257 |             else self._symbolic_eval(df, v) if k in eval_symbols
258 |             else v)
259 |             for k, v in kwargs.items()
260 |         }
261 | 
262 |     def _find_eval_args(self, request, args):
263 |         if (request == True) or ('*' in request):
264 |             return [i for i in range(len(args))]
265 |         elif request in [None, False]:
266 |             return []
267 |         return request
268 | 
269 |     def _find_eval_kwargs(self, request, kwargs):
270 |         if (request == True) or ('**' in request):
271 |             return [k for k in kwargs.keys()]
272 |         elif request in [None, False]:
273 |             return []
274 |         return request
275 | 
276 |     def __call__(self, *args, **kwargs):
277 |         df = args[0]
278 | 
279 |         args = self._recursive_arg_eval(df, args[1:])
280 |         kwargs = self._recursive_kwarg_eval(df, kwargs)
281 | 
282 |         return self.function(df, *args, **kwargs)
283 | 
284 | 
285 | def symbolic_evaluation(function=None, eval_symbols=True, eval_as_label=[],
286 |                         eval_as_selector=[]):
287 |     if function:
288 |         return IntentionEvaluator(function)
289 |     else:
290 |         @wraps(function)
291 |         def wrapper(function):
292 |             return IntentionEvaluator(function, eval_symbols=eval_symbols,
293 |                                       eval_as_label=eval_as_label,
294 |                                       eval_as_selector=eval_as_selector)
295 | 
296 |         return wrapper
297 | 
298 | 
299 | class group_delegation(object):
300 |     __name__ = "group_delegation"
301 | 
302 |     def __init__(self, function):
303 |         self.function = function
304 |         self.__doc__ = function.__doc__
305 | 
306 |     def _apply(self, df, *args, **kwargs):
307 |         grouped = df.groupby(df._grouped_by)
308 | 
309 |         dff = grouped.apply(self.function, *args, **kwargs)
310 |         # Save all the metadata attributes back into the new data frame
311 |         for field in df._metadata:
312 |             setattr(dff, field, getattr(df, field))
313 |         df = dff
314 | 
315 |         for name in df.index.names[:-1]:
316 |             if name in df:
317 |                 df.reset_index(level=0, drop=True, inplace=True)
318 |             else:
319 |                 df.reset_index(level=0, inplace=True)
320 | 
321 |         if (df.index == 0).all():
322 |             df.reset_index(drop=True, inplace=True)
323 | 
324 |         return df
325 | 
326 |     def __call__(self, *args, **kwargs):
327 |         grouped_by = getattr(args[0], '_grouped_by', None)
328 |         if (grouped_by is None) or not all([g in args[0].columns for g in grouped_by]):
329 |             return self.function(*args, **kwargs)
330 |         else:
331 |             applied = self._apply(args[0], *args[1:], **kwargs)
332 | 
333 |             with warnings.catch_warnings():
334 |                 warnings.simplefilter("ignore")
335 |                 applied._grouped_by = grouped_by
336 |             
337 |             return applied
338 | 
339 | 
340 | def dfpipe(f):
341 |     return pipe(
342 |         group_delegation(
343 |             symbolic_evaluation(f)
344 |         )
345 |     )
346 | 


--------------------------------------------------------------------------------
/dfply/reshape.py:
--------------------------------------------------------------------------------
  1 | from .base import *
  2 | import re
  3 | 
  4 | 
  5 | # ------------------------------------------------------------------------------
  6 | # Sorting
  7 | # ------------------------------------------------------------------------------
  8 | 
  9 | @dfpipe
 10 | def arrange(df, *args, **kwargs):
 11 |     """Calls `pandas.DataFrame.sort_values` to sort a DataFrame according to
 12 |     criteria.
 13 | 
 14 |     See:
 15 |     http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sort_values.html
 16 | 
 17 |     For a list of specific keyword arguments for sort_values (which will be
 18 |     the same in arrange).
 19 | 
 20 |     Args:
 21 |         *args: Symbolic, string, integer or lists of those types indicating
 22 |             columns to sort the DataFrame by.
 23 | 
 24 |     Kwargs:
 25 |         **kwargs: Any keyword arguments will be passed through to the pandas
 26 |             `DataFrame.sort_values` function.
 27 |     """
 28 | 
 29 |     flat_args = [a for a in flatten(args)]
 30 | 
 31 |     series = [df[arg] if isinstance(arg, str) else
 32 |               df.iloc[:, arg] if isinstance(arg, int) else
 33 |               pd.Series(arg) for arg in flat_args]
 34 | 
 35 |     sorter = pd.concat(series, axis=1).reset_index(drop=True)
 36 |     sorter = sorter.sort_values(sorter.columns.tolist(), **kwargs)
 37 |     return df.iloc[sorter.index, :]
 38 | 
 39 | 
 40 | # ------------------------------------------------------------------------------
 41 | # Renaming
 42 | # ------------------------------------------------------------------------------
 43 | 
 44 | @pipe
 45 | @symbolic_evaluation(eval_as_label=True)
 46 | def rename(df, **kwargs):
 47 |     """Renames columns, where keyword argument values are the current names
 48 |     of columns and keys are the new names.
 49 | 
 50 |     Args:
 51 |         df (:obj:`pandas.DataFrame`): DataFrame passed in via `>>` pipe.
 52 | 
 53 |     Kwargs:
 54 |         **kwargs: key:value pairs where keys are new names for columns and
 55 |             values are current names of columns.
 56 |     """
 57 | 
 58 |     return df.rename(columns={v: k for k, v in kwargs.items()})
 59 | 
 60 | 
 61 | # ------------------------------------------------------------------------------
 62 | # Elongate
 63 | # ------------------------------------------------------------------------------
 64 | 
 65 | @pipe
 66 | @symbolic_evaluation(eval_as_label=['*'])
 67 | def gather(df, key, values, *args, **kwargs):
 68 |     """
 69 |     Melts the specified columns in your DataFrame into two key:value columns.
 70 | 
 71 |     Args:
 72 |         key (str): Name of identifier column.
 73 |         values (str): Name of column that will contain values for the key.
 74 |         *args (str, int, symbolic): Columns to "melt" into the new key and
 75 |             value columns. If no args are specified, all columns are melted
 76 |             into they key and value columns.
 77 | 
 78 |     Kwargs:
 79 |         add_id (bool): Boolean value indicating whether to add a `"_ID"`
 80 |             column that will preserve information about the original rows
 81 |             (useful for being able to re-widen the data later).
 82 | 
 83 |     Example:
 84 |         diamonds >> gather('variable', 'value', ['price', 'depth','x','y','z']) >> head(5)
 85 | 
 86 |            carat      cut color clarity  table variable  value
 87 |         0   0.23    Ideal     E     SI2   55.0    price  326.0
 88 |         1   0.21  Premium     E     SI1   61.0    price  326.0
 89 |         2   0.23     Good     E     VS1   65.0    price  327.0
 90 |         3   0.29  Premium     I     VS2   58.0    price  334.0
 91 |         4   0.31     Good     J     SI2   58.0    price  335.0
 92 |     """
 93 | 
 94 |     if len(args) == 0:
 95 |         args = df.columns.tolist()
 96 |     else:
 97 |         args = [a for a in flatten(args)]
 98 | 
 99 |     if kwargs.get('add_id', False):
100 |         df = df.assign(_ID=np.arange(df.shape[0]))
101 | 
102 |     columns = df.columns.tolist()
103 |     id_vars = [col for col in columns if col not in args]
104 |     return pd.melt(df, id_vars, list(args), key, values)
105 | 
106 | 
107 | # ------------------------------------------------------------------------------
108 | # Widen
109 | # ------------------------------------------------------------------------------
110 | 
111 | def convert_type(df, columns):
112 |     """
113 |     Helper function that attempts to convert columns into their appropriate
114 |     data type.
115 |     """
116 |     # taken in part from the dplython package
117 |     out_df = df.copy()
118 |     for col in columns:
119 |         column_values = pd.Series(out_df[col].unique())
120 |         column_values = column_values[~column_values.isnull()]
121 |         # empty
122 |         if len(column_values) == 0:
123 |             continue
124 |         # boolean
125 |         if set(column_values.values) < {'True', 'False'}:
126 |             out_df[col] = out_df[col].map({'True': True, 'False': False})
127 |             continue
128 |         # numeric
129 |         if pd.to_numeric(column_values, errors='coerce').isnull().sum() == 0:
130 |             out_df[col] = pd.to_numeric(out_df[col], errors='ignore')
131 |             continue
132 |         # datetime
133 |         if pd.to_datetime(column_values, errors='coerce').isnull().sum() == 0:
134 |             out_df[col] = pd.to_datetime(out_df[col], errors='ignore',
135 |                                          infer_datetime_format=True)
136 |             continue
137 | 
138 |     return out_df
139 | 
140 | 
141 | @pipe
142 | @symbolic_evaluation(eval_as_label=['*'])
143 | def spread(df, key, values, convert=False):
144 |     """
145 |     Transforms a "long" DataFrame into a "wide" format using a key and value
146 |     column.
147 | 
148 |     If you have a mixed datatype column in your long-format DataFrame then the
149 |     default behavior is for the spread columns to be of type `object`, or
150 |     string. If you want to try to convert dtypes when spreading, you can set
151 |     the convert keyword argument in spread to True.
152 | 
153 |     Args:
154 |         key (str, int, or symbolic): Label for the key column.
155 |         values (str, int, or symbolic): Label for the values column.
156 | 
157 |     Kwargs:
158 |         convert (bool): Boolean indicating whether or not to try and convert
159 |             the spread columns to more appropriate data types.
160 | 
161 | 
162 |     Example:
163 |         widened = elongated >> spread(X.variable, X.value)
164 |         widened >> head(5)
165 | 
166 |             _ID carat clarity color        cut depth price table     x     y     z
167 |         0     0  0.23     SI2     E      Ideal  61.5   326    55  3.95  3.98  2.43
168 |         1     1  0.21     SI1     E    Premium  59.8   326    61  3.89  3.84  2.31
169 |         2    10   0.3     SI1     J       Good    64   339    55  4.25  4.28  2.73
170 |         3   100  0.75     SI1     D  Very Good  63.2  2760    56   5.8  5.75  3.65
171 |         4  1000  0.75     SI1     D      Ideal  62.3  2898    55  5.83   5.8  3.62
172 |     """
173 | 
174 |     # Taken mostly from dplython package
175 |     columns = df.columns.tolist()
176 |     id_cols = [col for col in columns if not col in [key, values]]
177 | 
178 |     temp_index = ['' for i in range(len(df))]
179 |     for id_col in id_cols:
180 |         temp_index += df[id_col].map(str)
181 | 
182 |     out_df = df.assign(temp_index=temp_index)
183 |     out_df = out_df.set_index('temp_index')
184 |     spread_data = out_df[[key, values]]
185 | 
186 |     if not all(spread_data.groupby([spread_data.index, key]).agg(
187 |             'count').reset_index()[values] < 2):
188 |         raise ValueError('Duplicate identifiers')
189 | 
190 |     spread_data = spread_data.pivot(columns=key, values=values)
191 | 
192 |     if convert and (out_df[values].dtype.kind in 'OSaU'):
193 |         columns_to_convert = [col for col in spread_data if col not in columns]
194 |         spread_data = convert_type(spread_data, columns_to_convert)
195 | 
196 |     out_df = out_df[id_cols].drop_duplicates()
197 |     out_df = out_df.merge(spread_data, left_index=True, right_index=True).reset_index(drop=True)
198 | 
199 |     out_df = (out_df >> arrange(id_cols)).reset_index(drop=True)
200 | 
201 |     return out_df
202 | 
203 | 
204 | # ------------------------------------------------------------------------------
205 | # Separate columns
206 | # ------------------------------------------------------------------------------
207 | 
208 | @pipe
209 | @symbolic_evaluation(eval_as_label=['*'])
210 | def separate(df, column, into, sep="[\W_]+", remove=True, convert=False,
211 |              extra='drop', fill='right'):
212 |     """
213 |     Splits columns into multiple columns.
214 | 
215 |     Args:
216 |         df (pandas.DataFrame): DataFrame passed in through the pipe.
217 |         column (str, symbolic): Label of column to split.
218 |         into (list): List of string names for new columns.
219 | 
220 |     Kwargs:
221 |         sep (str or list): If a string, the regex string used to split the
222 |             column. If a list, a list of integer positions to split strings
223 |             on.
224 |         remove (bool): Boolean indicating whether to remove the original column.
225 |         convert (bool): Boolean indicating whether the new columns should be
226 |             converted to the appropriate type.
227 |         extra (str): either `'drop'`, where split pieces beyond the specified
228 |             new columns are dropped, or `'merge'`, where the final split piece
229 |             contains the remainder of the original column.
230 |         fill (str): either `'right'`, where `np.nan` values are filled in the
231 |             right-most columns for missing pieces, or `'left'` where `np.nan`
232 |             values are filled in the left-most columns.
233 |     """
234 | 
235 |     assert isinstance(into, (tuple, list))
236 | 
237 |     if isinstance(sep, (tuple, list)):
238 |         inds = [0] + list(sep)
239 |         if len(inds) > len(into):
240 |             if extra == 'drop':
241 |                 inds = inds[:len(into) + 1]
242 |             elif extra == 'merge':
243 |                 inds = inds[:len(into)] + [None]
244 |         else:
245 |             inds = inds + [None]
246 | 
247 |         splits = df[column].map(lambda x: [str(x)[slice(inds[i], inds[i + 1])]
248 |                                            if i < len(inds) - 1 else np.nan
249 |                                            for i in range(len(into))])
250 | 
251 |     else:
252 |         maxsplit = len(into) - 1 if extra == 'merge' else 0
253 |         splits = df[column].map(lambda x: re.split(sep, x, maxsplit))
254 | 
255 |     right_filler = lambda x: x + [np.nan for i in range(len(into) - len(x))]
256 |     left_filler = lambda x: [np.nan for i in range(len(into) - len(x))] + x
257 | 
258 |     if fill == 'right':
259 |         splits = [right_filler(x) for x in splits]
260 |     elif fill == 'left':
261 |         splits = [left_filler(x) for x in splits]
262 | 
263 |     for i, split_col in enumerate(into):
264 |         df[split_col] = [x[i] if not x[i] == '' else np.nan for x in splits]
265 | 
266 |     if convert:
267 |         df = convert_type(df, into)
268 | 
269 |     if remove:
270 |         df.drop(column, axis=1, inplace=True)
271 | 
272 |     return df
273 | 
274 | 
275 | # ------------------------------------------------------------------------------
276 | # Unite columns
277 | # ------------------------------------------------------------------------------
278 | 
279 | @pipe
280 | @symbolic_evaluation(eval_as_label=['*'])
281 | def unite(df, colname, *args, **kwargs):
282 |     """
283 |     Does the inverse of `separate`, joining columns together by a specified
284 |     separator.
285 | 
286 |     Any columns that are not strings will be converted to strings.
287 | 
288 |     Args:
289 |         df (pandas.DataFrame): DataFrame passed in through the pipe.
290 |         colname (str): the name of the new joined column.
291 |         *args: list of columns to be joined, which can be strings, symbolic, or
292 |             integer positions.
293 | 
294 |     Kwargs:
295 |         sep (str): the string separator to join the columns with.
296 |         remove (bool): Boolean indicating whether or not to remove the
297 |             original columns.
298 |         na_action (str): can be one of `'maintain'` (the default),
299 |             '`ignore'`, or `'as_string'`. The default will make the new column
300 |             row a `NaN` value if any of the original column cells at that
301 |             row contained `NaN`. '`ignore'` will treat any `NaN` value as an
302 |             empty string during joining. `'as_string'` will convert any `NaN`
303 |             value to the string `'nan'` prior to joining.
304 |     """
305 | 
306 |     to_unite = list([a for a in flatten(args)])
307 |     sep = kwargs.get('sep', '_')
308 |     remove = kwargs.get('remove', True)
309 |     # possible na_action values
310 |     # ignore: empty string
311 |     # maintain: keep as np.nan (default)
312 |     # as_string: becomes string 'nan'
313 |     na_action = kwargs.get('na_action', 'maintain')
314 | 
315 |     # print(to_unite, sep, remove, na_action)
316 | 
317 |     if na_action == 'maintain':
318 |         df[colname] = df[to_unite].apply(lambda x: np.nan if any(x.isnull())
319 |         else sep.join(x.map(str)), axis=1)
320 |     elif na_action == 'ignore':
321 |         df[colname] = df[to_unite].apply(lambda x: sep.join(x[~x.isnull()].map(str)),
322 |                                          axis=1)
323 |     elif na_action == 'as_string':
324 |         df[colname] = df[to_unite].astype(str).apply(lambda x: sep.join(x), axis=1)
325 | 
326 |     if remove:
327 |         df.drop(to_unite, axis=1, inplace=True)
328 | 
329 |     return df
330 | 


--------------------------------------------------------------------------------
/test/test_summary_functions.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from dfply import *
  3 | 
  4 | 
  5 | ##==============================================================================
  6 | ## transform summary functions
  7 | ##==============================================================================
  8 | 
  9 | def test_mean():
 10 |     df = diamonds >> select(X.cut, X.x) >> head(5)
 11 |     # straight summarize
 12 |     t = df >> summarize(m=mean(X.x))
 13 |     df_truth = pd.DataFrame({'m': [4.086]})
 14 |     assert t.equals(df_truth)
 15 |     # grouped summarize
 16 |     t = df >> group_by(X.cut) >> summarize(m=mean(X.x))
 17 |     df_truth = pd.DataFrame({'cut': ['Good', 'Ideal', 'Premium'],
 18 |                              'm': [4.195, 3.950, 4.045]})
 19 |     assert t.equals(df_truth)
 20 |     # straight mutate
 21 |     t = df >> mutate(m=mean(X.x))
 22 |     df_truth = df.copy()
 23 |     df_truth['m'] = df_truth.x.mean()
 24 |     assert t.equals(df_truth)
 25 |     # grouped mutate
 26 |     t = df >> group_by(X.cut) >> mutate(m=mean(X.x))
 27 |     df_truth['m'] = pd.Series([3.950, 4.045, 4.195, 4.045, 4.195])
 28 |     assert t.sort_index().equals(df_truth)
 29 | 
 30 | 
 31 | def test_first():
 32 |     df = diamonds >> select(X.cut, X.x) >> head(5)
 33 |     # straight summarize
 34 |     t = df >> summarize(f=first(X.x))
 35 |     df_truth = pd.DataFrame({'f': [3.95]})
 36 |     assert t.equals(df_truth)
 37 |     # grouped summarize
 38 |     t = df >> group_by(X.cut) >> summarize(f=first(X.x))
 39 |     df_truth = pd.DataFrame({'cut': ['Good', 'Ideal', 'Premium'],
 40 |                              'f': [4.05, 3.95, 3.89]})
 41 |     assert t.equals(df_truth)
 42 |     # summarize with order_by
 43 |     t = df >> summarize(f=first(X.x, order_by=desc(X.cut)))
 44 |     df_truth = pd.DataFrame({'f':[3.89]})
 45 |     # straight mutate
 46 |     t = df >> mutate(f=first(X.x))
 47 |     df_truth = df.copy()
 48 |     df_truth['f'] = df_truth.x.iloc[0]
 49 |     assert t.equals(df_truth)
 50 |     # grouped mutate
 51 |     t = df >> group_by(X.cut) >> mutate(f=first(X.x))
 52 |     df_truth['f'] = pd.Series([3.95, 3.89, 4.05, 3.89, 4.05])
 53 |     assert t.sort_index().equals(df_truth)
 54 | 
 55 | 
 56 | def test_last():
 57 |     df = diamonds >> select(X.cut, X.x) >> head(5)
 58 |     # straight summarize
 59 |     t = df >> summarize(l=last(X.x))
 60 |     df_truth = pd.DataFrame({'l': [4.34]})
 61 |     assert t.equals(df_truth)
 62 |     # grouped summarize
 63 |     t = df >> group_by(X.cut) >> summarize(l=last(X.x))
 64 |     df_truth = pd.DataFrame({'cut': ['Good', 'Ideal', 'Premium'],
 65 |                              'l': [4.34, 3.95, 4.20]})
 66 |     assert t.equals(df_truth)
 67 |     # summarize with order_by
 68 |     #t = df >> summarize(f=last(X.x, order_by=desc(X.cut)))
 69 |     t = df >> summarize(f=last(X.x, order_by=[desc(X.cut), desc(X.x)]))
 70 |     df_truth = pd.DataFrame({'f':[4.05]})
 71 |     assert df_truth.equals(t)
 72 |     # straight mutate
 73 |     t = df >> mutate(l=last(X.x))
 74 |     df_truth = df.copy()
 75 |     df_truth['l'] = df_truth.x.iloc[4]
 76 |     assert t.equals(df_truth)
 77 |     # grouped mutate
 78 |     t = df >> group_by(X.cut) >> mutate(l=last(X.x))
 79 |     df_truth['l'] = pd.Series([3.95, 4.20, 4.34, 4.20, 4.34])
 80 |     assert t.sort_index().equals(df_truth)
 81 | 
 82 | 
 83 | def test_nth():
 84 |     df = diamonds >> select(X.cut, X.x) >> head(10)
 85 |     # straight summarize
 86 |     t = df >> summarize(second=nth(X.x, 1))
 87 |     df_truth = pd.DataFrame({'second': [3.89]})
 88 |     assert t.equals(df_truth)
 89 |     # grouped summarize
 90 |     t = df >> group_by(X.cut) >> summarize(first=nth(X.x, 0))
 91 |     df_truth = pd.DataFrame({'cut': ['Fair','Good', 'Ideal', 'Premium','Very Good'],
 92 |                              'first': [3.87,4.05,3.95,3.89,3.94]})
 93 |     assert t.equals(df_truth)
 94 |     # summarize with order_by
 95 |     t = df >> summarize(last=nth(X.x, -1, order_by=[desc(X.cut), desc(X.x)]))
 96 |     #print(t)
 97 |     df_truth = pd.DataFrame({'last':[3.87]})
 98 |     #print(df_truth)
 99 |     #print(t)
100 |     assert df_truth.equals(t)
101 |     # straight mutate
102 |     t = df >> mutate(out_of_range=nth(X.x, 500))
103 |     df_truth = df.copy()
104 |     df_truth['out_of_range'] = np.nan
105 |     assert t.equals(df_truth)
106 |     # grouped mutate
107 |     t = df >> group_by(X.cut) >> mutate(penultimate=nth(X.x, -2))
108 |     df_truth = df.copy()
109 |     df_truth['penultimate'] = pd.Series([np.nan,3.89,4.05,3.89,4.05,4.07,
110 |                                          4.07,4.07,np.nan,4.07])
111 |     print(t)
112 |     print(df_truth)
113 |     assert t.sort_index().equals(df_truth)
114 | 
115 | 
116 | def test_n():
117 |     df = diamonds >> select(X.cut, X.x) >> head(5)
118 |     # straight summarize
119 |     t = df >> summarize(n=n(X.x))
120 |     df_truth = pd.DataFrame({'n': [5]})
121 |     assert t.equals(df_truth)
122 |     # grouped summarize
123 |     t = df >> group_by(X.cut) >> summarize(n=n(X.x))
124 |     df_truth = pd.DataFrame({'cut': ['Good', 'Ideal', 'Premium'],
125 |                              'n': [2, 1, 2]})
126 |     assert t.equals(df_truth)
127 |     # straight mutate
128 |     t = df >> mutate(n=n(X.x))
129 |     df_truth = df.copy()
130 |     df_truth['n'] = 5
131 |     assert t.equals(df_truth)
132 |     # grouped mutate
133 |     t = df >> group_by(X.cut) >> mutate(n=n(X.x))
134 |     df_truth['n'] = pd.Series([1, 2, 2, 2, 2, 2])
135 |     print(t)
136 |     print(df_truth)
137 |     assert t.sort_index().equals(df_truth)
138 | 
139 | 
140 | def test_n_distinct():
141 |     df = pd.DataFrame({'col_1': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c'],
142 |                        'col_2': [1, 1, 1, 2, 3, 3, 4, 5]})
143 |     # straight summarize
144 |     t = df >> summarize(n=n_distinct(X.col_2))
145 |     df_truth = pd.DataFrame({'n': [5]})
146 |     assert t.equals(df_truth)
147 |     # grouped summarize
148 |     t = df >> group_by(X.col_1) >> summarize(n=n_distinct(X.col_2))
149 |     df_truth = pd.DataFrame({'col_1': ['a', 'b', 'c'],
150 |                              'n': [1, 2, 2]})
151 |     assert t.equals(df_truth)
152 |     # straight mutate
153 |     t = df >> mutate(n=n_distinct(X.col_2))
154 |     df_truth = df.copy()
155 |     df_truth['n'] = 5
156 |     assert t.equals(df_truth)
157 |     # grouped mutate
158 |     t = df >> group_by(X.col_1) >> mutate(n=n_distinct(X.col_2))
159 |     df_truth['n'] = pd.Series([1, 1, 1, 2, 2, 2, 2, 2])
160 |     assert t.equals(df_truth)
161 | 
162 | 
163 | def test_IQR():
164 |     df = diamonds >> select(X.cut, X.x) >> head(5)
165 |     # straight summarize
166 |     t = df >> summarize(i=IQR(X.x))
167 |     df_truth = pd.DataFrame({'i': [.25]})
168 |     assert t.equals(df_truth)
169 |     # grouped summarize
170 |     t = df >> group_by(X.cut) >> summarize(i=IQR(X.x))
171 |     df_truth = pd.DataFrame({'cut': ['Good', 'Ideal', 'Premium'],
172 |                              'i': [0.145, 0.000, 0.155]})
173 |     test_vector = abs(t.i - df_truth.i)
174 |     assert all(test_vector < 0.000000001)
175 |     # straight mutate
176 |     t = df >> mutate(i=IQR(X.x))
177 |     df_truth = df.copy()
178 |     df_truth['i'] = 0.25
179 |     assert t.equals(df_truth)
180 |     # grouped mutate
181 |     t = df >> group_by(X.cut) >> mutate(i=IQR(X.x))
182 |     df_truth['i'] = pd.Series([0.000, 0.155, 0.145, 0.155, 0.145])
183 |     test_vector = abs(t.i - df_truth.i)
184 |     assert all(test_vector < 0.000000001)
185 | 
186 | 
187 | def test_colmin():
188 |     df = diamonds >> select(X.cut, X.x) >> head(5)
189 |     # straight summarize
190 |     t = df >> summarize(m=colmin(X.x))
191 |     df_truth = pd.DataFrame({'m': [3.89]})
192 |     assert t.equals(df_truth)
193 |     # grouped summarize
194 |     t = df >> group_by(X.cut) >> summarize(m=colmin(X.x))
195 |     df_truth = pd.DataFrame({'cut': ['Good', 'Ideal', 'Premium'],
196 |                              'm': [4.05, 3.95, 3.89]})
197 |     assert t.equals(df_truth)
198 |     # straight mutate
199 |     t = df >> mutate(m=colmin(X.x))
200 |     df_truth = df.copy()
201 |     df_truth['m'] = 3.89
202 |     assert t.equals(df_truth)
203 |     # grouped mutate
204 |     t = df >> group_by(X.cut) >> mutate(m=colmin(X.x))
205 |     df_truth['m'] = pd.Series([3.95, 3.89, 4.05, 3.89, 4.05])
206 |     assert t.sort_index().equals(df_truth)
207 | 
208 | 
209 | def test_colmax():
210 |     df = diamonds >> select(X.cut, X.x) >> head(5)
211 |     # straight summarize
212 |     t = df >> summarize(m=colmax(X.x))
213 |     df_truth = pd.DataFrame({'m': [4.34]})
214 |     assert t.equals(df_truth)
215 |     # grouped summarize
216 |     t = df >> group_by(X.cut) >> summarize(m=colmax(X.x))
217 |     df_truth = pd.DataFrame({'cut': ['Good', 'Ideal', 'Premium'],
218 |                              'm': [4.34, 3.95, 4.20]})
219 |     assert t.equals(df_truth)
220 |     # straight mutate
221 |     t = df >> mutate(m=colmax(X.x))
222 |     df_truth = df.copy()
223 |     df_truth['m'] = 4.34
224 |     assert t.equals(df_truth)
225 |     # grouped mutate
226 |     print(df.groupby('cut')['x'].agg(np.max))
227 |     print(df)
228 |     t = df >> group_by(X.cut) >> mutate(m=colmax(X.x))
229 |     df_truth['m'] = pd.Series([3.95, 4.20, 4.34, 4.20, 4.34])
230 |     print(t)
231 |     print(df_truth)
232 |     assert t.sort_index().equals(df_truth)
233 | 
234 | 
235 | def test_median():
236 |     df = diamonds >> group_by(X.cut) >> head(3) >> select(X.cut, X.x) >> ungroup()
237 |     # straight summarize
238 |     t = df >> summarize(m=median(X.x))
239 |     df_truth = pd.DataFrame({'m': [4.05]})
240 |     assert t.equals(df_truth)
241 | 
242 |     # grouped summarize
243 |     t = df >> group_by(X.cut) >> summarize(m=median(X.x))
244 |     df_truth = pd.DataFrame({'cut': ['Fair', 'Good', 'Ideal', 'Premium', 'Very Good'],
245 |                              'm': [6.27, 4.25, 3.95, 3.89, 3.95]})
246 |     assert t.equals(df_truth)
247 |     # straight mutate
248 |     t = df >> mutate(m=median(X.x))
249 |     df_truth = df.copy()
250 |     df_truth['m'] = 4.05
251 |     assert t.equals(df_truth)
252 |     # grouped mutate
253 |     # t = df >> group_by(X.cut) >> mutate(m=median(X.x))
254 |     # df_truth['m'] = pd.Series(
255 |     #     [6.27, 6.27, 6.27, 4.25, 4.25, 4.25, 3.95, 3.95, 3.95, 3.89, 3.89, 3.89, 3.95, 3.95, 3.95],
256 |     #     index=t.index)
257 |     # assert t.equals(df_truth)
258 |     # make sure it handles case with even counts properly
259 |     df = diamonds >> group_by(X.cut) >> head(2) >> select(X.cut, X.x)
260 |     t = df >> group_by(X.cut) >> summarize(m=median(X.x))
261 |     df_truth = pd.DataFrame({'cut': ['Fair', 'Good', 'Ideal', 'Premium', 'Very Good'],
262 |                              'm': [5.160, 4.195, 3.940, 4.045, 3.945]})
263 |     test_vector = abs(t.m - df_truth.m)
264 |     assert all(test_vector < .000000001)
265 | 
266 | 
267 | def test_var():
268 |     df = diamonds >> group_by(X.cut) >> head(3) >> select(X.cut, X.x) >> ungroup()
269 | 
270 |     # straight summarize
271 |     t = df >> summarize(v=var(X.x))
272 |     df_truth = pd.DataFrame({'v': [0.687392]})
273 |     test_vector = abs(t.v - df_truth.v)
274 |     print(t.v)
275 |     print(df_truth.v)
276 |     assert all(test_vector < .00001)
277 | 
278 |     # grouped summarize
279 |     t = df >> group_by(X.cut) >> summarize(v=var(X.x))
280 |     df_truth = pd.DataFrame({'cut': ['Fair', 'Good', 'Ideal', 'Premium', 'Very Good'],
281 |                              'v': [2.074800, 0.022033, 0.056133, 0.033100, 0.005233]})
282 |     test_vector = abs(t.v - df_truth.v)
283 |     assert all(test_vector < .00001)
284 |     # straight mutate
285 |     t = df >> mutate(v=var(X.x))
286 |     df_truth = df.copy()
287 |     df_truth['v'] = 0.687392
288 |     test_vector = abs(t.v - df_truth.v)
289 |     assert all(test_vector < .00001)
290 |     # grouped mutate
291 |     # t = df >> group_by(X.cut) >> mutate(v=var(X.x))
292 |     # df_truth['v'] = pd.Series([2.074800, 2.074800, 2.074800, 0.022033, 0.022033, 0.022033,
293 |     #                            0.056133, 0.056133, 0.056133, 0.033100, 0.033100, 0.033100,
294 |     #                            0.005233, 0.005233, 0.005233],
295 |     #                           index=t.index)
296 |     # test_vector = abs(t.v - df_truth.v)
297 |     # assert all(test_vector < .00001)
298 |     # test with single value (var undefined)
299 |     df = diamonds >> group_by(X.cut) >> head(1) >> select(X.cut, X.x)
300 |     t = df >> group_by(X.cut) >> summarize(v=var(X.x))
301 |     df_truth = pd.DataFrame({'cut': ['Fair', 'Good', 'Ideal', 'Premium', 'Very Good'],
302 |                              'v': [np.nan, np.nan, np.nan, np.nan, np.nan]})
303 |     assert t.equals(df_truth)
304 | 
305 | 
306 | def test_sd():
307 |     df = diamonds >> group_by(X.cut) >> head(3) >> select(X.cut, X.x) >> ungroup()
308 |     # straight summarize
309 |     t = df >> summarize(s=sd(X.x))
310 |     df_truth = pd.DataFrame({'s': [0.829091]})
311 |     test_vector = abs(t.s - df_truth.s)
312 |     print(t)
313 |     print(t.s)
314 |     print(df_truth.s)
315 |     assert all(test_vector < .00001)
316 |     # grouped summarize
317 |     t = df >> group_by(X.cut) >> summarize(s=sd(X.x))
318 |     df_truth = pd.DataFrame({'cut': ['Fair', 'Good', 'Ideal', 'Premium', 'Very Good'],
319 |                              's': [1.440417, 0.148436, 0.236925, 0.181934, 0.072342]})
320 |     test_vector = abs(t.s - df_truth.s)
321 |     assert all(test_vector < .00001)
322 |     # straight mutate
323 |     t = df >> mutate(s=sd(X.x))
324 |     df_truth = df.copy()
325 |     df_truth['s'] = 0.829091
326 |     test_vector = abs(t.s - df_truth.s)
327 |     assert all(test_vector < .00001)
328 |     # grouped mutate
329 |     t = df >> group_by(X.cut) >> mutate(s=sd(X.x))
330 |     # df_truth['s'] = pd.Series([1.440417, 1.440417, 1.440417, 0.148436, 0.148436, 0.148436,
331 |     #                            0.236925, 0.236925, 0.236925, 0.181934, 0.181934, 0.181934,
332 |     #                            0.072342, 0.072342, 0.072342],
333 |     #                           index=t.index)
334 |     # test_vector = abs(t.s - df_truth.s)
335 |     # print(t)
336 |     # print(df_truth)
337 |     assert all(test_vector < .00001)
338 |     # test with single value (var undefined)
339 |     df = diamonds >> group_by(X.cut) >> head(1) >> select(X.cut, X.x)
340 |     t = df >> group_by(X.cut) >> summarize(s=sd(X.x))
341 |     df_truth = pd.DataFrame({'cut': ['Fair', 'Good', 'Ideal', 'Premium', 'Very Good'],
342 |                              's': [np.nan, np.nan, np.nan, np.nan, np.nan]})
343 |     assert t.equals(df_truth)
344 | 


--------------------------------------------------------------------------------
/examples/basics-extending-functionality.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np\n",
 13 |     "\n",
 14 |     "from dfply import *"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "### Case #1: A custom pipe function\n",
 22 |     "---\n",
 23 |     "\n",
 24 |     "Pandas has a function `pd.crosstab` which can generate a cross-tabluation of factors. Let's say we wanted to build a pipe function that wrapped around this. The docstring of the Pandas function is below:\n",
 25 |     "\n",
 26 |     "Compute a simple cross-tabulation of two (or more) factors. By default\n",
 27 |     "computes a frequency table of the factors unless an array of values and an\n",
 28 |     "aggregation function are passed\n",
 29 |     "\n",
 30 |     "    Parameters\n",
 31 |     "    ----------\n",
 32 |     "    index : array-like, Series, or list of arrays/Series\n",
 33 |     "        Values to group by in the rows\n",
 34 |     "    columns : array-like, Series, or list of arrays/Series\n",
 35 |     "        Values to group by in the columns\n",
 36 |     "    values : array-like, optional\n",
 37 |     "        Array of values to aggregate according to the factors.\n",
 38 |     "        Requires `aggfunc` be specified.\n",
 39 |     "    aggfunc : function, optional\n",
 40 |     "        If specified, requires `values` be specified as well\n",
 41 |     "    rownames : sequence, default None\n",
 42 |     "        If passed, must match number of row arrays passed\n",
 43 |     "    colnames : sequence, default None\n",
 44 |     "        If passed, must match number of column arrays passed\n",
 45 |     "    margins : boolean, default False\n",
 46 |     "        Add row/column margins (subtotals)\n",
 47 |     "    dropna : boolean, default True\n",
 48 |     "        Do not include columns whose entries are all NaN\n",
 49 |     "    normalize : boolean, {'all', 'index', 'columns'}, or {0,1}, default False\n",
 50 |     "        Normalize by dividing all values by the sum of values.\n",
 51 |     "\n",
 52 |     "        - If passed 'all' or `True`, will normalize over all values.\n",
 53 |     "        - If passed 'index' will normalize over each row.\n",
 54 |     "        - If passed 'columns' will normalize over each column.\n",
 55 |     "        - If margins is `True`, will also normalize margin values.\n",
 56 |     "        \n",
 57 |     "\n",
 58 |     "**To keep it simple, let's build a reduced version of this that takes only:**\n",
 59 |     "- `index`\n",
 60 |     "- `columns`\n",
 61 |     "- `values`\n",
 62 |     "- `aggfunc`\n",
 63 |     "\n",
 64 |     "Below is a function that wraps around the call to `pd.crosstab`. "
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 5,
 70 |    "metadata": {
 71 |     "collapsed": true
 72 |    },
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "def crosstab(index, columns, values=None, aggfunc=None):\n",
 76 |     "    return pd.crosstab(index, columns, values=values, aggfunc=aggfunc)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 6,
 82 |    "metadata": {},
 83 |    "outputs": [
 84 |     {
 85 |      "data": {
 86 |       "text/html": [
 87 |        "<div>\n",
 88 |        "<style>\n",
 89 |        "    .dataframe thead tr:only-child th {\n",
 90 |        "        text-align: right;\n",
 91 |        "    }\n",
 92 |        "\n",
 93 |        "    .dataframe thead th {\n",
 94 |        "        text-align: left;\n",
 95 |        "    }\n",
 96 |        "\n",
 97 |        "    .dataframe tbody tr th {\n",
 98 |        "        vertical-align: top;\n",
 99 |        "    }\n",
100 |        "</style>\n",
101 |        "<table border=\"1\" class=\"dataframe\">\n",
102 |        "  <thead>\n",
103 |        "    <tr style=\"text-align: right;\">\n",
104 |        "      <th></th>\n",
105 |        "      <th>carat</th>\n",
106 |        "      <th>cut</th>\n",
107 |        "      <th>color</th>\n",
108 |        "      <th>clarity</th>\n",
109 |        "      <th>depth</th>\n",
110 |        "      <th>table</th>\n",
111 |        "      <th>price</th>\n",
112 |        "      <th>x</th>\n",
113 |        "      <th>y</th>\n",
114 |        "      <th>z</th>\n",
115 |        "    </tr>\n",
116 |        "  </thead>\n",
117 |        "  <tbody>\n",
118 |        "    <tr>\n",
119 |        "      <th>0</th>\n",
120 |        "      <td>0.23</td>\n",
121 |        "      <td>Ideal</td>\n",
122 |        "      <td>E</td>\n",
123 |        "      <td>SI2</td>\n",
124 |        "      <td>61.5</td>\n",
125 |        "      <td>55.0</td>\n",
126 |        "      <td>326</td>\n",
127 |        "      <td>3.95</td>\n",
128 |        "      <td>3.98</td>\n",
129 |        "      <td>2.43</td>\n",
130 |        "    </tr>\n",
131 |        "    <tr>\n",
132 |        "      <th>1</th>\n",
133 |        "      <td>0.21</td>\n",
134 |        "      <td>Premium</td>\n",
135 |        "      <td>E</td>\n",
136 |        "      <td>SI1</td>\n",
137 |        "      <td>59.8</td>\n",
138 |        "      <td>61.0</td>\n",
139 |        "      <td>326</td>\n",
140 |        "      <td>3.89</td>\n",
141 |        "      <td>3.84</td>\n",
142 |        "      <td>2.31</td>\n",
143 |        "    </tr>\n",
144 |        "  </tbody>\n",
145 |        "</table>\n",
146 |        "</div>"
147 |       ],
148 |       "text/plain": [
149 |        "   carat      cut color clarity  depth  table  price     x     y     z\n",
150 |        "0   0.23    Ideal     E     SI2   61.5   55.0    326  3.95  3.98  2.43\n",
151 |        "1   0.21  Premium     E     SI1   59.8   61.0    326  3.89  3.84  2.31"
152 |       ]
153 |      },
154 |      "execution_count": 6,
155 |      "metadata": {},
156 |      "output_type": "execute_result"
157 |     }
158 |    ],
159 |    "source": [
160 |     "diamonds.head(2)"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 7,
166 |    "metadata": {},
167 |    "outputs": [
168 |     {
169 |      "data": {
170 |       "text/html": [
171 |        "<div>\n",
172 |        "<style>\n",
173 |        "    .dataframe thead tr:only-child th {\n",
174 |        "        text-align: right;\n",
175 |        "    }\n",
176 |        "\n",
177 |        "    .dataframe thead th {\n",
178 |        "        text-align: left;\n",
179 |        "    }\n",
180 |        "\n",
181 |        "    .dataframe tbody tr th {\n",
182 |        "        vertical-align: top;\n",
183 |        "    }\n",
184 |        "</style>\n",
185 |        "<table border=\"1\" class=\"dataframe\">\n",
186 |        "  <thead>\n",
187 |        "    <tr style=\"text-align: right;\">\n",
188 |        "      <th>color</th>\n",
189 |        "      <th>D</th>\n",
190 |        "      <th>E</th>\n",
191 |        "      <th>F</th>\n",
192 |        "      <th>G</th>\n",
193 |        "      <th>H</th>\n",
194 |        "      <th>I</th>\n",
195 |        "      <th>J</th>\n",
196 |        "    </tr>\n",
197 |        "    <tr>\n",
198 |        "      <th>cut</th>\n",
199 |        "      <th></th>\n",
200 |        "      <th></th>\n",
201 |        "      <th></th>\n",
202 |        "      <th></th>\n",
203 |        "      <th></th>\n",
204 |        "      <th></th>\n",
205 |        "      <th></th>\n",
206 |        "    </tr>\n",
207 |        "  </thead>\n",
208 |        "  <tbody>\n",
209 |        "    <tr>\n",
210 |        "      <th>Fair</th>\n",
211 |        "      <td>163</td>\n",
212 |        "      <td>224</td>\n",
213 |        "      <td>312</td>\n",
214 |        "      <td>314</td>\n",
215 |        "      <td>303</td>\n",
216 |        "      <td>175</td>\n",
217 |        "      <td>119</td>\n",
218 |        "    </tr>\n",
219 |        "    <tr>\n",
220 |        "      <th>Good</th>\n",
221 |        "      <td>662</td>\n",
222 |        "      <td>933</td>\n",
223 |        "      <td>909</td>\n",
224 |        "      <td>871</td>\n",
225 |        "      <td>702</td>\n",
226 |        "      <td>522</td>\n",
227 |        "      <td>307</td>\n",
228 |        "    </tr>\n",
229 |        "    <tr>\n",
230 |        "      <th>Ideal</th>\n",
231 |        "      <td>2834</td>\n",
232 |        "      <td>3903</td>\n",
233 |        "      <td>3826</td>\n",
234 |        "      <td>4884</td>\n",
235 |        "      <td>3115</td>\n",
236 |        "      <td>2093</td>\n",
237 |        "      <td>896</td>\n",
238 |        "    </tr>\n",
239 |        "    <tr>\n",
240 |        "      <th>Premium</th>\n",
241 |        "      <td>1603</td>\n",
242 |        "      <td>2337</td>\n",
243 |        "      <td>2331</td>\n",
244 |        "      <td>2924</td>\n",
245 |        "      <td>2360</td>\n",
246 |        "      <td>1428</td>\n",
247 |        "      <td>808</td>\n",
248 |        "    </tr>\n",
249 |        "    <tr>\n",
250 |        "      <th>Very Good</th>\n",
251 |        "      <td>1513</td>\n",
252 |        "      <td>2400</td>\n",
253 |        "      <td>2164</td>\n",
254 |        "      <td>2299</td>\n",
255 |        "      <td>1824</td>\n",
256 |        "      <td>1204</td>\n",
257 |        "      <td>678</td>\n",
258 |        "    </tr>\n",
259 |        "  </tbody>\n",
260 |        "</table>\n",
261 |        "</div>"
262 |       ],
263 |       "text/plain": [
264 |        "color         D     E     F     G     H     I    J\n",
265 |        "cut                                               \n",
266 |        "Fair        163   224   312   314   303   175  119\n",
267 |        "Good        662   933   909   871   702   522  307\n",
268 |        "Ideal      2834  3903  3826  4884  3115  2093  896\n",
269 |        "Premium    1603  2337  2331  2924  2360  1428  808\n",
270 |        "Very Good  1513  2400  2164  2299  1824  1204  678"
271 |       ]
272 |      },
273 |      "execution_count": 7,
274 |      "metadata": {},
275 |      "output_type": "execute_result"
276 |     }
277 |    ],
278 |    "source": [
279 |     "crosstab(diamonds.cut, diamonds.color)"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "markdown",
284 |    "metadata": {},
285 |    "source": [
286 |     "If you want your function to be part of a dfply pipe chain, the first argument _must_ be a dataframe, which is implicitly passed through during the evaluation of the chain! We will need to redefine the function to have the implicit `df` passed in as the first argument.\n",
287 |     "\n",
288 |     "The most common and straightforward way to convert a custom function to a dfply piping function is to use the `@dfpipe` decorator. \n",
289 |     "\n",
290 |     "> Note: the `@dfpipe` decorator is in fact a convenience decorator that stacks three dfply decorators together: \n",
291 |     "\n",
292 |     "> `@pipe`  \n",
293 |     "`@group_delegation`  \n",
294 |     "`@symbolic_evaluation`  \n",
295 |     "\n",
296 |     "> `@pipe` ensures that the function will work in the dfply piping syntax and take an implicit DataFrame, `@group_delegation` makes the function work with groupings applied prior in the chain, and `@symbolic_evaluation` enables you to use and evaluate symbolic arguments like `X.cut` that are placeholders for incoming data."
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": 8,
302 |    "metadata": {
303 |     "collapsed": true
304 |    },
305 |    "outputs": [],
306 |    "source": [
307 |     "@dfpipe\n",
308 |     "def crosstab(df, index, columns, values=None, aggfunc=None):\n",
309 |     "    return pd.crosstab(index, columns, values=values, aggfunc=aggfunc)"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": 9,
315 |    "metadata": {},
316 |    "outputs": [
317 |     {
318 |      "data": {
319 |       "text/html": [
320 |        "<div>\n",
321 |        "<style>\n",
322 |        "    .dataframe thead tr:only-child th {\n",
323 |        "        text-align: right;\n",
324 |        "    }\n",
325 |        "\n",
326 |        "    .dataframe thead th {\n",
327 |        "        text-align: left;\n",
328 |        "    }\n",
329 |        "\n",
330 |        "    .dataframe tbody tr th {\n",
331 |        "        vertical-align: top;\n",
332 |        "    }\n",
333 |        "</style>\n",
334 |        "<table border=\"1\" class=\"dataframe\">\n",
335 |        "  <thead>\n",
336 |        "    <tr style=\"text-align: right;\">\n",
337 |        "      <th>color</th>\n",
338 |        "      <th>D</th>\n",
339 |        "      <th>E</th>\n",
340 |        "      <th>F</th>\n",
341 |        "      <th>G</th>\n",
342 |        "      <th>H</th>\n",
343 |        "      <th>I</th>\n",
344 |        "      <th>J</th>\n",
345 |        "    </tr>\n",
346 |        "    <tr>\n",
347 |        "      <th>cut</th>\n",
348 |        "      <th></th>\n",
349 |        "      <th></th>\n",
350 |        "      <th></th>\n",
351 |        "      <th></th>\n",
352 |        "      <th></th>\n",
353 |        "      <th></th>\n",
354 |        "      <th></th>\n",
355 |        "    </tr>\n",
356 |        "  </thead>\n",
357 |        "  <tbody>\n",
358 |        "    <tr>\n",
359 |        "      <th>Fair</th>\n",
360 |        "      <td>163</td>\n",
361 |        "      <td>224</td>\n",
362 |        "      <td>312</td>\n",
363 |        "      <td>314</td>\n",
364 |        "      <td>303</td>\n",
365 |        "      <td>175</td>\n",
366 |        "      <td>119</td>\n",
367 |        "    </tr>\n",
368 |        "    <tr>\n",
369 |        "      <th>Good</th>\n",
370 |        "      <td>662</td>\n",
371 |        "      <td>933</td>\n",
372 |        "      <td>909</td>\n",
373 |        "      <td>871</td>\n",
374 |        "      <td>702</td>\n",
375 |        "      <td>522</td>\n",
376 |        "      <td>307</td>\n",
377 |        "    </tr>\n",
378 |        "    <tr>\n",
379 |        "      <th>Ideal</th>\n",
380 |        "      <td>2834</td>\n",
381 |        "      <td>3903</td>\n",
382 |        "      <td>3826</td>\n",
383 |        "      <td>4884</td>\n",
384 |        "      <td>3115</td>\n",
385 |        "      <td>2093</td>\n",
386 |        "      <td>896</td>\n",
387 |        "    </tr>\n",
388 |        "    <tr>\n",
389 |        "      <th>Premium</th>\n",
390 |        "      <td>1603</td>\n",
391 |        "      <td>2337</td>\n",
392 |        "      <td>2331</td>\n",
393 |        "      <td>2924</td>\n",
394 |        "      <td>2360</td>\n",
395 |        "      <td>1428</td>\n",
396 |        "      <td>808</td>\n",
397 |        "    </tr>\n",
398 |        "    <tr>\n",
399 |        "      <th>Very Good</th>\n",
400 |        "      <td>1513</td>\n",
401 |        "      <td>2400</td>\n",
402 |        "      <td>2164</td>\n",
403 |        "      <td>2299</td>\n",
404 |        "      <td>1824</td>\n",
405 |        "      <td>1204</td>\n",
406 |        "      <td>678</td>\n",
407 |        "    </tr>\n",
408 |        "  </tbody>\n",
409 |        "</table>\n",
410 |        "</div>"
411 |       ],
412 |       "text/plain": [
413 |        "color         D     E     F     G     H     I    J\n",
414 |        "cut                                               \n",
415 |        "Fair        163   224   312   314   303   175  119\n",
416 |        "Good        662   933   909   871   702   522  307\n",
417 |        "Ideal      2834  3903  3826  4884  3115  2093  896\n",
418 |        "Premium    1603  2337  2331  2924  2360  1428  808\n",
419 |        "Very Good  1513  2400  2164  2299  1824  1204  678"
420 |       ]
421 |      },
422 |      "execution_count": 9,
423 |      "metadata": {},
424 |      "output_type": "execute_result"
425 |     }
426 |    ],
427 |    "source": [
428 |     "diamonds >> crosstab(X.cut, X.color)"
429 |    ]
430 |   },
431 |   {
432 |    "cell_type": "markdown",
433 |    "metadata": {},
434 |    "source": [
435 |     "### Case #2: A function that works with symbolic arguments\n",
436 |     "---\n",
437 |     "\n",
438 |     "Many tasks are simpler and do not require the capacity to work as a pipe function. The `dfply` window functions are the common examples of this: functions that take a Series (or _symbolic_ Series) and return a modified version.\n",
439 |     "\n",
440 |     "Let's say we had a dataframe with dates represented by strings that we wanted to convert to pandas datetime objects using the `pd.to_datetime` function. Below is a tiny example dataframe with this issue."
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "code",
445 |    "execution_count": 10,
446 |    "metadata": {},
447 |    "outputs": [
448 |     {
449 |      "data": {
450 |       "text/html": [
451 |        "<div>\n",
452 |        "<style>\n",
453 |        "    .dataframe thead tr:only-child th {\n",
454 |        "        text-align: right;\n",
455 |        "    }\n",
456 |        "\n",
457 |        "    .dataframe thead th {\n",
458 |        "        text-align: left;\n",
459 |        "    }\n",
460 |        "\n",
461 |        "    .dataframe tbody tr th {\n",
462 |        "        vertical-align: top;\n",
463 |        "    }\n",
464 |        "</style>\n",
465 |        "<table border=\"1\" class=\"dataframe\">\n",
466 |        "  <thead>\n",
467 |        "    <tr style=\"text-align: right;\">\n",
468 |        "      <th></th>\n",
469 |        "      <th>date</th>\n",
470 |        "      <th>sales</th>\n",
471 |        "    </tr>\n",
472 |        "  </thead>\n",
473 |        "  <tbody>\n",
474 |        "    <tr>\n",
475 |        "      <th>0</th>\n",
476 |        "      <td>7/10/17</td>\n",
477 |        "      <td>1220</td>\n",
478 |        "    </tr>\n",
479 |        "    <tr>\n",
480 |        "      <th>1</th>\n",
481 |        "      <td>7/11/17</td>\n",
482 |        "      <td>1592</td>\n",
483 |        "    </tr>\n",
484 |        "    <tr>\n",
485 |        "      <th>2</th>\n",
486 |        "      <td>7/12/17</td>\n",
487 |        "      <td>908</td>\n",
488 |        "    </tr>\n",
489 |        "    <tr>\n",
490 |        "      <th>3</th>\n",
491 |        "      <td>7/13/17</td>\n",
492 |        "      <td>1102</td>\n",
493 |        "    </tr>\n",
494 |        "    <tr>\n",
495 |        "      <th>4</th>\n",
496 |        "      <td>7/14/17</td>\n",
497 |        "      <td>1395</td>\n",
498 |        "    </tr>\n",
499 |        "  </tbody>\n",
500 |        "</table>\n",
501 |        "</div>"
502 |       ],
503 |       "text/plain": [
504 |        "      date  sales\n",
505 |        "0  7/10/17   1220\n",
506 |        "1  7/11/17   1592\n",
507 |        "2  7/12/17    908\n",
508 |        "3  7/13/17   1102\n",
509 |        "4  7/14/17   1395"
510 |       ]
511 |      },
512 |      "execution_count": 10,
513 |      "metadata": {},
514 |      "output_type": "execute_result"
515 |     }
516 |    ],
517 |    "source": [
518 |     "sales = pd.DataFrame(dict(date=['7/10/17','7/11/17','7/12/17','7/13/17','7/14/17'],\n",
519 |     "                          sales=[1220, 1592, 908, 1102, 1395]))\n",
520 |     "sales"
521 |    ]
522 |   },
523 |   {
524 |    "cell_type": "code",
525 |    "execution_count": 11,
526 |    "metadata": {},
527 |    "outputs": [
528 |     {
529 |      "data": {
530 |       "text/plain": [
531 |        "date     object\n",
532 |        "sales     int64\n",
533 |        "dtype: object"
534 |       ]
535 |      },
536 |      "execution_count": 11,
537 |      "metadata": {},
538 |      "output_type": "execute_result"
539 |     }
540 |    ],
541 |    "source": [
542 |     "sales.dtypes"
543 |    ]
544 |   },
545 |   {
546 |    "cell_type": "markdown",
547 |    "metadata": {},
548 |    "source": [
549 |     "In pandas we would use the `pd.to_datetime` function to convert the strings to date objects, and add it as a new column like so:"
550 |    ]
551 |   },
552 |   {
553 |    "cell_type": "code",
554 |    "execution_count": 12,
555 |    "metadata": {},
556 |    "outputs": [
557 |     {
558 |      "data": {
559 |       "text/html": [
560 |        "<div>\n",
561 |        "<style>\n",
562 |        "    .dataframe thead tr:only-child th {\n",
563 |        "        text-align: right;\n",
564 |        "    }\n",
565 |        "\n",
566 |        "    .dataframe thead th {\n",
567 |        "        text-align: left;\n",
568 |        "    }\n",
569 |        "\n",
570 |        "    .dataframe tbody tr th {\n",
571 |        "        vertical-align: top;\n",
572 |        "    }\n",
573 |        "</style>\n",
574 |        "<table border=\"1\" class=\"dataframe\">\n",
575 |        "  <thead>\n",
576 |        "    <tr style=\"text-align: right;\">\n",
577 |        "      <th></th>\n",
578 |        "      <th>date</th>\n",
579 |        "      <th>sales</th>\n",
580 |        "      <th>pd_date</th>\n",
581 |        "    </tr>\n",
582 |        "  </thead>\n",
583 |        "  <tbody>\n",
584 |        "    <tr>\n",
585 |        "      <th>0</th>\n",
586 |        "      <td>7/10/17</td>\n",
587 |        "      <td>1220</td>\n",
588 |        "      <td>2017-07-10</td>\n",
589 |        "    </tr>\n",
590 |        "    <tr>\n",
591 |        "      <th>1</th>\n",
592 |        "      <td>7/11/17</td>\n",
593 |        "      <td>1592</td>\n",
594 |        "      <td>2017-07-11</td>\n",
595 |        "    </tr>\n",
596 |        "    <tr>\n",
597 |        "      <th>2</th>\n",
598 |        "      <td>7/12/17</td>\n",
599 |        "      <td>908</td>\n",
600 |        "      <td>2017-07-12</td>\n",
601 |        "    </tr>\n",
602 |        "    <tr>\n",
603 |        "      <th>3</th>\n",
604 |        "      <td>7/13/17</td>\n",
605 |        "      <td>1102</td>\n",
606 |        "      <td>2017-07-13</td>\n",
607 |        "    </tr>\n",
608 |        "    <tr>\n",
609 |        "      <th>4</th>\n",
610 |        "      <td>7/14/17</td>\n",
611 |        "      <td>1395</td>\n",
612 |        "      <td>2017-07-14</td>\n",
613 |        "    </tr>\n",
614 |        "  </tbody>\n",
615 |        "</table>\n",
616 |        "</div>"
617 |       ],
618 |       "text/plain": [
619 |        "      date  sales    pd_date\n",
620 |        "0  7/10/17   1220 2017-07-10\n",
621 |        "1  7/11/17   1592 2017-07-11\n",
622 |        "2  7/12/17    908 2017-07-12\n",
623 |        "3  7/13/17   1102 2017-07-13\n",
624 |        "4  7/14/17   1395 2017-07-14"
625 |       ]
626 |      },
627 |      "execution_count": 12,
628 |      "metadata": {},
629 |      "output_type": "execute_result"
630 |     }
631 |    ],
632 |    "source": [
633 |     "sales['pd_date'] = pd.to_datetime(sales['date'], infer_datetime_format=True)\n",
634 |     "sales"
635 |    ]
636 |   },
637 |   {
638 |    "cell_type": "code",
639 |    "execution_count": 13,
640 |    "metadata": {
641 |     "collapsed": true
642 |    },
643 |    "outputs": [],
644 |    "source": [
645 |     "sales.drop('pd_date', axis=1, inplace=True)"
646 |    ]
647 |   },
648 |   {
649 |    "cell_type": "markdown",
650 |    "metadata": {},
651 |    "source": [
652 |     "What if you tried to use the `pd.to_datetime` function inside of a call to mutate, like so?\n",
653 |     "\n",
654 |     "```python\n",
655 |     "sales >> mutate(pd_date=pd.to_datetime(X.date, infer_datetime_format=True))\n",
656 |     "```\n",
657 |     "\n",
658 |     "This will unfortunately break. The `dfply` functions are special in that they \"know\" to delay their evaluation until the data is at that point in the chain. `pd.to_datetime` is not such a function, and will immediately try to evaluate `X.date`. With a symbolic `Intention` argument passed in, the function will fail as it does not know what to do with that.\n",
659 |     "\n",
660 |     "Instead, we will need to make a wrapper around `pd.to_datetime` that can handle these symbolic arguments and delay evaluation until the right time. \n",
661 |     "\n",
662 |     "This is quite simple: all you need to do is decorate a function with the `@make_symbolic` decorator, like so:"
663 |    ]
664 |   },
665 |   {
666 |    "cell_type": "code",
667 |    "execution_count": 14,
668 |    "metadata": {
669 |     "collapsed": true
670 |    },
671 |    "outputs": [],
672 |    "source": [
673 |     "@make_symbolic\n",
674 |     "def to_datetime(series, infer_datetime_format=True):\n",
675 |     "    return pd.to_datetime(series, infer_datetime_format=infer_datetime_format)"
676 |    ]
677 |   },
678 |   {
679 |    "cell_type": "code",
680 |    "execution_count": 15,
681 |    "metadata": {},
682 |    "outputs": [
683 |     {
684 |      "data": {
685 |       "text/html": [
686 |        "<div>\n",
687 |        "<style>\n",
688 |        "    .dataframe thead tr:only-child th {\n",
689 |        "        text-align: right;\n",
690 |        "    }\n",
691 |        "\n",
692 |        "    .dataframe thead th {\n",
693 |        "        text-align: left;\n",
694 |        "    }\n",
695 |        "\n",
696 |        "    .dataframe tbody tr th {\n",
697 |        "        vertical-align: top;\n",
698 |        "    }\n",
699 |        "</style>\n",
700 |        "<table border=\"1\" class=\"dataframe\">\n",
701 |        "  <thead>\n",
702 |        "    <tr style=\"text-align: right;\">\n",
703 |        "      <th></th>\n",
704 |        "      <th>date</th>\n",
705 |        "      <th>sales</th>\n",
706 |        "      <th>pd_date</th>\n",
707 |        "    </tr>\n",
708 |        "  </thead>\n",
709 |        "  <tbody>\n",
710 |        "    <tr>\n",
711 |        "      <th>0</th>\n",
712 |        "      <td>7/10/17</td>\n",
713 |        "      <td>1220</td>\n",
714 |        "      <td>2017-07-10</td>\n",
715 |        "    </tr>\n",
716 |        "    <tr>\n",
717 |        "      <th>1</th>\n",
718 |        "      <td>7/11/17</td>\n",
719 |        "      <td>1592</td>\n",
720 |        "      <td>2017-07-11</td>\n",
721 |        "    </tr>\n",
722 |        "    <tr>\n",
723 |        "      <th>2</th>\n",
724 |        "      <td>7/12/17</td>\n",
725 |        "      <td>908</td>\n",
726 |        "      <td>2017-07-12</td>\n",
727 |        "    </tr>\n",
728 |        "    <tr>\n",
729 |        "      <th>3</th>\n",
730 |        "      <td>7/13/17</td>\n",
731 |        "      <td>1102</td>\n",
732 |        "      <td>2017-07-13</td>\n",
733 |        "    </tr>\n",
734 |        "    <tr>\n",
735 |        "      <th>4</th>\n",
736 |        "      <td>7/14/17</td>\n",
737 |        "      <td>1395</td>\n",
738 |        "      <td>2017-07-14</td>\n",
739 |        "    </tr>\n",
740 |        "  </tbody>\n",
741 |        "</table>\n",
742 |        "</div>"
743 |       ],
744 |       "text/plain": [
745 |        "      date  sales    pd_date\n",
746 |        "0  7/10/17   1220 2017-07-10\n",
747 |        "1  7/11/17   1592 2017-07-11\n",
748 |        "2  7/12/17    908 2017-07-12\n",
749 |        "3  7/13/17   1102 2017-07-13\n",
750 |        "4  7/14/17   1395 2017-07-14"
751 |       ]
752 |      },
753 |      "execution_count": 15,
754 |      "metadata": {},
755 |      "output_type": "execute_result"
756 |     }
757 |    ],
758 |    "source": [
759 |     "sales >> mutate(pd_date=to_datetime(X.date))"
760 |    ]
761 |   },
762 |   {
763 |    "cell_type": "markdown",
764 |    "metadata": {},
765 |    "source": [
766 |     "And there you go. Able to delay the evaluation.\n",
767 |     "\n",
768 |     "What's particularly nice about the `@make_symbolic` decorator is that it has no trouble working with non-symbolic arguments too. If we were to pass in the series itself the function evaluates without a problem:"
769 |    ]
770 |   },
771 |   {
772 |    "cell_type": "code",
773 |    "execution_count": 16,
774 |    "metadata": {},
775 |    "outputs": [
776 |     {
777 |      "data": {
778 |       "text/plain": [
779 |        "0   2017-07-10\n",
780 |        "1   2017-07-11\n",
781 |        "2   2017-07-12\n",
782 |        "3   2017-07-13\n",
783 |        "4   2017-07-14\n",
784 |        "Name: date, dtype: datetime64[ns]"
785 |       ]
786 |      },
787 |      "execution_count": 16,
788 |      "metadata": {},
789 |      "output_type": "execute_result"
790 |     }
791 |    ],
792 |    "source": [
793 |     "to_datetime(sales.date)"
794 |    ]
795 |   },
796 |   {
797 |    "cell_type": "markdown",
798 |    "metadata": {},
799 |    "source": [
800 |     "Keep in mind, though, that if _any_ of the arguments or keyword arguments are symbolic `Intention` objects, the return will itself be an `Intention` object representing the function awaiting evaluation by a dataframe:"
801 |    ]
802 |   },
803 |   {
804 |    "cell_type": "code",
805 |    "execution_count": 17,
806 |    "metadata": {},
807 |    "outputs": [
808 |     {
809 |      "data": {
810 |       "text/plain": [
811 |        "<dfply.base.Intention at 0x1199570f0>"
812 |       ]
813 |      },
814 |      "execution_count": 17,
815 |      "metadata": {},
816 |      "output_type": "execute_result"
817 |     }
818 |    ],
819 |    "source": [
820 |     "to_datetime(X.date)"
821 |    ]
822 |   },
823 |   {
824 |    "cell_type": "code",
825 |    "execution_count": 19,
826 |    "metadata": {},
827 |    "outputs": [
828 |     {
829 |      "data": {
830 |       "text/plain": [
831 |        "0   2017-07-10\n",
832 |        "1   2017-07-11\n",
833 |        "2   2017-07-12\n",
834 |        "3   2017-07-13\n",
835 |        "4   2017-07-14\n",
836 |        "Name: date, dtype: datetime64[ns]"
837 |       ]
838 |      },
839 |      "execution_count": 19,
840 |      "metadata": {},
841 |      "output_type": "execute_result"
842 |     }
843 |    ],
844 |    "source": [
845 |     "awaiting = to_datetime(X.date)\n",
846 |     "awaiting.evaluate(sales)"
847 |    ]
848 |   },
849 |   {
850 |    "cell_type": "code",
851 |    "execution_count": null,
852 |    "metadata": {
853 |     "collapsed": true
854 |    },
855 |    "outputs": [],
856 |    "source": []
857 |   }
858 |  ],
859 |  "metadata": {
860 |   "kernelspec": {
861 |    "display_name": "Python 3",
862 |    "language": "python",
863 |    "name": "python3"
864 |   },
865 |   "language_info": {
866 |    "codemirror_mode": {
867 |     "name": "ipython",
868 |     "version": 3
869 |    },
870 |    "file_extension": ".py",
871 |    "mimetype": "text/x-python",
872 |    "name": "python",
873 |    "nbconvert_exporter": "python",
874 |    "pygments_lexer": "ipython3",
875 |    "version": "3.6.1"
876 |   }
877 |  },
878 |  "nbformat": 4,
879 |  "nbformat_minor": 2
880 | }
881 | 


--------------------------------------------------------------------------------
/examples/.ipynb_checkpoints/basics-extending-functionality-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np\n",
 13 |     "\n",
 14 |     "from dfply import *"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "### Case #1: A custom pipe function\n",
 22 |     "---\n",
 23 |     "\n",
 24 |     "Pandas has a function `pd.crosstab` which can generate a cross-tabluation of factors. Let's say we wanted to build a pipe function that wrapped around this. The docstring of the Pandas function is below:\n",
 25 |     "\n",
 26 |     "Compute a simple cross-tabulation of two (or more) factors. By default\n",
 27 |     "computes a frequency table of the factors unless an array of values and an\n",
 28 |     "aggregation function are passed\n",
 29 |     "\n",
 30 |     "    Parameters\n",
 31 |     "    ----------\n",
 32 |     "    index : array-like, Series, or list of arrays/Series\n",
 33 |     "        Values to group by in the rows\n",
 34 |     "    columns : array-like, Series, or list of arrays/Series\n",
 35 |     "        Values to group by in the columns\n",
 36 |     "    values : array-like, optional\n",
 37 |     "        Array of values to aggregate according to the factors.\n",
 38 |     "        Requires `aggfunc` be specified.\n",
 39 |     "    aggfunc : function, optional\n",
 40 |     "        If specified, requires `values` be specified as well\n",
 41 |     "    rownames : sequence, default None\n",
 42 |     "        If passed, must match number of row arrays passed\n",
 43 |     "    colnames : sequence, default None\n",
 44 |     "        If passed, must match number of column arrays passed\n",
 45 |     "    margins : boolean, default False\n",
 46 |     "        Add row/column margins (subtotals)\n",
 47 |     "    dropna : boolean, default True\n",
 48 |     "        Do not include columns whose entries are all NaN\n",
 49 |     "    normalize : boolean, {'all', 'index', 'columns'}, or {0,1}, default False\n",
 50 |     "        Normalize by dividing all values by the sum of values.\n",
 51 |     "\n",
 52 |     "        - If passed 'all' or `True`, will normalize over all values.\n",
 53 |     "        - If passed 'index' will normalize over each row.\n",
 54 |     "        - If passed 'columns' will normalize over each column.\n",
 55 |     "        - If margins is `True`, will also normalize margin values.\n",
 56 |     "        \n",
 57 |     "\n",
 58 |     "**To keep it simple, let's build a reduced version of this that takes only:**\n",
 59 |     "- `index`\n",
 60 |     "- `columns`\n",
 61 |     "- `values`\n",
 62 |     "- `aggfunc`\n",
 63 |     "\n",
 64 |     "Below is a function that wraps around the call to `pd.crosstab`. "
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 5,
 70 |    "metadata": {
 71 |     "collapsed": true
 72 |    },
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "def crosstab(index, columns, values=None, aggfunc=None):\n",
 76 |     "    return pd.crosstab(index, columns, values=values, aggfunc=aggfunc)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 6,
 82 |    "metadata": {},
 83 |    "outputs": [
 84 |     {
 85 |      "data": {
 86 |       "text/html": [
 87 |        "<div>\n",
 88 |        "<style>\n",
 89 |        "    .dataframe thead tr:only-child th {\n",
 90 |        "        text-align: right;\n",
 91 |        "    }\n",
 92 |        "\n",
 93 |        "    .dataframe thead th {\n",
 94 |        "        text-align: left;\n",
 95 |        "    }\n",
 96 |        "\n",
 97 |        "    .dataframe tbody tr th {\n",
 98 |        "        vertical-align: top;\n",
 99 |        "    }\n",
100 |        "</style>\n",
101 |        "<table border=\"1\" class=\"dataframe\">\n",
102 |        "  <thead>\n",
103 |        "    <tr style=\"text-align: right;\">\n",
104 |        "      <th></th>\n",
105 |        "      <th>carat</th>\n",
106 |        "      <th>cut</th>\n",
107 |        "      <th>color</th>\n",
108 |        "      <th>clarity</th>\n",
109 |        "      <th>depth</th>\n",
110 |        "      <th>table</th>\n",
111 |        "      <th>price</th>\n",
112 |        "      <th>x</th>\n",
113 |        "      <th>y</th>\n",
114 |        "      <th>z</th>\n",
115 |        "    </tr>\n",
116 |        "  </thead>\n",
117 |        "  <tbody>\n",
118 |        "    <tr>\n",
119 |        "      <th>0</th>\n",
120 |        "      <td>0.23</td>\n",
121 |        "      <td>Ideal</td>\n",
122 |        "      <td>E</td>\n",
123 |        "      <td>SI2</td>\n",
124 |        "      <td>61.5</td>\n",
125 |        "      <td>55.0</td>\n",
126 |        "      <td>326</td>\n",
127 |        "      <td>3.95</td>\n",
128 |        "      <td>3.98</td>\n",
129 |        "      <td>2.43</td>\n",
130 |        "    </tr>\n",
131 |        "    <tr>\n",
132 |        "      <th>1</th>\n",
133 |        "      <td>0.21</td>\n",
134 |        "      <td>Premium</td>\n",
135 |        "      <td>E</td>\n",
136 |        "      <td>SI1</td>\n",
137 |        "      <td>59.8</td>\n",
138 |        "      <td>61.0</td>\n",
139 |        "      <td>326</td>\n",
140 |        "      <td>3.89</td>\n",
141 |        "      <td>3.84</td>\n",
142 |        "      <td>2.31</td>\n",
143 |        "    </tr>\n",
144 |        "  </tbody>\n",
145 |        "</table>\n",
146 |        "</div>"
147 |       ],
148 |       "text/plain": [
149 |        "   carat      cut color clarity  depth  table  price     x     y     z\n",
150 |        "0   0.23    Ideal     E     SI2   61.5   55.0    326  3.95  3.98  2.43\n",
151 |        "1   0.21  Premium     E     SI1   59.8   61.0    326  3.89  3.84  2.31"
152 |       ]
153 |      },
154 |      "execution_count": 6,
155 |      "metadata": {},
156 |      "output_type": "execute_result"
157 |     }
158 |    ],
159 |    "source": [
160 |     "diamonds.head(2)"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 7,
166 |    "metadata": {},
167 |    "outputs": [
168 |     {
169 |      "data": {
170 |       "text/html": [
171 |        "<div>\n",
172 |        "<style>\n",
173 |        "    .dataframe thead tr:only-child th {\n",
174 |        "        text-align: right;\n",
175 |        "    }\n",
176 |        "\n",
177 |        "    .dataframe thead th {\n",
178 |        "        text-align: left;\n",
179 |        "    }\n",
180 |        "\n",
181 |        "    .dataframe tbody tr th {\n",
182 |        "        vertical-align: top;\n",
183 |        "    }\n",
184 |        "</style>\n",
185 |        "<table border=\"1\" class=\"dataframe\">\n",
186 |        "  <thead>\n",
187 |        "    <tr style=\"text-align: right;\">\n",
188 |        "      <th>color</th>\n",
189 |        "      <th>D</th>\n",
190 |        "      <th>E</th>\n",
191 |        "      <th>F</th>\n",
192 |        "      <th>G</th>\n",
193 |        "      <th>H</th>\n",
194 |        "      <th>I</th>\n",
195 |        "      <th>J</th>\n",
196 |        "    </tr>\n",
197 |        "    <tr>\n",
198 |        "      <th>cut</th>\n",
199 |        "      <th></th>\n",
200 |        "      <th></th>\n",
201 |        "      <th></th>\n",
202 |        "      <th></th>\n",
203 |        "      <th></th>\n",
204 |        "      <th></th>\n",
205 |        "      <th></th>\n",
206 |        "    </tr>\n",
207 |        "  </thead>\n",
208 |        "  <tbody>\n",
209 |        "    <tr>\n",
210 |        "      <th>Fair</th>\n",
211 |        "      <td>163</td>\n",
212 |        "      <td>224</td>\n",
213 |        "      <td>312</td>\n",
214 |        "      <td>314</td>\n",
215 |        "      <td>303</td>\n",
216 |        "      <td>175</td>\n",
217 |        "      <td>119</td>\n",
218 |        "    </tr>\n",
219 |        "    <tr>\n",
220 |        "      <th>Good</th>\n",
221 |        "      <td>662</td>\n",
222 |        "      <td>933</td>\n",
223 |        "      <td>909</td>\n",
224 |        "      <td>871</td>\n",
225 |        "      <td>702</td>\n",
226 |        "      <td>522</td>\n",
227 |        "      <td>307</td>\n",
228 |        "    </tr>\n",
229 |        "    <tr>\n",
230 |        "      <th>Ideal</th>\n",
231 |        "      <td>2834</td>\n",
232 |        "      <td>3903</td>\n",
233 |        "      <td>3826</td>\n",
234 |        "      <td>4884</td>\n",
235 |        "      <td>3115</td>\n",
236 |        "      <td>2093</td>\n",
237 |        "      <td>896</td>\n",
238 |        "    </tr>\n",
239 |        "    <tr>\n",
240 |        "      <th>Premium</th>\n",
241 |        "      <td>1603</td>\n",
242 |        "      <td>2337</td>\n",
243 |        "      <td>2331</td>\n",
244 |        "      <td>2924</td>\n",
245 |        "      <td>2360</td>\n",
246 |        "      <td>1428</td>\n",
247 |        "      <td>808</td>\n",
248 |        "    </tr>\n",
249 |        "    <tr>\n",
250 |        "      <th>Very Good</th>\n",
251 |        "      <td>1513</td>\n",
252 |        "      <td>2400</td>\n",
253 |        "      <td>2164</td>\n",
254 |        "      <td>2299</td>\n",
255 |        "      <td>1824</td>\n",
256 |        "      <td>1204</td>\n",
257 |        "      <td>678</td>\n",
258 |        "    </tr>\n",
259 |        "  </tbody>\n",
260 |        "</table>\n",
261 |        "</div>"
262 |       ],
263 |       "text/plain": [
264 |        "color         D     E     F     G     H     I    J\n",
265 |        "cut                                               \n",
266 |        "Fair        163   224   312   314   303   175  119\n",
267 |        "Good        662   933   909   871   702   522  307\n",
268 |        "Ideal      2834  3903  3826  4884  3115  2093  896\n",
269 |        "Premium    1603  2337  2331  2924  2360  1428  808\n",
270 |        "Very Good  1513  2400  2164  2299  1824  1204  678"
271 |       ]
272 |      },
273 |      "execution_count": 7,
274 |      "metadata": {},
275 |      "output_type": "execute_result"
276 |     }
277 |    ],
278 |    "source": [
279 |     "crosstab(diamonds.cut, diamonds.color)"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "markdown",
284 |    "metadata": {},
285 |    "source": [
286 |     "If you want your function to be part of a dfply pipe chain, the first argument _must_ be a dataframe, which is implicitly passed through during the evaluation of the chain! We will need to redefine the function to have the implicit `df` passed in as the first argument.\n",
287 |     "\n",
288 |     "The most common and straightforward way to convert a custom function to a dfply piping function is to use the `@dfpipe` decorator. \n",
289 |     "\n",
290 |     "> Note: the `@dfpipe` decorator is in fact a convenience decorator that stacks three dfply decorators together: \n",
291 |     "\n",
292 |     "> `@pipe`  \n",
293 |     "`@group_delegation`  \n",
294 |     "`@symbolic_evaluation`  \n",
295 |     "\n",
296 |     "> `@pipe` ensures that the function will work in the dfply piping syntax and take an implicit DataFrame, `@group_delegation` makes the function work with groupings applied prior in the chain, and `@symbolic_evaluation` enables you to use and evaluate symbolic arguments like `X.cut` that are placeholders for incoming data."
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": 8,
302 |    "metadata": {
303 |     "collapsed": true
304 |    },
305 |    "outputs": [],
306 |    "source": [
307 |     "@dfpipe\n",
308 |     "def crosstab(df, index, columns, values=None, aggfunc=None):\n",
309 |     "    return pd.crosstab(index, columns, values=values, aggfunc=aggfunc)"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": 9,
315 |    "metadata": {},
316 |    "outputs": [
317 |     {
318 |      "data": {
319 |       "text/html": [
320 |        "<div>\n",
321 |        "<style>\n",
322 |        "    .dataframe thead tr:only-child th {\n",
323 |        "        text-align: right;\n",
324 |        "    }\n",
325 |        "\n",
326 |        "    .dataframe thead th {\n",
327 |        "        text-align: left;\n",
328 |        "    }\n",
329 |        "\n",
330 |        "    .dataframe tbody tr th {\n",
331 |        "        vertical-align: top;\n",
332 |        "    }\n",
333 |        "</style>\n",
334 |        "<table border=\"1\" class=\"dataframe\">\n",
335 |        "  <thead>\n",
336 |        "    <tr style=\"text-align: right;\">\n",
337 |        "      <th>color</th>\n",
338 |        "      <th>D</th>\n",
339 |        "      <th>E</th>\n",
340 |        "      <th>F</th>\n",
341 |        "      <th>G</th>\n",
342 |        "      <th>H</th>\n",
343 |        "      <th>I</th>\n",
344 |        "      <th>J</th>\n",
345 |        "    </tr>\n",
346 |        "    <tr>\n",
347 |        "      <th>cut</th>\n",
348 |        "      <th></th>\n",
349 |        "      <th></th>\n",
350 |        "      <th></th>\n",
351 |        "      <th></th>\n",
352 |        "      <th></th>\n",
353 |        "      <th></th>\n",
354 |        "      <th></th>\n",
355 |        "    </tr>\n",
356 |        "  </thead>\n",
357 |        "  <tbody>\n",
358 |        "    <tr>\n",
359 |        "      <th>Fair</th>\n",
360 |        "      <td>163</td>\n",
361 |        "      <td>224</td>\n",
362 |        "      <td>312</td>\n",
363 |        "      <td>314</td>\n",
364 |        "      <td>303</td>\n",
365 |        "      <td>175</td>\n",
366 |        "      <td>119</td>\n",
367 |        "    </tr>\n",
368 |        "    <tr>\n",
369 |        "      <th>Good</th>\n",
370 |        "      <td>662</td>\n",
371 |        "      <td>933</td>\n",
372 |        "      <td>909</td>\n",
373 |        "      <td>871</td>\n",
374 |        "      <td>702</td>\n",
375 |        "      <td>522</td>\n",
376 |        "      <td>307</td>\n",
377 |        "    </tr>\n",
378 |        "    <tr>\n",
379 |        "      <th>Ideal</th>\n",
380 |        "      <td>2834</td>\n",
381 |        "      <td>3903</td>\n",
382 |        "      <td>3826</td>\n",
383 |        "      <td>4884</td>\n",
384 |        "      <td>3115</td>\n",
385 |        "      <td>2093</td>\n",
386 |        "      <td>896</td>\n",
387 |        "    </tr>\n",
388 |        "    <tr>\n",
389 |        "      <th>Premium</th>\n",
390 |        "      <td>1603</td>\n",
391 |        "      <td>2337</td>\n",
392 |        "      <td>2331</td>\n",
393 |        "      <td>2924</td>\n",
394 |        "      <td>2360</td>\n",
395 |        "      <td>1428</td>\n",
396 |        "      <td>808</td>\n",
397 |        "    </tr>\n",
398 |        "    <tr>\n",
399 |        "      <th>Very Good</th>\n",
400 |        "      <td>1513</td>\n",
401 |        "      <td>2400</td>\n",
402 |        "      <td>2164</td>\n",
403 |        "      <td>2299</td>\n",
404 |        "      <td>1824</td>\n",
405 |        "      <td>1204</td>\n",
406 |        "      <td>678</td>\n",
407 |        "    </tr>\n",
408 |        "  </tbody>\n",
409 |        "</table>\n",
410 |        "</div>"
411 |       ],
412 |       "text/plain": [
413 |        "color         D     E     F     G     H     I    J\n",
414 |        "cut                                               \n",
415 |        "Fair        163   224   312   314   303   175  119\n",
416 |        "Good        662   933   909   871   702   522  307\n",
417 |        "Ideal      2834  3903  3826  4884  3115  2093  896\n",
418 |        "Premium    1603  2337  2331  2924  2360  1428  808\n",
419 |        "Very Good  1513  2400  2164  2299  1824  1204  678"
420 |       ]
421 |      },
422 |      "execution_count": 9,
423 |      "metadata": {},
424 |      "output_type": "execute_result"
425 |     }
426 |    ],
427 |    "source": [
428 |     "diamonds >> crosstab(X.cut, X.color)"
429 |    ]
430 |   },
431 |   {
432 |    "cell_type": "markdown",
433 |    "metadata": {},
434 |    "source": [
435 |     "### Case #2: A function that works with symbolic arguments\n",
436 |     "---\n",
437 |     "\n",
438 |     "Many tasks are simpler and do not require the capacity to work as a pipe function. The `dfply` window functions are the common examples of this: functions that take a Series (or _symbolic_ Series) and return a modified version.\n",
439 |     "\n",
440 |     "Let's say we had a dataframe with dates represented by strings that we wanted to convert to pandas datetime objects using the `pd.to_datetime` function. Below is a tiny example dataframe with this issue."
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "code",
445 |    "execution_count": 10,
446 |    "metadata": {},
447 |    "outputs": [
448 |     {
449 |      "data": {
450 |       "text/html": [
451 |        "<div>\n",
452 |        "<style>\n",
453 |        "    .dataframe thead tr:only-child th {\n",
454 |        "        text-align: right;\n",
455 |        "    }\n",
456 |        "\n",
457 |        "    .dataframe thead th {\n",
458 |        "        text-align: left;\n",
459 |        "    }\n",
460 |        "\n",
461 |        "    .dataframe tbody tr th {\n",
462 |        "        vertical-align: top;\n",
463 |        "    }\n",
464 |        "</style>\n",
465 |        "<table border=\"1\" class=\"dataframe\">\n",
466 |        "  <thead>\n",
467 |        "    <tr style=\"text-align: right;\">\n",
468 |        "      <th></th>\n",
469 |        "      <th>date</th>\n",
470 |        "      <th>sales</th>\n",
471 |        "    </tr>\n",
472 |        "  </thead>\n",
473 |        "  <tbody>\n",
474 |        "    <tr>\n",
475 |        "      <th>0</th>\n",
476 |        "      <td>7/10/17</td>\n",
477 |        "      <td>1220</td>\n",
478 |        "    </tr>\n",
479 |        "    <tr>\n",
480 |        "      <th>1</th>\n",
481 |        "      <td>7/11/17</td>\n",
482 |        "      <td>1592</td>\n",
483 |        "    </tr>\n",
484 |        "    <tr>\n",
485 |        "      <th>2</th>\n",
486 |        "      <td>7/12/17</td>\n",
487 |        "      <td>908</td>\n",
488 |        "    </tr>\n",
489 |        "    <tr>\n",
490 |        "      <th>3</th>\n",
491 |        "      <td>7/13/17</td>\n",
492 |        "      <td>1102</td>\n",
493 |        "    </tr>\n",
494 |        "    <tr>\n",
495 |        "      <th>4</th>\n",
496 |        "      <td>7/14/17</td>\n",
497 |        "      <td>1395</td>\n",
498 |        "    </tr>\n",
499 |        "  </tbody>\n",
500 |        "</table>\n",
501 |        "</div>"
502 |       ],
503 |       "text/plain": [
504 |        "      date  sales\n",
505 |        "0  7/10/17   1220\n",
506 |        "1  7/11/17   1592\n",
507 |        "2  7/12/17    908\n",
508 |        "3  7/13/17   1102\n",
509 |        "4  7/14/17   1395"
510 |       ]
511 |      },
512 |      "execution_count": 10,
513 |      "metadata": {},
514 |      "output_type": "execute_result"
515 |     }
516 |    ],
517 |    "source": [
518 |     "sales = pd.DataFrame(dict(date=['7/10/17','7/11/17','7/12/17','7/13/17','7/14/17'],\n",
519 |     "                          sales=[1220, 1592, 908, 1102, 1395]))\n",
520 |     "sales"
521 |    ]
522 |   },
523 |   {
524 |    "cell_type": "code",
525 |    "execution_count": 11,
526 |    "metadata": {},
527 |    "outputs": [
528 |     {
529 |      "data": {
530 |       "text/plain": [
531 |        "date     object\n",
532 |        "sales     int64\n",
533 |        "dtype: object"
534 |       ]
535 |      },
536 |      "execution_count": 11,
537 |      "metadata": {},
538 |      "output_type": "execute_result"
539 |     }
540 |    ],
541 |    "source": [
542 |     "sales.dtypes"
543 |    ]
544 |   },
545 |   {
546 |    "cell_type": "markdown",
547 |    "metadata": {},
548 |    "source": [
549 |     "In pandas we would use the `pd.to_datetime` function to convert the strings to date objects, and add it as a new column like so:"
550 |    ]
551 |   },
552 |   {
553 |    "cell_type": "code",
554 |    "execution_count": 12,
555 |    "metadata": {},
556 |    "outputs": [
557 |     {
558 |      "data": {
559 |       "text/html": [
560 |        "<div>\n",
561 |        "<style>\n",
562 |        "    .dataframe thead tr:only-child th {\n",
563 |        "        text-align: right;\n",
564 |        "    }\n",
565 |        "\n",
566 |        "    .dataframe thead th {\n",
567 |        "        text-align: left;\n",
568 |        "    }\n",
569 |        "\n",
570 |        "    .dataframe tbody tr th {\n",
571 |        "        vertical-align: top;\n",
572 |        "    }\n",
573 |        "</style>\n",
574 |        "<table border=\"1\" class=\"dataframe\">\n",
575 |        "  <thead>\n",
576 |        "    <tr style=\"text-align: right;\">\n",
577 |        "      <th></th>\n",
578 |        "      <th>date</th>\n",
579 |        "      <th>sales</th>\n",
580 |        "      <th>pd_date</th>\n",
581 |        "    </tr>\n",
582 |        "  </thead>\n",
583 |        "  <tbody>\n",
584 |        "    <tr>\n",
585 |        "      <th>0</th>\n",
586 |        "      <td>7/10/17</td>\n",
587 |        "      <td>1220</td>\n",
588 |        "      <td>2017-07-10</td>\n",
589 |        "    </tr>\n",
590 |        "    <tr>\n",
591 |        "      <th>1</th>\n",
592 |        "      <td>7/11/17</td>\n",
593 |        "      <td>1592</td>\n",
594 |        "      <td>2017-07-11</td>\n",
595 |        "    </tr>\n",
596 |        "    <tr>\n",
597 |        "      <th>2</th>\n",
598 |        "      <td>7/12/17</td>\n",
599 |        "      <td>908</td>\n",
600 |        "      <td>2017-07-12</td>\n",
601 |        "    </tr>\n",
602 |        "    <tr>\n",
603 |        "      <th>3</th>\n",
604 |        "      <td>7/13/17</td>\n",
605 |        "      <td>1102</td>\n",
606 |        "      <td>2017-07-13</td>\n",
607 |        "    </tr>\n",
608 |        "    <tr>\n",
609 |        "      <th>4</th>\n",
610 |        "      <td>7/14/17</td>\n",
611 |        "      <td>1395</td>\n",
612 |        "      <td>2017-07-14</td>\n",
613 |        "    </tr>\n",
614 |        "  </tbody>\n",
615 |        "</table>\n",
616 |        "</div>"
617 |       ],
618 |       "text/plain": [
619 |        "      date  sales    pd_date\n",
620 |        "0  7/10/17   1220 2017-07-10\n",
621 |        "1  7/11/17   1592 2017-07-11\n",
622 |        "2  7/12/17    908 2017-07-12\n",
623 |        "3  7/13/17   1102 2017-07-13\n",
624 |        "4  7/14/17   1395 2017-07-14"
625 |       ]
626 |      },
627 |      "execution_count": 12,
628 |      "metadata": {},
629 |      "output_type": "execute_result"
630 |     }
631 |    ],
632 |    "source": [
633 |     "sales['pd_date'] = pd.to_datetime(sales['date'], infer_datetime_format=True)\n",
634 |     "sales"
635 |    ]
636 |   },
637 |   {
638 |    "cell_type": "code",
639 |    "execution_count": 13,
640 |    "metadata": {
641 |     "collapsed": true
642 |    },
643 |    "outputs": [],
644 |    "source": [
645 |     "sales.drop('pd_date', axis=1, inplace=True)"
646 |    ]
647 |   },
648 |   {
649 |    "cell_type": "markdown",
650 |    "metadata": {},
651 |    "source": [
652 |     "What if you tried to use the `pd.to_datetime` function inside of a call to mutate, like so?\n",
653 |     "\n",
654 |     "```python\n",
655 |     "sales >> mutate(pd_date=pd.to_datetime(X.date, infer_datetime_format=True))\n",
656 |     "```\n",
657 |     "\n",
658 |     "This will unfortunately break. The `dfply` functions are special in that they \"know\" to delay their evaluation until the data is at that point in the chain. `pd.to_datetime` is not such a function, and will immediately try to evaluate `X.date`. With a symbolic `Intention` argument passed in, the function will fail as it does not know what to do with that.\n",
659 |     "\n",
660 |     "Instead, we will need to make a wrapper around `pd.to_datetime` that can handle these symbolic arguments and delay evaluation until the right time. \n",
661 |     "\n",
662 |     "This is quite simple: all you need to do is decorate a function with the `@make_symbolic` decorator, like so:"
663 |    ]
664 |   },
665 |   {
666 |    "cell_type": "code",
667 |    "execution_count": 14,
668 |    "metadata": {
669 |     "collapsed": true
670 |    },
671 |    "outputs": [],
672 |    "source": [
673 |     "@make_symbolic\n",
674 |     "def to_datetime(series, infer_datetime_format=True):\n",
675 |     "    return pd.to_datetime(series, infer_datetime_format=infer_datetime_format)"
676 |    ]
677 |   },
678 |   {
679 |    "cell_type": "code",
680 |    "execution_count": 15,
681 |    "metadata": {},
682 |    "outputs": [
683 |     {
684 |      "data": {
685 |       "text/html": [
686 |        "<div>\n",
687 |        "<style>\n",
688 |        "    .dataframe thead tr:only-child th {\n",
689 |        "        text-align: right;\n",
690 |        "    }\n",
691 |        "\n",
692 |        "    .dataframe thead th {\n",
693 |        "        text-align: left;\n",
694 |        "    }\n",
695 |        "\n",
696 |        "    .dataframe tbody tr th {\n",
697 |        "        vertical-align: top;\n",
698 |        "    }\n",
699 |        "</style>\n",
700 |        "<table border=\"1\" class=\"dataframe\">\n",
701 |        "  <thead>\n",
702 |        "    <tr style=\"text-align: right;\">\n",
703 |        "      <th></th>\n",
704 |        "      <th>date</th>\n",
705 |        "      <th>sales</th>\n",
706 |        "      <th>pd_date</th>\n",
707 |        "    </tr>\n",
708 |        "  </thead>\n",
709 |        "  <tbody>\n",
710 |        "    <tr>\n",
711 |        "      <th>0</th>\n",
712 |        "      <td>7/10/17</td>\n",
713 |        "      <td>1220</td>\n",
714 |        "      <td>2017-07-10</td>\n",
715 |        "    </tr>\n",
716 |        "    <tr>\n",
717 |        "      <th>1</th>\n",
718 |        "      <td>7/11/17</td>\n",
719 |        "      <td>1592</td>\n",
720 |        "      <td>2017-07-11</td>\n",
721 |        "    </tr>\n",
722 |        "    <tr>\n",
723 |        "      <th>2</th>\n",
724 |        "      <td>7/12/17</td>\n",
725 |        "      <td>908</td>\n",
726 |        "      <td>2017-07-12</td>\n",
727 |        "    </tr>\n",
728 |        "    <tr>\n",
729 |        "      <th>3</th>\n",
730 |        "      <td>7/13/17</td>\n",
731 |        "      <td>1102</td>\n",
732 |        "      <td>2017-07-13</td>\n",
733 |        "    </tr>\n",
734 |        "    <tr>\n",
735 |        "      <th>4</th>\n",
736 |        "      <td>7/14/17</td>\n",
737 |        "      <td>1395</td>\n",
738 |        "      <td>2017-07-14</td>\n",
739 |        "    </tr>\n",
740 |        "  </tbody>\n",
741 |        "</table>\n",
742 |        "</div>"
743 |       ],
744 |       "text/plain": [
745 |        "      date  sales    pd_date\n",
746 |        "0  7/10/17   1220 2017-07-10\n",
747 |        "1  7/11/17   1592 2017-07-11\n",
748 |        "2  7/12/17    908 2017-07-12\n",
749 |        "3  7/13/17   1102 2017-07-13\n",
750 |        "4  7/14/17   1395 2017-07-14"
751 |       ]
752 |      },
753 |      "execution_count": 15,
754 |      "metadata": {},
755 |      "output_type": "execute_result"
756 |     }
757 |    ],
758 |    "source": [
759 |     "sales >> mutate(pd_date=to_datetime(X.date))"
760 |    ]
761 |   },
762 |   {
763 |    "cell_type": "markdown",
764 |    "metadata": {},
765 |    "source": [
766 |     "And there you go. Able to delay the evaluation.\n",
767 |     "\n",
768 |     "What's particularly nice about the `@make_symbolic` decorator is that it has no trouble working with non-symbolic arguments too. If we were to pass in the series itself the function evaluates without a problem:"
769 |    ]
770 |   },
771 |   {
772 |    "cell_type": "code",
773 |    "execution_count": 16,
774 |    "metadata": {},
775 |    "outputs": [
776 |     {
777 |      "data": {
778 |       "text/plain": [
779 |        "0   2017-07-10\n",
780 |        "1   2017-07-11\n",
781 |        "2   2017-07-12\n",
782 |        "3   2017-07-13\n",
783 |        "4   2017-07-14\n",
784 |        "Name: date, dtype: datetime64[ns]"
785 |       ]
786 |      },
787 |      "execution_count": 16,
788 |      "metadata": {},
789 |      "output_type": "execute_result"
790 |     }
791 |    ],
792 |    "source": [
793 |     "to_datetime(sales.date)"
794 |    ]
795 |   },
796 |   {
797 |    "cell_type": "markdown",
798 |    "metadata": {},
799 |    "source": [
800 |     "Keep in mind, though, that if _any_ of the arguments or keyword arguments are symbolic `Intention` objects, the return will itself be an `Intention` object representing the function awaiting evaluation by a dataframe:"
801 |    ]
802 |   },
803 |   {
804 |    "cell_type": "code",
805 |    "execution_count": 17,
806 |    "metadata": {},
807 |    "outputs": [
808 |     {
809 |      "data": {
810 |       "text/plain": [
811 |        "<dfply.base.Intention at 0x1199570f0>"
812 |       ]
813 |      },
814 |      "execution_count": 17,
815 |      "metadata": {},
816 |      "output_type": "execute_result"
817 |     }
818 |    ],
819 |    "source": [
820 |     "to_datetime(X.date)"
821 |    ]
822 |   },
823 |   {
824 |    "cell_type": "code",
825 |    "execution_count": 19,
826 |    "metadata": {},
827 |    "outputs": [
828 |     {
829 |      "data": {
830 |       "text/plain": [
831 |        "0   2017-07-10\n",
832 |        "1   2017-07-11\n",
833 |        "2   2017-07-12\n",
834 |        "3   2017-07-13\n",
835 |        "4   2017-07-14\n",
836 |        "Name: date, dtype: datetime64[ns]"
837 |       ]
838 |      },
839 |      "execution_count": 19,
840 |      "metadata": {},
841 |      "output_type": "execute_result"
842 |     }
843 |    ],
844 |    "source": [
845 |     "awaiting = to_datetime(X.date)\n",
846 |     "awaiting.evaluate(sales)"
847 |    ]
848 |   },
849 |   {
850 |    "cell_type": "code",
851 |    "execution_count": null,
852 |    "metadata": {
853 |     "collapsed": true
854 |    },
855 |    "outputs": [],
856 |    "source": []
857 |   }
858 |  ],
859 |  "metadata": {
860 |   "kernelspec": {
861 |    "display_name": "Python 3",
862 |    "language": "python",
863 |    "name": "python3"
864 |   },
865 |   "language_info": {
866 |    "codemirror_mode": {
867 |     "name": "ipython",
868 |     "version": 3
869 |    },
870 |    "file_extension": ".py",
871 |    "mimetype": "text/x-python",
872 |    "name": "python",
873 |    "nbconvert_exporter": "python",
874 |    "pygments_lexer": "ipython3",
875 |    "version": "3.6.1"
876 |   }
877 |  },
878 |  "nbformat": 4,
879 |  "nbformat_minor": 2
880 | }
881 | 


--------------------------------------------------------------------------------